1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * regexport.c |
4 | * Functions for exporting info about a regex's NFA |
5 | * |
6 | * In this implementation, the NFA defines a necessary but not sufficient |
7 | * condition for a string to match the regex: that is, there can be strings |
8 | * that match the NFA but don't match the full regex, but not vice versa. |
9 | * Thus, for example, it is okay for the functions below to treat lookaround |
10 | * constraints as no-ops, since they merely constrain the string some more. |
11 | * |
12 | * Notice that these functions return info into caller-provided arrays |
13 | * rather than doing their own malloc's. This simplifies the APIs by |
14 | * eliminating a class of error conditions, and in the case of colors |
15 | * allows the caller to decide how big is too big to bother with. |
16 | * |
17 | * |
18 | * Portions Copyright (c) 2013-2019, PostgreSQL Global Development Group |
19 | * Portions Copyright (c) 1998, 1999 Henry Spencer |
20 | * |
21 | * IDENTIFICATION |
22 | * src/backend/regex/regexport.c |
23 | * |
24 | *------------------------------------------------------------------------- |
25 | */ |
26 | |
27 | #include "regex/regguts.h" |
28 | |
29 | #include "regex/regexport.h" |
30 | |
31 | |
32 | /* |
33 | * Get total number of NFA states. |
34 | */ |
35 | int |
36 | pg_reg_getnumstates(const regex_t *regex) |
37 | { |
38 | struct cnfa *cnfa; |
39 | |
40 | assert(regex != NULL && regex->re_magic == REMAGIC); |
41 | cnfa = &((struct guts *) regex->re_guts)->search; |
42 | |
43 | return cnfa->nstates; |
44 | } |
45 | |
46 | /* |
47 | * Get initial state of NFA. |
48 | */ |
49 | int |
50 | pg_reg_getinitialstate(const regex_t *regex) |
51 | { |
52 | struct cnfa *cnfa; |
53 | |
54 | assert(regex != NULL && regex->re_magic == REMAGIC); |
55 | cnfa = &((struct guts *) regex->re_guts)->search; |
56 | |
57 | return cnfa->pre; |
58 | } |
59 | |
60 | /* |
61 | * Get final state of NFA. |
62 | */ |
63 | int |
64 | pg_reg_getfinalstate(const regex_t *regex) |
65 | { |
66 | struct cnfa *cnfa; |
67 | |
68 | assert(regex != NULL && regex->re_magic == REMAGIC); |
69 | cnfa = &((struct guts *) regex->re_guts)->search; |
70 | |
71 | return cnfa->post; |
72 | } |
73 | |
74 | /* |
75 | * pg_reg_getnumoutarcs() and pg_reg_getoutarcs() mask the existence of LACON |
76 | * arcs from the caller, treating any LACON as being automatically satisfied. |
77 | * Since the output representation does not support arcs that consume no |
78 | * character when traversed, we have to recursively traverse LACON arcs here, |
79 | * and report whatever normal arcs are reachable by traversing LACON arcs. |
80 | * Note that this wouldn't work if it were possible to reach the final state |
81 | * via LACON traversal, but the regex library never builds NFAs that have |
82 | * LACON arcs leading directly to the final state. (This is because the |
83 | * regex executor is designed to consume one character beyond the nominal |
84 | * match end --- possibly an EOS indicator --- so there is always a set of |
85 | * ordinary arcs leading to the final state.) |
86 | * |
87 | * traverse_lacons is a recursive subroutine used by both exported functions |
88 | * to count and then emit the reachable regular arcs. *arcs_count is |
89 | * incremented by the number of reachable arcs, and as many as will fit in |
90 | * arcs_len (possibly 0) are emitted into arcs[]. |
91 | */ |
92 | static void |
93 | traverse_lacons(struct cnfa *cnfa, int st, |
94 | int *arcs_count, |
95 | regex_arc_t *arcs, int arcs_len) |
96 | { |
97 | struct carc *ca; |
98 | |
99 | /* |
100 | * Since this function recurses, it could theoretically be driven to stack |
101 | * overflow. In practice, this is mostly useful to backstop against a |
102 | * failure of the regex compiler to remove a loop of LACON arcs. |
103 | */ |
104 | check_stack_depth(); |
105 | |
106 | for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) |
107 | { |
108 | if (ca->co < cnfa->ncolors) |
109 | { |
110 | /* Ordinary arc, so count and possibly emit it */ |
111 | int ndx = (*arcs_count)++; |
112 | |
113 | if (ndx < arcs_len) |
114 | { |
115 | arcs[ndx].co = ca->co; |
116 | arcs[ndx].to = ca->to; |
117 | } |
118 | } |
119 | else |
120 | { |
121 | /* LACON arc --- assume it's satisfied and recurse... */ |
122 | /* ... but first, assert it doesn't lead directly to post state */ |
123 | Assert(ca->to != cnfa->post); |
124 | |
125 | traverse_lacons(cnfa, ca->to, arcs_count, arcs, arcs_len); |
126 | } |
127 | } |
128 | } |
129 | |
130 | /* |
131 | * Get number of outgoing NFA arcs of state number "st". |
132 | */ |
133 | int |
134 | pg_reg_getnumoutarcs(const regex_t *regex, int st) |
135 | { |
136 | struct cnfa *cnfa; |
137 | int arcs_count; |
138 | |
139 | assert(regex != NULL && regex->re_magic == REMAGIC); |
140 | cnfa = &((struct guts *) regex->re_guts)->search; |
141 | |
142 | if (st < 0 || st >= cnfa->nstates) |
143 | return 0; |
144 | arcs_count = 0; |
145 | traverse_lacons(cnfa, st, &arcs_count, NULL, 0); |
146 | return arcs_count; |
147 | } |
148 | |
149 | /* |
150 | * Write array of outgoing NFA arcs of state number "st" into arcs[], |
151 | * whose length arcs_len must be at least as long as indicated by |
152 | * pg_reg_getnumoutarcs(), else not all arcs will be returned. |
153 | */ |
154 | void |
155 | pg_reg_getoutarcs(const regex_t *regex, int st, |
156 | regex_arc_t *arcs, int arcs_len) |
157 | { |
158 | struct cnfa *cnfa; |
159 | int arcs_count; |
160 | |
161 | assert(regex != NULL && regex->re_magic == REMAGIC); |
162 | cnfa = &((struct guts *) regex->re_guts)->search; |
163 | |
164 | if (st < 0 || st >= cnfa->nstates || arcs_len <= 0) |
165 | return; |
166 | arcs_count = 0; |
167 | traverse_lacons(cnfa, st, &arcs_count, arcs, arcs_len); |
168 | } |
169 | |
170 | /* |
171 | * Get total number of colors. |
172 | */ |
173 | int |
174 | pg_reg_getnumcolors(const regex_t *regex) |
175 | { |
176 | struct colormap *cm; |
177 | |
178 | assert(regex != NULL && regex->re_magic == REMAGIC); |
179 | cm = &((struct guts *) regex->re_guts)->cmap; |
180 | |
181 | return cm->max + 1; |
182 | } |
183 | |
184 | /* |
185 | * Check if color is beginning of line/string. |
186 | * |
187 | * (We might at some point need to offer more refined handling of pseudocolors, |
188 | * but this will do for now.) |
189 | */ |
190 | int |
191 | pg_reg_colorisbegin(const regex_t *regex, int co) |
192 | { |
193 | struct cnfa *cnfa; |
194 | |
195 | assert(regex != NULL && regex->re_magic == REMAGIC); |
196 | cnfa = &((struct guts *) regex->re_guts)->search; |
197 | |
198 | if (co == cnfa->bos[0] || co == cnfa->bos[1]) |
199 | return true; |
200 | else |
201 | return false; |
202 | } |
203 | |
204 | /* |
205 | * Check if color is end of line/string. |
206 | */ |
207 | int |
208 | pg_reg_colorisend(const regex_t *regex, int co) |
209 | { |
210 | struct cnfa *cnfa; |
211 | |
212 | assert(regex != NULL && regex->re_magic == REMAGIC); |
213 | cnfa = &((struct guts *) regex->re_guts)->search; |
214 | |
215 | if (co == cnfa->eos[0] || co == cnfa->eos[1]) |
216 | return true; |
217 | else |
218 | return false; |
219 | } |
220 | |
221 | /* |
222 | * Get number of member chrs of color number "co". |
223 | * |
224 | * Note: we return -1 if the color number is invalid, or if it is a special |
225 | * color (WHITE or a pseudocolor), or if the number of members is uncertain. |
226 | * Callers should not try to extract the members if -1 is returned. |
227 | */ |
228 | int |
229 | pg_reg_getnumcharacters(const regex_t *regex, int co) |
230 | { |
231 | struct colormap *cm; |
232 | |
233 | assert(regex != NULL && regex->re_magic == REMAGIC); |
234 | cm = &((struct guts *) regex->re_guts)->cmap; |
235 | |
236 | if (co <= 0 || co > cm->max) /* we reject 0 which is WHITE */ |
237 | return -1; |
238 | if (cm->cd[co].flags & PSEUDO) /* also pseudocolors (BOS etc) */ |
239 | return -1; |
240 | |
241 | /* |
242 | * If the color appears anywhere in the high colormap, treat its number of |
243 | * members as uncertain. In principle we could determine all the specific |
244 | * chrs corresponding to each such entry, but it would be expensive |
245 | * (particularly if character class tests are required) and it doesn't |
246 | * seem worth it. |
247 | */ |
248 | if (cm->cd[co].nuchrs != 0) |
249 | return -1; |
250 | |
251 | /* OK, return the known number of member chrs */ |
252 | return cm->cd[co].nschrs; |
253 | } |
254 | |
255 | /* |
256 | * Write array of member chrs of color number "co" into chars[], |
257 | * whose length chars_len must be at least as long as indicated by |
258 | * pg_reg_getnumcharacters(), else not all chars will be returned. |
259 | * |
260 | * Fetching the members of WHITE or a pseudocolor is not supported. |
261 | * |
262 | * Caution: this is a relatively expensive operation. |
263 | */ |
264 | void |
265 | pg_reg_getcharacters(const regex_t *regex, int co, |
266 | pg_wchar *chars, int chars_len) |
267 | { |
268 | struct colormap *cm; |
269 | chr c; |
270 | |
271 | assert(regex != NULL && regex->re_magic == REMAGIC); |
272 | cm = &((struct guts *) regex->re_guts)->cmap; |
273 | |
274 | if (co <= 0 || co > cm->max || chars_len <= 0) |
275 | return; |
276 | if (cm->cd[co].flags & PSEUDO) |
277 | return; |
278 | |
279 | /* |
280 | * We need only examine the low character map; there should not be any |
281 | * matching entries in the high map. |
282 | */ |
283 | for (c = CHR_MIN; c <= MAX_SIMPLE_CHR; c++) |
284 | { |
285 | if (cm->locolormap[c - CHR_MIN] == co) |
286 | { |
287 | *chars++ = c; |
288 | if (--chars_len == 0) |
289 | break; |
290 | } |
291 | } |
292 | } |
293 | |