1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * regprefix.c |
4 | * Extract a common prefix, if any, from a compiled regex. |
5 | * |
6 | * |
7 | * Portions Copyright (c) 2012-2019, PostgreSQL Global Development Group |
8 | * Portions Copyright (c) 1998, 1999 Henry Spencer |
9 | * |
10 | * IDENTIFICATION |
11 | * src/backend/regex/regprefix.c |
12 | * |
13 | *------------------------------------------------------------------------- |
14 | */ |
15 | |
16 | #include "regex/regguts.h" |
17 | |
18 | |
19 | /* |
20 | * forward declarations |
21 | */ |
22 | static int findprefix(struct cnfa *cnfa, struct colormap *cm, |
23 | chr *string, size_t *slength); |
24 | |
25 | |
26 | /* |
27 | * pg_regprefix - get common prefix for regular expression |
28 | * |
29 | * Returns one of: |
30 | * REG_NOMATCH: there is no common prefix of strings matching the regex |
31 | * REG_PREFIX: there is a common prefix of strings matching the regex |
32 | * REG_EXACT: all strings satisfying the regex must match the same string |
33 | * or a REG_XXX error code |
34 | * |
35 | * In the non-failure cases, *string is set to a malloc'd string containing |
36 | * the common prefix or exact value, of length *slength (measured in chrs |
37 | * not bytes!). |
38 | * |
39 | * This function does not analyze all complex cases (such as lookaround |
40 | * constraints) exactly. Therefore it is possible that some strings matching |
41 | * the reported prefix or exact-match string do not satisfy the regex. But |
42 | * it should never be the case that a string satisfying the regex does not |
43 | * match the reported prefix or exact-match string. |
44 | */ |
45 | int |
46 | pg_regprefix(regex_t *re, |
47 | chr **string, |
48 | size_t *slength) |
49 | { |
50 | struct guts *g; |
51 | struct cnfa *cnfa; |
52 | int st; |
53 | |
54 | /* sanity checks */ |
55 | if (string == NULL || slength == NULL) |
56 | return REG_INVARG; |
57 | *string = NULL; /* initialize for failure cases */ |
58 | *slength = 0; |
59 | if (re == NULL || re->re_magic != REMAGIC) |
60 | return REG_INVARG; |
61 | if (re->re_csize != sizeof(chr)) |
62 | return REG_MIXED; |
63 | |
64 | /* Initialize locale-dependent support */ |
65 | pg_set_regex_collation(re->re_collation); |
66 | |
67 | /* setup */ |
68 | g = (struct guts *) re->re_guts; |
69 | if (g->info & REG_UIMPOSSIBLE) |
70 | return REG_NOMATCH; |
71 | |
72 | /* |
73 | * This implementation considers only the search NFA for the topmost regex |
74 | * tree node. Therefore, constraints such as backrefs are not fully |
75 | * applied, which is allowed per the function's API spec. |
76 | */ |
77 | assert(g->tree != NULL); |
78 | cnfa = &g->tree->cnfa; |
79 | |
80 | /* |
81 | * Since a correct NFA should never contain any exit-free loops, it should |
82 | * not be possible for our traversal to return to a previously visited NFA |
83 | * state. Hence we need at most nstates chrs in the output string. |
84 | */ |
85 | *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr)); |
86 | if (*string == NULL) |
87 | return REG_ESPACE; |
88 | |
89 | /* do it */ |
90 | st = findprefix(cnfa, &g->cmap, *string, slength); |
91 | |
92 | assert(*slength <= cnfa->nstates); |
93 | |
94 | /* clean up */ |
95 | if (st != REG_PREFIX && st != REG_EXACT) |
96 | { |
97 | FREE(*string); |
98 | *string = NULL; |
99 | *slength = 0; |
100 | } |
101 | |
102 | return st; |
103 | } |
104 | |
105 | /* |
106 | * findprefix - extract common prefix from cNFA |
107 | * |
108 | * Results are returned into the preallocated chr array string[], with |
109 | * *slength (which must be preset to zero) incremented for each chr. |
110 | */ |
111 | static int /* regprefix return code */ |
112 | findprefix(struct cnfa *cnfa, |
113 | struct colormap *cm, |
114 | chr *string, |
115 | size_t *slength) |
116 | { |
117 | int st; |
118 | int nextst; |
119 | color thiscolor; |
120 | chr c; |
121 | struct carc *ca; |
122 | |
123 | /* |
124 | * The "pre" state must have only BOS/BOL outarcs, else pattern isn't |
125 | * anchored left. If we have both BOS and BOL, they must go to the same |
126 | * next state. |
127 | */ |
128 | st = cnfa->pre; |
129 | nextst = -1; |
130 | for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) |
131 | { |
132 | if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) |
133 | { |
134 | if (nextst == -1) |
135 | nextst = ca->to; |
136 | else if (nextst != ca->to) |
137 | return REG_NOMATCH; |
138 | } |
139 | else |
140 | return REG_NOMATCH; |
141 | } |
142 | if (nextst == -1) |
143 | return REG_NOMATCH; |
144 | |
145 | /* |
146 | * Scan through successive states, stopping as soon as we find one with |
147 | * more than one acceptable transition character (either multiple colors |
148 | * on out-arcs, or a color with more than one member chr). |
149 | * |
150 | * We could find a state with multiple out-arcs that are all labeled with |
151 | * the same singleton color; this comes from patterns like "^ab(cde|cxy)". |
152 | * In that case we add the chr "c" to the output string but then exit the |
153 | * loop with nextst == -1. This leaves a little bit on the table: if the |
154 | * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added |
155 | * to the prefix. But chasing multiple parallel state chains doesn't seem |
156 | * worth the trouble. |
157 | */ |
158 | do |
159 | { |
160 | st = nextst; |
161 | nextst = -1; |
162 | thiscolor = COLORLESS; |
163 | for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) |
164 | { |
165 | /* We can ignore BOS/BOL arcs */ |
166 | if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) |
167 | continue; |
168 | /* ... but EOS/EOL arcs terminate the search, as do LACONs */ |
169 | if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] || |
170 | ca->co >= cnfa->ncolors) |
171 | { |
172 | thiscolor = COLORLESS; |
173 | break; |
174 | } |
175 | if (thiscolor == COLORLESS) |
176 | { |
177 | /* First plain outarc */ |
178 | thiscolor = ca->co; |
179 | nextst = ca->to; |
180 | } |
181 | else if (thiscolor == ca->co) |
182 | { |
183 | /* Another plain outarc for same color */ |
184 | nextst = -1; |
185 | } |
186 | else |
187 | { |
188 | /* More than one plain outarc color terminates the search */ |
189 | thiscolor = COLORLESS; |
190 | break; |
191 | } |
192 | } |
193 | /* Done if we didn't find exactly one color on plain outarcs */ |
194 | if (thiscolor == COLORLESS) |
195 | break; |
196 | /* The color must be a singleton */ |
197 | if (cm->cd[thiscolor].nschrs != 1) |
198 | break; |
199 | /* Must not have any high-color-map entries */ |
200 | if (cm->cd[thiscolor].nuchrs != 0) |
201 | break; |
202 | |
203 | /* |
204 | * Identify the color's sole member chr and add it to the prefix |
205 | * string. In general the colormap data structure doesn't provide a |
206 | * way to find color member chrs, except by trying GETCOLOR() on each |
207 | * possible chr value, which won't do at all. However, for the cases |
208 | * we care about it should be sufficient to test the "firstchr" value, |
209 | * that is the first chr ever added to the color. There are cases |
210 | * where this might no longer be a member of the color (so we do need |
211 | * to test), but none of them are likely to arise for a character that |
212 | * is a member of a common prefix. If we do hit such a corner case, |
213 | * we just fall out without adding anything to the prefix string. |
214 | */ |
215 | c = cm->cd[thiscolor].firstchr; |
216 | if (GETCOLOR(cm, c) != thiscolor) |
217 | break; |
218 | |
219 | string[(*slength)++] = c; |
220 | |
221 | /* Advance to next state, but only if we have a unique next state */ |
222 | } while (nextst != -1); |
223 | |
224 | /* |
225 | * If we ended at a state that only has EOS/EOL outarcs leading to the |
226 | * "post" state, then we have an exact-match string. Note this is true |
227 | * even if the string is of zero length. |
228 | */ |
229 | nextst = -1; |
230 | for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) |
231 | { |
232 | if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1]) |
233 | { |
234 | if (nextst == -1) |
235 | nextst = ca->to; |
236 | else if (nextst != ca->to) |
237 | { |
238 | nextst = -1; |
239 | break; |
240 | } |
241 | } |
242 | else |
243 | { |
244 | nextst = -1; |
245 | break; |
246 | } |
247 | } |
248 | if (nextst == cnfa->post) |
249 | return REG_EXACT; |
250 | |
251 | /* |
252 | * Otherwise, if we were unable to identify any prefix characters, say |
253 | * NOMATCH --- the pattern is anchored left, but doesn't specify any |
254 | * particular first character. |
255 | */ |
256 | if (*slength > 0) |
257 | return REG_PREFIX; |
258 | |
259 | return REG_NOMATCH; |
260 | } |
261 | |