1/*-------------------------------------------------------------------------
2 *
3 * regprefix.c
4 * Extract a common prefix, if any, from a compiled regex.
5 *
6 *
7 * Portions Copyright (c) 2012-2019, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1998, 1999 Henry Spencer
9 *
10 * IDENTIFICATION
11 * src/backend/regex/regprefix.c
12 *
13 *-------------------------------------------------------------------------
14 */
15
16#include "regex/regguts.h"
17
18
19/*
20 * forward declarations
21 */
22static int findprefix(struct cnfa *cnfa, struct colormap *cm,
23 chr *string, size_t *slength);
24
25
26/*
27 * pg_regprefix - get common prefix for regular expression
28 *
29 * Returns one of:
30 * REG_NOMATCH: there is no common prefix of strings matching the regex
31 * REG_PREFIX: there is a common prefix of strings matching the regex
32 * REG_EXACT: all strings satisfying the regex must match the same string
33 * or a REG_XXX error code
34 *
35 * In the non-failure cases, *string is set to a malloc'd string containing
36 * the common prefix or exact value, of length *slength (measured in chrs
37 * not bytes!).
38 *
39 * This function does not analyze all complex cases (such as lookaround
40 * constraints) exactly. Therefore it is possible that some strings matching
41 * the reported prefix or exact-match string do not satisfy the regex. But
42 * it should never be the case that a string satisfying the regex does not
43 * match the reported prefix or exact-match string.
44 */
45int
46pg_regprefix(regex_t *re,
47 chr **string,
48 size_t *slength)
49{
50 struct guts *g;
51 struct cnfa *cnfa;
52 int st;
53
54 /* sanity checks */
55 if (string == NULL || slength == NULL)
56 return REG_INVARG;
57 *string = NULL; /* initialize for failure cases */
58 *slength = 0;
59 if (re == NULL || re->re_magic != REMAGIC)
60 return REG_INVARG;
61 if (re->re_csize != sizeof(chr))
62 return REG_MIXED;
63
64 /* Initialize locale-dependent support */
65 pg_set_regex_collation(re->re_collation);
66
67 /* setup */
68 g = (struct guts *) re->re_guts;
69 if (g->info & REG_UIMPOSSIBLE)
70 return REG_NOMATCH;
71
72 /*
73 * This implementation considers only the search NFA for the topmost regex
74 * tree node. Therefore, constraints such as backrefs are not fully
75 * applied, which is allowed per the function's API spec.
76 */
77 assert(g->tree != NULL);
78 cnfa = &g->tree->cnfa;
79
80 /*
81 * Since a correct NFA should never contain any exit-free loops, it should
82 * not be possible for our traversal to return to a previously visited NFA
83 * state. Hence we need at most nstates chrs in the output string.
84 */
85 *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
86 if (*string == NULL)
87 return REG_ESPACE;
88
89 /* do it */
90 st = findprefix(cnfa, &g->cmap, *string, slength);
91
92 assert(*slength <= cnfa->nstates);
93
94 /* clean up */
95 if (st != REG_PREFIX && st != REG_EXACT)
96 {
97 FREE(*string);
98 *string = NULL;
99 *slength = 0;
100 }
101
102 return st;
103}
104
105/*
106 * findprefix - extract common prefix from cNFA
107 *
108 * Results are returned into the preallocated chr array string[], with
109 * *slength (which must be preset to zero) incremented for each chr.
110 */
111static int /* regprefix return code */
112findprefix(struct cnfa *cnfa,
113 struct colormap *cm,
114 chr *string,
115 size_t *slength)
116{
117 int st;
118 int nextst;
119 color thiscolor;
120 chr c;
121 struct carc *ca;
122
123 /*
124 * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
125 * anchored left. If we have both BOS and BOL, they must go to the same
126 * next state.
127 */
128 st = cnfa->pre;
129 nextst = -1;
130 for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
131 {
132 if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
133 {
134 if (nextst == -1)
135 nextst = ca->to;
136 else if (nextst != ca->to)
137 return REG_NOMATCH;
138 }
139 else
140 return REG_NOMATCH;
141 }
142 if (nextst == -1)
143 return REG_NOMATCH;
144
145 /*
146 * Scan through successive states, stopping as soon as we find one with
147 * more than one acceptable transition character (either multiple colors
148 * on out-arcs, or a color with more than one member chr).
149 *
150 * We could find a state with multiple out-arcs that are all labeled with
151 * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
152 * In that case we add the chr "c" to the output string but then exit the
153 * loop with nextst == -1. This leaves a little bit on the table: if the
154 * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
155 * to the prefix. But chasing multiple parallel state chains doesn't seem
156 * worth the trouble.
157 */
158 do
159 {
160 st = nextst;
161 nextst = -1;
162 thiscolor = COLORLESS;
163 for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
164 {
165 /* We can ignore BOS/BOL arcs */
166 if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
167 continue;
168 /* ... but EOS/EOL arcs terminate the search, as do LACONs */
169 if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] ||
170 ca->co >= cnfa->ncolors)
171 {
172 thiscolor = COLORLESS;
173 break;
174 }
175 if (thiscolor == COLORLESS)
176 {
177 /* First plain outarc */
178 thiscolor = ca->co;
179 nextst = ca->to;
180 }
181 else if (thiscolor == ca->co)
182 {
183 /* Another plain outarc for same color */
184 nextst = -1;
185 }
186 else
187 {
188 /* More than one plain outarc color terminates the search */
189 thiscolor = COLORLESS;
190 break;
191 }
192 }
193 /* Done if we didn't find exactly one color on plain outarcs */
194 if (thiscolor == COLORLESS)
195 break;
196 /* The color must be a singleton */
197 if (cm->cd[thiscolor].nschrs != 1)
198 break;
199 /* Must not have any high-color-map entries */
200 if (cm->cd[thiscolor].nuchrs != 0)
201 break;
202
203 /*
204 * Identify the color's sole member chr and add it to the prefix
205 * string. In general the colormap data structure doesn't provide a
206 * way to find color member chrs, except by trying GETCOLOR() on each
207 * possible chr value, which won't do at all. However, for the cases
208 * we care about it should be sufficient to test the "firstchr" value,
209 * that is the first chr ever added to the color. There are cases
210 * where this might no longer be a member of the color (so we do need
211 * to test), but none of them are likely to arise for a character that
212 * is a member of a common prefix. If we do hit such a corner case,
213 * we just fall out without adding anything to the prefix string.
214 */
215 c = cm->cd[thiscolor].firstchr;
216 if (GETCOLOR(cm, c) != thiscolor)
217 break;
218
219 string[(*slength)++] = c;
220
221 /* Advance to next state, but only if we have a unique next state */
222 } while (nextst != -1);
223
224 /*
225 * If we ended at a state that only has EOS/EOL outarcs leading to the
226 * "post" state, then we have an exact-match string. Note this is true
227 * even if the string is of zero length.
228 */
229 nextst = -1;
230 for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
231 {
232 if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
233 {
234 if (nextst == -1)
235 nextst = ca->to;
236 else if (nextst != ca->to)
237 {
238 nextst = -1;
239 break;
240 }
241 }
242 else
243 {
244 nextst = -1;
245 break;
246 }
247 }
248 if (nextst == cnfa->post)
249 return REG_EXACT;
250
251 /*
252 * Otherwise, if we were unable to identify any prefix characters, say
253 * NOMATCH --- the pattern is anchored left, but doesn't specify any
254 * particular first character.
255 */
256 if (*slength > 0)
257 return REG_PREFIX;
258
259 return REG_NOMATCH;
260}
261