regprefix.c source code [PostgreSQL/src/backend/regex/regprefix.c]

1	/-------------------------------------------------------------------------*
2	*
3	* regprefix.c
4	* Extract a common prefix, if any, from a compiled regex.
5	*
6	*
7	* Portions Copyright (c) 2012-2019, PostgreSQL Global Development Group
8	* Portions Copyright (c) 1998, 1999 Henry Spencer
9	*
10	* IDENTIFICATION
11	* src/backend/regex/regprefix.c
12	*
13	*-------------------------------------------------------------------------
14	*/
15
16	#include "regex/regguts.h"
17
18
19	/*
20	* forward declarations
21	*/
22	static int findprefix(struct cnfa cnfa, struct* colormap *cm,
23	chr string, size_t slength);
24
25
26	/*
27	* pg_regprefix - get common prefix for regular expression
28	*
29	* Returns one of:
30	* REG_NOMATCH: there is no common prefix of strings matching the regex
31	* REG_PREFIX: there is a common prefix of strings matching the regex
32	* REG_EXACT: all strings satisfying the regex must match the same string
33	* or a REG_XXX error code
34	*
35	* In the non-failure cases, *string is set to a malloc'd string containing
36	* the common prefix or exact value, of length *slength (measured in chrs
37	* not bytes!).
38	*
39	* This function does not analyze all complex cases (such as lookaround
40	* constraints) exactly. Therefore it is possible that some strings matching
41	* the reported prefix or exact-match string do not satisfy the regex. But
42	* it should never be the case that a string satisfying the regex does not
43	* match the reported prefix or exact-match string.
44	*/
45	int
46	pg_regprefix(regex_t *re,
47	chr **string,
48	size_t *slength)
49	{
50	struct guts *g;
51	struct cnfa *cnfa;
52	int st;
53
54	/ sanity checks /
55	if (string == NULL \|\| slength == NULL)
56	return REG_INVARG;
57	string = NULL; /* initialize for failure cases /
58	*slength = `0`;
59	if (re == NULL \|\| re->re_magic != REMAGIC)
60	return REG_INVARG;
61	if (re->re_csize != sizeof(chr))
62	return REG_MIXED;
63
64	/ Initialize locale-dependent support /
65	pg_set_regex_collation(re->re_collation);
66
67	/ setup /
68	g = (struct guts *) re->re_guts;
69	if (g->info & REG_UIMPOSSIBLE)
70	return REG_NOMATCH;
71
72	/*
73	* This implementation considers only the search NFA for the topmost regex
74	* tree node. Therefore, constraints such as backrefs are not fully
75	* applied, which is allowed per the function's API spec.
76	*/
77	assert(g->tree != NULL);
78	cnfa = &g->tree->cnfa;
79
80	/*
81	* Since a correct NFA should never contain any exit-free loops, it should
82	* not be possible for our traversal to return to a previously visited NFA
83	* state. Hence we need at most nstates chrs in the output string.
84	*/
85	string = (chr ) MALLOC(cnfa->nstates * sizeof(chr));
86	if (*string == NULL)
87	return REG_ESPACE;
88
89	/ do it /
90	st = findprefix(cnfa, &g->cmap, *string, slength);
91
92	assert(*slength <= cnfa->nstates);
93
94	/ clean up /
95	if (st != REG_PREFIX && st != REG_EXACT)
96	{
97	FREE(*string);
98	*string = NULL;
99	*slength = `0`;
100	}
101
102	return st;
103	}
104
105	/*
106	* findprefix - extract common prefix from cNFA
107	*
108	* Results are returned into the preallocated chr array string[], with
109	* *slength (which must be preset to zero) incremented for each chr.
110	*/
111	static int / regprefix return code /
112	findprefix(struct cnfa *cnfa,
113	struct colormap *cm,
114	chr *string,
115	size_t *slength)
116	{
117	int st;
118	int nextst;
119	color thiscolor;
120	chr c;
121	struct carc *ca;
122
123	/*
124	* The "pre" state must have only BOS/BOL outarcs, else pattern isn't
125	* anchored left. If we have both BOS and BOL, they must go to the same
126	* next state.
127	*/
128	st = cnfa->pre;
129	nextst = -`1`;
130	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
131	{
132	if (ca->co == cnfa->bos[`0`] \|\| ca->co == cnfa->bos[`1`])
133	{
134	if (nextst == -`1`)
135	nextst = ca->to;
136	else if (nextst != ca->to)
137	return REG_NOMATCH;
138	}
139	else
140	return REG_NOMATCH;
141	}
142	if (nextst == -`1`)
143	return REG_NOMATCH;
144
145	/*
146	* Scan through successive states, stopping as soon as we find one with
147	* more than one acceptable transition character (either multiple colors
148	* on out-arcs, or a color with more than one member chr).
149	*
150	* We could find a state with multiple out-arcs that are all labeled with
151	* the same singleton color; this comes from patterns like "^ab(cde\|cxy)".
152	* In that case we add the chr "c" to the output string but then exit the
153	* loop with nextst == -1. This leaves a little bit on the table: if the
154	* pattern is like "^ab(cde\|cdy)", we won't notice that "d" could be added
155	* to the prefix. But chasing multiple parallel state chains doesn't seem
156	* worth the trouble.
157	*/
158	do
159	{
160	st = nextst;
161	nextst = -`1`;
162	thiscolor = COLORLESS;
163	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
164	{
165	/ We can ignore BOS/BOL arcs /
166	if (ca->co == cnfa->bos[`0`] \|\| ca->co == cnfa->bos[`1`])
167	continue;
168	/ ... but EOS/EOL arcs terminate the search, as do LACONs /
169	if (ca->co == cnfa->eos[`0`] \|\| ca->co == cnfa->eos[`1`] \|\|
170	ca->co >= cnfa->ncolors)
171	{
172	thiscolor = COLORLESS;
173	break;
174	}
175	if (thiscolor == COLORLESS)
176	{
177	/ First plain outarc /
178	thiscolor = ca->co;
179	nextst = ca->to;
180	}
181	else if (thiscolor == ca->co)
182	{
183	/ Another plain outarc for same color /
184	nextst = -`1`;
185	}
186	else
187	{
188	/ More than one plain outarc color terminates the search /
189	thiscolor = COLORLESS;
190	break;
191	}
192	}
193	/ Done if we didn't find exactly one color on plain outarcs /
194	if (thiscolor == COLORLESS)
195	break;
196	/ The color must be a singleton /
197	if (cm->cd[thiscolor].nschrs != `1`)
198	break;
199	/ Must not have any high-color-map entries /
200	if (cm->cd[thiscolor].nuchrs != `0`)
201	break;
202
203	/*
204	* Identify the color's sole member chr and add it to the prefix
205	* string. In general the colormap data structure doesn't provide a
206	* way to find color member chrs, except by trying GETCOLOR() on each
207	* possible chr value, which won't do at all. However, for the cases
208	* we care about it should be sufficient to test the "firstchr" value,
209	* that is the first chr ever added to the color. There are cases
210	* where this might no longer be a member of the color (so we do need
211	* to test), but none of them are likely to arise for a character that
212	* is a member of a common prefix. If we do hit such a corner case,
213	* we just fall out without adding anything to the prefix string.
214	*/
215	c = cm->cd[thiscolor].firstchr;
216	if (GETCOLOR(cm, c) != thiscolor)
217	break;
218
219	string[(*slength)++] = c;
220
221	/ Advance to next state, but only if we have a unique next state /
222	} while (nextst != -`1`);
223
224	/*
225	* If we ended at a state that only has EOS/EOL outarcs leading to the
226	* "post" state, then we have an exact-match string. Note this is true
227	* even if the string is of zero length.
228	*/
229	nextst = -`1`;
230	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
231	{
232	if (ca->co == cnfa->eos[`0`] \|\| ca->co == cnfa->eos[`1`])
233	{
234	if (nextst == -`1`)
235	nextst = ca->to;
236	else if (nextst != ca->to)
237	{
238	nextst = -`1`;
239	break;
240	}
241	}
242	else
243	{
244	nextst = -`1`;
245	break;
246	}
247	}
248	if (nextst == cnfa->post)
249	return REG_EXACT;
250
251	/*
252	* Otherwise, if we were unable to identify any prefix characters, say
253	* NOMATCH --- the pattern is anchored left, but doesn't specify any
254	* particular first character.
255	*/
256	if (*slength > `0`)
257	return REG_PREFIX;
258
259	return REG_NOMATCH;
260	}
261

Browse the source code of PostgreSQL/src/backend/regex/regprefix.c