regexec.c source code [PostgreSQL/src/backend/regex/regexec.c]

1	/*
2	* re_*exec and friends - match REs
3	*
4	* Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
5	*
6	* Development of this software was funded, in part, by Cray Research Inc.,
7	* UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
8	* Corporation, none of whom are responsible for the results. The author
9	* thanks all of them.
10	*
11	* Redistribution and use in source and binary forms -- with or without
12	* modification -- are permitted for any purpose, provided that
13	* redistributions in source form retain this entire copyright notice and
14	* indicate the origin and nature of any modifications.
15	*
16	* I'd appreciate being given credit for this package in the documentation
17	* of software which uses it, but that is not a requirement.
18	*
19	* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
20	* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
21	* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
22	* HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25	* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26	* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27	* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28	* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29	*
30	* src/backend/regex/regexec.c
31	*
32	*/
33
34	#include "regex/regguts.h"
35
36
37
38	/ lazy-DFA representation /
39	struct arcp
40	{ / "pointer" to an outarc /
41	struct sset *ss;
42	color co;
43	};
44
45	struct sset
46	{ / state set /
47	unsigned states; /* pointer to bitvector /
48	unsigned hash; / hash of bitvector /
49	#define HASH(bv, nw) (((nw) == 1) ? *(bv) : hash(bv, nw))
50	#define HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 \|\| \
51	memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0))
52	int flags;
53	#define STARTER 01 /* the initial state set */
54	#define POSTSTATE 02 /* includes the goal state */
55	#define LOCKED 04 /* locked in cache */
56	#define NOPROGRESS 010 /* zero-progress state set */
57	struct arcp ins; / chain of inarcs pointing here /
58	chr lastseen; /* last entered on arrival here /
59	struct sset *outs; /* outarc vector indexed by color /
60	struct arcp inchain; /* chain-pointer vector for outarcs /
61	};
62
63	struct dfa
64	{
65	int nssets; / size of cache /
66	int nssused; / how many entries occupied yet /
67	int nstates; / number of states /
68	int ncolors; / length of outarc and inchain vectors /
69	int wordsper; / length of state-set bitvectors /
70	struct sset ssets; /* state-set cache /
71	unsigned statesarea; /* bitvector storage /
72	unsigned work; /* pointer to work area within statesarea /
73	struct sset *outsarea; /* outarc-vector storage /
74	struct arcp incarea; /* inchain storage /
75	struct cnfa *cnfa;
76	struct colormap *cm;
77	chr lastpost; /* location of last cache-flushed success /
78	chr lastnopr; /* location of last cache-flushed NOPROGRESS /
79	struct sset search; /* replacement-search-pointer memory /
80	int cptsmalloced; / were the areas individually malloced? /
81	char mallocarea; /* self, or master malloced area, or NULL /
82	};
83
84	#define WORK 1 /* number of work bitvectors needed */
85
86	/ setup for non-malloc allocation for small cases /
87	#define FEWSTATES 20 /* must be less than UBITS */
88	#define FEWCOLORS 15
89	struct smalldfa
90	{
91	struct dfa dfa;
92	struct sset ssets[FEWSTATES * `2`];
93	unsigned statesarea[FEWSTATES * `2` + WORK];
94	struct sset outsarea[FEWSTATES `2` * FEWCOLORS];
95	struct arcp incarea[FEWSTATES * `2` * FEWCOLORS];
96	};
97
98	#define DOMALLOC ((struct smalldfa )NULL) / force malloc */
99
100
101
102	/ internal variables, bundled for easy passing around /
103	struct vars
104	{
105	regex_t *re;
106	struct guts *g;
107	int eflags; / copies of arguments /
108	size_t nmatch;
109	regmatch_t *pmatch;
110	rm_detail_t *details;
111	chr start; /* start of string /
112	chr search_start; /* search start of string /
113	chr stop; /* just past end of string /
114	int err; / error code if any (0 none) /
115	struct dfa *subdfas; /* per-tree-subre DFAs /
116	struct dfa *ladfas; /* per-lacon-subre DFAs /
117	struct sset *lblastcss; /* per-lacon-subre lookbehind restart data /
118	chr *lblastcp; /* per-lacon-subre lookbehind restart data /
119	struct smalldfa dfa1;
120	struct smalldfa dfa2;
121	};
122
123	#define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
124	#define ISERR() VISERR(v)
125	#define VERR(vv,e) ((vv)->err = ((vv)->err ? (vv)->err : (e)))
126	#define ERR(e) VERR(v, e) /* record an error */
127	#define NOERR() {if (ISERR()) return v->err;} /* if error seen, return it */
128	#define OFF(p) ((p) - v->start)
129	#define LOFF(p) ((long)OFF(p))
130
131
132
133	/*
134	* forward declarations
135	*/
136	/ === regexec.c === /
137	static struct dfa getsubdfa(struct* vars , struct* subre *);
138	static struct dfa getladfa(struct* vars , int*);
139	static int find(struct vars , struct* cnfa , struct* colormap *);
140	static int cfind(struct vars , struct* cnfa , struct* colormap *);
141	static int cfindloop(struct vars , struct* cnfa , struct* colormap , struct* dfa , struct* dfa , chr *);
142	static void zapallsubs(regmatch_t *, size_t);
143	static void zaptreesubs(struct vars , struct* subre *);
144	static void subset(struct vars , struct* subre , chr , chr *);
145	static int cdissect(struct vars , struct* subre , chr , chr *);
146	static int ccondissect(struct vars , struct* subre , chr , chr *);
147	static int crevcondissect(struct vars , struct* subre , chr , chr *);
148	static int cbrdissect(struct vars , struct* subre , chr , chr *);
149	static int caltdissect(struct vars , struct* subre , chr , chr *);
150	static int citerdissect(struct vars , struct* subre , chr , chr *);
151	static int creviterdissect(struct vars , struct* subre , chr , chr *);
152
153	/ === rege_dfa.c === /
154	static chr longest(struct* vars , struct* dfa , chr , chr , int* *);
155	static chr shortest(struct* vars , struct* dfa , chr , chr , chr , chr *, int* *);
156	static int matchuntil(struct vars , struct* dfa , chr , struct sset , chr );
157	static chr lastcold(struct* vars , struct* dfa *);
158	static struct dfa newdfa(struct* vars , struct* cnfa , struct* colormap , struct* smalldfa *);
159	static void freedfa(struct dfa *);
160	static unsigned hash(unsigned , int*);
161	static struct sset initialize(struct* vars , struct* dfa , chr );
162	static struct sset miss(struct* vars , struct* dfa , struct* sset , color, chr , chr *);
163	static int lacon(struct vars , struct* cnfa , chr , color);
164	static struct sset getvacant(struct* vars , struct* dfa , chr , chr *);
165	static struct sset pickss(struct* vars , struct* dfa , chr , chr *);
166
167
168	/*
169	* pg_regexec - match regular expression
170	*/
171	int
172	pg_regexec(regex_t *re,
173	const chr *string,
174	size_t len,
175	size_t search_start,
176	rm_detail_t *details,
177	size_t nmatch,
178	regmatch_t pmatch[],
179	int flags)
180	{
181	struct vars var;
182	register struct vars *v = &var;
183	int st;
184	size_t n;
185	size_t i;
186	int backref;
187
188	#define LOCALMAT 20
189	regmatch_t mat[LOCALMAT];
190
191	#define LOCALDFAS 40
192	struct dfa *subdfas[LOCALDFAS];
193
194	/ sanity checks /
195	if (re == NULL \|\| string == NULL \|\| re->re_magic != REMAGIC)
196	return REG_INVARG;
197	if (re->re_csize != sizeof(chr))
198	return REG_MIXED;
199
200	/ Initialize locale-dependent support /
201	pg_set_regex_collation(re->re_collation);
202
203	/ setup /
204	v->re = re;
205	v->g = (struct guts *) re->re_guts;
206	if ((v->g->cflags & REG_EXPECT) && details == NULL)
207	return REG_INVARG;
208	if (v->g->info & REG_UIMPOSSIBLE)
209	return REG_NOMATCH;
210	backref = (v->g->info & REG_UBACKREF) ? `1` : `0`;
211	v->eflags = flags;
212	if (v->g->cflags & REG_NOSUB)
213	nmatch = `0`; / override client /
214	v->nmatch = nmatch;
215	if (backref)
216	{
217	/ need work area /
218	if (v->g->nsub + `1` <= LOCALMAT)
219	v->pmatch = mat;
220	else
221	v->pmatch = (regmatch_t ) MALLOC((v->g->nsub + `1`)
222	sizeof(regmatch_t));
223	if (v->pmatch == NULL)
224	return REG_ESPACE;
225	v->nmatch = v->g->nsub + `1`;
226	}
227	else
228	v->pmatch = pmatch;
229	v->details = details;
230	v->start = (chr *) string;
231	v->search_start = (chr *) string + search_start;
232	v->stop = (chr *) string + len;
233	v->err = `0`;
234	v->subdfas = NULL;
235	v->ladfas = NULL;
236	v->lblastcss = NULL;
237	v->lblastcp = NULL;
238	/ below this point, "goto cleanup" will behave sanely /
239
240	assert(v->g->ntree >= `0`);
241	n = (size_t) v->g->ntree;
242	if (n <= LOCALDFAS)
243	v->subdfas = subdfas;
244	else
245	{
246	v->subdfas = (struct dfa *) MALLOC(n sizeof(struct dfa *));
247	if (v->subdfas == NULL)
248	{
249	st = REG_ESPACE;
250	goto cleanup;
251	}
252	}
253	for (i = `0`; i < n; i++)
254	v->subdfas[i] = NULL;
255
256	assert(v->g->nlacons >= `0`);
257	n = (size_t) v->g->nlacons;
258	if (n > `0`)
259	{
260	v->ladfas = (struct dfa *) MALLOC(n sizeof(struct dfa *));
261	if (v->ladfas == NULL)
262	{
263	st = REG_ESPACE;
264	goto cleanup;
265	}
266	for (i = `0`; i < n; i++)
267	v->ladfas[i] = NULL;
268	v->lblastcss = (struct sset *) MALLOC(n sizeof(struct sset *));
269	v->lblastcp = (chr *) MALLOC(n sizeof(chr *));
270	if (v->lblastcss == NULL \|\| v->lblastcp == NULL)
271	{
272	st = REG_ESPACE;
273	goto cleanup;
274	}
275	for (i = `0`; i < n; i++)
276	{
277	v->lblastcss[i] = NULL;
278	v->lblastcp[i] = NULL;
279	}
280	}
281
282	/ do it /
283	assert(v->g->tree != NULL);
284	if (backref)
285	st = cfind(v, &v->g->tree->cnfa, &v->g->cmap);
286	else
287	st = find(v, &v->g->tree->cnfa, &v->g->cmap);
288
289	/ copy (portion of) match vector over if necessary /
290	if (st == REG_OKAY && v->pmatch != pmatch && nmatch > `0`)
291	{
292	zapallsubs(pmatch, nmatch);
293	n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
294	memcpy(VS(pmatch), VS(v->pmatch), n * sizeof(regmatch_t));
295	}
296
297	/ clean up /
298	cleanup:
299	if (v->pmatch != pmatch && v->pmatch != mat)
300	FREE(v->pmatch);
301	if (v->subdfas != NULL)
302	{
303	n = (size_t) v->g->ntree;
304	for (i = `0`; i < n; i++)
305	{
306	if (v->subdfas[i] != NULL)
307	freedfa(v->subdfas[i]);
308	}
309	if (v->subdfas != subdfas)
310	FREE(v->subdfas);
311	}
312	if (v->ladfas != NULL)
313	{
314	n = (size_t) v->g->nlacons;
315	for (i = `0`; i < n; i++)
316	{
317	if (v->ladfas[i] != NULL)
318	freedfa(v->ladfas[i]);
319	}
320	FREE(v->ladfas);
321	}
322	if (v->lblastcss != NULL)
323	FREE(v->lblastcss);
324	if (v->lblastcp != NULL)
325	FREE(v->lblastcp);
326
327	return st;
328	}
329
330	/*
331	* getsubdfa - create or re-fetch the DFA for a tree subre node
332	*
333	* We only need to create the DFA once per overall regex execution.
334	* The DFA will be freed by the cleanup step in pg_regexec().
335	*/
336	static struct dfa *
337	getsubdfa(struct vars *v,
338	struct subre *t)
339	{
340	if (v->subdfas[t->id] == NULL)
341	{
342	v->subdfas[t->id] = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC);
343	if (ISERR())
344	return NULL;
345	}
346	return v->subdfas[t->id];
347	}
348
349	/*
350	* getladfa - create or re-fetch the DFA for a LACON subre node
351	*
352	* Same as above, but for LACONs.
353	*/
354	static struct dfa *
355	getladfa(struct vars *v,
356	int n)
357	{
358	assert(n > `0` && n < v->g->nlacons && v->g->lacons != NULL);
359
360	if (v->ladfas[n] == NULL)
361	{
362	struct subre *sub = &v->g->lacons[n];
363
364	v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC);
365	if (ISERR())
366	return NULL;
367	}
368	return v->ladfas[n];
369	}
370
371	/*
372	* find - find a match for the main NFA (no-complications case)
373	*/
374	static int
375	find(struct vars *v,
376	struct cnfa *cnfa,
377	struct colormap *cm)
378	{
379	struct dfa *s;
380	struct dfa *d;
381	chr *begin;
382	chr *end = NULL;
383	chr *cold;
384	chr open; /* open and close of range of possible starts /
385	chr *close;
386	int hitend;
387	int shorter = (v->g->tree->flags & SHORTER) ? `1` : `0`;
388
389	/ first, a shot with the search RE /
390	s = newdfa(v, &v->g->search, cm, &v->dfa1);
391	assert(!(ISERR() && s != NULL));
392	NOERR();
393	MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
394	cold = NULL;
395	close = shortest(v, s, v->search_start, v->search_start, v->stop,
396	&cold, (int *) NULL);
397	freedfa(s);
398	NOERR();
399	if (v->g->cflags & REG_EXPECT)
400	{
401	assert(v->details != NULL);
402	if (cold != NULL)
403	v->details->rm_extend.rm_so = OFF(cold);
404	else
405	v->details->rm_extend.rm_so = OFF(v->stop);
406	v->details->rm_extend.rm_eo = OFF(v->stop); / unknown /
407	}
408	if (close == NULL) / not found /
409	return REG_NOMATCH;
410	if (v->nmatch == `0`) / found, don't need exact location /
411	return REG_OKAY;
412
413	/ find starting point and match /
414	assert(cold != NULL);
415	open = cold;
416	cold = NULL;
417	MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close)));
418	d = newdfa(v, cnfa, cm, &v->dfa1);
419	assert(!(ISERR() && d != NULL));
420	NOERR();
421	for (begin = open; begin <= close; begin++)
422	{
423	MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
424	if (shorter)
425	end = shortest(v, d, begin, begin, v->stop,
426	(chr **) NULL, &hitend);
427	else
428	end = longest(v, d, begin, v->stop, &hitend);
429	if (ISERR())
430	{
431	freedfa(d);
432	return v->err;
433	}
434	if (hitend && cold == NULL)
435	cold = begin;
436	if (end != NULL)
437	break; / NOTE BREAK OUT /
438	}
439	assert(end != NULL); / search RE succeeded so loop should /
440	freedfa(d);
441
442	/ and pin down details /
443	assert(v->nmatch > `0`);
444	v->pmatch[`0`].rm_so = OFF(begin);
445	v->pmatch[`0`].rm_eo = OFF(end);
446	if (v->g->cflags & REG_EXPECT)
447	{
448	if (cold != NULL)
449	v->details->rm_extend.rm_so = OFF(cold);
450	else
451	v->details->rm_extend.rm_so = OFF(v->stop);
452	v->details->rm_extend.rm_eo = OFF(v->stop); / unknown /
453	}
454	if (v->nmatch == `1`) / no need for submatches /
455	return REG_OKAY;
456
457	/ find submatches /
458	zapallsubs(v->pmatch, v->nmatch);
459	return cdissect(v, v->g->tree, begin, end);
460	}
461
462	/*
463	* cfind - find a match for the main NFA (with complications)
464	*/
465	static int
466	cfind(struct vars *v,
467	struct cnfa *cnfa,
468	struct colormap *cm)
469	{
470	struct dfa *s;
471	struct dfa *d;
472	chr *cold;
473	int ret;
474
475	s = newdfa(v, &v->g->search, cm, &v->dfa1);
476	NOERR();
477	d = newdfa(v, cnfa, cm, &v->dfa2);
478	if (ISERR())
479	{
480	assert(d == NULL);
481	freedfa(s);
482	return v->err;
483	}
484
485	ret = cfindloop(v, cnfa, cm, d, s, &cold);
486
487	freedfa(d);
488	freedfa(s);
489	NOERR();
490	if (v->g->cflags & REG_EXPECT)
491	{
492	assert(v->details != NULL);
493	if (cold != NULL)
494	v->details->rm_extend.rm_so = OFF(cold);
495	else
496	v->details->rm_extend.rm_so = OFF(v->stop);
497	v->details->rm_extend.rm_eo = OFF(v->stop); / unknown /
498	}
499	return ret;
500	}
501
502	/*
503	* cfindloop - the heart of cfind
504	*/
505	static int
506	cfindloop(struct vars *v,
507	struct cnfa *cnfa,
508	struct colormap *cm,
509	struct dfa *d,
510	struct dfa *s,
511	chr *coldp) /* where to put coldstart pointer /
512	{
513	chr *begin;
514	chr *end;
515	chr *cold;
516	chr open; /* open and close of range of possible starts /
517	chr *close;
518	chr *estart;
519	chr *estop;
520	int er;
521	int shorter = v->g->tree->flags & SHORTER;
522	int hitend;
523
524	assert(d != NULL && s != NULL);
525	cold = NULL;
526	close = v->search_start;
527	do
528	{
529	/ Search with the search RE for match range at/beyond "close" /
530	MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
531	close = shortest(v, s, close, close, v->stop, &cold, (int *) NULL);
532	if (ISERR())
533	{
534	*coldp = cold;
535	return v->err;
536	}
537	if (close == NULL)
538	break; / no more possible match anywhere /
539	assert(cold != NULL);
540	open = cold;
541	cold = NULL;
542	/ Search for matches starting between "open" and "close" inclusive /
543	MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
544	for (begin = open; begin <= close; begin++)
545	{
546	MDEBUG(("\ncfind trying at %ld\n", LOFF(begin)));
547	estart = begin;
548	estop = v->stop;
549	for (;;)
550	{
551	/ Here we use the top node's detailed RE /
552	if (shorter)
553	end = shortest(v, d, begin, estart,
554	estop, (chr **) NULL, &hitend);
555	else
556	end = longest(v, d, begin, estop,
557	&hitend);
558	if (ISERR())
559	{
560	*coldp = cold;
561	return v->err;
562	}
563	if (hitend && cold == NULL)
564	cold = begin;
565	if (end == NULL)
566	break; / no match with this begin point, try next /
567	MDEBUG(("tentative end %ld\n", LOFF(end)));
568	/ Dissect the potential match to see if it really matches /
569	zapallsubs(v->pmatch, v->nmatch);
570	er = cdissect(v, v->g->tree, begin, end);
571	if (er == REG_OKAY)
572	{
573	if (v->nmatch > `0`)
574	{
575	v->pmatch[`0`].rm_so = OFF(begin);
576	v->pmatch[`0`].rm_eo = OFF(end);
577	}
578	*coldp = cold;
579	return REG_OKAY;
580	}
581	if (er != REG_NOMATCH)
582	{
583	ERR(er);
584	*coldp = cold;
585	return er;
586	}
587	/ Try next longer/shorter match with same begin point /
588	if (shorter)
589	{
590	if (end == estop)
591	break; / no more, so try next begin point /
592	estart = end + `1`;
593	}
594	else
595	{
596	if (end == begin)
597	break; / no more, so try next begin point /
598	estop = end - `1`;
599	}
600	} / end loop over endpoint positions /
601	} / end loop over beginning positions /
602
603	/*
604	* If we get here, there is no possible match starting at or before
605	* "close", so consider matches beyond that. We'll do a fresh search
606	* with the search RE to find a new promising match range.
607	*/
608	close++;
609	} while (close < v->stop);
610
611	*coldp = cold;
612	return REG_NOMATCH;
613	}
614
615	/*
616	* zapallsubs - initialize all subexpression matches to "no match"
617	*/
618	static void
619	zapallsubs(regmatch_t *p,
620	size_t n)
621	{
622	size_t i;
623
624	for (i = n - `1`; i > `0`; i--)
625	{
626	p[i].rm_so = -`1`;
627	p[i].rm_eo = -`1`;
628	}
629	}
630
631	/*
632	* zaptreesubs - initialize subexpressions within subtree to "no match"
633	*/
634	static void
635	zaptreesubs(struct vars *v,
636	struct subre *t)
637	{
638	if (t->op == `'('`)
639	{
640	int n = t->subno;
641
642	assert(n > `0`);
643	if ((size_t) n < v->nmatch)
644	{
645	v->pmatch[n].rm_so = -`1`;
646	v->pmatch[n].rm_eo = -`1`;
647	}
648	}
649
650	if (t->left != NULL)
651	zaptreesubs(v, t->left);
652	if (t->right != NULL)
653	zaptreesubs(v, t->right);
654	}
655
656	/*
657	* subset - set subexpression match data for a successful subre
658	*/
659	static void
660	subset(struct vars *v,
661	struct subre *sub,
662	chr *begin,
663	chr *end)
664	{
665	int n = sub->subno;
666
667	assert(n > `0`);
668	if ((size_t) n >= v->nmatch)
669	return;
670
671	MDEBUG(("setting %d\n", n));
672	v->pmatch[n].rm_so = OFF(begin);
673	v->pmatch[n].rm_eo = OFF(end);
674	}
675
676	/*
677	* cdissect - check backrefs and determine subexpression matches
678	*
679	* cdissect recursively processes a subre tree to check matching of backrefs
680	* and/or identify submatch boundaries for capture nodes. The proposed match
681	* runs from "begin" to "end" (not including "end"), and we are basically
682	* "dissecting" it to see where the submatches are.
683	*
684	* Before calling any level of cdissect, the caller must have run the node's
685	* DFA and found that the proposed substring satisfies the DFA. (We make
686	* the caller do that because in concatenation and iteration nodes, it's
687	* much faster to check all the substrings against the child DFAs before we
688	* recurse.) Also, caller must have cleared subexpression match data via
689	* zaptreesubs (or zapallsubs at the top level).
690	*/
691	static int / regexec return code /
692	cdissect(struct vars *v,
693	struct subre *t,
694	chr begin, /* beginning of relevant substring /
695	chr end) /* end of same /
696	{
697	int er;
698
699	assert(t != NULL);
700	MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op));
701
702	/ handy place to check for operation cancel /
703	if (CANCEL_REQUESTED(v->re))
704	return REG_CANCEL;
705	/ ... and stack overrun /
706	if (STACK_TOO_DEEP(v->re))
707	return REG_ETOOBIG;
708
709	switch (t->op)
710	{
711	case `'='`: / terminal node /
712	assert(t->left == NULL && t->right == NULL);
713	er = REG_OKAY; / no action, parent did the work /
714	break;
715	case `'b'`: / back reference /
716	assert(t->left == NULL && t->right == NULL);
717	er = cbrdissect(v, t, begin, end);
718	break;
719	case `'.'`: / concatenation /
720	assert(t->left != NULL && t->right != NULL);
721	if (t->left->flags & SHORTER) / reverse scan /
722	er = crevcondissect(v, t, begin, end);
723	else
724	er = ccondissect(v, t, begin, end);
725	break;
726	case `'\|'`: / alternation /
727	assert(t->left != NULL);
728	er = caltdissect(v, t, begin, end);
729	break;
730	case `''`: /* iteration /
731	assert(t->left != NULL);
732	if (t->left->flags & SHORTER) / reverse scan /
733	er = creviterdissect(v, t, begin, end);
734	else
735	er = citerdissect(v, t, begin, end);
736	break;
737	case `'('`: / capturing /
738	assert(t->left != NULL && t->right == NULL);
739	assert(t->subno > `0`);
740	er = cdissect(v, t->left, begin, end);
741	if (er == REG_OKAY)
742	subset(v, t, begin, end);
743	break;
744	default:
745	er = REG_ASSERT;
746	break;
747	}
748
749	/*
750	* We should never have a match failure unless backrefs lurk below;
751	* otherwise, either caller failed to check the DFA, or there's some
752	* inconsistency between the DFA and the node's innards.
753	*/
754	assert(er != REG_NOMATCH \|\| (t->flags & BACKR));
755
756	return er;
757	}
758
759	/*
760	* ccondissect - dissect match for concatenation node
761	*/
762	static int / regexec return code /
763	ccondissect(struct vars *v,
764	struct subre *t,
765	chr begin, /* beginning of relevant substring /
766	chr end) /* end of same /
767	{
768	struct dfa *d;
769	struct dfa *d2;
770	chr *mid;
771	int er;
772
773	assert(t->op == `'.'`);
774	assert(t->left != NULL && t->left->cnfa.nstates > `0`);
775	assert(t->right != NULL && t->right->cnfa.nstates > `0`);
776	assert(!(t->left->flags & SHORTER));
777
778	d = getsubdfa(v, t->left);
779	NOERR();
780	d2 = getsubdfa(v, t->right);
781	NOERR();
782	MDEBUG(("cconcat %d\n", t->id));
783
784	/ pick a tentative midpoint /
785	mid = longest(v, d, begin, end, (int *) NULL);
786	NOERR();
787	if (mid == NULL)
788	return REG_NOMATCH;
789	MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
790
791	/ iterate until satisfaction or failure /
792	for (;;)
793	{
794	/ try this midpoint on for size /
795	if (longest(v, d2, mid, end, (int *) NULL) == end)
796	{
797	er = cdissect(v, t->left, begin, mid);
798	if (er == REG_OKAY)
799	{
800	er = cdissect(v, t->right, mid, end);
801	if (er == REG_OKAY)
802	{
803	/ satisfaction /
804	MDEBUG(("successful\n"));
805	return REG_OKAY;
806	}
807	}
808	if (er != REG_NOMATCH)
809	return er;
810	}
811	NOERR();
812
813	/ that midpoint didn't work, find a new one /
814	if (mid == begin)
815	{
816	/ all possibilities exhausted /
817	MDEBUG(("%d no midpoint\n", t->id));
818	return REG_NOMATCH;
819	}
820	mid = longest(v, d, begin, mid - `1`, (int *) NULL);
821	NOERR();
822	if (mid == NULL)
823	{
824	/ failed to find a new one /
825	MDEBUG(("%d failed midpoint\n", t->id));
826	return REG_NOMATCH;
827	}
828	MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
829	zaptreesubs(v, t->left);
830	zaptreesubs(v, t->right);
831	}
832
833	/ can't get here /
834	return REG_ASSERT;
835	}
836
837	/*
838	* crevcondissect - dissect match for concatenation node, shortest-first
839	*/
840	static int / regexec return code /
841	crevcondissect(struct vars *v,
842	struct subre *t,
843	chr begin, /* beginning of relevant substring /
844	chr end) /* end of same /
845	{
846	struct dfa *d;
847	struct dfa *d2;
848	chr *mid;
849	int er;
850
851	assert(t->op == `'.'`);
852	assert(t->left != NULL && t->left->cnfa.nstates > `0`);
853	assert(t->right != NULL && t->right->cnfa.nstates > `0`);
854	assert(t->left->flags & SHORTER);
855
856	d = getsubdfa(v, t->left);
857	NOERR();
858	d2 = getsubdfa(v, t->right);
859	NOERR();
860	MDEBUG(("crevcon %d\n", t->id));
861
862	/ pick a tentative midpoint /
863	mid = shortest(v, d, begin, begin, end, (chr *) NULL, (int* *) NULL);
864	NOERR();
865	if (mid == NULL)
866	return REG_NOMATCH;
867	MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
868
869	/ iterate until satisfaction or failure /
870	for (;;)
871	{
872	/ try this midpoint on for size /
873	if (longest(v, d2, mid, end, (int *) NULL) == end)
874	{
875	er = cdissect(v, t->left, begin, mid);
876	if (er == REG_OKAY)
877	{
878	er = cdissect(v, t->right, mid, end);
879	if (er == REG_OKAY)
880	{
881	/ satisfaction /
882	MDEBUG(("successful\n"));
883	return REG_OKAY;
884	}
885	}
886	if (er != REG_NOMATCH)
887	return er;
888	}
889	NOERR();
890
891	/ that midpoint didn't work, find a new one /
892	if (mid == end)
893	{
894	/ all possibilities exhausted /
895	MDEBUG(("%d no midpoint\n", t->id));
896	return REG_NOMATCH;
897	}
898	mid = shortest(v, d, begin, mid + `1`, end, (chr *) NULL, (int* *) NULL);
899	NOERR();
900	if (mid == NULL)
901	{
902	/ failed to find a new one /
903	MDEBUG(("%d failed midpoint\n", t->id));
904	return REG_NOMATCH;
905	}
906	MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
907	zaptreesubs(v, t->left);
908	zaptreesubs(v, t->right);
909	}
910
911	/ can't get here /
912	return REG_ASSERT;
913	}
914
915	/*
916	* cbrdissect - dissect match for backref node
917	*/
918	static int / regexec return code /
919	cbrdissect(struct vars *v,
920	struct subre *t,
921	chr begin, /* beginning of relevant substring /
922	chr end) /* end of same /
923	{
924	int n = t->subno;
925	size_t numreps;
926	size_t tlen;
927	size_t brlen;
928	chr *brstring;
929	chr *p;
930	int min = t->min;
931	int max = t->max;
932
933	assert(t != NULL);
934	assert(t->op == `'b'`);
935	assert(n >= `0`);
936	assert((size_t) n < v->nmatch);
937
938	MDEBUG(("cbackref n%d %d{%d-%d}\n", t->id, n, min, max));
939
940	/ get the backreferenced string /
941	if (v->pmatch[n].rm_so == -`1`)
942	return REG_NOMATCH;
943	brstring = v->start + v->pmatch[n].rm_so;
944	brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
945
946	/ special cases for zero-length strings /
947	if (brlen == `0`)
948	{
949	/*
950	* matches only if target is zero length, but any number of
951	* repetitions can be considered to be present
952	*/
953	if (begin == end && min <= max)
954	{
955	MDEBUG(("cbackref matched trivially\n"));
956	return REG_OKAY;
957	}
958	return REG_NOMATCH;
959	}
960	if (begin == end)
961	{
962	/ matches only if zero repetitions are okay /
963	if (min == `0`)
964	{
965	MDEBUG(("cbackref matched trivially\n"));
966	return REG_OKAY;
967	}
968	return REG_NOMATCH;
969	}
970
971	/*
972	* check target length to see if it could possibly be an allowed number of
973	* repetitions of brstring
974	*/
975	assert(end > begin);
976	tlen = end - begin;
977	if (tlen % brlen != `0`)
978	return REG_NOMATCH;
979	numreps = tlen / brlen;
980	if (numreps < min \|\| (numreps > max && max != DUPINF))
981	return REG_NOMATCH;
982
983	/ okay, compare the actual string contents /
984	p = begin;
985	while (numreps-- > `0`)
986	{
987	if ((*v->g->compare) (brstring, p, brlen) != `0`)
988	return REG_NOMATCH;
989	p += brlen;
990	}
991
992	MDEBUG(("cbackref matched\n"));
993	return REG_OKAY;
994	}
995
996	/*
997	* caltdissect - dissect match for alternation node
998	*/
999	static int / regexec return code /
1000	caltdissect(struct vars *v,
1001	struct subre *t,
1002	chr begin, /* beginning of relevant substring /
1003	chr end) /* end of same /
1004	{
1005	struct dfa *d;
1006	int er;
1007
1008	/ We loop, rather than tail-recurse, to handle a chain of alternatives /
1009	while (t != NULL)
1010	{
1011	assert(t->op == `'\|'`);
1012	assert(t->left != NULL && t->left->cnfa.nstates > `0`);
1013
1014	MDEBUG(("calt n%d\n", t->id));
1015
1016	d = getsubdfa(v, t->left);
1017	NOERR();
1018	if (longest(v, d, begin, end, (int *) NULL) == end)
1019	{
1020	MDEBUG(("calt matched\n"));
1021	er = cdissect(v, t->left, begin, end);
1022	if (er != REG_NOMATCH)
1023	return er;
1024	}
1025	NOERR();
1026
1027	t = t->right;
1028	}
1029
1030	return REG_NOMATCH;
1031	}
1032
1033	/*
1034	* citerdissect - dissect match for iteration node
1035	*/
1036	static int / regexec return code /
1037	citerdissect(struct vars *v,
1038	struct subre *t,
1039	chr begin, /* beginning of relevant substring /
1040	chr end) /* end of same /
1041	{
1042	struct dfa *d;
1043	chr **endpts;
1044	chr *limit;
1045	int min_matches;
1046	size_t max_matches;
1047	int nverified;
1048	int k;
1049	int i;
1050	int er;
1051
1052	assert(t->op == `'*'`);
1053	assert(t->left != NULL && t->left->cnfa.nstates > `0`);
1054	assert(!(t->left->flags & SHORTER));
1055	assert(begin <= end);
1056
1057	/*
1058	* For the moment, assume the minimum number of matches is 1. If zero
1059	* matches are allowed, and the target string is empty, we are allowed to
1060	* match regardless of the contents of the iter node --- but we would
1061	* prefer to match once, so that capturing parens get set. (An example of
1062	* the concern here is a pattern like "()*\1", which historically this
1063	* code has allowed to succeed.) Therefore, we deal with the zero-matches
1064	* case at the bottom, after failing to find any other way to match.
1065	*/
1066	min_matches = t->min;
1067	if (min_matches <= `0`)
1068	min_matches = `1`;
1069
1070	/*
1071	* We need workspace to track the endpoints of each sub-match. Normally
1072	* we consider only nonzero-length sub-matches, so there can be at most
1073	* end-begin of them. However, if min is larger than that, we will also
1074	* consider zero-length sub-matches in order to find enough matches.
1075	*
1076	* For convenience, endpts[0] contains the "begin" pointer and we store
1077	* sub-match endpoints in endpts[1..max_matches].
1078	*/
1079	max_matches = end - begin;
1080	if (max_matches > t->max && t->max != DUPINF)
1081	max_matches = t->max;
1082	if (max_matches < min_matches)
1083	max_matches = min_matches;
1084	endpts = (chr *) MALLOC((max_matches + `1`) sizeof(chr *));
1085	if (endpts == NULL)
1086	return REG_ESPACE;
1087	endpts[`0`] = begin;
1088
1089	d = getsubdfa(v, t->left);
1090	if (ISERR())
1091	{
1092	FREE(endpts);
1093	return v->err;
1094	}
1095	MDEBUG(("citer %d\n", t->id));
1096
1097	/*
1098	* Our strategy is to first find a set of sub-match endpoints that are
1099	* valid according to the child node's DFA, and then recursively dissect
1100	* each sub-match to confirm validity. If any validity check fails,
1101	* backtrack the last sub-match and try again. And, when we next try for
1102	* a validity check, we need not recheck any successfully verified
1103	* sub-matches that we didn't move the endpoints of. nverified remembers
1104	* how many sub-matches are currently known okay.
1105	*/
1106
1107	/ initialize to consider first sub-match /
1108	nverified = `0`;
1109	k = `1`;
1110	limit = end;
1111
1112	/ iterate until satisfaction or failure /
1113	while (k > `0`)
1114	{
1115	/ try to find an endpoint for the k'th sub-match /
1116	endpts[k] = longest(v, d, endpts[k - `1`], limit, (int *) NULL);
1117	if (ISERR())
1118	{
1119	FREE(endpts);
1120	return v->err;
1121	}
1122	if (endpts[k] == NULL)
1123	{
1124	/ no match possible, so see if we can shorten previous one /
1125	k--;
1126	goto backtrack;
1127	}
1128	MDEBUG(("%d: working endpoint %d: %ld\n",
1129	t->id, k, LOFF(endpts[k])));
1130
1131	/ k'th sub-match can no longer be considered verified /
1132	if (nverified >= k)
1133	nverified = k - `1`;
1134
1135	if (endpts[k] != end)
1136	{
1137	/ haven't reached end yet, try another iteration if allowed /
1138	if (k >= max_matches)
1139	{
1140	/ must try to shorten some previous match /
1141	k--;
1142	goto backtrack;
1143	}
1144
1145	/ reject zero-length match unless necessary to achieve min /
1146	if (endpts[k] == endpts[k - `1`] &&
1147	(k >= min_matches \|\| min_matches - k < end - endpts[k]))
1148	goto backtrack;
1149
1150	k++;
1151	limit = end;
1152	continue;
1153	}
1154
1155	/*
1156	* We've identified a way to divide the string into k sub-matches that
1157	* works so far as the child DFA can tell. If k is an allowed number
1158	* of matches, start the slow part: recurse to verify each sub-match.
1159	* We always have k <= max_matches, needn't check that.
1160	*/
1161	if (k < min_matches)
1162	goto backtrack;
1163
1164	MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + `1`, k));
1165
1166	for (i = nverified + `1`; i <= k; i++)
1167	{
1168	zaptreesubs(v, t->left);
1169	er = cdissect(v, t->left, endpts[i - `1`], endpts[i]);
1170	if (er == REG_OKAY)
1171	{
1172	nverified = i;
1173	continue;
1174	}
1175	if (er == REG_NOMATCH)
1176	break;
1177	/ oops, something failed /
1178	FREE(endpts);
1179	return er;
1180	}
1181
1182	if (i > k)
1183	{
1184	/ satisfaction /
1185	MDEBUG(("%d successful\n", t->id));
1186	FREE(endpts);
1187	return REG_OKAY;
1188	}
1189
1190	/ match failed to verify, so backtrack /
1191
1192	backtrack:
1193
1194	/*
1195	* Must consider shorter versions of the current sub-match. However,
1196	* we'll only ask for a zero-length match if necessary.
1197	*/
1198	while (k > `0`)
1199	{
1200	chr *prev_end = endpts[k - `1`];
1201
1202	if (endpts[k] > prev_end)
1203	{
1204	limit = endpts[k] - `1`;
1205	if (limit > prev_end \|\|
1206	(k < min_matches && min_matches - k >= end - prev_end))
1207	{
1208	/ break out of backtrack loop, continue the outer one /
1209	break;
1210	}
1211	}
1212	/ can't shorten k'th sub-match any more, consider previous one /
1213	k--;
1214	}
1215	}
1216
1217	/ all possibilities exhausted /
1218	FREE(endpts);
1219
1220	/*
1221	* Now consider the possibility that we can match to a zero-length string
1222	* by using zero repetitions.
1223	*/
1224	if (t->min == `0` && begin == end)
1225	{
1226	MDEBUG(("%d allowing zero matches\n", t->id));
1227	return REG_OKAY;
1228	}
1229
1230	MDEBUG(("%d failed\n", t->id));
1231	return REG_NOMATCH;
1232	}
1233
1234	/*
1235	* creviterdissect - dissect match for iteration node, shortest-first
1236	*/
1237	static int / regexec return code /
1238	creviterdissect(struct vars *v,
1239	struct subre *t,
1240	chr begin, /* beginning of relevant substring /
1241	chr end) /* end of same /
1242	{
1243	struct dfa *d;
1244	chr **endpts;
1245	chr *limit;
1246	int min_matches;
1247	size_t max_matches;
1248	int nverified;
1249	int k;
1250	int i;
1251	int er;
1252
1253	assert(t->op == `'*'`);
1254	assert(t->left != NULL && t->left->cnfa.nstates > `0`);
1255	assert(t->left->flags & SHORTER);
1256	assert(begin <= end);
1257
1258	/*
1259	* If zero matches are allowed, and target string is empty, just declare
1260	* victory. OTOH, if target string isn't empty, zero matches can't work
1261	* so we pretend the min is 1.
1262	*/
1263	min_matches = t->min;
1264	if (min_matches <= `0`)
1265	{
1266	if (begin == end)
1267	return REG_OKAY;
1268	min_matches = `1`;
1269	}
1270
1271	/*
1272	* We need workspace to track the endpoints of each sub-match. Normally
1273	* we consider only nonzero-length sub-matches, so there can be at most
1274	* end-begin of them. However, if min is larger than that, we will also
1275	* consider zero-length sub-matches in order to find enough matches.
1276	*
1277	* For convenience, endpts[0] contains the "begin" pointer and we store
1278	* sub-match endpoints in endpts[1..max_matches].
1279	*/
1280	max_matches = end - begin;
1281	if (max_matches > t->max && t->max != DUPINF)
1282	max_matches = t->max;
1283	if (max_matches < min_matches)
1284	max_matches = min_matches;
1285	endpts = (chr *) MALLOC((max_matches + `1`) sizeof(chr *));
1286	if (endpts == NULL)
1287	return REG_ESPACE;
1288	endpts[`0`] = begin;
1289
1290	d = getsubdfa(v, t->left);
1291	if (ISERR())
1292	{
1293	FREE(endpts);
1294	return v->err;
1295	}
1296	MDEBUG(("creviter %d\n", t->id));
1297
1298	/*
1299	* Our strategy is to first find a set of sub-match endpoints that are
1300	* valid according to the child node's DFA, and then recursively dissect
1301	* each sub-match to confirm validity. If any validity check fails,
1302	* backtrack the last sub-match and try again. And, when we next try for
1303	* a validity check, we need not recheck any successfully verified
1304	* sub-matches that we didn't move the endpoints of. nverified remembers
1305	* how many sub-matches are currently known okay.
1306	*/
1307
1308	/ initialize to consider first sub-match /
1309	nverified = `0`;
1310	k = `1`;
1311	limit = begin;
1312
1313	/ iterate until satisfaction or failure /
1314	while (k > `0`)
1315	{
1316	/ disallow zero-length match unless necessary to achieve min /
1317	if (limit == endpts[k - `1`] &&
1318	limit != end &&
1319	(k >= min_matches \|\| min_matches - k < end - limit))
1320	limit++;
1321
1322	/ if this is the last allowed sub-match, it must reach to the end /
1323	if (k >= max_matches)
1324	limit = end;
1325
1326	/ try to find an endpoint for the k'th sub-match /
1327	endpts[k] = shortest(v, d, endpts[k - `1`], limit, end,
1328	(chr *) NULL, (int* *) NULL);
1329	if (ISERR())
1330	{
1331	FREE(endpts);
1332	return v->err;
1333	}
1334	if (endpts[k] == NULL)
1335	{
1336	/ no match possible, so see if we can lengthen previous one /
1337	k--;
1338	goto backtrack;
1339	}
1340	MDEBUG(("%d: working endpoint %d: %ld\n",
1341	t->id, k, LOFF(endpts[k])));
1342
1343	/ k'th sub-match can no longer be considered verified /
1344	if (nverified >= k)
1345	nverified = k - `1`;
1346
1347	if (endpts[k] != end)
1348	{
1349	/ haven't reached end yet, try another iteration if allowed /
1350	if (k >= max_matches)
1351	{
1352	/ must try to lengthen some previous match /
1353	k--;
1354	goto backtrack;
1355	}
1356
1357	k++;
1358	limit = endpts[k - `1`];
1359	continue;
1360	}
1361
1362	/*
1363	* We've identified a way to divide the string into k sub-matches that
1364	* works so far as the child DFA can tell. If k is an allowed number
1365	* of matches, start the slow part: recurse to verify each sub-match.
1366	* We always have k <= max_matches, needn't check that.
1367	*/
1368	if (k < min_matches)
1369	goto backtrack;
1370
1371	MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + `1`, k));
1372
1373	for (i = nverified + `1`; i <= k; i++)
1374	{
1375	zaptreesubs(v, t->left);
1376	er = cdissect(v, t->left, endpts[i - `1`], endpts[i]);
1377	if (er == REG_OKAY)
1378	{
1379	nverified = i;
1380	continue;
1381	}
1382	if (er == REG_NOMATCH)
1383	break;
1384	/ oops, something failed /
1385	FREE(endpts);
1386	return er;
1387	}
1388
1389	if (i > k)
1390	{
1391	/ satisfaction /
1392	MDEBUG(("%d successful\n", t->id));
1393	FREE(endpts);
1394	return REG_OKAY;
1395	}
1396
1397	/ match failed to verify, so backtrack /
1398
1399	backtrack:
1400
1401	/*
1402	* Must consider longer versions of the current sub-match.
1403	*/
1404	while (k > `0`)
1405	{
1406	if (endpts[k] < end)
1407	{
1408	limit = endpts[k] + `1`;
1409	/ break out of backtrack loop, continue the outer one /
1410	break;
1411	}
1412	/ can't lengthen k'th sub-match any more, consider previous one /
1413	k--;
1414	}
1415	}
1416
1417	/ all possibilities exhausted /
1418	MDEBUG(("%d failed\n", t->id));
1419	FREE(endpts);
1420	return REG_NOMATCH;
1421	}
1422
1423
1424
1425	#include "rege_dfa.c"
1426

Browse the source code of PostgreSQL/src/backend/regex/regexec.c