regcomp.c source code [PostgreSQL/src/backend/regex/regcomp.c]

1	/*
2	* re_*comp and friends - compile REs
3	* This file #includes several others (see the bottom).
4	*
5	* Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6	*
7	* Development of this software was funded, in part, by Cray Research Inc.,
8	* UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9	* Corporation, none of whom are responsible for the results. The author
10	* thanks all of them.
11	*
12	* Redistribution and use in source and binary forms -- with or without
13	* modification -- are permitted for any purpose, provided that
14	* redistributions in source form retain this entire copyright notice and
15	* indicate the origin and nature of any modifications.
16	*
17	* I'd appreciate being given credit for this package in the documentation
18	* of software which uses it, but that is not a requirement.
19	*
20	* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21	* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22	* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23	* HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26	* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27	* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28	* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29	* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30	*
31	* src/backend/regex/regcomp.c
32	*
33	*/
34
35	#include "regex/regguts.h"
36
37	/*
38	* forward declarations, up here so forward datatypes etc. are defined early
39	*/
40	/ === regcomp.c === /
41	static void moresubs(struct vars , int*);
42	static int freev(struct vars , int*);
43	static void makesearch(struct vars , struct* nfa *);
44	static struct subre parse(struct* vars , int, int, struct* state , struct* state *);
45	static struct subre parsebranch(struct* vars , int, int, struct* state , struct* state , int*);
46	static void parseqatom(struct vars , int, int, struct* state , struct* state , struct* subre *);
47	static void nonword(struct vars , int, struct* state , struct* state *);
48	static void word(struct vars , int, struct* state , struct* state *);
49	static int scannum(struct vars *);
50	static void repeat(struct vars , struct* state , struct* state , int, int*);
51	static void bracket(struct vars , struct* state , struct* state *);
52	static void cbracket(struct vars , struct* state , struct* state *);
53	static void brackpart(struct vars , struct* state , struct* state *);
54	static const chr scanplain(struct* vars *);
55	static void onechr(struct vars , chr, struct* state , struct* state *);
56	static void wordchrs(struct vars *);
57	static void processlacon(struct vars , struct* state , struct* state , int*,
58	struct state , struct* state *);
59	static struct subre subre(struct* vars , int, int, struct* state , struct* state *);
60	static void freesubre(struct vars , struct* subre *);
61	static void freesrnode(struct vars , struct* subre *);
62	static void optst(struct vars , struct* subre *);
63	static int numst(struct subre , int*);
64	static void markst(struct subre *);
65	static void cleanst(struct vars *);
66	static long nfatree(struct vars , struct* subre , FILE );
67	static long nfanode(struct vars , struct* subre , int, FILE );
68	static int newlacon(struct vars , struct* state , struct* state , int*);
69	static void freelacons(struct subre , int*);
70	static void rfree(regex_t *);
71	static int rcancelrequested(void);
72	static int rstacktoodeep(void);
73
74	#ifdef REG_DEBUG
75	static void dump(regex_t , FILE );
76	static void dumpst(struct subre , FILE , int);
77	static void stdump(struct subre , FILE , int);
78	static const char stid(struct* subre , char* *, size_t);
79	#endif
80	/ === regc_lex.c === /
81	static void lexstart(struct vars *);
82	static void prefixes(struct vars *);
83	static void lexnest(struct vars , const* chr , const* chr *);
84	static void lexword(struct vars *);
85	static int next(struct vars *);
86	static int lexescape(struct vars *);
87	static chr lexdigits(struct vars , int, int, int*);
88	static int brenext(struct vars *, chr);
89	static void skip(struct vars *);
90	static chr newline(void);
91	static chr chrnamed(struct vars , const* chr , const* chr *, chr);
92
93	/ === regc_color.c === /
94	static void initcm(struct vars , struct* colormap *);
95	static void freecm(struct colormap *);
96	static color maxcolor(struct colormap *);
97	static color newcolor(struct colormap *);
98	static void freecolor(struct colormap *, color);
99	static color pseudocolor(struct colormap *);
100	static color subcolor(struct colormap *, chr);
101	static color subcolorhi(struct colormap , color );
102	static color newsub(struct colormap *, color);
103	static int newhicolorrow(struct colormap , int*);
104	static void newhicolorcols(struct colormap *);
105	static void subcolorcvec(struct vars , struct* cvec , struct* state , struct* state *);
106	static void subcoloronechr(struct vars , chr, struct* state , struct* state , color );
107	static void subcoloronerange(struct vars , chr, chr, struct* state , struct* state , color );
108	static void subcoloronerow(struct vars , int, struct* state , struct* state , color );
109	static void okcolors(struct nfa , struct* colormap *);
110	static void colorchain(struct colormap , struct* arc *);
111	static void uncolorchain(struct colormap , struct* arc *);
112	static void rainbow(struct nfa , struct* colormap , int, color, struct* state , struct* state *);
113	static void colorcomplement(struct nfa , struct* colormap , int, struct* state , struct* state , struct* state *);
114
115	#ifdef REG_DEBUG
116	static void dumpcolors(struct colormap , FILE );
117	static void dumpchr(chr, FILE *);
118	#endif
119	/ === regc_nfa.c === /
120	static struct nfa newnfa(struct* vars , struct* colormap , struct* nfa *);
121	static void freenfa(struct nfa *);
122	static struct state newstate(struct* nfa *);
123	static struct state newfstate(struct* nfa , int* flag);
124	static void dropstate(struct nfa , struct* state *);
125	static void freestate(struct nfa , struct* state *);
126	static void destroystate(struct nfa , struct* state *);
127	static void newarc(struct nfa , int, color, struct* state , struct* state *);
128	static void createarc(struct nfa , int, color, struct* state , struct* state *);
129	static struct arc allocarc(struct* nfa , struct* state *);
130	static void freearc(struct nfa , struct* arc *);
131	static void changearctarget(struct arc , struct* state *);
132	static int hasnonemptyout(struct state *);
133	static struct arc findarc(struct* state , int*, color);
134	static void cparc(struct nfa , struct* arc , struct* state , struct* state *);
135	static void sortins(struct nfa , struct* state *);
136	static int sortins_cmp(const void , const* void *);
137	static void sortouts(struct nfa , struct* state *);
138	static int sortouts_cmp(const void , const* void *);
139	static void moveins(struct nfa , struct* state , struct* state *);
140	static void copyins(struct nfa , struct* state , struct* state *);
141	static void mergeins(struct nfa , struct* state , struct* arc *, int*);
142	static void moveouts(struct nfa , struct* state , struct* state *);
143	static void copyouts(struct nfa , struct* state , struct* state *);
144	static void cloneouts(struct nfa , struct* state , struct* state , struct* state , int*);
145	static void delsub(struct nfa , struct* state , struct* state *);
146	static void deltraverse(struct nfa , struct* state , struct* state *);
147	static void dupnfa(struct nfa , struct* state , struct* state , struct* state , struct* state *);
148	static void duptraverse(struct nfa , struct* state , struct* state *);
149	static void cleartraverse(struct nfa , struct* state *);
150	static struct state single_color_transition(struct* state , struct* state *);
151	static void specialcolors(struct nfa *);
152	static long optimize(struct nfa , FILE );
153	static void pullback(struct nfa , FILE );
154	static int pull(struct nfa , struct* arc , struct* state **);
155	static void pushfwd(struct nfa , FILE );
156	static int push(struct nfa , struct* arc , struct* state **);
157
158	#define INCOMPATIBLE 1 /* destroys arc */
159	#define SATISFIED 2 /* constraint satisfied */
160	#define COMPATIBLE 3 /* compatible but not satisfied yet */
161	static int combine(struct arc , struct* arc *);
162	static void fixempties(struct nfa , FILE );
163	static struct state emptyreachable(struct* nfa , struct* state *,
164	struct state , struct* arc **);
165	static int isconstraintarc(struct arc *);
166	static int hasconstraintout(struct state *);
167	static void fixconstraintloops(struct nfa , FILE );
168	static int findconstraintloop(struct nfa , struct* state *);
169	static void breakconstraintloop(struct nfa , struct* state *);
170	static void clonesuccessorstates(struct nfa , struct* state , struct* state *,
171	struct state , struct* arc *,
172	char , char* , int*);
173	static void cleanup(struct nfa *);
174	static void markreachable(struct nfa , struct* state , struct* state , struct* state *);
175	static void markcanreach(struct nfa , struct* state , struct* state , struct* state *);
176	static long analyze(struct nfa *);
177	static void compact(struct nfa , struct* cnfa *);
178	static void carcsort(struct carc *, size_t);
179	static int carc_cmp(const void , const* void *);
180	static void freecnfa(struct cnfa *);
181	static void dumpnfa(struct nfa , FILE );
182
183	#ifdef REG_DEBUG
184	static void dumpstate(struct state , FILE );
185	static void dumparcs(struct state , FILE );
186	static void dumparc(struct arc , struct* state , FILE );
187	static void dumpcnfa(struct cnfa , FILE );
188	static void dumpcstate(int, struct cnfa , FILE );
189	#endif
190	/ === regc_cvec.c === /
191	static struct cvec newcvec(int, int*);
192	static struct cvec clearcvec(struct* cvec *);
193	static void addchr(struct cvec *, chr);
194	static void addrange(struct cvec *, chr, chr);
195	static struct cvec getcvec(struct* vars , int, int*);
196	static void freecvec(struct cvec *);
197
198	/ === regc_pg_locale.c === /
199	static int pg_wc_isdigit(pg_wchar c);
200	static int pg_wc_isalpha(pg_wchar c);
201	static int pg_wc_isalnum(pg_wchar c);
202	static int pg_wc_isupper(pg_wchar c);
203	static int pg_wc_islower(pg_wchar c);
204	static int pg_wc_isgraph(pg_wchar c);
205	static int pg_wc_isprint(pg_wchar c);
206	static int pg_wc_ispunct(pg_wchar c);
207	static int pg_wc_isspace(pg_wchar c);
208	static pg_wchar pg_wc_toupper(pg_wchar c);
209	static pg_wchar pg_wc_tolower(pg_wchar c);
210
211	/ === regc_locale.c === /
212	static chr element(struct vars , const* chr , const* chr *);
213	static struct cvec range(struct* vars , chr, chr, int*);
214	static int before(chr, chr);
215	static struct cvec eclass(struct* vars , chr, int*);
216	static struct cvec cclass(struct* vars , const* chr , const* chr , int*);
217	static int cclass_column_index(struct colormap *, chr);
218	static struct cvec allcases(struct* vars *, chr);
219	static int cmp(const chr , const* chr *, size_t);
220	static int casecmp(const chr , const* chr *, size_t);
221
222
223	/ internal variables, bundled for easy passing around /
224	struct vars
225	{
226	regex_t *re;
227	const chr now; /* scan pointer into string /
228	const chr stop; /* end of string /
229	const chr savenow; /* saved now and stop for "subroutine call" /
230	const chr *savestop;
231	int err; / error code (0 if none) /
232	int cflags; / copy of compile flags /
233	int lasttype; / type of previous token /
234	int nexttype; / type of next token /
235	chr nextvalue; / value (if any) of next token /
236	int lexcon; / lexical context type (see lex.c) /
237	int nsubexp; / subexpression count /
238	struct subre *subs; /* subRE pointer vector /
239	size_t nsubs; / length of vector /
240	struct subre sub10[`10`]; /* initial vector, enough for most /
241	struct nfa nfa; /* the NFA /
242	struct colormap cm; /* character color map /
243	color nlcolor; / color of newline /
244	struct state wordchrs; /* state in nfa holding word-char outarcs /
245	struct subre tree; /* subexpression tree /
246	struct subre treechain; /* all tree nodes allocated /
247	struct subre treefree; /* any free tree nodes /
248	int ntree; / number of tree nodes, plus one /
249	struct cvec cv; /* interface cvec /
250	struct cvec cv2; /* utility cvec /
251	struct subre lacons; /* lookaround-constraint vector /
252	int nlacons; / size of lacons[]; note that only slots*
253	* numbered 1 .. nlacons-1 are used */
254	size_t spaceused; / approx. space used for compilation /
255	};
256
257	/ parsing macros; most know that `v' is the struct vars pointer /
258	#define NEXT() (next(v)) /* advance by one token */
259	#define SEE(t) (v->nexttype == (t)) /* is next token this? */
260	#define EAT(t) (SEE(t) && next(v)) /* if next is this, swallow it */
261	#define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
262	#define ISERR() VISERR(v)
263	#define VERR(vv,e) ((vv)->nexttype = EOS, \
264	(vv)->err = ((vv)->err ? (vv)->err : (e)))
265	#define ERR(e) VERR(v, e) /* record an error */
266	#define NOERR() {if (ISERR()) return;} /* if error seen, return */
267	#define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */
268	#define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */
269	#define INSIST(c, e) do { if (!(c)) ERR(e); } while (0) /* error if c false */
270	#define NOTE(b) (v->re->re_info \|= (b)) /* note visible condition */
271	#define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y)
272
273	/ token type codes, some also used as NFA arc types /
274	#define EMPTY 'n' /* no token present */
275	#define EOS 'e' /* end of string */
276	#define PLAIN 'p' /* ordinary character */
277	#define DIGIT 'd' /* digit (in bound) */
278	#define BACKREF 'b' /* back reference */
279	#define COLLEL 'I' /* start of [. */
280	#define ECLASS 'E' /* start of [= */
281	#define CCLASS 'C' /* start of [: */
282	#define END 'X' /* end of [. [= [: */
283	#define RANGE 'R' /* - within [] which might be range delim. */
284	#define LACON 'L' /* lookaround constraint subRE */
285	#define AHEAD 'a' /* color-lookahead arc */
286	#define BEHIND 'r' /* color-lookbehind arc */
287	#define WBDRY 'w' /* word boundary constraint */
288	#define NWBDRY 'W' /* non-word-boundary constraint */
289	#define SBEGIN 'A' /* beginning of string (even if not BOL) */
290	#define SEND 'Z' /* end of string (even if not EOL) */
291	#define PREFER 'P' /* length preference */
292
293	/ is an arc colored, and hence on a color chain? /
294	#define COLORED(a) \
295	((a)->type == PLAIN \|\| (a)->type == AHEAD \|\| (a)->type == BEHIND)
296
297
298	/ static function list /
299	static const struct fns functions = {
300	rfree, / regfree insides /
301	rcancelrequested, / check for cancel request /
302	rstacktoodeep / check for stack getting dangerously deep /
303	};
304
305
306
307	/*
308	* pg_regcomp - compile regular expression
309	*
310	* Note: on failure, no resources remain allocated, so pg_regfree()
311	* need not be applied to re.
312	*/
313	int
314	pg_regcomp(regex_t *re,
315	const chr *string,
316	size_t len,
317	int flags,
318	Oid collation)
319	{
320	struct vars var;
321	struct vars *v = &var;
322	struct guts *g;
323	int i;
324	size_t j;
325
326	#ifdef REG_DEBUG
327	FILE debug = (flags & REG_PROGRESS) ? stdout : (FILE ) NULL;
328	#else
329	FILE debug = (FILE ) NULL;
330	#endif
331
332	#define CNOERR() { if (ISERR()) return freev(v, v->err); }
333
334	/ sanity checks /
335
336	if (re == NULL \|\| string == NULL)
337	return REG_INVARG;
338	if ((flags & REG_QUOTE) &&
339	(flags & (REG_ADVANCED \| REG_EXPANDED \| REG_NEWLINE)))
340	return REG_INVARG;
341	if (!(flags & REG_EXTENDED) && (flags & REG_ADVF))
342	return REG_INVARG;
343
344	/ Initialize locale-dependent support /
345	pg_set_regex_collation(collation);
346
347	/ initial setup (after which freev() is callable) /
348	v->re = re;
349	v->now = string;
350	v->stop = v->now + len;
351	v->savenow = v->savestop = NULL;
352	v->err = `0`;
353	v->cflags = flags;
354	v->nsubexp = `0`;
355	v->subs = v->sub10;
356	v->nsubs = `10`;
357	for (j = `0`; j < v->nsubs; j++)
358	v->subs[j] = NULL;
359	v->nfa = NULL;
360	v->cm = NULL;
361	v->nlcolor = COLORLESS;
362	v->wordchrs = NULL;
363	v->tree = NULL;
364	v->treechain = NULL;
365	v->treefree = NULL;
366	v->cv = NULL;
367	v->cv2 = NULL;
368	v->lacons = NULL;
369	v->nlacons = `0`;
370	v->spaceused = `0`;
371	re->re_magic = REMAGIC;
372	re->re_info = `0`; / bits get set during parse /
373	re->re_csize = sizeof(chr);
374	re->re_collation = collation;
375	re->re_guts = NULL;
376	re->re_fns = VS(&functions);
377
378	/ more complex setup, malloced things /
379	re->re_guts = VS(MALLOC(sizeof(struct guts)));
380	if (re->re_guts == NULL)
381	return freev(v, REG_ESPACE);
382	g = (struct guts *) re->re_guts;
383	g->tree = NULL;
384	initcm(v, &g->cmap);
385	v->cm = &g->cmap;
386	g->lacons = NULL;
387	g->nlacons = `0`;
388	ZAPCNFA(g->search);
389	v->nfa = newnfa(v, v->cm, (struct nfa *) NULL);
390	CNOERR();
391	/ set up a reasonably-sized transient cvec for getcvec usage /
392	v->cv = newcvec(`100`, `20`);
393	if (v->cv == NULL)
394	return freev(v, REG_ESPACE);
395
396	/ parsing /
397	lexstart(v); / also handles prefixes /
398	if ((v->cflags & REG_NLSTOP) \|\| (v->cflags & REG_NLANCH))
399	{
400	/ assign newline a unique color /
401	v->nlcolor = subcolor(v->cm, newline());
402	okcolors(v->nfa, v->cm);
403	}
404	CNOERR();
405	v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final);
406	assert(SEE(EOS)); / even if error; ISERR() => SEE(EOS) /
407	CNOERR();
408	assert(v->tree != NULL);
409
410	/ finish setup of nfa and its subre tree /
411	specialcolors(v->nfa);
412	CNOERR();
413	#ifdef REG_DEBUG
414	if (debug != NULL)
415	{
416	fprintf(debug, "\n\n\n========= RAW ==========\n");
417	dumpnfa(v->nfa, debug);
418	dumpst(v->tree, debug, `1`);
419	}
420	#endif
421	optst(v, v->tree);
422	v->ntree = numst(v->tree, `1`);
423	markst(v->tree);
424	cleanst(v);
425	#ifdef REG_DEBUG
426	if (debug != NULL)
427	{
428	fprintf(debug, "\n\n\n========= TREE FIXED ==========\n");
429	dumpst(v->tree, debug, `1`);
430	}
431	#endif
432
433	/ build compacted NFAs for tree and lacons /
434	re->re_info \|= nfatree(v, v->tree, debug);
435	CNOERR();
436	assert(v->nlacons == `0` \|\| v->lacons != NULL);
437	for (i = `1`; i < v->nlacons; i++)
438	{
439	struct subre *lasub = &v->lacons[i];
440
441	#ifdef REG_DEBUG
442	if (debug != NULL)
443	fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
444	#endif
445
446	/ Prepend .* to pattern if it's a lookbehind LACON /
447	nfanode(v, lasub, !LATYPE_IS_AHEAD(lasub->subno), debug);
448	}
449	CNOERR();
450	if (v->tree->flags & SHORTER)
451	NOTE(REG_USHORTEST);
452
453	/ build compacted NFAs for tree, lacons, fast search /
454	#ifdef REG_DEBUG
455	if (debug != NULL)
456	fprintf(debug, "\n\n\n========= SEARCH ==========\n");
457	#endif
458	/ can sacrifice main NFA now, so use it as work area /
459	(DISCARD) optimize(v->nfa, debug);
460	CNOERR();
461	makesearch(v, v->nfa);
462	CNOERR();
463	compact(v->nfa, &g->search);
464	CNOERR();
465
466	/ looks okay, package it up /
467	re->re_nsub = v->nsubexp;
468	v->re = NULL; / freev no longer frees re /
469	g->magic = GUTSMAGIC;
470	g->cflags = v->cflags;
471	g->info = re->re_info;
472	g->nsub = re->re_nsub;
473	g->tree = v->tree;
474	v->tree = NULL;
475	g->ntree = v->ntree;
476	g->compare = (v->cflags & REG_ICASE) ? casecmp : cmp;
477	g->lacons = v->lacons;
478	v->lacons = NULL;
479	g->nlacons = v->nlacons;
480
481	#ifdef REG_DEBUG
482	if (flags & REG_DUMP)
483	dump(re, stdout);
484	#endif
485
486	assert(v->err == `0`);
487	return freev(v, `0`);
488	}
489
490	/*
491	* moresubs - enlarge subRE vector
492	*/
493	static void
494	moresubs(struct vars *v,
495	int wanted) / want enough room for this one /
496	{
497	struct subre **p;
498	size_t n;
499
500	assert(wanted > `0` && (size_t) wanted >= v->nsubs);
501	n = (size_t) wanted * `3` / `2` + `1`;
502
503	if (v->subs == v->sub10)
504	{
505	p = (struct subre *) MALLOC(n sizeof(struct subre *));
506	if (p != NULL)
507	memcpy(VS(p), VS(v->subs),
508	v->nsubs * sizeof(struct subre *));
509	}
510	else
511	p = (struct subre *) REALLOC(v->subs, n sizeof(struct subre *));
512	if (p == NULL)
513	{
514	ERR(REG_ESPACE);
515	return;
516	}
517	v->subs = p;
518	for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
519	*p = NULL;
520	assert(v->nsubs == n);
521	assert((size_t) wanted < v->nsubs);
522	}
523
524	/*
525	* freev - free vars struct's substructures where necessary
526	*
527	* Optionally does error-number setting, and always returns error code
528	* (if any), to make error-handling code terser.
529	*/
530	static int
531	freev(struct vars *v,
532	int err)
533	{
534	if (v->re != NULL)
535	rfree(v->re);
536	if (v->subs != v->sub10)
537	FREE(v->subs);
538	if (v->nfa != NULL)
539	freenfa(v->nfa);
540	if (v->tree != NULL)
541	freesubre(v, v->tree);
542	if (v->treechain != NULL)
543	cleanst(v);
544	if (v->cv != NULL)
545	freecvec(v->cv);
546	if (v->cv2 != NULL)
547	freecvec(v->cv2);
548	if (v->lacons != NULL)
549	freelacons(v->lacons, v->nlacons);
550	ERR(err); / nop if err==0 /
551
552	return v->err;
553	}
554
555	/*
556	* makesearch - turn an NFA into a search NFA (implicit prepend of .*?)
557	* NFA must have been optimize()d already.
558	*/
559	static void
560	makesearch(struct vars *v,
561	struct nfa *nfa)
562	{
563	struct arc *a;
564	struct arc *b;
565	struct state *pre = nfa->pre;
566	struct state *s;
567	struct state *s2;
568	struct state *slist;
569
570	/ no loops are needed if it's anchored /
571	for (a = pre->outs; a != NULL; a = a->outchain)
572	{
573	assert(a->type == PLAIN);
574	if (a->co != nfa->bos[`0`] && a->co != nfa->bos[`1`])
575	break;
576	}
577	if (a != NULL)
578	{
579	/ add implicit .* in front /
580	rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre);
581
582	/ and ^* and \A* too -- not always necessary, but harmless /
583	newarc(nfa, PLAIN, nfa->bos[`0`], pre, pre);
584	newarc(nfa, PLAIN, nfa->bos[`1`], pre, pre);
585	}
586
587	/*
588	* Now here's the subtle part. Because many REs have no lookback
589	* constraints, often knowing when you were in the pre state tells you
590	* little; it's the next state(s) that are informative. But some of them
591	* may have other inarcs, i.e. it may be possible to make actual progress
592	* and then return to one of them. We must de-optimize such cases,
593	* splitting each such state into progress and no-progress states.
594	*/
595
596	/ first, make a list of the states reachable from pre and elsewhere /
597	slist = NULL;
598	for (a = pre->outs; a != NULL; a = a->outchain)
599	{
600	s = a->to;
601	for (b = s->ins; b != NULL; b = b->inchain)
602	{
603	if (b->from != pre)
604	break;
605	}
606
607	/*
608	* We want to mark states as being in the list already by having non
609	* NULL tmp fields, but we can't just store the old slist value in tmp
610	* because that doesn't work for the first such state. Instead, the
611	* first list entry gets its own address in tmp.
612	*/
613	if (b != NULL && s->tmp == NULL)
614	{
615	s->tmp = (slist != NULL) ? slist : s;
616	slist = s;
617	}
618	}
619
620	/ do the splits /
621	for (s = slist; s != NULL; s = s2)
622	{
623	s2 = newstate(nfa);
624	NOERR();
625	copyouts(nfa, s, s2);
626	NOERR();
627	for (a = s->ins; a != NULL; a = b)
628	{
629	b = a->inchain;
630	if (a->from != pre)
631	{
632	cparc(nfa, a, a->from, s2);
633	freearc(nfa, a);
634	}
635	}
636	s2 = (s->tmp != s) ? s->tmp : NULL;
637	s->tmp = NULL; / clean up while we're at it /
638	}
639	}
640
641	/*
642	* parse - parse an RE
643	*
644	* This is actually just the top level, which parses a bunch of branches
645	* tied together with '\|'. They appear in the tree as the left children
646	* of a chain of '\|' subres.
647	*/
648	static struct subre *
649	parse(struct vars *v,
650	int stopper, / EOS or ')' /
651	int type, / LACON (lookaround subRE) or PLAIN /
652	struct state init, /* initial state /
653	struct state final) /* final state /
654	{
655	struct state left; /* scaffolding for branch /
656	struct state *right;
657	struct subre branches; /* top level /
658	struct subre branch; /* current branch /
659	struct subre t; /* temporary /
660	int firstbranch; / is this the first branch? /
661
662	assert(stopper == `')'` \|\| stopper == EOS);
663
664	branches = subre(v, `'\|'`, LONGER, init, final);
665	NOERRN();
666	branch = branches;
667	firstbranch = `1`;
668	do
669	{ / a branch /
670	if (!firstbranch)
671	{
672	/ need a place to hang it /
673	branch->right = subre(v, `'\|'`, LONGER, init, final);
674	NOERRN();
675	branch = branch->right;
676	}
677	firstbranch = `0`;
678	left = newstate(v->nfa);
679	right = newstate(v->nfa);
680	NOERRN();
681	EMPTYARC(init, left);
682	EMPTYARC(right, final);
683	NOERRN();
684	branch->left = parsebranch(v, stopper, type, left, right, `0`);
685	NOERRN();
686	branch->flags \|= UP(branch->flags \| branch->left->flags);
687	if ((branch->flags & ~branches->flags) != `0`) / new flags /
688	for (t = branches; t != branch; t = t->right)
689	t->flags \|= branch->flags;
690	} while (EAT(`'\|'`));
691	assert(SEE(stopper) \|\| SEE(EOS));
692
693	if (!SEE(stopper))
694	{
695	assert(stopper == `')'` && SEE(EOS));
696	ERR(REG_EPAREN);
697	}
698
699	/ optimize out simple cases /
700	if (branch == branches)
701	{ / only one branch /
702	assert(branch->right == NULL);
703	t = branch->left;
704	branch->left = NULL;
705	freesubre(v, branches);
706	branches = t;
707	}
708	else if (!MESSY(branches->flags))
709	{ / no interesting innards /
710	freesubre(v, branches->left);
711	branches->left = NULL;
712	freesubre(v, branches->right);
713	branches->right = NULL;
714	branches->op = `'='`;
715	}
716
717	return branches;
718	}
719
720	/*
721	* parsebranch - parse one branch of an RE
722	*
723	* This mostly manages concatenation, working closely with parseqatom().
724	* Concatenated things are bundled up as much as possible, with separate
725	* ',' nodes introduced only when necessary due to substructure.
726	*/
727	static struct subre *
728	parsebranch(struct vars *v,
729	int stopper, / EOS or ')' /
730	int type, / LACON (lookaround subRE) or PLAIN /
731	struct state left, /* leftmost state /
732	struct state right, /* rightmost state /
733	int partial) / is this only part of a branch? /
734	{
735	struct state lp; /* left end of current construct /
736	int seencontent; / is there anything in this branch yet? /
737	struct subre *t;
738
739	lp = left;
740	seencontent = `0`;
741	t = subre(v, `'='`, `0`, left, right); / op '=' is tentative /
742	NOERRN();
743	while (!SEE(`'\|'`) && !SEE(stopper) && !SEE(EOS))
744	{
745	if (seencontent)
746	{ / implicit concat operator /
747	lp = newstate(v->nfa);
748	NOERRN();
749	moveins(v->nfa, right, lp);
750	}
751	seencontent = `1`;
752
753	/ NB, recursion in parseqatom() may swallow rest of branch /
754	parseqatom(v, stopper, type, lp, right, t);
755	NOERRN();
756	}
757
758	if (!seencontent)
759	{ / empty branch /
760	if (!partial)
761	NOTE(REG_UUNSPEC);
762	assert(lp == left);
763	EMPTYARC(left, right);
764	}
765
766	return t;
767	}
768
769	/*
770	* parseqatom - parse one quantified atom or constraint of an RE
771	*
772	* The bookkeeping near the end cooperates very closely with parsebranch();
773	* in particular, it contains a recursion that can involve parsing the rest
774	* of the branch, making this function's name somewhat inaccurate.
775	*/
776	static void
777	parseqatom(struct vars *v,
778	int stopper, / EOS or ')' /
779	int type, / LACON (lookaround subRE) or PLAIN /
780	struct state lp, /* left state to hang it on /
781	struct state rp, /* right state to hang it on /
782	struct subre top) /* subtree top /
783	{
784	struct state s; /* temporaries for new states /
785	struct state *s2;
786
787	#define ARCV(t, val) newarc(v->nfa, t, val, lp, rp)
788	int m,
789	n;
790	struct subre atom; /* atom's subtree /
791	struct subre *t;
792	int cap; / capturing parens? /
793	int latype; / lookaround constraint type /
794	int subno; / capturing-parens or backref number /
795	int atomtype;
796	int qprefer; / quantifier short/long preference /
797	int f;
798	struct subre *atomp; /* where the pointer to atom is /
799
800	/ initial bookkeeping /
801	atom = NULL;
802	assert(lp->nouts == `0`); / must string new code /
803	assert(rp->nins == `0`); / between lp and rp /
804	subno = `0`; / just to shut lint up /
805
806	/ an atom or constraint... /
807	atomtype = v->nexttype;
808	switch (atomtype)
809	{
810	/ first, constraints, which end by returning /
811	case `'^'`:
812	ARCV(`'^'`, `1`);
813	if (v->cflags & REG_NLANCH)
814	ARCV(BEHIND, v->nlcolor);
815	NEXT();
816	return;
817	break;
818	case `'$'`:
819	ARCV(`'$'`, `1`);
820	if (v->cflags & REG_NLANCH)
821	ARCV(AHEAD, v->nlcolor);
822	NEXT();
823	return;
824	break;
825	case SBEGIN:
826	ARCV(`'^'`, `1`); / BOL /
827	ARCV(`'^'`, `0`); / or BOS /
828	NEXT();
829	return;
830	break;
831	case SEND:
832	ARCV(`'$'`, `1`); / EOL /
833	ARCV(`'$'`, `0`); / or EOS /
834	NEXT();
835	return;
836	break;
837	case `'<'`:
838	wordchrs(v); / does NEXT() /
839	s = newstate(v->nfa);
840	NOERR();
841	nonword(v, BEHIND, lp, s);
842	word(v, AHEAD, s, rp);
843	return;
844	break;
845	case `'>'`:
846	wordchrs(v); / does NEXT() /
847	s = newstate(v->nfa);
848	NOERR();
849	word(v, BEHIND, lp, s);
850	nonword(v, AHEAD, s, rp);
851	return;
852	break;
853	case WBDRY:
854	wordchrs(v); / does NEXT() /
855	s = newstate(v->nfa);
856	NOERR();
857	nonword(v, BEHIND, lp, s);
858	word(v, AHEAD, s, rp);
859	s = newstate(v->nfa);
860	NOERR();
861	word(v, BEHIND, lp, s);
862	nonword(v, AHEAD, s, rp);
863	return;
864	break;
865	case NWBDRY:
866	wordchrs(v); / does NEXT() /
867	s = newstate(v->nfa);
868	NOERR();
869	word(v, BEHIND, lp, s);
870	word(v, AHEAD, s, rp);
871	s = newstate(v->nfa);
872	NOERR();
873	nonword(v, BEHIND, lp, s);
874	nonword(v, AHEAD, s, rp);
875	return;
876	break;
877	case LACON: / lookaround constraint /
878	latype = v->nextvalue;
879	NEXT();
880	s = newstate(v->nfa);
881	s2 = newstate(v->nfa);
882	NOERR();
883	t = parse(v, `')'`, LACON, s, s2);
884	freesubre(v, t); / internal structure irrelevant /
885	NOERR();
886	assert(SEE(`')'`));
887	NEXT();
888	processlacon(v, s, s2, latype, lp, rp);
889	return;
890	break;
891	/ then errors, to get them out of the way /
892	case `'*'`:
893	case `'+'`:
894	case `'?'`:
895	case `'{'`:
896	ERR(REG_BADRPT);
897	return;
898	break;
899	default:
900	ERR(REG_ASSERT);
901	return;
902	break;
903	/ then plain characters, and minor variants on that theme /
904	case `')'`: / unbalanced paren /
905	if ((v->cflags & REG_ADVANCED) != REG_EXTENDED)
906	{
907	ERR(REG_EPAREN);
908	return;
909	}
910	/ legal in EREs due to specification botch /
911	NOTE(REG_UPBOTCH);
912	/ fall through into case PLAIN /
913	/ FALLTHROUGH /
914	case PLAIN:
915	onechr(v, v->nextvalue, lp, rp);
916	okcolors(v->nfa, v->cm);
917	NOERR();
918	NEXT();
919	break;
920	case `'['`:
921	if (v->nextvalue == `1`)
922	bracket(v, lp, rp);
923	else
924	cbracket(v, lp, rp);
925	assert(SEE(`']'`) \|\| ISERR());
926	NEXT();
927	break;
928	case `'.'`:
929	rainbow(v->nfa, v->cm, PLAIN,
930	(v->cflags & REG_NLSTOP) ? v->nlcolor : COLORLESS,
931	lp, rp);
932	NEXT();
933	break;
934	/ and finally the ugly stuff /
935	case `'('`: / value flags as capturing or non /
936	cap = (type == LACON) ? `0` : v->nextvalue;
937	if (cap)
938	{
939	v->nsubexp++;
940	subno = v->nsubexp;
941	if ((size_t) subno >= v->nsubs)
942	moresubs(v, subno);
943	assert((size_t) subno < v->nsubs);
944	}
945	else
946	atomtype = PLAIN; / something that's not '(' /
947	NEXT();
948	/ need new endpoints because tree will contain pointers /
949	s = newstate(v->nfa);
950	s2 = newstate(v->nfa);
951	NOERR();
952	EMPTYARC(lp, s);
953	EMPTYARC(s2, rp);
954	NOERR();
955	atom = parse(v, `')'`, type, s, s2);
956	assert(SEE(`')'`) \|\| ISERR());
957	NEXT();
958	NOERR();
959	if (cap)
960	{
961	v->subs[subno] = atom;
962	t = subre(v, `'('`, atom->flags \| CAP, lp, rp);
963	NOERR();
964	t->subno = subno;
965	t->left = atom;
966	atom = t;
967	}
968	/ postpone everything else pending possible {0} /
969	break;
970	case BACKREF: / the Feature From The Black Lagoon /
971	INSIST(type != LACON, REG_ESUBREG);
972	INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
973	INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
974	NOERR();
975	assert(v->nextvalue > `0`);
976	atom = subre(v, `'b'`, BACKR, lp, rp);
977	NOERR();
978	subno = v->nextvalue;
979	atom->subno = subno;
980	EMPTYARC(lp, rp); / temporarily, so there's something /
981	NEXT();
982	break;
983	}
984
985	/ ...and an atom may be followed by a quantifier /
986	switch (v->nexttype)
987	{
988	case `'*'`:
989	m = `0`;
990	n = DUPINF;
991	qprefer = (v->nextvalue) ? LONGER : SHORTER;
992	NEXT();
993	break;
994	case `'+'`:
995	m = `1`;
996	n = DUPINF;
997	qprefer = (v->nextvalue) ? LONGER : SHORTER;
998	NEXT();
999	break;
1000	case `'?'`:
1001	m = `0`;
1002	n = `1`;
1003	qprefer = (v->nextvalue) ? LONGER : SHORTER;
1004	NEXT();
1005	break;
1006	case `'{'`:
1007	NEXT();
1008	m = scannum(v);
1009	if (EAT(`','`))
1010	{
1011	if (SEE(DIGIT))
1012	n = scannum(v);
1013	else
1014	n = DUPINF;
1015	if (m > n)
1016	{
1017	ERR(REG_BADBR);
1018	return;
1019	}
1020	/ {m,n} exercises preference, even if it's {m,m} /
1021	qprefer = (v->nextvalue) ? LONGER : SHORTER;
1022	}
1023	else
1024	{
1025	n = m;
1026	/ {m} passes operand's preference through /
1027	qprefer = `0`;
1028	}
1029	if (!SEE(`'}'`))
1030	{ / catches errors too /
1031	ERR(REG_BADBR);
1032	return;
1033	}
1034	NEXT();
1035	break;
1036	default: / no quantifier /
1037	m = n = `1`;
1038	qprefer = `0`;
1039	break;
1040	}
1041
1042	/ annoying special case: {0} or {0,0} cancels everything /
1043	if (m == `0` && n == `0`)
1044	{
1045	if (atom != NULL)
1046	freesubre(v, atom);
1047	if (atomtype == `'('`)
1048	v->subs[subno] = NULL;
1049	delsub(v->nfa, lp, rp);
1050	EMPTYARC(lp, rp);
1051	return;
1052	}
1053
1054	/ if not a messy case, avoid hard part /
1055	assert(!MESSY(top->flags));
1056	f = top->flags \| qprefer \| ((atom != NULL) ? atom->flags : `0`);
1057	if (atomtype != `'('` && atomtype != BACKREF && !MESSY(UP(f)))
1058	{
1059	if (!(m == `1` && n == `1`))
1060	repeat(v, lp, rp, m, n);
1061	if (atom != NULL)
1062	freesubre(v, atom);
1063	top->flags = f;
1064	return;
1065	}
1066
1067	/*
1068	* hard part: something messy
1069	*
1070	* That is, capturing parens, back reference, short/long clash, or an atom
1071	* with substructure containing one of those.
1072	*/
1073
1074	/ now we'll need a subre for the contents even if they're boring /
1075	if (atom == NULL)
1076	{
1077	atom = subre(v, `'='`, `0`, lp, rp);
1078	NOERR();
1079	}
1080
1081	/----------*
1082	* Prepare a general-purpose state skeleton.
1083	*
1084	* In the no-backrefs case, we want this:
1085	*
1086	* [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
1087	*
1088	* where prefix is some repetitions of atom. In the general case we need
1089	*
1090	* [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
1091	*
1092	* where the iterator wraps around [begin] ---atom---> [end]
1093	*
1094	* We make the s state here for both cases; s2 is made below if needed
1095	*----------
1096	*/
1097	s = newstate(v->nfa); / first, new endpoints for the atom /
1098	s2 = newstate(v->nfa);
1099	NOERR();
1100	moveouts(v->nfa, lp, s);
1101	moveins(v->nfa, rp, s2);
1102	NOERR();
1103	atom->begin = s;
1104	atom->end = s2;
1105	s = newstate(v->nfa); / set up starting state /
1106	NOERR();
1107	EMPTYARC(lp, s);
1108	NOERR();
1109
1110	/ break remaining subRE into x{...} and what follows /
1111	t = subre(v, `'.'`, COMBINE(qprefer, atom->flags), lp, rp);
1112	NOERR();
1113	t->left = atom;
1114	atomp = &t->left;
1115
1116	/ here we should recurse... but we must postpone that to the end /
1117
1118	/ split top into prefix and remaining /
1119	assert(top->op == `'='` && top->left == NULL && top->right == NULL);
1120	top->left = subre(v, `'='`, top->flags, top->begin, lp);
1121	NOERR();
1122	top->op = `'.'`;
1123	top->right = t;
1124
1125	/ if it's a backref, now is the time to replicate the subNFA /
1126	if (atomtype == BACKREF)
1127	{
1128	assert(atom->begin->nouts == `1`); / just the EMPTY /
1129	delsub(v->nfa, atom->begin, atom->end);
1130	assert(v->subs[subno] != NULL);
1131
1132	/*
1133	* And here's why the recursion got postponed: it must wait until the
1134	* skeleton is filled in, because it may hit a backref that wants to
1135	* copy the filled-in skeleton.
1136	*/
1137	dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
1138	atom->begin, atom->end);
1139	NOERR();
1140	}
1141
1142	/*
1143	* It's quantifier time. If the atom is just a backref, we'll let it deal
1144	* with quantifiers internally.
1145	*/
1146	if (atomtype == BACKREF)
1147	{
1148	/ special case: backrefs have internal quantifiers /
1149	EMPTYARC(s, atom->begin); / empty prefix /
1150	/ just stuff everything into atom /
1151	repeat(v, atom->begin, atom->end, m, n);
1152	atom->min = (short) m;
1153	atom->max = (short) n;
1154	atom->flags \|= COMBINE(qprefer, atom->flags);
1155	/ rest of branch can be strung starting from atom->end /
1156	s2 = atom->end;
1157	}
1158	else if (m == `1` && n == `1` &&
1159	(qprefer == `0` \|\|
1160	(atom->flags & (LONGER \| SHORTER \| MIXED)) == `0` \|\|
1161	qprefer == (atom->flags & (LONGER \| SHORTER \| MIXED))))
1162	{
1163	/ no/vacuous quantifier: done /
1164	EMPTYARC(s, atom->begin); / empty prefix /
1165	/ rest of branch can be strung starting from atom->end /
1166	s2 = atom->end;
1167	}
1168	else if (m > `0` && !(atom->flags & BACKR))
1169	{
1170	/*
1171	* If there's no backrefs involved, we can turn x{m,n} into
1172	* x{m-1,n-1}x, with capturing parens in only the second x. This is
1173	* valid because we only care about capturing matches from the final
1174	* iteration of the quantifier. It's a win because we can implement
1175	* the backref-free left side as a plain DFA node, since we don't
1176	* really care where its submatches are.
1177	*/
1178	dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
1179	assert(m >= `1` && m != DUPINF && n >= `1`);
1180	repeat(v, s, atom->begin, m - `1`, (n == DUPINF) ? n : n - `1`);
1181	f = COMBINE(qprefer, atom->flags);
1182	t = subre(v, `'.'`, f, s, atom->end); / prefix and atom /
1183	NOERR();
1184	t->left = subre(v, `'='`, PREF(f), s, atom->begin);
1185	NOERR();
1186	t->right = atom;
1187	*atomp = t;
1188	/ rest of branch can be strung starting from atom->end /
1189	s2 = atom->end;
1190	}
1191	else
1192	{
1193	/ general case: need an iteration node /
1194	s2 = newstate(v->nfa);
1195	NOERR();
1196	moveouts(v->nfa, atom->end, s2);
1197	NOERR();
1198	dupnfa(v->nfa, atom->begin, atom->end, s, s2);
1199	repeat(v, s, s2, m, n);
1200	f = COMBINE(qprefer, atom->flags);
1201	t = subre(v, `'*'`, f, s, s2);
1202	NOERR();
1203	t->min = (short) m;
1204	t->max = (short) n;
1205	t->left = atom;
1206	*atomp = t;
1207	/ rest of branch is to be strung from iteration's end state /
1208	}
1209
1210	/ and finally, look after that postponed recursion /
1211	t = top->right;
1212	if (!(SEE(`'\|'`) \|\| SEE(stopper) \|\| SEE(EOS)))
1213	t->right = parsebranch(v, stopper, type, s2, rp, `1`);
1214	else
1215	{
1216	EMPTYARC(s2, rp);
1217	t->right = subre(v, `'='`, `0`, s2, rp);
1218	}
1219	NOERR();
1220	assert(SEE(`'\|'`) \|\| SEE(stopper) \|\| SEE(EOS));
1221	t->flags \|= COMBINE(t->flags, t->right->flags);
1222	top->flags \|= COMBINE(top->flags, t->flags);
1223	}
1224
1225	/*
1226	* nonword - generate arcs for non-word-character ahead or behind
1227	*/
1228	static void
1229	nonword(struct vars *v,
1230	int dir, / AHEAD or BEHIND /
1231	struct state *lp,
1232	struct state *rp)
1233	{
1234	int anchor = (dir == AHEAD) ? `'$'` : `'^'`;
1235
1236	assert(dir == AHEAD \|\| dir == BEHIND);
1237	newarc(v->nfa, anchor, `1`, lp, rp);
1238	newarc(v->nfa, anchor, `0`, lp, rp);
1239	colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp);
1240	/ (no need for special attention to \n) /
1241	}
1242
1243	/*
1244	* word - generate arcs for word character ahead or behind
1245	*/
1246	static void
1247	word(struct vars *v,
1248	int dir, / AHEAD or BEHIND /
1249	struct state *lp,
1250	struct state *rp)
1251	{
1252	assert(dir == AHEAD \|\| dir == BEHIND);
1253	cloneouts(v->nfa, v->wordchrs, lp, rp, dir);
1254	/ (no need for special attention to \n) /
1255	}
1256
1257	/*
1258	* scannum - scan a number
1259	*/
1260	static int / value, <= DUPMAX /
1261	scannum(struct vars *v)
1262	{
1263	int n = `0`;
1264
1265	while (SEE(DIGIT) && n < DUPMAX)
1266	{
1267	n = n * `10` + v->nextvalue;
1268	NEXT();
1269	}
1270	if (SEE(DIGIT) \|\| n > DUPMAX)
1271	{
1272	ERR(REG_BADBR);
1273	return `0`;
1274	}
1275	return n;
1276	}
1277
1278	/*
1279	* repeat - replicate subNFA for quantifiers
1280	*
1281	* The sub-NFA strung from lp to rp is modified to represent m to n
1282	* repetitions of its initial contents.
1283	*
1284	* The duplication sequences used here are chosen carefully so that any
1285	* pointers starting out pointing into the subexpression end up pointing into
1286	* the last occurrence. (Note that it may not be strung between the same
1287	* left and right end states, however!) This used to be important for the
1288	* subRE tree, although the important bits are now handled by the in-line
1289	* code in parse(), and when this is called, it doesn't matter any more.
1290	*/
1291	static void
1292	repeat(struct vars *v,
1293	struct state *lp,
1294	struct state *rp,
1295	int m,
1296	int n)
1297	{
1298	#define SOME 2
1299	#define INF 3
1300	#define PAIR(x, y) ((x)*4 + (y))
1301	#define REDUCE(x) ( ((x) == DUPINF) ? INF : (((x) > 1) ? SOME : (x)) )
1302	const int rm = REDUCE(m);
1303	const int rn = REDUCE(n);
1304	struct state *s;
1305	struct state *s2;
1306
1307	switch (PAIR(rm, rn))
1308	{
1309	case PAIR(`0`, `0`): / empty string /
1310	delsub(v->nfa, lp, rp);
1311	EMPTYARC(lp, rp);
1312	break;
1313	case PAIR(`0`, `1`): / do as x\| /
1314	EMPTYARC(lp, rp);
1315	break;
1316	case PAIR(`0`, SOME): / do as x{1,n}\| /
1317	repeat(v, lp, rp, `1`, n);
1318	NOERR();
1319	EMPTYARC(lp, rp);
1320	break;
1321	case PAIR(`0`, INF): / loop x around /
1322	s = newstate(v->nfa);
1323	NOERR();
1324	moveouts(v->nfa, lp, s);
1325	moveins(v->nfa, rp, s);
1326	EMPTYARC(lp, s);
1327	EMPTYARC(s, rp);
1328	break;
1329	case PAIR(`1`, `1`): / no action required /
1330	break;
1331	case PAIR(`1`, SOME): / do as x{0,n-1}x = (x{1,n-1}\|)x /
1332	s = newstate(v->nfa);
1333	NOERR();
1334	moveouts(v->nfa, lp, s);
1335	dupnfa(v->nfa, s, rp, lp, s);
1336	NOERR();
1337	repeat(v, lp, s, `1`, n - `1`);
1338	NOERR();
1339	EMPTYARC(lp, s);
1340	break;
1341	case PAIR(`1`, INF): / add loopback arc /
1342	s = newstate(v->nfa);
1343	s2 = newstate(v->nfa);
1344	NOERR();
1345	moveouts(v->nfa, lp, s);
1346	moveins(v->nfa, rp, s2);
1347	EMPTYARC(lp, s);
1348	EMPTYARC(s2, rp);
1349	EMPTYARC(s2, s);
1350	break;
1351	case PAIR(SOME, SOME): / do as x{m-1,n-1}x /
1352	s = newstate(v->nfa);
1353	NOERR();
1354	moveouts(v->nfa, lp, s);
1355	dupnfa(v->nfa, s, rp, lp, s);
1356	NOERR();
1357	repeat(v, lp, s, m - `1`, n - `1`);
1358	break;
1359	case PAIR(SOME, INF): / do as x{m-1,}x /
1360	s = newstate(v->nfa);
1361	NOERR();
1362	moveouts(v->nfa, lp, s);
1363	dupnfa(v->nfa, s, rp, lp, s);
1364	NOERR();
1365	repeat(v, lp, s, m - `1`, n);
1366	break;
1367	default:
1368	ERR(REG_ASSERT);
1369	break;
1370	}
1371	}
1372
1373	/*
1374	* bracket - handle non-complemented bracket expression
1375	* Also called from cbracket for complemented bracket expressions.
1376	*/
1377	static void
1378	bracket(struct vars *v,
1379	struct state *lp,
1380	struct state *rp)
1381	{
1382	assert(SEE(`'['`));
1383	NEXT();
1384	while (!SEE(`']'`) && !SEE(EOS))
1385	brackpart(v, lp, rp);
1386	assert(SEE(`']'`) \|\| ISERR());
1387	okcolors(v->nfa, v->cm);
1388	}
1389
1390	/*
1391	* cbracket - handle complemented bracket expression
1392	* We do it by calling bracket() with dummy endpoints, and then complementing
1393	* the result. The alternative would be to invoke rainbow(), and then delete
1394	* arcs as the b.e. is seen... but that gets messy.
1395	*/
1396	static void
1397	cbracket(struct vars *v,
1398	struct state *lp,
1399	struct state *rp)
1400	{
1401	struct state *left = newstate(v->nfa);
1402	struct state *right = newstate(v->nfa);
1403
1404	NOERR();
1405	bracket(v, left, right);
1406	if (v->cflags & REG_NLSTOP)
1407	newarc(v->nfa, PLAIN, v->nlcolor, left, right);
1408	NOERR();
1409
1410	assert(lp->nouts == `0`); / all outarcs will be ours /
1411
1412	/*
1413	* Easy part of complementing, and all there is to do since the MCCE code
1414	* was removed.
1415	*/
1416	colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp);
1417	NOERR();
1418	dropstate(v->nfa, left);
1419	assert(right->nins == `0`);
1420	freestate(v->nfa, right);
1421	}
1422
1423	/*
1424	* brackpart - handle one item (or range) within a bracket expression
1425	*/
1426	static void
1427	brackpart(struct vars *v,
1428	struct state *lp,
1429	struct state *rp)
1430	{
1431	chr startc;
1432	chr endc;
1433	struct cvec *cv;
1434	const chr *startp;
1435	const chr *endp;
1436	chr c[`1`];
1437
1438	/ parse something, get rid of special cases, take shortcuts /
1439	switch (v->nexttype)
1440	{
1441	case RANGE: / a-b-c or other botch /
1442	ERR(REG_ERANGE);
1443	return;
1444	break;
1445	case PLAIN:
1446	c[`0`] = v->nextvalue;
1447	NEXT();
1448	/ shortcut for ordinary chr (not range) /
1449	if (!SEE(RANGE))
1450	{
1451	onechr(v, c[`0`], lp, rp);
1452	return;
1453	}
1454	startc = element(v, c, c + `1`);
1455	NOERR();
1456	break;
1457	case COLLEL:
1458	startp = v->now;
1459	endp = scanplain(v);
1460	INSIST(startp < endp, REG_ECOLLATE);
1461	NOERR();
1462	startc = element(v, startp, endp);
1463	NOERR();
1464	break;
1465	case ECLASS:
1466	startp = v->now;
1467	endp = scanplain(v);
1468	INSIST(startp < endp, REG_ECOLLATE);
1469	NOERR();
1470	startc = element(v, startp, endp);
1471	NOERR();
1472	cv = eclass(v, startc, (v->cflags & REG_ICASE));
1473	NOERR();
1474	subcolorcvec(v, cv, lp, rp);
1475	return;
1476	break;
1477	case CCLASS:
1478	startp = v->now;
1479	endp = scanplain(v);
1480	INSIST(startp < endp, REG_ECTYPE);
1481	NOERR();
1482	cv = cclass(v, startp, endp, (v->cflags & REG_ICASE));
1483	NOERR();
1484	subcolorcvec(v, cv, lp, rp);
1485	return;
1486	break;
1487	default:
1488	ERR(REG_ASSERT);
1489	return;
1490	break;
1491	}
1492
1493	if (SEE(RANGE))
1494	{
1495	NEXT();
1496	switch (v->nexttype)
1497	{
1498	case PLAIN:
1499	case RANGE:
1500	c[`0`] = v->nextvalue;
1501	NEXT();
1502	endc = element(v, c, c + `1`);
1503	NOERR();
1504	break;
1505	case COLLEL:
1506	startp = v->now;
1507	endp = scanplain(v);
1508	INSIST(startp < endp, REG_ECOLLATE);
1509	NOERR();
1510	endc = element(v, startp, endp);
1511	NOERR();
1512	break;
1513	default:
1514	ERR(REG_ERANGE);
1515	return;
1516	break;
1517	}
1518	}
1519	else
1520	endc = startc;
1521
1522	/*
1523	* Ranges are unportable. Actually, standard C does guarantee that digits
1524	* are contiguous, but making that an exception is just too complicated.
1525	*/
1526	if (startc != endc)
1527	NOTE(REG_UUNPORT);
1528	cv = range(v, startc, endc, (v->cflags & REG_ICASE));
1529	NOERR();
1530	subcolorcvec(v, cv, lp, rp);
1531	}
1532
1533	/*
1534	* scanplain - scan PLAIN contents of [. etc.
1535	*
1536	* Certain bits of trickery in lex.c know that this code does not try
1537	* to look past the final bracket of the [. etc.
1538	*/
1539	static const chr * / just after end of sequence /
1540	scanplain(struct vars *v)
1541	{
1542	const chr *endp;
1543
1544	assert(SEE(COLLEL) \|\| SEE(ECLASS) \|\| SEE(CCLASS));
1545	NEXT();
1546
1547	endp = v->now;
1548	while (SEE(PLAIN))
1549	{
1550	endp = v->now;
1551	NEXT();
1552	}
1553
1554	assert(SEE(END) \|\| ISERR());
1555	NEXT();
1556
1557	return endp;
1558	}
1559
1560	/*
1561	* onechr - fill in arcs for a plain character, and possible case complements
1562	* This is mostly a shortcut for efficient handling of the common case.
1563	*/
1564	static void
1565	onechr(struct vars *v,
1566	chr c,
1567	struct state *lp,
1568	struct state *rp)
1569	{
1570	if (!(v->cflags & REG_ICASE))
1571	{
1572	color lastsubcolor = COLORLESS;
1573
1574	subcoloronechr(v, c, lp, rp, &lastsubcolor);
1575	return;
1576	}
1577
1578	/ rats, need general case anyway... /
1579	subcolorcvec(v, allcases(v, c), lp, rp);
1580	}
1581
1582	/*
1583	* wordchrs - set up word-chr list for word-boundary stuff, if needed
1584	*
1585	* The list is kept as a bunch of arcs between two dummy states; it's
1586	* disposed of by the unreachable-states sweep in NFA optimization.
1587	* Does NEXT(). Must not be called from any unusual lexical context.
1588	* This should be reconciled with the \w etc. handling in lex.c, and
1589	* should be cleaned up to reduce dependencies on input scanning.
1590	*/
1591	static void
1592	wordchrs(struct vars *v)
1593	{
1594	struct state *left;
1595	struct state *right;
1596
1597	if (v->wordchrs != NULL)
1598	{
1599	NEXT(); / for consistency /
1600	return;
1601	}
1602
1603	left = newstate(v->nfa);
1604	right = newstate(v->nfa);
1605	NOERR();
1606	/ fine point: implemented with [::], and lexer will set REG_ULOCALE /
1607	lexword(v);
1608	NEXT();
1609	assert(v->savenow != NULL && SEE(`'['`));
1610	bracket(v, left, right);
1611	assert((v->savenow != NULL && SEE(`']'`)) \|\| ISERR());
1612	NEXT();
1613	NOERR();
1614	v->wordchrs = left;
1615	}
1616
1617	/*
1618	* processlacon - generate the NFA representation of a LACON
1619	*
1620	* In the general case this is just newlacon() + newarc(), but some cases
1621	* can be optimized.
1622	*/
1623	static void
1624	processlacon(struct vars *v,
1625	struct state begin, /* start of parsed LACON sub-re /
1626	struct state end, /* end of parsed LACON sub-re /
1627	int latype,
1628	struct state lp, /* left state to hang it on /
1629	struct state rp) /* right state to hang it on /
1630	{
1631	struct state *s1;
1632	int n;
1633
1634	/*
1635	* Check for lookaround RE consisting of a single plain color arc (or set
1636	* of arcs); this would typically be a simple chr or a bracket expression.
1637	*/
1638	s1 = single_color_transition(begin, end);
1639	switch (latype)
1640	{
1641	case LATYPE_AHEAD_POS:
1642	/ If lookahead RE is just colorset C, convert to AHEAD(C) /
1643	if (s1 != NULL)
1644	{
1645	cloneouts(v->nfa, s1, lp, rp, AHEAD);
1646	return;
1647	}
1648	break;
1649	case LATYPE_AHEAD_NEG:
1650	/ If lookahead RE is just colorset C, convert to AHEAD(^C)\|$ /
1651	if (s1 != NULL)
1652	{
1653	colorcomplement(v->nfa, v->cm, AHEAD, s1, lp, rp);
1654	newarc(v->nfa, `'$'`, `1`, lp, rp);
1655	newarc(v->nfa, `'$'`, `0`, lp, rp);
1656	return;
1657	}
1658	break;
1659	case LATYPE_BEHIND_POS:
1660	/ If lookbehind RE is just colorset C, convert to BEHIND(C) /
1661	if (s1 != NULL)
1662	{
1663	cloneouts(v->nfa, s1, lp, rp, BEHIND);
1664	return;
1665	}
1666	break;
1667	case LATYPE_BEHIND_NEG:
1668	/ If lookbehind RE is just colorset C, convert to BEHIND(^C)\|^ /
1669	if (s1 != NULL)
1670	{
1671	colorcomplement(v->nfa, v->cm, BEHIND, s1, lp, rp);
1672	newarc(v->nfa, `'^'`, `1`, lp, rp);
1673	newarc(v->nfa, `'^'`, `0`, lp, rp);
1674	return;
1675	}
1676	break;
1677	default:
1678	assert(NOTREACHED);
1679	}
1680
1681	/ General case: we need a LACON subre and arc /
1682	n = newlacon(v, begin, end, latype);
1683	newarc(v->nfa, LACON, n, lp, rp);
1684	}
1685
1686	/*
1687	* subre - allocate a subre
1688	*/
1689	static struct subre *
1690	subre(struct vars *v,
1691	int op,
1692	int flags,
1693	struct state *begin,
1694	struct state *end)
1695	{
1696	struct subre *ret = v->treefree;
1697
1698	/*
1699	* Checking for stack overflow here is sufficient to protect parse() and
1700	* its recursive subroutines.
1701	*/
1702	if (STACK_TOO_DEEP(v->re))
1703	{
1704	ERR(REG_ETOOBIG);
1705	return NULL;
1706	}
1707
1708	if (ret != NULL)
1709	v->treefree = ret->left;
1710	else
1711	{
1712	ret = (struct subre ) MALLOC(sizeof(struct* subre));
1713	if (ret == NULL)
1714	{
1715	ERR(REG_ESPACE);
1716	return NULL;
1717	}
1718	ret->chain = v->treechain;
1719	v->treechain = ret;
1720	}
1721
1722	assert(strchr("=b\|.*(", op) != NULL);
1723
1724	ret->op = op;
1725	ret->flags = flags;
1726	ret->id = `0`; / will be assigned later /
1727	ret->subno = `0`;
1728	ret->min = ret->max = `1`;
1729	ret->left = NULL;
1730	ret->right = NULL;
1731	ret->begin = begin;
1732	ret->end = end;
1733	ZAPCNFA(ret->cnfa);
1734
1735	return ret;
1736	}
1737
1738	/*
1739	* freesubre - free a subRE subtree
1740	*/
1741	static void
1742	freesubre(struct vars v, /* might be NULL /
1743	struct subre *sr)
1744	{
1745	if (sr == NULL)
1746	return;
1747
1748	if (sr->left != NULL)
1749	freesubre(v, sr->left);
1750	if (sr->right != NULL)
1751	freesubre(v, sr->right);
1752
1753	freesrnode(v, sr);
1754	}
1755
1756	/*
1757	* freesrnode - free one node in a subRE subtree
1758	*/
1759	static void
1760	freesrnode(struct vars v, /* might be NULL /
1761	struct subre *sr)
1762	{
1763	if (sr == NULL)
1764	return;
1765
1766	if (!NULLCNFA(sr->cnfa))
1767	freecnfa(&sr->cnfa);
1768	sr->flags = `0`;
1769
1770	if (v != NULL && v->treechain != NULL)
1771	{
1772	/ we're still parsing, maybe we can reuse the subre /
1773	sr->left = v->treefree;
1774	v->treefree = sr;
1775	}
1776	else
1777	FREE(sr);
1778	}
1779
1780	/*
1781	* optst - optimize a subRE subtree
1782	*/
1783	static void
1784	optst(struct vars *v,
1785	struct subre *t)
1786	{
1787	/*
1788	* DGP (2007-11-13): I assume it was the programmer's intent to eventually
1789	* come back and add code to optimize subRE trees, but the routine coded
1790	* just spends effort traversing the tree and doing nothing. We can do
1791	* nothing with less effort.
1792	*/
1793	return;
1794	}
1795
1796	/*
1797	* numst - number tree nodes (assigning "id" indexes)
1798	*/
1799	static int / next number /
1800	numst(struct subre *t,
1801	int start) / starting point for subtree numbers /
1802	{
1803	int i;
1804
1805	assert(t != NULL);
1806
1807	i = start;
1808	t->id = (short) i++;
1809	if (t->left != NULL)
1810	i = numst(t->left, i);
1811	if (t->right != NULL)
1812	i = numst(t->right, i);
1813	return i;
1814	}
1815
1816	/*
1817	* markst - mark tree nodes as INUSE
1818	*
1819	* Note: this is a great deal more subtle than it looks. During initial
1820	* parsing of a regex, all subres are linked into the treechain list;
1821	* discarded ones are also linked into the treefree list for possible reuse.
1822	* After we are done creating all subres required for a regex, we run markst()
1823	* then cleanst(), which results in discarding all subres not reachable from
1824	* v->tree. We then clear v->treechain, indicating that subres must be found
1825	* by descending from v->tree. This changes the behavior of freesubre(): it
1826	* will henceforth FREE() unwanted subres rather than sticking them into the
1827	* treefree list. (Doing that any earlier would result in dangling links in
1828	* the treechain list.) This all means that freev() will clean up correctly
1829	* if invoked before or after markst()+cleanst(); but it would not work if
1830	* called partway through this state conversion, so we mustn't error out
1831	* in or between these two functions.
1832	*/
1833	static void
1834	markst(struct subre *t)
1835	{
1836	assert(t != NULL);
1837
1838	t->flags \|= INUSE;
1839	if (t->left != NULL)
1840	markst(t->left);
1841	if (t->right != NULL)
1842	markst(t->right);
1843	}
1844
1845	/*
1846	* cleanst - free any tree nodes not marked INUSE
1847	*/
1848	static void
1849	cleanst(struct vars *v)
1850	{
1851	struct subre *t;
1852	struct subre *next;
1853
1854	for (t = v->treechain; t != NULL; t = next)
1855	{
1856	next = t->chain;
1857	if (!(t->flags & INUSE))
1858	FREE(t);
1859	}
1860	v->treechain = NULL;
1861	v->treefree = NULL; / just on general principles /
1862	}
1863
1864	/*
1865	* nfatree - turn a subRE subtree into a tree of compacted NFAs
1866	*/
1867	static long / optimize results from top node /
1868	nfatree(struct vars *v,
1869	struct subre *t,
1870	FILE f) /* for debug output /
1871	{
1872	assert(t != NULL && t->begin != NULL);
1873
1874	if (t->left != NULL)
1875	(DISCARD) nfatree(v, t->left, f);
1876	if (t->right != NULL)
1877	(DISCARD) nfatree(v, t->right, f);
1878
1879	return nfanode(v, t, `0`, f);
1880	}
1881
1882	/*
1883	* nfanode - do one NFA for nfatree or lacons
1884	*
1885	* If converttosearch is true, apply makesearch() to the NFA.
1886	*/
1887	static long / optimize results /
1888	nfanode(struct vars *v,
1889	struct subre *t,
1890	int converttosearch,
1891	FILE f) /* for debug output /
1892	{
1893	struct nfa *nfa;
1894	long ret = `0`;
1895
1896	assert(t->begin != NULL);
1897
1898	#ifdef REG_DEBUG
1899	if (f != NULL)
1900	{
1901	char idbuf[`50`];
1902
1903	fprintf(f, "\n\n\n========= TREE NODE %s ==========\n",
1904	stid(t, idbuf, sizeof(idbuf)));
1905	}
1906	#endif
1907	nfa = newnfa(v, v->cm, v->nfa);
1908	NOERRZ();
1909	dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
1910	if (!ISERR())
1911	specialcolors(nfa);
1912	if (!ISERR())
1913	ret = optimize(nfa, f);
1914	if (converttosearch && !ISERR())
1915	makesearch(v, nfa);
1916	if (!ISERR())
1917	compact(nfa, &t->cnfa);
1918
1919	freenfa(nfa);
1920	return ret;
1921	}
1922
1923	/*
1924	* newlacon - allocate a lookaround-constraint subRE
1925	*/
1926	static int / lacon number /
1927	newlacon(struct vars *v,
1928	struct state *begin,
1929	struct state *end,
1930	int latype)
1931	{
1932	int n;
1933	struct subre *newlacons;
1934	struct subre *sub;
1935
1936	if (v->nlacons == `0`)
1937	{
1938	n = `1`; / skip 0th /
1939	newlacons = (struct subre ) MALLOC(`2` sizeof(struct subre));
1940	}
1941	else
1942	{
1943	n = v->nlacons;
1944	newlacons = (struct subre *) REALLOC(v->lacons,
1945	(n + `1`) * sizeof(struct subre));
1946	}
1947	if (newlacons == NULL)
1948	{
1949	ERR(REG_ESPACE);
1950	return `0`;
1951	}
1952	v->lacons = newlacons;
1953	v->nlacons = n + `1`;
1954	sub = &v->lacons[n];
1955	sub->begin = begin;
1956	sub->end = end;
1957	sub->subno = latype;
1958	ZAPCNFA(sub->cnfa);
1959	return n;
1960	}
1961
1962	/*
1963	* freelacons - free lookaround-constraint subRE vector
1964	*/
1965	static void
1966	freelacons(struct subre *subs,
1967	int n)
1968	{
1969	struct subre *sub;
1970	int i;
1971
1972	assert(n > `0`);
1973	for (sub = subs + `1`, i = n - `1`; i > `0`; sub++, i--) / no 0th /
1974	if (!NULLCNFA(sub->cnfa))
1975	freecnfa(&sub->cnfa);
1976	FREE(subs);
1977	}
1978
1979	/*
1980	* rfree - free a whole RE (insides of regfree)
1981	*/
1982	static void
1983	rfree(regex_t *re)
1984	{
1985	struct guts *g;
1986
1987	if (re == NULL \|\| re->re_magic != REMAGIC)
1988	return;
1989
1990	re->re_magic = `0`; / invalidate RE /
1991	g = (struct guts *) re->re_guts;
1992	re->re_guts = NULL;
1993	re->re_fns = NULL;
1994	if (g != NULL)
1995	{
1996	g->magic = `0`;
1997	freecm(&g->cmap);
1998	if (g->tree != NULL)
1999	freesubre((struct vars *) NULL, g->tree);
2000	if (g->lacons != NULL)
2001	freelacons(g->lacons, g->nlacons);
2002	if (!NULLCNFA(g->search))
2003	freecnfa(&g->search);
2004	FREE(g);
2005	}
2006	}
2007
2008	/*
2009	* rcancelrequested - check for external request to cancel regex operation
2010	*
2011	* Return nonzero to fail the operation with error code REG_CANCEL,
2012	* zero to keep going
2013	*
2014	* The current implementation is Postgres-specific. If we ever get around
2015	* to splitting the regex code out as a standalone library, there will need
2016	* to be some API to let applications define a callback function for this.
2017	*/
2018	static int
2019	rcancelrequested(void)
2020	{
2021	return InterruptPending && (QueryCancelPending \|\| ProcDiePending);
2022	}
2023
2024	/*
2025	* rstacktoodeep - check for stack getting dangerously deep
2026	*
2027	* Return nonzero to fail the operation with error code REG_ETOOBIG,
2028	* zero to keep going
2029	*
2030	* The current implementation is Postgres-specific. If we ever get around
2031	* to splitting the regex code out as a standalone library, there will need
2032	* to be some API to let applications define a callback function for this.
2033	*/
2034	static int
2035	rstacktoodeep(void)
2036	{
2037	return stack_is_too_deep();
2038	}
2039
2040	#ifdef REG_DEBUG
2041
2042	/*
2043	* dump - dump an RE in human-readable form
2044	*/
2045	static void
2046	dump(regex_t *re,
2047	FILE *f)
2048	{
2049	struct guts *g;
2050	int i;
2051
2052	if (re->re_magic != REMAGIC)
2053	fprintf(f, "bad magic number (0x%x not 0x%x)\n", re->re_magic,
2054	REMAGIC);
2055	if (re->re_guts == NULL)
2056	{
2057	fprintf(f, "NULL guts!!!\n");
2058	return;
2059	}
2060	g = (struct guts *) re->re_guts;
2061	if (g->magic != GUTSMAGIC)
2062	fprintf(f, "bad guts magic number (0x%x not 0x%x)\n", g->magic,
2063	GUTSMAGIC);
2064
2065	fprintf(f, "\n\n\n========= DUMP ==========\n");
2066	fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n",
2067	(int) re->re_nsub, re->re_info, re->re_csize, g->ntree);
2068
2069	dumpcolors(&g->cmap, f);
2070	if (!NULLCNFA(g->search))
2071	{
2072	fprintf(f, "\nsearch:\n");
2073	dumpcnfa(&g->search, f);
2074	}
2075	for (i = `1`; i < g->nlacons; i++)
2076	{
2077	struct subre *lasub = &g->lacons[i];
2078	const char *latype;
2079
2080	switch (lasub->subno)
2081	{
2082	case LATYPE_AHEAD_POS:
2083	latype = "positive lookahead";
2084	break;
2085	case LATYPE_AHEAD_NEG:
2086	latype = "negative lookahead";
2087	break;
2088	case LATYPE_BEHIND_POS:
2089	latype = "positive lookbehind";
2090	break;
2091	case LATYPE_BEHIND_NEG:
2092	latype = "negative lookbehind";
2093	break;
2094	default:
2095	latype = "???";
2096	break;
2097	}
2098	fprintf(f, "\nla%d (%s):\n", i, latype);
2099	dumpcnfa(&lasub->cnfa, f);
2100	}
2101	fprintf(f, "\n");
2102	dumpst(g->tree, f, `0`);
2103	}
2104
2105	/*
2106	* dumpst - dump a subRE tree
2107	*/
2108	static void
2109	dumpst(struct subre *t,
2110	FILE *f,
2111	int nfapresent) / is the original NFA still around? /
2112	{
2113	if (t == NULL)
2114	fprintf(f, "null tree\n");
2115	else
2116	stdump(t, f, nfapresent);
2117	fflush(f);
2118	}
2119
2120	/*
2121	* stdump - recursive guts of dumpst
2122	*/
2123	static void
2124	stdump(struct subre *t,
2125	FILE *f,
2126	int nfapresent) / is the original NFA still around? /
2127	{
2128	char idbuf[`50`];
2129
2130	fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op);
2131	if (t->flags & LONGER)
2132	fprintf(f, " longest");
2133	if (t->flags & SHORTER)
2134	fprintf(f, " shortest");
2135	if (t->flags & MIXED)
2136	fprintf(f, " hasmixed");
2137	if (t->flags & CAP)
2138	fprintf(f, " hascapture");
2139	if (t->flags & BACKR)
2140	fprintf(f, " hasbackref");
2141	if (!(t->flags & INUSE))
2142	fprintf(f, " UNUSED");
2143	if (t->subno != `0`)
2144	fprintf(f, " (#%d)", t->subno);
2145	if (t->min != `1` \|\| t->max != `1`)
2146	{
2147	fprintf(f, " {%d,", t->min);
2148	if (t->max != DUPINF)
2149	fprintf(f, "%d", t->max);
2150	fprintf(f, "}");
2151	}
2152	if (nfapresent)
2153	fprintf(f, " %ld-%ld", (long) t->begin->no, (long) t->end->no);
2154	if (t->left != NULL)
2155	fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf)));
2156	if (t->right != NULL)
2157	fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf)));
2158	if (!NULLCNFA(t->cnfa))
2159	{
2160	fprintf(f, "\n");
2161	dumpcnfa(&t->cnfa, f);
2162	}
2163	fprintf(f, "\n");
2164	if (t->left != NULL)
2165	stdump(t->left, f, nfapresent);
2166	if (t->right != NULL)
2167	stdump(t->right, f, nfapresent);
2168	}
2169
2170	/*
2171	* stid - identify a subtree node for dumping
2172	*/
2173	static const char * / points to buf or constant string /
2174	stid(struct subre *t,
2175	char *buf,
2176	size_t bufsize)
2177	{
2178	/ big enough for hex int or decimal t->id? /
2179	if (bufsize < sizeof(void ) `2` + `3` \|\| bufsize < sizeof(t->id) * `3` + `1`)
2180	return "unable";
2181	if (t->id != `0`)
2182	sprintf(buf, "%d", t->id);
2183	else
2184	sprintf(buf, "%p", t);
2185	return buf;
2186	}
2187	#endif /* REG_DEBUG */
2188
2189
2190	#include "regc_lex.c"
2191	#include "regc_color.c"
2192	#include "regc_nfa.c"
2193	#include "regc_cvec.c"
2194	#include "regc_pg_locale.c"
2195	#include "regc_locale.c"
2196

Browse the source code of PostgreSQL/src/backend/regex/regcomp.c