regc_lex.c source code [PostgreSQL/src/backend/regex/regc_lex.c]

1	/*
2	* lexical analyzer
3	* This file is #included by regcomp.c.
4	*
5	* Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6	*
7	* Development of this software was funded, in part, by Cray Research Inc.,
8	* UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9	* Corporation, none of whom are responsible for the results. The author
10	* thanks all of them.
11	*
12	* Redistribution and use in source and binary forms -- with or without
13	* modification -- are permitted for any purpose, provided that
14	* redistributions in source form retain this entire copyright notice and
15	* indicate the origin and nature of any modifications.
16	*
17	* I'd appreciate being given credit for this package in the documentation
18	* of software which uses it, but that is not a requirement.
19	*
20	* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21	* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22	* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23	* HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26	* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27	* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28	* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29	* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30	*
31	* src/backend/regex/regc_lex.c
32	*
33	*/
34
35	/ scanning macros (know about v) /
36	#define ATEOS() (v->now >= v->stop)
37	#define HAVE(n) (v->stop - v->now >= (n))
38	#define NEXT1(c) (!ATEOS() && *v->now == CHR(c))
39	#define NEXT2(a,b) (HAVE(2) && v->now == CHR(a) && (v->now+1) == CHR(b))
40	#define NEXT3(a,b,c) (HAVE(3) && *v->now == CHR(a) && \
41	*(v->now+1) == CHR(b) && \
42	*(v->now+2) == CHR(c))
43	#define SET(c) (v->nexttype = (c))
44	#define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n))
45	#define RET(c) return (SET(c), 1)
46	#define RETV(c, n) return (SETV(c, n), 1)
47	#define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */
48	#define LASTTYPE(t) (v->lasttype == (t))
49
50	/ lexical contexts /
51	#define L_ERE 1 /* mainline ERE/ARE */
52	#define L_BRE 2 /* mainline BRE */
53	#define L_Q 3 /* REG_QUOTE */
54	#define L_EBND 4 /* ERE/ARE bound */
55	#define L_BBND 5 /* BRE bound */
56	#define L_BRACK 6 /* brackets */
57	#define L_CEL 7 /* collating element */
58	#define L_ECL 8 /* equivalence class */
59	#define L_CCL 9 /* character class */
60	#define INTOCON(c) (v->lexcon = (c))
61	#define INCON(con) (v->lexcon == (con))
62
63	/ construct pointer past end of chr array /
64	#define ENDOF(array) ((array) + sizeof(array)/sizeof(chr))
65
66	/*
67	* lexstart - set up lexical stuff, scan leading options
68	*/
69	static void
70	lexstart(struct vars *v)
71	{
72	prefixes(v); / may turn on new type bits etc. /
73	NOERR();
74
75	if (v->cflags & REG_QUOTE)
76	{
77	assert(!(v->cflags & (REG_ADVANCED \| REG_EXPANDED \| REG_NEWLINE)));
78	INTOCON(L_Q);
79	}
80	else if (v->cflags & REG_EXTENDED)
81	{
82	assert(!(v->cflags & REG_QUOTE));
83	INTOCON(L_ERE);
84	}
85	else
86	{
87	assert(!(v->cflags & (REG_QUOTE \| REG_ADVF)));
88	INTOCON(L_BRE);
89	}
90
91	v->nexttype = EMPTY; / remember we were at the start /
92	next(v); / set up the first token /
93	}
94
95	/*
96	* prefixes - implement various special prefixes
97	*/
98	static void
99	prefixes(struct vars *v)
100	{
101	/ literal string doesn't get any of this stuff /
102	if (v->cflags & REG_QUOTE)
103	return;
104
105	/ initial "**" gets special things /*
106	if (HAVE(`4`) && NEXT3(`''`, `''`, `'*'`))
107	switch (*(v->now + `3`))
108	{
109	case CHR(`'?'`): / "**?" error, msg shows version /*
110	ERR(REG_BADPAT);
111	return; / proceed no further /
112	break;
113	case CHR(`'='`): / "**=" shifts to literal string /*
114	NOTE(REG_UNONPOSIX);
115	v->cflags \|= REG_QUOTE;
116	v->cflags &= ~(REG_ADVANCED \| REG_EXPANDED \| REG_NEWLINE);
117	v->now += `4`;
118	return; / and there can be no more prefixes /
119	break;
120	case CHR(`':'`): / "**:" shifts to AREs /*
121	NOTE(REG_UNONPOSIX);
122	v->cflags \|= REG_ADVANCED;
123	v->now += `4`;
124	break;
125	default: / otherwise *** is just an error /
126	ERR(REG_BADRPT);
127	return;
128	break;
129	}
130
131	/ BREs and EREs don't get embedded options /
132	if ((v->cflags & REG_ADVANCED) != REG_ADVANCED)
133	return;
134
135	/ embedded options (AREs only) /
136	if (HAVE(`3`) && NEXT2(`'('`, `'?'`) && iscalpha(*(v->now + `2`)))
137	{
138	NOTE(REG_UNONPOSIX);
139	v->now += `2`;
140	for (; !ATEOS() && iscalpha(*v->now); v->now++)
141	switch (*v->now)
142	{
143	case CHR(`'b'`): / BREs (but why???) /
144	v->cflags &= ~(REG_ADVANCED \| REG_QUOTE);
145	break;
146	case CHR(`'c'`): / case sensitive /
147	v->cflags &= ~REG_ICASE;
148	break;
149	case CHR(`'e'`): / plain EREs /
150	v->cflags \|= REG_EXTENDED;
151	v->cflags &= ~(REG_ADVF \| REG_QUOTE);
152	break;
153	case CHR(`'i'`): / case insensitive /
154	v->cflags \|= REG_ICASE;
155	break;
156	case CHR(`'m'`): / Perloid synonym for n /
157	case CHR(`'n'`): / \n affects ^ $ . [^ /
158	v->cflags \|= REG_NEWLINE;
159	break;
160	case CHR(`'p'`): / ~Perl, \n affects . [^ /
161	v->cflags \|= REG_NLSTOP;
162	v->cflags &= ~REG_NLANCH;
163	break;
164	case CHR(`'q'`): / literal string /
165	v->cflags \|= REG_QUOTE;
166	v->cflags &= ~REG_ADVANCED;
167	break;
168	case CHR(`'s'`): / single line, \n ordinary /
169	v->cflags &= ~REG_NEWLINE;
170	break;
171	case CHR(`'t'`): / tight syntax /
172	v->cflags &= ~REG_EXPANDED;
173	break;
174	case CHR(`'w'`): / weird, \n affects ^ $ only /
175	v->cflags &= ~REG_NLSTOP;
176	v->cflags \|= REG_NLANCH;
177	break;
178	case CHR(`'x'`): / expanded syntax /
179	v->cflags \|= REG_EXPANDED;
180	break;
181	default:
182	ERR(REG_BADOPT);
183	return;
184	}
185	if (!NEXT1(`')'`))
186	{
187	ERR(REG_BADOPT);
188	return;
189	}
190	v->now++;
191	if (v->cflags & REG_QUOTE)
192	v->cflags &= ~(REG_EXPANDED \| REG_NEWLINE);
193	}
194	}
195
196	/*
197	* lexnest - "call a subroutine", interpolating string at the lexical level
198	*
199	* Note, this is not a very general facility. There are a number of
200	* implicit assumptions about what sorts of strings can be subroutines.
201	*/
202	static void
203	lexnest(struct vars *v,
204	const chr beginp, /* start of interpolation /
205	const chr endp) /* one past end of interpolation /
206	{
207	assert(v->savenow == NULL); / only one level of nesting /
208	v->savenow = v->now;
209	v->savestop = v->stop;
210	v->now = beginp;
211	v->stop = endp;
212	}
213
214	/*
215	* string constants to interpolate as expansions of things like \d
216	*/
217	static const chr backd[] = { / \d /
218	CHR(`'['`), CHR(`'['`), CHR(`':'`),
219	CHR(`'d'`), CHR(`'i'`), CHR(`'g'`), CHR(`'i'`), CHR(`'t'`),
220	CHR(`':'`), CHR(`']'`), CHR(`']'`)
221	};
222	static const chr backD[] = { / \D /
223	CHR(`'['`), CHR(`'^'`), CHR(`'['`), CHR(`':'`),
224	CHR(`'d'`), CHR(`'i'`), CHR(`'g'`), CHR(`'i'`), CHR(`'t'`),
225	CHR(`':'`), CHR(`']'`), CHR(`']'`)
226	};
227	static const chr brbackd[] = { / \d within brackets /
228	CHR(`'['`), CHR(`':'`),
229	CHR(`'d'`), CHR(`'i'`), CHR(`'g'`), CHR(`'i'`), CHR(`'t'`),
230	CHR(`':'`), CHR(`']'`)
231	};
232	static const chr backs[] = { / \s /
233	CHR(`'['`), CHR(`'['`), CHR(`':'`),
234	CHR(`'s'`), CHR(`'p'`), CHR(`'a'`), CHR(`'c'`), CHR(`'e'`),
235	CHR(`':'`), CHR(`']'`), CHR(`']'`)
236	};
237	static const chr backS[] = { / \S /
238	CHR(`'['`), CHR(`'^'`), CHR(`'['`), CHR(`':'`),
239	CHR(`'s'`), CHR(`'p'`), CHR(`'a'`), CHR(`'c'`), CHR(`'e'`),
240	CHR(`':'`), CHR(`']'`), CHR(`']'`)
241	};
242	static const chr brbacks[] = { / \s within brackets /
243	CHR(`'['`), CHR(`':'`),
244	CHR(`'s'`), CHR(`'p'`), CHR(`'a'`), CHR(`'c'`), CHR(`'e'`),
245	CHR(`':'`), CHR(`']'`)
246	};
247	static const chr backw[] = { / \w /
248	CHR(`'['`), CHR(`'['`), CHR(`':'`),
249	CHR(`'a'`), CHR(`'l'`), CHR(`'n'`), CHR(`'u'`), CHR(`'m'`),
250	CHR(`':'`), CHR(`']'`), CHR(`'_'`), CHR(`']'`)
251	};
252	static const chr backW[] = { / \W /
253	CHR(`'['`), CHR(`'^'`), CHR(`'['`), CHR(`':'`),
254	CHR(`'a'`), CHR(`'l'`), CHR(`'n'`), CHR(`'u'`), CHR(`'m'`),
255	CHR(`':'`), CHR(`']'`), CHR(`'_'`), CHR(`']'`)
256	};
257	static const chr brbackw[] = { / \w within brackets /
258	CHR(`'['`), CHR(`':'`),
259	CHR(`'a'`), CHR(`'l'`), CHR(`'n'`), CHR(`'u'`), CHR(`'m'`),
260	CHR(`':'`), CHR(`']'`), CHR(`'_'`)
261	};
262
263	/*
264	* lexword - interpolate a bracket expression for word characters
265	* Possibly ought to inquire whether there is a "word" character class.
266	*/
267	static void
268	lexword(struct vars *v)
269	{
270	lexnest(v, backw, ENDOF(backw));
271	}
272
273	/*
274	* next - get next token
275	*/
276	static int / 1 normal, 0 failure /
277	next(struct vars *v)
278	{
279	chr c;
280
281	/ errors yield an infinite sequence of failures /
282	if (ISERR())
283	return `0`; / the error has set nexttype to EOS /
284
285	/ remember flavor of last token /
286	v->lasttype = v->nexttype;
287
288	/ REG_BOSONLY /
289	if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY))
290	{
291	/ at start of a REG_BOSONLY RE /
292	RETV(SBEGIN, `0`); / same as \A /
293	}
294
295	/ if we're nested and we've hit end, return to outer level /
296	if (v->savenow != NULL && ATEOS())
297	{
298	v->now = v->savenow;
299	v->stop = v->savestop;
300	v->savenow = v->savestop = NULL;
301	}
302
303	/ skip white space etc. if appropriate (not in literal or []) /
304	if (v->cflags & REG_EXPANDED)
305	switch (v->lexcon)
306	{
307	case L_ERE:
308	case L_BRE:
309	case L_EBND:
310	case L_BBND:
311	skip(v);
312	break;
313	}
314
315	/ handle EOS, depending on context /
316	if (ATEOS())
317	{
318	switch (v->lexcon)
319	{
320	case L_ERE:
321	case L_BRE:
322	case L_Q:
323	RET(EOS);
324	break;
325	case L_EBND:
326	case L_BBND:
327	FAILW(REG_EBRACE);
328	break;
329	case L_BRACK:
330	case L_CEL:
331	case L_ECL:
332	case L_CCL:
333	FAILW(REG_EBRACK);
334	break;
335	}
336	assert(NOTREACHED);
337	}
338
339	/ okay, time to actually get a character /
340	c = *v->now++;
341
342	/ deal with the easy contexts, punt EREs to code below /
343	switch (v->lexcon)
344	{
345	case L_BRE: / punt BREs to separate function /
346	return brenext(v, c);
347	break;
348	case L_ERE: / see below /
349	break;
350	case L_Q: / literal strings are easy /
351	RETV(PLAIN, c);
352	break;
353	case L_BBND: / bounds are fairly simple /
354	case L_EBND:
355	switch (c)
356	{
357	case CHR(`'0'`):
358	case CHR(`'1'`):
359	case CHR(`'2'`):
360	case CHR(`'3'`):
361	case CHR(`'4'`):
362	case CHR(`'5'`):
363	case CHR(`'6'`):
364	case CHR(`'7'`):
365	case CHR(`'8'`):
366	case CHR(`'9'`):
367	RETV(DIGIT, (chr) DIGITVAL(c));
368	break;
369	case CHR(`','`):
370	RET(`','`);
371	break;
372	case CHR(`'}'`): / ERE bound ends with } /
373	if (INCON(L_EBND))
374	{
375	INTOCON(L_ERE);
376	if ((v->cflags & REG_ADVF) && NEXT1(`'?'`))
377	{
378	v->now++;
379	NOTE(REG_UNONPOSIX);
380	RETV(`'}'`, `0`);
381	}
382	RETV(`'}'`, `1`);
383	}
384	else
385	FAILW(REG_BADBR);
386	break;
387	case CHR(`'\\'`): / BRE bound ends with \} /
388	if (INCON(L_BBND) && NEXT1(`'}'`))
389	{
390	v->now++;
391	INTOCON(L_BRE);
392	RET(`'}'`);
393	}
394	else
395	FAILW(REG_BADBR);
396	break;
397	default:
398	FAILW(REG_BADBR);
399	break;
400	}
401	assert(NOTREACHED);
402	break;
403	case L_BRACK: / brackets are not too hard /
404	switch (c)
405	{
406	case CHR(`']'`):
407	if (LASTTYPE(`'['`))
408	RETV(PLAIN, c);
409	else
410	{
411	INTOCON((v->cflags & REG_EXTENDED) ?
412	L_ERE : L_BRE);
413	RET(`']'`);
414	}
415	break;
416	case CHR(`'\\'`):
417	NOTE(REG_UBBS);
418	if (!(v->cflags & REG_ADVF))
419	RETV(PLAIN, c);
420	NOTE(REG_UNONPOSIX);
421	if (ATEOS())
422	FAILW(REG_EESCAPE);
423	(DISCARD) lexescape(v);
424	switch (v->nexttype)
425	{ / not all escapes okay here /
426	case PLAIN:
427	return `1`;
428	break;
429	case CCLASS:
430	switch (v->nextvalue)
431	{
432	case `'d'`:
433	lexnest(v, brbackd, ENDOF(brbackd));
434	break;
435	case `'s'`:
436	lexnest(v, brbacks, ENDOF(brbacks));
437	break;
438	case `'w'`:
439	lexnest(v, brbackw, ENDOF(brbackw));
440	break;
441	default:
442	FAILW(REG_EESCAPE);
443	break;
444	}
445	/ lexnest done, back up and try again /
446	v->nexttype = v->lasttype;
447	return next(v);
448	break;
449	}
450	/ not one of the acceptable escapes /
451	FAILW(REG_EESCAPE);
452	break;
453	case CHR(`'-'`):
454	if (LASTTYPE(`'['`) \|\| NEXT1(`']'`))
455	RETV(PLAIN, c);
456	else
457	RETV(RANGE, c);
458	break;
459	case CHR(`'['`):
460	if (ATEOS())
461	FAILW(REG_EBRACK);
462	switch (*v->now++)
463	{
464	case CHR(`'.'`):
465	INTOCON(L_CEL);
466	/ might or might not be locale-specific /
467	RET(COLLEL);
468	break;
469	case CHR(`'='`):
470	INTOCON(L_ECL);
471	NOTE(REG_ULOCALE);
472	RET(ECLASS);
473	break;
474	case CHR(`':'`):
475	INTOCON(L_CCL);
476	NOTE(REG_ULOCALE);
477	RET(CCLASS);
478	break;
479	default: / oops /
480	v->now--;
481	RETV(PLAIN, c);
482	break;
483	}
484	assert(NOTREACHED);
485	break;
486	default:
487	RETV(PLAIN, c);
488	break;
489	}
490	assert(NOTREACHED);
491	break;
492	case L_CEL: / collating elements are easy /
493	if (c == CHR(`'.'`) && NEXT1(`']'`))
494	{
495	v->now++;
496	INTOCON(L_BRACK);
497	RETV(END, `'.'`);
498	}
499	else
500	RETV(PLAIN, c);
501	break;
502	case L_ECL: / ditto equivalence classes /
503	if (c == CHR(`'='`) && NEXT1(`']'`))
504	{
505	v->now++;
506	INTOCON(L_BRACK);
507	RETV(END, `'='`);
508	}
509	else
510	RETV(PLAIN, c);
511	break;
512	case L_CCL: / ditto character classes /
513	if (c == CHR(`':'`) && NEXT1(`']'`))
514	{
515	v->now++;
516	INTOCON(L_BRACK);
517	RETV(END, `':'`);
518	}
519	else
520	RETV(PLAIN, c);
521	break;
522	default:
523	assert(NOTREACHED);
524	break;
525	}
526
527	/ that got rid of everything except EREs and AREs /
528	assert(INCON(L_ERE));
529
530	/ deal with EREs and AREs, except for backslashes /
531	switch (c)
532	{
533	case CHR(`'\|'`):
534	RET(`'\|'`);
535	break;
536	case CHR(`'*'`):
537	if ((v->cflags & REG_ADVF) && NEXT1(`'?'`))
538	{
539	v->now++;
540	NOTE(REG_UNONPOSIX);
541	RETV(`'*'`, `0`);
542	}
543	RETV(`'*'`, `1`);
544	break;
545	case CHR(`'+'`):
546	if ((v->cflags & REG_ADVF) && NEXT1(`'?'`))
547	{
548	v->now++;
549	NOTE(REG_UNONPOSIX);
550	RETV(`'+'`, `0`);
551	}
552	RETV(`'+'`, `1`);
553	break;
554	case CHR(`'?'`):
555	if ((v->cflags & REG_ADVF) && NEXT1(`'?'`))
556	{
557	v->now++;
558	NOTE(REG_UNONPOSIX);
559	RETV(`'?'`, `0`);
560	}
561	RETV(`'?'`, `1`);
562	break;
563	case CHR(`'{'`): / bounds start or plain character /
564	if (v->cflags & REG_EXPANDED)
565	skip(v);
566	if (ATEOS() \|\| !iscdigit(*v->now))
567	{
568	NOTE(REG_UBRACES);
569	NOTE(REG_UUNSPEC);
570	RETV(PLAIN, c);
571	}
572	else
573	{
574	NOTE(REG_UBOUNDS);
575	INTOCON(L_EBND);
576	RET(`'{'`);
577	}
578	assert(NOTREACHED);
579	break;
580	case CHR(`'('`): / parenthesis, or advanced extension /
581	if ((v->cflags & REG_ADVF) && NEXT1(`'?'`))
582	{
583	NOTE(REG_UNONPOSIX);
584	v->now++;
585	if (ATEOS())
586	FAILW(REG_BADRPT);
587	switch (*v->now++)
588	{
589	case CHR(`':'`): / non-capturing paren /
590	RETV(`'('`, `0`);
591	break;
592	case CHR(`'#'`): / comment /
593	while (!ATEOS() && *v->now != CHR(`')'`))
594	v->now++;
595	if (!ATEOS())
596	v->now++;
597	assert(v->nexttype == v->lasttype);
598	return next(v);
599	break;
600	case CHR(`'='`): / positive lookahead /
601	NOTE(REG_ULOOKAROUND);
602	RETV(LACON, LATYPE_AHEAD_POS);
603	break;
604	case CHR(`'!'`): / negative lookahead /
605	NOTE(REG_ULOOKAROUND);
606	RETV(LACON, LATYPE_AHEAD_NEG);
607	break;
608	case CHR(`'<'`):
609	if (ATEOS())
610	FAILW(REG_BADRPT);
611	switch (*v->now++)
612	{
613	case CHR(`'='`): / positive lookbehind /
614	NOTE(REG_ULOOKAROUND);
615	RETV(LACON, LATYPE_BEHIND_POS);
616	break;
617	case CHR(`'!'`): / negative lookbehind /
618	NOTE(REG_ULOOKAROUND);
619	RETV(LACON, LATYPE_BEHIND_NEG);
620	break;
621	default:
622	FAILW(REG_BADRPT);
623	break;
624	}
625	assert(NOTREACHED);
626	break;
627	default:
628	FAILW(REG_BADRPT);
629	break;
630	}
631	assert(NOTREACHED);
632	}
633	if (v->cflags & REG_NOSUB)
634	RETV(`'('`, `0`); / all parens non-capturing /
635	else
636	RETV(`'('`, `1`);
637	break;
638	case CHR(`')'`):
639	if (LASTTYPE(`'('`))
640	NOTE(REG_UUNSPEC);
641	RETV(`')'`, c);
642	break;
643	case CHR(`'['`): / easy except for [[:<:]] and [[:>:]] /
644	if (HAVE(`6`) && *(v->now + `0`) == CHR(`'['`) &&
645	*(v->now + `1`) == CHR(`':'`) &&
646	(*(v->now + `2`) == CHR(`'<'`) \|\|
647	*(v->now + `2`) == CHR(`'>'`)) &&
648	*(v->now + `3`) == CHR(`':'`) &&
649	*(v->now + `4`) == CHR(`']'`) &&
650	*(v->now + `5`) == CHR(`']'`))
651	{
652	c = *(v->now + `2`);
653	v->now += `6`;
654	NOTE(REG_UNONPOSIX);
655	RET((c == CHR(`'<'`)) ? `'<'` : `'>'`);
656	}
657	INTOCON(L_BRACK);
658	if (NEXT1(`'^'`))
659	{
660	v->now++;
661	RETV(`'['`, `0`);
662	}
663	RETV(`'['`, `1`);
664	break;
665	case CHR(`'.'`):
666	RET(`'.'`);
667	break;
668	case CHR(`'^'`):
669	RET(`'^'`);
670	break;
671	case CHR(`'$'`):
672	RET(`'$'`);
673	break;
674	case CHR(`'\\'`): / mostly punt backslashes to code below /
675	if (ATEOS())
676	FAILW(REG_EESCAPE);
677	break;
678	default: / ordinary character /
679	RETV(PLAIN, c);
680	break;
681	}
682
683	/ ERE/ARE backslash handling; backslash already eaten /
684	assert(!ATEOS());
685	if (!(v->cflags & REG_ADVF))
686	{ / only AREs have non-trivial escapes /
687	if (iscalnum(*v->now))
688	{
689	NOTE(REG_UBSALNUM);
690	NOTE(REG_UUNSPEC);
691	}
692	RETV(PLAIN, *v->now++);
693	}
694	(DISCARD) lexescape(v);
695	if (ISERR())
696	FAILW(REG_EESCAPE);
697	if (v->nexttype == CCLASS)
698	{ / fudge at lexical level /
699	switch (v->nextvalue)
700	{
701	case `'d'`:
702	lexnest(v, backd, ENDOF(backd));
703	break;
704	case `'D'`:
705	lexnest(v, backD, ENDOF(backD));
706	break;
707	case `'s'`:
708	lexnest(v, backs, ENDOF(backs));
709	break;
710	case `'S'`:
711	lexnest(v, backS, ENDOF(backS));
712	break;
713	case `'w'`:
714	lexnest(v, backw, ENDOF(backw));
715	break;
716	case `'W'`:
717	lexnest(v, backW, ENDOF(backW));
718	break;
719	default:
720	assert(NOTREACHED);
721	FAILW(REG_ASSERT);
722	break;
723	}
724	/ lexnest done, back up and try again /
725	v->nexttype = v->lasttype;
726	return next(v);
727	}
728	/ otherwise, lexescape has already done the work /
729	return !ISERR();
730	}
731
732	/*
733	* lexescape - parse an ARE backslash escape (backslash already eaten)
734	* Note slightly nonstandard use of the CCLASS type code.
735	*/
736	static int / not actually used, but convenient for RETV /
737	lexescape(struct vars *v)
738	{
739	chr c;
740	static const chr alert[] = {
741	CHR(`'a'`), CHR(`'l'`), CHR(`'e'`), CHR(`'r'`), CHR(`'t'`)
742	};
743	static const chr esc[] = {
744	CHR(`'E'`), CHR(`'S'`), CHR(`'C'`)
745	};
746	const chr *save;
747
748	assert(v->cflags & REG_ADVF);
749
750	assert(!ATEOS());
751	c = *v->now++;
752	if (!iscalnum(c))
753	RETV(PLAIN, c);
754
755	NOTE(REG_UNONPOSIX);
756	switch (c)
757	{
758	case CHR(`'a'`):
759	RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR(`'\007'`)));
760	break;
761	case CHR(`'A'`):
762	RETV(SBEGIN, `0`);
763	break;
764	case CHR(`'b'`):
765	RETV(PLAIN, CHR(`'\b'`));
766	break;
767	case CHR(`'B'`):
768	RETV(PLAIN, CHR(`'\\'`));
769	break;
770	case CHR(`'c'`):
771	NOTE(REG_UUNPORT);
772	if (ATEOS())
773	FAILW(REG_EESCAPE);
774	RETV(PLAIN, (chr) (*v->now++ & `037`));
775	break;
776	case CHR(`'d'`):
777	NOTE(REG_ULOCALE);
778	RETV(CCLASS, `'d'`);
779	break;
780	case CHR(`'D'`):
781	NOTE(REG_ULOCALE);
782	RETV(CCLASS, `'D'`);
783	break;
784	case CHR(`'e'`):
785	NOTE(REG_UUNPORT);
786	RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR(`'\033'`)));
787	break;
788	case CHR(`'f'`):
789	RETV(PLAIN, CHR(`'\f'`));
790	break;
791	case CHR(`'m'`):
792	RET(`'<'`);
793	break;
794	case CHR(`'M'`):
795	RET(`'>'`);
796	break;
797	case CHR(`'n'`):
798	RETV(PLAIN, CHR(`'\n'`));
799	break;
800	case CHR(`'r'`):
801	RETV(PLAIN, CHR(`'\r'`));
802	break;
803	case CHR(`'s'`):
804	NOTE(REG_ULOCALE);
805	RETV(CCLASS, `'s'`);
806	break;
807	case CHR(`'S'`):
808	NOTE(REG_ULOCALE);
809	RETV(CCLASS, `'S'`);
810	break;
811	case CHR(`'t'`):
812	RETV(PLAIN, CHR(`'\t'`));
813	break;
814	case CHR(`'u'`):
815	c = lexdigits(v, `16`, `4`, `4`);
816	if (ISERR() \|\| !CHR_IS_IN_RANGE(c))
817	FAILW(REG_EESCAPE);
818	RETV(PLAIN, c);
819	break;
820	case CHR(`'U'`):
821	c = lexdigits(v, `16`, `8`, `8`);
822	if (ISERR() \|\| !CHR_IS_IN_RANGE(c))
823	FAILW(REG_EESCAPE);
824	RETV(PLAIN, c);
825	break;
826	case CHR(`'v'`):
827	RETV(PLAIN, CHR(`'\v'`));
828	break;
829	case CHR(`'w'`):
830	NOTE(REG_ULOCALE);
831	RETV(CCLASS, `'w'`);
832	break;
833	case CHR(`'W'`):
834	NOTE(REG_ULOCALE);
835	RETV(CCLASS, `'W'`);
836	break;
837	case CHR(`'x'`):
838	NOTE(REG_UUNPORT);
839	c = lexdigits(v, `16`, `1`, `255`); / REs >255 long outside spec /
840	if (ISERR() \|\| !CHR_IS_IN_RANGE(c))
841	FAILW(REG_EESCAPE);
842	RETV(PLAIN, c);
843	break;
844	case CHR(`'y'`):
845	NOTE(REG_ULOCALE);
846	RETV(WBDRY, `0`);
847	break;
848	case CHR(`'Y'`):
849	NOTE(REG_ULOCALE);
850	RETV(NWBDRY, `0`);
851	break;
852	case CHR(`'Z'`):
853	RETV(SEND, `0`);
854	break;
855	case CHR(`'1'`):
856	case CHR(`'2'`):
857	case CHR(`'3'`):
858	case CHR(`'4'`):
859	case CHR(`'5'`):
860	case CHR(`'6'`):
861	case CHR(`'7'`):
862	case CHR(`'8'`):
863	case CHR(`'9'`):
864	save = v->now;
865	v->now--; / put first digit back /
866	c = lexdigits(v, `10`, `1`, `255`); / REs >255 long outside spec /
867	if (ISERR())
868	FAILW(REG_EESCAPE);
869	/ ugly heuristic (first test is "exactly 1 digit?") /
870	if (v->now == save \|\| ((int) c > `0` && (int) c <= v->nsubexp))
871	{
872	NOTE(REG_UBACKREF);
873	RETV(BACKREF, c);
874	}
875	/ oops, doesn't look like it's a backref after all... /
876	v->now = save;
877	/ and fall through into octal number /
878	/ FALLTHROUGH /
879	case CHR(`'0'`):
880	NOTE(REG_UUNPORT);
881	v->now--; / put first digit back /
882	c = lexdigits(v, `8`, `1`, `3`);
883	if (ISERR())
884	FAILW(REG_EESCAPE);
885	if (c > `0xff`)
886	{
887	/ out of range, so we handled one digit too much /
888	v->now--;
889	c >>= `3`;
890	}
891	RETV(PLAIN, c);
892	break;
893	default:
894	assert(iscalpha(c));
895	FAILW(REG_EESCAPE); / unknown alphabetic escape /
896	break;
897	}
898	assert(NOTREACHED);
899	}
900
901	/*
902	* lexdigits - slurp up digits and return chr value
903	*
904	* This does not account for overflow; callers should range-check the result
905	* if maxlen is large enough to make that possible.
906	*/
907	static chr / chr value; errors signalled via ERR /
908	lexdigits(struct vars *v,
909	int base,
910	int minlen,
911	int maxlen)
912	{
913	uchr n; / unsigned to avoid overflow misbehavior /
914	int len;
915	chr c;
916	int d;
917	const uchr ub = (uchr) base;
918
919	n = `0`;
920	for (len = `0`; len < maxlen && !ATEOS(); len++)
921	{
922	c = *v->now++;
923	switch (c)
924	{
925	case CHR(`'0'`):
926	case CHR(`'1'`):
927	case CHR(`'2'`):
928	case CHR(`'3'`):
929	case CHR(`'4'`):
930	case CHR(`'5'`):
931	case CHR(`'6'`):
932	case CHR(`'7'`):
933	case CHR(`'8'`):
934	case CHR(`'9'`):
935	d = DIGITVAL(c);
936	break;
937	case CHR(`'a'`):
938	case CHR(`'A'`):
939	d = `10`;
940	break;
941	case CHR(`'b'`):
942	case CHR(`'B'`):
943	d = `11`;
944	break;
945	case CHR(`'c'`):
946	case CHR(`'C'`):
947	d = `12`;
948	break;
949	case CHR(`'d'`):
950	case CHR(`'D'`):
951	d = `13`;
952	break;
953	case CHR(`'e'`):
954	case CHR(`'E'`):
955	d = `14`;
956	break;
957	case CHR(`'f'`):
958	case CHR(`'F'`):
959	d = `15`;
960	break;
961	default:
962	v->now--; / oops, not a digit at all /
963	d = -`1`;
964	break;
965	}
966
967	if (d >= base)
968	{ / not a plausible digit /
969	v->now--;
970	d = -`1`;
971	}
972	if (d < `0`)
973	break; / NOTE BREAK OUT /
974	n = n * ub + (uchr) d;
975	}
976	if (len < minlen)
977	ERR(REG_EESCAPE);
978
979	return (chr) n;
980	}
981
982	/*
983	* brenext - get next BRE token
984	*
985	* This is much like EREs except for all the stupid backslashes and the
986	* context-dependency of some things.
987	*/
988	static int / 1 normal, 0 failure /
989	brenext(struct vars *v,
990	chr c)
991	{
992	switch (c)
993	{
994	case CHR(`'*'`):
995	if (LASTTYPE(EMPTY) \|\| LASTTYPE(`'('`) \|\| LASTTYPE(`'^'`))
996	RETV(PLAIN, c);
997	RET(`'*'`);
998	break;
999	case CHR(`'['`):
1000	if (HAVE(`6`) && *(v->now + `0`) == CHR(`'['`) &&
1001	*(v->now + `1`) == CHR(`':'`) &&
1002	(*(v->now + `2`) == CHR(`'<'`) \|\|
1003	*(v->now + `2`) == CHR(`'>'`)) &&
1004	*(v->now + `3`) == CHR(`':'`) &&
1005	*(v->now + `4`) == CHR(`']'`) &&
1006	*(v->now + `5`) == CHR(`']'`))
1007	{
1008	c = *(v->now + `2`);
1009	v->now += `6`;
1010	NOTE(REG_UNONPOSIX);
1011	RET((c == CHR(`'<'`)) ? `'<'` : `'>'`);
1012	}
1013	INTOCON(L_BRACK);
1014	if (NEXT1(`'^'`))
1015	{
1016	v->now++;
1017	RETV(`'['`, `0`);
1018	}
1019	RETV(`'['`, `1`);
1020	break;
1021	case CHR(`'.'`):
1022	RET(`'.'`);
1023	break;
1024	case CHR(`'^'`):
1025	if (LASTTYPE(EMPTY))
1026	RET(`'^'`);
1027	if (LASTTYPE(`'('`))
1028	{
1029	NOTE(REG_UUNSPEC);
1030	RET(`'^'`);
1031	}
1032	RETV(PLAIN, c);
1033	break;
1034	case CHR(`'$'`):
1035	if (v->cflags & REG_EXPANDED)
1036	skip(v);
1037	if (ATEOS())
1038	RET(`'$'`);
1039	if (NEXT2(`'\\'`, `')'`))
1040	{
1041	NOTE(REG_UUNSPEC);
1042	RET(`'$'`);
1043	}
1044	RETV(PLAIN, c);
1045	break;
1046	case CHR(`'\\'`):
1047	break; / see below /
1048	default:
1049	RETV(PLAIN, c);
1050	break;
1051	}
1052
1053	assert(c == CHR(`'\\'`));
1054
1055	if (ATEOS())
1056	FAILW(REG_EESCAPE);
1057
1058	c = *v->now++;
1059	switch (c)
1060	{
1061	case CHR(`'{'`):
1062	INTOCON(L_BBND);
1063	NOTE(REG_UBOUNDS);
1064	RET(`'{'`);
1065	break;
1066	case CHR(`'('`):
1067	RETV(`'('`, `1`);
1068	break;
1069	case CHR(`')'`):
1070	RETV(`')'`, c);
1071	break;
1072	case CHR(`'<'`):
1073	NOTE(REG_UNONPOSIX);
1074	RET(`'<'`);
1075	break;
1076	case CHR(`'>'`):
1077	NOTE(REG_UNONPOSIX);
1078	RET(`'>'`);
1079	break;
1080	case CHR(`'1'`):
1081	case CHR(`'2'`):
1082	case CHR(`'3'`):
1083	case CHR(`'4'`):
1084	case CHR(`'5'`):
1085	case CHR(`'6'`):
1086	case CHR(`'7'`):
1087	case CHR(`'8'`):
1088	case CHR(`'9'`):
1089	NOTE(REG_UBACKREF);
1090	RETV(BACKREF, (chr) DIGITVAL(c));
1091	break;
1092	default:
1093	if (iscalnum(c))
1094	{
1095	NOTE(REG_UBSALNUM);
1096	NOTE(REG_UUNSPEC);
1097	}
1098	RETV(PLAIN, c);
1099	break;
1100	}
1101
1102	assert(NOTREACHED);
1103	return `0`;
1104	}
1105
1106	/*
1107	* skip - skip white space and comments in expanded form
1108	*/
1109	static void
1110	skip(struct vars *v)
1111	{
1112	const chr *start = v->now;
1113
1114	assert(v->cflags & REG_EXPANDED);
1115
1116	for (;;)
1117	{
1118	while (!ATEOS() && iscspace(*v->now))
1119	v->now++;
1120	if (ATEOS() \|\| *v->now != CHR(`'#'`))
1121	break; / NOTE BREAK OUT /
1122	assert(NEXT1(`'#'`));
1123	while (!ATEOS() && *v->now != CHR(`'\n'`))
1124	v->now++;
1125	/ leave the newline to be picked up by the iscspace loop /
1126	}
1127
1128	if (v->now != start)
1129	NOTE(REG_UNONPOSIX);
1130	}
1131
1132	/*
1133	* newline - return the chr for a newline
1134	*
1135	* This helps confine use of CHR to this source file.
1136	*/
1137	static chr
1138	newline(void)
1139	{
1140	return CHR(`'\n'`);
1141	}
1142
1143	/*
1144	* chrnamed - return the chr known by a given (chr string) name
1145	*
1146	* The code is a bit clumsy, but this routine gets only such specialized
1147	* use that it hardly matters.
1148	*/
1149	static chr
1150	chrnamed(struct vars *v,
1151	const chr startp, /* start of name /
1152	const chr endp, /* just past end of name /
1153	chr lastresort) / what to return if name lookup fails /
1154	{
1155	chr c;
1156	int errsave;
1157	int e;
1158	struct cvec *cv;
1159
1160	errsave = v->err;
1161	v->err = `0`;
1162	c = element(v, startp, endp);
1163	e = v->err;
1164	v->err = errsave;
1165
1166	if (e != `0`)
1167	return lastresort;
1168
1169	cv = range(v, c, c, `0`);
1170	if (cv->nchrs == `0`)
1171	return lastresort;
1172	return cv->chrs[`0`];
1173	}
1174

Browse the source code of PostgreSQL/src/backend/regex/regc_lex.c