1/*
2 * lexical analyzer
3 * This file is #included by regcomp.c.
4 *
5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6 *
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results. The author
10 * thanks all of them.
11 *
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
16 *
17 * I'd appreciate being given credit for this package in the documentation
18 * of software which uses it, but that is not a requirement.
19 *
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 * src/backend/regex/regc_lex.c
32 *
33 */
34
35/* scanning macros (know about v) */
36#define ATEOS() (v->now >= v->stop)
37#define HAVE(n) (v->stop - v->now >= (n))
38#define NEXT1(c) (!ATEOS() && *v->now == CHR(c))
39#define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
40#define NEXT3(a,b,c) (HAVE(3) && *v->now == CHR(a) && \
41 *(v->now+1) == CHR(b) && \
42 *(v->now+2) == CHR(c))
43#define SET(c) (v->nexttype = (c))
44#define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n))
45#define RET(c) return (SET(c), 1)
46#define RETV(c, n) return (SETV(c, n), 1)
47#define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */
48#define LASTTYPE(t) (v->lasttype == (t))
49
50/* lexical contexts */
51#define L_ERE 1 /* mainline ERE/ARE */
52#define L_BRE 2 /* mainline BRE */
53#define L_Q 3 /* REG_QUOTE */
54#define L_EBND 4 /* ERE/ARE bound */
55#define L_BBND 5 /* BRE bound */
56#define L_BRACK 6 /* brackets */
57#define L_CEL 7 /* collating element */
58#define L_ECL 8 /* equivalence class */
59#define L_CCL 9 /* character class */
60#define INTOCON(c) (v->lexcon = (c))
61#define INCON(con) (v->lexcon == (con))
62
63/* construct pointer past end of chr array */
64#define ENDOF(array) ((array) + sizeof(array)/sizeof(chr))
65
66/*
67 * lexstart - set up lexical stuff, scan leading options
68 */
69static void
70lexstart(struct vars *v)
71{
72 prefixes(v); /* may turn on new type bits etc. */
73 NOERR();
74
75 if (v->cflags & REG_QUOTE)
76 {
77 assert(!(v->cflags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE)));
78 INTOCON(L_Q);
79 }
80 else if (v->cflags & REG_EXTENDED)
81 {
82 assert(!(v->cflags & REG_QUOTE));
83 INTOCON(L_ERE);
84 }
85 else
86 {
87 assert(!(v->cflags & (REG_QUOTE | REG_ADVF)));
88 INTOCON(L_BRE);
89 }
90
91 v->nexttype = EMPTY; /* remember we were at the start */
92 next(v); /* set up the first token */
93}
94
95/*
96 * prefixes - implement various special prefixes
97 */
98static void
99prefixes(struct vars *v)
100{
101 /* literal string doesn't get any of this stuff */
102 if (v->cflags & REG_QUOTE)
103 return;
104
105 /* initial "***" gets special things */
106 if (HAVE(4) && NEXT3('*', '*', '*'))
107 switch (*(v->now + 3))
108 {
109 case CHR('?'): /* "***?" error, msg shows version */
110 ERR(REG_BADPAT);
111 return; /* proceed no further */
112 break;
113 case CHR('='): /* "***=" shifts to literal string */
114 NOTE(REG_UNONPOSIX);
115 v->cflags |= REG_QUOTE;
116 v->cflags &= ~(REG_ADVANCED | REG_EXPANDED | REG_NEWLINE);
117 v->now += 4;
118 return; /* and there can be no more prefixes */
119 break;
120 case CHR(':'): /* "***:" shifts to AREs */
121 NOTE(REG_UNONPOSIX);
122 v->cflags |= REG_ADVANCED;
123 v->now += 4;
124 break;
125 default: /* otherwise *** is just an error */
126 ERR(REG_BADRPT);
127 return;
128 break;
129 }
130
131 /* BREs and EREs don't get embedded options */
132 if ((v->cflags & REG_ADVANCED) != REG_ADVANCED)
133 return;
134
135 /* embedded options (AREs only) */
136 if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2)))
137 {
138 NOTE(REG_UNONPOSIX);
139 v->now += 2;
140 for (; !ATEOS() && iscalpha(*v->now); v->now++)
141 switch (*v->now)
142 {
143 case CHR('b'): /* BREs (but why???) */
144 v->cflags &= ~(REG_ADVANCED | REG_QUOTE);
145 break;
146 case CHR('c'): /* case sensitive */
147 v->cflags &= ~REG_ICASE;
148 break;
149 case CHR('e'): /* plain EREs */
150 v->cflags |= REG_EXTENDED;
151 v->cflags &= ~(REG_ADVF | REG_QUOTE);
152 break;
153 case CHR('i'): /* case insensitive */
154 v->cflags |= REG_ICASE;
155 break;
156 case CHR('m'): /* Perloid synonym for n */
157 case CHR('n'): /* \n affects ^ $ . [^ */
158 v->cflags |= REG_NEWLINE;
159 break;
160 case CHR('p'): /* ~Perl, \n affects . [^ */
161 v->cflags |= REG_NLSTOP;
162 v->cflags &= ~REG_NLANCH;
163 break;
164 case CHR('q'): /* literal string */
165 v->cflags |= REG_QUOTE;
166 v->cflags &= ~REG_ADVANCED;
167 break;
168 case CHR('s'): /* single line, \n ordinary */
169 v->cflags &= ~REG_NEWLINE;
170 break;
171 case CHR('t'): /* tight syntax */
172 v->cflags &= ~REG_EXPANDED;
173 break;
174 case CHR('w'): /* weird, \n affects ^ $ only */
175 v->cflags &= ~REG_NLSTOP;
176 v->cflags |= REG_NLANCH;
177 break;
178 case CHR('x'): /* expanded syntax */
179 v->cflags |= REG_EXPANDED;
180 break;
181 default:
182 ERR(REG_BADOPT);
183 return;
184 }
185 if (!NEXT1(')'))
186 {
187 ERR(REG_BADOPT);
188 return;
189 }
190 v->now++;
191 if (v->cflags & REG_QUOTE)
192 v->cflags &= ~(REG_EXPANDED | REG_NEWLINE);
193 }
194}
195
196/*
197 * lexnest - "call a subroutine", interpolating string at the lexical level
198 *
199 * Note, this is not a very general facility. There are a number of
200 * implicit assumptions about what sorts of strings can be subroutines.
201 */
202static void
203lexnest(struct vars *v,
204 const chr *beginp, /* start of interpolation */
205 const chr *endp) /* one past end of interpolation */
206{
207 assert(v->savenow == NULL); /* only one level of nesting */
208 v->savenow = v->now;
209 v->savestop = v->stop;
210 v->now = beginp;
211 v->stop = endp;
212}
213
214/*
215 * string constants to interpolate as expansions of things like \d
216 */
217static const chr backd[] = { /* \d */
218 CHR('['), CHR('['), CHR(':'),
219 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
220 CHR(':'), CHR(']'), CHR(']')
221};
222static const chr backD[] = { /* \D */
223 CHR('['), CHR('^'), CHR('['), CHR(':'),
224 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
225 CHR(':'), CHR(']'), CHR(']')
226};
227static const chr brbackd[] = { /* \d within brackets */
228 CHR('['), CHR(':'),
229 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
230 CHR(':'), CHR(']')
231};
232static const chr backs[] = { /* \s */
233 CHR('['), CHR('['), CHR(':'),
234 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
235 CHR(':'), CHR(']'), CHR(']')
236};
237static const chr backS[] = { /* \S */
238 CHR('['), CHR('^'), CHR('['), CHR(':'),
239 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
240 CHR(':'), CHR(']'), CHR(']')
241};
242static const chr brbacks[] = { /* \s within brackets */
243 CHR('['), CHR(':'),
244 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
245 CHR(':'), CHR(']')
246};
247static const chr backw[] = { /* \w */
248 CHR('['), CHR('['), CHR(':'),
249 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
250 CHR(':'), CHR(']'), CHR('_'), CHR(']')
251};
252static const chr backW[] = { /* \W */
253 CHR('['), CHR('^'), CHR('['), CHR(':'),
254 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
255 CHR(':'), CHR(']'), CHR('_'), CHR(']')
256};
257static const chr brbackw[] = { /* \w within brackets */
258 CHR('['), CHR(':'),
259 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
260 CHR(':'), CHR(']'), CHR('_')
261};
262
263/*
264 * lexword - interpolate a bracket expression for word characters
265 * Possibly ought to inquire whether there is a "word" character class.
266 */
267static void
268lexword(struct vars *v)
269{
270 lexnest(v, backw, ENDOF(backw));
271}
272
273/*
274 * next - get next token
275 */
276static int /* 1 normal, 0 failure */
277next(struct vars *v)
278{
279 chr c;
280
281 /* errors yield an infinite sequence of failures */
282 if (ISERR())
283 return 0; /* the error has set nexttype to EOS */
284
285 /* remember flavor of last token */
286 v->lasttype = v->nexttype;
287
288 /* REG_BOSONLY */
289 if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY))
290 {
291 /* at start of a REG_BOSONLY RE */
292 RETV(SBEGIN, 0); /* same as \A */
293 }
294
295 /* if we're nested and we've hit end, return to outer level */
296 if (v->savenow != NULL && ATEOS())
297 {
298 v->now = v->savenow;
299 v->stop = v->savestop;
300 v->savenow = v->savestop = NULL;
301 }
302
303 /* skip white space etc. if appropriate (not in literal or []) */
304 if (v->cflags & REG_EXPANDED)
305 switch (v->lexcon)
306 {
307 case L_ERE:
308 case L_BRE:
309 case L_EBND:
310 case L_BBND:
311 skip(v);
312 break;
313 }
314
315 /* handle EOS, depending on context */
316 if (ATEOS())
317 {
318 switch (v->lexcon)
319 {
320 case L_ERE:
321 case L_BRE:
322 case L_Q:
323 RET(EOS);
324 break;
325 case L_EBND:
326 case L_BBND:
327 FAILW(REG_EBRACE);
328 break;
329 case L_BRACK:
330 case L_CEL:
331 case L_ECL:
332 case L_CCL:
333 FAILW(REG_EBRACK);
334 break;
335 }
336 assert(NOTREACHED);
337 }
338
339 /* okay, time to actually get a character */
340 c = *v->now++;
341
342 /* deal with the easy contexts, punt EREs to code below */
343 switch (v->lexcon)
344 {
345 case L_BRE: /* punt BREs to separate function */
346 return brenext(v, c);
347 break;
348 case L_ERE: /* see below */
349 break;
350 case L_Q: /* literal strings are easy */
351 RETV(PLAIN, c);
352 break;
353 case L_BBND: /* bounds are fairly simple */
354 case L_EBND:
355 switch (c)
356 {
357 case CHR('0'):
358 case CHR('1'):
359 case CHR('2'):
360 case CHR('3'):
361 case CHR('4'):
362 case CHR('5'):
363 case CHR('6'):
364 case CHR('7'):
365 case CHR('8'):
366 case CHR('9'):
367 RETV(DIGIT, (chr) DIGITVAL(c));
368 break;
369 case CHR(','):
370 RET(',');
371 break;
372 case CHR('}'): /* ERE bound ends with } */
373 if (INCON(L_EBND))
374 {
375 INTOCON(L_ERE);
376 if ((v->cflags & REG_ADVF) && NEXT1('?'))
377 {
378 v->now++;
379 NOTE(REG_UNONPOSIX);
380 RETV('}', 0);
381 }
382 RETV('}', 1);
383 }
384 else
385 FAILW(REG_BADBR);
386 break;
387 case CHR('\\'): /* BRE bound ends with \} */
388 if (INCON(L_BBND) && NEXT1('}'))
389 {
390 v->now++;
391 INTOCON(L_BRE);
392 RET('}');
393 }
394 else
395 FAILW(REG_BADBR);
396 break;
397 default:
398 FAILW(REG_BADBR);
399 break;
400 }
401 assert(NOTREACHED);
402 break;
403 case L_BRACK: /* brackets are not too hard */
404 switch (c)
405 {
406 case CHR(']'):
407 if (LASTTYPE('['))
408 RETV(PLAIN, c);
409 else
410 {
411 INTOCON((v->cflags & REG_EXTENDED) ?
412 L_ERE : L_BRE);
413 RET(']');
414 }
415 break;
416 case CHR('\\'):
417 NOTE(REG_UBBS);
418 if (!(v->cflags & REG_ADVF))
419 RETV(PLAIN, c);
420 NOTE(REG_UNONPOSIX);
421 if (ATEOS())
422 FAILW(REG_EESCAPE);
423 (DISCARD) lexescape(v);
424 switch (v->nexttype)
425 { /* not all escapes okay here */
426 case PLAIN:
427 return 1;
428 break;
429 case CCLASS:
430 switch (v->nextvalue)
431 {
432 case 'd':
433 lexnest(v, brbackd, ENDOF(brbackd));
434 break;
435 case 's':
436 lexnest(v, brbacks, ENDOF(brbacks));
437 break;
438 case 'w':
439 lexnest(v, brbackw, ENDOF(brbackw));
440 break;
441 default:
442 FAILW(REG_EESCAPE);
443 break;
444 }
445 /* lexnest done, back up and try again */
446 v->nexttype = v->lasttype;
447 return next(v);
448 break;
449 }
450 /* not one of the acceptable escapes */
451 FAILW(REG_EESCAPE);
452 break;
453 case CHR('-'):
454 if (LASTTYPE('[') || NEXT1(']'))
455 RETV(PLAIN, c);
456 else
457 RETV(RANGE, c);
458 break;
459 case CHR('['):
460 if (ATEOS())
461 FAILW(REG_EBRACK);
462 switch (*v->now++)
463 {
464 case CHR('.'):
465 INTOCON(L_CEL);
466 /* might or might not be locale-specific */
467 RET(COLLEL);
468 break;
469 case CHR('='):
470 INTOCON(L_ECL);
471 NOTE(REG_ULOCALE);
472 RET(ECLASS);
473 break;
474 case CHR(':'):
475 INTOCON(L_CCL);
476 NOTE(REG_ULOCALE);
477 RET(CCLASS);
478 break;
479 default: /* oops */
480 v->now--;
481 RETV(PLAIN, c);
482 break;
483 }
484 assert(NOTREACHED);
485 break;
486 default:
487 RETV(PLAIN, c);
488 break;
489 }
490 assert(NOTREACHED);
491 break;
492 case L_CEL: /* collating elements are easy */
493 if (c == CHR('.') && NEXT1(']'))
494 {
495 v->now++;
496 INTOCON(L_BRACK);
497 RETV(END, '.');
498 }
499 else
500 RETV(PLAIN, c);
501 break;
502 case L_ECL: /* ditto equivalence classes */
503 if (c == CHR('=') && NEXT1(']'))
504 {
505 v->now++;
506 INTOCON(L_BRACK);
507 RETV(END, '=');
508 }
509 else
510 RETV(PLAIN, c);
511 break;
512 case L_CCL: /* ditto character classes */
513 if (c == CHR(':') && NEXT1(']'))
514 {
515 v->now++;
516 INTOCON(L_BRACK);
517 RETV(END, ':');
518 }
519 else
520 RETV(PLAIN, c);
521 break;
522 default:
523 assert(NOTREACHED);
524 break;
525 }
526
527 /* that got rid of everything except EREs and AREs */
528 assert(INCON(L_ERE));
529
530 /* deal with EREs and AREs, except for backslashes */
531 switch (c)
532 {
533 case CHR('|'):
534 RET('|');
535 break;
536 case CHR('*'):
537 if ((v->cflags & REG_ADVF) && NEXT1('?'))
538 {
539 v->now++;
540 NOTE(REG_UNONPOSIX);
541 RETV('*', 0);
542 }
543 RETV('*', 1);
544 break;
545 case CHR('+'):
546 if ((v->cflags & REG_ADVF) && NEXT1('?'))
547 {
548 v->now++;
549 NOTE(REG_UNONPOSIX);
550 RETV('+', 0);
551 }
552 RETV('+', 1);
553 break;
554 case CHR('?'):
555 if ((v->cflags & REG_ADVF) && NEXT1('?'))
556 {
557 v->now++;
558 NOTE(REG_UNONPOSIX);
559 RETV('?', 0);
560 }
561 RETV('?', 1);
562 break;
563 case CHR('{'): /* bounds start or plain character */
564 if (v->cflags & REG_EXPANDED)
565 skip(v);
566 if (ATEOS() || !iscdigit(*v->now))
567 {
568 NOTE(REG_UBRACES);
569 NOTE(REG_UUNSPEC);
570 RETV(PLAIN, c);
571 }
572 else
573 {
574 NOTE(REG_UBOUNDS);
575 INTOCON(L_EBND);
576 RET('{');
577 }
578 assert(NOTREACHED);
579 break;
580 case CHR('('): /* parenthesis, or advanced extension */
581 if ((v->cflags & REG_ADVF) && NEXT1('?'))
582 {
583 NOTE(REG_UNONPOSIX);
584 v->now++;
585 if (ATEOS())
586 FAILW(REG_BADRPT);
587 switch (*v->now++)
588 {
589 case CHR(':'): /* non-capturing paren */
590 RETV('(', 0);
591 break;
592 case CHR('#'): /* comment */
593 while (!ATEOS() && *v->now != CHR(')'))
594 v->now++;
595 if (!ATEOS())
596 v->now++;
597 assert(v->nexttype == v->lasttype);
598 return next(v);
599 break;
600 case CHR('='): /* positive lookahead */
601 NOTE(REG_ULOOKAROUND);
602 RETV(LACON, LATYPE_AHEAD_POS);
603 break;
604 case CHR('!'): /* negative lookahead */
605 NOTE(REG_ULOOKAROUND);
606 RETV(LACON, LATYPE_AHEAD_NEG);
607 break;
608 case CHR('<'):
609 if (ATEOS())
610 FAILW(REG_BADRPT);
611 switch (*v->now++)
612 {
613 case CHR('='): /* positive lookbehind */
614 NOTE(REG_ULOOKAROUND);
615 RETV(LACON, LATYPE_BEHIND_POS);
616 break;
617 case CHR('!'): /* negative lookbehind */
618 NOTE(REG_ULOOKAROUND);
619 RETV(LACON, LATYPE_BEHIND_NEG);
620 break;
621 default:
622 FAILW(REG_BADRPT);
623 break;
624 }
625 assert(NOTREACHED);
626 break;
627 default:
628 FAILW(REG_BADRPT);
629 break;
630 }
631 assert(NOTREACHED);
632 }
633 if (v->cflags & REG_NOSUB)
634 RETV('(', 0); /* all parens non-capturing */
635 else
636 RETV('(', 1);
637 break;
638 case CHR(')'):
639 if (LASTTYPE('('))
640 NOTE(REG_UUNSPEC);
641 RETV(')', c);
642 break;
643 case CHR('['): /* easy except for [[:<:]] and [[:>:]] */
644 if (HAVE(6) && *(v->now + 0) == CHR('[') &&
645 *(v->now + 1) == CHR(':') &&
646 (*(v->now + 2) == CHR('<') ||
647 *(v->now + 2) == CHR('>')) &&
648 *(v->now + 3) == CHR(':') &&
649 *(v->now + 4) == CHR(']') &&
650 *(v->now + 5) == CHR(']'))
651 {
652 c = *(v->now + 2);
653 v->now += 6;
654 NOTE(REG_UNONPOSIX);
655 RET((c == CHR('<')) ? '<' : '>');
656 }
657 INTOCON(L_BRACK);
658 if (NEXT1('^'))
659 {
660 v->now++;
661 RETV('[', 0);
662 }
663 RETV('[', 1);
664 break;
665 case CHR('.'):
666 RET('.');
667 break;
668 case CHR('^'):
669 RET('^');
670 break;
671 case CHR('$'):
672 RET('$');
673 break;
674 case CHR('\\'): /* mostly punt backslashes to code below */
675 if (ATEOS())
676 FAILW(REG_EESCAPE);
677 break;
678 default: /* ordinary character */
679 RETV(PLAIN, c);
680 break;
681 }
682
683 /* ERE/ARE backslash handling; backslash already eaten */
684 assert(!ATEOS());
685 if (!(v->cflags & REG_ADVF))
686 { /* only AREs have non-trivial escapes */
687 if (iscalnum(*v->now))
688 {
689 NOTE(REG_UBSALNUM);
690 NOTE(REG_UUNSPEC);
691 }
692 RETV(PLAIN, *v->now++);
693 }
694 (DISCARD) lexescape(v);
695 if (ISERR())
696 FAILW(REG_EESCAPE);
697 if (v->nexttype == CCLASS)
698 { /* fudge at lexical level */
699 switch (v->nextvalue)
700 {
701 case 'd':
702 lexnest(v, backd, ENDOF(backd));
703 break;
704 case 'D':
705 lexnest(v, backD, ENDOF(backD));
706 break;
707 case 's':
708 lexnest(v, backs, ENDOF(backs));
709 break;
710 case 'S':
711 lexnest(v, backS, ENDOF(backS));
712 break;
713 case 'w':
714 lexnest(v, backw, ENDOF(backw));
715 break;
716 case 'W':
717 lexnest(v, backW, ENDOF(backW));
718 break;
719 default:
720 assert(NOTREACHED);
721 FAILW(REG_ASSERT);
722 break;
723 }
724 /* lexnest done, back up and try again */
725 v->nexttype = v->lasttype;
726 return next(v);
727 }
728 /* otherwise, lexescape has already done the work */
729 return !ISERR();
730}
731
732/*
733 * lexescape - parse an ARE backslash escape (backslash already eaten)
734 * Note slightly nonstandard use of the CCLASS type code.
735 */
736static int /* not actually used, but convenient for RETV */
737lexescape(struct vars *v)
738{
739 chr c;
740 static const chr alert[] = {
741 CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
742 };
743 static const chr esc[] = {
744 CHR('E'), CHR('S'), CHR('C')
745 };
746 const chr *save;
747
748 assert(v->cflags & REG_ADVF);
749
750 assert(!ATEOS());
751 c = *v->now++;
752 if (!iscalnum(c))
753 RETV(PLAIN, c);
754
755 NOTE(REG_UNONPOSIX);
756 switch (c)
757 {
758 case CHR('a'):
759 RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007')));
760 break;
761 case CHR('A'):
762 RETV(SBEGIN, 0);
763 break;
764 case CHR('b'):
765 RETV(PLAIN, CHR('\b'));
766 break;
767 case CHR('B'):
768 RETV(PLAIN, CHR('\\'));
769 break;
770 case CHR('c'):
771 NOTE(REG_UUNPORT);
772 if (ATEOS())
773 FAILW(REG_EESCAPE);
774 RETV(PLAIN, (chr) (*v->now++ & 037));
775 break;
776 case CHR('d'):
777 NOTE(REG_ULOCALE);
778 RETV(CCLASS, 'd');
779 break;
780 case CHR('D'):
781 NOTE(REG_ULOCALE);
782 RETV(CCLASS, 'D');
783 break;
784 case CHR('e'):
785 NOTE(REG_UUNPORT);
786 RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033')));
787 break;
788 case CHR('f'):
789 RETV(PLAIN, CHR('\f'));
790 break;
791 case CHR('m'):
792 RET('<');
793 break;
794 case CHR('M'):
795 RET('>');
796 break;
797 case CHR('n'):
798 RETV(PLAIN, CHR('\n'));
799 break;
800 case CHR('r'):
801 RETV(PLAIN, CHR('\r'));
802 break;
803 case CHR('s'):
804 NOTE(REG_ULOCALE);
805 RETV(CCLASS, 's');
806 break;
807 case CHR('S'):
808 NOTE(REG_ULOCALE);
809 RETV(CCLASS, 'S');
810 break;
811 case CHR('t'):
812 RETV(PLAIN, CHR('\t'));
813 break;
814 case CHR('u'):
815 c = lexdigits(v, 16, 4, 4);
816 if (ISERR() || !CHR_IS_IN_RANGE(c))
817 FAILW(REG_EESCAPE);
818 RETV(PLAIN, c);
819 break;
820 case CHR('U'):
821 c = lexdigits(v, 16, 8, 8);
822 if (ISERR() || !CHR_IS_IN_RANGE(c))
823 FAILW(REG_EESCAPE);
824 RETV(PLAIN, c);
825 break;
826 case CHR('v'):
827 RETV(PLAIN, CHR('\v'));
828 break;
829 case CHR('w'):
830 NOTE(REG_ULOCALE);
831 RETV(CCLASS, 'w');
832 break;
833 case CHR('W'):
834 NOTE(REG_ULOCALE);
835 RETV(CCLASS, 'W');
836 break;
837 case CHR('x'):
838 NOTE(REG_UUNPORT);
839 c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */
840 if (ISERR() || !CHR_IS_IN_RANGE(c))
841 FAILW(REG_EESCAPE);
842 RETV(PLAIN, c);
843 break;
844 case CHR('y'):
845 NOTE(REG_ULOCALE);
846 RETV(WBDRY, 0);
847 break;
848 case CHR('Y'):
849 NOTE(REG_ULOCALE);
850 RETV(NWBDRY, 0);
851 break;
852 case CHR('Z'):
853 RETV(SEND, 0);
854 break;
855 case CHR('1'):
856 case CHR('2'):
857 case CHR('3'):
858 case CHR('4'):
859 case CHR('5'):
860 case CHR('6'):
861 case CHR('7'):
862 case CHR('8'):
863 case CHR('9'):
864 save = v->now;
865 v->now--; /* put first digit back */
866 c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */
867 if (ISERR())
868 FAILW(REG_EESCAPE);
869 /* ugly heuristic (first test is "exactly 1 digit?") */
870 if (v->now == save || ((int) c > 0 && (int) c <= v->nsubexp))
871 {
872 NOTE(REG_UBACKREF);
873 RETV(BACKREF, c);
874 }
875 /* oops, doesn't look like it's a backref after all... */
876 v->now = save;
877 /* and fall through into octal number */
878 /* FALLTHROUGH */
879 case CHR('0'):
880 NOTE(REG_UUNPORT);
881 v->now--; /* put first digit back */
882 c = lexdigits(v, 8, 1, 3);
883 if (ISERR())
884 FAILW(REG_EESCAPE);
885 if (c > 0xff)
886 {
887 /* out of range, so we handled one digit too much */
888 v->now--;
889 c >>= 3;
890 }
891 RETV(PLAIN, c);
892 break;
893 default:
894 assert(iscalpha(c));
895 FAILW(REG_EESCAPE); /* unknown alphabetic escape */
896 break;
897 }
898 assert(NOTREACHED);
899}
900
901/*
902 * lexdigits - slurp up digits and return chr value
903 *
904 * This does not account for overflow; callers should range-check the result
905 * if maxlen is large enough to make that possible.
906 */
907static chr /* chr value; errors signalled via ERR */
908lexdigits(struct vars *v,
909 int base,
910 int minlen,
911 int maxlen)
912{
913 uchr n; /* unsigned to avoid overflow misbehavior */
914 int len;
915 chr c;
916 int d;
917 const uchr ub = (uchr) base;
918
919 n = 0;
920 for (len = 0; len < maxlen && !ATEOS(); len++)
921 {
922 c = *v->now++;
923 switch (c)
924 {
925 case CHR('0'):
926 case CHR('1'):
927 case CHR('2'):
928 case CHR('3'):
929 case CHR('4'):
930 case CHR('5'):
931 case CHR('6'):
932 case CHR('7'):
933 case CHR('8'):
934 case CHR('9'):
935 d = DIGITVAL(c);
936 break;
937 case CHR('a'):
938 case CHR('A'):
939 d = 10;
940 break;
941 case CHR('b'):
942 case CHR('B'):
943 d = 11;
944 break;
945 case CHR('c'):
946 case CHR('C'):
947 d = 12;
948 break;
949 case CHR('d'):
950 case CHR('D'):
951 d = 13;
952 break;
953 case CHR('e'):
954 case CHR('E'):
955 d = 14;
956 break;
957 case CHR('f'):
958 case CHR('F'):
959 d = 15;
960 break;
961 default:
962 v->now--; /* oops, not a digit at all */
963 d = -1;
964 break;
965 }
966
967 if (d >= base)
968 { /* not a plausible digit */
969 v->now--;
970 d = -1;
971 }
972 if (d < 0)
973 break; /* NOTE BREAK OUT */
974 n = n * ub + (uchr) d;
975 }
976 if (len < minlen)
977 ERR(REG_EESCAPE);
978
979 return (chr) n;
980}
981
982/*
983 * brenext - get next BRE token
984 *
985 * This is much like EREs except for all the stupid backslashes and the
986 * context-dependency of some things.
987 */
988static int /* 1 normal, 0 failure */
989brenext(struct vars *v,
990 chr c)
991{
992 switch (c)
993 {
994 case CHR('*'):
995 if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^'))
996 RETV(PLAIN, c);
997 RET('*');
998 break;
999 case CHR('['):
1000 if (HAVE(6) && *(v->now + 0) == CHR('[') &&
1001 *(v->now + 1) == CHR(':') &&
1002 (*(v->now + 2) == CHR('<') ||
1003 *(v->now + 2) == CHR('>')) &&
1004 *(v->now + 3) == CHR(':') &&
1005 *(v->now + 4) == CHR(']') &&
1006 *(v->now + 5) == CHR(']'))
1007 {
1008 c = *(v->now + 2);
1009 v->now += 6;
1010 NOTE(REG_UNONPOSIX);
1011 RET((c == CHR('<')) ? '<' : '>');
1012 }
1013 INTOCON(L_BRACK);
1014 if (NEXT1('^'))
1015 {
1016 v->now++;
1017 RETV('[', 0);
1018 }
1019 RETV('[', 1);
1020 break;
1021 case CHR('.'):
1022 RET('.');
1023 break;
1024 case CHR('^'):
1025 if (LASTTYPE(EMPTY))
1026 RET('^');
1027 if (LASTTYPE('('))
1028 {
1029 NOTE(REG_UUNSPEC);
1030 RET('^');
1031 }
1032 RETV(PLAIN, c);
1033 break;
1034 case CHR('$'):
1035 if (v->cflags & REG_EXPANDED)
1036 skip(v);
1037 if (ATEOS())
1038 RET('$');
1039 if (NEXT2('\\', ')'))
1040 {
1041 NOTE(REG_UUNSPEC);
1042 RET('$');
1043 }
1044 RETV(PLAIN, c);
1045 break;
1046 case CHR('\\'):
1047 break; /* see below */
1048 default:
1049 RETV(PLAIN, c);
1050 break;
1051 }
1052
1053 assert(c == CHR('\\'));
1054
1055 if (ATEOS())
1056 FAILW(REG_EESCAPE);
1057
1058 c = *v->now++;
1059 switch (c)
1060 {
1061 case CHR('{'):
1062 INTOCON(L_BBND);
1063 NOTE(REG_UBOUNDS);
1064 RET('{');
1065 break;
1066 case CHR('('):
1067 RETV('(', 1);
1068 break;
1069 case CHR(')'):
1070 RETV(')', c);
1071 break;
1072 case CHR('<'):
1073 NOTE(REG_UNONPOSIX);
1074 RET('<');
1075 break;
1076 case CHR('>'):
1077 NOTE(REG_UNONPOSIX);
1078 RET('>');
1079 break;
1080 case CHR('1'):
1081 case CHR('2'):
1082 case CHR('3'):
1083 case CHR('4'):
1084 case CHR('5'):
1085 case CHR('6'):
1086 case CHR('7'):
1087 case CHR('8'):
1088 case CHR('9'):
1089 NOTE(REG_UBACKREF);
1090 RETV(BACKREF, (chr) DIGITVAL(c));
1091 break;
1092 default:
1093 if (iscalnum(c))
1094 {
1095 NOTE(REG_UBSALNUM);
1096 NOTE(REG_UUNSPEC);
1097 }
1098 RETV(PLAIN, c);
1099 break;
1100 }
1101
1102 assert(NOTREACHED);
1103 return 0;
1104}
1105
1106/*
1107 * skip - skip white space and comments in expanded form
1108 */
1109static void
1110skip(struct vars *v)
1111{
1112 const chr *start = v->now;
1113
1114 assert(v->cflags & REG_EXPANDED);
1115
1116 for (;;)
1117 {
1118 while (!ATEOS() && iscspace(*v->now))
1119 v->now++;
1120 if (ATEOS() || *v->now != CHR('#'))
1121 break; /* NOTE BREAK OUT */
1122 assert(NEXT1('#'));
1123 while (!ATEOS() && *v->now != CHR('\n'))
1124 v->now++;
1125 /* leave the newline to be picked up by the iscspace loop */
1126 }
1127
1128 if (v->now != start)
1129 NOTE(REG_UNONPOSIX);
1130}
1131
1132/*
1133 * newline - return the chr for a newline
1134 *
1135 * This helps confine use of CHR to this source file.
1136 */
1137static chr
1138newline(void)
1139{
1140 return CHR('\n');
1141}
1142
1143/*
1144 * chrnamed - return the chr known by a given (chr string) name
1145 *
1146 * The code is a bit clumsy, but this routine gets only such specialized
1147 * use that it hardly matters.
1148 */
1149static chr
1150chrnamed(struct vars *v,
1151 const chr *startp, /* start of name */
1152 const chr *endp, /* just past end of name */
1153 chr lastresort) /* what to return if name lookup fails */
1154{
1155 chr c;
1156 int errsave;
1157 int e;
1158 struct cvec *cv;
1159
1160 errsave = v->err;
1161 v->err = 0;
1162 c = element(v, startp, endp);
1163 e = v->err;
1164 v->err = errsave;
1165
1166 if (e != 0)
1167 return lastresort;
1168
1169 cv = range(v, c, c, 0);
1170 if (cv->nchrs == 0)
1171 return lastresort;
1172 return cv->chrs[0];
1173}
1174