1 | /* |
2 | * lexical analyzer |
3 | * This file is #included by regcomp.c. |
4 | * |
5 | * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. |
6 | * |
7 | * Development of this software was funded, in part, by Cray Research Inc., |
8 | * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics |
9 | * Corporation, none of whom are responsible for the results. The author |
10 | * thanks all of them. |
11 | * |
12 | * Redistribution and use in source and binary forms -- with or without |
13 | * modification -- are permitted for any purpose, provided that |
14 | * redistributions in source form retain this entire copyright notice and |
15 | * indicate the origin and nature of any modifications. |
16 | * |
17 | * I'd appreciate being given credit for this package in the documentation |
18 | * of software which uses it, but that is not a requirement. |
19 | * |
20 | * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, |
21 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY |
22 | * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL |
23 | * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
24 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
25 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; |
26 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
27 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR |
28 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF |
29 | * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
30 | * |
31 | * src/backend/regex/regc_lex.c |
32 | * |
33 | */ |
34 | |
35 | /* scanning macros (know about v) */ |
36 | #define ATEOS() (v->now >= v->stop) |
37 | #define HAVE(n) (v->stop - v->now >= (n)) |
38 | #define NEXT1(c) (!ATEOS() && *v->now == CHR(c)) |
39 | #define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b)) |
40 | #define NEXT3(a,b,c) (HAVE(3) && *v->now == CHR(a) && \ |
41 | *(v->now+1) == CHR(b) && \ |
42 | *(v->now+2) == CHR(c)) |
43 | #define SET(c) (v->nexttype = (c)) |
44 | #define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n)) |
45 | #define RET(c) return (SET(c), 1) |
46 | #define RETV(c, n) return (SETV(c, n), 1) |
47 | #define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */ |
48 | #define LASTTYPE(t) (v->lasttype == (t)) |
49 | |
50 | /* lexical contexts */ |
51 | #define L_ERE 1 /* mainline ERE/ARE */ |
52 | #define L_BRE 2 /* mainline BRE */ |
53 | #define L_Q 3 /* REG_QUOTE */ |
54 | #define L_EBND 4 /* ERE/ARE bound */ |
55 | #define L_BBND 5 /* BRE bound */ |
56 | #define L_BRACK 6 /* brackets */ |
57 | #define L_CEL 7 /* collating element */ |
58 | #define L_ECL 8 /* equivalence class */ |
59 | #define L_CCL 9 /* character class */ |
60 | #define INTOCON(c) (v->lexcon = (c)) |
61 | #define INCON(con) (v->lexcon == (con)) |
62 | |
63 | /* construct pointer past end of chr array */ |
64 | #define ENDOF(array) ((array) + sizeof(array)/sizeof(chr)) |
65 | |
66 | /* |
67 | * lexstart - set up lexical stuff, scan leading options |
68 | */ |
69 | static void |
70 | lexstart(struct vars *v) |
71 | { |
72 | prefixes(v); /* may turn on new type bits etc. */ |
73 | NOERR(); |
74 | |
75 | if (v->cflags & REG_QUOTE) |
76 | { |
77 | assert(!(v->cflags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE))); |
78 | INTOCON(L_Q); |
79 | } |
80 | else if (v->cflags & REG_EXTENDED) |
81 | { |
82 | assert(!(v->cflags & REG_QUOTE)); |
83 | INTOCON(L_ERE); |
84 | } |
85 | else |
86 | { |
87 | assert(!(v->cflags & (REG_QUOTE | REG_ADVF))); |
88 | INTOCON(L_BRE); |
89 | } |
90 | |
91 | v->nexttype = EMPTY; /* remember we were at the start */ |
92 | next(v); /* set up the first token */ |
93 | } |
94 | |
95 | /* |
96 | * prefixes - implement various special prefixes |
97 | */ |
98 | static void |
99 | prefixes(struct vars *v) |
100 | { |
101 | /* literal string doesn't get any of this stuff */ |
102 | if (v->cflags & REG_QUOTE) |
103 | return; |
104 | |
105 | /* initial "***" gets special things */ |
106 | if (HAVE(4) && NEXT3('*', '*', '*')) |
107 | switch (*(v->now + 3)) |
108 | { |
109 | case CHR('?'): /* "***?" error, msg shows version */ |
110 | ERR(REG_BADPAT); |
111 | return; /* proceed no further */ |
112 | break; |
113 | case CHR('='): /* "***=" shifts to literal string */ |
114 | NOTE(REG_UNONPOSIX); |
115 | v->cflags |= REG_QUOTE; |
116 | v->cflags &= ~(REG_ADVANCED | REG_EXPANDED | REG_NEWLINE); |
117 | v->now += 4; |
118 | return; /* and there can be no more prefixes */ |
119 | break; |
120 | case CHR(':'): /* "***:" shifts to AREs */ |
121 | NOTE(REG_UNONPOSIX); |
122 | v->cflags |= REG_ADVANCED; |
123 | v->now += 4; |
124 | break; |
125 | default: /* otherwise *** is just an error */ |
126 | ERR(REG_BADRPT); |
127 | return; |
128 | break; |
129 | } |
130 | |
131 | /* BREs and EREs don't get embedded options */ |
132 | if ((v->cflags & REG_ADVANCED) != REG_ADVANCED) |
133 | return; |
134 | |
135 | /* embedded options (AREs only) */ |
136 | if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2))) |
137 | { |
138 | NOTE(REG_UNONPOSIX); |
139 | v->now += 2; |
140 | for (; !ATEOS() && iscalpha(*v->now); v->now++) |
141 | switch (*v->now) |
142 | { |
143 | case CHR('b'): /* BREs (but why???) */ |
144 | v->cflags &= ~(REG_ADVANCED | REG_QUOTE); |
145 | break; |
146 | case CHR('c'): /* case sensitive */ |
147 | v->cflags &= ~REG_ICASE; |
148 | break; |
149 | case CHR('e'): /* plain EREs */ |
150 | v->cflags |= REG_EXTENDED; |
151 | v->cflags &= ~(REG_ADVF | REG_QUOTE); |
152 | break; |
153 | case CHR('i'): /* case insensitive */ |
154 | v->cflags |= REG_ICASE; |
155 | break; |
156 | case CHR('m'): /* Perloid synonym for n */ |
157 | case CHR('n'): /* \n affects ^ $ . [^ */ |
158 | v->cflags |= REG_NEWLINE; |
159 | break; |
160 | case CHR('p'): /* ~Perl, \n affects . [^ */ |
161 | v->cflags |= REG_NLSTOP; |
162 | v->cflags &= ~REG_NLANCH; |
163 | break; |
164 | case CHR('q'): /* literal string */ |
165 | v->cflags |= REG_QUOTE; |
166 | v->cflags &= ~REG_ADVANCED; |
167 | break; |
168 | case CHR('s'): /* single line, \n ordinary */ |
169 | v->cflags &= ~REG_NEWLINE; |
170 | break; |
171 | case CHR('t'): /* tight syntax */ |
172 | v->cflags &= ~REG_EXPANDED; |
173 | break; |
174 | case CHR('w'): /* weird, \n affects ^ $ only */ |
175 | v->cflags &= ~REG_NLSTOP; |
176 | v->cflags |= REG_NLANCH; |
177 | break; |
178 | case CHR('x'): /* expanded syntax */ |
179 | v->cflags |= REG_EXPANDED; |
180 | break; |
181 | default: |
182 | ERR(REG_BADOPT); |
183 | return; |
184 | } |
185 | if (!NEXT1(')')) |
186 | { |
187 | ERR(REG_BADOPT); |
188 | return; |
189 | } |
190 | v->now++; |
191 | if (v->cflags & REG_QUOTE) |
192 | v->cflags &= ~(REG_EXPANDED | REG_NEWLINE); |
193 | } |
194 | } |
195 | |
196 | /* |
197 | * lexnest - "call a subroutine", interpolating string at the lexical level |
198 | * |
199 | * Note, this is not a very general facility. There are a number of |
200 | * implicit assumptions about what sorts of strings can be subroutines. |
201 | */ |
202 | static void |
203 | lexnest(struct vars *v, |
204 | const chr *beginp, /* start of interpolation */ |
205 | const chr *endp) /* one past end of interpolation */ |
206 | { |
207 | assert(v->savenow == NULL); /* only one level of nesting */ |
208 | v->savenow = v->now; |
209 | v->savestop = v->stop; |
210 | v->now = beginp; |
211 | v->stop = endp; |
212 | } |
213 | |
214 | /* |
215 | * string constants to interpolate as expansions of things like \d |
216 | */ |
217 | static const chr backd[] = { /* \d */ |
218 | CHR('['), CHR('['), CHR(':'), |
219 | CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), |
220 | CHR(':'), CHR(']'), CHR(']') |
221 | }; |
222 | static const chr backD[] = { /* \D */ |
223 | CHR('['), CHR('^'), CHR('['), CHR(':'), |
224 | CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), |
225 | CHR(':'), CHR(']'), CHR(']') |
226 | }; |
227 | static const chr brbackd[] = { /* \d within brackets */ |
228 | CHR('['), CHR(':'), |
229 | CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), |
230 | CHR(':'), CHR(']') |
231 | }; |
232 | static const chr backs[] = { /* \s */ |
233 | CHR('['), CHR('['), CHR(':'), |
234 | CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), |
235 | CHR(':'), CHR(']'), CHR(']') |
236 | }; |
237 | static const chr backS[] = { /* \S */ |
238 | CHR('['), CHR('^'), CHR('['), CHR(':'), |
239 | CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), |
240 | CHR(':'), CHR(']'), CHR(']') |
241 | }; |
242 | static const chr brbacks[] = { /* \s within brackets */ |
243 | CHR('['), CHR(':'), |
244 | CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), |
245 | CHR(':'), CHR(']') |
246 | }; |
247 | static const chr backw[] = { /* \w */ |
248 | CHR('['), CHR('['), CHR(':'), |
249 | CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), |
250 | CHR(':'), CHR(']'), CHR('_'), CHR(']') |
251 | }; |
252 | static const chr backW[] = { /* \W */ |
253 | CHR('['), CHR('^'), CHR('['), CHR(':'), |
254 | CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), |
255 | CHR(':'), CHR(']'), CHR('_'), CHR(']') |
256 | }; |
257 | static const chr brbackw[] = { /* \w within brackets */ |
258 | CHR('['), CHR(':'), |
259 | CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), |
260 | CHR(':'), CHR(']'), CHR('_') |
261 | }; |
262 | |
263 | /* |
264 | * lexword - interpolate a bracket expression for word characters |
265 | * Possibly ought to inquire whether there is a "word" character class. |
266 | */ |
267 | static void |
268 | lexword(struct vars *v) |
269 | { |
270 | lexnest(v, backw, ENDOF(backw)); |
271 | } |
272 | |
273 | /* |
274 | * next - get next token |
275 | */ |
276 | static int /* 1 normal, 0 failure */ |
277 | next(struct vars *v) |
278 | { |
279 | chr c; |
280 | |
281 | /* errors yield an infinite sequence of failures */ |
282 | if (ISERR()) |
283 | return 0; /* the error has set nexttype to EOS */ |
284 | |
285 | /* remember flavor of last token */ |
286 | v->lasttype = v->nexttype; |
287 | |
288 | /* REG_BOSONLY */ |
289 | if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY)) |
290 | { |
291 | /* at start of a REG_BOSONLY RE */ |
292 | RETV(SBEGIN, 0); /* same as \A */ |
293 | } |
294 | |
295 | /* if we're nested and we've hit end, return to outer level */ |
296 | if (v->savenow != NULL && ATEOS()) |
297 | { |
298 | v->now = v->savenow; |
299 | v->stop = v->savestop; |
300 | v->savenow = v->savestop = NULL; |
301 | } |
302 | |
303 | /* skip white space etc. if appropriate (not in literal or []) */ |
304 | if (v->cflags & REG_EXPANDED) |
305 | switch (v->lexcon) |
306 | { |
307 | case L_ERE: |
308 | case L_BRE: |
309 | case L_EBND: |
310 | case L_BBND: |
311 | skip(v); |
312 | break; |
313 | } |
314 | |
315 | /* handle EOS, depending on context */ |
316 | if (ATEOS()) |
317 | { |
318 | switch (v->lexcon) |
319 | { |
320 | case L_ERE: |
321 | case L_BRE: |
322 | case L_Q: |
323 | RET(EOS); |
324 | break; |
325 | case L_EBND: |
326 | case L_BBND: |
327 | FAILW(REG_EBRACE); |
328 | break; |
329 | case L_BRACK: |
330 | case L_CEL: |
331 | case L_ECL: |
332 | case L_CCL: |
333 | FAILW(REG_EBRACK); |
334 | break; |
335 | } |
336 | assert(NOTREACHED); |
337 | } |
338 | |
339 | /* okay, time to actually get a character */ |
340 | c = *v->now++; |
341 | |
342 | /* deal with the easy contexts, punt EREs to code below */ |
343 | switch (v->lexcon) |
344 | { |
345 | case L_BRE: /* punt BREs to separate function */ |
346 | return brenext(v, c); |
347 | break; |
348 | case L_ERE: /* see below */ |
349 | break; |
350 | case L_Q: /* literal strings are easy */ |
351 | RETV(PLAIN, c); |
352 | break; |
353 | case L_BBND: /* bounds are fairly simple */ |
354 | case L_EBND: |
355 | switch (c) |
356 | { |
357 | case CHR('0'): |
358 | case CHR('1'): |
359 | case CHR('2'): |
360 | case CHR('3'): |
361 | case CHR('4'): |
362 | case CHR('5'): |
363 | case CHR('6'): |
364 | case CHR('7'): |
365 | case CHR('8'): |
366 | case CHR('9'): |
367 | RETV(DIGIT, (chr) DIGITVAL(c)); |
368 | break; |
369 | case CHR(','): |
370 | RET(','); |
371 | break; |
372 | case CHR('}'): /* ERE bound ends with } */ |
373 | if (INCON(L_EBND)) |
374 | { |
375 | INTOCON(L_ERE); |
376 | if ((v->cflags & REG_ADVF) && NEXT1('?')) |
377 | { |
378 | v->now++; |
379 | NOTE(REG_UNONPOSIX); |
380 | RETV('}', 0); |
381 | } |
382 | RETV('}', 1); |
383 | } |
384 | else |
385 | FAILW(REG_BADBR); |
386 | break; |
387 | case CHR('\\'): /* BRE bound ends with \} */ |
388 | if (INCON(L_BBND) && NEXT1('}')) |
389 | { |
390 | v->now++; |
391 | INTOCON(L_BRE); |
392 | RET('}'); |
393 | } |
394 | else |
395 | FAILW(REG_BADBR); |
396 | break; |
397 | default: |
398 | FAILW(REG_BADBR); |
399 | break; |
400 | } |
401 | assert(NOTREACHED); |
402 | break; |
403 | case L_BRACK: /* brackets are not too hard */ |
404 | switch (c) |
405 | { |
406 | case CHR(']'): |
407 | if (LASTTYPE('[')) |
408 | RETV(PLAIN, c); |
409 | else |
410 | { |
411 | INTOCON((v->cflags & REG_EXTENDED) ? |
412 | L_ERE : L_BRE); |
413 | RET(']'); |
414 | } |
415 | break; |
416 | case CHR('\\'): |
417 | NOTE(REG_UBBS); |
418 | if (!(v->cflags & REG_ADVF)) |
419 | RETV(PLAIN, c); |
420 | NOTE(REG_UNONPOSIX); |
421 | if (ATEOS()) |
422 | FAILW(REG_EESCAPE); |
423 | (DISCARD) lexescape(v); |
424 | switch (v->nexttype) |
425 | { /* not all escapes okay here */ |
426 | case PLAIN: |
427 | return 1; |
428 | break; |
429 | case CCLASS: |
430 | switch (v->nextvalue) |
431 | { |
432 | case 'd': |
433 | lexnest(v, brbackd, ENDOF(brbackd)); |
434 | break; |
435 | case 's': |
436 | lexnest(v, brbacks, ENDOF(brbacks)); |
437 | break; |
438 | case 'w': |
439 | lexnest(v, brbackw, ENDOF(brbackw)); |
440 | break; |
441 | default: |
442 | FAILW(REG_EESCAPE); |
443 | break; |
444 | } |
445 | /* lexnest done, back up and try again */ |
446 | v->nexttype = v->lasttype; |
447 | return next(v); |
448 | break; |
449 | } |
450 | /* not one of the acceptable escapes */ |
451 | FAILW(REG_EESCAPE); |
452 | break; |
453 | case CHR('-'): |
454 | if (LASTTYPE('[') || NEXT1(']')) |
455 | RETV(PLAIN, c); |
456 | else |
457 | RETV(RANGE, c); |
458 | break; |
459 | case CHR('['): |
460 | if (ATEOS()) |
461 | FAILW(REG_EBRACK); |
462 | switch (*v->now++) |
463 | { |
464 | case CHR('.'): |
465 | INTOCON(L_CEL); |
466 | /* might or might not be locale-specific */ |
467 | RET(COLLEL); |
468 | break; |
469 | case CHR('='): |
470 | INTOCON(L_ECL); |
471 | NOTE(REG_ULOCALE); |
472 | RET(ECLASS); |
473 | break; |
474 | case CHR(':'): |
475 | INTOCON(L_CCL); |
476 | NOTE(REG_ULOCALE); |
477 | RET(CCLASS); |
478 | break; |
479 | default: /* oops */ |
480 | v->now--; |
481 | RETV(PLAIN, c); |
482 | break; |
483 | } |
484 | assert(NOTREACHED); |
485 | break; |
486 | default: |
487 | RETV(PLAIN, c); |
488 | break; |
489 | } |
490 | assert(NOTREACHED); |
491 | break; |
492 | case L_CEL: /* collating elements are easy */ |
493 | if (c == CHR('.') && NEXT1(']')) |
494 | { |
495 | v->now++; |
496 | INTOCON(L_BRACK); |
497 | RETV(END, '.'); |
498 | } |
499 | else |
500 | RETV(PLAIN, c); |
501 | break; |
502 | case L_ECL: /* ditto equivalence classes */ |
503 | if (c == CHR('=') && NEXT1(']')) |
504 | { |
505 | v->now++; |
506 | INTOCON(L_BRACK); |
507 | RETV(END, '='); |
508 | } |
509 | else |
510 | RETV(PLAIN, c); |
511 | break; |
512 | case L_CCL: /* ditto character classes */ |
513 | if (c == CHR(':') && NEXT1(']')) |
514 | { |
515 | v->now++; |
516 | INTOCON(L_BRACK); |
517 | RETV(END, ':'); |
518 | } |
519 | else |
520 | RETV(PLAIN, c); |
521 | break; |
522 | default: |
523 | assert(NOTREACHED); |
524 | break; |
525 | } |
526 | |
527 | /* that got rid of everything except EREs and AREs */ |
528 | assert(INCON(L_ERE)); |
529 | |
530 | /* deal with EREs and AREs, except for backslashes */ |
531 | switch (c) |
532 | { |
533 | case CHR('|'): |
534 | RET('|'); |
535 | break; |
536 | case CHR('*'): |
537 | if ((v->cflags & REG_ADVF) && NEXT1('?')) |
538 | { |
539 | v->now++; |
540 | NOTE(REG_UNONPOSIX); |
541 | RETV('*', 0); |
542 | } |
543 | RETV('*', 1); |
544 | break; |
545 | case CHR('+'): |
546 | if ((v->cflags & REG_ADVF) && NEXT1('?')) |
547 | { |
548 | v->now++; |
549 | NOTE(REG_UNONPOSIX); |
550 | RETV('+', 0); |
551 | } |
552 | RETV('+', 1); |
553 | break; |
554 | case CHR('?'): |
555 | if ((v->cflags & REG_ADVF) && NEXT1('?')) |
556 | { |
557 | v->now++; |
558 | NOTE(REG_UNONPOSIX); |
559 | RETV('?', 0); |
560 | } |
561 | RETV('?', 1); |
562 | break; |
563 | case CHR('{'): /* bounds start or plain character */ |
564 | if (v->cflags & REG_EXPANDED) |
565 | skip(v); |
566 | if (ATEOS() || !iscdigit(*v->now)) |
567 | { |
568 | NOTE(REG_UBRACES); |
569 | NOTE(REG_UUNSPEC); |
570 | RETV(PLAIN, c); |
571 | } |
572 | else |
573 | { |
574 | NOTE(REG_UBOUNDS); |
575 | INTOCON(L_EBND); |
576 | RET('{'); |
577 | } |
578 | assert(NOTREACHED); |
579 | break; |
580 | case CHR('('): /* parenthesis, or advanced extension */ |
581 | if ((v->cflags & REG_ADVF) && NEXT1('?')) |
582 | { |
583 | NOTE(REG_UNONPOSIX); |
584 | v->now++; |
585 | if (ATEOS()) |
586 | FAILW(REG_BADRPT); |
587 | switch (*v->now++) |
588 | { |
589 | case CHR(':'): /* non-capturing paren */ |
590 | RETV('(', 0); |
591 | break; |
592 | case CHR('#'): /* comment */ |
593 | while (!ATEOS() && *v->now != CHR(')')) |
594 | v->now++; |
595 | if (!ATEOS()) |
596 | v->now++; |
597 | assert(v->nexttype == v->lasttype); |
598 | return next(v); |
599 | break; |
600 | case CHR('='): /* positive lookahead */ |
601 | NOTE(REG_ULOOKAROUND); |
602 | RETV(LACON, LATYPE_AHEAD_POS); |
603 | break; |
604 | case CHR('!'): /* negative lookahead */ |
605 | NOTE(REG_ULOOKAROUND); |
606 | RETV(LACON, LATYPE_AHEAD_NEG); |
607 | break; |
608 | case CHR('<'): |
609 | if (ATEOS()) |
610 | FAILW(REG_BADRPT); |
611 | switch (*v->now++) |
612 | { |
613 | case CHR('='): /* positive lookbehind */ |
614 | NOTE(REG_ULOOKAROUND); |
615 | RETV(LACON, LATYPE_BEHIND_POS); |
616 | break; |
617 | case CHR('!'): /* negative lookbehind */ |
618 | NOTE(REG_ULOOKAROUND); |
619 | RETV(LACON, LATYPE_BEHIND_NEG); |
620 | break; |
621 | default: |
622 | FAILW(REG_BADRPT); |
623 | break; |
624 | } |
625 | assert(NOTREACHED); |
626 | break; |
627 | default: |
628 | FAILW(REG_BADRPT); |
629 | break; |
630 | } |
631 | assert(NOTREACHED); |
632 | } |
633 | if (v->cflags & REG_NOSUB) |
634 | RETV('(', 0); /* all parens non-capturing */ |
635 | else |
636 | RETV('(', 1); |
637 | break; |
638 | case CHR(')'): |
639 | if (LASTTYPE('(')) |
640 | NOTE(REG_UUNSPEC); |
641 | RETV(')', c); |
642 | break; |
643 | case CHR('['): /* easy except for [[:<:]] and [[:>:]] */ |
644 | if (HAVE(6) && *(v->now + 0) == CHR('[') && |
645 | *(v->now + 1) == CHR(':') && |
646 | (*(v->now + 2) == CHR('<') || |
647 | *(v->now + 2) == CHR('>')) && |
648 | *(v->now + 3) == CHR(':') && |
649 | *(v->now + 4) == CHR(']') && |
650 | *(v->now + 5) == CHR(']')) |
651 | { |
652 | c = *(v->now + 2); |
653 | v->now += 6; |
654 | NOTE(REG_UNONPOSIX); |
655 | RET((c == CHR('<')) ? '<' : '>'); |
656 | } |
657 | INTOCON(L_BRACK); |
658 | if (NEXT1('^')) |
659 | { |
660 | v->now++; |
661 | RETV('[', 0); |
662 | } |
663 | RETV('[', 1); |
664 | break; |
665 | case CHR('.'): |
666 | RET('.'); |
667 | break; |
668 | case CHR('^'): |
669 | RET('^'); |
670 | break; |
671 | case CHR('$'): |
672 | RET('$'); |
673 | break; |
674 | case CHR('\\'): /* mostly punt backslashes to code below */ |
675 | if (ATEOS()) |
676 | FAILW(REG_EESCAPE); |
677 | break; |
678 | default: /* ordinary character */ |
679 | RETV(PLAIN, c); |
680 | break; |
681 | } |
682 | |
683 | /* ERE/ARE backslash handling; backslash already eaten */ |
684 | assert(!ATEOS()); |
685 | if (!(v->cflags & REG_ADVF)) |
686 | { /* only AREs have non-trivial escapes */ |
687 | if (iscalnum(*v->now)) |
688 | { |
689 | NOTE(REG_UBSALNUM); |
690 | NOTE(REG_UUNSPEC); |
691 | } |
692 | RETV(PLAIN, *v->now++); |
693 | } |
694 | (DISCARD) lexescape(v); |
695 | if (ISERR()) |
696 | FAILW(REG_EESCAPE); |
697 | if (v->nexttype == CCLASS) |
698 | { /* fudge at lexical level */ |
699 | switch (v->nextvalue) |
700 | { |
701 | case 'd': |
702 | lexnest(v, backd, ENDOF(backd)); |
703 | break; |
704 | case 'D': |
705 | lexnest(v, backD, ENDOF(backD)); |
706 | break; |
707 | case 's': |
708 | lexnest(v, backs, ENDOF(backs)); |
709 | break; |
710 | case 'S': |
711 | lexnest(v, backS, ENDOF(backS)); |
712 | break; |
713 | case 'w': |
714 | lexnest(v, backw, ENDOF(backw)); |
715 | break; |
716 | case 'W': |
717 | lexnest(v, backW, ENDOF(backW)); |
718 | break; |
719 | default: |
720 | assert(NOTREACHED); |
721 | FAILW(REG_ASSERT); |
722 | break; |
723 | } |
724 | /* lexnest done, back up and try again */ |
725 | v->nexttype = v->lasttype; |
726 | return next(v); |
727 | } |
728 | /* otherwise, lexescape has already done the work */ |
729 | return !ISERR(); |
730 | } |
731 | |
732 | /* |
733 | * lexescape - parse an ARE backslash escape (backslash already eaten) |
734 | * Note slightly nonstandard use of the CCLASS type code. |
735 | */ |
736 | static int /* not actually used, but convenient for RETV */ |
737 | lexescape(struct vars *v) |
738 | { |
739 | chr c; |
740 | static const chr alert[] = { |
741 | CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t') |
742 | }; |
743 | static const chr esc[] = { |
744 | CHR('E'), CHR('S'), CHR('C') |
745 | }; |
746 | const chr *save; |
747 | |
748 | assert(v->cflags & REG_ADVF); |
749 | |
750 | assert(!ATEOS()); |
751 | c = *v->now++; |
752 | if (!iscalnum(c)) |
753 | RETV(PLAIN, c); |
754 | |
755 | NOTE(REG_UNONPOSIX); |
756 | switch (c) |
757 | { |
758 | case CHR('a'): |
759 | RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007'))); |
760 | break; |
761 | case CHR('A'): |
762 | RETV(SBEGIN, 0); |
763 | break; |
764 | case CHR('b'): |
765 | RETV(PLAIN, CHR('\b')); |
766 | break; |
767 | case CHR('B'): |
768 | RETV(PLAIN, CHR('\\')); |
769 | break; |
770 | case CHR('c'): |
771 | NOTE(REG_UUNPORT); |
772 | if (ATEOS()) |
773 | FAILW(REG_EESCAPE); |
774 | RETV(PLAIN, (chr) (*v->now++ & 037)); |
775 | break; |
776 | case CHR('d'): |
777 | NOTE(REG_ULOCALE); |
778 | RETV(CCLASS, 'd'); |
779 | break; |
780 | case CHR('D'): |
781 | NOTE(REG_ULOCALE); |
782 | RETV(CCLASS, 'D'); |
783 | break; |
784 | case CHR('e'): |
785 | NOTE(REG_UUNPORT); |
786 | RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033'))); |
787 | break; |
788 | case CHR('f'): |
789 | RETV(PLAIN, CHR('\f')); |
790 | break; |
791 | case CHR('m'): |
792 | RET('<'); |
793 | break; |
794 | case CHR('M'): |
795 | RET('>'); |
796 | break; |
797 | case CHR('n'): |
798 | RETV(PLAIN, CHR('\n')); |
799 | break; |
800 | case CHR('r'): |
801 | RETV(PLAIN, CHR('\r')); |
802 | break; |
803 | case CHR('s'): |
804 | NOTE(REG_ULOCALE); |
805 | RETV(CCLASS, 's'); |
806 | break; |
807 | case CHR('S'): |
808 | NOTE(REG_ULOCALE); |
809 | RETV(CCLASS, 'S'); |
810 | break; |
811 | case CHR('t'): |
812 | RETV(PLAIN, CHR('\t')); |
813 | break; |
814 | case CHR('u'): |
815 | c = lexdigits(v, 16, 4, 4); |
816 | if (ISERR() || !CHR_IS_IN_RANGE(c)) |
817 | FAILW(REG_EESCAPE); |
818 | RETV(PLAIN, c); |
819 | break; |
820 | case CHR('U'): |
821 | c = lexdigits(v, 16, 8, 8); |
822 | if (ISERR() || !CHR_IS_IN_RANGE(c)) |
823 | FAILW(REG_EESCAPE); |
824 | RETV(PLAIN, c); |
825 | break; |
826 | case CHR('v'): |
827 | RETV(PLAIN, CHR('\v')); |
828 | break; |
829 | case CHR('w'): |
830 | NOTE(REG_ULOCALE); |
831 | RETV(CCLASS, 'w'); |
832 | break; |
833 | case CHR('W'): |
834 | NOTE(REG_ULOCALE); |
835 | RETV(CCLASS, 'W'); |
836 | break; |
837 | case CHR('x'): |
838 | NOTE(REG_UUNPORT); |
839 | c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */ |
840 | if (ISERR() || !CHR_IS_IN_RANGE(c)) |
841 | FAILW(REG_EESCAPE); |
842 | RETV(PLAIN, c); |
843 | break; |
844 | case CHR('y'): |
845 | NOTE(REG_ULOCALE); |
846 | RETV(WBDRY, 0); |
847 | break; |
848 | case CHR('Y'): |
849 | NOTE(REG_ULOCALE); |
850 | RETV(NWBDRY, 0); |
851 | break; |
852 | case CHR('Z'): |
853 | RETV(SEND, 0); |
854 | break; |
855 | case CHR('1'): |
856 | case CHR('2'): |
857 | case CHR('3'): |
858 | case CHR('4'): |
859 | case CHR('5'): |
860 | case CHR('6'): |
861 | case CHR('7'): |
862 | case CHR('8'): |
863 | case CHR('9'): |
864 | save = v->now; |
865 | v->now--; /* put first digit back */ |
866 | c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */ |
867 | if (ISERR()) |
868 | FAILW(REG_EESCAPE); |
869 | /* ugly heuristic (first test is "exactly 1 digit?") */ |
870 | if (v->now == save || ((int) c > 0 && (int) c <= v->nsubexp)) |
871 | { |
872 | NOTE(REG_UBACKREF); |
873 | RETV(BACKREF, c); |
874 | } |
875 | /* oops, doesn't look like it's a backref after all... */ |
876 | v->now = save; |
877 | /* and fall through into octal number */ |
878 | /* FALLTHROUGH */ |
879 | case CHR('0'): |
880 | NOTE(REG_UUNPORT); |
881 | v->now--; /* put first digit back */ |
882 | c = lexdigits(v, 8, 1, 3); |
883 | if (ISERR()) |
884 | FAILW(REG_EESCAPE); |
885 | if (c > 0xff) |
886 | { |
887 | /* out of range, so we handled one digit too much */ |
888 | v->now--; |
889 | c >>= 3; |
890 | } |
891 | RETV(PLAIN, c); |
892 | break; |
893 | default: |
894 | assert(iscalpha(c)); |
895 | FAILW(REG_EESCAPE); /* unknown alphabetic escape */ |
896 | break; |
897 | } |
898 | assert(NOTREACHED); |
899 | } |
900 | |
901 | /* |
902 | * lexdigits - slurp up digits and return chr value |
903 | * |
904 | * This does not account for overflow; callers should range-check the result |
905 | * if maxlen is large enough to make that possible. |
906 | */ |
907 | static chr /* chr value; errors signalled via ERR */ |
908 | lexdigits(struct vars *v, |
909 | int base, |
910 | int minlen, |
911 | int maxlen) |
912 | { |
913 | uchr n; /* unsigned to avoid overflow misbehavior */ |
914 | int len; |
915 | chr c; |
916 | int d; |
917 | const uchr ub = (uchr) base; |
918 | |
919 | n = 0; |
920 | for (len = 0; len < maxlen && !ATEOS(); len++) |
921 | { |
922 | c = *v->now++; |
923 | switch (c) |
924 | { |
925 | case CHR('0'): |
926 | case CHR('1'): |
927 | case CHR('2'): |
928 | case CHR('3'): |
929 | case CHR('4'): |
930 | case CHR('5'): |
931 | case CHR('6'): |
932 | case CHR('7'): |
933 | case CHR('8'): |
934 | case CHR('9'): |
935 | d = DIGITVAL(c); |
936 | break; |
937 | case CHR('a'): |
938 | case CHR('A'): |
939 | d = 10; |
940 | break; |
941 | case CHR('b'): |
942 | case CHR('B'): |
943 | d = 11; |
944 | break; |
945 | case CHR('c'): |
946 | case CHR('C'): |
947 | d = 12; |
948 | break; |
949 | case CHR('d'): |
950 | case CHR('D'): |
951 | d = 13; |
952 | break; |
953 | case CHR('e'): |
954 | case CHR('E'): |
955 | d = 14; |
956 | break; |
957 | case CHR('f'): |
958 | case CHR('F'): |
959 | d = 15; |
960 | break; |
961 | default: |
962 | v->now--; /* oops, not a digit at all */ |
963 | d = -1; |
964 | break; |
965 | } |
966 | |
967 | if (d >= base) |
968 | { /* not a plausible digit */ |
969 | v->now--; |
970 | d = -1; |
971 | } |
972 | if (d < 0) |
973 | break; /* NOTE BREAK OUT */ |
974 | n = n * ub + (uchr) d; |
975 | } |
976 | if (len < minlen) |
977 | ERR(REG_EESCAPE); |
978 | |
979 | return (chr) n; |
980 | } |
981 | |
982 | /* |
983 | * brenext - get next BRE token |
984 | * |
985 | * This is much like EREs except for all the stupid backslashes and the |
986 | * context-dependency of some things. |
987 | */ |
988 | static int /* 1 normal, 0 failure */ |
989 | brenext(struct vars *v, |
990 | chr c) |
991 | { |
992 | switch (c) |
993 | { |
994 | case CHR('*'): |
995 | if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^')) |
996 | RETV(PLAIN, c); |
997 | RET('*'); |
998 | break; |
999 | case CHR('['): |
1000 | if (HAVE(6) && *(v->now + 0) == CHR('[') && |
1001 | *(v->now + 1) == CHR(':') && |
1002 | (*(v->now + 2) == CHR('<') || |
1003 | *(v->now + 2) == CHR('>')) && |
1004 | *(v->now + 3) == CHR(':') && |
1005 | *(v->now + 4) == CHR(']') && |
1006 | *(v->now + 5) == CHR(']')) |
1007 | { |
1008 | c = *(v->now + 2); |
1009 | v->now += 6; |
1010 | NOTE(REG_UNONPOSIX); |
1011 | RET((c == CHR('<')) ? '<' : '>'); |
1012 | } |
1013 | INTOCON(L_BRACK); |
1014 | if (NEXT1('^')) |
1015 | { |
1016 | v->now++; |
1017 | RETV('[', 0); |
1018 | } |
1019 | RETV('[', 1); |
1020 | break; |
1021 | case CHR('.'): |
1022 | RET('.'); |
1023 | break; |
1024 | case CHR('^'): |
1025 | if (LASTTYPE(EMPTY)) |
1026 | RET('^'); |
1027 | if (LASTTYPE('(')) |
1028 | { |
1029 | NOTE(REG_UUNSPEC); |
1030 | RET('^'); |
1031 | } |
1032 | RETV(PLAIN, c); |
1033 | break; |
1034 | case CHR('$'): |
1035 | if (v->cflags & REG_EXPANDED) |
1036 | skip(v); |
1037 | if (ATEOS()) |
1038 | RET('$'); |
1039 | if (NEXT2('\\', ')')) |
1040 | { |
1041 | NOTE(REG_UUNSPEC); |
1042 | RET('$'); |
1043 | } |
1044 | RETV(PLAIN, c); |
1045 | break; |
1046 | case CHR('\\'): |
1047 | break; /* see below */ |
1048 | default: |
1049 | RETV(PLAIN, c); |
1050 | break; |
1051 | } |
1052 | |
1053 | assert(c == CHR('\\')); |
1054 | |
1055 | if (ATEOS()) |
1056 | FAILW(REG_EESCAPE); |
1057 | |
1058 | c = *v->now++; |
1059 | switch (c) |
1060 | { |
1061 | case CHR('{'): |
1062 | INTOCON(L_BBND); |
1063 | NOTE(REG_UBOUNDS); |
1064 | RET('{'); |
1065 | break; |
1066 | case CHR('('): |
1067 | RETV('(', 1); |
1068 | break; |
1069 | case CHR(')'): |
1070 | RETV(')', c); |
1071 | break; |
1072 | case CHR('<'): |
1073 | NOTE(REG_UNONPOSIX); |
1074 | RET('<'); |
1075 | break; |
1076 | case CHR('>'): |
1077 | NOTE(REG_UNONPOSIX); |
1078 | RET('>'); |
1079 | break; |
1080 | case CHR('1'): |
1081 | case CHR('2'): |
1082 | case CHR('3'): |
1083 | case CHR('4'): |
1084 | case CHR('5'): |
1085 | case CHR('6'): |
1086 | case CHR('7'): |
1087 | case CHR('8'): |
1088 | case CHR('9'): |
1089 | NOTE(REG_UBACKREF); |
1090 | RETV(BACKREF, (chr) DIGITVAL(c)); |
1091 | break; |
1092 | default: |
1093 | if (iscalnum(c)) |
1094 | { |
1095 | NOTE(REG_UBSALNUM); |
1096 | NOTE(REG_UUNSPEC); |
1097 | } |
1098 | RETV(PLAIN, c); |
1099 | break; |
1100 | } |
1101 | |
1102 | assert(NOTREACHED); |
1103 | return 0; |
1104 | } |
1105 | |
1106 | /* |
1107 | * skip - skip white space and comments in expanded form |
1108 | */ |
1109 | static void |
1110 | skip(struct vars *v) |
1111 | { |
1112 | const chr *start = v->now; |
1113 | |
1114 | assert(v->cflags & REG_EXPANDED); |
1115 | |
1116 | for (;;) |
1117 | { |
1118 | while (!ATEOS() && iscspace(*v->now)) |
1119 | v->now++; |
1120 | if (ATEOS() || *v->now != CHR('#')) |
1121 | break; /* NOTE BREAK OUT */ |
1122 | assert(NEXT1('#')); |
1123 | while (!ATEOS() && *v->now != CHR('\n')) |
1124 | v->now++; |
1125 | /* leave the newline to be picked up by the iscspace loop */ |
1126 | } |
1127 | |
1128 | if (v->now != start) |
1129 | NOTE(REG_UNONPOSIX); |
1130 | } |
1131 | |
1132 | /* |
1133 | * newline - return the chr for a newline |
1134 | * |
1135 | * This helps confine use of CHR to this source file. |
1136 | */ |
1137 | static chr |
1138 | newline(void) |
1139 | { |
1140 | return CHR('\n'); |
1141 | } |
1142 | |
1143 | /* |
1144 | * chrnamed - return the chr known by a given (chr string) name |
1145 | * |
1146 | * The code is a bit clumsy, but this routine gets only such specialized |
1147 | * use that it hardly matters. |
1148 | */ |
1149 | static chr |
1150 | chrnamed(struct vars *v, |
1151 | const chr *startp, /* start of name */ |
1152 | const chr *endp, /* just past end of name */ |
1153 | chr lastresort) /* what to return if name lookup fails */ |
1154 | { |
1155 | chr c; |
1156 | int errsave; |
1157 | int e; |
1158 | struct cvec *cv; |
1159 | |
1160 | errsave = v->err; |
1161 | v->err = 0; |
1162 | c = element(v, startp, endp); |
1163 | e = v->err; |
1164 | v->err = errsave; |
1165 | |
1166 | if (e != 0) |
1167 | return lastresort; |
1168 | |
1169 | cv = range(v, c, c, 0); |
1170 | if (cv->nchrs == 0) |
1171 | return lastresort; |
1172 | return cv->chrs[0]; |
1173 | } |
1174 | |