1#include <stdlib.h>
2#include <stdio.h>
3#include <string.h>
4#include <setjmp.h>
5#include <limits.h>
6
7#include "regexp.h"
8#include "utf.h"
9
10#define emit regemit
11#define next regnext
12#define accept regaccept
13
14#define nelem(a) (int)(sizeof (a) / sizeof (a)[0])
15
16#define REPINF 255
17#define MAXSUB REG_MAXSUB
18#define MAXPROG (32 << 10)
19#define MAXREC 1024
20
21typedef struct Reclass Reclass;
22typedef struct Renode Renode;
23typedef struct Reinst Reinst;
24typedef struct Rethread Rethread;
25
26struct Reclass {
27 Rune *end;
28 Rune spans[64];
29};
30
31struct Reprog {
32 Reinst *start, *end;
33 int flags;
34 int nsub;
35 Reclass cclass[16];
36};
37
38struct cstate {
39 Reprog *prog;
40 Renode *pstart, *pend;
41
42 const char *source;
43 int ncclass;
44 int nsub;
45 Renode *sub[MAXSUB];
46
47 int lookahead;
48 Rune yychar;
49 Reclass *yycc;
50 int yymin, yymax;
51
52 const char *error;
53 jmp_buf kaboom;
54};
55
56static void die(struct cstate *g, const char *message)
57{
58 g->error = message;
59 longjmp(g->kaboom, 1);
60}
61
62static int canon(Rune c)
63{
64 Rune u = toupperrune(c);
65 if (c >= 128 && u < 128)
66 return c;
67 return u;
68}
69
70/* Scan */
71
72enum {
73 L_CHAR = 256,
74 L_CCLASS, /* character class */
75 L_NCCLASS, /* negative character class */
76 L_NC, /* "(?:" no capture */
77 L_PLA, /* "(?=" positive lookahead */
78 L_NLA, /* "(?!" negative lookahead */
79 L_WORD, /* "\b" word boundary */
80 L_NWORD, /* "\B" non-word boundary */
81 L_REF, /* "\1" back-reference */
82 L_COUNT, /* {M,N} */
83};
84
85static int hex(struct cstate *g, int c)
86{
87 if (c >= '0' && c <= '9') return c - '0';
88 if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
89 if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
90 die(g, "invalid escape sequence");
91 return 0;
92}
93
94static int dec(struct cstate *g, int c)
95{
96 if (c >= '0' && c <= '9') return c - '0';
97 die(g, "invalid quantifier");
98 return 0;
99}
100
101#define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|0123456789"
102
103static int isunicodeletter(int c)
104{
105 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c);
106}
107
108static int nextrune(struct cstate *g)
109{
110 g->source += chartorune(&g->yychar, g->source);
111 if (g->yychar == '\\') {
112 g->source += chartorune(&g->yychar, g->source);
113 switch (g->yychar) {
114 case 0: die(g, "unterminated escape sequence"); break;
115 case 'f': g->yychar = '\f'; return 0;
116 case 'n': g->yychar = '\n'; return 0;
117 case 'r': g->yychar = '\r'; return 0;
118 case 't': g->yychar = '\t'; return 0;
119 case 'v': g->yychar = '\v'; return 0;
120 case 'c':
121 g->yychar = (*g->source++) & 31;
122 return 0;
123 case 'x':
124 g->yychar = hex(g, *g->source++) << 4;
125 g->yychar += hex(g, *g->source++);
126 if (g->yychar == 0) {
127 g->yychar = '0';
128 return 1;
129 }
130 return 0;
131 case 'u':
132 g->yychar = hex(g, *g->source++) << 12;
133 g->yychar += hex(g, *g->source++) << 8;
134 g->yychar += hex(g, *g->source++) << 4;
135 g->yychar += hex(g, *g->source++);
136 if (g->yychar == 0) {
137 g->yychar = '0';
138 return 1;
139 }
140 return 0;
141 }
142 if (strchr(ESCAPES, g->yychar))
143 return 1;
144 if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */
145 die(g, "invalid escape character");
146 return 0;
147 }
148 return 0;
149}
150
151static int lexcount(struct cstate *g)
152{
153 g->yychar = *g->source++;
154
155 g->yymin = dec(g, g->yychar);
156 g->yychar = *g->source++;
157 while (g->yychar != ',' && g->yychar != '}') {
158 g->yymin = g->yymin * 10 + dec(g, g->yychar);
159 g->yychar = *g->source++;
160 if (g->yymin >= REPINF)
161 die(g, "numeric overflow");
162 }
163
164 if (g->yychar == ',') {
165 g->yychar = *g->source++;
166 if (g->yychar == '}') {
167 g->yymax = REPINF;
168 } else {
169 g->yymax = dec(g, g->yychar);
170 g->yychar = *g->source++;
171 while (g->yychar != '}') {
172 g->yymax = g->yymax * 10 + dec(g, g->yychar);
173 g->yychar = *g->source++;
174 if (g->yymax >= REPINF)
175 die(g, "numeric overflow");
176 }
177 }
178 } else {
179 g->yymax = g->yymin;
180 }
181
182 return L_COUNT;
183}
184
185static void newcclass(struct cstate *g)
186{
187 if (g->ncclass >= nelem(g->prog->cclass))
188 die(g, "too many character classes");
189 g->yycc = g->prog->cclass + g->ncclass++;
190 g->yycc->end = g->yycc->spans;
191}
192
193static void addrange(struct cstate *g, Rune a, Rune b)
194{
195 if (a > b)
196 die(g, "invalid character class range");
197 if (g->yycc->end + 2 == g->yycc->spans + nelem(g->yycc->spans))
198 die(g, "too many character class ranges");
199 *g->yycc->end++ = a;
200 *g->yycc->end++ = b;
201}
202
203static void addranges_d(struct cstate *g)
204{
205 addrange(g, '0', '9');
206}
207
208static void addranges_D(struct cstate *g)
209{
210 addrange(g, 0, '0'-1);
211 addrange(g, '9'+1, 0xFFFF);
212}
213
214static void addranges_s(struct cstate *g)
215{
216 addrange(g, 0x9, 0xD);
217 addrange(g, 0x20, 0x20);
218 addrange(g, 0xA0, 0xA0);
219 addrange(g, 0x2028, 0x2029);
220 addrange(g, 0xFEFF, 0xFEFF);
221}
222
223static void addranges_S(struct cstate *g)
224{
225 addrange(g, 0, 0x9-1);
226 addrange(g, 0xD+1, 0x20-1);
227 addrange(g, 0x20+1, 0xA0-1);
228 addrange(g, 0xA0+1, 0x2028-1);
229 addrange(g, 0x2029+1, 0xFEFF-1);
230 addrange(g, 0xFEFF+1, 0xFFFF);
231}
232
233static void addranges_w(struct cstate *g)
234{
235 addrange(g, '0', '9');
236 addrange(g, 'A', 'Z');
237 addrange(g, '_', '_');
238 addrange(g, 'a', 'z');
239}
240
241static void addranges_W(struct cstate *g)
242{
243 addrange(g, 0, '0'-1);
244 addrange(g, '9'+1, 'A'-1);
245 addrange(g, 'Z'+1, '_'-1);
246 addrange(g, '_'+1, 'a'-1);
247 addrange(g, 'z'+1, 0xFFFF);
248}
249
250static int lexclass(struct cstate *g)
251{
252 int type = L_CCLASS;
253 int quoted, havesave, havedash;
254 Rune save = 0;
255
256 newcclass(g);
257
258 quoted = nextrune(g);
259 if (!quoted && g->yychar == '^') {
260 type = L_NCCLASS;
261 quoted = nextrune(g);
262 }
263
264 havesave = havedash = 0;
265 for (;;) {
266 if (g->yychar == 0)
267 die(g, "unterminated character class");
268 if (!quoted && g->yychar == ']')
269 break;
270
271 if (!quoted && g->yychar == '-') {
272 if (havesave) {
273 if (havedash) {
274 addrange(g, save, '-');
275 havesave = havedash = 0;
276 } else {
277 havedash = 1;
278 }
279 } else {
280 save = '-';
281 havesave = 1;
282 }
283 } else if (quoted && strchr("DSWdsw", g->yychar)) {
284 if (havesave) {
285 addrange(g, save, save);
286 if (havedash)
287 addrange(g, '-', '-');
288 }
289 switch (g->yychar) {
290 case 'd': addranges_d(g); break;
291 case 's': addranges_s(g); break;
292 case 'w': addranges_w(g); break;
293 case 'D': addranges_D(g); break;
294 case 'S': addranges_S(g); break;
295 case 'W': addranges_W(g); break;
296 }
297 havesave = havedash = 0;
298 } else {
299 if (quoted) {
300 if (g->yychar == 'b')
301 g->yychar = '\b';
302 else if (g->yychar == '0')
303 g->yychar = 0;
304 /* else identity escape */
305 }
306 if (havesave) {
307 if (havedash) {
308 addrange(g, save, g->yychar);
309 havesave = havedash = 0;
310 } else {
311 addrange(g, save, save);
312 save = g->yychar;
313 }
314 } else {
315 save = g->yychar;
316 havesave = 1;
317 }
318 }
319
320 quoted = nextrune(g);
321 }
322
323 if (havesave) {
324 addrange(g, save, save);
325 if (havedash)
326 addrange(g, '-', '-');
327 }
328
329 return type;
330}
331
332static int lex(struct cstate *g)
333{
334 int quoted = nextrune(g);
335 if (quoted) {
336 switch (g->yychar) {
337 case 'b': return L_WORD;
338 case 'B': return L_NWORD;
339 case 'd': newcclass(g); addranges_d(g); return L_CCLASS;
340 case 's': newcclass(g); addranges_s(g); return L_CCLASS;
341 case 'w': newcclass(g); addranges_w(g); return L_CCLASS;
342 case 'D': newcclass(g); addranges_d(g); return L_NCCLASS;
343 case 'S': newcclass(g); addranges_s(g); return L_NCCLASS;
344 case 'W': newcclass(g); addranges_w(g); return L_NCCLASS;
345 case '0': g->yychar = 0; return L_CHAR;
346 }
347 if (g->yychar >= '0' && g->yychar <= '9') {
348 g->yychar -= '0';
349 if (*g->source >= '0' && *g->source <= '9')
350 g->yychar = g->yychar * 10 + *g->source++ - '0';
351 return L_REF;
352 }
353 return L_CHAR;
354 }
355
356 switch (g->yychar) {
357 case 0:
358 case '$': case ')': case '*': case '+':
359 case '.': case '?': case '^': case '|':
360 return g->yychar;
361 }
362
363 if (g->yychar == '{')
364 return lexcount(g);
365 if (g->yychar == '[')
366 return lexclass(g);
367 if (g->yychar == '(') {
368 if (g->source[0] == '?') {
369 if (g->source[1] == ':') {
370 g->source += 2;
371 return L_NC;
372 }
373 if (g->source[1] == '=') {
374 g->source += 2;
375 return L_PLA;
376 }
377 if (g->source[1] == '!') {
378 g->source += 2;
379 return L_NLA;
380 }
381 }
382 return '(';
383 }
384
385 return L_CHAR;
386}
387
388/* Parse */
389
390enum {
391 P_CAT, P_ALT, P_REP,
392 P_BOL, P_EOL, P_WORD, P_NWORD,
393 P_PAR, P_PLA, P_NLA,
394 P_ANY, P_CHAR, P_CCLASS, P_NCCLASS,
395 P_REF,
396};
397
398struct Renode {
399 unsigned char type;
400 unsigned char ng, m, n;
401 Rune c;
402 Reclass *cc;
403 Renode *x;
404 Renode *y;
405};
406
407static Renode *newnode(struct cstate *g, int type)
408{
409 Renode *node = g->pend++;
410 node->type = type;
411 node->cc = NULL;
412 node->c = 0;
413 node->ng = 0;
414 node->m = 0;
415 node->n = 0;
416 node->x = node->y = NULL;
417 return node;
418}
419
420static int empty(Renode *node)
421{
422 if (!node) return 1;
423 switch (node->type) {
424 default: return 1;
425 case P_CAT: return empty(node->x) && empty(node->y);
426 case P_ALT: return empty(node->x) || empty(node->y);
427 case P_REP: return empty(node->x) || node->m == 0;
428 case P_PAR: return empty(node->x);
429 case P_REF: return empty(node->x);
430 case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0;
431 }
432}
433
434static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max)
435{
436 Renode *rep = newnode(g, P_REP);
437 if (max == REPINF && empty(atom))
438 die(g, "infinite loop matching the empty string");
439 rep->ng = ng;
440 rep->m = min;
441 rep->n = max;
442 rep->x = atom;
443 return rep;
444}
445
446static void next(struct cstate *g)
447{
448 g->lookahead = lex(g);
449}
450
451static int accept(struct cstate *g, int t)
452{
453 if (g->lookahead == t) {
454 next(g);
455 return 1;
456 }
457 return 0;
458}
459
460static Renode *parsealt(struct cstate *g);
461
462static Renode *parseatom(struct cstate *g)
463{
464 Renode *atom;
465 if (g->lookahead == L_CHAR) {
466 atom = newnode(g, P_CHAR);
467 atom->c = g->yychar;
468 next(g);
469 return atom;
470 }
471 if (g->lookahead == L_CCLASS) {
472 atom = newnode(g, P_CCLASS);
473 atom->cc = g->yycc;
474 next(g);
475 return atom;
476 }
477 if (g->lookahead == L_NCCLASS) {
478 atom = newnode(g, P_NCCLASS);
479 atom->cc = g->yycc;
480 next(g);
481 return atom;
482 }
483 if (g->lookahead == L_REF) {
484 atom = newnode(g, P_REF);
485 if (g->yychar == 0 || g->yychar >= g->nsub || !g->sub[g->yychar])
486 die(g, "invalid back-reference");
487 atom->n = g->yychar;
488 atom->x = g->sub[g->yychar];
489 next(g);
490 return atom;
491 }
492 if (accept(g, '.'))
493 return newnode(g, P_ANY);
494 if (accept(g, '(')) {
495 atom = newnode(g, P_PAR);
496 if (g->nsub == MAXSUB)
497 die(g, "too many captures");
498 atom->n = g->nsub++;
499 atom->x = parsealt(g);
500 g->sub[atom->n] = atom;
501 if (!accept(g, ')'))
502 die(g, "unmatched '('");
503 return atom;
504 }
505 if (accept(g, L_NC)) {
506 atom = parsealt(g);
507 if (!accept(g, ')'))
508 die(g, "unmatched '('");
509 return atom;
510 }
511 if (accept(g, L_PLA)) {
512 atom = newnode(g, P_PLA);
513 atom->x = parsealt(g);
514 if (!accept(g, ')'))
515 die(g, "unmatched '('");
516 return atom;
517 }
518 if (accept(g, L_NLA)) {
519 atom = newnode(g, P_NLA);
520 atom->x = parsealt(g);
521 if (!accept(g, ')'))
522 die(g, "unmatched '('");
523 return atom;
524 }
525 die(g, "syntax error");
526 return NULL;
527}
528
529static Renode *parserep(struct cstate *g)
530{
531 Renode *atom;
532
533 if (accept(g, '^')) return newnode(g, P_BOL);
534 if (accept(g, '$')) return newnode(g, P_EOL);
535 if (accept(g, L_WORD)) return newnode(g, P_WORD);
536 if (accept(g, L_NWORD)) return newnode(g, P_NWORD);
537
538 atom = parseatom(g);
539 if (g->lookahead == L_COUNT) {
540 int min = g->yymin, max = g->yymax;
541 next(g);
542 if (max < min)
543 die(g, "invalid quantifier");
544 return newrep(g, atom, accept(g, '?'), min, max);
545 }
546 if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF);
547 if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF);
548 if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1);
549 return atom;
550}
551
552static Renode *parsecat(struct cstate *g)
553{
554 Renode *cat, *head, **tail;
555 if (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
556 /* Build a right-leaning tree by splicing in new 'cat' at the tail. */
557 head = parserep(g);
558 tail = &head;
559 while (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
560 cat = newnode(g, P_CAT);
561 cat->x = *tail;
562 cat->y = parserep(g);
563 *tail = cat;
564 tail = &cat->y;
565 }
566 return head;
567 }
568 return NULL;
569}
570
571static Renode *parsealt(struct cstate *g)
572{
573 Renode *alt, *x;
574 alt = parsecat(g);
575 while (accept(g, '|')) {
576 x = alt;
577 alt = newnode(g, P_ALT);
578 alt->x = x;
579 alt->y = parsecat(g);
580 }
581 return alt;
582}
583
584/* Compile */
585
586enum {
587 I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA,
588 I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
589 I_BOL, I_EOL, I_WORD, I_NWORD,
590 I_LPAR, I_RPAR
591};
592
593struct Reinst {
594 unsigned char opcode;
595 unsigned char n;
596 Rune c;
597 Reclass *cc;
598 Reinst *x;
599 Reinst *y;
600};
601
602static int count(struct cstate *g, Renode *node)
603{
604 int min, max, n;
605 if (!node) return 0;
606 switch (node->type) {
607 default: return 1;
608 case P_CAT: return count(g, node->x) + count(g, node->y);
609 case P_ALT: return count(g, node->x) + count(g, node->y) + 2;
610 case P_REP:
611 min = node->m;
612 max = node->n;
613 if (min == max) n = count(g, node->x) * min;
614 else if (max < REPINF) n = count(g, node->x) * max + (max - min);
615 else n = count(g, node->x) * (min + 1) + 2;
616 if (n < 0 || n > MAXPROG) die(g, "program too large");
617 return n;
618 case P_PAR: return count(g, node->x) + 2;
619 case P_PLA: return count(g, node->x) + 2;
620 case P_NLA: return count(g, node->x) + 2;
621 }
622}
623
624static Reinst *emit(Reprog *prog, int opcode)
625{
626 Reinst *inst = prog->end++;
627 inst->opcode = opcode;
628 inst->n = 0;
629 inst->c = 0;
630 inst->cc = NULL;
631 inst->x = inst->y = NULL;
632 return inst;
633}
634
635static void compile(Reprog *prog, Renode *node)
636{
637 Reinst *inst, *split, *jump;
638 int i;
639
640 if (!node)
641 return;
642
643loop:
644 switch (node->type) {
645 case P_CAT:
646 compile(prog, node->x);
647 node = node->y;
648 goto loop;
649
650 case P_ALT:
651 split = emit(prog, I_SPLIT);
652 compile(prog, node->x);
653 jump = emit(prog, I_JUMP);
654 compile(prog, node->y);
655 split->x = split + 1;
656 split->y = jump + 1;
657 jump->x = prog->end;
658 break;
659
660 case P_REP:
661 inst = NULL; /* silence compiler warning. assert(node->m > 0). */
662 for (i = 0; i < node->m; ++i) {
663 inst = prog->end;
664 compile(prog, node->x);
665 }
666 if (node->m == node->n)
667 break;
668 if (node->n < REPINF) {
669 for (i = node->m; i < node->n; ++i) {
670 split = emit(prog, I_SPLIT);
671 compile(prog, node->x);
672 if (node->ng) {
673 split->y = split + 1;
674 split->x = prog->end;
675 } else {
676 split->x = split + 1;
677 split->y = prog->end;
678 }
679 }
680 } else if (node->m == 0) {
681 split = emit(prog, I_SPLIT);
682 compile(prog, node->x);
683 jump = emit(prog, I_JUMP);
684 if (node->ng) {
685 split->y = split + 1;
686 split->x = prog->end;
687 } else {
688 split->x = split + 1;
689 split->y = prog->end;
690 }
691 jump->x = split;
692 } else {
693 split = emit(prog, I_SPLIT);
694 if (node->ng) {
695 split->y = inst;
696 split->x = prog->end;
697 } else {
698 split->x = inst;
699 split->y = prog->end;
700 }
701 }
702 break;
703
704 case P_BOL: emit(prog, I_BOL); break;
705 case P_EOL: emit(prog, I_EOL); break;
706 case P_WORD: emit(prog, I_WORD); break;
707 case P_NWORD: emit(prog, I_NWORD); break;
708
709 case P_PAR:
710 inst = emit(prog, I_LPAR);
711 inst->n = node->n;
712 compile(prog, node->x);
713 inst = emit(prog, I_RPAR);
714 inst->n = node->n;
715 break;
716 case P_PLA:
717 split = emit(prog, I_PLA);
718 compile(prog, node->x);
719 emit(prog, I_END);
720 split->x = split + 1;
721 split->y = prog->end;
722 break;
723 case P_NLA:
724 split = emit(prog, I_NLA);
725 compile(prog, node->x);
726 emit(prog, I_END);
727 split->x = split + 1;
728 split->y = prog->end;
729 break;
730
731 case P_ANY:
732 emit(prog, I_ANY);
733 break;
734 case P_CHAR:
735 inst = emit(prog, I_CHAR);
736 inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c;
737 break;
738 case P_CCLASS:
739 inst = emit(prog, I_CCLASS);
740 inst->cc = node->cc;
741 break;
742 case P_NCCLASS:
743 inst = emit(prog, I_NCCLASS);
744 inst->cc = node->cc;
745 break;
746 case P_REF:
747 inst = emit(prog, I_REF);
748 inst->n = node->n;
749 break;
750 }
751}
752
753#ifdef TEST
754static void dumpnode(Renode *node)
755{
756 Rune *p;
757 if (!node) { printf("Empty"); return; }
758 switch (node->type) {
759 case P_CAT: printf("Cat("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
760 case P_ALT: printf("Alt("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
761 case P_REP:
762 printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);
763 dumpnode(node->x);
764 printf(")");
765 break;
766 case P_BOL: printf("Bol"); break;
767 case P_EOL: printf("Eol"); break;
768 case P_WORD: printf("Word"); break;
769 case P_NWORD: printf("NotWord"); break;
770 case P_PAR: printf("Par(%d,", node->n); dumpnode(node->x); printf(")"); break;
771 case P_PLA: printf("PLA("); dumpnode(node->x); printf(")"); break;
772 case P_NLA: printf("NLA("); dumpnode(node->x); printf(")"); break;
773 case P_ANY: printf("Any"); break;
774 case P_CHAR: printf("Char(%c)", node->c); break;
775 case P_CCLASS:
776 printf("Class(");
777 for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
778 printf(")");
779 break;
780 case P_NCCLASS:
781 printf("NotClass(");
782 for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
783 printf(")");
784 break;
785 case P_REF: printf("Ref(%d)", node->n); break;
786 }
787}
788
789static void dumpprog(Reprog *prog)
790{
791 Reinst *inst;
792 int i;
793 for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) {
794 printf("% 5d: ", i);
795 switch (inst->opcode) {
796 case I_END: puts("end"); break;
797 case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break;
798 case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
799 case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
800 case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
801 case I_ANY: puts("any"); break;
802 case I_ANYNL: puts("anynl"); break;
803 case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break;
804 case I_CCLASS: puts("cclass"); break;
805 case I_NCCLASS: puts("ncclass"); break;
806 case I_REF: printf("ref %d\n", inst->n); break;
807 case I_BOL: puts("bol"); break;
808 case I_EOL: puts("eol"); break;
809 case I_WORD: puts("word"); break;
810 case I_NWORD: puts("nword"); break;
811 case I_LPAR: printf("lpar %d\n", inst->n); break;
812 case I_RPAR: printf("rpar %d\n", inst->n); break;
813 }
814 }
815}
816#endif
817
818Reprog *regcompx(void *(*alloc)(void *ctx, void *p, int n), void *ctx,
819 const char *pattern, int cflags, const char **errorp)
820{
821 struct cstate g;
822 Renode *node;
823 Reinst *split, *jump;
824 int i, n;
825
826 g.pstart = NULL;
827 g.prog = NULL;
828
829 if (setjmp(g.kaboom)) {
830 if (errorp) *errorp = g.error;
831 alloc(ctx, g.pstart, 0);
832 alloc(ctx, g.prog, 0);
833 return NULL;
834 }
835
836 g.prog = alloc(ctx, NULL, sizeof (Reprog));
837 if (!g.prog)
838 die(&g, "cannot allocate regular expression");
839 n = strlen(pattern) * 2;
840 if (n > MAXPROG)
841 die(&g, "program too large");
842 if (n > 0) {
843 g.pstart = g.pend = alloc(ctx, NULL, sizeof (Renode) * n);
844 if (!g.pstart)
845 die(&g, "cannot allocate regular expression parse list");
846 }
847
848 g.source = pattern;
849 g.ncclass = 0;
850 g.nsub = 1;
851 for (i = 0; i < MAXSUB; ++i)
852 g.sub[i] = 0;
853
854 g.prog->flags = cflags;
855
856 next(&g);
857 node = parsealt(&g);
858 if (g.lookahead == ')')
859 die(&g, "unmatched ')'");
860 if (g.lookahead != 0)
861 die(&g, "syntax error");
862
863#ifdef TEST
864 dumpnode(node);
865 putchar('\n');
866#endif
867
868 n = 6 + count(&g, node);
869 if (n < 0 || n > MAXPROG)
870 die(&g, "program too large");
871
872 g.prog->nsub = g.nsub;
873 g.prog->start = g.prog->end = alloc(ctx, NULL, n * sizeof (Reinst));
874 if (!g.prog->start)
875 die(&g, "cannot allocate regular expression instruction list");
876
877 split = emit(g.prog, I_SPLIT);
878 split->x = split + 3;
879 split->y = split + 1;
880 emit(g.prog, I_ANYNL);
881 jump = emit(g.prog, I_JUMP);
882 jump->x = split;
883 emit(g.prog, I_LPAR);
884 compile(g.prog, node);
885 emit(g.prog, I_RPAR);
886 emit(g.prog, I_END);
887
888#ifdef TEST
889 dumpprog(g.prog);
890#endif
891
892 alloc(ctx, g.pstart, 0);
893
894 if (errorp) *errorp = NULL;
895 return g.prog;
896}
897
898void regfreex(void *(*alloc)(void *ctx, void *p, int n), void *ctx, Reprog *prog)
899{
900 if (prog) {
901 alloc(ctx, prog->start, 0);
902 alloc(ctx, prog, 0);
903 }
904}
905
906static void *default_alloc(void *ctx, void *p, int n)
907{
908 return realloc(p, (size_t)n);
909}
910
911Reprog *regcomp(const char *pattern, int cflags, const char **errorp)
912{
913 return regcompx(default_alloc, NULL, pattern, cflags, errorp);
914}
915
916void regfree(Reprog *prog)
917{
918 regfreex(default_alloc, NULL, prog);
919}
920
921/* Match */
922
923static int isnewline(int c)
924{
925 return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
926}
927
928static int iswordchar(int c)
929{
930 return c == '_' ||
931 (c >= 'a' && c <= 'z') ||
932 (c >= 'A' && c <= 'Z') ||
933 (c >= '0' && c <= '9');
934}
935
936static int incclass(Reclass *cc, Rune c)
937{
938 Rune *p;
939 for (p = cc->spans; p < cc->end; p += 2)
940 if (p[0] <= c && c <= p[1])
941 return 1;
942 return 0;
943}
944
945static int incclasscanon(Reclass *cc, Rune c)
946{
947 Rune *p, r;
948 for (p = cc->spans; p < cc->end; p += 2)
949 for (r = p[0]; r <= p[1]; ++r)
950 if (c == canon(r))
951 return 1;
952 return 0;
953}
954
955static int strncmpcanon(const char *a, const char *b, int n)
956{
957 Rune ra, rb;
958 int c;
959 while (n--) {
960 if (!*a) return -1;
961 if (!*b) return 1;
962 a += chartorune(&ra, a);
963 b += chartorune(&rb, b);
964 c = canon(ra) - canon(rb);
965 if (c)
966 return c;
967 }
968 return 0;
969}
970
971static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out, int depth)
972{
973 Resub scratch;
974 int result;
975 int i;
976 Rune c;
977
978 /* stack overflow */
979 if (depth > MAXREC)
980 return -1;
981
982 for (;;) {
983 switch (pc->opcode) {
984 case I_END:
985 return 0;
986 case I_JUMP:
987 pc = pc->x;
988 break;
989 case I_SPLIT:
990 scratch = *out;
991 result = match(pc->x, sp, bol, flags, &scratch, depth+1);
992 if (result == -1)
993 return -1;
994 if (result == 0) {
995 *out = scratch;
996 return 0;
997 }
998 pc = pc->y;
999 break;
1000
1001 case I_PLA:
1002 result = match(pc->x, sp, bol, flags, out, depth+1);
1003 if (result == -1)
1004 return -1;
1005 if (result == 1)
1006 return 1;
1007 pc = pc->y;
1008 break;
1009 case I_NLA:
1010 scratch = *out;
1011 result = match(pc->x, sp, bol, flags, &scratch, depth+1);
1012 if (result == -1)
1013 return -1;
1014 if (result == 0)
1015 return 1;
1016 pc = pc->y;
1017 break;
1018
1019 case I_ANYNL:
1020 sp += chartorune(&c, sp);
1021 if (c == 0)
1022 return 1;
1023 pc = pc + 1;
1024 break;
1025 case I_ANY:
1026 sp += chartorune(&c, sp);
1027 if (c == 0)
1028 return 1;
1029 if (isnewline(c))
1030 return 1;
1031 pc = pc + 1;
1032 break;
1033 case I_CHAR:
1034 sp += chartorune(&c, sp);
1035 if (c == 0)
1036 return 1;
1037 if (flags & REG_ICASE)
1038 c = canon(c);
1039 if (c != pc->c)
1040 return 1;
1041 pc = pc + 1;
1042 break;
1043 case I_CCLASS:
1044 sp += chartorune(&c, sp);
1045 if (c == 0)
1046 return 1;
1047 if (flags & REG_ICASE) {
1048 if (!incclasscanon(pc->cc, canon(c)))
1049 return 1;
1050 } else {
1051 if (!incclass(pc->cc, c))
1052 return 1;
1053 }
1054 pc = pc + 1;
1055 break;
1056 case I_NCCLASS:
1057 sp += chartorune(&c, sp);
1058 if (c == 0)
1059 return 1;
1060 if (flags & REG_ICASE) {
1061 if (incclasscanon(pc->cc, canon(c)))
1062 return 1;
1063 } else {
1064 if (incclass(pc->cc, c))
1065 return 1;
1066 }
1067 pc = pc + 1;
1068 break;
1069 case I_REF:
1070 i = out->sub[pc->n].ep - out->sub[pc->n].sp;
1071 if (flags & REG_ICASE) {
1072 if (strncmpcanon(sp, out->sub[pc->n].sp, i))
1073 return 1;
1074 } else {
1075 if (strncmp(sp, out->sub[pc->n].sp, i))
1076 return 1;
1077 }
1078 if (i > 0)
1079 sp += i;
1080 pc = pc + 1;
1081 break;
1082
1083 case I_BOL:
1084 if (sp == bol && !(flags & REG_NOTBOL)) {
1085 pc = pc + 1;
1086 break;
1087 }
1088 if (flags & REG_NEWLINE) {
1089 if (sp > bol && isnewline(sp[-1])) {
1090 pc = pc + 1;
1091 break;
1092 }
1093 }
1094 return 1;
1095 case I_EOL:
1096 if (*sp == 0) {
1097 pc = pc + 1;
1098 break;
1099 }
1100 if (flags & REG_NEWLINE) {
1101 if (isnewline(*sp)) {
1102 pc = pc + 1;
1103 break;
1104 }
1105 }
1106 return 1;
1107 case I_WORD:
1108 i = sp > bol && iswordchar(sp[-1]);
1109 i ^= iswordchar(sp[0]);
1110 if (!i)
1111 return 1;
1112 pc = pc + 1;
1113 break;
1114 case I_NWORD:
1115 i = sp > bol && iswordchar(sp[-1]);
1116 i ^= iswordchar(sp[0]);
1117 if (i)
1118 return 1;
1119 pc = pc + 1;
1120 break;
1121
1122 case I_LPAR:
1123 out->sub[pc->n].sp = sp;
1124 pc = pc + 1;
1125 break;
1126 case I_RPAR:
1127 out->sub[pc->n].ep = sp;
1128 pc = pc + 1;
1129 break;
1130 default:
1131 return 1;
1132 }
1133 }
1134}
1135
1136int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
1137{
1138 Resub scratch;
1139 int i;
1140
1141 if (!sub)
1142 sub = &scratch;
1143
1144 sub->nsub = prog->nsub;
1145 for (i = 0; i < MAXSUB; ++i)
1146 sub->sub[i].sp = sub->sub[i].ep = NULL;
1147
1148 return match(prog->start, sp, sp, prog->flags | eflags, sub, 0);
1149}
1150
1151#ifdef TEST
1152int main(int argc, char **argv)
1153{
1154 const char *error;
1155 const char *s;
1156 Reprog *p;
1157 Resub m;
1158 int i;
1159
1160 if (argc > 1) {
1161 p = regcomp(argv[1], 0, &error);
1162 if (!p) {
1163 fprintf(stderr, "regcomp: %s\n", error);
1164 return 1;
1165 }
1166
1167 if (argc > 2) {
1168 s = argv[2];
1169 printf("nsub = %d\n", p->nsub);
1170 if (!regexec(p, s, &m, 0)) {
1171 for (i = 0; i < m.nsub; ++i) {
1172 int n = m.sub[i].ep - m.sub[i].sp;
1173 if (n > 0)
1174 printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
1175 else
1176 printf("match %d: n=0 ''\n", i);
1177 }
1178 } else {
1179 printf("no match\n");
1180 }
1181 }
1182 }
1183
1184 return 0;
1185}
1186#endif
1187