1 | #include <stdlib.h> |
2 | #include <stdio.h> |
3 | #include <string.h> |
4 | #include <setjmp.h> |
5 | #include <limits.h> |
6 | |
7 | #include "regexp.h" |
8 | #include "utf.h" |
9 | |
10 | #define emit regemit |
11 | #define next regnext |
12 | #define accept regaccept |
13 | |
14 | #define nelem(a) (int)(sizeof (a) / sizeof (a)[0]) |
15 | |
16 | #define REPINF 255 |
17 | #define MAXSUB REG_MAXSUB |
18 | #define MAXPROG (32 << 10) |
19 | #define MAXREC 1024 |
20 | |
21 | typedef struct Reclass Reclass; |
22 | typedef struct Renode Renode; |
23 | typedef struct Reinst Reinst; |
24 | typedef struct Rethread Rethread; |
25 | |
26 | struct Reclass { |
27 | Rune *end; |
28 | Rune spans[64]; |
29 | }; |
30 | |
31 | struct Reprog { |
32 | Reinst *start, *end; |
33 | int flags; |
34 | int nsub; |
35 | Reclass cclass[16]; |
36 | }; |
37 | |
38 | struct cstate { |
39 | Reprog *prog; |
40 | Renode *pstart, *pend; |
41 | |
42 | const char *source; |
43 | int ncclass; |
44 | int nsub; |
45 | Renode *sub[MAXSUB]; |
46 | |
47 | int lookahead; |
48 | Rune yychar; |
49 | Reclass *yycc; |
50 | int yymin, yymax; |
51 | |
52 | const char *error; |
53 | jmp_buf kaboom; |
54 | }; |
55 | |
56 | static void die(struct cstate *g, const char *message) |
57 | { |
58 | g->error = message; |
59 | longjmp(g->kaboom, 1); |
60 | } |
61 | |
62 | static int canon(Rune c) |
63 | { |
64 | Rune u = toupperrune(c); |
65 | if (c >= 128 && u < 128) |
66 | return c; |
67 | return u; |
68 | } |
69 | |
70 | /* Scan */ |
71 | |
72 | enum { |
73 | L_CHAR = 256, |
74 | L_CCLASS, /* character class */ |
75 | L_NCCLASS, /* negative character class */ |
76 | L_NC, /* "(?:" no capture */ |
77 | L_PLA, /* "(?=" positive lookahead */ |
78 | L_NLA, /* "(?!" negative lookahead */ |
79 | L_WORD, /* "\b" word boundary */ |
80 | L_NWORD, /* "\B" non-word boundary */ |
81 | L_REF, /* "\1" back-reference */ |
82 | L_COUNT, /* {M,N} */ |
83 | }; |
84 | |
85 | static int hex(struct cstate *g, int c) |
86 | { |
87 | if (c >= '0' && c <= '9') return c - '0'; |
88 | if (c >= 'a' && c <= 'f') return c - 'a' + 0xA; |
89 | if (c >= 'A' && c <= 'F') return c - 'A' + 0xA; |
90 | die(g, "invalid escape sequence" ); |
91 | return 0; |
92 | } |
93 | |
94 | static int dec(struct cstate *g, int c) |
95 | { |
96 | if (c >= '0' && c <= '9') return c - '0'; |
97 | die(g, "invalid quantifier" ); |
98 | return 0; |
99 | } |
100 | |
101 | #define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|0123456789" |
102 | |
103 | static int isunicodeletter(int c) |
104 | { |
105 | return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c); |
106 | } |
107 | |
108 | static int nextrune(struct cstate *g) |
109 | { |
110 | g->source += chartorune(&g->yychar, g->source); |
111 | if (g->yychar == '\\') { |
112 | g->source += chartorune(&g->yychar, g->source); |
113 | switch (g->yychar) { |
114 | case 0: die(g, "unterminated escape sequence" ); break; |
115 | case 'f': g->yychar = '\f'; return 0; |
116 | case 'n': g->yychar = '\n'; return 0; |
117 | case 'r': g->yychar = '\r'; return 0; |
118 | case 't': g->yychar = '\t'; return 0; |
119 | case 'v': g->yychar = '\v'; return 0; |
120 | case 'c': |
121 | g->yychar = (*g->source++) & 31; |
122 | return 0; |
123 | case 'x': |
124 | g->yychar = hex(g, *g->source++) << 4; |
125 | g->yychar += hex(g, *g->source++); |
126 | if (g->yychar == 0) { |
127 | g->yychar = '0'; |
128 | return 1; |
129 | } |
130 | return 0; |
131 | case 'u': |
132 | g->yychar = hex(g, *g->source++) << 12; |
133 | g->yychar += hex(g, *g->source++) << 8; |
134 | g->yychar += hex(g, *g->source++) << 4; |
135 | g->yychar += hex(g, *g->source++); |
136 | if (g->yychar == 0) { |
137 | g->yychar = '0'; |
138 | return 1; |
139 | } |
140 | return 0; |
141 | } |
142 | if (strchr(ESCAPES, g->yychar)) |
143 | return 1; |
144 | if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */ |
145 | die(g, "invalid escape character" ); |
146 | return 0; |
147 | } |
148 | return 0; |
149 | } |
150 | |
151 | static int lexcount(struct cstate *g) |
152 | { |
153 | g->yychar = *g->source++; |
154 | |
155 | g->yymin = dec(g, g->yychar); |
156 | g->yychar = *g->source++; |
157 | while (g->yychar != ',' && g->yychar != '}') { |
158 | g->yymin = g->yymin * 10 + dec(g, g->yychar); |
159 | g->yychar = *g->source++; |
160 | if (g->yymin >= REPINF) |
161 | die(g, "numeric overflow" ); |
162 | } |
163 | |
164 | if (g->yychar == ',') { |
165 | g->yychar = *g->source++; |
166 | if (g->yychar == '}') { |
167 | g->yymax = REPINF; |
168 | } else { |
169 | g->yymax = dec(g, g->yychar); |
170 | g->yychar = *g->source++; |
171 | while (g->yychar != '}') { |
172 | g->yymax = g->yymax * 10 + dec(g, g->yychar); |
173 | g->yychar = *g->source++; |
174 | if (g->yymax >= REPINF) |
175 | die(g, "numeric overflow" ); |
176 | } |
177 | } |
178 | } else { |
179 | g->yymax = g->yymin; |
180 | } |
181 | |
182 | return L_COUNT; |
183 | } |
184 | |
185 | static void newcclass(struct cstate *g) |
186 | { |
187 | if (g->ncclass >= nelem(g->prog->cclass)) |
188 | die(g, "too many character classes" ); |
189 | g->yycc = g->prog->cclass + g->ncclass++; |
190 | g->yycc->end = g->yycc->spans; |
191 | } |
192 | |
193 | static void addrange(struct cstate *g, Rune a, Rune b) |
194 | { |
195 | if (a > b) |
196 | die(g, "invalid character class range" ); |
197 | if (g->yycc->end + 2 == g->yycc->spans + nelem(g->yycc->spans)) |
198 | die(g, "too many character class ranges" ); |
199 | *g->yycc->end++ = a; |
200 | *g->yycc->end++ = b; |
201 | } |
202 | |
203 | static void addranges_d(struct cstate *g) |
204 | { |
205 | addrange(g, '0', '9'); |
206 | } |
207 | |
208 | static void addranges_D(struct cstate *g) |
209 | { |
210 | addrange(g, 0, '0'-1); |
211 | addrange(g, '9'+1, 0xFFFF); |
212 | } |
213 | |
214 | static void addranges_s(struct cstate *g) |
215 | { |
216 | addrange(g, 0x9, 0xD); |
217 | addrange(g, 0x20, 0x20); |
218 | addrange(g, 0xA0, 0xA0); |
219 | addrange(g, 0x2028, 0x2029); |
220 | addrange(g, 0xFEFF, 0xFEFF); |
221 | } |
222 | |
223 | static void addranges_S(struct cstate *g) |
224 | { |
225 | addrange(g, 0, 0x9-1); |
226 | addrange(g, 0xD+1, 0x20-1); |
227 | addrange(g, 0x20+1, 0xA0-1); |
228 | addrange(g, 0xA0+1, 0x2028-1); |
229 | addrange(g, 0x2029+1, 0xFEFF-1); |
230 | addrange(g, 0xFEFF+1, 0xFFFF); |
231 | } |
232 | |
233 | static void addranges_w(struct cstate *g) |
234 | { |
235 | addrange(g, '0', '9'); |
236 | addrange(g, 'A', 'Z'); |
237 | addrange(g, '_', '_'); |
238 | addrange(g, 'a', 'z'); |
239 | } |
240 | |
241 | static void addranges_W(struct cstate *g) |
242 | { |
243 | addrange(g, 0, '0'-1); |
244 | addrange(g, '9'+1, 'A'-1); |
245 | addrange(g, 'Z'+1, '_'-1); |
246 | addrange(g, '_'+1, 'a'-1); |
247 | addrange(g, 'z'+1, 0xFFFF); |
248 | } |
249 | |
250 | static int lexclass(struct cstate *g) |
251 | { |
252 | int type = L_CCLASS; |
253 | int quoted, havesave, havedash; |
254 | Rune save = 0; |
255 | |
256 | newcclass(g); |
257 | |
258 | quoted = nextrune(g); |
259 | if (!quoted && g->yychar == '^') { |
260 | type = L_NCCLASS; |
261 | quoted = nextrune(g); |
262 | } |
263 | |
264 | havesave = havedash = 0; |
265 | for (;;) { |
266 | if (g->yychar == 0) |
267 | die(g, "unterminated character class" ); |
268 | if (!quoted && g->yychar == ']') |
269 | break; |
270 | |
271 | if (!quoted && g->yychar == '-') { |
272 | if (havesave) { |
273 | if (havedash) { |
274 | addrange(g, save, '-'); |
275 | havesave = havedash = 0; |
276 | } else { |
277 | havedash = 1; |
278 | } |
279 | } else { |
280 | save = '-'; |
281 | havesave = 1; |
282 | } |
283 | } else if (quoted && strchr("DSWdsw" , g->yychar)) { |
284 | if (havesave) { |
285 | addrange(g, save, save); |
286 | if (havedash) |
287 | addrange(g, '-', '-'); |
288 | } |
289 | switch (g->yychar) { |
290 | case 'd': addranges_d(g); break; |
291 | case 's': addranges_s(g); break; |
292 | case 'w': addranges_w(g); break; |
293 | case 'D': addranges_D(g); break; |
294 | case 'S': addranges_S(g); break; |
295 | case 'W': addranges_W(g); break; |
296 | } |
297 | havesave = havedash = 0; |
298 | } else { |
299 | if (quoted) { |
300 | if (g->yychar == 'b') |
301 | g->yychar = '\b'; |
302 | else if (g->yychar == '0') |
303 | g->yychar = 0; |
304 | /* else identity escape */ |
305 | } |
306 | if (havesave) { |
307 | if (havedash) { |
308 | addrange(g, save, g->yychar); |
309 | havesave = havedash = 0; |
310 | } else { |
311 | addrange(g, save, save); |
312 | save = g->yychar; |
313 | } |
314 | } else { |
315 | save = g->yychar; |
316 | havesave = 1; |
317 | } |
318 | } |
319 | |
320 | quoted = nextrune(g); |
321 | } |
322 | |
323 | if (havesave) { |
324 | addrange(g, save, save); |
325 | if (havedash) |
326 | addrange(g, '-', '-'); |
327 | } |
328 | |
329 | return type; |
330 | } |
331 | |
332 | static int lex(struct cstate *g) |
333 | { |
334 | int quoted = nextrune(g); |
335 | if (quoted) { |
336 | switch (g->yychar) { |
337 | case 'b': return L_WORD; |
338 | case 'B': return L_NWORD; |
339 | case 'd': newcclass(g); addranges_d(g); return L_CCLASS; |
340 | case 's': newcclass(g); addranges_s(g); return L_CCLASS; |
341 | case 'w': newcclass(g); addranges_w(g); return L_CCLASS; |
342 | case 'D': newcclass(g); addranges_d(g); return L_NCCLASS; |
343 | case 'S': newcclass(g); addranges_s(g); return L_NCCLASS; |
344 | case 'W': newcclass(g); addranges_w(g); return L_NCCLASS; |
345 | case '0': g->yychar = 0; return L_CHAR; |
346 | } |
347 | if (g->yychar >= '0' && g->yychar <= '9') { |
348 | g->yychar -= '0'; |
349 | if (*g->source >= '0' && *g->source <= '9') |
350 | g->yychar = g->yychar * 10 + *g->source++ - '0'; |
351 | return L_REF; |
352 | } |
353 | return L_CHAR; |
354 | } |
355 | |
356 | switch (g->yychar) { |
357 | case 0: |
358 | case '$': case ')': case '*': case '+': |
359 | case '.': case '?': case '^': case '|': |
360 | return g->yychar; |
361 | } |
362 | |
363 | if (g->yychar == '{') |
364 | return lexcount(g); |
365 | if (g->yychar == '[') |
366 | return lexclass(g); |
367 | if (g->yychar == '(') { |
368 | if (g->source[0] == '?') { |
369 | if (g->source[1] == ':') { |
370 | g->source += 2; |
371 | return L_NC; |
372 | } |
373 | if (g->source[1] == '=') { |
374 | g->source += 2; |
375 | return L_PLA; |
376 | } |
377 | if (g->source[1] == '!') { |
378 | g->source += 2; |
379 | return L_NLA; |
380 | } |
381 | } |
382 | return '('; |
383 | } |
384 | |
385 | return L_CHAR; |
386 | } |
387 | |
388 | /* Parse */ |
389 | |
390 | enum { |
391 | P_CAT, P_ALT, P_REP, |
392 | P_BOL, P_EOL, P_WORD, P_NWORD, |
393 | P_PAR, P_PLA, P_NLA, |
394 | P_ANY, P_CHAR, P_CCLASS, P_NCCLASS, |
395 | P_REF, |
396 | }; |
397 | |
398 | struct Renode { |
399 | unsigned char type; |
400 | unsigned char ng, m, n; |
401 | Rune c; |
402 | Reclass *cc; |
403 | Renode *x; |
404 | Renode *y; |
405 | }; |
406 | |
407 | static Renode *newnode(struct cstate *g, int type) |
408 | { |
409 | Renode *node = g->pend++; |
410 | node->type = type; |
411 | node->cc = NULL; |
412 | node->c = 0; |
413 | node->ng = 0; |
414 | node->m = 0; |
415 | node->n = 0; |
416 | node->x = node->y = NULL; |
417 | return node; |
418 | } |
419 | |
420 | static int empty(Renode *node) |
421 | { |
422 | if (!node) return 1; |
423 | switch (node->type) { |
424 | default: return 1; |
425 | case P_CAT: return empty(node->x) && empty(node->y); |
426 | case P_ALT: return empty(node->x) || empty(node->y); |
427 | case P_REP: return empty(node->x) || node->m == 0; |
428 | case P_PAR: return empty(node->x); |
429 | case P_REF: return empty(node->x); |
430 | case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0; |
431 | } |
432 | } |
433 | |
434 | static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max) |
435 | { |
436 | Renode *rep = newnode(g, P_REP); |
437 | if (max == REPINF && empty(atom)) |
438 | die(g, "infinite loop matching the empty string" ); |
439 | rep->ng = ng; |
440 | rep->m = min; |
441 | rep->n = max; |
442 | rep->x = atom; |
443 | return rep; |
444 | } |
445 | |
446 | static void next(struct cstate *g) |
447 | { |
448 | g->lookahead = lex(g); |
449 | } |
450 | |
451 | static int accept(struct cstate *g, int t) |
452 | { |
453 | if (g->lookahead == t) { |
454 | next(g); |
455 | return 1; |
456 | } |
457 | return 0; |
458 | } |
459 | |
460 | static Renode *parsealt(struct cstate *g); |
461 | |
462 | static Renode *parseatom(struct cstate *g) |
463 | { |
464 | Renode *atom; |
465 | if (g->lookahead == L_CHAR) { |
466 | atom = newnode(g, P_CHAR); |
467 | atom->c = g->yychar; |
468 | next(g); |
469 | return atom; |
470 | } |
471 | if (g->lookahead == L_CCLASS) { |
472 | atom = newnode(g, P_CCLASS); |
473 | atom->cc = g->yycc; |
474 | next(g); |
475 | return atom; |
476 | } |
477 | if (g->lookahead == L_NCCLASS) { |
478 | atom = newnode(g, P_NCCLASS); |
479 | atom->cc = g->yycc; |
480 | next(g); |
481 | return atom; |
482 | } |
483 | if (g->lookahead == L_REF) { |
484 | atom = newnode(g, P_REF); |
485 | if (g->yychar == 0 || g->yychar >= g->nsub || !g->sub[g->yychar]) |
486 | die(g, "invalid back-reference" ); |
487 | atom->n = g->yychar; |
488 | atom->x = g->sub[g->yychar]; |
489 | next(g); |
490 | return atom; |
491 | } |
492 | if (accept(g, '.')) |
493 | return newnode(g, P_ANY); |
494 | if (accept(g, '(')) { |
495 | atom = newnode(g, P_PAR); |
496 | if (g->nsub == MAXSUB) |
497 | die(g, "too many captures" ); |
498 | atom->n = g->nsub++; |
499 | atom->x = parsealt(g); |
500 | g->sub[atom->n] = atom; |
501 | if (!accept(g, ')')) |
502 | die(g, "unmatched '('" ); |
503 | return atom; |
504 | } |
505 | if (accept(g, L_NC)) { |
506 | atom = parsealt(g); |
507 | if (!accept(g, ')')) |
508 | die(g, "unmatched '('" ); |
509 | return atom; |
510 | } |
511 | if (accept(g, L_PLA)) { |
512 | atom = newnode(g, P_PLA); |
513 | atom->x = parsealt(g); |
514 | if (!accept(g, ')')) |
515 | die(g, "unmatched '('" ); |
516 | return atom; |
517 | } |
518 | if (accept(g, L_NLA)) { |
519 | atom = newnode(g, P_NLA); |
520 | atom->x = parsealt(g); |
521 | if (!accept(g, ')')) |
522 | die(g, "unmatched '('" ); |
523 | return atom; |
524 | } |
525 | die(g, "syntax error" ); |
526 | return NULL; |
527 | } |
528 | |
529 | static Renode *parserep(struct cstate *g) |
530 | { |
531 | Renode *atom; |
532 | |
533 | if (accept(g, '^')) return newnode(g, P_BOL); |
534 | if (accept(g, '$')) return newnode(g, P_EOL); |
535 | if (accept(g, L_WORD)) return newnode(g, P_WORD); |
536 | if (accept(g, L_NWORD)) return newnode(g, P_NWORD); |
537 | |
538 | atom = parseatom(g); |
539 | if (g->lookahead == L_COUNT) { |
540 | int min = g->yymin, max = g->yymax; |
541 | next(g); |
542 | if (max < min) |
543 | die(g, "invalid quantifier" ); |
544 | return newrep(g, atom, accept(g, '?'), min, max); |
545 | } |
546 | if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF); |
547 | if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF); |
548 | if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1); |
549 | return atom; |
550 | } |
551 | |
552 | static Renode *parsecat(struct cstate *g) |
553 | { |
554 | Renode *cat, *head, **tail; |
555 | if (g->lookahead && g->lookahead != '|' && g->lookahead != ')') { |
556 | /* Build a right-leaning tree by splicing in new 'cat' at the tail. */ |
557 | head = parserep(g); |
558 | tail = &head; |
559 | while (g->lookahead && g->lookahead != '|' && g->lookahead != ')') { |
560 | cat = newnode(g, P_CAT); |
561 | cat->x = *tail; |
562 | cat->y = parserep(g); |
563 | *tail = cat; |
564 | tail = &cat->y; |
565 | } |
566 | return head; |
567 | } |
568 | return NULL; |
569 | } |
570 | |
571 | static Renode *parsealt(struct cstate *g) |
572 | { |
573 | Renode *alt, *x; |
574 | alt = parsecat(g); |
575 | while (accept(g, '|')) { |
576 | x = alt; |
577 | alt = newnode(g, P_ALT); |
578 | alt->x = x; |
579 | alt->y = parsecat(g); |
580 | } |
581 | return alt; |
582 | } |
583 | |
584 | /* Compile */ |
585 | |
586 | enum { |
587 | I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA, |
588 | I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF, |
589 | I_BOL, I_EOL, I_WORD, I_NWORD, |
590 | I_LPAR, I_RPAR |
591 | }; |
592 | |
593 | struct Reinst { |
594 | unsigned char opcode; |
595 | unsigned char n; |
596 | Rune c; |
597 | Reclass *cc; |
598 | Reinst *x; |
599 | Reinst *y; |
600 | }; |
601 | |
602 | static int count(struct cstate *g, Renode *node) |
603 | { |
604 | int min, max, n; |
605 | if (!node) return 0; |
606 | switch (node->type) { |
607 | default: return 1; |
608 | case P_CAT: return count(g, node->x) + count(g, node->y); |
609 | case P_ALT: return count(g, node->x) + count(g, node->y) + 2; |
610 | case P_REP: |
611 | min = node->m; |
612 | max = node->n; |
613 | if (min == max) n = count(g, node->x) * min; |
614 | else if (max < REPINF) n = count(g, node->x) * max + (max - min); |
615 | else n = count(g, node->x) * (min + 1) + 2; |
616 | if (n < 0 || n > MAXPROG) die(g, "program too large" ); |
617 | return n; |
618 | case P_PAR: return count(g, node->x) + 2; |
619 | case P_PLA: return count(g, node->x) + 2; |
620 | case P_NLA: return count(g, node->x) + 2; |
621 | } |
622 | } |
623 | |
624 | static Reinst *emit(Reprog *prog, int opcode) |
625 | { |
626 | Reinst *inst = prog->end++; |
627 | inst->opcode = opcode; |
628 | inst->n = 0; |
629 | inst->c = 0; |
630 | inst->cc = NULL; |
631 | inst->x = inst->y = NULL; |
632 | return inst; |
633 | } |
634 | |
635 | static void compile(Reprog *prog, Renode *node) |
636 | { |
637 | Reinst *inst, *split, *jump; |
638 | int i; |
639 | |
640 | if (!node) |
641 | return; |
642 | |
643 | loop: |
644 | switch (node->type) { |
645 | case P_CAT: |
646 | compile(prog, node->x); |
647 | node = node->y; |
648 | goto loop; |
649 | |
650 | case P_ALT: |
651 | split = emit(prog, I_SPLIT); |
652 | compile(prog, node->x); |
653 | jump = emit(prog, I_JUMP); |
654 | compile(prog, node->y); |
655 | split->x = split + 1; |
656 | split->y = jump + 1; |
657 | jump->x = prog->end; |
658 | break; |
659 | |
660 | case P_REP: |
661 | inst = NULL; /* silence compiler warning. assert(node->m > 0). */ |
662 | for (i = 0; i < node->m; ++i) { |
663 | inst = prog->end; |
664 | compile(prog, node->x); |
665 | } |
666 | if (node->m == node->n) |
667 | break; |
668 | if (node->n < REPINF) { |
669 | for (i = node->m; i < node->n; ++i) { |
670 | split = emit(prog, I_SPLIT); |
671 | compile(prog, node->x); |
672 | if (node->ng) { |
673 | split->y = split + 1; |
674 | split->x = prog->end; |
675 | } else { |
676 | split->x = split + 1; |
677 | split->y = prog->end; |
678 | } |
679 | } |
680 | } else if (node->m == 0) { |
681 | split = emit(prog, I_SPLIT); |
682 | compile(prog, node->x); |
683 | jump = emit(prog, I_JUMP); |
684 | if (node->ng) { |
685 | split->y = split + 1; |
686 | split->x = prog->end; |
687 | } else { |
688 | split->x = split + 1; |
689 | split->y = prog->end; |
690 | } |
691 | jump->x = split; |
692 | } else { |
693 | split = emit(prog, I_SPLIT); |
694 | if (node->ng) { |
695 | split->y = inst; |
696 | split->x = prog->end; |
697 | } else { |
698 | split->x = inst; |
699 | split->y = prog->end; |
700 | } |
701 | } |
702 | break; |
703 | |
704 | case P_BOL: emit(prog, I_BOL); break; |
705 | case P_EOL: emit(prog, I_EOL); break; |
706 | case P_WORD: emit(prog, I_WORD); break; |
707 | case P_NWORD: emit(prog, I_NWORD); break; |
708 | |
709 | case P_PAR: |
710 | inst = emit(prog, I_LPAR); |
711 | inst->n = node->n; |
712 | compile(prog, node->x); |
713 | inst = emit(prog, I_RPAR); |
714 | inst->n = node->n; |
715 | break; |
716 | case P_PLA: |
717 | split = emit(prog, I_PLA); |
718 | compile(prog, node->x); |
719 | emit(prog, I_END); |
720 | split->x = split + 1; |
721 | split->y = prog->end; |
722 | break; |
723 | case P_NLA: |
724 | split = emit(prog, I_NLA); |
725 | compile(prog, node->x); |
726 | emit(prog, I_END); |
727 | split->x = split + 1; |
728 | split->y = prog->end; |
729 | break; |
730 | |
731 | case P_ANY: |
732 | emit(prog, I_ANY); |
733 | break; |
734 | case P_CHAR: |
735 | inst = emit(prog, I_CHAR); |
736 | inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c; |
737 | break; |
738 | case P_CCLASS: |
739 | inst = emit(prog, I_CCLASS); |
740 | inst->cc = node->cc; |
741 | break; |
742 | case P_NCCLASS: |
743 | inst = emit(prog, I_NCCLASS); |
744 | inst->cc = node->cc; |
745 | break; |
746 | case P_REF: |
747 | inst = emit(prog, I_REF); |
748 | inst->n = node->n; |
749 | break; |
750 | } |
751 | } |
752 | |
753 | #ifdef TEST |
754 | static void dumpnode(Renode *node) |
755 | { |
756 | Rune *p; |
757 | if (!node) { printf("Empty" ); return; } |
758 | switch (node->type) { |
759 | case P_CAT: printf("Cat(" ); dumpnode(node->x); printf(", " ); dumpnode(node->y); printf(")" ); break; |
760 | case P_ALT: printf("Alt(" ); dumpnode(node->x); printf(", " ); dumpnode(node->y); printf(")" ); break; |
761 | case P_REP: |
762 | printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d," , node->m, node->n); |
763 | dumpnode(node->x); |
764 | printf(")" ); |
765 | break; |
766 | case P_BOL: printf("Bol" ); break; |
767 | case P_EOL: printf("Eol" ); break; |
768 | case P_WORD: printf("Word" ); break; |
769 | case P_NWORD: printf("NotWord" ); break; |
770 | case P_PAR: printf("Par(%d," , node->n); dumpnode(node->x); printf(")" ); break; |
771 | case P_PLA: printf("PLA(" ); dumpnode(node->x); printf(")" ); break; |
772 | case P_NLA: printf("NLA(" ); dumpnode(node->x); printf(")" ); break; |
773 | case P_ANY: printf("Any" ); break; |
774 | case P_CHAR: printf("Char(%c)" , node->c); break; |
775 | case P_CCLASS: |
776 | printf("Class(" ); |
777 | for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X," , p[0], p[1]); |
778 | printf(")" ); |
779 | break; |
780 | case P_NCCLASS: |
781 | printf("NotClass(" ); |
782 | for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X," , p[0], p[1]); |
783 | printf(")" ); |
784 | break; |
785 | case P_REF: printf("Ref(%d)" , node->n); break; |
786 | } |
787 | } |
788 | |
789 | static void dumpprog(Reprog *prog) |
790 | { |
791 | Reinst *inst; |
792 | int i; |
793 | for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) { |
794 | printf("% 5d: " , i); |
795 | switch (inst->opcode) { |
796 | case I_END: puts("end" ); break; |
797 | case I_JUMP: printf("jump %d\n" , (int)(inst->x - prog->start)); break; |
798 | case I_SPLIT: printf("split %d %d\n" , (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break; |
799 | case I_PLA: printf("pla %d %d\n" , (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break; |
800 | case I_NLA: printf("nla %d %d\n" , (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break; |
801 | case I_ANY: puts("any" ); break; |
802 | case I_ANYNL: puts("anynl" ); break; |
803 | case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n" , inst->c); break; |
804 | case I_CCLASS: puts("cclass" ); break; |
805 | case I_NCCLASS: puts("ncclass" ); break; |
806 | case I_REF: printf("ref %d\n" , inst->n); break; |
807 | case I_BOL: puts("bol" ); break; |
808 | case I_EOL: puts("eol" ); break; |
809 | case I_WORD: puts("word" ); break; |
810 | case I_NWORD: puts("nword" ); break; |
811 | case I_LPAR: printf("lpar %d\n" , inst->n); break; |
812 | case I_RPAR: printf("rpar %d\n" , inst->n); break; |
813 | } |
814 | } |
815 | } |
816 | #endif |
817 | |
818 | Reprog *regcompx(void *(*alloc)(void *ctx, void *p, int n), void *ctx, |
819 | const char *pattern, int cflags, const char **errorp) |
820 | { |
821 | struct cstate g; |
822 | Renode *node; |
823 | Reinst *split, *jump; |
824 | int i, n; |
825 | |
826 | g.pstart = NULL; |
827 | g.prog = NULL; |
828 | |
829 | if (setjmp(g.kaboom)) { |
830 | if (errorp) *errorp = g.error; |
831 | alloc(ctx, g.pstart, 0); |
832 | alloc(ctx, g.prog, 0); |
833 | return NULL; |
834 | } |
835 | |
836 | g.prog = alloc(ctx, NULL, sizeof (Reprog)); |
837 | if (!g.prog) |
838 | die(&g, "cannot allocate regular expression" ); |
839 | n = strlen(pattern) * 2; |
840 | if (n > MAXPROG) |
841 | die(&g, "program too large" ); |
842 | if (n > 0) { |
843 | g.pstart = g.pend = alloc(ctx, NULL, sizeof (Renode) * n); |
844 | if (!g.pstart) |
845 | die(&g, "cannot allocate regular expression parse list" ); |
846 | } |
847 | |
848 | g.source = pattern; |
849 | g.ncclass = 0; |
850 | g.nsub = 1; |
851 | for (i = 0; i < MAXSUB; ++i) |
852 | g.sub[i] = 0; |
853 | |
854 | g.prog->flags = cflags; |
855 | |
856 | next(&g); |
857 | node = parsealt(&g); |
858 | if (g.lookahead == ')') |
859 | die(&g, "unmatched ')'" ); |
860 | if (g.lookahead != 0) |
861 | die(&g, "syntax error" ); |
862 | |
863 | #ifdef TEST |
864 | dumpnode(node); |
865 | putchar('\n'); |
866 | #endif |
867 | |
868 | n = 6 + count(&g, node); |
869 | if (n < 0 || n > MAXPROG) |
870 | die(&g, "program too large" ); |
871 | |
872 | g.prog->nsub = g.nsub; |
873 | g.prog->start = g.prog->end = alloc(ctx, NULL, n * sizeof (Reinst)); |
874 | if (!g.prog->start) |
875 | die(&g, "cannot allocate regular expression instruction list" ); |
876 | |
877 | split = emit(g.prog, I_SPLIT); |
878 | split->x = split + 3; |
879 | split->y = split + 1; |
880 | emit(g.prog, I_ANYNL); |
881 | jump = emit(g.prog, I_JUMP); |
882 | jump->x = split; |
883 | emit(g.prog, I_LPAR); |
884 | compile(g.prog, node); |
885 | emit(g.prog, I_RPAR); |
886 | emit(g.prog, I_END); |
887 | |
888 | #ifdef TEST |
889 | dumpprog(g.prog); |
890 | #endif |
891 | |
892 | alloc(ctx, g.pstart, 0); |
893 | |
894 | if (errorp) *errorp = NULL; |
895 | return g.prog; |
896 | } |
897 | |
898 | void regfreex(void *(*alloc)(void *ctx, void *p, int n), void *ctx, Reprog *prog) |
899 | { |
900 | if (prog) { |
901 | alloc(ctx, prog->start, 0); |
902 | alloc(ctx, prog, 0); |
903 | } |
904 | } |
905 | |
906 | static void *default_alloc(void *ctx, void *p, int n) |
907 | { |
908 | return realloc(p, (size_t)n); |
909 | } |
910 | |
911 | Reprog *regcomp(const char *pattern, int cflags, const char **errorp) |
912 | { |
913 | return regcompx(default_alloc, NULL, pattern, cflags, errorp); |
914 | } |
915 | |
916 | void regfree(Reprog *prog) |
917 | { |
918 | regfreex(default_alloc, NULL, prog); |
919 | } |
920 | |
921 | /* Match */ |
922 | |
923 | static int isnewline(int c) |
924 | { |
925 | return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029; |
926 | } |
927 | |
928 | static int iswordchar(int c) |
929 | { |
930 | return c == '_' || |
931 | (c >= 'a' && c <= 'z') || |
932 | (c >= 'A' && c <= 'Z') || |
933 | (c >= '0' && c <= '9'); |
934 | } |
935 | |
936 | static int incclass(Reclass *cc, Rune c) |
937 | { |
938 | Rune *p; |
939 | for (p = cc->spans; p < cc->end; p += 2) |
940 | if (p[0] <= c && c <= p[1]) |
941 | return 1; |
942 | return 0; |
943 | } |
944 | |
945 | static int incclasscanon(Reclass *cc, Rune c) |
946 | { |
947 | Rune *p, r; |
948 | for (p = cc->spans; p < cc->end; p += 2) |
949 | for (r = p[0]; r <= p[1]; ++r) |
950 | if (c == canon(r)) |
951 | return 1; |
952 | return 0; |
953 | } |
954 | |
955 | static int strncmpcanon(const char *a, const char *b, int n) |
956 | { |
957 | Rune ra, rb; |
958 | int c; |
959 | while (n--) { |
960 | if (!*a) return -1; |
961 | if (!*b) return 1; |
962 | a += chartorune(&ra, a); |
963 | b += chartorune(&rb, b); |
964 | c = canon(ra) - canon(rb); |
965 | if (c) |
966 | return c; |
967 | } |
968 | return 0; |
969 | } |
970 | |
971 | static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out, int depth) |
972 | { |
973 | Resub scratch; |
974 | int result; |
975 | int i; |
976 | Rune c; |
977 | |
978 | /* stack overflow */ |
979 | if (depth > MAXREC) |
980 | return -1; |
981 | |
982 | for (;;) { |
983 | switch (pc->opcode) { |
984 | case I_END: |
985 | return 0; |
986 | case I_JUMP: |
987 | pc = pc->x; |
988 | break; |
989 | case I_SPLIT: |
990 | scratch = *out; |
991 | result = match(pc->x, sp, bol, flags, &scratch, depth+1); |
992 | if (result == -1) |
993 | return -1; |
994 | if (result == 0) { |
995 | *out = scratch; |
996 | return 0; |
997 | } |
998 | pc = pc->y; |
999 | break; |
1000 | |
1001 | case I_PLA: |
1002 | result = match(pc->x, sp, bol, flags, out, depth+1); |
1003 | if (result == -1) |
1004 | return -1; |
1005 | if (result == 1) |
1006 | return 1; |
1007 | pc = pc->y; |
1008 | break; |
1009 | case I_NLA: |
1010 | scratch = *out; |
1011 | result = match(pc->x, sp, bol, flags, &scratch, depth+1); |
1012 | if (result == -1) |
1013 | return -1; |
1014 | if (result == 0) |
1015 | return 1; |
1016 | pc = pc->y; |
1017 | break; |
1018 | |
1019 | case I_ANYNL: |
1020 | sp += chartorune(&c, sp); |
1021 | if (c == 0) |
1022 | return 1; |
1023 | pc = pc + 1; |
1024 | break; |
1025 | case I_ANY: |
1026 | sp += chartorune(&c, sp); |
1027 | if (c == 0) |
1028 | return 1; |
1029 | if (isnewline(c)) |
1030 | return 1; |
1031 | pc = pc + 1; |
1032 | break; |
1033 | case I_CHAR: |
1034 | sp += chartorune(&c, sp); |
1035 | if (c == 0) |
1036 | return 1; |
1037 | if (flags & REG_ICASE) |
1038 | c = canon(c); |
1039 | if (c != pc->c) |
1040 | return 1; |
1041 | pc = pc + 1; |
1042 | break; |
1043 | case I_CCLASS: |
1044 | sp += chartorune(&c, sp); |
1045 | if (c == 0) |
1046 | return 1; |
1047 | if (flags & REG_ICASE) { |
1048 | if (!incclasscanon(pc->cc, canon(c))) |
1049 | return 1; |
1050 | } else { |
1051 | if (!incclass(pc->cc, c)) |
1052 | return 1; |
1053 | } |
1054 | pc = pc + 1; |
1055 | break; |
1056 | case I_NCCLASS: |
1057 | sp += chartorune(&c, sp); |
1058 | if (c == 0) |
1059 | return 1; |
1060 | if (flags & REG_ICASE) { |
1061 | if (incclasscanon(pc->cc, canon(c))) |
1062 | return 1; |
1063 | } else { |
1064 | if (incclass(pc->cc, c)) |
1065 | return 1; |
1066 | } |
1067 | pc = pc + 1; |
1068 | break; |
1069 | case I_REF: |
1070 | i = out->sub[pc->n].ep - out->sub[pc->n].sp; |
1071 | if (flags & REG_ICASE) { |
1072 | if (strncmpcanon(sp, out->sub[pc->n].sp, i)) |
1073 | return 1; |
1074 | } else { |
1075 | if (strncmp(sp, out->sub[pc->n].sp, i)) |
1076 | return 1; |
1077 | } |
1078 | if (i > 0) |
1079 | sp += i; |
1080 | pc = pc + 1; |
1081 | break; |
1082 | |
1083 | case I_BOL: |
1084 | if (sp == bol && !(flags & REG_NOTBOL)) { |
1085 | pc = pc + 1; |
1086 | break; |
1087 | } |
1088 | if (flags & REG_NEWLINE) { |
1089 | if (sp > bol && isnewline(sp[-1])) { |
1090 | pc = pc + 1; |
1091 | break; |
1092 | } |
1093 | } |
1094 | return 1; |
1095 | case I_EOL: |
1096 | if (*sp == 0) { |
1097 | pc = pc + 1; |
1098 | break; |
1099 | } |
1100 | if (flags & REG_NEWLINE) { |
1101 | if (isnewline(*sp)) { |
1102 | pc = pc + 1; |
1103 | break; |
1104 | } |
1105 | } |
1106 | return 1; |
1107 | case I_WORD: |
1108 | i = sp > bol && iswordchar(sp[-1]); |
1109 | i ^= iswordchar(sp[0]); |
1110 | if (!i) |
1111 | return 1; |
1112 | pc = pc + 1; |
1113 | break; |
1114 | case I_NWORD: |
1115 | i = sp > bol && iswordchar(sp[-1]); |
1116 | i ^= iswordchar(sp[0]); |
1117 | if (i) |
1118 | return 1; |
1119 | pc = pc + 1; |
1120 | break; |
1121 | |
1122 | case I_LPAR: |
1123 | out->sub[pc->n].sp = sp; |
1124 | pc = pc + 1; |
1125 | break; |
1126 | case I_RPAR: |
1127 | out->sub[pc->n].ep = sp; |
1128 | pc = pc + 1; |
1129 | break; |
1130 | default: |
1131 | return 1; |
1132 | } |
1133 | } |
1134 | } |
1135 | |
1136 | int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags) |
1137 | { |
1138 | Resub scratch; |
1139 | int i; |
1140 | |
1141 | if (!sub) |
1142 | sub = &scratch; |
1143 | |
1144 | sub->nsub = prog->nsub; |
1145 | for (i = 0; i < MAXSUB; ++i) |
1146 | sub->sub[i].sp = sub->sub[i].ep = NULL; |
1147 | |
1148 | return match(prog->start, sp, sp, prog->flags | eflags, sub, 0); |
1149 | } |
1150 | |
1151 | #ifdef TEST |
1152 | int main(int argc, char **argv) |
1153 | { |
1154 | const char *error; |
1155 | const char *s; |
1156 | Reprog *p; |
1157 | Resub m; |
1158 | int i; |
1159 | |
1160 | if (argc > 1) { |
1161 | p = regcomp(argv[1], 0, &error); |
1162 | if (!p) { |
1163 | fprintf(stderr, "regcomp: %s\n" , error); |
1164 | return 1; |
1165 | } |
1166 | |
1167 | if (argc > 2) { |
1168 | s = argv[2]; |
1169 | printf("nsub = %d\n" , p->nsub); |
1170 | if (!regexec(p, s, &m, 0)) { |
1171 | for (i = 0; i < m.nsub; ++i) { |
1172 | int n = m.sub[i].ep - m.sub[i].sp; |
1173 | if (n > 0) |
1174 | printf("match %d: s=%d e=%d n=%d '%.*s'\n" , i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp); |
1175 | else |
1176 | printf("match %d: n=0 ''\n" , i); |
1177 | } |
1178 | } else { |
1179 | printf("no match\n" ); |
1180 | } |
1181 | } |
1182 | } |
1183 | |
1184 | return 0; |
1185 | } |
1186 | #endif |
1187 | |