1/*
2 * Regular Expression Engine
3 *
4 * Copyright (c) 2017-2018 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24#include <stdlib.h>
25#include <stdio.h>
26#include <stdarg.h>
27#include <inttypes.h>
28#include <string.h>
29#include <assert.h>
30
31#include "cutils.h"
32#include "libregexp.h"
33#include "libunicode.h"
34
35/*
36 TODO:
37
38 - Add a lock step execution mode (=linear time execution guaranteed)
39 when the regular expression is "simple" i.e. no backreference nor
40 complicated lookahead. The opcodes are designed for this execution
41 model.
42*/
43
44#if defined(TEST)
45#define DUMP_REOP
46#endif
47
48typedef enum {
49#define DEF(id, size) REOP_ ## id,
50#include "libregexp-opcode.h"
51#undef DEF
52 REOP_COUNT,
53} REOPCodeEnum;
54
55#define CAPTURE_COUNT_MAX 255
56#define STACK_SIZE_MAX 255
57/* must be large enough to have a negligible runtime cost and small
58 enough to call the interrupt callback often. */
59#define INTERRUPT_COUNTER_INIT 10000
60
61/* unicode code points */
62#define CP_LS 0x2028
63#define CP_PS 0x2029
64
65#define TMP_BUF_SIZE 128
66
67typedef struct {
68 DynBuf byte_code;
69 const uint8_t *buf_ptr;
70 const uint8_t *buf_end;
71 const uint8_t *buf_start;
72 int re_flags;
73 BOOL is_unicode;
74 BOOL ignore_case;
75 BOOL dotall;
76 int capture_count;
77 int total_capture_count; /* -1 = not computed yet */
78 int has_named_captures; /* -1 = don't know, 0 = no, 1 = yes */
79 void *opaque;
80 DynBuf group_names;
81 union {
82 char error_msg[TMP_BUF_SIZE];
83 char tmp_buf[TMP_BUF_SIZE];
84 } u;
85} REParseState;
86
87typedef struct {
88#ifdef DUMP_REOP
89 const char *name;
90#endif
91 uint8_t size;
92} REOpCode;
93
94static const REOpCode reopcode_info[REOP_COUNT] = {
95#ifdef DUMP_REOP
96#define DEF(id, size) { #id, size },
97#else
98#define DEF(id, size) { size },
99#endif
100#include "libregexp-opcode.h"
101#undef DEF
102};
103
104#define RE_HEADER_FLAGS 0
105#define RE_HEADER_CAPTURE_COUNT 1
106#define RE_HEADER_STACK_SIZE 2
107#define RE_HEADER_BYTECODE_LEN 3
108
109#define RE_HEADER_LEN 7
110
111static inline int is_digit(int c) {
112 return c >= '0' && c <= '9';
113}
114
115/* insert 'len' bytes at position 'pos'. Return < 0 if error. */
116static int dbuf_insert(DynBuf *s, int pos, int len)
117{
118 if (dbuf_realloc(s, s->size + len))
119 return -1;
120 memmove(s->buf + pos + len, s->buf + pos, s->size - pos);
121 s->size += len;
122 return 0;
123}
124
125static const uint16_t char_range_d[] = {
126 1,
127 0x0030, 0x0039 + 1,
128};
129
130/* code point ranges for Zs,Zl or Zp property */
131static const uint16_t char_range_s[] = {
132 10,
133 0x0009, 0x000D + 1,
134 0x0020, 0x0020 + 1,
135 0x00A0, 0x00A0 + 1,
136 0x1680, 0x1680 + 1,
137 0x2000, 0x200A + 1,
138 /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
139 /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
140 0x2028, 0x2029 + 1,
141 0x202F, 0x202F + 1,
142 0x205F, 0x205F + 1,
143 0x3000, 0x3000 + 1,
144 /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
145 0xFEFF, 0xFEFF + 1,
146};
147
148static const uint16_t char_range_w[] = {
149 4,
150 0x0030, 0x0039 + 1,
151 0x0041, 0x005A + 1,
152 0x005F, 0x005F + 1,
153 0x0061, 0x007A + 1,
154};
155
156#define CLASS_RANGE_BASE 0x40000000
157
158typedef enum {
159 CHAR_RANGE_d,
160 CHAR_RANGE_D,
161 CHAR_RANGE_s,
162 CHAR_RANGE_S,
163 CHAR_RANGE_w,
164 CHAR_RANGE_W,
165} CharRangeEnum;
166
167static const uint16_t * const char_range_table[] = {
168 char_range_d,
169 char_range_s,
170 char_range_w,
171};
172
173static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c)
174{
175 BOOL invert;
176 const uint16_t *c_pt;
177 int len, i;
178
179 invert = c & 1;
180 c_pt = char_range_table[c >> 1];
181 len = *c_pt++;
182 cr_init(cr, s->opaque, lre_realloc);
183 for(i = 0; i < len * 2; i++) {
184 if (cr_add_point(cr, c_pt[i]))
185 goto fail;
186 }
187 if (invert) {
188 if (cr_invert(cr))
189 goto fail;
190 }
191 return 0;
192 fail:
193 cr_free(cr);
194 return -1;
195}
196
197#ifdef DUMP_REOP
198static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
199 int buf_len)
200{
201 int pos, len, opcode, bc_len, re_flags, i;
202 uint32_t val;
203
204 assert(buf_len >= RE_HEADER_LEN);
205
206 re_flags = lre_get_flags(buf);
207 bc_len = get_u32(buf + RE_HEADER_BYTECODE_LEN);
208 assert(bc_len + RE_HEADER_LEN <= buf_len);
209 printf("flags: 0x%x capture_count=%d stack_size=%d\n",
210 re_flags, buf[RE_HEADER_CAPTURE_COUNT], buf[RE_HEADER_STACK_SIZE]);
211 if (re_flags & LRE_FLAG_NAMED_GROUPS) {
212 const char *p;
213 p = (char *)buf + RE_HEADER_LEN + bc_len;
214 printf("named groups: ");
215 for(i = 1; i < buf[RE_HEADER_CAPTURE_COUNT]; i++) {
216 if (i != 1)
217 printf(",");
218 printf("<%s>", p);
219 p += strlen(p) + 1;
220 }
221 printf("\n");
222 assert(p == (char *)(buf + buf_len));
223 }
224 printf("bytecode_len=%d\n", bc_len);
225
226 buf += RE_HEADER_LEN;
227 pos = 0;
228 while (pos < bc_len) {
229 printf("%5u: ", pos);
230 opcode = buf[pos];
231 len = reopcode_info[opcode].size;
232 if (opcode >= REOP_COUNT) {
233 printf(" invalid opcode=0x%02x\n", opcode);
234 break;
235 }
236 if ((pos + len) > bc_len) {
237 printf(" buffer overflow (opcode=0x%02x)\n", opcode);
238 break;
239 }
240 printf("%s", reopcode_info[opcode].name);
241 switch(opcode) {
242 case REOP_char:
243 val = get_u16(buf + pos + 1);
244 if (val >= ' ' && val <= 126)
245 printf(" '%c'", val);
246 else
247 printf(" 0x%04x", val);
248 break;
249 case REOP_char32:
250 val = get_u32(buf + pos + 1);
251 if (val >= ' ' && val <= 126)
252 printf(" '%c'", val);
253 else
254 printf(" 0x%08x", val);
255 break;
256 case REOP_goto:
257 case REOP_split_goto_first:
258 case REOP_split_next_first:
259 case REOP_loop:
260 case REOP_lookahead:
261 case REOP_negative_lookahead:
262 val = get_u32(buf + pos + 1);
263 val += (pos + 5);
264 printf(" %u", val);
265 break;
266 case REOP_simple_greedy_quant:
267 printf(" %u %u %u %u",
268 get_u32(buf + pos + 1) + (pos + 17),
269 get_u32(buf + pos + 1 + 4),
270 get_u32(buf + pos + 1 + 8),
271 get_u32(buf + pos + 1 + 12));
272 break;
273 case REOP_save_start:
274 case REOP_save_end:
275 case REOP_back_reference:
276 case REOP_backward_back_reference:
277 printf(" %u", buf[pos + 1]);
278 break;
279 case REOP_save_reset:
280 printf(" %u %u", buf[pos + 1], buf[pos + 2]);
281 break;
282 case REOP_push_i32:
283 val = get_u32(buf + pos + 1);
284 printf(" %d", val);
285 break;
286 case REOP_range:
287 {
288 int n, i;
289 n = get_u16(buf + pos + 1);
290 len += n * 4;
291 for(i = 0; i < n * 2; i++) {
292 val = get_u16(buf + pos + 3 + i * 2);
293 printf(" 0x%04x", val);
294 }
295 }
296 break;
297 case REOP_range32:
298 {
299 int n, i;
300 n = get_u16(buf + pos + 1);
301 len += n * 8;
302 for(i = 0; i < n * 2; i++) {
303 val = get_u32(buf + pos + 3 + i * 4);
304 printf(" 0x%08x", val);
305 }
306 }
307 break;
308 default:
309 break;
310 }
311 printf("\n");
312 pos += len;
313 }
314}
315#endif
316
317static void re_emit_op(REParseState *s, int op)
318{
319 dbuf_putc(&s->byte_code, op);
320}
321
322/* return the offset of the u32 value */
323static int re_emit_op_u32(REParseState *s, int op, uint32_t val)
324{
325 int pos;
326 dbuf_putc(&s->byte_code, op);
327 pos = s->byte_code.size;
328 dbuf_put_u32(&s->byte_code, val);
329 return pos;
330}
331
332static int re_emit_goto(REParseState *s, int op, uint32_t val)
333{
334 int pos;
335 dbuf_putc(&s->byte_code, op);
336 pos = s->byte_code.size;
337 dbuf_put_u32(&s->byte_code, val - (pos + 4));
338 return pos;
339}
340
341static void re_emit_op_u8(REParseState *s, int op, uint32_t val)
342{
343 dbuf_putc(&s->byte_code, op);
344 dbuf_putc(&s->byte_code, val);
345}
346
347static void re_emit_op_u16(REParseState *s, int op, uint32_t val)
348{
349 dbuf_putc(&s->byte_code, op);
350 dbuf_put_u16(&s->byte_code, val);
351}
352
353static int __attribute__((format(printf, 2, 3))) re_parse_error(REParseState *s, const char *fmt, ...)
354{
355 va_list ap;
356 va_start(ap, fmt);
357 vsnprintf(s->u.error_msg, sizeof(s->u.error_msg), fmt, ap);
358 va_end(ap);
359 return -1;
360}
361
362static int re_parse_out_of_memory(REParseState *s)
363{
364 return re_parse_error(s, "out of memory");
365}
366
367/* If allow_overflow is false, return -1 in case of
368 overflow. Otherwise return INT32_MAX. */
369static int parse_digits(const uint8_t **pp, BOOL allow_overflow)
370{
371 const uint8_t *p;
372 uint64_t v;
373 int c;
374
375 p = *pp;
376 v = 0;
377 for(;;) {
378 c = *p;
379 if (c < '0' || c > '9')
380 break;
381 v = v * 10 + c - '0';
382 if (v >= INT32_MAX) {
383 if (allow_overflow)
384 v = INT32_MAX;
385 else
386 return -1;
387 }
388 p++;
389 }
390 *pp = p;
391 return v;
392}
393
394static int re_parse_expect(REParseState *s, const uint8_t **pp, int c)
395{
396 const uint8_t *p;
397 p = *pp;
398 if (*p != c)
399 return re_parse_error(s, "expecting '%c'", c);
400 p++;
401 *pp = p;
402 return 0;
403}
404
405/* Parse an escape sequence, *pp points after the '\':
406 allow_utf16 value:
407 0 : no UTF-16 escapes allowed
408 1 : UTF-16 escapes allowed
409 2 : UTF-16 escapes allowed and escapes of surrogate pairs are
410 converted to a unicode character (unicode regexp case).
411
412 Return the unicode char and update *pp if recognized,
413 return -1 if malformed escape,
414 return -2 otherwise. */
415int lre_parse_escape(const uint8_t **pp, int allow_utf16)
416{
417 const uint8_t *p;
418 uint32_t c;
419
420 p = *pp;
421 c = *p++;
422 switch(c) {
423 case 'b':
424 c = '\b';
425 break;
426 case 'f':
427 c = '\f';
428 break;
429 case 'n':
430 c = '\n';
431 break;
432 case 'r':
433 c = '\r';
434 break;
435 case 't':
436 c = '\t';
437 break;
438 case 'v':
439 c = '\v';
440 break;
441 case 'x':
442 case 'u':
443 {
444 int h, n, i;
445 uint32_t c1;
446
447 if (*p == '{' && allow_utf16) {
448 p++;
449 c = 0;
450 for(;;) {
451 h = from_hex(*p++);
452 if (h < 0)
453 return -1;
454 c = (c << 4) | h;
455 if (c > 0x10FFFF)
456 return -1;
457 if (*p == '}')
458 break;
459 }
460 p++;
461 } else {
462 if (c == 'x') {
463 n = 2;
464 } else {
465 n = 4;
466 }
467
468 c = 0;
469 for(i = 0; i < n; i++) {
470 h = from_hex(*p++);
471 if (h < 0) {
472 return -1;
473 }
474 c = (c << 4) | h;
475 }
476 if (is_hi_surrogate(c) &&
477 allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') {
478 /* convert an escaped surrogate pair into a
479 unicode char */
480 c1 = 0;
481 for(i = 0; i < 4; i++) {
482 h = from_hex(p[2 + i]);
483 if (h < 0)
484 break;
485 c1 = (c1 << 4) | h;
486 }
487 if (i == 4 && is_lo_surrogate(c1)) {
488 p += 6;
489 c = from_surrogate(c, c1);
490 }
491 }
492 }
493 }
494 break;
495 case '0': case '1': case '2': case '3':
496 case '4': case '5': case '6': case '7':
497 c -= '0';
498 if (allow_utf16 == 2) {
499 /* only accept \0 not followed by digit */
500 if (c != 0 || is_digit(*p))
501 return -1;
502 } else {
503 /* parse a legacy octal sequence */
504 uint32_t v;
505 v = *p - '0';
506 if (v > 7)
507 break;
508 c = (c << 3) | v;
509 p++;
510 if (c >= 32)
511 break;
512 v = *p - '0';
513 if (v > 7)
514 break;
515 c = (c << 3) | v;
516 p++;
517 }
518 break;
519 default:
520 return -2;
521 }
522 *pp = p;
523 return c;
524}
525
526#ifdef CONFIG_ALL_UNICODE
527/* XXX: we use the same chars for name and value */
528static BOOL is_unicode_char(int c)
529{
530 return ((c >= '0' && c <= '9') ||
531 (c >= 'A' && c <= 'Z') ||
532 (c >= 'a' && c <= 'z') ||
533 (c == '_'));
534}
535
536static int parse_unicode_property(REParseState *s, CharRange *cr,
537 const uint8_t **pp, BOOL is_inv)
538{
539 const uint8_t *p;
540 char name[64], value[64];
541 char *q;
542 BOOL script_ext;
543 int ret;
544
545 p = *pp;
546 if (*p != '{')
547 return re_parse_error(s, "expecting '{' after \\p");
548 p++;
549 q = name;
550 while (is_unicode_char(*p)) {
551 if ((q - name) >= sizeof(name) - 1)
552 goto unknown_property_name;
553 *q++ = *p++;
554 }
555 *q = '\0';
556 q = value;
557 if (*p == '=') {
558 p++;
559 while (is_unicode_char(*p)) {
560 if ((q - value) >= sizeof(value) - 1)
561 return re_parse_error(s, "unknown unicode property value");
562 *q++ = *p++;
563 }
564 }
565 *q = '\0';
566 if (*p != '}')
567 return re_parse_error(s, "expecting '}'");
568 p++;
569 // printf("name=%s value=%s\n", name, value);
570
571 if (!strcmp(name, "Script") || !strcmp(name, "sc")) {
572 script_ext = FALSE;
573 goto do_script;
574 } else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) {
575 script_ext = TRUE;
576 do_script:
577 cr_init(cr, s->opaque, lre_realloc);
578 ret = unicode_script(cr, value, script_ext);
579 if (ret) {
580 cr_free(cr);
581 if (ret == -2)
582 return re_parse_error(s, "unknown unicode script");
583 else
584 goto out_of_memory;
585 }
586 } else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) {
587 cr_init(cr, s->opaque, lre_realloc);
588 ret = unicode_general_category(cr, value);
589 if (ret) {
590 cr_free(cr);
591 if (ret == -2)
592 return re_parse_error(s, "unknown unicode general category");
593 else
594 goto out_of_memory;
595 }
596 } else if (value[0] == '\0') {
597 cr_init(cr, s->opaque, lre_realloc);
598 ret = unicode_general_category(cr, name);
599 if (ret == -1) {
600 cr_free(cr);
601 goto out_of_memory;
602 }
603 if (ret < 0) {
604 ret = unicode_prop(cr, name);
605 if (ret) {
606 cr_free(cr);
607 if (ret == -2)
608 goto unknown_property_name;
609 else
610 goto out_of_memory;
611 }
612 }
613 } else {
614 unknown_property_name:
615 return re_parse_error(s, "unknown unicode property name");
616 }
617
618 if (is_inv) {
619 if (cr_invert(cr)) {
620 cr_free(cr);
621 return -1;
622 }
623 }
624 *pp = p;
625 return 0;
626 out_of_memory:
627 return re_parse_out_of_memory(s);
628}
629#endif /* CONFIG_ALL_UNICODE */
630
631/* return -1 if error otherwise the character or a class range
632 (CLASS_RANGE_BASE). In case of class range, 'cr' is
633 initialized. Otherwise, it is ignored. */
634static int get_class_atom(REParseState *s, CharRange *cr,
635 const uint8_t **pp, BOOL inclass)
636{
637 const uint8_t *p;
638 uint32_t c;
639 int ret;
640
641 p = *pp;
642
643 c = *p;
644 switch(c) {
645 case '\\':
646 p++;
647 if (p >= s->buf_end)
648 goto unexpected_end;
649 c = *p++;
650 switch(c) {
651 case 'd':
652 c = CHAR_RANGE_d;
653 goto class_range;
654 case 'D':
655 c = CHAR_RANGE_D;
656 goto class_range;
657 case 's':
658 c = CHAR_RANGE_s;
659 goto class_range;
660 case 'S':
661 c = CHAR_RANGE_S;
662 goto class_range;
663 case 'w':
664 c = CHAR_RANGE_w;
665 goto class_range;
666 case 'W':
667 c = CHAR_RANGE_W;
668 class_range:
669 if (cr_init_char_range(s, cr, c))
670 return -1;
671 c = CLASS_RANGE_BASE;
672 break;
673 case 'c':
674 c = *p;
675 if ((c >= 'a' && c <= 'z') ||
676 (c >= 'A' && c <= 'Z') ||
677 (((c >= '0' && c <= '9') || c == '_') &&
678 inclass && !s->is_unicode)) { /* Annex B.1.4 */
679 c &= 0x1f;
680 p++;
681 } else if (s->is_unicode) {
682 goto invalid_escape;
683 } else {
684 /* otherwise return '\' and 'c' */
685 p--;
686 c = '\\';
687 }
688 break;
689 case '-':
690 if (!inclass && s->is_unicode)
691 goto invalid_escape;
692 break;
693#ifdef CONFIG_ALL_UNICODE
694 case 'p':
695 case 'P':
696 if (s->is_unicode) {
697 if (parse_unicode_property(s, cr, &p, (c == 'P')))
698 return -1;
699 c = CLASS_RANGE_BASE;
700 break;
701 }
702 /* fall thru */
703#endif
704 default:
705 p--;
706 ret = lre_parse_escape(&p, s->is_unicode * 2);
707 if (ret >= 0) {
708 c = ret;
709 } else {
710 if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) {
711 /* always valid to escape these characters */
712 goto normal_char;
713 } else if (s->is_unicode) {
714 invalid_escape:
715 return re_parse_error(s, "invalid escape sequence in regular expression");
716 } else {
717 /* just ignore the '\' */
718 goto normal_char;
719 }
720 }
721 break;
722 }
723 break;
724 case '\0':
725 if (p >= s->buf_end) {
726 unexpected_end:
727 return re_parse_error(s, "unexpected end");
728 }
729 /* fall thru */
730 default:
731 normal_char:
732 /* normal char */
733 if (c >= 128) {
734 c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
735 if ((unsigned)c > 0xffff && !s->is_unicode) {
736 /* XXX: should handle non BMP-1 code points */
737 return re_parse_error(s, "malformed unicode char");
738 }
739 } else {
740 p++;
741 }
742 break;
743 }
744 *pp = p;
745 return c;
746}
747
748static int re_emit_range(REParseState *s, const CharRange *cr)
749{
750 int len, i;
751 uint32_t high;
752
753 len = (unsigned)cr->len / 2;
754 if (len >= 65535)
755 return re_parse_error(s, "too many ranges");
756 if (len == 0) {
757 /* not sure it can really happen. Emit a match that is always
758 false */
759 re_emit_op_u32(s, REOP_char32, -1);
760 } else {
761 high = cr->points[cr->len - 1];
762 if (high == UINT32_MAX)
763 high = cr->points[cr->len - 2];
764 if (high <= 0xffff) {
765 /* can use 16 bit ranges with the conversion that 0xffff =
766 infinity */
767 re_emit_op_u16(s, REOP_range, len);
768 for(i = 0; i < cr->len; i += 2) {
769 dbuf_put_u16(&s->byte_code, cr->points[i]);
770 high = cr->points[i + 1] - 1;
771 if (high == UINT32_MAX - 1)
772 high = 0xffff;
773 dbuf_put_u16(&s->byte_code, high);
774 }
775 } else {
776 re_emit_op_u16(s, REOP_range32, len);
777 for(i = 0; i < cr->len; i += 2) {
778 dbuf_put_u32(&s->byte_code, cr->points[i]);
779 dbuf_put_u32(&s->byte_code, cr->points[i + 1] - 1);
780 }
781 }
782 }
783 return 0;
784}
785
786static int re_parse_char_class(REParseState *s, const uint8_t **pp)
787{
788 const uint8_t *p;
789 uint32_t c1, c2;
790 CharRange cr_s, *cr = &cr_s;
791 CharRange cr1_s, *cr1 = &cr1_s;
792 BOOL invert;
793
794 cr_init(cr, s->opaque, lre_realloc);
795 p = *pp;
796 p++; /* skip '[' */
797
798 invert = FALSE;
799 if (*p == '^') {
800 p++;
801 invert = TRUE;
802 }
803
804 for(;;) {
805 if (*p == ']')
806 break;
807 c1 = get_class_atom(s, cr1, &p, TRUE);
808 if ((int)c1 < 0)
809 goto fail;
810 if (*p == '-' && p[1] != ']') {
811 const uint8_t *p0 = p + 1;
812 if (c1 >= CLASS_RANGE_BASE) {
813 if (s->is_unicode) {
814 cr_free(cr1);
815 goto invalid_class_range;
816 }
817 /* Annex B: match '-' character */
818 goto class_atom;
819 }
820 c2 = get_class_atom(s, cr1, &p0, TRUE);
821 if ((int)c2 < 0)
822 goto fail;
823 if (c2 >= CLASS_RANGE_BASE) {
824 cr_free(cr1);
825 if (s->is_unicode) {
826 goto invalid_class_range;
827 }
828 /* Annex B: match '-' character */
829 goto class_atom;
830 }
831 p = p0;
832 if (c2 < c1) {
833 invalid_class_range:
834 re_parse_error(s, "invalid class range");
835 goto fail;
836 }
837 if (cr_union_interval(cr, c1, c2))
838 goto memory_error;
839 } else {
840 class_atom:
841 if (c1 >= CLASS_RANGE_BASE) {
842 int ret;
843 ret = cr_union1(cr, cr1->points, cr1->len);
844 cr_free(cr1);
845 if (ret)
846 goto memory_error;
847 } else {
848 if (cr_union_interval(cr, c1, c1))
849 goto memory_error;
850 }
851 }
852 }
853 if (s->ignore_case) {
854 if (cr_regexp_canonicalize(cr, s->is_unicode))
855 goto memory_error;
856 }
857 if (invert) {
858 if (cr_invert(cr))
859 goto memory_error;
860 }
861 if (re_emit_range(s, cr))
862 goto fail;
863 cr_free(cr);
864 p++; /* skip ']' */
865 *pp = p;
866 return 0;
867 memory_error:
868 re_parse_out_of_memory(s);
869 fail:
870 cr_free(cr);
871 return -1;
872}
873
874/* Return:
875 - true if the opcodes may not advance the char pointer
876 - false if the opcodes always advance the char pointer
877*/
878static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len)
879{
880 int pos, opcode, len;
881 uint32_t val;
882 BOOL ret;
883
884 ret = TRUE;
885 pos = 0;
886 while (pos < bc_buf_len) {
887 opcode = bc_buf[pos];
888 len = reopcode_info[opcode].size;
889 switch(opcode) {
890 case REOP_range:
891 val = get_u16(bc_buf + pos + 1);
892 len += val * 4;
893 goto simple_char;
894 case REOP_range32:
895 val = get_u16(bc_buf + pos + 1);
896 len += val * 8;
897 goto simple_char;
898 case REOP_char:
899 case REOP_char32:
900 case REOP_dot:
901 case REOP_any:
902 simple_char:
903 ret = FALSE;
904 break;
905 case REOP_line_start:
906 case REOP_line_end:
907 case REOP_push_i32:
908 case REOP_push_char_pos:
909 case REOP_drop:
910 case REOP_word_boundary:
911 case REOP_not_word_boundary:
912 case REOP_prev:
913 /* no effect */
914 break;
915 case REOP_save_start:
916 case REOP_save_end:
917 case REOP_save_reset:
918 case REOP_back_reference:
919 case REOP_backward_back_reference:
920 break;
921 default:
922 /* safe behavior: we cannot predict the outcome */
923 return TRUE;
924 }
925 pos += len;
926 }
927 return ret;
928}
929
930/* return -1 if a simple quantifier cannot be used. Otherwise return
931 the number of characters in the atom. */
932static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
933{
934 int pos, opcode, len, count;
935 uint32_t val;
936
937 count = 0;
938 pos = 0;
939 while (pos < bc_buf_len) {
940 opcode = bc_buf[pos];
941 len = reopcode_info[opcode].size;
942 switch(opcode) {
943 case REOP_range:
944 val = get_u16(bc_buf + pos + 1);
945 len += val * 4;
946 goto simple_char;
947 case REOP_range32:
948 val = get_u16(bc_buf + pos + 1);
949 len += val * 8;
950 goto simple_char;
951 case REOP_char:
952 case REOP_char32:
953 case REOP_dot:
954 case REOP_any:
955 simple_char:
956 count++;
957 break;
958 case REOP_line_start:
959 case REOP_line_end:
960 case REOP_word_boundary:
961 case REOP_not_word_boundary:
962 break;
963 default:
964 return -1;
965 }
966 pos += len;
967 }
968 return count;
969}
970
971/* '*pp' is the first char after '<' */
972static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
973{
974 const uint8_t *p, *p1;
975 uint32_t c, d;
976 char *q;
977
978 p = *pp;
979 q = buf;
980 for(;;) {
981 c = *p;
982 if (c == '\\') {
983 p++;
984 if (*p != 'u')
985 return -1;
986 c = lre_parse_escape(&p, 2); // accept surrogate pairs
987 } else if (c == '>') {
988 break;
989 } else if (c >= 128) {
990 c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
991 if (is_hi_surrogate(c)) {
992 d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
993 if (is_lo_surrogate(d)) {
994 c = from_surrogate(c, d);
995 p = p1;
996 }
997 }
998 } else {
999 p++;
1000 }
1001 if (c > 0x10FFFF)
1002 return -1;
1003 if (q == buf) {
1004 if (!lre_js_is_ident_first(c))
1005 return -1;
1006 } else {
1007 if (!lre_js_is_ident_next(c))
1008 return -1;
1009 }
1010 if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size)
1011 return -1;
1012 if (c < 128) {
1013 *q++ = c;
1014 } else {
1015 q += unicode_to_utf8((uint8_t*)q, c);
1016 }
1017 }
1018 if (q == buf)
1019 return -1;
1020 *q = '\0';
1021 p++;
1022 *pp = p;
1023 return 0;
1024}
1025
1026/* if capture_name = NULL: return the number of captures + 1.
1027 Otherwise, return the capture index corresponding to capture_name
1028 or -1 if none */
1029static int re_parse_captures(REParseState *s, int *phas_named_captures,
1030 const char *capture_name)
1031{
1032 const uint8_t *p;
1033 int capture_index;
1034 char name[TMP_BUF_SIZE];
1035
1036 capture_index = 1;
1037 *phas_named_captures = 0;
1038 for (p = s->buf_start; p < s->buf_end; p++) {
1039 switch (*p) {
1040 case '(':
1041 if (p[1] == '?') {
1042 if (p[2] == '<' && p[3] != '=' && p[3] != '!') {
1043 *phas_named_captures = 1;
1044 /* potential named capture */
1045 if (capture_name) {
1046 p += 3;
1047 if (re_parse_group_name(name, sizeof(name), &p) == 0) {
1048 if (!strcmp(name, capture_name))
1049 return capture_index;
1050 }
1051 }
1052 capture_index++;
1053 if (capture_index >= CAPTURE_COUNT_MAX)
1054 goto done;
1055 }
1056 } else {
1057 capture_index++;
1058 if (capture_index >= CAPTURE_COUNT_MAX)
1059 goto done;
1060 }
1061 break;
1062 case '\\':
1063 p++;
1064 break;
1065 case '[':
1066 for (p += 1 + (*p == ']'); p < s->buf_end && *p != ']'; p++) {
1067 if (*p == '\\')
1068 p++;
1069 }
1070 break;
1071 }
1072 }
1073 done:
1074 if (capture_name)
1075 return -1;
1076 else
1077 return capture_index;
1078}
1079
1080static int re_count_captures(REParseState *s)
1081{
1082 if (s->total_capture_count < 0) {
1083 s->total_capture_count = re_parse_captures(s, &s->has_named_captures,
1084 NULL);
1085 }
1086 return s->total_capture_count;
1087}
1088
1089static BOOL re_has_named_captures(REParseState *s)
1090{
1091 if (s->has_named_captures < 0)
1092 re_count_captures(s);
1093 return s->has_named_captures;
1094}
1095
1096static int find_group_name(REParseState *s, const char *name)
1097{
1098 const char *p, *buf_end;
1099 size_t len, name_len;
1100 int capture_index;
1101
1102 p = (char *)s->group_names.buf;
1103 if (!p) return -1;
1104 buf_end = (char *)s->group_names.buf + s->group_names.size;
1105 name_len = strlen(name);
1106 capture_index = 1;
1107 while (p < buf_end) {
1108 len = strlen(p);
1109 if (len == name_len && memcmp(name, p, name_len) == 0)
1110 return capture_index;
1111 p += len + 1;
1112 capture_index++;
1113 }
1114 return -1;
1115}
1116
1117static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir);
1118
1119static int re_parse_term(REParseState *s, BOOL is_backward_dir)
1120{
1121 const uint8_t *p;
1122 int c, last_atom_start, quant_min, quant_max, last_capture_count;
1123 BOOL greedy, add_zero_advance_check, is_neg, is_backward_lookahead;
1124 CharRange cr_s, *cr = &cr_s;
1125
1126 last_atom_start = -1;
1127 last_capture_count = 0;
1128 p = s->buf_ptr;
1129 c = *p;
1130 switch(c) {
1131 case '^':
1132 p++;
1133 re_emit_op(s, REOP_line_start);
1134 break;
1135 case '$':
1136 p++;
1137 re_emit_op(s, REOP_line_end);
1138 break;
1139 case '.':
1140 p++;
1141 last_atom_start = s->byte_code.size;
1142 last_capture_count = s->capture_count;
1143 if (is_backward_dir)
1144 re_emit_op(s, REOP_prev);
1145 re_emit_op(s, s->dotall ? REOP_any : REOP_dot);
1146 if (is_backward_dir)
1147 re_emit_op(s, REOP_prev);
1148 break;
1149 case '{':
1150 if (s->is_unicode) {
1151 return re_parse_error(s, "syntax error");
1152 } else if (!is_digit(p[1])) {
1153 /* Annex B: we accept '{' not followed by digits as a
1154 normal atom */
1155 goto parse_class_atom;
1156 } else {
1157 const uint8_t *p1 = p + 1;
1158 /* Annex B: error if it is like a repetition count */
1159 parse_digits(&p1, TRUE);
1160 if (*p1 == ',') {
1161 p1++;
1162 if (is_digit(*p1)) {
1163 parse_digits(&p1, TRUE);
1164 }
1165 }
1166 if (*p1 != '}') {
1167 goto parse_class_atom;
1168 }
1169 }
1170 /* fall thru */
1171 case '*':
1172 case '+':
1173 case '?':
1174 return re_parse_error(s, "nothing to repeat");
1175 case '(':
1176 if (p[1] == '?') {
1177 if (p[2] == ':') {
1178 p += 3;
1179 last_atom_start = s->byte_code.size;
1180 last_capture_count = s->capture_count;
1181 s->buf_ptr = p;
1182 if (re_parse_disjunction(s, is_backward_dir))
1183 return -1;
1184 p = s->buf_ptr;
1185 if (re_parse_expect(s, &p, ')'))
1186 return -1;
1187 } else if ((p[2] == '=' || p[2] == '!')) {
1188 is_neg = (p[2] == '!');
1189 is_backward_lookahead = FALSE;
1190 p += 3;
1191 goto lookahead;
1192 } else if (p[2] == '<' &&
1193 (p[3] == '=' || p[3] == '!')) {
1194 int pos;
1195 is_neg = (p[3] == '!');
1196 is_backward_lookahead = TRUE;
1197 p += 4;
1198 /* lookahead */
1199 lookahead:
1200 /* Annex B allows lookahead to be used as an atom for
1201 the quantifiers */
1202 if (!s->is_unicode && !is_backward_lookahead) {
1203 last_atom_start = s->byte_code.size;
1204 last_capture_count = s->capture_count;
1205 }
1206 pos = re_emit_op_u32(s, REOP_lookahead + is_neg, 0);
1207 s->buf_ptr = p;
1208 if (re_parse_disjunction(s, is_backward_lookahead))
1209 return -1;
1210 p = s->buf_ptr;
1211 if (re_parse_expect(s, &p, ')'))
1212 return -1;
1213 re_emit_op(s, REOP_match);
1214 /* jump after the 'match' after the lookahead is successful */
1215 if (dbuf_error(&s->byte_code))
1216 return -1;
1217 put_u32(s->byte_code.buf + pos, s->byte_code.size - (pos + 4));
1218 } else if (p[2] == '<') {
1219 p += 3;
1220 if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
1221 &p)) {
1222 return re_parse_error(s, "invalid group name");
1223 }
1224 if (find_group_name(s, s->u.tmp_buf) > 0) {
1225 return re_parse_error(s, "duplicate group name");
1226 }
1227 /* group name with a trailing zero */
1228 dbuf_put(&s->group_names, (uint8_t *)s->u.tmp_buf,
1229 strlen(s->u.tmp_buf) + 1);
1230 s->has_named_captures = 1;
1231 goto parse_capture;
1232 } else {
1233 return re_parse_error(s, "invalid group");
1234 }
1235 } else {
1236 int capture_index;
1237 p++;
1238 /* capture without group name */
1239 dbuf_putc(&s->group_names, 0);
1240 parse_capture:
1241 if (s->capture_count >= CAPTURE_COUNT_MAX)
1242 return re_parse_error(s, "too many captures");
1243 last_atom_start = s->byte_code.size;
1244 last_capture_count = s->capture_count;
1245 capture_index = s->capture_count++;
1246 re_emit_op_u8(s, REOP_save_start + is_backward_dir,
1247 capture_index);
1248
1249 s->buf_ptr = p;
1250 if (re_parse_disjunction(s, is_backward_dir))
1251 return -1;
1252 p = s->buf_ptr;
1253
1254 re_emit_op_u8(s, REOP_save_start + 1 - is_backward_dir,
1255 capture_index);
1256
1257 if (re_parse_expect(s, &p, ')'))
1258 return -1;
1259 }
1260 break;
1261 case '\\':
1262 switch(p[1]) {
1263 case 'b':
1264 case 'B':
1265 re_emit_op(s, REOP_word_boundary + (p[1] != 'b'));
1266 p += 2;
1267 break;
1268 case 'k':
1269 {
1270 const uint8_t *p1;
1271 int dummy_res;
1272
1273 p1 = p;
1274 if (p1[2] != '<') {
1275 /* annex B: we tolerate invalid group names in non
1276 unicode mode if there is no named capture
1277 definition */
1278 if (s->is_unicode || re_has_named_captures(s))
1279 return re_parse_error(s, "expecting group name");
1280 else
1281 goto parse_class_atom;
1282 }
1283 p1 += 3;
1284 if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
1285 &p1)) {
1286 if (s->is_unicode || re_has_named_captures(s))
1287 return re_parse_error(s, "invalid group name");
1288 else
1289 goto parse_class_atom;
1290 }
1291 c = find_group_name(s, s->u.tmp_buf);
1292 if (c < 0) {
1293 /* no capture name parsed before, try to look
1294 after (inefficient, but hopefully not common */
1295 c = re_parse_captures(s, &dummy_res, s->u.tmp_buf);
1296 if (c < 0) {
1297 if (s->is_unicode || re_has_named_captures(s))
1298 return re_parse_error(s, "group name not defined");
1299 else
1300 goto parse_class_atom;
1301 }
1302 }
1303 p = p1;
1304 }
1305 goto emit_back_reference;
1306 case '0':
1307 p += 2;
1308 c = 0;
1309 if (s->is_unicode) {
1310 if (is_digit(*p)) {
1311 return re_parse_error(s, "invalid decimal escape in regular expression");
1312 }
1313 } else {
1314 /* Annex B.1.4: accept legacy octal */
1315 if (*p >= '0' && *p <= '7') {
1316 c = *p++ - '0';
1317 if (*p >= '0' && *p <= '7') {
1318 c = (c << 3) + *p++ - '0';
1319 }
1320 }
1321 }
1322 goto normal_char;
1323 case '1': case '2': case '3': case '4':
1324 case '5': case '6': case '7': case '8':
1325 case '9':
1326 {
1327 const uint8_t *q = ++p;
1328
1329 c = parse_digits(&p, FALSE);
1330 if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) {
1331 if (!s->is_unicode) {
1332 /* Annex B.1.4: accept legacy octal */
1333 p = q;
1334 if (*p <= '7') {
1335 c = 0;
1336 if (*p <= '3')
1337 c = *p++ - '0';
1338 if (*p >= '0' && *p <= '7') {
1339 c = (c << 3) + *p++ - '0';
1340 if (*p >= '0' && *p <= '7') {
1341 c = (c << 3) + *p++ - '0';
1342 }
1343 }
1344 } else {
1345 c = *p++;
1346 }
1347 goto normal_char;
1348 }
1349 return re_parse_error(s, "back reference out of range in regular expression");
1350 }
1351 emit_back_reference:
1352 last_atom_start = s->byte_code.size;
1353 last_capture_count = s->capture_count;
1354 re_emit_op_u8(s, REOP_back_reference + is_backward_dir, c);
1355 }
1356 break;
1357 default:
1358 goto parse_class_atom;
1359 }
1360 break;
1361 case '[':
1362 last_atom_start = s->byte_code.size;
1363 last_capture_count = s->capture_count;
1364 if (is_backward_dir)
1365 re_emit_op(s, REOP_prev);
1366 if (re_parse_char_class(s, &p))
1367 return -1;
1368 if (is_backward_dir)
1369 re_emit_op(s, REOP_prev);
1370 break;
1371 case ']':
1372 case '}':
1373 if (s->is_unicode)
1374 return re_parse_error(s, "syntax error");
1375 goto parse_class_atom;
1376 default:
1377 parse_class_atom:
1378 c = get_class_atom(s, cr, &p, FALSE);
1379 if ((int)c < 0)
1380 return -1;
1381 normal_char:
1382 last_atom_start = s->byte_code.size;
1383 last_capture_count = s->capture_count;
1384 if (is_backward_dir)
1385 re_emit_op(s, REOP_prev);
1386 if (c >= CLASS_RANGE_BASE) {
1387 int ret;
1388 /* Note: canonicalization is not needed */
1389 ret = re_emit_range(s, cr);
1390 cr_free(cr);
1391 if (ret)
1392 return -1;
1393 } else {
1394 if (s->ignore_case)
1395 c = lre_canonicalize(c, s->is_unicode);
1396 if (c <= 0xffff)
1397 re_emit_op_u16(s, REOP_char, c);
1398 else
1399 re_emit_op_u32(s, REOP_char32, c);
1400 }
1401 if (is_backward_dir)
1402 re_emit_op(s, REOP_prev);
1403 break;
1404 }
1405
1406 /* quantifier */
1407 if (last_atom_start >= 0) {
1408 c = *p;
1409 switch(c) {
1410 case '*':
1411 p++;
1412 quant_min = 0;
1413 quant_max = INT32_MAX;
1414 goto quantifier;
1415 case '+':
1416 p++;
1417 quant_min = 1;
1418 quant_max = INT32_MAX;
1419 goto quantifier;
1420 case '?':
1421 p++;
1422 quant_min = 0;
1423 quant_max = 1;
1424 goto quantifier;
1425 case '{':
1426 {
1427 const uint8_t *p1 = p;
1428 /* As an extension (see ES6 annex B), we accept '{' not
1429 followed by digits as a normal atom */
1430 if (!is_digit(p[1])) {
1431 if (s->is_unicode)
1432 goto invalid_quant_count;
1433 break;
1434 }
1435 p++;
1436 quant_min = parse_digits(&p, TRUE);
1437 quant_max = quant_min;
1438 if (*p == ',') {
1439 p++;
1440 if (is_digit(*p)) {
1441 quant_max = parse_digits(&p, TRUE);
1442 if (quant_max < quant_min) {
1443 invalid_quant_count:
1444 return re_parse_error(s, "invalid repetition count");
1445 }
1446 } else {
1447 quant_max = INT32_MAX; /* infinity */
1448 }
1449 }
1450 if (*p != '}' && !s->is_unicode) {
1451 /* Annex B: normal atom if invalid '{' syntax */
1452 p = p1;
1453 break;
1454 }
1455 if (re_parse_expect(s, &p, '}'))
1456 return -1;
1457 }
1458 quantifier:
1459 greedy = TRUE;
1460 if (*p == '?') {
1461 p++;
1462 greedy = FALSE;
1463 }
1464 if (last_atom_start < 0) {
1465 return re_parse_error(s, "nothing to repeat");
1466 }
1467 if (greedy) {
1468 int len, pos;
1469
1470 if (quant_max > 0) {
1471 /* specific optimization for simple quantifiers */
1472 if (dbuf_error(&s->byte_code))
1473 goto out_of_memory;
1474 len = re_is_simple_quantifier(s->byte_code.buf + last_atom_start,
1475 s->byte_code.size - last_atom_start);
1476 if (len > 0) {
1477 re_emit_op(s, REOP_match);
1478
1479 if (dbuf_insert(&s->byte_code, last_atom_start, 17))
1480 goto out_of_memory;
1481 pos = last_atom_start;
1482 s->byte_code.buf[pos++] = REOP_simple_greedy_quant;
1483 put_u32(&s->byte_code.buf[pos],
1484 s->byte_code.size - last_atom_start - 17);
1485 pos += 4;
1486 put_u32(&s->byte_code.buf[pos], quant_min);
1487 pos += 4;
1488 put_u32(&s->byte_code.buf[pos], quant_max);
1489 pos += 4;
1490 put_u32(&s->byte_code.buf[pos], len);
1491 pos += 4;
1492 goto done;
1493 }
1494 }
1495
1496 if (dbuf_error(&s->byte_code))
1497 goto out_of_memory;
1498 }
1499 /* the spec tells that if there is no advance when
1500 running the atom after the first quant_min times,
1501 then there is no match. We remove this test when we
1502 are sure the atom always advances the position. */
1503 add_zero_advance_check = re_need_check_advance(s->byte_code.buf + last_atom_start,
1504 s->byte_code.size - last_atom_start);
1505
1506 {
1507 int len, pos;
1508 len = s->byte_code.size - last_atom_start;
1509 if (quant_min == 0) {
1510 /* need to reset the capture in case the atom is
1511 not executed */
1512 if (last_capture_count != s->capture_count) {
1513 if (dbuf_insert(&s->byte_code, last_atom_start, 3))
1514 goto out_of_memory;
1515 s->byte_code.buf[last_atom_start++] = REOP_save_reset;
1516 s->byte_code.buf[last_atom_start++] = last_capture_count;
1517 s->byte_code.buf[last_atom_start++] = s->capture_count - 1;
1518 }
1519 if (quant_max == 0) {
1520 s->byte_code.size = last_atom_start;
1521 } else if (quant_max == 1 || quant_max == INT32_MAX) {
1522 BOOL has_goto = (quant_max == INT32_MAX);
1523 if (dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check))
1524 goto out_of_memory;
1525 s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
1526 greedy;
1527 put_u32(s->byte_code.buf + last_atom_start + 1,
1528 len + 5 * has_goto + add_zero_advance_check * 2);
1529 if (add_zero_advance_check) {
1530 s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos;
1531 re_emit_op(s, REOP_check_advance);
1532 }
1533 if (has_goto)
1534 re_emit_goto(s, REOP_goto, last_atom_start);
1535 } else {
1536 if (dbuf_insert(&s->byte_code, last_atom_start, 10 + add_zero_advance_check))
1537 goto out_of_memory;
1538 pos = last_atom_start;
1539 s->byte_code.buf[pos++] = REOP_push_i32;
1540 put_u32(s->byte_code.buf + pos, quant_max);
1541 pos += 4;
1542 s->byte_code.buf[pos++] = REOP_split_goto_first + greedy;
1543 put_u32(s->byte_code.buf + pos, len + 5 + add_zero_advance_check * 2);
1544 pos += 4;
1545 if (add_zero_advance_check) {
1546 s->byte_code.buf[pos++] = REOP_push_char_pos;
1547 re_emit_op(s, REOP_check_advance);
1548 }
1549 re_emit_goto(s, REOP_loop, last_atom_start + 5);
1550 re_emit_op(s, REOP_drop);
1551 }
1552 } else if (quant_min == 1 && quant_max == INT32_MAX &&
1553 !add_zero_advance_check) {
1554 re_emit_goto(s, REOP_split_next_first - greedy,
1555 last_atom_start);
1556 } else {
1557 if (quant_min == 1) {
1558 /* nothing to add */
1559 } else {
1560 if (dbuf_insert(&s->byte_code, last_atom_start, 5))
1561 goto out_of_memory;
1562 s->byte_code.buf[last_atom_start] = REOP_push_i32;
1563 put_u32(s->byte_code.buf + last_atom_start + 1,
1564 quant_min);
1565 last_atom_start += 5;
1566 re_emit_goto(s, REOP_loop, last_atom_start);
1567 re_emit_op(s, REOP_drop);
1568 }
1569 if (quant_max == INT32_MAX) {
1570 pos = s->byte_code.size;
1571 re_emit_op_u32(s, REOP_split_goto_first + greedy,
1572 len + 5 + add_zero_advance_check * 2);
1573 if (add_zero_advance_check)
1574 re_emit_op(s, REOP_push_char_pos);
1575 /* copy the atom */
1576 dbuf_put_self(&s->byte_code, last_atom_start, len);
1577 if (add_zero_advance_check)
1578 re_emit_op(s, REOP_check_advance);
1579 re_emit_goto(s, REOP_goto, pos);
1580 } else if (quant_max > quant_min) {
1581 re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min);
1582 pos = s->byte_code.size;
1583 re_emit_op_u32(s, REOP_split_goto_first + greedy,
1584 len + 5 + add_zero_advance_check * 2);
1585 if (add_zero_advance_check)
1586 re_emit_op(s, REOP_push_char_pos);
1587 /* copy the atom */
1588 dbuf_put_self(&s->byte_code, last_atom_start, len);
1589 if (add_zero_advance_check)
1590 re_emit_op(s, REOP_check_advance);
1591 re_emit_goto(s, REOP_loop, pos);
1592 re_emit_op(s, REOP_drop);
1593 }
1594 }
1595 last_atom_start = -1;
1596 }
1597 break;
1598 default:
1599 break;
1600 }
1601 }
1602 done:
1603 s->buf_ptr = p;
1604 return 0;
1605 out_of_memory:
1606 return re_parse_out_of_memory(s);
1607}
1608
1609static int re_parse_alternative(REParseState *s, BOOL is_backward_dir)
1610{
1611 const uint8_t *p;
1612 int ret;
1613 size_t start, term_start, end, term_size;
1614
1615 start = s->byte_code.size;
1616 for(;;) {
1617 p = s->buf_ptr;
1618 if (p >= s->buf_end)
1619 break;
1620 if (*p == '|' || *p == ')')
1621 break;
1622 term_start = s->byte_code.size;
1623 ret = re_parse_term(s, is_backward_dir);
1624 if (ret)
1625 return ret;
1626 if (is_backward_dir) {
1627 /* reverse the order of the terms (XXX: inefficient, but
1628 speed is not really critical here) */
1629 end = s->byte_code.size;
1630 term_size = end - term_start;
1631 if (dbuf_realloc(&s->byte_code, end + term_size))
1632 return -1;
1633 memmove(s->byte_code.buf + start + term_size,
1634 s->byte_code.buf + start,
1635 end - start);
1636 memcpy(s->byte_code.buf + start, s->byte_code.buf + end,
1637 term_size);
1638 }
1639 }
1640 return 0;
1641}
1642
1643static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir)
1644{
1645 int start, len, pos;
1646
1647 if (lre_check_stack_overflow(s->opaque, 0))
1648 return re_parse_error(s, "stack overflow");
1649
1650 start = s->byte_code.size;
1651 if (re_parse_alternative(s, is_backward_dir))
1652 return -1;
1653 while (*s->buf_ptr == '|') {
1654 s->buf_ptr++;
1655
1656 len = s->byte_code.size - start;
1657
1658 /* insert a split before the first alternative */
1659 if (dbuf_insert(&s->byte_code, start, 5)) {
1660 return re_parse_out_of_memory(s);
1661 }
1662 s->byte_code.buf[start] = REOP_split_next_first;
1663 put_u32(s->byte_code.buf + start + 1, len + 5);
1664
1665 pos = re_emit_op_u32(s, REOP_goto, 0);
1666
1667 if (re_parse_alternative(s, is_backward_dir))
1668 return -1;
1669
1670 /* patch the goto */
1671 len = s->byte_code.size - (pos + 4);
1672 put_u32(s->byte_code.buf + pos, len);
1673 }
1674 return 0;
1675}
1676
1677/* the control flow is recursive so the analysis can be linear */
1678static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len)
1679{
1680 int stack_size, stack_size_max, pos, opcode, len;
1681 uint32_t val;
1682
1683 stack_size = 0;
1684 stack_size_max = 0;
1685 bc_buf += RE_HEADER_LEN;
1686 bc_buf_len -= RE_HEADER_LEN;
1687 pos = 0;
1688 while (pos < bc_buf_len) {
1689 opcode = bc_buf[pos];
1690 len = reopcode_info[opcode].size;
1691 assert(opcode < REOP_COUNT);
1692 assert((pos + len) <= bc_buf_len);
1693 switch(opcode) {
1694 case REOP_push_i32:
1695 case REOP_push_char_pos:
1696 stack_size++;
1697 if (stack_size > stack_size_max) {
1698 if (stack_size > STACK_SIZE_MAX)
1699 return -1;
1700 stack_size_max = stack_size;
1701 }
1702 break;
1703 case REOP_drop:
1704 case REOP_check_advance:
1705 assert(stack_size > 0);
1706 stack_size--;
1707 break;
1708 case REOP_range:
1709 val = get_u16(bc_buf + pos + 1);
1710 len += val * 4;
1711 break;
1712 case REOP_range32:
1713 val = get_u16(bc_buf + pos + 1);
1714 len += val * 8;
1715 break;
1716 }
1717 pos += len;
1718 }
1719 return stack_size_max;
1720}
1721
1722/* 'buf' must be a zero terminated UTF-8 string of length buf_len.
1723 Return NULL if error and allocate an error message in *perror_msg,
1724 otherwise the compiled bytecode and its length in plen.
1725*/
1726uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
1727 const char *buf, size_t buf_len, int re_flags,
1728 void *opaque)
1729{
1730 REParseState s_s, *s = &s_s;
1731 int stack_size;
1732 BOOL is_sticky;
1733
1734 memset(s, 0, sizeof(*s));
1735 s->opaque = opaque;
1736 s->buf_ptr = (const uint8_t *)buf;
1737 s->buf_end = s->buf_ptr + buf_len;
1738 s->buf_start = s->buf_ptr;
1739 s->re_flags = re_flags;
1740 s->is_unicode = ((re_flags & LRE_FLAG_UNICODE) != 0);
1741 is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0);
1742 s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0);
1743 s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0);
1744 s->capture_count = 1;
1745 s->total_capture_count = -1;
1746 s->has_named_captures = -1;
1747
1748 dbuf_init2(&s->byte_code, opaque, lre_realloc);
1749 dbuf_init2(&s->group_names, opaque, lre_realloc);
1750
1751 dbuf_putc(&s->byte_code, re_flags); /* first element is the flags */
1752 dbuf_putc(&s->byte_code, 0); /* second element is the number of captures */
1753 dbuf_putc(&s->byte_code, 0); /* stack size */
1754 dbuf_put_u32(&s->byte_code, 0); /* bytecode length */
1755
1756 if (!is_sticky) {
1757 /* iterate thru all positions (about the same as .*?( ... ) )
1758 . We do it without an explicit loop so that lock step
1759 thread execution will be possible in an optimized
1760 implementation */
1761 re_emit_op_u32(s, REOP_split_goto_first, 1 + 5);
1762 re_emit_op(s, REOP_any);
1763 re_emit_op_u32(s, REOP_goto, -(5 + 1 + 5));
1764 }
1765 re_emit_op_u8(s, REOP_save_start, 0);
1766
1767 if (re_parse_disjunction(s, FALSE)) {
1768 error:
1769 dbuf_free(&s->byte_code);
1770 dbuf_free(&s->group_names);
1771 pstrcpy(error_msg, error_msg_size, s->u.error_msg);
1772 *plen = 0;
1773 return NULL;
1774 }
1775
1776 re_emit_op_u8(s, REOP_save_end, 0);
1777
1778 re_emit_op(s, REOP_match);
1779
1780 if (*s->buf_ptr != '\0') {
1781 re_parse_error(s, "extraneous characters at the end");
1782 goto error;
1783 }
1784
1785 if (dbuf_error(&s->byte_code)) {
1786 re_parse_out_of_memory(s);
1787 goto error;
1788 }
1789
1790 stack_size = compute_stack_size(s->byte_code.buf, s->byte_code.size);
1791 if (stack_size < 0) {
1792 re_parse_error(s, "too many imbricated quantifiers");
1793 goto error;
1794 }
1795
1796 s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count;
1797 s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size;
1798 put_u32(s->byte_code.buf + RE_HEADER_BYTECODE_LEN,
1799 s->byte_code.size - RE_HEADER_LEN);
1800
1801 /* add the named groups if needed */
1802 if (s->group_names.size > (s->capture_count - 1)) {
1803 dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size);
1804 s->byte_code.buf[RE_HEADER_FLAGS] |= LRE_FLAG_NAMED_GROUPS;
1805 }
1806 dbuf_free(&s->group_names);
1807
1808#ifdef DUMP_REOP
1809 lre_dump_bytecode(s->byte_code.buf, s->byte_code.size);
1810#endif
1811
1812 error_msg[0] = '\0';
1813 *plen = s->byte_code.size;
1814 return s->byte_code.buf;
1815}
1816
1817static BOOL is_line_terminator(uint32_t c)
1818{
1819 return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS);
1820}
1821
1822static BOOL is_word_char(uint32_t c)
1823{
1824 return ((c >= '0' && c <= '9') ||
1825 (c >= 'a' && c <= 'z') ||
1826 (c >= 'A' && c <= 'Z') ||
1827 (c == '_'));
1828}
1829
1830#define GET_CHAR(c, cptr, cbuf_end, cbuf_type) \
1831 do { \
1832 if (cbuf_type == 0) { \
1833 c = *cptr++; \
1834 } else { \
1835 const uint16_t *_p = (const uint16_t *)cptr; \
1836 const uint16_t *_end = (const uint16_t *)cbuf_end; \
1837 c = *_p++; \
1838 if (is_hi_surrogate(c) && cbuf_type == 2) { \
1839 if (_p < _end && is_lo_surrogate(*_p)) { \
1840 c = from_surrogate(c, *_p++); \
1841 } \
1842 } \
1843 cptr = (const void *)_p; \
1844 } \
1845 } while (0)
1846
1847#define PEEK_CHAR(c, cptr, cbuf_end, cbuf_type) \
1848 do { \
1849 if (cbuf_type == 0) { \
1850 c = cptr[0]; \
1851 } else { \
1852 const uint16_t *_p = (const uint16_t *)cptr; \
1853 const uint16_t *_end = (const uint16_t *)cbuf_end; \
1854 c = *_p++; \
1855 if (is_hi_surrogate(c) && cbuf_type == 2) { \
1856 if (_p < _end && is_lo_surrogate(*_p)) { \
1857 c = from_surrogate(c, *_p); \
1858 } \
1859 } \
1860 } \
1861 } while (0)
1862
1863#define PEEK_PREV_CHAR(c, cptr, cbuf_start, cbuf_type) \
1864 do { \
1865 if (cbuf_type == 0) { \
1866 c = cptr[-1]; \
1867 } else { \
1868 const uint16_t *_p = (const uint16_t *)cptr - 1; \
1869 const uint16_t *_start = (const uint16_t *)cbuf_start; \
1870 c = *_p; \
1871 if (is_lo_surrogate(c) && cbuf_type == 2) { \
1872 if (_p > _start && is_hi_surrogate(_p[-1])) { \
1873 c = from_surrogate(*--_p, c); \
1874 } \
1875 } \
1876 } \
1877 } while (0)
1878
1879#define GET_PREV_CHAR(c, cptr, cbuf_start, cbuf_type) \
1880 do { \
1881 if (cbuf_type == 0) { \
1882 cptr--; \
1883 c = cptr[0]; \
1884 } else { \
1885 const uint16_t *_p = (const uint16_t *)cptr - 1; \
1886 const uint16_t *_start = (const uint16_t *)cbuf_start; \
1887 c = *_p; \
1888 if (is_lo_surrogate(c) && cbuf_type == 2) { \
1889 if (_p > _start && is_hi_surrogate(_p[-1])) { \
1890 c = from_surrogate(*--_p, c); \
1891 } \
1892 } \
1893 cptr = (const void *)_p; \
1894 } \
1895 } while (0)
1896
1897#define PREV_CHAR(cptr, cbuf_start, cbuf_type) \
1898 do { \
1899 if (cbuf_type == 0) { \
1900 cptr--; \
1901 } else { \
1902 const uint16_t *_p = (const uint16_t *)cptr - 1; \
1903 const uint16_t *_start = (const uint16_t *)cbuf_start; \
1904 if (is_lo_surrogate(*_p) && cbuf_type == 2) { \
1905 if (_p > _start && is_hi_surrogate(_p[-1])) { \
1906 --_p; \
1907 } \
1908 } \
1909 cptr = (const void *)_p; \
1910 } \
1911 } while (0)
1912
1913typedef uintptr_t StackInt;
1914
1915typedef enum {
1916 RE_EXEC_STATE_SPLIT,
1917 RE_EXEC_STATE_LOOKAHEAD,
1918 RE_EXEC_STATE_NEGATIVE_LOOKAHEAD,
1919 RE_EXEC_STATE_GREEDY_QUANT,
1920} REExecStateEnum;
1921
1922typedef struct REExecState {
1923 REExecStateEnum type : 8;
1924 uint8_t stack_len;
1925 size_t count; /* only used for RE_EXEC_STATE_GREEDY_QUANT */
1926 const uint8_t *cptr;
1927 const uint8_t *pc;
1928 void *buf[0];
1929} REExecState;
1930
1931typedef struct {
1932 const uint8_t *cbuf;
1933 const uint8_t *cbuf_end;
1934 /* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 */
1935 int cbuf_type;
1936 int capture_count;
1937 int stack_size_max;
1938 BOOL multi_line;
1939 BOOL ignore_case;
1940 BOOL is_unicode;
1941 int interrupt_counter;
1942 void *opaque; /* used for stack overflow check */
1943
1944 size_t state_size;
1945 uint8_t *state_stack;
1946 size_t state_stack_size;
1947 size_t state_stack_len;
1948} REExecContext;
1949
1950static int push_state(REExecContext *s,
1951 uint8_t **capture,
1952 StackInt *stack, size_t stack_len,
1953 const uint8_t *pc, const uint8_t *cptr,
1954 REExecStateEnum type, size_t count)
1955{
1956 REExecState *rs;
1957 uint8_t *new_stack;
1958 size_t new_size, i, n;
1959 StackInt *stack_buf;
1960
1961 if (unlikely((s->state_stack_len + 1) > s->state_stack_size)) {
1962 /* reallocate the stack */
1963 new_size = s->state_stack_size * 3 / 2;
1964 if (new_size < 8)
1965 new_size = 8;
1966 new_stack = lre_realloc(s->opaque, s->state_stack, new_size * s->state_size);
1967 if (!new_stack)
1968 return -1;
1969 s->state_stack_size = new_size;
1970 s->state_stack = new_stack;
1971 }
1972 rs = (REExecState *)(s->state_stack + s->state_stack_len * s->state_size);
1973 s->state_stack_len++;
1974 rs->type = type;
1975 rs->count = count;
1976 rs->stack_len = stack_len;
1977 rs->cptr = cptr;
1978 rs->pc = pc;
1979 n = 2 * s->capture_count;
1980 for(i = 0; i < n; i++)
1981 rs->buf[i] = capture[i];
1982 stack_buf = (StackInt *)(rs->buf + n);
1983 for(i = 0; i < stack_len; i++)
1984 stack_buf[i] = stack[i];
1985 return 0;
1986}
1987
1988static int lre_poll_timeout(REExecContext *s)
1989{
1990 if (unlikely(--s->interrupt_counter <= 0)) {
1991 s->interrupt_counter = INTERRUPT_COUNTER_INIT;
1992 if (lre_check_timeout(s->opaque))
1993 return LRE_RET_TIMEOUT;
1994 }
1995 return 0;
1996}
1997
1998/* return 1 if match, 0 if not match or < 0 if error. */
1999static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2000 StackInt *stack, int stack_len,
2001 const uint8_t *pc, const uint8_t *cptr,
2002 BOOL no_recurse)
2003{
2004 int opcode, ret;
2005 int cbuf_type;
2006 uint32_t val, c;
2007 const uint8_t *cbuf_end;
2008
2009 cbuf_type = s->cbuf_type;
2010 cbuf_end = s->cbuf_end;
2011
2012 for(;;) {
2013 // printf("top=%p: pc=%d\n", th_list.top, (int)(pc - (bc_buf + RE_HEADER_LEN)));
2014 opcode = *pc++;
2015 switch(opcode) {
2016 case REOP_match:
2017 {
2018 REExecState *rs;
2019 if (no_recurse)
2020 return (intptr_t)cptr;
2021 ret = 1;
2022 goto recurse;
2023 no_match:
2024 if (no_recurse)
2025 return 0;
2026 ret = 0;
2027 recurse:
2028 for(;;) {
2029 if (lre_poll_timeout(s))
2030 return LRE_RET_TIMEOUT;
2031 if (s->state_stack_len == 0)
2032 return ret;
2033 rs = (REExecState *)(s->state_stack +
2034 (s->state_stack_len - 1) * s->state_size);
2035 if (rs->type == RE_EXEC_STATE_SPLIT) {
2036 if (!ret) {
2037 pop_state:
2038 memcpy(capture, rs->buf,
2039 sizeof(capture[0]) * 2 * s->capture_count);
2040 pop_state1:
2041 pc = rs->pc;
2042 cptr = rs->cptr;
2043 stack_len = rs->stack_len;
2044 memcpy(stack, rs->buf + 2 * s->capture_count,
2045 stack_len * sizeof(stack[0]));
2046 s->state_stack_len--;
2047 break;
2048 }
2049 } else if (rs->type == RE_EXEC_STATE_GREEDY_QUANT) {
2050 if (!ret) {
2051 uint32_t char_count, i;
2052 memcpy(capture, rs->buf,
2053 sizeof(capture[0]) * 2 * s->capture_count);
2054 stack_len = rs->stack_len;
2055 memcpy(stack, rs->buf + 2 * s->capture_count,
2056 stack_len * sizeof(stack[0]));
2057 pc = rs->pc;
2058 cptr = rs->cptr;
2059 /* go backward */
2060 char_count = get_u32(pc + 12);
2061 for(i = 0; i < char_count; i++) {
2062 PREV_CHAR(cptr, s->cbuf, cbuf_type);
2063 }
2064 pc = (pc + 16) + (int)get_u32(pc);
2065 rs->cptr = cptr;
2066 rs->count--;
2067 if (rs->count == 0) {
2068 s->state_stack_len--;
2069 }
2070 break;
2071 }
2072 } else {
2073 ret = ((rs->type == RE_EXEC_STATE_LOOKAHEAD && ret) ||
2074 (rs->type == RE_EXEC_STATE_NEGATIVE_LOOKAHEAD && !ret));
2075 if (ret) {
2076 /* keep the capture in case of positive lookahead */
2077 if (rs->type == RE_EXEC_STATE_LOOKAHEAD)
2078 goto pop_state1;
2079 else
2080 goto pop_state;
2081 }
2082 }
2083 s->state_stack_len--;
2084 }
2085 }
2086 break;
2087 case REOP_char32:
2088 val = get_u32(pc);
2089 pc += 4;
2090 goto test_char;
2091 case REOP_char:
2092 val = get_u16(pc);
2093 pc += 2;
2094 test_char:
2095 if (cptr >= cbuf_end)
2096 goto no_match;
2097 GET_CHAR(c, cptr, cbuf_end, cbuf_type);
2098 if (s->ignore_case) {
2099 c = lre_canonicalize(c, s->is_unicode);
2100 }
2101 if (val != c)
2102 goto no_match;
2103 break;
2104 case REOP_split_goto_first:
2105 case REOP_split_next_first:
2106 {
2107 const uint8_t *pc1;
2108
2109 val = get_u32(pc);
2110 pc += 4;
2111 if (opcode == REOP_split_next_first) {
2112 pc1 = pc + (int)val;
2113 } else {
2114 pc1 = pc;
2115 pc = pc + (int)val;
2116 }
2117 ret = push_state(s, capture, stack, stack_len,
2118 pc1, cptr, RE_EXEC_STATE_SPLIT, 0);
2119 if (ret < 0)
2120 return LRE_RET_MEMORY_ERROR;
2121 break;
2122 }
2123 case REOP_lookahead:
2124 case REOP_negative_lookahead:
2125 val = get_u32(pc);
2126 pc += 4;
2127 ret = push_state(s, capture, stack, stack_len,
2128 pc + (int)val, cptr,
2129 RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead,
2130 0);
2131 if (ret < 0)
2132 return LRE_RET_MEMORY_ERROR;
2133 break;
2134
2135 case REOP_goto:
2136 val = get_u32(pc);
2137 pc += 4 + (int)val;
2138 if (lre_poll_timeout(s))
2139 return LRE_RET_TIMEOUT;
2140 break;
2141 case REOP_line_start:
2142 if (cptr == s->cbuf)
2143 break;
2144 if (!s->multi_line)
2145 goto no_match;
2146 PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
2147 if (!is_line_terminator(c))
2148 goto no_match;
2149 break;
2150 case REOP_line_end:
2151 if (cptr == cbuf_end)
2152 break;
2153 if (!s->multi_line)
2154 goto no_match;
2155 PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
2156 if (!is_line_terminator(c))
2157 goto no_match;
2158 break;
2159 case REOP_dot:
2160 if (cptr == cbuf_end)
2161 goto no_match;
2162 GET_CHAR(c, cptr, cbuf_end, cbuf_type);
2163 if (is_line_terminator(c))
2164 goto no_match;
2165 break;
2166 case REOP_any:
2167 if (cptr == cbuf_end)
2168 goto no_match;
2169 GET_CHAR(c, cptr, cbuf_end, cbuf_type);
2170 break;
2171 case REOP_save_start:
2172 case REOP_save_end:
2173 val = *pc++;
2174 assert(val < s->capture_count);
2175 capture[2 * val + opcode - REOP_save_start] = (uint8_t *)cptr;
2176 break;
2177 case REOP_save_reset:
2178 {
2179 uint32_t val2;
2180 val = pc[0];
2181 val2 = pc[1];
2182 pc += 2;
2183 assert(val2 < s->capture_count);
2184 while (val <= val2) {
2185 capture[2 * val] = NULL;
2186 capture[2 * val + 1] = NULL;
2187 val++;
2188 }
2189 }
2190 break;
2191 case REOP_push_i32:
2192 val = get_u32(pc);
2193 pc += 4;
2194 stack[stack_len++] = val;
2195 break;
2196 case REOP_drop:
2197 stack_len--;
2198 break;
2199 case REOP_loop:
2200 val = get_u32(pc);
2201 pc += 4;
2202 if (--stack[stack_len - 1] != 0) {
2203 pc += (int)val;
2204 if (lre_poll_timeout(s))
2205 return LRE_RET_TIMEOUT;
2206 }
2207 break;
2208 case REOP_push_char_pos:
2209 stack[stack_len++] = (uintptr_t)cptr;
2210 break;
2211 case REOP_check_advance:
2212 if (stack[--stack_len] == (uintptr_t)cptr)
2213 goto no_match;
2214 break;
2215 case REOP_word_boundary:
2216 case REOP_not_word_boundary:
2217 {
2218 BOOL v1, v2;
2219 /* char before */
2220 if (cptr == s->cbuf) {
2221 v1 = FALSE;
2222 } else {
2223 PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
2224 v1 = is_word_char(c);
2225 }
2226 /* current char */
2227 if (cptr >= cbuf_end) {
2228 v2 = FALSE;
2229 } else {
2230 PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
2231 v2 = is_word_char(c);
2232 }
2233 if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode))
2234 goto no_match;
2235 }
2236 break;
2237 case REOP_back_reference:
2238 case REOP_backward_back_reference:
2239 {
2240 const uint8_t *cptr1, *cptr1_end, *cptr1_start;
2241 uint32_t c1, c2;
2242
2243 val = *pc++;
2244 if (val >= s->capture_count)
2245 goto no_match;
2246 cptr1_start = capture[2 * val];
2247 cptr1_end = capture[2 * val + 1];
2248 if (!cptr1_start || !cptr1_end)
2249 break;
2250 if (opcode == REOP_back_reference) {
2251 cptr1 = cptr1_start;
2252 while (cptr1 < cptr1_end) {
2253 if (cptr >= cbuf_end)
2254 goto no_match;
2255 GET_CHAR(c1, cptr1, cptr1_end, cbuf_type);
2256 GET_CHAR(c2, cptr, cbuf_end, cbuf_type);
2257 if (s->ignore_case) {
2258 c1 = lre_canonicalize(c1, s->is_unicode);
2259 c2 = lre_canonicalize(c2, s->is_unicode);
2260 }
2261 if (c1 != c2)
2262 goto no_match;
2263 }
2264 } else {
2265 cptr1 = cptr1_end;
2266 while (cptr1 > cptr1_start) {
2267 if (cptr == s->cbuf)
2268 goto no_match;
2269 GET_PREV_CHAR(c1, cptr1, cptr1_start, cbuf_type);
2270 GET_PREV_CHAR(c2, cptr, s->cbuf, cbuf_type);
2271 if (s->ignore_case) {
2272 c1 = lre_canonicalize(c1, s->is_unicode);
2273 c2 = lre_canonicalize(c2, s->is_unicode);
2274 }
2275 if (c1 != c2)
2276 goto no_match;
2277 }
2278 }
2279 }
2280 break;
2281 case REOP_range:
2282 {
2283 int n;
2284 uint32_t low, high, idx_min, idx_max, idx;
2285
2286 n = get_u16(pc); /* n must be >= 1 */
2287 pc += 2;
2288 if (cptr >= cbuf_end)
2289 goto no_match;
2290 GET_CHAR(c, cptr, cbuf_end, cbuf_type);
2291 if (s->ignore_case) {
2292 c = lre_canonicalize(c, s->is_unicode);
2293 }
2294 idx_min = 0;
2295 low = get_u16(pc + 0 * 4);
2296 if (c < low)
2297 goto no_match;
2298 idx_max = n - 1;
2299 high = get_u16(pc + idx_max * 4 + 2);
2300 /* 0xffff in for last value means +infinity */
2301 if (unlikely(c >= 0xffff) && high == 0xffff)
2302 goto range_match;
2303 if (c > high)
2304 goto no_match;
2305 while (idx_min <= idx_max) {
2306 idx = (idx_min + idx_max) / 2;
2307 low = get_u16(pc + idx * 4);
2308 high = get_u16(pc + idx * 4 + 2);
2309 if (c < low)
2310 idx_max = idx - 1;
2311 else if (c > high)
2312 idx_min = idx + 1;
2313 else
2314 goto range_match;
2315 }
2316 goto no_match;
2317 range_match:
2318 pc += 4 * n;
2319 }
2320 break;
2321 case REOP_range32:
2322 {
2323 int n;
2324 uint32_t low, high, idx_min, idx_max, idx;
2325
2326 n = get_u16(pc); /* n must be >= 1 */
2327 pc += 2;
2328 if (cptr >= cbuf_end)
2329 goto no_match;
2330 GET_CHAR(c, cptr, cbuf_end, cbuf_type);
2331 if (s->ignore_case) {
2332 c = lre_canonicalize(c, s->is_unicode);
2333 }
2334 idx_min = 0;
2335 low = get_u32(pc + 0 * 8);
2336 if (c < low)
2337 goto no_match;
2338 idx_max = n - 1;
2339 high = get_u32(pc + idx_max * 8 + 4);
2340 if (c > high)
2341 goto no_match;
2342 while (idx_min <= idx_max) {
2343 idx = (idx_min + idx_max) / 2;
2344 low = get_u32(pc + idx * 8);
2345 high = get_u32(pc + idx * 8 + 4);
2346 if (c < low)
2347 idx_max = idx - 1;
2348 else if (c > high)
2349 idx_min = idx + 1;
2350 else
2351 goto range32_match;
2352 }
2353 goto no_match;
2354 range32_match:
2355 pc += 8 * n;
2356 }
2357 break;
2358 case REOP_prev:
2359 /* go to the previous char */
2360 if (cptr == s->cbuf)
2361 goto no_match;
2362 PREV_CHAR(cptr, s->cbuf, cbuf_type);
2363 break;
2364 case REOP_simple_greedy_quant:
2365 {
2366 uint32_t next_pos, quant_min, quant_max;
2367 size_t q;
2368 intptr_t res;
2369 const uint8_t *pc1;
2370
2371 next_pos = get_u32(pc);
2372 quant_min = get_u32(pc + 4);
2373 quant_max = get_u32(pc + 8);
2374 pc += 16;
2375 pc1 = pc;
2376 pc += (int)next_pos;
2377
2378 q = 0;
2379 for(;;) {
2380 if (lre_poll_timeout(s))
2381 return LRE_RET_TIMEOUT;
2382 res = lre_exec_backtrack(s, capture, stack, stack_len,
2383 pc1, cptr, TRUE);
2384 if (res == LRE_RET_MEMORY_ERROR ||
2385 res == LRE_RET_TIMEOUT)
2386 return res;
2387 if (!res)
2388 break;
2389 cptr = (uint8_t *)res;
2390 q++;
2391 if (q >= quant_max && quant_max != INT32_MAX)
2392 break;
2393 }
2394 if (q < quant_min)
2395 goto no_match;
2396 if (q > quant_min) {
2397 /* will examine all matches down to quant_min */
2398 ret = push_state(s, capture, stack, stack_len,
2399 pc1 - 16, cptr,
2400 RE_EXEC_STATE_GREEDY_QUANT,
2401 q - quant_min);
2402 if (ret < 0)
2403 return LRE_RET_MEMORY_ERROR;
2404 }
2405 }
2406 break;
2407 default:
2408 abort();
2409 }
2410 }
2411}
2412
2413/* Return 1 if match, 0 if not match or < 0 if error (see LRE_RET_x). cindex is the
2414 starting position of the match and must be such as 0 <= cindex <=
2415 clen. */
2416int lre_exec(uint8_t **capture,
2417 const uint8_t *bc_buf, const uint8_t *cbuf, int cindex, int clen,
2418 int cbuf_type, void *opaque)
2419{
2420 REExecContext s_s, *s = &s_s;
2421 int re_flags, i, alloca_size, ret;
2422 StackInt *stack_buf;
2423
2424 re_flags = lre_get_flags(bc_buf);
2425 s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0;
2426 s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0;
2427 s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != 0;
2428 s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
2429 s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE];
2430 s->cbuf = cbuf;
2431 s->cbuf_end = cbuf + (clen << cbuf_type);
2432 s->cbuf_type = cbuf_type;
2433 if (s->cbuf_type == 1 && s->is_unicode)
2434 s->cbuf_type = 2;
2435 s->interrupt_counter = INTERRUPT_COUNTER_INIT;
2436 s->opaque = opaque;
2437
2438 s->state_size = sizeof(REExecState) +
2439 s->capture_count * sizeof(capture[0]) * 2 +
2440 s->stack_size_max * sizeof(stack_buf[0]);
2441 s->state_stack = NULL;
2442 s->state_stack_len = 0;
2443 s->state_stack_size = 0;
2444
2445 for(i = 0; i < s->capture_count * 2; i++)
2446 capture[i] = NULL;
2447 alloca_size = s->stack_size_max * sizeof(stack_buf[0]);
2448 stack_buf = alloca(alloca_size);
2449 ret = lre_exec_backtrack(s, capture, stack_buf, 0, bc_buf + RE_HEADER_LEN,
2450 cbuf + (cindex << cbuf_type), FALSE);
2451 lre_realloc(s->opaque, s->state_stack, 0);
2452 return ret;
2453}
2454
2455int lre_get_capture_count(const uint8_t *bc_buf)
2456{
2457 return bc_buf[RE_HEADER_CAPTURE_COUNT];
2458}
2459
2460int lre_get_flags(const uint8_t *bc_buf)
2461{
2462 return bc_buf[RE_HEADER_FLAGS];
2463}
2464
2465/* Return NULL if no group names. Otherwise, return a pointer to
2466 'capture_count - 1' zero terminated UTF-8 strings. */
2467const char *lre_get_groupnames(const uint8_t *bc_buf)
2468{
2469 uint32_t re_bytecode_len;
2470 if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == 0)
2471 return NULL;
2472 re_bytecode_len = get_u32(bc_buf + RE_HEADER_BYTECODE_LEN);
2473 return (const char *)(bc_buf + RE_HEADER_LEN + re_bytecode_len);
2474}
2475
2476#ifdef TEST
2477
2478BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size)
2479{
2480 return FALSE;
2481}
2482
2483void *lre_realloc(void *opaque, void *ptr, size_t size)
2484{
2485 return realloc(ptr, size);
2486}
2487
2488int main(int argc, char **argv)
2489{
2490 int len, flags, ret, i;
2491 uint8_t *bc;
2492 char error_msg[64];
2493 uint8_t *capture[CAPTURE_COUNT_MAX * 2];
2494 const char *input;
2495 int input_len, capture_count;
2496
2497 if (argc < 4) {
2498 printf("usage: %s regexp flags input\n", argv[0]);
2499 return 1;
2500 }
2501 flags = atoi(argv[2]);
2502 bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1],
2503 strlen(argv[1]), flags, NULL);
2504 if (!bc) {
2505 fprintf(stderr, "error: %s\n", error_msg);
2506 exit(1);
2507 }
2508
2509 input = argv[3];
2510 input_len = strlen(input);
2511
2512 ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL);
2513 printf("ret=%d\n", ret);
2514 if (ret == 1) {
2515 capture_count = lre_get_capture_count(bc);
2516 for(i = 0; i < 2 * capture_count; i++) {
2517 uint8_t *ptr;
2518 ptr = capture[i];
2519 printf("%d: ", i);
2520 if (!ptr)
2521 printf("<nil>");
2522 else
2523 printf("%u", (int)(ptr - (uint8_t *)input));
2524 printf("\n");
2525 }
2526 }
2527 return 0;
2528}
2529#endif
2530