1 | /* |
2 | * Regular Expression Engine |
3 | * |
4 | * Copyright (c) 2017-2018 Fabrice Bellard |
5 | * |
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | * of this software and associated documentation files (the "Software"), to deal |
8 | * in the Software without restriction, including without limitation the rights |
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | * copies of the Software, and to permit persons to whom the Software is |
11 | * furnished to do so, subject to the following conditions: |
12 | * |
13 | * The above copyright notice and this permission notice shall be included in |
14 | * all copies or substantial portions of the Software. |
15 | * |
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
22 | * THE SOFTWARE. |
23 | */ |
24 | #include <stdlib.h> |
25 | #include <stdio.h> |
26 | #include <stdarg.h> |
27 | #include <inttypes.h> |
28 | #include <string.h> |
29 | #include <assert.h> |
30 | |
31 | #include "cutils.h" |
32 | #include "libregexp.h" |
33 | #include "libunicode.h" |
34 | |
35 | /* |
36 | TODO: |
37 | |
38 | - Add a lock step execution mode (=linear time execution guaranteed) |
39 | when the regular expression is "simple" i.e. no backreference nor |
40 | complicated lookahead. The opcodes are designed for this execution |
41 | model. |
42 | */ |
43 | |
44 | #if defined(TEST) |
45 | #define DUMP_REOP |
46 | #endif |
47 | |
48 | typedef enum { |
49 | #define DEF(id, size) REOP_ ## id, |
50 | #include "libregexp-opcode.h" |
51 | #undef DEF |
52 | REOP_COUNT, |
53 | } REOPCodeEnum; |
54 | |
55 | #define CAPTURE_COUNT_MAX 255 |
56 | #define STACK_SIZE_MAX 255 |
57 | /* must be large enough to have a negligible runtime cost and small |
58 | enough to call the interrupt callback often. */ |
59 | #define INTERRUPT_COUNTER_INIT 10000 |
60 | |
61 | /* unicode code points */ |
62 | #define CP_LS 0x2028 |
63 | #define CP_PS 0x2029 |
64 | |
65 | #define TMP_BUF_SIZE 128 |
66 | |
67 | typedef struct { |
68 | DynBuf byte_code; |
69 | const uint8_t *buf_ptr; |
70 | const uint8_t *buf_end; |
71 | const uint8_t *buf_start; |
72 | int re_flags; |
73 | BOOL is_unicode; |
74 | BOOL ignore_case; |
75 | BOOL dotall; |
76 | int capture_count; |
77 | int total_capture_count; /* -1 = not computed yet */ |
78 | int has_named_captures; /* -1 = don't know, 0 = no, 1 = yes */ |
79 | void *opaque; |
80 | DynBuf group_names; |
81 | union { |
82 | char error_msg[TMP_BUF_SIZE]; |
83 | char tmp_buf[TMP_BUF_SIZE]; |
84 | } u; |
85 | } REParseState; |
86 | |
87 | typedef struct { |
88 | #ifdef DUMP_REOP |
89 | const char *name; |
90 | #endif |
91 | uint8_t size; |
92 | } REOpCode; |
93 | |
94 | static const REOpCode reopcode_info[REOP_COUNT] = { |
95 | #ifdef DUMP_REOP |
96 | #define DEF(id, size) { #id, size }, |
97 | #else |
98 | #define DEF(id, size) { size }, |
99 | #endif |
100 | #include "libregexp-opcode.h" |
101 | #undef DEF |
102 | }; |
103 | |
104 | #define 0 |
105 | #define 1 |
106 | #define 2 |
107 | #define 3 |
108 | |
109 | #define 7 |
110 | |
111 | static inline int is_digit(int c) { |
112 | return c >= '0' && c <= '9'; |
113 | } |
114 | |
115 | /* insert 'len' bytes at position 'pos'. Return < 0 if error. */ |
116 | static int dbuf_insert(DynBuf *s, int pos, int len) |
117 | { |
118 | if (dbuf_realloc(s, s->size + len)) |
119 | return -1; |
120 | memmove(s->buf + pos + len, s->buf + pos, s->size - pos); |
121 | s->size += len; |
122 | return 0; |
123 | } |
124 | |
125 | static const uint16_t char_range_d[] = { |
126 | 1, |
127 | 0x0030, 0x0039 + 1, |
128 | }; |
129 | |
130 | /* code point ranges for Zs,Zl or Zp property */ |
131 | static const uint16_t char_range_s[] = { |
132 | 10, |
133 | 0x0009, 0x000D + 1, |
134 | 0x0020, 0x0020 + 1, |
135 | 0x00A0, 0x00A0 + 1, |
136 | 0x1680, 0x1680 + 1, |
137 | 0x2000, 0x200A + 1, |
138 | /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */ |
139 | /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */ |
140 | 0x2028, 0x2029 + 1, |
141 | 0x202F, 0x202F + 1, |
142 | 0x205F, 0x205F + 1, |
143 | 0x3000, 0x3000 + 1, |
144 | /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */ |
145 | 0xFEFF, 0xFEFF + 1, |
146 | }; |
147 | |
148 | static const uint16_t char_range_w[] = { |
149 | 4, |
150 | 0x0030, 0x0039 + 1, |
151 | 0x0041, 0x005A + 1, |
152 | 0x005F, 0x005F + 1, |
153 | 0x0061, 0x007A + 1, |
154 | }; |
155 | |
156 | #define CLASS_RANGE_BASE 0x40000000 |
157 | |
158 | typedef enum { |
159 | CHAR_RANGE_d, |
160 | CHAR_RANGE_D, |
161 | CHAR_RANGE_s, |
162 | CHAR_RANGE_S, |
163 | CHAR_RANGE_w, |
164 | CHAR_RANGE_W, |
165 | } CharRangeEnum; |
166 | |
167 | static const uint16_t * const char_range_table[] = { |
168 | char_range_d, |
169 | char_range_s, |
170 | char_range_w, |
171 | }; |
172 | |
173 | static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c) |
174 | { |
175 | BOOL invert; |
176 | const uint16_t *c_pt; |
177 | int len, i; |
178 | |
179 | invert = c & 1; |
180 | c_pt = char_range_table[c >> 1]; |
181 | len = *c_pt++; |
182 | cr_init(cr, s->opaque, lre_realloc); |
183 | for(i = 0; i < len * 2; i++) { |
184 | if (cr_add_point(cr, c_pt[i])) |
185 | goto fail; |
186 | } |
187 | if (invert) { |
188 | if (cr_invert(cr)) |
189 | goto fail; |
190 | } |
191 | return 0; |
192 | fail: |
193 | cr_free(cr); |
194 | return -1; |
195 | } |
196 | |
197 | #ifdef DUMP_REOP |
198 | static __maybe_unused void lre_dump_bytecode(const uint8_t *buf, |
199 | int buf_len) |
200 | { |
201 | int pos, len, opcode, bc_len, re_flags, i; |
202 | uint32_t val; |
203 | |
204 | assert(buf_len >= RE_HEADER_LEN); |
205 | |
206 | re_flags = lre_get_flags(buf); |
207 | bc_len = get_u32(buf + RE_HEADER_BYTECODE_LEN); |
208 | assert(bc_len + RE_HEADER_LEN <= buf_len); |
209 | printf("flags: 0x%x capture_count=%d stack_size=%d\n" , |
210 | re_flags, buf[RE_HEADER_CAPTURE_COUNT], buf[RE_HEADER_STACK_SIZE]); |
211 | if (re_flags & LRE_FLAG_NAMED_GROUPS) { |
212 | const char *p; |
213 | p = (char *)buf + RE_HEADER_LEN + bc_len; |
214 | printf("named groups: " ); |
215 | for(i = 1; i < buf[RE_HEADER_CAPTURE_COUNT]; i++) { |
216 | if (i != 1) |
217 | printf("," ); |
218 | printf("<%s>" , p); |
219 | p += strlen(p) + 1; |
220 | } |
221 | printf("\n" ); |
222 | assert(p == (char *)(buf + buf_len)); |
223 | } |
224 | printf("bytecode_len=%d\n" , bc_len); |
225 | |
226 | buf += RE_HEADER_LEN; |
227 | pos = 0; |
228 | while (pos < bc_len) { |
229 | printf("%5u: " , pos); |
230 | opcode = buf[pos]; |
231 | len = reopcode_info[opcode].size; |
232 | if (opcode >= REOP_COUNT) { |
233 | printf(" invalid opcode=0x%02x\n" , opcode); |
234 | break; |
235 | } |
236 | if ((pos + len) > bc_len) { |
237 | printf(" buffer overflow (opcode=0x%02x)\n" , opcode); |
238 | break; |
239 | } |
240 | printf("%s" , reopcode_info[opcode].name); |
241 | switch(opcode) { |
242 | case REOP_char: |
243 | val = get_u16(buf + pos + 1); |
244 | if (val >= ' ' && val <= 126) |
245 | printf(" '%c'" , val); |
246 | else |
247 | printf(" 0x%04x" , val); |
248 | break; |
249 | case REOP_char32: |
250 | val = get_u32(buf + pos + 1); |
251 | if (val >= ' ' && val <= 126) |
252 | printf(" '%c'" , val); |
253 | else |
254 | printf(" 0x%08x" , val); |
255 | break; |
256 | case REOP_goto: |
257 | case REOP_split_goto_first: |
258 | case REOP_split_next_first: |
259 | case REOP_loop: |
260 | case REOP_lookahead: |
261 | case REOP_negative_lookahead: |
262 | val = get_u32(buf + pos + 1); |
263 | val += (pos + 5); |
264 | printf(" %u" , val); |
265 | break; |
266 | case REOP_simple_greedy_quant: |
267 | printf(" %u %u %u %u" , |
268 | get_u32(buf + pos + 1) + (pos + 17), |
269 | get_u32(buf + pos + 1 + 4), |
270 | get_u32(buf + pos + 1 + 8), |
271 | get_u32(buf + pos + 1 + 12)); |
272 | break; |
273 | case REOP_save_start: |
274 | case REOP_save_end: |
275 | case REOP_back_reference: |
276 | case REOP_backward_back_reference: |
277 | printf(" %u" , buf[pos + 1]); |
278 | break; |
279 | case REOP_save_reset: |
280 | printf(" %u %u" , buf[pos + 1], buf[pos + 2]); |
281 | break; |
282 | case REOP_push_i32: |
283 | val = get_u32(buf + pos + 1); |
284 | printf(" %d" , val); |
285 | break; |
286 | case REOP_range: |
287 | { |
288 | int n, i; |
289 | n = get_u16(buf + pos + 1); |
290 | len += n * 4; |
291 | for(i = 0; i < n * 2; i++) { |
292 | val = get_u16(buf + pos + 3 + i * 2); |
293 | printf(" 0x%04x" , val); |
294 | } |
295 | } |
296 | break; |
297 | case REOP_range32: |
298 | { |
299 | int n, i; |
300 | n = get_u16(buf + pos + 1); |
301 | len += n * 8; |
302 | for(i = 0; i < n * 2; i++) { |
303 | val = get_u32(buf + pos + 3 + i * 4); |
304 | printf(" 0x%08x" , val); |
305 | } |
306 | } |
307 | break; |
308 | default: |
309 | break; |
310 | } |
311 | printf("\n" ); |
312 | pos += len; |
313 | } |
314 | } |
315 | #endif |
316 | |
317 | static void re_emit_op(REParseState *s, int op) |
318 | { |
319 | dbuf_putc(&s->byte_code, op); |
320 | } |
321 | |
322 | /* return the offset of the u32 value */ |
323 | static int re_emit_op_u32(REParseState *s, int op, uint32_t val) |
324 | { |
325 | int pos; |
326 | dbuf_putc(&s->byte_code, op); |
327 | pos = s->byte_code.size; |
328 | dbuf_put_u32(&s->byte_code, val); |
329 | return pos; |
330 | } |
331 | |
332 | static int re_emit_goto(REParseState *s, int op, uint32_t val) |
333 | { |
334 | int pos; |
335 | dbuf_putc(&s->byte_code, op); |
336 | pos = s->byte_code.size; |
337 | dbuf_put_u32(&s->byte_code, val - (pos + 4)); |
338 | return pos; |
339 | } |
340 | |
341 | static void re_emit_op_u8(REParseState *s, int op, uint32_t val) |
342 | { |
343 | dbuf_putc(&s->byte_code, op); |
344 | dbuf_putc(&s->byte_code, val); |
345 | } |
346 | |
347 | static void re_emit_op_u16(REParseState *s, int op, uint32_t val) |
348 | { |
349 | dbuf_putc(&s->byte_code, op); |
350 | dbuf_put_u16(&s->byte_code, val); |
351 | } |
352 | |
353 | static int __attribute__((format(printf, 2, 3))) re_parse_error(REParseState *s, const char *fmt, ...) |
354 | { |
355 | va_list ap; |
356 | va_start(ap, fmt); |
357 | vsnprintf(s->u.error_msg, sizeof(s->u.error_msg), fmt, ap); |
358 | va_end(ap); |
359 | return -1; |
360 | } |
361 | |
362 | static int re_parse_out_of_memory(REParseState *s) |
363 | { |
364 | return re_parse_error(s, "out of memory" ); |
365 | } |
366 | |
367 | /* If allow_overflow is false, return -1 in case of |
368 | overflow. Otherwise return INT32_MAX. */ |
369 | static int parse_digits(const uint8_t **pp, BOOL allow_overflow) |
370 | { |
371 | const uint8_t *p; |
372 | uint64_t v; |
373 | int c; |
374 | |
375 | p = *pp; |
376 | v = 0; |
377 | for(;;) { |
378 | c = *p; |
379 | if (c < '0' || c > '9') |
380 | break; |
381 | v = v * 10 + c - '0'; |
382 | if (v >= INT32_MAX) { |
383 | if (allow_overflow) |
384 | v = INT32_MAX; |
385 | else |
386 | return -1; |
387 | } |
388 | p++; |
389 | } |
390 | *pp = p; |
391 | return v; |
392 | } |
393 | |
394 | static int re_parse_expect(REParseState *s, const uint8_t **pp, int c) |
395 | { |
396 | const uint8_t *p; |
397 | p = *pp; |
398 | if (*p != c) |
399 | return re_parse_error(s, "expecting '%c'" , c); |
400 | p++; |
401 | *pp = p; |
402 | return 0; |
403 | } |
404 | |
405 | /* Parse an escape sequence, *pp points after the '\': |
406 | allow_utf16 value: |
407 | 0 : no UTF-16 escapes allowed |
408 | 1 : UTF-16 escapes allowed |
409 | 2 : UTF-16 escapes allowed and escapes of surrogate pairs are |
410 | converted to a unicode character (unicode regexp case). |
411 | |
412 | Return the unicode char and update *pp if recognized, |
413 | return -1 if malformed escape, |
414 | return -2 otherwise. */ |
415 | int lre_parse_escape(const uint8_t **pp, int allow_utf16) |
416 | { |
417 | const uint8_t *p; |
418 | uint32_t c; |
419 | |
420 | p = *pp; |
421 | c = *p++; |
422 | switch(c) { |
423 | case 'b': |
424 | c = '\b'; |
425 | break; |
426 | case 'f': |
427 | c = '\f'; |
428 | break; |
429 | case 'n': |
430 | c = '\n'; |
431 | break; |
432 | case 'r': |
433 | c = '\r'; |
434 | break; |
435 | case 't': |
436 | c = '\t'; |
437 | break; |
438 | case 'v': |
439 | c = '\v'; |
440 | break; |
441 | case 'x': |
442 | case 'u': |
443 | { |
444 | int h, n, i; |
445 | uint32_t c1; |
446 | |
447 | if (*p == '{' && allow_utf16) { |
448 | p++; |
449 | c = 0; |
450 | for(;;) { |
451 | h = from_hex(*p++); |
452 | if (h < 0) |
453 | return -1; |
454 | c = (c << 4) | h; |
455 | if (c > 0x10FFFF) |
456 | return -1; |
457 | if (*p == '}') |
458 | break; |
459 | } |
460 | p++; |
461 | } else { |
462 | if (c == 'x') { |
463 | n = 2; |
464 | } else { |
465 | n = 4; |
466 | } |
467 | |
468 | c = 0; |
469 | for(i = 0; i < n; i++) { |
470 | h = from_hex(*p++); |
471 | if (h < 0) { |
472 | return -1; |
473 | } |
474 | c = (c << 4) | h; |
475 | } |
476 | if (is_hi_surrogate(c) && |
477 | allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') { |
478 | /* convert an escaped surrogate pair into a |
479 | unicode char */ |
480 | c1 = 0; |
481 | for(i = 0; i < 4; i++) { |
482 | h = from_hex(p[2 + i]); |
483 | if (h < 0) |
484 | break; |
485 | c1 = (c1 << 4) | h; |
486 | } |
487 | if (i == 4 && is_lo_surrogate(c1)) { |
488 | p += 6; |
489 | c = from_surrogate(c, c1); |
490 | } |
491 | } |
492 | } |
493 | } |
494 | break; |
495 | case '0': case '1': case '2': case '3': |
496 | case '4': case '5': case '6': case '7': |
497 | c -= '0'; |
498 | if (allow_utf16 == 2) { |
499 | /* only accept \0 not followed by digit */ |
500 | if (c != 0 || is_digit(*p)) |
501 | return -1; |
502 | } else { |
503 | /* parse a legacy octal sequence */ |
504 | uint32_t v; |
505 | v = *p - '0'; |
506 | if (v > 7) |
507 | break; |
508 | c = (c << 3) | v; |
509 | p++; |
510 | if (c >= 32) |
511 | break; |
512 | v = *p - '0'; |
513 | if (v > 7) |
514 | break; |
515 | c = (c << 3) | v; |
516 | p++; |
517 | } |
518 | break; |
519 | default: |
520 | return -2; |
521 | } |
522 | *pp = p; |
523 | return c; |
524 | } |
525 | |
526 | #ifdef CONFIG_ALL_UNICODE |
527 | /* XXX: we use the same chars for name and value */ |
528 | static BOOL is_unicode_char(int c) |
529 | { |
530 | return ((c >= '0' && c <= '9') || |
531 | (c >= 'A' && c <= 'Z') || |
532 | (c >= 'a' && c <= 'z') || |
533 | (c == '_')); |
534 | } |
535 | |
536 | static int parse_unicode_property(REParseState *s, CharRange *cr, |
537 | const uint8_t **pp, BOOL is_inv) |
538 | { |
539 | const uint8_t *p; |
540 | char name[64], value[64]; |
541 | char *q; |
542 | BOOL script_ext; |
543 | int ret; |
544 | |
545 | p = *pp; |
546 | if (*p != '{') |
547 | return re_parse_error(s, "expecting '{' after \\p" ); |
548 | p++; |
549 | q = name; |
550 | while (is_unicode_char(*p)) { |
551 | if ((q - name) >= sizeof(name) - 1) |
552 | goto unknown_property_name; |
553 | *q++ = *p++; |
554 | } |
555 | *q = '\0'; |
556 | q = value; |
557 | if (*p == '=') { |
558 | p++; |
559 | while (is_unicode_char(*p)) { |
560 | if ((q - value) >= sizeof(value) - 1) |
561 | return re_parse_error(s, "unknown unicode property value" ); |
562 | *q++ = *p++; |
563 | } |
564 | } |
565 | *q = '\0'; |
566 | if (*p != '}') |
567 | return re_parse_error(s, "expecting '}'" ); |
568 | p++; |
569 | // printf("name=%s value=%s\n", name, value); |
570 | |
571 | if (!strcmp(name, "Script" ) || !strcmp(name, "sc" )) { |
572 | script_ext = FALSE; |
573 | goto do_script; |
574 | } else if (!strcmp(name, "Script_Extensions" ) || !strcmp(name, "scx" )) { |
575 | script_ext = TRUE; |
576 | do_script: |
577 | cr_init(cr, s->opaque, lre_realloc); |
578 | ret = unicode_script(cr, value, script_ext); |
579 | if (ret) { |
580 | cr_free(cr); |
581 | if (ret == -2) |
582 | return re_parse_error(s, "unknown unicode script" ); |
583 | else |
584 | goto out_of_memory; |
585 | } |
586 | } else if (!strcmp(name, "General_Category" ) || !strcmp(name, "gc" )) { |
587 | cr_init(cr, s->opaque, lre_realloc); |
588 | ret = unicode_general_category(cr, value); |
589 | if (ret) { |
590 | cr_free(cr); |
591 | if (ret == -2) |
592 | return re_parse_error(s, "unknown unicode general category" ); |
593 | else |
594 | goto out_of_memory; |
595 | } |
596 | } else if (value[0] == '\0') { |
597 | cr_init(cr, s->opaque, lre_realloc); |
598 | ret = unicode_general_category(cr, name); |
599 | if (ret == -1) { |
600 | cr_free(cr); |
601 | goto out_of_memory; |
602 | } |
603 | if (ret < 0) { |
604 | ret = unicode_prop(cr, name); |
605 | if (ret) { |
606 | cr_free(cr); |
607 | if (ret == -2) |
608 | goto unknown_property_name; |
609 | else |
610 | goto out_of_memory; |
611 | } |
612 | } |
613 | } else { |
614 | unknown_property_name: |
615 | return re_parse_error(s, "unknown unicode property name" ); |
616 | } |
617 | |
618 | if (is_inv) { |
619 | if (cr_invert(cr)) { |
620 | cr_free(cr); |
621 | return -1; |
622 | } |
623 | } |
624 | *pp = p; |
625 | return 0; |
626 | out_of_memory: |
627 | return re_parse_out_of_memory(s); |
628 | } |
629 | #endif /* CONFIG_ALL_UNICODE */ |
630 | |
631 | /* return -1 if error otherwise the character or a class range |
632 | (CLASS_RANGE_BASE). In case of class range, 'cr' is |
633 | initialized. Otherwise, it is ignored. */ |
634 | static int get_class_atom(REParseState *s, CharRange *cr, |
635 | const uint8_t **pp, BOOL inclass) |
636 | { |
637 | const uint8_t *p; |
638 | uint32_t c; |
639 | int ret; |
640 | |
641 | p = *pp; |
642 | |
643 | c = *p; |
644 | switch(c) { |
645 | case '\\': |
646 | p++; |
647 | if (p >= s->buf_end) |
648 | goto unexpected_end; |
649 | c = *p++; |
650 | switch(c) { |
651 | case 'd': |
652 | c = CHAR_RANGE_d; |
653 | goto class_range; |
654 | case 'D': |
655 | c = CHAR_RANGE_D; |
656 | goto class_range; |
657 | case 's': |
658 | c = CHAR_RANGE_s; |
659 | goto class_range; |
660 | case 'S': |
661 | c = CHAR_RANGE_S; |
662 | goto class_range; |
663 | case 'w': |
664 | c = CHAR_RANGE_w; |
665 | goto class_range; |
666 | case 'W': |
667 | c = CHAR_RANGE_W; |
668 | class_range: |
669 | if (cr_init_char_range(s, cr, c)) |
670 | return -1; |
671 | c = CLASS_RANGE_BASE; |
672 | break; |
673 | case 'c': |
674 | c = *p; |
675 | if ((c >= 'a' && c <= 'z') || |
676 | (c >= 'A' && c <= 'Z') || |
677 | (((c >= '0' && c <= '9') || c == '_') && |
678 | inclass && !s->is_unicode)) { /* Annex B.1.4 */ |
679 | c &= 0x1f; |
680 | p++; |
681 | } else if (s->is_unicode) { |
682 | goto invalid_escape; |
683 | } else { |
684 | /* otherwise return '\' and 'c' */ |
685 | p--; |
686 | c = '\\'; |
687 | } |
688 | break; |
689 | case '-': |
690 | if (!inclass && s->is_unicode) |
691 | goto invalid_escape; |
692 | break; |
693 | #ifdef CONFIG_ALL_UNICODE |
694 | case 'p': |
695 | case 'P': |
696 | if (s->is_unicode) { |
697 | if (parse_unicode_property(s, cr, &p, (c == 'P'))) |
698 | return -1; |
699 | c = CLASS_RANGE_BASE; |
700 | break; |
701 | } |
702 | /* fall thru */ |
703 | #endif |
704 | default: |
705 | p--; |
706 | ret = lre_parse_escape(&p, s->is_unicode * 2); |
707 | if (ret >= 0) { |
708 | c = ret; |
709 | } else { |
710 | if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/" , *p)) { |
711 | /* always valid to escape these characters */ |
712 | goto normal_char; |
713 | } else if (s->is_unicode) { |
714 | invalid_escape: |
715 | return re_parse_error(s, "invalid escape sequence in regular expression" ); |
716 | } else { |
717 | /* just ignore the '\' */ |
718 | goto normal_char; |
719 | } |
720 | } |
721 | break; |
722 | } |
723 | break; |
724 | case '\0': |
725 | if (p >= s->buf_end) { |
726 | unexpected_end: |
727 | return re_parse_error(s, "unexpected end" ); |
728 | } |
729 | /* fall thru */ |
730 | default: |
731 | normal_char: |
732 | /* normal char */ |
733 | if (c >= 128) { |
734 | c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); |
735 | if ((unsigned)c > 0xffff && !s->is_unicode) { |
736 | /* XXX: should handle non BMP-1 code points */ |
737 | return re_parse_error(s, "malformed unicode char" ); |
738 | } |
739 | } else { |
740 | p++; |
741 | } |
742 | break; |
743 | } |
744 | *pp = p; |
745 | return c; |
746 | } |
747 | |
748 | static int re_emit_range(REParseState *s, const CharRange *cr) |
749 | { |
750 | int len, i; |
751 | uint32_t high; |
752 | |
753 | len = (unsigned)cr->len / 2; |
754 | if (len >= 65535) |
755 | return re_parse_error(s, "too many ranges" ); |
756 | if (len == 0) { |
757 | /* not sure it can really happen. Emit a match that is always |
758 | false */ |
759 | re_emit_op_u32(s, REOP_char32, -1); |
760 | } else { |
761 | high = cr->points[cr->len - 1]; |
762 | if (high == UINT32_MAX) |
763 | high = cr->points[cr->len - 2]; |
764 | if (high <= 0xffff) { |
765 | /* can use 16 bit ranges with the conversion that 0xffff = |
766 | infinity */ |
767 | re_emit_op_u16(s, REOP_range, len); |
768 | for(i = 0; i < cr->len; i += 2) { |
769 | dbuf_put_u16(&s->byte_code, cr->points[i]); |
770 | high = cr->points[i + 1] - 1; |
771 | if (high == UINT32_MAX - 1) |
772 | high = 0xffff; |
773 | dbuf_put_u16(&s->byte_code, high); |
774 | } |
775 | } else { |
776 | re_emit_op_u16(s, REOP_range32, len); |
777 | for(i = 0; i < cr->len; i += 2) { |
778 | dbuf_put_u32(&s->byte_code, cr->points[i]); |
779 | dbuf_put_u32(&s->byte_code, cr->points[i + 1] - 1); |
780 | } |
781 | } |
782 | } |
783 | return 0; |
784 | } |
785 | |
786 | static int re_parse_char_class(REParseState *s, const uint8_t **pp) |
787 | { |
788 | const uint8_t *p; |
789 | uint32_t c1, c2; |
790 | CharRange cr_s, *cr = &cr_s; |
791 | CharRange cr1_s, *cr1 = &cr1_s; |
792 | BOOL invert; |
793 | |
794 | cr_init(cr, s->opaque, lre_realloc); |
795 | p = *pp; |
796 | p++; /* skip '[' */ |
797 | |
798 | invert = FALSE; |
799 | if (*p == '^') { |
800 | p++; |
801 | invert = TRUE; |
802 | } |
803 | |
804 | for(;;) { |
805 | if (*p == ']') |
806 | break; |
807 | c1 = get_class_atom(s, cr1, &p, TRUE); |
808 | if ((int)c1 < 0) |
809 | goto fail; |
810 | if (*p == '-' && p[1] != ']') { |
811 | const uint8_t *p0 = p + 1; |
812 | if (c1 >= CLASS_RANGE_BASE) { |
813 | if (s->is_unicode) { |
814 | cr_free(cr1); |
815 | goto invalid_class_range; |
816 | } |
817 | /* Annex B: match '-' character */ |
818 | goto class_atom; |
819 | } |
820 | c2 = get_class_atom(s, cr1, &p0, TRUE); |
821 | if ((int)c2 < 0) |
822 | goto fail; |
823 | if (c2 >= CLASS_RANGE_BASE) { |
824 | cr_free(cr1); |
825 | if (s->is_unicode) { |
826 | goto invalid_class_range; |
827 | } |
828 | /* Annex B: match '-' character */ |
829 | goto class_atom; |
830 | } |
831 | p = p0; |
832 | if (c2 < c1) { |
833 | invalid_class_range: |
834 | re_parse_error(s, "invalid class range" ); |
835 | goto fail; |
836 | } |
837 | if (cr_union_interval(cr, c1, c2)) |
838 | goto memory_error; |
839 | } else { |
840 | class_atom: |
841 | if (c1 >= CLASS_RANGE_BASE) { |
842 | int ret; |
843 | ret = cr_union1(cr, cr1->points, cr1->len); |
844 | cr_free(cr1); |
845 | if (ret) |
846 | goto memory_error; |
847 | } else { |
848 | if (cr_union_interval(cr, c1, c1)) |
849 | goto memory_error; |
850 | } |
851 | } |
852 | } |
853 | if (s->ignore_case) { |
854 | if (cr_regexp_canonicalize(cr, s->is_unicode)) |
855 | goto memory_error; |
856 | } |
857 | if (invert) { |
858 | if (cr_invert(cr)) |
859 | goto memory_error; |
860 | } |
861 | if (re_emit_range(s, cr)) |
862 | goto fail; |
863 | cr_free(cr); |
864 | p++; /* skip ']' */ |
865 | *pp = p; |
866 | return 0; |
867 | memory_error: |
868 | re_parse_out_of_memory(s); |
869 | fail: |
870 | cr_free(cr); |
871 | return -1; |
872 | } |
873 | |
874 | /* Return: |
875 | - true if the opcodes may not advance the char pointer |
876 | - false if the opcodes always advance the char pointer |
877 | */ |
878 | static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len) |
879 | { |
880 | int pos, opcode, len; |
881 | uint32_t val; |
882 | BOOL ret; |
883 | |
884 | ret = TRUE; |
885 | pos = 0; |
886 | while (pos < bc_buf_len) { |
887 | opcode = bc_buf[pos]; |
888 | len = reopcode_info[opcode].size; |
889 | switch(opcode) { |
890 | case REOP_range: |
891 | val = get_u16(bc_buf + pos + 1); |
892 | len += val * 4; |
893 | goto simple_char; |
894 | case REOP_range32: |
895 | val = get_u16(bc_buf + pos + 1); |
896 | len += val * 8; |
897 | goto simple_char; |
898 | case REOP_char: |
899 | case REOP_char32: |
900 | case REOP_dot: |
901 | case REOP_any: |
902 | simple_char: |
903 | ret = FALSE; |
904 | break; |
905 | case REOP_line_start: |
906 | case REOP_line_end: |
907 | case REOP_push_i32: |
908 | case REOP_push_char_pos: |
909 | case REOP_drop: |
910 | case REOP_word_boundary: |
911 | case REOP_not_word_boundary: |
912 | case REOP_prev: |
913 | /* no effect */ |
914 | break; |
915 | case REOP_save_start: |
916 | case REOP_save_end: |
917 | case REOP_save_reset: |
918 | case REOP_back_reference: |
919 | case REOP_backward_back_reference: |
920 | break; |
921 | default: |
922 | /* safe behavior: we cannot predict the outcome */ |
923 | return TRUE; |
924 | } |
925 | pos += len; |
926 | } |
927 | return ret; |
928 | } |
929 | |
930 | /* return -1 if a simple quantifier cannot be used. Otherwise return |
931 | the number of characters in the atom. */ |
932 | static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len) |
933 | { |
934 | int pos, opcode, len, count; |
935 | uint32_t val; |
936 | |
937 | count = 0; |
938 | pos = 0; |
939 | while (pos < bc_buf_len) { |
940 | opcode = bc_buf[pos]; |
941 | len = reopcode_info[opcode].size; |
942 | switch(opcode) { |
943 | case REOP_range: |
944 | val = get_u16(bc_buf + pos + 1); |
945 | len += val * 4; |
946 | goto simple_char; |
947 | case REOP_range32: |
948 | val = get_u16(bc_buf + pos + 1); |
949 | len += val * 8; |
950 | goto simple_char; |
951 | case REOP_char: |
952 | case REOP_char32: |
953 | case REOP_dot: |
954 | case REOP_any: |
955 | simple_char: |
956 | count++; |
957 | break; |
958 | case REOP_line_start: |
959 | case REOP_line_end: |
960 | case REOP_word_boundary: |
961 | case REOP_not_word_boundary: |
962 | break; |
963 | default: |
964 | return -1; |
965 | } |
966 | pos += len; |
967 | } |
968 | return count; |
969 | } |
970 | |
971 | /* '*pp' is the first char after '<' */ |
972 | static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp) |
973 | { |
974 | const uint8_t *p, *p1; |
975 | uint32_t c, d; |
976 | char *q; |
977 | |
978 | p = *pp; |
979 | q = buf; |
980 | for(;;) { |
981 | c = *p; |
982 | if (c == '\\') { |
983 | p++; |
984 | if (*p != 'u') |
985 | return -1; |
986 | c = lre_parse_escape(&p, 2); // accept surrogate pairs |
987 | } else if (c == '>') { |
988 | break; |
989 | } else if (c >= 128) { |
990 | c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); |
991 | if (is_hi_surrogate(c)) { |
992 | d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1); |
993 | if (is_lo_surrogate(d)) { |
994 | c = from_surrogate(c, d); |
995 | p = p1; |
996 | } |
997 | } |
998 | } else { |
999 | p++; |
1000 | } |
1001 | if (c > 0x10FFFF) |
1002 | return -1; |
1003 | if (q == buf) { |
1004 | if (!lre_js_is_ident_first(c)) |
1005 | return -1; |
1006 | } else { |
1007 | if (!lre_js_is_ident_next(c)) |
1008 | return -1; |
1009 | } |
1010 | if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size) |
1011 | return -1; |
1012 | if (c < 128) { |
1013 | *q++ = c; |
1014 | } else { |
1015 | q += unicode_to_utf8((uint8_t*)q, c); |
1016 | } |
1017 | } |
1018 | if (q == buf) |
1019 | return -1; |
1020 | *q = '\0'; |
1021 | p++; |
1022 | *pp = p; |
1023 | return 0; |
1024 | } |
1025 | |
1026 | /* if capture_name = NULL: return the number of captures + 1. |
1027 | Otherwise, return the capture index corresponding to capture_name |
1028 | or -1 if none */ |
1029 | static int re_parse_captures(REParseState *s, int *phas_named_captures, |
1030 | const char *capture_name) |
1031 | { |
1032 | const uint8_t *p; |
1033 | int capture_index; |
1034 | char name[TMP_BUF_SIZE]; |
1035 | |
1036 | capture_index = 1; |
1037 | *phas_named_captures = 0; |
1038 | for (p = s->buf_start; p < s->buf_end; p++) { |
1039 | switch (*p) { |
1040 | case '(': |
1041 | if (p[1] == '?') { |
1042 | if (p[2] == '<' && p[3] != '=' && p[3] != '!') { |
1043 | *phas_named_captures = 1; |
1044 | /* potential named capture */ |
1045 | if (capture_name) { |
1046 | p += 3; |
1047 | if (re_parse_group_name(name, sizeof(name), &p) == 0) { |
1048 | if (!strcmp(name, capture_name)) |
1049 | return capture_index; |
1050 | } |
1051 | } |
1052 | capture_index++; |
1053 | if (capture_index >= CAPTURE_COUNT_MAX) |
1054 | goto done; |
1055 | } |
1056 | } else { |
1057 | capture_index++; |
1058 | if (capture_index >= CAPTURE_COUNT_MAX) |
1059 | goto done; |
1060 | } |
1061 | break; |
1062 | case '\\': |
1063 | p++; |
1064 | break; |
1065 | case '[': |
1066 | for (p += 1 + (*p == ']'); p < s->buf_end && *p != ']'; p++) { |
1067 | if (*p == '\\') |
1068 | p++; |
1069 | } |
1070 | break; |
1071 | } |
1072 | } |
1073 | done: |
1074 | if (capture_name) |
1075 | return -1; |
1076 | else |
1077 | return capture_index; |
1078 | } |
1079 | |
1080 | static int re_count_captures(REParseState *s) |
1081 | { |
1082 | if (s->total_capture_count < 0) { |
1083 | s->total_capture_count = re_parse_captures(s, &s->has_named_captures, |
1084 | NULL); |
1085 | } |
1086 | return s->total_capture_count; |
1087 | } |
1088 | |
1089 | static BOOL re_has_named_captures(REParseState *s) |
1090 | { |
1091 | if (s->has_named_captures < 0) |
1092 | re_count_captures(s); |
1093 | return s->has_named_captures; |
1094 | } |
1095 | |
1096 | static int find_group_name(REParseState *s, const char *name) |
1097 | { |
1098 | const char *p, *buf_end; |
1099 | size_t len, name_len; |
1100 | int capture_index; |
1101 | |
1102 | p = (char *)s->group_names.buf; |
1103 | if (!p) return -1; |
1104 | buf_end = (char *)s->group_names.buf + s->group_names.size; |
1105 | name_len = strlen(name); |
1106 | capture_index = 1; |
1107 | while (p < buf_end) { |
1108 | len = strlen(p); |
1109 | if (len == name_len && memcmp(name, p, name_len) == 0) |
1110 | return capture_index; |
1111 | p += len + 1; |
1112 | capture_index++; |
1113 | } |
1114 | return -1; |
1115 | } |
1116 | |
1117 | static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir); |
1118 | |
1119 | static int re_parse_term(REParseState *s, BOOL is_backward_dir) |
1120 | { |
1121 | const uint8_t *p; |
1122 | int c, last_atom_start, quant_min, quant_max, last_capture_count; |
1123 | BOOL greedy, add_zero_advance_check, is_neg, is_backward_lookahead; |
1124 | CharRange cr_s, *cr = &cr_s; |
1125 | |
1126 | last_atom_start = -1; |
1127 | last_capture_count = 0; |
1128 | p = s->buf_ptr; |
1129 | c = *p; |
1130 | switch(c) { |
1131 | case '^': |
1132 | p++; |
1133 | re_emit_op(s, REOP_line_start); |
1134 | break; |
1135 | case '$': |
1136 | p++; |
1137 | re_emit_op(s, REOP_line_end); |
1138 | break; |
1139 | case '.': |
1140 | p++; |
1141 | last_atom_start = s->byte_code.size; |
1142 | last_capture_count = s->capture_count; |
1143 | if (is_backward_dir) |
1144 | re_emit_op(s, REOP_prev); |
1145 | re_emit_op(s, s->dotall ? REOP_any : REOP_dot); |
1146 | if (is_backward_dir) |
1147 | re_emit_op(s, REOP_prev); |
1148 | break; |
1149 | case '{': |
1150 | if (s->is_unicode) { |
1151 | return re_parse_error(s, "syntax error" ); |
1152 | } else if (!is_digit(p[1])) { |
1153 | /* Annex B: we accept '{' not followed by digits as a |
1154 | normal atom */ |
1155 | goto parse_class_atom; |
1156 | } else { |
1157 | const uint8_t *p1 = p + 1; |
1158 | /* Annex B: error if it is like a repetition count */ |
1159 | parse_digits(&p1, TRUE); |
1160 | if (*p1 == ',') { |
1161 | p1++; |
1162 | if (is_digit(*p1)) { |
1163 | parse_digits(&p1, TRUE); |
1164 | } |
1165 | } |
1166 | if (*p1 != '}') { |
1167 | goto parse_class_atom; |
1168 | } |
1169 | } |
1170 | /* fall thru */ |
1171 | case '*': |
1172 | case '+': |
1173 | case '?': |
1174 | return re_parse_error(s, "nothing to repeat" ); |
1175 | case '(': |
1176 | if (p[1] == '?') { |
1177 | if (p[2] == ':') { |
1178 | p += 3; |
1179 | last_atom_start = s->byte_code.size; |
1180 | last_capture_count = s->capture_count; |
1181 | s->buf_ptr = p; |
1182 | if (re_parse_disjunction(s, is_backward_dir)) |
1183 | return -1; |
1184 | p = s->buf_ptr; |
1185 | if (re_parse_expect(s, &p, ')')) |
1186 | return -1; |
1187 | } else if ((p[2] == '=' || p[2] == '!')) { |
1188 | is_neg = (p[2] == '!'); |
1189 | is_backward_lookahead = FALSE; |
1190 | p += 3; |
1191 | goto lookahead; |
1192 | } else if (p[2] == '<' && |
1193 | (p[3] == '=' || p[3] == '!')) { |
1194 | int pos; |
1195 | is_neg = (p[3] == '!'); |
1196 | is_backward_lookahead = TRUE; |
1197 | p += 4; |
1198 | /* lookahead */ |
1199 | lookahead: |
1200 | /* Annex B allows lookahead to be used as an atom for |
1201 | the quantifiers */ |
1202 | if (!s->is_unicode && !is_backward_lookahead) { |
1203 | last_atom_start = s->byte_code.size; |
1204 | last_capture_count = s->capture_count; |
1205 | } |
1206 | pos = re_emit_op_u32(s, REOP_lookahead + is_neg, 0); |
1207 | s->buf_ptr = p; |
1208 | if (re_parse_disjunction(s, is_backward_lookahead)) |
1209 | return -1; |
1210 | p = s->buf_ptr; |
1211 | if (re_parse_expect(s, &p, ')')) |
1212 | return -1; |
1213 | re_emit_op(s, REOP_match); |
1214 | /* jump after the 'match' after the lookahead is successful */ |
1215 | if (dbuf_error(&s->byte_code)) |
1216 | return -1; |
1217 | put_u32(s->byte_code.buf + pos, s->byte_code.size - (pos + 4)); |
1218 | } else if (p[2] == '<') { |
1219 | p += 3; |
1220 | if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf), |
1221 | &p)) { |
1222 | return re_parse_error(s, "invalid group name" ); |
1223 | } |
1224 | if (find_group_name(s, s->u.tmp_buf) > 0) { |
1225 | return re_parse_error(s, "duplicate group name" ); |
1226 | } |
1227 | /* group name with a trailing zero */ |
1228 | dbuf_put(&s->group_names, (uint8_t *)s->u.tmp_buf, |
1229 | strlen(s->u.tmp_buf) + 1); |
1230 | s->has_named_captures = 1; |
1231 | goto parse_capture; |
1232 | } else { |
1233 | return re_parse_error(s, "invalid group" ); |
1234 | } |
1235 | } else { |
1236 | int capture_index; |
1237 | p++; |
1238 | /* capture without group name */ |
1239 | dbuf_putc(&s->group_names, 0); |
1240 | parse_capture: |
1241 | if (s->capture_count >= CAPTURE_COUNT_MAX) |
1242 | return re_parse_error(s, "too many captures" ); |
1243 | last_atom_start = s->byte_code.size; |
1244 | last_capture_count = s->capture_count; |
1245 | capture_index = s->capture_count++; |
1246 | re_emit_op_u8(s, REOP_save_start + is_backward_dir, |
1247 | capture_index); |
1248 | |
1249 | s->buf_ptr = p; |
1250 | if (re_parse_disjunction(s, is_backward_dir)) |
1251 | return -1; |
1252 | p = s->buf_ptr; |
1253 | |
1254 | re_emit_op_u8(s, REOP_save_start + 1 - is_backward_dir, |
1255 | capture_index); |
1256 | |
1257 | if (re_parse_expect(s, &p, ')')) |
1258 | return -1; |
1259 | } |
1260 | break; |
1261 | case '\\': |
1262 | switch(p[1]) { |
1263 | case 'b': |
1264 | case 'B': |
1265 | re_emit_op(s, REOP_word_boundary + (p[1] != 'b')); |
1266 | p += 2; |
1267 | break; |
1268 | case 'k': |
1269 | { |
1270 | const uint8_t *p1; |
1271 | int dummy_res; |
1272 | |
1273 | p1 = p; |
1274 | if (p1[2] != '<') { |
1275 | /* annex B: we tolerate invalid group names in non |
1276 | unicode mode if there is no named capture |
1277 | definition */ |
1278 | if (s->is_unicode || re_has_named_captures(s)) |
1279 | return re_parse_error(s, "expecting group name" ); |
1280 | else |
1281 | goto parse_class_atom; |
1282 | } |
1283 | p1 += 3; |
1284 | if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf), |
1285 | &p1)) { |
1286 | if (s->is_unicode || re_has_named_captures(s)) |
1287 | return re_parse_error(s, "invalid group name" ); |
1288 | else |
1289 | goto parse_class_atom; |
1290 | } |
1291 | c = find_group_name(s, s->u.tmp_buf); |
1292 | if (c < 0) { |
1293 | /* no capture name parsed before, try to look |
1294 | after (inefficient, but hopefully not common */ |
1295 | c = re_parse_captures(s, &dummy_res, s->u.tmp_buf); |
1296 | if (c < 0) { |
1297 | if (s->is_unicode || re_has_named_captures(s)) |
1298 | return re_parse_error(s, "group name not defined" ); |
1299 | else |
1300 | goto parse_class_atom; |
1301 | } |
1302 | } |
1303 | p = p1; |
1304 | } |
1305 | goto emit_back_reference; |
1306 | case '0': |
1307 | p += 2; |
1308 | c = 0; |
1309 | if (s->is_unicode) { |
1310 | if (is_digit(*p)) { |
1311 | return re_parse_error(s, "invalid decimal escape in regular expression" ); |
1312 | } |
1313 | } else { |
1314 | /* Annex B.1.4: accept legacy octal */ |
1315 | if (*p >= '0' && *p <= '7') { |
1316 | c = *p++ - '0'; |
1317 | if (*p >= '0' && *p <= '7') { |
1318 | c = (c << 3) + *p++ - '0'; |
1319 | } |
1320 | } |
1321 | } |
1322 | goto normal_char; |
1323 | case '1': case '2': case '3': case '4': |
1324 | case '5': case '6': case '7': case '8': |
1325 | case '9': |
1326 | { |
1327 | const uint8_t *q = ++p; |
1328 | |
1329 | c = parse_digits(&p, FALSE); |
1330 | if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) { |
1331 | if (!s->is_unicode) { |
1332 | /* Annex B.1.4: accept legacy octal */ |
1333 | p = q; |
1334 | if (*p <= '7') { |
1335 | c = 0; |
1336 | if (*p <= '3') |
1337 | c = *p++ - '0'; |
1338 | if (*p >= '0' && *p <= '7') { |
1339 | c = (c << 3) + *p++ - '0'; |
1340 | if (*p >= '0' && *p <= '7') { |
1341 | c = (c << 3) + *p++ - '0'; |
1342 | } |
1343 | } |
1344 | } else { |
1345 | c = *p++; |
1346 | } |
1347 | goto normal_char; |
1348 | } |
1349 | return re_parse_error(s, "back reference out of range in regular expression" ); |
1350 | } |
1351 | emit_back_reference: |
1352 | last_atom_start = s->byte_code.size; |
1353 | last_capture_count = s->capture_count; |
1354 | re_emit_op_u8(s, REOP_back_reference + is_backward_dir, c); |
1355 | } |
1356 | break; |
1357 | default: |
1358 | goto parse_class_atom; |
1359 | } |
1360 | break; |
1361 | case '[': |
1362 | last_atom_start = s->byte_code.size; |
1363 | last_capture_count = s->capture_count; |
1364 | if (is_backward_dir) |
1365 | re_emit_op(s, REOP_prev); |
1366 | if (re_parse_char_class(s, &p)) |
1367 | return -1; |
1368 | if (is_backward_dir) |
1369 | re_emit_op(s, REOP_prev); |
1370 | break; |
1371 | case ']': |
1372 | case '}': |
1373 | if (s->is_unicode) |
1374 | return re_parse_error(s, "syntax error" ); |
1375 | goto parse_class_atom; |
1376 | default: |
1377 | parse_class_atom: |
1378 | c = get_class_atom(s, cr, &p, FALSE); |
1379 | if ((int)c < 0) |
1380 | return -1; |
1381 | normal_char: |
1382 | last_atom_start = s->byte_code.size; |
1383 | last_capture_count = s->capture_count; |
1384 | if (is_backward_dir) |
1385 | re_emit_op(s, REOP_prev); |
1386 | if (c >= CLASS_RANGE_BASE) { |
1387 | int ret; |
1388 | /* Note: canonicalization is not needed */ |
1389 | ret = re_emit_range(s, cr); |
1390 | cr_free(cr); |
1391 | if (ret) |
1392 | return -1; |
1393 | } else { |
1394 | if (s->ignore_case) |
1395 | c = lre_canonicalize(c, s->is_unicode); |
1396 | if (c <= 0xffff) |
1397 | re_emit_op_u16(s, REOP_char, c); |
1398 | else |
1399 | re_emit_op_u32(s, REOP_char32, c); |
1400 | } |
1401 | if (is_backward_dir) |
1402 | re_emit_op(s, REOP_prev); |
1403 | break; |
1404 | } |
1405 | |
1406 | /* quantifier */ |
1407 | if (last_atom_start >= 0) { |
1408 | c = *p; |
1409 | switch(c) { |
1410 | case '*': |
1411 | p++; |
1412 | quant_min = 0; |
1413 | quant_max = INT32_MAX; |
1414 | goto quantifier; |
1415 | case '+': |
1416 | p++; |
1417 | quant_min = 1; |
1418 | quant_max = INT32_MAX; |
1419 | goto quantifier; |
1420 | case '?': |
1421 | p++; |
1422 | quant_min = 0; |
1423 | quant_max = 1; |
1424 | goto quantifier; |
1425 | case '{': |
1426 | { |
1427 | const uint8_t *p1 = p; |
1428 | /* As an extension (see ES6 annex B), we accept '{' not |
1429 | followed by digits as a normal atom */ |
1430 | if (!is_digit(p[1])) { |
1431 | if (s->is_unicode) |
1432 | goto invalid_quant_count; |
1433 | break; |
1434 | } |
1435 | p++; |
1436 | quant_min = parse_digits(&p, TRUE); |
1437 | quant_max = quant_min; |
1438 | if (*p == ',') { |
1439 | p++; |
1440 | if (is_digit(*p)) { |
1441 | quant_max = parse_digits(&p, TRUE); |
1442 | if (quant_max < quant_min) { |
1443 | invalid_quant_count: |
1444 | return re_parse_error(s, "invalid repetition count" ); |
1445 | } |
1446 | } else { |
1447 | quant_max = INT32_MAX; /* infinity */ |
1448 | } |
1449 | } |
1450 | if (*p != '}' && !s->is_unicode) { |
1451 | /* Annex B: normal atom if invalid '{' syntax */ |
1452 | p = p1; |
1453 | break; |
1454 | } |
1455 | if (re_parse_expect(s, &p, '}')) |
1456 | return -1; |
1457 | } |
1458 | quantifier: |
1459 | greedy = TRUE; |
1460 | if (*p == '?') { |
1461 | p++; |
1462 | greedy = FALSE; |
1463 | } |
1464 | if (last_atom_start < 0) { |
1465 | return re_parse_error(s, "nothing to repeat" ); |
1466 | } |
1467 | if (greedy) { |
1468 | int len, pos; |
1469 | |
1470 | if (quant_max > 0) { |
1471 | /* specific optimization for simple quantifiers */ |
1472 | if (dbuf_error(&s->byte_code)) |
1473 | goto out_of_memory; |
1474 | len = re_is_simple_quantifier(s->byte_code.buf + last_atom_start, |
1475 | s->byte_code.size - last_atom_start); |
1476 | if (len > 0) { |
1477 | re_emit_op(s, REOP_match); |
1478 | |
1479 | if (dbuf_insert(&s->byte_code, last_atom_start, 17)) |
1480 | goto out_of_memory; |
1481 | pos = last_atom_start; |
1482 | s->byte_code.buf[pos++] = REOP_simple_greedy_quant; |
1483 | put_u32(&s->byte_code.buf[pos], |
1484 | s->byte_code.size - last_atom_start - 17); |
1485 | pos += 4; |
1486 | put_u32(&s->byte_code.buf[pos], quant_min); |
1487 | pos += 4; |
1488 | put_u32(&s->byte_code.buf[pos], quant_max); |
1489 | pos += 4; |
1490 | put_u32(&s->byte_code.buf[pos], len); |
1491 | pos += 4; |
1492 | goto done; |
1493 | } |
1494 | } |
1495 | |
1496 | if (dbuf_error(&s->byte_code)) |
1497 | goto out_of_memory; |
1498 | } |
1499 | /* the spec tells that if there is no advance when |
1500 | running the atom after the first quant_min times, |
1501 | then there is no match. We remove this test when we |
1502 | are sure the atom always advances the position. */ |
1503 | add_zero_advance_check = re_need_check_advance(s->byte_code.buf + last_atom_start, |
1504 | s->byte_code.size - last_atom_start); |
1505 | |
1506 | { |
1507 | int len, pos; |
1508 | len = s->byte_code.size - last_atom_start; |
1509 | if (quant_min == 0) { |
1510 | /* need to reset the capture in case the atom is |
1511 | not executed */ |
1512 | if (last_capture_count != s->capture_count) { |
1513 | if (dbuf_insert(&s->byte_code, last_atom_start, 3)) |
1514 | goto out_of_memory; |
1515 | s->byte_code.buf[last_atom_start++] = REOP_save_reset; |
1516 | s->byte_code.buf[last_atom_start++] = last_capture_count; |
1517 | s->byte_code.buf[last_atom_start++] = s->capture_count - 1; |
1518 | } |
1519 | if (quant_max == 0) { |
1520 | s->byte_code.size = last_atom_start; |
1521 | } else if (quant_max == 1 || quant_max == INT32_MAX) { |
1522 | BOOL has_goto = (quant_max == INT32_MAX); |
1523 | if (dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check)) |
1524 | goto out_of_memory; |
1525 | s->byte_code.buf[last_atom_start] = REOP_split_goto_first + |
1526 | greedy; |
1527 | put_u32(s->byte_code.buf + last_atom_start + 1, |
1528 | len + 5 * has_goto + add_zero_advance_check * 2); |
1529 | if (add_zero_advance_check) { |
1530 | s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos; |
1531 | re_emit_op(s, REOP_check_advance); |
1532 | } |
1533 | if (has_goto) |
1534 | re_emit_goto(s, REOP_goto, last_atom_start); |
1535 | } else { |
1536 | if (dbuf_insert(&s->byte_code, last_atom_start, 10 + add_zero_advance_check)) |
1537 | goto out_of_memory; |
1538 | pos = last_atom_start; |
1539 | s->byte_code.buf[pos++] = REOP_push_i32; |
1540 | put_u32(s->byte_code.buf + pos, quant_max); |
1541 | pos += 4; |
1542 | s->byte_code.buf[pos++] = REOP_split_goto_first + greedy; |
1543 | put_u32(s->byte_code.buf + pos, len + 5 + add_zero_advance_check * 2); |
1544 | pos += 4; |
1545 | if (add_zero_advance_check) { |
1546 | s->byte_code.buf[pos++] = REOP_push_char_pos; |
1547 | re_emit_op(s, REOP_check_advance); |
1548 | } |
1549 | re_emit_goto(s, REOP_loop, last_atom_start + 5); |
1550 | re_emit_op(s, REOP_drop); |
1551 | } |
1552 | } else if (quant_min == 1 && quant_max == INT32_MAX && |
1553 | !add_zero_advance_check) { |
1554 | re_emit_goto(s, REOP_split_next_first - greedy, |
1555 | last_atom_start); |
1556 | } else { |
1557 | if (quant_min == 1) { |
1558 | /* nothing to add */ |
1559 | } else { |
1560 | if (dbuf_insert(&s->byte_code, last_atom_start, 5)) |
1561 | goto out_of_memory; |
1562 | s->byte_code.buf[last_atom_start] = REOP_push_i32; |
1563 | put_u32(s->byte_code.buf + last_atom_start + 1, |
1564 | quant_min); |
1565 | last_atom_start += 5; |
1566 | re_emit_goto(s, REOP_loop, last_atom_start); |
1567 | re_emit_op(s, REOP_drop); |
1568 | } |
1569 | if (quant_max == INT32_MAX) { |
1570 | pos = s->byte_code.size; |
1571 | re_emit_op_u32(s, REOP_split_goto_first + greedy, |
1572 | len + 5 + add_zero_advance_check * 2); |
1573 | if (add_zero_advance_check) |
1574 | re_emit_op(s, REOP_push_char_pos); |
1575 | /* copy the atom */ |
1576 | dbuf_put_self(&s->byte_code, last_atom_start, len); |
1577 | if (add_zero_advance_check) |
1578 | re_emit_op(s, REOP_check_advance); |
1579 | re_emit_goto(s, REOP_goto, pos); |
1580 | } else if (quant_max > quant_min) { |
1581 | re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min); |
1582 | pos = s->byte_code.size; |
1583 | re_emit_op_u32(s, REOP_split_goto_first + greedy, |
1584 | len + 5 + add_zero_advance_check * 2); |
1585 | if (add_zero_advance_check) |
1586 | re_emit_op(s, REOP_push_char_pos); |
1587 | /* copy the atom */ |
1588 | dbuf_put_self(&s->byte_code, last_atom_start, len); |
1589 | if (add_zero_advance_check) |
1590 | re_emit_op(s, REOP_check_advance); |
1591 | re_emit_goto(s, REOP_loop, pos); |
1592 | re_emit_op(s, REOP_drop); |
1593 | } |
1594 | } |
1595 | last_atom_start = -1; |
1596 | } |
1597 | break; |
1598 | default: |
1599 | break; |
1600 | } |
1601 | } |
1602 | done: |
1603 | s->buf_ptr = p; |
1604 | return 0; |
1605 | out_of_memory: |
1606 | return re_parse_out_of_memory(s); |
1607 | } |
1608 | |
1609 | static int re_parse_alternative(REParseState *s, BOOL is_backward_dir) |
1610 | { |
1611 | const uint8_t *p; |
1612 | int ret; |
1613 | size_t start, term_start, end, term_size; |
1614 | |
1615 | start = s->byte_code.size; |
1616 | for(;;) { |
1617 | p = s->buf_ptr; |
1618 | if (p >= s->buf_end) |
1619 | break; |
1620 | if (*p == '|' || *p == ')') |
1621 | break; |
1622 | term_start = s->byte_code.size; |
1623 | ret = re_parse_term(s, is_backward_dir); |
1624 | if (ret) |
1625 | return ret; |
1626 | if (is_backward_dir) { |
1627 | /* reverse the order of the terms (XXX: inefficient, but |
1628 | speed is not really critical here) */ |
1629 | end = s->byte_code.size; |
1630 | term_size = end - term_start; |
1631 | if (dbuf_realloc(&s->byte_code, end + term_size)) |
1632 | return -1; |
1633 | memmove(s->byte_code.buf + start + term_size, |
1634 | s->byte_code.buf + start, |
1635 | end - start); |
1636 | memcpy(s->byte_code.buf + start, s->byte_code.buf + end, |
1637 | term_size); |
1638 | } |
1639 | } |
1640 | return 0; |
1641 | } |
1642 | |
1643 | static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir) |
1644 | { |
1645 | int start, len, pos; |
1646 | |
1647 | if (lre_check_stack_overflow(s->opaque, 0)) |
1648 | return re_parse_error(s, "stack overflow" ); |
1649 | |
1650 | start = s->byte_code.size; |
1651 | if (re_parse_alternative(s, is_backward_dir)) |
1652 | return -1; |
1653 | while (*s->buf_ptr == '|') { |
1654 | s->buf_ptr++; |
1655 | |
1656 | len = s->byte_code.size - start; |
1657 | |
1658 | /* insert a split before the first alternative */ |
1659 | if (dbuf_insert(&s->byte_code, start, 5)) { |
1660 | return re_parse_out_of_memory(s); |
1661 | } |
1662 | s->byte_code.buf[start] = REOP_split_next_first; |
1663 | put_u32(s->byte_code.buf + start + 1, len + 5); |
1664 | |
1665 | pos = re_emit_op_u32(s, REOP_goto, 0); |
1666 | |
1667 | if (re_parse_alternative(s, is_backward_dir)) |
1668 | return -1; |
1669 | |
1670 | /* patch the goto */ |
1671 | len = s->byte_code.size - (pos + 4); |
1672 | put_u32(s->byte_code.buf + pos, len); |
1673 | } |
1674 | return 0; |
1675 | } |
1676 | |
1677 | /* the control flow is recursive so the analysis can be linear */ |
1678 | static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len) |
1679 | { |
1680 | int stack_size, stack_size_max, pos, opcode, len; |
1681 | uint32_t val; |
1682 | |
1683 | stack_size = 0; |
1684 | stack_size_max = 0; |
1685 | bc_buf += RE_HEADER_LEN; |
1686 | bc_buf_len -= RE_HEADER_LEN; |
1687 | pos = 0; |
1688 | while (pos < bc_buf_len) { |
1689 | opcode = bc_buf[pos]; |
1690 | len = reopcode_info[opcode].size; |
1691 | assert(opcode < REOP_COUNT); |
1692 | assert((pos + len) <= bc_buf_len); |
1693 | switch(opcode) { |
1694 | case REOP_push_i32: |
1695 | case REOP_push_char_pos: |
1696 | stack_size++; |
1697 | if (stack_size > stack_size_max) { |
1698 | if (stack_size > STACK_SIZE_MAX) |
1699 | return -1; |
1700 | stack_size_max = stack_size; |
1701 | } |
1702 | break; |
1703 | case REOP_drop: |
1704 | case REOP_check_advance: |
1705 | assert(stack_size > 0); |
1706 | stack_size--; |
1707 | break; |
1708 | case REOP_range: |
1709 | val = get_u16(bc_buf + pos + 1); |
1710 | len += val * 4; |
1711 | break; |
1712 | case REOP_range32: |
1713 | val = get_u16(bc_buf + pos + 1); |
1714 | len += val * 8; |
1715 | break; |
1716 | } |
1717 | pos += len; |
1718 | } |
1719 | return stack_size_max; |
1720 | } |
1721 | |
1722 | /* 'buf' must be a zero terminated UTF-8 string of length buf_len. |
1723 | Return NULL if error and allocate an error message in *perror_msg, |
1724 | otherwise the compiled bytecode and its length in plen. |
1725 | */ |
1726 | uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size, |
1727 | const char *buf, size_t buf_len, int re_flags, |
1728 | void *opaque) |
1729 | { |
1730 | REParseState s_s, *s = &s_s; |
1731 | int stack_size; |
1732 | BOOL is_sticky; |
1733 | |
1734 | memset(s, 0, sizeof(*s)); |
1735 | s->opaque = opaque; |
1736 | s->buf_ptr = (const uint8_t *)buf; |
1737 | s->buf_end = s->buf_ptr + buf_len; |
1738 | s->buf_start = s->buf_ptr; |
1739 | s->re_flags = re_flags; |
1740 | s->is_unicode = ((re_flags & LRE_FLAG_UNICODE) != 0); |
1741 | is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0); |
1742 | s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0); |
1743 | s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0); |
1744 | s->capture_count = 1; |
1745 | s->total_capture_count = -1; |
1746 | s->has_named_captures = -1; |
1747 | |
1748 | dbuf_init2(&s->byte_code, opaque, lre_realloc); |
1749 | dbuf_init2(&s->group_names, opaque, lre_realloc); |
1750 | |
1751 | dbuf_putc(&s->byte_code, re_flags); /* first element is the flags */ |
1752 | dbuf_putc(&s->byte_code, 0); /* second element is the number of captures */ |
1753 | dbuf_putc(&s->byte_code, 0); /* stack size */ |
1754 | dbuf_put_u32(&s->byte_code, 0); /* bytecode length */ |
1755 | |
1756 | if (!is_sticky) { |
1757 | /* iterate thru all positions (about the same as .*?( ... ) ) |
1758 | . We do it without an explicit loop so that lock step |
1759 | thread execution will be possible in an optimized |
1760 | implementation */ |
1761 | re_emit_op_u32(s, REOP_split_goto_first, 1 + 5); |
1762 | re_emit_op(s, REOP_any); |
1763 | re_emit_op_u32(s, REOP_goto, -(5 + 1 + 5)); |
1764 | } |
1765 | re_emit_op_u8(s, REOP_save_start, 0); |
1766 | |
1767 | if (re_parse_disjunction(s, FALSE)) { |
1768 | error: |
1769 | dbuf_free(&s->byte_code); |
1770 | dbuf_free(&s->group_names); |
1771 | pstrcpy(error_msg, error_msg_size, s->u.error_msg); |
1772 | *plen = 0; |
1773 | return NULL; |
1774 | } |
1775 | |
1776 | re_emit_op_u8(s, REOP_save_end, 0); |
1777 | |
1778 | re_emit_op(s, REOP_match); |
1779 | |
1780 | if (*s->buf_ptr != '\0') { |
1781 | re_parse_error(s, "extraneous characters at the end" ); |
1782 | goto error; |
1783 | } |
1784 | |
1785 | if (dbuf_error(&s->byte_code)) { |
1786 | re_parse_out_of_memory(s); |
1787 | goto error; |
1788 | } |
1789 | |
1790 | stack_size = compute_stack_size(s->byte_code.buf, s->byte_code.size); |
1791 | if (stack_size < 0) { |
1792 | re_parse_error(s, "too many imbricated quantifiers" ); |
1793 | goto error; |
1794 | } |
1795 | |
1796 | s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count; |
1797 | s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size; |
1798 | put_u32(s->byte_code.buf + RE_HEADER_BYTECODE_LEN, |
1799 | s->byte_code.size - RE_HEADER_LEN); |
1800 | |
1801 | /* add the named groups if needed */ |
1802 | if (s->group_names.size > (s->capture_count - 1)) { |
1803 | dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size); |
1804 | s->byte_code.buf[RE_HEADER_FLAGS] |= LRE_FLAG_NAMED_GROUPS; |
1805 | } |
1806 | dbuf_free(&s->group_names); |
1807 | |
1808 | #ifdef DUMP_REOP |
1809 | lre_dump_bytecode(s->byte_code.buf, s->byte_code.size); |
1810 | #endif |
1811 | |
1812 | error_msg[0] = '\0'; |
1813 | *plen = s->byte_code.size; |
1814 | return s->byte_code.buf; |
1815 | } |
1816 | |
1817 | static BOOL is_line_terminator(uint32_t c) |
1818 | { |
1819 | return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS); |
1820 | } |
1821 | |
1822 | static BOOL is_word_char(uint32_t c) |
1823 | { |
1824 | return ((c >= '0' && c <= '9') || |
1825 | (c >= 'a' && c <= 'z') || |
1826 | (c >= 'A' && c <= 'Z') || |
1827 | (c == '_')); |
1828 | } |
1829 | |
1830 | #define GET_CHAR(c, cptr, cbuf_end, cbuf_type) \ |
1831 | do { \ |
1832 | if (cbuf_type == 0) { \ |
1833 | c = *cptr++; \ |
1834 | } else { \ |
1835 | const uint16_t *_p = (const uint16_t *)cptr; \ |
1836 | const uint16_t *_end = (const uint16_t *)cbuf_end; \ |
1837 | c = *_p++; \ |
1838 | if (is_hi_surrogate(c) && cbuf_type == 2) { \ |
1839 | if (_p < _end && is_lo_surrogate(*_p)) { \ |
1840 | c = from_surrogate(c, *_p++); \ |
1841 | } \ |
1842 | } \ |
1843 | cptr = (const void *)_p; \ |
1844 | } \ |
1845 | } while (0) |
1846 | |
1847 | #define PEEK_CHAR(c, cptr, cbuf_end, cbuf_type) \ |
1848 | do { \ |
1849 | if (cbuf_type == 0) { \ |
1850 | c = cptr[0]; \ |
1851 | } else { \ |
1852 | const uint16_t *_p = (const uint16_t *)cptr; \ |
1853 | const uint16_t *_end = (const uint16_t *)cbuf_end; \ |
1854 | c = *_p++; \ |
1855 | if (is_hi_surrogate(c) && cbuf_type == 2) { \ |
1856 | if (_p < _end && is_lo_surrogate(*_p)) { \ |
1857 | c = from_surrogate(c, *_p); \ |
1858 | } \ |
1859 | } \ |
1860 | } \ |
1861 | } while (0) |
1862 | |
1863 | #define PEEK_PREV_CHAR(c, cptr, cbuf_start, cbuf_type) \ |
1864 | do { \ |
1865 | if (cbuf_type == 0) { \ |
1866 | c = cptr[-1]; \ |
1867 | } else { \ |
1868 | const uint16_t *_p = (const uint16_t *)cptr - 1; \ |
1869 | const uint16_t *_start = (const uint16_t *)cbuf_start; \ |
1870 | c = *_p; \ |
1871 | if (is_lo_surrogate(c) && cbuf_type == 2) { \ |
1872 | if (_p > _start && is_hi_surrogate(_p[-1])) { \ |
1873 | c = from_surrogate(*--_p, c); \ |
1874 | } \ |
1875 | } \ |
1876 | } \ |
1877 | } while (0) |
1878 | |
1879 | #define GET_PREV_CHAR(c, cptr, cbuf_start, cbuf_type) \ |
1880 | do { \ |
1881 | if (cbuf_type == 0) { \ |
1882 | cptr--; \ |
1883 | c = cptr[0]; \ |
1884 | } else { \ |
1885 | const uint16_t *_p = (const uint16_t *)cptr - 1; \ |
1886 | const uint16_t *_start = (const uint16_t *)cbuf_start; \ |
1887 | c = *_p; \ |
1888 | if (is_lo_surrogate(c) && cbuf_type == 2) { \ |
1889 | if (_p > _start && is_hi_surrogate(_p[-1])) { \ |
1890 | c = from_surrogate(*--_p, c); \ |
1891 | } \ |
1892 | } \ |
1893 | cptr = (const void *)_p; \ |
1894 | } \ |
1895 | } while (0) |
1896 | |
1897 | #define PREV_CHAR(cptr, cbuf_start, cbuf_type) \ |
1898 | do { \ |
1899 | if (cbuf_type == 0) { \ |
1900 | cptr--; \ |
1901 | } else { \ |
1902 | const uint16_t *_p = (const uint16_t *)cptr - 1; \ |
1903 | const uint16_t *_start = (const uint16_t *)cbuf_start; \ |
1904 | if (is_lo_surrogate(*_p) && cbuf_type == 2) { \ |
1905 | if (_p > _start && is_hi_surrogate(_p[-1])) { \ |
1906 | --_p; \ |
1907 | } \ |
1908 | } \ |
1909 | cptr = (const void *)_p; \ |
1910 | } \ |
1911 | } while (0) |
1912 | |
1913 | typedef uintptr_t StackInt; |
1914 | |
1915 | typedef enum { |
1916 | RE_EXEC_STATE_SPLIT, |
1917 | RE_EXEC_STATE_LOOKAHEAD, |
1918 | RE_EXEC_STATE_NEGATIVE_LOOKAHEAD, |
1919 | RE_EXEC_STATE_GREEDY_QUANT, |
1920 | } REExecStateEnum; |
1921 | |
1922 | typedef struct REExecState { |
1923 | REExecStateEnum type : 8; |
1924 | uint8_t stack_len; |
1925 | size_t count; /* only used for RE_EXEC_STATE_GREEDY_QUANT */ |
1926 | const uint8_t *cptr; |
1927 | const uint8_t *pc; |
1928 | void *buf[0]; |
1929 | } REExecState; |
1930 | |
1931 | typedef struct { |
1932 | const uint8_t *cbuf; |
1933 | const uint8_t *cbuf_end; |
1934 | /* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 */ |
1935 | int cbuf_type; |
1936 | int capture_count; |
1937 | int stack_size_max; |
1938 | BOOL multi_line; |
1939 | BOOL ignore_case; |
1940 | BOOL is_unicode; |
1941 | int interrupt_counter; |
1942 | void *opaque; /* used for stack overflow check */ |
1943 | |
1944 | size_t state_size; |
1945 | uint8_t *state_stack; |
1946 | size_t state_stack_size; |
1947 | size_t state_stack_len; |
1948 | } REExecContext; |
1949 | |
1950 | static int push_state(REExecContext *s, |
1951 | uint8_t **capture, |
1952 | StackInt *stack, size_t stack_len, |
1953 | const uint8_t *pc, const uint8_t *cptr, |
1954 | REExecStateEnum type, size_t count) |
1955 | { |
1956 | REExecState *rs; |
1957 | uint8_t *new_stack; |
1958 | size_t new_size, i, n; |
1959 | StackInt *stack_buf; |
1960 | |
1961 | if (unlikely((s->state_stack_len + 1) > s->state_stack_size)) { |
1962 | /* reallocate the stack */ |
1963 | new_size = s->state_stack_size * 3 / 2; |
1964 | if (new_size < 8) |
1965 | new_size = 8; |
1966 | new_stack = lre_realloc(s->opaque, s->state_stack, new_size * s->state_size); |
1967 | if (!new_stack) |
1968 | return -1; |
1969 | s->state_stack_size = new_size; |
1970 | s->state_stack = new_stack; |
1971 | } |
1972 | rs = (REExecState *)(s->state_stack + s->state_stack_len * s->state_size); |
1973 | s->state_stack_len++; |
1974 | rs->type = type; |
1975 | rs->count = count; |
1976 | rs->stack_len = stack_len; |
1977 | rs->cptr = cptr; |
1978 | rs->pc = pc; |
1979 | n = 2 * s->capture_count; |
1980 | for(i = 0; i < n; i++) |
1981 | rs->buf[i] = capture[i]; |
1982 | stack_buf = (StackInt *)(rs->buf + n); |
1983 | for(i = 0; i < stack_len; i++) |
1984 | stack_buf[i] = stack[i]; |
1985 | return 0; |
1986 | } |
1987 | |
1988 | static int lre_poll_timeout(REExecContext *s) |
1989 | { |
1990 | if (unlikely(--s->interrupt_counter <= 0)) { |
1991 | s->interrupt_counter = INTERRUPT_COUNTER_INIT; |
1992 | if (lre_check_timeout(s->opaque)) |
1993 | return LRE_RET_TIMEOUT; |
1994 | } |
1995 | return 0; |
1996 | } |
1997 | |
1998 | /* return 1 if match, 0 if not match or < 0 if error. */ |
1999 | static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, |
2000 | StackInt *stack, int stack_len, |
2001 | const uint8_t *pc, const uint8_t *cptr, |
2002 | BOOL no_recurse) |
2003 | { |
2004 | int opcode, ret; |
2005 | int cbuf_type; |
2006 | uint32_t val, c; |
2007 | const uint8_t *cbuf_end; |
2008 | |
2009 | cbuf_type = s->cbuf_type; |
2010 | cbuf_end = s->cbuf_end; |
2011 | |
2012 | for(;;) { |
2013 | // printf("top=%p: pc=%d\n", th_list.top, (int)(pc - (bc_buf + RE_HEADER_LEN))); |
2014 | opcode = *pc++; |
2015 | switch(opcode) { |
2016 | case REOP_match: |
2017 | { |
2018 | REExecState *rs; |
2019 | if (no_recurse) |
2020 | return (intptr_t)cptr; |
2021 | ret = 1; |
2022 | goto recurse; |
2023 | no_match: |
2024 | if (no_recurse) |
2025 | return 0; |
2026 | ret = 0; |
2027 | recurse: |
2028 | for(;;) { |
2029 | if (lre_poll_timeout(s)) |
2030 | return LRE_RET_TIMEOUT; |
2031 | if (s->state_stack_len == 0) |
2032 | return ret; |
2033 | rs = (REExecState *)(s->state_stack + |
2034 | (s->state_stack_len - 1) * s->state_size); |
2035 | if (rs->type == RE_EXEC_STATE_SPLIT) { |
2036 | if (!ret) { |
2037 | pop_state: |
2038 | memcpy(capture, rs->buf, |
2039 | sizeof(capture[0]) * 2 * s->capture_count); |
2040 | pop_state1: |
2041 | pc = rs->pc; |
2042 | cptr = rs->cptr; |
2043 | stack_len = rs->stack_len; |
2044 | memcpy(stack, rs->buf + 2 * s->capture_count, |
2045 | stack_len * sizeof(stack[0])); |
2046 | s->state_stack_len--; |
2047 | break; |
2048 | } |
2049 | } else if (rs->type == RE_EXEC_STATE_GREEDY_QUANT) { |
2050 | if (!ret) { |
2051 | uint32_t char_count, i; |
2052 | memcpy(capture, rs->buf, |
2053 | sizeof(capture[0]) * 2 * s->capture_count); |
2054 | stack_len = rs->stack_len; |
2055 | memcpy(stack, rs->buf + 2 * s->capture_count, |
2056 | stack_len * sizeof(stack[0])); |
2057 | pc = rs->pc; |
2058 | cptr = rs->cptr; |
2059 | /* go backward */ |
2060 | char_count = get_u32(pc + 12); |
2061 | for(i = 0; i < char_count; i++) { |
2062 | PREV_CHAR(cptr, s->cbuf, cbuf_type); |
2063 | } |
2064 | pc = (pc + 16) + (int)get_u32(pc); |
2065 | rs->cptr = cptr; |
2066 | rs->count--; |
2067 | if (rs->count == 0) { |
2068 | s->state_stack_len--; |
2069 | } |
2070 | break; |
2071 | } |
2072 | } else { |
2073 | ret = ((rs->type == RE_EXEC_STATE_LOOKAHEAD && ret) || |
2074 | (rs->type == RE_EXEC_STATE_NEGATIVE_LOOKAHEAD && !ret)); |
2075 | if (ret) { |
2076 | /* keep the capture in case of positive lookahead */ |
2077 | if (rs->type == RE_EXEC_STATE_LOOKAHEAD) |
2078 | goto pop_state1; |
2079 | else |
2080 | goto pop_state; |
2081 | } |
2082 | } |
2083 | s->state_stack_len--; |
2084 | } |
2085 | } |
2086 | break; |
2087 | case REOP_char32: |
2088 | val = get_u32(pc); |
2089 | pc += 4; |
2090 | goto test_char; |
2091 | case REOP_char: |
2092 | val = get_u16(pc); |
2093 | pc += 2; |
2094 | test_char: |
2095 | if (cptr >= cbuf_end) |
2096 | goto no_match; |
2097 | GET_CHAR(c, cptr, cbuf_end, cbuf_type); |
2098 | if (s->ignore_case) { |
2099 | c = lre_canonicalize(c, s->is_unicode); |
2100 | } |
2101 | if (val != c) |
2102 | goto no_match; |
2103 | break; |
2104 | case REOP_split_goto_first: |
2105 | case REOP_split_next_first: |
2106 | { |
2107 | const uint8_t *pc1; |
2108 | |
2109 | val = get_u32(pc); |
2110 | pc += 4; |
2111 | if (opcode == REOP_split_next_first) { |
2112 | pc1 = pc + (int)val; |
2113 | } else { |
2114 | pc1 = pc; |
2115 | pc = pc + (int)val; |
2116 | } |
2117 | ret = push_state(s, capture, stack, stack_len, |
2118 | pc1, cptr, RE_EXEC_STATE_SPLIT, 0); |
2119 | if (ret < 0) |
2120 | return LRE_RET_MEMORY_ERROR; |
2121 | break; |
2122 | } |
2123 | case REOP_lookahead: |
2124 | case REOP_negative_lookahead: |
2125 | val = get_u32(pc); |
2126 | pc += 4; |
2127 | ret = push_state(s, capture, stack, stack_len, |
2128 | pc + (int)val, cptr, |
2129 | RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead, |
2130 | 0); |
2131 | if (ret < 0) |
2132 | return LRE_RET_MEMORY_ERROR; |
2133 | break; |
2134 | |
2135 | case REOP_goto: |
2136 | val = get_u32(pc); |
2137 | pc += 4 + (int)val; |
2138 | if (lre_poll_timeout(s)) |
2139 | return LRE_RET_TIMEOUT; |
2140 | break; |
2141 | case REOP_line_start: |
2142 | if (cptr == s->cbuf) |
2143 | break; |
2144 | if (!s->multi_line) |
2145 | goto no_match; |
2146 | PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type); |
2147 | if (!is_line_terminator(c)) |
2148 | goto no_match; |
2149 | break; |
2150 | case REOP_line_end: |
2151 | if (cptr == cbuf_end) |
2152 | break; |
2153 | if (!s->multi_line) |
2154 | goto no_match; |
2155 | PEEK_CHAR(c, cptr, cbuf_end, cbuf_type); |
2156 | if (!is_line_terminator(c)) |
2157 | goto no_match; |
2158 | break; |
2159 | case REOP_dot: |
2160 | if (cptr == cbuf_end) |
2161 | goto no_match; |
2162 | GET_CHAR(c, cptr, cbuf_end, cbuf_type); |
2163 | if (is_line_terminator(c)) |
2164 | goto no_match; |
2165 | break; |
2166 | case REOP_any: |
2167 | if (cptr == cbuf_end) |
2168 | goto no_match; |
2169 | GET_CHAR(c, cptr, cbuf_end, cbuf_type); |
2170 | break; |
2171 | case REOP_save_start: |
2172 | case REOP_save_end: |
2173 | val = *pc++; |
2174 | assert(val < s->capture_count); |
2175 | capture[2 * val + opcode - REOP_save_start] = (uint8_t *)cptr; |
2176 | break; |
2177 | case REOP_save_reset: |
2178 | { |
2179 | uint32_t val2; |
2180 | val = pc[0]; |
2181 | val2 = pc[1]; |
2182 | pc += 2; |
2183 | assert(val2 < s->capture_count); |
2184 | while (val <= val2) { |
2185 | capture[2 * val] = NULL; |
2186 | capture[2 * val + 1] = NULL; |
2187 | val++; |
2188 | } |
2189 | } |
2190 | break; |
2191 | case REOP_push_i32: |
2192 | val = get_u32(pc); |
2193 | pc += 4; |
2194 | stack[stack_len++] = val; |
2195 | break; |
2196 | case REOP_drop: |
2197 | stack_len--; |
2198 | break; |
2199 | case REOP_loop: |
2200 | val = get_u32(pc); |
2201 | pc += 4; |
2202 | if (--stack[stack_len - 1] != 0) { |
2203 | pc += (int)val; |
2204 | if (lre_poll_timeout(s)) |
2205 | return LRE_RET_TIMEOUT; |
2206 | } |
2207 | break; |
2208 | case REOP_push_char_pos: |
2209 | stack[stack_len++] = (uintptr_t)cptr; |
2210 | break; |
2211 | case REOP_check_advance: |
2212 | if (stack[--stack_len] == (uintptr_t)cptr) |
2213 | goto no_match; |
2214 | break; |
2215 | case REOP_word_boundary: |
2216 | case REOP_not_word_boundary: |
2217 | { |
2218 | BOOL v1, v2; |
2219 | /* char before */ |
2220 | if (cptr == s->cbuf) { |
2221 | v1 = FALSE; |
2222 | } else { |
2223 | PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type); |
2224 | v1 = is_word_char(c); |
2225 | } |
2226 | /* current char */ |
2227 | if (cptr >= cbuf_end) { |
2228 | v2 = FALSE; |
2229 | } else { |
2230 | PEEK_CHAR(c, cptr, cbuf_end, cbuf_type); |
2231 | v2 = is_word_char(c); |
2232 | } |
2233 | if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode)) |
2234 | goto no_match; |
2235 | } |
2236 | break; |
2237 | case REOP_back_reference: |
2238 | case REOP_backward_back_reference: |
2239 | { |
2240 | const uint8_t *cptr1, *cptr1_end, *cptr1_start; |
2241 | uint32_t c1, c2; |
2242 | |
2243 | val = *pc++; |
2244 | if (val >= s->capture_count) |
2245 | goto no_match; |
2246 | cptr1_start = capture[2 * val]; |
2247 | cptr1_end = capture[2 * val + 1]; |
2248 | if (!cptr1_start || !cptr1_end) |
2249 | break; |
2250 | if (opcode == REOP_back_reference) { |
2251 | cptr1 = cptr1_start; |
2252 | while (cptr1 < cptr1_end) { |
2253 | if (cptr >= cbuf_end) |
2254 | goto no_match; |
2255 | GET_CHAR(c1, cptr1, cptr1_end, cbuf_type); |
2256 | GET_CHAR(c2, cptr, cbuf_end, cbuf_type); |
2257 | if (s->ignore_case) { |
2258 | c1 = lre_canonicalize(c1, s->is_unicode); |
2259 | c2 = lre_canonicalize(c2, s->is_unicode); |
2260 | } |
2261 | if (c1 != c2) |
2262 | goto no_match; |
2263 | } |
2264 | } else { |
2265 | cptr1 = cptr1_end; |
2266 | while (cptr1 > cptr1_start) { |
2267 | if (cptr == s->cbuf) |
2268 | goto no_match; |
2269 | GET_PREV_CHAR(c1, cptr1, cptr1_start, cbuf_type); |
2270 | GET_PREV_CHAR(c2, cptr, s->cbuf, cbuf_type); |
2271 | if (s->ignore_case) { |
2272 | c1 = lre_canonicalize(c1, s->is_unicode); |
2273 | c2 = lre_canonicalize(c2, s->is_unicode); |
2274 | } |
2275 | if (c1 != c2) |
2276 | goto no_match; |
2277 | } |
2278 | } |
2279 | } |
2280 | break; |
2281 | case REOP_range: |
2282 | { |
2283 | int n; |
2284 | uint32_t low, high, idx_min, idx_max, idx; |
2285 | |
2286 | n = get_u16(pc); /* n must be >= 1 */ |
2287 | pc += 2; |
2288 | if (cptr >= cbuf_end) |
2289 | goto no_match; |
2290 | GET_CHAR(c, cptr, cbuf_end, cbuf_type); |
2291 | if (s->ignore_case) { |
2292 | c = lre_canonicalize(c, s->is_unicode); |
2293 | } |
2294 | idx_min = 0; |
2295 | low = get_u16(pc + 0 * 4); |
2296 | if (c < low) |
2297 | goto no_match; |
2298 | idx_max = n - 1; |
2299 | high = get_u16(pc + idx_max * 4 + 2); |
2300 | /* 0xffff in for last value means +infinity */ |
2301 | if (unlikely(c >= 0xffff) && high == 0xffff) |
2302 | goto range_match; |
2303 | if (c > high) |
2304 | goto no_match; |
2305 | while (idx_min <= idx_max) { |
2306 | idx = (idx_min + idx_max) / 2; |
2307 | low = get_u16(pc + idx * 4); |
2308 | high = get_u16(pc + idx * 4 + 2); |
2309 | if (c < low) |
2310 | idx_max = idx - 1; |
2311 | else if (c > high) |
2312 | idx_min = idx + 1; |
2313 | else |
2314 | goto range_match; |
2315 | } |
2316 | goto no_match; |
2317 | range_match: |
2318 | pc += 4 * n; |
2319 | } |
2320 | break; |
2321 | case REOP_range32: |
2322 | { |
2323 | int n; |
2324 | uint32_t low, high, idx_min, idx_max, idx; |
2325 | |
2326 | n = get_u16(pc); /* n must be >= 1 */ |
2327 | pc += 2; |
2328 | if (cptr >= cbuf_end) |
2329 | goto no_match; |
2330 | GET_CHAR(c, cptr, cbuf_end, cbuf_type); |
2331 | if (s->ignore_case) { |
2332 | c = lre_canonicalize(c, s->is_unicode); |
2333 | } |
2334 | idx_min = 0; |
2335 | low = get_u32(pc + 0 * 8); |
2336 | if (c < low) |
2337 | goto no_match; |
2338 | idx_max = n - 1; |
2339 | high = get_u32(pc + idx_max * 8 + 4); |
2340 | if (c > high) |
2341 | goto no_match; |
2342 | while (idx_min <= idx_max) { |
2343 | idx = (idx_min + idx_max) / 2; |
2344 | low = get_u32(pc + idx * 8); |
2345 | high = get_u32(pc + idx * 8 + 4); |
2346 | if (c < low) |
2347 | idx_max = idx - 1; |
2348 | else if (c > high) |
2349 | idx_min = idx + 1; |
2350 | else |
2351 | goto range32_match; |
2352 | } |
2353 | goto no_match; |
2354 | range32_match: |
2355 | pc += 8 * n; |
2356 | } |
2357 | break; |
2358 | case REOP_prev: |
2359 | /* go to the previous char */ |
2360 | if (cptr == s->cbuf) |
2361 | goto no_match; |
2362 | PREV_CHAR(cptr, s->cbuf, cbuf_type); |
2363 | break; |
2364 | case REOP_simple_greedy_quant: |
2365 | { |
2366 | uint32_t next_pos, quant_min, quant_max; |
2367 | size_t q; |
2368 | intptr_t res; |
2369 | const uint8_t *pc1; |
2370 | |
2371 | next_pos = get_u32(pc); |
2372 | quant_min = get_u32(pc + 4); |
2373 | quant_max = get_u32(pc + 8); |
2374 | pc += 16; |
2375 | pc1 = pc; |
2376 | pc += (int)next_pos; |
2377 | |
2378 | q = 0; |
2379 | for(;;) { |
2380 | if (lre_poll_timeout(s)) |
2381 | return LRE_RET_TIMEOUT; |
2382 | res = lre_exec_backtrack(s, capture, stack, stack_len, |
2383 | pc1, cptr, TRUE); |
2384 | if (res == LRE_RET_MEMORY_ERROR || |
2385 | res == LRE_RET_TIMEOUT) |
2386 | return res; |
2387 | if (!res) |
2388 | break; |
2389 | cptr = (uint8_t *)res; |
2390 | q++; |
2391 | if (q >= quant_max && quant_max != INT32_MAX) |
2392 | break; |
2393 | } |
2394 | if (q < quant_min) |
2395 | goto no_match; |
2396 | if (q > quant_min) { |
2397 | /* will examine all matches down to quant_min */ |
2398 | ret = push_state(s, capture, stack, stack_len, |
2399 | pc1 - 16, cptr, |
2400 | RE_EXEC_STATE_GREEDY_QUANT, |
2401 | q - quant_min); |
2402 | if (ret < 0) |
2403 | return LRE_RET_MEMORY_ERROR; |
2404 | } |
2405 | } |
2406 | break; |
2407 | default: |
2408 | abort(); |
2409 | } |
2410 | } |
2411 | } |
2412 | |
2413 | /* Return 1 if match, 0 if not match or < 0 if error (see LRE_RET_x). cindex is the |
2414 | starting position of the match and must be such as 0 <= cindex <= |
2415 | clen. */ |
2416 | int lre_exec(uint8_t **capture, |
2417 | const uint8_t *bc_buf, const uint8_t *cbuf, int cindex, int clen, |
2418 | int cbuf_type, void *opaque) |
2419 | { |
2420 | REExecContext s_s, *s = &s_s; |
2421 | int re_flags, i, alloca_size, ret; |
2422 | StackInt *stack_buf; |
2423 | |
2424 | re_flags = lre_get_flags(bc_buf); |
2425 | s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0; |
2426 | s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0; |
2427 | s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != 0; |
2428 | s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT]; |
2429 | s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE]; |
2430 | s->cbuf = cbuf; |
2431 | s->cbuf_end = cbuf + (clen << cbuf_type); |
2432 | s->cbuf_type = cbuf_type; |
2433 | if (s->cbuf_type == 1 && s->is_unicode) |
2434 | s->cbuf_type = 2; |
2435 | s->interrupt_counter = INTERRUPT_COUNTER_INIT; |
2436 | s->opaque = opaque; |
2437 | |
2438 | s->state_size = sizeof(REExecState) + |
2439 | s->capture_count * sizeof(capture[0]) * 2 + |
2440 | s->stack_size_max * sizeof(stack_buf[0]); |
2441 | s->state_stack = NULL; |
2442 | s->state_stack_len = 0; |
2443 | s->state_stack_size = 0; |
2444 | |
2445 | for(i = 0; i < s->capture_count * 2; i++) |
2446 | capture[i] = NULL; |
2447 | alloca_size = s->stack_size_max * sizeof(stack_buf[0]); |
2448 | stack_buf = alloca(alloca_size); |
2449 | ret = lre_exec_backtrack(s, capture, stack_buf, 0, bc_buf + RE_HEADER_LEN, |
2450 | cbuf + (cindex << cbuf_type), FALSE); |
2451 | lre_realloc(s->opaque, s->state_stack, 0); |
2452 | return ret; |
2453 | } |
2454 | |
2455 | int lre_get_capture_count(const uint8_t *bc_buf) |
2456 | { |
2457 | return bc_buf[RE_HEADER_CAPTURE_COUNT]; |
2458 | } |
2459 | |
2460 | int lre_get_flags(const uint8_t *bc_buf) |
2461 | { |
2462 | return bc_buf[RE_HEADER_FLAGS]; |
2463 | } |
2464 | |
2465 | /* Return NULL if no group names. Otherwise, return a pointer to |
2466 | 'capture_count - 1' zero terminated UTF-8 strings. */ |
2467 | const char *lre_get_groupnames(const uint8_t *bc_buf) |
2468 | { |
2469 | uint32_t re_bytecode_len; |
2470 | if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == 0) |
2471 | return NULL; |
2472 | re_bytecode_len = get_u32(bc_buf + RE_HEADER_BYTECODE_LEN); |
2473 | return (const char *)(bc_buf + RE_HEADER_LEN + re_bytecode_len); |
2474 | } |
2475 | |
2476 | #ifdef TEST |
2477 | |
2478 | BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size) |
2479 | { |
2480 | return FALSE; |
2481 | } |
2482 | |
2483 | void *lre_realloc(void *opaque, void *ptr, size_t size) |
2484 | { |
2485 | return realloc(ptr, size); |
2486 | } |
2487 | |
2488 | int main(int argc, char **argv) |
2489 | { |
2490 | int len, flags, ret, i; |
2491 | uint8_t *bc; |
2492 | char error_msg[64]; |
2493 | uint8_t *capture[CAPTURE_COUNT_MAX * 2]; |
2494 | const char *input; |
2495 | int input_len, capture_count; |
2496 | |
2497 | if (argc < 4) { |
2498 | printf("usage: %s regexp flags input\n" , argv[0]); |
2499 | return 1; |
2500 | } |
2501 | flags = atoi(argv[2]); |
2502 | bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1], |
2503 | strlen(argv[1]), flags, NULL); |
2504 | if (!bc) { |
2505 | fprintf(stderr, "error: %s\n" , error_msg); |
2506 | exit(1); |
2507 | } |
2508 | |
2509 | input = argv[3]; |
2510 | input_len = strlen(input); |
2511 | |
2512 | ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL); |
2513 | printf("ret=%d\n" , ret); |
2514 | if (ret == 1) { |
2515 | capture_count = lre_get_capture_count(bc); |
2516 | for(i = 0; i < 2 * capture_count; i++) { |
2517 | uint8_t *ptr; |
2518 | ptr = capture[i]; |
2519 | printf("%d: " , i); |
2520 | if (!ptr) |
2521 | printf("<nil>" ); |
2522 | else |
2523 | printf("%u" , (int)(ptr - (uint8_t *)input)); |
2524 | printf("\n" ); |
2525 | } |
2526 | } |
2527 | return 0; |
2528 | } |
2529 | #endif |
2530 | |