libregexp.c source code [QuickJS/libregexp.c]

1	/*
2	* Regular Expression Engine
3	*
4	* Copyright (c) 2017-2018 Fabrice Bellard
5	*
6	* Permission is hereby granted, free of charge, to any person obtaining a copy
7	* of this software and associated documentation files (the "Software"), to deal
8	* in the Software without restriction, including without limitation the rights
9	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10	* copies of the Software, and to permit persons to whom the Software is
11	* furnished to do so, subject to the following conditions:
12	*
13	* The above copyright notice and this permission notice shall be included in
14	* all copies or substantial portions of the Software.
15	*
16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22	* THE SOFTWARE.
23	*/
24	#include <stdlib.h>
25	#include <stdio.h>
26	#include <stdarg.h>
27	#include <inttypes.h>
28	#include <string.h>
29	#include <assert.h>
30
31	#include "cutils.h"
32	#include "libregexp.h"
33	#include "libunicode.h"
34
35	/*
36	TODO:
37
38	- Add a lock step execution mode (=linear time execution guaranteed)
39	when the regular expression is "simple" i.e. no backreference nor
40	complicated lookahead. The opcodes are designed for this execution
41	model.
42	*/
43
44	#if defined(TEST)
45	#define DUMP_REOP
46	#endif
47
48	typedef enum {
49	#define DEF(id, size) REOP_ ## id,
50	#include "libregexp-opcode.h"
51	#undef DEF
52	REOP_COUNT,
53	} REOPCodeEnum;
54
55	#define CAPTURE_COUNT_MAX 255
56	#define STACK_SIZE_MAX 255
57	/ must be large enough to have a negligible runtime cost and small*
58	enough to call the interrupt callback often. /*
59	#define INTERRUPT_COUNTER_INIT 10000
60
61	/ unicode code points /
62	#define CP_LS 0x2028
63	#define CP_PS 0x2029
64
65	#define TMP_BUF_SIZE 128
66
67	typedef struct {
68	DynBuf byte_code;
69	const uint8_t *buf_ptr;
70	const uint8_t *buf_end;
71	const uint8_t *buf_start;
72	int re_flags;
73	BOOL is_unicode;
74	BOOL ignore_case;
75	BOOL dotall;
76	int capture_count;
77	int total_capture_count; / -1 = not computed yet /
78	int has_named_captures; / -1 = don't know, 0 = no, 1 = yes /
79	void *opaque;
80	DynBuf group_names;
81	union {
82	char error_msg[TMP_BUF_SIZE];
83	char tmp_buf[TMP_BUF_SIZE];
84	} u;
85	} REParseState;
86
87	typedef struct {
88	#ifdef DUMP_REOP
89	const char *name;
90	#endif
91	uint8_t size;
92	} REOpCode;
93
94	static const REOpCode reopcode_info[REOP_COUNT] = {
95	#ifdef DUMP_REOP
96	#define DEF(id, size) { #id, size },
97	#else
98	#define DEF(id, size) { size },
99	#endif
100	#include "libregexp-opcode.h"
101	#undef DEF
102	};
103
104	#define RE_HEADER_FLAGS 0
105	#define RE_HEADER_CAPTURE_COUNT 1
106	#define RE_HEADER_STACK_SIZE 2
107	#define RE_HEADER_BYTECODE_LEN 3
108
109	#define RE_HEADER_LEN 7
110
111	static inline int is_digit(int c) {
112	return c >= `'0'` && c <= `'9'`;
113	}
114
115	/ insert 'len' bytes at position 'pos'. Return < 0 if error. /
116	static int dbuf_insert(DynBuf s, int* pos, int len)
117	{
118	if (dbuf_realloc(s, s->size + len))
119	return -`1`;
120	memmove(s->buf + pos + len, s->buf + pos, s->size - pos);
121	s->size += len;
122	return `0`;
123	}
124
125	static const uint16_t char_range_d[] = {
126	`1`,
127	`0x0030`, `0x0039` + `1`,
128	};
129
130	/ code point ranges for Zs,Zl or Zp property /
131	static const uint16_t char_range_s[] = {
132	`10`,
133	`0x0009`, `0x000D` + `1`,
134	`0x0020`, `0x0020` + `1`,
135	`0x00A0`, `0x00A0` + `1`,
136	`0x1680`, `0x1680` + `1`,
137	`0x2000`, `0x200A` + `1`,
138	/ 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; /
139	/ 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; /
140	`0x2028`, `0x2029` + `1`,
141	`0x202F`, `0x202F` + `1`,
142	`0x205F`, `0x205F` + `1`,
143	`0x3000`, `0x3000` + `1`,
144	/ FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; /
145	`0xFEFF`, `0xFEFF` + `1`,
146	};
147
148	static const uint16_t char_range_w[] = {
149	`4`,
150	`0x0030`, `0x0039` + `1`,
151	`0x0041`, `0x005A` + `1`,
152	`0x005F`, `0x005F` + `1`,
153	`0x0061`, `0x007A` + `1`,
154	};
155
156	#define CLASS_RANGE_BASE 0x40000000
157
158	typedef enum {
159	CHAR_RANGE_d,
160	CHAR_RANGE_D,
161	CHAR_RANGE_s,
162	CHAR_RANGE_S,
163	CHAR_RANGE_w,
164	CHAR_RANGE_W,
165	} CharRangeEnum;
166
167	static const uint16_t * const char_range_table[] = {
168	char_range_d,
169	char_range_s,
170	char_range_w,
171	};
172
173	static int cr_init_char_range(REParseState s, CharRange cr, uint32_t c)
174	{
175	BOOL invert;
176	const uint16_t *c_pt;
177	int len, i;
178
179	invert = c & `1`;
180	c_pt = char_range_table[c >> `1`];
181	len = *c_pt++;
182	cr_init(cr, s->opaque, lre_realloc);
183	for(i = `0`; i < len * `2`; i++) {
184	if (cr_add_point(cr, c_pt[i]))
185	goto fail;
186	}
187	if (invert) {
188	if (cr_invert(cr))
189	goto fail;
190	}
191	return `0`;
192	fail:
193	cr_free(cr);
194	return -`1`;
195	}
196
197	#ifdef DUMP_REOP
198	static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
199	int buf_len)
200	{
201	int pos, len, opcode, bc_len, re_flags, i;
202	uint32_t val;
203
204	assert(buf_len >= RE_HEADER_LEN);
205
206	re_flags = lre_get_flags(buf);
207	bc_len = get_u32(buf + RE_HEADER_BYTECODE_LEN);
208	assert(bc_len + RE_HEADER_LEN <= buf_len);
209	printf("flags: 0x%x capture_count=%d stack_size=%d\n",
210	re_flags, buf[RE_HEADER_CAPTURE_COUNT], buf[RE_HEADER_STACK_SIZE]);
211	if (re_flags & LRE_FLAG_NAMED_GROUPS) {
212	const char *p;
213	p = (char *)buf + RE_HEADER_LEN + bc_len;
214	printf("named groups: ");
215	for(i = `1`; i < buf[RE_HEADER_CAPTURE_COUNT]; i++) {
216	if (i != `1`)
217	printf(",");
218	printf("<%s>", p);
219	p += strlen(p) + `1`;
220	}
221	printf("\n");
222	assert(p == (char *)(buf + buf_len));
223	}
224	printf("bytecode_len=%d\n", bc_len);
225
226	buf += RE_HEADER_LEN;
227	pos = `0`;
228	while (pos < bc_len) {
229	printf("%5u: ", pos);
230	opcode = buf[pos];
231	len = reopcode_info[opcode].size;
232	if (opcode >= REOP_COUNT) {
233	printf(" invalid opcode=0x%02x\n", opcode);
234	break;
235	}
236	if ((pos + len) > bc_len) {
237	printf(" buffer overflow (opcode=0x%02x)\n", opcode);
238	break;
239	}
240	printf("%s", reopcode_info[opcode].name);
241	switch(opcode) {
242	case REOP_char:
243	val = get_u16(buf + pos + `1`);
244	if (val >= `' '` && val <= `126`)
245	printf(" '%c'", val);
246	else
247	printf(" 0x%04x", val);
248	break;
249	case REOP_char32:
250	val = get_u32(buf + pos + `1`);
251	if (val >= `' '` && val <= `126`)
252	printf(" '%c'", val);
253	else
254	printf(" 0x%08x", val);
255	break;
256	case REOP_goto:
257	case REOP_split_goto_first:
258	case REOP_split_next_first:
259	case REOP_loop:
260	case REOP_lookahead:
261	case REOP_negative_lookahead:
262	val = get_u32(buf + pos + `1`);
263	val += (pos + `5`);
264	printf(" %u", val);
265	break;
266	case REOP_simple_greedy_quant:
267	printf(" %u %u %u %u",
268	get_u32(buf + pos + `1`) + (pos + `17`),
269	get_u32(buf + pos + `1` + `4`),
270	get_u32(buf + pos + `1` + `8`),
271	get_u32(buf + pos + `1` + `12`));
272	break;
273	case REOP_save_start:
274	case REOP_save_end:
275	case REOP_back_reference:
276	case REOP_backward_back_reference:
277	printf(" %u", buf[pos + `1`]);
278	break;
279	case REOP_save_reset:
280	printf(" %u %u", buf[pos + `1`], buf[pos + `2`]);
281	break;
282	case REOP_push_i32:
283	val = get_u32(buf + pos + `1`);
284	printf(" %d", val);
285	break;
286	case REOP_range:
287	{
288	int n, i;
289	n = get_u16(buf + pos + `1`);
290	len += n * `4`;
291	for(i = `0`; i < n * `2`; i++) {
292	val = get_u16(buf + pos + `3` + i * `2`);
293	printf(" 0x%04x", val);
294	}
295	}
296	break;
297	case REOP_range32:
298	{
299	int n, i;
300	n = get_u16(buf + pos + `1`);
301	len += n * `8`;
302	for(i = `0`; i < n * `2`; i++) {
303	val = get_u32(buf + pos + `3` + i * `4`);
304	printf(" 0x%08x", val);
305	}
306	}
307	break;
308	default:
309	break;
310	}
311	printf("\n");
312	pos += len;
313	}
314	}
315	#endif
316
317	static void re_emit_op(REParseState s, int* op)
318	{
319	dbuf_putc(&s->byte_code, op);
320	}
321
322	/ return the offset of the u32 value /
323	static int re_emit_op_u32(REParseState s, int* op, uint32_t val)
324	{
325	int pos;
326	dbuf_putc(&s->byte_code, op);
327	pos = s->byte_code.size;
328	dbuf_put_u32(&s->byte_code, val);
329	return pos;
330	}
331
332	static int re_emit_goto(REParseState s, int* op, uint32_t val)
333	{
334	int pos;
335	dbuf_putc(&s->byte_code, op);
336	pos = s->byte_code.size;
337	dbuf_put_u32(&s->byte_code, val - (pos + `4`));
338	return pos;
339	}
340
341	static void re_emit_op_u8(REParseState s, int* op, uint32_t val)
342	{
343	dbuf_putc(&s->byte_code, op);
344	dbuf_putc(&s->byte_code, val);
345	}
346
347	static void re_emit_op_u16(REParseState s, int* op, uint32_t val)
348	{
349	dbuf_putc(&s->byte_code, op);
350	dbuf_put_u16(&s->byte_code, val);
351	}
352
353	static int __attribute__((format(printf, `2`, `3`))) re_parse_error(REParseState s, const* char *fmt, ...)
354	{
355	va_list ap;
356	va_start(ap, fmt);
357	vsnprintf(s->u.error_msg, sizeof(s->u.error_msg), fmt, ap);
358	va_end(ap);
359	return -`1`;
360	}
361
362	static int re_parse_out_of_memory(REParseState *s)
363	{
364	return re_parse_error(s, "out of memory");
365	}
366
367	/ If allow_overflow is false, return -1 in case of*
368	overflow. Otherwise return INT32_MAX. /*
369	static int parse_digits(const uint8_t **pp, BOOL allow_overflow)
370	{
371	const uint8_t *p;
372	uint64_t v;
373	int c;
374
375	p = *pp;
376	v = `0`;
377	for(;;) {
378	c = *p;
379	if (c < `'0'` \|\| c > `'9'`)
380	break;
381	v = v * `10` + c - `'0'`;
382	if (v >= INT32_MAX) {
383	if (allow_overflow)
384	v = INT32_MAX;
385	else
386	return -`1`;
387	}
388	p++;
389	}
390	*pp = p;
391	return v;
392	}
393
394	static int re_parse_expect(REParseState s, const* uint8_t *pp, int* c)
395	{
396	const uint8_t *p;
397	p = *pp;
398	if (*p != c)
399	return re_parse_error(s, "expecting '%c'", c);
400	p++;
401	*pp = p;
402	return `0`;
403	}
404
405	/ Parse an escape sequence, pp points after the '\':
406	allow_utf16 value:
407	0 : no UTF-16 escapes allowed
408	1 : UTF-16 escapes allowed
409	2 : UTF-16 escapes allowed and escapes of surrogate pairs are
410	converted to a unicode character (unicode regexp case).
411
412	Return the unicode char and update pp if recognized,*
413	return -1 if malformed escape,
414	return -2 otherwise. /*
415	int lre_parse_escape(const uint8_t *pp, int* allow_utf16)
416	{
417	const uint8_t *p;
418	uint32_t c;
419
420	p = *pp;
421	c = *p++;
422	switch(c) {
423	case `'b'`:
424	c = `'\b'`;
425	break;
426	case `'f'`:
427	c = `'\f'`;
428	break;
429	case `'n'`:
430	c = `'\n'`;
431	break;
432	case `'r'`:
433	c = `'\r'`;
434	break;
435	case `'t'`:
436	c = `'\t'`;
437	break;
438	case `'v'`:
439	c = `'\v'`;
440	break;
441	case `'x'`:
442	case `'u'`:
443	{
444	int h, n, i;
445	uint32_t c1;
446
447	if (*p == `'{'` && allow_utf16) {
448	p++;
449	c = `0`;
450	for(;;) {
451	h = from_hex(*p++);
452	if (h < `0`)
453	return -`1`;
454	c = (c << `4`) \| h;
455	if (c > `0x10FFFF`)
456	return -`1`;
457	if (*p == `'}'`)
458	break;
459	}
460	p++;
461	} else {
462	if (c == `'x'`) {
463	n = `2`;
464	} else {
465	n = `4`;
466	}
467
468	c = `0`;
469	for(i = `0`; i < n; i++) {
470	h = from_hex(*p++);
471	if (h < `0`) {
472	return -`1`;
473	}
474	c = (c << `4`) \| h;
475	}
476	if (is_hi_surrogate(c) &&
477	allow_utf16 == `2` && p[`0`] == `'\\'` && p[`1`] == `'u'`) {
478	/ convert an escaped surrogate pair into a*
479	unicode char /*
480	c1 = `0`;
481	for(i = `0`; i < `4`; i++) {
482	h = from_hex(p[`2` + i]);
483	if (h < `0`)
484	break;
485	c1 = (c1 << `4`) \| h;
486	}
487	if (i == `4` && is_lo_surrogate(c1)) {
488	p += `6`;
489	c = from_surrogate(c, c1);
490	}
491	}
492	}
493	}
494	break;
495	case `'0'`: case `'1'`: case `'2'`: case `'3'`:
496	case `'4'`: case `'5'`: case `'6'`: case `'7'`:
497	c -= `'0'`;
498	if (allow_utf16 == `2`) {
499	/ only accept \0 not followed by digit /
500	if (c != `0` \|\| is_digit(*p))
501	return -`1`;
502	} else {
503	/ parse a legacy octal sequence /
504	uint32_t v;
505	v = *p - `'0'`;
506	if (v > `7`)
507	break;
508	c = (c << `3`) \| v;
509	p++;
510	if (c >= `32`)
511	break;
512	v = *p - `'0'`;
513	if (v > `7`)
514	break;
515	c = (c << `3`) \| v;
516	p++;
517	}
518	break;
519	default:
520	return -`2`;
521	}
522	*pp = p;
523	return c;
524	}
525
526	#ifdef CONFIG_ALL_UNICODE
527	/ XXX: we use the same chars for name and value /
528	static BOOL is_unicode_char(int c)
529	{
530	return ((c >= `'0'` && c <= `'9'`) \|\|
531	(c >= `'A'` && c <= `'Z'`) \|\|
532	(c >= `'a'` && c <= `'z'`) \|\|
533	(c == `'_'`));
534	}
535
536	static int parse_unicode_property(REParseState s, CharRange cr,
537	const uint8_t **pp, BOOL is_inv)
538	{
539	const uint8_t *p;
540	char name[`64`], value[`64`];
541	char *q;
542	BOOL script_ext;
543	int ret;
544
545	p = *pp;
546	if (*p != `'{'`)
547	return re_parse_error(s, "expecting '{' after \\p");
548	p++;
549	q = name;
550	while (is_unicode_char(*p)) {
551	if ((q - name) >= sizeof(name) - `1`)
552	goto unknown_property_name;
553	q++ = p++;
554	}
555	*q = `'\0'`;
556	q = value;
557	if (*p == `'='`) {
558	p++;
559	while (is_unicode_char(*p)) {
560	if ((q - value) >= sizeof(value) - `1`)
561	return re_parse_error(s, "unknown unicode property value");
562	q++ = p++;
563	}
564	}
565	*q = `'\0'`;
566	if (*p != `'}'`)
567	return re_parse_error(s, "expecting '}'");
568	p++;
569	// printf("name=%s value=%s\n", name, value);
570
571	if (!strcmp(name, "Script") \|\| !strcmp(name, "sc")) {
572	script_ext = FALSE;
573	goto do_script;
574	} else if (!strcmp(name, "Script_Extensions") \|\| !strcmp(name, "scx")) {
575	script_ext = TRUE;
576	do_script:
577	cr_init(cr, s->opaque, lre_realloc);
578	ret = unicode_script(cr, value, script_ext);
579	if (ret) {
580	cr_free(cr);
581	if (ret == -`2`)
582	return re_parse_error(s, "unknown unicode script");
583	else
584	goto out_of_memory;
585	}
586	} else if (!strcmp(name, "General_Category") \|\| !strcmp(name, "gc")) {
587	cr_init(cr, s->opaque, lre_realloc);
588	ret = unicode_general_category(cr, value);
589	if (ret) {
590	cr_free(cr);
591	if (ret == -`2`)
592	return re_parse_error(s, "unknown unicode general category");
593	else
594	goto out_of_memory;
595	}
596	} else if (value[`0`] == `'\0'`) {
597	cr_init(cr, s->opaque, lre_realloc);
598	ret = unicode_general_category(cr, name);
599	if (ret == -`1`) {
600	cr_free(cr);
601	goto out_of_memory;
602	}
603	if (ret < `0`) {
604	ret = unicode_prop(cr, name);
605	if (ret) {
606	cr_free(cr);
607	if (ret == -`2`)
608	goto unknown_property_name;
609	else
610	goto out_of_memory;
611	}
612	}
613	} else {
614	unknown_property_name:
615	return re_parse_error(s, "unknown unicode property name");
616	}
617
618	if (is_inv) {
619	if (cr_invert(cr)) {
620	cr_free(cr);
621	return -`1`;
622	}
623	}
624	*pp = p;
625	return `0`;
626	out_of_memory:
627	return re_parse_out_of_memory(s);
628	}
629	#endif /* CONFIG_ALL_UNICODE */
630
631	/ return -1 if error otherwise the character or a class range*
632	(CLASS_RANGE_BASE). In case of class range, 'cr' is
633	initialized. Otherwise, it is ignored. /*
634	static int get_class_atom(REParseState s, CharRange cr,
635	const uint8_t **pp, BOOL inclass)
636	{
637	const uint8_t *p;
638	uint32_t c;
639	int ret;
640
641	p = *pp;
642
643	c = *p;
644	switch(c) {
645	case `'\\'`:
646	p++;
647	if (p >= s->buf_end)
648	goto unexpected_end;
649	c = *p++;
650	switch(c) {
651	case `'d'`:
652	c = CHAR_RANGE_d;
653	goto class_range;
654	case `'D'`:
655	c = CHAR_RANGE_D;
656	goto class_range;
657	case `'s'`:
658	c = CHAR_RANGE_s;
659	goto class_range;
660	case `'S'`:
661	c = CHAR_RANGE_S;
662	goto class_range;
663	case `'w'`:
664	c = CHAR_RANGE_w;
665	goto class_range;
666	case `'W'`:
667	c = CHAR_RANGE_W;
668	class_range:
669	if (cr_init_char_range(s, cr, c))
670	return -`1`;
671	c = CLASS_RANGE_BASE;
672	break;
673	case `'c'`:
674	c = *p;
675	if ((c >= `'a'` && c <= `'z'`) \|\|
676	(c >= `'A'` && c <= `'Z'`) \|\|
677	(((c >= `'0'` && c <= `'9'`) \|\| c == `'_'`) &&
678	inclass && !s->is_unicode)) { / Annex B.1.4 /
679	c &= `0x1f`;
680	p++;
681	} else if (s->is_unicode) {
682	goto invalid_escape;
683	} else {
684	/ otherwise return '\' and 'c' /
685	p--;
686	c = `'\\'`;
687	}
688	break;
689	case `'-'`:
690	if (!inclass && s->is_unicode)
691	goto invalid_escape;
692	break;
693	#ifdef CONFIG_ALL_UNICODE
694	case `'p'`:
695	case `'P'`:
696	if (s->is_unicode) {
697	if (parse_unicode_property(s, cr, &p, (c == `'P'`)))
698	return -`1`;
699	c = CLASS_RANGE_BASE;
700	break;
701	}
702	/ fall thru /
703	#endif
704	default:
705	p--;
706	ret = lre_parse_escape(&p, s->is_unicode * `2`);
707	if (ret >= `0`) {
708	c = ret;
709	} else {
710	if (ret == -`2` && p != `'\0'` && strchr("^$\\.+?()[]{}\|/", *p)) {
711	/ always valid to escape these characters /
712	goto normal_char;
713	} else if (s->is_unicode) {
714	invalid_escape:
715	return re_parse_error(s, "invalid escape sequence in regular expression");
716	} else {
717	/ just ignore the '\' /
718	goto normal_char;
719	}
720	}
721	break;
722	}
723	break;
724	case `'\0'`:
725	if (p >= s->buf_end) {
726	unexpected_end:
727	return re_parse_error(s, "unexpected end");
728	}
729	/ fall thru /
730	default:
731	normal_char:
732	/ normal char /
733	if (c >= `128`) {
734	c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
735	if ((unsigned)c > `0xffff` && !s->is_unicode) {
736	/ XXX: should handle non BMP-1 code points /
737	return re_parse_error(s, "malformed unicode char");
738	}
739	} else {
740	p++;
741	}
742	break;
743	}
744	*pp = p;
745	return c;
746	}
747
748	static int re_emit_range(REParseState s, const* CharRange *cr)
749	{
750	int len, i;
751	uint32_t high;
752
753	len = (unsigned)cr->len / `2`;
754	if (len >= `65535`)
755	return re_parse_error(s, "too many ranges");
756	if (len == `0`) {
757	/ not sure it can really happen. Emit a match that is always*
758	false /*
759	re_emit_op_u32(s, REOP_char32, -`1`);
760	} else {
761	high = cr->points[cr->len - `1`];
762	if (high == UINT32_MAX)
763	high = cr->points[cr->len - `2`];
764	if (high <= `0xffff`) {
765	/ can use 16 bit ranges with the conversion that 0xffff =*
766	infinity /*
767	re_emit_op_u16(s, REOP_range, len);
768	for(i = `0`; i < cr->len; i += `2`) {
769	dbuf_put_u16(&s->byte_code, cr->points[i]);
770	high = cr->points[i + `1`] - `1`;
771	if (high == UINT32_MAX - `1`)
772	high = `0xffff`;
773	dbuf_put_u16(&s->byte_code, high);
774	}
775	} else {
776	re_emit_op_u16(s, REOP_range32, len);
777	for(i = `0`; i < cr->len; i += `2`) {
778	dbuf_put_u32(&s->byte_code, cr->points[i]);
779	dbuf_put_u32(&s->byte_code, cr->points[i + `1`] - `1`);
780	}
781	}
782	}
783	return `0`;
784	}
785
786	static int re_parse_char_class(REParseState s, const* uint8_t **pp)
787	{
788	const uint8_t *p;
789	uint32_t c1, c2;
790	CharRange cr_s, *cr = &cr_s;
791	CharRange cr1_s, *cr1 = &cr1_s;
792	BOOL invert;
793
794	cr_init(cr, s->opaque, lre_realloc);
795	p = *pp;
796	p++; / skip '[' /
797
798	invert = FALSE;
799	if (*p == `'^'`) {
800	p++;
801	invert = TRUE;
802	}
803
804	for(;;) {
805	if (*p == `']'`)
806	break;
807	c1 = get_class_atom(s, cr1, &p, TRUE);
808	if ((int)c1 < `0`)
809	goto fail;
810	if (*p == `'-'` && p[`1`] != `']'`) {
811	const uint8_t *p0 = p + `1`;
812	if (c1 >= CLASS_RANGE_BASE) {
813	if (s->is_unicode) {
814	cr_free(cr1);
815	goto invalid_class_range;
816	}
817	/ Annex B: match '-' character /
818	goto class_atom;
819	}
820	c2 = get_class_atom(s, cr1, &p0, TRUE);
821	if ((int)c2 < `0`)
822	goto fail;
823	if (c2 >= CLASS_RANGE_BASE) {
824	cr_free(cr1);
825	if (s->is_unicode) {
826	goto invalid_class_range;
827	}
828	/ Annex B: match '-' character /
829	goto class_atom;
830	}
831	p = p0;
832	if (c2 < c1) {
833	invalid_class_range:
834	re_parse_error(s, "invalid class range");
835	goto fail;
836	}
837	if (cr_union_interval(cr, c1, c2))
838	goto memory_error;
839	} else {
840	class_atom:
841	if (c1 >= CLASS_RANGE_BASE) {
842	int ret;
843	ret = cr_union1(cr, cr1->points, cr1->len);
844	cr_free(cr1);
845	if (ret)
846	goto memory_error;
847	} else {
848	if (cr_union_interval(cr, c1, c1))
849	goto memory_error;
850	}
851	}
852	}
853	if (s->ignore_case) {
854	if (cr_regexp_canonicalize(cr, s->is_unicode))
855	goto memory_error;
856	}
857	if (invert) {
858	if (cr_invert(cr))
859	goto memory_error;
860	}
861	if (re_emit_range(s, cr))
862	goto fail;
863	cr_free(cr);
864	p++; / skip ']' /
865	*pp = p;
866	return `0`;
867	memory_error:
868	re_parse_out_of_memory(s);
869	fail:
870	cr_free(cr);
871	return -`1`;
872	}
873
874	/ Return:*
875	- true if the opcodes may not advance the char pointer
876	- false if the opcodes always advance the char pointer
877	*/
878	static BOOL re_need_check_advance(const uint8_t bc_buf, int* bc_buf_len)
879	{
880	int pos, opcode, len;
881	uint32_t val;
882	BOOL ret;
883
884	ret = TRUE;
885	pos = `0`;
886	while (pos < bc_buf_len) {
887	opcode = bc_buf[pos];
888	len = reopcode_info[opcode].size;
889	switch(opcode) {
890	case REOP_range:
891	val = get_u16(bc_buf + pos + `1`);
892	len += val * `4`;
893	goto simple_char;
894	case REOP_range32:
895	val = get_u16(bc_buf + pos + `1`);
896	len += val * `8`;
897	goto simple_char;
898	case REOP_char:
899	case REOP_char32:
900	case REOP_dot:
901	case REOP_any:
902	simple_char:
903	ret = FALSE;
904	break;
905	case REOP_line_start:
906	case REOP_line_end:
907	case REOP_push_i32:
908	case REOP_push_char_pos:
909	case REOP_drop:
910	case REOP_word_boundary:
911	case REOP_not_word_boundary:
912	case REOP_prev:
913	/ no effect /
914	break;
915	case REOP_save_start:
916	case REOP_save_end:
917	case REOP_save_reset:
918	case REOP_back_reference:
919	case REOP_backward_back_reference:
920	break;
921	default:
922	/ safe behavior: we cannot predict the outcome /
923	return TRUE;
924	}
925	pos += len;
926	}
927	return ret;
928	}
929
930	/ return -1 if a simple quantifier cannot be used. Otherwise return*
931	the number of characters in the atom. /*
932	static int re_is_simple_quantifier(const uint8_t bc_buf, int* bc_buf_len)
933	{
934	int pos, opcode, len, count;
935	uint32_t val;
936
937	count = `0`;
938	pos = `0`;
939	while (pos < bc_buf_len) {
940	opcode = bc_buf[pos];
941	len = reopcode_info[opcode].size;
942	switch(opcode) {
943	case REOP_range:
944	val = get_u16(bc_buf + pos + `1`);
945	len += val * `4`;
946	goto simple_char;
947	case REOP_range32:
948	val = get_u16(bc_buf + pos + `1`);
949	len += val * `8`;
950	goto simple_char;
951	case REOP_char:
952	case REOP_char32:
953	case REOP_dot:
954	case REOP_any:
955	simple_char:
956	count++;
957	break;
958	case REOP_line_start:
959	case REOP_line_end:
960	case REOP_word_boundary:
961	case REOP_not_word_boundary:
962	break;
963	default:
964	return -`1`;
965	}
966	pos += len;
967	}
968	return count;
969	}
970
971	/ 'pp' is the first char after '<' /*
972	static int re_parse_group_name(char buf, int* buf_size, const uint8_t **pp)
973	{
974	const uint8_t p, p1;
975	uint32_t c, d;
976	char *q;
977
978	p = *pp;
979	q = buf;
980	for(;;) {
981	c = *p;
982	if (c == `'\\'`) {
983	p++;
984	if (*p != `'u'`)
985	return -`1`;
986	c = lre_parse_escape(&p, `2`); // accept surrogate pairs
987	} else if (c == `'>'`) {
988	break;
989	} else if (c >= `128`) {
990	c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
991	if (is_hi_surrogate(c)) {
992	d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
993	if (is_lo_surrogate(d)) {
994	c = from_surrogate(c, d);
995	p = p1;
996	}
997	}
998	} else {
999	p++;
1000	}
1001	if (c > `0x10FFFF`)
1002	return -`1`;
1003	if (q == buf) {
1004	if (!lre_js_is_ident_first(c))
1005	return -`1`;
1006	} else {
1007	if (!lre_js_is_ident_next(c))
1008	return -`1`;
1009	}
1010	if ((q - buf + UTF8_CHAR_LEN_MAX + `1`) > buf_size)
1011	return -`1`;
1012	if (c < `128`) {
1013	*q++ = c;
1014	} else {
1015	q += unicode_to_utf8((uint8_t*)q, c);
1016	}
1017	}
1018	if (q == buf)
1019	return -`1`;
1020	*q = `'\0'`;
1021	p++;
1022	*pp = p;
1023	return `0`;
1024	}
1025
1026	/ if capture_name = NULL: return the number of captures + 1.*
1027	Otherwise, return the capture index corresponding to capture_name
1028	or -1 if none /*
1029	static int re_parse_captures(REParseState s, int* *phas_named_captures,
1030	const char *capture_name)
1031	{
1032	const uint8_t *p;
1033	int capture_index;
1034	char name[TMP_BUF_SIZE];
1035
1036	capture_index = `1`;
1037	*phas_named_captures = `0`;
1038	for (p = s->buf_start; p < s->buf_end; p++) {
1039	switch (*p) {
1040	case `'('`:
1041	if (p[`1`] == `'?'`) {
1042	if (p[`2`] == `'<'` && p[`3`] != `'='` && p[`3`] != `'!'`) {
1043	*phas_named_captures = `1`;
1044	/ potential named capture /
1045	if (capture_name) {
1046	p += `3`;
1047	if (re_parse_group_name(name, sizeof(name), &p) == `0`) {
1048	if (!strcmp(name, capture_name))
1049	return capture_index;
1050	}
1051	}
1052	capture_index++;
1053	if (capture_index >= CAPTURE_COUNT_MAX)
1054	goto done;
1055	}
1056	} else {
1057	capture_index++;
1058	if (capture_index >= CAPTURE_COUNT_MAX)
1059	goto done;
1060	}
1061	break;
1062	case `'\\'`:
1063	p++;
1064	break;
1065	case `'['`:
1066	for (p += `1` + (p == `']'`); p < s->buf_end && p != `']'`; p++) {
1067	if (*p == `'\\'`)
1068	p++;
1069	}
1070	break;
1071	}
1072	}
1073	done:
1074	if (capture_name)
1075	return -`1`;
1076	else
1077	return capture_index;
1078	}
1079
1080	static int re_count_captures(REParseState *s)
1081	{
1082	if (s->total_capture_count < `0`) {
1083	s->total_capture_count = re_parse_captures(s, &s->has_named_captures,
1084	NULL);
1085	}
1086	return s->total_capture_count;
1087	}
1088
1089	static BOOL re_has_named_captures(REParseState *s)
1090	{
1091	if (s->has_named_captures < `0`)
1092	re_count_captures(s);
1093	return s->has_named_captures;
1094	}
1095
1096	static int find_group_name(REParseState s, const* char *name)
1097	{
1098	const char p, buf_end;
1099	size_t len, name_len;
1100	int capture_index;
1101
1102	p = (char *)s->group_names.buf;
1103	if (!p) return -`1`;
1104	buf_end = (char *)s->group_names.buf + s->group_names.size;
1105	name_len = strlen(name);
1106	capture_index = `1`;
1107	while (p < buf_end) {
1108	len = strlen(p);
1109	if (len == name_len && memcmp(name, p, name_len) == `0`)
1110	return capture_index;
1111	p += len + `1`;
1112	capture_index++;
1113	}
1114	return -`1`;
1115	}
1116
1117	static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir);
1118
1119	static int re_parse_term(REParseState *s, BOOL is_backward_dir)
1120	{
1121	const uint8_t *p;
1122	int c, last_atom_start, quant_min, quant_max, last_capture_count;
1123	BOOL greedy, add_zero_advance_check, is_neg, is_backward_lookahead;
1124	CharRange cr_s, *cr = &cr_s;
1125
1126	last_atom_start = -`1`;
1127	last_capture_count = `0`;
1128	p = s->buf_ptr;
1129	c = *p;
1130	switch(c) {
1131	case `'^'`:
1132	p++;
1133	re_emit_op(s, REOP_line_start);
1134	break;
1135	case `'$'`:
1136	p++;
1137	re_emit_op(s, REOP_line_end);
1138	break;
1139	case `'.'`:
1140	p++;
1141	last_atom_start = s->byte_code.size;
1142	last_capture_count = s->capture_count;
1143	if (is_backward_dir)
1144	re_emit_op(s, REOP_prev);
1145	re_emit_op(s, s->dotall ? REOP_any : REOP_dot);
1146	if (is_backward_dir)
1147	re_emit_op(s, REOP_prev);
1148	break;
1149	case `'{'`:
1150	if (s->is_unicode) {
1151	return re_parse_error(s, "syntax error");
1152	} else if (!is_digit(p[`1`])) {
1153	/ Annex B: we accept '{' not followed by digits as a*
1154	normal atom /*
1155	goto parse_class_atom;
1156	} else {
1157	const uint8_t *p1 = p + `1`;
1158	/ Annex B: error if it is like a repetition count /
1159	parse_digits(&p1, TRUE);
1160	if (*p1 == `','`) {
1161	p1++;
1162	if (is_digit(*p1)) {
1163	parse_digits(&p1, TRUE);
1164	}
1165	}
1166	if (*p1 != `'}'`) {
1167	goto parse_class_atom;
1168	}
1169	}
1170	/ fall thru /
1171	case `'*'`:
1172	case `'+'`:
1173	case `'?'`:
1174	return re_parse_error(s, "nothing to repeat");
1175	case `'('`:
1176	if (p[`1`] == `'?'`) {
1177	if (p[`2`] == `':'`) {
1178	p += `3`;
1179	last_atom_start = s->byte_code.size;
1180	last_capture_count = s->capture_count;
1181	s->buf_ptr = p;
1182	if (re_parse_disjunction(s, is_backward_dir))
1183	return -`1`;
1184	p = s->buf_ptr;
1185	if (re_parse_expect(s, &p, `')'`))
1186	return -`1`;
1187	} else if ((p[`2`] == `'='` \|\| p[`2`] == `'!'`)) {
1188	is_neg = (p[`2`] == `'!'`);
1189	is_backward_lookahead = FALSE;
1190	p += `3`;
1191	goto lookahead;
1192	} else if (p[`2`] == `'<'` &&
1193	(p[`3`] == `'='` \|\| p[`3`] == `'!'`)) {
1194	int pos;
1195	is_neg = (p[`3`] == `'!'`);
1196	is_backward_lookahead = TRUE;
1197	p += `4`;
1198	/ lookahead /
1199	lookahead:
1200	/ Annex B allows lookahead to be used as an atom for*
1201	the quantifiers /*
1202	if (!s->is_unicode && !is_backward_lookahead) {
1203	last_atom_start = s->byte_code.size;
1204	last_capture_count = s->capture_count;
1205	}
1206	pos = re_emit_op_u32(s, REOP_lookahead + is_neg, `0`);
1207	s->buf_ptr = p;
1208	if (re_parse_disjunction(s, is_backward_lookahead))
1209	return -`1`;
1210	p = s->buf_ptr;
1211	if (re_parse_expect(s, &p, `')'`))
1212	return -`1`;
1213	re_emit_op(s, REOP_match);
1214	/ jump after the 'match' after the lookahead is successful /
1215	if (dbuf_error(&s->byte_code))
1216	return -`1`;
1217	put_u32(s->byte_code.buf + pos, s->byte_code.size - (pos + `4`));
1218	} else if (p[`2`] == `'<'`) {
1219	p += `3`;
1220	if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
1221	&p)) {
1222	return re_parse_error(s, "invalid group name");
1223	}
1224	if (find_group_name(s, s->u.tmp_buf) > `0`) {
1225	return re_parse_error(s, "duplicate group name");
1226	}
1227	/ group name with a trailing zero /
1228	dbuf_put(&s->group_names, (uint8_t *)s->u.tmp_buf,
1229	strlen(s->u.tmp_buf) + `1`);
1230	s->has_named_captures = `1`;
1231	goto parse_capture;
1232	} else {
1233	return re_parse_error(s, "invalid group");
1234	}
1235	} else {
1236	int capture_index;
1237	p++;
1238	/ capture without group name /
1239	dbuf_putc(&s->group_names, `0`);
1240	parse_capture:
1241	if (s->capture_count >= CAPTURE_COUNT_MAX)
1242	return re_parse_error(s, "too many captures");
1243	last_atom_start = s->byte_code.size;
1244	last_capture_count = s->capture_count;
1245	capture_index = s->capture_count++;
1246	re_emit_op_u8(s, REOP_save_start + is_backward_dir,
1247	capture_index);
1248
1249	s->buf_ptr = p;
1250	if (re_parse_disjunction(s, is_backward_dir))
1251	return -`1`;
1252	p = s->buf_ptr;
1253
1254	re_emit_op_u8(s, REOP_save_start + `1` - is_backward_dir,
1255	capture_index);
1256
1257	if (re_parse_expect(s, &p, `')'`))
1258	return -`1`;
1259	}
1260	break;
1261	case `'\\'`:
1262	switch(p[`1`]) {
1263	case `'b'`:
1264	case `'B'`:
1265	re_emit_op(s, REOP_word_boundary + (p[`1`] != `'b'`));
1266	p += `2`;
1267	break;
1268	case `'k'`:
1269	{
1270	const uint8_t *p1;
1271	int dummy_res;
1272
1273	p1 = p;
1274	if (p1[`2`] != `'<'`) {
1275	/ annex B: we tolerate invalid group names in non*
1276	unicode mode if there is no named capture
1277	definition /*
1278	if (s->is_unicode \|\| re_has_named_captures(s))
1279	return re_parse_error(s, "expecting group name");
1280	else
1281	goto parse_class_atom;
1282	}
1283	p1 += `3`;
1284	if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
1285	&p1)) {
1286	if (s->is_unicode \|\| re_has_named_captures(s))
1287	return re_parse_error(s, "invalid group name");
1288	else
1289	goto parse_class_atom;
1290	}
1291	c = find_group_name(s, s->u.tmp_buf);
1292	if (c < `0`) {
1293	/ no capture name parsed before, try to look*
1294	after (inefficient, but hopefully not common /*
1295	c = re_parse_captures(s, &dummy_res, s->u.tmp_buf);
1296	if (c < `0`) {
1297	if (s->is_unicode \|\| re_has_named_captures(s))
1298	return re_parse_error(s, "group name not defined");
1299	else
1300	goto parse_class_atom;
1301	}
1302	}
1303	p = p1;
1304	}
1305	goto emit_back_reference;
1306	case `'0'`:
1307	p += `2`;
1308	c = `0`;
1309	if (s->is_unicode) {
1310	if (is_digit(*p)) {
1311	return re_parse_error(s, "invalid decimal escape in regular expression");
1312	}
1313	} else {
1314	/ Annex B.1.4: accept legacy octal /
1315	if (p >= `'0'` && p <= `'7'`) {
1316	c = *p++ - `'0'`;
1317	if (p >= `'0'` && p <= `'7'`) {
1318	c = (c << `3`) + *p++ - `'0'`;
1319	}
1320	}
1321	}
1322	goto normal_char;
1323	case `'1'`: case `'2'`: case `'3'`: case `'4'`:
1324	case `'5'`: case `'6'`: case `'7'`: case `'8'`:
1325	case `'9'`:
1326	{
1327	const uint8_t *q = ++p;
1328
1329	c = parse_digits(&p, FALSE);
1330	if (c < `0` \|\| (c >= s->capture_count && c >= re_count_captures(s))) {
1331	if (!s->is_unicode) {
1332	/ Annex B.1.4: accept legacy octal /
1333	p = q;
1334	if (*p <= `'7'`) {
1335	c = `0`;
1336	if (*p <= `'3'`)
1337	c = *p++ - `'0'`;
1338	if (p >= `'0'` && p <= `'7'`) {
1339	c = (c << `3`) + *p++ - `'0'`;
1340	if (p >= `'0'` && p <= `'7'`) {
1341	c = (c << `3`) + *p++ - `'0'`;
1342	}
1343	}
1344	} else {
1345	c = *p++;
1346	}
1347	goto normal_char;
1348	}
1349	return re_parse_error(s, "back reference out of range in regular expression");
1350	}
1351	emit_back_reference:
1352	last_atom_start = s->byte_code.size;
1353	last_capture_count = s->capture_count;
1354	re_emit_op_u8(s, REOP_back_reference + is_backward_dir, c);
1355	}
1356	break;
1357	default:
1358	goto parse_class_atom;
1359	}
1360	break;
1361	case `'['`:
1362	last_atom_start = s->byte_code.size;
1363	last_capture_count = s->capture_count;
1364	if (is_backward_dir)
1365	re_emit_op(s, REOP_prev);
1366	if (re_parse_char_class(s, &p))
1367	return -`1`;
1368	if (is_backward_dir)
1369	re_emit_op(s, REOP_prev);
1370	break;
1371	case `']'`:
1372	case `'}'`:
1373	if (s->is_unicode)
1374	return re_parse_error(s, "syntax error");
1375	goto parse_class_atom;
1376	default:
1377	parse_class_atom:
1378	c = get_class_atom(s, cr, &p, FALSE);
1379	if ((int)c < `0`)
1380	return -`1`;
1381	normal_char:
1382	last_atom_start = s->byte_code.size;
1383	last_capture_count = s->capture_count;
1384	if (is_backward_dir)
1385	re_emit_op(s, REOP_prev);
1386	if (c >= CLASS_RANGE_BASE) {
1387	int ret;
1388	/ Note: canonicalization is not needed /
1389	ret = re_emit_range(s, cr);
1390	cr_free(cr);
1391	if (ret)
1392	return -`1`;
1393	} else {
1394	if (s->ignore_case)
1395	c = lre_canonicalize(c, s->is_unicode);
1396	if (c <= `0xffff`)
1397	re_emit_op_u16(s, REOP_char, c);
1398	else
1399	re_emit_op_u32(s, REOP_char32, c);
1400	}
1401	if (is_backward_dir)
1402	re_emit_op(s, REOP_prev);
1403	break;
1404	}
1405
1406	/ quantifier /
1407	if (last_atom_start >= `0`) {
1408	c = *p;
1409	switch(c) {
1410	case `'*'`:
1411	p++;
1412	quant_min = `0`;
1413	quant_max = INT32_MAX;
1414	goto quantifier;
1415	case `'+'`:
1416	p++;
1417	quant_min = `1`;
1418	quant_max = INT32_MAX;
1419	goto quantifier;
1420	case `'?'`:
1421	p++;
1422	quant_min = `0`;
1423	quant_max = `1`;
1424	goto quantifier;
1425	case `'{'`:
1426	{
1427	const uint8_t *p1 = p;
1428	/ As an extension (see ES6 annex B), we accept '{' not*
1429	followed by digits as a normal atom /*
1430	if (!is_digit(p[`1`])) {
1431	if (s->is_unicode)
1432	goto invalid_quant_count;
1433	break;
1434	}
1435	p++;
1436	quant_min = parse_digits(&p, TRUE);
1437	quant_max = quant_min;
1438	if (*p == `','`) {
1439	p++;
1440	if (is_digit(*p)) {
1441	quant_max = parse_digits(&p, TRUE);
1442	if (quant_max < quant_min) {
1443	invalid_quant_count:
1444	return re_parse_error(s, "invalid repetition count");
1445	}
1446	} else {
1447	quant_max = INT32_MAX; / infinity /
1448	}
1449	}
1450	if (*p != `'}'` && !s->is_unicode) {
1451	/ Annex B: normal atom if invalid '{' syntax /
1452	p = p1;
1453	break;
1454	}
1455	if (re_parse_expect(s, &p, `'}'`))
1456	return -`1`;
1457	}
1458	quantifier:
1459	greedy = TRUE;
1460	if (*p == `'?'`) {
1461	p++;
1462	greedy = FALSE;
1463	}
1464	if (last_atom_start < `0`) {
1465	return re_parse_error(s, "nothing to repeat");
1466	}
1467	if (greedy) {
1468	int len, pos;
1469
1470	if (quant_max > `0`) {
1471	/ specific optimization for simple quantifiers /
1472	if (dbuf_error(&s->byte_code))
1473	goto out_of_memory;
1474	len = re_is_simple_quantifier(s->byte_code.buf + last_atom_start,
1475	s->byte_code.size - last_atom_start);
1476	if (len > `0`) {
1477	re_emit_op(s, REOP_match);
1478
1479	if (dbuf_insert(&s->byte_code, last_atom_start, `17`))
1480	goto out_of_memory;
1481	pos = last_atom_start;
1482	s->byte_code.buf[pos++] = REOP_simple_greedy_quant;
1483	put_u32(&s->byte_code.buf[pos],
1484	s->byte_code.size - last_atom_start - `17`);
1485	pos += `4`;
1486	put_u32(&s->byte_code.buf[pos], quant_min);
1487	pos += `4`;
1488	put_u32(&s->byte_code.buf[pos], quant_max);
1489	pos += `4`;
1490	put_u32(&s->byte_code.buf[pos], len);
1491	pos += `4`;
1492	goto done;
1493	}
1494	}
1495
1496	if (dbuf_error(&s->byte_code))
1497	goto out_of_memory;
1498	}
1499	/ the spec tells that if there is no advance when*
1500	running the atom after the first quant_min times,
1501	then there is no match. We remove this test when we
1502	are sure the atom always advances the position. /*
1503	add_zero_advance_check = re_need_check_advance(s->byte_code.buf + last_atom_start,
1504	s->byte_code.size - last_atom_start);
1505
1506	{
1507	int len, pos;
1508	len = s->byte_code.size - last_atom_start;
1509	if (quant_min == `0`) {
1510	/ need to reset the capture in case the atom is*
1511	not executed /*
1512	if (last_capture_count != s->capture_count) {
1513	if (dbuf_insert(&s->byte_code, last_atom_start, `3`))
1514	goto out_of_memory;
1515	s->byte_code.buf[last_atom_start++] = REOP_save_reset;
1516	s->byte_code.buf[last_atom_start++] = last_capture_count;
1517	s->byte_code.buf[last_atom_start++] = s->capture_count - `1`;
1518	}
1519	if (quant_max == `0`) {
1520	s->byte_code.size = last_atom_start;
1521	} else if (quant_max == `1` \|\| quant_max == INT32_MAX) {
1522	BOOL has_goto = (quant_max == INT32_MAX);
1523	if (dbuf_insert(&s->byte_code, last_atom_start, `5` + add_zero_advance_check))
1524	goto out_of_memory;
1525	s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
1526	greedy;
1527	put_u32(s->byte_code.buf + last_atom_start + `1`,
1528	len + `5` * has_goto + add_zero_advance_check * `2`);
1529	if (add_zero_advance_check) {
1530	s->byte_code.buf[last_atom_start + `1` + `4`] = REOP_push_char_pos;
1531	re_emit_op(s, REOP_check_advance);
1532	}
1533	if (has_goto)
1534	re_emit_goto(s, REOP_goto, last_atom_start);
1535	} else {
1536	if (dbuf_insert(&s->byte_code, last_atom_start, `10` + add_zero_advance_check))
1537	goto out_of_memory;
1538	pos = last_atom_start;
1539	s->byte_code.buf[pos++] = REOP_push_i32;
1540	put_u32(s->byte_code.buf + pos, quant_max);
1541	pos += `4`;
1542	s->byte_code.buf[pos++] = REOP_split_goto_first + greedy;
1543	put_u32(s->byte_code.buf + pos, len + `5` + add_zero_advance_check * `2`);
1544	pos += `4`;
1545	if (add_zero_advance_check) {
1546	s->byte_code.buf[pos++] = REOP_push_char_pos;
1547	re_emit_op(s, REOP_check_advance);
1548	}
1549	re_emit_goto(s, REOP_loop, last_atom_start + `5`);
1550	re_emit_op(s, REOP_drop);
1551	}
1552	} else if (quant_min == `1` && quant_max == INT32_MAX &&
1553	!add_zero_advance_check) {
1554	re_emit_goto(s, REOP_split_next_first - greedy,
1555	last_atom_start);
1556	} else {
1557	if (quant_min == `1`) {
1558	/ nothing to add /
1559	} else {
1560	if (dbuf_insert(&s->byte_code, last_atom_start, `5`))
1561	goto out_of_memory;
1562	s->byte_code.buf[last_atom_start] = REOP_push_i32;
1563	put_u32(s->byte_code.buf + last_atom_start + `1`,
1564	quant_min);
1565	last_atom_start += `5`;
1566	re_emit_goto(s, REOP_loop, last_atom_start);
1567	re_emit_op(s, REOP_drop);
1568	}
1569	if (quant_max == INT32_MAX) {
1570	pos = s->byte_code.size;
1571	re_emit_op_u32(s, REOP_split_goto_first + greedy,
1572	len + `5` + add_zero_advance_check * `2`);
1573	if (add_zero_advance_check)
1574	re_emit_op(s, REOP_push_char_pos);
1575	/ copy the atom /
1576	dbuf_put_self(&s->byte_code, last_atom_start, len);
1577	if (add_zero_advance_check)
1578	re_emit_op(s, REOP_check_advance);
1579	re_emit_goto(s, REOP_goto, pos);
1580	} else if (quant_max > quant_min) {
1581	re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min);
1582	pos = s->byte_code.size;
1583	re_emit_op_u32(s, REOP_split_goto_first + greedy,
1584	len + `5` + add_zero_advance_check * `2`);
1585	if (add_zero_advance_check)
1586	re_emit_op(s, REOP_push_char_pos);
1587	/ copy the atom /
1588	dbuf_put_self(&s->byte_code, last_atom_start, len);
1589	if (add_zero_advance_check)
1590	re_emit_op(s, REOP_check_advance);
1591	re_emit_goto(s, REOP_loop, pos);
1592	re_emit_op(s, REOP_drop);
1593	}
1594	}
1595	last_atom_start = -`1`;
1596	}
1597	break;
1598	default:
1599	break;
1600	}
1601	}
1602	done:
1603	s->buf_ptr = p;
1604	return `0`;
1605	out_of_memory:
1606	return re_parse_out_of_memory(s);
1607	}
1608
1609	static int re_parse_alternative(REParseState *s, BOOL is_backward_dir)
1610	{
1611	const uint8_t *p;
1612	int ret;
1613	size_t start, term_start, end, term_size;
1614
1615	start = s->byte_code.size;
1616	for(;;) {
1617	p = s->buf_ptr;
1618	if (p >= s->buf_end)
1619	break;
1620	if (p == `'\|'` \|\| p == `')'`)
1621	break;
1622	term_start = s->byte_code.size;
1623	ret = re_parse_term(s, is_backward_dir);
1624	if (ret)
1625	return ret;
1626	if (is_backward_dir) {
1627	/ reverse the order of the terms (XXX: inefficient, but*
1628	speed is not really critical here) /*
1629	end = s->byte_code.size;
1630	term_size = end - term_start;
1631	if (dbuf_realloc(&s->byte_code, end + term_size))
1632	return -`1`;
1633	memmove(s->byte_code.buf + start + term_size,
1634	s->byte_code.buf + start,
1635	end - start);
1636	memcpy(s->byte_code.buf + start, s->byte_code.buf + end,
1637	term_size);
1638	}
1639	}
1640	return `0`;
1641	}
1642
1643	static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir)
1644	{
1645	int start, len, pos;
1646
1647	if (lre_check_stack_overflow(s->opaque, `0`))
1648	return re_parse_error(s, "stack overflow");
1649
1650	start = s->byte_code.size;
1651	if (re_parse_alternative(s, is_backward_dir))
1652	return -`1`;
1653	while (*s->buf_ptr == `'\|'`) {
1654	s->buf_ptr++;
1655
1656	len = s->byte_code.size - start;
1657
1658	/ insert a split before the first alternative /
1659	if (dbuf_insert(&s->byte_code, start, `5`)) {
1660	return re_parse_out_of_memory(s);
1661	}
1662	s->byte_code.buf[start] = REOP_split_next_first;
1663	put_u32(s->byte_code.buf + start + `1`, len + `5`);
1664
1665	pos = re_emit_op_u32(s, REOP_goto, `0`);
1666
1667	if (re_parse_alternative(s, is_backward_dir))
1668	return -`1`;
1669
1670	/ patch the goto /
1671	len = s->byte_code.size - (pos + `4`);
1672	put_u32(s->byte_code.buf + pos, len);
1673	}
1674	return `0`;
1675	}
1676
1677	/ the control flow is recursive so the analysis can be linear /
1678	static int compute_stack_size(const uint8_t bc_buf, int* bc_buf_len)
1679	{
1680	int stack_size, stack_size_max, pos, opcode, len;
1681	uint32_t val;
1682
1683	stack_size = `0`;
1684	stack_size_max = `0`;
1685	bc_buf += RE_HEADER_LEN;
1686	bc_buf_len -= RE_HEADER_LEN;
1687	pos = `0`;
1688	while (pos < bc_buf_len) {
1689	opcode = bc_buf[pos];
1690	len = reopcode_info[opcode].size;
1691	assert(opcode < REOP_COUNT);
1692	assert((pos + len) <= bc_buf_len);
1693	switch(opcode) {
1694	case REOP_push_i32:
1695	case REOP_push_char_pos:
1696	stack_size++;
1697	if (stack_size > stack_size_max) {
1698	if (stack_size > STACK_SIZE_MAX)
1699	return -`1`;
1700	stack_size_max = stack_size;
1701	}
1702	break;
1703	case REOP_drop:
1704	case REOP_check_advance:
1705	assert(stack_size > `0`);
1706	stack_size--;
1707	break;
1708	case REOP_range:
1709	val = get_u16(bc_buf + pos + `1`);
1710	len += val * `4`;
1711	break;
1712	case REOP_range32:
1713	val = get_u16(bc_buf + pos + `1`);
1714	len += val * `8`;
1715	break;
1716	}
1717	pos += len;
1718	}
1719	return stack_size_max;
1720	}
1721
1722	/ 'buf' must be a zero terminated UTF-8 string of length buf_len.*
1723	Return NULL if error and allocate an error message in perror_msg,*
1724	otherwise the compiled bytecode and its length in plen.
1725	*/
1726	uint8_t lre_compile(int* plen, char* error_msg, int* error_msg_size,
1727	const char buf, size_t buf_len, int* re_flags,
1728	void *opaque)
1729	{
1730	REParseState s_s, *s = &s_s;
1731	int stack_size;
1732	BOOL is_sticky;
1733
1734	memset(s, `0`, sizeof(*s));
1735	s->opaque = opaque;
1736	s->buf_ptr = (const uint8_t *)buf;
1737	s->buf_end = s->buf_ptr + buf_len;
1738	s->buf_start = s->buf_ptr;
1739	s->re_flags = re_flags;
1740	s->is_unicode = ((re_flags & LRE_FLAG_UNICODE) != `0`);
1741	is_sticky = ((re_flags & LRE_FLAG_STICKY) != `0`);
1742	s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != `0`);
1743	s->dotall = ((re_flags & LRE_FLAG_DOTALL) != `0`);
1744	s->capture_count = `1`;
1745	s->total_capture_count = -`1`;
1746	s->has_named_captures = -`1`;
1747
1748	dbuf_init2(&s->byte_code, opaque, lre_realloc);
1749	dbuf_init2(&s->group_names, opaque, lre_realloc);
1750
1751	dbuf_putc(&s->byte_code, re_flags); / first element is the flags /
1752	dbuf_putc(&s->byte_code, `0`); / second element is the number of captures /
1753	dbuf_putc(&s->byte_code, `0`); / stack size /
1754	dbuf_put_u32(&s->byte_code, `0`); / bytecode length /
1755
1756	if (!is_sticky) {
1757	/ iterate thru all positions (about the same as .?( ... ) )
1758	. We do it without an explicit loop so that lock step
1759	thread execution will be possible in an optimized
1760	implementation /*
1761	re_emit_op_u32(s, REOP_split_goto_first, `1` + `5`);
1762	re_emit_op(s, REOP_any);
1763	re_emit_op_u32(s, REOP_goto, -(`5` + `1` + `5`));
1764	}
1765	re_emit_op_u8(s, REOP_save_start, `0`);
1766
1767	if (re_parse_disjunction(s, FALSE)) {
1768	error:
1769	dbuf_free(&s->byte_code);
1770	dbuf_free(&s->group_names);
1771	pstrcpy(error_msg, error_msg_size, s->u.error_msg);
1772	*plen = `0`;
1773	return NULL;
1774	}
1775
1776	re_emit_op_u8(s, REOP_save_end, `0`);
1777
1778	re_emit_op(s, REOP_match);
1779
1780	if (*s->buf_ptr != `'\0'`) {
1781	re_parse_error(s, "extraneous characters at the end");
1782	goto error;
1783	}
1784
1785	if (dbuf_error(&s->byte_code)) {
1786	re_parse_out_of_memory(s);
1787	goto error;
1788	}
1789
1790	stack_size = compute_stack_size(s->byte_code.buf, s->byte_code.size);
1791	if (stack_size < `0`) {
1792	re_parse_error(s, "too many imbricated quantifiers");
1793	goto error;
1794	}
1795
1796	s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count;
1797	s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size;
1798	put_u32(s->byte_code.buf + RE_HEADER_BYTECODE_LEN,
1799	s->byte_code.size - RE_HEADER_LEN);
1800
1801	/ add the named groups if needed /
1802	if (s->group_names.size > (s->capture_count - `1`)) {
1803	dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size);
1804	s->byte_code.buf[RE_HEADER_FLAGS] \|= LRE_FLAG_NAMED_GROUPS;
1805	}
1806	dbuf_free(&s->group_names);
1807
1808	#ifdef DUMP_REOP
1809	lre_dump_bytecode(s->byte_code.buf, s->byte_code.size);
1810	#endif
1811
1812	error_msg[`0`] = `'\0'`;
1813	*plen = s->byte_code.size;
1814	return s->byte_code.buf;
1815	}
1816
1817	static BOOL is_line_terminator(uint32_t c)
1818	{
1819	return (c == `'\n'` \|\| c == `'\r'` \|\| c == CP_LS \|\| c == CP_PS);
1820	}
1821
1822	static BOOL is_word_char(uint32_t c)
1823	{
1824	return ((c >= `'0'` && c <= `'9'`) \|\|
1825	(c >= `'a'` && c <= `'z'`) \|\|
1826	(c >= `'A'` && c <= `'Z'`) \|\|
1827	(c == `'_'`));
1828	}
1829
1830	#define GET_CHAR(c, cptr, cbuf_end, cbuf_type) \
1831	do { \
1832	if (cbuf_type == 0) { \
1833	c = *cptr++; \
1834	} else { \
1835	const uint16_t _p = (const uint16_t )cptr; \
1836	const uint16_t _end = (const uint16_t )cbuf_end; \
1837	c = *_p++; \
1838	if (is_hi_surrogate(c) && cbuf_type == 2) { \
1839	if (_p < _end && is_lo_surrogate(*_p)) { \
1840	c = from_surrogate(c, *_p++); \
1841	} \
1842	} \
1843	cptr = (const void *)_p; \
1844	} \
1845	} while (0)
1846
1847	#define PEEK_CHAR(c, cptr, cbuf_end, cbuf_type) \
1848	do { \
1849	if (cbuf_type == 0) { \
1850	c = cptr[0]; \
1851	} else { \
1852	const uint16_t _p = (const uint16_t )cptr; \
1853	const uint16_t _end = (const uint16_t )cbuf_end; \
1854	c = *_p++; \
1855	if (is_hi_surrogate(c) && cbuf_type == 2) { \
1856	if (_p < _end && is_lo_surrogate(*_p)) { \
1857	c = from_surrogate(c, *_p); \
1858	} \
1859	} \
1860	} \
1861	} while (0)
1862
1863	#define PEEK_PREV_CHAR(c, cptr, cbuf_start, cbuf_type) \
1864	do { \
1865	if (cbuf_type == 0) { \
1866	c = cptr[-1]; \
1867	} else { \
1868	const uint16_t _p = (const uint16_t )cptr - 1; \
1869	const uint16_t _start = (const uint16_t )cbuf_start; \
1870	c = *_p; \
1871	if (is_lo_surrogate(c) && cbuf_type == 2) { \
1872	if (_p > _start && is_hi_surrogate(_p[-1])) { \
1873	c = from_surrogate(*--_p, c); \
1874	} \
1875	} \
1876	} \
1877	} while (0)
1878
1879	#define GET_PREV_CHAR(c, cptr, cbuf_start, cbuf_type) \
1880	do { \
1881	if (cbuf_type == 0) { \
1882	cptr--; \
1883	c = cptr[0]; \
1884	} else { \
1885	const uint16_t _p = (const uint16_t )cptr - 1; \
1886	const uint16_t _start = (const uint16_t )cbuf_start; \
1887	c = *_p; \
1888	if (is_lo_surrogate(c) && cbuf_type == 2) { \
1889	if (_p > _start && is_hi_surrogate(_p[-1])) { \
1890	c = from_surrogate(*--_p, c); \
1891	} \
1892	} \
1893	cptr = (const void *)_p; \
1894	} \
1895	} while (0)
1896
1897	#define PREV_CHAR(cptr, cbuf_start, cbuf_type) \
1898	do { \
1899	if (cbuf_type == 0) { \
1900	cptr--; \
1901	} else { \
1902	const uint16_t _p = (const uint16_t )cptr - 1; \
1903	const uint16_t _start = (const uint16_t )cbuf_start; \
1904	if (is_lo_surrogate(*_p) && cbuf_type == 2) { \
1905	if (_p > _start && is_hi_surrogate(_p[-1])) { \
1906	--_p; \
1907	} \
1908	} \
1909	cptr = (const void *)_p; \
1910	} \
1911	} while (0)
1912
1913	typedef uintptr_t StackInt;
1914
1915	typedef enum {
1916	RE_EXEC_STATE_SPLIT,
1917	RE_EXEC_STATE_LOOKAHEAD,
1918	RE_EXEC_STATE_NEGATIVE_LOOKAHEAD,
1919	RE_EXEC_STATE_GREEDY_QUANT,
1920	} REExecStateEnum;
1921
1922	typedef struct REExecState {
1923	REExecStateEnum type : `8`;
1924	uint8_t stack_len;
1925	size_t count; / only used for RE_EXEC_STATE_GREEDY_QUANT /
1926	const uint8_t *cptr;
1927	const uint8_t *pc;
1928	void *buf[`0`];
1929	} REExecState;
1930
1931	typedef struct {
1932	const uint8_t *cbuf;
1933	const uint8_t *cbuf_end;
1934	/ 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 /
1935	int cbuf_type;
1936	int capture_count;
1937	int stack_size_max;
1938	BOOL multi_line;
1939	BOOL ignore_case;
1940	BOOL is_unicode;
1941	int interrupt_counter;
1942	void opaque; /* used for stack overflow check /
1943
1944	size_t state_size;
1945	uint8_t *state_stack;
1946	size_t state_stack_size;
1947	size_t state_stack_len;
1948	} REExecContext;
1949
1950	static int push_state(REExecContext *s,
1951	uint8_t **capture,
1952	StackInt *stack, size_t stack_len,
1953	const uint8_t pc, const* uint8_t *cptr,
1954	REExecStateEnum type, size_t count)
1955	{
1956	REExecState *rs;
1957	uint8_t *new_stack;
1958	size_t new_size, i, n;
1959	StackInt *stack_buf;
1960
1961	if (unlikely((s->state_stack_len + `1`) > s->state_stack_size)) {
1962	/ reallocate the stack /
1963	new_size = s->state_stack_size * `3` / `2`;
1964	if (new_size < `8`)
1965	new_size = `8`;
1966	new_stack = lre_realloc(s->opaque, s->state_stack, new_size * s->state_size);
1967	if (!new_stack)
1968	return -`1`;
1969	s->state_stack_size = new_size;
1970	s->state_stack = new_stack;
1971	}
1972	rs = (REExecState )(s->state_stack + s->state_stack_len s->state_size);
1973	s->state_stack_len++;
1974	rs->type = type;
1975	rs->count = count;
1976	rs->stack_len = stack_len;
1977	rs->cptr = cptr;
1978	rs->pc = pc;
1979	n = `2` * s->capture_count;
1980	for(i = `0`; i < n; i++)
1981	rs->buf[i] = capture[i];
1982	stack_buf = (StackInt *)(rs->buf + n);
1983	for(i = `0`; i < stack_len; i++)
1984	stack_buf[i] = stack[i];
1985	return `0`;
1986	}
1987
1988	static int lre_poll_timeout(REExecContext *s)
1989	{
1990	if (unlikely(--s->interrupt_counter <= `0`)) {
1991	s->interrupt_counter = INTERRUPT_COUNTER_INIT;
1992	if (lre_check_timeout(s->opaque))
1993	return LRE_RET_TIMEOUT;
1994	}
1995	return `0`;
1996	}
1997
1998	/ return 1 if match, 0 if not match or < 0 if error. /
1999	static intptr_t lre_exec_backtrack(REExecContext s, uint8_t *capture,
2000	StackInt stack, int* stack_len,
2001	const uint8_t pc, const* uint8_t *cptr,
2002	BOOL no_recurse)
2003	{
2004	int opcode, ret;
2005	int cbuf_type;
2006	uint32_t val, c;
2007	const uint8_t *cbuf_end;
2008
2009	cbuf_type = s->cbuf_type;
2010	cbuf_end = s->cbuf_end;
2011
2012	for(;;) {
2013	// printf("top=%p: pc=%d\n", th_list.top, (int)(pc - (bc_buf + RE_HEADER_LEN)));
2014	opcode = *pc++;
2015	switch(opcode) {
2016	case REOP_match:
2017	{
2018	REExecState *rs;
2019	if (no_recurse)
2020	return (intptr_t)cptr;
2021	ret = `1`;
2022	goto recurse;
2023	no_match:
2024	if (no_recurse)
2025	return `0`;
2026	ret = `0`;
2027	recurse:
2028	for(;;) {
2029	if (lre_poll_timeout(s))
2030	return LRE_RET_TIMEOUT;
2031	if (s->state_stack_len == `0`)
2032	return ret;
2033	rs = (REExecState *)(s->state_stack +
2034	(s->state_stack_len - `1`) * s->state_size);
2035	if (rs->type == RE_EXEC_STATE_SPLIT) {
2036	if (!ret) {
2037	pop_state:
2038	memcpy(capture, rs->buf,
2039	sizeof(capture[`0`]) * `2` * s->capture_count);
2040	pop_state1:
2041	pc = rs->pc;
2042	cptr = rs->cptr;
2043	stack_len = rs->stack_len;
2044	memcpy(stack, rs->buf + `2` * s->capture_count,
2045	stack_len * sizeof(stack[`0`]));
2046	s->state_stack_len--;
2047	break;
2048	}
2049	} else if (rs->type == RE_EXEC_STATE_GREEDY_QUANT) {
2050	if (!ret) {
2051	uint32_t char_count, i;
2052	memcpy(capture, rs->buf,
2053	sizeof(capture[`0`]) * `2` * s->capture_count);
2054	stack_len = rs->stack_len;
2055	memcpy(stack, rs->buf + `2` * s->capture_count,
2056	stack_len * sizeof(stack[`0`]));
2057	pc = rs->pc;
2058	cptr = rs->cptr;
2059	/ go backward /
2060	char_count = get_u32(pc + `12`);
2061	for(i = `0`; i < char_count; i++) {
2062	PREV_CHAR(cptr, s->cbuf, cbuf_type);
2063	}
2064	pc = (pc + `16`) + (int)get_u32(pc);
2065	rs->cptr = cptr;
2066	rs->count--;
2067	if (rs->count == `0`) {
2068	s->state_stack_len--;
2069	}
2070	break;
2071	}
2072	} else {
2073	ret = ((rs->type == RE_EXEC_STATE_LOOKAHEAD && ret) \|\|
2074	(rs->type == RE_EXEC_STATE_NEGATIVE_LOOKAHEAD && !ret));
2075	if (ret) {
2076	/ keep the capture in case of positive lookahead /
2077	if (rs->type == RE_EXEC_STATE_LOOKAHEAD)
2078	goto pop_state1;
2079	else
2080	goto pop_state;
2081	}
2082	}
2083	s->state_stack_len--;
2084	}
2085	}
2086	break;
2087	case REOP_char32:
2088	val = get_u32(pc);
2089	pc += `4`;
2090	goto test_char;
2091	case REOP_char:
2092	val = get_u16(pc);
2093	pc += `2`;
2094	test_char:
2095	if (cptr >= cbuf_end)
2096	goto no_match;
2097	GET_CHAR(c, cptr, cbuf_end, cbuf_type);
2098	if (s->ignore_case) {
2099	c = lre_canonicalize(c, s->is_unicode);
2100	}
2101	if (val != c)
2102	goto no_match;
2103	break;
2104	case REOP_split_goto_first:
2105	case REOP_split_next_first:
2106	{
2107	const uint8_t *pc1;
2108
2109	val = get_u32(pc);
2110	pc += `4`;
2111	if (opcode == REOP_split_next_first) {
2112	pc1 = pc + (int)val;
2113	} else {
2114	pc1 = pc;
2115	pc = pc + (int)val;
2116	}
2117	ret = push_state(s, capture, stack, stack_len,
2118	pc1, cptr, RE_EXEC_STATE_SPLIT, `0`);
2119	if (ret < `0`)
2120	return LRE_RET_MEMORY_ERROR;
2121	break;
2122	}
2123	case REOP_lookahead:
2124	case REOP_negative_lookahead:
2125	val = get_u32(pc);
2126	pc += `4`;
2127	ret = push_state(s, capture, stack, stack_len,
2128	pc + (int)val, cptr,
2129	RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead,
2130	`0`);
2131	if (ret < `0`)
2132	return LRE_RET_MEMORY_ERROR;
2133	break;
2134
2135	case REOP_goto:
2136	val = get_u32(pc);
2137	pc += `4` + (int)val;
2138	if (lre_poll_timeout(s))
2139	return LRE_RET_TIMEOUT;
2140	break;
2141	case REOP_line_start:
2142	if (cptr == s->cbuf)
2143	break;
2144	if (!s->multi_line)
2145	goto no_match;
2146	PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
2147	if (!is_line_terminator(c))
2148	goto no_match;
2149	break;
2150	case REOP_line_end:
2151	if (cptr == cbuf_end)
2152	break;
2153	if (!s->multi_line)
2154	goto no_match;
2155	PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
2156	if (!is_line_terminator(c))
2157	goto no_match;
2158	break;
2159	case REOP_dot:
2160	if (cptr == cbuf_end)
2161	goto no_match;
2162	GET_CHAR(c, cptr, cbuf_end, cbuf_type);
2163	if (is_line_terminator(c))
2164	goto no_match;
2165	break;
2166	case REOP_any:
2167	if (cptr == cbuf_end)
2168	goto no_match;
2169	GET_CHAR(c, cptr, cbuf_end, cbuf_type);
2170	break;
2171	case REOP_save_start:
2172	case REOP_save_end:
2173	val = *pc++;
2174	assert(val < s->capture_count);
2175	capture[`2` * val + opcode - REOP_save_start] = (uint8_t *)cptr;
2176	break;
2177	case REOP_save_reset:
2178	{
2179	uint32_t val2;
2180	val = pc[`0`];
2181	val2 = pc[`1`];
2182	pc += `2`;
2183	assert(val2 < s->capture_count);
2184	while (val <= val2) {
2185	capture[`2` * val] = NULL;
2186	capture[`2` * val + `1`] = NULL;
2187	val++;
2188	}
2189	}
2190	break;
2191	case REOP_push_i32:
2192	val = get_u32(pc);
2193	pc += `4`;
2194	stack[stack_len++] = val;
2195	break;
2196	case REOP_drop:
2197	stack_len--;
2198	break;
2199	case REOP_loop:
2200	val = get_u32(pc);
2201	pc += `4`;
2202	if (--stack[stack_len - `1`] != `0`) {
2203	pc += (int)val;
2204	if (lre_poll_timeout(s))
2205	return LRE_RET_TIMEOUT;
2206	}
2207	break;
2208	case REOP_push_char_pos:
2209	stack[stack_len++] = (uintptr_t)cptr;
2210	break;
2211	case REOP_check_advance:
2212	if (stack[--stack_len] == (uintptr_t)cptr)
2213	goto no_match;
2214	break;
2215	case REOP_word_boundary:
2216	case REOP_not_word_boundary:
2217	{
2218	BOOL v1, v2;
2219	/ char before /
2220	if (cptr == s->cbuf) {
2221	v1 = FALSE;
2222	} else {
2223	PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
2224	v1 = is_word_char(c);
2225	}
2226	/ current char /
2227	if (cptr >= cbuf_end) {
2228	v2 = FALSE;
2229	} else {
2230	PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
2231	v2 = is_word_char(c);
2232	}
2233	if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode))
2234	goto no_match;
2235	}
2236	break;
2237	case REOP_back_reference:
2238	case REOP_backward_back_reference:
2239	{
2240	const uint8_t cptr1, cptr1_end, *cptr1_start;
2241	uint32_t c1, c2;
2242
2243	val = *pc++;
2244	if (val >= s->capture_count)
2245	goto no_match;
2246	cptr1_start = capture[`2` * val];
2247	cptr1_end = capture[`2` * val + `1`];
2248	if (!cptr1_start \|\| !cptr1_end)
2249	break;
2250	if (opcode == REOP_back_reference) {
2251	cptr1 = cptr1_start;
2252	while (cptr1 < cptr1_end) {
2253	if (cptr >= cbuf_end)
2254	goto no_match;
2255	GET_CHAR(c1, cptr1, cptr1_end, cbuf_type);
2256	GET_CHAR(c2, cptr, cbuf_end, cbuf_type);
2257	if (s->ignore_case) {
2258	c1 = lre_canonicalize(c1, s->is_unicode);
2259	c2 = lre_canonicalize(c2, s->is_unicode);
2260	}
2261	if (c1 != c2)
2262	goto no_match;
2263	}
2264	} else {
2265	cptr1 = cptr1_end;
2266	while (cptr1 > cptr1_start) {
2267	if (cptr == s->cbuf)
2268	goto no_match;
2269	GET_PREV_CHAR(c1, cptr1, cptr1_start, cbuf_type);
2270	GET_PREV_CHAR(c2, cptr, s->cbuf, cbuf_type);
2271	if (s->ignore_case) {
2272	c1 = lre_canonicalize(c1, s->is_unicode);
2273	c2 = lre_canonicalize(c2, s->is_unicode);
2274	}
2275	if (c1 != c2)
2276	goto no_match;
2277	}
2278	}
2279	}
2280	break;
2281	case REOP_range:
2282	{
2283	int n;
2284	uint32_t low, high, idx_min, idx_max, idx;
2285
2286	n = get_u16(pc); / n must be >= 1 /
2287	pc += `2`;
2288	if (cptr >= cbuf_end)
2289	goto no_match;
2290	GET_CHAR(c, cptr, cbuf_end, cbuf_type);
2291	if (s->ignore_case) {
2292	c = lre_canonicalize(c, s->is_unicode);
2293	}
2294	idx_min = `0`;
2295	low = get_u16(pc + `0` * `4`);
2296	if (c < low)
2297	goto no_match;
2298	idx_max = n - `1`;
2299	high = get_u16(pc + idx_max * `4` + `2`);
2300	/ 0xffff in for last value means +infinity /
2301	if (unlikely(c >= `0xffff`) && high == `0xffff`)
2302	goto range_match;
2303	if (c > high)
2304	goto no_match;
2305	while (idx_min <= idx_max) {
2306	idx = (idx_min + idx_max) / `2`;
2307	low = get_u16(pc + idx * `4`);
2308	high = get_u16(pc + idx * `4` + `2`);
2309	if (c < low)
2310	idx_max = idx - `1`;
2311	else if (c > high)
2312	idx_min = idx + `1`;
2313	else
2314	goto range_match;
2315	}
2316	goto no_match;
2317	range_match:
2318	pc += `4` * n;
2319	}
2320	break;
2321	case REOP_range32:
2322	{
2323	int n;
2324	uint32_t low, high, idx_min, idx_max, idx;
2325
2326	n = get_u16(pc); / n must be >= 1 /
2327	pc += `2`;
2328	if (cptr >= cbuf_end)
2329	goto no_match;
2330	GET_CHAR(c, cptr, cbuf_end, cbuf_type);
2331	if (s->ignore_case) {
2332	c = lre_canonicalize(c, s->is_unicode);
2333	}
2334	idx_min = `0`;
2335	low = get_u32(pc + `0` * `8`);
2336	if (c < low)
2337	goto no_match;
2338	idx_max = n - `1`;
2339	high = get_u32(pc + idx_max * `8` + `4`);
2340	if (c > high)
2341	goto no_match;
2342	while (idx_min <= idx_max) {
2343	idx = (idx_min + idx_max) / `2`;
2344	low = get_u32(pc + idx * `8`);
2345	high = get_u32(pc + idx * `8` + `4`);
2346	if (c < low)
2347	idx_max = idx - `1`;
2348	else if (c > high)
2349	idx_min = idx + `1`;
2350	else
2351	goto range32_match;
2352	}
2353	goto no_match;
2354	range32_match:
2355	pc += `8` * n;
2356	}
2357	break;
2358	case REOP_prev:
2359	/ go to the previous char /
2360	if (cptr == s->cbuf)
2361	goto no_match;
2362	PREV_CHAR(cptr, s->cbuf, cbuf_type);
2363	break;
2364	case REOP_simple_greedy_quant:
2365	{
2366	uint32_t next_pos, quant_min, quant_max;
2367	size_t q;
2368	intptr_t res;
2369	const uint8_t *pc1;
2370
2371	next_pos = get_u32(pc);
2372	quant_min = get_u32(pc + `4`);
2373	quant_max = get_u32(pc + `8`);
2374	pc += `16`;
2375	pc1 = pc;
2376	pc += (int)next_pos;
2377
2378	q = `0`;
2379	for(;;) {
2380	if (lre_poll_timeout(s))
2381	return LRE_RET_TIMEOUT;
2382	res = lre_exec_backtrack(s, capture, stack, stack_len,
2383	pc1, cptr, TRUE);
2384	if (res == LRE_RET_MEMORY_ERROR \|\|
2385	res == LRE_RET_TIMEOUT)
2386	return res;
2387	if (!res)
2388	break;
2389	cptr = (uint8_t *)res;
2390	q++;
2391	if (q >= quant_max && quant_max != INT32_MAX)
2392	break;
2393	}
2394	if (q < quant_min)
2395	goto no_match;
2396	if (q > quant_min) {
2397	/ will examine all matches down to quant_min /
2398	ret = push_state(s, capture, stack, stack_len,
2399	pc1 - `16`, cptr,
2400	RE_EXEC_STATE_GREEDY_QUANT,
2401	q - quant_min);
2402	if (ret < `0`)
2403	return LRE_RET_MEMORY_ERROR;
2404	}
2405	}
2406	break;
2407	default:
2408	abort();
2409	}
2410	}
2411	}
2412
2413	/ Return 1 if match, 0 if not match or < 0 if error (see LRE_RET_x). cindex is the*
2414	starting position of the match and must be such as 0 <= cindex <=
2415	clen. /*
2416	int lre_exec(uint8_t **capture,
2417	const uint8_t bc_buf, const* uint8_t cbuf, int* cindex, int clen,
2418	int cbuf_type, void *opaque)
2419	{
2420	REExecContext s_s, *s = &s_s;
2421	int re_flags, i, alloca_size, ret;
2422	StackInt *stack_buf;
2423
2424	re_flags = lre_get_flags(bc_buf);
2425	s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != `0`;
2426	s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != `0`;
2427	s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != `0`;
2428	s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
2429	s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE];
2430	s->cbuf = cbuf;
2431	s->cbuf_end = cbuf + (clen << cbuf_type);
2432	s->cbuf_type = cbuf_type;
2433	if (s->cbuf_type == `1` && s->is_unicode)
2434	s->cbuf_type = `2`;
2435	s->interrupt_counter = INTERRUPT_COUNTER_INIT;
2436	s->opaque = opaque;
2437
2438	s->state_size = sizeof(REExecState) +
2439	s->capture_count * sizeof(capture[`0`]) * `2` +
2440	s->stack_size_max * sizeof(stack_buf[`0`]);
2441	s->state_stack = NULL;
2442	s->state_stack_len = `0`;
2443	s->state_stack_size = `0`;
2444
2445	for(i = `0`; i < s->capture_count * `2`; i++)
2446	capture[i] = NULL;
2447	alloca_size = s->stack_size_max * sizeof(stack_buf[`0`]);
2448	stack_buf = alloca(alloca_size);
2449	ret = lre_exec_backtrack(s, capture, stack_buf, `0`, bc_buf + RE_HEADER_LEN,
2450	cbuf + (cindex << cbuf_type), FALSE);
2451	lre_realloc(s->opaque, s->state_stack, `0`);
2452	return ret;
2453	}
2454
2455	int lre_get_capture_count(const uint8_t *bc_buf)
2456	{
2457	return bc_buf[RE_HEADER_CAPTURE_COUNT];
2458	}
2459
2460	int lre_get_flags(const uint8_t *bc_buf)
2461	{
2462	return bc_buf[RE_HEADER_FLAGS];
2463	}
2464
2465	/ Return NULL if no group names. Otherwise, return a pointer to*
2466	'capture_count - 1' zero terminated UTF-8 strings. /*
2467	const char lre_get_groupnames(const* uint8_t *bc_buf)
2468	{
2469	uint32_t re_bytecode_len;
2470	if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == `0`)
2471	return NULL;
2472	re_bytecode_len = get_u32(bc_buf + RE_HEADER_BYTECODE_LEN);
2473	return (const char *)(bc_buf + RE_HEADER_LEN + re_bytecode_len);
2474	}
2475
2476	#ifdef TEST
2477
2478	BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size)
2479	{
2480	return FALSE;
2481	}
2482
2483	void lre_realloc(void* opaque, void* *ptr, size_t size)
2484	{
2485	return realloc(ptr, size);
2486	}
2487
2488	int main(int argc, char **argv)
2489	{
2490	int len, flags, ret, i;
2491	uint8_t *bc;
2492	char error_msg[`64`];
2493	uint8_t capture[CAPTURE_COUNT_MAX `2`];
2494	const char *input;
2495	int input_len, capture_count;
2496
2497	if (argc < `4`) {
2498	printf("usage: %s regexp flags input\n", argv[`0`]);
2499	return `1`;
2500	}
2501	flags = atoi(argv[`2`]);
2502	bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[`1`],
2503	strlen(argv[`1`]), flags, NULL);
2504	if (!bc) {
2505	fprintf(stderr, "error: %s\n", error_msg);
2506	exit(`1`);
2507	}
2508
2509	input = argv[`3`];
2510	input_len = strlen(input);
2511
2512	ret = lre_exec(capture, bc, (uint8_t *)input, `0`, input_len, `0`, NULL);
2513	printf("ret=%d\n", ret);
2514	if (ret == `1`) {
2515	capture_count = lre_get_capture_count(bc);
2516	for(i = `0`; i < `2` * capture_count; i++) {
2517	uint8_t *ptr;
2518	ptr = capture[i];
2519	printf("%d: ", i);
2520	if (!ptr)
2521	printf("<nil>");
2522	else
2523	printf("%u", (int)(ptr - (uint8_t *)input));
2524	printf("\n");
2525	}
2526	}
2527	return `0`;
2528	}
2529	#endif
2530

Browse the source code of QuickJS/libregexp.c