regexp.c source code [MuPDF/thirdparty/mujs/regexp.c]

1	#include <stdlib.h>
2	#include <stdio.h>
3	#include <string.h>
4	#include <setjmp.h>
5	#include <limits.h>
6
7	#include "regexp.h"
8	#include "utf.h"
9
10	#define emit regemit
11	#define next regnext
12	#define accept regaccept
13
14	#define nelem(a) (int)(sizeof (a) / sizeof (a)[0])
15
16	#define REPINF 255
17	#define MAXSUB REG_MAXSUB
18	#define MAXPROG (32 << 10)
19	#define MAXREC 1024
20
21	typedef struct Reclass Reclass;
22	typedef struct Renode Renode;
23	typedef struct Reinst Reinst;
24	typedef struct Rethread Rethread;
25
26	struct Reclass {
27	Rune *end;
28	Rune spans[`64`];
29	};
30
31	struct Reprog {
32	Reinst start, end;
33	int flags;
34	int nsub;
35	Reclass cclass[`16`];
36	};
37
38	struct cstate {
39	Reprog *prog;
40	Renode pstart, pend;
41
42	const char *source;
43	int ncclass;
44	int nsub;
45	Renode *sub[MAXSUB];
46
47	int lookahead;
48	Rune yychar;
49	Reclass *yycc;
50	int yymin, yymax;
51
52	const char *error;
53	jmp_buf kaboom;
54	};
55
56	static void die(struct cstate g, const* char *message)
57	{
58	g->error = message;
59	longjmp(g->kaboom, `1`);
60	}
61
62	static int canon(Rune c)
63	{
64	Rune u = toupperrune(c);
65	if (c >= `128` && u < `128`)
66	return c;
67	return u;
68	}
69
70	/ Scan /
71
72	enum {
73	L_CHAR = `256`,
74	L_CCLASS, / character class /
75	L_NCCLASS, / negative character class /
76	L_NC, / "(?:" no capture /
77	L_PLA, / "(?=" positive lookahead /
78	L_NLA, / "(?!" negative lookahead /
79	L_WORD, / "\b" word boundary /
80	L_NWORD, / "\B" non-word boundary /
81	L_REF, / "\1" back-reference /
82	L_COUNT, / {M,N} /
83	};
84
85	static int hex(struct cstate g, int* c)
86	{
87	if (c >= `'0'` && c <= `'9'`) return c - `'0'`;
88	if (c >= `'a'` && c <= `'f'`) return c - `'a'` + `0xA`;
89	if (c >= `'A'` && c <= `'F'`) return c - `'A'` + `0xA`;
90	die(g, "invalid escape sequence");
91	return `0`;
92	}
93
94	static int dec(struct cstate g, int* c)
95	{
96	if (c >= `'0'` && c <= `'9'`) return c - `'0'`;
97	die(g, "invalid quantifier");
98	return `0`;
99	}
100
101	#define ESCAPES "BbDdSsWw^$\\.*+?()[]{}\|0123456789"
102
103	static int isunicodeletter(int c)
104	{
105	return (c >= `'a'` && c <= `'z'`) \|\| (c >= `'A'` && c <= `'Z'`) \|\| isalpharune(c);
106	}
107
108	static int nextrune(struct cstate *g)
109	{
110	g->source += chartorune(&g->yychar, g->source);
111	if (g->yychar == `'\\'`) {
112	g->source += chartorune(&g->yychar, g->source);
113	switch (g->yychar) {
114	case `0`: die(g, "unterminated escape sequence"); break;
115	case `'f'`: g->yychar = `'\f'`; return `0`;
116	case `'n'`: g->yychar = `'\n'`; return `0`;
117	case `'r'`: g->yychar = `'\r'`; return `0`;
118	case `'t'`: g->yychar = `'\t'`; return `0`;
119	case `'v'`: g->yychar = `'\v'`; return `0`;
120	case `'c'`:
121	g->yychar = (*g->source++) & `31`;
122	return `0`;
123	case `'x'`:
124	g->yychar = hex(g, *g->source++) << `4`;
125	g->yychar += hex(g, *g->source++);
126	if (g->yychar == `0`) {
127	g->yychar = `'0'`;
128	return `1`;
129	}
130	return `0`;
131	case `'u'`:
132	g->yychar = hex(g, *g->source++) << `12`;
133	g->yychar += hex(g, *g->source++) << `8`;
134	g->yychar += hex(g, *g->source++) << `4`;
135	g->yychar += hex(g, *g->source++);
136	if (g->yychar == `0`) {
137	g->yychar = `'0'`;
138	return `1`;
139	}
140	return `0`;
141	}
142	if (strchr(ESCAPES, g->yychar))
143	return `1`;
144	if (isunicodeletter(g->yychar) \|\| g->yychar == `'_'`) / check identity escape /
145	die(g, "invalid escape character");
146	return `0`;
147	}
148	return `0`;
149	}
150
151	static int lexcount(struct cstate *g)
152	{
153	g->yychar = *g->source++;
154
155	g->yymin = dec(g, g->yychar);
156	g->yychar = *g->source++;
157	while (g->yychar != `','` && g->yychar != `'}'`) {
158	g->yymin = g->yymin * `10` + dec(g, g->yychar);
159	g->yychar = *g->source++;
160	if (g->yymin >= REPINF)
161	die(g, "numeric overflow");
162	}
163
164	if (g->yychar == `','`) {
165	g->yychar = *g->source++;
166	if (g->yychar == `'}'`) {
167	g->yymax = REPINF;
168	} else {
169	g->yymax = dec(g, g->yychar);
170	g->yychar = *g->source++;
171	while (g->yychar != `'}'`) {
172	g->yymax = g->yymax * `10` + dec(g, g->yychar);
173	g->yychar = *g->source++;
174	if (g->yymax >= REPINF)
175	die(g, "numeric overflow");
176	}
177	}
178	} else {
179	g->yymax = g->yymin;
180	}
181
182	return L_COUNT;
183	}
184
185	static void newcclass(struct cstate *g)
186	{
187	if (g->ncclass >= nelem(g->prog->cclass))
188	die(g, "too many character classes");
189	g->yycc = g->prog->cclass + g->ncclass++;
190	g->yycc->end = g->yycc->spans;
191	}
192
193	static void addrange(struct cstate *g, Rune a, Rune b)
194	{
195	if (a > b)
196	die(g, "invalid character class range");
197	if (g->yycc->end + `2` == g->yycc->spans + nelem(g->yycc->spans))
198	die(g, "too many character class ranges");
199	*g->yycc->end++ = a;
200	*g->yycc->end++ = b;
201	}
202
203	static void addranges_d(struct cstate *g)
204	{
205	addrange(g, `'0'`, `'9'`);
206	}
207
208	static void addranges_D(struct cstate *g)
209	{
210	addrange(g, `0`, `'0'`-`1`);
211	addrange(g, `'9'`+`1`, `0xFFFF`);
212	}
213
214	static void addranges_s(struct cstate *g)
215	{
216	addrange(g, `0x9`, `0xD`);
217	addrange(g, `0x20`, `0x20`);
218	addrange(g, `0xA0`, `0xA0`);
219	addrange(g, `0x2028`, `0x2029`);
220	addrange(g, `0xFEFF`, `0xFEFF`);
221	}
222
223	static void addranges_S(struct cstate *g)
224	{
225	addrange(g, `0`, `0x9`-`1`);
226	addrange(g, `0xD`+`1`, `0x20`-`1`);
227	addrange(g, `0x20`+`1`, `0xA0`-`1`);
228	addrange(g, `0xA0`+`1`, `0x2028`-`1`);
229	addrange(g, `0x2029`+`1`, `0xFEFF`-`1`);
230	addrange(g, `0xFEFF`+`1`, `0xFFFF`);
231	}
232
233	static void addranges_w(struct cstate *g)
234	{
235	addrange(g, `'0'`, `'9'`);
236	addrange(g, `'A'`, `'Z'`);
237	addrange(g, `'_'`, `'_'`);
238	addrange(g, `'a'`, `'z'`);
239	}
240
241	static void addranges_W(struct cstate *g)
242	{
243	addrange(g, `0`, `'0'`-`1`);
244	addrange(g, `'9'`+`1`, `'A'`-`1`);
245	addrange(g, `'Z'`+`1`, `'_'`-`1`);
246	addrange(g, `'_'`+`1`, `'a'`-`1`);
247	addrange(g, `'z'`+`1`, `0xFFFF`);
248	}
249
250	static int lexclass(struct cstate *g)
251	{
252	int type = L_CCLASS;
253	int quoted, havesave, havedash;
254	Rune save = `0`;
255
256	newcclass(g);
257
258	quoted = nextrune(g);
259	if (!quoted && g->yychar == `'^'`) {
260	type = L_NCCLASS;
261	quoted = nextrune(g);
262	}
263
264	havesave = havedash = `0`;
265	for (;;) {
266	if (g->yychar == `0`)
267	die(g, "unterminated character class");
268	if (!quoted && g->yychar == `']'`)
269	break;
270
271	if (!quoted && g->yychar == `'-'`) {
272	if (havesave) {
273	if (havedash) {
274	addrange(g, save, `'-'`);
275	havesave = havedash = `0`;
276	} else {
277	havedash = `1`;
278	}
279	} else {
280	save = `'-'`;
281	havesave = `1`;
282	}
283	} else if (quoted && strchr("DSWdsw", g->yychar)) {
284	if (havesave) {
285	addrange(g, save, save);
286	if (havedash)
287	addrange(g, `'-'`, `'-'`);
288	}
289	switch (g->yychar) {
290	case `'d'`: addranges_d(g); break;
291	case `'s'`: addranges_s(g); break;
292	case `'w'`: addranges_w(g); break;
293	case `'D'`: addranges_D(g); break;
294	case `'S'`: addranges_S(g); break;
295	case `'W'`: addranges_W(g); break;
296	}
297	havesave = havedash = `0`;
298	} else {
299	if (quoted) {
300	if (g->yychar == `'b'`)
301	g->yychar = `'\b'`;
302	else if (g->yychar == `'0'`)
303	g->yychar = `0`;
304	/ else identity escape /
305	}
306	if (havesave) {
307	if (havedash) {
308	addrange(g, save, g->yychar);
309	havesave = havedash = `0`;
310	} else {
311	addrange(g, save, save);
312	save = g->yychar;
313	}
314	} else {
315	save = g->yychar;
316	havesave = `1`;
317	}
318	}
319
320	quoted = nextrune(g);
321	}
322
323	if (havesave) {
324	addrange(g, save, save);
325	if (havedash)
326	addrange(g, `'-'`, `'-'`);
327	}
328
329	return type;
330	}
331
332	static int lex(struct cstate *g)
333	{
334	int quoted = nextrune(g);
335	if (quoted) {
336	switch (g->yychar) {
337	case `'b'`: return L_WORD;
338	case `'B'`: return L_NWORD;
339	case `'d'`: newcclass(g); addranges_d(g); return L_CCLASS;
340	case `'s'`: newcclass(g); addranges_s(g); return L_CCLASS;
341	case `'w'`: newcclass(g); addranges_w(g); return L_CCLASS;
342	case `'D'`: newcclass(g); addranges_d(g); return L_NCCLASS;
343	case `'S'`: newcclass(g); addranges_s(g); return L_NCCLASS;
344	case `'W'`: newcclass(g); addranges_w(g); return L_NCCLASS;
345	case `'0'`: g->yychar = `0`; return L_CHAR;
346	}
347	if (g->yychar >= `'0'` && g->yychar <= `'9'`) {
348	g->yychar -= `'0'`;
349	if (g->source >= `'0'` && g->source <= `'9'`)
350	g->yychar = g->yychar * `10` + *g->source++ - `'0'`;
351	return L_REF;
352	}
353	return L_CHAR;
354	}
355
356	switch (g->yychar) {
357	case `0`:
358	case `'$'`: case `')'`: case `''`: case* `'+'`:
359	case `'.'`: case `'?'`: case `'^'`: case `'\|'`:
360	return g->yychar;
361	}
362
363	if (g->yychar == `'{'`)
364	return lexcount(g);
365	if (g->yychar == `'['`)
366	return lexclass(g);
367	if (g->yychar == `'('`) {
368	if (g->source[`0`] == `'?'`) {
369	if (g->source[`1`] == `':'`) {
370	g->source += `2`;
371	return L_NC;
372	}
373	if (g->source[`1`] == `'='`) {
374	g->source += `2`;
375	return L_PLA;
376	}
377	if (g->source[`1`] == `'!'`) {
378	g->source += `2`;
379	return L_NLA;
380	}
381	}
382	return `'('`;
383	}
384
385	return L_CHAR;
386	}
387
388	/ Parse /
389
390	enum {
391	P_CAT, P_ALT, P_REP,
392	P_BOL, P_EOL, P_WORD, P_NWORD,
393	P_PAR, P_PLA, P_NLA,
394	P_ANY, P_CHAR, P_CCLASS, P_NCCLASS,
395	P_REF,
396	};
397
398	struct Renode {
399	unsigned char type;
400	unsigned char ng, m, n;
401	Rune c;
402	Reclass *cc;
403	Renode *x;
404	Renode *y;
405	};
406
407	static Renode newnode(struct* cstate g, int* type)
408	{
409	Renode *node = g->pend++;
410	node->type = type;
411	node->cc = NULL;
412	node->c = `0`;
413	node->ng = `0`;
414	node->m = `0`;
415	node->n = `0`;
416	node->x = node->y = NULL;
417	return node;
418	}
419
420	static int empty(Renode *node)
421	{
422	if (!node) return `1`;
423	switch (node->type) {
424	default: return `1`;
425	case P_CAT: return empty(node->x) && empty(node->y);
426	case P_ALT: return empty(node->x) \|\| empty(node->y);
427	case P_REP: return empty(node->x) \|\| node->m == `0`;
428	case P_PAR: return empty(node->x);
429	case P_REF: return empty(node->x);
430	case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return `0`;
431	}
432	}
433
434	static Renode newrep(struct* cstate g, Renode atom, int ng, int min, int max)
435	{
436	Renode *rep = newnode(g, P_REP);
437	if (max == REPINF && empty(atom))
438	die(g, "infinite loop matching the empty string");
439	rep->ng = ng;
440	rep->m = min;
441	rep->n = max;
442	rep->x = atom;
443	return rep;
444	}
445
446	static void next(struct cstate *g)
447	{
448	g->lookahead = lex(g);
449	}
450
451	static int accept(struct cstate g, int* t)
452	{
453	if (g->lookahead == t) {
454	next(g);
455	return `1`;
456	}
457	return `0`;
458	}
459
460	static Renode parsealt(struct* cstate *g);
461
462	static Renode parseatom(struct* cstate *g)
463	{
464	Renode *atom;
465	if (g->lookahead == L_CHAR) {
466	atom = newnode(g, P_CHAR);
467	atom->c = g->yychar;
468	next(g);
469	return atom;
470	}
471	if (g->lookahead == L_CCLASS) {
472	atom = newnode(g, P_CCLASS);
473	atom->cc = g->yycc;
474	next(g);
475	return atom;
476	}
477	if (g->lookahead == L_NCCLASS) {
478	atom = newnode(g, P_NCCLASS);
479	atom->cc = g->yycc;
480	next(g);
481	return atom;
482	}
483	if (g->lookahead == L_REF) {
484	atom = newnode(g, P_REF);
485	if (g->yychar == `0` \|\| g->yychar >= g->nsub \|\| !g->sub[g->yychar])
486	die(g, "invalid back-reference");
487	atom->n = g->yychar;
488	atom->x = g->sub[g->yychar];
489	next(g);
490	return atom;
491	}
492	if (accept(g, `'.'`))
493	return newnode(g, P_ANY);
494	if (accept(g, `'('`)) {
495	atom = newnode(g, P_PAR);
496	if (g->nsub == MAXSUB)
497	die(g, "too many captures");
498	atom->n = g->nsub++;
499	atom->x = parsealt(g);
500	g->sub[atom->n] = atom;
501	if (!accept(g, `')'`))
502	die(g, "unmatched '('");
503	return atom;
504	}
505	if (accept(g, L_NC)) {
506	atom = parsealt(g);
507	if (!accept(g, `')'`))
508	die(g, "unmatched '('");
509	return atom;
510	}
511	if (accept(g, L_PLA)) {
512	atom = newnode(g, P_PLA);
513	atom->x = parsealt(g);
514	if (!accept(g, `')'`))
515	die(g, "unmatched '('");
516	return atom;
517	}
518	if (accept(g, L_NLA)) {
519	atom = newnode(g, P_NLA);
520	atom->x = parsealt(g);
521	if (!accept(g, `')'`))
522	die(g, "unmatched '('");
523	return atom;
524	}
525	die(g, "syntax error");
526	return NULL;
527	}
528
529	static Renode parserep(struct* cstate *g)
530	{
531	Renode *atom;
532
533	if (accept(g, `'^'`)) return newnode(g, P_BOL);
534	if (accept(g, `'$'`)) return newnode(g, P_EOL);
535	if (accept(g, L_WORD)) return newnode(g, P_WORD);
536	if (accept(g, L_NWORD)) return newnode(g, P_NWORD);
537
538	atom = parseatom(g);
539	if (g->lookahead == L_COUNT) {
540	int min = g->yymin, max = g->yymax;
541	next(g);
542	if (max < min)
543	die(g, "invalid quantifier");
544	return newrep(g, atom, accept(g, `'?'`), min, max);
545	}
546	if (accept(g, `''`)) return* newrep(g, atom, accept(g, `'?'`), `0`, REPINF);
547	if (accept(g, `'+'`)) return newrep(g, atom, accept(g, `'?'`), `1`, REPINF);
548	if (accept(g, `'?'`)) return newrep(g, atom, accept(g, `'?'`), `0`, `1`);
549	return atom;
550	}
551
552	static Renode parsecat(struct* cstate *g)
553	{
554	Renode cat, head, **tail;
555	if (g->lookahead && g->lookahead != `'\|'` && g->lookahead != `')'`) {
556	/ Build a right-leaning tree by splicing in new 'cat' at the tail. /
557	head = parserep(g);
558	tail = &head;
559	while (g->lookahead && g->lookahead != `'\|'` && g->lookahead != `')'`) {
560	cat = newnode(g, P_CAT);
561	cat->x = *tail;
562	cat->y = parserep(g);
563	*tail = cat;
564	tail = &cat->y;
565	}
566	return head;
567	}
568	return NULL;
569	}
570
571	static Renode parsealt(struct* cstate *g)
572	{
573	Renode alt, x;
574	alt = parsecat(g);
575	while (accept(g, `'\|'`)) {
576	x = alt;
577	alt = newnode(g, P_ALT);
578	alt->x = x;
579	alt->y = parsecat(g);
580	}
581	return alt;
582	}
583
584	/ Compile /
585
586	enum {
587	I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA,
588	I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
589	I_BOL, I_EOL, I_WORD, I_NWORD,
590	I_LPAR, I_RPAR
591	};
592
593	struct Reinst {
594	unsigned char opcode;
595	unsigned char n;
596	Rune c;
597	Reclass *cc;
598	Reinst *x;
599	Reinst *y;
600	};
601
602	static int count(struct cstate g, Renode node)
603	{
604	int min, max, n;
605	if (!node) return `0`;
606	switch (node->type) {
607	default: return `1`;
608	case P_CAT: return count(g, node->x) + count(g, node->y);
609	case P_ALT: return count(g, node->x) + count(g, node->y) + `2`;
610	case P_REP:
611	min = node->m;
612	max = node->n;
613	if (min == max) n = count(g, node->x) * min;
614	else if (max < REPINF) n = count(g, node->x) * max + (max - min);
615	else n = count(g, node->x) * (min + `1`) + `2`;
616	if (n < `0` \|\| n > MAXPROG) die(g, "program too large");
617	return n;
618	case P_PAR: return count(g, node->x) + `2`;
619	case P_PLA: return count(g, node->x) + `2`;
620	case P_NLA: return count(g, node->x) + `2`;
621	}
622	}
623
624	static Reinst emit(Reprog prog, int opcode)
625	{
626	Reinst *inst = prog->end++;
627	inst->opcode = opcode;
628	inst->n = `0`;
629	inst->c = `0`;
630	inst->cc = NULL;
631	inst->x = inst->y = NULL;
632	return inst;
633	}
634
635	static void compile(Reprog prog, Renode node)
636	{
637	Reinst inst, split, *jump;
638	int i;
639
640	if (!node)
641	return;
642
643	loop:
644	switch (node->type) {
645	case P_CAT:
646	compile(prog, node->x);
647	node = node->y;
648	goto loop;
649
650	case P_ALT:
651	split = emit(prog, I_SPLIT);
652	compile(prog, node->x);
653	jump = emit(prog, I_JUMP);
654	compile(prog, node->y);
655	split->x = split + `1`;
656	split->y = jump + `1`;
657	jump->x = prog->end;
658	break;
659
660	case P_REP:
661	inst = NULL; / silence compiler warning. assert(node->m > 0). /
662	for (i = `0`; i < node->m; ++i) {
663	inst = prog->end;
664	compile(prog, node->x);
665	}
666	if (node->m == node->n)
667	break;
668	if (node->n < REPINF) {
669	for (i = node->m; i < node->n; ++i) {
670	split = emit(prog, I_SPLIT);
671	compile(prog, node->x);
672	if (node->ng) {
673	split->y = split + `1`;
674	split->x = prog->end;
675	} else {
676	split->x = split + `1`;
677	split->y = prog->end;
678	}
679	}
680	} else if (node->m == `0`) {
681	split = emit(prog, I_SPLIT);
682	compile(prog, node->x);
683	jump = emit(prog, I_JUMP);
684	if (node->ng) {
685	split->y = split + `1`;
686	split->x = prog->end;
687	} else {
688	split->x = split + `1`;
689	split->y = prog->end;
690	}
691	jump->x = split;
692	} else {
693	split = emit(prog, I_SPLIT);
694	if (node->ng) {
695	split->y = inst;
696	split->x = prog->end;
697	} else {
698	split->x = inst;
699	split->y = prog->end;
700	}
701	}
702	break;
703
704	case P_BOL: emit(prog, I_BOL); break;
705	case P_EOL: emit(prog, I_EOL); break;
706	case P_WORD: emit(prog, I_WORD); break;
707	case P_NWORD: emit(prog, I_NWORD); break;
708
709	case P_PAR:
710	inst = emit(prog, I_LPAR);
711	inst->n = node->n;
712	compile(prog, node->x);
713	inst = emit(prog, I_RPAR);
714	inst->n = node->n;
715	break;
716	case P_PLA:
717	split = emit(prog, I_PLA);
718	compile(prog, node->x);
719	emit(prog, I_END);
720	split->x = split + `1`;
721	split->y = prog->end;
722	break;
723	case P_NLA:
724	split = emit(prog, I_NLA);
725	compile(prog, node->x);
726	emit(prog, I_END);
727	split->x = split + `1`;
728	split->y = prog->end;
729	break;
730
731	case P_ANY:
732	emit(prog, I_ANY);
733	break;
734	case P_CHAR:
735	inst = emit(prog, I_CHAR);
736	inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c;
737	break;
738	case P_CCLASS:
739	inst = emit(prog, I_CCLASS);
740	inst->cc = node->cc;
741	break;
742	case P_NCCLASS:
743	inst = emit(prog, I_NCCLASS);
744	inst->cc = node->cc;
745	break;
746	case P_REF:
747	inst = emit(prog, I_REF);
748	inst->n = node->n;
749	break;
750	}
751	}
752
753	#ifdef TEST
754	static void dumpnode(Renode *node)
755	{
756	Rune *p;
757	if (!node) { printf("Empty"); return; }
758	switch (node->type) {
759	case P_CAT: printf("Cat("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
760	case P_ALT: printf("Alt("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
761	case P_REP:
762	printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);
763	dumpnode(node->x);
764	printf(")");
765	break;
766	case P_BOL: printf("Bol"); break;
767	case P_EOL: printf("Eol"); break;
768	case P_WORD: printf("Word"); break;
769	case P_NWORD: printf("NotWord"); break;
770	case P_PAR: printf("Par(%d,", node->n); dumpnode(node->x); printf(")"); break;
771	case P_PLA: printf("PLA("); dumpnode(node->x); printf(")"); break;
772	case P_NLA: printf("NLA("); dumpnode(node->x); printf(")"); break;
773	case P_ANY: printf("Any"); break;
774	case P_CHAR: printf("Char(%c)", node->c); break;
775	case P_CCLASS:
776	printf("Class(");
777	for (p = node->cc->spans; p < node->cc->end; p += `2`) printf("%02X-%02X,", p[`0`], p[`1`]);
778	printf(")");
779	break;
780	case P_NCCLASS:
781	printf("NotClass(");
782	for (p = node->cc->spans; p < node->cc->end; p += `2`) printf("%02X-%02X,", p[`0`], p[`1`]);
783	printf(")");
784	break;
785	case P_REF: printf("Ref(%d)", node->n); break;
786	}
787	}
788
789	static void dumpprog(Reprog *prog)
790	{
791	Reinst *inst;
792	int i;
793	for (i = `0`, inst = prog->start; inst < prog->end; ++i, ++inst) {
794	printf("% 5d: ", i);
795	switch (inst->opcode) {
796	case I_END: puts("end"); break;
797	case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break;
798	case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
799	case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
800	case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
801	case I_ANY: puts("any"); break;
802	case I_ANYNL: puts("anynl"); break;
803	case I_CHAR: printf(inst->c >= `32` && inst->c < `127` ? "char '%c'\n" : "char U+%04X\n", inst->c); break;
804	case I_CCLASS: puts("cclass"); break;
805	case I_NCCLASS: puts("ncclass"); break;
806	case I_REF: printf("ref %d\n", inst->n); break;
807	case I_BOL: puts("bol"); break;
808	case I_EOL: puts("eol"); break;
809	case I_WORD: puts("word"); break;
810	case I_NWORD: puts("nword"); break;
811	case I_LPAR: printf("lpar %d\n", inst->n); break;
812	case I_RPAR: printf("rpar %d\n", inst->n); break;
813	}
814	}
815	}
816	#endif
817
818	Reprog regcompx(void* (alloc)(void ctx, void* p, int* n), void *ctx,
819	const char pattern, int* cflags, const char **errorp)
820	{
821	struct cstate g;
822	Renode *node;
823	Reinst split, jump;
824	int i, n;
825
826	g.pstart = NULL;
827	g.prog = NULL;
828
829	if (setjmp(g.kaboom)) {
830	if (errorp) *errorp = g.error;
831	alloc(ctx, g.pstart, `0`);
832	alloc(ctx, g.prog, `0`);
833	return NULL;
834	}
835
836	g.prog = alloc(ctx, NULL, sizeof (Reprog));
837	if (!g.prog)
838	die(&g, "cannot allocate regular expression");
839	n = strlen(pattern) * `2`;
840	if (n > MAXPROG)
841	die(&g, "program too large");
842	if (n > `0`) {
843	g.pstart = g.pend = alloc(ctx, NULL, sizeof (Renode) * n);
844	if (!g.pstart)
845	die(&g, "cannot allocate regular expression parse list");
846	}
847
848	g.source = pattern;
849	g.ncclass = `0`;
850	g.nsub = `1`;
851	for (i = `0`; i < MAXSUB; ++i)
852	g.sub[i] = `0`;
853
854	g.prog->flags = cflags;
855
856	next(&g);
857	node = parsealt(&g);
858	if (g.lookahead == `')'`)
859	die(&g, "unmatched ')'");
860	if (g.lookahead != `0`)
861	die(&g, "syntax error");
862
863	#ifdef TEST
864	dumpnode(node);
865	putchar(`'\n'`);
866	#endif
867
868	n = `6` + count(&g, node);
869	if (n < `0` \|\| n > MAXPROG)
870	die(&g, "program too large");
871
872	g.prog->nsub = g.nsub;
873	g.prog->start = g.prog->end = alloc(ctx, NULL, n * sizeof (Reinst));
874	if (!g.prog->start)
875	die(&g, "cannot allocate regular expression instruction list");
876
877	split = emit(g.prog, I_SPLIT);
878	split->x = split + `3`;
879	split->y = split + `1`;
880	emit(g.prog, I_ANYNL);
881	jump = emit(g.prog, I_JUMP);
882	jump->x = split;
883	emit(g.prog, I_LPAR);
884	compile(g.prog, node);
885	emit(g.prog, I_RPAR);
886	emit(g.prog, I_END);
887
888	#ifdef TEST
889	dumpprog(g.prog);
890	#endif
891
892	alloc(ctx, g.pstart, `0`);
893
894	if (errorp) *errorp = NULL;
895	return g.prog;
896	}
897
898	void regfreex(void (alloc)(void ctx, void* p, int* n), void ctx, Reprog prog)
899	{
900	if (prog) {
901	alloc(ctx, prog->start, `0`);
902	alloc(ctx, prog, `0`);
903	}
904	}
905
906	static void default_alloc(void* ctx, void* p, int* n)
907	{
908	return realloc(p, (size_t)n);
909	}
910
911	Reprog regcomp(const* char pattern, int* cflags, const char **errorp)
912	{
913	return regcompx(default_alloc, NULL, pattern, cflags, errorp);
914	}
915
916	void regfree(Reprog *prog)
917	{
918	regfreex(default_alloc, NULL, prog);
919	}
920
921	/ Match /
922
923	static int isnewline(int c)
924	{
925	return c == `0xA` \|\| c == `0xD` \|\| c == `0x2028` \|\| c == `0x2029`;
926	}
927
928	static int iswordchar(int c)
929	{
930	return c == `'_'` \|\|
931	(c >= `'a'` && c <= `'z'`) \|\|
932	(c >= `'A'` && c <= `'Z'`) \|\|
933	(c >= `'0'` && c <= `'9'`);
934	}
935
936	static int incclass(Reclass *cc, Rune c)
937	{
938	Rune *p;
939	for (p = cc->spans; p < cc->end; p += `2`)
940	if (p[`0`] <= c && c <= p[`1`])
941	return `1`;
942	return `0`;
943	}
944
945	static int incclasscanon(Reclass *cc, Rune c)
946	{
947	Rune *p, r;
948	for (p = cc->spans; p < cc->end; p += `2`)
949	for (r = p[`0`]; r <= p[`1`]; ++r)
950	if (c == canon(r))
951	return `1`;
952	return `0`;
953	}
954
955	static int strncmpcanon(const char a, const* char b, int* n)
956	{
957	Rune ra, rb;
958	int c;
959	while (n--) {
960	if (!a) return* -`1`;
961	if (!b) return* `1`;
962	a += chartorune(&ra, a);
963	b += chartorune(&rb, b);
964	c = canon(ra) - canon(rb);
965	if (c)
966	return c;
967	}
968	return `0`;
969	}
970
971	static int match(Reinst pc, const* char sp, const* char bol, int* flags, Resub out, int* depth)
972	{
973	Resub scratch;
974	int result;
975	int i;
976	Rune c;
977
978	/ stack overflow /
979	if (depth > MAXREC)
980	return -`1`;
981
982	for (;;) {
983	switch (pc->opcode) {
984	case I_END:
985	return `0`;
986	case I_JUMP:
987	pc = pc->x;
988	break;
989	case I_SPLIT:
990	scratch = *out;
991	result = match(pc->x, sp, bol, flags, &scratch, depth+`1`);
992	if (result == -`1`)
993	return -`1`;
994	if (result == `0`) {
995	*out = scratch;
996	return `0`;
997	}
998	pc = pc->y;
999	break;
1000
1001	case I_PLA:
1002	result = match(pc->x, sp, bol, flags, out, depth+`1`);
1003	if (result == -`1`)
1004	return -`1`;
1005	if (result == `1`)
1006	return `1`;
1007	pc = pc->y;
1008	break;
1009	case I_NLA:
1010	scratch = *out;
1011	result = match(pc->x, sp, bol, flags, &scratch, depth+`1`);
1012	if (result == -`1`)
1013	return -`1`;
1014	if (result == `0`)
1015	return `1`;
1016	pc = pc->y;
1017	break;
1018
1019	case I_ANYNL:
1020	sp += chartorune(&c, sp);
1021	if (c == `0`)
1022	return `1`;
1023	pc = pc + `1`;
1024	break;
1025	case I_ANY:
1026	sp += chartorune(&c, sp);
1027	if (c == `0`)
1028	return `1`;
1029	if (isnewline(c))
1030	return `1`;
1031	pc = pc + `1`;
1032	break;
1033	case I_CHAR:
1034	sp += chartorune(&c, sp);
1035	if (c == `0`)
1036	return `1`;
1037	if (flags & REG_ICASE)
1038	c = canon(c);
1039	if (c != pc->c)
1040	return `1`;
1041	pc = pc + `1`;
1042	break;
1043	case I_CCLASS:
1044	sp += chartorune(&c, sp);
1045	if (c == `0`)
1046	return `1`;
1047	if (flags & REG_ICASE) {
1048	if (!incclasscanon(pc->cc, canon(c)))
1049	return `1`;
1050	} else {
1051	if (!incclass(pc->cc, c))
1052	return `1`;
1053	}
1054	pc = pc + `1`;
1055	break;
1056	case I_NCCLASS:
1057	sp += chartorune(&c, sp);
1058	if (c == `0`)
1059	return `1`;
1060	if (flags & REG_ICASE) {
1061	if (incclasscanon(pc->cc, canon(c)))
1062	return `1`;
1063	} else {
1064	if (incclass(pc->cc, c))
1065	return `1`;
1066	}
1067	pc = pc + `1`;
1068	break;
1069	case I_REF:
1070	i = out->sub[pc->n].ep - out->sub[pc->n].sp;
1071	if (flags & REG_ICASE) {
1072	if (strncmpcanon(sp, out->sub[pc->n].sp, i))
1073	return `1`;
1074	} else {
1075	if (strncmp(sp, out->sub[pc->n].sp, i))
1076	return `1`;
1077	}
1078	if (i > `0`)
1079	sp += i;
1080	pc = pc + `1`;
1081	break;
1082
1083	case I_BOL:
1084	if (sp == bol && !(flags & REG_NOTBOL)) {
1085	pc = pc + `1`;
1086	break;
1087	}
1088	if (flags & REG_NEWLINE) {
1089	if (sp > bol && isnewline(sp[-`1`])) {
1090	pc = pc + `1`;
1091	break;
1092	}
1093	}
1094	return `1`;
1095	case I_EOL:
1096	if (*sp == `0`) {
1097	pc = pc + `1`;
1098	break;
1099	}
1100	if (flags & REG_NEWLINE) {
1101	if (isnewline(*sp)) {
1102	pc = pc + `1`;
1103	break;
1104	}
1105	}
1106	return `1`;
1107	case I_WORD:
1108	i = sp > bol && iswordchar(sp[-`1`]);
1109	i ^= iswordchar(sp[`0`]);
1110	if (!i)
1111	return `1`;
1112	pc = pc + `1`;
1113	break;
1114	case I_NWORD:
1115	i = sp > bol && iswordchar(sp[-`1`]);
1116	i ^= iswordchar(sp[`0`]);
1117	if (i)
1118	return `1`;
1119	pc = pc + `1`;
1120	break;
1121
1122	case I_LPAR:
1123	out->sub[pc->n].sp = sp;
1124	pc = pc + `1`;
1125	break;
1126	case I_RPAR:
1127	out->sub[pc->n].ep = sp;
1128	pc = pc + `1`;
1129	break;
1130	default:
1131	return `1`;
1132	}
1133	}
1134	}
1135
1136	int regexec(Reprog prog, const* char sp, Resub sub, int eflags)
1137	{
1138	Resub scratch;
1139	int i;
1140
1141	if (!sub)
1142	sub = &scratch;
1143
1144	sub->nsub = prog->nsub;
1145	for (i = `0`; i < MAXSUB; ++i)
1146	sub->sub[i].sp = sub->sub[i].ep = NULL;
1147
1148	return match(prog->start, sp, sp, prog->flags \| eflags, sub, `0`);
1149	}
1150
1151	#ifdef TEST
1152	int main(int argc, char **argv)
1153	{
1154	const char *error;
1155	const char *s;
1156	Reprog *p;
1157	Resub m;
1158	int i;
1159
1160	if (argc > `1`) {
1161	p = regcomp(argv[`1`], `0`, &error);
1162	if (!p) {
1163	fprintf(stderr, "regcomp: %s\n", error);
1164	return `1`;
1165	}
1166
1167	if (argc > `2`) {
1168	s = argv[`2`];
1169	printf("nsub = %d\n", p->nsub);
1170	if (!regexec(p, s, &m, `0`)) {
1171	for (i = `0`; i < m.nsub; ++i) {
1172	int n = m.sub[i].ep - m.sub[i].sp;
1173	if (n > `0`)
1174	printf("match %d: s=%d e=%d n=%d '%.s'\n", i, (int)(m.sub[i].sp - s), (int*)(m.sub[i].ep - s), n, n, m.sub[i].sp);
1175	else
1176	printf("match %d: n=0 ''\n", i);
1177	}
1178	} else {
1179	printf("no match\n");
1180	}
1181	}
1182	}
1183
1184	return `0`;
1185	}
1186	#endif
1187

Browse the source code of MuPDF/thirdparty/mujs/regexp.c