1 | // Copyright 2014 Paul Sokolovsky. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | #include "re1.5.h" |
6 | |
7 | #define INSERT_CODE(at, num, pc) \ |
8 | ((code ? memmove(code + at + num, code + at, pc - at) : 0), pc += num) |
9 | #define REL(at, to) (to - at - 2) |
10 | #define EMIT(at, byte) (code ? (code[at] = byte) : (at)) |
11 | #define EMIT_CHECKED(at, byte) (_emit_checked(at, code, byte, &err)) |
12 | #define PC (prog->bytelen) |
13 | |
14 | static void _emit_checked(int at, char *code, int val, bool *err) { |
15 | *err |= val != (int8_t)val; |
16 | if (code) { |
17 | code[at] = val; |
18 | } |
19 | } |
20 | |
21 | static const char *_compilecode(const char *re, ByteProg *prog, int sizecode) |
22 | { |
23 | char *code = sizecode ? NULL : prog->insts; |
24 | bool err = false; |
25 | int start = PC; |
26 | int term = PC; |
27 | int alt_label = 0; |
28 | |
29 | for (; *re && *re != ')'; re++) { |
30 | switch (*re) { |
31 | case '\\': |
32 | re++; |
33 | if (!*re) return NULL; // Trailing backslash |
34 | if ((*re | 0x20) == 'd' || (*re | 0x20) == 's' || (*re | 0x20) == 'w') { |
35 | term = PC; |
36 | EMIT(PC++, NamedClass); |
37 | EMIT(PC++, *re); |
38 | prog->len++; |
39 | break; |
40 | } |
41 | MP_FALLTHROUGH |
42 | default: |
43 | term = PC; |
44 | EMIT(PC++, Char); |
45 | EMIT(PC++, *re); |
46 | prog->len++; |
47 | break; |
48 | case '.': |
49 | term = PC; |
50 | EMIT(PC++, Any); |
51 | prog->len++; |
52 | break; |
53 | case '[': { |
54 | int cnt; |
55 | term = PC; |
56 | re++; |
57 | if (*re == '^') { |
58 | EMIT(PC++, ClassNot); |
59 | re++; |
60 | } else { |
61 | EMIT(PC++, Class); |
62 | } |
63 | PC++; // Skip # of pair byte |
64 | prog->len++; |
65 | for (cnt = 0; *re != ']'; re++, cnt++) { |
66 | if (*re == '\\') { |
67 | ++re; |
68 | } |
69 | if (!*re) return NULL; |
70 | EMIT(PC++, *re); |
71 | if (re[1] == '-' && re[2] != ']') { |
72 | re += 2; |
73 | } |
74 | EMIT(PC++, *re); |
75 | } |
76 | EMIT_CHECKED(term + 1, cnt); |
77 | break; |
78 | } |
79 | case '(': { |
80 | term = PC; |
81 | int sub = 0; |
82 | int capture = re[1] != '?' || re[2] != ':'; |
83 | |
84 | if (capture) { |
85 | sub = ++prog->sub; |
86 | EMIT(PC++, Save); |
87 | EMIT_CHECKED(PC++, 2 * sub); |
88 | prog->len++; |
89 | } else { |
90 | re += 2; |
91 | } |
92 | |
93 | re = _compilecode(re + 1, prog, sizecode); |
94 | if (re == NULL || *re != ')') return NULL; // error, or no matching paren |
95 | |
96 | if (capture) { |
97 | EMIT(PC++, Save); |
98 | EMIT_CHECKED(PC++, 2 * sub + 1); |
99 | prog->len++; |
100 | } |
101 | |
102 | break; |
103 | } |
104 | case '?': |
105 | if (PC == term) return NULL; // nothing to repeat |
106 | INSERT_CODE(term, 2, PC); |
107 | if (re[1] == '?') { |
108 | EMIT(term, RSplit); |
109 | re++; |
110 | } else { |
111 | EMIT(term, Split); |
112 | } |
113 | EMIT_CHECKED(term + 1, REL(term, PC)); |
114 | prog->len++; |
115 | term = PC; |
116 | break; |
117 | case '*': |
118 | if (PC == term) return NULL; // nothing to repeat |
119 | INSERT_CODE(term, 2, PC); |
120 | EMIT(PC, Jmp); |
121 | EMIT_CHECKED(PC + 1, REL(PC, term)); |
122 | PC += 2; |
123 | if (re[1] == '?') { |
124 | EMIT(term, RSplit); |
125 | re++; |
126 | } else { |
127 | EMIT(term, Split); |
128 | } |
129 | EMIT_CHECKED(term + 1, REL(term, PC)); |
130 | prog->len += 2; |
131 | term = PC; |
132 | break; |
133 | case '+': |
134 | if (PC == term) return NULL; // nothing to repeat |
135 | if (re[1] == '?') { |
136 | EMIT(PC, Split); |
137 | re++; |
138 | } else { |
139 | EMIT(PC, RSplit); |
140 | } |
141 | EMIT_CHECKED(PC + 1, REL(PC, term)); |
142 | PC += 2; |
143 | prog->len++; |
144 | term = PC; |
145 | break; |
146 | case '|': |
147 | if (alt_label) { |
148 | EMIT_CHECKED(alt_label, REL(alt_label, PC) + 1); |
149 | } |
150 | INSERT_CODE(start, 2, PC); |
151 | EMIT(PC++, Jmp); |
152 | alt_label = PC++; |
153 | EMIT(start, Split); |
154 | EMIT_CHECKED(start + 1, REL(start, PC)); |
155 | prog->len += 2; |
156 | term = PC; |
157 | break; |
158 | case '^': |
159 | EMIT(PC++, Bol); |
160 | prog->len++; |
161 | term = PC; |
162 | break; |
163 | case '$': |
164 | EMIT(PC++, Eol); |
165 | prog->len++; |
166 | term = PC; |
167 | break; |
168 | } |
169 | } |
170 | |
171 | if (alt_label) { |
172 | EMIT_CHECKED(alt_label, REL(alt_label, PC) + 1); |
173 | } |
174 | return err ? NULL : re; |
175 | } |
176 | |
177 | int re1_5_sizecode(const char *re) |
178 | { |
179 | ByteProg dummyprog = { |
180 | // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code |
181 | .bytelen = 5 + NON_ANCHORED_PREFIX |
182 | }; |
183 | |
184 | if (_compilecode(re, &dummyprog, /*sizecode*/1) == NULL) return -1; |
185 | |
186 | return dummyprog.bytelen; |
187 | } |
188 | |
189 | int re1_5_compilecode(ByteProg *prog, const char *re) |
190 | { |
191 | prog->len = 0; |
192 | prog->bytelen = 0; |
193 | prog->sub = 0; |
194 | |
195 | // Add code to implement non-anchored operation ("search"), |
196 | // for anchored operation ("match"), this code will be just skipped. |
197 | // TODO: Implement search in much more efficient manner |
198 | prog->insts[prog->bytelen++] = RSplit; |
199 | prog->insts[prog->bytelen++] = 3; |
200 | prog->insts[prog->bytelen++] = Any; |
201 | prog->insts[prog->bytelen++] = Jmp; |
202 | prog->insts[prog->bytelen++] = -5; |
203 | prog->len += 3; |
204 | |
205 | prog->insts[prog->bytelen++] = Save; |
206 | prog->insts[prog->bytelen++] = 0; |
207 | prog->len++; |
208 | |
209 | re = _compilecode(re, prog, /*sizecode*/0); |
210 | if (re == NULL || *re) return 1; |
211 | |
212 | prog->insts[prog->bytelen++] = Save; |
213 | prog->insts[prog->bytelen++] = 1; |
214 | prog->len++; |
215 | |
216 | prog->insts[prog->bytelen++] = Match; |
217 | prog->len++; |
218 | |
219 | return 0; |
220 | } |
221 | |
222 | #if 0 |
223 | int main(int argc, char *argv[]) |
224 | { |
225 | int pc = 0; |
226 | ByteProg *code = re1_5_compilecode(argv[1]); |
227 | re1_5_dumpcode(code); |
228 | } |
229 | #endif |
230 | |