1 | /* |
2 | ** $Id: lpcode.c,v 1.24 2016/09/15 17:46:13 roberto Exp $ |
3 | ** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) |
4 | */ |
5 | |
6 | #include <limits.h> |
7 | |
8 | |
9 | #include "lua.h" |
10 | #include "lauxlib.h" |
11 | |
12 | #include "lptypes.h" |
13 | #include "lpcode.h" |
14 | |
15 | |
16 | /* signals a "no-instruction */ |
17 | #define NOINST -1 |
18 | |
19 | |
20 | |
21 | static const Charset fullset_ = |
22 | {{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
23 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
24 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
25 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}}; |
26 | |
27 | static const Charset *fullset = &fullset_; |
28 | |
29 | /* |
30 | ** {====================================================== |
31 | ** Analysis and some optimizations |
32 | ** ======================================================= |
33 | */ |
34 | |
35 | /* |
36 | ** Check whether a charset is empty (returns IFail), singleton (IChar), |
37 | ** full (IAny), or none of those (ISet). When singleton, '*c' returns |
38 | ** which character it is. (When generic set, the set was the input, |
39 | ** so there is no need to return it.) |
40 | */ |
41 | static Opcode charsettype (const byte *cs, int *c) { |
42 | int count = 0; /* number of characters in the set */ |
43 | int i; |
44 | int candidate = -1; /* candidate position for the singleton char */ |
45 | for (i = 0; i < CHARSETSIZE; i++) { /* for each byte */ |
46 | int b = cs[i]; |
47 | if (b == 0) { /* is byte empty? */ |
48 | if (count > 1) /* was set neither empty nor singleton? */ |
49 | return ISet; /* neither full nor empty nor singleton */ |
50 | /* else set is still empty or singleton */ |
51 | } |
52 | else if (b == 0xFF) { /* is byte full? */ |
53 | if (count < (i * BITSPERCHAR)) /* was set not full? */ |
54 | return ISet; /* neither full nor empty nor singleton */ |
55 | else count += BITSPERCHAR; /* set is still full */ |
56 | } |
57 | else if ((b & (b - 1)) == 0) { /* has byte only one bit? */ |
58 | if (count > 0) /* was set not empty? */ |
59 | return ISet; /* neither full nor empty nor singleton */ |
60 | else { /* set has only one char till now; track it */ |
61 | count++; |
62 | candidate = i; |
63 | } |
64 | } |
65 | else return ISet; /* byte is neither empty, full, nor singleton */ |
66 | } |
67 | switch (count) { |
68 | case 0: return IFail; /* empty set */ |
69 | case 1: { /* singleton; find character bit inside byte */ |
70 | int b = cs[candidate]; |
71 | *c = candidate * BITSPERCHAR; |
72 | if ((b & 0xF0) != 0) { *c += 4; b >>= 4; } |
73 | if ((b & 0x0C) != 0) { *c += 2; b >>= 2; } |
74 | if ((b & 0x02) != 0) { *c += 1; } |
75 | return IChar; |
76 | } |
77 | default: { |
78 | assert(count == CHARSETSIZE * BITSPERCHAR); /* full set */ |
79 | return IAny; |
80 | } |
81 | } |
82 | } |
83 | |
84 | |
85 | /* |
86 | ** A few basic operations on Charsets |
87 | */ |
88 | static void cs_complement (Charset *cs) { |
89 | loopset(i, cs->cs[i] = ~cs->cs[i]); |
90 | } |
91 | |
92 | static int cs_equal (const byte *cs1, const byte *cs2) { |
93 | loopset(i, if (cs1[i] != cs2[i]) return 0); |
94 | return 1; |
95 | } |
96 | |
97 | static int cs_disjoint (const Charset *cs1, const Charset *cs2) { |
98 | loopset(i, if ((cs1->cs[i] & cs2->cs[i]) != 0) return 0;) |
99 | return 1; |
100 | } |
101 | |
102 | |
103 | /* |
104 | ** If 'tree' is a 'char' pattern (TSet, TChar, TAny), convert it into a |
105 | ** charset and return 1; else return 0. |
106 | */ |
107 | int tocharset (TTree *tree, Charset *cs) { |
108 | switch (tree->tag) { |
109 | case TSet: { /* copy set */ |
110 | loopset(i, cs->cs[i] = treebuffer(tree)[i]); |
111 | return 1; |
112 | } |
113 | case TChar: { /* only one char */ |
114 | assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX); |
115 | loopset(i, cs->cs[i] = 0); /* erase all chars */ |
116 | setchar(cs->cs, tree->u.n); /* add that one */ |
117 | return 1; |
118 | } |
119 | case TAny: { |
120 | loopset(i, cs->cs[i] = 0xFF); /* add all characters to the set */ |
121 | return 1; |
122 | } |
123 | default: return 0; |
124 | } |
125 | } |
126 | |
127 | |
128 | /* |
129 | ** Visit a TCall node taking care to stop recursion. If node not yet |
130 | ** visited, return 'f(sib2(tree))', otherwise return 'def' (default |
131 | ** value) |
132 | */ |
133 | static int callrecursive (TTree *tree, int f (TTree *t), int def) { |
134 | int key = tree->key; |
135 | assert(tree->tag == TCall); |
136 | assert(sib2(tree)->tag == TRule); |
137 | if (key == 0) /* node already visited? */ |
138 | return def; /* return default value */ |
139 | else { /* first visit */ |
140 | int result; |
141 | tree->key = 0; /* mark call as already visited */ |
142 | result = f(sib2(tree)); /* go to called rule */ |
143 | tree->key = key; /* restore tree */ |
144 | return result; |
145 | } |
146 | } |
147 | |
148 | |
149 | /* |
150 | ** Check whether a pattern tree has captures |
151 | */ |
152 | int hascaptures (TTree *tree) { |
153 | tailcall: |
154 | switch (tree->tag) { |
155 | case TCapture: case TRunTime: |
156 | return 1; |
157 | case TCall: |
158 | return callrecursive(tree, hascaptures, 0); |
159 | case TRule: /* do not follow siblings */ |
160 | tree = sib1(tree); goto tailcall; |
161 | case TOpenCall: assert(0); |
162 | default: { |
163 | switch (numsiblings[tree->tag]) { |
164 | case 1: /* return hascaptures(sib1(tree)); */ |
165 | tree = sib1(tree); goto tailcall; |
166 | case 2: |
167 | if (hascaptures(sib1(tree))) |
168 | return 1; |
169 | /* else return hascaptures(sib2(tree)); */ |
170 | tree = sib2(tree); goto tailcall; |
171 | default: assert(numsiblings[tree->tag] == 0); return 0; |
172 | } |
173 | } |
174 | } |
175 | } |
176 | |
177 | |
178 | /* |
179 | ** Checks how a pattern behaves regarding the empty string, |
180 | ** in one of two different ways: |
181 | ** A pattern is *nullable* if it can match without consuming any character; |
182 | ** A pattern is *nofail* if it never fails for any string |
183 | ** (including the empty string). |
184 | ** The difference is only for predicates and run-time captures; |
185 | ** for other patterns, the two properties are equivalent. |
186 | ** (With predicates, &'a' is nullable but not nofail. Of course, |
187 | ** nofail => nullable.) |
188 | ** These functions are all convervative in the following way: |
189 | ** p is nullable => nullable(p) |
190 | ** nofail(p) => p cannot fail |
191 | ** The function assumes that TOpenCall is not nullable; |
192 | ** this will be checked again when the grammar is fixed. |
193 | ** Run-time captures can do whatever they want, so the result |
194 | ** is conservative. |
195 | */ |
196 | int checkaux (TTree *tree, int pred) { |
197 | tailcall: |
198 | switch (tree->tag) { |
199 | case TChar: case TSet: case TAny: |
200 | case TFalse: case TOpenCall: |
201 | return 0; /* not nullable */ |
202 | case TRep: case TTrue: |
203 | return 1; /* no fail */ |
204 | case TNot: case TBehind: /* can match empty, but can fail */ |
205 | if (pred == PEnofail) return 0; |
206 | else return 1; /* PEnullable */ |
207 | case TAnd: /* can match empty; fail iff body does */ |
208 | if (pred == PEnullable) return 1; |
209 | /* else return checkaux(sib1(tree), pred); */ |
210 | tree = sib1(tree); goto tailcall; |
211 | case TRunTime: /* can fail; match empty iff body does */ |
212 | if (pred == PEnofail) return 0; |
213 | /* else return checkaux(sib1(tree), pred); */ |
214 | tree = sib1(tree); goto tailcall; |
215 | case TSeq: |
216 | if (!checkaux(sib1(tree), pred)) return 0; |
217 | /* else return checkaux(sib2(tree), pred); */ |
218 | tree = sib2(tree); goto tailcall; |
219 | case TChoice: |
220 | if (checkaux(sib2(tree), pred)) return 1; |
221 | /* else return checkaux(sib1(tree), pred); */ |
222 | tree = sib1(tree); goto tailcall; |
223 | case TCapture: case TGrammar: case TRule: |
224 | /* return checkaux(sib1(tree), pred); */ |
225 | tree = sib1(tree); goto tailcall; |
226 | case TCall: /* return checkaux(sib2(tree), pred); */ |
227 | tree = sib2(tree); goto tailcall; |
228 | default: assert(0); return 0; |
229 | } |
230 | } |
231 | |
232 | |
233 | /* |
234 | ** number of characters to match a pattern (or -1 if variable) |
235 | */ |
236 | int fixedlen (TTree *tree) { |
237 | int len = 0; /* to accumulate in tail calls */ |
238 | tailcall: |
239 | switch (tree->tag) { |
240 | case TChar: case TSet: case TAny: |
241 | return len + 1; |
242 | case TFalse: case TTrue: case TNot: case TAnd: case TBehind: |
243 | return len; |
244 | case TRep: case TRunTime: case TOpenCall: |
245 | return -1; |
246 | case TCapture: case TRule: case TGrammar: |
247 | /* return fixedlen(sib1(tree)); */ |
248 | tree = sib1(tree); goto tailcall; |
249 | case TCall: { |
250 | int n1 = callrecursive(tree, fixedlen, -1); |
251 | if (n1 < 0) |
252 | return -1; |
253 | else |
254 | return len + n1; |
255 | } |
256 | case TSeq: { |
257 | int n1 = fixedlen(sib1(tree)); |
258 | if (n1 < 0) |
259 | return -1; |
260 | /* else return fixedlen(sib2(tree)) + len; */ |
261 | len += n1; tree = sib2(tree); goto tailcall; |
262 | } |
263 | case TChoice: { |
264 | int n1 = fixedlen(sib1(tree)); |
265 | int n2 = fixedlen(sib2(tree)); |
266 | if (n1 != n2 || n1 < 0) |
267 | return -1; |
268 | else |
269 | return len + n1; |
270 | } |
271 | default: assert(0); return 0; |
272 | }; |
273 | } |
274 | |
275 | |
276 | /* |
277 | ** Computes the 'first set' of a pattern. |
278 | ** The result is a conservative aproximation: |
279 | ** match p ax -> x (for some x) ==> a belongs to first(p) |
280 | ** or |
281 | ** a not in first(p) ==> match p ax -> fail (for all x) |
282 | ** |
283 | ** The set 'follow' is the first set of what follows the |
284 | ** pattern (full set if nothing follows it). |
285 | ** |
286 | ** The function returns 0 when this resulting set can be used for |
287 | ** test instructions that avoid the pattern altogether. |
288 | ** A non-zero return can happen for two reasons: |
289 | ** 1) match p '' -> '' ==> return has bit 1 set |
290 | ** (tests cannot be used because they would always fail for an empty input); |
291 | ** 2) there is a match-time capture ==> return has bit 2 set |
292 | ** (optimizations should not bypass match-time captures). |
293 | */ |
294 | static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { |
295 | tailcall: |
296 | switch (tree->tag) { |
297 | case TChar: case TSet: case TAny: { |
298 | tocharset(tree, firstset); |
299 | return 0; |
300 | } |
301 | case TTrue: { |
302 | loopset(i, firstset->cs[i] = follow->cs[i]); |
303 | return 1; /* accepts the empty string */ |
304 | } |
305 | case TFalse: { |
306 | loopset(i, firstset->cs[i] = 0); |
307 | return 0; |
308 | } |
309 | case TChoice: { |
310 | Charset csaux; |
311 | int e1 = getfirst(sib1(tree), follow, firstset); |
312 | int e2 = getfirst(sib2(tree), follow, &csaux); |
313 | loopset(i, firstset->cs[i] |= csaux.cs[i]); |
314 | return e1 | e2; |
315 | } |
316 | case TSeq: { |
317 | if (!nullable(sib1(tree))) { |
318 | /* when p1 is not nullable, p2 has nothing to contribute; |
319 | return getfirst(sib1(tree), fullset, firstset); */ |
320 | tree = sib1(tree); follow = fullset; goto tailcall; |
321 | } |
322 | else { /* FIRST(p1 p2, fl) = FIRST(p1, FIRST(p2, fl)) */ |
323 | Charset csaux; |
324 | int e2 = getfirst(sib2(tree), follow, &csaux); |
325 | int e1 = getfirst(sib1(tree), &csaux, firstset); |
326 | if (e1 == 0) return 0; /* 'e1' ensures that first can be used */ |
327 | else if ((e1 | e2) & 2) /* one of the children has a matchtime? */ |
328 | return 2; /* pattern has a matchtime capture */ |
329 | else return e2; /* else depends on 'e2' */ |
330 | } |
331 | } |
332 | case TRep: { |
333 | getfirst(sib1(tree), follow, firstset); |
334 | loopset(i, firstset->cs[i] |= follow->cs[i]); |
335 | return 1; /* accept the empty string */ |
336 | } |
337 | case TCapture: case TGrammar: case TRule: { |
338 | /* return getfirst(sib1(tree), follow, firstset); */ |
339 | tree = sib1(tree); goto tailcall; |
340 | } |
341 | case TRunTime: { /* function invalidates any follow info. */ |
342 | int e = getfirst(sib1(tree), fullset, firstset); |
343 | if (e) return 2; /* function is not "protected"? */ |
344 | else return 0; /* pattern inside capture ensures first can be used */ |
345 | } |
346 | case TCall: { |
347 | /* return getfirst(sib2(tree), follow, firstset); */ |
348 | tree = sib2(tree); goto tailcall; |
349 | } |
350 | case TAnd: { |
351 | int e = getfirst(sib1(tree), follow, firstset); |
352 | loopset(i, firstset->cs[i] &= follow->cs[i]); |
353 | return e; |
354 | } |
355 | case TNot: { |
356 | if (tocharset(sib1(tree), firstset)) { |
357 | cs_complement(firstset); |
358 | return 1; |
359 | } |
360 | /* else go through */ |
361 | } |
362 | case TBehind: { /* instruction gives no new information */ |
363 | /* call 'getfirst' only to check for math-time captures */ |
364 | int e = getfirst(sib1(tree), follow, firstset); |
365 | loopset(i, firstset->cs[i] = follow->cs[i]); /* uses follow */ |
366 | return e | 1; /* always can accept the empty string */ |
367 | } |
368 | default: assert(0); return 0; |
369 | } |
370 | } |
371 | |
372 | |
373 | /* |
374 | ** If 'headfail(tree)' true, then 'tree' can fail only depending on the |
375 | ** next character of the subject. |
376 | */ |
377 | static int headfail (TTree *tree) { |
378 | tailcall: |
379 | switch (tree->tag) { |
380 | case TChar: case TSet: case TAny: case TFalse: |
381 | return 1; |
382 | case TTrue: case TRep: case TRunTime: case TNot: |
383 | case TBehind: |
384 | return 0; |
385 | case TCapture: case TGrammar: case TRule: case TAnd: |
386 | tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */ |
387 | case TCall: |
388 | tree = sib2(tree); goto tailcall; /* return headfail(sib2(tree)); */ |
389 | case TSeq: |
390 | if (!nofail(sib2(tree))) return 0; |
391 | /* else return headfail(sib1(tree)); */ |
392 | tree = sib1(tree); goto tailcall; |
393 | case TChoice: |
394 | if (!headfail(sib1(tree))) return 0; |
395 | /* else return headfail(sib2(tree)); */ |
396 | tree = sib2(tree); goto tailcall; |
397 | default: assert(0); return 0; |
398 | } |
399 | } |
400 | |
401 | |
402 | /* |
403 | ** Check whether the code generation for the given tree can benefit |
404 | ** from a follow set (to avoid computing the follow set when it is |
405 | ** not needed) |
406 | */ |
407 | static int needfollow (TTree *tree) { |
408 | tailcall: |
409 | switch (tree->tag) { |
410 | case TChar: case TSet: case TAny: |
411 | case TFalse: case TTrue: case TAnd: case TNot: |
412 | case TRunTime: case TGrammar: case TCall: case TBehind: |
413 | return 0; |
414 | case TChoice: case TRep: |
415 | return 1; |
416 | case TCapture: |
417 | tree = sib1(tree); goto tailcall; |
418 | case TSeq: |
419 | tree = sib2(tree); goto tailcall; |
420 | default: assert(0); return 0; |
421 | } |
422 | } |
423 | |
424 | /* }====================================================== */ |
425 | |
426 | |
427 | |
428 | /* |
429 | ** {====================================================== |
430 | ** Code generation |
431 | ** ======================================================= |
432 | */ |
433 | |
434 | |
435 | /* |
436 | ** size of an instruction |
437 | */ |
438 | int sizei (const Instruction *i) { |
439 | switch((Opcode)i->i.code) { |
440 | case ISet: case ISpan: return CHARSETINSTSIZE; |
441 | case ITestSet: return CHARSETINSTSIZE + 1; |
442 | case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: |
443 | case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: |
444 | return 2; |
445 | default: return 1; |
446 | } |
447 | } |
448 | |
449 | |
450 | /* |
451 | ** state for the compiler |
452 | */ |
453 | typedef struct CompileState { |
454 | Pattern *p; /* pattern being compiled */ |
455 | int ncode; /* next position in p->code to be filled */ |
456 | lua_State *L; |
457 | } CompileState; |
458 | |
459 | |
460 | /* |
461 | ** code generation is recursive; 'opt' indicates that the code is being |
462 | ** generated as the last thing inside an optional pattern (so, if that |
463 | ** code is optional too, it can reuse the 'IChoice' already in place for |
464 | ** the outer pattern). 'tt' points to a previous test protecting this |
465 | ** code (or NOINST). 'fl' is the follow set of the pattern. |
466 | */ |
467 | static void codegen (CompileState *compst, TTree *tree, int opt, int tt, |
468 | const Charset *fl); |
469 | |
470 | |
471 | void realloccode (lua_State *L, Pattern *p, int nsize) { |
472 | void *ud; |
473 | lua_Alloc f = lua_getallocf(L, &ud); |
474 | void *newblock = f(ud, p->code, p->codesize * sizeof(Instruction), |
475 | nsize * sizeof(Instruction)); |
476 | if (newblock == NULL && nsize > 0) |
477 | luaL_error(L, "not enough memory" ); |
478 | p->code = (Instruction *)newblock; |
479 | p->codesize = nsize; |
480 | } |
481 | |
482 | |
483 | static int nextinstruction (CompileState *compst) { |
484 | int size = compst->p->codesize; |
485 | if (compst->ncode >= size) |
486 | realloccode(compst->L, compst->p, size * 2); |
487 | return compst->ncode++; |
488 | } |
489 | |
490 | |
491 | #define getinstr(cs,i) ((cs)->p->code[i]) |
492 | |
493 | |
494 | static int addinstruction (CompileState *compst, Opcode op, int aux) { |
495 | int i = nextinstruction(compst); |
496 | getinstr(compst, i).i.code = op; |
497 | getinstr(compst, i).i.aux = aux; |
498 | return i; |
499 | } |
500 | |
501 | |
502 | /* |
503 | ** Add an instruction followed by space for an offset (to be set later) |
504 | */ |
505 | static int addoffsetinst (CompileState *compst, Opcode op) { |
506 | int i = addinstruction(compst, op, 0); /* instruction */ |
507 | addinstruction(compst, (Opcode)0, 0); /* open space for offset */ |
508 | assert(op == ITestSet || sizei(&getinstr(compst, i)) == 2); |
509 | return i; |
510 | } |
511 | |
512 | |
513 | /* |
514 | ** Set the offset of an instruction |
515 | */ |
516 | static void setoffset (CompileState *compst, int instruction, int offset) { |
517 | getinstr(compst, instruction + 1).offset = offset; |
518 | } |
519 | |
520 | |
521 | /* |
522 | ** Add a capture instruction: |
523 | ** 'op' is the capture instruction; 'cap' the capture kind; |
524 | ** 'key' the key into ktable; 'aux' is the optional capture offset |
525 | ** |
526 | */ |
527 | static int addinstcap (CompileState *compst, Opcode op, int cap, int key, |
528 | int aux) { |
529 | int i = addinstruction(compst, op, joinkindoff(cap, aux)); |
530 | getinstr(compst, i).i.key = key; |
531 | return i; |
532 | } |
533 | |
534 | |
535 | #define gethere(compst) ((compst)->ncode) |
536 | |
537 | #define target(code,i) ((i) + code[i + 1].offset) |
538 | |
539 | |
540 | /* |
541 | ** Patch 'instruction' to jump to 'target' |
542 | */ |
543 | static void jumptothere (CompileState *compst, int instruction, int target) { |
544 | if (instruction >= 0) |
545 | setoffset(compst, instruction, target - instruction); |
546 | } |
547 | |
548 | |
549 | /* |
550 | ** Patch 'instruction' to jump to current position |
551 | */ |
552 | static void jumptohere (CompileState *compst, int instruction) { |
553 | jumptothere(compst, instruction, gethere(compst)); |
554 | } |
555 | |
556 | |
557 | /* |
558 | ** Code an IChar instruction, or IAny if there is an equivalent |
559 | ** test dominating it |
560 | */ |
561 | static void codechar (CompileState *compst, int c, int tt) { |
562 | if (tt >= 0 && getinstr(compst, tt).i.code == ITestChar && |
563 | getinstr(compst, tt).i.aux == c) |
564 | addinstruction(compst, IAny, 0); |
565 | else |
566 | addinstruction(compst, IChar, c); |
567 | } |
568 | |
569 | |
570 | /* |
571 | ** Add a charset posfix to an instruction |
572 | */ |
573 | static void addcharset (CompileState *compst, const byte *cs) { |
574 | int p = gethere(compst); |
575 | int i; |
576 | for (i = 0; i < (int)CHARSETINSTSIZE - 1; i++) |
577 | nextinstruction(compst); /* space for buffer */ |
578 | /* fill buffer with charset */ |
579 | loopset(j, getinstr(compst, p).buff[j] = cs[j]); |
580 | } |
581 | |
582 | |
583 | /* |
584 | ** code a char set, optimizing unit sets for IChar, "complete" |
585 | ** sets for IAny, and empty sets for IFail; also use an IAny |
586 | ** when instruction is dominated by an equivalent test. |
587 | */ |
588 | static void codecharset (CompileState *compst, const byte *cs, int tt) { |
589 | int c = 0; /* (=) to avoid warnings */ |
590 | Opcode op = charsettype(cs, &c); |
591 | switch (op) { |
592 | case IChar: codechar(compst, c, tt); break; |
593 | case ISet: { /* non-trivial set? */ |
594 | if (tt >= 0 && getinstr(compst, tt).i.code == ITestSet && |
595 | cs_equal(cs, getinstr(compst, tt + 2).buff)) |
596 | addinstruction(compst, IAny, 0); |
597 | else { |
598 | addinstruction(compst, ISet, 0); |
599 | addcharset(compst, cs); |
600 | } |
601 | break; |
602 | } |
603 | default: addinstruction(compst, op, c); break; |
604 | } |
605 | } |
606 | |
607 | |
608 | /* |
609 | ** code a test set, optimizing unit sets for ITestChar, "complete" |
610 | ** sets for ITestAny, and empty sets for IJmp (always fails). |
611 | ** 'e' is true iff test should accept the empty string. (Test |
612 | ** instructions in the current VM never accept the empty string.) |
613 | */ |
614 | static int codetestset (CompileState *compst, Charset *cs, int e) { |
615 | if (e) return NOINST; /* no test */ |
616 | else { |
617 | int c = 0; |
618 | Opcode op = charsettype(cs->cs, &c); |
619 | switch (op) { |
620 | case IFail: return addoffsetinst(compst, IJmp); /* always jump */ |
621 | case IAny: return addoffsetinst(compst, ITestAny); |
622 | case IChar: { |
623 | int i = addoffsetinst(compst, ITestChar); |
624 | getinstr(compst, i).i.aux = c; |
625 | return i; |
626 | } |
627 | case ISet: { |
628 | int i = addoffsetinst(compst, ITestSet); |
629 | addcharset(compst, cs->cs); |
630 | return i; |
631 | } |
632 | default: assert(0); return 0; |
633 | } |
634 | } |
635 | } |
636 | |
637 | |
638 | /* |
639 | ** Find the final destination of a sequence of jumps |
640 | */ |
641 | static int finaltarget (Instruction *code, int i) { |
642 | while (code[i].i.code == IJmp) |
643 | i = target(code, i); |
644 | return i; |
645 | } |
646 | |
647 | |
648 | /* |
649 | ** final label (after traversing any jumps) |
650 | */ |
651 | static int finallabel (Instruction *code, int i) { |
652 | return finaltarget(code, target(code, i)); |
653 | } |
654 | |
655 | |
656 | /* |
657 | ** <behind(p)> == behind n; <p> (where n = fixedlen(p)) |
658 | */ |
659 | static void codebehind (CompileState *compst, TTree *tree) { |
660 | if (tree->u.n > 0) |
661 | addinstruction(compst, IBehind, tree->u.n); |
662 | codegen(compst, sib1(tree), 0, NOINST, fullset); |
663 | } |
664 | |
665 | |
666 | /* |
667 | ** Choice; optimizations: |
668 | ** - when p1 is headfail or |
669 | ** when first(p1) and first(p2) are disjoint, than |
670 | ** a character not in first(p1) cannot go to p1, and a character |
671 | ** in first(p1) cannot go to p2 (at it is not in first(p2)). |
672 | ** (The optimization is not valid if p1 accepts the empty string, |
673 | ** as then there is no character at all...) |
674 | ** - when p2 is empty and opt is true; a IPartialCommit can reuse |
675 | ** the Choice already active in the stack. |
676 | */ |
677 | static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt, |
678 | const Charset *fl) { |
679 | int emptyp2 = (p2->tag == TTrue); |
680 | Charset cs1, cs2; |
681 | int e1 = getfirst(p1, fullset, &cs1); |
682 | if (headfail(p1) || |
683 | (!e1 && (getfirst(p2, fl, &cs2), cs_disjoint(&cs1, &cs2)))) { |
684 | /* <p1 / p2> == test (fail(p1)) -> L1 ; p1 ; jmp L2; L1: p2; L2: */ |
685 | int test = codetestset(compst, &cs1, 0); |
686 | int jmp = NOINST; |
687 | codegen(compst, p1, 0, test, fl); |
688 | if (!emptyp2) |
689 | jmp = addoffsetinst(compst, IJmp); |
690 | jumptohere(compst, test); |
691 | codegen(compst, p2, opt, NOINST, fl); |
692 | jumptohere(compst, jmp); |
693 | } |
694 | else if (opt && emptyp2) { |
695 | /* p1? == IPartialCommit; p1 */ |
696 | jumptohere(compst, addoffsetinst(compst, IPartialCommit)); |
697 | codegen(compst, p1, 1, NOINST, fullset); |
698 | } |
699 | else { |
700 | /* <p1 / p2> == |
701 | test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */ |
702 | int pcommit; |
703 | int test = codetestset(compst, &cs1, e1); |
704 | int pchoice = addoffsetinst(compst, IChoice); |
705 | codegen(compst, p1, emptyp2, test, fullset); |
706 | pcommit = addoffsetinst(compst, ICommit); |
707 | jumptohere(compst, pchoice); |
708 | jumptohere(compst, test); |
709 | codegen(compst, p2, opt, NOINST, fl); |
710 | jumptohere(compst, pcommit); |
711 | } |
712 | } |
713 | |
714 | |
715 | /* |
716 | ** And predicate |
717 | ** optimization: fixedlen(p) = n ==> <&p> == <p>; behind n |
718 | ** (valid only when 'p' has no captures) |
719 | */ |
720 | static void codeand (CompileState *compst, TTree *tree, int tt) { |
721 | int n = fixedlen(tree); |
722 | if (n >= 0 && n <= MAXBEHIND && !hascaptures(tree)) { |
723 | codegen(compst, tree, 0, tt, fullset); |
724 | if (n > 0) |
725 | addinstruction(compst, IBehind, n); |
726 | } |
727 | else { /* default: Choice L1; p1; BackCommit L2; L1: Fail; L2: */ |
728 | int pcommit; |
729 | int pchoice = addoffsetinst(compst, IChoice); |
730 | codegen(compst, tree, 0, tt, fullset); |
731 | pcommit = addoffsetinst(compst, IBackCommit); |
732 | jumptohere(compst, pchoice); |
733 | addinstruction(compst, IFail, 0); |
734 | jumptohere(compst, pcommit); |
735 | } |
736 | } |
737 | |
738 | |
739 | /* |
740 | ** Captures: if pattern has fixed (and not too big) length, and it |
741 | ** has no nested captures, use a single IFullCapture instruction |
742 | ** after the match; otherwise, enclose the pattern with OpenCapture - |
743 | ** CloseCapture. |
744 | */ |
745 | static void codecapture (CompileState *compst, TTree *tree, int tt, |
746 | const Charset *fl) { |
747 | int len = fixedlen(sib1(tree)); |
748 | if (len >= 0 && len <= MAXOFF && !hascaptures(sib1(tree))) { |
749 | codegen(compst, sib1(tree), 0, tt, fl); |
750 | addinstcap(compst, IFullCapture, tree->cap, tree->key, len); |
751 | } |
752 | else { |
753 | addinstcap(compst, IOpenCapture, tree->cap, tree->key, 0); |
754 | codegen(compst, sib1(tree), 0, tt, fl); |
755 | addinstcap(compst, ICloseCapture, Cclose, 0, 0); |
756 | } |
757 | } |
758 | |
759 | |
760 | static void coderuntime (CompileState *compst, TTree *tree, int tt) { |
761 | addinstcap(compst, IOpenCapture, Cgroup, tree->key, 0); |
762 | codegen(compst, sib1(tree), 0, tt, fullset); |
763 | addinstcap(compst, ICloseRunTime, Cclose, 0, 0); |
764 | } |
765 | |
766 | |
767 | /* |
768 | ** Repetion; optimizations: |
769 | ** When pattern is a charset, can use special instruction ISpan. |
770 | ** When pattern is head fail, or if it starts with characters that |
771 | ** are disjoint from what follows the repetions, a simple test |
772 | ** is enough (a fail inside the repetition would backtrack to fail |
773 | ** again in the following pattern, so there is no need for a choice). |
774 | ** When 'opt' is true, the repetion can reuse the Choice already |
775 | ** active in the stack. |
776 | */ |
777 | static void coderep (CompileState *compst, TTree *tree, int opt, |
778 | const Charset *fl) { |
779 | Charset st; |
780 | if (tocharset(tree, &st)) { |
781 | addinstruction(compst, ISpan, 0); |
782 | addcharset(compst, st.cs); |
783 | } |
784 | else { |
785 | int e1 = getfirst(tree, fullset, &st); |
786 | if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) { |
787 | /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */ |
788 | int jmp; |
789 | int test = codetestset(compst, &st, 0); |
790 | codegen(compst, tree, 0, test, fullset); |
791 | jmp = addoffsetinst(compst, IJmp); |
792 | jumptohere(compst, test); |
793 | jumptothere(compst, jmp, test); |
794 | } |
795 | else { |
796 | /* test(fail(p1)) -> L2; choice L2; L1: <p>; partialcommit L1; L2: */ |
797 | /* or (if 'opt'): partialcommit L1; L1: <p>; partialcommit L1; */ |
798 | int commit, l2; |
799 | int test = codetestset(compst, &st, e1); |
800 | int pchoice = NOINST; |
801 | if (opt) |
802 | jumptohere(compst, addoffsetinst(compst, IPartialCommit)); |
803 | else |
804 | pchoice = addoffsetinst(compst, IChoice); |
805 | l2 = gethere(compst); |
806 | codegen(compst, tree, 0, NOINST, fullset); |
807 | commit = addoffsetinst(compst, IPartialCommit); |
808 | jumptothere(compst, commit, l2); |
809 | jumptohere(compst, pchoice); |
810 | jumptohere(compst, test); |
811 | } |
812 | } |
813 | } |
814 | |
815 | |
816 | /* |
817 | ** Not predicate; optimizations: |
818 | ** In any case, if first test fails, 'not' succeeds, so it can jump to |
819 | ** the end. If pattern is headfail, that is all (it cannot fail |
820 | ** in other parts); this case includes 'not' of simple sets. Otherwise, |
821 | ** use the default code (a choice plus a failtwice). |
822 | */ |
823 | static void codenot (CompileState *compst, TTree *tree) { |
824 | Charset st; |
825 | int e = getfirst(tree, fullset, &st); |
826 | int test = codetestset(compst, &st, e); |
827 | if (headfail(tree)) /* test (fail(p1)) -> L1; fail; L1: */ |
828 | addinstruction(compst, IFail, 0); |
829 | else { |
830 | /* test(fail(p))-> L1; choice L1; <p>; failtwice; L1: */ |
831 | int pchoice = addoffsetinst(compst, IChoice); |
832 | codegen(compst, tree, 0, NOINST, fullset); |
833 | addinstruction(compst, IFailTwice, 0); |
834 | jumptohere(compst, pchoice); |
835 | } |
836 | jumptohere(compst, test); |
837 | } |
838 | |
839 | |
840 | /* |
841 | ** change open calls to calls, using list 'positions' to find |
842 | ** correct offsets; also optimize tail calls |
843 | */ |
844 | static void correctcalls (CompileState *compst, int *positions, |
845 | int from, int to) { |
846 | int i; |
847 | Instruction *code = compst->p->code; |
848 | for (i = from; i < to; i += sizei(&code[i])) { |
849 | if (code[i].i.code == IOpenCall) { |
850 | int n = code[i].i.key; /* rule number */ |
851 | int rule = positions[n]; /* rule position */ |
852 | assert(rule == from || code[rule - 1].i.code == IRet); |
853 | if (code[finaltarget(code, i + 2)].i.code == IRet) /* call; ret ? */ |
854 | code[i].i.code = IJmp; /* tail call */ |
855 | else |
856 | code[i].i.code = ICall; |
857 | jumptothere(compst, i, rule); /* call jumps to respective rule */ |
858 | } |
859 | } |
860 | assert(i == to); |
861 | } |
862 | |
863 | |
864 | /* |
865 | ** Code for a grammar: |
866 | ** call L1; jmp L2; L1: rule 1; ret; rule 2; ret; ...; L2: |
867 | */ |
868 | static void codegrammar (CompileState *compst, TTree *grammar) { |
869 | int positions[MAXRULES]; |
870 | int rulenumber = 0; |
871 | TTree *rule; |
872 | int firstcall = addoffsetinst(compst, ICall); /* call initial rule */ |
873 | int jumptoend = addoffsetinst(compst, IJmp); /* jump to the end */ |
874 | int start = gethere(compst); /* here starts the initial rule */ |
875 | jumptohere(compst, firstcall); |
876 | for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { |
877 | positions[rulenumber++] = gethere(compst); /* save rule position */ |
878 | codegen(compst, sib1(rule), 0, NOINST, fullset); /* code rule */ |
879 | addinstruction(compst, IRet, 0); |
880 | } |
881 | assert(rule->tag == TTrue); |
882 | jumptohere(compst, jumptoend); |
883 | correctcalls(compst, positions, start, gethere(compst)); |
884 | } |
885 | |
886 | |
887 | static void codecall (CompileState *compst, TTree *call) { |
888 | int c = addoffsetinst(compst, IOpenCall); /* to be corrected later */ |
889 | getinstr(compst, c).i.key = sib2(call)->cap; /* rule number */ |
890 | assert(sib2(call)->tag == TRule); |
891 | } |
892 | |
893 | |
894 | /* |
895 | ** Code first child of a sequence |
896 | ** (second child is called in-place to allow tail call) |
897 | ** Return 'tt' for second child |
898 | */ |
899 | static int codeseq1 (CompileState *compst, TTree *p1, TTree *p2, |
900 | int tt, const Charset *fl) { |
901 | if (needfollow(p1)) { |
902 | Charset fl1; |
903 | getfirst(p2, fl, &fl1); /* p1 follow is p2 first */ |
904 | codegen(compst, p1, 0, tt, &fl1); |
905 | } |
906 | else /* use 'fullset' as follow */ |
907 | codegen(compst, p1, 0, tt, fullset); |
908 | if (fixedlen(p1) != 0) /* can 'p1' consume anything? */ |
909 | return NOINST; /* invalidate test */ |
910 | else return tt; /* else 'tt' still protects sib2 */ |
911 | } |
912 | |
913 | |
914 | /* |
915 | ** Main code-generation function: dispatch to auxiliar functions |
916 | ** according to kind of tree. ('needfollow' should return true |
917 | ** only for consructions that use 'fl'.) |
918 | */ |
919 | static void codegen (CompileState *compst, TTree *tree, int opt, int tt, |
920 | const Charset *fl) { |
921 | tailcall: |
922 | switch (tree->tag) { |
923 | case TChar: codechar(compst, tree->u.n, tt); break; |
924 | case TAny: addinstruction(compst, IAny, 0); break; |
925 | case TSet: codecharset(compst, treebuffer(tree), tt); break; |
926 | case TTrue: break; |
927 | case TFalse: addinstruction(compst, IFail, 0); break; |
928 | case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break; |
929 | case TRep: coderep(compst, sib1(tree), opt, fl); break; |
930 | case TBehind: codebehind(compst, tree); break; |
931 | case TNot: codenot(compst, sib1(tree)); break; |
932 | case TAnd: codeand(compst, sib1(tree), tt); break; |
933 | case TCapture: codecapture(compst, tree, tt, fl); break; |
934 | case TRunTime: coderuntime(compst, tree, tt); break; |
935 | case TGrammar: codegrammar(compst, tree); break; |
936 | case TCall: codecall(compst, tree); break; |
937 | case TSeq: { |
938 | tt = codeseq1(compst, sib1(tree), sib2(tree), tt, fl); /* code 'p1' */ |
939 | /* codegen(compst, p2, opt, tt, fl); */ |
940 | tree = sib2(tree); goto tailcall; |
941 | } |
942 | default: assert(0); |
943 | } |
944 | } |
945 | |
946 | |
947 | /* |
948 | ** Optimize jumps and other jump-like instructions. |
949 | ** * Update labels of instructions with labels to their final |
950 | ** destinations (e.g., choice L1; ... L1: jmp L2: becomes |
951 | ** choice L2) |
952 | ** * Jumps to other instructions that do jumps become those |
953 | ** instructions (e.g., jump to return becomes a return; jump |
954 | ** to commit becomes a commit) |
955 | */ |
956 | static void peephole (CompileState *compst) { |
957 | Instruction *code = compst->p->code; |
958 | int i; |
959 | for (i = 0; i < compst->ncode; i += sizei(&code[i])) { |
960 | redo: |
961 | switch (code[i].i.code) { |
962 | case IChoice: case ICall: case ICommit: case IPartialCommit: |
963 | case IBackCommit: case ITestChar: case ITestSet: |
964 | case ITestAny: { /* instructions with labels */ |
965 | jumptothere(compst, i, finallabel(code, i)); /* optimize label */ |
966 | break; |
967 | } |
968 | case IJmp: { |
969 | int ft = finaltarget(code, i); |
970 | switch (code[ft].i.code) { /* jumping to what? */ |
971 | case IRet: case IFail: case IFailTwice: |
972 | case IEnd: { /* instructions with unconditional implicit jumps */ |
973 | code[i] = code[ft]; /* jump becomes that instruction */ |
974 | code[i + 1].i.code = IAny; /* 'no-op' for target position */ |
975 | break; |
976 | } |
977 | case ICommit: case IPartialCommit: |
978 | case IBackCommit: { /* inst. with unconditional explicit jumps */ |
979 | int fft = finallabel(code, ft); |
980 | code[i] = code[ft]; /* jump becomes that instruction... */ |
981 | jumptothere(compst, i, fft); /* but must correct its offset */ |
982 | goto redo; /* reoptimize its label */ |
983 | } |
984 | default: { |
985 | jumptothere(compst, i, ft); /* optimize label */ |
986 | break; |
987 | } |
988 | } |
989 | break; |
990 | } |
991 | default: break; |
992 | } |
993 | } |
994 | assert(code[i - 1].i.code == IEnd); |
995 | } |
996 | |
997 | |
998 | /* |
999 | ** Compile a pattern |
1000 | */ |
1001 | Instruction *compile (lua_State *L, Pattern *p) { |
1002 | CompileState compst; |
1003 | compst.p = p; compst.ncode = 0; compst.L = L; |
1004 | realloccode(L, p, 2); /* minimum initial size */ |
1005 | codegen(&compst, p->tree, 0, NOINST, fullset); |
1006 | addinstruction(&compst, IEnd, 0); |
1007 | realloccode(L, p, compst.ncode); /* set final size */ |
1008 | peephole(&compst); |
1009 | return p->code; |
1010 | } |
1011 | |
1012 | |
1013 | /* }====================================================== */ |
1014 | |
1015 | |