1 | /* |
2 | * DFA routines |
3 | * This file is #included by regexec.c. |
4 | * |
5 | * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. |
6 | * |
7 | * Development of this software was funded, in part, by Cray Research Inc., |
8 | * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics |
9 | * Corporation, none of whom are responsible for the results. The author |
10 | * thanks all of them. |
11 | * |
12 | * Redistribution and use in source and binary forms -- with or without |
13 | * modification -- are permitted for any purpose, provided that |
14 | * redistributions in source form retain this entire copyright notice and |
15 | * indicate the origin and nature of any modifications. |
16 | * |
17 | * I'd appreciate being given credit for this package in the documentation |
18 | * of software which uses it, but that is not a requirement. |
19 | * |
20 | * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, |
21 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY |
22 | * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL |
23 | * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
24 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
25 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; |
26 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
27 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR |
28 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF |
29 | * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
30 | * |
31 | * src/backend/regex/rege_dfa.c |
32 | * |
33 | */ |
34 | |
35 | /* |
36 | * longest - longest-preferred matching engine |
37 | * |
38 | * On success, returns match endpoint address. Returns NULL on no match. |
39 | * Internal errors also return NULL, with v->err set. |
40 | */ |
41 | static chr * |
42 | longest(struct vars *v, |
43 | struct dfa *d, |
44 | chr *start, /* where the match should start */ |
45 | chr *stop, /* match must end at or before here */ |
46 | int *hitstopp) /* record whether hit v->stop, if non-NULL */ |
47 | { |
48 | chr *cp; |
49 | chr *realstop = (stop == v->stop) ? stop : stop + 1; |
50 | color co; |
51 | struct sset *css; |
52 | struct sset *ss; |
53 | chr *post; |
54 | int i; |
55 | struct colormap *cm = d->cm; |
56 | |
57 | /* prevent "uninitialized variable" warnings */ |
58 | if (hitstopp != NULL) |
59 | *hitstopp = 0; |
60 | |
61 | /* initialize */ |
62 | css = initialize(v, d, start); |
63 | if (css == NULL) |
64 | return NULL; |
65 | cp = start; |
66 | |
67 | /* startup */ |
68 | FDEBUG(("+++ startup +++\n" )); |
69 | if (cp == v->start) |
70 | { |
71 | co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1]; |
72 | FDEBUG(("color %ld\n" , (long) co)); |
73 | } |
74 | else |
75 | { |
76 | co = GETCOLOR(cm, *(cp - 1)); |
77 | FDEBUG(("char %c, color %ld\n" , (char) *(cp - 1), (long) co)); |
78 | } |
79 | css = miss(v, d, css, co, cp, start); |
80 | if (css == NULL) |
81 | return NULL; |
82 | css->lastseen = cp; |
83 | |
84 | /* |
85 | * This is the main text-scanning loop. It seems worth having two copies |
86 | * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG |
87 | * builds, when you're not actively tracing. |
88 | */ |
89 | #ifdef REG_DEBUG |
90 | if (v->eflags & REG_FTRACE) |
91 | { |
92 | while (cp < realstop) |
93 | { |
94 | FDEBUG(("+++ at c%d +++\n" , (int) (css - d->ssets))); |
95 | co = GETCOLOR(cm, *cp); |
96 | FDEBUG(("char %c, color %ld\n" , (char) *cp, (long) co)); |
97 | ss = css->outs[co]; |
98 | if (ss == NULL) |
99 | { |
100 | ss = miss(v, d, css, co, cp + 1, start); |
101 | if (ss == NULL) |
102 | break; /* NOTE BREAK OUT */ |
103 | } |
104 | cp++; |
105 | ss->lastseen = cp; |
106 | css = ss; |
107 | } |
108 | } |
109 | else |
110 | #endif |
111 | { |
112 | while (cp < realstop) |
113 | { |
114 | co = GETCOLOR(cm, *cp); |
115 | ss = css->outs[co]; |
116 | if (ss == NULL) |
117 | { |
118 | ss = miss(v, d, css, co, cp + 1, start); |
119 | if (ss == NULL) |
120 | break; /* NOTE BREAK OUT */ |
121 | } |
122 | cp++; |
123 | ss->lastseen = cp; |
124 | css = ss; |
125 | } |
126 | } |
127 | |
128 | if (ISERR()) |
129 | return NULL; |
130 | |
131 | /* shutdown */ |
132 | FDEBUG(("+++ shutdown at c%d +++\n" , (int) (css - d->ssets))); |
133 | if (cp == v->stop && stop == v->stop) |
134 | { |
135 | if (hitstopp != NULL) |
136 | *hitstopp = 1; |
137 | co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1]; |
138 | FDEBUG(("color %ld\n" , (long) co)); |
139 | ss = miss(v, d, css, co, cp, start); |
140 | if (ISERR()) |
141 | return NULL; |
142 | /* special case: match ended at eol? */ |
143 | if (ss != NULL && (ss->flags & POSTSTATE)) |
144 | return cp; |
145 | else if (ss != NULL) |
146 | ss->lastseen = cp; /* to be tidy */ |
147 | } |
148 | |
149 | /* find last match, if any */ |
150 | post = d->lastpost; |
151 | for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--) |
152 | if ((ss->flags & POSTSTATE) && post != ss->lastseen && |
153 | (post == NULL || post < ss->lastseen)) |
154 | post = ss->lastseen; |
155 | if (post != NULL) /* found one */ |
156 | return post - 1; |
157 | |
158 | return NULL; |
159 | } |
160 | |
161 | /* |
162 | * shortest - shortest-preferred matching engine |
163 | * |
164 | * On success, returns match endpoint address. Returns NULL on no match. |
165 | * Internal errors also return NULL, with v->err set. |
166 | */ |
167 | static chr * |
168 | shortest(struct vars *v, |
169 | struct dfa *d, |
170 | chr *start, /* where the match should start */ |
171 | chr *min, /* match must end at or after here */ |
172 | chr *max, /* match must end at or before here */ |
173 | chr **coldp, /* store coldstart pointer here, if non-NULL */ |
174 | int *hitstopp) /* record whether hit v->stop, if non-NULL */ |
175 | { |
176 | chr *cp; |
177 | chr *realmin = (min == v->stop) ? min : min + 1; |
178 | chr *realmax = (max == v->stop) ? max : max + 1; |
179 | color co; |
180 | struct sset *css; |
181 | struct sset *ss; |
182 | struct colormap *cm = d->cm; |
183 | |
184 | /* prevent "uninitialized variable" warnings */ |
185 | if (coldp != NULL) |
186 | *coldp = NULL; |
187 | if (hitstopp != NULL) |
188 | *hitstopp = 0; |
189 | |
190 | /* initialize */ |
191 | css = initialize(v, d, start); |
192 | if (css == NULL) |
193 | return NULL; |
194 | cp = start; |
195 | |
196 | /* startup */ |
197 | FDEBUG(("--- startup ---\n" )); |
198 | if (cp == v->start) |
199 | { |
200 | co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1]; |
201 | FDEBUG(("color %ld\n" , (long) co)); |
202 | } |
203 | else |
204 | { |
205 | co = GETCOLOR(cm, *(cp - 1)); |
206 | FDEBUG(("char %c, color %ld\n" , (char) *(cp - 1), (long) co)); |
207 | } |
208 | css = miss(v, d, css, co, cp, start); |
209 | if (css == NULL) |
210 | return NULL; |
211 | css->lastseen = cp; |
212 | ss = css; |
213 | |
214 | /* |
215 | * This is the main text-scanning loop. It seems worth having two copies |
216 | * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG |
217 | * builds, when you're not actively tracing. |
218 | */ |
219 | #ifdef REG_DEBUG |
220 | if (v->eflags & REG_FTRACE) |
221 | { |
222 | while (cp < realmax) |
223 | { |
224 | FDEBUG(("--- at c%d ---\n" , (int) (css - d->ssets))); |
225 | co = GETCOLOR(cm, *cp); |
226 | FDEBUG(("char %c, color %ld\n" , (char) *cp, (long) co)); |
227 | ss = css->outs[co]; |
228 | if (ss == NULL) |
229 | { |
230 | ss = miss(v, d, css, co, cp + 1, start); |
231 | if (ss == NULL) |
232 | break; /* NOTE BREAK OUT */ |
233 | } |
234 | cp++; |
235 | ss->lastseen = cp; |
236 | css = ss; |
237 | if ((ss->flags & POSTSTATE) && cp >= realmin) |
238 | break; /* NOTE BREAK OUT */ |
239 | } |
240 | } |
241 | else |
242 | #endif |
243 | { |
244 | while (cp < realmax) |
245 | { |
246 | co = GETCOLOR(cm, *cp); |
247 | ss = css->outs[co]; |
248 | if (ss == NULL) |
249 | { |
250 | ss = miss(v, d, css, co, cp + 1, start); |
251 | if (ss == NULL) |
252 | break; /* NOTE BREAK OUT */ |
253 | } |
254 | cp++; |
255 | ss->lastseen = cp; |
256 | css = ss; |
257 | if ((ss->flags & POSTSTATE) && cp >= realmin) |
258 | break; /* NOTE BREAK OUT */ |
259 | } |
260 | } |
261 | |
262 | if (ss == NULL) |
263 | return NULL; |
264 | |
265 | if (coldp != NULL) /* report last no-progress state set, if any */ |
266 | *coldp = lastcold(v, d); |
267 | |
268 | if ((ss->flags & POSTSTATE) && cp > min) |
269 | { |
270 | assert(cp >= realmin); |
271 | cp--; |
272 | } |
273 | else if (cp == v->stop && max == v->stop) |
274 | { |
275 | co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1]; |
276 | FDEBUG(("color %ld\n" , (long) co)); |
277 | ss = miss(v, d, css, co, cp, start); |
278 | /* match might have ended at eol */ |
279 | if ((ss == NULL || !(ss->flags & POSTSTATE)) && hitstopp != NULL) |
280 | *hitstopp = 1; |
281 | } |
282 | |
283 | if (ss == NULL || !(ss->flags & POSTSTATE)) |
284 | return NULL; |
285 | |
286 | return cp; |
287 | } |
288 | |
289 | /* |
290 | * matchuntil - incremental matching engine |
291 | * |
292 | * This is meant for use with a search-style NFA (that is, the pattern is |
293 | * known to act as though it had a leading .*). We determine whether a |
294 | * match exists starting at v->start and ending at probe. Multiple calls |
295 | * require only O(N) time not O(N^2) so long as the probe values are |
296 | * nondecreasing. *lastcss and *lastcp must be initialized to NULL before |
297 | * starting a series of calls. |
298 | * |
299 | * Returns 1 if a match exists, 0 if not. |
300 | * Internal errors also return 0, with v->err set. |
301 | */ |
302 | static int |
303 | matchuntil(struct vars *v, |
304 | struct dfa *d, |
305 | chr *probe, /* we want to know if a match ends here */ |
306 | struct sset **lastcss, /* state storage across calls */ |
307 | chr **lastcp) /* state storage across calls */ |
308 | { |
309 | chr *cp = *lastcp; |
310 | color co; |
311 | struct sset *css = *lastcss; |
312 | struct sset *ss; |
313 | struct colormap *cm = d->cm; |
314 | |
315 | /* initialize and startup, or restart, if necessary */ |
316 | if (cp == NULL || cp > probe) |
317 | { |
318 | cp = v->start; |
319 | css = initialize(v, d, cp); |
320 | if (css == NULL) |
321 | return 0; |
322 | |
323 | FDEBUG((">>> startup >>>\n" )); |
324 | co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1]; |
325 | FDEBUG(("color %ld\n" , (long) co)); |
326 | |
327 | css = miss(v, d, css, co, cp, v->start); |
328 | if (css == NULL) |
329 | return 0; |
330 | css->lastseen = cp; |
331 | } |
332 | else if (css == NULL) |
333 | { |
334 | /* we previously found that no match is possible beyond *lastcp */ |
335 | return 0; |
336 | } |
337 | ss = css; |
338 | |
339 | /* |
340 | * This is the main text-scanning loop. It seems worth having two copies |
341 | * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG |
342 | * builds, when you're not actively tracing. |
343 | */ |
344 | #ifdef REG_DEBUG |
345 | if (v->eflags & REG_FTRACE) |
346 | { |
347 | while (cp < probe) |
348 | { |
349 | FDEBUG((">>> at c%d >>>\n" , (int) (css - d->ssets))); |
350 | co = GETCOLOR(cm, *cp); |
351 | FDEBUG(("char %c, color %ld\n" , (char) *cp, (long) co)); |
352 | ss = css->outs[co]; |
353 | if (ss == NULL) |
354 | { |
355 | ss = miss(v, d, css, co, cp + 1, v->start); |
356 | if (ss == NULL) |
357 | break; /* NOTE BREAK OUT */ |
358 | } |
359 | cp++; |
360 | ss->lastseen = cp; |
361 | css = ss; |
362 | } |
363 | } |
364 | else |
365 | #endif |
366 | { |
367 | while (cp < probe) |
368 | { |
369 | co = GETCOLOR(cm, *cp); |
370 | ss = css->outs[co]; |
371 | if (ss == NULL) |
372 | { |
373 | ss = miss(v, d, css, co, cp + 1, v->start); |
374 | if (ss == NULL) |
375 | break; /* NOTE BREAK OUT */ |
376 | } |
377 | cp++; |
378 | ss->lastseen = cp; |
379 | css = ss; |
380 | } |
381 | } |
382 | |
383 | *lastcss = ss; |
384 | *lastcp = cp; |
385 | |
386 | if (ss == NULL) |
387 | return 0; /* impossible match, or internal error */ |
388 | |
389 | /* We need to process one more chr, or the EOS symbol, to check match */ |
390 | if (cp < v->stop) |
391 | { |
392 | FDEBUG((">>> at c%d >>>\n" , (int) (css - d->ssets))); |
393 | co = GETCOLOR(cm, *cp); |
394 | FDEBUG(("char %c, color %ld\n" , (char) *cp, (long) co)); |
395 | ss = css->outs[co]; |
396 | if (ss == NULL) |
397 | ss = miss(v, d, css, co, cp + 1, v->start); |
398 | } |
399 | else |
400 | { |
401 | assert(cp == v->stop); |
402 | co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1]; |
403 | FDEBUG(("color %ld\n" , (long) co)); |
404 | ss = miss(v, d, css, co, cp, v->start); |
405 | } |
406 | |
407 | if (ss == NULL || !(ss->flags & POSTSTATE)) |
408 | return 0; |
409 | |
410 | return 1; |
411 | } |
412 | |
413 | /* |
414 | * lastcold - determine last point at which no progress had been made |
415 | */ |
416 | static chr * /* endpoint, or NULL */ |
417 | lastcold(struct vars *v, |
418 | struct dfa *d) |
419 | { |
420 | struct sset *ss; |
421 | chr *nopr; |
422 | int i; |
423 | |
424 | nopr = d->lastnopr; |
425 | if (nopr == NULL) |
426 | nopr = v->start; |
427 | for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--) |
428 | if ((ss->flags & NOPROGRESS) && nopr < ss->lastseen) |
429 | nopr = ss->lastseen; |
430 | return nopr; |
431 | } |
432 | |
433 | /* |
434 | * newdfa - set up a fresh DFA |
435 | */ |
436 | static struct dfa * |
437 | newdfa(struct vars *v, |
438 | struct cnfa *cnfa, |
439 | struct colormap *cm, |
440 | struct smalldfa *sml) /* preallocated space, may be NULL */ |
441 | { |
442 | struct dfa *d; |
443 | size_t nss = cnfa->nstates * 2; |
444 | int wordsper = (cnfa->nstates + UBITS - 1) / UBITS; |
445 | struct smalldfa *smallwas = sml; |
446 | |
447 | assert(cnfa != NULL && cnfa->nstates != 0); |
448 | |
449 | if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS) |
450 | { |
451 | assert(wordsper == 1); |
452 | if (sml == NULL) |
453 | { |
454 | sml = (struct smalldfa *) MALLOC(sizeof(struct smalldfa)); |
455 | if (sml == NULL) |
456 | { |
457 | ERR(REG_ESPACE); |
458 | return NULL; |
459 | } |
460 | } |
461 | d = &sml->dfa; |
462 | d->ssets = sml->ssets; |
463 | d->statesarea = sml->statesarea; |
464 | d->work = &d->statesarea[nss]; |
465 | d->outsarea = sml->outsarea; |
466 | d->incarea = sml->incarea; |
467 | d->cptsmalloced = 0; |
468 | d->mallocarea = (smallwas == NULL) ? (char *) sml : NULL; |
469 | } |
470 | else |
471 | { |
472 | d = (struct dfa *) MALLOC(sizeof(struct dfa)); |
473 | if (d == NULL) |
474 | { |
475 | ERR(REG_ESPACE); |
476 | return NULL; |
477 | } |
478 | d->ssets = (struct sset *) MALLOC(nss * sizeof(struct sset)); |
479 | d->statesarea = (unsigned *) MALLOC((nss + WORK) * wordsper * |
480 | sizeof(unsigned)); |
481 | d->work = &d->statesarea[nss * wordsper]; |
482 | d->outsarea = (struct sset **) MALLOC(nss * cnfa->ncolors * |
483 | sizeof(struct sset *)); |
484 | d->incarea = (struct arcp *) MALLOC(nss * cnfa->ncolors * |
485 | sizeof(struct arcp)); |
486 | d->cptsmalloced = 1; |
487 | d->mallocarea = (char *) d; |
488 | if (d->ssets == NULL || d->statesarea == NULL || |
489 | d->outsarea == NULL || d->incarea == NULL) |
490 | { |
491 | freedfa(d); |
492 | ERR(REG_ESPACE); |
493 | return NULL; |
494 | } |
495 | } |
496 | |
497 | d->nssets = (v->eflags & REG_SMALL) ? 7 : nss; |
498 | d->nssused = 0; |
499 | d->nstates = cnfa->nstates; |
500 | d->ncolors = cnfa->ncolors; |
501 | d->wordsper = wordsper; |
502 | d->cnfa = cnfa; |
503 | d->cm = cm; |
504 | d->lastpost = NULL; |
505 | d->lastnopr = NULL; |
506 | d->search = d->ssets; |
507 | |
508 | /* initialization of sset fields is done as needed */ |
509 | |
510 | return d; |
511 | } |
512 | |
513 | /* |
514 | * freedfa - free a DFA |
515 | */ |
516 | static void |
517 | freedfa(struct dfa *d) |
518 | { |
519 | if (d->cptsmalloced) |
520 | { |
521 | if (d->ssets != NULL) |
522 | FREE(d->ssets); |
523 | if (d->statesarea != NULL) |
524 | FREE(d->statesarea); |
525 | if (d->outsarea != NULL) |
526 | FREE(d->outsarea); |
527 | if (d->incarea != NULL) |
528 | FREE(d->incarea); |
529 | } |
530 | |
531 | if (d->mallocarea != NULL) |
532 | FREE(d->mallocarea); |
533 | } |
534 | |
535 | /* |
536 | * hash - construct a hash code for a bitvector |
537 | * |
538 | * There are probably better ways, but they're more expensive. |
539 | */ |
540 | static unsigned |
541 | hash(unsigned *uv, |
542 | int n) |
543 | { |
544 | int i; |
545 | unsigned h; |
546 | |
547 | h = 0; |
548 | for (i = 0; i < n; i++) |
549 | h ^= uv[i]; |
550 | return h; |
551 | } |
552 | |
553 | /* |
554 | * initialize - hand-craft a cache entry for startup, otherwise get ready |
555 | */ |
556 | static struct sset * |
557 | initialize(struct vars *v, |
558 | struct dfa *d, |
559 | chr *start) |
560 | { |
561 | struct sset *ss; |
562 | int i; |
563 | |
564 | /* is previous one still there? */ |
565 | if (d->nssused > 0 && (d->ssets[0].flags & STARTER)) |
566 | ss = &d->ssets[0]; |
567 | else |
568 | { /* no, must (re)build it */ |
569 | ss = getvacant(v, d, start, start); |
570 | if (ss == NULL) |
571 | return NULL; |
572 | for (i = 0; i < d->wordsper; i++) |
573 | ss->states[i] = 0; |
574 | BSET(ss->states, d->cnfa->pre); |
575 | ss->hash = HASH(ss->states, d->wordsper); |
576 | assert(d->cnfa->pre != d->cnfa->post); |
577 | ss->flags = STARTER | LOCKED | NOPROGRESS; |
578 | /* lastseen dealt with below */ |
579 | } |
580 | |
581 | for (i = 0; i < d->nssused; i++) |
582 | d->ssets[i].lastseen = NULL; |
583 | ss->lastseen = start; /* maybe untrue, but harmless */ |
584 | d->lastpost = NULL; |
585 | d->lastnopr = NULL; |
586 | return ss; |
587 | } |
588 | |
589 | /* |
590 | * miss - handle a stateset cache miss |
591 | * |
592 | * css is the current stateset, co is the color of the current input character, |
593 | * cp points to the character after that (which is where we may need to test |
594 | * LACONs). start does not affect matching behavior but is needed for pickss' |
595 | * heuristics about which stateset cache entry to replace. |
596 | * |
597 | * Ordinarily, returns the address of the next stateset (the one that is |
598 | * valid after consuming the input character). Returns NULL if no valid |
599 | * NFA states remain, ie we have a certain match failure. |
600 | * Internal errors also return NULL, with v->err set. |
601 | */ |
602 | static struct sset * |
603 | miss(struct vars *v, |
604 | struct dfa *d, |
605 | struct sset *css, |
606 | color co, |
607 | chr *cp, /* next chr */ |
608 | chr *start) /* where the attempt got started */ |
609 | { |
610 | struct cnfa *cnfa = d->cnfa; |
611 | int i; |
612 | unsigned h; |
613 | struct carc *ca; |
614 | struct sset *p; |
615 | int ispost; |
616 | int noprogress; |
617 | int gotstate; |
618 | int dolacons; |
619 | int sawlacons; |
620 | |
621 | /* for convenience, we can be called even if it might not be a miss */ |
622 | if (css->outs[co] != NULL) |
623 | { |
624 | FDEBUG(("hit\n" )); |
625 | return css->outs[co]; |
626 | } |
627 | FDEBUG(("miss\n" )); |
628 | |
629 | /* |
630 | * Checking for operation cancel in the inner text search loop seems |
631 | * unduly expensive. As a compromise, check during cache misses. |
632 | */ |
633 | if (CANCEL_REQUESTED(v->re)) |
634 | { |
635 | ERR(REG_CANCEL); |
636 | return NULL; |
637 | } |
638 | |
639 | /* |
640 | * What set of states would we end up in after consuming the co character? |
641 | * We first consider PLAIN arcs that consume the character, and then look |
642 | * to see what LACON arcs could be traversed after consuming it. |
643 | */ |
644 | for (i = 0; i < d->wordsper; i++) |
645 | d->work[i] = 0; /* build new stateset bitmap in d->work */ |
646 | ispost = 0; |
647 | noprogress = 1; |
648 | gotstate = 0; |
649 | for (i = 0; i < d->nstates; i++) |
650 | if (ISBSET(css->states, i)) |
651 | for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++) |
652 | if (ca->co == co) |
653 | { |
654 | BSET(d->work, ca->to); |
655 | gotstate = 1; |
656 | if (ca->to == cnfa->post) |
657 | ispost = 1; |
658 | if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS)) |
659 | noprogress = 0; |
660 | FDEBUG(("%d -> %d\n" , i, ca->to)); |
661 | } |
662 | if (!gotstate) |
663 | return NULL; /* character cannot reach any new state */ |
664 | dolacons = (cnfa->flags & HASLACONS); |
665 | sawlacons = 0; |
666 | /* outer loop handles transitive closure of reachable-by-LACON states */ |
667 | while (dolacons) |
668 | { |
669 | dolacons = 0; |
670 | for (i = 0; i < d->nstates; i++) |
671 | if (ISBSET(d->work, i)) |
672 | for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++) |
673 | { |
674 | if (ca->co < cnfa->ncolors) |
675 | continue; /* not a LACON arc */ |
676 | if (ISBSET(d->work, ca->to)) |
677 | continue; /* arc would be a no-op anyway */ |
678 | sawlacons = 1; /* this LACON affects our result */ |
679 | if (!lacon(v, cnfa, cp, ca->co)) |
680 | { |
681 | if (ISERR()) |
682 | return NULL; |
683 | continue; /* LACON arc cannot be traversed */ |
684 | } |
685 | if (ISERR()) |
686 | return NULL; |
687 | BSET(d->work, ca->to); |
688 | dolacons = 1; |
689 | if (ca->to == cnfa->post) |
690 | ispost = 1; |
691 | if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS)) |
692 | noprogress = 0; |
693 | FDEBUG(("%d :> %d\n" , i, ca->to)); |
694 | } |
695 | } |
696 | h = HASH(d->work, d->wordsper); |
697 | |
698 | /* Is this stateset already in the cache? */ |
699 | for (p = d->ssets, i = d->nssused; i > 0; p++, i--) |
700 | if (HIT(h, d->work, p, d->wordsper)) |
701 | { |
702 | FDEBUG(("cached c%d\n" , (int) (p - d->ssets))); |
703 | break; /* NOTE BREAK OUT */ |
704 | } |
705 | if (i == 0) |
706 | { /* nope, need a new cache entry */ |
707 | p = getvacant(v, d, cp, start); |
708 | if (p == NULL) |
709 | return NULL; |
710 | assert(p != css); |
711 | for (i = 0; i < d->wordsper; i++) |
712 | p->states[i] = d->work[i]; |
713 | p->hash = h; |
714 | p->flags = (ispost) ? POSTSTATE : 0; |
715 | if (noprogress) |
716 | p->flags |= NOPROGRESS; |
717 | /* lastseen to be dealt with by caller */ |
718 | } |
719 | |
720 | /* |
721 | * Link new stateset to old, unless a LACON affected the result, in which |
722 | * case we don't create the link. That forces future transitions across |
723 | * this same arc (same prior stateset and character color) to come through |
724 | * miss() again, so that we can recheck the LACON(s), which might or might |
725 | * not pass since context will be different. |
726 | */ |
727 | if (!sawlacons) |
728 | { |
729 | FDEBUG(("c%d[%d]->c%d\n" , |
730 | (int) (css - d->ssets), co, (int) (p - d->ssets))); |
731 | css->outs[co] = p; |
732 | css->inchain[co] = p->ins; |
733 | p->ins.ss = css; |
734 | p->ins.co = co; |
735 | } |
736 | return p; |
737 | } |
738 | |
739 | /* |
740 | * lacon - lookaround-constraint checker for miss() |
741 | */ |
742 | static int /* predicate: constraint satisfied? */ |
743 | lacon(struct vars *v, |
744 | struct cnfa *pcnfa, /* parent cnfa */ |
745 | chr *cp, |
746 | color co) /* "color" of the lookaround constraint */ |
747 | { |
748 | int n; |
749 | struct subre *sub; |
750 | struct dfa *d; |
751 | chr *end; |
752 | int satisfied; |
753 | |
754 | /* Since this is recursive, it could be driven to stack overflow */ |
755 | if (STACK_TOO_DEEP(v->re)) |
756 | { |
757 | ERR(REG_ETOOBIG); |
758 | return 0; |
759 | } |
760 | |
761 | n = co - pcnfa->ncolors; |
762 | assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL); |
763 | FDEBUG(("=== testing lacon %d\n" , n)); |
764 | sub = &v->g->lacons[n]; |
765 | d = getladfa(v, n); |
766 | if (d == NULL) |
767 | return 0; |
768 | if (LATYPE_IS_AHEAD(sub->subno)) |
769 | { |
770 | /* used to use longest() here, but shortest() could be much cheaper */ |
771 | end = shortest(v, d, cp, cp, v->stop, |
772 | (chr **) NULL, (int *) NULL); |
773 | satisfied = LATYPE_IS_POS(sub->subno) ? (end != NULL) : (end == NULL); |
774 | } |
775 | else |
776 | { |
777 | /* |
778 | * To avoid doing O(N^2) work when repeatedly testing a lookbehind |
779 | * constraint in an N-character string, we use matchuntil() which can |
780 | * cache the DFA state across calls. We only need to restart if the |
781 | * probe point decreases, which is not common. The NFA we're using is |
782 | * a search NFA, so it doesn't mind scanning over stuff before the |
783 | * nominal match. |
784 | */ |
785 | satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]); |
786 | if (!LATYPE_IS_POS(sub->subno)) |
787 | satisfied = !satisfied; |
788 | } |
789 | FDEBUG(("=== lacon %d satisfied %d\n" , n, satisfied)); |
790 | return satisfied; |
791 | } |
792 | |
793 | /* |
794 | * getvacant - get a vacant state set |
795 | * |
796 | * This routine clears out the inarcs and outarcs, but does not otherwise |
797 | * clear the innards of the state set -- that's up to the caller. |
798 | */ |
799 | static struct sset * |
800 | getvacant(struct vars *v, |
801 | struct dfa *d, |
802 | chr *cp, |
803 | chr *start) |
804 | { |
805 | int i; |
806 | struct sset *ss; |
807 | struct sset *p; |
808 | struct arcp ap; |
809 | color co; |
810 | |
811 | ss = pickss(v, d, cp, start); |
812 | if (ss == NULL) |
813 | return NULL; |
814 | assert(!(ss->flags & LOCKED)); |
815 | |
816 | /* clear out its inarcs, including self-referential ones */ |
817 | ap = ss->ins; |
818 | while ((p = ap.ss) != NULL) |
819 | { |
820 | co = ap.co; |
821 | FDEBUG(("zapping c%d's %ld outarc\n" , (int) (p - d->ssets), (long) co)); |
822 | p->outs[co] = NULL; |
823 | ap = p->inchain[co]; |
824 | p->inchain[co].ss = NULL; /* paranoia */ |
825 | } |
826 | ss->ins.ss = NULL; |
827 | |
828 | /* take it off the inarc chains of the ssets reached by its outarcs */ |
829 | for (i = 0; i < d->ncolors; i++) |
830 | { |
831 | p = ss->outs[i]; |
832 | assert(p != ss); /* not self-referential */ |
833 | if (p == NULL) |
834 | continue; /* NOTE CONTINUE */ |
835 | FDEBUG(("del outarc %d from c%d's in chn\n" , i, (int) (p - d->ssets))); |
836 | if (p->ins.ss == ss && p->ins.co == i) |
837 | p->ins = ss->inchain[i]; |
838 | else |
839 | { |
840 | struct arcp lastap = {NULL, 0}; |
841 | |
842 | assert(p->ins.ss != NULL); |
843 | for (ap = p->ins; ap.ss != NULL && |
844 | !(ap.ss == ss && ap.co == i); |
845 | ap = ap.ss->inchain[ap.co]) |
846 | lastap = ap; |
847 | assert(ap.ss != NULL); |
848 | lastap.ss->inchain[lastap.co] = ss->inchain[i]; |
849 | } |
850 | ss->outs[i] = NULL; |
851 | ss->inchain[i].ss = NULL; |
852 | } |
853 | |
854 | /* if ss was a success state, may need to remember location */ |
855 | if ((ss->flags & POSTSTATE) && ss->lastseen != d->lastpost && |
856 | (d->lastpost == NULL || d->lastpost < ss->lastseen)) |
857 | d->lastpost = ss->lastseen; |
858 | |
859 | /* likewise for a no-progress state */ |
860 | if ((ss->flags & NOPROGRESS) && ss->lastseen != d->lastnopr && |
861 | (d->lastnopr == NULL || d->lastnopr < ss->lastseen)) |
862 | d->lastnopr = ss->lastseen; |
863 | |
864 | return ss; |
865 | } |
866 | |
867 | /* |
868 | * pickss - pick the next stateset to be used |
869 | */ |
870 | static struct sset * |
871 | pickss(struct vars *v, |
872 | struct dfa *d, |
873 | chr *cp, |
874 | chr *start) |
875 | { |
876 | int i; |
877 | struct sset *ss; |
878 | struct sset *end; |
879 | chr *ancient; |
880 | |
881 | /* shortcut for cases where cache isn't full */ |
882 | if (d->nssused < d->nssets) |
883 | { |
884 | i = d->nssused; |
885 | d->nssused++; |
886 | ss = &d->ssets[i]; |
887 | FDEBUG(("new c%d\n" , i)); |
888 | /* set up innards */ |
889 | ss->states = &d->statesarea[i * d->wordsper]; |
890 | ss->flags = 0; |
891 | ss->ins.ss = NULL; |
892 | ss->ins.co = WHITE; /* give it some value */ |
893 | ss->outs = &d->outsarea[i * d->ncolors]; |
894 | ss->inchain = &d->incarea[i * d->ncolors]; |
895 | for (i = 0; i < d->ncolors; i++) |
896 | { |
897 | ss->outs[i] = NULL; |
898 | ss->inchain[i].ss = NULL; |
899 | } |
900 | return ss; |
901 | } |
902 | |
903 | /* look for oldest, or old enough anyway */ |
904 | if (cp - start > d->nssets * 2 / 3) /* oldest 33% are expendable */ |
905 | ancient = cp - d->nssets * 2 / 3; |
906 | else |
907 | ancient = start; |
908 | for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++) |
909 | if ((ss->lastseen == NULL || ss->lastseen < ancient) && |
910 | !(ss->flags & LOCKED)) |
911 | { |
912 | d->search = ss + 1; |
913 | FDEBUG(("replacing c%d\n" , (int) (ss - d->ssets))); |
914 | return ss; |
915 | } |
916 | for (ss = d->ssets, end = d->search; ss < end; ss++) |
917 | if ((ss->lastseen == NULL || ss->lastseen < ancient) && |
918 | !(ss->flags & LOCKED)) |
919 | { |
920 | d->search = ss + 1; |
921 | FDEBUG(("replacing c%d\n" , (int) (ss - d->ssets))); |
922 | return ss; |
923 | } |
924 | |
925 | /* nobody's old enough?!? -- something's really wrong */ |
926 | FDEBUG(("cannot find victim to replace!\n" )); |
927 | ERR(REG_ASSERT); |
928 | return NULL; |
929 | } |
930 | |