1/*
2 * re_*exec and friends - match REs
3 *
4 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
5 *
6 * Development of this software was funded, in part, by Cray Research Inc.,
7 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
8 * Corporation, none of whom are responsible for the results. The author
9 * thanks all of them.
10 *
11 * Redistribution and use in source and binary forms -- with or without
12 * modification -- are permitted for any purpose, provided that
13 * redistributions in source form retain this entire copyright notice and
14 * indicate the origin and nature of any modifications.
15 *
16 * I'd appreciate being given credit for this package in the documentation
17 * of software which uses it, but that is not a requirement.
18 *
19 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
20 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
21 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
22 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *
30 * src/backend/regex/regexec.c
31 *
32 */
33
34#include "regex/regguts.h"
35
36
37
38/* lazy-DFA representation */
39struct arcp
40{ /* "pointer" to an outarc */
41 struct sset *ss;
42 color co;
43};
44
45struct sset
46{ /* state set */
47 unsigned *states; /* pointer to bitvector */
48 unsigned hash; /* hash of bitvector */
49#define HASH(bv, nw) (((nw) == 1) ? *(bv) : hash(bv, nw))
50#define HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \
51 memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0))
52 int flags;
53#define STARTER 01 /* the initial state set */
54#define POSTSTATE 02 /* includes the goal state */
55#define LOCKED 04 /* locked in cache */
56#define NOPROGRESS 010 /* zero-progress state set */
57 struct arcp ins; /* chain of inarcs pointing here */
58 chr *lastseen; /* last entered on arrival here */
59 struct sset **outs; /* outarc vector indexed by color */
60 struct arcp *inchain; /* chain-pointer vector for outarcs */
61};
62
63struct dfa
64{
65 int nssets; /* size of cache */
66 int nssused; /* how many entries occupied yet */
67 int nstates; /* number of states */
68 int ncolors; /* length of outarc and inchain vectors */
69 int wordsper; /* length of state-set bitvectors */
70 struct sset *ssets; /* state-set cache */
71 unsigned *statesarea; /* bitvector storage */
72 unsigned *work; /* pointer to work area within statesarea */
73 struct sset **outsarea; /* outarc-vector storage */
74 struct arcp *incarea; /* inchain storage */
75 struct cnfa *cnfa;
76 struct colormap *cm;
77 chr *lastpost; /* location of last cache-flushed success */
78 chr *lastnopr; /* location of last cache-flushed NOPROGRESS */
79 struct sset *search; /* replacement-search-pointer memory */
80 int cptsmalloced; /* were the areas individually malloced? */
81 char *mallocarea; /* self, or master malloced area, or NULL */
82};
83
84#define WORK 1 /* number of work bitvectors needed */
85
86/* setup for non-malloc allocation for small cases */
87#define FEWSTATES 20 /* must be less than UBITS */
88#define FEWCOLORS 15
89struct smalldfa
90{
91 struct dfa dfa;
92 struct sset ssets[FEWSTATES * 2];
93 unsigned statesarea[FEWSTATES * 2 + WORK];
94 struct sset *outsarea[FEWSTATES * 2 * FEWCOLORS];
95 struct arcp incarea[FEWSTATES * 2 * FEWCOLORS];
96};
97
98#define DOMALLOC ((struct smalldfa *)NULL) /* force malloc */
99
100
101
102/* internal variables, bundled for easy passing around */
103struct vars
104{
105 regex_t *re;
106 struct guts *g;
107 int eflags; /* copies of arguments */
108 size_t nmatch;
109 regmatch_t *pmatch;
110 rm_detail_t *details;
111 chr *start; /* start of string */
112 chr *search_start; /* search start of string */
113 chr *stop; /* just past end of string */
114 int err; /* error code if any (0 none) */
115 struct dfa **subdfas; /* per-tree-subre DFAs */
116 struct dfa **ladfas; /* per-lacon-subre DFAs */
117 struct sset **lblastcss; /* per-lacon-subre lookbehind restart data */
118 chr **lblastcp; /* per-lacon-subre lookbehind restart data */
119 struct smalldfa dfa1;
120 struct smalldfa dfa2;
121};
122
123#define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
124#define ISERR() VISERR(v)
125#define VERR(vv,e) ((vv)->err = ((vv)->err ? (vv)->err : (e)))
126#define ERR(e) VERR(v, e) /* record an error */
127#define NOERR() {if (ISERR()) return v->err;} /* if error seen, return it */
128#define OFF(p) ((p) - v->start)
129#define LOFF(p) ((long)OFF(p))
130
131
132
133/*
134 * forward declarations
135 */
136/* === regexec.c === */
137static struct dfa *getsubdfa(struct vars *, struct subre *);
138static struct dfa *getladfa(struct vars *, int);
139static int find(struct vars *, struct cnfa *, struct colormap *);
140static int cfind(struct vars *, struct cnfa *, struct colormap *);
141static int cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **);
142static void zapallsubs(regmatch_t *, size_t);
143static void zaptreesubs(struct vars *, struct subre *);
144static void subset(struct vars *, struct subre *, chr *, chr *);
145static int cdissect(struct vars *, struct subre *, chr *, chr *);
146static int ccondissect(struct vars *, struct subre *, chr *, chr *);
147static int crevcondissect(struct vars *, struct subre *, chr *, chr *);
148static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
149static int caltdissect(struct vars *, struct subre *, chr *, chr *);
150static int citerdissect(struct vars *, struct subre *, chr *, chr *);
151static int creviterdissect(struct vars *, struct subre *, chr *, chr *);
152
153/* === rege_dfa.c === */
154static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
155static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *);
156static int matchuntil(struct vars *, struct dfa *, chr *, struct sset **, chr **);
157static chr *lastcold(struct vars *, struct dfa *);
158static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *);
159static void freedfa(struct dfa *);
160static unsigned hash(unsigned *, int);
161static struct sset *initialize(struct vars *, struct dfa *, chr *);
162static struct sset *miss(struct vars *, struct dfa *, struct sset *, color, chr *, chr *);
163static int lacon(struct vars *, struct cnfa *, chr *, color);
164static struct sset *getvacant(struct vars *, struct dfa *, chr *, chr *);
165static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *);
166
167
168/*
169 * pg_regexec - match regular expression
170 */
171int
172pg_regexec(regex_t *re,
173 const chr *string,
174 size_t len,
175 size_t search_start,
176 rm_detail_t *details,
177 size_t nmatch,
178 regmatch_t pmatch[],
179 int flags)
180{
181 struct vars var;
182 register struct vars *v = &var;
183 int st;
184 size_t n;
185 size_t i;
186 int backref;
187
188#define LOCALMAT 20
189 regmatch_t mat[LOCALMAT];
190
191#define LOCALDFAS 40
192 struct dfa *subdfas[LOCALDFAS];
193
194 /* sanity checks */
195 if (re == NULL || string == NULL || re->re_magic != REMAGIC)
196 return REG_INVARG;
197 if (re->re_csize != sizeof(chr))
198 return REG_MIXED;
199
200 /* Initialize locale-dependent support */
201 pg_set_regex_collation(re->re_collation);
202
203 /* setup */
204 v->re = re;
205 v->g = (struct guts *) re->re_guts;
206 if ((v->g->cflags & REG_EXPECT) && details == NULL)
207 return REG_INVARG;
208 if (v->g->info & REG_UIMPOSSIBLE)
209 return REG_NOMATCH;
210 backref = (v->g->info & REG_UBACKREF) ? 1 : 0;
211 v->eflags = flags;
212 if (v->g->cflags & REG_NOSUB)
213 nmatch = 0; /* override client */
214 v->nmatch = nmatch;
215 if (backref)
216 {
217 /* need work area */
218 if (v->g->nsub + 1 <= LOCALMAT)
219 v->pmatch = mat;
220 else
221 v->pmatch = (regmatch_t *) MALLOC((v->g->nsub + 1) *
222 sizeof(regmatch_t));
223 if (v->pmatch == NULL)
224 return REG_ESPACE;
225 v->nmatch = v->g->nsub + 1;
226 }
227 else
228 v->pmatch = pmatch;
229 v->details = details;
230 v->start = (chr *) string;
231 v->search_start = (chr *) string + search_start;
232 v->stop = (chr *) string + len;
233 v->err = 0;
234 v->subdfas = NULL;
235 v->ladfas = NULL;
236 v->lblastcss = NULL;
237 v->lblastcp = NULL;
238 /* below this point, "goto cleanup" will behave sanely */
239
240 assert(v->g->ntree >= 0);
241 n = (size_t) v->g->ntree;
242 if (n <= LOCALDFAS)
243 v->subdfas = subdfas;
244 else
245 {
246 v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
247 if (v->subdfas == NULL)
248 {
249 st = REG_ESPACE;
250 goto cleanup;
251 }
252 }
253 for (i = 0; i < n; i++)
254 v->subdfas[i] = NULL;
255
256 assert(v->g->nlacons >= 0);
257 n = (size_t) v->g->nlacons;
258 if (n > 0)
259 {
260 v->ladfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
261 if (v->ladfas == NULL)
262 {
263 st = REG_ESPACE;
264 goto cleanup;
265 }
266 for (i = 0; i < n; i++)
267 v->ladfas[i] = NULL;
268 v->lblastcss = (struct sset **) MALLOC(n * sizeof(struct sset *));
269 v->lblastcp = (chr **) MALLOC(n * sizeof(chr *));
270 if (v->lblastcss == NULL || v->lblastcp == NULL)
271 {
272 st = REG_ESPACE;
273 goto cleanup;
274 }
275 for (i = 0; i < n; i++)
276 {
277 v->lblastcss[i] = NULL;
278 v->lblastcp[i] = NULL;
279 }
280 }
281
282 /* do it */
283 assert(v->g->tree != NULL);
284 if (backref)
285 st = cfind(v, &v->g->tree->cnfa, &v->g->cmap);
286 else
287 st = find(v, &v->g->tree->cnfa, &v->g->cmap);
288
289 /* copy (portion of) match vector over if necessary */
290 if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0)
291 {
292 zapallsubs(pmatch, nmatch);
293 n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
294 memcpy(VS(pmatch), VS(v->pmatch), n * sizeof(regmatch_t));
295 }
296
297 /* clean up */
298cleanup:
299 if (v->pmatch != pmatch && v->pmatch != mat)
300 FREE(v->pmatch);
301 if (v->subdfas != NULL)
302 {
303 n = (size_t) v->g->ntree;
304 for (i = 0; i < n; i++)
305 {
306 if (v->subdfas[i] != NULL)
307 freedfa(v->subdfas[i]);
308 }
309 if (v->subdfas != subdfas)
310 FREE(v->subdfas);
311 }
312 if (v->ladfas != NULL)
313 {
314 n = (size_t) v->g->nlacons;
315 for (i = 0; i < n; i++)
316 {
317 if (v->ladfas[i] != NULL)
318 freedfa(v->ladfas[i]);
319 }
320 FREE(v->ladfas);
321 }
322 if (v->lblastcss != NULL)
323 FREE(v->lblastcss);
324 if (v->lblastcp != NULL)
325 FREE(v->lblastcp);
326
327 return st;
328}
329
330/*
331 * getsubdfa - create or re-fetch the DFA for a tree subre node
332 *
333 * We only need to create the DFA once per overall regex execution.
334 * The DFA will be freed by the cleanup step in pg_regexec().
335 */
336static struct dfa *
337getsubdfa(struct vars *v,
338 struct subre *t)
339{
340 if (v->subdfas[t->id] == NULL)
341 {
342 v->subdfas[t->id] = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC);
343 if (ISERR())
344 return NULL;
345 }
346 return v->subdfas[t->id];
347}
348
349/*
350 * getladfa - create or re-fetch the DFA for a LACON subre node
351 *
352 * Same as above, but for LACONs.
353 */
354static struct dfa *
355getladfa(struct vars *v,
356 int n)
357{
358 assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
359
360 if (v->ladfas[n] == NULL)
361 {
362 struct subre *sub = &v->g->lacons[n];
363
364 v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC);
365 if (ISERR())
366 return NULL;
367 }
368 return v->ladfas[n];
369}
370
371/*
372 * find - find a match for the main NFA (no-complications case)
373 */
374static int
375find(struct vars *v,
376 struct cnfa *cnfa,
377 struct colormap *cm)
378{
379 struct dfa *s;
380 struct dfa *d;
381 chr *begin;
382 chr *end = NULL;
383 chr *cold;
384 chr *open; /* open and close of range of possible starts */
385 chr *close;
386 int hitend;
387 int shorter = (v->g->tree->flags & SHORTER) ? 1 : 0;
388
389 /* first, a shot with the search RE */
390 s = newdfa(v, &v->g->search, cm, &v->dfa1);
391 assert(!(ISERR() && s != NULL));
392 NOERR();
393 MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
394 cold = NULL;
395 close = shortest(v, s, v->search_start, v->search_start, v->stop,
396 &cold, (int *) NULL);
397 freedfa(s);
398 NOERR();
399 if (v->g->cflags & REG_EXPECT)
400 {
401 assert(v->details != NULL);
402 if (cold != NULL)
403 v->details->rm_extend.rm_so = OFF(cold);
404 else
405 v->details->rm_extend.rm_so = OFF(v->stop);
406 v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
407 }
408 if (close == NULL) /* not found */
409 return REG_NOMATCH;
410 if (v->nmatch == 0) /* found, don't need exact location */
411 return REG_OKAY;
412
413 /* find starting point and match */
414 assert(cold != NULL);
415 open = cold;
416 cold = NULL;
417 MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close)));
418 d = newdfa(v, cnfa, cm, &v->dfa1);
419 assert(!(ISERR() && d != NULL));
420 NOERR();
421 for (begin = open; begin <= close; begin++)
422 {
423 MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
424 if (shorter)
425 end = shortest(v, d, begin, begin, v->stop,
426 (chr **) NULL, &hitend);
427 else
428 end = longest(v, d, begin, v->stop, &hitend);
429 if (ISERR())
430 {
431 freedfa(d);
432 return v->err;
433 }
434 if (hitend && cold == NULL)
435 cold = begin;
436 if (end != NULL)
437 break; /* NOTE BREAK OUT */
438 }
439 assert(end != NULL); /* search RE succeeded so loop should */
440 freedfa(d);
441
442 /* and pin down details */
443 assert(v->nmatch > 0);
444 v->pmatch[0].rm_so = OFF(begin);
445 v->pmatch[0].rm_eo = OFF(end);
446 if (v->g->cflags & REG_EXPECT)
447 {
448 if (cold != NULL)
449 v->details->rm_extend.rm_so = OFF(cold);
450 else
451 v->details->rm_extend.rm_so = OFF(v->stop);
452 v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
453 }
454 if (v->nmatch == 1) /* no need for submatches */
455 return REG_OKAY;
456
457 /* find submatches */
458 zapallsubs(v->pmatch, v->nmatch);
459 return cdissect(v, v->g->tree, begin, end);
460}
461
462/*
463 * cfind - find a match for the main NFA (with complications)
464 */
465static int
466cfind(struct vars *v,
467 struct cnfa *cnfa,
468 struct colormap *cm)
469{
470 struct dfa *s;
471 struct dfa *d;
472 chr *cold;
473 int ret;
474
475 s = newdfa(v, &v->g->search, cm, &v->dfa1);
476 NOERR();
477 d = newdfa(v, cnfa, cm, &v->dfa2);
478 if (ISERR())
479 {
480 assert(d == NULL);
481 freedfa(s);
482 return v->err;
483 }
484
485 ret = cfindloop(v, cnfa, cm, d, s, &cold);
486
487 freedfa(d);
488 freedfa(s);
489 NOERR();
490 if (v->g->cflags & REG_EXPECT)
491 {
492 assert(v->details != NULL);
493 if (cold != NULL)
494 v->details->rm_extend.rm_so = OFF(cold);
495 else
496 v->details->rm_extend.rm_so = OFF(v->stop);
497 v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
498 }
499 return ret;
500}
501
502/*
503 * cfindloop - the heart of cfind
504 */
505static int
506cfindloop(struct vars *v,
507 struct cnfa *cnfa,
508 struct colormap *cm,
509 struct dfa *d,
510 struct dfa *s,
511 chr **coldp) /* where to put coldstart pointer */
512{
513 chr *begin;
514 chr *end;
515 chr *cold;
516 chr *open; /* open and close of range of possible starts */
517 chr *close;
518 chr *estart;
519 chr *estop;
520 int er;
521 int shorter = v->g->tree->flags & SHORTER;
522 int hitend;
523
524 assert(d != NULL && s != NULL);
525 cold = NULL;
526 close = v->search_start;
527 do
528 {
529 /* Search with the search RE for match range at/beyond "close" */
530 MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
531 close = shortest(v, s, close, close, v->stop, &cold, (int *) NULL);
532 if (ISERR())
533 {
534 *coldp = cold;
535 return v->err;
536 }
537 if (close == NULL)
538 break; /* no more possible match anywhere */
539 assert(cold != NULL);
540 open = cold;
541 cold = NULL;
542 /* Search for matches starting between "open" and "close" inclusive */
543 MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
544 for (begin = open; begin <= close; begin++)
545 {
546 MDEBUG(("\ncfind trying at %ld\n", LOFF(begin)));
547 estart = begin;
548 estop = v->stop;
549 for (;;)
550 {
551 /* Here we use the top node's detailed RE */
552 if (shorter)
553 end = shortest(v, d, begin, estart,
554 estop, (chr **) NULL, &hitend);
555 else
556 end = longest(v, d, begin, estop,
557 &hitend);
558 if (ISERR())
559 {
560 *coldp = cold;
561 return v->err;
562 }
563 if (hitend && cold == NULL)
564 cold = begin;
565 if (end == NULL)
566 break; /* no match with this begin point, try next */
567 MDEBUG(("tentative end %ld\n", LOFF(end)));
568 /* Dissect the potential match to see if it really matches */
569 zapallsubs(v->pmatch, v->nmatch);
570 er = cdissect(v, v->g->tree, begin, end);
571 if (er == REG_OKAY)
572 {
573 if (v->nmatch > 0)
574 {
575 v->pmatch[0].rm_so = OFF(begin);
576 v->pmatch[0].rm_eo = OFF(end);
577 }
578 *coldp = cold;
579 return REG_OKAY;
580 }
581 if (er != REG_NOMATCH)
582 {
583 ERR(er);
584 *coldp = cold;
585 return er;
586 }
587 /* Try next longer/shorter match with same begin point */
588 if (shorter)
589 {
590 if (end == estop)
591 break; /* no more, so try next begin point */
592 estart = end + 1;
593 }
594 else
595 {
596 if (end == begin)
597 break; /* no more, so try next begin point */
598 estop = end - 1;
599 }
600 } /* end loop over endpoint positions */
601 } /* end loop over beginning positions */
602
603 /*
604 * If we get here, there is no possible match starting at or before
605 * "close", so consider matches beyond that. We'll do a fresh search
606 * with the search RE to find a new promising match range.
607 */
608 close++;
609 } while (close < v->stop);
610
611 *coldp = cold;
612 return REG_NOMATCH;
613}
614
615/*
616 * zapallsubs - initialize all subexpression matches to "no match"
617 */
618static void
619zapallsubs(regmatch_t *p,
620 size_t n)
621{
622 size_t i;
623
624 for (i = n - 1; i > 0; i--)
625 {
626 p[i].rm_so = -1;
627 p[i].rm_eo = -1;
628 }
629}
630
631/*
632 * zaptreesubs - initialize subexpressions within subtree to "no match"
633 */
634static void
635zaptreesubs(struct vars *v,
636 struct subre *t)
637{
638 if (t->op == '(')
639 {
640 int n = t->subno;
641
642 assert(n > 0);
643 if ((size_t) n < v->nmatch)
644 {
645 v->pmatch[n].rm_so = -1;
646 v->pmatch[n].rm_eo = -1;
647 }
648 }
649
650 if (t->left != NULL)
651 zaptreesubs(v, t->left);
652 if (t->right != NULL)
653 zaptreesubs(v, t->right);
654}
655
656/*
657 * subset - set subexpression match data for a successful subre
658 */
659static void
660subset(struct vars *v,
661 struct subre *sub,
662 chr *begin,
663 chr *end)
664{
665 int n = sub->subno;
666
667 assert(n > 0);
668 if ((size_t) n >= v->nmatch)
669 return;
670
671 MDEBUG(("setting %d\n", n));
672 v->pmatch[n].rm_so = OFF(begin);
673 v->pmatch[n].rm_eo = OFF(end);
674}
675
676/*
677 * cdissect - check backrefs and determine subexpression matches
678 *
679 * cdissect recursively processes a subre tree to check matching of backrefs
680 * and/or identify submatch boundaries for capture nodes. The proposed match
681 * runs from "begin" to "end" (not including "end"), and we are basically
682 * "dissecting" it to see where the submatches are.
683 *
684 * Before calling any level of cdissect, the caller must have run the node's
685 * DFA and found that the proposed substring satisfies the DFA. (We make
686 * the caller do that because in concatenation and iteration nodes, it's
687 * much faster to check all the substrings against the child DFAs before we
688 * recurse.) Also, caller must have cleared subexpression match data via
689 * zaptreesubs (or zapallsubs at the top level).
690 */
691static int /* regexec return code */
692cdissect(struct vars *v,
693 struct subre *t,
694 chr *begin, /* beginning of relevant substring */
695 chr *end) /* end of same */
696{
697 int er;
698
699 assert(t != NULL);
700 MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op));
701
702 /* handy place to check for operation cancel */
703 if (CANCEL_REQUESTED(v->re))
704 return REG_CANCEL;
705 /* ... and stack overrun */
706 if (STACK_TOO_DEEP(v->re))
707 return REG_ETOOBIG;
708
709 switch (t->op)
710 {
711 case '=': /* terminal node */
712 assert(t->left == NULL && t->right == NULL);
713 er = REG_OKAY; /* no action, parent did the work */
714 break;
715 case 'b': /* back reference */
716 assert(t->left == NULL && t->right == NULL);
717 er = cbrdissect(v, t, begin, end);
718 break;
719 case '.': /* concatenation */
720 assert(t->left != NULL && t->right != NULL);
721 if (t->left->flags & SHORTER) /* reverse scan */
722 er = crevcondissect(v, t, begin, end);
723 else
724 er = ccondissect(v, t, begin, end);
725 break;
726 case '|': /* alternation */
727 assert(t->left != NULL);
728 er = caltdissect(v, t, begin, end);
729 break;
730 case '*': /* iteration */
731 assert(t->left != NULL);
732 if (t->left->flags & SHORTER) /* reverse scan */
733 er = creviterdissect(v, t, begin, end);
734 else
735 er = citerdissect(v, t, begin, end);
736 break;
737 case '(': /* capturing */
738 assert(t->left != NULL && t->right == NULL);
739 assert(t->subno > 0);
740 er = cdissect(v, t->left, begin, end);
741 if (er == REG_OKAY)
742 subset(v, t, begin, end);
743 break;
744 default:
745 er = REG_ASSERT;
746 break;
747 }
748
749 /*
750 * We should never have a match failure unless backrefs lurk below;
751 * otherwise, either caller failed to check the DFA, or there's some
752 * inconsistency between the DFA and the node's innards.
753 */
754 assert(er != REG_NOMATCH || (t->flags & BACKR));
755
756 return er;
757}
758
759/*
760 * ccondissect - dissect match for concatenation node
761 */
762static int /* regexec return code */
763ccondissect(struct vars *v,
764 struct subre *t,
765 chr *begin, /* beginning of relevant substring */
766 chr *end) /* end of same */
767{
768 struct dfa *d;
769 struct dfa *d2;
770 chr *mid;
771 int er;
772
773 assert(t->op == '.');
774 assert(t->left != NULL && t->left->cnfa.nstates > 0);
775 assert(t->right != NULL && t->right->cnfa.nstates > 0);
776 assert(!(t->left->flags & SHORTER));
777
778 d = getsubdfa(v, t->left);
779 NOERR();
780 d2 = getsubdfa(v, t->right);
781 NOERR();
782 MDEBUG(("cconcat %d\n", t->id));
783
784 /* pick a tentative midpoint */
785 mid = longest(v, d, begin, end, (int *) NULL);
786 NOERR();
787 if (mid == NULL)
788 return REG_NOMATCH;
789 MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
790
791 /* iterate until satisfaction or failure */
792 for (;;)
793 {
794 /* try this midpoint on for size */
795 if (longest(v, d2, mid, end, (int *) NULL) == end)
796 {
797 er = cdissect(v, t->left, begin, mid);
798 if (er == REG_OKAY)
799 {
800 er = cdissect(v, t->right, mid, end);
801 if (er == REG_OKAY)
802 {
803 /* satisfaction */
804 MDEBUG(("successful\n"));
805 return REG_OKAY;
806 }
807 }
808 if (er != REG_NOMATCH)
809 return er;
810 }
811 NOERR();
812
813 /* that midpoint didn't work, find a new one */
814 if (mid == begin)
815 {
816 /* all possibilities exhausted */
817 MDEBUG(("%d no midpoint\n", t->id));
818 return REG_NOMATCH;
819 }
820 mid = longest(v, d, begin, mid - 1, (int *) NULL);
821 NOERR();
822 if (mid == NULL)
823 {
824 /* failed to find a new one */
825 MDEBUG(("%d failed midpoint\n", t->id));
826 return REG_NOMATCH;
827 }
828 MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
829 zaptreesubs(v, t->left);
830 zaptreesubs(v, t->right);
831 }
832
833 /* can't get here */
834 return REG_ASSERT;
835}
836
837/*
838 * crevcondissect - dissect match for concatenation node, shortest-first
839 */
840static int /* regexec return code */
841crevcondissect(struct vars *v,
842 struct subre *t,
843 chr *begin, /* beginning of relevant substring */
844 chr *end) /* end of same */
845{
846 struct dfa *d;
847 struct dfa *d2;
848 chr *mid;
849 int er;
850
851 assert(t->op == '.');
852 assert(t->left != NULL && t->left->cnfa.nstates > 0);
853 assert(t->right != NULL && t->right->cnfa.nstates > 0);
854 assert(t->left->flags & SHORTER);
855
856 d = getsubdfa(v, t->left);
857 NOERR();
858 d2 = getsubdfa(v, t->right);
859 NOERR();
860 MDEBUG(("crevcon %d\n", t->id));
861
862 /* pick a tentative midpoint */
863 mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL);
864 NOERR();
865 if (mid == NULL)
866 return REG_NOMATCH;
867 MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
868
869 /* iterate until satisfaction or failure */
870 for (;;)
871 {
872 /* try this midpoint on for size */
873 if (longest(v, d2, mid, end, (int *) NULL) == end)
874 {
875 er = cdissect(v, t->left, begin, mid);
876 if (er == REG_OKAY)
877 {
878 er = cdissect(v, t->right, mid, end);
879 if (er == REG_OKAY)
880 {
881 /* satisfaction */
882 MDEBUG(("successful\n"));
883 return REG_OKAY;
884 }
885 }
886 if (er != REG_NOMATCH)
887 return er;
888 }
889 NOERR();
890
891 /* that midpoint didn't work, find a new one */
892 if (mid == end)
893 {
894 /* all possibilities exhausted */
895 MDEBUG(("%d no midpoint\n", t->id));
896 return REG_NOMATCH;
897 }
898 mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, (int *) NULL);
899 NOERR();
900 if (mid == NULL)
901 {
902 /* failed to find a new one */
903 MDEBUG(("%d failed midpoint\n", t->id));
904 return REG_NOMATCH;
905 }
906 MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
907 zaptreesubs(v, t->left);
908 zaptreesubs(v, t->right);
909 }
910
911 /* can't get here */
912 return REG_ASSERT;
913}
914
915/*
916 * cbrdissect - dissect match for backref node
917 */
918static int /* regexec return code */
919cbrdissect(struct vars *v,
920 struct subre *t,
921 chr *begin, /* beginning of relevant substring */
922 chr *end) /* end of same */
923{
924 int n = t->subno;
925 size_t numreps;
926 size_t tlen;
927 size_t brlen;
928 chr *brstring;
929 chr *p;
930 int min = t->min;
931 int max = t->max;
932
933 assert(t != NULL);
934 assert(t->op == 'b');
935 assert(n >= 0);
936 assert((size_t) n < v->nmatch);
937
938 MDEBUG(("cbackref n%d %d{%d-%d}\n", t->id, n, min, max));
939
940 /* get the backreferenced string */
941 if (v->pmatch[n].rm_so == -1)
942 return REG_NOMATCH;
943 brstring = v->start + v->pmatch[n].rm_so;
944 brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
945
946 /* special cases for zero-length strings */
947 if (brlen == 0)
948 {
949 /*
950 * matches only if target is zero length, but any number of
951 * repetitions can be considered to be present
952 */
953 if (begin == end && min <= max)
954 {
955 MDEBUG(("cbackref matched trivially\n"));
956 return REG_OKAY;
957 }
958 return REG_NOMATCH;
959 }
960 if (begin == end)
961 {
962 /* matches only if zero repetitions are okay */
963 if (min == 0)
964 {
965 MDEBUG(("cbackref matched trivially\n"));
966 return REG_OKAY;
967 }
968 return REG_NOMATCH;
969 }
970
971 /*
972 * check target length to see if it could possibly be an allowed number of
973 * repetitions of brstring
974 */
975 assert(end > begin);
976 tlen = end - begin;
977 if (tlen % brlen != 0)
978 return REG_NOMATCH;
979 numreps = tlen / brlen;
980 if (numreps < min || (numreps > max && max != DUPINF))
981 return REG_NOMATCH;
982
983 /* okay, compare the actual string contents */
984 p = begin;
985 while (numreps-- > 0)
986 {
987 if ((*v->g->compare) (brstring, p, brlen) != 0)
988 return REG_NOMATCH;
989 p += brlen;
990 }
991
992 MDEBUG(("cbackref matched\n"));
993 return REG_OKAY;
994}
995
996/*
997 * caltdissect - dissect match for alternation node
998 */
999static int /* regexec return code */
1000caltdissect(struct vars *v,
1001 struct subre *t,
1002 chr *begin, /* beginning of relevant substring */
1003 chr *end) /* end of same */
1004{
1005 struct dfa *d;
1006 int er;
1007
1008 /* We loop, rather than tail-recurse, to handle a chain of alternatives */
1009 while (t != NULL)
1010 {
1011 assert(t->op == '|');
1012 assert(t->left != NULL && t->left->cnfa.nstates > 0);
1013
1014 MDEBUG(("calt n%d\n", t->id));
1015
1016 d = getsubdfa(v, t->left);
1017 NOERR();
1018 if (longest(v, d, begin, end, (int *) NULL) == end)
1019 {
1020 MDEBUG(("calt matched\n"));
1021 er = cdissect(v, t->left, begin, end);
1022 if (er != REG_NOMATCH)
1023 return er;
1024 }
1025 NOERR();
1026
1027 t = t->right;
1028 }
1029
1030 return REG_NOMATCH;
1031}
1032
1033/*
1034 * citerdissect - dissect match for iteration node
1035 */
1036static int /* regexec return code */
1037citerdissect(struct vars *v,
1038 struct subre *t,
1039 chr *begin, /* beginning of relevant substring */
1040 chr *end) /* end of same */
1041{
1042 struct dfa *d;
1043 chr **endpts;
1044 chr *limit;
1045 int min_matches;
1046 size_t max_matches;
1047 int nverified;
1048 int k;
1049 int i;
1050 int er;
1051
1052 assert(t->op == '*');
1053 assert(t->left != NULL && t->left->cnfa.nstates > 0);
1054 assert(!(t->left->flags & SHORTER));
1055 assert(begin <= end);
1056
1057 /*
1058 * For the moment, assume the minimum number of matches is 1. If zero
1059 * matches are allowed, and the target string is empty, we are allowed to
1060 * match regardless of the contents of the iter node --- but we would
1061 * prefer to match once, so that capturing parens get set. (An example of
1062 * the concern here is a pattern like "()*\1", which historically this
1063 * code has allowed to succeed.) Therefore, we deal with the zero-matches
1064 * case at the bottom, after failing to find any other way to match.
1065 */
1066 min_matches = t->min;
1067 if (min_matches <= 0)
1068 min_matches = 1;
1069
1070 /*
1071 * We need workspace to track the endpoints of each sub-match. Normally
1072 * we consider only nonzero-length sub-matches, so there can be at most
1073 * end-begin of them. However, if min is larger than that, we will also
1074 * consider zero-length sub-matches in order to find enough matches.
1075 *
1076 * For convenience, endpts[0] contains the "begin" pointer and we store
1077 * sub-match endpoints in endpts[1..max_matches].
1078 */
1079 max_matches = end - begin;
1080 if (max_matches > t->max && t->max != DUPINF)
1081 max_matches = t->max;
1082 if (max_matches < min_matches)
1083 max_matches = min_matches;
1084 endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
1085 if (endpts == NULL)
1086 return REG_ESPACE;
1087 endpts[0] = begin;
1088
1089 d = getsubdfa(v, t->left);
1090 if (ISERR())
1091 {
1092 FREE(endpts);
1093 return v->err;
1094 }
1095 MDEBUG(("citer %d\n", t->id));
1096
1097 /*
1098 * Our strategy is to first find a set of sub-match endpoints that are
1099 * valid according to the child node's DFA, and then recursively dissect
1100 * each sub-match to confirm validity. If any validity check fails,
1101 * backtrack the last sub-match and try again. And, when we next try for
1102 * a validity check, we need not recheck any successfully verified
1103 * sub-matches that we didn't move the endpoints of. nverified remembers
1104 * how many sub-matches are currently known okay.
1105 */
1106
1107 /* initialize to consider first sub-match */
1108 nverified = 0;
1109 k = 1;
1110 limit = end;
1111
1112 /* iterate until satisfaction or failure */
1113 while (k > 0)
1114 {
1115 /* try to find an endpoint for the k'th sub-match */
1116 endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
1117 if (ISERR())
1118 {
1119 FREE(endpts);
1120 return v->err;
1121 }
1122 if (endpts[k] == NULL)
1123 {
1124 /* no match possible, so see if we can shorten previous one */
1125 k--;
1126 goto backtrack;
1127 }
1128 MDEBUG(("%d: working endpoint %d: %ld\n",
1129 t->id, k, LOFF(endpts[k])));
1130
1131 /* k'th sub-match can no longer be considered verified */
1132 if (nverified >= k)
1133 nverified = k - 1;
1134
1135 if (endpts[k] != end)
1136 {
1137 /* haven't reached end yet, try another iteration if allowed */
1138 if (k >= max_matches)
1139 {
1140 /* must try to shorten some previous match */
1141 k--;
1142 goto backtrack;
1143 }
1144
1145 /* reject zero-length match unless necessary to achieve min */
1146 if (endpts[k] == endpts[k - 1] &&
1147 (k >= min_matches || min_matches - k < end - endpts[k]))
1148 goto backtrack;
1149
1150 k++;
1151 limit = end;
1152 continue;
1153 }
1154
1155 /*
1156 * We've identified a way to divide the string into k sub-matches that
1157 * works so far as the child DFA can tell. If k is an allowed number
1158 * of matches, start the slow part: recurse to verify each sub-match.
1159 * We always have k <= max_matches, needn't check that.
1160 */
1161 if (k < min_matches)
1162 goto backtrack;
1163
1164 MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
1165
1166 for (i = nverified + 1; i <= k; i++)
1167 {
1168 zaptreesubs(v, t->left);
1169 er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
1170 if (er == REG_OKAY)
1171 {
1172 nverified = i;
1173 continue;
1174 }
1175 if (er == REG_NOMATCH)
1176 break;
1177 /* oops, something failed */
1178 FREE(endpts);
1179 return er;
1180 }
1181
1182 if (i > k)
1183 {
1184 /* satisfaction */
1185 MDEBUG(("%d successful\n", t->id));
1186 FREE(endpts);
1187 return REG_OKAY;
1188 }
1189
1190 /* match failed to verify, so backtrack */
1191
1192backtrack:
1193
1194 /*
1195 * Must consider shorter versions of the current sub-match. However,
1196 * we'll only ask for a zero-length match if necessary.
1197 */
1198 while (k > 0)
1199 {
1200 chr *prev_end = endpts[k - 1];
1201
1202 if (endpts[k] > prev_end)
1203 {
1204 limit = endpts[k] - 1;
1205 if (limit > prev_end ||
1206 (k < min_matches && min_matches - k >= end - prev_end))
1207 {
1208 /* break out of backtrack loop, continue the outer one */
1209 break;
1210 }
1211 }
1212 /* can't shorten k'th sub-match any more, consider previous one */
1213 k--;
1214 }
1215 }
1216
1217 /* all possibilities exhausted */
1218 FREE(endpts);
1219
1220 /*
1221 * Now consider the possibility that we can match to a zero-length string
1222 * by using zero repetitions.
1223 */
1224 if (t->min == 0 && begin == end)
1225 {
1226 MDEBUG(("%d allowing zero matches\n", t->id));
1227 return REG_OKAY;
1228 }
1229
1230 MDEBUG(("%d failed\n", t->id));
1231 return REG_NOMATCH;
1232}
1233
1234/*
1235 * creviterdissect - dissect match for iteration node, shortest-first
1236 */
1237static int /* regexec return code */
1238creviterdissect(struct vars *v,
1239 struct subre *t,
1240 chr *begin, /* beginning of relevant substring */
1241 chr *end) /* end of same */
1242{
1243 struct dfa *d;
1244 chr **endpts;
1245 chr *limit;
1246 int min_matches;
1247 size_t max_matches;
1248 int nverified;
1249 int k;
1250 int i;
1251 int er;
1252
1253 assert(t->op == '*');
1254 assert(t->left != NULL && t->left->cnfa.nstates > 0);
1255 assert(t->left->flags & SHORTER);
1256 assert(begin <= end);
1257
1258 /*
1259 * If zero matches are allowed, and target string is empty, just declare
1260 * victory. OTOH, if target string isn't empty, zero matches can't work
1261 * so we pretend the min is 1.
1262 */
1263 min_matches = t->min;
1264 if (min_matches <= 0)
1265 {
1266 if (begin == end)
1267 return REG_OKAY;
1268 min_matches = 1;
1269 }
1270
1271 /*
1272 * We need workspace to track the endpoints of each sub-match. Normally
1273 * we consider only nonzero-length sub-matches, so there can be at most
1274 * end-begin of them. However, if min is larger than that, we will also
1275 * consider zero-length sub-matches in order to find enough matches.
1276 *
1277 * For convenience, endpts[0] contains the "begin" pointer and we store
1278 * sub-match endpoints in endpts[1..max_matches].
1279 */
1280 max_matches = end - begin;
1281 if (max_matches > t->max && t->max != DUPINF)
1282 max_matches = t->max;
1283 if (max_matches < min_matches)
1284 max_matches = min_matches;
1285 endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
1286 if (endpts == NULL)
1287 return REG_ESPACE;
1288 endpts[0] = begin;
1289
1290 d = getsubdfa(v, t->left);
1291 if (ISERR())
1292 {
1293 FREE(endpts);
1294 return v->err;
1295 }
1296 MDEBUG(("creviter %d\n", t->id));
1297
1298 /*
1299 * Our strategy is to first find a set of sub-match endpoints that are
1300 * valid according to the child node's DFA, and then recursively dissect
1301 * each sub-match to confirm validity. If any validity check fails,
1302 * backtrack the last sub-match and try again. And, when we next try for
1303 * a validity check, we need not recheck any successfully verified
1304 * sub-matches that we didn't move the endpoints of. nverified remembers
1305 * how many sub-matches are currently known okay.
1306 */
1307
1308 /* initialize to consider first sub-match */
1309 nverified = 0;
1310 k = 1;
1311 limit = begin;
1312
1313 /* iterate until satisfaction or failure */
1314 while (k > 0)
1315 {
1316 /* disallow zero-length match unless necessary to achieve min */
1317 if (limit == endpts[k - 1] &&
1318 limit != end &&
1319 (k >= min_matches || min_matches - k < end - limit))
1320 limit++;
1321
1322 /* if this is the last allowed sub-match, it must reach to the end */
1323 if (k >= max_matches)
1324 limit = end;
1325
1326 /* try to find an endpoint for the k'th sub-match */
1327 endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
1328 (chr **) NULL, (int *) NULL);
1329 if (ISERR())
1330 {
1331 FREE(endpts);
1332 return v->err;
1333 }
1334 if (endpts[k] == NULL)
1335 {
1336 /* no match possible, so see if we can lengthen previous one */
1337 k--;
1338 goto backtrack;
1339 }
1340 MDEBUG(("%d: working endpoint %d: %ld\n",
1341 t->id, k, LOFF(endpts[k])));
1342
1343 /* k'th sub-match can no longer be considered verified */
1344 if (nverified >= k)
1345 nverified = k - 1;
1346
1347 if (endpts[k] != end)
1348 {
1349 /* haven't reached end yet, try another iteration if allowed */
1350 if (k >= max_matches)
1351 {
1352 /* must try to lengthen some previous match */
1353 k--;
1354 goto backtrack;
1355 }
1356
1357 k++;
1358 limit = endpts[k - 1];
1359 continue;
1360 }
1361
1362 /*
1363 * We've identified a way to divide the string into k sub-matches that
1364 * works so far as the child DFA can tell. If k is an allowed number
1365 * of matches, start the slow part: recurse to verify each sub-match.
1366 * We always have k <= max_matches, needn't check that.
1367 */
1368 if (k < min_matches)
1369 goto backtrack;
1370
1371 MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
1372
1373 for (i = nverified + 1; i <= k; i++)
1374 {
1375 zaptreesubs(v, t->left);
1376 er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
1377 if (er == REG_OKAY)
1378 {
1379 nverified = i;
1380 continue;
1381 }
1382 if (er == REG_NOMATCH)
1383 break;
1384 /* oops, something failed */
1385 FREE(endpts);
1386 return er;
1387 }
1388
1389 if (i > k)
1390 {
1391 /* satisfaction */
1392 MDEBUG(("%d successful\n", t->id));
1393 FREE(endpts);
1394 return REG_OKAY;
1395 }
1396
1397 /* match failed to verify, so backtrack */
1398
1399backtrack:
1400
1401 /*
1402 * Must consider longer versions of the current sub-match.
1403 */
1404 while (k > 0)
1405 {
1406 if (endpts[k] < end)
1407 {
1408 limit = endpts[k] + 1;
1409 /* break out of backtrack loop, continue the outer one */
1410 break;
1411 }
1412 /* can't lengthen k'th sub-match any more, consider previous one */
1413 k--;
1414 }
1415 }
1416
1417 /* all possibilities exhausted */
1418 MDEBUG(("%d failed\n", t->id));
1419 FREE(endpts);
1420 return REG_NOMATCH;
1421}
1422
1423
1424
1425#include "rege_dfa.c"
1426