1// Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Tested by search_test.cc.
6//
7// Prog::SearchNFA, an NFA search.
8// This is an actual NFA like the theorists talk about,
9// not the pseudo-NFA found in backtracking regexp implementations.
10//
11// IMPLEMENTATION
12//
13// This algorithm is a variant of one that appeared in Rob Pike's sam editor,
14// which is a variant of the one described in Thompson's 1968 CACM paper.
15// See http://swtch.com/~rsc/regexp/ for various history. The main feature
16// over the DFA implementation is that it tracks submatch boundaries.
17//
18// When the choice of submatch boundaries is ambiguous, this particular
19// implementation makes the same choices that traditional backtracking
20// implementations (in particular, Perl and PCRE) do.
21// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
22// time in the length of the input.
23//
24// Like Thompson's original machine and like the DFA implementation, this
25// implementation notices a match only once it is one byte past it.
26
27#include <stdio.h>
28#include <string.h>
29#include <algorithm>
30#include <string>
31#include <utility>
32#include <vector>
33
34#include "util/logging.h"
35#include "util/strutil.h"
36#include "re2/pod_array.h"
37#include "re2/prog.h"
38#include "re2/regexp.h"
39#include "re2/sparse_array.h"
40#include "re2/sparse_set.h"
41
42namespace re2 {
43
44static const bool ExtraDebug = false;
45
46class NFA {
47 public:
48 NFA(Prog* prog);
49 ~NFA();
50
51 // Searches for a matching string.
52 // * If anchored is true, only considers matches starting at offset.
53 // Otherwise finds lefmost match at or after offset.
54 // * If longest is true, returns the longest match starting
55 // at the chosen start point. Otherwise returns the so-called
56 // left-biased match, the one traditional backtracking engines
57 // (like Perl and PCRE) find.
58 // Records submatch boundaries in submatch[1..nsubmatch-1].
59 // Submatch[0] is the entire match. When there is a choice in
60 // which text matches each subexpression, the submatch boundaries
61 // are chosen to match what a backtracking implementation would choose.
62 bool Search(const StringPiece& text, const StringPiece& context,
63 bool anchored, bool longest,
64 StringPiece* submatch, int nsubmatch);
65
66 private:
67 struct Thread {
68 union {
69 int ref;
70 Thread* next; // when on free list
71 };
72 const char** capture;
73 };
74
75 // State for explicit stack in AddToThreadq.
76 struct AddState {
77 int id; // Inst to process
78 Thread* t; // if not null, set t0 = t before processing id
79 };
80
81 // Threadq is a list of threads. The list is sorted by the order
82 // in which Perl would explore that particular state -- the earlier
83 // choices appear earlier in the list.
84 typedef SparseArray<Thread*> Threadq;
85
86 inline Thread* AllocThread();
87 inline Thread* Incref(Thread* t);
88 inline void Decref(Thread* t);
89
90 // Follows all empty arrows from id0 and enqueues all the states reached.
91 // Enqueues only the ByteRange instructions that match byte c.
92 // context is used (with p) for evaluating empty-width specials.
93 // p is the current input position, and t0 is the current thread.
94 void AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
95 const char* p, Thread* t0);
96
97 // Run runq on byte c, appending new states to nextq.
98 // Updates matched_ and match_ as new, better matches are found.
99 // context is used (with p) for evaluating empty-width specials.
100 // p is the position of byte c in the input string for AddToThreadq;
101 // p-1 will be used when processing Match instructions.
102 // Frees all the threads on runq.
103 // If there is a shortcut to the end, returns that shortcut.
104 int Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
105 const char* p);
106
107 // Returns text version of capture information, for debugging.
108 std::string FormatCapture(const char** capture);
109
110 inline void CopyCapture(const char** dst, const char** src);
111
112 Prog* prog_; // underlying program
113 int start_; // start instruction in program
114 int ncapture_; // number of submatches to track
115 bool longest_; // whether searching for longest match
116 bool endmatch_; // whether match must end at text.end()
117 const char* btext_; // beginning of text being matched (for FormatSubmatch)
118 const char* etext_; // end of text being matched (for endmatch_)
119 Threadq q0_, q1_; // pre-allocated for Search.
120 PODArray<AddState> stack_; // pre-allocated for AddToThreadq
121 Thread* free_threads_; // free list
122 const char** match_; // best match so far
123 bool matched_; // any match so far?
124
125 NFA(const NFA&) = delete;
126 NFA& operator=(const NFA&) = delete;
127};
128
129NFA::NFA(Prog* prog) {
130 prog_ = prog;
131 start_ = prog_->start();
132 ncapture_ = 0;
133 longest_ = false;
134 endmatch_ = false;
135 btext_ = NULL;
136 etext_ = NULL;
137 q0_.resize(prog_->size());
138 q1_.resize(prog_->size());
139 // See NFA::AddToThreadq() for why this is so.
140 int nstack = 2*prog_->inst_count(kInstCapture) +
141 prog_->inst_count(kInstEmptyWidth) +
142 prog_->inst_count(kInstNop) + 1; // + 1 for start inst
143 stack_ = PODArray<AddState>(nstack);
144 free_threads_ = NULL;
145 match_ = NULL;
146 matched_ = false;
147}
148
149NFA::~NFA() {
150 delete[] match_;
151 Thread* next;
152 for (Thread* t = free_threads_; t; t = next) {
153 next = t->next;
154 delete[] t->capture;
155 delete t;
156 }
157}
158
159NFA::Thread* NFA::AllocThread() {
160 Thread* t = free_threads_;
161 if (t == NULL) {
162 t = new Thread;
163 t->ref = 1;
164 t->capture = new const char*[ncapture_];
165 return t;
166 }
167 free_threads_ = t->next;
168 t->ref = 1;
169 return t;
170}
171
172NFA::Thread* NFA::Incref(Thread* t) {
173 DCHECK(t != NULL);
174 t->ref++;
175 return t;
176}
177
178void NFA::Decref(Thread* t) {
179 if (t == NULL)
180 return;
181 t->ref--;
182 if (t->ref > 0)
183 return;
184 DCHECK_EQ(t->ref, 0);
185 t->next = free_threads_;
186 free_threads_ = t;
187}
188
189void NFA::CopyCapture(const char** dst, const char** src) {
190 for (int i = 0; i < ncapture_; i+=2) {
191 dst[i] = src[i];
192 dst[i+1] = src[i+1];
193 }
194}
195
196// Follows all empty arrows from id0 and enqueues all the states reached.
197// Enqueues only the ByteRange instructions that match byte c.
198// context is used (with p) for evaluating empty-width specials.
199// p is the current input position, and t0 is the current thread.
200void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
201 const char* p, Thread* t0) {
202 if (id0 == 0)
203 return;
204
205 // Use stack_ to hold our stack of instructions yet to process.
206 // It was preallocated as follows:
207 // two entries per Capture;
208 // one entry per EmptyWidth; and
209 // one entry per Nop.
210 // This reflects the maximum number of stack pushes that each can
211 // perform. (Each instruction can be processed at most once.)
212 AddState* stk = stack_.data();
213 int nstk = 0;
214
215 stk[nstk++] = {id0, NULL};
216 while (nstk > 0) {
217 DCHECK_LE(nstk, stack_.size());
218 AddState a = stk[--nstk];
219
220 Loop:
221 if (a.t != NULL) {
222 // t0 was a thread that we allocated and copied in order to
223 // record the capture, so we must now decref it.
224 Decref(t0);
225 t0 = a.t;
226 }
227
228 int id = a.id;
229 if (id == 0)
230 continue;
231 if (q->has_index(id)) {
232 if (ExtraDebug)
233 fprintf(stderr, " [%d%s]\n", id, FormatCapture(t0->capture).c_str());
234 continue;
235 }
236
237 // Create entry in q no matter what. We might fill it in below,
238 // or we might not. Even if not, it is necessary to have it,
239 // so that we don't revisit id0 during the recursion.
240 q->set_new(id, NULL);
241 Thread** tp = &q->get_existing(id);
242 int j;
243 Thread* t;
244 Prog::Inst* ip = prog_->inst(id);
245 switch (ip->opcode()) {
246 default:
247 LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
248 break;
249
250 case kInstFail:
251 break;
252
253 case kInstAltMatch:
254 // Save state; will pick up at next byte.
255 t = Incref(t0);
256 *tp = t;
257
258 DCHECK(!ip->last());
259 a = {id+1, NULL};
260 goto Loop;
261
262 case kInstNop:
263 if (!ip->last())
264 stk[nstk++] = {id+1, NULL};
265
266 // Continue on.
267 a = {ip->out(), NULL};
268 goto Loop;
269
270 case kInstCapture:
271 if (!ip->last())
272 stk[nstk++] = {id+1, NULL};
273
274 if ((j=ip->cap()) < ncapture_) {
275 // Push a dummy whose only job is to restore t0
276 // once we finish exploring this possibility.
277 stk[nstk++] = {0, t0};
278
279 // Record capture.
280 t = AllocThread();
281 CopyCapture(t->capture, t0->capture);
282 t->capture[j] = p;
283 t0 = t;
284 }
285 a = {ip->out(), NULL};
286 goto Loop;
287
288 case kInstByteRange:
289 if (!ip->Matches(c))
290 goto Next;
291
292 // Save state; will pick up at next byte.
293 t = Incref(t0);
294 *tp = t;
295 if (ExtraDebug)
296 fprintf(stderr, " + %d%s\n", id, FormatCapture(t0->capture).c_str());
297
298 if (ip->hint() == 0)
299 break;
300 a = {id+ip->hint(), NULL};
301 goto Loop;
302
303 case kInstMatch:
304 // Save state; will pick up at next byte.
305 t = Incref(t0);
306 *tp = t;
307 if (ExtraDebug)
308 fprintf(stderr, " ! %d%s\n", id, FormatCapture(t0->capture).c_str());
309
310 Next:
311 if (ip->last())
312 break;
313 a = {id+1, NULL};
314 goto Loop;
315
316 case kInstEmptyWidth:
317 if (!ip->last())
318 stk[nstk++] = {id+1, NULL};
319
320 // Continue on if we have all the right flag bits.
321 if (ip->empty() & ~Prog::EmptyFlags(context, p))
322 break;
323 a = {ip->out(), NULL};
324 goto Loop;
325 }
326 }
327}
328
329// Run runq on byte c, appending new states to nextq.
330// Updates matched_ and match_ as new, better matches are found.
331// context is used (with p) for evaluating empty-width specials.
332// p is the position of byte c in the input string for AddToThreadq;
333// p-1 will be used when processing Match instructions.
334// Frees all the threads on runq.
335// If there is a shortcut to the end, returns that shortcut.
336int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
337 const char* p) {
338 nextq->clear();
339
340 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
341 Thread* t = i->value();
342 if (t == NULL)
343 continue;
344
345 if (longest_) {
346 // Can skip any threads started after our current best match.
347 if (matched_ && match_[0] < t->capture[0]) {
348 Decref(t);
349 continue;
350 }
351 }
352
353 int id = i->index();
354 Prog::Inst* ip = prog_->inst(id);
355
356 switch (ip->opcode()) {
357 default:
358 // Should only see the values handled below.
359 LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
360 break;
361
362 case kInstByteRange:
363 AddToThreadq(nextq, ip->out(), c, context, p, t);
364 break;
365
366 case kInstAltMatch:
367 if (i != runq->begin())
368 break;
369 // The match is ours if we want it.
370 if (ip->greedy(prog_) || longest_) {
371 CopyCapture(match_, t->capture);
372 matched_ = true;
373
374 Decref(t);
375 for (++i; i != runq->end(); ++i)
376 Decref(i->value());
377 runq->clear();
378 if (ip->greedy(prog_))
379 return ip->out1();
380 return ip->out();
381 }
382 break;
383
384 case kInstMatch: {
385 // Avoid invoking undefined behavior (arithmetic on a null pointer)
386 // by storing p instead of p-1. (What would the latter even mean?!)
387 // This complements the special case in NFA::Search().
388 if (p == NULL) {
389 CopyCapture(match_, t->capture);
390 match_[1] = p;
391 matched_ = true;
392 break;
393 }
394
395 if (endmatch_ && p-1 != etext_)
396 break;
397
398 if (longest_) {
399 // Leftmost-longest mode: save this match only if
400 // it is either farther to the left or at the same
401 // point but longer than an existing match.
402 if (!matched_ || t->capture[0] < match_[0] ||
403 (t->capture[0] == match_[0] && p-1 > match_[1])) {
404 CopyCapture(match_, t->capture);
405 match_[1] = p-1;
406 matched_ = true;
407 }
408 } else {
409 // Leftmost-biased mode: this match is by definition
410 // better than what we've already found (see next line).
411 CopyCapture(match_, t->capture);
412 match_[1] = p-1;
413 matched_ = true;
414
415 // Cut off the threads that can only find matches
416 // worse than the one we just found: don't run the
417 // rest of the current Threadq.
418 Decref(t);
419 for (++i; i != runq->end(); ++i)
420 Decref(i->value());
421 runq->clear();
422 return 0;
423 }
424 break;
425 }
426 }
427 Decref(t);
428 }
429 runq->clear();
430 return 0;
431}
432
433std::string NFA::FormatCapture(const char** capture) {
434 std::string s;
435 for (int i = 0; i < ncapture_; i+=2) {
436 if (capture[i] == NULL)
437 s += "(?,?)";
438 else if (capture[i+1] == NULL)
439 s += StringPrintf("(%td,?)",
440 capture[i] - btext_);
441 else
442 s += StringPrintf("(%td,%td)",
443 capture[i] - btext_,
444 capture[i+1] - btext_);
445 }
446 return s;
447}
448
449bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
450 bool anchored, bool longest,
451 StringPiece* submatch, int nsubmatch) {
452 if (start_ == 0)
453 return false;
454
455 StringPiece context = const_context;
456 if (context.data() == NULL)
457 context = text;
458
459 // Sanity check: make sure that text lies within context.
460 if (text.begin() < context.begin() || text.end() > context.end()) {
461 LOG(DFATAL) << "context does not contain text";
462 return false;
463 }
464
465 if (prog_->anchor_start() && context.begin() != text.begin())
466 return false;
467 if (prog_->anchor_end() && context.end() != text.end())
468 return false;
469 anchored |= prog_->anchor_start();
470 if (prog_->anchor_end()) {
471 longest = true;
472 endmatch_ = true;
473 }
474
475 if (nsubmatch < 0) {
476 LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
477 return false;
478 }
479
480 // Save search parameters.
481 ncapture_ = 2*nsubmatch;
482 longest_ = longest;
483
484 if (nsubmatch == 0) {
485 // We need to maintain match[0], both to distinguish the
486 // longest match (if longest is true) and also to tell
487 // whether we've seen any matches at all.
488 ncapture_ = 2;
489 }
490
491 match_ = new const char*[ncapture_];
492 matched_ = false;
493
494 // For debugging prints.
495 btext_ = context.data();
496 // For convenience.
497 etext_ = text.data() + text.size();
498
499 if (ExtraDebug)
500 fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
501 std::string(text).c_str(), std::string(context).c_str(), anchored, longest);
502
503 // Set up search.
504 Threadq* runq = &q0_;
505 Threadq* nextq = &q1_;
506 runq->clear();
507 nextq->clear();
508 memset(&match_[0], 0, ncapture_*sizeof match_[0]);
509
510 // Loop over the text, stepping the machine.
511 for (const char* p = text.data();; p++) {
512 if (ExtraDebug) {
513 int c = 0;
514 if (p == btext_)
515 c = '^';
516 else if (p > etext_)
517 c = '$';
518 else if (p < etext_)
519 c = p[0] & 0xFF;
520
521 fprintf(stderr, "%c:", c);
522 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
523 Thread* t = i->value();
524 if (t == NULL)
525 continue;
526 fprintf(stderr, " %d%s", i->index(), FormatCapture(t->capture).c_str());
527 }
528 fprintf(stderr, "\n");
529 }
530
531 // This is a no-op the first time around the loop because runq is empty.
532 int id = Step(runq, nextq, p < etext_ ? p[0] & 0xFF : -1, context, p);
533 DCHECK_EQ(runq->size(), 0);
534 using std::swap;
535 swap(nextq, runq);
536 nextq->clear();
537 if (id != 0) {
538 // We're done: full match ahead.
539 p = etext_;
540 for (;;) {
541 Prog::Inst* ip = prog_->inst(id);
542 switch (ip->opcode()) {
543 default:
544 LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
545 break;
546
547 case kInstCapture:
548 if (ip->cap() < ncapture_)
549 match_[ip->cap()] = p;
550 id = ip->out();
551 continue;
552
553 case kInstNop:
554 id = ip->out();
555 continue;
556
557 case kInstMatch:
558 match_[1] = p;
559 matched_ = true;
560 break;
561 }
562 break;
563 }
564 break;
565 }
566
567 if (p > etext_)
568 break;
569
570 // Start a new thread if there have not been any matches.
571 // (No point in starting a new thread if there have been
572 // matches, since it would be to the right of the match
573 // we already found.)
574 if (!matched_ && (!anchored || p == text.data())) {
575 // If there's a required first byte for an unanchored search
576 // and we're not in the middle of any possible matches,
577 // use memchr to search for the byte quickly.
578 int fb = prog_->first_byte();
579 if (!anchored && runq->size() == 0 &&
580 fb >= 0 && p < etext_ && (p[0] & 0xFF) != fb) {
581 p = reinterpret_cast<const char*>(memchr(p, fb, etext_ - p));
582 if (p == NULL) {
583 p = etext_;
584 }
585 }
586
587 Thread* t = AllocThread();
588 CopyCapture(t->capture, match_);
589 t->capture[0] = p;
590 AddToThreadq(runq, start_, p < etext_ ? p[0] & 0xFF : -1, context, p,
591 t);
592 Decref(t);
593 }
594
595 // If all the threads have died, stop early.
596 if (runq->size() == 0) {
597 if (ExtraDebug)
598 fprintf(stderr, "dead\n");
599 break;
600 }
601
602 // Avoid invoking undefined behavior (arithmetic on a null pointer)
603 // by simply not continuing the loop.
604 // This complements the special case in NFA::Step().
605 if (p == NULL) {
606 (void)Step(runq, nextq, p < etext_ ? p[0] & 0xFF : -1, context, p);
607 DCHECK_EQ(runq->size(), 0);
608 using std::swap;
609 swap(nextq, runq);
610 nextq->clear();
611 break;
612 }
613 }
614
615 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
616 Decref(i->value());
617
618 if (matched_) {
619 for (int i = 0; i < nsubmatch; i++)
620 submatch[i] =
621 StringPiece(match_[2 * i],
622 static_cast<size_t>(match_[2 * i + 1] - match_[2 * i]));
623 if (ExtraDebug)
624 fprintf(stderr, "match (%td,%td)\n",
625 match_[0] - btext_,
626 match_[1] - btext_);
627 return true;
628 }
629 return false;
630}
631
632// Computes whether all successful matches have a common first byte,
633// and if so, returns that byte. If not, returns -1.
634int Prog::ComputeFirstByte() {
635 int b = -1;
636 SparseSet q(size());
637 q.insert(start());
638 for (SparseSet::iterator it = q.begin(); it != q.end(); ++it) {
639 int id = *it;
640 Prog::Inst* ip = inst(id);
641 switch (ip->opcode()) {
642 default:
643 LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
644 break;
645
646 case kInstMatch:
647 // The empty string matches: no first byte.
648 return -1;
649
650 case kInstByteRange:
651 if (!ip->last())
652 q.insert(id+1);
653
654 // Must match only a single byte
655 if (ip->lo() != ip->hi())
656 return -1;
657 if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
658 return -1;
659 // If we haven't seen any bytes yet, record it;
660 // otherwise must match the one we saw before.
661 if (b == -1)
662 b = ip->lo();
663 else if (b != ip->lo())
664 return -1;
665 break;
666
667 case kInstNop:
668 case kInstCapture:
669 case kInstEmptyWidth:
670 if (!ip->last())
671 q.insert(id+1);
672
673 // Continue on.
674 // Ignore ip->empty() flags for kInstEmptyWidth
675 // in order to be as conservative as possible
676 // (assume all possible empty-width flags are true).
677 if (ip->out())
678 q.insert(ip->out());
679 break;
680
681 case kInstAltMatch:
682 DCHECK(!ip->last());
683 q.insert(id+1);
684 break;
685
686 case kInstFail:
687 break;
688 }
689 }
690 return b;
691}
692
693bool
694Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
695 Anchor anchor, MatchKind kind,
696 StringPiece* match, int nmatch) {
697 if (ExtraDebug)
698 Dump();
699
700 NFA nfa(this);
701 StringPiece sp;
702 if (kind == kFullMatch) {
703 anchor = kAnchored;
704 if (nmatch == 0) {
705 match = &sp;
706 nmatch = 1;
707 }
708 }
709 if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
710 return false;
711 if (kind == kFullMatch && match[0].end() != text.end())
712 return false;
713 return true;
714}
715
716// For each instruction i in the program reachable from the start, compute the
717// number of instructions reachable from i by following only empty transitions
718// and record that count as fanout[i].
719//
720// fanout holds the results and is also the work queue for the outer iteration.
721// reachable holds the reached nodes for the inner iteration.
722void Prog::Fanout(SparseArray<int>* fanout) {
723 DCHECK_EQ(fanout->max_size(), size());
724 SparseSet reachable(size());
725 fanout->clear();
726 fanout->set_new(start(), 0);
727 for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) {
728 int* count = &i->value();
729 reachable.clear();
730 reachable.insert(i->index());
731 for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {
732 int id = *j;
733 Prog::Inst* ip = inst(id);
734 switch (ip->opcode()) {
735 default:
736 LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()";
737 break;
738
739 case kInstByteRange:
740 if (!ip->last())
741 reachable.insert(id+1);
742
743 (*count)++;
744 if (!fanout->has_index(ip->out())) {
745 fanout->set_new(ip->out(), 0);
746 }
747 break;
748
749 case kInstAltMatch:
750 DCHECK(!ip->last());
751 reachable.insert(id+1);
752 break;
753
754 case kInstCapture:
755 case kInstEmptyWidth:
756 case kInstNop:
757 if (!ip->last())
758 reachable.insert(id+1);
759
760 reachable.insert(ip->out());
761 break;
762
763 case kInstMatch:
764 if (!ip->last())
765 reachable.insert(id+1);
766 break;
767
768 case kInstFail:
769 break;
770 }
771 }
772 }
773}
774
775} // namespace re2
776