1 | // Copyright 2006-2007 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | // Tested by search_test.cc. |
6 | // |
7 | // Prog::SearchNFA, an NFA search. |
8 | // This is an actual NFA like the theorists talk about, |
9 | // not the pseudo-NFA found in backtracking regexp implementations. |
10 | // |
11 | // IMPLEMENTATION |
12 | // |
13 | // This algorithm is a variant of one that appeared in Rob Pike's sam editor, |
14 | // which is a variant of the one described in Thompson's 1968 CACM paper. |
15 | // See http://swtch.com/~rsc/regexp/ for various history. The main feature |
16 | // over the DFA implementation is that it tracks submatch boundaries. |
17 | // |
18 | // When the choice of submatch boundaries is ambiguous, this particular |
19 | // implementation makes the same choices that traditional backtracking |
20 | // implementations (in particular, Perl and PCRE) do. |
21 | // Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential |
22 | // time in the length of the input. |
23 | // |
24 | // Like Thompson's original machine and like the DFA implementation, this |
25 | // implementation notices a match only once it is one byte past it. |
26 | |
27 | #include <stdio.h> |
28 | #include <string.h> |
29 | #include <algorithm> |
30 | #include <string> |
31 | #include <utility> |
32 | #include <vector> |
33 | |
34 | #include "util/logging.h" |
35 | #include "util/strutil.h" |
36 | #include "re2/pod_array.h" |
37 | #include "re2/prog.h" |
38 | #include "re2/regexp.h" |
39 | #include "re2/sparse_array.h" |
40 | #include "re2/sparse_set.h" |
41 | |
42 | namespace re2 { |
43 | |
44 | static const bool = false; |
45 | |
46 | class NFA { |
47 | public: |
48 | NFA(Prog* prog); |
49 | ~NFA(); |
50 | |
51 | // Searches for a matching string. |
52 | // * If anchored is true, only considers matches starting at offset. |
53 | // Otherwise finds lefmost match at or after offset. |
54 | // * If longest is true, returns the longest match starting |
55 | // at the chosen start point. Otherwise returns the so-called |
56 | // left-biased match, the one traditional backtracking engines |
57 | // (like Perl and PCRE) find. |
58 | // Records submatch boundaries in submatch[1..nsubmatch-1]. |
59 | // Submatch[0] is the entire match. When there is a choice in |
60 | // which text matches each subexpression, the submatch boundaries |
61 | // are chosen to match what a backtracking implementation would choose. |
62 | bool Search(const StringPiece& text, const StringPiece& context, |
63 | bool anchored, bool longest, |
64 | StringPiece* submatch, int nsubmatch); |
65 | |
66 | private: |
67 | struct Thread { |
68 | union { |
69 | int ref; |
70 | Thread* next; // when on free list |
71 | }; |
72 | const char** capture; |
73 | }; |
74 | |
75 | // State for explicit stack in AddToThreadq. |
76 | struct AddState { |
77 | int id; // Inst to process |
78 | Thread* t; // if not null, set t0 = t before processing id |
79 | }; |
80 | |
81 | // Threadq is a list of threads. The list is sorted by the order |
82 | // in which Perl would explore that particular state -- the earlier |
83 | // choices appear earlier in the list. |
84 | typedef SparseArray<Thread*> Threadq; |
85 | |
86 | inline Thread* AllocThread(); |
87 | inline Thread* Incref(Thread* t); |
88 | inline void Decref(Thread* t); |
89 | |
90 | // Follows all empty arrows from id0 and enqueues all the states reached. |
91 | // Enqueues only the ByteRange instructions that match byte c. |
92 | // context is used (with p) for evaluating empty-width specials. |
93 | // p is the current input position, and t0 is the current thread. |
94 | void AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, |
95 | const char* p, Thread* t0); |
96 | |
97 | // Run runq on byte c, appending new states to nextq. |
98 | // Updates matched_ and match_ as new, better matches are found. |
99 | // context is used (with p) for evaluating empty-width specials. |
100 | // p is the position of byte c in the input string for AddToThreadq; |
101 | // p-1 will be used when processing Match instructions. |
102 | // Frees all the threads on runq. |
103 | // If there is a shortcut to the end, returns that shortcut. |
104 | int Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, |
105 | const char* p); |
106 | |
107 | // Returns text version of capture information, for debugging. |
108 | std::string FormatCapture(const char** capture); |
109 | |
110 | inline void CopyCapture(const char** dst, const char** src); |
111 | |
112 | Prog* prog_; // underlying program |
113 | int start_; // start instruction in program |
114 | int ncapture_; // number of submatches to track |
115 | bool longest_; // whether searching for longest match |
116 | bool endmatch_; // whether match must end at text.end() |
117 | const char* btext_; // beginning of text being matched (for FormatSubmatch) |
118 | const char* etext_; // end of text being matched (for endmatch_) |
119 | Threadq q0_, q1_; // pre-allocated for Search. |
120 | PODArray<AddState> stack_; // pre-allocated for AddToThreadq |
121 | Thread* free_threads_; // free list |
122 | const char** match_; // best match so far |
123 | bool matched_; // any match so far? |
124 | |
125 | NFA(const NFA&) = delete; |
126 | NFA& operator=(const NFA&) = delete; |
127 | }; |
128 | |
129 | NFA::NFA(Prog* prog) { |
130 | prog_ = prog; |
131 | start_ = prog_->start(); |
132 | ncapture_ = 0; |
133 | longest_ = false; |
134 | endmatch_ = false; |
135 | btext_ = NULL; |
136 | etext_ = NULL; |
137 | q0_.resize(prog_->size()); |
138 | q1_.resize(prog_->size()); |
139 | // See NFA::AddToThreadq() for why this is so. |
140 | int nstack = 2*prog_->inst_count(kInstCapture) + |
141 | prog_->inst_count(kInstEmptyWidth) + |
142 | prog_->inst_count(kInstNop) + 1; // + 1 for start inst |
143 | stack_ = PODArray<AddState>(nstack); |
144 | free_threads_ = NULL; |
145 | match_ = NULL; |
146 | matched_ = false; |
147 | } |
148 | |
149 | NFA::~NFA() { |
150 | delete[] match_; |
151 | Thread* next; |
152 | for (Thread* t = free_threads_; t; t = next) { |
153 | next = t->next; |
154 | delete[] t->capture; |
155 | delete t; |
156 | } |
157 | } |
158 | |
159 | NFA::Thread* NFA::AllocThread() { |
160 | Thread* t = free_threads_; |
161 | if (t == NULL) { |
162 | t = new Thread; |
163 | t->ref = 1; |
164 | t->capture = new const char*[ncapture_]; |
165 | return t; |
166 | } |
167 | free_threads_ = t->next; |
168 | t->ref = 1; |
169 | return t; |
170 | } |
171 | |
172 | NFA::Thread* NFA::Incref(Thread* t) { |
173 | DCHECK(t != NULL); |
174 | t->ref++; |
175 | return t; |
176 | } |
177 | |
178 | void NFA::Decref(Thread* t) { |
179 | if (t == NULL) |
180 | return; |
181 | t->ref--; |
182 | if (t->ref > 0) |
183 | return; |
184 | DCHECK_EQ(t->ref, 0); |
185 | t->next = free_threads_; |
186 | free_threads_ = t; |
187 | } |
188 | |
189 | void NFA::CopyCapture(const char** dst, const char** src) { |
190 | for (int i = 0; i < ncapture_; i+=2) { |
191 | dst[i] = src[i]; |
192 | dst[i+1] = src[i+1]; |
193 | } |
194 | } |
195 | |
196 | // Follows all empty arrows from id0 and enqueues all the states reached. |
197 | // Enqueues only the ByteRange instructions that match byte c. |
198 | // context is used (with p) for evaluating empty-width specials. |
199 | // p is the current input position, and t0 is the current thread. |
200 | void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, |
201 | const char* p, Thread* t0) { |
202 | if (id0 == 0) |
203 | return; |
204 | |
205 | // Use stack_ to hold our stack of instructions yet to process. |
206 | // It was preallocated as follows: |
207 | // two entries per Capture; |
208 | // one entry per EmptyWidth; and |
209 | // one entry per Nop. |
210 | // This reflects the maximum number of stack pushes that each can |
211 | // perform. (Each instruction can be processed at most once.) |
212 | AddState* stk = stack_.data(); |
213 | int nstk = 0; |
214 | |
215 | stk[nstk++] = {id0, NULL}; |
216 | while (nstk > 0) { |
217 | DCHECK_LE(nstk, stack_.size()); |
218 | AddState a = stk[--nstk]; |
219 | |
220 | Loop: |
221 | if (a.t != NULL) { |
222 | // t0 was a thread that we allocated and copied in order to |
223 | // record the capture, so we must now decref it. |
224 | Decref(t0); |
225 | t0 = a.t; |
226 | } |
227 | |
228 | int id = a.id; |
229 | if (id == 0) |
230 | continue; |
231 | if (q->has_index(id)) { |
232 | if (ExtraDebug) |
233 | fprintf(stderr, " [%d%s]\n" , id, FormatCapture(t0->capture).c_str()); |
234 | continue; |
235 | } |
236 | |
237 | // Create entry in q no matter what. We might fill it in below, |
238 | // or we might not. Even if not, it is necessary to have it, |
239 | // so that we don't revisit id0 during the recursion. |
240 | q->set_new(id, NULL); |
241 | Thread** tp = &q->get_existing(id); |
242 | int j; |
243 | Thread* t; |
244 | Prog::Inst* ip = prog_->inst(id); |
245 | switch (ip->opcode()) { |
246 | default: |
247 | LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq" ; |
248 | break; |
249 | |
250 | case kInstFail: |
251 | break; |
252 | |
253 | case kInstAltMatch: |
254 | // Save state; will pick up at next byte. |
255 | t = Incref(t0); |
256 | *tp = t; |
257 | |
258 | DCHECK(!ip->last()); |
259 | a = {id+1, NULL}; |
260 | goto Loop; |
261 | |
262 | case kInstNop: |
263 | if (!ip->last()) |
264 | stk[nstk++] = {id+1, NULL}; |
265 | |
266 | // Continue on. |
267 | a = {ip->out(), NULL}; |
268 | goto Loop; |
269 | |
270 | case kInstCapture: |
271 | if (!ip->last()) |
272 | stk[nstk++] = {id+1, NULL}; |
273 | |
274 | if ((j=ip->cap()) < ncapture_) { |
275 | // Push a dummy whose only job is to restore t0 |
276 | // once we finish exploring this possibility. |
277 | stk[nstk++] = {0, t0}; |
278 | |
279 | // Record capture. |
280 | t = AllocThread(); |
281 | CopyCapture(t->capture, t0->capture); |
282 | t->capture[j] = p; |
283 | t0 = t; |
284 | } |
285 | a = {ip->out(), NULL}; |
286 | goto Loop; |
287 | |
288 | case kInstByteRange: |
289 | if (!ip->Matches(c)) |
290 | goto Next; |
291 | |
292 | // Save state; will pick up at next byte. |
293 | t = Incref(t0); |
294 | *tp = t; |
295 | if (ExtraDebug) |
296 | fprintf(stderr, " + %d%s\n" , id, FormatCapture(t0->capture).c_str()); |
297 | |
298 | if (ip->hint() == 0) |
299 | break; |
300 | a = {id+ip->hint(), NULL}; |
301 | goto Loop; |
302 | |
303 | case kInstMatch: |
304 | // Save state; will pick up at next byte. |
305 | t = Incref(t0); |
306 | *tp = t; |
307 | if (ExtraDebug) |
308 | fprintf(stderr, " ! %d%s\n" , id, FormatCapture(t0->capture).c_str()); |
309 | |
310 | Next: |
311 | if (ip->last()) |
312 | break; |
313 | a = {id+1, NULL}; |
314 | goto Loop; |
315 | |
316 | case kInstEmptyWidth: |
317 | if (!ip->last()) |
318 | stk[nstk++] = {id+1, NULL}; |
319 | |
320 | // Continue on if we have all the right flag bits. |
321 | if (ip->empty() & ~Prog::EmptyFlags(context, p)) |
322 | break; |
323 | a = {ip->out(), NULL}; |
324 | goto Loop; |
325 | } |
326 | } |
327 | } |
328 | |
329 | // Run runq on byte c, appending new states to nextq. |
330 | // Updates matched_ and match_ as new, better matches are found. |
331 | // context is used (with p) for evaluating empty-width specials. |
332 | // p is the position of byte c in the input string for AddToThreadq; |
333 | // p-1 will be used when processing Match instructions. |
334 | // Frees all the threads on runq. |
335 | // If there is a shortcut to the end, returns that shortcut. |
336 | int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, |
337 | const char* p) { |
338 | nextq->clear(); |
339 | |
340 | for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { |
341 | Thread* t = i->value(); |
342 | if (t == NULL) |
343 | continue; |
344 | |
345 | if (longest_) { |
346 | // Can skip any threads started after our current best match. |
347 | if (matched_ && match_[0] < t->capture[0]) { |
348 | Decref(t); |
349 | continue; |
350 | } |
351 | } |
352 | |
353 | int id = i->index(); |
354 | Prog::Inst* ip = prog_->inst(id); |
355 | |
356 | switch (ip->opcode()) { |
357 | default: |
358 | // Should only see the values handled below. |
359 | LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step" ; |
360 | break; |
361 | |
362 | case kInstByteRange: |
363 | AddToThreadq(nextq, ip->out(), c, context, p, t); |
364 | break; |
365 | |
366 | case kInstAltMatch: |
367 | if (i != runq->begin()) |
368 | break; |
369 | // The match is ours if we want it. |
370 | if (ip->greedy(prog_) || longest_) { |
371 | CopyCapture(match_, t->capture); |
372 | matched_ = true; |
373 | |
374 | Decref(t); |
375 | for (++i; i != runq->end(); ++i) |
376 | Decref(i->value()); |
377 | runq->clear(); |
378 | if (ip->greedy(prog_)) |
379 | return ip->out1(); |
380 | return ip->out(); |
381 | } |
382 | break; |
383 | |
384 | case kInstMatch: { |
385 | // Avoid invoking undefined behavior (arithmetic on a null pointer) |
386 | // by storing p instead of p-1. (What would the latter even mean?!) |
387 | // This complements the special case in NFA::Search(). |
388 | if (p == NULL) { |
389 | CopyCapture(match_, t->capture); |
390 | match_[1] = p; |
391 | matched_ = true; |
392 | break; |
393 | } |
394 | |
395 | if (endmatch_ && p-1 != etext_) |
396 | break; |
397 | |
398 | if (longest_) { |
399 | // Leftmost-longest mode: save this match only if |
400 | // it is either farther to the left or at the same |
401 | // point but longer than an existing match. |
402 | if (!matched_ || t->capture[0] < match_[0] || |
403 | (t->capture[0] == match_[0] && p-1 > match_[1])) { |
404 | CopyCapture(match_, t->capture); |
405 | match_[1] = p-1; |
406 | matched_ = true; |
407 | } |
408 | } else { |
409 | // Leftmost-biased mode: this match is by definition |
410 | // better than what we've already found (see next line). |
411 | CopyCapture(match_, t->capture); |
412 | match_[1] = p-1; |
413 | matched_ = true; |
414 | |
415 | // Cut off the threads that can only find matches |
416 | // worse than the one we just found: don't run the |
417 | // rest of the current Threadq. |
418 | Decref(t); |
419 | for (++i; i != runq->end(); ++i) |
420 | Decref(i->value()); |
421 | runq->clear(); |
422 | return 0; |
423 | } |
424 | break; |
425 | } |
426 | } |
427 | Decref(t); |
428 | } |
429 | runq->clear(); |
430 | return 0; |
431 | } |
432 | |
433 | std::string NFA::FormatCapture(const char** capture) { |
434 | std::string s; |
435 | for (int i = 0; i < ncapture_; i+=2) { |
436 | if (capture[i] == NULL) |
437 | s += "(?,?)" ; |
438 | else if (capture[i+1] == NULL) |
439 | s += StringPrintf("(%td,?)" , |
440 | capture[i] - btext_); |
441 | else |
442 | s += StringPrintf("(%td,%td)" , |
443 | capture[i] - btext_, |
444 | capture[i+1] - btext_); |
445 | } |
446 | return s; |
447 | } |
448 | |
449 | bool NFA::Search(const StringPiece& text, const StringPiece& const_context, |
450 | bool anchored, bool longest, |
451 | StringPiece* submatch, int nsubmatch) { |
452 | if (start_ == 0) |
453 | return false; |
454 | |
455 | StringPiece context = const_context; |
456 | if (context.data() == NULL) |
457 | context = text; |
458 | |
459 | // Sanity check: make sure that text lies within context. |
460 | if (text.begin() < context.begin() || text.end() > context.end()) { |
461 | LOG(DFATAL) << "context does not contain text" ; |
462 | return false; |
463 | } |
464 | |
465 | if (prog_->anchor_start() && context.begin() != text.begin()) |
466 | return false; |
467 | if (prog_->anchor_end() && context.end() != text.end()) |
468 | return false; |
469 | anchored |= prog_->anchor_start(); |
470 | if (prog_->anchor_end()) { |
471 | longest = true; |
472 | endmatch_ = true; |
473 | } |
474 | |
475 | if (nsubmatch < 0) { |
476 | LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch; |
477 | return false; |
478 | } |
479 | |
480 | // Save search parameters. |
481 | ncapture_ = 2*nsubmatch; |
482 | longest_ = longest; |
483 | |
484 | if (nsubmatch == 0) { |
485 | // We need to maintain match[0], both to distinguish the |
486 | // longest match (if longest is true) and also to tell |
487 | // whether we've seen any matches at all. |
488 | ncapture_ = 2; |
489 | } |
490 | |
491 | match_ = new const char*[ncapture_]; |
492 | matched_ = false; |
493 | |
494 | // For debugging prints. |
495 | btext_ = context.data(); |
496 | // For convenience. |
497 | etext_ = text.data() + text.size(); |
498 | |
499 | if (ExtraDebug) |
500 | fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n" , |
501 | std::string(text).c_str(), std::string(context).c_str(), anchored, longest); |
502 | |
503 | // Set up search. |
504 | Threadq* runq = &q0_; |
505 | Threadq* nextq = &q1_; |
506 | runq->clear(); |
507 | nextq->clear(); |
508 | memset(&match_[0], 0, ncapture_*sizeof match_[0]); |
509 | |
510 | // Loop over the text, stepping the machine. |
511 | for (const char* p = text.data();; p++) { |
512 | if (ExtraDebug) { |
513 | int c = 0; |
514 | if (p == btext_) |
515 | c = '^'; |
516 | else if (p > etext_) |
517 | c = '$'; |
518 | else if (p < etext_) |
519 | c = p[0] & 0xFF; |
520 | |
521 | fprintf(stderr, "%c:" , c); |
522 | for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { |
523 | Thread* t = i->value(); |
524 | if (t == NULL) |
525 | continue; |
526 | fprintf(stderr, " %d%s" , i->index(), FormatCapture(t->capture).c_str()); |
527 | } |
528 | fprintf(stderr, "\n" ); |
529 | } |
530 | |
531 | // This is a no-op the first time around the loop because runq is empty. |
532 | int id = Step(runq, nextq, p < etext_ ? p[0] & 0xFF : -1, context, p); |
533 | DCHECK_EQ(runq->size(), 0); |
534 | using std::swap; |
535 | swap(nextq, runq); |
536 | nextq->clear(); |
537 | if (id != 0) { |
538 | // We're done: full match ahead. |
539 | p = etext_; |
540 | for (;;) { |
541 | Prog::Inst* ip = prog_->inst(id); |
542 | switch (ip->opcode()) { |
543 | default: |
544 | LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode(); |
545 | break; |
546 | |
547 | case kInstCapture: |
548 | if (ip->cap() < ncapture_) |
549 | match_[ip->cap()] = p; |
550 | id = ip->out(); |
551 | continue; |
552 | |
553 | case kInstNop: |
554 | id = ip->out(); |
555 | continue; |
556 | |
557 | case kInstMatch: |
558 | match_[1] = p; |
559 | matched_ = true; |
560 | break; |
561 | } |
562 | break; |
563 | } |
564 | break; |
565 | } |
566 | |
567 | if (p > etext_) |
568 | break; |
569 | |
570 | // Start a new thread if there have not been any matches. |
571 | // (No point in starting a new thread if there have been |
572 | // matches, since it would be to the right of the match |
573 | // we already found.) |
574 | if (!matched_ && (!anchored || p == text.data())) { |
575 | // If there's a required first byte for an unanchored search |
576 | // and we're not in the middle of any possible matches, |
577 | // use memchr to search for the byte quickly. |
578 | int fb = prog_->first_byte(); |
579 | if (!anchored && runq->size() == 0 && |
580 | fb >= 0 && p < etext_ && (p[0] & 0xFF) != fb) { |
581 | p = reinterpret_cast<const char*>(memchr(p, fb, etext_ - p)); |
582 | if (p == NULL) { |
583 | p = etext_; |
584 | } |
585 | } |
586 | |
587 | Thread* t = AllocThread(); |
588 | CopyCapture(t->capture, match_); |
589 | t->capture[0] = p; |
590 | AddToThreadq(runq, start_, p < etext_ ? p[0] & 0xFF : -1, context, p, |
591 | t); |
592 | Decref(t); |
593 | } |
594 | |
595 | // If all the threads have died, stop early. |
596 | if (runq->size() == 0) { |
597 | if (ExtraDebug) |
598 | fprintf(stderr, "dead\n" ); |
599 | break; |
600 | } |
601 | |
602 | // Avoid invoking undefined behavior (arithmetic on a null pointer) |
603 | // by simply not continuing the loop. |
604 | // This complements the special case in NFA::Step(). |
605 | if (p == NULL) { |
606 | (void)Step(runq, nextq, p < etext_ ? p[0] & 0xFF : -1, context, p); |
607 | DCHECK_EQ(runq->size(), 0); |
608 | using std::swap; |
609 | swap(nextq, runq); |
610 | nextq->clear(); |
611 | break; |
612 | } |
613 | } |
614 | |
615 | for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) |
616 | Decref(i->value()); |
617 | |
618 | if (matched_) { |
619 | for (int i = 0; i < nsubmatch; i++) |
620 | submatch[i] = |
621 | StringPiece(match_[2 * i], |
622 | static_cast<size_t>(match_[2 * i + 1] - match_[2 * i])); |
623 | if (ExtraDebug) |
624 | fprintf(stderr, "match (%td,%td)\n" , |
625 | match_[0] - btext_, |
626 | match_[1] - btext_); |
627 | return true; |
628 | } |
629 | return false; |
630 | } |
631 | |
632 | // Computes whether all successful matches have a common first byte, |
633 | // and if so, returns that byte. If not, returns -1. |
634 | int Prog::ComputeFirstByte() { |
635 | int b = -1; |
636 | SparseSet q(size()); |
637 | q.insert(start()); |
638 | for (SparseSet::iterator it = q.begin(); it != q.end(); ++it) { |
639 | int id = *it; |
640 | Prog::Inst* ip = inst(id); |
641 | switch (ip->opcode()) { |
642 | default: |
643 | LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte" ; |
644 | break; |
645 | |
646 | case kInstMatch: |
647 | // The empty string matches: no first byte. |
648 | return -1; |
649 | |
650 | case kInstByteRange: |
651 | if (!ip->last()) |
652 | q.insert(id+1); |
653 | |
654 | // Must match only a single byte |
655 | if (ip->lo() != ip->hi()) |
656 | return -1; |
657 | if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z') |
658 | return -1; |
659 | // If we haven't seen any bytes yet, record it; |
660 | // otherwise must match the one we saw before. |
661 | if (b == -1) |
662 | b = ip->lo(); |
663 | else if (b != ip->lo()) |
664 | return -1; |
665 | break; |
666 | |
667 | case kInstNop: |
668 | case kInstCapture: |
669 | case kInstEmptyWidth: |
670 | if (!ip->last()) |
671 | q.insert(id+1); |
672 | |
673 | // Continue on. |
674 | // Ignore ip->empty() flags for kInstEmptyWidth |
675 | // in order to be as conservative as possible |
676 | // (assume all possible empty-width flags are true). |
677 | if (ip->out()) |
678 | q.insert(ip->out()); |
679 | break; |
680 | |
681 | case kInstAltMatch: |
682 | DCHECK(!ip->last()); |
683 | q.insert(id+1); |
684 | break; |
685 | |
686 | case kInstFail: |
687 | break; |
688 | } |
689 | } |
690 | return b; |
691 | } |
692 | |
693 | bool |
694 | Prog::SearchNFA(const StringPiece& text, const StringPiece& context, |
695 | Anchor anchor, MatchKind kind, |
696 | StringPiece* match, int nmatch) { |
697 | if (ExtraDebug) |
698 | Dump(); |
699 | |
700 | NFA nfa(this); |
701 | StringPiece sp; |
702 | if (kind == kFullMatch) { |
703 | anchor = kAnchored; |
704 | if (nmatch == 0) { |
705 | match = &sp; |
706 | nmatch = 1; |
707 | } |
708 | } |
709 | if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch)) |
710 | return false; |
711 | if (kind == kFullMatch && match[0].end() != text.end()) |
712 | return false; |
713 | return true; |
714 | } |
715 | |
716 | // For each instruction i in the program reachable from the start, compute the |
717 | // number of instructions reachable from i by following only empty transitions |
718 | // and record that count as fanout[i]. |
719 | // |
720 | // fanout holds the results and is also the work queue for the outer iteration. |
721 | // reachable holds the reached nodes for the inner iteration. |
722 | void Prog::Fanout(SparseArray<int>* fanout) { |
723 | DCHECK_EQ(fanout->max_size(), size()); |
724 | SparseSet reachable(size()); |
725 | fanout->clear(); |
726 | fanout->set_new(start(), 0); |
727 | for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) { |
728 | int* count = &i->value(); |
729 | reachable.clear(); |
730 | reachable.insert(i->index()); |
731 | for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) { |
732 | int id = *j; |
733 | Prog::Inst* ip = inst(id); |
734 | switch (ip->opcode()) { |
735 | default: |
736 | LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()" ; |
737 | break; |
738 | |
739 | case kInstByteRange: |
740 | if (!ip->last()) |
741 | reachable.insert(id+1); |
742 | |
743 | (*count)++; |
744 | if (!fanout->has_index(ip->out())) { |
745 | fanout->set_new(ip->out(), 0); |
746 | } |
747 | break; |
748 | |
749 | case kInstAltMatch: |
750 | DCHECK(!ip->last()); |
751 | reachable.insert(id+1); |
752 | break; |
753 | |
754 | case kInstCapture: |
755 | case kInstEmptyWidth: |
756 | case kInstNop: |
757 | if (!ip->last()) |
758 | reachable.insert(id+1); |
759 | |
760 | reachable.insert(ip->out()); |
761 | break; |
762 | |
763 | case kInstMatch: |
764 | if (!ip->last()) |
765 | reachable.insert(id+1); |
766 | break; |
767 | |
768 | case kInstFail: |
769 | break; |
770 | } |
771 | } |
772 | } |
773 | } |
774 | |
775 | } // namespace re2 |
776 | |