1 | // Copyright 2008 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | // A DFA (deterministic finite automaton)-based regular expression search. |
6 | // |
7 | // The DFA search has two main parts: the construction of the automaton, |
8 | // which is represented by a graph of State structures, and the execution |
9 | // of the automaton over a given input string. |
10 | // |
11 | // The basic idea is that the State graph is constructed so that the |
12 | // execution can simply start with a state s, and then for each byte c in |
13 | // the input string, execute "s = s->next[c]", checking at each point whether |
14 | // the current s represents a matching state. |
15 | // |
16 | // The simple explanation just given does convey the essence of this code, |
17 | // but it omits the details of how the State graph gets constructed as well |
18 | // as some performance-driven optimizations to the execution of the automaton. |
19 | // All these details are explained in the comments for the code following |
20 | // the definition of class DFA. |
21 | // |
22 | // See http://swtch.com/~rsc/regexp/ for a very bare-bones equivalent. |
23 | |
24 | #include <stddef.h> |
25 | #include <stdint.h> |
26 | #include <stdio.h> |
27 | #include <string.h> |
28 | #include <algorithm> |
29 | #include <atomic> |
30 | #include <deque> |
31 | #include <mutex> |
32 | #include <new> |
33 | #include <string> |
34 | #include <unordered_map> |
35 | #include <unordered_set> |
36 | #include <utility> |
37 | #include <vector> |
38 | |
39 | #include "util/logging.h" |
40 | #include "util/mix.h" |
41 | #include "util/mutex.h" |
42 | #include "util/strutil.h" |
43 | #include "re2/pod_array.h" |
44 | #include "re2/prog.h" |
45 | #include "re2/sparse_set.h" |
46 | #include "re2/stringpiece.h" |
47 | |
48 | // Silence "zero-sized array in struct/union" warning for DFA::State::next_. |
49 | #ifdef _MSC_VER |
50 | #pragma warning(disable: 4200) |
51 | #endif |
52 | |
53 | namespace re2 { |
54 | |
55 | #if !defined(__linux__) /* only Linux seems to have memrchr */ |
56 | static void* memrchr(const void* s, int c, size_t n) { |
57 | const unsigned char* p = (const unsigned char*)s; |
58 | for (p += n; n > 0; n--) |
59 | if (*--p == c) |
60 | return (void*)p; |
61 | |
62 | return NULL; |
63 | } |
64 | #endif |
65 | |
66 | // Controls whether the DFA should bail out early if the NFA would be faster. |
67 | static bool dfa_should_bail_when_slow = true; |
68 | |
69 | // Changing this to true compiles in prints that trace execution of the DFA. |
70 | // Generates a lot of output -- only useful for debugging. |
71 | static const bool = false; |
72 | |
73 | // A DFA implementation of a regular expression program. |
74 | // Since this is entirely a forward declaration mandated by C++, |
75 | // some of the comments here are better understood after reading |
76 | // the comments in the sections that follow the DFA definition. |
77 | class DFA { |
78 | public: |
79 | DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem); |
80 | ~DFA(); |
81 | bool ok() const { return !init_failed_; } |
82 | Prog::MatchKind kind() { return kind_; } |
83 | |
84 | // Searches for the regular expression in text, which is considered |
85 | // as a subsection of context for the purposes of interpreting flags |
86 | // like ^ and $ and \A and \z. |
87 | // Returns whether a match was found. |
88 | // If a match is found, sets *ep to the end point of the best match in text. |
89 | // If "anchored", the match must begin at the start of text. |
90 | // If "want_earliest_match", the match that ends first is used, not |
91 | // necessarily the best one. |
92 | // If "run_forward" is true, the DFA runs from text.begin() to text.end(). |
93 | // If it is false, the DFA runs from text.end() to text.begin(), |
94 | // returning the leftmost end of the match instead of the rightmost one. |
95 | // If the DFA cannot complete the search (for example, if it is out of |
96 | // memory), it sets *failed and returns false. |
97 | bool Search(const StringPiece& text, const StringPiece& context, |
98 | bool anchored, bool want_earliest_match, bool run_forward, |
99 | bool* failed, const char** ep, SparseSet* matches); |
100 | |
101 | // Builds out all states for the entire DFA. |
102 | // If cb is not empty, it receives one callback per state built. |
103 | // Returns the number of states built. |
104 | // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY. |
105 | int BuildAllStates(const Prog::DFAStateCallback& cb); |
106 | |
107 | // Computes min and max for matching strings. Won't return strings |
108 | // bigger than maxlen. |
109 | bool PossibleMatchRange(std::string* min, std::string* max, int maxlen); |
110 | |
111 | // These data structures are logically private, but C++ makes it too |
112 | // difficult to mark them as such. |
113 | class RWLocker; |
114 | class StateSaver; |
115 | class Workq; |
116 | |
117 | // A single DFA state. The DFA is represented as a graph of these |
118 | // States, linked by the next_ pointers. If in state s and reading |
119 | // byte c, the next state should be s->next_[c]. |
120 | struct State { |
121 | inline bool IsMatch() const { return (flag_ & kFlagMatch) != 0; } |
122 | |
123 | int* inst_; // Instruction pointers in the state. |
124 | int ninst_; // # of inst_ pointers. |
125 | uint32_t flag_; // Empty string bitfield flags in effect on the way |
126 | // into this state, along with kFlagMatch if this |
127 | // is a matching state. |
128 | |
129 | // Work around the bug affecting flexible array members in GCC 6.x (for x >= 1). |
130 | // (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70932) |
131 | #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && __GNUC_MINOR__ >= 1 |
132 | std::atomic<State*> next_[0]; // Outgoing arrows from State, |
133 | #else |
134 | std::atomic<State*> next_[]; // Outgoing arrows from State, |
135 | #endif |
136 | |
137 | // one per input byte class |
138 | }; |
139 | |
140 | enum { |
141 | kByteEndText = 256, // imaginary byte at end of text |
142 | |
143 | kFlagEmptyMask = 0xFF, // State.flag_: bits holding kEmptyXXX flags |
144 | kFlagMatch = 0x0100, // State.flag_: this is a matching state |
145 | kFlagLastWord = 0x0200, // State.flag_: last byte was a word char |
146 | kFlagNeedShift = 16, // needed kEmpty bits are or'ed in shifted left |
147 | }; |
148 | |
149 | struct StateHash { |
150 | size_t operator()(const State* a) const { |
151 | DCHECK(a != NULL); |
152 | HashMix mix(a->flag_); |
153 | for (int i = 0; i < a->ninst_; i++) |
154 | mix.Mix(a->inst_[i]); |
155 | mix.Mix(0); |
156 | return mix.get(); |
157 | } |
158 | }; |
159 | |
160 | struct StateEqual { |
161 | bool operator()(const State* a, const State* b) const { |
162 | DCHECK(a != NULL); |
163 | DCHECK(b != NULL); |
164 | if (a == b) |
165 | return true; |
166 | if (a->flag_ != b->flag_) |
167 | return false; |
168 | if (a->ninst_ != b->ninst_) |
169 | return false; |
170 | for (int i = 0; i < a->ninst_; i++) |
171 | if (a->inst_[i] != b->inst_[i]) |
172 | return false; |
173 | return true; |
174 | } |
175 | }; |
176 | |
177 | typedef std::unordered_set<State*, StateHash, StateEqual> StateSet; |
178 | |
179 | private: |
180 | // Special "first_byte" values for a state. (Values >= 0 denote actual bytes.) |
181 | enum { |
182 | kFbUnknown = -1, // No analysis has been performed. |
183 | kFbNone = -2, // The first-byte trick cannot be used. |
184 | }; |
185 | |
186 | enum { |
187 | // Indices into start_ for unanchored searches. |
188 | // Add kStartAnchored for anchored searches. |
189 | kStartBeginText = 0, // text at beginning of context |
190 | kStartBeginLine = 2, // text at beginning of line |
191 | kStartAfterWordChar = 4, // text follows a word character |
192 | kStartAfterNonWordChar = 6, // text follows non-word character |
193 | kMaxStart = 8, |
194 | |
195 | kStartAnchored = 1, |
196 | }; |
197 | |
198 | // Resets the DFA State cache, flushing all saved State* information. |
199 | // Releases and reacquires cache_mutex_ via cache_lock, so any |
200 | // State* existing before the call are not valid after the call. |
201 | // Use a StateSaver to preserve important states across the call. |
202 | // cache_mutex_.r <= L < mutex_ |
203 | // After: cache_mutex_.w <= L < mutex_ |
204 | void ResetCache(RWLocker* cache_lock); |
205 | |
206 | // Looks up and returns the State corresponding to a Workq. |
207 | // L >= mutex_ |
208 | State* WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag); |
209 | |
210 | // Looks up and returns a State matching the inst, ninst, and flag. |
211 | // L >= mutex_ |
212 | State* CachedState(int* inst, int ninst, uint32_t flag); |
213 | |
214 | // Clear the cache entirely. |
215 | // Must hold cache_mutex_.w or be in destructor. |
216 | void ClearCache(); |
217 | |
218 | // Converts a State into a Workq: the opposite of WorkqToCachedState. |
219 | // L >= mutex_ |
220 | void StateToWorkq(State* s, Workq* q); |
221 | |
222 | // Runs a State on a given byte, returning the next state. |
223 | State* RunStateOnByteUnlocked(State*, int); // cache_mutex_.r <= L < mutex_ |
224 | State* RunStateOnByte(State*, int); // L >= mutex_ |
225 | |
226 | // Runs a Workq on a given byte followed by a set of empty-string flags, |
227 | // producing a new Workq in nq. If a match instruction is encountered, |
228 | // sets *ismatch to true. |
229 | // L >= mutex_ |
230 | void RunWorkqOnByte(Workq* q, Workq* nq, |
231 | int c, uint32_t flag, bool* ismatch); |
232 | |
233 | // Runs a Workq on a set of empty-string flags, producing a new Workq in nq. |
234 | // L >= mutex_ |
235 | void RunWorkqOnEmptyString(Workq* q, Workq* nq, uint32_t flag); |
236 | |
237 | // Adds the instruction id to the Workq, following empty arrows |
238 | // according to flag. |
239 | // L >= mutex_ |
240 | void AddToQueue(Workq* q, int id, uint32_t flag); |
241 | |
242 | // For debugging, returns a text representation of State. |
243 | static std::string DumpState(State* state); |
244 | |
245 | // For debugging, returns a text representation of a Workq. |
246 | static std::string DumpWorkq(Workq* q); |
247 | |
248 | // Search parameters |
249 | struct SearchParams { |
250 | SearchParams(const StringPiece& text, const StringPiece& context, |
251 | RWLocker* cache_lock) |
252 | : text(text), context(context), |
253 | anchored(false), |
254 | want_earliest_match(false), |
255 | run_forward(false), |
256 | start(NULL), |
257 | first_byte(kFbUnknown), |
258 | cache_lock(cache_lock), |
259 | failed(false), |
260 | ep(NULL), |
261 | matches(NULL) { } |
262 | |
263 | StringPiece text; |
264 | StringPiece context; |
265 | bool anchored; |
266 | bool want_earliest_match; |
267 | bool run_forward; |
268 | State* start; |
269 | int first_byte; |
270 | RWLocker *cache_lock; |
271 | bool failed; // "out" parameter: whether search gave up |
272 | const char* ep; // "out" parameter: end pointer for match |
273 | SparseSet* matches; |
274 | |
275 | private: |
276 | SearchParams(const SearchParams&) = delete; |
277 | SearchParams& operator=(const SearchParams&) = delete; |
278 | }; |
279 | |
280 | // Before each search, the parameters to Search are analyzed by |
281 | // AnalyzeSearch to determine the state in which to start and the |
282 | // "first_byte" for that state, if any. |
283 | struct StartInfo { |
284 | StartInfo() : start(NULL), first_byte(kFbUnknown) {} |
285 | State* start; |
286 | std::atomic<int> first_byte; |
287 | }; |
288 | |
289 | // Fills in params->start and params->first_byte using |
290 | // the other search parameters. Returns true on success, |
291 | // false on failure. |
292 | // cache_mutex_.r <= L < mutex_ |
293 | bool AnalyzeSearch(SearchParams* params); |
294 | bool AnalyzeSearchHelper(SearchParams* params, StartInfo* info, |
295 | uint32_t flags); |
296 | |
297 | // The generic search loop, inlined to create specialized versions. |
298 | // cache_mutex_.r <= L < mutex_ |
299 | // Might unlock and relock cache_mutex_ via params->cache_lock. |
300 | inline bool InlinedSearchLoop(SearchParams* params, |
301 | bool have_first_byte, |
302 | bool want_earliest_match, |
303 | bool run_forward); |
304 | |
305 | // The specialized versions of InlinedSearchLoop. The three letters |
306 | // at the ends of the name denote the true/false values used as the |
307 | // last three parameters of InlinedSearchLoop. |
308 | // cache_mutex_.r <= L < mutex_ |
309 | // Might unlock and relock cache_mutex_ via params->cache_lock. |
310 | bool SearchFFF(SearchParams* params); |
311 | bool SearchFFT(SearchParams* params); |
312 | bool SearchFTF(SearchParams* params); |
313 | bool SearchFTT(SearchParams* params); |
314 | bool SearchTFF(SearchParams* params); |
315 | bool SearchTFT(SearchParams* params); |
316 | bool SearchTTF(SearchParams* params); |
317 | bool SearchTTT(SearchParams* params); |
318 | |
319 | // The main search loop: calls an appropriate specialized version of |
320 | // InlinedSearchLoop. |
321 | // cache_mutex_.r <= L < mutex_ |
322 | // Might unlock and relock cache_mutex_ via params->cache_lock. |
323 | bool FastSearchLoop(SearchParams* params); |
324 | |
325 | // For debugging, a slow search loop that calls InlinedSearchLoop |
326 | // directly -- because the booleans passed are not constants, the |
327 | // loop is not specialized like the SearchFFF etc. versions, so it |
328 | // runs much more slowly. Useful only for debugging. |
329 | // cache_mutex_.r <= L < mutex_ |
330 | // Might unlock and relock cache_mutex_ via params->cache_lock. |
331 | bool SlowSearchLoop(SearchParams* params); |
332 | |
333 | // Looks up bytes in bytemap_ but handles case c == kByteEndText too. |
334 | int ByteMap(int c) { |
335 | if (c == kByteEndText) |
336 | return prog_->bytemap_range(); |
337 | return prog_->bytemap()[c]; |
338 | } |
339 | |
340 | // Constant after initialization. |
341 | Prog* prog_; // The regular expression program to run. |
342 | Prog::MatchKind kind_; // The kind of DFA. |
343 | bool init_failed_; // initialization failed (out of memory) |
344 | |
345 | Mutex mutex_; // mutex_ >= cache_mutex_.r |
346 | |
347 | // Scratch areas, protected by mutex_. |
348 | Workq* q0_; // Two pre-allocated work queues. |
349 | Workq* q1_; |
350 | PODArray<int> stack_; // Pre-allocated stack for AddToQueue |
351 | |
352 | // State* cache. Many threads use and add to the cache simultaneously, |
353 | // holding cache_mutex_ for reading and mutex_ (above) when adding. |
354 | // If the cache fills and needs to be discarded, the discarding is done |
355 | // while holding cache_mutex_ for writing, to avoid interrupting other |
356 | // readers. Any State* pointers are only valid while cache_mutex_ |
357 | // is held. |
358 | Mutex cache_mutex_; |
359 | int64_t mem_budget_; // Total memory budget for all States. |
360 | int64_t state_budget_; // Amount of memory remaining for new States. |
361 | StateSet state_cache_; // All States computed so far. |
362 | StartInfo start_[kMaxStart]; |
363 | }; |
364 | |
365 | // Shorthand for casting to uint8_t*. |
366 | static inline const uint8_t* BytePtr(const void* v) { |
367 | return reinterpret_cast<const uint8_t*>(v); |
368 | } |
369 | |
370 | // Work queues |
371 | |
372 | // Marks separate thread groups of different priority |
373 | // in the work queue when in leftmost-longest matching mode. |
374 | #define Mark (-1) |
375 | |
376 | // Separates the match IDs from the instructions in inst_. |
377 | // Used only for "many match" DFA states. |
378 | #define MatchSep (-2) |
379 | |
380 | // Internally, the DFA uses a sparse array of |
381 | // program instruction pointers as a work queue. |
382 | // In leftmost longest mode, marks separate sections |
383 | // of workq that started executing at different |
384 | // locations in the string (earlier locations first). |
385 | class DFA::Workq : public SparseSet { |
386 | public: |
387 | // Constructor: n is number of normal slots, maxmark number of mark slots. |
388 | Workq(int n, int maxmark) : |
389 | SparseSet(n+maxmark), |
390 | n_(n), |
391 | maxmark_(maxmark), |
392 | nextmark_(n), |
393 | last_was_mark_(true) { |
394 | } |
395 | |
396 | bool is_mark(int i) { return i >= n_; } |
397 | |
398 | int maxmark() { return maxmark_; } |
399 | |
400 | void clear() { |
401 | SparseSet::clear(); |
402 | nextmark_ = n_; |
403 | } |
404 | |
405 | void mark() { |
406 | if (last_was_mark_) |
407 | return; |
408 | last_was_mark_ = false; |
409 | SparseSet::insert_new(nextmark_++); |
410 | } |
411 | |
412 | int size() { |
413 | return n_ + maxmark_; |
414 | } |
415 | |
416 | void insert(int id) { |
417 | if (contains(id)) |
418 | return; |
419 | insert_new(id); |
420 | } |
421 | |
422 | void insert_new(int id) { |
423 | last_was_mark_ = false; |
424 | SparseSet::insert_new(id); |
425 | } |
426 | |
427 | private: |
428 | int n_; // size excluding marks |
429 | int maxmark_; // maximum number of marks |
430 | int nextmark_; // id of next mark |
431 | bool last_was_mark_; // last inserted was mark |
432 | |
433 | Workq(const Workq&) = delete; |
434 | Workq& operator=(const Workq&) = delete; |
435 | }; |
436 | |
437 | DFA::DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem) |
438 | : prog_(prog), |
439 | kind_(kind), |
440 | init_failed_(false), |
441 | q0_(NULL), |
442 | q1_(NULL), |
443 | mem_budget_(max_mem) { |
444 | if (ExtraDebug) |
445 | fprintf(stderr, "\nkind %d\n%s\n" , kind_, prog_->DumpUnanchored().c_str()); |
446 | int nmark = 0; |
447 | if (kind_ == Prog::kLongestMatch) |
448 | nmark = prog_->size(); |
449 | // See DFA::AddToQueue() for why this is so. |
450 | int nstack = prog_->inst_count(kInstCapture) + |
451 | prog_->inst_count(kInstEmptyWidth) + |
452 | prog_->inst_count(kInstNop) + |
453 | nmark + 1; // + 1 for start inst |
454 | |
455 | // Account for space needed for DFA, q0, q1, stack. |
456 | mem_budget_ -= sizeof(DFA); |
457 | mem_budget_ -= (prog_->size() + nmark) * |
458 | (sizeof(int)+sizeof(int)) * 2; // q0, q1 |
459 | mem_budget_ -= nstack * sizeof(int); // stack |
460 | if (mem_budget_ < 0) { |
461 | init_failed_ = true; |
462 | return; |
463 | } |
464 | |
465 | state_budget_ = mem_budget_; |
466 | |
467 | // Make sure there is a reasonable amount of working room left. |
468 | // At minimum, the search requires room for two states in order |
469 | // to limp along, restarting frequently. We'll get better performance |
470 | // if there is room for a larger number of states, say 20. |
471 | // Note that a state stores list heads only, so we use the program |
472 | // list count for the upper bound, not the program size. |
473 | int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot |
474 | int64_t one_state = sizeof(State) + nnext*sizeof(std::atomic<State*>) + |
475 | (prog_->list_count()+nmark)*sizeof(int); |
476 | if (state_budget_ < 20*one_state) { |
477 | init_failed_ = true; |
478 | return; |
479 | } |
480 | |
481 | q0_ = new Workq(prog_->size(), nmark); |
482 | q1_ = new Workq(prog_->size(), nmark); |
483 | stack_ = PODArray<int>(nstack); |
484 | } |
485 | |
486 | DFA::~DFA() { |
487 | delete q0_; |
488 | delete q1_; |
489 | ClearCache(); |
490 | } |
491 | |
492 | // In the DFA state graph, s->next[c] == NULL means that the |
493 | // state has not yet been computed and needs to be. We need |
494 | // a different special value to signal that s->next[c] is a |
495 | // state that can never lead to a match (and thus the search |
496 | // can be called off). Hence DeadState. |
497 | #define DeadState reinterpret_cast<State*>(1) |
498 | |
499 | // Signals that the rest of the string matches no matter what it is. |
500 | #define FullMatchState reinterpret_cast<State*>(2) |
501 | |
502 | #define SpecialStateMax FullMatchState |
503 | |
504 | // Debugging printouts |
505 | |
506 | // For debugging, returns a string representation of the work queue. |
507 | std::string DFA::DumpWorkq(Workq* q) { |
508 | std::string s; |
509 | const char* sep = "" ; |
510 | for (Workq::iterator it = q->begin(); it != q->end(); ++it) { |
511 | if (q->is_mark(*it)) { |
512 | s += "|" ; |
513 | sep = "" ; |
514 | } else { |
515 | s += StringPrintf("%s%d" , sep, *it); |
516 | sep = "," ; |
517 | } |
518 | } |
519 | return s; |
520 | } |
521 | |
522 | // For debugging, returns a string representation of the state. |
523 | std::string DFA::DumpState(State* state) { |
524 | if (state == NULL) |
525 | return "_" ; |
526 | if (state == DeadState) |
527 | return "X" ; |
528 | if (state == FullMatchState) |
529 | return "*" ; |
530 | std::string s; |
531 | const char* sep = "" ; |
532 | s += StringPrintf("(%p)" , state); |
533 | for (int i = 0; i < state->ninst_; i++) { |
534 | if (state->inst_[i] == Mark) { |
535 | s += "|" ; |
536 | sep = "" ; |
537 | } else if (state->inst_[i] == MatchSep) { |
538 | s += "||" ; |
539 | sep = "" ; |
540 | } else { |
541 | s += StringPrintf("%s%d" , sep, state->inst_[i]); |
542 | sep = "," ; |
543 | } |
544 | } |
545 | s += StringPrintf(" flag=%#x" , state->flag_); |
546 | return s; |
547 | } |
548 | |
549 | ////////////////////////////////////////////////////////////////////// |
550 | // |
551 | // DFA state graph construction. |
552 | // |
553 | // The DFA state graph is a heavily-linked collection of State* structures. |
554 | // The state_cache_ is a set of all the State structures ever allocated, |
555 | // so that if the same state is reached by two different paths, |
556 | // the same State structure can be used. This reduces allocation |
557 | // requirements and also avoids duplication of effort across the two |
558 | // identical states. |
559 | // |
560 | // A State is defined by an ordered list of instruction ids and a flag word. |
561 | // |
562 | // The choice of an ordered list of instructions differs from a typical |
563 | // textbook DFA implementation, which would use an unordered set. |
564 | // Textbook descriptions, however, only care about whether |
565 | // the DFA matches, not where it matches in the text. To decide where the |
566 | // DFA matches, we need to mimic the behavior of the dominant backtracking |
567 | // implementations like PCRE, which try one possible regular expression |
568 | // execution, then another, then another, stopping when one of them succeeds. |
569 | // The DFA execution tries these many executions in parallel, representing |
570 | // each by an instruction id. These pointers are ordered in the State.inst_ |
571 | // list in the same order that the executions would happen in a backtracking |
572 | // search: if a match is found during execution of inst_[2], inst_[i] for i>=3 |
573 | // can be discarded. |
574 | // |
575 | // Textbooks also typically do not consider context-aware empty string operators |
576 | // like ^ or $. These are handled by the flag word, which specifies the set |
577 | // of empty-string operators that should be matched when executing at the |
578 | // current text position. These flag bits are defined in prog.h. |
579 | // The flag word also contains two DFA-specific bits: kFlagMatch if the state |
580 | // is a matching state (one that reached a kInstMatch in the program) |
581 | // and kFlagLastWord if the last processed byte was a word character, for the |
582 | // implementation of \B and \b. |
583 | // |
584 | // The flag word also contains, shifted up 16 bits, the bits looked for by |
585 | // any kInstEmptyWidth instructions in the state. These provide a useful |
586 | // summary indicating when new flags might be useful. |
587 | // |
588 | // The permanent representation of a State's instruction ids is just an array, |
589 | // but while a state is being analyzed, these instruction ids are represented |
590 | // as a Workq, which is an array that allows iteration in insertion order. |
591 | |
592 | // NOTE(rsc): The choice of State construction determines whether the DFA |
593 | // mimics backtracking implementations (so-called leftmost first matching) or |
594 | // traditional DFA implementations (so-called leftmost longest matching as |
595 | // prescribed by POSIX). This implementation chooses to mimic the |
596 | // backtracking implementations, because we want to replace PCRE. To get |
597 | // POSIX behavior, the states would need to be considered not as a simple |
598 | // ordered list of instruction ids, but as a list of unordered sets of instruction |
599 | // ids. A match by a state in one set would inhibit the running of sets |
600 | // farther down the list but not other instruction ids in the same set. Each |
601 | // set would correspond to matches beginning at a given point in the string. |
602 | // This is implemented by separating different sets with Mark pointers. |
603 | |
604 | // Looks in the State cache for a State matching q, flag. |
605 | // If one is found, returns it. If one is not found, allocates one, |
606 | // inserts it in the cache, and returns it. |
607 | // If mq is not null, MatchSep and the match IDs in mq will be appended |
608 | // to the State. |
609 | DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { |
610 | //mutex_.AssertHeld(); |
611 | |
612 | // Construct array of instruction ids for the new state. |
613 | // Only ByteRange, EmptyWidth, and Match instructions are useful to keep: |
614 | // those are the only operators with any effect in |
615 | // RunWorkqOnEmptyString or RunWorkqOnByte. |
616 | int* inst = new int[q->size()]; |
617 | int n = 0; |
618 | uint32_t needflags = 0; // flags needed by kInstEmptyWidth instructions |
619 | bool sawmatch = false; // whether queue contains guaranteed kInstMatch |
620 | bool sawmark = false; // whether queue contains a Mark |
621 | if (ExtraDebug) |
622 | fprintf(stderr, "WorkqToCachedState %s [%#x]" , DumpWorkq(q).c_str(), flag); |
623 | for (Workq::iterator it = q->begin(); it != q->end(); ++it) { |
624 | int id = *it; |
625 | if (sawmatch && (kind_ == Prog::kFirstMatch || q->is_mark(id))) |
626 | break; |
627 | if (q->is_mark(id)) { |
628 | if (n > 0 && inst[n-1] != Mark) { |
629 | sawmark = true; |
630 | inst[n++] = Mark; |
631 | } |
632 | continue; |
633 | } |
634 | Prog::Inst* ip = prog_->inst(id); |
635 | switch (ip->opcode()) { |
636 | case kInstAltMatch: |
637 | // This state will continue to a match no matter what |
638 | // the rest of the input is. If it is the highest priority match |
639 | // being considered, return the special FullMatchState |
640 | // to indicate that it's all matches from here out. |
641 | if (kind_ != Prog::kManyMatch && |
642 | (kind_ != Prog::kFirstMatch || |
643 | (it == q->begin() && ip->greedy(prog_))) && |
644 | (kind_ != Prog::kLongestMatch || !sawmark) && |
645 | (flag & kFlagMatch)) { |
646 | delete[] inst; |
647 | if (ExtraDebug) |
648 | fprintf(stderr, " -> FullMatchState\n" ); |
649 | return FullMatchState; |
650 | } |
651 | FALLTHROUGH_INTENDED; |
652 | default: |
653 | // Record iff id is the head of its list, which must |
654 | // be the case if id-1 is the last of *its* list. :) |
655 | if (prog_->inst(id-1)->last()) |
656 | inst[n++] = *it; |
657 | if (ip->opcode() == kInstEmptyWidth) |
658 | needflags |= ip->empty(); |
659 | if (ip->opcode() == kInstMatch && !prog_->anchor_end()) |
660 | sawmatch = true; |
661 | break; |
662 | } |
663 | } |
664 | DCHECK_LE(n, q->size()); |
665 | if (n > 0 && inst[n-1] == Mark) |
666 | n--; |
667 | |
668 | // If there are no empty-width instructions waiting to execute, |
669 | // then the extra flag bits will not be used, so there is no |
670 | // point in saving them. (Discarding them reduces the number |
671 | // of distinct states.) |
672 | if (needflags == 0) |
673 | flag &= kFlagMatch; |
674 | |
675 | // NOTE(rsc): The code above cannot do flag &= needflags, |
676 | // because if the right flags were present to pass the current |
677 | // kInstEmptyWidth instructions, new kInstEmptyWidth instructions |
678 | // might be reached that in turn need different flags. |
679 | // The only sure thing is that if there are no kInstEmptyWidth |
680 | // instructions at all, no flags will be needed. |
681 | // We could do the extra work to figure out the full set of |
682 | // possibly needed flags by exploring past the kInstEmptyWidth |
683 | // instructions, but the check above -- are any flags needed |
684 | // at all? -- handles the most common case. More fine-grained |
685 | // analysis can only be justified by measurements showing that |
686 | // too many redundant states are being allocated. |
687 | |
688 | // If there are no Insts in the list, it's a dead state, |
689 | // which is useful to signal with a special pointer so that |
690 | // the execution loop can stop early. This is only okay |
691 | // if the state is *not* a matching state. |
692 | if (n == 0 && flag == 0) { |
693 | delete[] inst; |
694 | if (ExtraDebug) |
695 | fprintf(stderr, " -> DeadState\n" ); |
696 | return DeadState; |
697 | } |
698 | |
699 | // If we're in longest match mode, the state is a sequence of |
700 | // unordered state sets separated by Marks. Sort each set |
701 | // to canonicalize, to reduce the number of distinct sets stored. |
702 | if (kind_ == Prog::kLongestMatch) { |
703 | int* ip = inst; |
704 | int* ep = ip + n; |
705 | while (ip < ep) { |
706 | int* markp = ip; |
707 | while (markp < ep && *markp != Mark) |
708 | markp++; |
709 | std::sort(ip, markp); |
710 | if (markp < ep) |
711 | markp++; |
712 | ip = markp; |
713 | } |
714 | } |
715 | |
716 | // If we're in many match mode, canonicalize for similar reasons: |
717 | // we have an unordered set of states (i.e. we don't have Marks) |
718 | // and sorting will reduce the number of distinct sets stored. |
719 | if (kind_ == Prog::kManyMatch) { |
720 | int* ip = inst; |
721 | int* ep = ip + n; |
722 | std::sort(ip, ep); |
723 | } |
724 | |
725 | // Append MatchSep and the match IDs in mq if necessary. |
726 | if (mq != NULL) { |
727 | inst[n++] = MatchSep; |
728 | for (Workq::iterator i = mq->begin(); i != mq->end(); ++i) { |
729 | int id = *i; |
730 | Prog::Inst* ip = prog_->inst(id); |
731 | if (ip->opcode() == kInstMatch) |
732 | inst[n++] = ip->match_id(); |
733 | } |
734 | } |
735 | |
736 | // Save the needed empty-width flags in the top bits for use later. |
737 | flag |= needflags << kFlagNeedShift; |
738 | |
739 | State* state = CachedState(inst, n, flag); |
740 | delete[] inst; |
741 | return state; |
742 | } |
743 | |
744 | // Looks in the State cache for a State matching inst, ninst, flag. |
745 | // If one is found, returns it. If one is not found, allocates one, |
746 | // inserts it in the cache, and returns it. |
747 | DFA::State* DFA::CachedState(int* inst, int ninst, uint32_t flag) { |
748 | //mutex_.AssertHeld(); |
749 | |
750 | // Look in the cache for a pre-existing state. |
751 | // We have to initialise the struct like this because otherwise |
752 | // MSVC will complain about the flexible array member. :( |
753 | State state; |
754 | state.inst_ = inst; |
755 | state.ninst_ = ninst; |
756 | state.flag_ = flag; |
757 | StateSet::iterator it = state_cache_.find(&state); |
758 | if (it != state_cache_.end()) { |
759 | if (ExtraDebug) |
760 | fprintf(stderr, " -cached-> %s\n" , DumpState(*it).c_str()); |
761 | return *it; |
762 | } |
763 | |
764 | // Must have enough memory for new state. |
765 | // In addition to what we're going to allocate, |
766 | // the state cache hash table seems to incur about 40 bytes per |
767 | // State*, empirically. |
768 | const int kStateCacheOverhead = 40; |
769 | int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot |
770 | int mem = sizeof(State) + nnext*sizeof(std::atomic<State*>) + |
771 | ninst*sizeof(int); |
772 | if (mem_budget_ < mem + kStateCacheOverhead) { |
773 | mem_budget_ = -1; |
774 | return NULL; |
775 | } |
776 | mem_budget_ -= mem + kStateCacheOverhead; |
777 | |
778 | // Allocate new state along with room for next_ and inst_. |
779 | char* space = std::allocator<char>().allocate(mem); |
780 | State* s = new (space) State; |
781 | (void) new (s->next_) std::atomic<State*>[nnext]; |
782 | // Work around a unfortunate bug in older versions of libstdc++. |
783 | // (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64658) |
784 | for (int i = 0; i < nnext; i++) |
785 | (void) new (s->next_ + i) std::atomic<State*>(NULL); |
786 | s->inst_ = new (s->next_ + nnext) int[ninst]; |
787 | memmove(s->inst_, inst, ninst*sizeof s->inst_[0]); |
788 | s->ninst_ = ninst; |
789 | s->flag_ = flag; |
790 | if (ExtraDebug) |
791 | fprintf(stderr, " -> %s\n" , DumpState(s).c_str()); |
792 | |
793 | // Put state in cache and return it. |
794 | state_cache_.insert(s); |
795 | return s; |
796 | } |
797 | |
798 | // Clear the cache. Must hold cache_mutex_.w or be in destructor. |
799 | void DFA::ClearCache() { |
800 | StateSet::iterator begin = state_cache_.begin(); |
801 | StateSet::iterator end = state_cache_.end(); |
802 | while (begin != end) { |
803 | StateSet::iterator tmp = begin; |
804 | ++begin; |
805 | // Deallocate the blob of memory that we allocated in DFA::CachedState(). |
806 | // We recompute mem in order to benefit from sized delete where possible. |
807 | int ninst = (*tmp)->ninst_; |
808 | int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot |
809 | int mem = sizeof(State) + nnext*sizeof(std::atomic<State*>) + |
810 | ninst*sizeof(int); |
811 | std::allocator<char>().deallocate(reinterpret_cast<char*>(*tmp), mem); |
812 | } |
813 | state_cache_.clear(); |
814 | } |
815 | |
816 | // Copies insts in state s to the work queue q. |
817 | void DFA::StateToWorkq(State* s, Workq* q) { |
818 | q->clear(); |
819 | for (int i = 0; i < s->ninst_; i++) { |
820 | if (s->inst_[i] == Mark) { |
821 | q->mark(); |
822 | } else if (s->inst_[i] == MatchSep) { |
823 | // Nothing after this is an instruction! |
824 | break; |
825 | } else { |
826 | // Explore from the head of the list. |
827 | AddToQueue(q, s->inst_[i], s->flag_ & kFlagEmptyMask); |
828 | } |
829 | } |
830 | } |
831 | |
832 | // Adds ip to the work queue, following empty arrows according to flag. |
833 | void DFA::AddToQueue(Workq* q, int id, uint32_t flag) { |
834 | |
835 | // Use stack_ to hold our stack of instructions yet to process. |
836 | // It was preallocated as follows: |
837 | // one entry per Capture; |
838 | // one entry per EmptyWidth; and |
839 | // one entry per Nop. |
840 | // This reflects the maximum number of stack pushes that each can |
841 | // perform. (Each instruction can be processed at most once.) |
842 | // When using marks, we also added nmark == prog_->size(). |
843 | // (Otherwise, nmark == 0.) |
844 | int* stk = stack_.data(); |
845 | int nstk = 0; |
846 | |
847 | stk[nstk++] = id; |
848 | while (nstk > 0) { |
849 | DCHECK_LE(nstk, stack_.size()); |
850 | id = stk[--nstk]; |
851 | |
852 | Loop: |
853 | if (id == Mark) { |
854 | q->mark(); |
855 | continue; |
856 | } |
857 | |
858 | if (id == 0) |
859 | continue; |
860 | |
861 | // If ip is already on the queue, nothing to do. |
862 | // Otherwise add it. We don't actually keep all the |
863 | // ones that get added, but adding all of them here |
864 | // increases the likelihood of q->contains(id), |
865 | // reducing the amount of duplicated work. |
866 | if (q->contains(id)) |
867 | continue; |
868 | q->insert_new(id); |
869 | |
870 | // Process instruction. |
871 | Prog::Inst* ip = prog_->inst(id); |
872 | switch (ip->opcode()) { |
873 | default: |
874 | LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); |
875 | break; |
876 | |
877 | case kInstByteRange: // just save these on the queue |
878 | case kInstMatch: |
879 | if (ip->last()) |
880 | break; |
881 | id = id+1; |
882 | goto Loop; |
883 | |
884 | case kInstCapture: // DFA treats captures as no-ops. |
885 | case kInstNop: |
886 | if (!ip->last()) |
887 | stk[nstk++] = id+1; |
888 | |
889 | // If this instruction is the [00-FF]* loop at the beginning of |
890 | // a leftmost-longest unanchored search, separate with a Mark so |
891 | // that future threads (which will start farther to the right in |
892 | // the input string) are lower priority than current threads. |
893 | if (ip->opcode() == kInstNop && q->maxmark() > 0 && |
894 | id == prog_->start_unanchored() && id != prog_->start()) |
895 | stk[nstk++] = Mark; |
896 | id = ip->out(); |
897 | goto Loop; |
898 | |
899 | case kInstAltMatch: |
900 | DCHECK(!ip->last()); |
901 | id = id+1; |
902 | goto Loop; |
903 | |
904 | case kInstEmptyWidth: |
905 | if (!ip->last()) |
906 | stk[nstk++] = id+1; |
907 | |
908 | // Continue on if we have all the right flag bits. |
909 | if (ip->empty() & ~flag) |
910 | break; |
911 | id = ip->out(); |
912 | goto Loop; |
913 | } |
914 | } |
915 | } |
916 | |
917 | // Running of work queues. In the work queue, order matters: |
918 | // the queue is sorted in priority order. If instruction i comes before j, |
919 | // then the instructions that i produces during the run must come before |
920 | // the ones that j produces. In order to keep this invariant, all the |
921 | // work queue runners have to take an old queue to process and then |
922 | // also a new queue to fill in. It's not acceptable to add to the end of |
923 | // an existing queue, because new instructions will not end up in the |
924 | // correct position. |
925 | |
926 | // Runs the work queue, processing the empty strings indicated by flag. |
927 | // For example, flag == kEmptyBeginLine|kEmptyEndLine means to match |
928 | // both ^ and $. It is important that callers pass all flags at once: |
929 | // processing both ^ and $ is not the same as first processing only ^ |
930 | // and then processing only $. Doing the two-step sequence won't match |
931 | // ^$^$^$ but processing ^ and $ simultaneously will (and is the behavior |
932 | // exhibited by existing implementations). |
933 | void DFA::RunWorkqOnEmptyString(Workq* oldq, Workq* newq, uint32_t flag) { |
934 | newq->clear(); |
935 | for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) { |
936 | if (oldq->is_mark(*i)) |
937 | AddToQueue(newq, Mark, flag); |
938 | else |
939 | AddToQueue(newq, *i, flag); |
940 | } |
941 | } |
942 | |
943 | // Runs the work queue, processing the single byte c followed by any empty |
944 | // strings indicated by flag. For example, c == 'a' and flag == kEmptyEndLine, |
945 | // means to match c$. Sets the bool *ismatch to true if the end of the |
946 | // regular expression program has been reached (the regexp has matched). |
947 | void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq, |
948 | int c, uint32_t flag, bool* ismatch) { |
949 | //mutex_.AssertHeld(); |
950 | |
951 | newq->clear(); |
952 | for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) { |
953 | if (oldq->is_mark(*i)) { |
954 | if (*ismatch) |
955 | return; |
956 | newq->mark(); |
957 | continue; |
958 | } |
959 | int id = *i; |
960 | Prog::Inst* ip = prog_->inst(id); |
961 | switch (ip->opcode()) { |
962 | default: |
963 | LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); |
964 | break; |
965 | |
966 | case kInstFail: // never succeeds |
967 | case kInstCapture: // already followed |
968 | case kInstNop: // already followed |
969 | case kInstAltMatch: // already followed |
970 | case kInstEmptyWidth: // already followed |
971 | break; |
972 | |
973 | case kInstByteRange: // can follow if c is in range |
974 | if (ip->Matches(c)) |
975 | AddToQueue(newq, ip->out(), flag); |
976 | break; |
977 | |
978 | case kInstMatch: |
979 | if (prog_->anchor_end() && c != kByteEndText && |
980 | kind_ != Prog::kManyMatch) |
981 | break; |
982 | *ismatch = true; |
983 | if (kind_ == Prog::kFirstMatch) { |
984 | // Can stop processing work queue since we found a match. |
985 | return; |
986 | } |
987 | break; |
988 | } |
989 | } |
990 | |
991 | if (ExtraDebug) |
992 | fprintf(stderr, "%s on %d[%#x] -> %s [%d]\n" , |
993 | DumpWorkq(oldq).c_str(), c, flag, DumpWorkq(newq).c_str(), *ismatch); |
994 | } |
995 | |
996 | // Processes input byte c in state, returning new state. |
997 | // Caller does not hold mutex. |
998 | DFA::State* DFA::RunStateOnByteUnlocked(State* state, int c) { |
999 | // Keep only one RunStateOnByte going |
1000 | // even if the DFA is being run by multiple threads. |
1001 | MutexLock l(&mutex_); |
1002 | return RunStateOnByte(state, c); |
1003 | } |
1004 | |
1005 | // Processes input byte c in state, returning new state. |
1006 | DFA::State* DFA::RunStateOnByte(State* state, int c) { |
1007 | //mutex_.AssertHeld(); |
1008 | |
1009 | if (state <= SpecialStateMax) { |
1010 | if (state == FullMatchState) { |
1011 | // It is convenient for routines like PossibleMatchRange |
1012 | // if we implement RunStateOnByte for FullMatchState: |
1013 | // once you get into this state you never get out, |
1014 | // so it's pretty easy. |
1015 | return FullMatchState; |
1016 | } |
1017 | if (state == DeadState) { |
1018 | LOG(DFATAL) << "DeadState in RunStateOnByte" ; |
1019 | return NULL; |
1020 | } |
1021 | if (state == NULL) { |
1022 | LOG(DFATAL) << "NULL state in RunStateOnByte" ; |
1023 | return NULL; |
1024 | } |
1025 | LOG(DFATAL) << "Unexpected special state in RunStateOnByte" ; |
1026 | return NULL; |
1027 | } |
1028 | |
1029 | // If someone else already computed this, return it. |
1030 | State* ns = state->next_[ByteMap(c)].load(std::memory_order_relaxed); |
1031 | if (ns != NULL) |
1032 | return ns; |
1033 | |
1034 | // Convert state into Workq. |
1035 | StateToWorkq(state, q0_); |
1036 | |
1037 | // Flags marking the kinds of empty-width things (^ $ etc) |
1038 | // around this byte. Before the byte we have the flags recorded |
1039 | // in the State structure itself. After the byte we have |
1040 | // nothing yet (but that will change: read on). |
1041 | uint32_t needflag = state->flag_ >> kFlagNeedShift; |
1042 | uint32_t beforeflag = state->flag_ & kFlagEmptyMask; |
1043 | uint32_t oldbeforeflag = beforeflag; |
1044 | uint32_t afterflag = 0; |
1045 | |
1046 | if (c == '\n') { |
1047 | // Insert implicit $ and ^ around \n |
1048 | beforeflag |= kEmptyEndLine; |
1049 | afterflag |= kEmptyBeginLine; |
1050 | } |
1051 | |
1052 | if (c == kByteEndText) { |
1053 | // Insert implicit $ and \z before the fake "end text" byte. |
1054 | beforeflag |= kEmptyEndLine | kEmptyEndText; |
1055 | } |
1056 | |
1057 | // The state flag kFlagLastWord says whether the last |
1058 | // byte processed was a word character. Use that info to |
1059 | // insert empty-width (non-)word boundaries. |
1060 | bool islastword = (state->flag_ & kFlagLastWord) != 0; |
1061 | bool isword = c != kByteEndText && Prog::IsWordChar(static_cast<uint8_t>(c)); |
1062 | if (isword == islastword) |
1063 | beforeflag |= kEmptyNonWordBoundary; |
1064 | else |
1065 | beforeflag |= kEmptyWordBoundary; |
1066 | |
1067 | // Okay, finally ready to run. |
1068 | // Only useful to rerun on empty string if there are new, useful flags. |
1069 | if (beforeflag & ~oldbeforeflag & needflag) { |
1070 | RunWorkqOnEmptyString(q0_, q1_, beforeflag); |
1071 | using std::swap; |
1072 | swap(q0_, q1_); |
1073 | } |
1074 | bool ismatch = false; |
1075 | RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch); |
1076 | using std::swap; |
1077 | swap(q0_, q1_); |
1078 | |
1079 | // Save afterflag along with ismatch and isword in new state. |
1080 | uint32_t flag = afterflag; |
1081 | if (ismatch) |
1082 | flag |= kFlagMatch; |
1083 | if (isword) |
1084 | flag |= kFlagLastWord; |
1085 | |
1086 | if (ismatch && kind_ == Prog::kManyMatch) |
1087 | ns = WorkqToCachedState(q0_, q1_, flag); |
1088 | else |
1089 | ns = WorkqToCachedState(q0_, NULL, flag); |
1090 | |
1091 | // Flush ns before linking to it. |
1092 | // Write barrier before updating state->next_ so that the |
1093 | // main search loop can proceed without any locking, for speed. |
1094 | // (Otherwise it would need one mutex operation per input byte.) |
1095 | state->next_[ByteMap(c)].store(ns, std::memory_order_release); |
1096 | return ns; |
1097 | } |
1098 | |
1099 | |
1100 | ////////////////////////////////////////////////////////////////////// |
1101 | // DFA cache reset. |
1102 | |
1103 | // Reader-writer lock helper. |
1104 | // |
1105 | // The DFA uses a reader-writer mutex to protect the state graph itself. |
1106 | // Traversing the state graph requires holding the mutex for reading, |
1107 | // and discarding the state graph and starting over requires holding the |
1108 | // lock for writing. If a search needs to expand the graph but is out |
1109 | // of memory, it will need to drop its read lock and then acquire the |
1110 | // write lock. Since it cannot then atomically downgrade from write lock |
1111 | // to read lock, it runs the rest of the search holding the write lock. |
1112 | // (This probably helps avoid repeated contention, but really the decision |
1113 | // is forced by the Mutex interface.) It's a bit complicated to keep |
1114 | // track of whether the lock is held for reading or writing and thread |
1115 | // that through the search, so instead we encapsulate it in the RWLocker |
1116 | // and pass that around. |
1117 | |
1118 | class DFA::RWLocker { |
1119 | public: |
1120 | explicit RWLocker(Mutex* mu); |
1121 | ~RWLocker(); |
1122 | |
1123 | // If the lock is only held for reading right now, |
1124 | // drop the read lock and re-acquire for writing. |
1125 | // Subsequent calls to LockForWriting are no-ops. |
1126 | // Notice that the lock is *released* temporarily. |
1127 | void LockForWriting(); |
1128 | |
1129 | private: |
1130 | Mutex* mu_; |
1131 | bool writing_; |
1132 | |
1133 | RWLocker(const RWLocker&) = delete; |
1134 | RWLocker& operator=(const RWLocker&) = delete; |
1135 | }; |
1136 | |
1137 | DFA::RWLocker::RWLocker(Mutex* mu) : mu_(mu), writing_(false) { |
1138 | mu_->ReaderLock(); |
1139 | } |
1140 | |
1141 | // This function is marked as NO_THREAD_SAFETY_ANALYSIS because |
1142 | // the annotations don't support lock upgrade. |
1143 | void DFA::RWLocker::LockForWriting() NO_THREAD_SAFETY_ANALYSIS { |
1144 | if (!writing_) { |
1145 | mu_->ReaderUnlock(); |
1146 | mu_->WriterLock(); |
1147 | writing_ = true; |
1148 | } |
1149 | } |
1150 | |
1151 | DFA::RWLocker::~RWLocker() { |
1152 | if (!writing_) |
1153 | mu_->ReaderUnlock(); |
1154 | else |
1155 | mu_->WriterUnlock(); |
1156 | } |
1157 | |
1158 | |
1159 | // When the DFA's State cache fills, we discard all the states in the |
1160 | // cache and start over. Many threads can be using and adding to the |
1161 | // cache at the same time, so we synchronize using the cache_mutex_ |
1162 | // to keep from stepping on other threads. Specifically, all the |
1163 | // threads using the current cache hold cache_mutex_ for reading. |
1164 | // When a thread decides to flush the cache, it drops cache_mutex_ |
1165 | // and then re-acquires it for writing. That ensures there are no |
1166 | // other threads accessing the cache anymore. The rest of the search |
1167 | // runs holding cache_mutex_ for writing, avoiding any contention |
1168 | // with or cache pollution caused by other threads. |
1169 | |
1170 | void DFA::ResetCache(RWLocker* cache_lock) { |
1171 | // Re-acquire the cache_mutex_ for writing (exclusive use). |
1172 | cache_lock->LockForWriting(); |
1173 | |
1174 | // Clear the cache, reset the memory budget. |
1175 | for (int i = 0; i < kMaxStart; i++) { |
1176 | start_[i].start = NULL; |
1177 | start_[i].first_byte.store(kFbUnknown, std::memory_order_relaxed); |
1178 | } |
1179 | ClearCache(); |
1180 | mem_budget_ = state_budget_; |
1181 | } |
1182 | |
1183 | // Typically, a couple States do need to be preserved across a cache |
1184 | // reset, like the State at the current point in the search. |
1185 | // The StateSaver class helps keep States across cache resets. |
1186 | // It makes a copy of the state's guts outside the cache (before the reset) |
1187 | // and then can be asked, after the reset, to recreate the State |
1188 | // in the new cache. For example, in a DFA method ("this" is a DFA): |
1189 | // |
1190 | // StateSaver saver(this, s); |
1191 | // ResetCache(cache_lock); |
1192 | // s = saver.Restore(); |
1193 | // |
1194 | // The saver should always have room in the cache to re-create the state, |
1195 | // because resetting the cache locks out all other threads, and the cache |
1196 | // is known to have room for at least a couple states (otherwise the DFA |
1197 | // constructor fails). |
1198 | |
1199 | class DFA::StateSaver { |
1200 | public: |
1201 | explicit StateSaver(DFA* dfa, State* state); |
1202 | ~StateSaver(); |
1203 | |
1204 | // Recreates and returns a state equivalent to the |
1205 | // original state passed to the constructor. |
1206 | // Returns NULL if the cache has filled, but |
1207 | // since the DFA guarantees to have room in the cache |
1208 | // for a couple states, should never return NULL |
1209 | // if used right after ResetCache. |
1210 | State* Restore(); |
1211 | |
1212 | private: |
1213 | DFA* dfa_; // the DFA to use |
1214 | int* inst_; // saved info from State |
1215 | int ninst_; |
1216 | uint32_t flag_; |
1217 | bool is_special_; // whether original state was special |
1218 | State* special_; // if is_special_, the original state |
1219 | |
1220 | StateSaver(const StateSaver&) = delete; |
1221 | StateSaver& operator=(const StateSaver&) = delete; |
1222 | }; |
1223 | |
1224 | DFA::StateSaver::StateSaver(DFA* dfa, State* state) { |
1225 | dfa_ = dfa; |
1226 | if (state <= SpecialStateMax) { |
1227 | inst_ = NULL; |
1228 | ninst_ = 0; |
1229 | flag_ = 0; |
1230 | is_special_ = true; |
1231 | special_ = state; |
1232 | return; |
1233 | } |
1234 | is_special_ = false; |
1235 | special_ = NULL; |
1236 | flag_ = state->flag_; |
1237 | ninst_ = state->ninst_; |
1238 | inst_ = new int[ninst_]; |
1239 | memmove(inst_, state->inst_, ninst_*sizeof inst_[0]); |
1240 | } |
1241 | |
1242 | DFA::StateSaver::~StateSaver() { |
1243 | if (!is_special_) |
1244 | delete[] inst_; |
1245 | } |
1246 | |
1247 | DFA::State* DFA::StateSaver::Restore() { |
1248 | if (is_special_) |
1249 | return special_; |
1250 | MutexLock l(&dfa_->mutex_); |
1251 | State* s = dfa_->CachedState(inst_, ninst_, flag_); |
1252 | if (s == NULL) |
1253 | LOG(DFATAL) << "StateSaver failed to restore state." ; |
1254 | return s; |
1255 | } |
1256 | |
1257 | |
1258 | ////////////////////////////////////////////////////////////////////// |
1259 | // |
1260 | // DFA execution. |
1261 | // |
1262 | // The basic search loop is easy: start in a state s and then for each |
1263 | // byte c in the input, s = s->next[c]. |
1264 | // |
1265 | // This simple description omits a few efficiency-driven complications. |
1266 | // |
1267 | // First, the State graph is constructed incrementally: it is possible |
1268 | // that s->next[c] is null, indicating that that state has not been |
1269 | // fully explored. In this case, RunStateOnByte must be invoked to |
1270 | // determine the next state, which is cached in s->next[c] to save |
1271 | // future effort. An alternative reason for s->next[c] to be null is |
1272 | // that the DFA has reached a so-called "dead state", in which any match |
1273 | // is no longer possible. In this case RunStateOnByte will return NULL |
1274 | // and the processing of the string can stop early. |
1275 | // |
1276 | // Second, a 256-element pointer array for s->next_ makes each State |
1277 | // quite large (2kB on 64-bit machines). Instead, dfa->bytemap_[] |
1278 | // maps from bytes to "byte classes" and then next_ only needs to have |
1279 | // as many pointers as there are byte classes. A byte class is simply a |
1280 | // range of bytes that the regexp never distinguishes between. |
1281 | // A regexp looking for a[abc] would have four byte ranges -- 0 to 'a'-1, |
1282 | // 'a', 'b' to 'c', and 'c' to 0xFF. The bytemap slows us a little bit |
1283 | // but in exchange we typically cut the size of a State (and thus our |
1284 | // memory footprint) by about 5-10x. The comments still refer to |
1285 | // s->next[c] for simplicity, but code should refer to s->next_[bytemap_[c]]. |
1286 | // |
1287 | // Third, it is common for a DFA for an unanchored match to begin in a |
1288 | // state in which only one particular byte value can take the DFA to a |
1289 | // different state. That is, s->next[c] != s for only one c. In this |
1290 | // situation, the DFA can do better than executing the simple loop. |
1291 | // Instead, it can call memchr to search very quickly for the byte c. |
1292 | // Whether the start state has this property is determined during a |
1293 | // pre-compilation pass, and if so, the byte b is passed to the search |
1294 | // loop as the "first_byte" argument, along with a boolean "have_first_byte". |
1295 | // |
1296 | // Fourth, the desired behavior is to search for the leftmost-best match |
1297 | // (approximately, the same one that Perl would find), which is not |
1298 | // necessarily the match ending earliest in the string. Each time a |
1299 | // match is found, it must be noted, but the DFA must continue on in |
1300 | // hope of finding a higher-priority match. In some cases, the caller only |
1301 | // cares whether there is any match at all, not which one is found. |
1302 | // The "want_earliest_match" flag causes the search to stop at the first |
1303 | // match found. |
1304 | // |
1305 | // Fifth, one algorithm that uses the DFA needs it to run over the |
1306 | // input string backward, beginning at the end and ending at the beginning. |
1307 | // Passing false for the "run_forward" flag causes the DFA to run backward. |
1308 | // |
1309 | // The checks for these last three cases, which in a naive implementation |
1310 | // would be performed once per input byte, slow the general loop enough |
1311 | // to merit specialized versions of the search loop for each of the |
1312 | // eight possible settings of the three booleans. Rather than write |
1313 | // eight different functions, we write one general implementation and then |
1314 | // inline it to create the specialized ones. |
1315 | // |
1316 | // Note that matches are delayed by one byte, to make it easier to |
1317 | // accomodate match conditions depending on the next input byte (like $ and \b). |
1318 | // When s->next[c]->IsMatch(), it means that there is a match ending just |
1319 | // *before* byte c. |
1320 | |
1321 | // The generic search loop. Searches text for a match, returning |
1322 | // the pointer to the end of the chosen match, or NULL if no match. |
1323 | // The bools are equal to the same-named variables in params, but |
1324 | // making them function arguments lets the inliner specialize |
1325 | // this function to each combination (see two paragraphs above). |
1326 | inline bool DFA::InlinedSearchLoop(SearchParams* params, |
1327 | bool have_first_byte, |
1328 | bool want_earliest_match, |
1329 | bool run_forward) { |
1330 | State* start = params->start; |
1331 | const uint8_t* bp = BytePtr(params->text.data()); // start of text |
1332 | const uint8_t* p = bp; // text scanning point |
1333 | const uint8_t* ep = BytePtr(params->text.data() + |
1334 | params->text.size()); // end of text |
1335 | const uint8_t* resetp = NULL; // p at last cache reset |
1336 | if (!run_forward) { |
1337 | using std::swap; |
1338 | swap(p, ep); |
1339 | } |
1340 | |
1341 | const uint8_t* bytemap = prog_->bytemap(); |
1342 | const uint8_t* lastmatch = NULL; // most recent matching position in text |
1343 | bool matched = false; |
1344 | |
1345 | State* s = start; |
1346 | if (ExtraDebug) |
1347 | fprintf(stderr, "@stx: %s\n" , DumpState(s).c_str()); |
1348 | |
1349 | if (s->IsMatch()) { |
1350 | matched = true; |
1351 | lastmatch = p; |
1352 | if (ExtraDebug) |
1353 | fprintf(stderr, "match @stx! [%s]\n" , DumpState(s).c_str()); |
1354 | if (params->matches != NULL && kind_ == Prog::kManyMatch) { |
1355 | for (int i = s->ninst_ - 1; i >= 0; i--) { |
1356 | int id = s->inst_[i]; |
1357 | if (id == MatchSep) |
1358 | break; |
1359 | params->matches->insert(id); |
1360 | } |
1361 | } |
1362 | if (want_earliest_match) { |
1363 | params->ep = reinterpret_cast<const char*>(lastmatch); |
1364 | return true; |
1365 | } |
1366 | } |
1367 | |
1368 | while (p != ep) { |
1369 | if (ExtraDebug) |
1370 | fprintf(stderr, "@%td: %s\n" , p - bp, DumpState(s).c_str()); |
1371 | |
1372 | if (have_first_byte && s == start) { |
1373 | // In start state, only way out is to find first_byte, |
1374 | // so use optimized assembly in memchr to skip ahead. |
1375 | // If first_byte isn't found, we can skip to the end |
1376 | // of the string. |
1377 | if (run_forward) { |
1378 | if ((p = BytePtr(memchr(p, params->first_byte, ep - p))) == NULL) { |
1379 | p = ep; |
1380 | break; |
1381 | } |
1382 | } else { |
1383 | if ((p = BytePtr(memrchr(ep, params->first_byte, p - ep))) == NULL) { |
1384 | p = ep; |
1385 | break; |
1386 | } |
1387 | p++; |
1388 | } |
1389 | } |
1390 | |
1391 | int c; |
1392 | if (run_forward) |
1393 | c = *p++; |
1394 | else |
1395 | c = *--p; |
1396 | |
1397 | // Note that multiple threads might be consulting |
1398 | // s->next_[bytemap[c]] simultaneously. |
1399 | // RunStateOnByte takes care of the appropriate locking, |
1400 | // including a memory barrier so that the unlocked access |
1401 | // (sometimes known as "double-checked locking") is safe. |
1402 | // The alternative would be either one DFA per thread |
1403 | // or one mutex operation per input byte. |
1404 | // |
1405 | // ns == DeadState means the state is known to be dead |
1406 | // (no more matches are possible). |
1407 | // ns == NULL means the state has not yet been computed |
1408 | // (need to call RunStateOnByteUnlocked). |
1409 | // RunStateOnByte returns ns == NULL if it is out of memory. |
1410 | // ns == FullMatchState means the rest of the string matches. |
1411 | // |
1412 | // Okay to use bytemap[] not ByteMap() here, because |
1413 | // c is known to be an actual byte and not kByteEndText. |
1414 | |
1415 | State* ns = s->next_[bytemap[c]].load(std::memory_order_acquire); |
1416 | if (ns == NULL) { |
1417 | ns = RunStateOnByteUnlocked(s, c); |
1418 | if (ns == NULL) { |
1419 | // After we reset the cache, we hold cache_mutex exclusively, |
1420 | // so if resetp != NULL, it means we filled the DFA state |
1421 | // cache with this search alone (without any other threads). |
1422 | // Benchmarks show that doing a state computation on every |
1423 | // byte runs at about 0.2 MB/s, while the NFA (nfa.cc) can do the |
1424 | // same at about 2 MB/s. Unless we're processing an average |
1425 | // of 10 bytes per state computation, fail so that RE2 can |
1426 | // fall back to the NFA. However, RE2::Set cannot fall back, |
1427 | // so we just have to keep on keeping on in that case. |
1428 | if (dfa_should_bail_when_slow && resetp != NULL && |
1429 | static_cast<size_t>(p - resetp) < 10*state_cache_.size() && |
1430 | kind_ != Prog::kManyMatch) { |
1431 | params->failed = true; |
1432 | return false; |
1433 | } |
1434 | resetp = p; |
1435 | |
1436 | // Prepare to save start and s across the reset. |
1437 | StateSaver save_start(this, start); |
1438 | StateSaver save_s(this, s); |
1439 | |
1440 | // Discard all the States in the cache. |
1441 | ResetCache(params->cache_lock); |
1442 | |
1443 | // Restore start and s so we can continue. |
1444 | if ((start = save_start.Restore()) == NULL || |
1445 | (s = save_s.Restore()) == NULL) { |
1446 | // Restore already did LOG(DFATAL). |
1447 | params->failed = true; |
1448 | return false; |
1449 | } |
1450 | ns = RunStateOnByteUnlocked(s, c); |
1451 | if (ns == NULL) { |
1452 | LOG(DFATAL) << "RunStateOnByteUnlocked failed after ResetCache" ; |
1453 | params->failed = true; |
1454 | return false; |
1455 | } |
1456 | } |
1457 | } |
1458 | if (ns <= SpecialStateMax) { |
1459 | if (ns == DeadState) { |
1460 | params->ep = reinterpret_cast<const char*>(lastmatch); |
1461 | return matched; |
1462 | } |
1463 | // FullMatchState |
1464 | params->ep = reinterpret_cast<const char*>(ep); |
1465 | return true; |
1466 | } |
1467 | |
1468 | s = ns; |
1469 | if (s->IsMatch()) { |
1470 | matched = true; |
1471 | // The DFA notices the match one byte late, |
1472 | // so adjust p before using it in the match. |
1473 | if (run_forward) |
1474 | lastmatch = p - 1; |
1475 | else |
1476 | lastmatch = p + 1; |
1477 | if (ExtraDebug) |
1478 | fprintf(stderr, "match @%td! [%s]\n" , lastmatch - bp, DumpState(s).c_str()); |
1479 | if (params->matches != NULL && kind_ == Prog::kManyMatch) { |
1480 | for (int i = s->ninst_ - 1; i >= 0; i--) { |
1481 | int id = s->inst_[i]; |
1482 | if (id == MatchSep) |
1483 | break; |
1484 | params->matches->insert(id); |
1485 | } |
1486 | } |
1487 | if (want_earliest_match) { |
1488 | params->ep = reinterpret_cast<const char*>(lastmatch); |
1489 | return true; |
1490 | } |
1491 | } |
1492 | } |
1493 | |
1494 | // Process one more byte to see if it triggers a match. |
1495 | // (Remember, matches are delayed one byte.) |
1496 | if (ExtraDebug) |
1497 | fprintf(stderr, "@etx: %s\n" , DumpState(s).c_str()); |
1498 | |
1499 | int lastbyte; |
1500 | if (run_forward) { |
1501 | if (params->text.end() == params->context.end()) |
1502 | lastbyte = kByteEndText; |
1503 | else |
1504 | lastbyte = params->text.end()[0] & 0xFF; |
1505 | } else { |
1506 | if (params->text.begin() == params->context.begin()) |
1507 | lastbyte = kByteEndText; |
1508 | else |
1509 | lastbyte = params->text.begin()[-1] & 0xFF; |
1510 | } |
1511 | |
1512 | State* ns = s->next_[ByteMap(lastbyte)].load(std::memory_order_acquire); |
1513 | if (ns == NULL) { |
1514 | ns = RunStateOnByteUnlocked(s, lastbyte); |
1515 | if (ns == NULL) { |
1516 | StateSaver save_s(this, s); |
1517 | ResetCache(params->cache_lock); |
1518 | if ((s = save_s.Restore()) == NULL) { |
1519 | params->failed = true; |
1520 | return false; |
1521 | } |
1522 | ns = RunStateOnByteUnlocked(s, lastbyte); |
1523 | if (ns == NULL) { |
1524 | LOG(DFATAL) << "RunStateOnByteUnlocked failed after Reset" ; |
1525 | params->failed = true; |
1526 | return false; |
1527 | } |
1528 | } |
1529 | } |
1530 | if (ns <= SpecialStateMax) { |
1531 | if (ns == DeadState) { |
1532 | params->ep = reinterpret_cast<const char*>(lastmatch); |
1533 | return matched; |
1534 | } |
1535 | // FullMatchState |
1536 | params->ep = reinterpret_cast<const char*>(ep); |
1537 | return true; |
1538 | } |
1539 | |
1540 | s = ns; |
1541 | if (s->IsMatch()) { |
1542 | matched = true; |
1543 | lastmatch = p; |
1544 | if (ExtraDebug) |
1545 | fprintf(stderr, "match @etx! [%s]\n" , DumpState(s).c_str()); |
1546 | if (params->matches != NULL && kind_ == Prog::kManyMatch) { |
1547 | for (int i = s->ninst_ - 1; i >= 0; i--) { |
1548 | int id = s->inst_[i]; |
1549 | if (id == MatchSep) |
1550 | break; |
1551 | params->matches->insert(id); |
1552 | } |
1553 | } |
1554 | } |
1555 | |
1556 | params->ep = reinterpret_cast<const char*>(lastmatch); |
1557 | return matched; |
1558 | } |
1559 | |
1560 | // Inline specializations of the general loop. |
1561 | bool DFA::SearchFFF(SearchParams* params) { |
1562 | return InlinedSearchLoop(params, 0, 0, 0); |
1563 | } |
1564 | bool DFA::SearchFFT(SearchParams* params) { |
1565 | return InlinedSearchLoop(params, 0, 0, 1); |
1566 | } |
1567 | bool DFA::SearchFTF(SearchParams* params) { |
1568 | return InlinedSearchLoop(params, 0, 1, 0); |
1569 | } |
1570 | bool DFA::SearchFTT(SearchParams* params) { |
1571 | return InlinedSearchLoop(params, 0, 1, 1); |
1572 | } |
1573 | bool DFA::SearchTFF(SearchParams* params) { |
1574 | return InlinedSearchLoop(params, 1, 0, 0); |
1575 | } |
1576 | bool DFA::SearchTFT(SearchParams* params) { |
1577 | return InlinedSearchLoop(params, 1, 0, 1); |
1578 | } |
1579 | bool DFA::SearchTTF(SearchParams* params) { |
1580 | return InlinedSearchLoop(params, 1, 1, 0); |
1581 | } |
1582 | bool DFA::SearchTTT(SearchParams* params) { |
1583 | return InlinedSearchLoop(params, 1, 1, 1); |
1584 | } |
1585 | |
1586 | // For debugging, calls the general code directly. |
1587 | bool DFA::SlowSearchLoop(SearchParams* params) { |
1588 | return InlinedSearchLoop(params, |
1589 | params->first_byte >= 0, |
1590 | params->want_earliest_match, |
1591 | params->run_forward); |
1592 | } |
1593 | |
1594 | // For performance, calls the appropriate specialized version |
1595 | // of InlinedSearchLoop. |
1596 | bool DFA::FastSearchLoop(SearchParams* params) { |
1597 | // Because the methods are private, the Searches array |
1598 | // cannot be declared at top level. |
1599 | static bool (DFA::*Searches[])(SearchParams*) = { |
1600 | &DFA::SearchFFF, |
1601 | &DFA::SearchFFT, |
1602 | &DFA::SearchFTF, |
1603 | &DFA::SearchFTT, |
1604 | &DFA::SearchTFF, |
1605 | &DFA::SearchTFT, |
1606 | &DFA::SearchTTF, |
1607 | &DFA::SearchTTT, |
1608 | }; |
1609 | |
1610 | bool have_first_byte = params->first_byte >= 0; |
1611 | int index = 4 * have_first_byte + |
1612 | 2 * params->want_earliest_match + |
1613 | 1 * params->run_forward; |
1614 | return (this->*Searches[index])(params); |
1615 | } |
1616 | |
1617 | |
1618 | // The discussion of DFA execution above ignored the question of how |
1619 | // to determine the initial state for the search loop. There are two |
1620 | // factors that influence the choice of start state. |
1621 | // |
1622 | // The first factor is whether the search is anchored or not. |
1623 | // The regexp program (Prog*) itself has |
1624 | // two different entry points: one for anchored searches and one for |
1625 | // unanchored searches. (The unanchored version starts with a leading ".*?" |
1626 | // and then jumps to the anchored one.) |
1627 | // |
1628 | // The second factor is where text appears in the larger context, which |
1629 | // determines which empty-string operators can be matched at the beginning |
1630 | // of execution. If text is at the very beginning of context, \A and ^ match. |
1631 | // Otherwise if text is at the beginning of a line, then ^ matches. |
1632 | // Otherwise it matters whether the character before text is a word character |
1633 | // or a non-word character. |
1634 | // |
1635 | // The two cases (unanchored vs not) and four cases (empty-string flags) |
1636 | // combine to make the eight cases recorded in the DFA's begin_text_[2], |
1637 | // begin_line_[2], after_wordchar_[2], and after_nonwordchar_[2] cached |
1638 | // StartInfos. The start state for each is filled in the first time it |
1639 | // is used for an actual search. |
1640 | |
1641 | // Examines text, context, and anchored to determine the right start |
1642 | // state for the DFA search loop. Fills in params and returns true on success. |
1643 | // Returns false on failure. |
1644 | bool DFA::AnalyzeSearch(SearchParams* params) { |
1645 | const StringPiece& text = params->text; |
1646 | const StringPiece& context = params->context; |
1647 | |
1648 | // Sanity check: make sure that text lies within context. |
1649 | if (text.begin() < context.begin() || text.end() > context.end()) { |
1650 | LOG(DFATAL) << "context does not contain text" ; |
1651 | params->start = DeadState; |
1652 | return true; |
1653 | } |
1654 | |
1655 | // Determine correct search type. |
1656 | int start; |
1657 | uint32_t flags; |
1658 | if (params->run_forward) { |
1659 | if (text.begin() == context.begin()) { |
1660 | start = kStartBeginText; |
1661 | flags = kEmptyBeginText|kEmptyBeginLine; |
1662 | } else if (text.begin()[-1] == '\n') { |
1663 | start = kStartBeginLine; |
1664 | flags = kEmptyBeginLine; |
1665 | } else if (Prog::IsWordChar(text.begin()[-1] & 0xFF)) { |
1666 | start = kStartAfterWordChar; |
1667 | flags = kFlagLastWord; |
1668 | } else { |
1669 | start = kStartAfterNonWordChar; |
1670 | flags = 0; |
1671 | } |
1672 | } else { |
1673 | if (text.end() == context.end()) { |
1674 | start = kStartBeginText; |
1675 | flags = kEmptyBeginText|kEmptyBeginLine; |
1676 | } else if (text.end()[0] == '\n') { |
1677 | start = kStartBeginLine; |
1678 | flags = kEmptyBeginLine; |
1679 | } else if (Prog::IsWordChar(text.end()[0] & 0xFF)) { |
1680 | start = kStartAfterWordChar; |
1681 | flags = kFlagLastWord; |
1682 | } else { |
1683 | start = kStartAfterNonWordChar; |
1684 | flags = 0; |
1685 | } |
1686 | } |
1687 | if (params->anchored) |
1688 | start |= kStartAnchored; |
1689 | StartInfo* info = &start_[start]; |
1690 | |
1691 | // Try once without cache_lock for writing. |
1692 | // Try again after resetting the cache |
1693 | // (ResetCache will relock cache_lock for writing). |
1694 | if (!AnalyzeSearchHelper(params, info, flags)) { |
1695 | ResetCache(params->cache_lock); |
1696 | if (!AnalyzeSearchHelper(params, info, flags)) { |
1697 | LOG(DFATAL) << "Failed to analyze start state." ; |
1698 | params->failed = true; |
1699 | return false; |
1700 | } |
1701 | } |
1702 | |
1703 | if (ExtraDebug) |
1704 | fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s first_byte=%d\n" , |
1705 | params->anchored, params->run_forward, flags, |
1706 | DumpState(info->start).c_str(), info->first_byte.load()); |
1707 | |
1708 | params->start = info->start; |
1709 | params->first_byte = info->first_byte.load(std::memory_order_acquire); |
1710 | |
1711 | return true; |
1712 | } |
1713 | |
1714 | // Fills in info if needed. Returns true on success, false on failure. |
1715 | bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, |
1716 | uint32_t flags) { |
1717 | // Quick check. |
1718 | int fb = info->first_byte.load(std::memory_order_acquire); |
1719 | if (fb != kFbUnknown) |
1720 | return true; |
1721 | |
1722 | MutexLock l(&mutex_); |
1723 | fb = info->first_byte.load(std::memory_order_relaxed); |
1724 | if (fb != kFbUnknown) |
1725 | return true; |
1726 | |
1727 | q0_->clear(); |
1728 | AddToQueue(q0_, |
1729 | params->anchored ? prog_->start() : prog_->start_unanchored(), |
1730 | flags); |
1731 | info->start = WorkqToCachedState(q0_, NULL, flags); |
1732 | if (info->start == NULL) |
1733 | return false; |
1734 | |
1735 | if (info->start == DeadState) { |
1736 | // Synchronize with "quick check" above. |
1737 | info->first_byte.store(kFbNone, std::memory_order_release); |
1738 | return true; |
1739 | } |
1740 | |
1741 | if (info->start == FullMatchState) { |
1742 | // Synchronize with "quick check" above. |
1743 | info->first_byte.store(kFbNone, std::memory_order_release); // will be ignored |
1744 | return true; |
1745 | } |
1746 | |
1747 | // Even if we have a first_byte, we cannot use it when anchored and, |
1748 | // less obviously, we cannot use it when we are going to need flags. |
1749 | // This trick works only when there is a single byte that leads to a |
1750 | // different state! |
1751 | int first_byte = prog_->first_byte(); |
1752 | if (first_byte == -1 || |
1753 | params->anchored || |
1754 | info->start->flag_ >> kFlagNeedShift != 0) |
1755 | first_byte = kFbNone; |
1756 | |
1757 | // Synchronize with "quick check" above. |
1758 | info->first_byte.store(first_byte, std::memory_order_release); |
1759 | return true; |
1760 | } |
1761 | |
1762 | // The actual DFA search: calls AnalyzeSearch and then FastSearchLoop. |
1763 | bool DFA::Search(const StringPiece& text, |
1764 | const StringPiece& context, |
1765 | bool anchored, |
1766 | bool want_earliest_match, |
1767 | bool run_forward, |
1768 | bool* failed, |
1769 | const char** epp, |
1770 | SparseSet* matches) { |
1771 | *epp = NULL; |
1772 | if (!ok()) { |
1773 | *failed = true; |
1774 | return false; |
1775 | } |
1776 | *failed = false; |
1777 | |
1778 | if (ExtraDebug) { |
1779 | fprintf(stderr, "\nprogram:\n%s\n" , prog_->DumpUnanchored().c_str()); |
1780 | fprintf(stderr, "text %s anchored=%d earliest=%d fwd=%d kind %d\n" , |
1781 | std::string(text).c_str(), anchored, want_earliest_match, run_forward, kind_); |
1782 | } |
1783 | |
1784 | RWLocker l(&cache_mutex_); |
1785 | SearchParams params(text, context, &l); |
1786 | params.anchored = anchored; |
1787 | params.want_earliest_match = want_earliest_match; |
1788 | params.run_forward = run_forward; |
1789 | params.matches = matches; |
1790 | |
1791 | if (!AnalyzeSearch(¶ms)) { |
1792 | *failed = true; |
1793 | return false; |
1794 | } |
1795 | if (params.start == DeadState) |
1796 | return false; |
1797 | if (params.start == FullMatchState) { |
1798 | if (run_forward == want_earliest_match) |
1799 | *epp = text.data(); |
1800 | else |
1801 | *epp = text.data() + text.size(); |
1802 | return true; |
1803 | } |
1804 | if (ExtraDebug) |
1805 | fprintf(stderr, "start %s\n" , DumpState(params.start).c_str()); |
1806 | bool ret = FastSearchLoop(¶ms); |
1807 | if (params.failed) { |
1808 | *failed = true; |
1809 | return false; |
1810 | } |
1811 | *epp = params.ep; |
1812 | return ret; |
1813 | } |
1814 | |
1815 | DFA* Prog::GetDFA(MatchKind kind) { |
1816 | // For a forward DFA, half the memory goes to each DFA. |
1817 | // However, if it is a "many match" DFA, then there is |
1818 | // no counterpart with which the memory must be shared. |
1819 | // |
1820 | // For a reverse DFA, all the memory goes to the |
1821 | // "longest match" DFA, because RE2 never does reverse |
1822 | // "first match" searches. |
1823 | if (kind == kFirstMatch) { |
1824 | std::call_once(dfa_first_once_, [](Prog* prog) { |
1825 | prog->dfa_first_ = new DFA(prog, kFirstMatch, prog->dfa_mem_ / 2); |
1826 | }, this); |
1827 | return dfa_first_; |
1828 | } else if (kind == kManyMatch) { |
1829 | std::call_once(dfa_first_once_, [](Prog* prog) { |
1830 | prog->dfa_first_ = new DFA(prog, kManyMatch, prog->dfa_mem_); |
1831 | }, this); |
1832 | return dfa_first_; |
1833 | } else { |
1834 | std::call_once(dfa_longest_once_, [](Prog* prog) { |
1835 | if (!prog->reversed_) |
1836 | prog->dfa_longest_ = new DFA(prog, kLongestMatch, prog->dfa_mem_ / 2); |
1837 | else |
1838 | prog->dfa_longest_ = new DFA(prog, kLongestMatch, prog->dfa_mem_); |
1839 | }, this); |
1840 | return dfa_longest_; |
1841 | } |
1842 | } |
1843 | |
1844 | void Prog::DeleteDFA(DFA* dfa) { |
1845 | delete dfa; |
1846 | } |
1847 | |
1848 | // Executes the regexp program to search in text, |
1849 | // which itself is inside the larger context. (As a convenience, |
1850 | // passing a NULL context is equivalent to passing text.) |
1851 | // Returns true if a match is found, false if not. |
1852 | // If a match is found, fills in match0->end() to point at the end of the match |
1853 | // and sets match0->begin() to text.begin(), since the DFA can't track |
1854 | // where the match actually began. |
1855 | // |
1856 | // This is the only external interface (class DFA only exists in this file). |
1857 | // |
1858 | bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, |
1859 | Anchor anchor, MatchKind kind, StringPiece* match0, |
1860 | bool* failed, SparseSet* matches) { |
1861 | *failed = false; |
1862 | |
1863 | StringPiece context = const_context; |
1864 | if (context.data() == NULL) |
1865 | context = text; |
1866 | bool carat = anchor_start(); |
1867 | bool dollar = anchor_end(); |
1868 | if (reversed_) { |
1869 | using std::swap; |
1870 | swap(carat, dollar); |
1871 | } |
1872 | if (carat && context.begin() != text.begin()) |
1873 | return false; |
1874 | if (dollar && context.end() != text.end()) |
1875 | return false; |
1876 | |
1877 | // Handle full match by running an anchored longest match |
1878 | // and then checking if it covers all of text. |
1879 | bool anchored = anchor == kAnchored || anchor_start() || kind == kFullMatch; |
1880 | bool endmatch = false; |
1881 | if (kind == kManyMatch) { |
1882 | // This is split out in order to avoid clobbering kind. |
1883 | } else if (kind == kFullMatch || anchor_end()) { |
1884 | endmatch = true; |
1885 | kind = kLongestMatch; |
1886 | } |
1887 | |
1888 | // If the caller doesn't care where the match is (just whether one exists), |
1889 | // then we can stop at the very first match we find, the so-called |
1890 | // "earliest match". |
1891 | bool want_earliest_match = false; |
1892 | if (kind == kManyMatch) { |
1893 | // This is split out in order to avoid clobbering kind. |
1894 | if (matches == NULL) { |
1895 | want_earliest_match = true; |
1896 | } |
1897 | } else if (match0 == NULL && !endmatch) { |
1898 | want_earliest_match = true; |
1899 | kind = kLongestMatch; |
1900 | } |
1901 | |
1902 | DFA* dfa = GetDFA(kind); |
1903 | const char* ep; |
1904 | bool matched = dfa->Search(text, context, anchored, |
1905 | want_earliest_match, !reversed_, |
1906 | failed, &ep, matches); |
1907 | if (*failed) |
1908 | return false; |
1909 | if (!matched) |
1910 | return false; |
1911 | if (endmatch && ep != (reversed_ ? text.data() : text.data() + text.size())) |
1912 | return false; |
1913 | |
1914 | // If caller cares, record the boundary of the match. |
1915 | // We only know where it ends, so use the boundary of text |
1916 | // as the beginning. |
1917 | if (match0) { |
1918 | if (reversed_) |
1919 | *match0 = |
1920 | StringPiece(ep, static_cast<size_t>(text.data() + text.size() - ep)); |
1921 | else |
1922 | *match0 = |
1923 | StringPiece(text.data(), static_cast<size_t>(ep - text.data())); |
1924 | } |
1925 | return true; |
1926 | } |
1927 | |
1928 | // Build out all states in DFA. Returns number of states. |
1929 | int DFA::BuildAllStates(const Prog::DFAStateCallback& cb) { |
1930 | if (!ok()) |
1931 | return 0; |
1932 | |
1933 | // Pick out start state for unanchored search |
1934 | // at beginning of text. |
1935 | RWLocker l(&cache_mutex_); |
1936 | SearchParams params(StringPiece(), StringPiece(), &l); |
1937 | params.anchored = false; |
1938 | if (!AnalyzeSearch(¶ms) || |
1939 | params.start == NULL || |
1940 | params.start == DeadState) |
1941 | return 0; |
1942 | |
1943 | // Add start state to work queue. |
1944 | // Note that any State* that we handle here must point into the cache, |
1945 | // so we can simply depend on pointer-as-a-number hashing and equality. |
1946 | std::unordered_map<State*, int> m; |
1947 | std::deque<State*> q; |
1948 | m.emplace(params.start, static_cast<int>(m.size())); |
1949 | q.push_back(params.start); |
1950 | |
1951 | // Compute the input bytes needed to cover all of the next pointers. |
1952 | int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot |
1953 | std::vector<int> input(nnext); |
1954 | for (int c = 0; c < 256; c++) { |
1955 | int b = prog_->bytemap()[c]; |
1956 | while (c < 256-1 && prog_->bytemap()[c+1] == b) |
1957 | c++; |
1958 | input[b] = c; |
1959 | } |
1960 | input[prog_->bytemap_range()] = kByteEndText; |
1961 | |
1962 | // Scratch space for the output. |
1963 | std::vector<int> output(nnext); |
1964 | |
1965 | // Flood to expand every state. |
1966 | bool oom = false; |
1967 | while (!q.empty()) { |
1968 | State* s = q.front(); |
1969 | q.pop_front(); |
1970 | for (int c : input) { |
1971 | State* ns = RunStateOnByteUnlocked(s, c); |
1972 | if (ns == NULL) { |
1973 | oom = true; |
1974 | break; |
1975 | } |
1976 | if (ns == DeadState) { |
1977 | output[ByteMap(c)] = -1; |
1978 | continue; |
1979 | } |
1980 | if (m.find(ns) == m.end()) { |
1981 | m.emplace(ns, static_cast<int>(m.size())); |
1982 | q.push_back(ns); |
1983 | } |
1984 | output[ByteMap(c)] = m[ns]; |
1985 | } |
1986 | if (cb) |
1987 | cb(oom ? NULL : output.data(), |
1988 | s == FullMatchState || s->IsMatch()); |
1989 | if (oom) |
1990 | break; |
1991 | } |
1992 | |
1993 | return static_cast<int>(m.size()); |
1994 | } |
1995 | |
1996 | // Build out all states in DFA for kind. Returns number of states. |
1997 | int Prog::BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb) { |
1998 | return GetDFA(kind)->BuildAllStates(cb); |
1999 | } |
2000 | |
2001 | void Prog::TEST_dfa_should_bail_when_slow(bool b) { |
2002 | dfa_should_bail_when_slow = b; |
2003 | } |
2004 | |
2005 | // Computes min and max for matching string. |
2006 | // Won't return strings bigger than maxlen. |
2007 | bool DFA::PossibleMatchRange(std::string* min, std::string* max, int maxlen) { |
2008 | if (!ok()) |
2009 | return false; |
2010 | |
2011 | // NOTE: if future users of PossibleMatchRange want more precision when |
2012 | // presented with infinitely repeated elements, consider making this a |
2013 | // parameter to PossibleMatchRange. |
2014 | static int kMaxEltRepetitions = 0; |
2015 | |
2016 | // Keep track of the number of times we've visited states previously. We only |
2017 | // revisit a given state if it's part of a repeated group, so if the value |
2018 | // portion of the map tuple exceeds kMaxEltRepetitions we bail out and set |
2019 | // |*max| to |PrefixSuccessor(*max)|. |
2020 | // |
2021 | // Also note that previously_visited_states[UnseenStatePtr] will, in the STL |
2022 | // tradition, implicitly insert a '0' value at first use. We take advantage |
2023 | // of that property below. |
2024 | std::unordered_map<State*, int> previously_visited_states; |
2025 | |
2026 | // Pick out start state for anchored search at beginning of text. |
2027 | RWLocker l(&cache_mutex_); |
2028 | SearchParams params(StringPiece(), StringPiece(), &l); |
2029 | params.anchored = true; |
2030 | if (!AnalyzeSearch(¶ms)) |
2031 | return false; |
2032 | if (params.start == DeadState) { // No matching strings |
2033 | *min = "" ; |
2034 | *max = "" ; |
2035 | return true; |
2036 | } |
2037 | if (params.start == FullMatchState) // Every string matches: no max |
2038 | return false; |
2039 | |
2040 | // The DFA is essentially a big graph rooted at params.start, |
2041 | // and paths in the graph correspond to accepted strings. |
2042 | // Each node in the graph has potentially 256+1 arrows |
2043 | // coming out, one for each byte plus the magic end of |
2044 | // text character kByteEndText. |
2045 | |
2046 | // To find the smallest possible prefix of an accepted |
2047 | // string, we just walk the graph preferring to follow |
2048 | // arrows with the lowest bytes possible. To find the |
2049 | // largest possible prefix, we follow the largest bytes |
2050 | // possible. |
2051 | |
2052 | // The test for whether there is an arrow from s on byte j is |
2053 | // ns = RunStateOnByteUnlocked(s, j); |
2054 | // if (ns == NULL) |
2055 | // return false; |
2056 | // if (ns != DeadState && ns->ninst > 0) |
2057 | // The RunStateOnByteUnlocked call asks the DFA to build out the graph. |
2058 | // It returns NULL only if the DFA has run out of memory, |
2059 | // in which case we can't be sure of anything. |
2060 | // The second check sees whether there was graph built |
2061 | // and whether it is interesting graph. Nodes might have |
2062 | // ns->ninst == 0 if they exist only to represent the fact |
2063 | // that a match was found on the previous byte. |
2064 | |
2065 | // Build minimum prefix. |
2066 | State* s = params.start; |
2067 | min->clear(); |
2068 | MutexLock lock(&mutex_); |
2069 | for (int i = 0; i < maxlen; i++) { |
2070 | if (previously_visited_states[s] > kMaxEltRepetitions) |
2071 | break; |
2072 | previously_visited_states[s]++; |
2073 | |
2074 | // Stop if min is a match. |
2075 | State* ns = RunStateOnByte(s, kByteEndText); |
2076 | if (ns == NULL) // DFA out of memory |
2077 | return false; |
2078 | if (ns != DeadState && (ns == FullMatchState || ns->IsMatch())) |
2079 | break; |
2080 | |
2081 | // Try to extend the string with low bytes. |
2082 | bool extended = false; |
2083 | for (int j = 0; j < 256; j++) { |
2084 | ns = RunStateOnByte(s, j); |
2085 | if (ns == NULL) // DFA out of memory |
2086 | return false; |
2087 | if (ns == FullMatchState || |
2088 | (ns > SpecialStateMax && ns->ninst_ > 0)) { |
2089 | extended = true; |
2090 | min->append(1, static_cast<char>(j)); |
2091 | s = ns; |
2092 | break; |
2093 | } |
2094 | } |
2095 | if (!extended) |
2096 | break; |
2097 | } |
2098 | |
2099 | // Build maximum prefix. |
2100 | previously_visited_states.clear(); |
2101 | s = params.start; |
2102 | max->clear(); |
2103 | for (int i = 0; i < maxlen; i++) { |
2104 | if (previously_visited_states[s] > kMaxEltRepetitions) |
2105 | break; |
2106 | previously_visited_states[s] += 1; |
2107 | |
2108 | // Try to extend the string with high bytes. |
2109 | bool extended = false; |
2110 | for (int j = 255; j >= 0; j--) { |
2111 | State* ns = RunStateOnByte(s, j); |
2112 | if (ns == NULL) |
2113 | return false; |
2114 | if (ns == FullMatchState || |
2115 | (ns > SpecialStateMax && ns->ninst_ > 0)) { |
2116 | extended = true; |
2117 | max->append(1, static_cast<char>(j)); |
2118 | s = ns; |
2119 | break; |
2120 | } |
2121 | } |
2122 | if (!extended) { |
2123 | // Done, no need for PrefixSuccessor. |
2124 | return true; |
2125 | } |
2126 | } |
2127 | |
2128 | // Stopped while still adding to *max - round aaaaaaaaaa... to aaaa...b |
2129 | PrefixSuccessor(max); |
2130 | |
2131 | // If there are no bytes left, we have no way to say "there is no maximum |
2132 | // string". We could make the interface more complicated and be able to |
2133 | // return "there is no maximum but here is a minimum", but that seems like |
2134 | // overkill -- the most common no-max case is all possible strings, so not |
2135 | // telling the caller that the empty string is the minimum match isn't a |
2136 | // great loss. |
2137 | if (max->empty()) |
2138 | return false; |
2139 | |
2140 | return true; |
2141 | } |
2142 | |
2143 | // PossibleMatchRange for a Prog. |
2144 | bool Prog::PossibleMatchRange(std::string* min, std::string* max, int maxlen) { |
2145 | // Have to use dfa_longest_ to get all strings for full matches. |
2146 | // For example, (a|aa) never matches aa in first-match mode. |
2147 | return GetDFA(kLongestMatch)->PossibleMatchRange(min, max, maxlen); |
2148 | } |
2149 | |
2150 | } // namespace re2 |
2151 | |