1 | // Copyright 2007 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | #ifndef RE2_PROG_H_ |
6 | #define RE2_PROG_H_ |
7 | |
8 | // Compiled representation of regular expressions. |
9 | // See regexp.h for the Regexp class, which represents a regular |
10 | // expression symbolically. |
11 | |
12 | #include <stdint.h> |
13 | #include <functional> |
14 | #include <mutex> |
15 | #include <string> |
16 | #include <vector> |
17 | #include <type_traits> |
18 | |
19 | #include "util/util.h" |
20 | #include "util/logging.h" |
21 | #include "re2/pod_array.h" |
22 | #include "re2/re2.h" |
23 | #include "re2/sparse_array.h" |
24 | #include "re2/sparse_set.h" |
25 | |
26 | namespace re2 { |
27 | |
28 | // Opcodes for Inst |
29 | enum InstOp { |
30 | kInstAlt = 0, // choose between out_ and out1_ |
31 | kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa. |
32 | kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_] |
33 | kInstCapture, // capturing parenthesis number cap_ |
34 | kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_ |
35 | kInstMatch, // found a match! |
36 | kInstNop, // no-op; occasionally unavoidable |
37 | kInstFail, // never match; occasionally unavoidable |
38 | kNumInst, |
39 | }; |
40 | |
41 | // Bit flags for empty-width specials |
42 | enum EmptyOp { |
43 | kEmptyBeginLine = 1<<0, // ^ - beginning of line |
44 | kEmptyEndLine = 1<<1, // $ - end of line |
45 | kEmptyBeginText = 1<<2, // \A - beginning of text |
46 | kEmptyEndText = 1<<3, // \z - end of text |
47 | kEmptyWordBoundary = 1<<4, // \b - word boundary |
48 | kEmptyNonWordBoundary = 1<<5, // \B - not \b |
49 | kEmptyAllFlags = (1<<6)-1, |
50 | }; |
51 | |
52 | class DFA; |
53 | class Regexp; |
54 | |
55 | // Compiled form of regexp program. |
56 | class Prog { |
57 | public: |
58 | Prog(); |
59 | ~Prog(); |
60 | |
61 | // Single instruction in regexp program. |
62 | class Inst { |
63 | public: |
64 | // See the assertion below for why this is so. |
65 | Inst() = default; |
66 | |
67 | // Copyable. |
68 | Inst(const Inst&) = default; |
69 | Inst& operator=(const Inst&) = default; |
70 | |
71 | // Constructors per opcode |
72 | void InitAlt(uint32_t out, uint32_t out1); |
73 | void InitByteRange(int lo, int hi, int foldcase, uint32_t out); |
74 | void InitCapture(int cap, uint32_t out); |
75 | void InitEmptyWidth(EmptyOp empty, uint32_t out); |
76 | void InitMatch(int id); |
77 | void InitNop(uint32_t out); |
78 | void InitFail(); |
79 | |
80 | // Getters |
81 | int id(Prog* p) { return static_cast<int>(this - p->inst_.data()); } |
82 | InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); } |
83 | int last() { return (out_opcode_>>3)&1; } |
84 | int out() { return out_opcode_>>4; } |
85 | int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; } |
86 | int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; } |
87 | int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; } |
88 | int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; } |
89 | int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_&1; } |
90 | int hint() { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_>>1; } |
91 | int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; } |
92 | EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; } |
93 | |
94 | bool greedy(Prog* p) { |
95 | DCHECK_EQ(opcode(), kInstAltMatch); |
96 | return p->inst(out())->opcode() == kInstByteRange || |
97 | (p->inst(out())->opcode() == kInstNop && |
98 | p->inst(p->inst(out())->out())->opcode() == kInstByteRange); |
99 | } |
100 | |
101 | // Does this inst (an kInstByteRange) match c? |
102 | inline bool Matches(int c) { |
103 | DCHECK_EQ(opcode(), kInstByteRange); |
104 | if (foldcase() && 'A' <= c && c <= 'Z') |
105 | c += 'a' - 'A'; |
106 | return lo_ <= c && c <= hi_; |
107 | } |
108 | |
109 | // Returns string representation for debugging. |
110 | std::string Dump(); |
111 | |
112 | // Maximum instruction id. |
113 | // (Must fit in out_opcode_. PatchList/last steal another bit.) |
114 | static const int kMaxInst = (1<<28) - 1; |
115 | |
116 | private: |
117 | void set_opcode(InstOp opcode) { |
118 | out_opcode_ = (out()<<4) | (last()<<3) | opcode; |
119 | } |
120 | |
121 | void set_last() { |
122 | out_opcode_ = (out()<<4) | (1<<3) | opcode(); |
123 | } |
124 | |
125 | void set_out(int out) { |
126 | out_opcode_ = (out<<4) | (last()<<3) | opcode(); |
127 | } |
128 | |
129 | void set_out_opcode(int out, InstOp opcode) { |
130 | out_opcode_ = (out<<4) | (last()<<3) | opcode; |
131 | } |
132 | |
133 | uint32_t out_opcode_; // 28 bits: out, 1 bit: last, 3 (low) bits: opcode |
134 | union { // additional instruction arguments: |
135 | uint32_t out1_; // opcode == kInstAlt |
136 | // alternate next instruction |
137 | |
138 | int32_t cap_; // opcode == kInstCapture |
139 | // Index of capture register (holds text |
140 | // position recorded by capturing parentheses). |
141 | // For \n (the submatch for the nth parentheses), |
142 | // the left parenthesis captures into register 2*n |
143 | // and the right one captures into register 2*n+1. |
144 | |
145 | int32_t match_id_; // opcode == kInstMatch |
146 | // Match ID to identify this match (for re2::Set). |
147 | |
148 | struct { // opcode == kInstByteRange |
149 | uint8_t lo_; // byte range is lo_-hi_ inclusive |
150 | uint8_t hi_; // |
151 | uint16_t hint_foldcase_; // 15 bits: hint, 1 (low) bit: foldcase |
152 | // hint to execution engines: the delta to the |
153 | // next instruction (in the current list) worth |
154 | // exploring iff this instruction matched; 0 |
155 | // means there are no remaining possibilities, |
156 | // which is most likely for character classes. |
157 | // foldcase: A-Z -> a-z before checking range. |
158 | }; |
159 | |
160 | EmptyOp empty_; // opcode == kInstEmptyWidth |
161 | // empty_ is bitwise OR of kEmpty* flags above. |
162 | }; |
163 | |
164 | friend class Compiler; |
165 | friend struct PatchList; |
166 | friend class Prog; |
167 | }; |
168 | |
169 | // Inst must be trivial so that we can freely clear it with memset(3). |
170 | // Arrays of Inst are initialised by copying the initial elements with |
171 | // memmove(3) and then clearing any remaining elements with memset(3). |
172 | static_assert(std::is_trivial<Inst>::value, "Inst must be trivial" ); |
173 | |
174 | // Whether to anchor the search. |
175 | enum Anchor { |
176 | kUnanchored, // match anywhere |
177 | kAnchored, // match only starting at beginning of text |
178 | }; |
179 | |
180 | // Kind of match to look for (for anchor != kFullMatch) |
181 | // |
182 | // kLongestMatch mode finds the overall longest |
183 | // match but still makes its submatch choices the way |
184 | // Perl would, not in the way prescribed by POSIX. |
185 | // The POSIX rules are much more expensive to implement, |
186 | // and no one has needed them. |
187 | // |
188 | // kFullMatch is not strictly necessary -- we could use |
189 | // kLongestMatch and then check the length of the match -- but |
190 | // the matching code can run faster if it knows to consider only |
191 | // full matches. |
192 | enum MatchKind { |
193 | kFirstMatch, // like Perl, PCRE |
194 | kLongestMatch, // like egrep or POSIX |
195 | kFullMatch, // match only entire text; implies anchor==kAnchored |
196 | kManyMatch // for SearchDFA, records set of matches |
197 | }; |
198 | |
199 | Inst *inst(int id) { return &inst_[id]; } |
200 | int start() { return start_; } |
201 | int start_unanchored() { return start_unanchored_; } |
202 | void set_start(int start) { start_ = start; } |
203 | void set_start_unanchored(int start) { start_unanchored_ = start; } |
204 | int size() { return size_; } |
205 | bool reversed() { return reversed_; } |
206 | void set_reversed(bool reversed) { reversed_ = reversed; } |
207 | int list_count() { return list_count_; } |
208 | int inst_count(InstOp op) { return inst_count_[op]; } |
209 | uint16_t* list_heads() { return list_heads_.data(); } |
210 | void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; } |
211 | int64_t dfa_mem() { return dfa_mem_; } |
212 | int flags() { return flags_; } |
213 | void set_flags(int flags) { flags_ = flags; } |
214 | bool anchor_start() { return anchor_start_; } |
215 | void set_anchor_start(bool b) { anchor_start_ = b; } |
216 | bool anchor_end() { return anchor_end_; } |
217 | void set_anchor_end(bool b) { anchor_end_ = b; } |
218 | int bytemap_range() { return bytemap_range_; } |
219 | const uint8_t* bytemap() { return bytemap_; } |
220 | |
221 | // Lazily computed. |
222 | int first_byte(); |
223 | |
224 | // Returns string representation of program for debugging. |
225 | std::string Dump(); |
226 | std::string DumpUnanchored(); |
227 | std::string DumpByteMap(); |
228 | |
229 | // Returns the set of kEmpty flags that are in effect at |
230 | // position p within context. |
231 | static uint32_t EmptyFlags(const StringPiece& context, const char* p); |
232 | |
233 | // Returns whether byte c is a word character: ASCII only. |
234 | // Used by the implementation of \b and \B. |
235 | // This is not right for Unicode, but: |
236 | // - it's hard to get right in a byte-at-a-time matching world |
237 | // (the DFA has only one-byte lookahead). |
238 | // - even if the lookahead were possible, the Progs would be huge. |
239 | // This crude approximation is the same one PCRE uses. |
240 | static bool IsWordChar(uint8_t c) { |
241 | return ('A' <= c && c <= 'Z') || |
242 | ('a' <= c && c <= 'z') || |
243 | ('0' <= c && c <= '9') || |
244 | c == '_'; |
245 | } |
246 | |
247 | // Execution engines. They all search for the regexp (run the prog) |
248 | // in text, which is in the larger context (used for ^ $ \b etc). |
249 | // Anchor and kind control the kind of search. |
250 | // Returns true if match found, false if not. |
251 | // If match found, fills match[0..nmatch-1] with submatch info. |
252 | // match[0] is overall match, match[1] is first set of parens, etc. |
253 | // If a particular submatch is not matched during the regexp match, |
254 | // it is set to NULL. |
255 | // |
256 | // Matching text == StringPiece(NULL, 0) is treated as any other empty |
257 | // string, but note that on return, it will not be possible to distinguish |
258 | // submatches that matched that empty string from submatches that didn't |
259 | // match anything. Either way, match[i] == NULL. |
260 | |
261 | // Search using NFA: can find submatches but kind of slow. |
262 | bool SearchNFA(const StringPiece& text, const StringPiece& context, |
263 | Anchor anchor, MatchKind kind, |
264 | StringPiece* match, int nmatch); |
265 | |
266 | // Search using DFA: much faster than NFA but only finds |
267 | // end of match and can use a lot more memory. |
268 | // Returns whether a match was found. |
269 | // If the DFA runs out of memory, sets *failed to true and returns false. |
270 | // If matches != NULL and kind == kManyMatch and there is a match, |
271 | // SearchDFA fills matches with the match IDs of the final matching state. |
272 | bool SearchDFA(const StringPiece& text, const StringPiece& context, |
273 | Anchor anchor, MatchKind kind, StringPiece* match0, |
274 | bool* failed, SparseSet* matches); |
275 | |
276 | // The callback issued after building each DFA state with BuildEntireDFA(). |
277 | // If next is null, then the memory budget has been exhausted and building |
278 | // will halt. Otherwise, the state has been built and next points to an array |
279 | // of bytemap_range()+1 slots holding the next states as per the bytemap and |
280 | // kByteEndText. The number of the state is implied by the callback sequence: |
281 | // the first callback is for state 0, the second callback is for state 1, ... |
282 | // match indicates whether the state is a matching state. |
283 | using DFAStateCallback = std::function<void(const int* next, bool match)>; |
284 | |
285 | // Build the entire DFA for the given match kind. |
286 | // Usually the DFA is built out incrementally, as needed, which |
287 | // avoids lots of unnecessary work. |
288 | // If cb is not empty, it receives one callback per state built. |
289 | // Returns the number of states built. |
290 | // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY. |
291 | int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb); |
292 | |
293 | // Controls whether the DFA should bail out early if the NFA would be faster. |
294 | // FOR TESTING ONLY. |
295 | static void TEST_dfa_should_bail_when_slow(bool b); |
296 | |
297 | // Compute bytemap. |
298 | void ComputeByteMap(); |
299 | |
300 | // Computes whether all matches must begin with the same first |
301 | // byte, and if so, returns that byte. If not, returns -1. |
302 | int ComputeFirstByte(); |
303 | |
304 | // Run peep-hole optimizer on program. |
305 | void Optimize(); |
306 | |
307 | // One-pass NFA: only correct if IsOnePass() is true, |
308 | // but much faster than NFA (competitive with PCRE) |
309 | // for those expressions. |
310 | bool IsOnePass(); |
311 | bool SearchOnePass(const StringPiece& text, const StringPiece& context, |
312 | Anchor anchor, MatchKind kind, |
313 | StringPiece* match, int nmatch); |
314 | |
315 | // Bit-state backtracking. Fast on small cases but uses memory |
316 | // proportional to the product of the list count and the text size. |
317 | bool CanBitState() { return list_heads_.data() != NULL; } |
318 | bool SearchBitState(const StringPiece& text, const StringPiece& context, |
319 | Anchor anchor, MatchKind kind, |
320 | StringPiece* match, int nmatch); |
321 | |
322 | static const int kMaxOnePassCapture = 5; // $0 through $4 |
323 | |
324 | // Backtracking search: the gold standard against which the other |
325 | // implementations are checked. FOR TESTING ONLY. |
326 | // It allocates a ton of memory to avoid running forever. |
327 | // It is also recursive, so can't use in production (will overflow stacks). |
328 | // The name "Unsafe" here is supposed to be a flag that |
329 | // you should not be using this function. |
330 | bool UnsafeSearchBacktrack(const StringPiece& text, |
331 | const StringPiece& context, |
332 | Anchor anchor, MatchKind kind, |
333 | StringPiece* match, int nmatch); |
334 | |
335 | // Computes range for any strings matching regexp. The min and max can in |
336 | // some cases be arbitrarily precise, so the caller gets to specify the |
337 | // maximum desired length of string returned. |
338 | // |
339 | // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any |
340 | // string s that is an anchored match for this regexp satisfies |
341 | // min <= s && s <= max. |
342 | // |
343 | // Note that PossibleMatchRange() will only consider the first copy of an |
344 | // infinitely repeated element (i.e., any regexp element followed by a '*' or |
345 | // '+' operator). Regexps with "{N}" constructions are not affected, as those |
346 | // do not compile down to infinite repetitions. |
347 | // |
348 | // Returns true on success, false on error. |
349 | bool PossibleMatchRange(std::string* min, std::string* max, int maxlen); |
350 | |
351 | // EXPERIMENTAL! SUBJECT TO CHANGE! |
352 | // Outputs the program fanout into the given sparse array. |
353 | void Fanout(SparseArray<int>* fanout); |
354 | |
355 | // Compiles a collection of regexps to Prog. Each regexp will have |
356 | // its own Match instruction recording the index in the output vector. |
357 | static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem); |
358 | |
359 | // Flattens the Prog from "tree" form to "list" form. This is an in-place |
360 | // operation in the sense that the old instructions are lost. |
361 | void Flatten(); |
362 | |
363 | // Walks the Prog; the "successor roots" or predecessors of the reachable |
364 | // instructions are marked in rootmap or predmap/predvec, respectively. |
365 | // reachable and stk are preallocated scratch structures. |
366 | void MarkSuccessors(SparseArray<int>* rootmap, |
367 | SparseArray<int>* predmap, |
368 | std::vector<std::vector<int>>* predvec, |
369 | SparseSet* reachable, std::vector<int>* stk); |
370 | |
371 | // Walks the Prog from the given "root" instruction; the "dominator root" |
372 | // of the reachable instructions (if such exists) is marked in rootmap. |
373 | // reachable and stk are preallocated scratch structures. |
374 | void MarkDominator(int root, SparseArray<int>* rootmap, |
375 | SparseArray<int>* predmap, |
376 | std::vector<std::vector<int>>* predvec, |
377 | SparseSet* reachable, std::vector<int>* stk); |
378 | |
379 | // Walks the Prog from the given "root" instruction; the reachable |
380 | // instructions are emitted in "list" form and appended to flat. |
381 | // reachable and stk are preallocated scratch structures. |
382 | void EmitList(int root, SparseArray<int>* rootmap, |
383 | std::vector<Inst>* flat, |
384 | SparseSet* reachable, std::vector<int>* stk); |
385 | |
386 | // Computes hints for ByteRange instructions in [begin, end). |
387 | void ComputeHints(std::vector<Inst>* flat, int begin, int end); |
388 | |
389 | private: |
390 | friend class Compiler; |
391 | |
392 | DFA* GetDFA(MatchKind kind); |
393 | void DeleteDFA(DFA* dfa); |
394 | |
395 | bool anchor_start_; // regexp has explicit start anchor |
396 | bool anchor_end_; // regexp has explicit end anchor |
397 | bool reversed_; // whether program runs backward over input |
398 | bool did_flatten_; // has Flatten been called? |
399 | bool did_onepass_; // has IsOnePass been called? |
400 | |
401 | int start_; // entry point for program |
402 | int start_unanchored_; // unanchored entry point for program |
403 | int size_; // number of instructions |
404 | int bytemap_range_; // bytemap_[x] < bytemap_range_ |
405 | int first_byte_; // required first byte for match, or -1 if none |
406 | int flags_; // regexp parse flags |
407 | |
408 | int list_count_; // count of lists (see above) |
409 | int inst_count_[kNumInst]; // count of instructions by opcode |
410 | PODArray<uint16_t> list_heads_; // sparse array enumerating list heads |
411 | // not populated if size_ is overly large |
412 | |
413 | PODArray<Inst> inst_; // pointer to instruction array |
414 | PODArray<uint8_t> onepass_nodes_; // data for OnePass nodes |
415 | |
416 | int64_t dfa_mem_; // Maximum memory for DFAs. |
417 | DFA* dfa_first_; // DFA cached for kFirstMatch/kManyMatch |
418 | DFA* dfa_longest_; // DFA cached for kLongestMatch/kFullMatch |
419 | |
420 | uint8_t bytemap_[256]; // map from input bytes to byte classes |
421 | |
422 | std::once_flag first_byte_once_; |
423 | std::once_flag dfa_first_once_; |
424 | std::once_flag dfa_longest_once_; |
425 | |
426 | Prog(const Prog&) = delete; |
427 | Prog& operator=(const Prog&) = delete; |
428 | }; |
429 | |
430 | } // namespace re2 |
431 | |
432 | #endif // RE2_PROG_H_ |
433 | |