1 | // Copyright 2007 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | // Compile regular expression to Prog. |
6 | // |
7 | // Prog and Inst are defined in prog.h. |
8 | // This file's external interface is just Regexp::CompileToProg. |
9 | // The Compiler class defined in this file is private. |
10 | |
11 | #include <stdint.h> |
12 | #include <string.h> |
13 | #include <unordered_map> |
14 | #include <utility> |
15 | |
16 | #include "util/logging.h" |
17 | #include "util/utf.h" |
18 | #include "re2/pod_array.h" |
19 | #include "re2/prog.h" |
20 | #include "re2/re2.h" |
21 | #include "re2/regexp.h" |
22 | #include "re2/walker-inl.h" |
23 | |
24 | namespace re2 { |
25 | |
26 | // List of pointers to Inst* that need to be filled in (patched). |
27 | // Because the Inst* haven't been filled in yet, |
28 | // we can use the Inst* word to hold the list's "next" pointer. |
29 | // It's kind of sleazy, but it works well in practice. |
30 | // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. |
31 | // |
32 | // Because the out and out1 fields in Inst are no longer pointers, |
33 | // we can't use pointers directly here either. Instead, p refers |
34 | // to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1). |
35 | // p == 0 represents the NULL list. This is okay because instruction #0 |
36 | // is always the fail instruction, which never appears on a list. |
37 | |
38 | struct PatchList { |
39 | uint32_t p; |
40 | |
41 | // Returns patch list containing just p. |
42 | static PatchList Mk(uint32_t p); |
43 | |
44 | // Patches all the entries on l to have value v. |
45 | // Caller must not ever use patch list again. |
46 | static void Patch(Prog::Inst *inst0, PatchList l, uint32_t v); |
47 | |
48 | // Deref returns the next pointer pointed at by p. |
49 | static PatchList Deref(Prog::Inst *inst0, PatchList l); |
50 | |
51 | // Appends two patch lists and returns result. |
52 | static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2); |
53 | }; |
54 | |
55 | static PatchList nullPatchList = { 0 }; |
56 | |
57 | // Returns patch list containing just p. |
58 | PatchList PatchList::Mk(uint32_t p) { |
59 | PatchList l; |
60 | l.p = p; |
61 | return l; |
62 | } |
63 | |
64 | // Returns the next pointer pointed at by l. |
65 | PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) { |
66 | Prog::Inst* ip = &inst0[l.p>>1]; |
67 | if (l.p&1) |
68 | l.p = ip->out1(); |
69 | else |
70 | l.p = ip->out(); |
71 | return l; |
72 | } |
73 | |
74 | // Patches all the entries on l to have value v. |
75 | void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32_t val) { |
76 | while (l.p != 0) { |
77 | Prog::Inst* ip = &inst0[l.p>>1]; |
78 | if (l.p&1) { |
79 | l.p = ip->out1(); |
80 | ip->out1_ = val; |
81 | } else { |
82 | l.p = ip->out(); |
83 | ip->set_out(val); |
84 | } |
85 | } |
86 | } |
87 | |
88 | // Appends two patch lists and returns result. |
89 | PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) { |
90 | if (l1.p == 0) |
91 | return l2; |
92 | if (l2.p == 0) |
93 | return l1; |
94 | |
95 | PatchList l = l1; |
96 | for (;;) { |
97 | PatchList next = PatchList::Deref(inst0, l); |
98 | if (next.p == 0) |
99 | break; |
100 | l = next; |
101 | } |
102 | |
103 | Prog::Inst* ip = &inst0[l.p>>1]; |
104 | if (l.p&1) |
105 | ip->out1_ = l2.p; |
106 | else |
107 | ip->set_out(l2.p); |
108 | |
109 | return l1; |
110 | } |
111 | |
112 | // Compiled program fragment. |
113 | struct Frag { |
114 | uint32_t begin; |
115 | PatchList end; |
116 | |
117 | Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector |
118 | Frag(uint32_t begin, PatchList end) : begin(begin), end(end) {} |
119 | }; |
120 | |
121 | // Input encodings. |
122 | enum Encoding { |
123 | kEncodingUTF8 = 1, // UTF-8 (0-10FFFF) |
124 | kEncodingLatin1, // Latin-1 (0-FF) |
125 | }; |
126 | |
127 | class Compiler : public Regexp::Walker<Frag> { |
128 | public: |
129 | explicit Compiler(); |
130 | ~Compiler(); |
131 | |
132 | // Compiles Regexp to a new Prog. |
133 | // Caller is responsible for deleting Prog when finished with it. |
134 | // If reversed is true, compiles for walking over the input |
135 | // string backward (reverses all concatenations). |
136 | static Prog *Compile(Regexp* re, bool reversed, int64_t max_mem); |
137 | |
138 | // Compiles alternation of all the re to a new Prog. |
139 | // Each re has a match with an id equal to its index in the vector. |
140 | static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem); |
141 | |
142 | // Interface for Regexp::Walker, which helps traverse the Regexp. |
143 | // The walk is purely post-recursive: given the machines for the |
144 | // children, PostVisit combines them to create the machine for |
145 | // the current node. The child_args are Frags. |
146 | // The Compiler traverses the Regexp parse tree, visiting |
147 | // each node in depth-first order. It invokes PreVisit before |
148 | // visiting the node's children and PostVisit after visiting |
149 | // the children. |
150 | Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop); |
151 | Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args, |
152 | int nchild_args); |
153 | Frag ShortVisit(Regexp* re, Frag parent_arg); |
154 | Frag Copy(Frag arg); |
155 | |
156 | // Given fragment a, returns a+ or a+?; a* or a*?; a? or a?? |
157 | Frag Plus(Frag a, bool nongreedy); |
158 | Frag Star(Frag a, bool nongreedy); |
159 | Frag Quest(Frag a, bool nongreedy); |
160 | |
161 | // Given fragment a, returns (a) capturing as \n. |
162 | Frag Capture(Frag a, int n); |
163 | |
164 | // Given fragments a and b, returns ab; a|b |
165 | Frag Cat(Frag a, Frag b); |
166 | Frag Alt(Frag a, Frag b); |
167 | |
168 | // Returns a fragment that can't match anything. |
169 | Frag NoMatch(); |
170 | |
171 | // Returns a fragment that matches the empty string. |
172 | Frag Match(int32_t id); |
173 | |
174 | // Returns a no-op fragment. |
175 | Frag Nop(); |
176 | |
177 | // Returns a fragment matching the byte range lo-hi. |
178 | Frag ByteRange(int lo, int hi, bool foldcase); |
179 | |
180 | // Returns a fragment matching an empty-width special op. |
181 | Frag EmptyWidth(EmptyOp op); |
182 | |
183 | // Adds n instructions to the program. |
184 | // Returns the index of the first one. |
185 | // Returns -1 if no more instructions are available. |
186 | int AllocInst(int n); |
187 | |
188 | // Rune range compiler. |
189 | |
190 | // Begins a new alternation. |
191 | void BeginRange(); |
192 | |
193 | // Adds a fragment matching the rune range lo-hi. |
194 | void AddRuneRange(Rune lo, Rune hi, bool foldcase); |
195 | void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase); |
196 | void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase); |
197 | void Add_80_10ffff(); |
198 | |
199 | // New suffix that matches the byte range lo-hi, then goes to next. |
200 | int UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next); |
201 | int CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next); |
202 | |
203 | // Returns true iff the suffix is cached. |
204 | bool IsCachedRuneByteSuffix(int id); |
205 | |
206 | // Adds a suffix to alternation. |
207 | void AddSuffix(int id); |
208 | |
209 | // Adds a suffix to the trie starting from the given root node. |
210 | // Returns zero iff allocating an instruction fails. Otherwise, returns |
211 | // the current root node, which might be different from what was given. |
212 | int AddSuffixRecursive(int root, int id); |
213 | |
214 | // Finds the trie node for the given suffix. Returns a Frag in order to |
215 | // distinguish between pointing at the root node directly (end.p == 0) |
216 | // and pointing at an Alt's out1 or out (end.p&1 == 1 or 0, respectively). |
217 | Frag FindByteRange(int root, int id); |
218 | |
219 | // Compares two ByteRanges and returns true iff they are equal. |
220 | bool ByteRangeEqual(int id1, int id2); |
221 | |
222 | // Returns the alternation of all the added suffixes. |
223 | Frag EndRange(); |
224 | |
225 | // Single rune. |
226 | Frag Literal(Rune r, bool foldcase); |
227 | |
228 | void Setup(Regexp::ParseFlags, int64_t, RE2::Anchor); |
229 | Prog* Finish(); |
230 | |
231 | // Returns .* where dot = any byte |
232 | Frag DotStar(); |
233 | |
234 | private: |
235 | Prog* prog_; // Program being built. |
236 | bool failed_; // Did we give up compiling? |
237 | Encoding encoding_; // Input encoding |
238 | bool reversed_; // Should program run backward over text? |
239 | |
240 | PODArray<Prog::Inst> inst_; |
241 | int ninst_; // Number of instructions used. |
242 | int max_ninst_; // Maximum number of instructions. |
243 | |
244 | int64_t max_mem_; // Total memory budget. |
245 | |
246 | std::unordered_map<uint64_t, int> rune_cache_; |
247 | Frag rune_range_; |
248 | |
249 | RE2::Anchor anchor_; // anchor mode for RE2::Set |
250 | |
251 | Compiler(const Compiler&) = delete; |
252 | Compiler& operator=(const Compiler&) = delete; |
253 | }; |
254 | |
255 | Compiler::Compiler() { |
256 | prog_ = new Prog(); |
257 | failed_ = false; |
258 | encoding_ = kEncodingUTF8; |
259 | reversed_ = false; |
260 | ninst_ = 0; |
261 | max_ninst_ = 1; // make AllocInst for fail instruction okay |
262 | max_mem_ = 0; |
263 | int fail = AllocInst(1); |
264 | inst_[fail].InitFail(); |
265 | max_ninst_ = 0; // Caller must change |
266 | } |
267 | |
268 | Compiler::~Compiler() { |
269 | delete prog_; |
270 | } |
271 | |
272 | int Compiler::AllocInst(int n) { |
273 | if (failed_ || ninst_ + n > max_ninst_) { |
274 | failed_ = true; |
275 | return -1; |
276 | } |
277 | |
278 | if (ninst_ + n > inst_.size()) { |
279 | int cap = inst_.size(); |
280 | if (cap == 0) |
281 | cap = 8; |
282 | while (ninst_ + n > cap) |
283 | cap *= 2; |
284 | PODArray<Prog::Inst> inst(cap); |
285 | if (inst_.data() != NULL) |
286 | memmove(inst.data(), inst_.data(), ninst_*sizeof inst_[0]); |
287 | memset(inst.data() + ninst_, 0, (cap - ninst_)*sizeof inst_[0]); |
288 | inst_ = std::move(inst); |
289 | } |
290 | int id = ninst_; |
291 | ninst_ += n; |
292 | return id; |
293 | } |
294 | |
295 | // These routines are somewhat hard to visualize in text -- |
296 | // see http://swtch.com/~rsc/regexp/regexp1.html for |
297 | // pictures explaining what is going on here. |
298 | |
299 | // Returns an unmatchable fragment. |
300 | Frag Compiler::NoMatch() { |
301 | return Frag(0, nullPatchList); |
302 | } |
303 | |
304 | // Is a an unmatchable fragment? |
305 | static bool IsNoMatch(Frag a) { |
306 | return a.begin == 0; |
307 | } |
308 | |
309 | // Given fragments a and b, returns fragment for ab. |
310 | Frag Compiler::Cat(Frag a, Frag b) { |
311 | if (IsNoMatch(a) || IsNoMatch(b)) |
312 | return NoMatch(); |
313 | |
314 | // Elide no-op. |
315 | Prog::Inst* begin = &inst_[a.begin]; |
316 | if (begin->opcode() == kInstNop && |
317 | a.end.p == (a.begin << 1) && |
318 | begin->out() == 0) { |
319 | // in case refs to a somewhere |
320 | PatchList::Patch(inst_.data(), a.end, b.begin); |
321 | return b; |
322 | } |
323 | |
324 | // To run backward over string, reverse all concatenations. |
325 | if (reversed_) { |
326 | PatchList::Patch(inst_.data(), b.end, a.begin); |
327 | return Frag(b.begin, a.end); |
328 | } |
329 | |
330 | PatchList::Patch(inst_.data(), a.end, b.begin); |
331 | return Frag(a.begin, b.end); |
332 | } |
333 | |
334 | // Given fragments for a and b, returns fragment for a|b. |
335 | Frag Compiler::Alt(Frag a, Frag b) { |
336 | // Special case for convenience in loops. |
337 | if (IsNoMatch(a)) |
338 | return b; |
339 | if (IsNoMatch(b)) |
340 | return a; |
341 | |
342 | int id = AllocInst(1); |
343 | if (id < 0) |
344 | return NoMatch(); |
345 | |
346 | inst_[id].InitAlt(a.begin, b.begin); |
347 | return Frag(id, PatchList::Append(inst_.data(), a.end, b.end)); |
348 | } |
349 | |
350 | // When capturing submatches in like-Perl mode, a kOpAlt Inst |
351 | // treats out_ as the first choice, out1_ as the second. |
352 | // |
353 | // For *, +, and ?, if out_ causes another repetition, |
354 | // then the operator is greedy. If out1_ is the repetition |
355 | // (and out_ moves forward), then the operator is non-greedy. |
356 | |
357 | // Given a fragment a, returns a fragment for a* or a*? (if nongreedy) |
358 | Frag Compiler::Star(Frag a, bool nongreedy) { |
359 | int id = AllocInst(1); |
360 | if (id < 0) |
361 | return NoMatch(); |
362 | inst_[id].InitAlt(0, 0); |
363 | PatchList::Patch(inst_.data(), a.end, id); |
364 | if (nongreedy) { |
365 | inst_[id].out1_ = a.begin; |
366 | return Frag(id, PatchList::Mk(id << 1)); |
367 | } else { |
368 | inst_[id].set_out(a.begin); |
369 | return Frag(id, PatchList::Mk((id << 1) | 1)); |
370 | } |
371 | } |
372 | |
373 | // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy) |
374 | Frag Compiler::Plus(Frag a, bool nongreedy) { |
375 | // a+ is just a* with a different entry point. |
376 | Frag f = Star(a, nongreedy); |
377 | return Frag(a.begin, f.end); |
378 | } |
379 | |
380 | // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy) |
381 | Frag Compiler::Quest(Frag a, bool nongreedy) { |
382 | if (IsNoMatch(a)) |
383 | return Nop(); |
384 | int id = AllocInst(1); |
385 | if (id < 0) |
386 | return NoMatch(); |
387 | PatchList pl; |
388 | if (nongreedy) { |
389 | inst_[id].InitAlt(0, a.begin); |
390 | pl = PatchList::Mk(id << 1); |
391 | } else { |
392 | inst_[id].InitAlt(a.begin, 0); |
393 | pl = PatchList::Mk((id << 1) | 1); |
394 | } |
395 | return Frag(id, PatchList::Append(inst_.data(), pl, a.end)); |
396 | } |
397 | |
398 | // Returns a fragment for the byte range lo-hi. |
399 | Frag Compiler::ByteRange(int lo, int hi, bool foldcase) { |
400 | int id = AllocInst(1); |
401 | if (id < 0) |
402 | return NoMatch(); |
403 | inst_[id].InitByteRange(lo, hi, foldcase, 0); |
404 | return Frag(id, PatchList::Mk(id << 1)); |
405 | } |
406 | |
407 | // Returns a no-op fragment. Sometimes unavoidable. |
408 | Frag Compiler::Nop() { |
409 | int id = AllocInst(1); |
410 | if (id < 0) |
411 | return NoMatch(); |
412 | inst_[id].InitNop(0); |
413 | return Frag(id, PatchList::Mk(id << 1)); |
414 | } |
415 | |
416 | // Returns a fragment that signals a match. |
417 | Frag Compiler::Match(int32_t match_id) { |
418 | int id = AllocInst(1); |
419 | if (id < 0) |
420 | return NoMatch(); |
421 | inst_[id].InitMatch(match_id); |
422 | return Frag(id, nullPatchList); |
423 | } |
424 | |
425 | // Returns a fragment matching a particular empty-width op (like ^ or $) |
426 | Frag Compiler::EmptyWidth(EmptyOp empty) { |
427 | int id = AllocInst(1); |
428 | if (id < 0) |
429 | return NoMatch(); |
430 | inst_[id].InitEmptyWidth(empty, 0); |
431 | return Frag(id, PatchList::Mk(id << 1)); |
432 | } |
433 | |
434 | // Given a fragment a, returns a fragment with capturing parens around a. |
435 | Frag Compiler::Capture(Frag a, int n) { |
436 | if (IsNoMatch(a)) |
437 | return NoMatch(); |
438 | int id = AllocInst(2); |
439 | if (id < 0) |
440 | return NoMatch(); |
441 | inst_[id].InitCapture(2*n, a.begin); |
442 | inst_[id+1].InitCapture(2*n+1, 0); |
443 | PatchList::Patch(inst_.data(), a.end, id+1); |
444 | |
445 | return Frag(id, PatchList::Mk((id+1) << 1)); |
446 | } |
447 | |
448 | // A Rune is a name for a Unicode code point. |
449 | // Returns maximum rune encoded by UTF-8 sequence of length len. |
450 | static int MaxRune(int len) { |
451 | int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax) |
452 | if (len == 1) |
453 | b = 7; |
454 | else |
455 | b = 8-(len+1) + 6*(len-1); |
456 | return (1<<b) - 1; // maximum Rune for b bits. |
457 | } |
458 | |
459 | // The rune range compiler caches common suffix fragments, |
460 | // which are very common in UTF-8 (e.g., [80-bf]). |
461 | // The fragment suffixes are identified by their start |
462 | // instructions. NULL denotes the eventual end match. |
463 | // The Frag accumulates in rune_range_. Caching common |
464 | // suffixes reduces the UTF-8 "." from 32 to 24 instructions, |
465 | // and it reduces the corresponding one-pass NFA from 16 nodes to 8. |
466 | |
467 | void Compiler::BeginRange() { |
468 | rune_cache_.clear(); |
469 | rune_range_.begin = 0; |
470 | rune_range_.end = nullPatchList; |
471 | } |
472 | |
473 | int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, |
474 | int next) { |
475 | Frag f = ByteRange(lo, hi, foldcase); |
476 | if (next != 0) { |
477 | PatchList::Patch(inst_.data(), f.end, next); |
478 | } else { |
479 | rune_range_.end = PatchList::Append(inst_.data(), rune_range_.end, f.end); |
480 | } |
481 | return f.begin; |
482 | } |
483 | |
484 | static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase, |
485 | int next) { |
486 | return (uint64_t)next << 17 | |
487 | (uint64_t)lo << 9 | |
488 | (uint64_t)hi << 1 | |
489 | (uint64_t)foldcase; |
490 | } |
491 | |
492 | int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, |
493 | int next) { |
494 | uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next); |
495 | std::unordered_map<uint64_t, int>::const_iterator it = rune_cache_.find(key); |
496 | if (it != rune_cache_.end()) |
497 | return it->second; |
498 | int id = UncachedRuneByteSuffix(lo, hi, foldcase, next); |
499 | rune_cache_[key] = id; |
500 | return id; |
501 | } |
502 | |
503 | bool Compiler::IsCachedRuneByteSuffix(int id) { |
504 | uint8_t lo = inst_[id].lo_; |
505 | uint8_t hi = inst_[id].hi_; |
506 | bool foldcase = inst_[id].foldcase() != 0; |
507 | int next = inst_[id].out(); |
508 | |
509 | uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next); |
510 | return rune_cache_.find(key) != rune_cache_.end(); |
511 | } |
512 | |
513 | void Compiler::AddSuffix(int id) { |
514 | if (failed_) |
515 | return; |
516 | |
517 | if (rune_range_.begin == 0) { |
518 | rune_range_.begin = id; |
519 | return; |
520 | } |
521 | |
522 | if (encoding_ == kEncodingUTF8) { |
523 | // Build a trie in order to reduce fanout. |
524 | rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id); |
525 | return; |
526 | } |
527 | |
528 | int alt = AllocInst(1); |
529 | if (alt < 0) { |
530 | rune_range_.begin = 0; |
531 | return; |
532 | } |
533 | inst_[alt].InitAlt(rune_range_.begin, id); |
534 | rune_range_.begin = alt; |
535 | } |
536 | |
537 | int Compiler::AddSuffixRecursive(int root, int id) { |
538 | DCHECK(inst_[root].opcode() == kInstAlt || |
539 | inst_[root].opcode() == kInstByteRange); |
540 | |
541 | Frag f = FindByteRange(root, id); |
542 | if (IsNoMatch(f)) { |
543 | int alt = AllocInst(1); |
544 | if (alt < 0) |
545 | return 0; |
546 | inst_[alt].InitAlt(root, id); |
547 | return alt; |
548 | } |
549 | |
550 | int br; |
551 | if (f.end.p == 0) |
552 | br = root; |
553 | else if (f.end.p&1) |
554 | br = inst_[f.begin].out1(); |
555 | else |
556 | br = inst_[f.begin].out(); |
557 | |
558 | if (IsCachedRuneByteSuffix(br)) { |
559 | // We can't fiddle with cached suffixes, so make a clone of the head. |
560 | int byterange = AllocInst(1); |
561 | if (byterange < 0) |
562 | return 0; |
563 | inst_[byterange].InitByteRange(inst_[br].lo(), inst_[br].hi(), |
564 | inst_[br].foldcase(), inst_[br].out()); |
565 | |
566 | // Ensure that the parent points to the clone, not to the original. |
567 | // Note that this could leave the head unreachable except via the cache. |
568 | br = byterange; |
569 | if (f.end.p == 0) |
570 | root = br; |
571 | else if (f.end.p&1) |
572 | inst_[f.begin].out1_ = br; |
573 | else |
574 | inst_[f.begin].set_out(br); |
575 | } |
576 | |
577 | int out = inst_[id].out(); |
578 | if (!IsCachedRuneByteSuffix(id)) { |
579 | // The head should be the instruction most recently allocated, so free it |
580 | // instead of leaving it unreachable. |
581 | DCHECK_EQ(id, ninst_-1); |
582 | inst_[id].out_opcode_ = 0; |
583 | inst_[id].out1_ = 0; |
584 | ninst_--; |
585 | } |
586 | |
587 | out = AddSuffixRecursive(inst_[br].out(), out); |
588 | if (out == 0) |
589 | return 0; |
590 | |
591 | inst_[br].set_out(out); |
592 | return root; |
593 | } |
594 | |
595 | bool Compiler::ByteRangeEqual(int id1, int id2) { |
596 | return inst_[id1].lo() == inst_[id2].lo() && |
597 | inst_[id1].hi() == inst_[id2].hi() && |
598 | inst_[id1].foldcase() == inst_[id2].foldcase(); |
599 | } |
600 | |
601 | Frag Compiler::FindByteRange(int root, int id) { |
602 | if (inst_[root].opcode() == kInstByteRange) { |
603 | if (ByteRangeEqual(root, id)) |
604 | return Frag(root, nullPatchList); |
605 | else |
606 | return NoMatch(); |
607 | } |
608 | |
609 | while (inst_[root].opcode() == kInstAlt) { |
610 | int out1 = inst_[root].out1(); |
611 | if (ByteRangeEqual(out1, id)) |
612 | return Frag(root, PatchList::Mk((root << 1) | 1)); |
613 | |
614 | // CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't |
615 | // what we're looking for, then we can stop immediately. Unfortunately, we |
616 | // can't short-circuit the search in reverse mode. |
617 | if (!reversed_) |
618 | return NoMatch(); |
619 | |
620 | int out = inst_[root].out(); |
621 | if (inst_[out].opcode() == kInstAlt) |
622 | root = out; |
623 | else if (ByteRangeEqual(out, id)) |
624 | return Frag(root, PatchList::Mk(root << 1)); |
625 | else |
626 | return NoMatch(); |
627 | } |
628 | |
629 | LOG(DFATAL) << "should never happen" ; |
630 | return NoMatch(); |
631 | } |
632 | |
633 | Frag Compiler::EndRange() { |
634 | return rune_range_; |
635 | } |
636 | |
637 | // Converts rune range lo-hi into a fragment that recognizes |
638 | // the bytes that would make up those runes in the current |
639 | // encoding (Latin 1 or UTF-8). |
640 | // This lets the machine work byte-by-byte even when |
641 | // using multibyte encodings. |
642 | |
643 | void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) { |
644 | switch (encoding_) { |
645 | default: |
646 | case kEncodingUTF8: |
647 | AddRuneRangeUTF8(lo, hi, foldcase); |
648 | break; |
649 | case kEncodingLatin1: |
650 | AddRuneRangeLatin1(lo, hi, foldcase); |
651 | break; |
652 | } |
653 | } |
654 | |
655 | void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) { |
656 | // Latin-1 is easy: runes *are* bytes. |
657 | if (lo > hi || lo > 0xFF) |
658 | return; |
659 | if (hi > 0xFF) |
660 | hi = 0xFF; |
661 | AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo), |
662 | static_cast<uint8_t>(hi), foldcase, 0)); |
663 | } |
664 | |
665 | // Table describing how to make a UTF-8 matching machine |
666 | // for the rune range 80-10FFFF (Runeself-Runemax). |
667 | // This range happens frequently enough (for example /./ and /[^a-z]/) |
668 | // and the rune_cache_ map is slow enough that this is worth |
669 | // special handling. Makes compilation of a small expression |
670 | // with a dot in it about 10% faster. |
671 | // The * in the comments below mark whole sequences. |
672 | static struct ByteRangeProg { |
673 | int next; |
674 | int lo; |
675 | int hi; |
676 | } prog_80_10ffff[] = { |
677 | // Two-byte |
678 | { -1, 0x80, 0xBF, }, // 0: 80-BF |
679 | { 0, 0xC2, 0xDF, }, // 1: C2-DF 80-BF* |
680 | |
681 | // Three-byte |
682 | { 0, 0xA0, 0xBF, }, // 2: A0-BF 80-BF |
683 | { 2, 0xE0, 0xE0, }, // 3: E0 A0-BF 80-BF* |
684 | { 0, 0x80, 0xBF, }, // 4: 80-BF 80-BF |
685 | { 4, 0xE1, 0xEF, }, // 5: E1-EF 80-BF 80-BF* |
686 | |
687 | // Four-byte |
688 | { 4, 0x90, 0xBF, }, // 6: 90-BF 80-BF 80-BF |
689 | { 6, 0xF0, 0xF0, }, // 7: F0 90-BF 80-BF 80-BF* |
690 | { 4, 0x80, 0xBF, }, // 8: 80-BF 80-BF 80-BF |
691 | { 8, 0xF1, 0xF3, }, // 9: F1-F3 80-BF 80-BF 80-BF* |
692 | { 4, 0x80, 0x8F, }, // 10: 80-8F 80-BF 80-BF |
693 | { 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF* |
694 | }; |
695 | |
696 | void Compiler::Add_80_10ffff() { |
697 | int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning |
698 | for (size_t i = 0; i < arraysize(prog_80_10ffff); i++) { |
699 | const ByteRangeProg& p = prog_80_10ffff[i]; |
700 | int next = 0; |
701 | if (p.next >= 0) |
702 | next = inst[p.next]; |
703 | inst[i] = UncachedRuneByteSuffix(static_cast<uint8_t>(p.lo), |
704 | static_cast<uint8_t>(p.hi), false, next); |
705 | if ((p.lo & 0xC0) != 0x80) |
706 | AddSuffix(inst[i]); |
707 | } |
708 | } |
709 | |
710 | void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { |
711 | if (lo > hi) |
712 | return; |
713 | |
714 | // Pick off 80-10FFFF as a common special case |
715 | // that can bypass the slow rune_cache_. |
716 | if (lo == 0x80 && hi == 0x10ffff && !reversed_) { |
717 | Add_80_10ffff(); |
718 | return; |
719 | } |
720 | |
721 | // Split range into same-length sized ranges. |
722 | for (int i = 1; i < UTFmax; i++) { |
723 | Rune max = MaxRune(i); |
724 | if (lo <= max && max < hi) { |
725 | AddRuneRangeUTF8(lo, max, foldcase); |
726 | AddRuneRangeUTF8(max+1, hi, foldcase); |
727 | return; |
728 | } |
729 | } |
730 | |
731 | // ASCII range is always a special case. |
732 | if (hi < Runeself) { |
733 | AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo), |
734 | static_cast<uint8_t>(hi), foldcase, 0)); |
735 | return; |
736 | } |
737 | |
738 | // Split range into sections that agree on leading bytes. |
739 | for (int i = 1; i < UTFmax; i++) { |
740 | uint32_t m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence |
741 | if ((lo & ~m) != (hi & ~m)) { |
742 | if ((lo & m) != 0) { |
743 | AddRuneRangeUTF8(lo, lo|m, foldcase); |
744 | AddRuneRangeUTF8((lo|m)+1, hi, foldcase); |
745 | return; |
746 | } |
747 | if ((hi & m) != m) { |
748 | AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase); |
749 | AddRuneRangeUTF8(hi&~m, hi, foldcase); |
750 | return; |
751 | } |
752 | } |
753 | } |
754 | |
755 | // Finally. Generate byte matching equivalent for lo-hi. |
756 | uint8_t ulo[UTFmax], uhi[UTFmax]; |
757 | int n = runetochar(reinterpret_cast<char*>(ulo), &lo); |
758 | int m = runetochar(reinterpret_cast<char*>(uhi), &hi); |
759 | (void)m; // USED(m) |
760 | DCHECK_EQ(n, m); |
761 | |
762 | // The logic below encodes this thinking: |
763 | // |
764 | // 1. When we have built the whole suffix, we know that it cannot |
765 | // possibly be a suffix of anything longer: in forward mode, nothing |
766 | // else can occur before the leading byte; in reverse mode, nothing |
767 | // else can occur after the last continuation byte or else the leading |
768 | // byte would have to change. Thus, there is no benefit to caching |
769 | // the first byte of the suffix whereas there is a cost involved in |
770 | // cloning it if it begins a common prefix, which is fairly likely. |
771 | // |
772 | // 2. Conversely, the last byte of the suffix cannot possibly be a |
773 | // prefix of anything because next == 0, so we will never want to |
774 | // clone it, but it is fairly likely to be a common suffix. Perhaps |
775 | // more so in reverse mode than in forward mode because the former is |
776 | // "converging" towards lower entropy, but caching is still worthwhile |
777 | // for the latter in cases such as 80-BF. |
778 | // |
779 | // 3. Handling the bytes between the first and the last is less |
780 | // straightforward and, again, the approach depends on whether we are |
781 | // "converging" towards lower entropy: in forward mode, a single byte |
782 | // is unlikely to be part of a common suffix whereas a byte range |
783 | // is more likely so; in reverse mode, a byte range is unlikely to |
784 | // be part of a common suffix whereas a single byte is more likely |
785 | // so. The same benefit versus cost argument applies here. |
786 | int id = 0; |
787 | if (reversed_) { |
788 | for (int i = 0; i < n; i++) { |
789 | // In reverse UTF-8 mode: cache the leading byte; don't cache the last |
790 | // continuation byte; cache anything else iff it's a single byte (XX-XX). |
791 | if (i == 0 || (ulo[i] == uhi[i] && i != n-1)) |
792 | id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id); |
793 | else |
794 | id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id); |
795 | } |
796 | } else { |
797 | for (int i = n-1; i >= 0; i--) { |
798 | // In forward UTF-8 mode: don't cache the leading byte; cache the last |
799 | // continuation byte; cache anything else iff it's a byte range (XX-YY). |
800 | if (i == n-1 || (ulo[i] < uhi[i] && i != 0)) |
801 | id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id); |
802 | else |
803 | id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id); |
804 | } |
805 | } |
806 | AddSuffix(id); |
807 | } |
808 | |
809 | // Should not be called. |
810 | Frag Compiler::Copy(Frag arg) { |
811 | // We're using WalkExponential; there should be no copying. |
812 | LOG(DFATAL) << "Compiler::Copy called!" ; |
813 | failed_ = true; |
814 | return NoMatch(); |
815 | } |
816 | |
817 | // Visits a node quickly; called once WalkExponential has |
818 | // decided to cut this walk short. |
819 | Frag Compiler::ShortVisit(Regexp* re, Frag) { |
820 | failed_ = true; |
821 | return NoMatch(); |
822 | } |
823 | |
824 | // Called before traversing a node's children during the walk. |
825 | Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) { |
826 | // Cut off walk if we've already failed. |
827 | if (failed_) |
828 | *stop = true; |
829 | |
830 | return Frag(); // not used by caller |
831 | } |
832 | |
833 | Frag Compiler::Literal(Rune r, bool foldcase) { |
834 | switch (encoding_) { |
835 | default: |
836 | return Frag(); |
837 | |
838 | case kEncodingLatin1: |
839 | return ByteRange(r, r, foldcase); |
840 | |
841 | case kEncodingUTF8: { |
842 | if (r < Runeself) // Make common case fast. |
843 | return ByteRange(r, r, foldcase); |
844 | uint8_t buf[UTFmax]; |
845 | int n = runetochar(reinterpret_cast<char*>(buf), &r); |
846 | Frag f = ByteRange((uint8_t)buf[0], buf[0], false); |
847 | for (int i = 1; i < n; i++) |
848 | f = Cat(f, ByteRange((uint8_t)buf[i], buf[i], false)); |
849 | return f; |
850 | } |
851 | } |
852 | } |
853 | |
854 | // Called after traversing the node's children during the walk. |
855 | // Given their frags, build and return the frag for this re. |
856 | Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, |
857 | int nchild_frags) { |
858 | // If a child failed, don't bother going forward, especially |
859 | // since the child_frags might contain Frags with NULLs in them. |
860 | if (failed_) |
861 | return NoMatch(); |
862 | |
863 | // Given the child fragments, return the fragment for this node. |
864 | switch (re->op()) { |
865 | case kRegexpRepeat: |
866 | // Should not see; code at bottom of function will print error |
867 | break; |
868 | |
869 | case kRegexpNoMatch: |
870 | return NoMatch(); |
871 | |
872 | case kRegexpEmptyMatch: |
873 | return Nop(); |
874 | |
875 | case kRegexpHaveMatch: { |
876 | Frag f = Match(re->match_id()); |
877 | if (anchor_ == RE2::ANCHOR_BOTH) { |
878 | // Append \z or else the subexpression will effectively be unanchored. |
879 | // Complemented by the UNANCHORED case in CompileSet(). |
880 | f = Cat(EmptyWidth(kEmptyEndText), f); |
881 | } |
882 | return f; |
883 | } |
884 | |
885 | case kRegexpConcat: { |
886 | Frag f = child_frags[0]; |
887 | for (int i = 1; i < nchild_frags; i++) |
888 | f = Cat(f, child_frags[i]); |
889 | return f; |
890 | } |
891 | |
892 | case kRegexpAlternate: { |
893 | Frag f = child_frags[0]; |
894 | for (int i = 1; i < nchild_frags; i++) |
895 | f = Alt(f, child_frags[i]); |
896 | return f; |
897 | } |
898 | |
899 | case kRegexpStar: |
900 | return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); |
901 | |
902 | case kRegexpPlus: |
903 | return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); |
904 | |
905 | case kRegexpQuest: |
906 | return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); |
907 | |
908 | case kRegexpLiteral: |
909 | return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0); |
910 | |
911 | case kRegexpLiteralString: { |
912 | // Concatenation of literals. |
913 | if (re->nrunes() == 0) |
914 | return Nop(); |
915 | Frag f; |
916 | for (int i = 0; i < re->nrunes(); i++) { |
917 | Frag f1 = Literal(re->runes()[i], |
918 | (re->parse_flags()&Regexp::FoldCase) != 0); |
919 | if (i == 0) |
920 | f = f1; |
921 | else |
922 | f = Cat(f, f1); |
923 | } |
924 | return f; |
925 | } |
926 | |
927 | case kRegexpAnyChar: |
928 | BeginRange(); |
929 | AddRuneRange(0, Runemax, false); |
930 | return EndRange(); |
931 | |
932 | case kRegexpAnyByte: |
933 | return ByteRange(0x00, 0xFF, false); |
934 | |
935 | case kRegexpCharClass: { |
936 | CharClass* cc = re->cc(); |
937 | if (cc->empty()) { |
938 | // This can't happen. |
939 | LOG(DFATAL) << "No ranges in char class" ; |
940 | failed_ = true; |
941 | return NoMatch(); |
942 | } |
943 | |
944 | // ASCII case-folding optimization: if the char class |
945 | // behaves the same on A-Z as it does on a-z, |
946 | // discard any ranges wholly contained in A-Z |
947 | // and mark the other ranges as foldascii. |
948 | // This reduces the size of a program for |
949 | // (?i)abc from 3 insts per letter to 1 per letter. |
950 | bool foldascii = cc->FoldsASCII(); |
951 | |
952 | // Character class is just a big OR of the different |
953 | // character ranges in the class. |
954 | BeginRange(); |
955 | for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) { |
956 | // ASCII case-folding optimization (see above). |
957 | if (foldascii && 'A' <= i->lo && i->hi <= 'Z') |
958 | continue; |
959 | |
960 | // If this range contains all of A-Za-z or none of it, |
961 | // the fold flag is unnecessary; don't bother. |
962 | bool fold = foldascii; |
963 | if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo || |
964 | ('Z' < i->lo && i->hi < 'a')) |
965 | fold = false; |
966 | |
967 | AddRuneRange(i->lo, i->hi, fold); |
968 | } |
969 | return EndRange(); |
970 | } |
971 | |
972 | case kRegexpCapture: |
973 | // If this is a non-capturing parenthesis -- (?:foo) -- |
974 | // just use the inner expression. |
975 | if (re->cap() < 0) |
976 | return child_frags[0]; |
977 | return Capture(child_frags[0], re->cap()); |
978 | |
979 | case kRegexpBeginLine: |
980 | return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine); |
981 | |
982 | case kRegexpEndLine: |
983 | return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine); |
984 | |
985 | case kRegexpBeginText: |
986 | return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText); |
987 | |
988 | case kRegexpEndText: |
989 | return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText); |
990 | |
991 | case kRegexpWordBoundary: |
992 | return EmptyWidth(kEmptyWordBoundary); |
993 | |
994 | case kRegexpNoWordBoundary: |
995 | return EmptyWidth(kEmptyNonWordBoundary); |
996 | } |
997 | LOG(DFATAL) << "Missing case in Compiler: " << re->op(); |
998 | failed_ = true; |
999 | return NoMatch(); |
1000 | } |
1001 | |
1002 | // Is this regexp required to start at the beginning of the text? |
1003 | // Only approximate; can return false for complicated regexps like (\Aa|\Ab), |
1004 | // but handles (\A(a|b)). Could use the Walker to write a more exact one. |
1005 | static bool IsAnchorStart(Regexp** pre, int depth) { |
1006 | Regexp* re = *pre; |
1007 | Regexp* sub; |
1008 | // The depth limit makes sure that we don't overflow |
1009 | // the stack on a deeply nested regexp. As the comment |
1010 | // above says, IsAnchorStart is conservative, so returning |
1011 | // a false negative is okay. The exact limit is somewhat arbitrary. |
1012 | if (re == NULL || depth >= 4) |
1013 | return false; |
1014 | switch (re->op()) { |
1015 | default: |
1016 | break; |
1017 | case kRegexpConcat: |
1018 | if (re->nsub() > 0) { |
1019 | sub = re->sub()[0]->Incref(); |
1020 | if (IsAnchorStart(&sub, depth+1)) { |
1021 | PODArray<Regexp*> subcopy(re->nsub()); |
1022 | subcopy[0] = sub; // already have reference |
1023 | for (int i = 1; i < re->nsub(); i++) |
1024 | subcopy[i] = re->sub()[i]->Incref(); |
1025 | *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags()); |
1026 | re->Decref(); |
1027 | return true; |
1028 | } |
1029 | sub->Decref(); |
1030 | } |
1031 | break; |
1032 | case kRegexpCapture: |
1033 | sub = re->sub()[0]->Incref(); |
1034 | if (IsAnchorStart(&sub, depth+1)) { |
1035 | *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); |
1036 | re->Decref(); |
1037 | return true; |
1038 | } |
1039 | sub->Decref(); |
1040 | break; |
1041 | case kRegexpBeginText: |
1042 | *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); |
1043 | re->Decref(); |
1044 | return true; |
1045 | } |
1046 | return false; |
1047 | } |
1048 | |
1049 | // Is this regexp required to start at the end of the text? |
1050 | // Only approximate; can return false for complicated regexps like (a\z|b\z), |
1051 | // but handles ((a|b)\z). Could use the Walker to write a more exact one. |
1052 | static bool IsAnchorEnd(Regexp** pre, int depth) { |
1053 | Regexp* re = *pre; |
1054 | Regexp* sub; |
1055 | // The depth limit makes sure that we don't overflow |
1056 | // the stack on a deeply nested regexp. As the comment |
1057 | // above says, IsAnchorEnd is conservative, so returning |
1058 | // a false negative is okay. The exact limit is somewhat arbitrary. |
1059 | if (re == NULL || depth >= 4) |
1060 | return false; |
1061 | switch (re->op()) { |
1062 | default: |
1063 | break; |
1064 | case kRegexpConcat: |
1065 | if (re->nsub() > 0) { |
1066 | sub = re->sub()[re->nsub() - 1]->Incref(); |
1067 | if (IsAnchorEnd(&sub, depth+1)) { |
1068 | PODArray<Regexp*> subcopy(re->nsub()); |
1069 | subcopy[re->nsub() - 1] = sub; // already have reference |
1070 | for (int i = 0; i < re->nsub() - 1; i++) |
1071 | subcopy[i] = re->sub()[i]->Incref(); |
1072 | *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags()); |
1073 | re->Decref(); |
1074 | return true; |
1075 | } |
1076 | sub->Decref(); |
1077 | } |
1078 | break; |
1079 | case kRegexpCapture: |
1080 | sub = re->sub()[0]->Incref(); |
1081 | if (IsAnchorEnd(&sub, depth+1)) { |
1082 | *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); |
1083 | re->Decref(); |
1084 | return true; |
1085 | } |
1086 | sub->Decref(); |
1087 | break; |
1088 | case kRegexpEndText: |
1089 | *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); |
1090 | re->Decref(); |
1091 | return true; |
1092 | } |
1093 | return false; |
1094 | } |
1095 | |
1096 | void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem, |
1097 | RE2::Anchor anchor) { |
1098 | prog_->set_flags(flags); |
1099 | |
1100 | if (flags & Regexp::Latin1) |
1101 | encoding_ = kEncodingLatin1; |
1102 | max_mem_ = max_mem; |
1103 | if (max_mem <= 0) { |
1104 | max_ninst_ = 100000; // more than enough |
1105 | } else if (static_cast<size_t>(max_mem) <= sizeof(Prog)) { |
1106 | // No room for anything. |
1107 | max_ninst_ = 0; |
1108 | } else { |
1109 | int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst); |
1110 | // Limit instruction count so that inst->id() fits nicely in an int. |
1111 | // SparseArray also assumes that the indices (inst->id()) are ints. |
1112 | // The call to WalkExponential uses 2*max_ninst_ below, |
1113 | // and other places in the code use 2 or 3 * prog->size(). |
1114 | // Limiting to 2^24 should avoid overflow in those places. |
1115 | // (The point of allowing more than 32 bits of memory is to |
1116 | // have plenty of room for the DFA states, not to use it up |
1117 | // on the program.) |
1118 | if (m >= 1<<24) |
1119 | m = 1<<24; |
1120 | |
1121 | // Inst imposes its own limit (currently bigger than 2^24 but be safe). |
1122 | if (m > Prog::Inst::kMaxInst) |
1123 | m = Prog::Inst::kMaxInst; |
1124 | |
1125 | max_ninst_ = static_cast<int>(m); |
1126 | } |
1127 | |
1128 | anchor_ = anchor; |
1129 | } |
1130 | |
1131 | // Compiles re, returning program. |
1132 | // Caller is responsible for deleting prog_. |
1133 | // If reversed is true, compiles a program that expects |
1134 | // to run over the input string backward (reverses all concatenations). |
1135 | // The reversed flag is also recorded in the returned program. |
1136 | Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) { |
1137 | Compiler c; |
1138 | c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */); |
1139 | c.reversed_ = reversed; |
1140 | |
1141 | // Simplify to remove things like counted repetitions |
1142 | // and character classes like \d. |
1143 | Regexp* sre = re->Simplify(); |
1144 | if (sre == NULL) |
1145 | return NULL; |
1146 | |
1147 | // Record whether prog is anchored, removing the anchors. |
1148 | // (They get in the way of other optimizations.) |
1149 | bool is_anchor_start = IsAnchorStart(&sre, 0); |
1150 | bool is_anchor_end = IsAnchorEnd(&sre, 0); |
1151 | |
1152 | // Generate fragment for entire regexp. |
1153 | Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_); |
1154 | sre->Decref(); |
1155 | if (c.failed_) |
1156 | return NULL; |
1157 | |
1158 | // Success! Finish by putting Match node at end, and record start. |
1159 | // Turn off c.reversed_ (if it is set) to force the remaining concatenations |
1160 | // to behave normally. |
1161 | c.reversed_ = false; |
1162 | all = c.Cat(all, c.Match(0)); |
1163 | |
1164 | c.prog_->set_reversed(reversed); |
1165 | if (c.prog_->reversed()) { |
1166 | c.prog_->set_anchor_start(is_anchor_end); |
1167 | c.prog_->set_anchor_end(is_anchor_start); |
1168 | } else { |
1169 | c.prog_->set_anchor_start(is_anchor_start); |
1170 | c.prog_->set_anchor_end(is_anchor_end); |
1171 | } |
1172 | |
1173 | c.prog_->set_start(all.begin); |
1174 | if (!c.prog_->anchor_start()) { |
1175 | // Also create unanchored version, which starts with a .*? loop. |
1176 | all = c.Cat(c.DotStar(), all); |
1177 | } |
1178 | c.prog_->set_start_unanchored(all.begin); |
1179 | |
1180 | // Hand ownership of prog_ to caller. |
1181 | return c.Finish(); |
1182 | } |
1183 | |
1184 | Prog* Compiler::Finish() { |
1185 | if (failed_) |
1186 | return NULL; |
1187 | |
1188 | if (prog_->start() == 0 && prog_->start_unanchored() == 0) { |
1189 | // No possible matches; keep Fail instruction only. |
1190 | ninst_ = 1; |
1191 | } |
1192 | |
1193 | // Hand off the array to Prog. |
1194 | prog_->inst_ = std::move(inst_); |
1195 | prog_->size_ = ninst_; |
1196 | |
1197 | prog_->Optimize(); |
1198 | prog_->Flatten(); |
1199 | prog_->ComputeByteMap(); |
1200 | |
1201 | // Record remaining memory for DFA. |
1202 | if (max_mem_ <= 0) { |
1203 | prog_->set_dfa_mem(1<<20); |
1204 | } else { |
1205 | int64_t m = max_mem_ - sizeof(Prog); |
1206 | m -= prog_->size_*sizeof(Prog::Inst); // account for inst_ |
1207 | if (prog_->CanBitState()) |
1208 | m -= prog_->size_*sizeof(uint16_t); // account for list_heads_ |
1209 | if (m < 0) |
1210 | m = 0; |
1211 | prog_->set_dfa_mem(m); |
1212 | } |
1213 | |
1214 | Prog* p = prog_; |
1215 | prog_ = NULL; |
1216 | return p; |
1217 | } |
1218 | |
1219 | // Converts Regexp to Prog. |
1220 | Prog* Regexp::CompileToProg(int64_t max_mem) { |
1221 | return Compiler::Compile(this, false, max_mem); |
1222 | } |
1223 | |
1224 | Prog* Regexp::CompileToReverseProg(int64_t max_mem) { |
1225 | return Compiler::Compile(this, true, max_mem); |
1226 | } |
1227 | |
1228 | Frag Compiler::DotStar() { |
1229 | return Star(ByteRange(0x00, 0xff, false), true); |
1230 | } |
1231 | |
1232 | // Compiles RE set to Prog. |
1233 | Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) { |
1234 | Compiler c; |
1235 | c.Setup(re->parse_flags(), max_mem, anchor); |
1236 | |
1237 | Regexp* sre = re->Simplify(); |
1238 | if (sre == NULL) |
1239 | return NULL; |
1240 | |
1241 | Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_); |
1242 | sre->Decref(); |
1243 | if (c.failed_) |
1244 | return NULL; |
1245 | |
1246 | c.prog_->set_anchor_start(true); |
1247 | c.prog_->set_anchor_end(true); |
1248 | |
1249 | if (anchor == RE2::UNANCHORED) { |
1250 | // Prepend .* or else the expression will effectively be anchored. |
1251 | // Complemented by the ANCHOR_BOTH case in PostVisit(). |
1252 | all = c.Cat(c.DotStar(), all); |
1253 | } |
1254 | c.prog_->set_start(all.begin); |
1255 | c.prog_->set_start_unanchored(all.begin); |
1256 | |
1257 | Prog* prog = c.Finish(); |
1258 | if (prog == NULL) |
1259 | return NULL; |
1260 | |
1261 | // Make sure DFA has enough memory to operate, |
1262 | // since we're not going to fall back to the NFA. |
1263 | bool dfa_failed = false; |
1264 | StringPiece sp = "hello, world" ; |
1265 | prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch, |
1266 | NULL, &dfa_failed, NULL); |
1267 | if (dfa_failed) { |
1268 | delete prog; |
1269 | return NULL; |
1270 | } |
1271 | |
1272 | return prog; |
1273 | } |
1274 | |
1275 | Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) { |
1276 | return Compiler::CompileSet(re, anchor, max_mem); |
1277 | } |
1278 | |
1279 | } // namespace re2 |
1280 | |