1// Copyright 2007 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Compile regular expression to Prog.
6//
7// Prog and Inst are defined in prog.h.
8// This file's external interface is just Regexp::CompileToProg.
9// The Compiler class defined in this file is private.
10
11#include <stdint.h>
12#include <string.h>
13#include <unordered_map>
14#include <utility>
15
16#include "util/logging.h"
17#include "util/utf.h"
18#include "re2/pod_array.h"
19#include "re2/prog.h"
20#include "re2/re2.h"
21#include "re2/regexp.h"
22#include "re2/walker-inl.h"
23
24namespace re2 {
25
26// List of pointers to Inst* that need to be filled in (patched).
27// Because the Inst* haven't been filled in yet,
28// we can use the Inst* word to hold the list's "next" pointer.
29// It's kind of sleazy, but it works well in practice.
30// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
31//
32// Because the out and out1 fields in Inst are no longer pointers,
33// we can't use pointers directly here either. Instead, p refers
34// to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1).
35// p == 0 represents the NULL list. This is okay because instruction #0
36// is always the fail instruction, which never appears on a list.
37
38struct PatchList {
39 uint32_t p;
40
41 // Returns patch list containing just p.
42 static PatchList Mk(uint32_t p);
43
44 // Patches all the entries on l to have value v.
45 // Caller must not ever use patch list again.
46 static void Patch(Prog::Inst *inst0, PatchList l, uint32_t v);
47
48 // Deref returns the next pointer pointed at by p.
49 static PatchList Deref(Prog::Inst *inst0, PatchList l);
50
51 // Appends two patch lists and returns result.
52 static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2);
53};
54
55static PatchList nullPatchList = { 0 };
56
57// Returns patch list containing just p.
58PatchList PatchList::Mk(uint32_t p) {
59 PatchList l;
60 l.p = p;
61 return l;
62}
63
64// Returns the next pointer pointed at by l.
65PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) {
66 Prog::Inst* ip = &inst0[l.p>>1];
67 if (l.p&1)
68 l.p = ip->out1();
69 else
70 l.p = ip->out();
71 return l;
72}
73
74// Patches all the entries on l to have value v.
75void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32_t val) {
76 while (l.p != 0) {
77 Prog::Inst* ip = &inst0[l.p>>1];
78 if (l.p&1) {
79 l.p = ip->out1();
80 ip->out1_ = val;
81 } else {
82 l.p = ip->out();
83 ip->set_out(val);
84 }
85 }
86}
87
88// Appends two patch lists and returns result.
89PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
90 if (l1.p == 0)
91 return l2;
92 if (l2.p == 0)
93 return l1;
94
95 PatchList l = l1;
96 for (;;) {
97 PatchList next = PatchList::Deref(inst0, l);
98 if (next.p == 0)
99 break;
100 l = next;
101 }
102
103 Prog::Inst* ip = &inst0[l.p>>1];
104 if (l.p&1)
105 ip->out1_ = l2.p;
106 else
107 ip->set_out(l2.p);
108
109 return l1;
110}
111
112// Compiled program fragment.
113struct Frag {
114 uint32_t begin;
115 PatchList end;
116
117 Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector
118 Frag(uint32_t begin, PatchList end) : begin(begin), end(end) {}
119};
120
121// Input encodings.
122enum Encoding {
123 kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)
124 kEncodingLatin1, // Latin-1 (0-FF)
125};
126
127class Compiler : public Regexp::Walker<Frag> {
128 public:
129 explicit Compiler();
130 ~Compiler();
131
132 // Compiles Regexp to a new Prog.
133 // Caller is responsible for deleting Prog when finished with it.
134 // If reversed is true, compiles for walking over the input
135 // string backward (reverses all concatenations).
136 static Prog *Compile(Regexp* re, bool reversed, int64_t max_mem);
137
138 // Compiles alternation of all the re to a new Prog.
139 // Each re has a match with an id equal to its index in the vector.
140 static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
141
142 // Interface for Regexp::Walker, which helps traverse the Regexp.
143 // The walk is purely post-recursive: given the machines for the
144 // children, PostVisit combines them to create the machine for
145 // the current node. The child_args are Frags.
146 // The Compiler traverses the Regexp parse tree, visiting
147 // each node in depth-first order. It invokes PreVisit before
148 // visiting the node's children and PostVisit after visiting
149 // the children.
150 Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
151 Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,
152 int nchild_args);
153 Frag ShortVisit(Regexp* re, Frag parent_arg);
154 Frag Copy(Frag arg);
155
156 // Given fragment a, returns a+ or a+?; a* or a*?; a? or a??
157 Frag Plus(Frag a, bool nongreedy);
158 Frag Star(Frag a, bool nongreedy);
159 Frag Quest(Frag a, bool nongreedy);
160
161 // Given fragment a, returns (a) capturing as \n.
162 Frag Capture(Frag a, int n);
163
164 // Given fragments a and b, returns ab; a|b
165 Frag Cat(Frag a, Frag b);
166 Frag Alt(Frag a, Frag b);
167
168 // Returns a fragment that can't match anything.
169 Frag NoMatch();
170
171 // Returns a fragment that matches the empty string.
172 Frag Match(int32_t id);
173
174 // Returns a no-op fragment.
175 Frag Nop();
176
177 // Returns a fragment matching the byte range lo-hi.
178 Frag ByteRange(int lo, int hi, bool foldcase);
179
180 // Returns a fragment matching an empty-width special op.
181 Frag EmptyWidth(EmptyOp op);
182
183 // Adds n instructions to the program.
184 // Returns the index of the first one.
185 // Returns -1 if no more instructions are available.
186 int AllocInst(int n);
187
188 // Rune range compiler.
189
190 // Begins a new alternation.
191 void BeginRange();
192
193 // Adds a fragment matching the rune range lo-hi.
194 void AddRuneRange(Rune lo, Rune hi, bool foldcase);
195 void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
196 void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
197 void Add_80_10ffff();
198
199 // New suffix that matches the byte range lo-hi, then goes to next.
200 int UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
201 int CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
202
203 // Returns true iff the suffix is cached.
204 bool IsCachedRuneByteSuffix(int id);
205
206 // Adds a suffix to alternation.
207 void AddSuffix(int id);
208
209 // Adds a suffix to the trie starting from the given root node.
210 // Returns zero iff allocating an instruction fails. Otherwise, returns
211 // the current root node, which might be different from what was given.
212 int AddSuffixRecursive(int root, int id);
213
214 // Finds the trie node for the given suffix. Returns a Frag in order to
215 // distinguish between pointing at the root node directly (end.p == 0)
216 // and pointing at an Alt's out1 or out (end.p&1 == 1 or 0, respectively).
217 Frag FindByteRange(int root, int id);
218
219 // Compares two ByteRanges and returns true iff they are equal.
220 bool ByteRangeEqual(int id1, int id2);
221
222 // Returns the alternation of all the added suffixes.
223 Frag EndRange();
224
225 // Single rune.
226 Frag Literal(Rune r, bool foldcase);
227
228 void Setup(Regexp::ParseFlags, int64_t, RE2::Anchor);
229 Prog* Finish();
230
231 // Returns .* where dot = any byte
232 Frag DotStar();
233
234 private:
235 Prog* prog_; // Program being built.
236 bool failed_; // Did we give up compiling?
237 Encoding encoding_; // Input encoding
238 bool reversed_; // Should program run backward over text?
239
240 PODArray<Prog::Inst> inst_;
241 int ninst_; // Number of instructions used.
242 int max_ninst_; // Maximum number of instructions.
243
244 int64_t max_mem_; // Total memory budget.
245
246 std::unordered_map<uint64_t, int> rune_cache_;
247 Frag rune_range_;
248
249 RE2::Anchor anchor_; // anchor mode for RE2::Set
250
251 Compiler(const Compiler&) = delete;
252 Compiler& operator=(const Compiler&) = delete;
253};
254
255Compiler::Compiler() {
256 prog_ = new Prog();
257 failed_ = false;
258 encoding_ = kEncodingUTF8;
259 reversed_ = false;
260 ninst_ = 0;
261 max_ninst_ = 1; // make AllocInst for fail instruction okay
262 max_mem_ = 0;
263 int fail = AllocInst(1);
264 inst_[fail].InitFail();
265 max_ninst_ = 0; // Caller must change
266}
267
268Compiler::~Compiler() {
269 delete prog_;
270}
271
272int Compiler::AllocInst(int n) {
273 if (failed_ || ninst_ + n > max_ninst_) {
274 failed_ = true;
275 return -1;
276 }
277
278 if (ninst_ + n > inst_.size()) {
279 int cap = inst_.size();
280 if (cap == 0)
281 cap = 8;
282 while (ninst_ + n > cap)
283 cap *= 2;
284 PODArray<Prog::Inst> inst(cap);
285 if (inst_.data() != NULL)
286 memmove(inst.data(), inst_.data(), ninst_*sizeof inst_[0]);
287 memset(inst.data() + ninst_, 0, (cap - ninst_)*sizeof inst_[0]);
288 inst_ = std::move(inst);
289 }
290 int id = ninst_;
291 ninst_ += n;
292 return id;
293}
294
295// These routines are somewhat hard to visualize in text --
296// see http://swtch.com/~rsc/regexp/regexp1.html for
297// pictures explaining what is going on here.
298
299// Returns an unmatchable fragment.
300Frag Compiler::NoMatch() {
301 return Frag(0, nullPatchList);
302}
303
304// Is a an unmatchable fragment?
305static bool IsNoMatch(Frag a) {
306 return a.begin == 0;
307}
308
309// Given fragments a and b, returns fragment for ab.
310Frag Compiler::Cat(Frag a, Frag b) {
311 if (IsNoMatch(a) || IsNoMatch(b))
312 return NoMatch();
313
314 // Elide no-op.
315 Prog::Inst* begin = &inst_[a.begin];
316 if (begin->opcode() == kInstNop &&
317 a.end.p == (a.begin << 1) &&
318 begin->out() == 0) {
319 // in case refs to a somewhere
320 PatchList::Patch(inst_.data(), a.end, b.begin);
321 return b;
322 }
323
324 // To run backward over string, reverse all concatenations.
325 if (reversed_) {
326 PatchList::Patch(inst_.data(), b.end, a.begin);
327 return Frag(b.begin, a.end);
328 }
329
330 PatchList::Patch(inst_.data(), a.end, b.begin);
331 return Frag(a.begin, b.end);
332}
333
334// Given fragments for a and b, returns fragment for a|b.
335Frag Compiler::Alt(Frag a, Frag b) {
336 // Special case for convenience in loops.
337 if (IsNoMatch(a))
338 return b;
339 if (IsNoMatch(b))
340 return a;
341
342 int id = AllocInst(1);
343 if (id < 0)
344 return NoMatch();
345
346 inst_[id].InitAlt(a.begin, b.begin);
347 return Frag(id, PatchList::Append(inst_.data(), a.end, b.end));
348}
349
350// When capturing submatches in like-Perl mode, a kOpAlt Inst
351// treats out_ as the first choice, out1_ as the second.
352//
353// For *, +, and ?, if out_ causes another repetition,
354// then the operator is greedy. If out1_ is the repetition
355// (and out_ moves forward), then the operator is non-greedy.
356
357// Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
358Frag Compiler::Star(Frag a, bool nongreedy) {
359 int id = AllocInst(1);
360 if (id < 0)
361 return NoMatch();
362 inst_[id].InitAlt(0, 0);
363 PatchList::Patch(inst_.data(), a.end, id);
364 if (nongreedy) {
365 inst_[id].out1_ = a.begin;
366 return Frag(id, PatchList::Mk(id << 1));
367 } else {
368 inst_[id].set_out(a.begin);
369 return Frag(id, PatchList::Mk((id << 1) | 1));
370 }
371}
372
373// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
374Frag Compiler::Plus(Frag a, bool nongreedy) {
375 // a+ is just a* with a different entry point.
376 Frag f = Star(a, nongreedy);
377 return Frag(a.begin, f.end);
378}
379
380// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
381Frag Compiler::Quest(Frag a, bool nongreedy) {
382 if (IsNoMatch(a))
383 return Nop();
384 int id = AllocInst(1);
385 if (id < 0)
386 return NoMatch();
387 PatchList pl;
388 if (nongreedy) {
389 inst_[id].InitAlt(0, a.begin);
390 pl = PatchList::Mk(id << 1);
391 } else {
392 inst_[id].InitAlt(a.begin, 0);
393 pl = PatchList::Mk((id << 1) | 1);
394 }
395 return Frag(id, PatchList::Append(inst_.data(), pl, a.end));
396}
397
398// Returns a fragment for the byte range lo-hi.
399Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
400 int id = AllocInst(1);
401 if (id < 0)
402 return NoMatch();
403 inst_[id].InitByteRange(lo, hi, foldcase, 0);
404 return Frag(id, PatchList::Mk(id << 1));
405}
406
407// Returns a no-op fragment. Sometimes unavoidable.
408Frag Compiler::Nop() {
409 int id = AllocInst(1);
410 if (id < 0)
411 return NoMatch();
412 inst_[id].InitNop(0);
413 return Frag(id, PatchList::Mk(id << 1));
414}
415
416// Returns a fragment that signals a match.
417Frag Compiler::Match(int32_t match_id) {
418 int id = AllocInst(1);
419 if (id < 0)
420 return NoMatch();
421 inst_[id].InitMatch(match_id);
422 return Frag(id, nullPatchList);
423}
424
425// Returns a fragment matching a particular empty-width op (like ^ or $)
426Frag Compiler::EmptyWidth(EmptyOp empty) {
427 int id = AllocInst(1);
428 if (id < 0)
429 return NoMatch();
430 inst_[id].InitEmptyWidth(empty, 0);
431 return Frag(id, PatchList::Mk(id << 1));
432}
433
434// Given a fragment a, returns a fragment with capturing parens around a.
435Frag Compiler::Capture(Frag a, int n) {
436 if (IsNoMatch(a))
437 return NoMatch();
438 int id = AllocInst(2);
439 if (id < 0)
440 return NoMatch();
441 inst_[id].InitCapture(2*n, a.begin);
442 inst_[id+1].InitCapture(2*n+1, 0);
443 PatchList::Patch(inst_.data(), a.end, id+1);
444
445 return Frag(id, PatchList::Mk((id+1) << 1));
446}
447
448// A Rune is a name for a Unicode code point.
449// Returns maximum rune encoded by UTF-8 sequence of length len.
450static int MaxRune(int len) {
451 int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
452 if (len == 1)
453 b = 7;
454 else
455 b = 8-(len+1) + 6*(len-1);
456 return (1<<b) - 1; // maximum Rune for b bits.
457}
458
459// The rune range compiler caches common suffix fragments,
460// which are very common in UTF-8 (e.g., [80-bf]).
461// The fragment suffixes are identified by their start
462// instructions. NULL denotes the eventual end match.
463// The Frag accumulates in rune_range_. Caching common
464// suffixes reduces the UTF-8 "." from 32 to 24 instructions,
465// and it reduces the corresponding one-pass NFA from 16 nodes to 8.
466
467void Compiler::BeginRange() {
468 rune_cache_.clear();
469 rune_range_.begin = 0;
470 rune_range_.end = nullPatchList;
471}
472
473int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
474 int next) {
475 Frag f = ByteRange(lo, hi, foldcase);
476 if (next != 0) {
477 PatchList::Patch(inst_.data(), f.end, next);
478 } else {
479 rune_range_.end = PatchList::Append(inst_.data(), rune_range_.end, f.end);
480 }
481 return f.begin;
482}
483
484static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase,
485 int next) {
486 return (uint64_t)next << 17 |
487 (uint64_t)lo << 9 |
488 (uint64_t)hi << 1 |
489 (uint64_t)foldcase;
490}
491
492int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
493 int next) {
494 uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
495 std::unordered_map<uint64_t, int>::const_iterator it = rune_cache_.find(key);
496 if (it != rune_cache_.end())
497 return it->second;
498 int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
499 rune_cache_[key] = id;
500 return id;
501}
502
503bool Compiler::IsCachedRuneByteSuffix(int id) {
504 uint8_t lo = inst_[id].lo_;
505 uint8_t hi = inst_[id].hi_;
506 bool foldcase = inst_[id].foldcase() != 0;
507 int next = inst_[id].out();
508
509 uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
510 return rune_cache_.find(key) != rune_cache_.end();
511}
512
513void Compiler::AddSuffix(int id) {
514 if (failed_)
515 return;
516
517 if (rune_range_.begin == 0) {
518 rune_range_.begin = id;
519 return;
520 }
521
522 if (encoding_ == kEncodingUTF8) {
523 // Build a trie in order to reduce fanout.
524 rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id);
525 return;
526 }
527
528 int alt = AllocInst(1);
529 if (alt < 0) {
530 rune_range_.begin = 0;
531 return;
532 }
533 inst_[alt].InitAlt(rune_range_.begin, id);
534 rune_range_.begin = alt;
535}
536
537int Compiler::AddSuffixRecursive(int root, int id) {
538 DCHECK(inst_[root].opcode() == kInstAlt ||
539 inst_[root].opcode() == kInstByteRange);
540
541 Frag f = FindByteRange(root, id);
542 if (IsNoMatch(f)) {
543 int alt = AllocInst(1);
544 if (alt < 0)
545 return 0;
546 inst_[alt].InitAlt(root, id);
547 return alt;
548 }
549
550 int br;
551 if (f.end.p == 0)
552 br = root;
553 else if (f.end.p&1)
554 br = inst_[f.begin].out1();
555 else
556 br = inst_[f.begin].out();
557
558 if (IsCachedRuneByteSuffix(br)) {
559 // We can't fiddle with cached suffixes, so make a clone of the head.
560 int byterange = AllocInst(1);
561 if (byterange < 0)
562 return 0;
563 inst_[byterange].InitByteRange(inst_[br].lo(), inst_[br].hi(),
564 inst_[br].foldcase(), inst_[br].out());
565
566 // Ensure that the parent points to the clone, not to the original.
567 // Note that this could leave the head unreachable except via the cache.
568 br = byterange;
569 if (f.end.p == 0)
570 root = br;
571 else if (f.end.p&1)
572 inst_[f.begin].out1_ = br;
573 else
574 inst_[f.begin].set_out(br);
575 }
576
577 int out = inst_[id].out();
578 if (!IsCachedRuneByteSuffix(id)) {
579 // The head should be the instruction most recently allocated, so free it
580 // instead of leaving it unreachable.
581 DCHECK_EQ(id, ninst_-1);
582 inst_[id].out_opcode_ = 0;
583 inst_[id].out1_ = 0;
584 ninst_--;
585 }
586
587 out = AddSuffixRecursive(inst_[br].out(), out);
588 if (out == 0)
589 return 0;
590
591 inst_[br].set_out(out);
592 return root;
593}
594
595bool Compiler::ByteRangeEqual(int id1, int id2) {
596 return inst_[id1].lo() == inst_[id2].lo() &&
597 inst_[id1].hi() == inst_[id2].hi() &&
598 inst_[id1].foldcase() == inst_[id2].foldcase();
599}
600
601Frag Compiler::FindByteRange(int root, int id) {
602 if (inst_[root].opcode() == kInstByteRange) {
603 if (ByteRangeEqual(root, id))
604 return Frag(root, nullPatchList);
605 else
606 return NoMatch();
607 }
608
609 while (inst_[root].opcode() == kInstAlt) {
610 int out1 = inst_[root].out1();
611 if (ByteRangeEqual(out1, id))
612 return Frag(root, PatchList::Mk((root << 1) | 1));
613
614 // CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
615 // what we're looking for, then we can stop immediately. Unfortunately, we
616 // can't short-circuit the search in reverse mode.
617 if (!reversed_)
618 return NoMatch();
619
620 int out = inst_[root].out();
621 if (inst_[out].opcode() == kInstAlt)
622 root = out;
623 else if (ByteRangeEqual(out, id))
624 return Frag(root, PatchList::Mk(root << 1));
625 else
626 return NoMatch();
627 }
628
629 LOG(DFATAL) << "should never happen";
630 return NoMatch();
631}
632
633Frag Compiler::EndRange() {
634 return rune_range_;
635}
636
637// Converts rune range lo-hi into a fragment that recognizes
638// the bytes that would make up those runes in the current
639// encoding (Latin 1 or UTF-8).
640// This lets the machine work byte-by-byte even when
641// using multibyte encodings.
642
643void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
644 switch (encoding_) {
645 default:
646 case kEncodingUTF8:
647 AddRuneRangeUTF8(lo, hi, foldcase);
648 break;
649 case kEncodingLatin1:
650 AddRuneRangeLatin1(lo, hi, foldcase);
651 break;
652 }
653}
654
655void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
656 // Latin-1 is easy: runes *are* bytes.
657 if (lo > hi || lo > 0xFF)
658 return;
659 if (hi > 0xFF)
660 hi = 0xFF;
661 AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
662 static_cast<uint8_t>(hi), foldcase, 0));
663}
664
665// Table describing how to make a UTF-8 matching machine
666// for the rune range 80-10FFFF (Runeself-Runemax).
667// This range happens frequently enough (for example /./ and /[^a-z]/)
668// and the rune_cache_ map is slow enough that this is worth
669// special handling. Makes compilation of a small expression
670// with a dot in it about 10% faster.
671// The * in the comments below mark whole sequences.
672static struct ByteRangeProg {
673 int next;
674 int lo;
675 int hi;
676} prog_80_10ffff[] = {
677 // Two-byte
678 { -1, 0x80, 0xBF, }, // 0: 80-BF
679 { 0, 0xC2, 0xDF, }, // 1: C2-DF 80-BF*
680
681 // Three-byte
682 { 0, 0xA0, 0xBF, }, // 2: A0-BF 80-BF
683 { 2, 0xE0, 0xE0, }, // 3: E0 A0-BF 80-BF*
684 { 0, 0x80, 0xBF, }, // 4: 80-BF 80-BF
685 { 4, 0xE1, 0xEF, }, // 5: E1-EF 80-BF 80-BF*
686
687 // Four-byte
688 { 4, 0x90, 0xBF, }, // 6: 90-BF 80-BF 80-BF
689 { 6, 0xF0, 0xF0, }, // 7: F0 90-BF 80-BF 80-BF*
690 { 4, 0x80, 0xBF, }, // 8: 80-BF 80-BF 80-BF
691 { 8, 0xF1, 0xF3, }, // 9: F1-F3 80-BF 80-BF 80-BF*
692 { 4, 0x80, 0x8F, }, // 10: 80-8F 80-BF 80-BF
693 { 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF*
694};
695
696void Compiler::Add_80_10ffff() {
697 int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning
698 for (size_t i = 0; i < arraysize(prog_80_10ffff); i++) {
699 const ByteRangeProg& p = prog_80_10ffff[i];
700 int next = 0;
701 if (p.next >= 0)
702 next = inst[p.next];
703 inst[i] = UncachedRuneByteSuffix(static_cast<uint8_t>(p.lo),
704 static_cast<uint8_t>(p.hi), false, next);
705 if ((p.lo & 0xC0) != 0x80)
706 AddSuffix(inst[i]);
707 }
708}
709
710void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
711 if (lo > hi)
712 return;
713
714 // Pick off 80-10FFFF as a common special case
715 // that can bypass the slow rune_cache_.
716 if (lo == 0x80 && hi == 0x10ffff && !reversed_) {
717 Add_80_10ffff();
718 return;
719 }
720
721 // Split range into same-length sized ranges.
722 for (int i = 1; i < UTFmax; i++) {
723 Rune max = MaxRune(i);
724 if (lo <= max && max < hi) {
725 AddRuneRangeUTF8(lo, max, foldcase);
726 AddRuneRangeUTF8(max+1, hi, foldcase);
727 return;
728 }
729 }
730
731 // ASCII range is always a special case.
732 if (hi < Runeself) {
733 AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
734 static_cast<uint8_t>(hi), foldcase, 0));
735 return;
736 }
737
738 // Split range into sections that agree on leading bytes.
739 for (int i = 1; i < UTFmax; i++) {
740 uint32_t m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence
741 if ((lo & ~m) != (hi & ~m)) {
742 if ((lo & m) != 0) {
743 AddRuneRangeUTF8(lo, lo|m, foldcase);
744 AddRuneRangeUTF8((lo|m)+1, hi, foldcase);
745 return;
746 }
747 if ((hi & m) != m) {
748 AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase);
749 AddRuneRangeUTF8(hi&~m, hi, foldcase);
750 return;
751 }
752 }
753 }
754
755 // Finally. Generate byte matching equivalent for lo-hi.
756 uint8_t ulo[UTFmax], uhi[UTFmax];
757 int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
758 int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
759 (void)m; // USED(m)
760 DCHECK_EQ(n, m);
761
762 // The logic below encodes this thinking:
763 //
764 // 1. When we have built the whole suffix, we know that it cannot
765 // possibly be a suffix of anything longer: in forward mode, nothing
766 // else can occur before the leading byte; in reverse mode, nothing
767 // else can occur after the last continuation byte or else the leading
768 // byte would have to change. Thus, there is no benefit to caching
769 // the first byte of the suffix whereas there is a cost involved in
770 // cloning it if it begins a common prefix, which is fairly likely.
771 //
772 // 2. Conversely, the last byte of the suffix cannot possibly be a
773 // prefix of anything because next == 0, so we will never want to
774 // clone it, but it is fairly likely to be a common suffix. Perhaps
775 // more so in reverse mode than in forward mode because the former is
776 // "converging" towards lower entropy, but caching is still worthwhile
777 // for the latter in cases such as 80-BF.
778 //
779 // 3. Handling the bytes between the first and the last is less
780 // straightforward and, again, the approach depends on whether we are
781 // "converging" towards lower entropy: in forward mode, a single byte
782 // is unlikely to be part of a common suffix whereas a byte range
783 // is more likely so; in reverse mode, a byte range is unlikely to
784 // be part of a common suffix whereas a single byte is more likely
785 // so. The same benefit versus cost argument applies here.
786 int id = 0;
787 if (reversed_) {
788 for (int i = 0; i < n; i++) {
789 // In reverse UTF-8 mode: cache the leading byte; don't cache the last
790 // continuation byte; cache anything else iff it's a single byte (XX-XX).
791 if (i == 0 || (ulo[i] == uhi[i] && i != n-1))
792 id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
793 else
794 id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
795 }
796 } else {
797 for (int i = n-1; i >= 0; i--) {
798 // In forward UTF-8 mode: don't cache the leading byte; cache the last
799 // continuation byte; cache anything else iff it's a byte range (XX-YY).
800 if (i == n-1 || (ulo[i] < uhi[i] && i != 0))
801 id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
802 else
803 id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
804 }
805 }
806 AddSuffix(id);
807}
808
809// Should not be called.
810Frag Compiler::Copy(Frag arg) {
811 // We're using WalkExponential; there should be no copying.
812 LOG(DFATAL) << "Compiler::Copy called!";
813 failed_ = true;
814 return NoMatch();
815}
816
817// Visits a node quickly; called once WalkExponential has
818// decided to cut this walk short.
819Frag Compiler::ShortVisit(Regexp* re, Frag) {
820 failed_ = true;
821 return NoMatch();
822}
823
824// Called before traversing a node's children during the walk.
825Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {
826 // Cut off walk if we've already failed.
827 if (failed_)
828 *stop = true;
829
830 return Frag(); // not used by caller
831}
832
833Frag Compiler::Literal(Rune r, bool foldcase) {
834 switch (encoding_) {
835 default:
836 return Frag();
837
838 case kEncodingLatin1:
839 return ByteRange(r, r, foldcase);
840
841 case kEncodingUTF8: {
842 if (r < Runeself) // Make common case fast.
843 return ByteRange(r, r, foldcase);
844 uint8_t buf[UTFmax];
845 int n = runetochar(reinterpret_cast<char*>(buf), &r);
846 Frag f = ByteRange((uint8_t)buf[0], buf[0], false);
847 for (int i = 1; i < n; i++)
848 f = Cat(f, ByteRange((uint8_t)buf[i], buf[i], false));
849 return f;
850 }
851 }
852}
853
854// Called after traversing the node's children during the walk.
855// Given their frags, build and return the frag for this re.
856Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
857 int nchild_frags) {
858 // If a child failed, don't bother going forward, especially
859 // since the child_frags might contain Frags with NULLs in them.
860 if (failed_)
861 return NoMatch();
862
863 // Given the child fragments, return the fragment for this node.
864 switch (re->op()) {
865 case kRegexpRepeat:
866 // Should not see; code at bottom of function will print error
867 break;
868
869 case kRegexpNoMatch:
870 return NoMatch();
871
872 case kRegexpEmptyMatch:
873 return Nop();
874
875 case kRegexpHaveMatch: {
876 Frag f = Match(re->match_id());
877 if (anchor_ == RE2::ANCHOR_BOTH) {
878 // Append \z or else the subexpression will effectively be unanchored.
879 // Complemented by the UNANCHORED case in CompileSet().
880 f = Cat(EmptyWidth(kEmptyEndText), f);
881 }
882 return f;
883 }
884
885 case kRegexpConcat: {
886 Frag f = child_frags[0];
887 for (int i = 1; i < nchild_frags; i++)
888 f = Cat(f, child_frags[i]);
889 return f;
890 }
891
892 case kRegexpAlternate: {
893 Frag f = child_frags[0];
894 for (int i = 1; i < nchild_frags; i++)
895 f = Alt(f, child_frags[i]);
896 return f;
897 }
898
899 case kRegexpStar:
900 return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
901
902 case kRegexpPlus:
903 return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
904
905 case kRegexpQuest:
906 return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
907
908 case kRegexpLiteral:
909 return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0);
910
911 case kRegexpLiteralString: {
912 // Concatenation of literals.
913 if (re->nrunes() == 0)
914 return Nop();
915 Frag f;
916 for (int i = 0; i < re->nrunes(); i++) {
917 Frag f1 = Literal(re->runes()[i],
918 (re->parse_flags()&Regexp::FoldCase) != 0);
919 if (i == 0)
920 f = f1;
921 else
922 f = Cat(f, f1);
923 }
924 return f;
925 }
926
927 case kRegexpAnyChar:
928 BeginRange();
929 AddRuneRange(0, Runemax, false);
930 return EndRange();
931
932 case kRegexpAnyByte:
933 return ByteRange(0x00, 0xFF, false);
934
935 case kRegexpCharClass: {
936 CharClass* cc = re->cc();
937 if (cc->empty()) {
938 // This can't happen.
939 LOG(DFATAL) << "No ranges in char class";
940 failed_ = true;
941 return NoMatch();
942 }
943
944 // ASCII case-folding optimization: if the char class
945 // behaves the same on A-Z as it does on a-z,
946 // discard any ranges wholly contained in A-Z
947 // and mark the other ranges as foldascii.
948 // This reduces the size of a program for
949 // (?i)abc from 3 insts per letter to 1 per letter.
950 bool foldascii = cc->FoldsASCII();
951
952 // Character class is just a big OR of the different
953 // character ranges in the class.
954 BeginRange();
955 for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
956 // ASCII case-folding optimization (see above).
957 if (foldascii && 'A' <= i->lo && i->hi <= 'Z')
958 continue;
959
960 // If this range contains all of A-Za-z or none of it,
961 // the fold flag is unnecessary; don't bother.
962 bool fold = foldascii;
963 if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo ||
964 ('Z' < i->lo && i->hi < 'a'))
965 fold = false;
966
967 AddRuneRange(i->lo, i->hi, fold);
968 }
969 return EndRange();
970 }
971
972 case kRegexpCapture:
973 // If this is a non-capturing parenthesis -- (?:foo) --
974 // just use the inner expression.
975 if (re->cap() < 0)
976 return child_frags[0];
977 return Capture(child_frags[0], re->cap());
978
979 case kRegexpBeginLine:
980 return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
981
982 case kRegexpEndLine:
983 return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
984
985 case kRegexpBeginText:
986 return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
987
988 case kRegexpEndText:
989 return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
990
991 case kRegexpWordBoundary:
992 return EmptyWidth(kEmptyWordBoundary);
993
994 case kRegexpNoWordBoundary:
995 return EmptyWidth(kEmptyNonWordBoundary);
996 }
997 LOG(DFATAL) << "Missing case in Compiler: " << re->op();
998 failed_ = true;
999 return NoMatch();
1000}
1001
1002// Is this regexp required to start at the beginning of the text?
1003// Only approximate; can return false for complicated regexps like (\Aa|\Ab),
1004// but handles (\A(a|b)). Could use the Walker to write a more exact one.
1005static bool IsAnchorStart(Regexp** pre, int depth) {
1006 Regexp* re = *pre;
1007 Regexp* sub;
1008 // The depth limit makes sure that we don't overflow
1009 // the stack on a deeply nested regexp. As the comment
1010 // above says, IsAnchorStart is conservative, so returning
1011 // a false negative is okay. The exact limit is somewhat arbitrary.
1012 if (re == NULL || depth >= 4)
1013 return false;
1014 switch (re->op()) {
1015 default:
1016 break;
1017 case kRegexpConcat:
1018 if (re->nsub() > 0) {
1019 sub = re->sub()[0]->Incref();
1020 if (IsAnchorStart(&sub, depth+1)) {
1021 PODArray<Regexp*> subcopy(re->nsub());
1022 subcopy[0] = sub; // already have reference
1023 for (int i = 1; i < re->nsub(); i++)
1024 subcopy[i] = re->sub()[i]->Incref();
1025 *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
1026 re->Decref();
1027 return true;
1028 }
1029 sub->Decref();
1030 }
1031 break;
1032 case kRegexpCapture:
1033 sub = re->sub()[0]->Incref();
1034 if (IsAnchorStart(&sub, depth+1)) {
1035 *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
1036 re->Decref();
1037 return true;
1038 }
1039 sub->Decref();
1040 break;
1041 case kRegexpBeginText:
1042 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
1043 re->Decref();
1044 return true;
1045 }
1046 return false;
1047}
1048
1049// Is this regexp required to start at the end of the text?
1050// Only approximate; can return false for complicated regexps like (a\z|b\z),
1051// but handles ((a|b)\z). Could use the Walker to write a more exact one.
1052static bool IsAnchorEnd(Regexp** pre, int depth) {
1053 Regexp* re = *pre;
1054 Regexp* sub;
1055 // The depth limit makes sure that we don't overflow
1056 // the stack on a deeply nested regexp. As the comment
1057 // above says, IsAnchorEnd is conservative, so returning
1058 // a false negative is okay. The exact limit is somewhat arbitrary.
1059 if (re == NULL || depth >= 4)
1060 return false;
1061 switch (re->op()) {
1062 default:
1063 break;
1064 case kRegexpConcat:
1065 if (re->nsub() > 0) {
1066 sub = re->sub()[re->nsub() - 1]->Incref();
1067 if (IsAnchorEnd(&sub, depth+1)) {
1068 PODArray<Regexp*> subcopy(re->nsub());
1069 subcopy[re->nsub() - 1] = sub; // already have reference
1070 for (int i = 0; i < re->nsub() - 1; i++)
1071 subcopy[i] = re->sub()[i]->Incref();
1072 *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
1073 re->Decref();
1074 return true;
1075 }
1076 sub->Decref();
1077 }
1078 break;
1079 case kRegexpCapture:
1080 sub = re->sub()[0]->Incref();
1081 if (IsAnchorEnd(&sub, depth+1)) {
1082 *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
1083 re->Decref();
1084 return true;
1085 }
1086 sub->Decref();
1087 break;
1088 case kRegexpEndText:
1089 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
1090 re->Decref();
1091 return true;
1092 }
1093 return false;
1094}
1095
1096void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem,
1097 RE2::Anchor anchor) {
1098 prog_->set_flags(flags);
1099
1100 if (flags & Regexp::Latin1)
1101 encoding_ = kEncodingLatin1;
1102 max_mem_ = max_mem;
1103 if (max_mem <= 0) {
1104 max_ninst_ = 100000; // more than enough
1105 } else if (static_cast<size_t>(max_mem) <= sizeof(Prog)) {
1106 // No room for anything.
1107 max_ninst_ = 0;
1108 } else {
1109 int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
1110 // Limit instruction count so that inst->id() fits nicely in an int.
1111 // SparseArray also assumes that the indices (inst->id()) are ints.
1112 // The call to WalkExponential uses 2*max_ninst_ below,
1113 // and other places in the code use 2 or 3 * prog->size().
1114 // Limiting to 2^24 should avoid overflow in those places.
1115 // (The point of allowing more than 32 bits of memory is to
1116 // have plenty of room for the DFA states, not to use it up
1117 // on the program.)
1118 if (m >= 1<<24)
1119 m = 1<<24;
1120
1121 // Inst imposes its own limit (currently bigger than 2^24 but be safe).
1122 if (m > Prog::Inst::kMaxInst)
1123 m = Prog::Inst::kMaxInst;
1124
1125 max_ninst_ = static_cast<int>(m);
1126 }
1127
1128 anchor_ = anchor;
1129}
1130
1131// Compiles re, returning program.
1132// Caller is responsible for deleting prog_.
1133// If reversed is true, compiles a program that expects
1134// to run over the input string backward (reverses all concatenations).
1135// The reversed flag is also recorded in the returned program.
1136Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
1137 Compiler c;
1138 c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */);
1139 c.reversed_ = reversed;
1140
1141 // Simplify to remove things like counted repetitions
1142 // and character classes like \d.
1143 Regexp* sre = re->Simplify();
1144 if (sre == NULL)
1145 return NULL;
1146
1147 // Record whether prog is anchored, removing the anchors.
1148 // (They get in the way of other optimizations.)
1149 bool is_anchor_start = IsAnchorStart(&sre, 0);
1150 bool is_anchor_end = IsAnchorEnd(&sre, 0);
1151
1152 // Generate fragment for entire regexp.
1153 Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
1154 sre->Decref();
1155 if (c.failed_)
1156 return NULL;
1157
1158 // Success! Finish by putting Match node at end, and record start.
1159 // Turn off c.reversed_ (if it is set) to force the remaining concatenations
1160 // to behave normally.
1161 c.reversed_ = false;
1162 all = c.Cat(all, c.Match(0));
1163
1164 c.prog_->set_reversed(reversed);
1165 if (c.prog_->reversed()) {
1166 c.prog_->set_anchor_start(is_anchor_end);
1167 c.prog_->set_anchor_end(is_anchor_start);
1168 } else {
1169 c.prog_->set_anchor_start(is_anchor_start);
1170 c.prog_->set_anchor_end(is_anchor_end);
1171 }
1172
1173 c.prog_->set_start(all.begin);
1174 if (!c.prog_->anchor_start()) {
1175 // Also create unanchored version, which starts with a .*? loop.
1176 all = c.Cat(c.DotStar(), all);
1177 }
1178 c.prog_->set_start_unanchored(all.begin);
1179
1180 // Hand ownership of prog_ to caller.
1181 return c.Finish();
1182}
1183
1184Prog* Compiler::Finish() {
1185 if (failed_)
1186 return NULL;
1187
1188 if (prog_->start() == 0 && prog_->start_unanchored() == 0) {
1189 // No possible matches; keep Fail instruction only.
1190 ninst_ = 1;
1191 }
1192
1193 // Hand off the array to Prog.
1194 prog_->inst_ = std::move(inst_);
1195 prog_->size_ = ninst_;
1196
1197 prog_->Optimize();
1198 prog_->Flatten();
1199 prog_->ComputeByteMap();
1200
1201 // Record remaining memory for DFA.
1202 if (max_mem_ <= 0) {
1203 prog_->set_dfa_mem(1<<20);
1204 } else {
1205 int64_t m = max_mem_ - sizeof(Prog);
1206 m -= prog_->size_*sizeof(Prog::Inst); // account for inst_
1207 if (prog_->CanBitState())
1208 m -= prog_->size_*sizeof(uint16_t); // account for list_heads_
1209 if (m < 0)
1210 m = 0;
1211 prog_->set_dfa_mem(m);
1212 }
1213
1214 Prog* p = prog_;
1215 prog_ = NULL;
1216 return p;
1217}
1218
1219// Converts Regexp to Prog.
1220Prog* Regexp::CompileToProg(int64_t max_mem) {
1221 return Compiler::Compile(this, false, max_mem);
1222}
1223
1224Prog* Regexp::CompileToReverseProg(int64_t max_mem) {
1225 return Compiler::Compile(this, true, max_mem);
1226}
1227
1228Frag Compiler::DotStar() {
1229 return Star(ByteRange(0x00, 0xff, false), true);
1230}
1231
1232// Compiles RE set to Prog.
1233Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
1234 Compiler c;
1235 c.Setup(re->parse_flags(), max_mem, anchor);
1236
1237 Regexp* sre = re->Simplify();
1238 if (sre == NULL)
1239 return NULL;
1240
1241 Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
1242 sre->Decref();
1243 if (c.failed_)
1244 return NULL;
1245
1246 c.prog_->set_anchor_start(true);
1247 c.prog_->set_anchor_end(true);
1248
1249 if (anchor == RE2::UNANCHORED) {
1250 // Prepend .* or else the expression will effectively be anchored.
1251 // Complemented by the ANCHOR_BOTH case in PostVisit().
1252 all = c.Cat(c.DotStar(), all);
1253 }
1254 c.prog_->set_start(all.begin);
1255 c.prog_->set_start_unanchored(all.begin);
1256
1257 Prog* prog = c.Finish();
1258 if (prog == NULL)
1259 return NULL;
1260
1261 // Make sure DFA has enough memory to operate,
1262 // since we're not going to fall back to the NFA.
1263 bool dfa_failed = false;
1264 StringPiece sp = "hello, world";
1265 prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
1266 NULL, &dfa_failed, NULL);
1267 if (dfa_failed) {
1268 delete prog;
1269 return NULL;
1270 }
1271
1272 return prog;
1273}
1274
1275Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
1276 return Compiler::CompileSet(re, anchor, max_mem);
1277}
1278
1279} // namespace re2
1280