1// Copyright 2009 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "re2/prefilter.h"
6
7#include <stddef.h>
8#include <stdint.h>
9#include <string>
10#include <vector>
11
12#include "util/util.h"
13#include "util/logging.h"
14#include "util/strutil.h"
15#include "util/utf.h"
16#include "re2/re2.h"
17#include "re2/unicode_casefold.h"
18#include "re2/walker-inl.h"
19
20namespace re2 {
21
22static const bool ExtraDebug = false;
23
24typedef std::set<std::string>::iterator SSIter;
25typedef std::set<std::string>::const_iterator ConstSSIter;
26
27// Initializes a Prefilter, allocating subs_ as necessary.
28Prefilter::Prefilter(Op op) {
29 op_ = op;
30 subs_ = NULL;
31 if (op_ == AND || op_ == OR)
32 subs_ = new std::vector<Prefilter*>;
33}
34
35// Destroys a Prefilter.
36Prefilter::~Prefilter() {
37 if (subs_) {
38 for (size_t i = 0; i < subs_->size(); i++)
39 delete (*subs_)[i];
40 delete subs_;
41 subs_ = NULL;
42 }
43}
44
45// Simplify if the node is an empty Or or And.
46Prefilter* Prefilter::Simplify() {
47 if (op_ != AND && op_ != OR) {
48 return this;
49 }
50
51 // Nothing left in the AND/OR.
52 if (subs_->empty()) {
53 if (op_ == AND)
54 op_ = ALL; // AND of nothing is true
55 else
56 op_ = NONE; // OR of nothing is false
57
58 return this;
59 }
60
61 // Just one subnode: throw away wrapper.
62 if (subs_->size() == 1) {
63 Prefilter* a = (*subs_)[0];
64 subs_->clear();
65 delete this;
66 return a->Simplify();
67 }
68
69 return this;
70}
71
72// Combines two Prefilters together to create an "op" (AND or OR).
73// The passed Prefilters will be part of the returned Prefilter or deleted.
74// Does lots of work to avoid creating unnecessarily complicated structures.
75Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
76 // If a, b can be rewritten as op, do so.
77 a = a->Simplify();
78 b = b->Simplify();
79
80 // Canonicalize: a->op <= b->op.
81 if (a->op() > b->op()) {
82 Prefilter* t = a;
83 a = b;
84 b = t;
85 }
86
87 // Trivial cases.
88 // ALL AND b = b
89 // NONE OR b = b
90 // ALL OR b = ALL
91 // NONE AND b = NONE
92 // Don't need to look at b, because of canonicalization above.
93 // ALL and NONE are smallest opcodes.
94 if (a->op() == ALL || a->op() == NONE) {
95 if ((a->op() == ALL && op == AND) ||
96 (a->op() == NONE && op == OR)) {
97 delete a;
98 return b;
99 } else {
100 delete b;
101 return a;
102 }
103 }
104
105 // If a and b match op, merge their contents.
106 if (a->op() == op && b->op() == op) {
107 for (size_t i = 0; i < b->subs()->size(); i++) {
108 Prefilter* bb = (*b->subs())[i];
109 a->subs()->push_back(bb);
110 }
111 b->subs()->clear();
112 delete b;
113 return a;
114 }
115
116 // If a already has the same op as the op that is under construction
117 // add in b (similarly if b already has the same op, add in a).
118 if (b->op() == op) {
119 Prefilter* t = a;
120 a = b;
121 b = t;
122 }
123 if (a->op() == op) {
124 a->subs()->push_back(b);
125 return a;
126 }
127
128 // Otherwise just return the op.
129 Prefilter* c = new Prefilter(op);
130 c->subs()->push_back(a);
131 c->subs()->push_back(b);
132 return c;
133}
134
135Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
136 return AndOr(AND, a, b);
137}
138
139Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
140 return AndOr(OR, a, b);
141}
142
143static void SimplifyStringSet(std::set<std::string>* ss) {
144 // Now make sure that the strings aren't redundant. For example, if
145 // we know "ab" is a required string, then it doesn't help at all to
146 // know that "abc" is also a required string, so delete "abc". This
147 // is because, when we are performing a string search to filter
148 // regexps, matching "ab" will already allow this regexp to be a
149 // candidate for match, so further matching "abc" is redundant.
150 // Note that we must ignore "" because find() would find it at the
151 // start of everything and thus we would end up erasing everything.
152 for (SSIter i = ss->begin(); i != ss->end(); ++i) {
153 if (i->empty())
154 continue;
155 SSIter j = i;
156 ++j;
157 while (j != ss->end()) {
158 if (j->find(*i) != std::string::npos) {
159 j = ss->erase(j);
160 continue;
161 }
162 ++j;
163 }
164 }
165}
166
167Prefilter* Prefilter::OrStrings(std::set<std::string>* ss) {
168 Prefilter* or_prefilter = new Prefilter(NONE);
169 SimplifyStringSet(ss);
170 for (SSIter i = ss->begin(); i != ss->end(); ++i)
171 or_prefilter = Or(or_prefilter, FromString(*i));
172 return or_prefilter;
173}
174
175static Rune ToLowerRune(Rune r) {
176 if (r < Runeself) {
177 if ('A' <= r && r <= 'Z')
178 r += 'a' - 'A';
179 return r;
180 }
181
182 const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
183 if (f == NULL || r < f->lo)
184 return r;
185 return ApplyFold(f, r);
186}
187
188static Rune ToLowerRuneLatin1(Rune r) {
189 if ('A' <= r && r <= 'Z')
190 r += 'a' - 'A';
191 return r;
192}
193
194Prefilter* Prefilter::FromString(const std::string& str) {
195 Prefilter* m = new Prefilter(Prefilter::ATOM);
196 m->atom_ = str;
197 return m;
198}
199
200// Information about a regexp used during computation of Prefilter.
201// Can be thought of as information about the set of strings matching
202// the given regular expression.
203class Prefilter::Info {
204 public:
205 Info();
206 ~Info();
207
208 // More constructors. They delete their Info* arguments.
209 static Info* Alt(Info* a, Info* b);
210 static Info* Concat(Info* a, Info* b);
211 static Info* And(Info* a, Info* b);
212 static Info* Star(Info* a);
213 static Info* Plus(Info* a);
214 static Info* Quest(Info* a);
215 static Info* EmptyString();
216 static Info* NoMatch();
217 static Info* AnyCharOrAnyByte();
218 static Info* CClass(CharClass* cc, bool latin1);
219 static Info* Literal(Rune r);
220 static Info* LiteralLatin1(Rune r);
221 static Info* AnyMatch();
222
223 // Format Info as a string.
224 std::string ToString();
225
226 // Caller takes ownership of the Prefilter.
227 Prefilter* TakeMatch();
228
229 std::set<std::string>& exact() { return exact_; }
230
231 bool is_exact() const { return is_exact_; }
232
233 class Walker;
234
235 private:
236 std::set<std::string> exact_;
237
238 // When is_exact_ is true, the strings that match
239 // are placed in exact_. When it is no longer an exact
240 // set of strings that match this RE, then is_exact_
241 // is false and the match_ contains the required match
242 // criteria.
243 bool is_exact_;
244
245 // Accumulated Prefilter query that any
246 // match for this regexp is guaranteed to match.
247 Prefilter* match_;
248};
249
250
251Prefilter::Info::Info()
252 : is_exact_(false),
253 match_(NULL) {
254}
255
256Prefilter::Info::~Info() {
257 delete match_;
258}
259
260Prefilter* Prefilter::Info::TakeMatch() {
261 if (is_exact_) {
262 match_ = Prefilter::OrStrings(&exact_);
263 is_exact_ = false;
264 }
265 Prefilter* m = match_;
266 match_ = NULL;
267 return m;
268}
269
270// Format a Info in string form.
271std::string Prefilter::Info::ToString() {
272 if (is_exact_) {
273 int n = 0;
274 std::string s;
275 for (SSIter i = exact_.begin(); i != exact_.end(); ++i) {
276 if (n++ > 0)
277 s += ",";
278 s += *i;
279 }
280 return s;
281 }
282
283 if (match_)
284 return match_->DebugString();
285
286 return "";
287}
288
289// Add the strings from src to dst.
290static void CopyIn(const std::set<std::string>& src,
291 std::set<std::string>* dst) {
292 for (ConstSSIter i = src.begin(); i != src.end(); ++i)
293 dst->insert(*i);
294}
295
296// Add the cross-product of a and b to dst.
297// (For each string i in a and j in b, add i+j.)
298static void CrossProduct(const std::set<std::string>& a,
299 const std::set<std::string>& b,
300 std::set<std::string>* dst) {
301 for (ConstSSIter i = a.begin(); i != a.end(); ++i)
302 for (ConstSSIter j = b.begin(); j != b.end(); ++j)
303 dst->insert(*i + *j);
304}
305
306// Concats a and b. Requires that both are exact sets.
307// Forms an exact set that is a crossproduct of a and b.
308Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
309 if (a == NULL)
310 return b;
311 DCHECK(a->is_exact_);
312 DCHECK(b && b->is_exact_);
313 Info *ab = new Info();
314
315 CrossProduct(a->exact_, b->exact_, &ab->exact_);
316 ab->is_exact_ = true;
317
318 delete a;
319 delete b;
320 return ab;
321}
322
323// Constructs an inexact Info for ab given a and b.
324// Used only when a or b is not exact or when the
325// exact cross product is likely to be too big.
326Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
327 if (a == NULL)
328 return b;
329 if (b == NULL)
330 return a;
331
332 Info *ab = new Info();
333
334 ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
335 ab->is_exact_ = false;
336 delete a;
337 delete b;
338 return ab;
339}
340
341// Constructs Info for a|b given a and b.
342Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
343 Info *ab = new Info();
344
345 if (a->is_exact_ && b->is_exact_) {
346 CopyIn(a->exact_, &ab->exact_);
347 CopyIn(b->exact_, &ab->exact_);
348 ab->is_exact_ = true;
349 } else {
350 // Either a or b has is_exact_ = false. If the other
351 // one has is_exact_ = true, we move it to match_ and
352 // then create a OR of a,b. The resulting Info has
353 // is_exact_ = false.
354 ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
355 ab->is_exact_ = false;
356 }
357
358 delete a;
359 delete b;
360 return ab;
361}
362
363// Constructs Info for a? given a.
364Prefilter::Info* Prefilter::Info::Quest(Info *a) {
365 Info *ab = new Info();
366
367 ab->is_exact_ = false;
368 ab->match_ = new Prefilter(ALL);
369 delete a;
370 return ab;
371}
372
373// Constructs Info for a* given a.
374// Same as a? -- not much to do.
375Prefilter::Info* Prefilter::Info::Star(Info *a) {
376 return Quest(a);
377}
378
379// Constructs Info for a+ given a. If a was exact set, it isn't
380// anymore.
381Prefilter::Info* Prefilter::Info::Plus(Info *a) {
382 Info *ab = new Info();
383
384 ab->match_ = a->TakeMatch();
385 ab->is_exact_ = false;
386
387 delete a;
388 return ab;
389}
390
391static std::string RuneToString(Rune r) {
392 char buf[UTFmax];
393 int n = runetochar(buf, &r);
394 return std::string(buf, n);
395}
396
397static std::string RuneToStringLatin1(Rune r) {
398 char c = r & 0xff;
399 return std::string(&c, 1);
400}
401
402// Constructs Info for literal rune.
403Prefilter::Info* Prefilter::Info::Literal(Rune r) {
404 Info* info = new Info();
405 info->exact_.insert(RuneToString(ToLowerRune(r)));
406 info->is_exact_ = true;
407 return info;
408}
409
410// Constructs Info for literal rune for Latin1 encoded string.
411Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
412 Info* info = new Info();
413 info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
414 info->is_exact_ = true;
415 return info;
416}
417
418// Constructs Info for dot (any character) or \C (any byte).
419Prefilter::Info* Prefilter::Info::AnyCharOrAnyByte() {
420 Prefilter::Info* info = new Prefilter::Info();
421 info->match_ = new Prefilter(ALL);
422 return info;
423}
424
425// Constructs Prefilter::Info for no possible match.
426Prefilter::Info* Prefilter::Info::NoMatch() {
427 Prefilter::Info* info = new Prefilter::Info();
428 info->match_ = new Prefilter(NONE);
429 return info;
430}
431
432// Constructs Prefilter::Info for any possible match.
433// This Prefilter::Info is valid for any regular expression,
434// since it makes no assertions whatsoever about the
435// strings being matched.
436Prefilter::Info* Prefilter::Info::AnyMatch() {
437 Prefilter::Info *info = new Prefilter::Info();
438 info->match_ = new Prefilter(ALL);
439 return info;
440}
441
442// Constructs Prefilter::Info for just the empty string.
443Prefilter::Info* Prefilter::Info::EmptyString() {
444 Prefilter::Info* info = new Prefilter::Info();
445 info->is_exact_ = true;
446 info->exact_.insert("");
447 return info;
448}
449
450// Constructs Prefilter::Info for a character class.
451typedef CharClass::iterator CCIter;
452Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
453 bool latin1) {
454 if (ExtraDebug) {
455 LOG(ERROR) << "CharClassInfo:";
456 for (CCIter i = cc->begin(); i != cc->end(); ++i)
457 LOG(ERROR) << " " << i->lo << "-" << i->hi;
458 }
459
460 // If the class is too large, it's okay to overestimate.
461 if (cc->size() > 10)
462 return AnyCharOrAnyByte();
463
464 Prefilter::Info *a = new Prefilter::Info();
465 for (CCIter i = cc->begin(); i != cc->end(); ++i)
466 for (Rune r = i->lo; r <= i->hi; r++) {
467 if (latin1) {
468 a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
469 } else {
470 a->exact_.insert(RuneToString(ToLowerRune(r)));
471 }
472 }
473
474
475 a->is_exact_ = true;
476
477 if (ExtraDebug)
478 LOG(ERROR) << " = " << a->ToString();
479
480 return a;
481}
482
483class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
484 public:
485 Walker(bool latin1) : latin1_(latin1) {}
486
487 virtual Info* PostVisit(
488 Regexp* re, Info* parent_arg,
489 Info* pre_arg,
490 Info** child_args, int nchild_args);
491
492 virtual Info* ShortVisit(
493 Regexp* re,
494 Info* parent_arg);
495
496 bool latin1() { return latin1_; }
497 private:
498 bool latin1_;
499
500 Walker(const Walker&) = delete;
501 Walker& operator=(const Walker&) = delete;
502};
503
504Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
505 if (ExtraDebug)
506 LOG(ERROR) << "BuildPrefilter::Info: " << re->ToString();
507
508 bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0;
509 Prefilter::Info::Walker w(latin1);
510 Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
511
512 if (w.stopped_early()) {
513 delete info;
514 return NULL;
515 }
516
517 return info;
518}
519
520Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
521 Regexp* re, Prefilter::Info* parent_arg) {
522 return AnyMatch();
523}
524
525// Constructs the Prefilter::Info for the given regular expression.
526// Assumes re is simplified.
527Prefilter::Info* Prefilter::Info::Walker::PostVisit(
528 Regexp* re, Prefilter::Info* parent_arg,
529 Prefilter::Info* pre_arg, Prefilter::Info** child_args,
530 int nchild_args) {
531 Prefilter::Info *info;
532 switch (re->op()) {
533 default:
534 case kRegexpRepeat:
535 LOG(DFATAL) << "Bad regexp op " << re->op();
536 info = EmptyString();
537 break;
538
539 case kRegexpNoMatch:
540 info = NoMatch();
541 break;
542
543 // These ops match the empty string:
544 case kRegexpEmptyMatch: // anywhere
545 case kRegexpBeginLine: // at beginning of line
546 case kRegexpEndLine: // at end of line
547 case kRegexpBeginText: // at beginning of text
548 case kRegexpEndText: // at end of text
549 case kRegexpWordBoundary: // at word boundary
550 case kRegexpNoWordBoundary: // not at word boundary
551 info = EmptyString();
552 break;
553
554 case kRegexpLiteral:
555 if (latin1()) {
556 info = LiteralLatin1(re->rune());
557 }
558 else {
559 info = Literal(re->rune());
560 }
561 break;
562
563 case kRegexpLiteralString:
564 if (re->nrunes() == 0) {
565 info = NoMatch();
566 break;
567 }
568 if (latin1()) {
569 info = LiteralLatin1(re->runes()[0]);
570 for (int i = 1; i < re->nrunes(); i++) {
571 info = Concat(info, LiteralLatin1(re->runes()[i]));
572 }
573 } else {
574 info = Literal(re->runes()[0]);
575 for (int i = 1; i < re->nrunes(); i++) {
576 info = Concat(info, Literal(re->runes()[i]));
577 }
578 }
579 break;
580
581 case kRegexpConcat: {
582 // Accumulate in info.
583 // Exact is concat of recent contiguous exact nodes.
584 info = NULL;
585 Info* exact = NULL;
586 for (int i = 0; i < nchild_args; i++) {
587 Info* ci = child_args[i]; // child info
588 if (!ci->is_exact() ||
589 (exact && ci->exact().size() * exact->exact().size() > 16)) {
590 // Exact run is over.
591 info = And(info, exact);
592 exact = NULL;
593 // Add this child's info.
594 info = And(info, ci);
595 } else {
596 // Append to exact run.
597 exact = Concat(exact, ci);
598 }
599 }
600 info = And(info, exact);
601 }
602 break;
603
604 case kRegexpAlternate:
605 info = child_args[0];
606 for (int i = 1; i < nchild_args; i++)
607 info = Alt(info, child_args[i]);
608 break;
609
610 case kRegexpStar:
611 info = Star(child_args[0]);
612 break;
613
614 case kRegexpQuest:
615 info = Quest(child_args[0]);
616 break;
617
618 case kRegexpPlus:
619 info = Plus(child_args[0]);
620 break;
621
622 case kRegexpAnyChar:
623 case kRegexpAnyByte:
624 // Claim nothing, except that it's not empty.
625 info = AnyCharOrAnyByte();
626 break;
627
628 case kRegexpCharClass:
629 info = CClass(re->cc(), latin1());
630 break;
631
632 case kRegexpCapture:
633 // These don't affect the set of matching strings.
634 info = child_args[0];
635 break;
636 }
637
638 if (ExtraDebug)
639 LOG(ERROR) << "BuildInfo " << re->ToString()
640 << ": " << (info ? info->ToString() : "");
641
642 return info;
643}
644
645
646Prefilter* Prefilter::FromRegexp(Regexp* re) {
647 if (re == NULL)
648 return NULL;
649
650 Regexp* simple = re->Simplify();
651 Prefilter::Info *info = BuildInfo(simple);
652
653 simple->Decref();
654 if (info == NULL)
655 return NULL;
656
657 Prefilter* m = info->TakeMatch();
658
659 delete info;
660 return m;
661}
662
663std::string Prefilter::DebugString() const {
664 switch (op_) {
665 default:
666 LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
667 return StringPrintf("op%d", op_);
668 case NONE:
669 return "*no-matches*";
670 case ATOM:
671 return atom_;
672 case ALL:
673 return "";
674 case AND: {
675 std::string s = "";
676 for (size_t i = 0; i < subs_->size(); i++) {
677 if (i > 0)
678 s += " ";
679 Prefilter* sub = (*subs_)[i];
680 s += sub ? sub->DebugString() : "<nil>";
681 }
682 return s;
683 }
684 case OR: {
685 std::string s = "(";
686 for (size_t i = 0; i < subs_->size(); i++) {
687 if (i > 0)
688 s += "|";
689 Prefilter* sub = (*subs_)[i];
690 s += sub ? sub->DebugString() : "<nil>";
691 }
692 s += ")";
693 return s;
694 }
695 }
696}
697
698Prefilter* Prefilter::FromRE2(const RE2* re2) {
699 if (re2 == NULL)
700 return NULL;
701
702 Regexp* regexp = re2->Regexp();
703 if (regexp == NULL)
704 return NULL;
705
706 return FromRegexp(regexp);
707}
708
709
710} // namespace re2
711