1// Copyright 2009 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#ifndef RE2_FILTERED_RE2_H_
6#define RE2_FILTERED_RE2_H_
7
8// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
9// It provides a prefilter mechanism that helps in cutting down the
10// number of regexps that need to be actually searched.
11//
12// By design, it does not include a string matching engine. This is to
13// allow the user of the class to use their favorite string match
14// engine. The overall flow is: Add all the regexps using Add, then
15// Compile the FilteredRE2. The compile returns strings that need to
16// be matched. Note that all returned strings are lowercase. For
17// applying regexps to a search text, the caller does the string
18// matching using the strings returned. When doing the string match,
19// note that the caller has to do that on lower cased version of the
20// search text. Then call FirstMatch or AllMatches with a vector of
21// indices of strings that were found in the text to get the actual
22// regexp matches.
23
24#include <string>
25#include <vector>
26
27#include "re2/re2.h"
28
29namespace re2 {
30
31class PrefilterTree;
32
33class FilteredRE2 {
34 public:
35 FilteredRE2();
36 explicit FilteredRE2(int min_atom_len);
37 ~FilteredRE2();
38
39 // Uses RE2 constructor to create a RE2 object (re). Returns
40 // re->error_code(). If error_code is other than NoError, then re is
41 // deleted and not added to re2_vec_.
42 RE2::ErrorCode Add(const StringPiece& pattern,
43 const RE2::Options& options,
44 int *id);
45
46 // Prepares the regexps added by Add for filtering. Returns a set
47 // of strings that the caller should check for in candidate texts.
48 // The returned strings are lowercased. When doing string matching,
49 // the search text should be lowercased first to find matching
50 // strings from the set of strings returned by Compile. Call after
51 // all Add calls are done.
52 void Compile(std::vector<std::string>* strings_to_match);
53
54 // Returns the index of the first matching regexp.
55 // Returns -1 on no match. Can be called prior to Compile.
56 // Does not do any filtering: simply tries to Match the
57 // regexps in a loop.
58 int SlowFirstMatch(const StringPiece& text) const;
59
60 // Returns the index of the first matching regexp.
61 // Returns -1 on no match. Compile has to be called before
62 // calling this.
63 int FirstMatch(const StringPiece& text,
64 const std::vector<int>& atoms) const;
65
66 // Returns the indices of all matching regexps, after first clearing
67 // matched_regexps.
68 bool AllMatches(const StringPiece& text,
69 const std::vector<int>& atoms,
70 std::vector<int>* matching_regexps) const;
71
72 // Returns the indices of all potentially matching regexps after first
73 // clearing potential_regexps.
74 // A regexp is potentially matching if it passes the filter.
75 // If a regexp passes the filter it may still not match.
76 // A regexp that does not pass the filter is guaranteed to not match.
77 void AllPotentials(const std::vector<int>& atoms,
78 std::vector<int>* potential_regexps) const;
79
80 // The number of regexps added.
81 int NumRegexps() const { return static_cast<int>(re2_vec_.size()); }
82
83 // Get the individual RE2 objects.
84 const RE2& GetRE2(int regexpid) const { return *re2_vec_[regexpid]; }
85
86 private:
87 // Print prefilter.
88 void PrintPrefilter(int regexpid);
89
90 // Useful for testing and debugging.
91 void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
92 std::vector<int>* passed_regexps);
93
94 // All the regexps in the FilteredRE2.
95 std::vector<RE2*> re2_vec_;
96
97 // Has the FilteredRE2 been compiled using Compile()
98 bool compiled_;
99
100 // An AND-OR tree of string atoms used for filtering regexps.
101 PrefilterTree* prefilter_tree_;
102
103 FilteredRE2(const FilteredRE2&) = delete;
104 FilteredRE2& operator=(const FilteredRE2&) = delete;
105};
106
107} // namespace re2
108
109#endif // RE2_FILTERED_RE2_H_
110