1 | // Copyright 2009 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | #ifndef RE2_FILTERED_RE2_H_ |
6 | #define RE2_FILTERED_RE2_H_ |
7 | |
8 | // The class FilteredRE2 is used as a wrapper to multiple RE2 regexps. |
9 | // It provides a prefilter mechanism that helps in cutting down the |
10 | // number of regexps that need to be actually searched. |
11 | // |
12 | // By design, it does not include a string matching engine. This is to |
13 | // allow the user of the class to use their favorite string match |
14 | // engine. The overall flow is: Add all the regexps using Add, then |
15 | // Compile the FilteredRE2. The compile returns strings that need to |
16 | // be matched. Note that all returned strings are lowercase. For |
17 | // applying regexps to a search text, the caller does the string |
18 | // matching using the strings returned. When doing the string match, |
19 | // note that the caller has to do that on lower cased version of the |
20 | // search text. Then call FirstMatch or AllMatches with a vector of |
21 | // indices of strings that were found in the text to get the actual |
22 | // regexp matches. |
23 | |
24 | #include <string> |
25 | #include <vector> |
26 | |
27 | #include "re2/re2.h" |
28 | |
29 | namespace re2 { |
30 | |
31 | class PrefilterTree; |
32 | |
33 | class FilteredRE2 { |
34 | public: |
35 | FilteredRE2(); |
36 | explicit FilteredRE2(int min_atom_len); |
37 | ~FilteredRE2(); |
38 | |
39 | // Uses RE2 constructor to create a RE2 object (re). Returns |
40 | // re->error_code(). If error_code is other than NoError, then re is |
41 | // deleted and not added to re2_vec_. |
42 | RE2::ErrorCode Add(const StringPiece& pattern, |
43 | const RE2::Options& options, |
44 | int *id); |
45 | |
46 | // Prepares the regexps added by Add for filtering. Returns a set |
47 | // of strings that the caller should check for in candidate texts. |
48 | // The returned strings are lowercased. When doing string matching, |
49 | // the search text should be lowercased first to find matching |
50 | // strings from the set of strings returned by Compile. Call after |
51 | // all Add calls are done. |
52 | void Compile(std::vector<std::string>* strings_to_match); |
53 | |
54 | // Returns the index of the first matching regexp. |
55 | // Returns -1 on no match. Can be called prior to Compile. |
56 | // Does not do any filtering: simply tries to Match the |
57 | // regexps in a loop. |
58 | int SlowFirstMatch(const StringPiece& text) const; |
59 | |
60 | // Returns the index of the first matching regexp. |
61 | // Returns -1 on no match. Compile has to be called before |
62 | // calling this. |
63 | int FirstMatch(const StringPiece& text, |
64 | const std::vector<int>& atoms) const; |
65 | |
66 | // Returns the indices of all matching regexps, after first clearing |
67 | // matched_regexps. |
68 | bool AllMatches(const StringPiece& text, |
69 | const std::vector<int>& atoms, |
70 | std::vector<int>* matching_regexps) const; |
71 | |
72 | // Returns the indices of all potentially matching regexps after first |
73 | // clearing potential_regexps. |
74 | // A regexp is potentially matching if it passes the filter. |
75 | // If a regexp passes the filter it may still not match. |
76 | // A regexp that does not pass the filter is guaranteed to not match. |
77 | void AllPotentials(const std::vector<int>& atoms, |
78 | std::vector<int>* potential_regexps) const; |
79 | |
80 | // The number of regexps added. |
81 | int NumRegexps() const { return static_cast<int>(re2_vec_.size()); } |
82 | |
83 | // Get the individual RE2 objects. |
84 | const RE2& GetRE2(int regexpid) const { return *re2_vec_[regexpid]; } |
85 | |
86 | private: |
87 | // Print prefilter. |
88 | void PrintPrefilter(int regexpid); |
89 | |
90 | // Useful for testing and debugging. |
91 | void RegexpsGivenStrings(const std::vector<int>& matched_atoms, |
92 | std::vector<int>* passed_regexps); |
93 | |
94 | // All the regexps in the FilteredRE2. |
95 | std::vector<RE2*> re2_vec_; |
96 | |
97 | // Has the FilteredRE2 been compiled using Compile() |
98 | bool compiled_; |
99 | |
100 | // An AND-OR tree of string atoms used for filtering regexps. |
101 | PrefilterTree* prefilter_tree_; |
102 | |
103 | FilteredRE2(const FilteredRE2&) = delete; |
104 | FilteredRE2& operator=(const FilteredRE2&) = delete; |
105 | }; |
106 | |
107 | } // namespace re2 |
108 | |
109 | #endif // RE2_FILTERED_RE2_H_ |
110 | |