1 | // Copyright 2009 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | #ifndef RE2_PREFILTER_TREE_H_ |
6 | #define RE2_PREFILTER_TREE_H_ |
7 | |
8 | // The PrefilterTree class is used to form an AND-OR tree of strings |
9 | // that would trigger each regexp. The 'prefilter' of each regexp is |
10 | // added to PrefilterTree, and then PrefilterTree is used to find all |
11 | // the unique strings across the prefilters. During search, by using |
12 | // matches from a string matching engine, PrefilterTree deduces the |
13 | // set of regexps that are to be triggered. The 'string matching |
14 | // engine' itself is outside of this class, and the caller can use any |
15 | // favorite engine. PrefilterTree provides a set of strings (called |
16 | // atoms) that the user of this class should use to do the string |
17 | // matching. |
18 | |
19 | #include <map> |
20 | #include <string> |
21 | #include <vector> |
22 | |
23 | #include "util/util.h" |
24 | #include "re2/prefilter.h" |
25 | #include "re2/sparse_array.h" |
26 | |
27 | namespace re2 { |
28 | |
29 | class PrefilterTree { |
30 | public: |
31 | PrefilterTree(); |
32 | explicit PrefilterTree(int min_atom_len); |
33 | ~PrefilterTree(); |
34 | |
35 | // Adds the prefilter for the next regexp. Note that we assume that |
36 | // Add called sequentially for all regexps. All Add calls |
37 | // must precede Compile. |
38 | void Add(Prefilter* prefilter); |
39 | |
40 | // The Compile returns a vector of string in atom_vec. |
41 | // Call this after all the prefilters are added through Add. |
42 | // No calls to Add after Compile are allowed. |
43 | // The caller should use the returned set of strings to do string matching. |
44 | // Each time a string matches, the corresponding index then has to be |
45 | // and passed to RegexpsGivenStrings below. |
46 | void Compile(std::vector<std::string>* atom_vec); |
47 | |
48 | // Given the indices of the atoms that matched, returns the indexes |
49 | // of regexps that should be searched. The matched_atoms should |
50 | // contain all the ids of string atoms that were found to match the |
51 | // content. The caller can use any string match engine to perform |
52 | // this function. This function is thread safe. |
53 | void RegexpsGivenStrings(const std::vector<int>& matched_atoms, |
54 | std::vector<int>* regexps) const; |
55 | |
56 | // Print debug prefilter. Also prints unique ids associated with |
57 | // nodes of the prefilter of the regexp. |
58 | void PrintPrefilter(int regexpid); |
59 | |
60 | private: |
61 | typedef SparseArray<int> IntMap; |
62 | typedef std::map<int, int> StdIntMap; |
63 | typedef std::map<std::string, Prefilter*> NodeMap; |
64 | |
65 | // Each unique node has a corresponding Entry that helps in |
66 | // passing the matching trigger information along the tree. |
67 | struct Entry { |
68 | public: |
69 | // How many children should match before this node triggers the |
70 | // parent. For an atom and an OR node, this is 1 and for an AND |
71 | // node, it is the number of unique children. |
72 | int propagate_up_at_count; |
73 | |
74 | // When this node is ready to trigger the parent, what are the indices |
75 | // of the parent nodes to trigger. The reason there may be more than |
76 | // one is because of sharing. For example (abc | def) and (xyz | def) |
77 | // are two different nodes, but they share the atom 'def'. So when |
78 | // 'def' matches, it triggers two parents, corresponding to the two |
79 | // different OR nodes. |
80 | StdIntMap* parents; |
81 | |
82 | // When this node is ready to trigger the parent, what are the |
83 | // regexps that are triggered. |
84 | std::vector<int> regexps; |
85 | }; |
86 | |
87 | // Returns true if the prefilter node should be kept. |
88 | bool KeepNode(Prefilter* node) const; |
89 | |
90 | // This function assigns unique ids to various parts of the |
91 | // prefilter, by looking at if these nodes are already in the |
92 | // PrefilterTree. |
93 | void AssignUniqueIds(NodeMap* nodes, std::vector<std::string>* atom_vec); |
94 | |
95 | // Given the matching atoms, find the regexps to be triggered. |
96 | void PropagateMatch(const std::vector<int>& atom_ids, |
97 | IntMap* regexps) const; |
98 | |
99 | // Returns the prefilter node that has the same NodeString as this |
100 | // node. For the canonical node, returns node. |
101 | Prefilter* CanonicalNode(NodeMap* nodes, Prefilter* node); |
102 | |
103 | // A string that uniquely identifies the node. Assumes that the |
104 | // children of node has already been assigned unique ids. |
105 | std::string NodeString(Prefilter* node) const; |
106 | |
107 | // Recursively constructs a readable prefilter string. |
108 | std::string DebugNodeString(Prefilter* node) const; |
109 | |
110 | // Used for debugging. |
111 | void PrintDebugInfo(NodeMap* nodes); |
112 | |
113 | // These are all the nodes formed by Compile. Essentially, there is |
114 | // one node for each unique atom and each unique AND/OR node. |
115 | std::vector<Entry> entries_; |
116 | |
117 | // indices of regexps that always pass through the filter (since we |
118 | // found no required literals in these regexps). |
119 | std::vector<int> unfiltered_; |
120 | |
121 | // vector of Prefilter for all regexps. |
122 | std::vector<Prefilter*> prefilter_vec_; |
123 | |
124 | // Atom index in returned strings to entry id mapping. |
125 | std::vector<int> atom_index_to_id_; |
126 | |
127 | // Has the prefilter tree been compiled. |
128 | bool compiled_; |
129 | |
130 | // Strings less than this length are not stored as atoms. |
131 | const int min_atom_len_; |
132 | |
133 | PrefilterTree(const PrefilterTree&) = delete; |
134 | PrefilterTree& operator=(const PrefilterTree&) = delete; |
135 | }; |
136 | |
137 | } // namespace |
138 | |
139 | #endif // RE2_PREFILTER_TREE_H_ |
140 | |