1// Copyright 2009 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#ifndef RE2_PREFILTER_TREE_H_
6#define RE2_PREFILTER_TREE_H_
7
8// The PrefilterTree class is used to form an AND-OR tree of strings
9// that would trigger each regexp. The 'prefilter' of each regexp is
10// added to PrefilterTree, and then PrefilterTree is used to find all
11// the unique strings across the prefilters. During search, by using
12// matches from a string matching engine, PrefilterTree deduces the
13// set of regexps that are to be triggered. The 'string matching
14// engine' itself is outside of this class, and the caller can use any
15// favorite engine. PrefilterTree provides a set of strings (called
16// atoms) that the user of this class should use to do the string
17// matching.
18
19#include <map>
20#include <string>
21#include <vector>
22
23#include "util/util.h"
24#include "re2/prefilter.h"
25#include "re2/sparse_array.h"
26
27namespace re2 {
28
29class PrefilterTree {
30 public:
31 PrefilterTree();
32 explicit PrefilterTree(int min_atom_len);
33 ~PrefilterTree();
34
35 // Adds the prefilter for the next regexp. Note that we assume that
36 // Add called sequentially for all regexps. All Add calls
37 // must precede Compile.
38 void Add(Prefilter* prefilter);
39
40 // The Compile returns a vector of string in atom_vec.
41 // Call this after all the prefilters are added through Add.
42 // No calls to Add after Compile are allowed.
43 // The caller should use the returned set of strings to do string matching.
44 // Each time a string matches, the corresponding index then has to be
45 // and passed to RegexpsGivenStrings below.
46 void Compile(std::vector<std::string>* atom_vec);
47
48 // Given the indices of the atoms that matched, returns the indexes
49 // of regexps that should be searched. The matched_atoms should
50 // contain all the ids of string atoms that were found to match the
51 // content. The caller can use any string match engine to perform
52 // this function. This function is thread safe.
53 void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
54 std::vector<int>* regexps) const;
55
56 // Print debug prefilter. Also prints unique ids associated with
57 // nodes of the prefilter of the regexp.
58 void PrintPrefilter(int regexpid);
59
60 private:
61 typedef SparseArray<int> IntMap;
62 typedef std::map<int, int> StdIntMap;
63 typedef std::map<std::string, Prefilter*> NodeMap;
64
65 // Each unique node has a corresponding Entry that helps in
66 // passing the matching trigger information along the tree.
67 struct Entry {
68 public:
69 // How many children should match before this node triggers the
70 // parent. For an atom and an OR node, this is 1 and for an AND
71 // node, it is the number of unique children.
72 int propagate_up_at_count;
73
74 // When this node is ready to trigger the parent, what are the indices
75 // of the parent nodes to trigger. The reason there may be more than
76 // one is because of sharing. For example (abc | def) and (xyz | def)
77 // are two different nodes, but they share the atom 'def'. So when
78 // 'def' matches, it triggers two parents, corresponding to the two
79 // different OR nodes.
80 StdIntMap* parents;
81
82 // When this node is ready to trigger the parent, what are the
83 // regexps that are triggered.
84 std::vector<int> regexps;
85 };
86
87 // Returns true if the prefilter node should be kept.
88 bool KeepNode(Prefilter* node) const;
89
90 // This function assigns unique ids to various parts of the
91 // prefilter, by looking at if these nodes are already in the
92 // PrefilterTree.
93 void AssignUniqueIds(NodeMap* nodes, std::vector<std::string>* atom_vec);
94
95 // Given the matching atoms, find the regexps to be triggered.
96 void PropagateMatch(const std::vector<int>& atom_ids,
97 IntMap* regexps) const;
98
99 // Returns the prefilter node that has the same NodeString as this
100 // node. For the canonical node, returns node.
101 Prefilter* CanonicalNode(NodeMap* nodes, Prefilter* node);
102
103 // A string that uniquely identifies the node. Assumes that the
104 // children of node has already been assigned unique ids.
105 std::string NodeString(Prefilter* node) const;
106
107 // Recursively constructs a readable prefilter string.
108 std::string DebugNodeString(Prefilter* node) const;
109
110 // Used for debugging.
111 void PrintDebugInfo(NodeMap* nodes);
112
113 // These are all the nodes formed by Compile. Essentially, there is
114 // one node for each unique atom and each unique AND/OR node.
115 std::vector<Entry> entries_;
116
117 // indices of regexps that always pass through the filter (since we
118 // found no required literals in these regexps).
119 std::vector<int> unfiltered_;
120
121 // vector of Prefilter for all regexps.
122 std::vector<Prefilter*> prefilter_vec_;
123
124 // Atom index in returned strings to entry id mapping.
125 std::vector<int> atom_index_to_id_;
126
127 // Has the prefilter tree been compiled.
128 bool compiled_;
129
130 // Strings less than this length are not stored as atoms.
131 const int min_atom_len_;
132
133 PrefilterTree(const PrefilterTree&) = delete;
134 PrefilterTree& operator=(const PrefilterTree&) = delete;
135};
136
137} // namespace
138
139#endif // RE2_PREFILTER_TREE_H_
140