1 | // Copyright 2009 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | #include "re2/prefilter_tree.h" |
6 | |
7 | #include <stddef.h> |
8 | #include <algorithm> |
9 | #include <map> |
10 | #include <memory> |
11 | #include <set> |
12 | #include <string> |
13 | #include <utility> |
14 | #include <vector> |
15 | |
16 | #include "util/util.h" |
17 | #include "util/logging.h" |
18 | #include "util/strutil.h" |
19 | #include "re2/prefilter.h" |
20 | #include "re2/re2.h" |
21 | |
22 | namespace re2 { |
23 | |
24 | static const bool = false; |
25 | |
26 | PrefilterTree::PrefilterTree() |
27 | : compiled_(false), |
28 | min_atom_len_(3) { |
29 | } |
30 | |
31 | PrefilterTree::PrefilterTree(int min_atom_len) |
32 | : compiled_(false), |
33 | min_atom_len_(min_atom_len) { |
34 | } |
35 | |
36 | PrefilterTree::~PrefilterTree() { |
37 | for (size_t i = 0; i < prefilter_vec_.size(); i++) |
38 | delete prefilter_vec_[i]; |
39 | |
40 | for (size_t i = 0; i < entries_.size(); i++) |
41 | delete entries_[i].parents; |
42 | } |
43 | |
44 | void PrefilterTree::Add(Prefilter* prefilter) { |
45 | if (compiled_) { |
46 | LOG(DFATAL) << "Add called after Compile." ; |
47 | return; |
48 | } |
49 | if (prefilter != NULL && !KeepNode(prefilter)) { |
50 | delete prefilter; |
51 | prefilter = NULL; |
52 | } |
53 | |
54 | prefilter_vec_.push_back(prefilter); |
55 | } |
56 | |
57 | void PrefilterTree::Compile(std::vector<std::string>* atom_vec) { |
58 | if (compiled_) { |
59 | LOG(DFATAL) << "Compile called already." ; |
60 | return; |
61 | } |
62 | |
63 | // Some legacy users of PrefilterTree call Compile() before |
64 | // adding any regexps and expect Compile() to have no effect. |
65 | if (prefilter_vec_.empty()) |
66 | return; |
67 | |
68 | compiled_ = true; |
69 | |
70 | // TODO(junyer): Use std::unordered_set<Prefilter*> instead? |
71 | NodeMap nodes; |
72 | AssignUniqueIds(&nodes, atom_vec); |
73 | |
74 | // Identify nodes that are too common among prefilters and are |
75 | // triggering too many parents. Then get rid of them if possible. |
76 | // Note that getting rid of a prefilter node simply means they are |
77 | // no longer necessary for their parent to trigger; that is, we do |
78 | // not miss out on any regexps triggering by getting rid of a |
79 | // prefilter node. |
80 | for (size_t i = 0; i < entries_.size(); i++) { |
81 | StdIntMap* parents = entries_[i].parents; |
82 | if (parents->size() > 8) { |
83 | // This one triggers too many things. If all the parents are AND |
84 | // nodes and have other things guarding them, then get rid of |
85 | // this trigger. TODO(vsri): Adjust the threshold appropriately, |
86 | // make it a function of total number of nodes? |
87 | bool have_other_guard = true; |
88 | for (StdIntMap::iterator it = parents->begin(); |
89 | it != parents->end(); ++it) { |
90 | have_other_guard = have_other_guard && |
91 | (entries_[it->first].propagate_up_at_count > 1); |
92 | } |
93 | |
94 | if (have_other_guard) { |
95 | for (StdIntMap::iterator it = parents->begin(); |
96 | it != parents->end(); ++it) |
97 | entries_[it->first].propagate_up_at_count -= 1; |
98 | |
99 | parents->clear(); // Forget the parents |
100 | } |
101 | } |
102 | } |
103 | |
104 | if (ExtraDebug) |
105 | PrintDebugInfo(&nodes); |
106 | } |
107 | |
108 | Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) { |
109 | std::string node_string = NodeString(node); |
110 | std::map<std::string, Prefilter*>::iterator iter = nodes->find(node_string); |
111 | if (iter == nodes->end()) |
112 | return NULL; |
113 | return (*iter).second; |
114 | } |
115 | |
116 | std::string PrefilterTree::NodeString(Prefilter* node) const { |
117 | // Adding the operation disambiguates AND/OR/atom nodes. |
118 | std::string s = StringPrintf("%d" , node->op()) + ":" ; |
119 | if (node->op() == Prefilter::ATOM) { |
120 | s += node->atom(); |
121 | } else { |
122 | for (size_t i = 0; i < node->subs()->size(); i++) { |
123 | if (i > 0) |
124 | s += ','; |
125 | s += StringPrintf("%d" , (*node->subs())[i]->unique_id()); |
126 | } |
127 | } |
128 | return s; |
129 | } |
130 | |
131 | bool PrefilterTree::KeepNode(Prefilter* node) const { |
132 | if (node == NULL) |
133 | return false; |
134 | |
135 | switch (node->op()) { |
136 | default: |
137 | LOG(DFATAL) << "Unexpected op in KeepNode: " << node->op(); |
138 | return false; |
139 | |
140 | case Prefilter::ALL: |
141 | case Prefilter::NONE: |
142 | return false; |
143 | |
144 | case Prefilter::ATOM: |
145 | return node->atom().size() >= static_cast<size_t>(min_atom_len_); |
146 | |
147 | case Prefilter::AND: { |
148 | int j = 0; |
149 | std::vector<Prefilter*>* subs = node->subs(); |
150 | for (size_t i = 0; i < subs->size(); i++) |
151 | if (KeepNode((*subs)[i])) |
152 | (*subs)[j++] = (*subs)[i]; |
153 | else |
154 | delete (*subs)[i]; |
155 | |
156 | subs->resize(j); |
157 | return j > 0; |
158 | } |
159 | |
160 | case Prefilter::OR: |
161 | for (size_t i = 0; i < node->subs()->size(); i++) |
162 | if (!KeepNode((*node->subs())[i])) |
163 | return false; |
164 | return true; |
165 | } |
166 | } |
167 | |
168 | void PrefilterTree::AssignUniqueIds(NodeMap* nodes, |
169 | std::vector<std::string>* atom_vec) { |
170 | atom_vec->clear(); |
171 | |
172 | // Build vector of all filter nodes, sorted topologically |
173 | // from top to bottom in v. |
174 | std::vector<Prefilter*> v; |
175 | |
176 | // Add the top level nodes of each regexp prefilter. |
177 | for (size_t i = 0; i < prefilter_vec_.size(); i++) { |
178 | Prefilter* f = prefilter_vec_[i]; |
179 | if (f == NULL) |
180 | unfiltered_.push_back(static_cast<int>(i)); |
181 | |
182 | // We push NULL also on to v, so that we maintain the |
183 | // mapping of index==regexpid for level=0 prefilter nodes. |
184 | v.push_back(f); |
185 | } |
186 | |
187 | // Now add all the descendant nodes. |
188 | for (size_t i = 0; i < v.size(); i++) { |
189 | Prefilter* f = v[i]; |
190 | if (f == NULL) |
191 | continue; |
192 | if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) { |
193 | const std::vector<Prefilter*>& subs = *f->subs(); |
194 | for (size_t j = 0; j < subs.size(); j++) |
195 | v.push_back(subs[j]); |
196 | } |
197 | } |
198 | |
199 | // Identify unique nodes. |
200 | int unique_id = 0; |
201 | for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { |
202 | Prefilter *node = v[i]; |
203 | if (node == NULL) |
204 | continue; |
205 | node->set_unique_id(-1); |
206 | Prefilter* canonical = CanonicalNode(nodes, node); |
207 | if (canonical == NULL) { |
208 | // Any further nodes that have the same node string |
209 | // will find this node as the canonical node. |
210 | nodes->emplace(NodeString(node), node); |
211 | if (node->op() == Prefilter::ATOM) { |
212 | atom_vec->push_back(node->atom()); |
213 | atom_index_to_id_.push_back(unique_id); |
214 | } |
215 | node->set_unique_id(unique_id++); |
216 | } else { |
217 | node->set_unique_id(canonical->unique_id()); |
218 | } |
219 | } |
220 | entries_.resize(nodes->size()); |
221 | |
222 | // Create parent StdIntMap for the entries. |
223 | for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { |
224 | Prefilter* prefilter = v[i]; |
225 | if (prefilter == NULL) |
226 | continue; |
227 | |
228 | if (CanonicalNode(nodes, prefilter) != prefilter) |
229 | continue; |
230 | |
231 | Entry* entry = &entries_[prefilter->unique_id()]; |
232 | entry->parents = new StdIntMap(); |
233 | } |
234 | |
235 | // Fill the entries. |
236 | for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { |
237 | Prefilter* prefilter = v[i]; |
238 | if (prefilter == NULL) |
239 | continue; |
240 | |
241 | if (CanonicalNode(nodes, prefilter) != prefilter) |
242 | continue; |
243 | |
244 | Entry* entry = &entries_[prefilter->unique_id()]; |
245 | |
246 | switch (prefilter->op()) { |
247 | default: |
248 | case Prefilter::ALL: |
249 | LOG(DFATAL) << "Unexpected op: " << prefilter->op(); |
250 | return; |
251 | |
252 | case Prefilter::ATOM: |
253 | entry->propagate_up_at_count = 1; |
254 | break; |
255 | |
256 | case Prefilter::OR: |
257 | case Prefilter::AND: { |
258 | std::set<int> uniq_child; |
259 | for (size_t j = 0; j < prefilter->subs()->size(); j++) { |
260 | Prefilter* child = (*prefilter->subs())[j]; |
261 | Prefilter* canonical = CanonicalNode(nodes, child); |
262 | if (canonical == NULL) { |
263 | LOG(DFATAL) << "Null canonical node" ; |
264 | return; |
265 | } |
266 | int child_id = canonical->unique_id(); |
267 | uniq_child.insert(child_id); |
268 | // To the child, we want to add to parent indices. |
269 | Entry* child_entry = &entries_[child_id]; |
270 | if (child_entry->parents->find(prefilter->unique_id()) == |
271 | child_entry->parents->end()) { |
272 | (*child_entry->parents)[prefilter->unique_id()] = 1; |
273 | } |
274 | } |
275 | entry->propagate_up_at_count = prefilter->op() == Prefilter::AND |
276 | ? static_cast<int>(uniq_child.size()) |
277 | : 1; |
278 | |
279 | break; |
280 | } |
281 | } |
282 | } |
283 | |
284 | // For top level nodes, populate regexp id. |
285 | for (size_t i = 0; i < prefilter_vec_.size(); i++) { |
286 | if (prefilter_vec_[i] == NULL) |
287 | continue; |
288 | int id = CanonicalNode(nodes, prefilter_vec_[i])->unique_id(); |
289 | DCHECK_LE(0, id); |
290 | Entry* entry = &entries_[id]; |
291 | entry->regexps.push_back(static_cast<int>(i)); |
292 | } |
293 | } |
294 | |
295 | // Functions for triggering during search. |
296 | void PrefilterTree::RegexpsGivenStrings( |
297 | const std::vector<int>& matched_atoms, |
298 | std::vector<int>* regexps) const { |
299 | regexps->clear(); |
300 | if (!compiled_) { |
301 | // Some legacy users of PrefilterTree call Compile() before |
302 | // adding any regexps and expect Compile() to have no effect. |
303 | // This kludge is a counterpart to that kludge. |
304 | if (prefilter_vec_.empty()) |
305 | return; |
306 | |
307 | LOG(ERROR) << "RegexpsGivenStrings called before Compile." ; |
308 | for (size_t i = 0; i < prefilter_vec_.size(); i++) |
309 | regexps->push_back(static_cast<int>(i)); |
310 | } else { |
311 | IntMap regexps_map(static_cast<int>(prefilter_vec_.size())); |
312 | std::vector<int> matched_atom_ids; |
313 | for (size_t j = 0; j < matched_atoms.size(); j++) |
314 | matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]); |
315 | PropagateMatch(matched_atom_ids, ®exps_map); |
316 | for (IntMap::iterator it = regexps_map.begin(); |
317 | it != regexps_map.end(); |
318 | ++it) |
319 | regexps->push_back(it->index()); |
320 | |
321 | regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end()); |
322 | } |
323 | std::sort(regexps->begin(), regexps->end()); |
324 | } |
325 | |
326 | void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids, |
327 | IntMap* regexps) const { |
328 | IntMap count(static_cast<int>(entries_.size())); |
329 | IntMap work(static_cast<int>(entries_.size())); |
330 | for (size_t i = 0; i < atom_ids.size(); i++) |
331 | work.set(atom_ids[i], 1); |
332 | for (IntMap::iterator it = work.begin(); it != work.end(); ++it) { |
333 | const Entry& entry = entries_[it->index()]; |
334 | // Record regexps triggered. |
335 | for (size_t i = 0; i < entry.regexps.size(); i++) |
336 | regexps->set(entry.regexps[i], 1); |
337 | int c; |
338 | // Pass trigger up to parents. |
339 | for (StdIntMap::iterator it = entry.parents->begin(); |
340 | it != entry.parents->end(); |
341 | ++it) { |
342 | int j = it->first; |
343 | const Entry& parent = entries_[j]; |
344 | // Delay until all the children have succeeded. |
345 | if (parent.propagate_up_at_count > 1) { |
346 | if (count.has_index(j)) { |
347 | c = count.get_existing(j) + 1; |
348 | count.set_existing(j, c); |
349 | } else { |
350 | c = 1; |
351 | count.set_new(j, c); |
352 | } |
353 | if (c < parent.propagate_up_at_count) |
354 | continue; |
355 | } |
356 | // Trigger the parent. |
357 | work.set(j, 1); |
358 | } |
359 | } |
360 | } |
361 | |
362 | // Debugging help. |
363 | void PrefilterTree::PrintPrefilter(int regexpid) { |
364 | LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]); |
365 | } |
366 | |
367 | void PrefilterTree::PrintDebugInfo(NodeMap* nodes) { |
368 | LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size(); |
369 | LOG(ERROR) << "#Unique Nodes: " << entries_.size(); |
370 | |
371 | for (size_t i = 0; i < entries_.size(); i++) { |
372 | StdIntMap* parents = entries_[i].parents; |
373 | const std::vector<int>& regexps = entries_[i].regexps; |
374 | LOG(ERROR) << "EntryId: " << i |
375 | << " N: " << parents->size() << " R: " << regexps.size(); |
376 | for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it) |
377 | LOG(ERROR) << it->first; |
378 | } |
379 | LOG(ERROR) << "Map:" ; |
380 | for (std::map<std::string, Prefilter*>::const_iterator iter = nodes->begin(); |
381 | iter != nodes->end(); ++iter) |
382 | LOG(ERROR) << "NodeId: " << (*iter).second->unique_id() |
383 | << " Str: " << (*iter).first; |
384 | } |
385 | |
386 | std::string PrefilterTree::DebugNodeString(Prefilter* node) const { |
387 | std::string node_string = "" ; |
388 | if (node->op() == Prefilter::ATOM) { |
389 | DCHECK(!node->atom().empty()); |
390 | node_string += node->atom(); |
391 | } else { |
392 | // Adding the operation disambiguates AND and OR nodes. |
393 | node_string += node->op() == Prefilter::AND ? "AND" : "OR" ; |
394 | node_string += "(" ; |
395 | for (size_t i = 0; i < node->subs()->size(); i++) { |
396 | if (i > 0) |
397 | node_string += ','; |
398 | node_string += StringPrintf("%d" , (*node->subs())[i]->unique_id()); |
399 | node_string += ":" ; |
400 | node_string += DebugNodeString((*node->subs())[i]); |
401 | } |
402 | node_string += ")" ; |
403 | } |
404 | return node_string; |
405 | } |
406 | |
407 | } // namespace re2 |
408 | |