1 | /* |
2 | * Copyright (c) 2015-2018, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** \file |
30 | * \brief Splits an NFA graph into its connected components. |
31 | * |
32 | * This pass takes a NGHolder and splits its graph into a set of connected |
33 | * components, returning them as individual NGHolder graphs. For example, the |
34 | * graph for the regex /foo.*bar|[a-z]{7,13}|hatstand|teakettle$/ will be split |
35 | * into four NGHolders, representing these four components: |
36 | * |
37 | * - /foo.*bar/ |
38 | * - /[a-z]{7,13}/ |
39 | * - /hatstand/ |
40 | * - /teakettle$/ |
41 | * |
42 | * The pass operates by creating an undirected graph from the input graph, and |
43 | * then using the BGL's connected_components algorithm to do the work, cloning |
44 | * the identified components into their own graphs. A "shell" of vertices |
45 | * is identified and removed first from the head and tail of the graph, in |
46 | * order to handle cases where there is a common head/tail region. |
47 | * |
48 | * Trivial cases, such as an alternation of single vertices like /a|b|c|d|e|f/, |
49 | * are not split, as later optimisations will handle these cases efficiently. |
50 | */ |
51 | #include "ng_calc_components.h" |
52 | |
53 | #include "ng_depth.h" |
54 | #include "ng_holder.h" |
55 | #include "ng_prune.h" |
56 | #include "ng_util.h" |
57 | #include "grey.h" |
58 | #include "ue2common.h" |
59 | #include "util/graph_range.h" |
60 | #include "util/graph_undirected.h" |
61 | #include "util/make_unique.h" |
62 | |
63 | #include <map> |
64 | #include <vector> |
65 | |
66 | #include <boost/graph/connected_components.hpp> |
67 | #include <boost/graph/filtered_graph.hpp> |
68 | |
69 | using namespace std; |
70 | |
71 | namespace ue2 { |
72 | |
73 | static constexpr u32 MAX_HEAD_SHELL_DEPTH = 3; |
74 | static constexpr u32 MAX_TAIL_SHELL_DEPTH = 3; |
75 | |
76 | /** |
77 | * \brief Returns true if the whole graph is just an alternation of character |
78 | * classes. |
79 | */ |
80 | bool isAlternationOfClasses(const NGHolder &g) { |
81 | for (auto v : vertices_range(g)) { |
82 | if (is_special(v, g)) { |
83 | continue; |
84 | } |
85 | // Vertex must have in edges from starts only. |
86 | for (auto u : inv_adjacent_vertices_range(v, g)) { |
87 | if (!is_any_start(u, g)) { |
88 | return false; |
89 | } |
90 | } |
91 | // Vertex must have out edges to accepts only. |
92 | for (auto w : adjacent_vertices_range(v, g)) { |
93 | if (!is_any_accept(w, g)) { |
94 | return false; |
95 | } |
96 | } |
97 | } |
98 | |
99 | DEBUG_PRINTF("alternation of single states, treating as one comp\n" ); |
100 | return true; |
101 | } |
102 | |
103 | /** |
104 | * \brief Compute initial max distance to v from start (i.e. ignoring its own |
105 | * self-loop). |
106 | */ |
107 | static |
108 | depth max_dist_from_start(const NGHolder &g, |
109 | const vector<NFAVertexBidiDepth> &depths, |
110 | NFAVertex v) { |
111 | depth max_depth(0); |
112 | for (const auto u : inv_adjacent_vertices_range(v, g)) { |
113 | if (u == v) { |
114 | continue; |
115 | } |
116 | const auto &d = depths.at(g[u].index); |
117 | if (d.fromStart.max.is_reachable()) { |
118 | max_depth = max(max_depth, d.fromStart.max); |
119 | } |
120 | if (d.fromStartDotStar.max.is_reachable()) { |
121 | max_depth = max(max_depth, d.fromStartDotStar.max); |
122 | } |
123 | } |
124 | return max_depth + 1; |
125 | } |
126 | |
127 | /** |
128 | * \brief Compute initial max depth from v from accept (i.e. ignoring its own |
129 | * self-loop). |
130 | */ |
131 | static |
132 | depth max_dist_to_accept(const NGHolder &g, |
133 | const vector<NFAVertexBidiDepth> &depths, |
134 | NFAVertex v) { |
135 | depth max_depth(0); |
136 | for (const auto w : adjacent_vertices_range(v, g)) { |
137 | if (w == v) { |
138 | continue; |
139 | } |
140 | const auto &d = depths.at(g[w].index); |
141 | if (d.toAccept.max.is_reachable()) { |
142 | max_depth = max(max_depth, d.toAccept.max); |
143 | } |
144 | if (d.toAcceptEod.max.is_reachable()) { |
145 | max_depth = max(max_depth, d.toAcceptEod.max); |
146 | } |
147 | } |
148 | return max_depth + 1; |
149 | } |
150 | |
151 | static |
152 | flat_set<NFAVertex> findHeadShell(const NGHolder &g, |
153 | const vector<NFAVertexBidiDepth> &depths, |
154 | const depth &max_dist) { |
155 | flat_set<NFAVertex> shell; |
156 | |
157 | for (auto v : vertices_range(g)) { |
158 | if (is_special(v, g)) { |
159 | continue; |
160 | } |
161 | if (max_dist_from_start(g, depths, v) <= max_dist) { |
162 | shell.insert(v); |
163 | } |
164 | } |
165 | |
166 | for (UNUSED auto v : shell) { |
167 | DEBUG_PRINTF("shell: %zu\n" , g[v].index); |
168 | } |
169 | |
170 | return shell; |
171 | } |
172 | |
173 | static |
174 | flat_set<NFAVertex> findTailShell(const NGHolder &g, |
175 | const vector<NFAVertexBidiDepth> &depths, |
176 | const depth &max_dist) { |
177 | flat_set<NFAVertex> shell; |
178 | |
179 | for (auto v : vertices_range(g)) { |
180 | if (is_special(v, g)) { |
181 | continue; |
182 | } |
183 | if (max_dist_to_accept(g, depths, v) <= max_dist) { |
184 | shell.insert(v); |
185 | } |
186 | } |
187 | |
188 | for (UNUSED auto v : shell) { |
189 | DEBUG_PRINTF("shell: %zu\n" , g[v].index); |
190 | } |
191 | |
192 | return shell; |
193 | } |
194 | |
195 | static |
196 | vector<NFAEdge> findShellEdges(const NGHolder &g, |
197 | const flat_set<NFAVertex> &head_shell, |
198 | const flat_set<NFAVertex> &tail_shell) { |
199 | vector<NFAEdge> shell_edges; |
200 | |
201 | for (const auto &e : edges_range(g)) { |
202 | auto u = source(e, g); |
203 | auto v = target(e, g); |
204 | |
205 | if (v == g.startDs && is_any_start(u, g)) { |
206 | continue; |
207 | } |
208 | if (u == g.accept && v == g.acceptEod) { |
209 | continue; |
210 | } |
211 | |
212 | if ((is_special(u, g) || contains(head_shell, u)) && |
213 | (is_special(v, g) || contains(tail_shell, v))) { |
214 | DEBUG_PRINTF("edge (%zu,%zu) is a shell edge\n" , g[u].index, |
215 | g[v].index); |
216 | shell_edges.push_back(e); |
217 | } |
218 | } |
219 | |
220 | return shell_edges; |
221 | } |
222 | |
223 | template<typename GetAdjRange> |
224 | bool shellHasOnePath(const NGHolder &g, const flat_set<NFAVertex> &shell, |
225 | GetAdjRange adj_range_func) { |
226 | if (shell.empty()) { |
227 | DEBUG_PRINTF("no shell\n" ); |
228 | return false; |
229 | } |
230 | |
231 | NFAVertex exit_vertex = NGHolder::null_vertex(); |
232 | for (auto u : shell) { |
233 | for (auto v : adj_range_func(u, g)) { |
234 | if (contains(shell, v)) { |
235 | continue; |
236 | } |
237 | if (!exit_vertex) { |
238 | exit_vertex = v; |
239 | continue; |
240 | } |
241 | if (exit_vertex == v) { |
242 | continue; |
243 | } |
244 | return false; |
245 | } |
246 | } |
247 | |
248 | return true; |
249 | } |
250 | |
251 | /** |
252 | * True if all edges out of vertices in the head shell lead to at most a single |
253 | * outside vertex, or the inverse for the tail shell. |
254 | */ |
255 | static |
256 | bool shellHasOnePath(const NGHolder &g, const flat_set<NFAVertex> &head_shell, |
257 | const flat_set<NFAVertex> &tail_shell) { |
258 | if (shellHasOnePath(g, head_shell, adjacent_vertices_range<NGHolder>)) { |
259 | DEBUG_PRINTF("head shell has only one path through it\n" ); |
260 | return true; |
261 | } |
262 | if (shellHasOnePath(g, tail_shell, inv_adjacent_vertices_range<NGHolder>)) { |
263 | DEBUG_PRINTF("tail shell has only one path into it\n" ); |
264 | return true; |
265 | } |
266 | return false; |
267 | } |
268 | |
269 | /** |
270 | * Common code called by calc- and recalc- below. Splits the given holder into |
271 | * one or more connected components, adding them to the comps deque. |
272 | */ |
273 | static |
274 | void splitIntoComponents(unique_ptr<NGHolder> g, |
275 | deque<unique_ptr<NGHolder>> &comps, |
276 | const depth &max_head_depth, |
277 | const depth &max_tail_depth, bool *shell_comp) { |
278 | DEBUG_PRINTF("graph has %zu vertices\n" , num_vertices(*g)); |
279 | |
280 | assert(shell_comp); |
281 | *shell_comp = false; |
282 | |
283 | // Compute "shell" head and tail subgraphs. |
284 | auto depths = calcBidiDepths(*g); |
285 | auto head_shell = findHeadShell(*g, depths, max_head_depth); |
286 | auto tail_shell = findTailShell(*g, depths, max_tail_depth); |
287 | for (auto v : head_shell) { |
288 | tail_shell.erase(v); |
289 | } |
290 | |
291 | if (head_shell.size() + tail_shell.size() + N_SPECIALS >= |
292 | num_vertices(*g)) { |
293 | DEBUG_PRINTF("all in shell component\n" ); |
294 | comps.push_back(std::move(g)); |
295 | *shell_comp = true; |
296 | return; |
297 | } |
298 | |
299 | // Find edges connecting the head and tail shells directly. |
300 | vector<NFAEdge> shell_edges = findShellEdges(*g, head_shell, tail_shell); |
301 | |
302 | DEBUG_PRINTF("%zu vertices in head, %zu in tail, %zu shell edges\n" , |
303 | head_shell.size(), tail_shell.size(), shell_edges.size()); |
304 | |
305 | // If there are no shell edges and only one path out of the head shell or |
306 | // into the tail shell, we aren't going to find more than one component. |
307 | if (shell_edges.empty() && shellHasOnePath(*g, head_shell, tail_shell)) { |
308 | DEBUG_PRINTF("single component\n" ); |
309 | comps.push_back(std::move(g)); |
310 | return; |
311 | } |
312 | |
313 | auto ug = make_undirected_graph(*g); |
314 | |
315 | // Filter specials and shell vertices from undirected graph. |
316 | unordered_set<NFAVertex> bad_vertices( |
317 | {g->start, g->startDs, g->accept, g->acceptEod}); |
318 | bad_vertices.insert(head_shell.begin(), head_shell.end()); |
319 | bad_vertices.insert(tail_shell.begin(), tail_shell.end()); |
320 | |
321 | auto filtered_ug = boost::make_filtered_graph( |
322 | ug, boost::keep_all(), make_bad_vertex_filter(&bad_vertices)); |
323 | |
324 | // Actually run the connected components algorithm. |
325 | map<NFAVertex, u32> split_components; |
326 | const u32 num = connected_components( |
327 | filtered_ug, boost::make_assoc_property_map(split_components)); |
328 | |
329 | assert(num > 0); |
330 | if (num == 1 && shell_edges.empty()) { |
331 | DEBUG_PRINTF("single component\n" ); |
332 | comps.push_back(std::move(g)); |
333 | return; |
334 | } |
335 | |
336 | DEBUG_PRINTF("broke graph into %u components\n" , num); |
337 | |
338 | vector<deque<NFAVertex>> verts(num); |
339 | |
340 | // Collect vertex lists per component. |
341 | for (const auto &m : split_components) { |
342 | NFAVertex v = m.first; |
343 | u32 c = m.second; |
344 | verts[c].push_back(v); |
345 | DEBUG_PRINTF("vertex %zu is in comp %u\n" , (*g)[v].index, c); |
346 | } |
347 | |
348 | unordered_map<NFAVertex, NFAVertex> v_map; // temp map for fillHolder |
349 | for (auto &vv : verts) { |
350 | // Shells are in every component. |
351 | vv.insert(vv.end(), begin(head_shell), end(head_shell)); |
352 | vv.insert(vv.end(), begin(tail_shell), end(tail_shell)); |
353 | |
354 | /* Sort for determinism. Still required as NFAUndirectedVertex have |
355 | * no deterministic ordering (split_components map). */ |
356 | sort(begin(vv), end(vv)); |
357 | |
358 | auto gc = ue2::make_unique<NGHolder>(); |
359 | v_map.clear(); |
360 | fillHolder(gc.get(), *g, vv, &v_map); |
361 | |
362 | // Remove shell edges, which will get their own component. |
363 | for (const auto &e : shell_edges) { |
364 | auto cu = v_map.at(source(e, *g)); |
365 | auto cv = v_map.at(target(e, *g)); |
366 | assert(edge(cu, cv, *gc).second); |
367 | remove_edge(cu, cv, *gc); |
368 | } |
369 | |
370 | pruneUseless(*gc); |
371 | DEBUG_PRINTF("component %zu has %zu vertices\n" , comps.size(), |
372 | num_vertices(*gc)); |
373 | comps.push_back(move(gc)); |
374 | } |
375 | |
376 | // Another component to handle the direct shell-to-shell edges. |
377 | if (!shell_edges.empty()) { |
378 | deque<NFAVertex> vv; |
379 | vv.insert(vv.end(), begin(head_shell), end(head_shell)); |
380 | vv.insert(vv.end(), begin(tail_shell), end(tail_shell)); |
381 | |
382 | auto gc = ue2::make_unique<NGHolder>(); |
383 | v_map.clear(); |
384 | fillHolder(gc.get(), *g, vv, &v_map); |
385 | |
386 | pruneUseless(*gc); |
387 | DEBUG_PRINTF("shell edge component %zu has %zu vertices\n" , |
388 | comps.size(), num_vertices(*gc)); |
389 | comps.push_back(move(gc)); |
390 | *shell_comp = true; |
391 | } |
392 | |
393 | // Ensure that only vertices with accept edges have reports. |
394 | for (auto &gc : comps) { |
395 | assert(gc); |
396 | clearReports(*gc); |
397 | } |
398 | |
399 | // We should never produce empty component graphs. |
400 | assert(all_of(begin(comps), end(comps), |
401 | [](const unique_ptr<NGHolder> &g_comp) { |
402 | return num_vertices(*g_comp) > N_SPECIALS; |
403 | })); |
404 | } |
405 | |
406 | deque<unique_ptr<NGHolder>> calcComponents(unique_ptr<NGHolder> g, |
407 | const Grey &grey) { |
408 | deque<unique_ptr<NGHolder>> comps; |
409 | |
410 | // For trivial cases, we needn't bother running the full |
411 | // connected_components algorithm. |
412 | if (!grey.calcComponents || isAlternationOfClasses(*g)) { |
413 | comps.push_back(std::move(g)); |
414 | return comps; |
415 | } |
416 | |
417 | bool shell_comp = false; |
418 | splitIntoComponents(std::move(g), comps, depth(MAX_HEAD_SHELL_DEPTH), |
419 | depth(MAX_TAIL_SHELL_DEPTH), &shell_comp); |
420 | |
421 | if (shell_comp) { |
422 | DEBUG_PRINTF("re-running on shell comp\n" ); |
423 | assert(!comps.empty()); |
424 | auto sc = std::move(comps.back()); |
425 | comps.pop_back(); |
426 | splitIntoComponents(std::move(sc), comps, depth(0), depth(0), |
427 | &shell_comp); |
428 | } |
429 | |
430 | DEBUG_PRINTF("finished; split into %zu components\n" , comps.size()); |
431 | return comps; |
432 | } |
433 | |
434 | void recalcComponents(deque<unique_ptr<NGHolder>> &comps, const Grey &grey) { |
435 | if (!grey.calcComponents) { |
436 | return; |
437 | } |
438 | |
439 | deque<unique_ptr<NGHolder>> out; |
440 | |
441 | for (auto &gc : comps) { |
442 | if (!gc) { |
443 | continue; // graph has been consumed already. |
444 | } |
445 | |
446 | if (isAlternationOfClasses(*gc)) { |
447 | out.push_back(std::move(gc)); |
448 | continue; |
449 | } |
450 | |
451 | auto gc_comps = calcComponents(std::move(gc), grey); |
452 | out.insert(end(out), std::make_move_iterator(begin(gc_comps)), |
453 | std::make_move_iterator(end(gc_comps))); |
454 | } |
455 | |
456 | // Replace comps with our recalculated list. |
457 | comps.swap(out); |
458 | } |
459 | |
460 | } // namespace ue2 |
461 | |