1/*
2 * Copyright (c) 2015-2018, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Splits an NFA graph into its connected components.
31 *
32 * This pass takes a NGHolder and splits its graph into a set of connected
33 * components, returning them as individual NGHolder graphs. For example, the
34 * graph for the regex /foo.*bar|[a-z]{7,13}|hatstand|teakettle$/ will be split
35 * into four NGHolders, representing these four components:
36 *
37 * - /foo.*bar/
38 * - /[a-z]{7,13}/
39 * - /hatstand/
40 * - /teakettle$/
41 *
42 * The pass operates by creating an undirected graph from the input graph, and
43 * then using the BGL's connected_components algorithm to do the work, cloning
44 * the identified components into their own graphs. A "shell" of vertices
45 * is identified and removed first from the head and tail of the graph, in
46 * order to handle cases where there is a common head/tail region.
47 *
48 * Trivial cases, such as an alternation of single vertices like /a|b|c|d|e|f/,
49 * are not split, as later optimisations will handle these cases efficiently.
50 */
51#include "ng_calc_components.h"
52
53#include "ng_depth.h"
54#include "ng_holder.h"
55#include "ng_prune.h"
56#include "ng_util.h"
57#include "grey.h"
58#include "ue2common.h"
59#include "util/graph_range.h"
60#include "util/graph_undirected.h"
61#include "util/make_unique.h"
62
63#include <map>
64#include <vector>
65
66#include <boost/graph/connected_components.hpp>
67#include <boost/graph/filtered_graph.hpp>
68
69using namespace std;
70
71namespace ue2 {
72
73static constexpr u32 MAX_HEAD_SHELL_DEPTH = 3;
74static constexpr u32 MAX_TAIL_SHELL_DEPTH = 3;
75
76/**
77 * \brief Returns true if the whole graph is just an alternation of character
78 * classes.
79 */
80bool isAlternationOfClasses(const NGHolder &g) {
81 for (auto v : vertices_range(g)) {
82 if (is_special(v, g)) {
83 continue;
84 }
85 // Vertex must have in edges from starts only.
86 for (auto u : inv_adjacent_vertices_range(v, g)) {
87 if (!is_any_start(u, g)) {
88 return false;
89 }
90 }
91 // Vertex must have out edges to accepts only.
92 for (auto w : adjacent_vertices_range(v, g)) {
93 if (!is_any_accept(w, g)) {
94 return false;
95 }
96 }
97 }
98
99 DEBUG_PRINTF("alternation of single states, treating as one comp\n");
100 return true;
101}
102
103/**
104 * \brief Compute initial max distance to v from start (i.e. ignoring its own
105 * self-loop).
106 */
107static
108depth max_dist_from_start(const NGHolder &g,
109 const vector<NFAVertexBidiDepth> &depths,
110 NFAVertex v) {
111 depth max_depth(0);
112 for (const auto u : inv_adjacent_vertices_range(v, g)) {
113 if (u == v) {
114 continue;
115 }
116 const auto &d = depths.at(g[u].index);
117 if (d.fromStart.max.is_reachable()) {
118 max_depth = max(max_depth, d.fromStart.max);
119 }
120 if (d.fromStartDotStar.max.is_reachable()) {
121 max_depth = max(max_depth, d.fromStartDotStar.max);
122 }
123 }
124 return max_depth + 1;
125}
126
127/**
128 * \brief Compute initial max depth from v from accept (i.e. ignoring its own
129 * self-loop).
130 */
131static
132depth max_dist_to_accept(const NGHolder &g,
133 const vector<NFAVertexBidiDepth> &depths,
134 NFAVertex v) {
135 depth max_depth(0);
136 for (const auto w : adjacent_vertices_range(v, g)) {
137 if (w == v) {
138 continue;
139 }
140 const auto &d = depths.at(g[w].index);
141 if (d.toAccept.max.is_reachable()) {
142 max_depth = max(max_depth, d.toAccept.max);
143 }
144 if (d.toAcceptEod.max.is_reachable()) {
145 max_depth = max(max_depth, d.toAcceptEod.max);
146 }
147 }
148 return max_depth + 1;
149}
150
151static
152flat_set<NFAVertex> findHeadShell(const NGHolder &g,
153 const vector<NFAVertexBidiDepth> &depths,
154 const depth &max_dist) {
155 flat_set<NFAVertex> shell;
156
157 for (auto v : vertices_range(g)) {
158 if (is_special(v, g)) {
159 continue;
160 }
161 if (max_dist_from_start(g, depths, v) <= max_dist) {
162 shell.insert(v);
163 }
164 }
165
166 for (UNUSED auto v : shell) {
167 DEBUG_PRINTF("shell: %zu\n", g[v].index);
168 }
169
170 return shell;
171}
172
173static
174flat_set<NFAVertex> findTailShell(const NGHolder &g,
175 const vector<NFAVertexBidiDepth> &depths,
176 const depth &max_dist) {
177 flat_set<NFAVertex> shell;
178
179 for (auto v : vertices_range(g)) {
180 if (is_special(v, g)) {
181 continue;
182 }
183 if (max_dist_to_accept(g, depths, v) <= max_dist) {
184 shell.insert(v);
185 }
186 }
187
188 for (UNUSED auto v : shell) {
189 DEBUG_PRINTF("shell: %zu\n", g[v].index);
190 }
191
192 return shell;
193}
194
195static
196vector<NFAEdge> findShellEdges(const NGHolder &g,
197 const flat_set<NFAVertex> &head_shell,
198 const flat_set<NFAVertex> &tail_shell) {
199 vector<NFAEdge> shell_edges;
200
201 for (const auto &e : edges_range(g)) {
202 auto u = source(e, g);
203 auto v = target(e, g);
204
205 if (v == g.startDs && is_any_start(u, g)) {
206 continue;
207 }
208 if (u == g.accept && v == g.acceptEod) {
209 continue;
210 }
211
212 if ((is_special(u, g) || contains(head_shell, u)) &&
213 (is_special(v, g) || contains(tail_shell, v))) {
214 DEBUG_PRINTF("edge (%zu,%zu) is a shell edge\n", g[u].index,
215 g[v].index);
216 shell_edges.push_back(e);
217 }
218 }
219
220 return shell_edges;
221}
222
223template<typename GetAdjRange>
224bool shellHasOnePath(const NGHolder &g, const flat_set<NFAVertex> &shell,
225 GetAdjRange adj_range_func) {
226 if (shell.empty()) {
227 DEBUG_PRINTF("no shell\n");
228 return false;
229 }
230
231 NFAVertex exit_vertex = NGHolder::null_vertex();
232 for (auto u : shell) {
233 for (auto v : adj_range_func(u, g)) {
234 if (contains(shell, v)) {
235 continue;
236 }
237 if (!exit_vertex) {
238 exit_vertex = v;
239 continue;
240 }
241 if (exit_vertex == v) {
242 continue;
243 }
244 return false;
245 }
246 }
247
248 return true;
249}
250
251/**
252 * True if all edges out of vertices in the head shell lead to at most a single
253 * outside vertex, or the inverse for the tail shell.
254 */
255static
256bool shellHasOnePath(const NGHolder &g, const flat_set<NFAVertex> &head_shell,
257 const flat_set<NFAVertex> &tail_shell) {
258 if (shellHasOnePath(g, head_shell, adjacent_vertices_range<NGHolder>)) {
259 DEBUG_PRINTF("head shell has only one path through it\n");
260 return true;
261 }
262 if (shellHasOnePath(g, tail_shell, inv_adjacent_vertices_range<NGHolder>)) {
263 DEBUG_PRINTF("tail shell has only one path into it\n");
264 return true;
265 }
266 return false;
267}
268
269/**
270 * Common code called by calc- and recalc- below. Splits the given holder into
271 * one or more connected components, adding them to the comps deque.
272 */
273static
274void splitIntoComponents(unique_ptr<NGHolder> g,
275 deque<unique_ptr<NGHolder>> &comps,
276 const depth &max_head_depth,
277 const depth &max_tail_depth, bool *shell_comp) {
278 DEBUG_PRINTF("graph has %zu vertices\n", num_vertices(*g));
279
280 assert(shell_comp);
281 *shell_comp = false;
282
283 // Compute "shell" head and tail subgraphs.
284 auto depths = calcBidiDepths(*g);
285 auto head_shell = findHeadShell(*g, depths, max_head_depth);
286 auto tail_shell = findTailShell(*g, depths, max_tail_depth);
287 for (auto v : head_shell) {
288 tail_shell.erase(v);
289 }
290
291 if (head_shell.size() + tail_shell.size() + N_SPECIALS >=
292 num_vertices(*g)) {
293 DEBUG_PRINTF("all in shell component\n");
294 comps.push_back(std::move(g));
295 *shell_comp = true;
296 return;
297 }
298
299 // Find edges connecting the head and tail shells directly.
300 vector<NFAEdge> shell_edges = findShellEdges(*g, head_shell, tail_shell);
301
302 DEBUG_PRINTF("%zu vertices in head, %zu in tail, %zu shell edges\n",
303 head_shell.size(), tail_shell.size(), shell_edges.size());
304
305 // If there are no shell edges and only one path out of the head shell or
306 // into the tail shell, we aren't going to find more than one component.
307 if (shell_edges.empty() && shellHasOnePath(*g, head_shell, tail_shell)) {
308 DEBUG_PRINTF("single component\n");
309 comps.push_back(std::move(g));
310 return;
311 }
312
313 auto ug = make_undirected_graph(*g);
314
315 // Filter specials and shell vertices from undirected graph.
316 unordered_set<NFAVertex> bad_vertices(
317 {g->start, g->startDs, g->accept, g->acceptEod});
318 bad_vertices.insert(head_shell.begin(), head_shell.end());
319 bad_vertices.insert(tail_shell.begin(), tail_shell.end());
320
321 auto filtered_ug = boost::make_filtered_graph(
322 ug, boost::keep_all(), make_bad_vertex_filter(&bad_vertices));
323
324 // Actually run the connected components algorithm.
325 map<NFAVertex, u32> split_components;
326 const u32 num = connected_components(
327 filtered_ug, boost::make_assoc_property_map(split_components));
328
329 assert(num > 0);
330 if (num == 1 && shell_edges.empty()) {
331 DEBUG_PRINTF("single component\n");
332 comps.push_back(std::move(g));
333 return;
334 }
335
336 DEBUG_PRINTF("broke graph into %u components\n", num);
337
338 vector<deque<NFAVertex>> verts(num);
339
340 // Collect vertex lists per component.
341 for (const auto &m : split_components) {
342 NFAVertex v = m.first;
343 u32 c = m.second;
344 verts[c].push_back(v);
345 DEBUG_PRINTF("vertex %zu is in comp %u\n", (*g)[v].index, c);
346 }
347
348 unordered_map<NFAVertex, NFAVertex> v_map; // temp map for fillHolder
349 for (auto &vv : verts) {
350 // Shells are in every component.
351 vv.insert(vv.end(), begin(head_shell), end(head_shell));
352 vv.insert(vv.end(), begin(tail_shell), end(tail_shell));
353
354 /* Sort for determinism. Still required as NFAUndirectedVertex have
355 * no deterministic ordering (split_components map). */
356 sort(begin(vv), end(vv));
357
358 auto gc = ue2::make_unique<NGHolder>();
359 v_map.clear();
360 fillHolder(gc.get(), *g, vv, &v_map);
361
362 // Remove shell edges, which will get their own component.
363 for (const auto &e : shell_edges) {
364 auto cu = v_map.at(source(e, *g));
365 auto cv = v_map.at(target(e, *g));
366 assert(edge(cu, cv, *gc).second);
367 remove_edge(cu, cv, *gc);
368 }
369
370 pruneUseless(*gc);
371 DEBUG_PRINTF("component %zu has %zu vertices\n", comps.size(),
372 num_vertices(*gc));
373 comps.push_back(move(gc));
374 }
375
376 // Another component to handle the direct shell-to-shell edges.
377 if (!shell_edges.empty()) {
378 deque<NFAVertex> vv;
379 vv.insert(vv.end(), begin(head_shell), end(head_shell));
380 vv.insert(vv.end(), begin(tail_shell), end(tail_shell));
381
382 auto gc = ue2::make_unique<NGHolder>();
383 v_map.clear();
384 fillHolder(gc.get(), *g, vv, &v_map);
385
386 pruneUseless(*gc);
387 DEBUG_PRINTF("shell edge component %zu has %zu vertices\n",
388 comps.size(), num_vertices(*gc));
389 comps.push_back(move(gc));
390 *shell_comp = true;
391 }
392
393 // Ensure that only vertices with accept edges have reports.
394 for (auto &gc : comps) {
395 assert(gc);
396 clearReports(*gc);
397 }
398
399 // We should never produce empty component graphs.
400 assert(all_of(begin(comps), end(comps),
401 [](const unique_ptr<NGHolder> &g_comp) {
402 return num_vertices(*g_comp) > N_SPECIALS;
403 }));
404}
405
406deque<unique_ptr<NGHolder>> calcComponents(unique_ptr<NGHolder> g,
407 const Grey &grey) {
408 deque<unique_ptr<NGHolder>> comps;
409
410 // For trivial cases, we needn't bother running the full
411 // connected_components algorithm.
412 if (!grey.calcComponents || isAlternationOfClasses(*g)) {
413 comps.push_back(std::move(g));
414 return comps;
415 }
416
417 bool shell_comp = false;
418 splitIntoComponents(std::move(g), comps, depth(MAX_HEAD_SHELL_DEPTH),
419 depth(MAX_TAIL_SHELL_DEPTH), &shell_comp);
420
421 if (shell_comp) {
422 DEBUG_PRINTF("re-running on shell comp\n");
423 assert(!comps.empty());
424 auto sc = std::move(comps.back());
425 comps.pop_back();
426 splitIntoComponents(std::move(sc), comps, depth(0), depth(0),
427 &shell_comp);
428 }
429
430 DEBUG_PRINTF("finished; split into %zu components\n", comps.size());
431 return comps;
432}
433
434void recalcComponents(deque<unique_ptr<NGHolder>> &comps, const Grey &grey) {
435 if (!grey.calcComponents) {
436 return;
437 }
438
439 deque<unique_ptr<NGHolder>> out;
440
441 for (auto &gc : comps) {
442 if (!gc) {
443 continue; // graph has been consumed already.
444 }
445
446 if (isAlternationOfClasses(*gc)) {
447 out.push_back(std::move(gc));
448 continue;
449 }
450
451 auto gc_comps = calcComponents(std::move(gc), grey);
452 out.insert(end(out), std::make_move_iterator(begin(gc_comps)),
453 std::make_move_iterator(end(gc_comps)));
454 }
455
456 // Replace comps with our recalculated list.
457 comps.swap(out);
458}
459
460} // namespace ue2
461