1 | /* |
2 | * Copyright (c) 2016-2018, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | #include "config.h" |
30 | |
31 | #include "ng_violet.h" |
32 | |
33 | #include "grey.h" |
34 | #include "ng_depth.h" |
35 | #include "ng_dominators.h" |
36 | #include "ng_dump.h" |
37 | #include "ng_equivalence.h" |
38 | #include "ng_holder.h" |
39 | #include "ng_is_equal.h" |
40 | #include "ng_literal_analysis.h" |
41 | #include "ng_limex.h" |
42 | #include "ng_mcclellan.h" |
43 | #include "ng_netflow.h" |
44 | #include "ng_prune.h" |
45 | #include "ng_redundancy.h" |
46 | #include "ng_region.h" |
47 | #include "ng_reports.h" |
48 | #include "ng_split.h" |
49 | #include "ng_util.h" |
50 | #include "ng_width.h" |
51 | #include "nfa/rdfa.h" |
52 | #include "rose/rose_build.h" |
53 | #include "rose/rose_build_util.h" |
54 | #include "rose/rose_in_dump.h" |
55 | #include "rose/rose_in_graph.h" |
56 | #include "rose/rose_in_util.h" |
57 | #include "util/compare.h" |
58 | #include "util/compile_context.h" |
59 | #include "util/container.h" |
60 | #include "util/flat_containers.h" |
61 | #include "util/graph.h" |
62 | #include "util/graph_range.h" |
63 | #include "util/graph_small_color_map.h" |
64 | #include "util/insertion_ordered.h" |
65 | #include "util/make_unique.h" |
66 | #include "util/order_check.h" |
67 | #include "util/target_info.h" |
68 | #include "util/ue2string.h" |
69 | |
70 | #include <set> |
71 | #include <utility> |
72 | #include <vector> |
73 | #include <boost/dynamic_bitset.hpp> |
74 | #include <boost/range/adaptor/map.hpp> |
75 | |
76 | #define STAGE_DEBUG_PRINTF DEBUG_PRINTF |
77 | |
78 | using namespace std; |
79 | using boost::adaptors::map_values; |
80 | |
81 | namespace ue2 { |
82 | |
83 | /* createsAnchoredLHS() is conservative as the depths take into account |
84 | * back edges that come from beyond the split point and would be missing after |
85 | * the graph is split. */ |
86 | static |
87 | bool createsAnchoredLHS(const NGHolder &g, const vector<NFAVertex> &vv, |
88 | const vector<NFAVertexDepth> &depths, |
89 | const Grey &grey, depth max_depth = depth::infinity()) { |
90 | max_depth = min(max_depth, depth(grey.maxAnchoredRegion)); |
91 | |
92 | for (auto v : vv) { |
93 | /* avoid issues of self loops blowing out depths: |
94 | * look at preds, add 1 */ |
95 | for (auto u : inv_adjacent_vertices_range(v, g)) { |
96 | if (u == v) { |
97 | continue; |
98 | } |
99 | |
100 | u32 idx = g[u].index; |
101 | assert(idx < depths.size()); |
102 | if (maxDistFromStartOfData(depths.at(idx)) >= max_depth) { |
103 | return false; |
104 | } |
105 | } |
106 | } |
107 | return true; |
108 | } |
109 | |
110 | /* createsTransientLHS() is conservative as the depths take into account |
111 | * back edges that come from beyond the split point and would be missing after |
112 | * the graph is split. */ |
113 | static |
114 | bool createsTransientLHS(const NGHolder &g, const vector<NFAVertex> &vv, |
115 | const vector<NFAVertexDepth> &depths, |
116 | const Grey &grey) { |
117 | const depth max_depth(grey.maxHistoryAvailable); |
118 | |
119 | for (auto v : vv) { |
120 | /* avoid issues of self loops blowing out depths: |
121 | * look at preds, add 1 */ |
122 | for (auto u : inv_adjacent_vertices_range(v, g)) { |
123 | if (u == v) { |
124 | continue; |
125 | } |
126 | |
127 | u32 idx = g[u].index; |
128 | assert(idx < depths.size()); |
129 | if (maxDistFromInit(depths.at(idx)) >= max_depth) { |
130 | return false; |
131 | } |
132 | } |
133 | } |
134 | return true; |
135 | } |
136 | |
137 | /** |
138 | * Counts the number of vertices that are reachable from the set of sources |
139 | * given. |
140 | */ |
141 | static |
142 | size_t count_reachable(const NGHolder &g, const vector<NFAVertex> &sources, |
143 | small_color_map<decltype(get(vertex_index, g))> &color_map) { |
144 | auto null_visitor = boost::make_dfs_visitor(boost::null_visitor()); |
145 | color_map.fill(small_color::white); |
146 | |
147 | for (auto v : sources) { |
148 | boost::depth_first_visit(g, v, null_visitor, color_map); |
149 | } |
150 | |
151 | return color_map.count(small_color::black); |
152 | } |
153 | |
154 | static |
155 | size_t shorter_than(const set<ue2_literal> &s, size_t limit) { |
156 | return count_if(s.begin(), s.end(), |
157 | [&](const ue2_literal &a) { return a.length() < limit; }); |
158 | } |
159 | |
160 | static |
161 | u32 min_len(const set<ue2_literal> &s) { |
162 | u32 rv = ~0U; |
163 | |
164 | for (const auto &lit : s) { |
165 | rv = min(rv, (u32)lit.length()); |
166 | } |
167 | |
168 | return rv; |
169 | } |
170 | |
171 | static |
172 | u32 min_period(const set<ue2_literal> &s) { |
173 | u32 rv = ~0U; |
174 | |
175 | for (const auto &lit : s) { |
176 | rv = min(rv, (u32)minStringPeriod(lit)); |
177 | } |
178 | DEBUG_PRINTF("min period %u\n" , rv); |
179 | return rv; |
180 | } |
181 | |
182 | namespace { |
183 | /** |
184 | * Information on a cut: vertices and literals. |
185 | */ |
186 | struct VertLitInfo { |
187 | VertLitInfo() {} |
188 | VertLitInfo(NFAVertex v, const set<ue2_literal> &litlit, bool c_anch, |
189 | bool c_tran = false) |
190 | : vv(vector<NFAVertex>(1, v)), lit(litlit), creates_anchored(c_anch), |
191 | creates_transient(c_tran) {} |
192 | VertLitInfo(const vector<NFAVertex> &vv_in, const set<ue2_literal> &lit_in, |
193 | bool c_anch) |
194 | : vv(vv_in), lit(lit_in), creates_anchored(c_anch) {} |
195 | vector<NFAVertex> vv; |
196 | set<ue2_literal> lit; |
197 | |
198 | bool creates_anchored = false; |
199 | bool creates_transient = false; |
200 | double split_ratio = 0; |
201 | }; |
202 | |
203 | #define LAST_CHANCE_STRONG_LEN 1 |
204 | |
205 | /** |
206 | * \brief Comparator class for comparing different literal cuts. |
207 | */ |
208 | class LitComparator { |
209 | public: |
210 | LitComparator(const NGHolder &g_in, bool sa, bool st, bool lc) |
211 | : g(g_in), seeking_anchored(sa), seeking_transient(st), |
212 | last_chance(lc) {} |
213 | bool operator()(const unique_ptr<VertLitInfo> &a, |
214 | const unique_ptr<VertLitInfo> &b) const { |
215 | assert(a && b); |
216 | |
217 | if (seeking_anchored) { |
218 | if (a->creates_anchored != b->creates_anchored) { |
219 | return a->creates_anchored < b->creates_anchored; |
220 | } |
221 | } |
222 | |
223 | if (seeking_transient) { |
224 | if (a->creates_transient != b->creates_transient) { |
225 | return a->creates_transient < b->creates_transient; |
226 | } |
227 | } |
228 | |
229 | if (last_chance |
230 | && min_len(a->lit) > LAST_CHANCE_STRONG_LEN |
231 | && min_len(b->lit) > LAST_CHANCE_STRONG_LEN) { |
232 | DEBUG_PRINTF("using split ratio %g , %g\n" , a->split_ratio, |
233 | b->split_ratio); |
234 | return a->split_ratio < b->split_ratio; |
235 | } |
236 | |
237 | u64a score_a = scoreSet(a->lit); |
238 | u64a score_b = scoreSet(b->lit); |
239 | |
240 | if (score_a != score_b) { |
241 | return score_a > score_b; |
242 | } |
243 | |
244 | /* vertices should only be in one candidate cut */ |
245 | assert(a->vv == b->vv || a->vv.front() != b->vv.front()); |
246 | return g[a->vv.front()].index > g[b->vv.front()].index; |
247 | } |
248 | |
249 | private: |
250 | const NGHolder &g; /**< graph on which cuts are found */ |
251 | |
252 | bool seeking_anchored; |
253 | bool seeking_transient; |
254 | bool last_chance; |
255 | }; |
256 | } |
257 | |
258 | #define MIN_ANCHORED_LEN 2 |
259 | #define MIN_ANCHORED_DESPERATE_LEN 1 |
260 | |
261 | /* anchored here means that the cut creates a 'usefully' anchored LHS */ |
262 | static |
263 | bool validateRoseLiteralSetQuality(const set<ue2_literal> &s, u64a score, |
264 | bool anchored, u32 min_allowed_floating_len, |
265 | bool desperation, bool last_chance) { |
266 | u32 min_allowed_len = anchored ? MIN_ANCHORED_LEN |
267 | : min_allowed_floating_len; |
268 | if (anchored && last_chance) { |
269 | min_allowed_len = MIN_ANCHORED_DESPERATE_LEN; |
270 | } |
271 | if (last_chance) { |
272 | desperation = true; |
273 | } |
274 | |
275 | DEBUG_PRINTF("validating%s set, min allowed len %u\n" , |
276 | anchored ? " anchored" : "" , min_allowed_len); |
277 | |
278 | assert(none_of(begin(s), end(s), bad_mixed_sensitivity)); |
279 | |
280 | if (score >= NO_LITERAL_AT_EDGE_SCORE) { |
281 | DEBUG_PRINTF("candidate is too bad %llu/%zu\n" , score, s.size()); |
282 | return false; |
283 | } |
284 | |
285 | assert(!s.empty()); |
286 | if (s.empty()) { |
287 | DEBUG_PRINTF("candidate is too bad/something went wrong\n" ); |
288 | return false; |
289 | } |
290 | |
291 | u32 s_min_len = min_len(s); |
292 | u32 s_min_period = min_period(s); |
293 | size_t short_count = shorter_than(s, 5); |
294 | |
295 | DEBUG_PRINTF("cand '%s': score %llu count=%zu min_len=%u min_period=%u" |
296 | " short_count=%zu desp=%d\n" , |
297 | dumpString(*s.begin()).c_str(), score, s.size(), s_min_len, |
298 | s_min_period, short_count, (int)desperation); |
299 | |
300 | bool ok = true; |
301 | |
302 | if (s.size() > 10 /* magic number is magic */ |
303 | || s_min_len < min_allowed_len |
304 | || (s_min_period <= 1 && min_allowed_len != 1)) { |
305 | DEBUG_PRINTF("candidate may be bad\n" ); |
306 | ok = false; |
307 | } |
308 | |
309 | if (!ok && desperation |
310 | && s.size() <= 20 /* more magic numbers are magical */ |
311 | && (s_min_len > 5 || (s_min_len > 2 && short_count <= 10)) |
312 | && s_min_period > 1) { |
313 | DEBUG_PRINTF("candidate is ok\n" ); |
314 | ok = true; |
315 | } |
316 | |
317 | if (!ok && desperation |
318 | && s.size() <= 50 /* more magic numbers are magical */ |
319 | && s_min_len > 10 |
320 | && s_min_period > 1) { |
321 | DEBUG_PRINTF("candidate is ok\n" ); |
322 | ok = true; |
323 | } |
324 | |
325 | if (!ok) { |
326 | DEBUG_PRINTF("candidate is too shitty\n" ); |
327 | return false; |
328 | } |
329 | |
330 | return true; |
331 | } |
332 | |
333 | static UNUSED |
334 | void dumpRoseLiteralSet(const set<ue2_literal> &s) { |
335 | for (UNUSED const auto &lit : s) { |
336 | DEBUG_PRINTF(" lit: %s\n" , dumpString(lit).c_str()); |
337 | } |
338 | } |
339 | |
340 | static |
341 | void getSimpleRoseLiterals(const NGHolder &g, bool seeking_anchored, |
342 | const vector<NFAVertexDepth> *depths, |
343 | const set<NFAVertex> &a_dom, |
344 | vector<unique_ptr<VertLitInfo>> *lits, |
345 | u32 min_allowed_len, bool desperation, |
346 | bool last_chance, const CompileContext &cc) { |
347 | assert(depths || !seeking_anchored); |
348 | |
349 | map<NFAVertex, u64a> scores; |
350 | map<NFAVertex, unique_ptr<VertLitInfo>> lit_info; |
351 | set<ue2_literal> s; |
352 | |
353 | for (auto v : a_dom) { |
354 | s = getLiteralSet(g, v, true); /* RHS will take responsibility for any |
355 | revisits to the target vertex */ |
356 | |
357 | if (s.empty()) { |
358 | DEBUG_PRINTF("candidate is too shitty\n" ); |
359 | continue; |
360 | } |
361 | |
362 | DEBUG_PRINTF("|candidate raw literal set| = %zu\n" , s.size()); |
363 | dumpRoseLiteralSet(s); |
364 | u64a score = sanitizeAndCompressAndScore(s); |
365 | |
366 | bool anchored = false; |
367 | if (seeking_anchored) { |
368 | anchored = createsAnchoredLHS(g, {v}, *depths, cc.grey); |
369 | } |
370 | |
371 | if (!validateRoseLiteralSetQuality(s, score, anchored, min_allowed_len, |
372 | desperation, last_chance)) { |
373 | continue; |
374 | } |
375 | |
376 | DEBUG_PRINTF("candidate is a candidate\n" ); |
377 | scores[v] = score; |
378 | lit_info[v] = ue2::make_unique<VertLitInfo>(v, s, anchored); |
379 | } |
380 | |
381 | /* try to filter out cases where appending some characters produces worse |
382 | * literals. Only bother to look back one byte, TODO make better */ |
383 | for (auto u : a_dom) { |
384 | if (out_degree(u, g) != 1 || !scores[u]) { |
385 | continue; |
386 | } |
387 | NFAVertex v = *adjacent_vertices(u, g).first; |
388 | if (contains(scores, v) && scores[v] >= scores[u]) { |
389 | DEBUG_PRINTF("killing off v as score %llu >= %llu\n" , |
390 | scores[v], scores[u]); |
391 | lit_info.erase(v); |
392 | } |
393 | } |
394 | |
395 | lits->reserve(lit_info.size()); |
396 | for (auto &m : lit_info) { |
397 | lits->push_back(move(m.second)); |
398 | } |
399 | DEBUG_PRINTF("%zu candidate literal sets\n" , lits->size()); |
400 | } |
401 | |
402 | static |
403 | void getRegionRoseLiterals(const NGHolder &g, bool seeking_anchored, |
404 | const vector<NFAVertexDepth> *depths, |
405 | const set<NFAVertex> &bad, |
406 | const set<NFAVertex> *allowed, |
407 | vector<unique_ptr<VertLitInfo>> *lits, |
408 | u32 min_allowed_len, bool desperation, |
409 | bool last_chance, const CompileContext &cc) { |
410 | /* This allows us to get more places to split the graph as we are not |
411 | limited to points where there is a single vertex to split at. */ |
412 | |
413 | assert(depths || !seeking_anchored); |
414 | |
415 | /* TODO: operate over 'proto-regions' which ignore back edges */ |
416 | auto regions = assignRegions(g); |
417 | |
418 | set<u32> mand, optional; |
419 | map<u32, vector<NFAVertex> > exits; |
420 | |
421 | for (auto v : vertices_range(g)) { |
422 | u32 region = regions[v]; |
423 | if (is_any_start(v, g) || region == 0) { |
424 | continue; |
425 | } |
426 | |
427 | if (is_any_accept(v, g)) { |
428 | continue; |
429 | } |
430 | |
431 | if (!generates_callbacks(g) && is_match_vertex(v, g)) { |
432 | /* we cannot leave a completely vacuous infix */ |
433 | continue; |
434 | } |
435 | |
436 | if (isRegionExit(g, v, regions)) { |
437 | exits[region].push_back(v); |
438 | } |
439 | |
440 | if (isRegionEntry(g, v, regions)) { |
441 | // Determine whether this region is mandatory or optional. We only |
442 | // need to do this check for the first entry vertex we encounter |
443 | // for this region. |
444 | if (!contains(mand, region) && !contains(optional, region)) { |
445 | if (isOptionalRegion(g, v, regions)) { |
446 | optional.insert(region); |
447 | } else { |
448 | mand.insert(region); |
449 | } |
450 | } |
451 | } |
452 | } |
453 | |
454 | for (const auto &m : exits) { |
455 | if (false) { |
456 | next_cand: |
457 | continue; |
458 | } |
459 | |
460 | const u32 region = m.first; |
461 | const vector<NFAVertex> &vv = m.second; |
462 | assert(!vv.empty()); |
463 | |
464 | if (!contains(mand, region)) { |
465 | continue; |
466 | } |
467 | |
468 | for (auto v : vv) { |
469 | /* if an exit is in bad, the region is already handled well |
470 | * by getSimpleRoseLiterals or is otherwise bad */ |
471 | if (contains(bad, v)) { |
472 | goto next_cand; |
473 | } |
474 | /* if we are only allowed to consider some vertices, v must be in |
475 | the list; */ |
476 | if (allowed && !contains(*allowed, v)) { |
477 | goto next_cand; |
478 | } |
479 | } |
480 | |
481 | /* the final region may not have a neat exit. validate that all exits |
482 | * have an edge to each accept or none do */ |
483 | bool edge_to_a = edge(vv[0], g.accept, g).second; |
484 | bool edge_to_aeod = edge(vv[0], g.acceptEod, g).second; |
485 | const auto &reports = g[vv[0]].reports; |
486 | for (auto v : vv) { |
487 | if (edge_to_a != edge(v, g.accept, g).second) { |
488 | goto next_cand; |
489 | } |
490 | |
491 | if (edge_to_aeod != edge(v, g.acceptEod, g).second) { |
492 | goto next_cand; |
493 | } |
494 | |
495 | if (g[v].reports != reports) { |
496 | goto next_cand; |
497 | } |
498 | } |
499 | |
500 | DEBUG_PRINTF("inspecting region %u\n" , region); |
501 | set<ue2_literal> s; |
502 | for (auto v : vv) { |
503 | DEBUG_PRINTF(" exit vertex: %zu\n" , g[v].index); |
504 | /* Note: RHS can not be depended on to take all subsequent revisits |
505 | * to this vertex */ |
506 | set<ue2_literal> ss = getLiteralSet(g, v, false); |
507 | if (ss.empty()) { |
508 | DEBUG_PRINTF("candidate is too shitty\n" ); |
509 | goto next_cand; |
510 | } |
511 | insert(&s, ss); |
512 | } |
513 | |
514 | assert(!s.empty()); |
515 | |
516 | DEBUG_PRINTF("|candidate raw literal set| = %zu\n" , s.size()); |
517 | dumpRoseLiteralSet(s); |
518 | u64a score = sanitizeAndCompressAndScore(s); |
519 | |
520 | DEBUG_PRINTF("|candidate literal set| = %zu\n" , s.size()); |
521 | dumpRoseLiteralSet(s); |
522 | |
523 | bool anchored = false; |
524 | if (seeking_anchored) { |
525 | anchored = createsAnchoredLHS(g, vv, *depths, cc.grey); |
526 | } |
527 | |
528 | if (!validateRoseLiteralSetQuality(s, score, anchored, min_allowed_len, |
529 | desperation, last_chance)) { |
530 | goto next_cand; |
531 | } |
532 | |
533 | DEBUG_PRINTF("candidate is a candidate\n" ); |
534 | lits->push_back(ue2::make_unique<VertLitInfo>(vv, s, anchored)); |
535 | } |
536 | } |
537 | |
538 | static |
539 | void filterCandPivots(const NGHolder &g, const set<NFAVertex> &cand_raw, |
540 | set<NFAVertex> *out) { |
541 | for (auto u : cand_raw) { |
542 | const CharReach &u_cr = g[u].char_reach; |
543 | if (u_cr.count() > 40) { |
544 | continue; /* too wide to be plausible */ |
545 | } |
546 | |
547 | if (u_cr.count() > 2) { |
548 | /* include u as a candidate as successor may have backed away from |
549 | * expanding through it */ |
550 | out->insert(u); |
551 | continue; |
552 | } |
553 | |
554 | NFAVertex v = getSoleDestVertex(g, u); |
555 | if (v && in_degree(v, g) == 1 && out_degree(u, g) == 1) { |
556 | const CharReach &v_cr = g[v].char_reach; |
557 | if (v_cr.count() == 1 || v_cr.isCaselessChar()) { |
558 | continue; /* v will always generate better literals */ |
559 | } |
560 | } |
561 | |
562 | out->insert(u); |
563 | } |
564 | } |
565 | |
566 | /* cand_raw is the candidate set before filtering points which are clearly |
567 | * a bad idea. */ |
568 | static |
569 | void getCandidatePivots(const NGHolder &g, set<NFAVertex> *cand, |
570 | set<NFAVertex> *cand_raw) { |
571 | auto dominators = findDominators(g); |
572 | |
573 | set<NFAVertex> accepts; |
574 | |
575 | for (auto v : inv_adjacent_vertices_range(g.accept, g)) { |
576 | if (is_special(v, g)) { |
577 | continue; |
578 | } |
579 | accepts.insert(v); |
580 | } |
581 | for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) { |
582 | if (is_special(v, g)) { |
583 | continue; |
584 | } |
585 | accepts.insert(v); |
586 | } |
587 | |
588 | assert(!accepts.empty()); |
589 | |
590 | vector<NFAVertex> dom_trace; |
591 | auto ait = accepts.begin(); |
592 | assert(ait != accepts.end()); |
593 | NFAVertex curr = *ait; |
594 | while (curr && !is_special(curr, g)) { |
595 | dom_trace.push_back(curr); |
596 | curr = dominators[curr]; |
597 | } |
598 | reverse(dom_trace.begin(), dom_trace.end()); |
599 | for (++ait; ait != accepts.end(); ++ait) { |
600 | curr = *ait; |
601 | vector<NFAVertex> dom_trace2; |
602 | while (curr && !is_special(curr, g)) { |
603 | dom_trace2.push_back(curr); |
604 | curr = dominators[curr]; |
605 | } |
606 | reverse(dom_trace2.begin(), dom_trace2.end()); |
607 | auto dti = dom_trace.begin(), dtie = dom_trace.end(); |
608 | auto dtj = dom_trace2.begin(), dtje = dom_trace2.end(); |
609 | while (dti != dtie && dtj != dtje && *dti == *dtj) { |
610 | ++dti; |
611 | ++dtj; |
612 | } |
613 | dom_trace.erase(dti, dtie); |
614 | } |
615 | |
616 | cand_raw->insert(dom_trace.begin(), dom_trace.end()); |
617 | |
618 | filterCandPivots(g, *cand_raw, cand); |
619 | } |
620 | |
621 | static |
622 | unique_ptr<VertLitInfo> findBestSplit(const NGHolder &g, |
623 | const vector<NFAVertexDepth> *depths, |
624 | bool for_prefix, u32 min_len, |
625 | const set<NFAVertex> *allowed_cand, |
626 | const set<NFAVertex> *disallowed_cand, |
627 | bool last_chance, |
628 | const CompileContext &cc) { |
629 | assert(!for_prefix || depths); |
630 | |
631 | /* look for a single simple split point */ |
632 | set<NFAVertex> cand; |
633 | set<NFAVertex> cand_raw; |
634 | |
635 | getCandidatePivots(g, &cand, &cand_raw); |
636 | |
637 | if (allowed_cand) { |
638 | set<NFAVertex> cand2; |
639 | set<NFAVertex> cand2_raw; |
640 | set_intersection(allowed_cand->begin(), allowed_cand->end(), |
641 | cand.begin(), cand.end(), |
642 | inserter(cand2, cand2.begin())); |
643 | |
644 | set_intersection(allowed_cand->begin(), allowed_cand->end(), |
645 | cand_raw.begin(), cand_raw.end(), |
646 | inserter(cand2_raw, cand2_raw.begin())); |
647 | |
648 | cand = std::move(cand2); |
649 | cand_raw = std::move(cand2_raw); |
650 | } |
651 | if (disallowed_cand) { |
652 | DEBUG_PRINTF("%zu disallowed candidates\n" , disallowed_cand->size()); |
653 | DEBUG_PRINTF("|old cand| = %zu\n" , cand.size()); |
654 | erase_all(&cand, *disallowed_cand); |
655 | insert(&cand_raw, *disallowed_cand); |
656 | } |
657 | |
658 | if (!generates_callbacks(g)) { |
659 | /* not output exposed so must leave some RHS */ |
660 | for (NFAVertex v : inv_adjacent_vertices_range(g.accept, g)) { |
661 | cand.erase(v); |
662 | cand_raw.erase(v); |
663 | } |
664 | |
665 | for (NFAVertex v : inv_adjacent_vertices_range(g.acceptEod, g)) { |
666 | cand.erase(v); |
667 | cand_raw.erase(v); |
668 | } |
669 | } |
670 | |
671 | DEBUG_PRINTF("|cand| = %zu\n" , cand.size()); |
672 | |
673 | bool seeking_anchored = for_prefix; |
674 | bool seeking_transient = for_prefix; |
675 | |
676 | bool desperation = for_prefix && cc.streaming; |
677 | |
678 | vector<unique_ptr<VertLitInfo>> lits; /**< sorted list of potential cuts */ |
679 | |
680 | getSimpleRoseLiterals(g, seeking_anchored, depths, cand, &lits, min_len, |
681 | desperation, last_chance, cc); |
682 | getRegionRoseLiterals(g, seeking_anchored, depths, cand_raw, allowed_cand, |
683 | &lits, min_len, desperation, last_chance, cc); |
684 | |
685 | if (lits.empty()) { |
686 | DEBUG_PRINTF("no literals found\n" ); |
687 | return nullptr; |
688 | } |
689 | |
690 | if (seeking_transient) { |
691 | for (auto &a : lits) { |
692 | a->creates_transient |
693 | = createsTransientLHS(g, a->vv, *depths, cc.grey); |
694 | } |
695 | } |
696 | |
697 | if (last_chance) { |
698 | const size_t num_verts = num_vertices(g); |
699 | auto color_map = make_small_color_map(g); |
700 | for (auto &a : lits) { |
701 | size_t num_reachable = count_reachable(g, a->vv, color_map); |
702 | double ratio = (double)num_reachable / (double)num_verts; |
703 | a->split_ratio = ratio > 0.5 ? 1 - ratio : ratio; |
704 | } |
705 | } |
706 | |
707 | auto cmp = LitComparator(g, seeking_anchored, seeking_transient, |
708 | last_chance); |
709 | |
710 | unique_ptr<VertLitInfo> best = move(lits.back()); |
711 | lits.pop_back(); |
712 | while (!lits.empty()) { |
713 | if (cmp(best, lits.back())) { |
714 | best = move(lits.back()); |
715 | } |
716 | lits.pop_back(); |
717 | } |
718 | |
719 | DEBUG_PRINTF("best is '%s' %zu a%d t%d\n" , |
720 | dumpString(*best->lit.begin()).c_str(), |
721 | g[best->vv.front()].index, |
722 | depths ? (int)createsAnchoredLHS(g, best->vv, *depths, cc.grey) : 0, |
723 | depths ? (int)createsTransientLHS(g, best->vv, *depths, cc.grey) : 0); |
724 | |
725 | return best; |
726 | } |
727 | |
728 | static |
729 | void poisonFromSuccessor(const NGHolder &h, const ue2_literal &succ, |
730 | bool overhang_ok, flat_set<NFAEdge> &bad) { |
731 | DEBUG_PRINTF("poisoning holder of size %zu, succ len %zu\n" , |
732 | num_vertices(h), succ.length()); |
733 | |
734 | using EdgeSet = boost::dynamic_bitset<>; |
735 | |
736 | const size_t edge_count = num_edges(h); |
737 | EdgeSet bad_edges(edge_count); |
738 | |
739 | unordered_map<NFAVertex, EdgeSet> curr; |
740 | for (const auto &e : in_edges_range(h.accept, h)) { |
741 | auto &path_set = curr[source(e, h)]; |
742 | if (path_set.empty()) { |
743 | path_set.resize(edge_count); |
744 | } |
745 | path_set.set(h[e].index); |
746 | } |
747 | |
748 | unordered_map<NFAVertex, EdgeSet> next; |
749 | for (auto it = succ.rbegin(); it != succ.rend(); ++it) { |
750 | for (const auto &path : curr) { |
751 | NFAVertex u = path.first; |
752 | const auto &path_set = path.second; |
753 | if (u == h.start && overhang_ok) { |
754 | DEBUG_PRINTF("poisoning early %zu [overhang]\n" , |
755 | path_set.count()); |
756 | bad_edges |= path_set; |
757 | continue; |
758 | } |
759 | if (overlaps(h[u].char_reach, *it)) { |
760 | for (const auto &e : in_edges_range(u, h)) { |
761 | auto &new_path_set = next[source(e, h)]; |
762 | if (new_path_set.empty()) { |
763 | new_path_set.resize(edge_count); |
764 | } |
765 | new_path_set |= path_set; |
766 | new_path_set.set(h[e].index); |
767 | } |
768 | } |
769 | } |
770 | DEBUG_PRINTF("succ char matches at %zu paths\n" , next.size()); |
771 | assert(overhang_ok || !curr.empty()); |
772 | swap(curr, next); |
773 | next.clear(); |
774 | } |
775 | |
776 | assert(overhang_ok || !curr.empty()); |
777 | for (const auto &path : curr) { |
778 | bad_edges |= path.second; |
779 | DEBUG_PRINTF("poisoning %zu vertices\n" , path.second.count()); |
780 | } |
781 | |
782 | for (const auto &e : edges_range(h)) { |
783 | if (bad_edges.test(h[e].index)) { |
784 | bad.insert(e); |
785 | } |
786 | } |
787 | } |
788 | |
789 | static |
790 | void poisonForGoodPrefix(const NGHolder &h, |
791 | const vector<NFAVertexDepth> &depths, |
792 | flat_set<NFAEdge> &bad, const Grey &grey) { |
793 | for (const auto &v : vertices_range(h)) { |
794 | if (!createsAnchoredLHS(h, {v}, depths, grey) |
795 | && !createsTransientLHS(h, {v}, depths, grey)) { |
796 | insert(&bad, in_edges_range(v, h)); |
797 | } |
798 | } |
799 | } |
800 | |
801 | static UNUSED |
802 | bool is_any_accept_type(RoseInVertexType t) { |
803 | return t == RIV_ACCEPT || t == RIV_ACCEPT_EOD; |
804 | } |
805 | |
806 | static |
807 | flat_set<NFAEdge> poisonEdges(const NGHolder &h, |
808 | const vector<NFAVertexDepth> *depths, |
809 | const RoseInGraph &vg, const vector<RoseInEdge> &ee, |
810 | bool for_prefix, const Grey &grey) { |
811 | DEBUG_PRINTF("poisoning edges %zu successor edges\n" , ee.size()); |
812 | |
813 | /* poison edges covered by successor literal */ |
814 | |
815 | set<pair<ue2_literal, bool> > succs; |
816 | for (const RoseInEdge &ve : ee) { |
817 | if (vg[target(ve, vg)].type != RIV_LITERAL) { |
818 | /* nothing to poison in suffixes/outfixes */ |
819 | assert(generates_callbacks(h)); |
820 | assert(is_any_accept_type(vg[target(ve, vg)].type)); |
821 | continue; |
822 | } |
823 | succs.insert({vg[target(ve, vg)].s, |
824 | vg[source(ve, vg)].type == RIV_LITERAL}); |
825 | |
826 | } |
827 | |
828 | DEBUG_PRINTF("poisoning edges %zu successor literals\n" , succs.size()); |
829 | |
830 | flat_set<NFAEdge> bad; |
831 | for (const auto &p : succs) { |
832 | poisonFromSuccessor(h, p.first, p.second, bad); |
833 | } |
834 | |
835 | /* poison edges which don't significantly improve a prefix */ |
836 | |
837 | if (for_prefix) { |
838 | poisonForGoodPrefix(h, *depths, bad, grey); |
839 | } |
840 | |
841 | return bad; |
842 | } |
843 | |
844 | static |
845 | set<NFAVertex> poisonVertices(const NGHolder &h, const RoseInGraph &vg, |
846 | const vector<RoseInEdge> &ee, const Grey &grey) { |
847 | flat_set<NFAEdge> bad_edges = poisonEdges(h, nullptr, vg, ee, false, grey); |
848 | set<NFAVertex> bad_vertices; |
849 | for (const NFAEdge &e : bad_edges) { |
850 | bad_vertices.insert(target(e, h)); |
851 | DEBUG_PRINTF("bad: %zu->%zu\n" , h[source(e, h)].index, |
852 | h[target(e, h)].index); |
853 | } |
854 | |
855 | return bad_vertices; |
856 | } |
857 | |
858 | static |
859 | unique_ptr<VertLitInfo> findBestNormalSplit(const NGHolder &g, |
860 | const RoseInGraph &vg, |
861 | const vector<RoseInEdge> &ee, |
862 | const CompileContext &cc) { |
863 | assert(g.kind == NFA_OUTFIX || g.kind == NFA_INFIX || g.kind == NFA_SUFFIX); |
864 | set<NFAVertex> bad_vertices = poisonVertices(g, vg, ee, cc.grey); |
865 | |
866 | return findBestSplit(g, nullptr, false, cc.grey.minRoseLiteralLength, |
867 | nullptr, &bad_vertices, false, cc); |
868 | } |
869 | |
870 | static |
871 | unique_ptr<VertLitInfo> findBestLastChanceSplit(const NGHolder &g, |
872 | const RoseInGraph &vg, |
873 | const vector<RoseInEdge> &ee, |
874 | const CompileContext &cc) { |
875 | assert(g.kind == NFA_OUTFIX || g.kind == NFA_INFIX || g.kind == NFA_SUFFIX); |
876 | set<NFAVertex> bad_vertices = poisonVertices(g, vg, ee, cc.grey); |
877 | |
878 | return findBestSplit(g, nullptr, false, cc.grey.minRoseLiteralLength, |
879 | nullptr, &bad_vertices, true, cc); |
880 | } |
881 | |
882 | static |
883 | unique_ptr<VertLitInfo> findSimplePrefixSplit(const NGHolder &g, |
884 | const CompileContext &cc) { |
885 | DEBUG_PRINTF("looking for simple prefix split\n" ); |
886 | bool anchored = !proper_out_degree(g.startDs, g); |
887 | NFAVertex u = anchored ? g.start : g.startDs; |
888 | |
889 | if (out_degree(u, g) != 2) { /* startDs + succ */ |
890 | return nullptr; |
891 | } |
892 | |
893 | NFAVertex v = NGHolder::null_vertex(); |
894 | for (NFAVertex t : adjacent_vertices_range(u, g)) { |
895 | if (t != g.startDs) { |
896 | assert(!v); |
897 | v = t; |
898 | } |
899 | } |
900 | assert(v); |
901 | |
902 | if (!anchored) { |
903 | if (out_degree(g.start, g) > 2) { |
904 | return nullptr; |
905 | } |
906 | if (out_degree(g.start, g) == 2 && !edge(g.start, v, g).second) { |
907 | return nullptr; |
908 | } |
909 | } |
910 | |
911 | NFAVertex best_v = NGHolder::null_vertex(); |
912 | ue2_literal best_lit; |
913 | |
914 | u32 limit = cc.grey.maxHistoryAvailable; |
915 | if (anchored) { |
916 | LIMIT_TO_AT_MOST(&limit, cc.grey.maxAnchoredRegion); |
917 | } |
918 | |
919 | ue2_literal curr_lit; |
920 | for (u32 i = 0; i < limit; i++) { |
921 | const auto &v_cr = g[v].char_reach; |
922 | if (v_cr.count() == 1 || v_cr.isCaselessChar()) { |
923 | curr_lit.push_back(v_cr.find_first(), v_cr.isCaselessChar()); |
924 | } else { |
925 | curr_lit.clear(); |
926 | } |
927 | |
928 | if (curr_lit.length() > best_lit.length()) { |
929 | best_lit = curr_lit; |
930 | best_v = v; |
931 | } |
932 | |
933 | if (out_degree(v, g) != 1) { |
934 | break; |
935 | } |
936 | v = *adjacent_vertices(v, g).first; |
937 | } |
938 | |
939 | if (best_lit.length() < cc.grey.minRoseLiteralLength) { |
940 | return nullptr; |
941 | } |
942 | |
943 | set<ue2_literal> best_lit_set({best_lit}); |
944 | if (bad_mixed_sensitivity(best_lit)) { |
945 | sanitizeAndCompressAndScore(best_lit_set); |
946 | } |
947 | |
948 | return ue2::make_unique<VertLitInfo>(best_v, best_lit_set, anchored, true); |
949 | } |
950 | |
951 | static |
952 | unique_ptr<VertLitInfo> findBestPrefixSplit(const NGHolder &g, |
953 | const vector<NFAVertexDepth> &depths, |
954 | const RoseInGraph &vg, |
955 | const vector<RoseInEdge> &ee, |
956 | bool last_chance, |
957 | const CompileContext &cc) { |
958 | assert(g.kind == NFA_PREFIX || g.kind == NFA_OUTFIX); |
959 | set<NFAVertex> bad_vertices = poisonVertices(g, vg, ee, cc.grey); |
960 | auto rv = findBestSplit(g, &depths, true, cc.grey.minRoseLiteralLength, |
961 | nullptr, &bad_vertices, last_chance, cc); |
962 | |
963 | /* large back edges may prevent us identifying anchored or transient cases |
964 | * properly - use a simple walk instead */ |
965 | if (!rv || !(rv->creates_transient || rv->creates_anchored)) { |
966 | auto rv2 = findSimplePrefixSplit(g, cc); |
967 | if (rv2) { |
968 | return rv2; |
969 | } |
970 | } |
971 | |
972 | return rv; |
973 | } |
974 | |
975 | static |
976 | unique_ptr<VertLitInfo> findBestCleanSplit(const NGHolder &g, |
977 | const CompileContext &cc) { |
978 | assert(g.kind != NFA_PREFIX); |
979 | set<NFAVertex> cleanSplits; |
980 | for (NFAVertex v : vertices_range(g)) { |
981 | if (!g[v].char_reach.all() || !edge(v, v, g).second) { |
982 | continue; |
983 | } |
984 | insert(&cleanSplits, inv_adjacent_vertices(v, g)); |
985 | cleanSplits.erase(v); |
986 | } |
987 | cleanSplits.erase(g.start); |
988 | if (cleanSplits.empty()) { |
989 | return nullptr; |
990 | } |
991 | return findBestSplit(g, nullptr, false, cc.grey.violetEarlyCleanLiteralLen, |
992 | &cleanSplits, nullptr, false, cc); |
993 | } |
994 | |
995 | static |
996 | bool can_match(const NGHolder &g, const ue2_literal &lit, bool overhang_ok) { |
997 | set<NFAVertex> curr, next; |
998 | curr.insert(g.accept); |
999 | |
1000 | for (auto it = lit.rbegin(); it != lit.rend(); ++it) { |
1001 | next.clear(); |
1002 | |
1003 | for (auto v : curr) { |
1004 | for (auto u : inv_adjacent_vertices_range(v, g)) { |
1005 | if (u == g.start) { |
1006 | if (overhang_ok) { |
1007 | DEBUG_PRINTF("bail\n" ); |
1008 | return true; |
1009 | } else { |
1010 | continue; /* it is not possible for a lhs literal to |
1011 | * overhang the start */ |
1012 | } |
1013 | } |
1014 | |
1015 | const CharReach &cr = g[u].char_reach; |
1016 | if (!overlaps(*it, cr)) { |
1017 | continue; |
1018 | } |
1019 | |
1020 | next.insert(u); |
1021 | } |
1022 | } |
1023 | |
1024 | curr.swap(next); |
1025 | } |
1026 | |
1027 | return !curr.empty(); |
1028 | } |
1029 | |
1030 | static |
1031 | bool splitRoseEdge(const NGHolder &base_graph, RoseInGraph &vg, |
1032 | const vector<RoseInEdge> &ee, const VertLitInfo &split) { |
1033 | const vector<NFAVertex> &splitters = split.vv; |
1034 | assert(!splitters.empty()); |
1035 | |
1036 | shared_ptr<NGHolder> lhs = make_shared<NGHolder>(); |
1037 | shared_ptr<NGHolder> rhs = make_shared<NGHolder>(); |
1038 | |
1039 | unordered_map<NFAVertex, NFAVertex> lhs_map; |
1040 | unordered_map<NFAVertex, NFAVertex> rhs_map; |
1041 | |
1042 | splitGraph(base_graph, splitters, lhs.get(), &lhs_map, rhs.get(), &rhs_map); |
1043 | DEBUG_PRINTF("split %s:%zu into %s:%zu + %s:%zu\n" , |
1044 | to_string(base_graph.kind).c_str(), num_vertices(base_graph), |
1045 | to_string(lhs->kind).c_str(), num_vertices(*lhs), |
1046 | to_string(rhs->kind).c_str(), num_vertices(*rhs)); |
1047 | |
1048 | bool suffix = generates_callbacks(base_graph); |
1049 | |
1050 | if (is_triggered(base_graph)) { |
1051 | /* if we are already guarded, check if the split reduces the size of |
1052 | * the problem before continuing with the split */ |
1053 | if (num_vertices(*lhs) >= num_vertices(base_graph) |
1054 | && !(suffix && isVacuous(*rhs))) { |
1055 | DEBUG_PRINTF("split's lhs is no smaller\n" ); |
1056 | return false; |
1057 | } |
1058 | |
1059 | if (num_vertices(*rhs) >= num_vertices(base_graph)) { |
1060 | DEBUG_PRINTF("split's rhs is no smaller\n" ); |
1061 | return false; |
1062 | } |
1063 | } |
1064 | |
1065 | bool do_accept = false; |
1066 | bool do_accept_eod = false; |
1067 | assert(rhs); |
1068 | if (isVacuous(*rhs) && suffix) { |
1069 | if (edge(rhs->start, rhs->accept, *rhs).second) { |
1070 | DEBUG_PRINTF("rhs has a cliche\n" ); |
1071 | do_accept = true; |
1072 | remove_edge(rhs->start, rhs->accept, *rhs); |
1073 | } |
1074 | |
1075 | if (edge(rhs->start, rhs->acceptEod, *rhs).second) { |
1076 | DEBUG_PRINTF("rhs has an eod cliche\n" ); |
1077 | do_accept_eod = true; |
1078 | remove_edge(rhs->start, rhs->acceptEod, *rhs); |
1079 | } |
1080 | |
1081 | renumber_edges(*rhs); |
1082 | } |
1083 | |
1084 | /* check if we still have a useful graph left over */ |
1085 | bool do_norm = out_degree(rhs->start, *rhs) != 1; |
1086 | |
1087 | set<ReportID> splitter_reports; |
1088 | for (auto v : splitters) { |
1089 | insert(&splitter_reports, base_graph[v].reports); |
1090 | } |
1091 | |
1092 | /* find the targets of each source vertex; insertion_ordered_map used to |
1093 | * preserve deterministic ordering */ |
1094 | insertion_ordered_map<RoseInVertex, vector<RoseInVertex>> images; |
1095 | for (const RoseInEdge &e : ee) { |
1096 | RoseInVertex src = source(e, vg); |
1097 | RoseInVertex dest = target(e, vg); |
1098 | images[src].push_back(dest); |
1099 | remove_edge(e, vg); |
1100 | } |
1101 | |
1102 | map<vector<RoseInVertex>, vector<RoseInVertex>> verts_by_image; |
1103 | |
1104 | for (const auto &m : images) { |
1105 | const auto &u = m.first; |
1106 | const auto &image = m.second; |
1107 | |
1108 | if (contains(verts_by_image, image)) { |
1109 | for (RoseInVertex v : verts_by_image[image]) { |
1110 | add_edge(u, v, RoseInEdgeProps(lhs, 0U), vg); |
1111 | } |
1112 | continue; |
1113 | } |
1114 | |
1115 | for (const auto &lit : split.lit) { |
1116 | assert(!bad_mixed_sensitivity(lit)); |
1117 | |
1118 | /* don't allow overhang in can_match() as literals should |
1119 | * correspond to the edge graph being split; overhanging the graph |
1120 | * would indicate a false path.*/ |
1121 | if (!can_match(*lhs, lit, false)) { |
1122 | DEBUG_PRINTF("'%s' did not match lhs\n" , |
1123 | escapeString(lit).c_str()); |
1124 | continue; |
1125 | } |
1126 | |
1127 | DEBUG_PRINTF("best is '%s'\n" , escapeString(lit).c_str()); |
1128 | auto v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg); |
1129 | add_edge(u, v, RoseInEdgeProps(lhs, 0U), vg); |
1130 | |
1131 | /* work out delay later */ |
1132 | if (do_accept) { |
1133 | DEBUG_PRINTF("rhs has a cliche\n" ); |
1134 | auto tt = add_vertex(RoseInVertexProps::makeAccept( |
1135 | splitter_reports), vg); |
1136 | add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg); |
1137 | } |
1138 | |
1139 | if (do_accept_eod) { |
1140 | DEBUG_PRINTF("rhs has an eod cliche\n" ); |
1141 | auto tt = add_vertex(RoseInVertexProps::makeAcceptEod( |
1142 | splitter_reports), vg); |
1143 | add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg); |
1144 | } |
1145 | |
1146 | if (do_norm) { |
1147 | assert(out_degree(rhs->start, *rhs) > 1); |
1148 | for (RoseInVertex dest : image) { |
1149 | add_edge(v, dest, RoseInEdgeProps(rhs, 0U), vg); |
1150 | } |
1151 | } |
1152 | verts_by_image[image].push_back(v); |
1153 | } |
1154 | } |
1155 | |
1156 | assert(hasCorrectlyNumberedVertices(*rhs)); |
1157 | assert(hasCorrectlyNumberedEdges(*rhs)); |
1158 | assert(isCorrectlyTopped(*rhs)); |
1159 | assert(hasCorrectlyNumberedVertices(*lhs)); |
1160 | assert(hasCorrectlyNumberedEdges(*lhs)); |
1161 | assert(isCorrectlyTopped(*lhs)); |
1162 | |
1163 | return true; |
1164 | } |
1165 | |
1166 | #define MAX_NETFLOW_CUT_WIDTH 40 /* magic number is magic */ |
1167 | #define MAX_LEN_2_LITERALS_PER_CUT 3 |
1168 | |
1169 | static |
1170 | bool checkValidNetflowLits(NGHolder &h, const vector<u64a> &scores, |
1171 | const map<NFAEdge, set<ue2_literal>> &cut_lits, |
1172 | u32 min_allowed_length) { |
1173 | DEBUG_PRINTF("cut width %zu; min allowed %u\n" , cut_lits.size(), |
1174 | min_allowed_length); |
1175 | if (cut_lits.size() > MAX_NETFLOW_CUT_WIDTH) { |
1176 | return false; |
1177 | } |
1178 | |
1179 | u32 len_2_count = 0; |
1180 | |
1181 | for (const auto &cut : cut_lits) { |
1182 | if (scores[h[cut.first].index] >= NO_LITERAL_AT_EDGE_SCORE) { |
1183 | DEBUG_PRINTF("cut uses a forbidden edge\n" ); |
1184 | return false; |
1185 | } |
1186 | |
1187 | if (min_len(cut.second) < min_allowed_length) { |
1188 | DEBUG_PRINTF("cut uses a bad literal\n" ); |
1189 | return false; |
1190 | } |
1191 | |
1192 | for (const auto &lit : cut.second) { |
1193 | if (lit.length() == 2) { |
1194 | len_2_count++; |
1195 | } |
1196 | } |
1197 | } |
1198 | |
1199 | if (len_2_count > MAX_LEN_2_LITERALS_PER_CUT) { |
1200 | return false; |
1201 | } |
1202 | |
1203 | return true; |
1204 | } |
1205 | |
1206 | static |
1207 | void splitEdgesByCut(NGHolder &h, RoseInGraph &vg, |
1208 | const vector<RoseInEdge> &to_cut, |
1209 | const vector<NFAEdge> &cut, |
1210 | const map<NFAEdge, set<ue2_literal>> &cut_lits) { |
1211 | DEBUG_PRINTF("splitting %s (%zu vertices)\n" , to_string(h.kind).c_str(), |
1212 | num_vertices(h)); |
1213 | |
1214 | /* create literal vertices and connect preds */ |
1215 | unordered_set<RoseInVertex> done_sources; |
1216 | map<RoseInVertex, vector<pair<RoseInVertex, NFAVertex>>> verts_by_source; |
1217 | for (const RoseInEdge &ve : to_cut) { |
1218 | assert(&h == &*vg[ve].graph); |
1219 | RoseInVertex src = source(ve, vg); |
1220 | if (!done_sources.insert(src).second) { |
1221 | continue; /* already processed */ |
1222 | } |
1223 | |
1224 | /* iterate over cut for determinism */ |
1225 | for (const auto &e : cut) { |
1226 | NFAVertex prev_v = source(e, h); |
1227 | NFAVertex pivot = target(e, h); |
1228 | |
1229 | DEBUG_PRINTF("splitting on pivot %zu\n" , h[pivot].index); |
1230 | unordered_map<NFAVertex, NFAVertex> temp_map; |
1231 | shared_ptr<NGHolder> new_lhs = make_shared<NGHolder>(); |
1232 | splitLHS(h, pivot, new_lhs.get(), &temp_map); |
1233 | |
1234 | /* want to cut off paths to pivot from things other than the pivot - |
1235 | * makes a more svelte graphy */ |
1236 | clear_in_edges(temp_map[pivot], *new_lhs); |
1237 | NFAEdge pivot_edge = add_edge(temp_map[prev_v], temp_map[pivot], |
1238 | *new_lhs); |
1239 | if (is_triggered(h) && prev_v == h.start) { |
1240 | (*new_lhs)[pivot_edge].tops.insert(DEFAULT_TOP); |
1241 | } |
1242 | |
1243 | pruneUseless(*new_lhs, false); |
1244 | renumber_vertices(*new_lhs); |
1245 | renumber_edges(*new_lhs); |
1246 | |
1247 | DEBUG_PRINTF(" into lhs %s (%zu vertices)\n" , |
1248 | to_string(new_lhs->kind).c_str(), |
1249 | num_vertices(*new_lhs)); |
1250 | |
1251 | assert(hasCorrectlyNumberedVertices(*new_lhs)); |
1252 | assert(hasCorrectlyNumberedEdges(*new_lhs)); |
1253 | assert(isCorrectlyTopped(*new_lhs)); |
1254 | |
1255 | const set<ue2_literal> &lits = cut_lits.at(e); |
1256 | for (const auto &lit : lits) { |
1257 | if (!can_match(*new_lhs, lit, is_triggered(h))) { |
1258 | continue; |
1259 | } |
1260 | |
1261 | RoseInVertex v |
1262 | = add_vertex(RoseInVertexProps::makeLiteral(lit), vg); |
1263 | |
1264 | /* if this is a prefix/infix an edge directly to accept should |
1265 | * represent a false path as we have poisoned vertices covered |
1266 | * by the literals. */ |
1267 | if (generates_callbacks(h)) { |
1268 | if (edge(pivot, h.accept, h).second) { |
1269 | DEBUG_PRINTF("adding acceptEod\n" ); |
1270 | /* literal has a direct connection to accept */ |
1271 | const flat_set<ReportID> &reports = h[pivot].reports; |
1272 | auto tt = add_vertex( |
1273 | RoseInVertexProps::makeAccept(reports), vg); |
1274 | add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg); |
1275 | } |
1276 | |
1277 | if (edge(pivot, h.acceptEod, h).second) { |
1278 | assert(generates_callbacks(h)); |
1279 | DEBUG_PRINTF("adding acceptEod\n" ); |
1280 | /* literal has a direct connection to accept */ |
1281 | const flat_set<ReportID> &reports = h[pivot].reports; |
1282 | auto tt = add_vertex( |
1283 | RoseInVertexProps::makeAcceptEod(reports), vg); |
1284 | add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg); |
1285 | } |
1286 | } |
1287 | |
1288 | add_edge(src, v, RoseInEdgeProps(new_lhs, 0), vg); |
1289 | verts_by_source[src].push_back({v, pivot}); |
1290 | } |
1291 | } |
1292 | } |
1293 | |
1294 | /* wire the literal vertices up to successors */ |
1295 | map<vector<NFAVertex>, shared_ptr<NGHolder> > done_rhs; |
1296 | for (const RoseInEdge &ve : to_cut) { |
1297 | RoseInVertex src = source(ve, vg); |
1298 | RoseInVertex dest = target(ve, vg); |
1299 | |
1300 | /* iterate over cut for determinism */ |
1301 | for (const auto &elem : verts_by_source[src]) { |
1302 | NFAVertex pivot = elem.second; |
1303 | RoseInVertex v = elem.first; |
1304 | |
1305 | vector<NFAVertex> adj; |
1306 | insert(&adj, adj.end(), adjacent_vertices(pivot, h)); |
1307 | /* we can ignore presence of accept, accepteod in adj as it is best |
1308 | effort */ |
1309 | |
1310 | if (!contains(done_rhs, adj)) { |
1311 | unordered_map<NFAVertex, NFAVertex> temp_map; |
1312 | shared_ptr<NGHolder> new_rhs = make_shared<NGHolder>(); |
1313 | splitRHS(h, adj, new_rhs.get(), &temp_map); |
1314 | remove_edge(new_rhs->start, new_rhs->accept, *new_rhs); |
1315 | remove_edge(new_rhs->start, new_rhs->acceptEod, *new_rhs); |
1316 | renumber_edges(*new_rhs); |
1317 | DEBUG_PRINTF(" into rhs %s (%zu vertices)\n" , |
1318 | to_string(new_rhs->kind).c_str(), |
1319 | num_vertices(*new_rhs)); |
1320 | done_rhs.emplace(adj, new_rhs); |
1321 | assert(isCorrectlyTopped(*new_rhs)); |
1322 | } |
1323 | |
1324 | assert(done_rhs[adj].get()); |
1325 | shared_ptr<NGHolder> new_rhs = done_rhs[adj]; |
1326 | |
1327 | assert(hasCorrectlyNumberedVertices(*new_rhs)); |
1328 | assert(hasCorrectlyNumberedEdges(*new_rhs)); |
1329 | assert(isCorrectlyTopped(*new_rhs)); |
1330 | |
1331 | if (vg[dest].type == RIV_LITERAL |
1332 | && !can_match(*new_rhs, vg[dest].s, true)) { |
1333 | continue; |
1334 | } |
1335 | |
1336 | if (out_degree(new_rhs->start, *new_rhs) != 1) { |
1337 | add_edge(v, dest, RoseInEdgeProps(new_rhs, 0), vg); |
1338 | } |
1339 | } |
1340 | |
1341 | remove_edge(ve, vg); |
1342 | } |
1343 | } |
1344 | |
1345 | static |
1346 | bool doNetflowCut(NGHolder &h, |
1347 | const vector<NFAVertexDepth> *depths, |
1348 | RoseInGraph &vg, |
1349 | const vector<RoseInEdge> &ee, bool for_prefix, |
1350 | const Grey &grey, u32 min_allowed_length = 0U) { |
1351 | ENSURE_AT_LEAST(&min_allowed_length, grey.minRoseNetflowLiteralLength); |
1352 | |
1353 | DEBUG_PRINTF("doing netflow cut\n" ); |
1354 | /* TODO: we should really get literals/scores from the full graph as this |
1355 | * allows us to overlap with previous cuts. */ |
1356 | assert(!ee.empty()); |
1357 | assert(&h == &*vg[ee.front()].graph); |
1358 | assert(!for_prefix || depths); |
1359 | |
1360 | if (num_edges(h) > grey.maxRoseNetflowEdges) { |
1361 | /* We have a limit on this because scoring edges and running netflow |
1362 | * gets very slow for big graphs. */ |
1363 | DEBUG_PRINTF("too many edges, skipping netflow cut\n" ); |
1364 | return false; |
1365 | } |
1366 | |
1367 | assert(hasCorrectlyNumberedVertices(h)); |
1368 | assert(hasCorrectlyNumberedEdges(h)); |
1369 | |
1370 | auto known_bad = poisonEdges(h, depths, vg, ee, for_prefix, grey); |
1371 | |
1372 | /* Step 1: Get scores for all edges */ |
1373 | vector<u64a> scores = scoreEdges(h, known_bad); /* scores by edge_index */ |
1374 | |
1375 | /* Step 2: Find cutset based on scores */ |
1376 | vector<NFAEdge> cut = findMinCut(h, scores); |
1377 | |
1378 | /* Step 3: Get literals corresponding to cut edges */ |
1379 | map<NFAEdge, set<ue2_literal>> cut_lits; |
1380 | for (const auto &e : cut) { |
1381 | set<ue2_literal> lits = getLiteralSet(h, e); |
1382 | sanitizeAndCompressAndScore(lits); |
1383 | |
1384 | cut_lits[e] = lits; |
1385 | } |
1386 | |
1387 | /* if literals are underlength bail or if it involves a forbidden edge*/ |
1388 | if (!checkValidNetflowLits(h, scores, cut_lits, min_allowed_length)) { |
1389 | return false; |
1390 | } |
1391 | DEBUG_PRINTF("splitting\n" ); |
1392 | |
1393 | /* Step 4: Split graph based on cuts */ |
1394 | splitEdgesByCut(h, vg, ee, cut, cut_lits); |
1395 | |
1396 | return true; |
1397 | } |
1398 | |
1399 | static |
1400 | bool deanchorIfNeeded(NGHolder &g) { |
1401 | DEBUG_PRINTF("hi\n" ); |
1402 | if (proper_out_degree(g.startDs, g)) { |
1403 | return false; |
1404 | } |
1405 | |
1406 | /* look for a non-special dot with a loop following start */ |
1407 | set<NFAVertex> succ_g; |
1408 | insert(&succ_g, adjacent_vertices(g.start, g)); |
1409 | succ_g.erase(g.startDs); |
1410 | |
1411 | for (auto v : adjacent_vertices_range(g.start, g)) { |
1412 | DEBUG_PRINTF("inspecting cand %zu || = %zu\n" , g[v].index, |
1413 | g[v].char_reach.count()); |
1414 | |
1415 | if (v == g.startDs || !g[v].char_reach.all()) { |
1416 | continue; |
1417 | } |
1418 | |
1419 | set<NFAVertex> succ_v; |
1420 | insert(&succ_v, adjacent_vertices(v, g)); |
1421 | |
1422 | if (succ_v == succ_g) { |
1423 | DEBUG_PRINTF("found ^.*\n" ); |
1424 | for (auto succ : adjacent_vertices_range(g.start, g)) { |
1425 | if (succ == g.startDs) { |
1426 | continue; |
1427 | } |
1428 | add_edge(g.startDs, succ, g); |
1429 | } |
1430 | clear_vertex(v, g); |
1431 | remove_vertex(v, g); |
1432 | renumber_vertices(g); |
1433 | return true; |
1434 | } |
1435 | |
1436 | if (succ_g.size() == 1 && hasSelfLoop(v, g)) { |
1437 | DEBUG_PRINTF("found ^.+\n" ); |
1438 | add_edge(g.startDs, v, g); |
1439 | remove_edge(v, v, g); |
1440 | return true; |
1441 | } |
1442 | } |
1443 | |
1444 | return false; |
1445 | } |
1446 | |
1447 | static |
1448 | RoseInGraph populateTrivialGraph(const NGHolder &h) { |
1449 | RoseInGraph g; |
1450 | shared_ptr<NGHolder> root_g = cloneHolder(h); |
1451 | bool orig_anch = isAnchored(*root_g); |
1452 | orig_anch |= deanchorIfNeeded(*root_g); |
1453 | |
1454 | DEBUG_PRINTF("orig_anch %d\n" , (int)orig_anch); |
1455 | |
1456 | auto start = add_vertex(RoseInVertexProps::makeStart(orig_anch), g); |
1457 | auto accept = add_vertex(RoseInVertexProps::makeAccept(set<ReportID>()), g); |
1458 | |
1459 | add_edge(start, accept, RoseInEdgeProps(root_g, 0), g); |
1460 | |
1461 | return g; |
1462 | } |
1463 | |
1464 | static |
1465 | void avoidOutfixes(RoseInGraph &vg, bool last_chance, |
1466 | const CompileContext &cc) { |
1467 | STAGE_DEBUG_PRINTF("AVOIDING OUTFIX\n" ); |
1468 | assert(num_vertices(vg) == 2); |
1469 | assert(num_edges(vg) == 1); |
1470 | |
1471 | RoseInEdge e = *edges(vg).first; |
1472 | |
1473 | NGHolder &h = *vg[e].graph; |
1474 | assert(isCorrectlyTopped(h)); |
1475 | |
1476 | renumber_vertices(h); |
1477 | renumber_edges(h); |
1478 | |
1479 | unique_ptr<VertLitInfo> split = findBestNormalSplit(h, vg, {e}, cc); |
1480 | |
1481 | if (split && splitRoseEdge(h, vg, {e}, *split)) { |
1482 | DEBUG_PRINTF("split on simple literal\n" ); |
1483 | return; |
1484 | } |
1485 | |
1486 | if (last_chance) { |
1487 | /* look for a prefix split as it allows us to accept very weak anchored |
1488 | * literals. */ |
1489 | auto depths = calcDepths(h); |
1490 | |
1491 | split = findBestPrefixSplit(h, depths, vg, {e}, last_chance, cc); |
1492 | |
1493 | if (split && splitRoseEdge(h, vg, {e}, *split)) { |
1494 | DEBUG_PRINTF("split on simple literal\n" ); |
1495 | return; |
1496 | } |
1497 | } |
1498 | |
1499 | doNetflowCut(h, nullptr, vg, {e}, false, cc.grey); |
1500 | } |
1501 | |
1502 | static |
1503 | void removeRedundantPrefixes(RoseInGraph &g) { |
1504 | STAGE_DEBUG_PRINTF("REMOVING REDUNDANT PREFIXES\n" ); |
1505 | |
1506 | for (const RoseInEdge &e : edges_range(g)) { |
1507 | RoseInVertex s = source(e, g); |
1508 | RoseInVertex t = target(e, g); |
1509 | |
1510 | if (g[s].type != RIV_START || g[t].type != RIV_LITERAL) { |
1511 | continue; |
1512 | } |
1513 | |
1514 | if (!g[e].graph) { |
1515 | continue; |
1516 | } |
1517 | |
1518 | assert(!g[t].delay); |
1519 | const ue2_literal &lit = g[t].s; |
1520 | |
1521 | if (!literalIsWholeGraph(*g[e].graph, lit)) { |
1522 | DEBUG_PRINTF("not whole graph\n" ); |
1523 | continue; |
1524 | } |
1525 | |
1526 | if (!isFloating(*g[e].graph)) { |
1527 | DEBUG_PRINTF("not floating\n" ); |
1528 | continue; |
1529 | } |
1530 | g[e].graph.reset(); |
1531 | } |
1532 | } |
1533 | |
1534 | static |
1535 | u32 maxDelay(const CompileContext &cc) { |
1536 | if (!cc.streaming) { |
1537 | return MO_INVALID_IDX; |
1538 | } |
1539 | return cc.grey.maxHistoryAvailable; |
1540 | } |
1541 | |
1542 | static |
1543 | void removeRedundantLiteralsFromPrefixes(RoseInGraph &g, |
1544 | const CompileContext &cc) { |
1545 | STAGE_DEBUG_PRINTF("REMOVING LITERALS FROM PREFIXES\n" ); |
1546 | |
1547 | vector<RoseInEdge> to_anchor; |
1548 | for (const RoseInEdge &e : edges_range(g)) { |
1549 | RoseInVertex s = source(e, g); |
1550 | RoseInVertex t = target(e, g); |
1551 | |
1552 | if (g[s].type != RIV_START && g[s].type != RIV_ANCHORED_START) { |
1553 | continue; |
1554 | } |
1555 | |
1556 | if (g[t].type != RIV_LITERAL) { |
1557 | continue; |
1558 | } |
1559 | |
1560 | if (!g[e].graph) { |
1561 | continue; |
1562 | } |
1563 | |
1564 | if (g[e].graph_lag) { |
1565 | /* already removed redundant parts of literals */ |
1566 | continue; |
1567 | } |
1568 | |
1569 | if (g[e].dfa) { |
1570 | /* if we removed any more states, we would need to rebuild the |
1571 | * the dfa which can be time consuming. */ |
1572 | continue; |
1573 | } |
1574 | |
1575 | assert(!g[t].delay); |
1576 | const ue2_literal &lit = g[t].s; |
1577 | |
1578 | DEBUG_PRINTF("removing states for literal: %s\n" , |
1579 | dumpString(lit).c_str()); |
1580 | |
1581 | unique_ptr<NGHolder> h = cloneHolder(*g[e].graph); |
1582 | const u32 max_delay = maxDelay(cc); |
1583 | |
1584 | u32 delay = removeTrailingLiteralStates(*h, lit, max_delay, |
1585 | false /* can't overhang start */); |
1586 | |
1587 | DEBUG_PRINTF("got delay %u (max allowed %u)\n" , delay, max_delay); |
1588 | |
1589 | if (edge(h->startDs, h->accept, *h).second) { |
1590 | /* we should have delay == lit.length(), but in really complex |
1591 | * cases we may fail to identify that we can remove the whole |
1592 | * graph. Regardless, the fact that sds is wired to accept means the |
1593 | * graph serves no purpose. */ |
1594 | DEBUG_PRINTF("whole graph\n" ); |
1595 | g[e].graph.reset(); |
1596 | continue; |
1597 | } |
1598 | |
1599 | if (delay == lit.length() && edge(h->start, h->accept, *h).second |
1600 | && num_vertices(*h) == N_SPECIALS) { |
1601 | to_anchor.push_back(e); |
1602 | continue; |
1603 | } |
1604 | |
1605 | /* if we got here we should still have an interesting graph */ |
1606 | assert(delay == max_delay || num_vertices(*h) > N_SPECIALS); |
1607 | |
1608 | if (delay && delay != MO_INVALID_IDX) { |
1609 | DEBUG_PRINTF("setting delay %u on lhs %p\n" , delay, h.get()); |
1610 | |
1611 | g[e].graph = move(h); |
1612 | g[e].graph_lag = delay; |
1613 | } |
1614 | } |
1615 | |
1616 | if (!to_anchor.empty()) { |
1617 | RoseInVertex anch = add_vertex(RoseInVertexProps::makeStart(true), g); |
1618 | |
1619 | for (RoseInEdge e : to_anchor) { |
1620 | DEBUG_PRINTF("rehoming to anchor\n" ); |
1621 | RoseInVertex v = target(e, g); |
1622 | add_edge(anch, v, g); |
1623 | remove_edge(e, g); |
1624 | } |
1625 | } |
1626 | } |
1627 | |
1628 | static |
1629 | bool isStarCliche(const NGHolder &g) { |
1630 | DEBUG_PRINTF("checking graph with %zu vertices\n" , num_vertices(g)); |
1631 | |
1632 | bool nonspecials_seen = false; |
1633 | |
1634 | for (auto v : vertices_range(g)) { |
1635 | if (is_special(v, g)) { |
1636 | continue; |
1637 | } |
1638 | |
1639 | if (nonspecials_seen) { |
1640 | return false; |
1641 | } |
1642 | nonspecials_seen = true; |
1643 | |
1644 | if (!g[v].char_reach.all()) { |
1645 | return false; |
1646 | } |
1647 | |
1648 | if (!hasSelfLoop(v, g)) { |
1649 | return false; |
1650 | } |
1651 | if (!edge(v, g.accept, g).second) { |
1652 | return false; |
1653 | } |
1654 | } |
1655 | |
1656 | if (!nonspecials_seen) { |
1657 | return false; |
1658 | } |
1659 | |
1660 | if (!edge(g.start, g.accept, g).second) { |
1661 | return false; |
1662 | } |
1663 | |
1664 | return true; |
1665 | } |
1666 | |
1667 | static |
1668 | void removeRedundantLiteralsFromInfix(const NGHolder &h, RoseInGraph &ig, |
1669 | const vector<RoseInEdge> &ee, |
1670 | const CompileContext &cc) { |
1671 | /* TODO: This could be better by not creating a separate graph for each |
1672 | * successor literal. This would require using distinct report ids and also |
1673 | * taking into account overlap of successor literals. */ |
1674 | |
1675 | set<ue2_literal> preds; |
1676 | set<ue2_literal> succs; |
1677 | for (const RoseInEdge &e : ee) { |
1678 | RoseInVertex u = source(e, ig); |
1679 | assert(ig[u].type == RIV_LITERAL); |
1680 | assert(!ig[u].delay); |
1681 | preds.insert(ig[u].s); |
1682 | |
1683 | RoseInVertex v = target(e, ig); |
1684 | assert(ig[v].type == RIV_LITERAL); |
1685 | assert(!ig[v].delay); |
1686 | succs.insert(ig[v].s); |
1687 | |
1688 | if (ig[e].graph_lag) { |
1689 | /* already removed redundant parts of literals */ |
1690 | return; |
1691 | } |
1692 | |
1693 | assert(!ig[e].dfa); |
1694 | } |
1695 | |
1696 | map<ue2_literal, pair<shared_ptr<NGHolder>, u32> > graphs; /* + delay */ |
1697 | |
1698 | for (const ue2_literal &right : succs) { |
1699 | size_t max_overlap = 0; |
1700 | for (const ue2_literal &left : preds) { |
1701 | size_t overlap = maxOverlap(left, right, 0); |
1702 | ENSURE_AT_LEAST(&max_overlap, overlap); |
1703 | } |
1704 | |
1705 | u32 max_allowed_delay = right.length() - max_overlap; |
1706 | |
1707 | if (cc.streaming) { |
1708 | LIMIT_TO_AT_MOST(&max_allowed_delay, cc.grey.maxHistoryAvailable); |
1709 | } |
1710 | |
1711 | if (!max_allowed_delay) { |
1712 | continue; |
1713 | } |
1714 | |
1715 | shared_ptr<NGHolder> h_new = cloneHolder(h); |
1716 | |
1717 | u32 delay = removeTrailingLiteralStates(*h_new, right, |
1718 | max_allowed_delay); |
1719 | |
1720 | if (delay == MO_INVALID_IDX) { |
1721 | /* successor literal could not match infix -> ignore false path */ |
1722 | assert(0); |
1723 | continue; |
1724 | } |
1725 | |
1726 | if (!delay) { |
1727 | /* unable to trim graph --> no point swapping to new holder */ |
1728 | continue; |
1729 | } |
1730 | |
1731 | assert(isCorrectlyTopped(*h_new)); |
1732 | graphs[right] = make_pair(h_new, delay); |
1733 | } |
1734 | |
1735 | for (const RoseInEdge &e : ee) { |
1736 | RoseInVertex v = target(e, ig); |
1737 | const ue2_literal &succ = ig[v].s; |
1738 | if (!contains(graphs, succ)) { |
1739 | continue; |
1740 | } |
1741 | |
1742 | ig[e].graph = graphs[succ].first; |
1743 | ig[e].graph_lag = graphs[succ].second; |
1744 | |
1745 | if (isStarCliche(*ig[e].graph)) { |
1746 | DEBUG_PRINTF("is a X star!\n" ); |
1747 | ig[e].graph.reset(); |
1748 | ig[e].graph_lag = 0; |
1749 | } |
1750 | } |
1751 | } |
1752 | |
1753 | static |
1754 | void removeRedundantLiteralsFromInfixes(RoseInGraph &g, |
1755 | const CompileContext &cc) { |
1756 | insertion_ordered_map<NGHolder *, vector<RoseInEdge>> infixes; |
1757 | |
1758 | for (const RoseInEdge &e : edges_range(g)) { |
1759 | RoseInVertex s = source(e, g); |
1760 | RoseInVertex t = target(e, g); |
1761 | |
1762 | if (g[s].type != RIV_LITERAL || g[t].type != RIV_LITERAL) { |
1763 | continue; |
1764 | } |
1765 | |
1766 | if (!g[e].graph) { |
1767 | continue; |
1768 | } |
1769 | |
1770 | assert(!g[t].delay); |
1771 | if (g[e].dfa) { |
1772 | /* if we removed any more states, we would need to rebuild the |
1773 | * the dfa which can be time consuming. */ |
1774 | continue; |
1775 | } |
1776 | |
1777 | NGHolder *h = g[e].graph.get(); |
1778 | infixes[h].push_back(e); |
1779 | } |
1780 | |
1781 | for (const auto &m : infixes) { |
1782 | NGHolder *h = m.first; |
1783 | const auto &edges = m.second; |
1784 | removeRedundantLiteralsFromInfix(*h, g, edges, cc); |
1785 | } |
1786 | } |
1787 | |
1788 | static |
1789 | void removeRedundantLiterals(RoseInGraph &g, const CompileContext &cc) { |
1790 | removeRedundantLiteralsFromPrefixes(g, cc); |
1791 | removeRedundantLiteralsFromInfixes(g, cc); |
1792 | } |
1793 | |
1794 | static |
1795 | RoseInVertex getStart(RoseInGraph &vg) { |
1796 | for (RoseInVertex v : vertices_range(vg)) { |
1797 | if (vg[v].type == RIV_START || vg[v].type == RIV_ANCHORED_START) { |
1798 | return v; |
1799 | } |
1800 | } |
1801 | assert(0); |
1802 | return RoseInGraph::null_vertex(); |
1803 | } |
1804 | |
1805 | /** |
1806 | * Finds the initial accept vertex created to which suffix/outfixes are |
1807 | * attached. |
1808 | */ |
1809 | static |
1810 | RoseInVertex getPrimaryAccept(RoseInGraph &vg) { |
1811 | for (RoseInVertex v : vertices_range(vg)) { |
1812 | if (vg[v].type == RIV_ACCEPT && vg[v].reports.empty()) { |
1813 | return v; |
1814 | } |
1815 | } |
1816 | assert(0); |
1817 | return RoseInGraph::null_vertex(); |
1818 | } |
1819 | |
1820 | static |
1821 | bool willBeTransient(const depth &max_depth, const CompileContext &cc) { |
1822 | if (!cc.streaming) { |
1823 | return max_depth <= depth(ROSE_BLOCK_TRANSIENT_MAX_WIDTH); |
1824 | } else { |
1825 | return max_depth <= depth(cc.grey.maxHistoryAvailable + 1); |
1826 | } |
1827 | } |
1828 | |
1829 | static |
1830 | bool willBeAnchoredTable(const depth &max_depth, const Grey &grey) { |
1831 | return max_depth <= depth(grey.maxAnchoredRegion); |
1832 | } |
1833 | |
1834 | static |
1835 | unique_ptr<NGHolder> make_chain(u32 count) { |
1836 | assert(count); |
1837 | |
1838 | auto rv = ue2::make_unique<NGHolder>(NFA_INFIX); |
1839 | |
1840 | NGHolder &h = *rv; |
1841 | |
1842 | NFAVertex u = h.start; |
1843 | for (u32 i = 0; i < count; i++) { |
1844 | NFAVertex v = add_vertex(h); |
1845 | h[v].char_reach = CharReach::dot(); |
1846 | add_edge(u, v, h); |
1847 | u = v; |
1848 | } |
1849 | h[u].reports.insert(0); |
1850 | add_edge(u, h.accept, h); |
1851 | |
1852 | setTops(h); |
1853 | |
1854 | return rv; |
1855 | } |
1856 | |
1857 | #define SHORT_TRIGGER_LEN 16 |
1858 | |
1859 | static |
1860 | bool makeTransientFromLongLiteral(NGHolder &h, RoseInGraph &vg, |
1861 | const vector<RoseInEdge> &ee, |
1862 | const CompileContext &cc) { |
1863 | /* check max width and literal lengths to see if possible */ |
1864 | size_t min_lit = (size_t)~0ULL; |
1865 | for (const RoseInEdge &e : ee) { |
1866 | RoseInVertex v = target(e, vg); |
1867 | LIMIT_TO_AT_MOST(&min_lit, vg[v].s.length()); |
1868 | } |
1869 | |
1870 | if (min_lit <= SHORT_TRIGGER_LEN || min_lit >= UINT_MAX) { |
1871 | return false; |
1872 | } |
1873 | |
1874 | depth max_width = findMaxWidth(h); |
1875 | |
1876 | u32 delta = min_lit - SHORT_TRIGGER_LEN; |
1877 | |
1878 | if (!willBeTransient(max_width - depth(delta), cc) |
1879 | && !willBeAnchoredTable(max_width - depth(delta), cc.grey)) { |
1880 | return false; |
1881 | } |
1882 | |
1883 | DEBUG_PRINTF("candidate for splitting long literal (len %zu)\n" , min_lit); |
1884 | DEBUG_PRINTF("delta = %u\n" , delta); |
1885 | |
1886 | /* try split */ |
1887 | map<RoseInVertex, shared_ptr<NGHolder> > graphs; |
1888 | for (const RoseInEdge &e : ee) { |
1889 | RoseInVertex v = target(e, vg); |
1890 | |
1891 | shared_ptr<NGHolder> h_new = cloneHolder(h); |
1892 | |
1893 | u32 delay = removeTrailingLiteralStates(*h_new, vg[v].s, delta); |
1894 | |
1895 | DEBUG_PRINTF("delay %u\n" , delay); |
1896 | |
1897 | if (delay != delta) { |
1898 | DEBUG_PRINTF("unable to trim literal\n" ); |
1899 | return false; |
1900 | } |
1901 | |
1902 | if (in_degree(v, vg) != 1) { |
1903 | DEBUG_PRINTF("complicated\n" ); |
1904 | return false; |
1905 | } |
1906 | |
1907 | DEBUG_PRINTF("new mw = %u\n" , (u32)findMaxWidth(*h_new)); |
1908 | assert(willBeTransient(findMaxWidth(*h_new), cc) |
1909 | || willBeAnchoredTable(findMaxWidth(*h_new), cc.grey)); |
1910 | |
1911 | assert(isCorrectlyTopped(*h_new)); |
1912 | graphs[v] = h_new; |
1913 | } |
1914 | |
1915 | /* add .{repeats} from prefixes to long literals */ |
1916 | for (const RoseInEdge &e : ee) { |
1917 | RoseInVertex s = source(e, vg); |
1918 | RoseInVertex t = target(e, vg); |
1919 | |
1920 | remove_edge(e, vg); |
1921 | const ue2_literal &orig_lit = vg[t].s; |
1922 | |
1923 | ue2_literal lit(orig_lit.begin(), orig_lit.end() - delta); |
1924 | |
1925 | ue2_literal lit2(orig_lit.end() - delta, orig_lit.end()); |
1926 | |
1927 | assert(lit.length() + delta == orig_lit.length()); |
1928 | |
1929 | vg[t].s = lit2; |
1930 | |
1931 | RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg); |
1932 | add_edge(s, v, RoseInEdgeProps(graphs[t], 0), vg); |
1933 | add_edge(v, t, RoseInEdgeProps(make_chain(delta), 0), vg); |
1934 | } |
1935 | |
1936 | DEBUG_PRINTF("success\n" ); |
1937 | /* TODO: alter split point to avoid pathological splits */ |
1938 | return true; |
1939 | } |
1940 | |
1941 | static |
1942 | void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit, |
1943 | u32 delay, const vector<NFAVertex> &preds) { |
1944 | assert(delay <= lit.length()); |
1945 | assert(isCorrectlyTopped(g)); |
1946 | DEBUG_PRINTF("adding on '%s' %u\n" , dumpString(lit).c_str(), delay); |
1947 | |
1948 | NFAVertex prev = g.accept; |
1949 | auto it = lit.rbegin(); |
1950 | while (delay--) { |
1951 | NFAVertex curr = add_vertex(g); |
1952 | assert(it != lit.rend()); |
1953 | g[curr].char_reach = *it; |
1954 | add_edge(curr, prev, g); |
1955 | ++it; |
1956 | prev = curr; |
1957 | } |
1958 | |
1959 | for (auto v : preds) { |
1960 | NFAEdge e = add_edge_if_not_present(v, prev, g); |
1961 | if (v == g.start && is_triggered(g)) { |
1962 | g[e].tops.insert(DEFAULT_TOP); |
1963 | } |
1964 | } |
1965 | |
1966 | // Every predecessor of accept must have a report. |
1967 | set_report(g, 0); |
1968 | |
1969 | renumber_vertices(g); |
1970 | renumber_edges(g); |
1971 | assert(allMatchStatesHaveReports(g)); |
1972 | assert(isCorrectlyTopped(g)); |
1973 | } |
1974 | |
1975 | static |
1976 | void restoreTrailingLiteralStates(NGHolder &g, |
1977 | const vector<pair<ue2_literal, u32>> &lits) { |
1978 | vector<NFAVertex> preds; |
1979 | insert(&preds, preds.end(), inv_adjacent_vertices(g.accept, g)); |
1980 | clear_in_edges(g.accept, g); |
1981 | |
1982 | for (auto v : preds) { |
1983 | g[v].reports.clear(); /* clear report from old accepts */ |
1984 | } |
1985 | |
1986 | for (const auto &p : lits) { |
1987 | const ue2_literal &lit = p.first; |
1988 | u32 delay = p.second; |
1989 | |
1990 | restoreTrailingLiteralStates(g, lit, delay, preds); |
1991 | } |
1992 | } |
1993 | |
1994 | static |
1995 | bool improvePrefix(NGHolder &h, RoseInGraph &vg, const vector<RoseInEdge> &ee, |
1996 | const CompileContext &cc) { |
1997 | DEBUG_PRINTF("trying to improve prefix %p, %zu verts\n" , &h, |
1998 | num_vertices(h)); |
1999 | assert(isCorrectlyTopped(h)); |
2000 | |
2001 | renumber_vertices(h); |
2002 | renumber_edges(h); |
2003 | |
2004 | auto depths = calcDepths(h); |
2005 | |
2006 | /* If the reason the prefix is not transient is due to a very long literal |
2007 | * following, we can make it transient by restricting ourselves to using |
2008 | * just the head of the literal. */ |
2009 | if (makeTransientFromLongLiteral(h, vg, ee, cc)) { |
2010 | return true; |
2011 | } |
2012 | |
2013 | auto split = findBestPrefixSplit(h, depths, vg, ee, false, cc); |
2014 | |
2015 | if (split && (split->creates_transient || split->creates_anchored) |
2016 | && splitRoseEdge(h, vg, ee, *split)) { |
2017 | DEBUG_PRINTF("split on simple literal\n" ); |
2018 | return true; |
2019 | } |
2020 | |
2021 | /* large back edges may prevent us identifing anchored or transient cases |
2022 | * properly - use a simple walk instead */ |
2023 | |
2024 | if (doNetflowCut(h, &depths, vg, ee, true, cc.grey)) { |
2025 | return true; |
2026 | } |
2027 | |
2028 | if (split && splitRoseEdge(h, vg, ee, *split)) { |
2029 | /* use the simple split even though it doesn't create a transient |
2030 | * prefix */ |
2031 | DEBUG_PRINTF("split on simple literal\n" ); |
2032 | return true; |
2033 | } |
2034 | |
2035 | /* look for netflow cuts which don't produce good prefixes */ |
2036 | if (doNetflowCut(h, &depths, vg, ee, false, cc.grey)) { |
2037 | return true; |
2038 | } |
2039 | |
2040 | if (ee.size() > 1) { |
2041 | DEBUG_PRINTF("split the prefix apart based on succ literals\n" ); |
2042 | unordered_map<shared_ptr<NGHolder>, vector<pair<RoseInEdge, u32> >, |
2043 | NGHolderHasher, NGHolderEqual> trimmed; |
2044 | |
2045 | for (const auto &e : ee) { |
2046 | shared_ptr<NGHolder> hh = cloneHolder(h); |
2047 | auto succ_lit = vg[target(e, vg)].s; |
2048 | assert(isCorrectlyTopped(*hh)); |
2049 | u32 delay = removeTrailingLiteralStates(*hh, succ_lit, |
2050 | succ_lit.length(), |
2051 | false /* can't overhang start */); |
2052 | if (!delay) { |
2053 | DEBUG_PRINTF("could not remove any literal, skip over\n" ); |
2054 | continue; |
2055 | } |
2056 | |
2057 | assert(isCorrectlyTopped(*hh)); |
2058 | trimmed[hh].emplace_back(e, delay); |
2059 | } |
2060 | |
2061 | if (trimmed.size() == 1) { |
2062 | return false; |
2063 | } |
2064 | |
2065 | /* shift the contents to a vector so we can modify the graphs without |
2066 | * violating the map's invariants. */ |
2067 | vector<pair<shared_ptr<NGHolder>, vector<pair<RoseInEdge, u32> > > > |
2068 | trimmed_vec(trimmed.begin(), trimmed.end()); |
2069 | trimmed.clear(); |
2070 | for (auto &elem : trimmed_vec) { |
2071 | shared_ptr<NGHolder> &hp = elem.first; |
2072 | vector<pair<ue2_literal, u32>> succ_lits; |
2073 | |
2074 | for (const auto &edge_delay : elem.second) { |
2075 | const RoseInEdge &e = edge_delay.first; |
2076 | u32 delay = edge_delay.second; |
2077 | auto lit = vg[target(e, vg)].s; |
2078 | |
2079 | vg[e].graph = hp; |
2080 | assert(delay <= lit.length()); |
2081 | succ_lits.emplace_back(lit, delay); |
2082 | } |
2083 | restoreTrailingLiteralStates(*hp, succ_lits); |
2084 | } |
2085 | return true; |
2086 | } |
2087 | |
2088 | return false; |
2089 | } |
2090 | |
2091 | #define MAX_FIND_BETTER_PREFIX_GEN 4 |
2092 | #define MAX_FIND_BETTER_PREFIX_COUNT 100 |
2093 | |
2094 | static |
2095 | void findBetterPrefixes(RoseInGraph &vg, const CompileContext &cc) { |
2096 | STAGE_DEBUG_PRINTF("FIND BETTER PREFIXES\n" ); |
2097 | RoseInVertex start = getStart(vg); |
2098 | |
2099 | insertion_ordered_map<NGHolder *, vector<RoseInEdge>> prefixes; |
2100 | bool changed; |
2101 | u32 gen = 0; |
2102 | do { |
2103 | DEBUG_PRINTF("gen %u\n" , gen); |
2104 | changed = false; |
2105 | prefixes.clear(); |
2106 | |
2107 | /* find prefixes */ |
2108 | for (const RoseInEdge &e : out_edges_range(start, vg)) { |
2109 | /* outfixes shouldn't have made it this far */ |
2110 | assert(vg[target(e, vg)].type == RIV_LITERAL); |
2111 | if (vg[e].graph) { |
2112 | NGHolder *h = vg[e].graph.get(); |
2113 | prefixes[h].push_back(e); |
2114 | } |
2115 | } |
2116 | |
2117 | if (prefixes.size() > MAX_FIND_BETTER_PREFIX_COUNT) { |
2118 | break; |
2119 | } |
2120 | |
2121 | /* look for bad prefixes and try to split */ |
2122 | for (const auto &m : prefixes) { |
2123 | NGHolder *h = m.first; |
2124 | const auto &edges = m.second; |
2125 | depth max_width = findMaxWidth(*h); |
2126 | if (willBeTransient(max_width, cc) |
2127 | || willBeAnchoredTable(max_width, cc.grey)) { |
2128 | continue; |
2129 | } |
2130 | |
2131 | changed = improvePrefix(*h, vg, edges, cc); |
2132 | } |
2133 | } while (changed && gen++ < MAX_FIND_BETTER_PREFIX_GEN); |
2134 | } |
2135 | |
2136 | #define STRONG_LITERAL_LENGTH 20 |
2137 | #define 10 |
2138 | |
2139 | static |
2140 | bool (NGHolder &h, RoseInGraph &vg, |
2141 | const vector<RoseInEdge> &ee, |
2142 | const CompileContext &cc) { |
2143 | DEBUG_PRINTF("looking for string literal\n" ); |
2144 | unique_ptr<VertLitInfo> split = findBestNormalSplit(h, vg, ee, cc); |
2145 | |
2146 | if (split && min_len(split->lit) >= STRONG_LITERAL_LENGTH) { |
2147 | DEBUG_PRINTF("splitting simple literal\n" ); |
2148 | return splitRoseEdge(h, vg, ee, *split); |
2149 | } |
2150 | |
2151 | return false; |
2152 | } |
2153 | |
2154 | static |
2155 | void (RoseInGraph &vg, const CompileContext &cc) { |
2156 | if (!cc.grey.violetExtractStrongLiterals) { |
2157 | return; |
2158 | } |
2159 | |
2160 | STAGE_DEBUG_PRINTF("EXTRACT STRONG LITERALS\n" ); |
2161 | |
2162 | unordered_set<NGHolder *> stuck; |
2163 | insertion_ordered_map<NGHolder *, vector<RoseInEdge>> edges_by_graph; |
2164 | bool changed; |
2165 | |
2166 | do { |
2167 | changed = false; |
2168 | |
2169 | edges_by_graph.clear(); |
2170 | for (const RoseInEdge &ve : edges_range(vg)) { |
2171 | if (vg[source(ve, vg)].type != RIV_LITERAL) { |
2172 | continue; |
2173 | } |
2174 | |
2175 | if (vg[ve].graph) { |
2176 | NGHolder *h = vg[ve].graph.get(); |
2177 | edges_by_graph[h].push_back(ve); |
2178 | } |
2179 | } |
2180 | |
2181 | if (edges_by_graph.size() > MAX_EXTRACT_STRONG_LITERAL_GRAPHS) { |
2182 | DEBUG_PRINTF("too many graphs, stopping\n" ); |
2183 | return; |
2184 | } |
2185 | |
2186 | for (const auto &m : edges_by_graph) { |
2187 | NGHolder *g = m.first; |
2188 | const auto &edges = m.second; |
2189 | if (contains(stuck, g)) { |
2190 | DEBUG_PRINTF("already known to be bad\n" ); |
2191 | continue; |
2192 | } |
2193 | bool rv = extractStrongLiteral(*g, vg, edges, cc); |
2194 | if (rv) { |
2195 | changed = true; |
2196 | } else { |
2197 | stuck.insert(g); |
2198 | } |
2199 | } |
2200 | } while (changed); |
2201 | } |
2202 | |
2203 | #define INFIX_STRONG_GUARD_LEN 8 |
2204 | #define INFIX_MIN_SPLIT_LITERAL_LEN 12 |
2205 | |
2206 | static |
2207 | bool improveInfix(NGHolder &h, RoseInGraph &vg, const vector<RoseInEdge> &ee, |
2208 | const CompileContext &cc) { |
2209 | unique_ptr<VertLitInfo> split = findBestNormalSplit(h, vg, ee, cc); |
2210 | |
2211 | if (split && min_len(split->lit) >= INFIX_MIN_SPLIT_LITERAL_LEN |
2212 | && splitRoseEdge(h, vg, ee, *split)) { |
2213 | DEBUG_PRINTF("splitting simple literal\n" ); |
2214 | return true; |
2215 | } |
2216 | |
2217 | DEBUG_PRINTF("trying for a netflow cut\n" ); |
2218 | /* look for netflow cuts which don't produce good prefixes */ |
2219 | bool rv = doNetflowCut(h, nullptr, vg, ee, false, cc.grey, 8); |
2220 | |
2221 | DEBUG_PRINTF("did netfow cut? = %d\n" , (int)rv); |
2222 | |
2223 | return rv; |
2224 | } |
2225 | |
2226 | /** |
2227 | * Infixes which are weakly guarded can, in effect, act like prefixes as they |
2228 | * will often be live. We should try to split these infixes further if they |
2229 | * contain strong literals so that we are at least running smaller weak infixes |
2230 | * which can hopeful be accelerated/miracled. |
2231 | */ |
2232 | static |
2233 | void improveWeakInfixes(RoseInGraph &vg, const CompileContext &cc) { |
2234 | if (!cc.grey.violetAvoidWeakInfixes) { |
2235 | return; |
2236 | } |
2237 | STAGE_DEBUG_PRINTF("IMPROVE WEAK INFIXES\n" ); |
2238 | |
2239 | RoseInVertex start = getStart(vg); |
2240 | |
2241 | unordered_set<NGHolder *> weak; |
2242 | |
2243 | for (RoseInVertex vv : adjacent_vertices_range(start, vg)) { |
2244 | /* outfixes shouldn't have made it this far */ |
2245 | assert(vg[vv].type == RIV_LITERAL); |
2246 | if (vg[vv].s.length() >= INFIX_STRONG_GUARD_LEN) { |
2247 | continue; |
2248 | } |
2249 | |
2250 | for (const RoseInEdge &e : out_edges_range(vv, vg)) { |
2251 | if (vg[target(e, vg)].type != RIV_LITERAL || !vg[e].graph) { |
2252 | continue; |
2253 | } |
2254 | |
2255 | NGHolder *h = vg[e].graph.get(); |
2256 | DEBUG_PRINTF("'%s' guards %p\n" , dumpString(vg[vv].s).c_str(), h); |
2257 | weak.insert(h); |
2258 | } |
2259 | } |
2260 | |
2261 | insertion_ordered_map<NGHolder *, vector<RoseInEdge>> weak_edges; |
2262 | for (const RoseInEdge &ve : edges_range(vg)) { |
2263 | NGHolder *h = vg[ve].graph.get(); |
2264 | if (contains(weak, h)) { |
2265 | weak_edges[h].push_back(ve); |
2266 | } |
2267 | } |
2268 | |
2269 | for (const auto &m : weak_edges) { |
2270 | NGHolder *h = m.first; |
2271 | const auto &edges = m.second; |
2272 | improveInfix(*h, vg, edges, cc); |
2273 | } |
2274 | } |
2275 | |
2276 | static |
2277 | void splitEdgesForSuffix(const NGHolder &base_graph, RoseInGraph &vg, |
2278 | const vector<RoseInEdge> &ee, const VertLitInfo &split, |
2279 | bool eod, const flat_set<ReportID> &reports) { |
2280 | const vector<NFAVertex> &splitters = split.vv; |
2281 | assert(!splitters.empty()); |
2282 | |
2283 | shared_ptr<NGHolder> lhs = make_shared<NGHolder>(); |
2284 | unordered_map<NFAVertex, NFAVertex> v_map; |
2285 | cloneHolder(*lhs, base_graph, &v_map); |
2286 | lhs->kind = NFA_INFIX; |
2287 | clear_in_edges(lhs->accept, *lhs); |
2288 | clear_in_edges(lhs->acceptEod, *lhs); |
2289 | add_edge(lhs->accept, lhs->acceptEod, *lhs); |
2290 | clearReports(*lhs); |
2291 | for (NFAVertex v : splitters) { |
2292 | NFAEdge e = add_edge(v_map[v], lhs->accept, *lhs); |
2293 | if (v == base_graph.start) { |
2294 | (*lhs)[e].tops.insert(DEFAULT_TOP); |
2295 | } |
2296 | (*lhs)[v_map[v]].reports.insert(0); |
2297 | |
2298 | } |
2299 | pruneUseless(*lhs); |
2300 | assert(isCorrectlyTopped(*lhs)); |
2301 | |
2302 | /* create literal vertices and connect preds */ |
2303 | for (const auto &lit : split.lit) { |
2304 | if (!can_match(*lhs, lit, is_triggered(*lhs))) { |
2305 | continue; |
2306 | } |
2307 | |
2308 | DEBUG_PRINTF("best is '%s'\n" , escapeString(lit).c_str()); |
2309 | RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg); |
2310 | |
2311 | RoseInVertex tt; |
2312 | if (eod) { |
2313 | DEBUG_PRINTF("doing eod\n" ); |
2314 | tt = add_vertex(RoseInVertexProps::makeAcceptEod(reports), vg); |
2315 | } else { |
2316 | DEBUG_PRINTF("doing non-eod\n" ); |
2317 | tt = add_vertex(RoseInVertexProps::makeAccept(reports), vg); |
2318 | } |
2319 | add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg); |
2320 | |
2321 | for (const RoseInEdge &e : ee) { |
2322 | RoseInVertex u = source(e, vg); |
2323 | assert(!edge(u, v, vg).second); |
2324 | add_edge(u, v, RoseInEdgeProps(lhs, 0U), vg); |
2325 | } |
2326 | } |
2327 | } |
2328 | |
2329 | #define MIN_SUFFIX_LEN 6 |
2330 | |
2331 | static |
2332 | bool replaceSuffixWithInfix(const NGHolder &h, RoseInGraph &vg, |
2333 | const vector<RoseInEdge> &suffix_edges, |
2334 | const CompileContext &cc) { |
2335 | DEBUG_PRINTF("inspecting suffix : %p on %zu edges\n" , &h, |
2336 | suffix_edges.size()); |
2337 | /* |
2338 | * We would, in general, rather not have output exposed engines because |
2339 | * once they are triggered, they must be run while infixes only have to run |
2340 | * if the successor literal is seen. Matches from output exposed engines |
2341 | * also have to be placed in a priority queue and interleaved with matches |
2342 | * from other sources. |
2343 | * |
2344 | * Note: |
2345 | * - if the LHS is extremely unlikely we may be better off leaving |
2346 | * a suffix unguarded. |
2347 | * |
2348 | * - limited width suffixes may be less bad as they won't be continuously |
2349 | * active, we may want to have (a) stronger controls on if we want to pick |
2350 | * a trailing literal in these cases and/or (b) look also for literals |
2351 | * near accept as well as right on accept |
2352 | * |
2353 | * TODO: improve heuristics, splitting logic. |
2354 | */ |
2355 | |
2356 | /* we may do multiple splits corresponding to different report behaviour */ |
2357 | set<NFAVertex> seen; |
2358 | map<pair<bool, flat_set<ReportID> >, VertLitInfo> by_reports; /* eod, rep */ |
2359 | |
2360 | for (NFAVertex v : inv_adjacent_vertices_range(h.accept, h)) { |
2361 | set<ue2_literal> ss = getLiteralSet(h, v, false); |
2362 | if (ss.empty()) { |
2363 | DEBUG_PRINTF("candidate is too shitty\n" ); |
2364 | return false; |
2365 | } |
2366 | |
2367 | VertLitInfo &vli = by_reports[make_pair(false, h[v].reports)]; |
2368 | insert(&vli.lit, ss); |
2369 | vli.vv.push_back(v); |
2370 | seen.insert(v); |
2371 | } |
2372 | |
2373 | seen.insert(h.accept); |
2374 | for (NFAVertex v : inv_adjacent_vertices_range(h.acceptEod, h)) { |
2375 | if (contains(seen, v)) { |
2376 | continue; |
2377 | } |
2378 | |
2379 | set<ue2_literal> ss = getLiteralSet(h, v, false); |
2380 | if (ss.empty()) { |
2381 | DEBUG_PRINTF("candidate is too shitty\n" ); |
2382 | return false; |
2383 | } |
2384 | |
2385 | VertLitInfo &vli = by_reports[make_pair(true, h[v].reports)]; |
2386 | insert(&vli.lit, ss); |
2387 | vli.vv.push_back(v); |
2388 | } |
2389 | |
2390 | assert(!by_reports.empty()); |
2391 | |
2392 | /* TODO: how strong a min len do we want here ? */ |
2393 | u32 min_len = cc.grey.minRoseLiteralLength; |
2394 | ENSURE_AT_LEAST(&min_len, MIN_SUFFIX_LEN); |
2395 | |
2396 | for (auto &vli : by_reports | map_values) { |
2397 | u64a score = sanitizeAndCompressAndScore(vli.lit); |
2398 | |
2399 | if (vli.lit.empty() |
2400 | || !validateRoseLiteralSetQuality(vli.lit, score, false, min_len, |
2401 | false, false)) { |
2402 | return false; |
2403 | } |
2404 | } |
2405 | |
2406 | for (const auto &info : by_reports) { |
2407 | DEBUG_PRINTF("splitting on simple literals\n" ); |
2408 | splitEdgesForSuffix(h, vg, suffix_edges, info.second, |
2409 | info.first.first /* eod */, |
2410 | info.first.second /* reports */); |
2411 | } |
2412 | |
2413 | for (const RoseInEdge &e : suffix_edges) { |
2414 | remove_edge(e, vg); |
2415 | } |
2416 | return true; |
2417 | } |
2418 | |
2419 | static |
2420 | void avoidSuffixes(RoseInGraph &vg, const CompileContext &cc) { |
2421 | if (!cc.grey.violetAvoidSuffixes) { |
2422 | return; |
2423 | } |
2424 | |
2425 | STAGE_DEBUG_PRINTF("AVOID SUFFIXES\n" ); |
2426 | |
2427 | RoseInVertex accept = getPrimaryAccept(vg); |
2428 | |
2429 | insertion_ordered_map<const NGHolder *, vector<RoseInEdge>> suffixes; |
2430 | |
2431 | /* find suffixes */ |
2432 | for (const RoseInEdge &e : in_edges_range(accept, vg)) { |
2433 | /* outfixes shouldn't have made it this far */ |
2434 | assert(vg[source(e, vg)].type == RIV_LITERAL); |
2435 | assert(vg[e].graph); /* non suffix paths should be wired to other |
2436 | accepts */ |
2437 | const NGHolder *h = vg[e].graph.get(); |
2438 | suffixes[h].push_back(e); |
2439 | } |
2440 | |
2441 | /* look at suffixes and try to split */ |
2442 | for (const auto &m : suffixes) { |
2443 | const NGHolder *h = m.first; |
2444 | const auto &edges = m.second; |
2445 | replaceSuffixWithInfix(*h, vg, edges, cc); |
2446 | } |
2447 | } |
2448 | |
2449 | static |
2450 | bool leadingDotStartLiteral(const NGHolder &h, VertLitInfo *out) { |
2451 | if (out_degree(h.start, h) != 3) { |
2452 | return false; |
2453 | } |
2454 | |
2455 | NFAVertex v = NGHolder::null_vertex(); |
2456 | NFAVertex ds = NGHolder::null_vertex(); |
2457 | |
2458 | for (NFAVertex a : adjacent_vertices_range(h.start, h)) { |
2459 | if (a == h.startDs) { |
2460 | continue; |
2461 | } |
2462 | if (h[a].char_reach.all()) { |
2463 | ds = a; |
2464 | if (out_degree(ds, h) != 2 || !edge(ds, ds, h).second) { |
2465 | return false; |
2466 | } |
2467 | } else { |
2468 | v = a; |
2469 | } |
2470 | } |
2471 | |
2472 | if (!v || !ds || !edge(ds, v, h).second) { |
2473 | return false; |
2474 | } |
2475 | |
2476 | if (h[v].char_reach.count() != 1 && !h[v].char_reach.isCaselessChar()) { |
2477 | return false; |
2478 | } |
2479 | |
2480 | ue2_literal lit; |
2481 | lit.push_back(h[v].char_reach.find_first(), |
2482 | h[v].char_reach.isCaselessChar()); |
2483 | while (out_degree(v, h) == 1) { |
2484 | NFAVertex vv = *adjacent_vertices(v, h).first; |
2485 | if (h[vv].char_reach.count() != 1 |
2486 | && !h[vv].char_reach.isCaselessChar()) { |
2487 | break; |
2488 | } |
2489 | |
2490 | v = vv; |
2491 | |
2492 | lit.push_back(h[v].char_reach.find_first(), |
2493 | h[v].char_reach.isCaselessChar()); |
2494 | } |
2495 | |
2496 | if (is_match_vertex(v, h) && h.kind != NFA_SUFFIX) { |
2497 | /* we have rediscovered the post-infix literal */ |
2498 | return false; |
2499 | } |
2500 | |
2501 | if (bad_mixed_sensitivity(lit)) { |
2502 | make_nocase(&lit); |
2503 | } |
2504 | |
2505 | DEBUG_PRINTF("%zu found %s\n" , h[v].index, dumpString(lit).c_str()); |
2506 | out->vv = {v}; |
2507 | out->lit = {lit}; |
2508 | return true; |
2509 | } |
2510 | |
2511 | static |
2512 | bool lookForDoubleCut(const NGHolder &h, const vector<RoseInEdge> &ee, |
2513 | RoseInGraph &vg, const Grey &grey) { |
2514 | VertLitInfo info; |
2515 | if (!leadingDotStartLiteral(h, &info) |
2516 | || min_len(info.lit) < grey.violetDoubleCutLiteralLen) { |
2517 | return false; |
2518 | } |
2519 | DEBUG_PRINTF("performing split\n" ); |
2520 | return splitRoseEdge(h, vg, ee, {info}); |
2521 | } |
2522 | |
2523 | static |
2524 | void lookForDoubleCut(RoseInGraph &vg, const CompileContext &cc) { |
2525 | if (!cc.grey.violetDoubleCut) { |
2526 | return; |
2527 | } |
2528 | |
2529 | insertion_ordered_map<const NGHolder *, vector<RoseInEdge>> right_edges; |
2530 | for (const RoseInEdge &ve : edges_range(vg)) { |
2531 | if (vg[ve].graph && vg[source(ve, vg)].type == RIV_LITERAL) { |
2532 | const NGHolder *h = vg[ve].graph.get(); |
2533 | right_edges[h].push_back(ve); |
2534 | } |
2535 | } |
2536 | |
2537 | for (const auto &m : right_edges) { |
2538 | const NGHolder *h = m.first; |
2539 | const auto &edges = m.second; |
2540 | lookForDoubleCut(*h, edges, vg, cc.grey); |
2541 | } |
2542 | } |
2543 | |
2544 | static |
2545 | pair<NFAVertex, ue2_literal> findLiteralBefore(const NGHolder &h, NFAVertex v) { |
2546 | ue2_literal lit; |
2547 | if (h[v].char_reach.count() != 1 && !h[v].char_reach.isCaselessChar()) { |
2548 | return {v, std::move(lit) }; |
2549 | } |
2550 | lit.push_back(h[v].char_reach.find_first(), |
2551 | h[v].char_reach.isCaselessChar()); |
2552 | |
2553 | while (in_degree(v, h) == 1) { |
2554 | NFAVertex vv = *inv_adjacent_vertices(v, h).first; |
2555 | if (h[vv].char_reach.count() != 1 |
2556 | && !h[vv].char_reach.isCaselessChar()) { |
2557 | break; |
2558 | } |
2559 | |
2560 | lit.push_back(h[vv].char_reach.find_first(), |
2561 | h[vv].char_reach.isCaselessChar()); |
2562 | v = vv; |
2563 | } |
2564 | |
2565 | return {v, std::move(lit) }; |
2566 | } |
2567 | |
2568 | static |
2569 | bool lookForDotStarPred(NFAVertex v, const NGHolder &h, |
2570 | NFAVertex *u, NFAVertex *ds) { |
2571 | *u = NGHolder::null_vertex(); |
2572 | *ds = NGHolder::null_vertex(); |
2573 | for (NFAVertex a : inv_adjacent_vertices_range(v, h)) { |
2574 | if (h[a].char_reach.all()) { |
2575 | if (!edge(a, a, h).second) { |
2576 | return false; |
2577 | } |
2578 | |
2579 | if (*ds) { |
2580 | return false; |
2581 | } |
2582 | |
2583 | *ds = a; |
2584 | } else { |
2585 | if (*u) { |
2586 | return false; |
2587 | } |
2588 | *u = a; |
2589 | } |
2590 | } |
2591 | |
2592 | if (!*u || !*ds) { |
2593 | return false; |
2594 | } |
2595 | |
2596 | return true; |
2597 | } |
2598 | |
2599 | static |
2600 | bool trailingDotStarLiteral(const NGHolder &h, VertLitInfo *out) { |
2601 | /* Note: there is no delay yet - so the final literal is the already |
2602 | * discovered successor literal - we are in fact interested in the literal |
2603 | * before it. */ |
2604 | |
2605 | if (in_degree(h.accept, h) != 1) { |
2606 | return false; |
2607 | } |
2608 | |
2609 | if (in_degree(h.acceptEod, h) != 1) { |
2610 | assert(0); |
2611 | return false; |
2612 | } |
2613 | |
2614 | NFAVertex v |
2615 | = findLiteralBefore(h, *inv_adjacent_vertices(h.accept, h).first).first; |
2616 | |
2617 | NFAVertex u; |
2618 | NFAVertex ds; |
2619 | |
2620 | if (!lookForDotStarPred(v, h, &u, &ds)) { |
2621 | return false; |
2622 | } |
2623 | |
2624 | v = u; |
2625 | auto rv = findLiteralBefore(h, v); |
2626 | |
2627 | if (!lookForDotStarPred(v, h, &u, &ds)) { |
2628 | return false; |
2629 | } |
2630 | |
2631 | ue2_literal lit = reverse_literal(rv.second); |
2632 | DEBUG_PRINTF("%zu found %s\n" , h[v].index, dumpString(lit).c_str()); |
2633 | |
2634 | if (bad_mixed_sensitivity(lit)) { |
2635 | make_nocase(&lit); |
2636 | } |
2637 | |
2638 | out->vv = {v}; |
2639 | out->lit = {lit}; |
2640 | return true; |
2641 | } |
2642 | |
2643 | static |
2644 | bool lookForTrailingLiteralDotStar(const NGHolder &h, |
2645 | const vector<RoseInEdge> &ee, |
2646 | RoseInGraph &vg, const Grey &grey) { |
2647 | VertLitInfo info; |
2648 | if (!trailingDotStarLiteral(h, &info) |
2649 | || min_len(info.lit) < grey.violetDoubleCutLiteralLen) { |
2650 | return false; |
2651 | } |
2652 | DEBUG_PRINTF("performing split\n" ); |
2653 | return splitRoseEdge(h, vg, ee, info); |
2654 | } |
2655 | |
2656 | /* In streaming mode, active engines have to be caught up at stream boundaries |
2657 | * and have to be stored in stream state, so we prefer to decompose patterns |
2658 | * in to literals with no state between them if possible. */ |
2659 | static |
2660 | void decomposeLiteralChains(RoseInGraph &vg, const CompileContext &cc) { |
2661 | if (!cc.grey.violetLiteralChains) { |
2662 | return; |
2663 | } |
2664 | |
2665 | insertion_ordered_map<const NGHolder *, vector<RoseInEdge>> right_edges; |
2666 | bool changed; |
2667 | do { |
2668 | changed = false; |
2669 | |
2670 | right_edges.clear(); |
2671 | for (const RoseInEdge &ve : edges_range(vg)) { |
2672 | if (vg[ve].graph && vg[source(ve, vg)].type == RIV_LITERAL) { |
2673 | const NGHolder *h = vg[ve].graph.get(); |
2674 | right_edges[h].push_back(ve); |
2675 | } |
2676 | } |
2677 | |
2678 | for (const auto &m : right_edges) { |
2679 | const NGHolder *h = m.first; |
2680 | const vector<RoseInEdge> &ee = m.second; |
2681 | bool rv = lookForDoubleCut(*h, ee, vg, cc.grey); |
2682 | if (!rv && h->kind != NFA_SUFFIX) { |
2683 | rv = lookForTrailingLiteralDotStar(*h, ee, vg, cc.grey); |
2684 | } |
2685 | changed |= rv; |
2686 | } |
2687 | } while (changed); |
2688 | } |
2689 | |
2690 | static |
2691 | bool lookForCleanSplit(const NGHolder &h, const vector<RoseInEdge> &ee, |
2692 | RoseInGraph &vg, const CompileContext &cc) { |
2693 | unique_ptr<VertLitInfo> split = findBestCleanSplit(h, cc); |
2694 | |
2695 | if (split) { |
2696 | return splitRoseEdge(h, vg, {ee}, *split); |
2697 | } |
2698 | |
2699 | return false; |
2700 | } |
2701 | |
2702 | #define MAX_DESIRED_CLEAN_SPLIT_DEPTH 4 |
2703 | |
2704 | static |
2705 | void lookForCleanEarlySplits(RoseInGraph &vg, const CompileContext &cc) { |
2706 | u32 gen = 0; |
2707 | |
2708 | insertion_ordered_set<RoseInVertex> prev({getStart(vg)}); |
2709 | insertion_ordered_set<RoseInVertex> curr; |
2710 | |
2711 | while (gen < MAX_DESIRED_CLEAN_SPLIT_DEPTH) { |
2712 | curr.clear(); |
2713 | for (RoseInVertex u : prev) { |
2714 | for (auto v : adjacent_vertices_range(u, vg)) { |
2715 | curr.insert(v); |
2716 | } |
2717 | } |
2718 | |
2719 | insertion_ordered_map<const NGHolder *, vector<RoseInEdge>> rightfixes; |
2720 | for (RoseInVertex v : curr) { |
2721 | for (const RoseInEdge &e : out_edges_range(v, vg)) { |
2722 | if (vg[e].graph) { |
2723 | NGHolder *h = vg[e].graph.get(); |
2724 | rightfixes[h].push_back(e); |
2725 | } |
2726 | } |
2727 | } |
2728 | |
2729 | for (const auto &m : rightfixes) { |
2730 | const NGHolder *h = m.first; |
2731 | const auto &edges = m.second; |
2732 | lookForCleanSplit(*h, edges, vg, cc); |
2733 | } |
2734 | |
2735 | prev = std::move(curr); |
2736 | gen++; |
2737 | } |
2738 | } |
2739 | |
2740 | static |
2741 | void rehomeEodSuffixes(RoseInGraph &vg) { |
2742 | // Find edges to accept with EOD-anchored graphs that we can move over to |
2743 | // acceptEod. |
2744 | vector<RoseInEdge> acc_edges; |
2745 | for (const auto &e : edges_range(vg)) { |
2746 | if (vg[target(e, vg)].type != RIV_ACCEPT) { |
2747 | continue; |
2748 | } |
2749 | if (vg[e].haig || !vg[e].graph) { |
2750 | continue; |
2751 | } |
2752 | |
2753 | const NGHolder &h = *vg[e].graph; |
2754 | |
2755 | if (in_degree(h.accept, h)) { |
2756 | DEBUG_PRINTF("graph isn't eod anchored\n" ); |
2757 | continue; |
2758 | } |
2759 | |
2760 | acc_edges.push_back(e); |
2761 | } |
2762 | |
2763 | for (const RoseInEdge &e : acc_edges) { |
2764 | // Move this edge from accept to acceptEod |
2765 | RoseInVertex w = add_vertex(RoseInVertexProps::makeAcceptEod(), vg); |
2766 | add_edge(source(e, vg), w, vg[e], vg); |
2767 | remove_edge(e, vg); |
2768 | } |
2769 | |
2770 | /* old accept vertices will be tidied up by final pruneUseless() call */ |
2771 | } |
2772 | |
2773 | static |
2774 | bool tryForEarlyDfa(const NGHolder &h, const CompileContext &cc) { |
2775 | switch (h.kind) { |
2776 | case NFA_OUTFIX: /* 'prefix' of eod */ |
2777 | case NFA_PREFIX: |
2778 | return cc.grey.earlyMcClellanPrefix; |
2779 | case NFA_INFIX: |
2780 | return cc.grey.earlyMcClellanInfix; |
2781 | case NFA_SUFFIX: |
2782 | return cc.grey.earlyMcClellanSuffix; |
2783 | default: |
2784 | DEBUG_PRINTF("kind %u\n" , (u32)h.kind); |
2785 | assert(0); |
2786 | return false; |
2787 | } |
2788 | } |
2789 | |
2790 | static |
2791 | vector<vector<CharReach>> getDfaTriggers(RoseInGraph &vg, |
2792 | const vector<RoseInEdge> &edges, |
2793 | bool *single_trigger) { |
2794 | vector<vector<CharReach>> triggers; |
2795 | u32 min_offset = ~0U; |
2796 | u32 max_offset = 0; |
2797 | for (const auto &e : edges) { |
2798 | RoseInVertex s = source(e, vg); |
2799 | if (vg[s].type == RIV_LITERAL) { |
2800 | triggers.push_back(as_cr_seq(vg[s].s)); |
2801 | } |
2802 | ENSURE_AT_LEAST(&max_offset, vg[s].max_offset); |
2803 | LIMIT_TO_AT_MOST(&min_offset, vg[s].min_offset); |
2804 | } |
2805 | |
2806 | *single_trigger = min_offset == max_offset; |
2807 | DEBUG_PRINTF("trigger offset (%u, %u)\n" , min_offset, max_offset); |
2808 | |
2809 | return triggers; |
2810 | } |
2811 | |
2812 | static |
2813 | bool doEarlyDfa(RoseBuild &rose, RoseInGraph &vg, NGHolder &h, |
2814 | const vector<RoseInEdge> &edges, bool final_chance, |
2815 | const ReportManager &rm, const CompileContext &cc) { |
2816 | DEBUG_PRINTF("trying for dfa\n" ); |
2817 | |
2818 | bool single_trigger; |
2819 | for (const auto &e : edges) { |
2820 | if (vg[target(e, vg)].type == RIV_ACCEPT_EOD) { |
2821 | /* TODO: support eod prefixes */ |
2822 | return false; |
2823 | } |
2824 | } |
2825 | |
2826 | auto triggers = getDfaTriggers(vg, edges, &single_trigger); |
2827 | |
2828 | /* TODO: literal delay things */ |
2829 | if (!generates_callbacks(h)) { |
2830 | set_report(h, rose.getNewNfaReport()); |
2831 | } |
2832 | |
2833 | shared_ptr<raw_dfa> dfa = buildMcClellan(h, &rm, single_trigger, triggers, |
2834 | cc.grey, final_chance); |
2835 | |
2836 | if (!dfa) { |
2837 | return false; |
2838 | } |
2839 | |
2840 | DEBUG_PRINTF("dfa ok\n" ); |
2841 | for (const auto &e : edges) { |
2842 | vg[e].dfa = dfa; |
2843 | } |
2844 | |
2845 | return true; |
2846 | } |
2847 | |
2848 | #define MAX_EDGES_FOR_IMPLEMENTABILITY 50 |
2849 | |
2850 | static |
2851 | bool splitForImplementability(RoseInGraph &vg, NGHolder &h, |
2852 | const vector<RoseInEdge> &edges, |
2853 | const CompileContext &cc) { |
2854 | vector<pair<ue2_literal, u32>> succ_lits; |
2855 | DEBUG_PRINTF("trying to split %s with %zu vertices on %zu edges\n" , |
2856 | to_string(h.kind).c_str(), num_vertices(h), edges.size()); |
2857 | |
2858 | if (edges.size() > MAX_EDGES_FOR_IMPLEMENTABILITY) { |
2859 | return false; |
2860 | } |
2861 | |
2862 | if (!generates_callbacks(h)) { |
2863 | for (const auto &e : edges) { |
2864 | const auto &lit = vg[target(e, vg)].s; |
2865 | u32 delay = vg[e].graph_lag; |
2866 | vg[e].graph_lag = 0; |
2867 | |
2868 | assert(delay <= lit.length()); |
2869 | succ_lits.emplace_back(lit, delay); |
2870 | } |
2871 | restoreTrailingLiteralStates(h, succ_lits); |
2872 | } |
2873 | |
2874 | unique_ptr<VertLitInfo> split; |
2875 | bool last_chance = true; |
2876 | if (h.kind == NFA_PREFIX) { |
2877 | auto depths = calcDepths(h); |
2878 | |
2879 | split = findBestPrefixSplit(h, depths, vg, edges, last_chance, cc); |
2880 | } else { |
2881 | split = findBestLastChanceSplit(h, vg, edges, cc); |
2882 | } |
2883 | |
2884 | if (split && splitRoseEdge(h, vg, edges, *split)) { |
2885 | DEBUG_PRINTF("split on simple literal\n" ); |
2886 | return true; |
2887 | } |
2888 | |
2889 | DEBUG_PRINTF("trying to netflow\n" ); |
2890 | bool rv = doNetflowCut(h, nullptr, vg, edges, false, cc.grey); |
2891 | DEBUG_PRINTF("done\n" ); |
2892 | |
2893 | return rv; |
2894 | } |
2895 | |
2896 | #define MAX_IMPLEMENTABLE_SPLITS 50 |
2897 | |
2898 | bool ensureImplementable(RoseBuild &rose, RoseInGraph &vg, bool allow_changes, |
2899 | bool final_chance, const ReportManager &rm, |
2900 | const CompileContext &cc) { |
2901 | DEBUG_PRINTF("checking for impl %d\n" , final_chance); |
2902 | bool changed = false; |
2903 | bool need_to_recalc = false; |
2904 | u32 added_count = 0; |
2905 | unordered_set<shared_ptr<NGHolder>> good; /* known to be implementable */ |
2906 | do { |
2907 | changed = false; |
2908 | DEBUG_PRINTF("added %u\n" , added_count); |
2909 | insertion_ordered_map<shared_ptr<NGHolder>, |
2910 | vector<RoseInEdge>> edges_by_graph; |
2911 | for (const RoseInEdge &ve : edges_range(vg)) { |
2912 | if (vg[ve].graph && !vg[ve].dfa) { |
2913 | auto &h = vg[ve].graph; |
2914 | edges_by_graph[h].push_back(ve); |
2915 | } |
2916 | } |
2917 | for (auto &m : edges_by_graph) { |
2918 | auto &h = m.first; |
2919 | if (contains(good, h)) { |
2920 | continue; |
2921 | } |
2922 | reduceGraphEquivalences(*h, cc); |
2923 | if (isImplementableNFA(*h, &rm, cc)) { |
2924 | good.insert(h); |
2925 | continue; |
2926 | } |
2927 | |
2928 | const auto &edges = m.second; |
2929 | |
2930 | if (tryForEarlyDfa(*h, cc) && |
2931 | doEarlyDfa(rose, vg, *h, edges, final_chance, rm, cc)) { |
2932 | continue; |
2933 | } |
2934 | |
2935 | DEBUG_PRINTF("eek\n" ); |
2936 | if (!allow_changes) { |
2937 | return false; |
2938 | } |
2939 | |
2940 | if (splitForImplementability(vg, *h, edges, cc)) { |
2941 | added_count++; |
2942 | if (added_count > MAX_IMPLEMENTABLE_SPLITS) { |
2943 | DEBUG_PRINTF("added_count hit limit\n" ); |
2944 | return false; |
2945 | } |
2946 | changed = true; |
2947 | continue; |
2948 | } |
2949 | |
2950 | return false; |
2951 | } |
2952 | |
2953 | assert(added_count <= MAX_IMPLEMENTABLE_SPLITS); |
2954 | |
2955 | if (changed) { |
2956 | removeRedundantLiterals(vg, cc); |
2957 | pruneUseless(vg); |
2958 | need_to_recalc = true; |
2959 | } |
2960 | } while (changed); |
2961 | |
2962 | if (need_to_recalc) { |
2963 | renumber_vertices(vg); |
2964 | calcVertexOffsets(vg); |
2965 | } |
2966 | |
2967 | DEBUG_PRINTF("ok!\n" ); |
2968 | return true; |
2969 | } |
2970 | |
2971 | static |
2972 | RoseInGraph doInitialVioletTransform(const NGHolder &h, bool last_chance, |
2973 | const CompileContext &cc) { |
2974 | assert(!can_never_match(h)); |
2975 | |
2976 | RoseInGraph vg = populateTrivialGraph(h); |
2977 | |
2978 | if (!cc.grey.allowViolet) { |
2979 | return vg; |
2980 | } |
2981 | |
2982 | /* Avoid running the Violet analysis at all on graphs with no vertices with |
2983 | * small reach, since we will not be able to extract any literals. */ |
2984 | if (!hasNarrowReachVertex(h)) { |
2985 | DEBUG_PRINTF("fail, no vertices with small reach\n" ); |
2986 | return vg; |
2987 | } |
2988 | |
2989 | DEBUG_PRINTF("hello world\n" ); |
2990 | |
2991 | /* Step 1: avoid outfixes as we always have to run them. */ |
2992 | avoidOutfixes(vg, last_chance, cc); |
2993 | |
2994 | if (num_vertices(vg) <= 2) { |
2995 | return vg; /* unable to transform pattern */ |
2996 | } |
2997 | |
2998 | removeRedundantPrefixes(vg); |
2999 | dumpPreRoseGraph(vg, cc.grey, "pre_prefix_rose.dot" ); |
3000 | |
3001 | /* Step 2: avoid non-transient prefixes (esp in streaming mode) */ |
3002 | findBetterPrefixes(vg, cc); |
3003 | |
3004 | dumpPreRoseGraph(vg, cc.grey, "post_prefix_rose.dot" ); |
3005 | |
3006 | extractStrongLiterals(vg, cc); |
3007 | dumpPreRoseGraph(vg, cc.grey, "post_extract_rose.dot" ); |
3008 | improveWeakInfixes(vg, cc); |
3009 | dumpPreRoseGraph(vg, cc.grey, "post_infix_rose.dot" ); |
3010 | |
3011 | /* Step 3: avoid output exposed engines if there is a strong trailing |
3012 | literal) */ |
3013 | avoidSuffixes(vg, cc); |
3014 | |
3015 | /* Step 4: look for infixes/suffixes with leading .*literals |
3016 | * This can reduce the amount of work a heavily picked literal has to do and |
3017 | * reduce the amount of state used as .* is handled internally to rose. */ |
3018 | lookForDoubleCut(vg, cc); |
3019 | |
3020 | if (cc.streaming) { |
3021 | lookForCleanEarlySplits(vg, cc); |
3022 | decomposeLiteralChains(vg, cc); |
3023 | } |
3024 | |
3025 | rehomeEodSuffixes(vg); |
3026 | removeRedundantLiterals(vg, cc); |
3027 | |
3028 | pruneUseless(vg); |
3029 | dumpPreRoseGraph(vg, cc.grey); |
3030 | renumber_vertices(vg); |
3031 | calcVertexOffsets(vg); |
3032 | |
3033 | return vg; |
3034 | } |
3035 | |
3036 | bool doViolet(RoseBuild &rose, const NGHolder &h, bool prefilter, |
3037 | bool last_chance, const ReportManager &rm, |
3038 | const CompileContext &cc) { |
3039 | auto vg = doInitialVioletTransform(h, last_chance, cc); |
3040 | if (num_vertices(vg) <= 2) { |
3041 | return false; |
3042 | } |
3043 | |
3044 | /* Step 5: avoid unimplementable, or overly large engines if possible */ |
3045 | if (!ensureImplementable(rose, vg, last_chance, last_chance, rm, cc)) { |
3046 | return false; |
3047 | } |
3048 | dumpPreRoseGraph(vg, cc.grey, "post_ensure_rose.dot" ); |
3049 | |
3050 | /* Step 6: send to rose */ |
3051 | bool rv = rose.addRose(vg, prefilter); |
3052 | DEBUG_PRINTF("violet: %s\n" , rv ? "success" : "fail" ); |
3053 | return rv; |
3054 | } |
3055 | |
3056 | bool checkViolet(const ReportManager &rm, const NGHolder &h, bool prefilter, |
3057 | const CompileContext &cc) { |
3058 | auto vg = doInitialVioletTransform(h, true, cc); |
3059 | if (num_vertices(vg) <= 2) { |
3060 | return false; |
3061 | } |
3062 | |
3063 | bool rv = roseCheckRose(vg, prefilter, rm, cc); |
3064 | DEBUG_PRINTF("violet: %s\n" , rv ? "success" : "fail" ); |
3065 | return rv; |
3066 | } |
3067 | |
3068 | } |
3069 | |