1/*
2 * Copyright (c) 2015-2018, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Bounded repeat analysis.
31 */
32#include "ng_repeat.h"
33
34#include "grey.h"
35#include "ng_depth.h"
36#include "ng_holder.h"
37#include "ng_limex_accel.h"
38#include "ng_prune.h"
39#include "ng_reports.h"
40#include "ng_som_util.h"
41#include "ng_util.h"
42#include "nfa/accel.h"
43#include "nfa/limex_limits.h"
44#include "nfa/repeat_internal.h"
45#include "nfa/repeatcompile.h"
46#include "util/container.h"
47#include "util/dump_charclass.h"
48#include "util/graph_range.h"
49#include "util/graph_small_color_map.h"
50#include "util/graph_undirected.h"
51#include "util/report_manager.h"
52#include "util/unordered.h"
53
54#include <algorithm>
55#include <map>
56#include <queue>
57#include <unordered_map>
58#include <unordered_set>
59
60#include <boost/graph/connected_components.hpp>
61#include <boost/graph/depth_first_search.hpp>
62#include <boost/graph/filtered_graph.hpp>
63#include <boost/graph/reverse_graph.hpp>
64#include <boost/graph/topological_sort.hpp>
65#include <boost/icl/interval_set.hpp>
66
67using namespace std;
68using boost::depth_first_search;
69using boost::depth_first_visit;
70using boost::make_assoc_property_map;
71
72namespace ue2 {
73
74namespace {
75
76/**
77 * \brief Filter that retains only edges between vertices with the same
78 * reachability. Special vertices are dropped.
79 */
80template<class Graph>
81struct ReachFilter {
82 ReachFilter() = default;
83 explicit ReachFilter(const Graph *g_in) : g(g_in) {}
84
85 // Convenience typedefs.
86 using Traits = typename boost::graph_traits<Graph>;
87 using VertexDescriptor = typename Traits::vertex_descriptor;
88 using EdgeDescriptor = typename Traits::edge_descriptor;
89
90 bool operator()(const VertexDescriptor &v) const {
91 assert(g);
92 // Disallow special vertices, as otherwise we will try to remove them
93 // later.
94 return !is_special(v, *g);
95 }
96
97 bool operator()(const EdgeDescriptor &e) const {
98 assert(g);
99 // Vertices must have the same reach.
100 auto u = source(e, *g), v = target(e, *g);
101 const CharReach &cr_u = (*g)[u].char_reach;
102 const CharReach &cr_v = (*g)[v].char_reach;
103 return cr_u == cr_v;
104 }
105
106 const Graph *g = nullptr;
107};
108
109using RepeatGraph = boost::filtered_graph<NGHolder, ReachFilter<NGHolder>,
110 ReachFilter<NGHolder>>;
111
112struct ReachSubgraph {
113 vector<NFAVertex> vertices;
114 depth repeatMin{0};
115 depth repeatMax{0};
116 u32 minPeriod = 1;
117 bool is_reset = false;
118 enum RepeatType historyType = REPEAT_RING;
119 bool bad = false; // if true, ignore this case
120};
121
122} // namespace
123
124static
125void findInitDepths(const NGHolder &g,
126 unordered_map<NFAVertex, NFAVertexDepth> &depths) {
127 auto d = calcDepths(g);
128
129 for (auto v : vertices_range(g)) {
130 size_t idx = g[v].index;
131 assert(idx < d.size());
132 depths.emplace(v, d[idx]);
133 }
134}
135
136static
137vector<NFAVertex> buildTopoOrder(const RepeatGraph &g) {
138 /* Note: RepeatGraph is a filtered version of NGHolder and still has
139 * NFAVertex as its vertex descriptor */
140
141 typedef unordered_set<NFAEdge> EdgeSet;
142 EdgeSet deadEdges;
143
144 // We don't have indices spanning [0,N] on our filtered graph, so we
145 // provide a colour map.
146 unordered_map<NFAVertex, boost::default_color_type> colours;
147
148 depth_first_search(g, visitor(BackEdges<EdgeSet>(deadEdges)).
149 color_map(make_assoc_property_map(colours)));
150 auto acyclic_g = make_filtered_graph(g, make_bad_edge_filter(&deadEdges));
151
152 vector<NFAVertex> topoOrder;
153 topological_sort(acyclic_g, back_inserter(topoOrder),
154 color_map(make_assoc_property_map(colours)));
155
156 reverse(topoOrder.begin(), topoOrder.end());
157
158 return topoOrder;
159}
160
161static
162void proper_pred(const NGHolder &g, NFAVertex v,
163 unordered_set<NFAVertex> &p) {
164 pred(g, v, &p);
165 p.erase(v); // self-loops
166}
167
168static
169void proper_succ(const NGHolder &g, NFAVertex v,
170 unordered_set<NFAVertex> &s) {
171 succ(g, v, &s);
172 s.erase(v); // self-loops
173}
174
175static
176bool roguePredecessor(const NGHolder &g, NFAVertex v,
177 const unordered_set<NFAVertex> &involved,
178 const unordered_set<NFAVertex> &pred) {
179 u32 seen = 0;
180
181 for (auto u : inv_adjacent_vertices_range(v, g)) {
182 if (contains(involved, u)) {
183 continue;
184 }
185 if (!contains(pred, u)) {
186 DEBUG_PRINTF("%zu is a rogue pred\n", g[u].index);
187 return true;
188 }
189
190 seen++;
191 }
192
193 // We must have edges from either (a) none of our external predecessors, or
194 // (b) all of our external predecessors.
195 if (!seen) {
196 return false;
197 }
198 return pred.size() != seen;
199}
200
201static
202bool rogueSuccessor(const NGHolder &g, NFAVertex v,
203 const unordered_set<NFAVertex> &involved,
204 const unordered_set<NFAVertex> &succ) {
205 u32 seen = 0;
206 for (auto w : adjacent_vertices_range(v, g)) {
207 if (contains(involved, w)) {
208 continue;
209 }
210
211 if (!contains(succ, w)) {
212 DEBUG_PRINTF("%zu is a rogue succ\n", g[w].index);
213 return true;
214 }
215
216 seen++;
217 }
218
219 // We must have edges to either (a) none of our external successors, or
220 // (b) all of our external successors.
221 if (!seen) {
222 return false;
223 }
224 return succ.size() != seen;
225}
226
227static
228bool hasDifferentTops(const NGHolder &g, const vector<NFAVertex> &verts) {
229 /* TODO: check that we need this now that we allow multiple tops */
230 const flat_set<u32> *tops = nullptr;
231
232 for (auto v : verts) {
233 for (const auto &e : in_edges_range(v, g)) {
234 NFAVertex u = source(e, g);
235 if (u != g.start && u != g.startDs) {
236 continue; // Only edges from starts have valid top properties.
237 }
238 DEBUG_PRINTF("edge (%zu,%zu) with %zu tops\n", g[u].index,
239 g[v].index, g[e].tops.size());
240 if (!tops) {
241 tops = &g[e].tops;
242 } else if (g[e].tops != *tops) {
243 return true; // More than one set of tops.
244 }
245 }
246 }
247
248 return false;
249}
250
251static
252bool vertexIsBad(const NGHolder &g, NFAVertex v,
253 const unordered_set<NFAVertex> &involved,
254 const unordered_set<NFAVertex> &tail,
255 const unordered_set<NFAVertex> &pred,
256 const unordered_set<NFAVertex> &succ,
257 const flat_set<ReportID> &reports) {
258 DEBUG_PRINTF("check vertex %zu\n", g[v].index);
259
260 // We must drop any vertex that is the target of a back-edge within
261 // our subgraph. The tail set contains all vertices that are after v in a
262 // topo ordering.
263 for (auto u : inv_adjacent_vertices_range(v, g)) {
264 if (contains(tail, u)) {
265 DEBUG_PRINTF("back-edge (%zu,%zu) in subgraph found\n",
266 g[u].index, g[v].index);
267 return true;
268 }
269 }
270
271 // If this vertex has an entry from outside our subgraph, it must have
272 // edges from *all* the vertices in pred and no other external entries.
273 // Similarly for exits.
274 if (roguePredecessor(g, v, involved, pred)) {
275 DEBUG_PRINTF("preds for %zu not well-formed\n", g[v].index);
276 return true;
277 }
278
279 if (rogueSuccessor(g, v, involved, succ)) {
280 DEBUG_PRINTF("succs for %zu not well-formed\n", g[v].index);
281 return true;
282 }
283
284 // All reporting vertices should have the same reports.
285 if (is_match_vertex(v, g) && reports != g[v].reports) {
286 DEBUG_PRINTF("report mismatch to %zu\n", g[v].index);
287 return true;
288 }
289
290 return false;
291}
292
293static
294void splitSubgraph(const NGHolder &g, const deque<NFAVertex> &verts,
295 const u32 minNumVertices, queue<ReachSubgraph> &q) {
296 DEBUG_PRINTF("entry\n");
297
298 // We construct a copy of the graph using just the vertices we want, rather
299 // than using a filtered_graph -- this way is faster.
300 NGHolder verts_g;
301 unordered_map<NFAVertex, NFAVertex> verts_map; // in g -> in verts_g
302 fillHolder(&verts_g, g, verts, &verts_map);
303
304 const auto ug = make_undirected_graph(verts_g);
305
306 unordered_map<NFAVertex, u32> repeatMap;
307
308 size_t num = connected_components(ug, make_assoc_property_map(repeatMap));
309 DEBUG_PRINTF("found %zu connected repeat components\n", num);
310 assert(num > 0);
311
312 vector<ReachSubgraph> rs(num);
313
314 for (auto v : verts) {
315 assert(!is_special(v, g));
316 auto vu = verts_map.at(v);
317 auto rit = repeatMap.find(vu);
318 if (rit == repeatMap.end()) {
319 continue; /* not part of a repeat */
320 }
321 u32 comp_id = rit->second;
322 assert(comp_id < num);
323 rs[comp_id].vertices.push_back(v);
324 }
325
326 for (const auto &rsi : rs) {
327 if (rsi.vertices.empty()) {
328 // Empty elements can happen when connected_components finds a
329 // subgraph consisting entirely of specials (which aren't added to
330 // ReachSubgraph in the loop above). There's nothing we can do with
331 // these, so we skip them.
332 continue;
333 }
334 DEBUG_PRINTF("repeat with %zu vertices\n", rsi.vertices.size());
335 if (rsi.vertices.size() >= minNumVertices) {
336 DEBUG_PRINTF("enqueuing\n");
337 q.push(rsi);
338 }
339 }
340}
341
342static
343void findFirstReports(const NGHolder &g, const ReachSubgraph &rsi,
344 flat_set<ReportID> &reports) {
345 for (auto v : rsi.vertices) {
346 if (is_match_vertex(v, g)) {
347 reports = g[v].reports;
348 return;
349 }
350 }
351}
352
353static
354void checkReachSubgraphs(const NGHolder &g, vector<ReachSubgraph> &rs,
355 const u32 minNumVertices) {
356 if (rs.empty()) {
357 return;
358 }
359
360 DEBUG_PRINTF("%zu subgraphs\n", rs.size());
361
362 vector<ReachSubgraph> rs_out;
363
364 queue<ReachSubgraph> q;
365 for (const auto &rsi : rs) {
366 if (rsi.vertices.size() < minNumVertices) {
367 continue;
368 }
369 q.push(rsi);
370 }
371
372 while (!q.empty()) {
373 const ReachSubgraph &rsi = q.front();
374
375 if (rsi.vertices.size() < minNumVertices) {
376 q.pop(); // Too small for consideration as a repeat.
377 continue;
378 }
379
380 DEBUG_PRINTF("subgraph with %zu vertices\n", rsi.vertices.size());
381
382 // Check that all the edges from outside have the same tops. TODO: we
383 // don't have to throw the whole subgraph out, we could do this check
384 // on a per vertex basis.
385 if (hasDifferentTops(g, rsi.vertices)) {
386 DEBUG_PRINTF("different tops!\n");
387 q.pop();
388 continue;
389 }
390
391 unordered_set<NFAVertex> involved(rsi.vertices.begin(),
392 rsi.vertices.end());
393 unordered_set<NFAVertex> tail(involved); // to look for back-edges.
394 unordered_set<NFAVertex> pred, succ;
395 proper_pred(g, rsi.vertices.front(), pred);
396 proper_succ(g, rsi.vertices.back(), succ);
397
398 flat_set<ReportID> reports;
399 findFirstReports(g, rsi, reports);
400
401 bool recalc = false;
402 deque<NFAVertex> verts;
403
404 for (auto v : rsi.vertices) {
405 tail.erase(v); // now contains all vertices _after_ this one.
406
407 if (vertexIsBad(g, v, involved, tail, pred, succ, reports)) {
408 recalc = true;
409 continue;
410 }
411
412 verts.push_back(v);
413 }
414
415 if (recalc) {
416 if (verts.size() < minNumVertices) {
417 DEBUG_PRINTF("subgraph got too small\n");
418 q.pop();
419 continue;
420 }
421 splitSubgraph(g, verts, minNumVertices, q);
422 } else {
423 DEBUG_PRINTF("subgraph is ok\n");
424 rs_out.push_back(rsi);
425 }
426 q.pop();
427 }
428
429 rs.swap(rs_out);
430}
431
432namespace {
433class DistanceSet {
434private:
435 // We use boost::icl to do the heavy lifting.
436 typedef boost::icl::closed_interval<u32> ClosedInterval;
437 typedef boost::icl::interval_set<u32, std::less, ClosedInterval>
438 IntervalSet;
439 IntervalSet distances;
440public:
441 // Add a distance.
442 void insert(u32 d) {
443 distances.insert(d);
444 }
445
446 void add(const DistanceSet &a) {
447 distances += a.distances; // union operation
448 }
449
450 // Increment all the distances by one and add.
451 void add_incremented(const DistanceSet &a) {
452 for (const auto &d : a.distances) {
453 u32 lo = lower(d) + 1;
454 u32 hi = upper(d) + 1;
455 distances.insert(boost::icl::construct<ClosedInterval>(lo, hi));
456 }
457 }
458
459#ifdef DEBUG
460 void dump() const {
461 if (distances.empty()) {
462 printf("<empty>");
463 return;
464 }
465
466 for (const auto &d : distances) {
467 printf("[%u,%u] ", lower(d), upper(d));
468 }
469 }
470#endif
471
472 // True if this distance set is a single contiguous interval.
473 bool is_contiguous() const {
474 IntervalSet::const_iterator it = distances.begin();
475 if (it == distances.end()) {
476 return false;
477 }
478 ++it;
479 return (it == distances.end());
480 }
481
482 pair<u32, u32> get_range() const {
483 assert(is_contiguous());
484 return make_pair(lower(distances), upper(distances));
485 }
486};
487}
488
489/**
490 * Returns false if the given bounds are too large to be implemented with our
491 * runtime engines that handle bounded repeats.
492 */
493static
494bool tooLargeToImplement(const depth &repeatMin, const depth &repeatMax) {
495 if (!repeatMin.is_finite()) {
496 DEBUG_PRINTF("non-finite min bound %s\n", repeatMin.str().c_str());
497 assert(0); // this is a surprise!
498 return true;
499 }
500
501 if ((u32)repeatMin >= REPEAT_INF) {
502 DEBUG_PRINTF("min bound %s too large\n", repeatMin.str().c_str());
503 return true;
504 }
505
506 if (repeatMax.is_finite() && (u32)repeatMax >= REPEAT_INF) {
507 DEBUG_PRINTF("finite max bound %s too large\n", repeatMax.str().c_str());
508 return true;
509 }
510
511 return false;
512}
513
514/** Returns false if the graph is not a supported bounded repeat. */
515static
516bool processSubgraph(const NGHolder &g, ReachSubgraph &rsi,
517 u32 minNumVertices) {
518 DEBUG_PRINTF("reach subgraph has %zu vertices\n", rsi.vertices.size());
519
520 if (rsi.vertices.size() < minNumVertices) {
521 DEBUG_PRINTF("too small, min is %u\n", minNumVertices);
522 return false;
523 }
524
525 NFAVertex first = rsi.vertices.front();
526 NFAVertex last = rsi.vertices.back();
527
528 typedef unordered_map<NFAVertex, DistanceSet> DistanceMap;
529 DistanceMap dist;
530
531 // Initial distance sets.
532 for (auto u : inv_adjacent_vertices_range(first, g)) {
533 if (u == first) {
534 continue; // no self-loops
535 }
536 DEBUG_PRINTF("pred vertex %zu\n", g[u].index);
537 dist[u].insert(0);
538 }
539
540 for (auto v : rsi.vertices) {
541 for (auto u : inv_adjacent_vertices_range(v, g)) {
542 if (u == v) {
543 continue; // no self-loops
544 }
545
546 auto di = dist.find(u);
547 if (di == dist.end()) {
548 assert(0);
549 return false;
550 }
551
552 dist[v].add_incremented(di->second);
553 }
554 }
555
556 // Remove pred distances from our map.
557 for (auto u : inv_adjacent_vertices_range(first, g)) {
558 if (u == first) {
559 continue; // no self-loops
560 }
561 dist.erase(u);
562 }
563
564 // Calculate final union of distances.
565 DistanceSet final_d;
566 for (auto v : adjacent_vertices_range(last, g)) {
567 if (v == last) {
568 continue; // no self-loops
569 }
570 for (auto u : inv_adjacent_vertices_range(v, g)) {
571 if (u == v) {
572 continue; // no self-loops
573 }
574 auto di = dist.find(u);
575 if (di == dist.end()) {
576 continue;
577 }
578 final_d.add(di->second);
579 }
580 }
581
582#ifdef DEBUG
583 DEBUG_PRINTF("final_d dists: ");
584 final_d.dump();
585 printf("\n");
586#endif
587
588 if (!final_d.is_contiguous()) {
589 // not handled right now
590 DEBUG_PRINTF("not contiguous!\n");
591 return false;
592 }
593
594 pair<u32, u32> range = final_d.get_range();
595 if (range.first > depth::max_value() || range.second > depth::max_value()) {
596 DEBUG_PRINTF("repeat (%u,%u) not representable with depths\n",
597 range.first, range.second);
598 return false;
599 }
600 rsi.repeatMin = depth(range.first);
601 rsi.repeatMax = depth(range.second);
602
603 // If we've got a self-loop anywhere, we've got inf max.
604 if (anySelfLoop(g, rsi.vertices.begin(), rsi.vertices.end())) {
605 DEBUG_PRINTF("repeat contains self-loop, setting max to INF\n");
606 rsi.repeatMax = depth::infinity();
607 }
608
609 // If our pattern contains a bounded repeat that we wouldn't be able to
610 // implement as runtime, then we have no strategy that leads to
611 // implementation -- it's not like falling back to a DFA or other
612 // non-repeat engine is going to succeed.
613 if (tooLargeToImplement(rsi.repeatMin, rsi.repeatMax)) {
614 throw CompileError("Pattern too large.");
615 }
616
617 return true;
618}
619
620static
621bool allPredsInSubgraph(NFAVertex v, const NGHolder &g,
622 const unordered_set<NFAVertex> &involved) {
623 for (auto u : inv_adjacent_vertices_range(v, g)) {
624 if (!contains(involved, u)) {
625 return false;
626 }
627 }
628 return true;
629}
630
631static
632void buildTugTrigger(NGHolder &g, NFAVertex cyclic, NFAVertex v,
633 const unordered_set<NFAVertex> &involved,
634 unordered_map<NFAVertex, NFAVertexDepth> &depths,
635 vector<NFAVertex> &tugs) {
636 if (allPredsInSubgraph(v, g, involved)) {
637 // We can transform this vertex into a tug trigger in-place.
638 DEBUG_PRINTF("all preds in subgraph, vertex %zu becomes tug\n",
639 g[v].index);
640 add_edge(cyclic, v, g);
641 tugs.push_back(v);
642 return;
643 }
644
645 // Some predecessors of v are not in the subgraph, so we need to clone v
646 // and split up its in-edges.
647 NFAVertex t = clone_vertex(g, v);
648 depths[t] = depths[v];
649
650 DEBUG_PRINTF("there are other paths, cloned tug %zu from vertex %zu\n",
651 g[t].index, g[v].index);
652
653 tugs.push_back(t);
654 add_edge(cyclic, t, g);
655
656 // New vertex gets all of v's successors, including v itself if it's
657 // cyclic.
658 clone_out_edges(g, v, t);
659}
660
661static
662NFAVertex createCyclic(NGHolder &g, ReachSubgraph &rsi) {
663 NFAVertex last = rsi.vertices.back();
664 NFAVertex cyclic = clone_vertex(g, last);
665 add_edge(cyclic, cyclic, g);
666
667 DEBUG_PRINTF("created cyclic vertex %zu\n", g[cyclic].index);
668 return cyclic;
669}
670
671static
672NFAVertex createPos(NGHolder &g, ReachSubgraph &rsi) {
673 NFAVertex pos = add_vertex(g);
674 NFAVertex first = rsi.vertices.front();
675
676 g[pos].char_reach = g[first].char_reach;
677
678 DEBUG_PRINTF("created pos vertex %zu\n", g[pos].index);
679 return pos;
680}
681
682// 2 if v is directly connected to an accept, or 1 if one hop away,
683// or 0 otherwise.
684static
685u32 isCloseToAccept(const NGHolder &g, NFAVertex v) {
686 if (is_any_accept(v, g)) {
687 return 2;
688 }
689
690 for (auto w : adjacent_vertices_range(v, g)) {
691 if (is_any_accept(w, g)) {
692 return 1;
693 }
694 }
695
696 return 0;
697}
698
699static
700u32 unpeelAmount(const NGHolder &g, const ReachSubgraph &rsi) {
701 const NFAVertex last = rsi.vertices.back();
702 u32 rv = 0;
703
704 for (auto v : adjacent_vertices_range(last, g)) {
705 rv = max(rv, isCloseToAccept(g, v));
706 }
707
708 return rv;
709}
710
711static
712void unpeelNearEnd(NGHolder &g, ReachSubgraph &rsi,
713 unordered_map<NFAVertex, NFAVertexDepth> &depths,
714 vector<NFAVertex> *succs) {
715 u32 unpeel = unpeelAmount(g, rsi);
716 DEBUG_PRINTF("unpeeling %u vertices\n", unpeel);
717
718 while (unpeel) {
719 NFAVertex last = rsi.vertices.back();
720 NFAVertex first = rsi.vertices.front();
721
722 NFAVertex d = clone_vertex(g, last);
723 depths[d] = depths[last];
724 DEBUG_PRINTF("created vertex %zu\n", g[d].index);
725
726 for (auto v : *succs) {
727 add_edge(d, v, g);
728 }
729
730 if (rsi.repeatMin > depth(1)) {
731 rsi.repeatMin -= 1;
732 } else {
733 /* Skip edge for the cyclic state; note that we must clone their
734 * edge properties as they may include tops. */
735 for (const auto &e : in_edges_range(first, g)) {
736 add_edge(source(e, g), d, g[e], g);
737 }
738 }
739
740 succs->clear();
741 succs->push_back(d);
742
743 rsi.repeatMax -= 1;
744
745 assert(rsi.repeatMin > depth(0));
746 assert(rsi.repeatMax > depth(0));
747
748 unpeel--;
749 }
750}
751
752/** Fetch the set of successor vertices of this subgraph. */
753static
754void getSuccessors(const NGHolder &g, const ReachSubgraph &rsi,
755 vector<NFAVertex> *succs) {
756 assert(!rsi.vertices.empty());
757 // Successors come from successors of last vertex.
758 NFAVertex last = rsi.vertices.back();
759
760 for (auto v : adjacent_vertices_range(last, g)) {
761 if (v == last) { /* ignore self loop */
762 continue;
763 }
764 succs->push_back(v);
765 }
766}
767
768/** Disconnect the given subgraph from its predecessors and successors in the
769 * NFA graph and replace it with a cyclic state. */
770static
771void replaceSubgraphWithSpecial(NGHolder &g, ReachSubgraph &rsi,
772 vector<BoundedRepeatData> *repeats,
773 unordered_map<NFAVertex, NFAVertexDepth> &depths,
774 unordered_set<NFAVertex> &created) {
775 assert(!rsi.bad);
776 /* As we may need to unpeel 2 vertices, we need the width to be more than 2.
777 * This should only happen if the graph did not have redundancy pass
778 * performed on as vertex count checks would be prevent us reaching here.
779 */
780 if (rsi.repeatMax <= depth(2)) {
781 return;
782 }
783 assert(rsi.repeatMin > depth(0));
784 assert(rsi.repeatMax >= rsi.repeatMin);
785 assert(rsi.repeatMax > depth(2));
786
787 DEBUG_PRINTF("entry\n");
788
789 const unordered_set<NFAVertex> involved(rsi.vertices.begin(),
790 rsi.vertices.end());
791 vector<NFAVertex> succs;
792 getSuccessors(g, rsi, &succs);
793
794 unpeelNearEnd(g, rsi, depths, &succs);
795
796 // Create our replacement cyclic state with the same reachability and
797 // report info as the last vertex in our topo-ordered list.
798 NFAVertex cyclic = createCyclic(g, rsi);
799 created.insert(cyclic);
800
801 // One more special vertex is necessary: the positive trigger (same
802 // reach as cyclic).
803 NFAVertex pos_trigger = createPos(g, rsi);
804 created.insert(pos_trigger);
805 add_edge(pos_trigger, cyclic, g);
806
807 // Update depths for our new vertices.
808 NFAVertex first = rsi.vertices.front(), last = rsi.vertices.back();
809 depths[pos_trigger] = depths[first];
810 depths[cyclic].fromStart =
811 unionDepthMinMax(depths[first].fromStart, depths[last].fromStart);
812 depths[cyclic].fromStartDotStar = unionDepthMinMax(
813 depths[first].fromStartDotStar, depths[last].fromStartDotStar);
814
815 // Wire predecessors to positive trigger.
816 for (const auto &e : in_edges_range(first, g)) {
817 add_edge(source(e, g), pos_trigger, g[e], g);
818 }
819
820 // Wire cyclic state to tug trigger states built from successors.
821 vector<NFAVertex> tugs;
822 for (auto v : succs) {
823 buildTugTrigger(g, cyclic, v, involved, depths, tugs);
824 }
825 created.insert(tugs.begin(), tugs.end());
826 assert(!tugs.empty());
827
828 // Wire pos trigger to tugs if min repeat is one -- this deals with cases
829 // where we can get a pos and tug trigger on the same byte.
830 if (rsi.repeatMin == depth(1)) {
831 for (auto v : tugs) {
832 add_edge(pos_trigger, v, g);
833 }
834 }
835
836 // Remove the vertices/edges in the subgraph.
837 remove_vertices(rsi.vertices, g, false);
838 erase_all(&depths, rsi.vertices);
839
840 repeats->push_back(BoundedRepeatData(rsi.historyType, rsi.repeatMin,
841 rsi.repeatMax, rsi.minPeriod, cyclic,
842 pos_trigger, tugs));
843}
844
845/** Variant for Rose-specific graphs that terminate in a sole accept, so we can
846 * use a "lazy tug". See UE-1636. */
847static
848void replaceSubgraphWithLazySpecial(NGHolder &g, ReachSubgraph &rsi,
849 vector<BoundedRepeatData> *repeats,
850 unordered_map<NFAVertex, NFAVertexDepth> &depths,
851 unordered_set<NFAVertex> &created) {
852 assert(!rsi.bad);
853 assert(rsi.repeatMin);
854 assert(rsi.repeatMax >= rsi.repeatMin);
855
856 DEBUG_PRINTF("entry\n");
857
858 const unordered_set<NFAVertex> involved(rsi.vertices.begin(),
859 rsi.vertices.end());
860 vector<NFAVertex> succs;
861 getSuccessors(g, rsi, &succs);
862
863 // Create our replacement cyclic state with the same reachability and
864 // report info as the last vertex in our topo-ordered list.
865 NFAVertex cyclic = createCyclic(g, rsi);
866 created.insert(cyclic);
867
868 // One more special vertex is necessary: the positive trigger (same
869 // reach as cyclic).
870 NFAVertex pos_trigger = createPos(g, rsi);
871 created.insert(pos_trigger);
872 add_edge(pos_trigger, cyclic, g);
873
874 // Update depths for our new vertices.
875 NFAVertex first = rsi.vertices.front(), last = rsi.vertices.back();
876 depths[pos_trigger] = depths[first];
877 depths[cyclic].fromStart =
878 unionDepthMinMax(depths[first].fromStart, depths[last].fromStart);
879 depths[cyclic].fromStartDotStar = unionDepthMinMax(
880 depths[first].fromStartDotStar, depths[last].fromStartDotStar);
881
882 // Wire predecessors to positive trigger.
883 for (const auto &e : in_edges_range(first, g)) {
884 add_edge(source(e, g), pos_trigger, g[e], g);
885 }
886
887 // In the rose case, our tug is our cyclic, and it's wired to our
888 // successors (which should be just the accept).
889 vector<NFAVertex> tugs;
890 assert(succs.size() == 1);
891 for (auto v : succs) {
892 add_edge(cyclic, v, g);
893 }
894
895 // Wire pos trigger to accept if min repeat is one -- this deals with cases
896 // where we can get a pos and tug trigger on the same byte.
897 if (rsi.repeatMin == depth(1)) {
898 for (auto v : succs) {
899 add_edge(pos_trigger, v, g);
900 g[pos_trigger].reports = g[cyclic].reports;
901 }
902 }
903
904 // Remove the vertices/edges in the subgraph.
905 remove_vertices(rsi.vertices, g, false);
906 erase_all(&depths, rsi.vertices);
907
908 repeats->push_back(BoundedRepeatData(rsi.historyType, rsi.repeatMin,
909 rsi.repeatMax, rsi.minPeriod, cyclic,
910 pos_trigger, tugs));
911}
912
913static
914bool isCompBigEnough(const RepeatGraph &rg, const u32 minRepeat) {
915 // filtered_graph doesn't filter the num_vertices call.
916 size_t n = 0;
917 RepeatGraph::vertex_iterator vi, ve;
918 for (tie(vi, ve) = vertices(rg); vi != ve; ++vi) {
919 if (++n >= minRepeat) {
920 return true;
921 }
922 }
923 return false;
924}
925
926// Marks the subgraph as bad if it can't be handled.
927static
928void reprocessSubgraph(const NGHolder &h, const Grey &grey,
929 ReachSubgraph &rsi) {
930 vector<ReachSubgraph> rs(1, rsi);
931 checkReachSubgraphs(h, rs, grey.minExtBoundedRepeatSize);
932 if (rs.size() != 1) {
933 DEBUG_PRINTF("subgraph split into %zu\n", rs.size());
934 rsi.bad = true;
935 return;
936 }
937
938 rsi = rs.back(); // Potentially modified.
939
940 if (processSubgraph(h, rsi, grey.minExtBoundedRepeatSize)) {
941 DEBUG_PRINTF("reprocessed subgraph is {%s,%s} repeat\n",
942 rsi.repeatMin.str().c_str(), rsi.repeatMax.str().c_str());
943 } else {
944 DEBUG_PRINTF("reprocessed subgraph is bad\n");
945 rsi.bad = true;
946 }
947}
948
949/** Remove vertices from the beginning and end of the vertex set that are
950 * involved in other repeats as a result of earlier repeat transformations. */
951static
952bool peelSubgraph(const NGHolder &g, const Grey &grey, ReachSubgraph &rsi,
953 const unordered_set<NFAVertex> &created) {
954 assert(!rsi.bad);
955
956 if (created.empty()) {
957 return true;
958 }
959
960 if (rsi.vertices.empty()) {
961 return false;
962 }
963
964 // Peel involved vertices from the front.
965 vector<NFAVertex>::iterator zap = rsi.vertices.end();
966 for (auto it = rsi.vertices.begin(), ite = rsi.vertices.end(); it != ite;
967 ++it) {
968 if (!contains(created, *it)) {
969 zap = it;
970 break;
971 } else {
972 DEBUG_PRINTF("%zu is involved in another repeat\n", g[*it].index);
973 }
974 }
975 DEBUG_PRINTF("peeling %zu vertices from front\n",
976 distance(rsi.vertices.begin(), zap));
977 rsi.vertices.erase(rsi.vertices.begin(), zap);
978
979 // Peel involved vertices and vertices with edges to involved vertices from
980 // the back; otherwise we may try to transform a POS into a TUG.
981 zap = rsi.vertices.begin();
982 for (auto it = rsi.vertices.rbegin(), ite = rsi.vertices.rend(); it != ite;
983 ++it) {
984 if (!contains(created, *it) &&
985 !contains_any_of(created, adjacent_vertices(*it, g))) {
986 zap = it.base(); // Note: erases everything after it.
987 break;
988 } else {
989 DEBUG_PRINTF("%zu is involved in another repeat\n", g[*it].index);
990 }
991 }
992 DEBUG_PRINTF("peeling %zu vertices from back\n",
993 distance(zap, rsi.vertices.end()));
994 rsi.vertices.erase(zap, rsi.vertices.end());
995
996 // If vertices in the middle are involved in other repeats, it's a definite
997 // no-no.
998 for (auto v : rsi.vertices) {
999 if (contains(created, v)) {
1000 DEBUG_PRINTF("vertex %zu is in another repeat\n", g[v].index);
1001 return false;
1002 }
1003 }
1004
1005 reprocessSubgraph(g, grey, rsi);
1006 return !rsi.bad;
1007}
1008
1009/** For performance reasons, it's nice not to have an exceptional state right
1010 * next to a startDs state: that way we can do double-byte accel, whereas
1011 * otherwise the NEG trigger would limit us to single. This might be a good
1012 * idea to extend to cyclic states, too. */
1013static
1014void peelStartDotStar(const NGHolder &g,
1015 const unordered_map<NFAVertex, NFAVertexDepth> &depths,
1016 const Grey &grey, ReachSubgraph &rsi) {
1017 if (rsi.vertices.size() < 1) {
1018 return;
1019 }
1020
1021 NFAVertex first = rsi.vertices.front();
1022 if (depths.at(first).fromStartDotStar.min == depth(1)) {
1023 DEBUG_PRINTF("peeling start front vertex %zu\n", g[first].index);
1024 rsi.vertices.erase(rsi.vertices.begin());
1025 reprocessSubgraph(g, grey, rsi);
1026 }
1027}
1028
1029static
1030void buildReachSubgraphs(const NGHolder &g, vector<ReachSubgraph> &rs,
1031 const u32 minNumVertices) {
1032 const ReachFilter<NGHolder> fil(&g);
1033 const RepeatGraph rg(g, fil, fil);
1034
1035 if (!isCompBigEnough(rg, minNumVertices)) {
1036 DEBUG_PRINTF("component not big enough, bailing\n");
1037 return;
1038 }
1039
1040 const auto ug = make_undirected_graph(rg);
1041
1042 unordered_map<NFAVertex, u32> repeatMap;
1043
1044 unsigned int num;
1045 num = connected_components(ug, make_assoc_property_map(repeatMap));
1046 DEBUG_PRINTF("found %u connected repeat components\n", num);
1047
1048 // Now, we build a set of topo-ordered ReachSubgraphs.
1049 vector<NFAVertex> topoOrder = buildTopoOrder(rg);
1050
1051 rs.resize(num);
1052
1053 for (auto v : topoOrder) {
1054 auto rit = repeatMap.find(v);
1055 if (rit == repeatMap.end()) {
1056 continue; /* not part of a repeat */
1057 }
1058 u32 comp_id = rit->second;
1059 assert(comp_id < num);
1060 rs[comp_id].vertices.push_back(v);
1061 }
1062
1063#ifdef DEBUG
1064 for (size_t i = 0; i < rs.size(); i++) {
1065 DEBUG_PRINTF("rs %zu has %zu vertices.\n", i, rs[i].vertices.size());
1066 }
1067#endif
1068}
1069
1070static
1071bool hasSkipEdges(const NGHolder &g, const ReachSubgraph &rsi) {
1072 assert(!rsi.vertices.empty());
1073
1074 const NFAVertex first = rsi.vertices.front();
1075 const NFAVertex last = rsi.vertices.back();
1076
1077 // All of the preds of first must have edges to all the successors of last.
1078 for (auto u : inv_adjacent_vertices_range(first, g)) {
1079 for (auto v : adjacent_vertices_range(last, g)) {
1080 if (!edge(u, v, g).second) {
1081 return false;
1082 }
1083 }
1084 }
1085
1086 return true;
1087}
1088
1089/* depth info is valid as calculated at entry */
1090static
1091bool entered_at_fixed_offset(NFAVertex v, const NGHolder &g,
1092 const unordered_map<NFAVertex, NFAVertexDepth> &depths,
1093 const unordered_set<NFAVertex> &reached_by_fixed_tops) {
1094 DEBUG_PRINTF("|reached_by_fixed_tops| %zu\n",
1095 reached_by_fixed_tops.size());
1096 if (is_triggered(g) && !contains(reached_by_fixed_tops, v)) {
1097 /* can't do this for infix/suffixes unless we know trigger literals
1098 * can only occur at one offset */
1099 DEBUG_PRINTF("bad top(s) for %zu\n", g[v].index);
1100 return false;
1101 }
1102
1103 if (depths.at(v).fromStartDotStar.min.is_reachable()) {
1104 DEBUG_PRINTF("reachable from startDs\n");
1105 return false;
1106 }
1107
1108 /* look at preds as v may be cyclic */
1109 const depth &first = depths.at(v).fromStart.min;
1110 assert(first.is_reachable());
1111 if (!first.is_finite()) {
1112 DEBUG_PRINTF("first not finite\n");
1113 return false;
1114 }
1115 DEBUG_PRINTF("first is at least %s from start\n", first.str().c_str());
1116
1117 for (auto u : inv_adjacent_vertices_range(v, g)) {
1118 const depth &u_max_depth = depths.at(u).fromStart.max;
1119 DEBUG_PRINTF("pred %zu max depth %s from start\n", g[u].index,
1120 u_max_depth.str().c_str());
1121 if (u_max_depth != first - depth(1)) {
1122 return false;
1123 }
1124 }
1125
1126 return true;
1127}
1128
1129static
1130NFAVertex buildTriggerStates(NGHolder &g, const vector<CharReach> &trigger,
1131 u32 top) {
1132 NFAVertex u = g.start;
1133 for (const auto &cr : trigger) {
1134 NFAVertex v = add_vertex(g);
1135 g[v].char_reach = cr;
1136 add_edge(u, v, g);
1137 if (u == g.start) {
1138 g[edge(u, v, g)].tops.insert(top);
1139 }
1140 u = v;
1141 }
1142
1143 DEBUG_PRINTF("trigger len=%zu has sink %zu\n", trigger.size(), g[u].index);
1144 return u;
1145}
1146
1147/**
1148 * For triggered graphs, replace the "top" edges from start with the triggers
1149 * they represent, for the purposes of determining sole entry.
1150 */
1151static
1152void addTriggers(NGHolder &g,
1153 const map<u32, vector<vector<CharReach>>> &triggers) {
1154 if (!is_triggered(g)) {
1155 assert(triggers.empty());
1156 return;
1157 }
1158
1159 vector<NFAEdge> dead;
1160 map<u32, vector<NFAVertex>> starts_by_top;
1161
1162 for (const auto &e : out_edges_range(g.start, g)) {
1163 const NFAVertex &v = target(e, g);
1164 if (v == g.startDs) {
1165 continue;
1166 }
1167
1168 const auto &tops = g[e].tops;
1169
1170 // The caller may not have given us complete trigger information. If we
1171 // don't have any triggers for a particular top, we should just leave
1172 // it alone.
1173 for (u32 top : tops) {
1174 if (!contains(triggers, top)) {
1175 DEBUG_PRINTF("no triggers for top %u\n", top);
1176 goto next_edge;
1177 }
1178
1179 starts_by_top[top].push_back(v);
1180 }
1181 dead.push_back(e);
1182 next_edge:;
1183 }
1184
1185 remove_edges(dead, g);
1186
1187 for (const auto &m : starts_by_top) {
1188 const auto &top = m.first;
1189 const auto &starts = m.second;
1190
1191 assert(contains(triggers, top));
1192 const auto &top_triggers = triggers.at(top);
1193
1194 for (const auto &trigger : top_triggers) {
1195 NFAVertex u = buildTriggerStates(g, trigger, top);
1196 for (const auto &v : starts) {
1197 add_edge_if_not_present(u, v, g);
1198 }
1199 }
1200 }
1201}
1202
1203static
1204CharReach predReach(const NGHolder &g, NFAVertex v) {
1205 CharReach cr;
1206 for (auto u : inv_adjacent_vertices_range(v, g)) {
1207 cr |= g[u].char_reach;
1208 }
1209 return cr;
1210}
1211
1212/**
1213 * Filter the given vertex map (which maps from vertices in another graph to
1214 * vertices in subg) so that it only contains vertices that actually exist in
1215 * subg.
1216 */
1217static
1218void filterMap(const NGHolder &subg,
1219 unordered_map<NFAVertex, NFAVertex> &vmap) {
1220 NGHolder::vertex_iterator vi, ve;
1221 tie(vi, ve) = vertices(subg);
1222 const unordered_set<NFAVertex> remaining_verts(vi, ve);
1223
1224 unordered_map<NFAVertex, NFAVertex> fmap; // filtered map
1225
1226 for (const auto &m : vmap) {
1227 if (contains(remaining_verts, m.second)) {
1228 fmap.insert(m);
1229 }
1230 }
1231
1232 vmap.swap(fmap);
1233}
1234
1235/** Construct a graph for sole entry analysis that only considers paths through
1236 * the bounded repeat. */
1237static
1238void buildRepeatGraph(NGHolder &rg,
1239 unordered_map<NFAVertex, NFAVertex> &rg_map,
1240 const NGHolder &g, const ReachSubgraph &rsi,
1241 const map<u32, vector<vector<CharReach>>> &triggers) {
1242 cloneHolder(rg, g, &rg_map);
1243 assert(rg.kind == g.kind);
1244
1245 clear_in_edges(rg.accept, rg);
1246 clear_in_edges(rg.acceptEod, rg);
1247 add_edge(rg.accept, rg.acceptEod, rg);
1248
1249 // Find the set of vertices in rg involved in the repeat.
1250 unordered_set<NFAVertex> rg_involved;
1251 for (const auto &v : rsi.vertices) {
1252 assert(contains(rg_map, v));
1253 rg_involved.insert(rg_map.at(v));
1254 }
1255
1256 // Remove all out-edges from repeat vertices that aren't to other repeat
1257 // vertices, then connect terminal repeat vertices to accept.
1258 for (const auto &v : rsi.vertices) {
1259 NFAVertex rv = rg_map.at(v);
1260 remove_out_edge_if(rv, [&](const NFAEdge &e) {
1261 return !contains(rg_involved, target(e, rg));
1262 }, rg);
1263 if (!has_successor(rv, rg)) { // no interior out-edges
1264 add_edge(rv, rg.accept, rg);
1265 }
1266 }
1267
1268 pruneUseless(rg);
1269
1270 if (is_triggered(rg)) {
1271 // Add vertices for all our triggers
1272 addTriggers(rg, triggers);
1273 renumber_vertices(rg);
1274
1275 // We don't know anything about how often this graph is triggered, so we
1276 // make the start vertex cyclic for the purposes of this analysis ONLY.
1277 add_edge(rg.start, rg.start, rg);
1278 }
1279
1280 filterMap(rg, rg_map);
1281
1282 // All of our repeat vertices should have vertices in rg.
1283 assert(all_of(begin(rsi.vertices), end(rsi.vertices),
1284 [&](const NFAVertex &v) { return contains(rg_map, v); }));
1285}
1286
1287/**
1288 * Construct an input DAG which accepts on all entries to the repeat.
1289 */
1290static
1291void buildInputGraph(NGHolder &lhs,
1292 unordered_map<NFAVertex, NFAVertex> &lhs_map,
1293 const NGHolder &g, const NFAVertex first,
1294 const map<u32, vector<vector<CharReach>>> &triggers) {
1295 DEBUG_PRINTF("building lhs with first=%zu\n", g[first].index);
1296 cloneHolder(lhs, g, &lhs_map);
1297 assert(g.kind == lhs.kind);
1298 addTriggers(lhs, triggers);
1299 renumber_vertices(lhs);
1300
1301 // Replace each back-edge (u,v) with an edge (startDs,v), which will
1302 // generate entries at at least the rate of the loop created by that
1303 // back-edge.
1304 set<NFAEdge> dead;
1305 BackEdges<set<NFAEdge> > backEdgeVisitor(dead);
1306 depth_first_search(lhs, visitor(backEdgeVisitor).root_vertex(lhs.start));
1307 for (const auto &e : dead) {
1308 const NFAVertex u = source(e, lhs), v = target(e, lhs);
1309 if (u == v) {
1310 continue; // Self-loops are OK.
1311 }
1312
1313 DEBUG_PRINTF("replacing back-edge (%zu,%zu) with edge (startDs,%zu)\n",
1314 lhs[u].index, lhs[v].index, lhs[v].index);
1315
1316 add_edge_if_not_present(lhs.startDs, v, lhs);
1317 remove_edge(e, lhs);
1318 }
1319
1320 clear_in_edges(lhs.accept, lhs);
1321 clear_in_edges(lhs.acceptEod, lhs);
1322 add_edge(lhs.accept, lhs.acceptEod, lhs);
1323
1324 // Wire the predecessors of the first repeat vertex to accept, then prune.
1325 NFAVertex lhs_first = lhs_map.at(first);
1326 for (auto u : inv_adjacent_vertices_range(lhs_first, lhs)) {
1327 add_edge_if_not_present(u, lhs.accept, lhs);
1328 }
1329
1330 pruneUseless(lhs);
1331 filterMap(lhs, lhs_map);
1332}
1333
1334/**
1335 * Maximum number of vertices in the input DAG to actually allow sole entry
1336 * calculation (as very large cases make sentClearsTail take a long, long time
1337 * to complete.)
1338 */
1339static const size_t MAX_SOLE_ENTRY_VERTICES = 10000;
1340
1341/** True if (1) fixed offset or (2) reentries to this subgraph must involve a
1342 * character which escapes the repeat, meaning that we only need to store a
1343 * single offset at runtime. See UE-1361. */
1344static
1345bool hasSoleEntry(const NGHolder &g, const ReachSubgraph &rsi,
1346 const unordered_map<NFAVertex, NFAVertexDepth> &depths,
1347 const unordered_set<NFAVertex> &reached_by_fixed_tops,
1348 const map<u32, vector<vector<CharReach>>> &triggers) {
1349 DEBUG_PRINTF("checking repeat {%s,%s}\n", rsi.repeatMin.str().c_str(),
1350 rsi.repeatMax.str().c_str());
1351 NFAVertex first = rsi.vertices.front();
1352 const CharReach &repeatReach = g[first].char_reach;
1353
1354 /* trivial case first is at a fixed depth */
1355 if (entered_at_fixed_offset(first, g, depths, reached_by_fixed_tops)) {
1356 DEBUG_PRINTF("fixed depth\n");
1357 return true;
1358 }
1359
1360 DEBUG_PRINTF("repeat reach is %s\n", describeClass(repeatReach).c_str());
1361
1362 // Nothing can escape a dot repeat.
1363 if (repeatReach.all()) {
1364 DEBUG_PRINTF("dot repeat cannot be escaped\n");
1365 return false;
1366 }
1367
1368 // Another easy case: if the union of the reach of all entries to the
1369 // repeat will always escape the repeat, we have sole entry.
1370 if (predReach(g, first).isSubsetOf(~repeatReach)) {
1371 DEBUG_PRINTF("pred reach %s, which is subset of repeat escape\n",
1372 describeClass(predReach(g, first)).c_str());
1373 return true;
1374 }
1375
1376 NGHolder rg;
1377 unordered_map<NFAVertex, NFAVertex> rg_map;
1378 buildRepeatGraph(rg, rg_map, g, rsi, triggers);
1379 assert(rg.kind == g.kind);
1380
1381 NGHolder lhs;
1382 unordered_map<NFAVertex, NFAVertex> lhs_map;
1383 buildInputGraph(lhs, lhs_map, g, first, triggers);
1384 assert(lhs.kind == g.kind);
1385
1386 if (num_vertices(lhs) > MAX_SOLE_ENTRY_VERTICES) {
1387 DEBUG_PRINTF("too many vertices (%zu) for sole entry test.\n",
1388 num_vertices(lhs));
1389 return false;
1390 }
1391
1392 // Split the repeat graph into two regions: vertices in the LHS input DAG
1393 // are in one region, vertices in the bounded repeat are in another.
1394 const u32 lhs_region = 1;
1395 const u32 repeat_region = 2;
1396 unordered_map<NFAVertex, u32> region_map;
1397
1398 for (const auto &v : rsi.vertices) {
1399 assert(!is_special(v, g)); // no specials in repeats
1400 assert(contains(rg_map, v));
1401 DEBUG_PRINTF("rg vertex %zu in repeat\n", rg[rg_map.at(v)].index);
1402 region_map.emplace(rg_map.at(v), repeat_region);
1403 }
1404
1405 for (const auto &v : vertices_range(rg)) {
1406 if (!contains(region_map, v)) {
1407 DEBUG_PRINTF("rg vertex %zu in lhs (trigger)\n", rg[v].index);
1408 region_map.emplace(v, lhs_region);
1409 }
1410 }
1411
1412 u32 bad_region = 0;
1413 if (sentClearsTail(rg, region_map, lhs, lhs_region, &bad_region)) {
1414 DEBUG_PRINTF("input dag clears repeat: sole entry\n");
1415 return true;
1416 }
1417
1418 DEBUG_PRINTF("not sole entry\n");
1419 return false;
1420}
1421
1422namespace {
1423
1424template<class Graph>
1425struct StrawWalker {
1426 StrawWalker(const NGHolder &h_in, const Graph &g_in,
1427 const vector<BoundedRepeatData> &all_repeats)
1428 : h(h_in), g(g_in), repeats(all_repeats) {}
1429
1430 /** True if v is a cyclic that belongs to a bounded repeat (one without an
1431 * inf max bound). */
1432 bool isBoundedRepeatCyclic(NFAVertex v) const {
1433 for (const auto &r : repeats) {
1434 if (r.repeatMax.is_finite() && r.cyclic == v) {
1435 return true;
1436 }
1437 }
1438 return false;
1439 }
1440
1441 NFAVertex step(NFAVertex v) const {
1442 typename Graph::adjacency_iterator ai, ae;
1443 tie(ai, ae) = adjacent_vertices(v, g);
1444 assert(ai != ae);
1445 NFAVertex next = *ai;
1446 if (next == v) { // Ignore self loop.
1447 ++ai;
1448 if (ai == ae) {
1449 return NGHolder::null_vertex();
1450 }
1451 next = *ai;
1452 }
1453 ++ai;
1454 if (ai != ae && *ai == v) { // Ignore self loop
1455 ++ai;
1456 }
1457 if (ai != ae) {
1458 DEBUG_PRINTF("more than one succ\n");
1459 set<NFAVertex> succs;
1460 insert(&succs, adjacent_vertices(v, g));
1461 succs.erase(v);
1462 for (tie(ai, ae) = adjacent_vertices(v, g); ai != ae; ++ai) {
1463 next = *ai;
1464 DEBUG_PRINTF("checking %zu\n", g[next].index);
1465 if (next == v) {
1466 continue;
1467 }
1468 set<NFAVertex> lsuccs;
1469 insert(&lsuccs, adjacent_vertices(next, g));
1470
1471 if (lsuccs != succs) {
1472 continue;
1473 }
1474
1475 // Ensure that if v is in connected to accept, the reports
1476 // on `next` much match.
1477 if (is_match_vertex(v, h) && g[v].reports != g[next].reports) {
1478 DEBUG_PRINTF("report mismatch\n");
1479 continue;
1480 }
1481
1482 return next;
1483 }
1484 DEBUG_PRINTF("bailing\n");
1485 return NGHolder::null_vertex();
1486 }
1487 return next;
1488 }
1489
1490 NFAVertex walk(NFAVertex v, vector<NFAVertex> &straw) const {
1491 DEBUG_PRINTF("walk from %zu\n", g[v].index);
1492 unordered_set<NFAVertex> visited;
1493 straw.clear();
1494
1495 while (!is_special(v, g)) {
1496 DEBUG_PRINTF("checking %zu\n", g[v].index);
1497 NFAVertex next = step(v);
1498 if (next == NGHolder::null_vertex()) {
1499 break;
1500 }
1501 if (!visited.insert(next).second) {
1502 DEBUG_PRINTF("already visited %zu, bailing\n", g[next].index);
1503 break; /* don't want to get stuck in any complicated loops */
1504 }
1505
1506 const CharReach &reach_v = g[v].char_reach;
1507 const CharReach &reach_next = g[next].char_reach;
1508 if (!reach_v.isSubsetOf(reach_next)) {
1509 DEBUG_PRINTF("%zu's reach is not a superset of %zu's\n",
1510 g[next].index, g[v].index);
1511 break;
1512 }
1513
1514 // If this is cyclic with the right reach, we're done. Note that
1515 // startDs fulfils this requirement.
1516 if (hasSelfLoop(next, g) && !isBoundedRepeatCyclic(next)) {
1517 DEBUG_PRINTF("found cyclic %zu\n", g[next].index);
1518 return next;
1519 }
1520
1521 v = next;
1522 straw.push_back(v);
1523 }
1524
1525 straw.clear();
1526 return NGHolder::null_vertex();
1527 }
1528
1529private:
1530 const NGHolder &h; // underlying graph
1531 const Graph &g;
1532 const vector<BoundedRepeatData> &repeats;
1533};
1534
1535} // namespace
1536
1537static
1538NFAVertex walkStrawToCyclicRev(const NGHolder &g, NFAVertex v,
1539 const vector<BoundedRepeatData> &all_repeats,
1540 vector<NFAVertex> &straw) {
1541 typedef boost::reverse_graph<NGHolder, const NGHolder &> RevGraph;
1542 const RevGraph revg(g);
1543
1544 auto cyclic = StrawWalker<RevGraph>(g, revg, all_repeats).walk(v, straw);
1545 reverse(begin(straw), end(straw)); // path comes from cyclic
1546 return cyclic;
1547}
1548
1549static
1550NFAVertex walkStrawToCyclicFwd(const NGHolder &g, NFAVertex v,
1551 const vector<BoundedRepeatData> &all_repeats,
1552 vector<NFAVertex> &straw) {
1553 return StrawWalker<NGHolder>(g, g, all_repeats).walk(v, straw);
1554}
1555
1556/** True if entries to this subgraph must pass through a cyclic state with
1557 * reachability that is a superset of the reach of the repeat, and
1558 * reachabilities along this path "nest" into the reaches of their
1559 * predecessors.
1560 *
1561 * This is what is called a 'straw' in the region code. */
1562static
1563bool hasCyclicSupersetEntryPath(const NGHolder &g, const ReachSubgraph &rsi,
1564 const vector<BoundedRepeatData> &all_repeats) {
1565 // Cope with peeling by following a chain of single vertices backwards
1566 // until we encounter our cyclic, all of which must have superset reach.
1567 vector<NFAVertex> straw;
1568 return walkStrawToCyclicRev(g, rsi.vertices.front(), all_repeats, straw) !=
1569 NGHolder::null_vertex();
1570}
1571
1572static
1573bool hasCyclicSupersetExitPath(const NGHolder &g, const ReachSubgraph &rsi,
1574 const vector<BoundedRepeatData> &all_repeats) {
1575 vector<NFAVertex> straw;
1576 return walkStrawToCyclicFwd(g, rsi.vertices.back(), all_repeats, straw) !=
1577 NGHolder::null_vertex();
1578}
1579
1580static
1581bool leadsOnlyToAccept(const NGHolder &g, const ReachSubgraph &rsi) {
1582 const NFAVertex u = rsi.vertices.back();
1583 for (auto v : adjacent_vertices_range(u, g)) {
1584 if (v != g.accept) {
1585 return false;
1586 }
1587 }
1588 assert(out_degree(u, g));
1589 return true;
1590}
1591
1592static
1593bool allSimpleHighlander(const ReportManager &rm,
1594 const flat_set<ReportID> &reports) {
1595 assert(!reports.empty());
1596 for (auto report : reports) {
1597 if (!isSimpleExhaustible(rm.getReport(report))) {
1598 return false;
1599 }
1600 }
1601
1602 return true;
1603}
1604
1605// Finds a single, fairly unrefined trigger for the repeat by walking backwards
1606// and collecting the unioned reach at each step.
1607static
1608vector<CharReach> getUnionedTrigger(const NGHolder &g, const NFAVertex v) {
1609 const size_t MAX_TRIGGER_STEPS = 32;
1610
1611 vector<CharReach> trigger;
1612
1613 flat_set<NFAVertex> curr, next;
1614 insert(&curr, inv_adjacent_vertices(v, g));
1615
1616 if (contains(curr, g.start)) {
1617 DEBUG_PRINTF("start in repeat's immediate preds\n");
1618 trigger.push_back(CharReach::dot()); // Trigger could be anything!
1619 return trigger;
1620 }
1621
1622 for (size_t num_steps = 0; num_steps < MAX_TRIGGER_STEPS; num_steps++) {
1623 next.clear();
1624 trigger.push_back(CharReach());
1625 CharReach &cr = trigger.back();
1626
1627 for (auto v_c : curr) {
1628 cr |= g[v_c].char_reach;
1629 insert(&next, inv_adjacent_vertices(v_c, g));
1630 }
1631
1632 DEBUG_PRINTF("cr[%zu]=%s\n", num_steps, describeClass(cr).c_str());
1633
1634 if (next.empty() || contains(next, g.start)) {
1635 break;
1636 }
1637
1638 curr.swap(next);
1639 }
1640
1641 reverse(trigger.begin(), trigger.end());
1642 return trigger;
1643}
1644
1645static
1646vector<vector<CharReach>> getRepeatTriggers(const NGHolder &g,
1647 const NFAVertex sink) {
1648 const size_t MAX_TRIGGER_STEPS = 32;
1649 const size_t UNIONED_FALLBACK_THRESHOLD = 100;
1650
1651 using Path = deque<NFAVertex>;
1652
1653 vector<vector<CharReach>> triggers;
1654
1655 deque<Path> q; // work queue
1656 deque<Path> done; // finished paths
1657
1658 size_t max_len = MAX_TRIGGER_STEPS;
1659
1660 // Find a set of paths leading to vertex v by depth first search.
1661
1662 for (auto u : inv_adjacent_vertices_range(sink, g)) {
1663 if (is_any_start(u, g)) {
1664 triggers.push_back({}); // empty
1665 return triggers;
1666 }
1667 q.push_back(Path(1, u));
1668 }
1669
1670 while (!q.empty()) {
1671 Path &path = q.front();
1672 NFAVertex v = path.back();
1673
1674 if (path.size() >= max_len) {
1675 max_len = min(max_len, path.size());
1676 done.push_back(path);
1677 goto next_path;
1678 }
1679
1680 for (auto u : inv_adjacent_vertices_range(v, g)) {
1681 if (is_any_start(u, g)) {
1682 // Found an accept. There's no point expanding this path any
1683 // further, we're done.
1684 max_len = min(max_len, path.size());
1685 done.push_back(path);
1686 goto next_path;
1687 }
1688
1689 if (path.size() + 1 >= max_len) {
1690 done.push_back(path);
1691 done.back().push_back(u);
1692 } else {
1693 q.push_back(path); // copy
1694 q.back().push_back(u);
1695 }
1696 }
1697
1698 next_path:
1699 q.pop_front();
1700
1701 // If our queue or our finished trigger list gets too large, fall back
1702 // to generating a single trigger with union reach.
1703 if (q.size() + done.size() > UNIONED_FALLBACK_THRESHOLD) {
1704 DEBUG_PRINTF("search too large, fall back to union trigger\n");
1705 triggers.clear();
1706 triggers.push_back(getUnionedTrigger(g, sink));
1707 return triggers;
1708 }
1709 }
1710
1711 assert(!done.empty());
1712
1713 // Convert our path list into a set of unique triggers.
1714 ue2_unordered_set<vector<CharReach>> unique_triggers;
1715 for (const auto &path : done) {
1716 vector<CharReach> reach_path;
1717 for (auto jt = path.rbegin(), jte = path.rend(); jt != jte; ++jt) {
1718 reach_path.push_back(g[*jt].char_reach);
1719 }
1720 unique_triggers.insert(reach_path);
1721 }
1722
1723 insert(&triggers, triggers.end(), unique_triggers);
1724 sort(triggers.begin(), triggers.end());
1725 DEBUG_PRINTF("built %zu unique triggers, max_len=%zu\n", triggers.size(),
1726 max_len);
1727 return triggers;
1728}
1729
1730static
1731void findMinPeriod(const NGHolder &g,
1732 const map<u32, vector<vector<CharReach>>> &triggers,
1733 ReachSubgraph &rsi) {
1734 const auto v = rsi.vertices.front();
1735 const CharReach &cr = g[v].char_reach;
1736
1737 vector<vector<CharReach>> repeat_triggers;
1738
1739 if (is_triggered(g)) {
1740 // Construct a temporary copy of the graph that also contains its
1741 // triggers, potentially lengthening the repeat's triggers.
1742 NGHolder tg;
1743 unordered_map<NFAVertex, NFAVertex> tg_map;
1744 cloneHolder(tg, g, &tg_map);
1745 addTriggers(tg, triggers);
1746 assert(contains(tg_map, v));
1747 repeat_triggers = getRepeatTriggers(tg, tg_map.at(v));
1748 } else {
1749 // Not triggered, no need to mutate the graph.
1750 repeat_triggers = getRepeatTriggers(g, v);
1751 }
1752
1753 rsi.minPeriod = minPeriod(repeat_triggers, cr, &rsi.is_reset);
1754 DEBUG_PRINTF("%zu triggers, minPeriod=%u, is_reset=%d\n",
1755 repeat_triggers.size(), rsi.minPeriod, (int)rsi.is_reset);
1756}
1757
1758static
1759void
1760selectHistoryScheme(const NGHolder &g, const ReportManager *rm,
1761 ReachSubgraph &rsi,
1762 const unordered_map<NFAVertex, NFAVertexDepth> &depths,
1763 const unordered_set<NFAVertex> &reached_by_fixed_tops,
1764 const map<u32, vector<vector<CharReach>>> &triggers,
1765 const vector<BoundedRepeatData> &all_repeats,
1766 const bool simple_model_selection) {
1767 // {N,} cases use the FIRST history mechanism.
1768 if (rsi.repeatMax.is_infinite()) {
1769 DEBUG_PRINTF("selected FIRST history\n");
1770 rsi.historyType = REPEAT_FIRST;
1771 return;
1772 }
1773
1774 /* If we have a repeat which only raises a highlander, only the first match
1775 * matters */
1776 if (rm && leadsOnlyToAccept(g, rsi)
1777 && allSimpleHighlander(*rm, g[rsi.vertices.back()].reports)) {
1778 DEBUG_PRINTF("selected FIRST history (as highlander)\n");
1779 rsi.historyType = REPEAT_FIRST;
1780 rsi.repeatMax = depth::infinity(); /* for consistency */
1781 return;
1782 }
1783
1784 // {N,M} cases can use the FIRST mechanism if they follow a cyclic which
1785 // includes their reachability via a "straw" path. (see UE-1589)
1786 if (hasCyclicSupersetEntryPath(g, rsi, all_repeats)) {
1787 DEBUG_PRINTF("selected FIRST history due to cyclic pred with "
1788 "superset of reach\n");
1789 rsi.historyType = REPEAT_FIRST;
1790 rsi.repeatMax = depth::infinity(); /* will continue to pump out matches */
1791 return;
1792 }
1793
1794 // Similarly, {N,M} cases can use the FIRST mechanism if they precede a
1795 // cyclic which includes their reachability via a "straw" path.
1796 if (hasCyclicSupersetExitPath(g, rsi, all_repeats)) {
1797 DEBUG_PRINTF("selected FIRST history due to cyclic succ with "
1798 "superset of reach\n");
1799 rsi.historyType = REPEAT_FIRST;
1800 rsi.repeatMax = depth::infinity(); /* will continue to pump out matches */
1801 return;
1802 }
1803
1804 // Could have skip edges and therefore be a {0,N} repeat.
1805 if (rsi.repeatMin == depth(1) && hasSkipEdges(g, rsi)) {
1806 DEBUG_PRINTF("selected LAST history\n");
1807 rsi.historyType = REPEAT_LAST;
1808 return;
1809 }
1810
1811 // Fill minPeriod, is_reset flags
1812 findMinPeriod(g, triggers, rsi);
1813
1814 // If we can't re-enter this cyclic state, we have a reset case.
1815 // This check can be very expensive, so we don't do it if we've been asked
1816 // for simple model selection.
1817 if (!simple_model_selection && !rsi.is_reset &&
1818 hasSoleEntry(g, rsi, depths, reached_by_fixed_tops, triggers)) {
1819 DEBUG_PRINTF("repeat is sole entry -> reset\n");
1820 rsi.is_reset = true;
1821 }
1822
1823 // We can lean on the common selection code for the remainder of our repeat
1824 // models.
1825 rsi.historyType = chooseRepeatType(rsi.repeatMin, rsi.repeatMax,
1826 rsi.minPeriod, rsi.is_reset);
1827}
1828
1829static
1830void buildFeeder(NGHolder &g, const BoundedRepeatData &rd,
1831 unordered_set<NFAVertex> &created,
1832 const vector<NFAVertex> &straw) {
1833 if (!g[rd.cyclic].char_reach.all()) {
1834 // Create another cyclic feeder state with flipped reach. It has an
1835 // edge from the repeat's cyclic state and pos_trigger, an edge to the
1836 // straw, and edges from every vertex along the straw.
1837 NFAVertex feeder = clone_vertex(g, rd.cyclic);
1838 created.insert(feeder);
1839 g[feeder].char_reach.flip();
1840 add_edge(feeder, feeder, g);
1841 add_edge(rd.pos_trigger, feeder, g);
1842 add_edge(rd.cyclic, feeder, g);
1843 add_edge(feeder, straw.front(), g);
1844
1845 // An edge from every vertex in the straw.
1846 for (auto v : straw) {
1847 add_edge(v, feeder, g);
1848 }
1849
1850 // An edge to the feeder from the first vertex in the straw and all of
1851 // its predecessors (other than the feeder itself, we've already
1852 // created that edge!)
1853 for (auto u : inv_adjacent_vertices_range(straw.front(), g)) {
1854 if (u == feeder) {
1855 continue;
1856 }
1857 add_edge(u, feeder, g);
1858 }
1859
1860 DEBUG_PRINTF("added feeder %zu\n", g[feeder].index);
1861 } else {
1862 // No neg trigger means feeder is empty, and unnecessary.
1863 assert(g[rd.pos_trigger].char_reach.all());
1864 }
1865}
1866
1867/**
1868 * If we have a leading first repeat, we can split startDs so that it is not
1869 * cyclic so that the repeat is only triggered once, rather than every byte. If we
1870 * perform this transform we must create another cyclic state to retrigger the
1871 * repeat after we see an escape for the repeat.
1872 *
1873 * We do not use the anchored start state to allow us to restart the NFA at a deep
1874 * offset.
1875 */
1876static
1877bool improveLeadingRepeat(NGHolder &g, BoundedRepeatData &rd,
1878 unordered_set<NFAVertex> &created,
1879 const vector<BoundedRepeatData> &all_repeats) {
1880 assert(edge(g.startDs, g.startDs, g).second);
1881
1882 // UE-1617: can rewire FIRST history cases that are preceded by
1883 // startDs.
1884 if (rd.type != REPEAT_FIRST) {
1885 return false;
1886 }
1887
1888 const CharReach &cyc_cr = g[rd.cyclic].char_reach;
1889
1890 // This transformation is only worth doing if this would allow us to
1891 // accelerate the cyclic state (UE-2055).
1892 if ((~cyc_cr).count() > ACCEL_MAX_STOP_CHAR) {
1893 DEBUG_PRINTF("we wouldn't be able to accel this case\n");
1894 return false;
1895 }
1896
1897 vector<NFAVertex> straw;
1898 NFAVertex pred =
1899 walkStrawToCyclicRev(g, rd.pos_trigger, all_repeats, straw);
1900 if (pred != g.startDs) {
1901 DEBUG_PRINTF("straw walk doesn't lead to startDs\n");
1902 return false;
1903 }
1904
1905 // This transformation is only safe if the straw path from startDs that
1906 // we've discovered can *only* lead to this repeat, since we're going to
1907 // remove the self-loop on startDs.
1908 if (proper_out_degree(g.startDs, g) > 1) {
1909 DEBUG_PRINTF("startDs has other successors\n");
1910 return false;
1911 }
1912 for (const auto &v : straw) {
1913 if (proper_out_degree(v, g) != 1) {
1914 DEBUG_PRINTF("branch between startDs and repeat, from vertex %zu\n",
1915 g[v].index);
1916 return false;
1917 }
1918 }
1919
1920 if (g[rd.pos_trigger].char_reach.count() < ACCEL_MAX_STOP_CHAR) {
1921 DEBUG_PRINTF("entry is narrow, could be accelerable\n");
1922 return false;
1923 }
1924
1925 assert(!straw.empty());
1926
1927 /* If there is overlap between the feeder and the first vertex in the straw
1928 * fun things happen. TODO: handle fun things happening (requires more
1929 * edges and more vertices). */
1930 if (!g[straw.front()].char_reach.isSubsetOf(cyc_cr)) {
1931 DEBUG_PRINTF("straw has `interesting' reach\n");
1932 return false;
1933 }
1934
1935 DEBUG_PRINTF("repeat can be improved by removing startDs loop!\n");
1936
1937 // Remove the self-loop on startDs! What a blast!
1938 remove_edge(g.startDs, g.startDs, g);
1939
1940 // Wire up feeder state to straw.
1941 buildFeeder(g, rd, created, straw);
1942
1943 return true;
1944}
1945
1946static
1947vector<NFAVertex> makeOwnStraw(NGHolder &g, BoundedRepeatData &rd,
1948 const vector<NFAVertex> &straw) {
1949 // Straw runs from startDs to our pos trigger.
1950 assert(!straw.empty());
1951 assert(edge(g.startDs, straw.front(), g).second);
1952 assert(edge(straw.back(), rd.pos_trigger, g).second);
1953
1954 vector<NFAVertex> own_straw;
1955 for (const auto &v : straw) {
1956 NFAVertex v2 = clone_vertex(g, v);
1957 if (hasSelfLoop(v, g)) {
1958 add_edge(v2, v2, g);
1959 }
1960 if (!own_straw.empty()) {
1961 add_edge(own_straw.back(), v2, g);
1962 }
1963 own_straw.push_back(v2);
1964 }
1965
1966 // Wire our straw to start, not startDs.
1967 add_edge(g.start, own_straw.front(), g);
1968
1969 // Swap over to using our own straw to get to the POS trigger.
1970 remove_edge(straw.back(), rd.pos_trigger, g);
1971 add_edge(own_straw.back(), rd.pos_trigger, g);
1972
1973 return own_straw;
1974}
1975
1976/**
1977 * Specialized version of improveLeadingRepeat for outfixes, in which we can
1978 * rewire the straw to start instead of removing the startDs self-loop.
1979 */
1980static
1981bool improveLeadingRepeatOutfix(NGHolder &g, BoundedRepeatData &rd,
1982 unordered_set<NFAVertex> &created,
1983 const vector<BoundedRepeatData> &all_repeats) {
1984 assert(g.kind == NFA_OUTFIX);
1985
1986 // UE-1617: can rewire FIRST history cases that are preceded by
1987 // startDs.
1988 if (rd.type != REPEAT_FIRST) {
1989 return false;
1990 }
1991
1992 const CharReach &cyc_cr = g[rd.cyclic].char_reach;
1993
1994 // This transformation is only worth doing if this would allow us to
1995 // accelerate the cyclic state (UE-2055).
1996 if ((~cyc_cr).count() > ACCEL_MAX_STOP_CHAR) {
1997 DEBUG_PRINTF("we wouldn't be able to accel this case\n");
1998 return false;
1999 }
2000
2001 vector<NFAVertex> straw;
2002 NFAVertex pred =
2003 walkStrawToCyclicRev(g, rd.pos_trigger, all_repeats, straw);
2004 if (pred != g.startDs) {
2005 DEBUG_PRINTF("straw walk doesn't lead to startDs\n");
2006 return false;
2007 }
2008
2009 if (g[rd.pos_trigger].char_reach.count() < ACCEL_MAX_STOP_CHAR) {
2010 DEBUG_PRINTF("entry is narrow, could be accelerable\n");
2011 return false;
2012 }
2013
2014 assert(!straw.empty());
2015
2016 /* If there is overlap between the feeder and the first vertex in the straw
2017 * fun things happen. TODO: handle fun things happening (requires more
2018 * edges and more vertices). */
2019 if (!g[straw.front()].char_reach.isSubsetOf(cyc_cr)) {
2020 DEBUG_PRINTF("straw has `interesting' reach\n");
2021 return false;
2022 }
2023
2024 DEBUG_PRINTF("repeat can be improved by rebuilding its entry\n");
2025
2026 const auto own_straw = makeOwnStraw(g, rd, straw);
2027 insert(&created, own_straw);
2028
2029 // Wire up feeder state to our new straw.
2030 buildFeeder(g, rd, created, own_straw);
2031
2032 // We may no longer need the original straw.
2033 pruneUseless(g);
2034
2035 return true;
2036}
2037
2038/** Returns true if doing the bounded repeat transformation on this case
2039 * results in a smaller NFA model. */
2040static
2041bool givesBetterModel(const NGHolder &g, const vector<ReachSubgraph> &rs) {
2042 static const u32 MAX_FAST_STATES = 128; // bigger NFAs are fat and slow.
2043
2044 // We use vertex count as an upper bound for the number of states.
2045 u32 curr_states = num_vertices(g) - 2; // accepts don't have states
2046
2047 if (curr_states <= MAX_FAST_STATES) {
2048 return false;
2049 }
2050 if (curr_states > NFA_MAX_STATES) {
2051 return true;
2052 }
2053
2054 u32 expected_states = curr_states;
2055 for (const auto &rsi : rs) {
2056 /* may be off as unpeeling not done yet */
2057 expected_states += 2; /* cyclic and pos */
2058 expected_states -= rsi.vertices.size();
2059 }
2060
2061 return ROUNDUP_N(curr_states, 128) != ROUNDUP_N(expected_states, 128);
2062}
2063
2064/** True if this repeat terminates with a vertex that leads only to accept. */
2065static
2066bool endsInAccept(const NGHolder &g, const ReachSubgraph &rsi) {
2067 NFAVertex last = rsi.vertices.back();
2068 return getSoleDestVertex(g, last) == g.accept;
2069}
2070
2071static
2072bool endsInAcceptEod(const NGHolder &g, const ReachSubgraph &rsi) {
2073 NFAVertex last = rsi.vertices.back();
2074 return getSoleDestVertex(g, last) == g.acceptEod;
2075}
2076
2077namespace {
2078class pfti_visitor : public boost::default_dfs_visitor {
2079public:
2080 pfti_visitor(unordered_map<NFAVertex, depth> &top_depths_in,
2081 const depth &our_depth_in)
2082 : top_depths(top_depths_in), our_depth(our_depth_in) {}
2083
2084 void discover_vertex(NFAVertex v, UNUSED const NGHolder &g) {
2085 DEBUG_PRINTF("discovered %zu (depth %s)\n", g[v].index,
2086 our_depth.str().c_str());
2087
2088 auto it = top_depths.find(v);
2089 if (it != top_depths.end() && it->second != our_depth) {
2090 // already seen at a different depth, remove from consideration.
2091 it->second = depth::infinity();
2092 } else {
2093 top_depths[v] = our_depth;
2094 }
2095 }
2096 unordered_map<NFAVertex, depth> &top_depths;
2097 const depth &our_depth;
2098};
2099} // namespace
2100
2101static
2102void populateFixedTopInfo(const map<u32, u32> &fixed_depth_tops,
2103 const NGHolder &g,
2104 unordered_set<NFAVertex> *reached_by_fixed_tops) {
2105 if (fixed_depth_tops.empty()) {
2106 return; /* we will never find anything */
2107 }
2108
2109 assert(!proper_out_degree(g.startDs, g));
2110 unordered_map<NFAVertex, depth> top_depths;
2111 auto colours = make_small_color_map(g);
2112
2113 for (const auto &e : out_edges_range(g.start, g)) {
2114 NFAVertex v = target(e, g);
2115 if (v == g.startDs) {
2116 continue;
2117 }
2118
2119 depth td = depth::infinity();
2120 for (u32 top : g[e].tops) {
2121 if (!contains(fixed_depth_tops, top)) {
2122 td = depth::infinity();
2123 break;
2124 }
2125 depth td_t(fixed_depth_tops.at(top));
2126 if (td == td_t) {
2127 continue;
2128 } else if (td == depth::infinity()) {
2129 td = td_t;
2130 } else {
2131 td = depth::infinity();
2132 break;
2133 }
2134 }
2135
2136 DEBUG_PRINTF("scanning from %zu depth=%s\n", g[v].index,
2137 td.str().c_str());
2138 /* for each vertex reachable from v update its map to reflect that it is
2139 * reachable from a top of depth td. */
2140
2141 depth_first_visit(g, v, pfti_visitor(top_depths, td), colours);
2142 }
2143
2144 for (const auto &v_depth : top_depths) {
2145 const NFAVertex v = v_depth.first;
2146 const depth &d = v_depth.second;
2147 if (d.is_finite()) {
2148 DEBUG_PRINTF("%zu reached by fixed tops at depth %s\n",
2149 g[v].index, d.str().c_str());
2150 reached_by_fixed_tops->insert(v);
2151 }
2152 }
2153}
2154
2155#ifndef NDEBUG
2156/** Assertion use only. Returns true if the given bounded repeats share any
2157 * vertices, which we don't allow. */
2158static
2159bool hasOverlappingRepeats(UNUSED const NGHolder &g,
2160 const vector<BoundedRepeatData> &repeats) {
2161 unordered_set<NFAVertex> involved;
2162
2163 for (const auto &br : repeats) {
2164 if (contains(involved, br.cyclic)) {
2165 DEBUG_PRINTF("already seen cyclic %zu\n", g[br.cyclic].index);
2166 return true;
2167 }
2168 if (contains(involved, br.pos_trigger)) {
2169 DEBUG_PRINTF("already seen pos %zu\n", g[br.pos_trigger].index);
2170 return true;
2171 }
2172 for (auto v : br.tug_triggers) {
2173 if (contains(involved, v)) {
2174 DEBUG_PRINTF("already seen tug %zu\n", g[v].index);
2175 return true;
2176 }
2177 }
2178
2179 involved.insert(br.cyclic);
2180 involved.insert(br.pos_trigger);
2181 involved.insert(br.tug_triggers.begin(), br.tug_triggers.end());
2182 }
2183
2184 return false;
2185}
2186
2187#endif // NDEBUG
2188
2189/**
2190 * Identifies so-called "nasty" repeats, in which the reachability of both the
2191 * repeat itself and its tugs are wide, which means that executing the NFA will
2192 * likely be bogged down in exception processing.
2193 */
2194static
2195bool repeatIsNasty(const NGHolder &g, const ReachSubgraph &rsi,
2196 const unordered_map<NFAVertex, NFAVertexDepth> &depths) {
2197 if (num_vertices(g) > NFA_MAX_STATES) {
2198 // We may have no choice but to implement this repeat to get the graph
2199 // down to a tractable number of vertices.
2200 return false;
2201 }
2202
2203 if (!generates_callbacks(g) && endsInAccept(g, rsi)) {
2204 DEBUG_PRINTF("would generate a lazy tug, repeat is OK\n");
2205 return false;
2206 }
2207
2208 const NFAVertex first = rsi.vertices.front();
2209 DEBUG_PRINTF("min depth from startds = %s\n",
2210 depths.at(first).fromStartDotStar.min.str().c_str());
2211 if (depths.at(first).fromStartDotStar.min > depth(2)) {
2212 return false;
2213 }
2214
2215 NFAVertex last = rsi.vertices.back();
2216 const CharReach &cyclicreach = g[last].char_reach;
2217 CharReach tugreach;
2218 for (auto v : adjacent_vertices_range(last, g)) {
2219 if (v == last || is_special(v, g)) {
2220 continue;
2221 }
2222 tugreach |= g[v].char_reach;
2223 }
2224 // Deal with unpeeled cases.
2225 if (tugreach.none()) {
2226 tugreach = cyclicreach;
2227 }
2228 DEBUG_PRINTF("tugreach.count=%zu, cyclicreach.count=%zu\n",
2229 tugreach.count(), cyclicreach.count());
2230 return (tugreach.count() > 200) && (cyclicreach.count() > 200);
2231}
2232
2233void analyseRepeats(NGHolder &g, const ReportManager *rm,
2234 const map<u32, u32> &fixed_depth_tops,
2235 const map<u32, vector<vector<CharReach>>> &triggers,
2236 vector<BoundedRepeatData> *repeats, bool streaming,
2237 bool simple_model_selection, const Grey &grey,
2238 bool *reformed_start_ds) {
2239 if (!grey.allowExtendedNFA || !grey.allowLimExNFA) {
2240 return;
2241 }
2242
2243 // Quick sanity test.
2244 assert(allMatchStatesHaveReports(g));
2245
2246#ifndef NDEBUG
2247 // So we can assert that the number of tops hasn't changed at the end of
2248 // this analysis.
2249 const flat_set<u32> allTops = getTops(g);
2250#endif
2251
2252 // Later on, we're (a little bit) dependent on depth information for
2253 // unpeeling and so forth. Note that these depths MUST be maintained when
2254 // new vertices are added.
2255 unordered_map<NFAVertex, NFAVertexDepth> depths;
2256 findInitDepths(g, depths);
2257
2258 // Construct our list of subgraphs with the same reach using BGL magic.
2259 vector<ReachSubgraph> rs;
2260 buildReachSubgraphs(g, rs, grey.minExtBoundedRepeatSize);
2261
2262 // Validate and split subgraphs.
2263 checkReachSubgraphs(g, rs, grey.minExtBoundedRepeatSize);
2264
2265 // Identify which subgraphs represent bounded repeats in forms ("cliches")
2266 // that we accept, and mark the others as bad.
2267 for (auto &rsi: rs) {
2268 if (!processSubgraph(g, rsi, grey.minExtBoundedRepeatSize)) {
2269 rsi.bad = true;
2270 continue;
2271 }
2272
2273 DEBUG_PRINTF("rsi min %s=max=%s\n", rsi.repeatMin.str().c_str(),
2274 rsi.repeatMax.str().c_str());
2275
2276 // Identify repeats with wide cyclic and tug reach which will produce
2277 // low-performance implementations and avoid doing them.
2278 if (repeatIsNasty(g, rsi, depths)) {
2279 DEBUG_PRINTF("marking nasty repeat as bad\n");
2280 rsi.bad = true;
2281 }
2282 }
2283
2284 // Remove bad cases, then sort remaining subgraphs in descending size
2285 // order.
2286 rs.erase(remove_if(rs.begin(), rs.end(),
2287 [](const ReachSubgraph &r) { return r.bad; }),
2288 rs.end());
2289 stable_sort(rs.begin(), rs.end(),
2290 [](const ReachSubgraph &a, const ReachSubgraph &b) {
2291 return a.vertices.size() > b.vertices.size();
2292 });
2293
2294 if (!streaming && !givesBetterModel(g, rs)) {
2295 /* in block mode, there is no state space so we are only looking for
2296 * performance wins */
2297 DEBUG_PRINTF("repeat would not reduce NFA model size, skipping\n");
2298 return;
2299 }
2300
2301 if (rs.empty()) {
2302 /* no good repeats */
2303 return;
2304 }
2305
2306 // Store a copy of the original, unmodified graph in case we need to revert
2307 // back: in particular, due to tug cloning it is possible to build a graph
2308 // that was bigger than the original. See UE-2370. FIXME: smarter analysis
2309 // could make this unnecessary?
2310 const unique_ptr<const NGHolder> orig_g(cloneHolder(g));
2311
2312 unordered_set<NFAVertex> reached_by_fixed_tops;
2313 if (is_triggered(g)) {
2314 populateFixedTopInfo(fixed_depth_tops, g, &reached_by_fixed_tops);
2315 }
2316
2317 // Go to town on the remaining acceptable subgraphs.
2318 unordered_set<NFAVertex> created;
2319 for (auto &rsi : rs) {
2320 DEBUG_PRINTF("subgraph (beginning vertex %zu) is a {%s,%s} repeat\n",
2321 g[rsi.vertices.front()].index,
2322 rsi.repeatMin.str().c_str(), rsi.repeatMax.str().c_str());
2323
2324 if (!peelSubgraph(g, grey, rsi, created)) {
2325 DEBUG_PRINTF("peel failed, skipping\n");
2326 continue;
2327 }
2328
2329 // Attempt to peel a vertex if we're up against startDs, for
2330 // performance reasons.
2331 peelStartDotStar(g, depths, grey, rsi);
2332
2333 // Our peeling passes may have killed off this repeat.
2334 if (rsi.bad) {
2335 continue;
2336 }
2337
2338 selectHistoryScheme(g, rm, rsi, depths, reached_by_fixed_tops, triggers,
2339 *repeats, simple_model_selection);
2340
2341 if (!generates_callbacks(g) && endsInAccept(g, rsi)) {
2342 DEBUG_PRINTF("accepty-rosy graph\n");
2343 replaceSubgraphWithLazySpecial(g, rsi, repeats, depths, created);
2344 } else if (endsInAcceptEod(g, rsi)) {
2345 DEBUG_PRINTF("accepty-rosy graph\n");
2346 replaceSubgraphWithLazySpecial(g, rsi, repeats, depths, created);
2347 } else {
2348 replaceSubgraphWithSpecial(g, rsi, repeats, depths, created);
2349 }
2350
2351 // Some of our analyses require correctly numbered vertices, so we
2352 // renumber after changes.
2353 renumber_vertices(g);
2354 }
2355
2356 bool modified_start_ds = false;
2357
2358 // We may be able to make improvements to the graph for performance
2359 // reasons. Note that this may do 'orrible things like remove the startDs
2360 // cycle, this should only happen quite late in the graph lifecycle.
2361 if (repeats->size() == 1) {
2362 if (g.kind == NFA_OUTFIX) {
2363 improveLeadingRepeatOutfix(g, repeats->back(), created, *repeats);
2364 // (Does not modify startDs, so we don't need to set
2365 // reformed_start_ds for this case.)
2366 } else {
2367 modified_start_ds =
2368 improveLeadingRepeat(g, repeats->back(), created, *repeats);
2369 }
2370 }
2371
2372 if (reformed_start_ds) {
2373 *reformed_start_ds = modified_start_ds;
2374 }
2375
2376 if (!repeats->empty()) {
2377 if (num_vertices(g) > NFA_MAX_STATES) {
2378 // We've managed to build an unimplementable NFA. Swap back to the
2379 // original.
2380 DEBUG_PRINTF("NFA has %zu vertices; swapping back to the "
2381 "original graph\n", num_vertices(g));
2382 clear_graph(g);
2383 assert(orig_g);
2384 cloneHolder(g, *orig_g);
2385 repeats->clear();
2386 }
2387
2388 // Sanity test: we don't want any repeats that share special vertices
2389 // as our construction code later can't cope with it.
2390 assert(!hasOverlappingRepeats(g, *repeats));
2391
2392 // We have modified the graph, so we need to ensure that our edges
2393 // and vertices are correctly numbered.
2394 renumber_vertices(g);
2395 renumber_edges(g);
2396 // Remove stray report IDs.
2397 clearReports(g);
2398 }
2399
2400 // Quick sanity tests.
2401 assert(allMatchStatesHaveReports(g));
2402 assert(!is_triggered(g) || getTops(g) == allTops);
2403}
2404
2405/**
2406 * \brief True if the non-special vertices in the given graph all have the same
2407 * character reachability.
2408 */
2409static
2410bool allOneReach(const NGHolder &g) {
2411 const CharReach *cr = nullptr;
2412 for (const auto &v : vertices_range(g)) {
2413 if (is_special(v, g)) {
2414 continue;
2415 }
2416 if (!cr) {
2417 cr = &g[v].char_reach;
2418 } else {
2419 if (*cr != g[v].char_reach) {
2420 return false;
2421 }
2422 }
2423 }
2424 return true;
2425}
2426
2427bool isPureRepeat(const NGHolder &g, PureRepeat &repeat) {
2428 assert(allMatchStatesHaveReports(g));
2429
2430 DEBUG_PRINTF("entry\n");
2431
2432 // Must be start anchored.
2433 assert(edge(g.startDs, g.startDs, g).second);
2434 if (out_degree(g.startDs, g) > 1) {
2435 DEBUG_PRINTF("Unanchored\n");
2436 return false;
2437 }
2438
2439 // Must not be EOD-anchored.
2440 assert(edge(g.accept, g.acceptEod, g).second);
2441 if (in_degree(g.acceptEod, g) > 1) {
2442 DEBUG_PRINTF("EOD anchored\n");
2443 return false;
2444 }
2445
2446 // Must have precisely one top.
2447 if (is_triggered(g) && !onlyOneTop(g)) {
2448 DEBUG_PRINTF("Too many tops\n");
2449 return false;
2450 }
2451
2452 if (!allOneReach(g)) {
2453 DEBUG_PRINTF("vertices with different reach\n");
2454 return false;
2455 }
2456
2457 // We allow this code to report true for any repeat, even for '.*' or '.+'
2458 // cases.
2459 const u32 minNumVertices = 1;
2460
2461 vector<ReachSubgraph> rs;
2462 buildReachSubgraphs(g, rs, minNumVertices);
2463 checkReachSubgraphs(g, rs, minNumVertices);
2464 if (rs.size() != 1) {
2465 DEBUG_PRINTF("too many subgraphs\n");
2466 return false;
2467 }
2468
2469 ReachSubgraph &rsi = *rs.begin();
2470 if (!processSubgraph(g, rsi, minNumVertices)) {
2471 DEBUG_PRINTF("not a supported repeat\n");
2472 return false;
2473 }
2474
2475 if (rsi.vertices.size() + N_SPECIALS != num_vertices(g)) {
2476 DEBUG_PRINTF("repeat doesn't span graph\n");
2477 return false;
2478 }
2479
2480 assert(!rsi.bad);
2481 assert(rsi.vertices.size() >= minNumVertices);
2482
2483 const NFAVertex v = rsi.vertices.back();
2484
2485 repeat.reach = g[v].char_reach;
2486 repeat.bounds.min = rsi.repeatMin;
2487 repeat.bounds.max = rsi.repeatMax;
2488 insert(&repeat.reports, g[v].reports);
2489
2490 if (isVacuous(g)) {
2491 // This graph might be a {0,N} or {0,} repeat. For this to be true, we
2492 // must have found a {1,N} or {1,} repeat and the start vertex must
2493 // have the same report set as the vertices in the repeat.
2494 if (repeat.bounds.min == depth(1) &&
2495 g[g.start].reports == g[v].reports) {
2496 repeat.bounds.min = depth(0);
2497 DEBUG_PRINTF("graph is %s repeat\n", repeat.bounds.str().c_str());
2498 } else {
2499 DEBUG_PRINTF("not a supported repeat\n");
2500 return false;
2501 }
2502 }
2503
2504 assert(all_reports(g) == set<ReportID>(begin(g[v].reports),
2505 end(g[v].reports)));
2506 return true;
2507}
2508
2509void findRepeats(const NGHolder &h, u32 minRepeatVertices,
2510 vector<GraphRepeatInfo> *repeats_out) {
2511 // Construct our list of subgraphs with the same reach using BGL magic.
2512 vector<ReachSubgraph> rs;
2513 buildReachSubgraphs(h, rs, minRepeatVertices);
2514 checkReachSubgraphs(h, rs, minRepeatVertices);
2515
2516 for (auto &rsi : rs) {
2517 if (!processSubgraph(h, rsi, minRepeatVertices)) {
2518 continue;
2519 }
2520
2521 DEBUG_PRINTF("rsi min=%s max=%s\n", rsi.repeatMin.str().c_str(),
2522 rsi.repeatMax.str().c_str());
2523
2524 depth repeatMax = rsi.repeatMax;
2525
2526 vector<BoundedRepeatData> all_repeats; /* we don't mutate the graph in
2527 * this path */
2528 if (hasCyclicSupersetEntryPath(h, rsi, all_repeats)) {
2529 DEBUG_PRINTF("selected FIRST history due to cyclic pred with "
2530 "superset of reach\n");
2531 repeatMax = depth::infinity(); /* will continue to pump out matches */
2532 }
2533 if (hasCyclicSupersetExitPath(h, rsi, all_repeats)) {
2534 DEBUG_PRINTF("selected FIRST history due to cyclic succ with "
2535 "superset of reach\n");
2536 repeatMax = depth::infinity(); /* will continue to pump out matches */
2537 }
2538
2539 repeats_out->push_back(GraphRepeatInfo());
2540 GraphRepeatInfo &ri = repeats_out->back();
2541 ri.vertices.swap(rsi.vertices);
2542 ri.repeatMin = rsi.repeatMin;
2543 ri.repeatMax = repeatMax;
2544 }
2545}
2546
2547} // namespace ue2
2548