1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Literal analysis and scoring.
31 */
32#include "ng_literal_analysis.h"
33
34#include "ng_holder.h"
35#include "ng_split.h"
36#include "ng_util.h"
37#include "ue2common.h"
38#include "rose/rose_common.h"
39#include "util/compare.h"
40#include "util/depth.h"
41#include "util/graph.h"
42#include "util/graph_range.h"
43#include "util/graph_small_color_map.h"
44#include "util/ue2_graph.h"
45#include "util/ue2string.h"
46
47#include <algorithm>
48#include <fstream>
49#include <queue>
50
51#include <boost/graph/boykov_kolmogorov_max_flow.hpp>
52
53using namespace std;
54
55namespace ue2 {
56
57/** Maximum number of paths to generate. */
58static const u32 MAX_WIDTH = 11;
59
60/** Scoring adjustment for 'uniqueness' in literal. */
61static const u64a WEIGHT_OF_UNIQUENESS = 250;
62
63namespace {
64
65/* Small literal graph type used for the suffix tree used in
66 * compressAndScore. */
67
68struct LitGraphVertexProps {
69 LitGraphVertexProps() = default;
70 explicit LitGraphVertexProps(ue2_literal::elem c_in) : c(move(c_in)) {}
71 ue2_literal::elem c; // string element (char + bool)
72 size_t index; // managed by ue2_graph
73};
74
75struct LitGraphEdgeProps {
76 LitGraphEdgeProps() = default;
77 explicit LitGraphEdgeProps(u64a score_in) : score(score_in) {}
78 u64a score = NO_LITERAL_AT_EDGE_SCORE;
79 size_t index; // managed by ue2_graph
80};
81
82struct LitGraph
83 : public ue2_graph<LitGraph, LitGraphVertexProps, LitGraphEdgeProps> {
84
85 LitGraph() : root(add_vertex(*this)), sink(add_vertex(*this)) {}
86
87 const vertex_descriptor root;
88 const vertex_descriptor sink;
89};
90
91typedef LitGraph::vertex_descriptor LitVertex;
92typedef LitGraph::edge_descriptor LitEdge;
93
94typedef pair<LitVertex, NFAVertex> VertexPair;
95typedef std::queue<VertexPair> LitVertexQ;
96
97} // namespace
98
99#ifdef DUMP_SUPPORT
100
101/** \brief Dump the literal graph in Graphviz format. */
102static UNUSED
103void dumpGraph(const char *filename, const LitGraph &lg) {
104 ofstream fout(filename);
105
106 fout << "digraph G {" << endl;
107
108 for (auto v : vertices_range(lg)) {
109 fout << lg[v].index;
110 if (v == lg.root) {
111 fout << "[label=\"ROOT\"];";
112 } else if (v == lg.sink) {
113 fout << "[label=\"SINK\"];";
114 } else {
115 ue2_literal s;
116 s.push_back(lg[v].c);
117 fout << "[label=\"" << dumpString(s) << "\"];";
118 }
119 fout << endl;
120 }
121
122 for (const auto &e : edges_range(lg)) {
123 LitVertex u = source(e, lg), v = target(e, lg);
124 fout << lg[u].index << " -> " << lg[v].index << "[label=\""
125 << lg[e].score << "\"]"
126 << ";" << endl;
127 }
128
129 fout << "}" << endl;
130}
131
132#endif // DUMP_SUPPORT
133
134static
135bool allowExpand(size_t numItems, size_t totalPathsSoFar) {
136 if (numItems == 0) {
137 return false;
138 }
139
140 if (numItems + totalPathsSoFar > MAX_WIDTH) {
141 return false;
142 }
143
144 return true;
145}
146
147static
148LitVertex addToLitGraph(LitGraph &lg, LitVertex pred,
149 const ue2_literal::elem &c) {
150 // Check if we already have this in the graph.
151 for (auto v : adjacent_vertices_range(pred, lg)) {
152 if (v == lg.sink) {
153 continue;
154 }
155 if (lg[v].c == c) {
156 return v;
157 }
158 }
159
160 LitVertex lv = add_vertex(LitGraphVertexProps(c), lg);
161 add_edge(pred, lv, lg);
162 return lv;
163}
164
165static
166void addToQueue(LitVertexQ &workQ, LitGraph &lg, LitVertex pred,
167 const CharReach &cr, NFAVertex v) {
168 for (size_t i = cr.find_first(); i != CharReach::npos;
169 i = cr.find_next(i)) {
170 if (myisupper(i) && cr.test(mytolower(i))) {
171 // ignore upper half of a nocase pair
172 continue;
173 }
174
175 bool nocase = myislower(i) && cr.test(mytoupper(i));
176 ue2_literal::elem c((char)i, nocase);
177 LitVertex lv = addToLitGraph(lg, pred, c);
178 workQ.push(VertexPair(lv, v));
179 }
180}
181
182static
183void initWorkQueue(LitVertexQ &workQ, LitGraph &lg, const NGHolder &g,
184 const NFAEdge &e) {
185 NFAVertex u = source(e, g);
186 NFAVertex v = target(e, g);
187 const CharReach &cr = g[v].char_reach;
188
189 if (!allowExpand(cr.count(), 0)) {
190 return;
191 }
192
193 addToQueue(workQ, lg, lg.root, cr, u);
194}
195
196static
197u32 crCardinality(const CharReach &cr) {
198 // Special-case for handling dots, much faster than running the find_next
199 // loop below.
200 if (cr.all()) {
201 return 230; // [^A-Z]
202 }
203
204 u32 rv = 0;
205 for (size_t i = cr.find_first(); i != CharReach::npos;
206 i = cr.find_next(i)) {
207 if (myisupper(i) && cr.test(mytolower(i))) {
208 // ignore upper half of a nocase pair
209 continue;
210 }
211 rv++;
212 }
213
214 return rv;
215}
216
217/** Filter out literals that include other literals as suffixes. We do this by
218 * identifying vertices connected to the sink and removing their other
219 * out-edges. */
220static
221void filterLitGraph(LitGraph &lg) {
222 for (auto v : inv_adjacent_vertices_range(lg.sink, lg)) {
223 remove_out_edge_if(v, [&lg](const LitEdge &e) {
224 return target(e, lg) != lg.sink;
225 }, lg);
226 }
227
228 // We could do a DFS-and-prune here, if we wanted. Right now, we just
229 // handle it in extractLiterals by throwing away paths that don't run all
230 // the way from sink to root.
231}
232
233/** Extracts all the literals from the given literal graph. Walks the graph
234 * from each predecessor of the sink (note: it's a suffix tree except for this
235 * convenience) towards the source, storing each string as we go. */
236static
237void extractLiterals(const LitGraph &lg, set<ue2_literal> &s) {
238 ue2_literal lit;
239
240 for (auto u : inv_adjacent_vertices_range(lg.sink, lg)) {
241 lit.clear();
242 while (u != lg.root) {
243 lit.push_back(lg[u].c);
244 assert(in_degree(u, lg) <= 1);
245 LitGraph::inv_adjacency_iterator ai2, ae2;
246 tie(ai2, ae2) = inv_adjacent_vertices(u, lg);
247 if (ai2 == ae2) {
248 // Path has been cut, time for the next literal.
249 goto next_literal;
250 }
251 u = *ai2;
252 }
253 s.insert(lit);
254next_literal:
255 ;
256 }
257}
258
259#ifndef NDEBUG
260static
261bool hasSuffixLiterals(const set<ue2_literal> &s) {
262 for (auto it = s.begin(), ite = s.end(); it != ite; ++it) {
263 for (auto jt = std::next(it); jt != ite; ++jt) {
264 if (isSuffix(*it, *jt) || isSuffix(*jt, *it)) {
265 DEBUG_PRINTF("'%s' and '%s' have suffix issues\n",
266 dumpString(*it).c_str(),
267 dumpString(*jt).c_str());
268 return true;
269 }
270 }
271 }
272 return false;
273}
274#endif
275
276static
277void processWorkQueue(const NGHolder &g, const NFAEdge &e,
278 set<ue2_literal> &s) {
279 if (is_special(target(e, g), g)) {
280 return;
281 }
282
283 LitGraph lg;
284
285 LitVertexQ workQ;
286 initWorkQueue(workQ, lg, g, e);
287
288 while (!workQ.empty()) {
289 const LitVertex lv = workQ.front().first;
290 const NFAVertex &t = workQ.front().second;
291 const CharReach &cr = g[t].char_reach;
292
293 u32 cr_card = crCardinality(cr);
294 size_t numItems = cr_card * in_degree(t, g);
295 size_t committed_count = workQ.size() + in_degree(lg.sink, lg) - 1;
296
297 if (g[t].index == NODE_START) {
298 // reached start, add to literal set
299 add_edge_if_not_present(lv, lg.sink, lg);
300 goto next_work_elem;
301 }
302
303 // Expand next vertex
304 if (allowExpand(numItems, committed_count)) {
305 for (auto u : inv_adjacent_vertices_range(t, g)) {
306 addToQueue(workQ, lg, lv, cr, u);
307 }
308 goto next_work_elem;
309 }
310
311 // Expand this vertex
312 if (allowExpand(cr_card, committed_count)) {
313 for (size_t i = cr.find_first(); i != CharReach::npos;
314 i = cr.find_next(i)) {
315 if (myisupper(i) && cr.test(mytolower(i))) {
316 // ignore upper half of a nocase pair
317 continue;
318 }
319
320 bool nocase = myislower(i) && cr.test(mytoupper(i));
321 ue2_literal::elem c((char)i, nocase);
322 LitVertex lt = addToLitGraph(lg, lv, c);
323 add_edge_if_not_present(lt, lg.sink, lg);
324 }
325 goto next_work_elem;
326 }
327
328 // add to literal set
329 add_edge_if_not_present(lv, lg.sink, lg);
330 next_work_elem:
331 workQ.pop();
332 }
333
334 filterLitGraph(lg);
335 //dumpGraph("litgraph.dot", lg);
336 extractLiterals(lg, s);
337
338 // Our literal set should contain no literal that is a suffix of another.
339 assert(!hasSuffixLiterals(s));
340
341 DEBUG_PRINTF("edge %zu (%zu->%zu) produced %zu literals\n", g[e].index,
342 g[source(e, g)].index, g[target(e, g)].index, s.size());
343}
344
345bool bad_mixed_sensitivity(const ue2_literal &s) {
346 /* TODO: if the mixed cases is entirely within MAX_MASK2_WIDTH of the end,
347 * we should be able to handle it */
348 return mixed_sensitivity(s) && s.length() > MAX_MASK2_WIDTH;
349}
350
351static
352u64a litUniqueness(const string &s) {
353 CharReach seen(s);
354 return seen.count();
355}
356
357/** Count the significant bits of this literal (i.e. seven for nocase alpha,
358 * eight for everything else). */
359static
360u64a litCountBits(const ue2_literal &lit) {
361 u64a n = 0;
362 for (const auto &c : lit) {
363 n += c.nocase ? 7 : 8;
364 }
365 return n;
366}
367
368/** Returns a fairly arbitrary score for the given literal, used to compare the
369 * suitability of different candidates. */
370static
371u64a scoreLiteral(const ue2_literal &s) {
372 // old scoring scheme: SUM(s in S: 1/s.len()^2)
373 // now weight (currently 75/25) with number of unique chars
374 // in the string
375 u64a len = litCountBits(s);
376 u64a lenUnique = litUniqueness(s.get_string()) * 8;
377
378 u64a weightedLen = (1000ULL - WEIGHT_OF_UNIQUENESS) * len +
379 WEIGHT_OF_UNIQUENESS * lenUnique;
380 weightedLen /= 8;
381
382 DEBUG_PRINTF("scored literal '%s' %llu\n",
383 escapeString(s.get_string()).c_str(), weightedLen);
384
385 return weightedLen;
386}
387
388
389/**
390 * calculateScore has the following properties:
391 * - score of literal is the same as the score of the reversed literal;
392 * - score of substring of literal is worse than the original literal's score;
393 * - score of any literal should be non-zero.
394 */
395static
396u64a calculateScore(const ue2_literal &s) {
397 if (s.empty()) {
398 return NO_LITERAL_AT_EDGE_SCORE;
399 }
400
401 u64a weightedLen = scoreLiteral(s);
402
403 DEBUG_PRINTF("len %zu, wl %llu\n", s.length(), weightedLen);
404 u64a rv = 1000000000000000ULL/(weightedLen * weightedLen * weightedLen);
405
406 if (!rv) {
407 rv = 1;
408 }
409 DEBUG_PRINTF("len %zu, score %llu\n", s.length(), rv);
410 return rv;
411}
412
413/** Adds a literal in reverse order, building up a suffix tree. */
414static
415void addReversedLiteral(const ue2_literal &lit, LitGraph &lg) {
416 DEBUG_PRINTF("literal: '%s'\n", escapeString(lit).c_str());
417 ue2_literal suffix;
418 LitVertex v = lg.root;
419 for (auto it = lit.rbegin(), ite = lit.rend(); it != ite; ++it) {
420 suffix.push_back(*it);
421 LitVertex w;
422 for (auto v2 : adjacent_vertices_range(v, lg)) {
423 if (v2 != lg.sink && lg[v2].c == *it) {
424 w = v2;
425 goto next_char;
426 }
427 }
428 w = add_vertex(LitGraphVertexProps(*it), lg);
429 add_edge(v, w, LitGraphEdgeProps(calculateScore(suffix)), lg);
430next_char:
431 v = w;
432 }
433
434 // Wire the last vertex to the sink.
435 add_edge(v, lg.sink, lg);
436}
437
438static
439void extractLiterals(const vector<LitEdge> &cutset, const LitGraph &lg,
440 set<ue2_literal> &s) {
441 for (const auto &e : cutset) {
442 LitVertex u = source(e, lg);
443 LitVertex v = target(e, lg);
444 ue2_literal lit;
445 lit.push_back(lg[v].c);
446 while (u != lg.root) {
447 lit.push_back(lg[u].c);
448 assert(in_degree(u, lg) == 1);
449 LitGraph::inv_adjacency_iterator ai, ae;
450 tie(ai, ae) = inv_adjacent_vertices(u, lg);
451 if (ai == ae) {
452 // Path has been cut, time for the next literal.
453 goto next_literal;
454 }
455 u = *ai;
456 }
457 DEBUG_PRINTF("extracted: '%s'\n", escapeString(lit).c_str());
458 s.insert(lit);
459next_literal:
460 ;
461 }
462}
463
464#ifdef DEBUG
465static UNUSED
466const char *describeColor(small_color c) {
467 switch (c) {
468 case small_color::white:
469 return "white";
470 case small_color::gray:
471 return "gray";
472 case small_color::black:
473 return "black";
474 default:
475 return "unknown";
476 }
477}
478#endif
479
480/**
481 * The BGL's boykov_kolmogorov_max_flow requires that all edges have their
482 * reverse edge in the graph. This function adds them, returning a vector
483 * mapping edge index to reverse edge. Note: LitGraph should be a DAG so there
484 * should be no existing reverse_edges.
485 */
486static
487vector<LitEdge> add_reverse_edges_and_index(LitGraph &lg) {
488 const size_t edge_count = num_edges(lg);
489 vector<LitEdge> fwd_edges;
490 fwd_edges.reserve(edge_count);
491 for (const auto &e : edges_range(lg)) {
492 fwd_edges.push_back(e);
493 }
494
495 vector<LitEdge> rev_map(2 * edge_count);
496
497 for (const auto &e : fwd_edges) {
498 LitVertex u = source(e, lg);
499 LitVertex v = target(e, lg);
500
501 assert(!edge(v, u, lg).second);
502
503 LitEdge rev = add_edge(v, u, LitGraphEdgeProps(0), lg).first;
504 rev_map[lg[e].index] = rev;
505 rev_map[lg[rev].index] = e;
506 }
507
508 return rev_map;
509}
510
511static
512void findMinCut(LitGraph &lg, vector<LitEdge> &cutset) {
513 cutset.clear();
514
515 //dumpGraph("litgraph.dot", lg);
516
517 assert(!in_degree(lg.root, lg));
518 assert(!out_degree(lg.sink, lg));
519 size_t num_real_edges = num_edges(lg);
520
521 // Add reverse edges for the convenience of the BGL's max flow algorithm.
522 vector<LitEdge> rev_edges = add_reverse_edges_and_index(lg);
523
524 const auto v_index_map = get(&LitGraphVertexProps::index, lg);
525 const auto e_index_map = get(&LitGraphEdgeProps::index, lg);
526 const size_t num_verts = num_vertices(lg);
527 auto colors = make_small_color_map(lg);
528 vector<s32> distances(num_verts);
529 vector<LitEdge> predecessors(num_verts);
530 vector<u64a> residuals(num_edges(lg));
531
532 UNUSED u64a flow = boykov_kolmogorov_max_flow(lg,
533 get(&LitGraphEdgeProps::score, lg),
534 make_iterator_property_map(residuals.begin(), e_index_map),
535 make_iterator_property_map(rev_edges.begin(), e_index_map),
536 make_iterator_property_map(predecessors.begin(), v_index_map),
537 colors,
538 make_iterator_property_map(distances.begin(), v_index_map),
539 v_index_map, lg.root, lg.sink);
540 DEBUG_PRINTF("done, flow = %llu\n", flow);
541
542 /* remove reverse edges */
543 remove_edge_if([&](const LitEdge &e) {
544 return lg[e].index >= num_real_edges;
545 }, lg);
546
547 vector<LitEdge> white_cut, black_cut;
548 u64a white_flow = 0, black_flow = 0;
549
550 for (const auto &e : edges_range(lg)) {
551 const LitVertex u = source(e, lg), v = target(e, lg);
552 const auto ucolor = get(colors, u);
553 const auto vcolor = get(colors, v);
554
555 DEBUG_PRINTF("edge %zu:%s -> %zu:%s score %llu\n", lg[u].index,
556 describeColor(ucolor), lg[v].index, describeColor(vcolor),
557 lg[e].score);
558
559 if (ucolor != small_color::white && vcolor == small_color::white) {
560 assert(v != lg.sink);
561 white_cut.push_back(e);
562 white_flow += lg[e].score;
563 }
564 if (ucolor == small_color::black && vcolor != small_color::black) {
565 assert(v != lg.sink);
566 black_cut.push_back(e);
567 black_flow += lg[e].score;
568 }
569 }
570
571 DEBUG_PRINTF("white flow = %llu, black flow = %llu\n",
572 white_flow, black_flow);
573 assert(white_flow && black_flow);
574
575 if (white_flow <= black_flow) {
576 DEBUG_PRINTF("selected white cut\n");
577 cutset.swap(white_cut);
578 } else {
579 DEBUG_PRINTF("selected black cut\n");
580 cutset.swap(black_cut);
581 }
582
583 DEBUG_PRINTF("min cut has %zu edges\n", cutset.size());
584 assert(!cutset.empty());
585}
586
587/** Takes a set of literals and derives a better one from them, returning its
588 * score. Literals with a common suffix S will be replaced with S. (for
589 * example, {foobar, fooobar} -> {oobar}).
590 */
591u64a compressAndScore(set<ue2_literal> &s) {
592 if (s.empty()) {
593 return NO_LITERAL_AT_EDGE_SCORE;
594 }
595
596 if (s.size() == 1) {
597 return calculateScore(*s.begin());
598 }
599
600 UNUSED u64a initialScore = scoreSet(s);
601 DEBUG_PRINTF("begin, initial literals have score %llu\n",
602 initialScore);
603
604 LitGraph lg;
605
606 for (const auto &lit : s) {
607 addReversedLiteral(lit, lg);
608 }
609
610 DEBUG_PRINTF("suffix tree has %zu vertices and %zu edges\n",
611 num_vertices(lg), num_edges(lg));
612
613 vector<LitEdge> cutset;
614 findMinCut(lg, cutset);
615
616 s.clear();
617 extractLiterals(cutset, lg, s);
618
619 u64a score = scoreSet(s);
620 DEBUG_PRINTF("compressed score is %llu\n", score);
621 assert(score <= initialScore);
622 return score;
623}
624
625/* like compressAndScore, but replaces long mixed sensitivity literals with
626 * something weaker. */
627u64a sanitizeAndCompressAndScore(set<ue2_literal> &lits) {
628 const size_t maxExploded = 8; // only case-explode this far
629
630 /* TODO: the whole compression thing could be made better by systematically
631 * considering replacing literal sets not just by common suffixes but also
632 * by nocase literals. */
633
634 vector<ue2_literal> replacements;
635
636 for (auto it = lits.begin(); it != lits.end();) {
637 auto jt = it;
638 ++it;
639
640 if (!bad_mixed_sensitivity(*jt)) {
641 continue;
642 }
643
644 /* we have to replace *jt with something... */
645 ue2_literal s = *jt;
646 lits.erase(jt);
647
648 vector<ue2_literal> exploded;
649 for (auto cit = caseIterateBegin(s); cit != caseIterateEnd(); ++cit) {
650 exploded.emplace_back(*cit, false);
651 if (exploded.size() > maxExploded) {
652 goto dont_explode;
653 }
654 }
655 insert(&replacements, replacements.end(), exploded);
656
657 continue;
658 dont_explode:
659 make_nocase(&s);
660 replacements.push_back(s);
661 }
662
663 insert(&lits, replacements);
664 return compressAndScore(lits);
665}
666
667u64a scoreSet(const set<ue2_literal> &s) {
668 if (s.empty()) {
669 return NO_LITERAL_AT_EDGE_SCORE;
670 }
671
672 u64a score = 1ULL;
673
674 for (const auto &lit : s) {
675 score += calculateScore(lit);
676 }
677
678 return score;
679}
680
681set<ue2_literal> getLiteralSet(const NGHolder &g, const NFAEdge &e) {
682 set<ue2_literal> s;
683 processWorkQueue(g, e, s);
684 return s;
685}
686
687set<ue2_literal> getLiteralSet(const NGHolder &g, const NFAVertex &v,
688 bool only_first_encounter) {
689 set<ue2_literal> s;
690
691 if (is_special(v, g)) {
692 return s;
693 }
694
695 set<ue2_literal> ls;
696
697 for (const auto &e : in_edges_range(v, g)) {
698 if (source(e, g) == v && only_first_encounter) {
699 continue; /* ignore self loop on root vertex as we are interested in
700 * the first time we visit the vertex on the way to
701 * accept. In fact, we can ignore any back edges - but
702 * they would require a bit of effort to discover. */
703 }
704
705 ls = getLiteralSet(g, e);
706 if (ls.empty()) {
707 s.clear();
708 return s;
709 } else {
710 s.insert(ls.begin(), ls.end());
711 }
712 }
713
714 return s;
715}
716
717vector<u64a> scoreEdges(const NGHolder &g, const flat_set<NFAEdge> &known_bad) {
718 assert(hasCorrectlyNumberedEdges(g));
719
720 vector<u64a> scores(num_edges(g));
721
722 for (const auto &e : edges_range(g)) {
723 u32 eidx = g[e].index;
724 assert(eidx < scores.size());
725 if (contains(known_bad, e)) {
726 scores[eidx] = NO_LITERAL_AT_EDGE_SCORE;
727 } else {
728 set<ue2_literal> ls = getLiteralSet(g, e);
729 scores[eidx] = compressAndScore(ls);
730 }
731 }
732
733 return scores;
734}
735
736bool splitOffLeadingLiteral(const NGHolder &g, ue2_literal *lit_out,
737 NGHolder *rhs) {
738 DEBUG_PRINTF("looking for leading floating literal\n");
739 set<NFAVertex> s_succ;
740 insert(&s_succ, adjacent_vertices(g.start, g));
741
742 set<NFAVertex> sds_succ;
743 insert(&sds_succ, adjacent_vertices(g.startDs, g));
744
745 bool floating = is_subset_of(s_succ, sds_succ);
746 if (!floating) {
747 DEBUG_PRINTF("not floating\n");
748 return false;
749 }
750
751 sds_succ.erase(g.startDs);
752 if (sds_succ.size() != 1) {
753 DEBUG_PRINTF("branchy root\n");
754 return false;
755 }
756
757 NFAVertex u = g.startDs;
758 NFAVertex v = *sds_succ.begin();
759
760 while (true) {
761 DEBUG_PRINTF("validating vertex %zu\n", g[v].index);
762
763 assert(v != g.acceptEod && v != g.accept);
764
765 const CharReach &cr = g[v].char_reach;
766 if (cr.count() != 1 && !cr.isCaselessChar()) {
767 break;
768 }
769
770 // Rose can only handle mixed-sensitivity literals up to the max mask
771 // length.
772 if (lit_out->length() >= MAX_MASK2_WIDTH) {
773 if (mixed_sensitivity(*lit_out)) {
774 DEBUG_PRINTF("long and mixed sensitivity\n");
775 break;
776 }
777 if (ourisalpha((char)cr.find_first())) {
778 if (cr.isCaselessChar() != lit_out->any_nocase()) {
779 DEBUG_PRINTF("stop at mixed sensitivity on '%c'\n",
780 (char)cr.find_first());
781 break;
782 }
783 }
784 }
785
786 if (edge(v, g.accept, g).second || edge(v, g.acceptEod, g).second) {
787 DEBUG_PRINTF("connection to accept\n");
788 break;
789 }
790
791 lit_out->push_back(cr.find_first(), cr.isCaselessChar());
792 u = v;
793
794 if (out_degree(v, g) != 1) {
795 DEBUG_PRINTF("out_degree != 1\n");
796 break;
797 }
798
799 v = *adjacent_vertices(v, g).first;
800
801 if (in_degree(v, g) != 1) {
802 DEBUG_PRINTF("blargh\n"); /* picks up cases where there is no path
803 * to case accept (large cycles),
804 * ensures term */
805 break;
806 }
807 }
808
809 if (lit_out->empty()) {
810 return false;
811 }
812 assert(u != g.startDs);
813
814 unordered_map<NFAVertex, NFAVertex> rhs_map;
815 vector<NFAVertex> pivots = make_vector_from(adjacent_vertices(u, g));
816 splitRHS(g, pivots, rhs, &rhs_map);
817
818 DEBUG_PRINTF("literal is '%s' (len %zu)\n", dumpString(*lit_out).c_str(),
819 lit_out->length());
820 assert(is_triggered(*rhs));
821 return true;
822}
823
824bool getTrailingLiteral(const NGHolder &g, ue2_literal *lit_out) {
825 if (in_degree(g.acceptEod, g) != 1) {
826 return false;
827 }
828
829 NFAVertex v = getSoleSourceVertex(g, g.accept);
830
831 if (!v) {
832 return false;
833 }
834
835 set<ue2_literal> s = getLiteralSet(g, v, false);
836
837 if (s.size() != 1) {
838 return false;
839 }
840
841 const ue2_literal &lit = *s.begin();
842
843 if (lit.length() > MAX_MASK2_WIDTH && mixed_sensitivity(lit)) {
844 DEBUG_PRINTF("long & mixed-sensitivity, Rose can't handle this.\n");
845 return false;
846 }
847
848 *lit_out = lit;
849 return true;
850}
851
852bool literalIsWholeGraph(const NGHolder &g, const ue2_literal &lit) {
853 NFAVertex v = g.accept;
854
855 for (auto it = lit.rbegin(), ite = lit.rend(); it != ite; ++it) {
856 NGHolder::inv_adjacency_iterator ai, ae;
857 tie(ai, ae) = inv_adjacent_vertices(v, g);
858 if (ai == ae) {
859 assert(0); // no predecessors?
860 return false;
861 }
862 v = *ai++;
863 if (ai != ae) {
864 DEBUG_PRINTF("branch, fail\n");
865 return false;
866 }
867
868 if (is_special(v, g)) {
869 DEBUG_PRINTF("special found, fail\n");
870 return false;
871 }
872
873 const CharReach &cr_g = g[v].char_reach;
874 const CharReach &cr_l = *it;
875
876 if (!cr_l.isSubsetOf(cr_g)) {
877 /* running over the prefix is needed to prevent false postives */
878 DEBUG_PRINTF("reach fail\n");
879 return false;
880 }
881 }
882
883 // Our last value for v should have only start states for predecessors.
884 for (auto u : inv_adjacent_vertices_range(v, g)) {
885 if (!is_any_start(u, g)) {
886 DEBUG_PRINTF("pred is not start\n");
887 return false;
888 }
889 }
890
891 assert(num_vertices(g) == lit.length() + N_SPECIALS);
892
893 DEBUG_PRINTF("ok\n");
894 return true;
895}
896
897} // namespace ue2
898