1 | /* |
2 | * Copyright (c) 2015-2017, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | #include "rose_build_convert.h" |
30 | |
31 | #include "grey.h" |
32 | #include "rose_build.h" |
33 | #include "rose_build_impl.h" |
34 | #include "rose_build_util.h" |
35 | #include "ue2common.h" |
36 | #include "hwlm/hwlm_build.h" |
37 | #include "nfa/castlecompile.h" |
38 | #include "nfa/limex_limits.h" |
39 | #include "nfagraph/ng_dump.h" |
40 | #include "nfagraph/ng_limex.h" |
41 | #include "nfagraph/ng_repeat.h" |
42 | #include "nfagraph/ng_reports.h" |
43 | #include "nfagraph/ng_split.h" |
44 | #include "nfagraph/ng_util.h" |
45 | #include "nfagraph/ng_width.h" |
46 | #include "util/bitutils.h" |
47 | #include "util/charreach.h" |
48 | #include "util/charreach_util.h" |
49 | #include "util/compile_context.h" |
50 | #include "util/depth.h" |
51 | #include "util/graph_range.h" |
52 | #include "util/make_unique.h" |
53 | #include "util/order_check.h" |
54 | #include "util/ue2string.h" |
55 | |
56 | #include <algorithm> |
57 | #include <map> |
58 | #include <queue> |
59 | #include <set> |
60 | #include <string> |
61 | #include <unordered_map> |
62 | #include <utility> |
63 | #include <vector> |
64 | |
65 | #include <boost/range/adaptor/map.hpp> |
66 | |
67 | using namespace std; |
68 | using boost::adaptors::map_values; |
69 | |
70 | namespace ue2 { |
71 | |
72 | static |
73 | NFAVertex addHolderVertex(const CharReach &cr, NGHolder &out) { |
74 | assert(cr.any()); |
75 | NFAVertex v = add_vertex(out); |
76 | out[v].char_reach = cr; |
77 | return v; |
78 | } |
79 | |
80 | static |
81 | size_t suffixFloodLen(const ue2_literal &s) { |
82 | if (s.empty()) { |
83 | return 0; |
84 | } |
85 | |
86 | const ue2_literal::elem &c = s.back(); |
87 | auto it = find_if(s.rbegin(), s.rend(), |
88 | [&c](const ue2_literal::elem &e) { return e != c; }); |
89 | return distance(s.rbegin(), it); |
90 | } |
91 | |
92 | static |
93 | unique_ptr<NGHolder> makeFloodProneSuffix(const ue2_literal &s, size_t len, |
94 | const flat_set<ReportID> &reports) { |
95 | assert(len < s.length()); |
96 | assert(!reports.empty()); |
97 | |
98 | unique_ptr<NGHolder> h = ue2::make_unique<NGHolder>(NFA_SUFFIX); |
99 | |
100 | NFAVertex u = h->start; |
101 | for (auto it = s.begin() + s.length() - len; it != s.end(); ++it) { |
102 | NFAVertex v = addHolderVertex(*it, *h); |
103 | NFAEdge e = add_edge(u, v, *h); |
104 | if (u == h->start) { |
105 | (*h)[e].tops.insert(DEFAULT_TOP); |
106 | } |
107 | u = v; |
108 | } |
109 | |
110 | (*h)[u].reports.insert(reports.begin(), reports.end()); |
111 | add_edge(u, h->accept, *h); |
112 | return h; |
113 | } |
114 | |
115 | static |
116 | unique_ptr<NGHolder> makeRosePrefix(const ue2_literal &s) { |
117 | unique_ptr<NGHolder> h = ue2::make_unique<NGHolder>(NFA_PREFIX); |
118 | |
119 | NFAVertex u = h->startDs; |
120 | for (const auto &c : s) { |
121 | NFAVertex v = addHolderVertex(c, *h); |
122 | add_edge(u, v, *h); |
123 | u = v; |
124 | } |
125 | add_edge(u, h->accept, *h); |
126 | return h; |
127 | } |
128 | |
129 | static |
130 | void replaceWithLitPrefix(RoseBuildImpl &tbi, RoseVertex v, u32 lit_id, |
131 | const rose_literal_id &lit, size_t suffixlen, |
132 | size_t delay) { |
133 | assert(suffixlen < lit.s.length()); |
134 | |
135 | DEBUG_PRINTF("replacing '%s' with prefix, length=%zu, delay=%zu\n" , |
136 | dumpString(lit.s).c_str(), lit.s.length() - suffixlen, delay); |
137 | |
138 | RoseGraph &g = tbi.g; |
139 | ue2_literal new_lit = lit.s.substr(0, lit.s.length() - suffixlen); |
140 | u32 new_id = tbi.getLiteralId(new_lit, delay, ROSE_FLOATING); |
141 | rose_literal_info &old_info = tbi.literal_info.at(lit_id); |
142 | old_info.vertices.erase(v); |
143 | tbi.literal_info.at(new_id).vertices.insert(v); |
144 | g[v].literals.clear(); |
145 | g[v].literals.insert(new_id); |
146 | } |
147 | |
148 | static |
149 | bool delayLiteralWithPrefix(RoseBuildImpl &tbi, RoseVertex v, u32 lit_id, |
150 | const rose_literal_id &lit, size_t suffixlen) { |
151 | if (suffixlen > MAX_DELAY) { |
152 | DEBUG_PRINTF("delay too large\n" ); |
153 | return false; |
154 | } |
155 | |
156 | if (!tbi.isDirectReport(lit_id)) { |
157 | DEBUG_PRINTF("literal is not direct report\n" ); |
158 | return false; |
159 | } |
160 | |
161 | if (tbi.cc.streaming && |
162 | lit.s.length() > tbi.cc.grey.maxHistoryAvailable + 1) { |
163 | DEBUG_PRINTF("insufficient history to delay literal of len %zu\n" , |
164 | lit.s.length()); |
165 | return false; |
166 | } |
167 | |
168 | shared_ptr<NGHolder> h = makeRosePrefix(lit.s); |
169 | ReportID prefix_report = 0; |
170 | set_report(*h, prefix_report); |
171 | |
172 | if (!isImplementableNFA(*h, &tbi.rm, tbi.cc)) { |
173 | DEBUG_PRINTF("prefix not implementable\n" ); |
174 | return false; |
175 | } |
176 | |
177 | RoseGraph &g = tbi.g; |
178 | assert(!g[v].left); |
179 | g[v].left.graph = h; |
180 | g[v].left.lag = 0; |
181 | g[v].left.leftfix_report = prefix_report; |
182 | |
183 | // Swap v's literal for a shorter one, delayed by suffix len. |
184 | replaceWithLitPrefix(tbi, v, lit_id, lit, suffixlen, suffixlen); |
185 | |
186 | return true; |
187 | } |
188 | |
189 | static |
190 | void convertFloodProneSuffix(RoseBuildImpl &tbi, RoseVertex v, u32 lit_id, |
191 | const rose_literal_id &lit, size_t suffixlen) { |
192 | DEBUG_PRINTF("flood-prone leaf '%s'\n" , dumpString(lit.s).c_str()); |
193 | DEBUG_PRINTF("turning last %zu chars into a suffix NFA\n" , suffixlen); |
194 | RoseGraph &g = tbi.g; |
195 | assert(!g[v].eod_accept); |
196 | |
197 | // If we're a direct report literal, we may be able to convert this case |
198 | // into a delayed literal with a (very boring) transient prefix that |
199 | // handles our flood-prone suffix. |
200 | if (delayLiteralWithPrefix(tbi, v, lit_id, lit, suffixlen)) { |
201 | DEBUG_PRINTF("implemented as delayed literal with a rose prefix\n" ); |
202 | return; |
203 | } |
204 | |
205 | // General case: create a suffix that implements the flood-prone portion. |
206 | |
207 | // Create the NFA. |
208 | auto h = makeFloodProneSuffix(lit.s, suffixlen, g[v].reports); |
209 | if (!isImplementableNFA(*h, &tbi.rm, tbi.cc)) { |
210 | DEBUG_PRINTF("not implementable\n" ); |
211 | return; |
212 | } |
213 | |
214 | // Apply the NFA. |
215 | assert(!g[v].suffix); |
216 | g[v].suffix.graph = move(h); |
217 | g[v].reports.clear(); |
218 | |
219 | // Swap v's literal for a shorter one. |
220 | replaceWithLitPrefix(tbi, v, lit_id, lit, suffixlen, 0); |
221 | |
222 | // It's possible that min_offset might be an underestimate, so we |
223 | // subtract min(min_offset, suffixlen) for safety. |
224 | g[v].min_offset -= min((size_t)g[v].min_offset, suffixlen); |
225 | |
226 | if (g[v].max_offset < ROSE_BOUND_INF) { |
227 | assert(g[v].max_offset >= suffixlen); |
228 | g[v].max_offset -= suffixlen; |
229 | } |
230 | } |
231 | |
232 | /** |
233 | * Collect an estimate of the number of literals in the floating table, and use |
234 | * this to estimate the flood prone suffix length. |
235 | */ |
236 | static |
237 | size_t findFloodProneSuffixLen(const RoseBuildImpl &tbi) { |
238 | size_t numLiterals = 0; |
239 | for (const rose_literal_id &lit : tbi.literals) { |
240 | if (lit.delay) { |
241 | continue; // delay ids are virtual-ish |
242 | } |
243 | if (lit.table != ROSE_FLOATING) { |
244 | continue; |
245 | } |
246 | |
247 | numLiterals++; |
248 | } |
249 | |
250 | return hwlmFloodProneSuffixLen(numLiterals, tbi.cc); |
251 | } |
252 | |
253 | /** |
254 | * \brief Convert flood-prone literal suffixes into suffix NFAs. |
255 | * |
256 | * For any trailing string in Rose (string cannot lead to more Rose roles or |
257 | * NFAs, etc) ending with a continuous run of a single character with more than |
258 | * 3 copies of that single character, |
259 | * |
260 | * If the result of removing all but 2 copies of that character yields a string |
261 | * that is greater than FLOOD_PRONE_LIT_MIN_LENGTH characters, remove those |
262 | * final characters from the literal and move them into a suffix NFA. |
263 | */ |
264 | void convertFloodProneSuffixes(RoseBuildImpl &tbi) { |
265 | static const size_t FLOOD_PRONE_LIT_MIN_LENGTH = 5; |
266 | |
267 | if (!tbi.cc.grey.roseConvertFloodProneSuffixes) { |
268 | return; |
269 | } |
270 | |
271 | const size_t floodProneLen = findFloodProneSuffixLen(tbi); |
272 | DEBUG_PRINTF("flood prone suffix len = %zu\n" , floodProneLen); |
273 | |
274 | RoseGraph &g = tbi.g; |
275 | |
276 | for (auto v : vertices_range(g)) { |
277 | if (!isLeafNode(v, g)) { |
278 | continue; |
279 | } |
280 | |
281 | if (g[v].reports.empty()) { |
282 | continue; |
283 | } |
284 | |
285 | // TODO: currently only boring vertices. |
286 | if (!g[v].isBoring()) { |
287 | continue; |
288 | } |
289 | |
290 | // Currently only handles vertices with a single literal (should always |
291 | // be the case this early in Rose construction). |
292 | if (g[v].literals.size() != 1) { |
293 | continue; |
294 | } |
295 | |
296 | u32 lit_id = *g[v].literals.begin(); |
297 | const rose_literal_id &lit = tbi.literals.at(lit_id); |
298 | |
299 | // anchored or delayed literals need thought. |
300 | if (lit.table != ROSE_FLOATING || lit.delay) { |
301 | continue; |
302 | } |
303 | |
304 | // don't do this to literals with msk/cmp. |
305 | if (!lit.msk.empty()) { |
306 | continue; |
307 | } |
308 | |
309 | // Can't safely do this operation to vertices with delayed |
310 | // predecessors. |
311 | if (tbi.hasDelayPred(v)) { |
312 | DEBUG_PRINTF("delayed pred\n" ); |
313 | continue; |
314 | } |
315 | |
316 | if (lit.s.length() <= FLOOD_PRONE_LIT_MIN_LENGTH) { |
317 | DEBUG_PRINTF("literal is short enough already\n" ); |
318 | continue; |
319 | } |
320 | |
321 | size_t floodLen = suffixFloodLen(lit.s); |
322 | if (floodLen < floodProneLen) { |
323 | DEBUG_PRINTF("literal not flood-prone\n" ); |
324 | continue; |
325 | } |
326 | |
327 | if (floodLen == lit.s.length()) { |
328 | DEBUG_PRINTF("whole literal is a flood\n" ); |
329 | // Removing the part of the flood from the end of the literal would |
330 | // leave us with a shorter, but still flood-prone, prefix. Better |
331 | // to leave it alone. |
332 | continue; |
333 | } |
334 | |
335 | size_t suffixLen = floodLen - (floodProneLen - 1); |
336 | if (lit.s.length() - suffixLen < FLOOD_PRONE_LIT_MIN_LENGTH) { |
337 | DEBUG_PRINTF("removing flood would leave literal too short\n" ); |
338 | continue; |
339 | } |
340 | |
341 | convertFloodProneSuffix(tbi, v, lit_id, lit, suffixLen); |
342 | } |
343 | } |
344 | |
345 | static |
346 | CharReach getReachOfNormalVertex(const NGHolder &g) { |
347 | for (auto v : vertices_range(g)) { |
348 | if (is_special(v, g)) { |
349 | continue; |
350 | } |
351 | return g[v].char_reach; |
352 | } |
353 | assert(0); |
354 | return CharReach(); |
355 | } |
356 | |
357 | /** |
358 | * \brief Set the edge bounds and appropriate history on the given edge in the |
359 | * Rose graph. |
360 | */ |
361 | static |
362 | void setEdgeBounds(RoseGraph &g, const RoseEdge &e, u32 min_bound, |
363 | u32 max_bound) { |
364 | assert(min_bound <= max_bound); |
365 | assert(max_bound <= ROSE_BOUND_INF); |
366 | |
367 | g[e].minBound = min_bound; |
368 | g[e].maxBound = max_bound; |
369 | |
370 | if (min_bound || max_bound < ROSE_BOUND_INF) { |
371 | g[e].history = ROSE_ROLE_HISTORY_ANCH; |
372 | } else { |
373 | g[e].history = ROSE_ROLE_HISTORY_NONE; |
374 | } |
375 | } |
376 | |
377 | static |
378 | bool handleStartPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v, |
379 | const RoseEdge &e_old, RoseVertex ar, |
380 | vector<RoseEdge> *to_delete) { |
381 | DEBUG_PRINTF("hi\n" ); |
382 | |
383 | /* check for prefix cliches connected to start (^.{N,M}) */ |
384 | if (!getReachOfNormalVertex(h).all()) { |
385 | DEBUG_PRINTF(":(\n" ); |
386 | return false; |
387 | } |
388 | |
389 | PureRepeat repeat; |
390 | if (!isPureRepeat(h, repeat)) { |
391 | DEBUG_PRINTF(":(\n" ); |
392 | return false; |
393 | } |
394 | |
395 | assert(repeat.bounds.min.is_finite()); |
396 | assert(repeat.bounds.max.is_reachable()); |
397 | assert(repeat.bounds.min <= repeat.bounds.max); |
398 | |
399 | DEBUG_PRINTF("prefix is ^.{%s,%s}\n" , repeat.bounds.min.str().c_str(), |
400 | repeat.bounds.max.str().c_str()); |
401 | |
402 | /* update bounds on edge */ |
403 | |
404 | // Convert to Rose graph bounds, which are not (yet?) depth classes. |
405 | u32 bound_min = repeat.bounds.min; |
406 | u32 bound_max = |
407 | repeat.bounds.max.is_finite() ? (u32)repeat.bounds.max : ROSE_BOUND_INF; |
408 | |
409 | if (source(e_old, g) == ar) { |
410 | assert(g[e_old].minBound <= bound_min); |
411 | assert(g[e_old].maxBound >= bound_max); |
412 | setEdgeBounds(g, e_old, bound_min, bound_max); |
413 | } else { |
414 | RoseEdge e_new = add_edge(ar, v, g); |
415 | setEdgeBounds(g, e_new, bound_min, bound_max); |
416 | to_delete->push_back(e_old); |
417 | } |
418 | |
419 | g[v].left.reset(); /* clear the prefix info */ |
420 | return true; |
421 | } |
422 | |
423 | static |
424 | bool handleStartDsPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v, |
425 | const RoseEdge &e) { |
426 | DEBUG_PRINTF("hi\n" ); |
427 | /* check for prefix cliches connected to start-ds (.{N}, ^.{N,}) */ |
428 | u32 repeatCount = 0; |
429 | NFAVertex hu = h.startDs; |
430 | |
431 | auto start_succ = succs<set<NFAVertex>>(h.start, h); |
432 | auto startds_succ = succs<set<NFAVertex>>(h.startDs, h); |
433 | |
434 | if (!is_subset_of(start_succ, startds_succ)) { |
435 | DEBUG_PRINTF("not a simple chain\n" ); |
436 | return false; |
437 | } |
438 | |
439 | set<NFAVertex> seen; |
440 | do { |
441 | if (!h[hu].char_reach.all()) { |
442 | return false; |
443 | } |
444 | NFAVertex hv = getSoleDestVertex(h, hu); |
445 | if (!hv) { |
446 | return false; |
447 | } |
448 | if (contains(seen, hv)) { |
449 | assert(0); |
450 | return false; |
451 | } |
452 | hu = hv; |
453 | repeatCount++; |
454 | if (hu == h.accept) { |
455 | break; |
456 | } |
457 | } while(1); |
458 | |
459 | assert(hu == h.accept); |
460 | |
461 | repeatCount--; /* do not count accept as part of the chain */ |
462 | |
463 | DEBUG_PRINTF("prefix is ^.{%u,}\n" , repeatCount); |
464 | |
465 | /* update bounds on edge */ |
466 | assert(g[e].minBound <= repeatCount); |
467 | setEdgeBounds(g, e, repeatCount, ROSE_BOUND_INF); |
468 | |
469 | g[v].left.reset(); /* clear the prefix info */ |
470 | |
471 | return true; |
472 | } |
473 | |
474 | static |
475 | bool handleMixedPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v, |
476 | const RoseEdge &e_old, RoseVertex ar, |
477 | vector<RoseEdge> *to_delete, |
478 | const CompileContext &cc) { |
479 | assert(in_degree(h.acceptEod, h) == 1); |
480 | |
481 | bool anchored = !proper_out_degree(h.startDs, h); |
482 | NFAVertex key = NGHolder::null_vertex(); |
483 | NFAVertex base = anchored ? h.start : h.startDs; |
484 | |
485 | if (!anchored) { |
486 | auto start_succ = succs<set<NFAVertex>>(h.start, h); |
487 | auto startds_succ = succs<set<NFAVertex>>(h.startDs, h); |
488 | |
489 | if (!is_subset_of(start_succ, startds_succ)) { |
490 | DEBUG_PRINTF("not a simple chain\n" ); |
491 | return false; |
492 | } |
493 | } |
494 | |
495 | for (auto w : adjacent_vertices_range(base, h)) { |
496 | DEBUG_PRINTF("checking %zu\n" , h[w].index); |
497 | if (!h[w].char_reach.all()) { |
498 | continue; |
499 | } |
500 | |
501 | if (!is_special(w, h)) { |
502 | key = w; |
503 | break; |
504 | } |
505 | } |
506 | |
507 | if (!key) { |
508 | return false; |
509 | } |
510 | |
511 | vector<GraphRepeatInfo> repeats; |
512 | findRepeats(h, 2, &repeats); |
513 | |
514 | vector<GraphRepeatInfo>::const_iterator it; |
515 | for (it = repeats.begin(); it != repeats.end(); ++it) { |
516 | DEBUG_PRINTF("checking.. %zu verts\n" , it->vertices.size()); |
517 | if (find(it->vertices.begin(), it->vertices.end(), key) |
518 | != it->vertices.end()) { |
519 | break; |
520 | } |
521 | } |
522 | if (it == repeats.end()) { |
523 | DEBUG_PRINTF("no repeat found\n" ); |
524 | return false; |
525 | } |
526 | |
527 | GraphRepeatInfo ri = *it; |
528 | |
529 | set<NFAVertex> exits_and_repeat_verts; |
530 | for (auto repeat_v : ri.vertices) { |
531 | DEBUG_PRINTF("repeat vertex %zu\n" , h[repeat_v].index); |
532 | succ(h, repeat_v, &exits_and_repeat_verts); |
533 | exits_and_repeat_verts.insert(repeat_v); |
534 | } |
535 | |
536 | DEBUG_PRINTF("repeat {%s,%s}\n" , ri.repeatMin.str().c_str(), |
537 | ri.repeatMax.str().c_str()); |
538 | |
539 | set<NFAVertex> rep_verts; |
540 | insert(&rep_verts, ri.vertices); |
541 | |
542 | set<NFAVertex> exits; |
543 | exits = exits_and_repeat_verts; |
544 | erase_all(&exits, rep_verts); |
545 | |
546 | auto base_succ = succs<set<NFAVertex>>(base, h); |
547 | base_succ.erase(h.startDs); |
548 | |
549 | if (is_subset_of(base_succ, rep_verts)) { |
550 | /* all good: repeat dominates the rest of the pattern */ |
551 | } else if (ri.repeatMin == depth(1) |
552 | && is_subset_of(exits, base_succ) |
553 | && is_subset_of(base_succ, exits_and_repeat_verts)) { |
554 | /* we have a jump edge */ |
555 | ri.repeatMin = depth(0); |
556 | } else { |
557 | return false; |
558 | } |
559 | |
560 | DEBUG_PRINTF("repeat {%s,%s}\n" , ri.repeatMin.str().c_str(), |
561 | ri.repeatMax.str().c_str()); |
562 | DEBUG_PRINTF("woot?\n" ); |
563 | |
564 | shared_ptr<NGHolder> h_new = make_shared<NGHolder>(); |
565 | unordered_map<NFAVertex, NFAVertex> rhs_map; |
566 | vector<NFAVertex> exits_vec; |
567 | insert(&exits_vec, exits_vec.end(), exits); |
568 | splitRHS(h, exits_vec, h_new.get(), &rhs_map); |
569 | h_new->kind = NFA_PREFIX; |
570 | |
571 | if (num_vertices(*h_new) <= N_SPECIALS) { |
572 | DEBUG_PRINTF("not a hybrid??\n" ); |
573 | /* TODO: pick up these cases, unify code */ |
574 | return false; |
575 | } |
576 | |
577 | for (auto w : adjacent_vertices_range(h_new->start, *h_new)) { |
578 | if (w != h_new->startDs) { |
579 | add_edge(h_new->startDs, w, *h_new); |
580 | } |
581 | } |
582 | clear_out_edges(h_new->start, *h_new); |
583 | add_edge(h_new->start, h_new->startDs, *h_new); |
584 | |
585 | depth width = findMinWidth(*h_new); |
586 | if (width != findMaxWidth(*h_new)) { |
587 | return false; |
588 | } |
589 | |
590 | if (g[v].left.dfa) { |
591 | /* we were unable to implement initial graph as an nfa; |
592 | * we need to to check if we still need a dfa and, if so, rebuild. */ |
593 | if (!isImplementableNFA(*h_new, nullptr, cc)) { |
594 | return false; /* TODO: handle rebuilding dfa */ |
595 | } |
596 | } |
597 | |
598 | if (anchored) { |
599 | if (ri.repeatMax.is_infinite()) { |
600 | return false; /* TODO */ |
601 | } |
602 | |
603 | if (source(e_old, g) == ar) { |
604 | setEdgeBounds(g, e_old, ri.repeatMin + width, ri.repeatMax + width); |
605 | } else { |
606 | RoseEdge e_new = add_edge(ar, v, g); |
607 | setEdgeBounds(g, e_new, ri.repeatMin + width, ri.repeatMax + width); |
608 | to_delete->push_back(e_old); |
609 | } |
610 | |
611 | } else { |
612 | assert(g[e_old].minBound <= ri.repeatMin + width); |
613 | setEdgeBounds(g, e_old, ri.repeatMin + width, ROSE_BOUND_INF); |
614 | } |
615 | |
616 | g[v].left.dfa.reset(); |
617 | g[v].left.graph = h_new; |
618 | |
619 | return true; |
620 | } |
621 | |
622 | /* turns simple prefixes like /^.{30,} into bounds on the root roles */ |
623 | void convertPrefixToBounds(RoseBuildImpl &tbi) { |
624 | RoseGraph &g = tbi.g; |
625 | |
626 | vector<RoseEdge> to_delete; |
627 | RoseVertex ar = tbi.anchored_root; |
628 | |
629 | /* graphs with prefixes produced by rose are wired to tbi.root */ |
630 | |
631 | for (const auto &e : out_edges_range(tbi.root, g)) { |
632 | RoseVertex v = target(e, g); |
633 | |
634 | if (in_degree(v, g) != 1) { |
635 | continue; |
636 | } |
637 | |
638 | if (!g[v].left.graph) { |
639 | continue; |
640 | } |
641 | |
642 | if (g[v].left.tracksSom()) { |
643 | continue; |
644 | } |
645 | |
646 | const NGHolder &h = *g[v].left.graph; |
647 | |
648 | if (g[v].left.lag != tbi.minLiteralLen(v) |
649 | || g[v].left.lag != tbi.maxLiteralLen(v)) { |
650 | continue; |
651 | } |
652 | |
653 | if (all_reports(h).size() != 1) { |
654 | assert(0); |
655 | continue; |
656 | } |
657 | |
658 | DEBUG_PRINTF("inspecting prefix of %zu\n" , g[v].index); |
659 | |
660 | if (!proper_out_degree(h.startDs, h)) { |
661 | if (handleStartPrefixCliche(h, g, v, e, ar, &to_delete)) { |
662 | continue; |
663 | } |
664 | } else { |
665 | if (handleStartDsPrefixCliche(h, g, v, e)) { |
666 | continue; |
667 | } |
668 | } |
669 | |
670 | /* prefix is not just a simple dot repeat. However, it is still |
671 | * possible that it consists of dot repeat and fixed width mask that we |
672 | * can handle. */ |
673 | handleMixedPrefixCliche(h, g, v, e, ar, &to_delete, tbi.cc); |
674 | } |
675 | |
676 | for (const auto &e : out_edges_range(ar, g)) { |
677 | RoseVertex v = target(e, g); |
678 | |
679 | /* note: vertices that we have rehomed will currently have an in-degree |
680 | * of 2 */ |
681 | if (in_degree(v, g) != 1) { |
682 | continue; |
683 | } |
684 | |
685 | if (!g[v].left.graph) { |
686 | continue; |
687 | } |
688 | |
689 | if (g[v].left.tracksSom()) { |
690 | continue; |
691 | } |
692 | |
693 | if (g[v].left.lag != tbi.minLiteralLen(v) |
694 | || g[v].left.lag != tbi.maxLiteralLen(v)) { |
695 | continue; |
696 | } |
697 | |
698 | const NGHolder &h = *g[v].left.graph; |
699 | if (all_reports(h).size() != 1) { |
700 | assert(0); |
701 | continue; |
702 | } |
703 | |
704 | DEBUG_PRINTF("inspecting prefix of %zu\n" , g[v].index); |
705 | |
706 | if (!proper_out_degree(h.startDs, h)) { |
707 | if (handleStartPrefixCliche(h, g, v, e, ar, &to_delete)) { |
708 | continue; |
709 | } |
710 | } else { |
711 | if (handleStartDsPrefixCliche(h, g, v, e)) { |
712 | continue; |
713 | } |
714 | } |
715 | |
716 | /* prefix is not just a simple dot repeat. However, it is still |
717 | * possible that it consists of dot repeat and fixed width mask that we |
718 | * can handle. */ |
719 | handleMixedPrefixCliche(h, g, v, e, ar, &to_delete, tbi.cc); |
720 | } |
721 | |
722 | for (const auto &e : to_delete) { |
723 | remove_edge(e, g); |
724 | } |
725 | } |
726 | |
727 | /** |
728 | * Identify dot-repeat infixes after fixed-depth literals and convert them to |
729 | * edges with ROSE_ROLE_HISTORY_ANCH history and equivalent bounds. |
730 | */ |
731 | void convertAnchPrefixToBounds(RoseBuildImpl &tbi) { |
732 | RoseGraph &g = tbi.g; |
733 | |
734 | for (const auto v : vertices_range(g)) { |
735 | if (!g[v].left) { |
736 | continue; |
737 | } |
738 | |
739 | DEBUG_PRINTF("vertex %zu\n" , g[v].index); |
740 | |
741 | // This pass runs after makeCastles, so we use the fact that bounded |
742 | // repeat detection has already been done for us. |
743 | |
744 | if (!g[v].left.castle) { |
745 | DEBUG_PRINTF("not a castle\n" ); |
746 | continue; |
747 | } |
748 | |
749 | const CastleProto &castle = *g[v].left.castle; |
750 | |
751 | if (castle.repeats.size() != 1) { |
752 | DEBUG_PRINTF("too many repeats\n" ); |
753 | assert(0); // Castles should not have been merged yet. |
754 | continue; |
755 | } |
756 | |
757 | if (!castle.reach().all()) { |
758 | DEBUG_PRINTF("not dot\n" ); |
759 | continue; |
760 | } |
761 | |
762 | if (in_degree(v, g) != 1) { |
763 | DEBUG_PRINTF("too many in-edges\n" ); |
764 | continue; |
765 | } |
766 | |
767 | RoseEdge e = *in_edges(v, g).first; |
768 | RoseVertex u = source(e, g); |
769 | |
770 | if (g[e].history != ROSE_ROLE_HISTORY_NONE) { |
771 | DEBUG_PRINTF("history already set to something other than NONE?\n" ); |
772 | assert(0); |
773 | continue; |
774 | } |
775 | |
776 | if (g[u].min_offset != g[u].max_offset) { |
777 | DEBUG_PRINTF("pred not fixed offset\n" ); |
778 | continue; |
779 | } |
780 | DEBUG_PRINTF("pred is fixed offset, at %u\n" , g[u].min_offset); |
781 | assert(g[u].min_offset < ROSE_BOUND_INF); |
782 | |
783 | size_t lit_length = tbi.minLiteralLen(v); |
784 | if (lit_length != tbi.maxLiteralLen(v)) { |
785 | assert(0); |
786 | DEBUG_PRINTF("variable literal lengths\n" ); |
787 | continue; |
788 | } |
789 | |
790 | u32 lag = g[v].left.lag; |
791 | DEBUG_PRINTF("lit_length=%zu, lag=%u\n" , lit_length, lag); |
792 | assert(lag <= lit_length); |
793 | depth delay_adj(lit_length - lag); |
794 | |
795 | const PureRepeat &pr = castle.repeats.begin()->second; |
796 | DEBUG_PRINTF("castle has repeat %s\n" , pr.bounds.str().c_str()); |
797 | DEBUG_PRINTF("delay adj %u\n" , (u32)delay_adj); |
798 | |
799 | if (delay_adj >= pr.bounds.max) { |
800 | DEBUG_PRINTF("delay adj too large\n" ); |
801 | continue; |
802 | } |
803 | |
804 | DepthMinMax bounds(pr.bounds); // copy |
805 | if (delay_adj > bounds.min) { |
806 | bounds.min = depth(0); |
807 | } else { |
808 | bounds.min -= delay_adj; |
809 | } |
810 | bounds.max -= delay_adj; |
811 | setEdgeBounds(g, e, bounds.min, bounds.max.is_finite() |
812 | ? (u32)bounds.max |
813 | : ROSE_BOUND_INF); |
814 | g[v].left.reset(); |
815 | } |
816 | } |
817 | |
818 | } // namespace ue2 |
819 | |