1 | /* |
2 | * Copyright (c) 2015-2017, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** \file |
30 | * \brief Rose compile-time analysis for lookaround masks. |
31 | */ |
32 | #include "rose_build_lookaround.h" |
33 | |
34 | #include "rose_build_impl.h" |
35 | #include "nfa/castlecompile.h" |
36 | #include "nfa/goughcompile.h" |
37 | #include "nfa/rdfa.h" |
38 | #include "nfagraph/ng_repeat.h" |
39 | #include "nfagraph/ng_util.h" |
40 | #include "util/container.h" |
41 | #include "util/dump_charclass.h" |
42 | #include "util/graph_range.h" |
43 | #include "util/flat_containers.h" |
44 | #include "util/verify_types.h" |
45 | |
46 | #include <cstdlib> |
47 | #include <queue> |
48 | #include <sstream> |
49 | |
50 | using namespace std; |
51 | |
52 | namespace ue2 { |
53 | |
54 | /** \brief Max search distance for reachability in front of a role. */ |
55 | static const u32 MAX_FWD_LEN = 64; |
56 | |
57 | /** \brief Max search distance for reachability behind a role. */ |
58 | static const u32 MAX_BACK_LEN = 64; |
59 | |
60 | /** \brief Max lookaround entries for a role. */ |
61 | static const u32 MAX_LOOKAROUND_ENTRIES = 16; |
62 | |
63 | /** \brief We would rather have lookarounds with smaller reach than this. */ |
64 | static const u32 LOOKAROUND_WIDE_REACH = 200; |
65 | |
66 | #if defined(DEBUG) || defined(DUMP_SUPPORT) |
67 | static UNUSED |
68 | string dump(const map<s32, CharReach> &look) { |
69 | ostringstream oss; |
70 | for (auto it = look.begin(), ite = look.end(); it != ite; ++it) { |
71 | if (it != look.begin()) { |
72 | oss << ", " ; |
73 | } |
74 | oss << "{" << it->first << ": " << describeClass(it->second) << "}" ; |
75 | } |
76 | return oss.str(); |
77 | } |
78 | #endif |
79 | |
80 | static |
81 | void getForwardReach(const NGHolder &g, u32 top, map<s32, CharReach> &look) { |
82 | flat_set<NFAVertex> curr, next; |
83 | |
84 | // Consider only successors of start with the required top. |
85 | for (const auto &e : out_edges_range(g.start, g)) { |
86 | NFAVertex v = target(e, g); |
87 | if (v == g.startDs) { |
88 | continue; |
89 | } |
90 | if (contains(g[e].tops, top)) { |
91 | curr.insert(v); |
92 | } |
93 | } |
94 | |
95 | for (u32 i = 0; i < MAX_FWD_LEN; i++) { |
96 | if (curr.empty() || contains(curr, g.accept) || |
97 | contains(curr, g.acceptEod)) { |
98 | break; |
99 | } |
100 | |
101 | next.clear(); |
102 | CharReach cr; |
103 | |
104 | for (auto v : curr) { |
105 | assert(!is_special(v, g)); |
106 | cr |= g[v].char_reach; |
107 | insert(&next, adjacent_vertices(v, g)); |
108 | } |
109 | |
110 | assert(cr.any()); |
111 | look[i] |= cr; |
112 | curr.swap(next); |
113 | } |
114 | } |
115 | |
116 | static |
117 | void getBackwardReach(const NGHolder &g, ReportID report, u32 lag, |
118 | map<s32, CharReach> &look) { |
119 | flat_set<NFAVertex> curr, next; |
120 | |
121 | for (auto v : inv_adjacent_vertices_range(g.accept, g)) { |
122 | if (contains(g[v].reports, report)) { |
123 | curr.insert(v); |
124 | } |
125 | } |
126 | |
127 | for (u32 i = lag + 1; i <= MAX_BACK_LEN; i++) { |
128 | if (curr.empty() || contains(curr, g.start) || |
129 | contains(curr, g.startDs)) { |
130 | break; |
131 | } |
132 | |
133 | next.clear(); |
134 | CharReach cr; |
135 | |
136 | for (auto v : curr) { |
137 | assert(!is_special(v, g)); |
138 | cr |= g[v].char_reach; |
139 | insert(&next, inv_adjacent_vertices(v, g)); |
140 | } |
141 | |
142 | assert(cr.any()); |
143 | look[0 - i] |= cr; |
144 | curr.swap(next); |
145 | } |
146 | } |
147 | |
148 | static |
149 | void getForwardReach(const CastleProto &castle, u32 top, |
150 | map<s32, CharReach> &look) { |
151 | depth len = castle.repeats.at(top).bounds.min; |
152 | len = min(len, depth(MAX_FWD_LEN)); |
153 | assert(len.is_finite()); |
154 | |
155 | const CharReach &cr = castle.reach(); |
156 | for (u32 i = 0; i < len; i++) { |
157 | look[i] |= cr; |
158 | } |
159 | } |
160 | |
161 | static |
162 | void getBackwardReach(const CastleProto &castle, ReportID report, u32 lag, |
163 | map<s32, CharReach> &look) { |
164 | depth min_depth = depth::infinity(); |
165 | for (const auto &m : castle.repeats) { |
166 | const PureRepeat &pr = m.second; |
167 | if (contains(pr.reports, report)) { |
168 | min_depth = min(min_depth, pr.bounds.min); |
169 | } |
170 | } |
171 | |
172 | if (!min_depth.is_finite()) { |
173 | assert(0); |
174 | return; |
175 | } |
176 | |
177 | const CharReach &cr = castle.reach(); |
178 | for (u32 i = lag + 1; i <= min(lag + (u32)min_depth, MAX_BACK_LEN); |
179 | i++) { |
180 | look[0 - i] |= cr; |
181 | } |
182 | } |
183 | |
184 | static |
185 | void getForwardReach(const raw_dfa &rdfa, map<s32, CharReach> &look) { |
186 | if (rdfa.states.size() < 2) { |
187 | return; |
188 | } |
189 | |
190 | flat_set<dstate_id_t> curr, next; |
191 | curr.insert(rdfa.start_anchored); |
192 | |
193 | for (u32 i = 0; i < MAX_FWD_LEN && !curr.empty(); i++) { |
194 | next.clear(); |
195 | CharReach cr; |
196 | |
197 | for (const auto state_id : curr) { |
198 | const dstate &ds = rdfa.states[state_id]; |
199 | |
200 | if (!ds.reports.empty() || !ds.reports_eod.empty()) { |
201 | return; |
202 | } |
203 | |
204 | for (unsigned c = 0; c < N_CHARS; c++) { |
205 | dstate_id_t succ = ds.next[rdfa.alpha_remap[c]]; |
206 | if (succ != DEAD_STATE) { |
207 | cr.set(c); |
208 | next.insert(succ); |
209 | } |
210 | } |
211 | } |
212 | |
213 | assert(cr.any()); |
214 | look[i] |= cr; |
215 | curr.swap(next); |
216 | } |
217 | } |
218 | |
219 | static |
220 | void getSuffixForwardReach(const suffix_id &suff, u32 top, |
221 | map<s32, CharReach> &look) { |
222 | if (suff.graph()) { |
223 | getForwardReach(*suff.graph(), top, look); |
224 | } else if (suff.castle()) { |
225 | getForwardReach(*suff.castle(), top, look); |
226 | } else if (suff.dfa()) { |
227 | assert(top == 0); // DFA isn't multi-top capable. |
228 | getForwardReach(*suff.dfa(), look); |
229 | } else if (suff.haig()) { |
230 | assert(top == 0); // DFA isn't multi-top capable. |
231 | getForwardReach(*suff.haig(), look); |
232 | } |
233 | } |
234 | |
235 | static |
236 | void getRoseForwardReach(const left_id &left, u32 top, |
237 | map<s32, CharReach> &look) { |
238 | if (left.graph()) { |
239 | getForwardReach(*left.graph(), top, look); |
240 | } else if (left.castle()) { |
241 | getForwardReach(*left.castle(), top, look); |
242 | } else if (left.dfa()) { |
243 | assert(top == 0); // DFA isn't multi-top capable. |
244 | getForwardReach(*left.dfa(), look); |
245 | } else if (left.haig()) { |
246 | assert(top == 0); // DFA isn't multi-top capable. |
247 | getForwardReach(*left.haig(), look); |
248 | } |
249 | } |
250 | |
251 | static |
252 | void combineForwardMasks(const vector<map<s32, CharReach> > &rose_look, |
253 | map<s32, CharReach> &look) { |
254 | for (u32 i = 0; i < MAX_FWD_LEN; i++) { |
255 | for (const auto &rlook : rose_look) { |
256 | if (contains(rlook, i)) { |
257 | look[i] |= rlook.at(i); |
258 | } else { |
259 | look[i].setall(); |
260 | } |
261 | } |
262 | } |
263 | } |
264 | |
265 | static |
266 | void findForwardReach(const RoseGraph &g, const RoseVertex v, |
267 | map<s32, CharReach> &look) { |
268 | if (!g[v].reports.empty()) { |
269 | DEBUG_PRINTF("acceptor\n" ); |
270 | return; |
271 | } |
272 | |
273 | // Non-leaf vertices can pick up a mask per successor prefix rose |
274 | // engine. |
275 | vector<map<s32, CharReach>> rose_look; |
276 | for (const auto &e : out_edges_range(v, g)) { |
277 | RoseVertex t = target(e, g); |
278 | if (!g[t].left) { |
279 | DEBUG_PRINTF("successor %zu has no leftfix\n" , g[t].index); |
280 | return; |
281 | } |
282 | rose_look.push_back(map<s32, CharReach>()); |
283 | getRoseForwardReach(g[t].left, g[e].rose_top, rose_look.back()); |
284 | } |
285 | |
286 | if (g[v].suffix) { |
287 | DEBUG_PRINTF("suffix engine\n" ); |
288 | rose_look.push_back(map<s32, CharReach>()); |
289 | getSuffixForwardReach(g[v].suffix, g[v].suffix.top, rose_look.back()); |
290 | } |
291 | |
292 | combineForwardMasks(rose_look, look); |
293 | } |
294 | |
295 | static |
296 | void findBackwardReach(const RoseGraph &g, const RoseVertex v, |
297 | map<s32, CharReach> &look) { |
298 | if (!g[v].left) { |
299 | return; |
300 | } |
301 | |
302 | DEBUG_PRINTF("leftfix, report=%u, lag=%u\n" , g[v].left.leftfix_report, |
303 | g[v].left.lag); |
304 | |
305 | if (g[v].left.graph) { |
306 | getBackwardReach(*g[v].left.graph, g[v].left.leftfix_report, |
307 | g[v].left.lag, look); |
308 | } else if (g[v].left.castle) { |
309 | getBackwardReach(*g[v].left.castle, g[v].left.leftfix_report, |
310 | g[v].left.lag, look); |
311 | } |
312 | |
313 | // TODO: implement DFA variants if necessary. |
314 | } |
315 | |
316 | static |
317 | void normalise(map<s32, CharReach> &look) { |
318 | // We can erase entries where the reach is "all characters". |
319 | vector<s32> dead; |
320 | for (const auto &m : look) { |
321 | if (m.second.all()) { |
322 | dead.push_back(m.first); |
323 | } |
324 | } |
325 | erase_all(&look, dead); |
326 | } |
327 | |
328 | namespace { |
329 | |
330 | struct LookPriority { |
331 | explicit LookPriority(const map<s32, CharReach> &look_in) : look(look_in) {} |
332 | |
333 | bool operator()(s32 a, s32 b) const { |
334 | const CharReach &a_reach = look.at(a); |
335 | const CharReach &b_reach = look.at(b); |
336 | if (a_reach.count() != b_reach.count()) { |
337 | return a_reach.count() < b_reach.count(); |
338 | } |
339 | return abs(a) < abs(b); |
340 | } |
341 | |
342 | private: |
343 | const map<s32, CharReach> &look; |
344 | }; |
345 | |
346 | } // namespace |
347 | |
348 | static |
349 | bool isFloodProne(const map<s32, CharReach> &look, const CharReach &flood_cr) { |
350 | for (const auto &m : look) { |
351 | const CharReach &look_cr = m.second; |
352 | if (!overlaps(look_cr, flood_cr)) { |
353 | return false; |
354 | } |
355 | } |
356 | DEBUG_PRINTF("look can't escape flood on %s\n" , |
357 | describeClass(flood_cr).c_str()); |
358 | return true; |
359 | } |
360 | |
361 | static |
362 | bool isFloodProne(const map<s32, CharReach> &look, |
363 | const set<CharReach> &flood_reach) { |
364 | if (flood_reach.empty()) { |
365 | return false; |
366 | } |
367 | |
368 | for (const CharReach &flood_cr : flood_reach) { |
369 | if (isFloodProne(look, flood_cr)) { |
370 | return true; |
371 | } |
372 | } |
373 | |
374 | return false; |
375 | } |
376 | |
377 | static |
378 | void reduce(map<s32, CharReach> &look, set<CharReach> &flood_reach) { |
379 | if (look.size() <= MAX_LOOKAROUND_ENTRIES) { |
380 | return; |
381 | } |
382 | |
383 | DEBUG_PRINTF("before reduce: %s\n" , dump(look).c_str()); |
384 | |
385 | // First, remove floods that we already can't escape; they shouldn't affect |
386 | // the analysis below. |
387 | for (auto it = flood_reach.begin(); it != flood_reach.end();) { |
388 | if (isFloodProne(look, *it)) { |
389 | DEBUG_PRINTF("removing inescapable flood on %s from analysis\n" , |
390 | describeClass(*it).c_str()); |
391 | flood_reach.erase(it++); |
392 | } else { |
393 | ++it; |
394 | } |
395 | } |
396 | |
397 | LookPriority cmp(look); |
398 | priority_queue<s32, vector<s32>, LookPriority> pq(cmp); |
399 | for (const auto &m : look) { |
400 | pq.push(m.first); |
401 | } |
402 | |
403 | while (!pq.empty() && look.size() > MAX_LOOKAROUND_ENTRIES) { |
404 | s32 d = pq.top(); |
405 | assert(contains(look, d)); |
406 | const CharReach cr(look[d]); // copy |
407 | pq.pop(); |
408 | |
409 | DEBUG_PRINTF("erasing {%d: %s}\n" , d, describeClass(cr).c_str()); |
410 | look.erase(d); |
411 | |
412 | // If removing this entry would result in us becoming flood_prone on a |
413 | // particular flood_reach case, reinstate it and move on. |
414 | if (isFloodProne(look, flood_reach)) { |
415 | DEBUG_PRINTF("reinstating {%d: %s} due to flood-prone check\n" , d, |
416 | describeClass(cr).c_str()); |
417 | look.insert(make_pair(d, cr)); |
418 | } |
419 | } |
420 | |
421 | while (!pq.empty()) { |
422 | s32 d = pq.top(); |
423 | assert(contains(look, d)); |
424 | const CharReach cr(look[d]); // copy |
425 | pq.pop(); |
426 | |
427 | if (cr.count() < LOOKAROUND_WIDE_REACH) { |
428 | continue; |
429 | } |
430 | |
431 | DEBUG_PRINTF("erasing {%d: %s}\n" , d, describeClass(cr).c_str()); |
432 | look.erase(d); |
433 | |
434 | // If removing this entry would result in us becoming flood_prone on a |
435 | // particular flood_reach case, reinstate it and move on. |
436 | if (isFloodProne(look, flood_reach)) { |
437 | DEBUG_PRINTF("reinstating {%d: %s} due to flood-prone check\n" , d, |
438 | describeClass(cr).c_str()); |
439 | look.insert(make_pair(d, cr)); |
440 | } |
441 | } |
442 | |
443 | DEBUG_PRINTF("after reduce: %s\n" , dump(look).c_str()); |
444 | } |
445 | |
446 | static |
447 | void findFloodReach(const RoseBuildImpl &tbi, const RoseVertex v, |
448 | set<CharReach> &flood_reach) { |
449 | for (u32 lit_id : tbi.g[v].literals) { |
450 | const ue2_literal &s = tbi.literals.at(lit_id).s; |
451 | if (s.empty()) { |
452 | continue; |
453 | } |
454 | if (is_flood(s)) { |
455 | CharReach cr(*s.begin()); |
456 | DEBUG_PRINTF("flood-prone with reach: %s\n" , |
457 | describeClass(cr).c_str()); |
458 | flood_reach.insert(cr); |
459 | } |
460 | } |
461 | } |
462 | |
463 | |
464 | namespace { |
465 | struct LookProto { |
466 | LookProto(s32 offset_in, CharReach reach_in) |
467 | : offset(offset_in), reach(move(reach_in)) {} |
468 | s32 offset; |
469 | CharReach reach; |
470 | }; |
471 | } |
472 | |
473 | static |
474 | vector<LookProto> findLiteralReach(const rose_literal_id &lit) { |
475 | vector<LookProto> look; |
476 | look.reserve(lit.s.length()); |
477 | |
478 | s32 i = 0 - lit.s.length() - lit.delay; |
479 | for (const auto &c : lit.s) { |
480 | look.emplace_back(i, c); |
481 | i++; |
482 | } |
483 | |
484 | return look; |
485 | } |
486 | |
487 | static |
488 | vector<LookProto> findLiteralReach(const RoseBuildImpl &build, |
489 | const RoseVertex v) { |
490 | bool first = true; |
491 | vector<LookProto> look; |
492 | |
493 | for (u32 lit_id : build.g[v].literals) { |
494 | const rose_literal_id &lit = build.literals.at(lit_id); |
495 | auto lit_look = findLiteralReach(lit); |
496 | |
497 | if (first) { |
498 | look = std::move(lit_look); |
499 | first = false; |
500 | continue; |
501 | } |
502 | |
503 | // Erase elements from look with keys not in lit_look. Where a key is |
504 | // in both maps, union its reach with the lookaround. |
505 | auto jt = begin(lit_look); |
506 | for (auto it = begin(look); it != end(look);) { |
507 | if (jt == end(lit_look)) { |
508 | // No further lit_look entries, erase remaining elements from |
509 | // look. |
510 | look.erase(it, end(look)); |
511 | break; |
512 | } |
513 | if (it->offset < jt->offset) { |
514 | // Offset is present in look but not in lit_look, erase. |
515 | it = look.erase(it); |
516 | } else if (it->offset > jt->offset) { |
517 | // Offset is preset in lit_look but not in look, ignore. |
518 | ++jt; |
519 | } else { |
520 | // Offset is present in both, union its reach with look. |
521 | it->reach |= jt->reach; |
522 | ++it; |
523 | ++jt; |
524 | } |
525 | } |
526 | } |
527 | |
528 | return look; |
529 | } |
530 | |
531 | /** |
532 | * Trim lookaround checks from the prefix that overlap with the literals |
533 | * themselves. |
534 | */ |
535 | static |
536 | void trimLiterals(const RoseBuildImpl &build, const RoseVertex v, |
537 | map<s32, CharReach> &look) { |
538 | DEBUG_PRINTF("pre-trim lookaround: %s\n" , dump(look).c_str()); |
539 | |
540 | for (const auto &m : findLiteralReach(build, v)) { |
541 | auto it = look.find(m.offset); |
542 | if (it == end(look)) { |
543 | continue; |
544 | } |
545 | if (m.reach.isSubsetOf(it->second)) { |
546 | DEBUG_PRINTF("can trim entry at %d\n" , it->first); |
547 | look.erase(it); |
548 | } |
549 | } |
550 | |
551 | DEBUG_PRINTF("post-trim lookaround: %s\n" , dump(look).c_str()); |
552 | } |
553 | |
554 | static |
555 | void normaliseLeftfix(map<s32, CharReach> &look) { |
556 | // We can erase entries where the reach is "all characters", except for the |
557 | // very first one -- this might be required to establish a minimum bound on |
558 | // the literal's match offset. |
559 | |
560 | // TODO: It would be cleaner to use a literal program instruction to check |
561 | // the minimum bound explicitly. |
562 | |
563 | if (look.empty()) { |
564 | return; |
565 | } |
566 | |
567 | const auto earliest = begin(look)->first; |
568 | |
569 | vector<s32> dead; |
570 | for (const auto &m : look) { |
571 | if (m.second.all() && m.first != earliest) { |
572 | dead.push_back(m.first); |
573 | } |
574 | } |
575 | erase_all(&look, dead); |
576 | } |
577 | |
578 | static |
579 | bool trimMultipathLeftfix(const RoseBuildImpl &build, const RoseVertex v, |
580 | vector<map<s32, CharReach>> &looks) { |
581 | size_t path_count = 0; |
582 | for (auto &look : looks) { |
583 | ++path_count; |
584 | DEBUG_PRINTF("Path #%ld\n" , path_count); |
585 | |
586 | assert(!look.empty()); |
587 | trimLiterals(build, v, look); |
588 | |
589 | if (look.empty()) { |
590 | return false; |
591 | } |
592 | |
593 | // Could be optimized here, just keep the empty byte of the longest path |
594 | normaliseLeftfix(look); |
595 | |
596 | if (look.size() > MAX_LOOKAROUND_ENTRIES) { |
597 | DEBUG_PRINTF("lookaround too big (%zu entries)\n" , look.size()); |
598 | return false; |
599 | } |
600 | } |
601 | return true; |
602 | } |
603 | |
604 | static |
605 | void transToLookaround(const vector<map<s32, CharReach>> &looks, |
606 | vector<vector<LookEntry>> &lookarounds) { |
607 | for (const auto &look : looks) { |
608 | vector<LookEntry> lookaround; |
609 | DEBUG_PRINTF("lookaround: %s\n" , dump(look).c_str()); |
610 | lookaround.reserve(look.size()); |
611 | for (const auto &m : look) { |
612 | if (m.first < -128 || m.first > 127) { |
613 | DEBUG_PRINTF("range too big\n" ); |
614 | lookarounds.clear(); |
615 | return; |
616 | } |
617 | s8 offset = verify_s8(m.first); |
618 | lookaround.emplace_back(offset, m.second); |
619 | } |
620 | lookarounds.push_back(lookaround); |
621 | } |
622 | } |
623 | |
624 | void findLookaroundMasks(const RoseBuildImpl &tbi, const RoseVertex v, |
625 | vector<LookEntry> &lookaround) { |
626 | lookaround.clear(); |
627 | |
628 | const RoseGraph &g = tbi.g; |
629 | |
630 | map<s32, CharReach> look; |
631 | findBackwardReach(g, v, look); |
632 | findForwardReach(g, v, look); |
633 | trimLiterals(tbi, v, look); |
634 | |
635 | if (look.empty()) { |
636 | return; |
637 | } |
638 | |
639 | normalise(look); |
640 | |
641 | if (look.empty()) { |
642 | return; |
643 | } |
644 | |
645 | set<CharReach> flood_reach; |
646 | findFloodReach(tbi, v, flood_reach); |
647 | reduce(look, flood_reach); |
648 | |
649 | if (look.empty()) { |
650 | return; |
651 | } |
652 | |
653 | DEBUG_PRINTF("lookaround: %s\n" , dump(look).c_str()); |
654 | lookaround.reserve(look.size()); |
655 | for (const auto &m : look) { |
656 | s8 offset = verify_s8(m.first); |
657 | lookaround.emplace_back(offset, m.second); |
658 | } |
659 | } |
660 | |
661 | static |
662 | bool checkShuftiBuckets(const vector<map<s32, CharReach>> &looks, |
663 | u32 bucket_size) { |
664 | set<u32> bucket; |
665 | for (const auto &look : looks) { |
666 | for (const auto &l : look) { |
667 | CharReach cr = l.second; |
668 | if (cr.count() > 128) { |
669 | cr.flip(); |
670 | } |
671 | map <u16, u16> lo2hi; |
672 | |
673 | for (size_t i = cr.find_first(); i != CharReach::npos;) { |
674 | u8 it_hi = i >> 4; |
675 | u16 low_encode = 0; |
676 | while (i != CharReach::npos && (i >> 4) == it_hi) { |
677 | low_encode |= 1 << (i &0xf); |
678 | i = cr.find_next(i); |
679 | } |
680 | lo2hi[low_encode] |= 1 << it_hi; |
681 | } |
682 | |
683 | for (const auto &it : lo2hi) { |
684 | u32 hi_lo = (it.second << 16) | it.first; |
685 | bucket.insert(hi_lo); |
686 | } |
687 | } |
688 | } |
689 | DEBUG_PRINTF("shufti has %lu bucket(s)\n" , bucket.size()); |
690 | return bucket.size() <= bucket_size; |
691 | } |
692 | |
693 | static |
694 | bool getTransientPrefixReach(const NGHolder &g, ReportID report, u32 lag, |
695 | vector<map<s32, CharReach>> &looks) { |
696 | if (!isAcyclic(g)) { |
697 | DEBUG_PRINTF("contains back-edge\n" ); |
698 | return false; |
699 | } |
700 | |
701 | // Must be floating chains wired to startDs. |
702 | if (!isFloating(g)) { |
703 | DEBUG_PRINTF("not a floating start\n" ); |
704 | return false; |
705 | } |
706 | |
707 | vector<NFAVertex> curr; |
708 | for (auto v : inv_adjacent_vertices_range(g.accept, g)) { |
709 | if (v == g.start || v == g.startDs) { |
710 | DEBUG_PRINTF("empty graph\n" ); |
711 | return true; |
712 | } |
713 | if (contains(g[v].reports, report)) { |
714 | curr.push_back(v); |
715 | } |
716 | } |
717 | |
718 | assert(!curr.empty()); |
719 | |
720 | u32 total_len = curr.size(); |
721 | |
722 | for (const auto &v : curr) { |
723 | looks.emplace_back(map<s32, CharReach>()); |
724 | looks.back()[0 - (lag + 1)] = g[v].char_reach; |
725 | } |
726 | |
727 | bool curr_active = false; |
728 | |
729 | /* For each offset -i, we backwardly trace the path by vertices in curr. |
730 | * Once there are more than 8 paths and more than 64 bits total_len, |
731 | * which means that neither MULTIPATH_LOOKAROUND nor MULTIPATH_SHUFTI |
732 | * could be successfully built, we will give up the path finding. |
733 | * Otherwise, the loop will halt when all vertices in curr are startDs. |
734 | */ |
735 | for (u32 i = lag + 2; i < (lag + 2) + MAX_BACK_LEN; i++) { |
736 | curr_active = false; |
737 | size_t curr_size = curr.size(); |
738 | if (curr.size() > 1 && i > lag + MULTIPATH_MAX_LEN) { |
739 | DEBUG_PRINTF("range is larger than 16 in multi-path\n" ); |
740 | return false; |
741 | } |
742 | |
743 | for (size_t idx = 0; idx < curr_size; idx++) { |
744 | NFAVertex v = curr[idx]; |
745 | if (v == g.startDs) { |
746 | continue; |
747 | } |
748 | assert(!is_special(v, g)); |
749 | |
750 | for (auto u : inv_adjacent_vertices_range(v, g)) { |
751 | if (u == g.start || u == g.startDs) { |
752 | curr[idx] = g.startDs; |
753 | break; |
754 | } |
755 | } |
756 | |
757 | if (is_special(curr[idx], g)) { |
758 | continue; |
759 | } |
760 | |
761 | for (auto u : inv_adjacent_vertices_range(v, g)) { |
762 | curr_active = true; |
763 | if (curr[idx] == v) { |
764 | curr[idx] = u; |
765 | looks[idx][0 - i] = g[u].char_reach; |
766 | total_len++; |
767 | } else { |
768 | curr.push_back(u); |
769 | looks.push_back(looks[idx]); |
770 | (looks.back())[0 - i] = g[u].char_reach; |
771 | total_len += looks.back().size(); |
772 | } |
773 | |
774 | if (curr.size() > MAX_LOOKAROUND_PATHS && total_len > 64) { |
775 | DEBUG_PRINTF("too many branches\n" ); |
776 | return false; |
777 | } |
778 | } |
779 | } |
780 | if (!curr_active) { |
781 | break; |
782 | } |
783 | } |
784 | |
785 | if (curr_active) { |
786 | DEBUG_PRINTF("single path too long\n" ); |
787 | return false; |
788 | } |
789 | |
790 | // More than 8 paths, check multi-path shufti. |
791 | if (curr.size() > MAX_LOOKAROUND_PATHS) { |
792 | u32 bucket_size = total_len > 32 ? 8 : 16; |
793 | if (!checkShuftiBuckets(looks, bucket_size)) { |
794 | DEBUG_PRINTF("shufti has too many buckets\n" ); |
795 | return false; |
796 | } |
797 | } |
798 | |
799 | assert(!looks.empty()); |
800 | if (looks.size() == 1) { |
801 | DEBUG_PRINTF("single lookaround\n" ); |
802 | } else { |
803 | DEBUG_PRINTF("multi-path lookaround\n" ); |
804 | } |
805 | DEBUG_PRINTF("done\n" ); |
806 | return true; |
807 | } |
808 | |
809 | bool makeLeftfixLookaround(const RoseBuildImpl &build, const RoseVertex v, |
810 | vector<vector<LookEntry>> &lookaround) { |
811 | lookaround.clear(); |
812 | |
813 | const RoseGraph &g = build.g; |
814 | const left_id leftfix(g[v].left); |
815 | |
816 | if (!contains(build.transient, leftfix)) { |
817 | DEBUG_PRINTF("not transient\n" ); |
818 | return false; |
819 | } |
820 | |
821 | if (!leftfix.graph()) { |
822 | DEBUG_PRINTF("only supported for graphs so far\n" ); |
823 | return false; |
824 | } |
825 | |
826 | vector<map<s32, CharReach>> looks; |
827 | if (!getTransientPrefixReach(*leftfix.graph(), g[v].left.leftfix_report, |
828 | g[v].left.lag, looks)) { |
829 | DEBUG_PRINTF("graph has loop or too large\n" ); |
830 | return false; |
831 | } |
832 | |
833 | if (!trimMultipathLeftfix(build, v, looks)) { |
834 | return false; |
835 | } |
836 | transToLookaround(looks, lookaround); |
837 | |
838 | return !lookaround.empty(); |
839 | } |
840 | |
841 | void mergeLookaround(vector<LookEntry> &lookaround, |
842 | const vector<LookEntry> &more_lookaround) { |
843 | if (lookaround.size() >= MAX_LOOKAROUND_ENTRIES) { |
844 | DEBUG_PRINTF("big enough!\n" ); |
845 | return; |
846 | } |
847 | |
848 | // Don't merge lookarounds at offsets we already have entries for. |
849 | flat_set<s8> offsets; |
850 | for (const auto &e : lookaround) { |
851 | offsets.insert(e.offset); |
852 | } |
853 | |
854 | map<s32, CharReach> more; |
855 | LookPriority cmp(more); |
856 | priority_queue<s32, vector<s32>, LookPriority> pq(cmp); |
857 | for (const auto &e : more_lookaround) { |
858 | if (!contains(offsets, e.offset)) { |
859 | more.emplace(e.offset, e.reach); |
860 | pq.push(e.offset); |
861 | } |
862 | } |
863 | |
864 | while (!pq.empty() && lookaround.size() < MAX_LOOKAROUND_ENTRIES) { |
865 | const s32 offset = pq.top(); |
866 | pq.pop(); |
867 | const auto &cr = more.at(offset); |
868 | DEBUG_PRINTF("added {%d,%s}\n" , offset, describeClass(cr).c_str()); |
869 | lookaround.emplace_back(verify_s8(offset), cr); |
870 | } |
871 | |
872 | // Order by offset. |
873 | sort(begin(lookaround), end(lookaround), |
874 | [](const LookEntry &a, const LookEntry &b) { |
875 | return a.offset < b.offset; |
876 | }); |
877 | } |
878 | |
879 | } // namespace ue2 |
880 | |