1 | /* |
2 | * Copyright (c) 2015-2017, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** |
30 | * \file |
31 | * \brief Propagate extended parameters to vertex reports and reduce graph if |
32 | * possible. |
33 | * |
34 | * This code handles the propagation of the extension parameters specified by |
35 | * the user with the \ref hs_expr_ext structure into the reports on the graph's |
36 | * vertices. |
37 | * |
38 | * There are also some analyses that prune edges that cannot contribute to a |
39 | * match given these constraints, or transform the graph in order to make a |
40 | * constraint implicit. |
41 | */ |
42 | |
43 | #include "ng_extparam.h" |
44 | |
45 | #include "ng.h" |
46 | #include "ng_depth.h" |
47 | #include "ng_dump.h" |
48 | #include "ng_prune.h" |
49 | #include "ng_reports.h" |
50 | #include "ng_som_util.h" |
51 | #include "ng_width.h" |
52 | #include "ng_util.h" |
53 | #include "ue2common.h" |
54 | #include "compiler/compiler.h" |
55 | #include "parser/position.h" |
56 | #include "util/compile_context.h" |
57 | #include "util/compile_error.h" |
58 | #include "util/container.h" |
59 | #include "util/graph.h" |
60 | #include "util/graph_range.h" |
61 | |
62 | #include <sstream> |
63 | #include <string> |
64 | |
65 | using namespace std; |
66 | |
67 | namespace ue2 { |
68 | |
69 | static const u32 MAX_MAXOFFSET_TO_ANCHOR = 2000; |
70 | static const u32 MAX_MINLENGTH_TO_CONVERT = 2000; |
71 | |
72 | /** True if all the given reports have the same extparam bounds. */ |
73 | template<typename Container> |
74 | bool hasSameBounds(const Container &reports, const ReportManager &rm) { |
75 | assert(!reports.empty()); |
76 | |
77 | const auto &first = rm.getReport(*reports.begin()); |
78 | for (auto id : reports) { |
79 | const auto &report = rm.getReport(id); |
80 | if (report.minOffset != first.minOffset || |
81 | report.maxOffset != first.maxOffset || |
82 | report.minLength != first.minLength) { |
83 | return false; |
84 | } |
85 | } |
86 | |
87 | return true; |
88 | } |
89 | |
90 | /** |
91 | * \brief Find the (min, max) offset adjustment for the reports on a given |
92 | * vertex. |
93 | */ |
94 | static |
95 | pair<s32,s32> getMinMaxOffsetAdjust(const ReportManager &rm, |
96 | const NGHolder &g, NFAVertex v) { |
97 | s32 minAdj = 0, maxAdj = 0; |
98 | const auto &reports = g[v].reports; |
99 | for (auto ri = reports.begin(), re = reports.end(); ri != re; ++ri) { |
100 | const Report &ir = rm.getReport(*ri); |
101 | if (ri == reports.begin()) { |
102 | minAdj = ir.offsetAdjust; |
103 | maxAdj = ir.offsetAdjust; |
104 | } else { |
105 | minAdj = min(minAdj, ir.offsetAdjust); |
106 | maxAdj = max(maxAdj, ir.offsetAdjust); |
107 | } |
108 | } |
109 | |
110 | return make_pair(minAdj, maxAdj); |
111 | } |
112 | |
113 | /** \brief Find the (min, max) length of any match for the given holder. */ |
114 | static |
115 | DepthMinMax findMatchLengths(const ReportManager &rm, const NGHolder &g) { |
116 | DepthMinMax match_depths; |
117 | |
118 | vector<DepthMinMax> depths = getDistancesFromSOM(g); |
119 | |
120 | pair<s32, s32> adj; |
121 | |
122 | for (auto v : inv_adjacent_vertices_range(g.accept, g)) { |
123 | u32 idx = g[v].index; |
124 | DepthMinMax d = depths[idx]; // copy |
125 | adj = getMinMaxOffsetAdjust(rm, g, v); |
126 | DEBUG_PRINTF("vertex %u: depths=%s, adj=[%d,%d]\n" , idx, |
127 | d.str().c_str(), adj.first, adj.second); |
128 | d.min += adj.first; |
129 | d.max += adj.second; |
130 | match_depths = unionDepthMinMax(match_depths, d); |
131 | } |
132 | |
133 | for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) { |
134 | if (v == g.accept) { |
135 | continue; |
136 | } |
137 | u32 idx = g[v].index; |
138 | DepthMinMax d = depths[idx]; // copy |
139 | adj = getMinMaxOffsetAdjust(rm, g, v); |
140 | DEBUG_PRINTF("vertex %u: depths=%s, adj=[%d,%d]\n" , idx, |
141 | d.str().c_str(), adj.first, adj.second); |
142 | d.min += adj.first; |
143 | d.max += adj.second; |
144 | match_depths = unionDepthMinMax(match_depths, d); |
145 | } |
146 | |
147 | DEBUG_PRINTF("match_depths=%s\n" , match_depths.str().c_str()); |
148 | |
149 | assert(match_depths.min.is_reachable()); |
150 | assert(match_depths.max.is_reachable()); |
151 | return match_depths; |
152 | } |
153 | |
154 | template<typename Function> |
155 | void replaceReports(NGHolder &g, NFAVertex accept, flat_set<NFAVertex> &seen, |
156 | Function func) { |
157 | for (auto v : inv_adjacent_vertices_range(accept, g)) { |
158 | if (v == g.accept) { |
159 | // Don't operate on accept: the accept->acceptEod edge is stylised. |
160 | assert(accept == g.acceptEod); |
161 | assert(g[v].reports.empty()); |
162 | continue; |
163 | } |
164 | |
165 | if (!seen.insert(v).second) { |
166 | continue; // We have already processed v. |
167 | } |
168 | |
169 | auto &reports = g[v].reports; |
170 | if (reports.empty()) { |
171 | continue; |
172 | } |
173 | decltype(g[v].reports) new_reports; |
174 | for (auto id : g[v].reports) { |
175 | new_reports.insert(func(v, id)); |
176 | } |
177 | reports = std::move(new_reports); |
178 | } |
179 | } |
180 | |
181 | /** |
182 | * Generic function for replacing all the reports in the graph. |
183 | * |
184 | * Pass this a function that takes a vertex and a ReportID returns another |
185 | * ReportID (or the same one) to replace it with. |
186 | */ |
187 | template<typename Function> |
188 | void replaceReports(NGHolder &g, Function func) { |
189 | flat_set<NFAVertex> seen; |
190 | replaceReports(g, g.accept, seen, func); |
191 | replaceReports(g, g.acceptEod, seen, func); |
192 | } |
193 | |
194 | /** \brief Replace the graph's reports with new reports that specify bounds. */ |
195 | static |
196 | void updateReportBounds(ReportManager &rm, NGHolder &g, |
197 | const ExpressionInfo &expr) { |
198 | DEBUG_PRINTF("updating report bounds\n" ); |
199 | replaceReports(g, [&](NFAVertex, ReportID id) { |
200 | Report report = rm.getReport(id); // make a copy |
201 | assert(!report.hasBounds()); |
202 | |
203 | // Note that we need to cope with offset adjustment here. |
204 | |
205 | report.minOffset = expr.min_offset - report.offsetAdjust; |
206 | if (expr.max_offset == MAX_OFFSET) { |
207 | report.maxOffset = MAX_OFFSET; |
208 | } else { |
209 | report.maxOffset = expr.max_offset - report.offsetAdjust; |
210 | } |
211 | assert(report.maxOffset >= report.minOffset); |
212 | |
213 | report.minLength = expr.min_length; |
214 | if (expr.min_length && !expr.som) { |
215 | report.quashSom = true; |
216 | } |
217 | |
218 | DEBUG_PRINTF("id %u -> min_offset=%llu, max_offset=%llu, " |
219 | "min_length=%llu\n" , id, report.minOffset, |
220 | report.maxOffset, report.minLength); |
221 | |
222 | return rm.getInternalId(report); |
223 | }); |
224 | } |
225 | |
226 | static |
227 | bool hasVirtualStarts(const NGHolder &g) { |
228 | for (auto v : adjacent_vertices_range(g.start, g)) { |
229 | if (g[v].assert_flags & POS_FLAG_VIRTUAL_START) { |
230 | return true; |
231 | } |
232 | } |
233 | return false; |
234 | } |
235 | |
236 | /** Set the min_length param for all reports to zero. */ |
237 | static |
238 | void clearMinLengthParam(NGHolder &g, ReportManager &rm) { |
239 | DEBUG_PRINTF("clearing min length\n" ); |
240 | replaceReports(g, [&rm](NFAVertex, ReportID id) { |
241 | const auto &report = rm.getReport(id); |
242 | if (report.minLength) { |
243 | Report new_report = report; |
244 | new_report.minLength = 0; |
245 | return rm.getInternalId(new_report); |
246 | } |
247 | return id; |
248 | }); |
249 | } |
250 | |
251 | /** |
252 | * Set the min_offset param to zero and the max_offset param to MAX_OFFSET for |
253 | * all reports. |
254 | */ |
255 | static |
256 | void clearOffsetParams(NGHolder &g, ReportManager &rm) { |
257 | DEBUG_PRINTF("clearing min and max offset\n" ); |
258 | replaceReports(g, [&rm](NFAVertex, ReportID id) { |
259 | const auto &report = rm.getReport(id); |
260 | if (report.minLength) { |
261 | Report new_report = report; |
262 | new_report.minOffset = 0; |
263 | new_report.maxOffset = MAX_OFFSET; |
264 | return rm.getInternalId(new_report); |
265 | } |
266 | return id; |
267 | }); |
268 | } |
269 | |
270 | /** |
271 | * If the pattern is unanchored, has a max_offset and has not asked for SOM, we |
272 | * can use that knowledge to anchor it which will limit its lifespan. Note that |
273 | * we can't use this transformation if there's a min_length, as it's currently |
274 | * handled using "sly SOM". |
275 | * |
276 | * Note that it is possible to handle graphs that have a combination of |
277 | * anchored and unanchored paths, but it's too tricky for the moment. |
278 | */ |
279 | static |
280 | bool anchorPatternWithBoundedRepeat(NGHolder &g, ReportManager &rm) { |
281 | if (!isFloating(g)) { |
282 | return false; |
283 | } |
284 | |
285 | const auto &reports = all_reports(g); |
286 | if (reports.empty()) { |
287 | return false; |
288 | } |
289 | |
290 | if (any_of_in(reports, [&](ReportID id) { |
291 | const auto &report = rm.getReport(id); |
292 | return report.maxOffset == MAX_OFFSET || report.minLength || |
293 | report.offsetAdjust; |
294 | })) { |
295 | return false; |
296 | } |
297 | |
298 | if (!hasSameBounds(reports, rm)) { |
299 | DEBUG_PRINTF("mixed report bounds\n" ); |
300 | return false; |
301 | } |
302 | |
303 | const depth minWidth = findMinWidth(g); |
304 | const depth maxWidth = findMaxWidth(g); |
305 | |
306 | assert(minWidth <= maxWidth); |
307 | assert(maxWidth.is_reachable()); |
308 | |
309 | const auto &first_report = rm.getReport(*reports.begin()); |
310 | const auto min_offset = first_report.minOffset; |
311 | const auto max_offset = first_report.maxOffset; |
312 | assert(max_offset < MAX_OFFSET); |
313 | |
314 | DEBUG_PRINTF("widths=[%s,%s], min/max offsets=[%llu,%llu]\n" , |
315 | minWidth.str().c_str(), maxWidth.str().c_str(), |
316 | min_offset, max_offset); |
317 | |
318 | if (max_offset > MAX_MAXOFFSET_TO_ANCHOR) { |
319 | return false; |
320 | } |
321 | |
322 | if (max_offset < minWidth) { |
323 | assert(0); |
324 | return false; |
325 | } |
326 | |
327 | // If the pattern has virtual starts, we probably don't want to touch it. |
328 | if (hasVirtualStarts(g)) { |
329 | DEBUG_PRINTF("virtual starts, bailing\n" ); |
330 | return false; |
331 | } |
332 | |
333 | // Similarly, bail if the pattern is vacuous. TODO: this could be done, we |
334 | // would just need to be a little careful with reports. |
335 | if (isVacuous(g)) { |
336 | DEBUG_PRINTF("vacuous, bailing\n" ); |
337 | return false; |
338 | } |
339 | |
340 | u32 min_bound, max_bound; |
341 | if (maxWidth.is_infinite()) { |
342 | min_bound = 0; |
343 | max_bound = max_offset - minWidth; |
344 | } else { |
345 | min_bound = min_offset > maxWidth ? min_offset - maxWidth : 0; |
346 | max_bound = max_offset - minWidth; |
347 | } |
348 | |
349 | DEBUG_PRINTF("prepending ^.{%u,%u}\n" , min_bound, max_bound); |
350 | |
351 | vector<NFAVertex> initials; |
352 | for (auto v : adjacent_vertices_range(g.startDs, g)) { |
353 | if (v == g.startDs) { |
354 | continue; |
355 | } |
356 | initials.push_back(v); |
357 | } |
358 | if (initials.empty()) { |
359 | DEBUG_PRINTF("no initial vertices\n" ); |
360 | return false; |
361 | } |
362 | |
363 | // Wire up 'min_offset' mandatory dots from anchored start. |
364 | NFAVertex u = g.start; |
365 | for (u32 i = 0; i < min_bound; i++) { |
366 | NFAVertex v = add_vertex(g); |
367 | g[v].char_reach.setall(); |
368 | add_edge(u, v, g); |
369 | u = v; |
370 | } |
371 | |
372 | NFAVertex head = u; |
373 | |
374 | // Wire up optional dots for (max_offset - min_offset). |
375 | for (u32 i = 0; i < max_bound - min_bound; i++) { |
376 | NFAVertex v = add_vertex(g); |
377 | g[v].char_reach.setall(); |
378 | if (head != u) { |
379 | add_edge(head, v, g); |
380 | } |
381 | add_edge(u, v, g); |
382 | u = v; |
383 | } |
384 | |
385 | // Remove edges from starts and wire both head and u to our initials. |
386 | for (auto v : initials) { |
387 | remove_edge(g.startDs, v, g); |
388 | remove_edge(g.start, v, g); |
389 | |
390 | if (head != u) { |
391 | add_edge(head, v, g); |
392 | } |
393 | add_edge(u, v, g); |
394 | } |
395 | |
396 | renumber_vertices(g); |
397 | renumber_edges(g); |
398 | |
399 | if (minWidth == maxWidth) { |
400 | // For a fixed width pattern, we can retire the offsets as |
401 | // they are implicit in the graph now. |
402 | clearOffsetParams(g, rm); |
403 | } |
404 | |
405 | clearReports(g); |
406 | return true; |
407 | } |
408 | |
409 | static |
410 | NFAVertex findSingleCyclic(const NGHolder &g) { |
411 | NFAVertex v = NGHolder::null_vertex(); |
412 | for (const auto &e : edges_range(g)) { |
413 | if (source(e, g) == target(e, g)) { |
414 | if (source(e, g) == g.startDs) { |
415 | continue; |
416 | } |
417 | if (v != NGHolder::null_vertex()) { |
418 | // More than one cyclic vertex. |
419 | return NGHolder::null_vertex(); |
420 | } |
421 | v = source(e, g); |
422 | } |
423 | } |
424 | |
425 | if (v != NGHolder::null_vertex()) { |
426 | DEBUG_PRINTF("cyclic is %zu\n" , g[v].index); |
427 | assert(!is_special(v, g)); |
428 | } |
429 | return v; |
430 | } |
431 | |
432 | static |
433 | bool hasOffsetAdjust(const ReportManager &rm, NGHolder &g, |
434 | int *adjust) { |
435 | const auto &reports = all_reports(g); |
436 | if (reports.empty()) { |
437 | assert(0); |
438 | return false; |
439 | } |
440 | |
441 | int offsetAdjust = rm.getReport(*reports.begin()).offsetAdjust; |
442 | for (auto report : reports) { |
443 | const Report &ir = rm.getReport(report); |
444 | if (ir.offsetAdjust != offsetAdjust) { |
445 | DEBUG_PRINTF("different adjusts!\n" ); |
446 | return false; |
447 | } |
448 | } |
449 | |
450 | *adjust = offsetAdjust; |
451 | return true; |
452 | } |
453 | |
454 | /** |
455 | * If the pattern has a min_length and is of "ratchet" form with one unbounded |
456 | * repeat, that repeat can become a bounded repeat. |
457 | * |
458 | * /foo.*bar/{min_length=100} --> /foo.{94,}bar/ |
459 | */ |
460 | static |
461 | bool transformMinLengthToRepeat(NGHolder &g, ReportManager &rm) { |
462 | const auto &reports = all_reports(g); |
463 | |
464 | if (reports.empty()) { |
465 | return false; |
466 | } |
467 | |
468 | if (!hasSameBounds(reports, rm)) { |
469 | DEBUG_PRINTF("mixed report bounds\n" ); |
470 | return false; |
471 | } |
472 | |
473 | const auto &min_length = rm.getReport(*reports.begin()).minLength; |
474 | if (!min_length || min_length > MAX_MINLENGTH_TO_CONVERT) { |
475 | return false; |
476 | } |
477 | |
478 | // If the pattern has virtual starts, we probably don't want to touch it. |
479 | if (hasVirtualStarts(g)) { |
480 | DEBUG_PRINTF("virtual starts, bailing\n" ); |
481 | return false; |
482 | } |
483 | |
484 | // The graph must contain a single cyclic vertex (other than startDs), and |
485 | // that vertex can have one pred and one successor. |
486 | NFAVertex cyclic = findSingleCyclic(g); |
487 | if (cyclic == NGHolder::null_vertex()) { |
488 | return false; |
489 | } |
490 | |
491 | NGHolder::adjacency_iterator ai, ae; |
492 | tie(ai, ae) = adjacent_vertices(g.start, g); |
493 | if (*ai == g.startDs) { |
494 | ++ai; |
495 | } |
496 | NFAVertex v = *ai; |
497 | if (++ai != ae) { |
498 | DEBUG_PRINTF("more than one initial vertex\n" ); |
499 | return false; |
500 | } |
501 | |
502 | u32 width = 0; |
503 | |
504 | // Walk from the start vertex to the cyclic state and ensure we have a |
505 | // chain of vertices. |
506 | while (v != cyclic) { |
507 | DEBUG_PRINTF("vertex %zu\n" , g[v].index); |
508 | width++; |
509 | auto succ = succs(v, g); |
510 | if (contains(succ, cyclic)) { |
511 | if (succ.size() == 1) { |
512 | v = cyclic; |
513 | } else if (succ.size() == 2) { |
514 | // Cyclic and jump edge. |
515 | succ.erase(cyclic); |
516 | NFAVertex v2 = *succ.begin(); |
517 | if (!edge(cyclic, v2, g).second) { |
518 | DEBUG_PRINTF("bad form\n" ); |
519 | return false; |
520 | } |
521 | v = cyclic; |
522 | } else { |
523 | DEBUG_PRINTF("bad form\n" ); |
524 | return false; |
525 | } |
526 | } else { |
527 | if (succ.size() != 1) { |
528 | DEBUG_PRINTF("bad form\n" ); |
529 | return false; |
530 | } |
531 | v = *succ.begin(); |
532 | } |
533 | } |
534 | |
535 | // Check the cyclic state is A-OK. |
536 | v = getSoleDestVertex(g, cyclic); |
537 | if (v == NGHolder::null_vertex()) { |
538 | DEBUG_PRINTF("cyclic has more than one successor\n" ); |
539 | return false; |
540 | } |
541 | |
542 | // Walk from the cyclic state to an accept and ensure we have a chain of |
543 | // vertices. |
544 | while (!is_any_accept(v, g)) { |
545 | DEBUG_PRINTF("vertex %zu\n" , g[v].index); |
546 | width++; |
547 | auto succ = succs(v, g); |
548 | if (succ.size() != 1) { |
549 | DEBUG_PRINTF("bad form\n" ); |
550 | return false; |
551 | } |
552 | v = *succ.begin(); |
553 | } |
554 | |
555 | int offsetAdjust = 0; |
556 | if (!hasOffsetAdjust(rm, g, &offsetAdjust)) { |
557 | return false; |
558 | } |
559 | DEBUG_PRINTF("adjusting width by %d\n" , offsetAdjust); |
560 | width += offsetAdjust; |
561 | |
562 | DEBUG_PRINTF("width=%u, vertex %zu is cyclic\n" , width, |
563 | g[cyclic].index); |
564 | |
565 | if (width >= min_length) { |
566 | DEBUG_PRINTF("min_length=%llu is guaranteed, as width=%u\n" , |
567 | min_length, width); |
568 | clearMinLengthParam(g, rm); |
569 | return true; |
570 | } |
571 | |
572 | vector<NFAVertex> preds; |
573 | vector<NFAEdge> dead; |
574 | for (auto u : inv_adjacent_vertices_range(cyclic, g)) { |
575 | DEBUG_PRINTF("pred %zu\n" , g[u].index); |
576 | if (u == cyclic) { |
577 | continue; |
578 | } |
579 | preds.push_back(u); |
580 | |
581 | // We want to delete the out-edges of each predecessor, but need to |
582 | // make sure we don't delete the startDs self loop. |
583 | for (const auto &e : out_edges_range(u, g)) { |
584 | if (target(e, g) != g.startDs) { |
585 | dead.push_back(e); |
586 | } |
587 | } |
588 | } |
589 | |
590 | remove_edges(dead, g); |
591 | |
592 | assert(!preds.empty()); |
593 | |
594 | const CharReach &cr = g[cyclic].char_reach; |
595 | |
596 | for (u32 i = 0; i < min_length - width - 1; ++i) { |
597 | v = add_vertex(g); |
598 | g[v].char_reach = cr; |
599 | |
600 | for (auto u : preds) { |
601 | add_edge(u, v, g); |
602 | } |
603 | preds.clear(); |
604 | preds.push_back(v); |
605 | } |
606 | assert(!preds.empty()); |
607 | for (auto u : preds) { |
608 | add_edge(u, cyclic, g); |
609 | } |
610 | |
611 | renumber_vertices(g); |
612 | renumber_edges(g); |
613 | clearMinLengthParam(g, rm); |
614 | clearReports(g); |
615 | return true; |
616 | } |
617 | |
618 | static |
619 | bool hasExtParams(const ExpressionInfo &expr) { |
620 | if (expr.min_length != 0) { |
621 | return true; |
622 | } |
623 | if (expr.min_offset != 0) { |
624 | return true; |
625 | } |
626 | if (expr.max_offset != MAX_OFFSET) { |
627 | return true; |
628 | } |
629 | return false; |
630 | } |
631 | |
632 | static |
633 | const depth& maxDistToAccept(const NFAVertexBidiDepth &d) { |
634 | if (d.toAccept.max.is_unreachable()) { |
635 | return d.toAcceptEod.max; |
636 | } else if (d.toAcceptEod.max.is_unreachable()) { |
637 | return d.toAccept.max; |
638 | } |
639 | return max(d.toAccept.max, d.toAcceptEod.max); |
640 | } |
641 | |
642 | static |
643 | const depth& minDistFromStart(const NFAVertexBidiDepth &d) { |
644 | return min(d.fromStartDotStar.min, d.fromStart.min); |
645 | } |
646 | |
647 | static |
648 | const depth& minDistToAccept(const NFAVertexBidiDepth &d) { |
649 | return min(d.toAccept.min, d.toAcceptEod.min); |
650 | } |
651 | |
652 | static |
653 | bool isEdgePrunable(const NGHolder &g, const Report &report, |
654 | const vector<NFAVertexBidiDepth> &depths, |
655 | const NFAEdge &e) { |
656 | const NFAVertex u = source(e, g); |
657 | const NFAVertex v = target(e, g); |
658 | |
659 | DEBUG_PRINTF("edge (%zu,%zu)\n" , g[u].index, g[v].index); |
660 | |
661 | // Leave our special-to-special edges alone. |
662 | if (is_special(u, g) && is_special(v, g)) { |
663 | DEBUG_PRINTF("ignoring special-to-special\n" ); |
664 | return false; |
665 | } |
666 | |
667 | // We must be careful around start: we don't want to remove (start, v) if |
668 | // (startDs, v) exists as well, since later code will assume the presence |
669 | // of both edges, but other cases are OK. |
670 | if (u == g.start && edge(g.startDs, v, g).second) { |
671 | DEBUG_PRINTF("ignoring unanchored start edge\n" ); |
672 | return false; |
673 | } |
674 | |
675 | u32 u_idx = g[u].index; |
676 | u32 v_idx = g[v].index; |
677 | assert(u_idx < depths.size() && v_idx < depths.size()); |
678 | |
679 | const NFAVertexBidiDepth &du = depths.at(u_idx); |
680 | const NFAVertexBidiDepth &dv = depths.at(v_idx); |
681 | |
682 | if (report.minOffset) { |
683 | depth max_offset = maxDistFromStartOfData(du) + maxDistToAccept(dv); |
684 | if (max_offset.is_finite() && max_offset < report.minOffset) { |
685 | DEBUG_PRINTF("max_offset=%s too small\n" , max_offset.str().c_str()); |
686 | return true; |
687 | } |
688 | } |
689 | |
690 | if (report.maxOffset != MAX_OFFSET) { |
691 | depth min_offset = minDistFromStart(du) + minDistToAccept(dv); |
692 | assert(min_offset.is_finite()); |
693 | |
694 | if (min_offset > report.maxOffset) { |
695 | DEBUG_PRINTF("min_offset=%s too large\n" , min_offset.str().c_str()); |
696 | return true; |
697 | } |
698 | } |
699 | |
700 | if (report.minLength && is_any_accept(v, g)) { |
701 | // Simple take on min_length. If we're an edge to accept and our max |
702 | // dist from start is too small, we can be pruned. |
703 | const depth &width = maxDistFromInit(du); |
704 | if (width.is_finite() && width < report.minLength) { |
705 | DEBUG_PRINTF("max width %s from start too small for min_length\n" , |
706 | width.str().c_str()); |
707 | return true; |
708 | } |
709 | } |
710 | |
711 | return false; |
712 | } |
713 | |
714 | static |
715 | void pruneExtUnreachable(NGHolder &g, const ReportManager &rm) { |
716 | const auto &reports = all_reports(g); |
717 | if (reports.empty()) { |
718 | return; |
719 | } |
720 | |
721 | if (!hasSameBounds(reports, rm)) { |
722 | DEBUG_PRINTF("report bounds vary\n" ); |
723 | return; |
724 | } |
725 | |
726 | const auto &report = rm.getReport(*reports.begin()); |
727 | |
728 | auto depths = calcBidiDepths(g); |
729 | |
730 | vector<NFAEdge> dead; |
731 | |
732 | for (const auto &e : edges_range(g)) { |
733 | if (isEdgePrunable(g, report, depths, e)) { |
734 | DEBUG_PRINTF("pruning\n" ); |
735 | dead.push_back(e); |
736 | } |
737 | } |
738 | |
739 | if (dead.empty()) { |
740 | return; |
741 | } |
742 | |
743 | remove_edges(dead, g); |
744 | pruneUseless(g); |
745 | clearReports(g); |
746 | } |
747 | |
748 | /** |
749 | * Remove vacuous edges in graphs where the min_offset or min_length |
750 | * constraints dictate that they can never produce a match. |
751 | */ |
752 | static |
753 | void pruneVacuousEdges(NGHolder &g, const ReportManager &rm) { |
754 | vector<NFAEdge> dead; |
755 | |
756 | auto has_min_offset = [&](NFAVertex v) { |
757 | assert(!g[v].reports.empty()); // must be reporter |
758 | return all_of_in(g[v].reports, [&](ReportID id) { |
759 | return rm.getReport(id).minOffset > 0; |
760 | }); |
761 | }; |
762 | |
763 | auto has_min_length = [&](NFAVertex v) { |
764 | assert(!g[v].reports.empty()); // must be reporter |
765 | return all_of_in(g[v].reports, [&](ReportID id) { |
766 | return rm.getReport(id).minLength > 0; |
767 | }); |
768 | }; |
769 | |
770 | for (const auto &e : edges_range(g)) { |
771 | const NFAVertex u = source(e, g); |
772 | const NFAVertex v = target(e, g); |
773 | |
774 | // Special case: Crudely remove vacuous edges from start in graphs with |
775 | // a min_offset. |
776 | if (u == g.start && is_any_accept(v, g) && has_min_offset(u)) { |
777 | DEBUG_PRINTF("vacuous edge in graph with min_offset!\n" ); |
778 | dead.push_back(e); |
779 | continue; |
780 | } |
781 | |
782 | // If a min_length is set, vacuous edges can be removed. |
783 | if (is_any_start(u, g) && is_any_accept(v, g) && has_min_length(u)) { |
784 | DEBUG_PRINTF("vacuous edge in graph with min_length!\n" ); |
785 | dead.push_back(e); |
786 | continue; |
787 | } |
788 | } |
789 | |
790 | if (dead.empty()) { |
791 | return; |
792 | } |
793 | |
794 | DEBUG_PRINTF("removing %zu vacuous edges\n" , dead.size()); |
795 | remove_edges(dead, g); |
796 | pruneUseless(g); |
797 | clearReports(g); |
798 | } |
799 | |
800 | static |
801 | void pruneUnmatchable(NGHolder &g, const vector<DepthMinMax> &depths, |
802 | const ReportManager &rm, NFAVertex accept) { |
803 | vector<NFAEdge> dead; |
804 | |
805 | for (const auto &e : in_edges_range(accept, g)) { |
806 | NFAVertex v = source(e, g); |
807 | if (v == g.accept) { |
808 | assert(accept == g.acceptEod); // stylised edge |
809 | continue; |
810 | } |
811 | |
812 | if (!hasSameBounds(g[v].reports, rm)) { |
813 | continue; |
814 | } |
815 | const auto &report = rm.getReport(*g[v].reports.begin()); |
816 | |
817 | u32 idx = g[v].index; |
818 | DepthMinMax d = depths[idx]; // copy |
819 | pair<s32, s32> adj = getMinMaxOffsetAdjust(rm, g, v); |
820 | DEBUG_PRINTF("vertex %u: depths=%s, adj=[%d,%d]\n" , idx, |
821 | d.str().c_str(), adj.first, adj.second); |
822 | d.min += adj.first; |
823 | d.max += adj.second; |
824 | |
825 | if (d.max.is_finite() && d.max < report.minLength) { |
826 | DEBUG_PRINTF("prune, max match length %s < min_length=%llu\n" , |
827 | d.max.str().c_str(), report.minLength); |
828 | dead.push_back(e); |
829 | continue; |
830 | } |
831 | |
832 | if (report.maxOffset != MAX_OFFSET && d.min > report.maxOffset) { |
833 | DEBUG_PRINTF("prune, min match length %s > max_offset=%llu\n" , |
834 | d.min.str().c_str(), report.maxOffset); |
835 | dead.push_back(e); |
836 | continue; |
837 | } |
838 | } |
839 | |
840 | remove_edges(dead, g); |
841 | } |
842 | |
843 | /** |
844 | * Remove edges to accepts that can never produce a match long enough to |
845 | * satisfy our min_length and max_offset constraints. |
846 | */ |
847 | static |
848 | void pruneUnmatchable(NGHolder &g, const ReportManager &rm) { |
849 | if (!any_of_in(all_reports(g), [&](ReportID id) { |
850 | return rm.getReport(id).minLength > 0; |
851 | })) { |
852 | return; |
853 | } |
854 | |
855 | vector<DepthMinMax> depths = getDistancesFromSOM(g); |
856 | |
857 | pruneUnmatchable(g, depths, rm, g.accept); |
858 | pruneUnmatchable(g, depths, rm, g.acceptEod); |
859 | |
860 | pruneUseless(g); |
861 | clearReports(g); |
862 | } |
863 | |
864 | static |
865 | bool hasOffsetAdjustments(const ReportManager &rm, const NGHolder &g) { |
866 | return any_of_in(all_reports(g), [&rm](ReportID id) { |
867 | return rm.getReport(id).offsetAdjust != 0; |
868 | }); |
869 | } |
870 | |
871 | void propagateExtendedParams(NGHolder &g, ExpressionInfo &expr, |
872 | ReportManager &rm) { |
873 | if (!hasExtParams(expr)) { |
874 | return; |
875 | } |
876 | |
877 | depth minWidth = findMinWidth(g); |
878 | depth maxWidth = findMaxWidth(g); |
879 | bool is_anchored = !has_proper_successor(g.startDs, g) |
880 | && out_degree(g.start, g); |
881 | |
882 | DepthMinMax match_depths = findMatchLengths(rm, g); |
883 | DEBUG_PRINTF("match depths %s\n" , match_depths.str().c_str()); |
884 | |
885 | if (is_anchored && maxWidth.is_finite() && expr.min_offset > maxWidth) { |
886 | ostringstream oss; |
887 | oss << "Expression is anchored and cannot satisfy min_offset=" |
888 | << expr.min_offset << " as it can only produce matches of length " |
889 | << maxWidth << " bytes at most." ; |
890 | throw CompileError(expr.index, oss.str()); |
891 | } |
892 | |
893 | if (minWidth > expr.max_offset) { |
894 | ostringstream oss; |
895 | oss << "Expression has max_offset=" << expr.max_offset |
896 | << " but requires " << minWidth << " bytes to match." ; |
897 | throw CompileError(expr.index, oss.str()); |
898 | } |
899 | |
900 | if (maxWidth.is_finite() && match_depths.max < expr.min_length) { |
901 | ostringstream oss; |
902 | oss << "Expression has min_length=" << expr.min_length << " but can " |
903 | "only produce matches of length " << match_depths.max << |
904 | " bytes at most." ; |
905 | throw CompileError(expr.index, oss.str()); |
906 | } |
907 | |
908 | if (expr.min_length && expr.min_length <= match_depths.min) { |
909 | DEBUG_PRINTF("min_length=%llu constraint is unnecessary\n" , |
910 | expr.min_length); |
911 | expr.min_length = 0; |
912 | } |
913 | |
914 | if (!hasExtParams(expr)) { |
915 | return; |
916 | } |
917 | |
918 | updateReportBounds(rm, g, expr); |
919 | } |
920 | |
921 | /** |
922 | * If the pattern is completely anchored and has a min_length set, this can |
923 | * be converted to a min_offset. |
924 | */ |
925 | static |
926 | void replaceMinLengthWithOffset(NGHolder &g, ReportManager &rm) { |
927 | if (has_proper_successor(g.startDs, g)) { |
928 | return; // not wholly anchored |
929 | } |
930 | |
931 | replaceReports(g, [&rm](NFAVertex, ReportID id) { |
932 | const auto &report = rm.getReport(id); |
933 | if (report.minLength) { |
934 | Report new_report = report; |
935 | u64a min_len_offset = report.minLength - report.offsetAdjust; |
936 | new_report.minOffset = max(report.minOffset, min_len_offset); |
937 | new_report.minLength = 0; |
938 | return rm.getInternalId(new_report); |
939 | } |
940 | return id; |
941 | }); |
942 | } |
943 | |
944 | /** |
945 | * Clear offset bounds on reports that are not needed because they're satisfied |
946 | * by vertex depth. |
947 | */ |
948 | static |
949 | void removeUnneededOffsetBounds(NGHolder &g, ReportManager &rm) { |
950 | auto depths = calcDepths(g); |
951 | |
952 | replaceReports(g, [&](NFAVertex v, ReportID id) { |
953 | const auto &d = depths.at(g[v].index); |
954 | const depth &min_depth = min(d.fromStartDotStar.min, d.fromStart.min); |
955 | const depth &max_depth = maxDistFromStartOfData(d); |
956 | |
957 | DEBUG_PRINTF("vertex %zu has min_depth=%s, max_depth=%s\n" , g[v].index, |
958 | min_depth.str().c_str(), max_depth.str().c_str()); |
959 | |
960 | Report report = rm.getReport(id); // copy |
961 | bool modified = false; |
962 | if (report.minOffset && !report.offsetAdjust && |
963 | report.minOffset <= min_depth) { |
964 | report.minOffset = 0; |
965 | modified = true; |
966 | } |
967 | if (report.maxOffset != MAX_OFFSET && max_depth.is_finite() && |
968 | report.maxOffset >= max_depth) { |
969 | report.maxOffset = MAX_OFFSET; |
970 | modified = true; |
971 | } |
972 | if (modified) { |
973 | DEBUG_PRINTF("vertex %zu, changed bounds to [%llu,%llu]\n" , |
974 | g[v].index, report.minOffset, report.maxOffset); |
975 | return rm.getInternalId(report); |
976 | } |
977 | |
978 | return id; |
979 | }); |
980 | } |
981 | |
982 | void reduceExtendedParams(NGHolder &g, ReportManager &rm, som_type som) { |
983 | if (!any_of_in(all_reports(g), |
984 | [&](ReportID id) { return rm.getReport(id).hasBounds(); })) { |
985 | DEBUG_PRINTF("no extparam bounds\n" ); |
986 | return; |
987 | } |
988 | |
989 | DEBUG_PRINTF("graph has extparam bounds\n" ); |
990 | |
991 | pruneVacuousEdges(g, rm); |
992 | if (can_never_match(g)) { |
993 | return; |
994 | } |
995 | |
996 | pruneUnmatchable(g, rm); |
997 | if (can_never_match(g)) { |
998 | return; |
999 | } |
1000 | |
1001 | if (!hasOffsetAdjustments(rm, g)) { |
1002 | pruneExtUnreachable(g, rm); |
1003 | if (can_never_match(g)) { |
1004 | return; |
1005 | } |
1006 | } |
1007 | |
1008 | replaceMinLengthWithOffset(g, rm); |
1009 | if (can_never_match(g)) { |
1010 | return; |
1011 | } |
1012 | |
1013 | // If the pattern has a min_length and is of "ratchet" form with one |
1014 | // unbounded repeat, that repeat can become a bounded repeat. |
1015 | // e.g. /foo.*bar/{min_length=100} --> /foo.{94,}bar/ |
1016 | transformMinLengthToRepeat(g, rm); |
1017 | if (can_never_match(g)) { |
1018 | return; |
1019 | } |
1020 | |
1021 | // If the pattern is unanchored, has a max_offset and has not asked for |
1022 | // SOM, we can use that knowledge to anchor it which will limit its |
1023 | // lifespan. Note that we can't use this transformation if there's a |
1024 | // min_length, as it's currently handled using "sly SOM". |
1025 | if (som == SOM_NONE) { |
1026 | anchorPatternWithBoundedRepeat(g, rm); |
1027 | if (can_never_match(g)) { |
1028 | return; |
1029 | } |
1030 | } |
1031 | |
1032 | removeUnneededOffsetBounds(g, rm); |
1033 | } |
1034 | |
1035 | } // namespace ue2 |
1036 | |