1 | /* |
2 | * Copyright (c) 2015-2018, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | #include "rose_build_add_internal.h" |
30 | #include "rose_build_impl.h" |
31 | |
32 | #include "ue2common.h" |
33 | #include "grey.h" |
34 | #include "rose_build_anchored.h" |
35 | #include "rose_in_util.h" |
36 | #include "hwlm/hwlm_literal.h" |
37 | #include "nfa/goughcompile.h" |
38 | #include "nfa/nfa_api_queue.h" |
39 | #include "nfagraph/ng_depth.h" |
40 | #include "nfagraph/ng_dump.h" |
41 | #include "nfagraph/ng_holder.h" |
42 | #include "nfagraph/ng_limex.h" |
43 | #include "nfagraph/ng_mcclellan.h" |
44 | #include "nfagraph/ng_prefilter.h" |
45 | #include "nfagraph/ng_prune.h" |
46 | #include "nfagraph/ng_region.h" |
47 | #include "nfagraph/ng_repeat.h" |
48 | #include "nfagraph/ng_reports.h" |
49 | #include "nfagraph/ng_util.h" |
50 | #include "nfagraph/ng_width.h" |
51 | #include "util/charreach.h" |
52 | #include "util/charreach_util.h" |
53 | #include "util/compare.h" |
54 | #include "util/compile_context.h" |
55 | #include "util/container.h" |
56 | #include "util/dump_charclass.h" |
57 | #include "util/graph_range.h" |
58 | #include "util/insertion_ordered.h" |
59 | #include "util/make_unique.h" |
60 | #include "util/noncopyable.h" |
61 | #include "util/order_check.h" |
62 | #include "util/report_manager.h" |
63 | #include "util/ue2string.h" |
64 | #include "util/verify_types.h" |
65 | |
66 | #include <algorithm> |
67 | #include <map> |
68 | #include <set> |
69 | #include <string> |
70 | #include <vector> |
71 | #include <utility> |
72 | |
73 | using namespace std; |
74 | |
75 | namespace ue2 { |
76 | |
77 | /** |
78 | * \brief Data used by most of the construction code in this file. |
79 | */ |
80 | struct RoseBuildData : noncopyable { |
81 | RoseBuildData(const RoseInGraph &ig_in, bool som_in) |
82 | : ig(ig_in), som(som_in) {} |
83 | |
84 | /** Input rose graph. */ |
85 | const RoseInGraph &ig; |
86 | |
87 | /** Edges we've transformed (in \ref transformAnchoredLiteralOverlap) which |
88 | * require ANCH history to prevent overlap. */ |
89 | unordered_set<RoseInEdge> anch_history_edges; |
90 | |
91 | /** True if we're tracking Start of Match. */ |
92 | bool som; |
93 | }; |
94 | |
95 | static |
96 | ReportID findReportId(const NGHolder &g) { |
97 | /* prefix/infix always have an edge to accept and only 1 reportid initially |
98 | */ |
99 | assert(in_degree(g.accept, g)); |
100 | const auto &rep = g[*inv_adjacent_vertices(g.accept, g).first].reports; |
101 | assert(!rep.empty()); |
102 | return *rep.begin(); |
103 | } |
104 | |
105 | static |
106 | RoseVertex createVertex(RoseBuildImpl *build, u32 literalId, u32 min_offset, |
107 | u32 max_offset) { |
108 | RoseGraph &g = build->g; |
109 | // add to tree |
110 | RoseVertex v = add_vertex(g); |
111 | g[v].min_offset = min_offset; |
112 | g[v].max_offset = max_offset; |
113 | |
114 | DEBUG_PRINTF("insert vertex %zu into literal %u's vertex set\n" , g[v].index, |
115 | literalId); |
116 | g[v].literals.insert(literalId); |
117 | build->literal_info[literalId].vertices.insert(v); |
118 | |
119 | return v; |
120 | } |
121 | |
122 | RoseVertex createVertex(RoseBuildImpl *build, const RoseVertex parent, |
123 | u32 minBound, u32 maxBound, u32 literalId, |
124 | size_t literalLength, |
125 | const flat_set<ReportID> &reports) { |
126 | assert(parent != RoseGraph::null_vertex()); |
127 | |
128 | RoseGraph &g = build->g; |
129 | // add to tree (offsets set latter) |
130 | RoseVertex v = createVertex(build, literalId, 0U, 0U); |
131 | |
132 | /* fill in report information */ |
133 | g[v].reports.insert(reports.begin(), reports.end()); |
134 | |
135 | RoseEdge e = add_edge(parent, v, g); |
136 | DEBUG_PRINTF("adding edge (%u, %u) to parent\n" , minBound, maxBound); |
137 | |
138 | g[e].minBound = minBound; |
139 | g[e].maxBound = maxBound; |
140 | g[e].rose_top = 0; |
141 | |
142 | u32 min_offset = add_rose_depth(g[parent].min_offset, minBound); |
143 | u32 max_offset = add_rose_depth(g[parent].max_offset, maxBound); |
144 | |
145 | /* take literal length into account for offsets */ |
146 | const u32 lit_len = verify_u32(literalLength); |
147 | min_offset = add_rose_depth(min_offset, lit_len); |
148 | max_offset = add_rose_depth(max_offset, lit_len); |
149 | |
150 | g[v].min_offset = min_offset; |
151 | g[v].max_offset = max_offset; |
152 | |
153 | return v; |
154 | } |
155 | |
156 | static |
157 | RoseVertex createAnchoredVertex(RoseBuildImpl *build, u32 literalId, |
158 | u32 min_offset, u32 max_offset) { |
159 | RoseGraph &g = build->g; |
160 | RoseVertex v = createVertex(build, literalId, min_offset, max_offset); |
161 | |
162 | DEBUG_PRINTF("created anchored vertex %zu with lit id %u\n" , g[v].index, |
163 | literalId); |
164 | |
165 | RoseEdge e = add_edge(build->anchored_root, v, g); |
166 | g[e].minBound = min_offset; |
167 | g[e].maxBound = max_offset; |
168 | |
169 | return v; |
170 | } |
171 | |
172 | static |
173 | RoseVertex duplicate(RoseBuildImpl *build, RoseVertex v) { |
174 | RoseGraph &g = build->g; |
175 | RoseVertex w = add_vertex(g[v], g); |
176 | DEBUG_PRINTF("added vertex %zu\n" , g[w].index); |
177 | |
178 | for (auto lit_id : g[w].literals) { |
179 | build->literal_info[lit_id].vertices.insert(w); |
180 | } |
181 | |
182 | for (const auto &e : in_edges_range(v, g)) { |
183 | RoseVertex s = source(e, g); |
184 | add_edge(s, w, g[e], g); |
185 | DEBUG_PRINTF("added edge (%zu,%zu)\n" , g[s].index, g[w].index); |
186 | } |
187 | |
188 | return w; |
189 | } |
190 | |
191 | namespace { |
192 | struct created_key { |
193 | explicit created_key(const RoseInEdgeProps &trep) |
194 | : prefix(trep.graph.get()), lag(trep.graph_lag) { |
195 | } |
196 | bool operator<(const created_key &b) const { |
197 | const created_key &a = *this; |
198 | ORDER_CHECK(prefix); |
199 | ORDER_CHECK(lag); |
200 | return false; |
201 | } |
202 | NGHolder *prefix; |
203 | u32 lag; |
204 | }; |
205 | } |
206 | |
207 | static |
208 | bool isPureAnchored(const NGHolder &h) { |
209 | return !proper_out_degree(h.startDs, h); |
210 | } |
211 | |
212 | static |
213 | RoseRoleHistory selectHistory(const RoseBuildImpl &tbi, const RoseBuildData &bd, |
214 | const RoseInEdge &rose_edge, const RoseEdge &e) { |
215 | const RoseGraph &g = tbi.g; |
216 | const RoseVertex u = source(e, g), v = target(e, g); |
217 | const bool fixed_offset_src = g[u].fixedOffset(); |
218 | const bool has_bounds = g[e].minBound || (g[e].maxBound != ROSE_BOUND_INF); |
219 | |
220 | DEBUG_PRINTF("edge %zu->%zu, bounds=[%u,%u], fixed_u=%d, prefix=%d\n" , |
221 | g[u].index, g[v].index, g[e].minBound, g[e].maxBound, |
222 | (int)g[u].fixedOffset(), (int)g[v].left); |
223 | |
224 | if (g[v].left) { |
225 | // Roles with prefix engines have their history handled by that prefix. |
226 | assert(!contains(bd.anch_history_edges, rose_edge)); |
227 | return ROSE_ROLE_HISTORY_NONE; |
228 | } |
229 | |
230 | if (contains(bd.anch_history_edges, rose_edge)) { |
231 | DEBUG_PRINTF("needs anch history\n" ); |
232 | return ROSE_ROLE_HISTORY_ANCH; |
233 | } |
234 | |
235 | if (fixed_offset_src && has_bounds) { |
236 | DEBUG_PRINTF("needs anch history\n" ); |
237 | return ROSE_ROLE_HISTORY_ANCH; |
238 | } |
239 | |
240 | return ROSE_ROLE_HISTORY_NONE; |
241 | } |
242 | |
243 | static |
244 | bool hasSuccessorLiterals(RoseInVertex iv, const RoseInGraph &ig) { |
245 | for (auto v : adjacent_vertices_range(iv, ig)) { |
246 | if (ig[v].type != RIV_ACCEPT) { |
247 | return true; |
248 | } |
249 | } |
250 | return false; |
251 | } |
252 | |
253 | static |
254 | void createVertices(RoseBuildImpl *tbi, |
255 | map<RoseInVertex, vector<RoseVertex> > &vertex_map, |
256 | const vector<pair<RoseVertex, RoseInEdge> > &parents, |
257 | RoseInVertex iv, u32 min_offset, u32 max_offset, |
258 | u32 literalId, u32 delay, const RoseBuildData &bd) { |
259 | RoseGraph &g = tbi->g; |
260 | |
261 | DEBUG_PRINTF("vertex has %zu parents\n" , parents.size()); |
262 | |
263 | map<created_key, RoseVertex> created; |
264 | |
265 | for (const auto &pv : parents) { |
266 | RoseVertex w; |
267 | const RoseInEdgeProps &edge_props = bd.ig[pv.second]; |
268 | shared_ptr<NGHolder> prefix_graph = edge_props.graph; |
269 | u32 prefix_lag = edge_props.graph_lag; |
270 | |
271 | created_key key(edge_props); |
272 | |
273 | if (!contains(created, key)) { |
274 | assert(prefix_graph || !edge_props.haig); |
275 | w = createVertex(tbi, literalId, min_offset, max_offset); |
276 | created[key] = w; |
277 | |
278 | if (prefix_graph) { |
279 | g[w].left.graph = prefix_graph; |
280 | if (edge_props.dfa) { |
281 | g[w].left.dfa = edge_props.dfa; |
282 | } |
283 | g[w].left.haig = edge_props.haig; |
284 | g[w].left.lag = prefix_lag; |
285 | |
286 | // The graph already has its report id allocated - find it. |
287 | g[w].left.leftfix_report = findReportId(*prefix_graph); |
288 | |
289 | if (g[w].left.dfa || g[w].left.haig) { |
290 | assert(prefix_graph); |
291 | g[w].left.dfa_min_width = findMinWidth(*prefix_graph); |
292 | g[w].left.dfa_max_width = findMaxWidth(*prefix_graph); |
293 | } |
294 | } |
295 | |
296 | if (bd.som && !g[w].left.haig) { |
297 | /* no prefix - som based on literal start */ |
298 | assert(!prefix_graph); |
299 | g[w].som_adjust = tbi->literals.at(literalId).elength(); |
300 | DEBUG_PRINTF("set som_adjust to %u\n" , g[w].som_adjust); |
301 | } |
302 | |
303 | DEBUG_PRINTF(" adding new vertex index=%zu\n" , tbi->g[w].index); |
304 | vertex_map[iv].push_back(w); |
305 | } else { |
306 | w = created[key]; |
307 | } |
308 | |
309 | RoseVertex p = pv.first; |
310 | |
311 | RoseEdge e = add_edge(p, w, g); |
312 | DEBUG_PRINTF("adding edge (%u,%u) to parent\n" , edge_props.minBound, |
313 | edge_props.maxBound); |
314 | g[e].minBound = edge_props.minBound; |
315 | if (p != tbi->root && g[w].left.graph |
316 | && (!tbi->isAnyStart(p) || isPureAnchored(*g[w].left.graph))) { |
317 | depth mw = findMaxWidth(*g[w].left.graph); |
318 | if (mw.is_infinite()) { |
319 | g[e].maxBound = ROSE_BOUND_INF; |
320 | } else { |
321 | DEBUG_PRINTF("setting max to %s + %u\n" , mw.str().c_str(), |
322 | prefix_lag); |
323 | g[e].maxBound = prefix_lag + mw; |
324 | } |
325 | } else { |
326 | g[e].maxBound = edge_props.maxBound; |
327 | } |
328 | g[e].rose_top = 0; |
329 | g[e].history = selectHistory(*tbi, bd, pv.second, e); |
330 | } |
331 | |
332 | if (delay && hasSuccessorLiterals(iv, bd.ig)) { |
333 | // Add an undelayed "ghost" vertex for this literal. |
334 | u32 ghostId = tbi->literal_info[literalId].undelayed_id; |
335 | DEBUG_PRINTF("creating delay ghost vertex, id=%u\n" , ghostId); |
336 | assert(ghostId != literalId); |
337 | assert(tbi->literals.at(ghostId).delay == 0); |
338 | |
339 | // Adjust offsets, removing delay. |
340 | u32 ghost_min = min_offset, ghost_max = max_offset; |
341 | assert(ghost_min < ROSE_BOUND_INF && ghost_min >= delay); |
342 | ghost_min -= delay; |
343 | ghost_max -= ghost_max == ROSE_BOUND_INF ? 0 : delay; |
344 | |
345 | RoseVertex g_v = createVertex(tbi, ghostId, ghost_min, ghost_max); |
346 | |
347 | for (const auto &pv : parents) { |
348 | const RoseInEdgeProps &edge_props = bd.ig[pv.second]; |
349 | RoseEdge e = add_edge(pv.first, g_v, tbi->g); |
350 | g[e].minBound = edge_props.minBound; |
351 | g[e].maxBound = edge_props.maxBound; |
352 | g[e].history = selectHistory(*tbi, bd, pv.second, e); |
353 | DEBUG_PRINTF("parent edge has bounds [%u,%u]\n" , |
354 | edge_props.minBound, edge_props.maxBound); |
355 | } |
356 | |
357 | for (auto &m : created) { |
358 | tbi->ghost[m.second] = g_v; |
359 | } |
360 | } |
361 | } |
362 | |
363 | /* ensure the holder does not accept any paths which do not end with lit */ |
364 | static |
365 | void removeFalsePaths(NGHolder &g, const ue2_literal &lit) { |
366 | DEBUG_PRINTF("strip '%s'\n" , dumpString(lit).c_str()); |
367 | set<NFAVertex> curr, next; |
368 | curr.insert(g.accept); |
369 | curr.insert(g.acceptEod); |
370 | |
371 | for (auto it = lit.rbegin(), ite = lit.rend(); it != ite; ++it) { |
372 | next.clear(); |
373 | for (auto curr_v : curr) { |
374 | DEBUG_PRINTF("handling %zu\n" , g[curr_v].index); |
375 | vector<NFAVertex> next_cand; |
376 | insert(&next_cand, next_cand.end(), |
377 | inv_adjacent_vertices(curr_v, g)); |
378 | clear_in_edges(curr_v, g); |
379 | if (curr_v == g.acceptEod) { |
380 | add_edge(g.accept, g.acceptEod, g); |
381 | } |
382 | |
383 | for (auto v : next_cand) { |
384 | assert(v != g.startDs); |
385 | if (v == g.start || v == g.startDs || v == g.accept) { |
386 | continue; |
387 | } |
388 | |
389 | const CharReach &cr = g[v].char_reach; |
390 | |
391 | if (!overlaps(*it, cr)) { |
392 | DEBUG_PRINTF("false edge %zu\n" , g[v].index); |
393 | continue; |
394 | } |
395 | |
396 | NFAVertex v2 = clone_vertex(g, v); |
397 | clone_in_edges(g, v, v2); |
398 | add_edge(v2, curr_v, g); |
399 | g[v2].char_reach &= *it; |
400 | DEBUG_PRINTF("next <- %zu\n" , g[v2].index); |
401 | next.insert(v2); |
402 | } |
403 | } |
404 | |
405 | curr.swap(next); |
406 | } |
407 | |
408 | pruneUseless(g); |
409 | clearReports(g); |
410 | assert(in_degree(g.accept, g) || in_degree(g.acceptEod, g) > 1); |
411 | assert(allMatchStatesHaveReports(g)); |
412 | |
413 | DEBUG_PRINTF("graph has %zu vertices left\n" , num_vertices(g)); |
414 | } |
415 | |
416 | static |
417 | RoseVertex tryForAnchoredVertex(RoseBuildImpl *tbi, |
418 | const RoseInVertexProps &iv_info, |
419 | const RoseInEdgeProps &ep) { |
420 | if (ep.graph_lag && ep.graph_lag != iv_info.s.length()) { |
421 | DEBUG_PRINTF("bad lag %u != %zu\n" , ep.graph_lag, iv_info.s.length()); |
422 | return RoseGraph::null_vertex(); /* TODO: better */ |
423 | } |
424 | |
425 | const depth anchored_max_depth(tbi->cc.grey.maxAnchoredRegion); |
426 | depth min_width(0), max_width(0); |
427 | |
428 | if (ep.graph.get()) { |
429 | const depth graph_lag(ep.graph_lag); |
430 | max_width = findMaxWidth(*ep.graph) + graph_lag; |
431 | min_width = findMinWidth(*ep.graph) + graph_lag; |
432 | if (proper_out_degree(ep.graph->startDs, *ep.graph)) { |
433 | max_width = depth::infinity(); |
434 | } |
435 | } |
436 | |
437 | DEBUG_PRINTF("mw = %s; lag = %u\n" , max_width.str().c_str(), ep.graph_lag); |
438 | |
439 | NGHolder h; |
440 | |
441 | if (ep.graph.get() && max_width <= anchored_max_depth) { |
442 | cloneHolder(h, *ep.graph); |
443 | |
444 | /* add literal/dots */ |
445 | if (ep.graph_lag) { |
446 | assert(ep.graph_lag == iv_info.s.length()); |
447 | appendLiteral(h, iv_info.s); |
448 | } else { |
449 | removeFalsePaths(h, iv_info.s); |
450 | } |
451 | } else if (!ep.graph.get() && ep.maxBound < ROSE_BOUND_INF |
452 | && iv_info.s.length() + ep.maxBound |
453 | <= tbi->cc.grey.maxAnchoredRegion) { |
454 | if (ep.maxBound || ep.minBound) { |
455 | /* TODO: handle, however these cases are not generated currently by |
456 | ng_violet */ |
457 | return RoseGraph::null_vertex(); |
458 | } |
459 | max_width = depth(ep.maxBound + iv_info.s.length()); |
460 | min_width = depth(ep.minBound + iv_info.s.length()); |
461 | add_edge(h.start, h.accept, h); |
462 | appendLiteral(h, iv_info.s); |
463 | } else { |
464 | return RoseGraph::null_vertex(); |
465 | } |
466 | |
467 | u32 anchored_exit_id = tbi->getNewLiteralId(); |
468 | u32 remap_id = 0; |
469 | DEBUG_PRINTF(" trying to add dfa stuff\n" ); |
470 | int rv = addToAnchoredMatcher(*tbi, h, anchored_exit_id, &remap_id); |
471 | |
472 | if (rv == ANCHORED_FAIL) { |
473 | return RoseGraph::null_vertex(); |
474 | } else if (rv == ANCHORED_REMAP) { |
475 | anchored_exit_id = remap_id; |
476 | } else { |
477 | assert(rv == ANCHORED_SUCCESS); |
478 | } |
479 | |
480 | // Store the literal itself in a side structure so that we can use it for |
481 | // overlap calculations later. This may be obsolete when the old Rose |
482 | // construction path (and its history selection code) goes away. |
483 | rose_literal_id lit(iv_info.s, ROSE_ANCHORED, 0); |
484 | tbi->anchoredLitSuffix.insert(make_pair(anchored_exit_id, lit)); |
485 | |
486 | assert(min_width <= anchored_max_depth); |
487 | assert(max_width <= anchored_max_depth); |
488 | assert(min_width <= max_width); |
489 | |
490 | /* Note: bounds are end-to-end as anchored lits are considered |
491 | * to have 0 length. */ |
492 | RoseVertex v = createAnchoredVertex(tbi, anchored_exit_id, min_width, |
493 | max_width); |
494 | return v; |
495 | } |
496 | |
497 | static |
498 | u32 findRoseAnchorFloatingOverlap(const RoseInEdgeProps &ep, |
499 | const RoseInVertexProps &succ_vp) { |
500 | /* we need to ensure there is enough history to find the successor literal |
501 | * when we enable its group. |
502 | */ |
503 | |
504 | if (!ep.graph.get()) { |
505 | return 0; /* non overlapping */ |
506 | } |
507 | depth graph_min_width = findMinWidth(*ep.graph); |
508 | u32 min_width = ep.graph_lag + graph_min_width; |
509 | u32 s_len = succ_vp.s.length(); |
510 | |
511 | if (s_len <= min_width) { |
512 | return 0; /* no overlap */ |
513 | } |
514 | |
515 | u32 overlap = s_len - min_width; |
516 | DEBUG_PRINTF("found overlap of %u\n" , overlap); |
517 | return overlap; |
518 | } |
519 | |
520 | static |
521 | void findRoseLiteralMask(const NGHolder &h, const u32 lag, vector<u8> &msk, |
522 | vector<u8> &cmp) { |
523 | if (lag >= HWLM_MASKLEN) { |
524 | msk.clear(); cmp.clear(); |
525 | return; |
526 | } |
527 | |
528 | assert(in_degree(h.acceptEod, h) == 1); // no eod reports |
529 | |
530 | // Start with the set of reporter vertices for this rose. |
531 | set<NFAVertex> curr, next; |
532 | insert(&curr, inv_adjacent_vertices(h.accept, h)); |
533 | assert(!curr.empty()); |
534 | |
535 | msk.assign(HWLM_MASKLEN, 0); |
536 | cmp.assign(HWLM_MASKLEN, 0); |
537 | size_t i = HWLM_MASKLEN - lag - 1; |
538 | do { |
539 | if (curr.empty() || contains(curr, h.start) || |
540 | contains(curr, h.startDs)) { |
541 | DEBUG_PRINTF("end of the road\n" ); |
542 | break; |
543 | } |
544 | |
545 | next.clear(); |
546 | CharReach cr; |
547 | for (auto v : curr) { |
548 | DEBUG_PRINTF("vertex %zu, reach %s\n" , h[v].index, |
549 | describeClass(h[v].char_reach).c_str()); |
550 | cr |= h[v].char_reach; |
551 | insert(&next, inv_adjacent_vertices(v, h)); |
552 | } |
553 | make_and_cmp_mask(cr, &msk[i], &cmp[i]); |
554 | DEBUG_PRINTF("%zu: reach=%s, msk=%u, cmp=%u\n" , i, |
555 | describeClass(cr).c_str(), msk.at(i), cmp.at(i)); |
556 | curr.swap(next); |
557 | } while (i-- > 0); |
558 | } |
559 | |
560 | static |
561 | void doRoseLiteralVertex(RoseBuildImpl *tbi, bool use_eod_table, |
562 | map<RoseInVertex, vector<RoseVertex> > &vertex_map, |
563 | const vector<pair<RoseVertex, RoseInEdge> > &parents, |
564 | RoseInVertex iv, const RoseBuildData &bd) { |
565 | const RoseInGraph &ig = bd.ig; |
566 | const RoseInVertexProps &iv_info = ig[iv]; |
567 | assert(iv_info.type == RIV_LITERAL); |
568 | assert(!parents.empty()); /* start vertices should not be here */ |
569 | |
570 | // ng_violet should have ensured that mixed-sensitivity literals are no |
571 | // longer than the benefits max width. |
572 | assert(iv_info.s.length() <= MAX_MASK2_WIDTH || |
573 | !mixed_sensitivity(iv_info.s)); |
574 | |
575 | // Rose graph construction process should have given us a min_offset. |
576 | assert(iv_info.min_offset > 0); |
577 | |
578 | if (use_eod_table) { |
579 | goto floating; |
580 | } |
581 | |
582 | DEBUG_PRINTF("rose find vertex\n" ); |
583 | if (parents.size() == 1) { |
584 | const RoseVertex u = parents.front().first; |
585 | const RoseInEdgeProps &ep = ig[parents.front().second]; |
586 | |
587 | if (!tbi->isAnyStart(u)) { |
588 | goto floating; |
589 | } |
590 | |
591 | if (!ep.graph && ep.maxBound == ROSE_BOUND_INF) { |
592 | goto floating; |
593 | } |
594 | if (ep.graph && !isAnchored(*ep.graph)) { |
595 | goto floating; |
596 | } |
597 | |
598 | DEBUG_PRINTF("cand for anchored maxBound %u, %p (%d)\n" , ep.maxBound, |
599 | ep.graph.get(), ep.graph ? (int)isAnchored(*ep.graph) : 3); |
600 | |
601 | /* need to check if putting iv into the anchored table would create |
602 | * any bad_overlap relationships with its successor literals */ |
603 | for (const auto &e : out_edges_range(iv, ig)) { |
604 | RoseInVertex t = target(e, ig); |
605 | u32 overlap = findRoseAnchorFloatingOverlap(ig[e], ig[t]); |
606 | DEBUG_PRINTF("found overlap of %u\n" , overlap); |
607 | if (overlap > tbi->cc.grey.maxHistoryAvailable + 1) { |
608 | goto floating; |
609 | } |
610 | } |
611 | |
612 | RoseVertex v = tryForAnchoredVertex(tbi, iv_info, ep); |
613 | if (v != RoseGraph::null_vertex()) { |
614 | DEBUG_PRINTF("add anchored literal vertex\n" ); |
615 | vertex_map[iv].push_back(v); |
616 | return; |
617 | } |
618 | } |
619 | |
620 | floating: |
621 | vector<u8> msk, cmp; |
622 | if (tbi->cc.grey.roseHamsterMasks && in_degree(iv, ig) == 1) { |
623 | RoseInEdge e = *in_edges(iv, ig).first; |
624 | if (ig[e].graph) { |
625 | findRoseLiteralMask(*ig[e].graph, ig[e].graph_lag, msk, cmp); |
626 | } |
627 | } |
628 | |
629 | u32 delay = iv_info.delay; |
630 | rose_literal_table table = use_eod_table ? ROSE_EOD_ANCHORED : ROSE_FLOATING; |
631 | |
632 | u32 literalId = tbi->getLiteralId(iv_info.s, msk, cmp, delay, table); |
633 | |
634 | DEBUG_PRINTF("literal=%u (len=%zu, delay=%u, offsets=[%u,%u] '%s')\n" , |
635 | literalId, iv_info.s.length(), delay, iv_info.min_offset, |
636 | iv_info.max_offset, dumpString(iv_info.s).c_str()); |
637 | |
638 | createVertices(tbi, vertex_map, parents, iv, iv_info.min_offset, |
639 | iv_info.max_offset, literalId, delay, bd); |
640 | } |
641 | |
642 | static |
643 | unique_ptr<NGHolder> makeRoseEodPrefix(const NGHolder &h, RoseBuildImpl &build, |
644 | map<flat_set<ReportID>, ReportID> &remap) { |
645 | assert(generates_callbacks(h)); |
646 | assert(!in_degree(h.accept, h)); |
647 | auto gg = cloneHolder(h); |
648 | NGHolder &g = *gg; |
649 | g.kind = is_triggered(h) ? NFA_INFIX : NFA_PREFIX; |
650 | |
651 | // Move acceptEod edges over to accept. |
652 | vector<NFAEdge> dead; |
653 | for (const auto &e : in_edges_range(g.acceptEod, g)) { |
654 | NFAVertex u = source(e, g); |
655 | if (u == g.accept) { |
656 | continue; |
657 | } |
658 | add_edge_if_not_present(u, g.accept, g); |
659 | dead.push_back(e); |
660 | |
661 | if (!contains(remap, g[u].reports)) { |
662 | remap[g[u].reports] = build.getNewNfaReport(); |
663 | } |
664 | |
665 | g[u].reports = { remap[g[u].reports] }; |
666 | } |
667 | |
668 | remove_edges(dead, g); |
669 | return gg; |
670 | } |
671 | |
672 | static |
673 | u32 getEodEventID(RoseBuildImpl &build) { |
674 | // Allocate the EOD event if it hasn't been already. |
675 | if (build.eod_event_literal_id == MO_INVALID_IDX) { |
676 | build.eod_event_literal_id = build.getLiteralId({}, 0, ROSE_EVENT); |
677 | } |
678 | |
679 | return build.eod_event_literal_id; |
680 | } |
681 | |
682 | static |
683 | void makeEodEventLeftfix(RoseBuildImpl &build, RoseVertex u, |
684 | const NGHolder &h) { |
685 | assert(!build.isInETable(u)); |
686 | |
687 | RoseGraph &g = build.g; |
688 | map<flat_set<ReportID>, ReportID> report_remap; |
689 | shared_ptr<NGHolder> eod_leftfix |
690 | = makeRoseEodPrefix(h, build, report_remap); |
691 | |
692 | u32 eod_event = getEodEventID(build); |
693 | |
694 | for (const auto &report_mapping : report_remap) { |
695 | RoseVertex v = add_vertex(g); |
696 | g[v].literals.insert(eod_event); |
697 | build.literal_info[eod_event].vertices.insert(v); |
698 | |
699 | g[v].left.graph = eod_leftfix; |
700 | g[v].left.leftfix_report = report_mapping.second; |
701 | g[v].left.lag = 0; |
702 | RoseEdge e1 = add_edge(u, v, g); |
703 | g[e1].minBound = 0; |
704 | g[e1].maxBound = ROSE_BOUND_INF; |
705 | g[v].min_offset = add_rose_depth(g[u].min_offset, |
706 | findMinWidth(*g[v].left.graph)); |
707 | g[v].max_offset = ROSE_BOUND_INF; |
708 | |
709 | depth max_width = findMaxWidth(*g[v].left.graph); |
710 | if (u != build.root && max_width.is_finite() |
711 | && (!build.isAnyStart(u) || isPureAnchored(*g[v].left.graph))) { |
712 | g[e1].maxBound = max_width; |
713 | g[v].max_offset = add_rose_depth(g[u].max_offset, max_width); |
714 | } |
715 | |
716 | g[e1].history = ROSE_ROLE_HISTORY_NONE; // handled by prefix |
717 | RoseVertex w = add_vertex(g); |
718 | g[w].eod_accept = true; |
719 | g[w].reports = report_mapping.first; |
720 | g[w].min_offset = g[v].min_offset; |
721 | g[w].max_offset = g[v].max_offset; |
722 | RoseEdge e = add_edge(v, w, g); |
723 | g[e].minBound = 0; |
724 | g[e].maxBound = 0; |
725 | /* No need to set history as the event is only delivered at the last |
726 | * byte anyway - no need to invalidate stale entries. */ |
727 | g[e].history = ROSE_ROLE_HISTORY_NONE; |
728 | DEBUG_PRINTF("accept eod vertex (index=%zu)\n" , g[w].index); |
729 | } |
730 | } |
731 | |
732 | static |
733 | void doRoseAcceptVertex(RoseBuildImpl *tbi, |
734 | const vector<pair<RoseVertex, RoseInEdge> > &parents, |
735 | RoseInVertex iv, const RoseBuildData &bd) { |
736 | const RoseInGraph &ig = bd.ig; |
737 | assert(ig[iv].type == RIV_ACCEPT || ig[iv].type == RIV_ACCEPT_EOD); |
738 | |
739 | RoseGraph &g = tbi->g; |
740 | |
741 | for (const auto &pv : parents) { |
742 | RoseVertex u = pv.first; |
743 | const RoseInEdgeProps &edge_props = bd.ig[pv.second]; |
744 | |
745 | /* We need to duplicate the parent vertices if: |
746 | * |
747 | * 1) It already has a suffix, etc as we are going to add the specified |
748 | * suffix, etc to the parents and we do not want to overwrite the |
749 | * existing information. |
750 | * |
751 | * 2) We are making the an EOD accept and the vertex already has other |
752 | * out-edges - The LAST_BYTE history used for EOD accepts is |
753 | * incompatible with normal successors. As accepts are processed last we |
754 | * do not need to worry about other normal successors being added later. |
755 | */ |
756 | if (g[u].suffix || !g[u].reports.empty() |
757 | || (ig[iv].type == RIV_ACCEPT_EOD && out_degree(u, g) |
758 | && !edge_props.graph) |
759 | || (!isLeafNode(u, g) && !tbi->isAnyStart(u))) { |
760 | DEBUG_PRINTF("duplicating for parent %zu\n" , g[u].index); |
761 | assert(!tbi->isAnyStart(u)); |
762 | u = duplicate(tbi, u); |
763 | g[u].suffix.reset(); |
764 | g[u].eod_accept = false; |
765 | } |
766 | |
767 | assert(!g[u].suffix); |
768 | if (ig[iv].type == RIV_ACCEPT) { |
769 | assert(!tbi->isAnyStart(u)); |
770 | if (edge_props.dfa) { |
771 | DEBUG_PRINTF("adding early dfa suffix to i%zu\n" , g[u].index); |
772 | g[u].suffix.rdfa = edge_props.dfa; |
773 | g[u].suffix.dfa_min_width = findMinWidth(*edge_props.graph); |
774 | g[u].suffix.dfa_max_width = findMaxWidth(*edge_props.graph); |
775 | } else if (edge_props.graph) { |
776 | DEBUG_PRINTF("adding suffix to i%zu\n" , g[u].index); |
777 | g[u].suffix.graph = edge_props.graph; |
778 | assert(g[u].suffix.graph->kind == NFA_SUFFIX); |
779 | /* TODO: set dfa_(min|max)_width */ |
780 | } else if (edge_props.haig) { |
781 | DEBUG_PRINTF("adding suffaig to i%zu\n" , g[u].index); |
782 | g[u].suffix.haig = edge_props.haig; |
783 | } else { |
784 | DEBUG_PRINTF("adding boring accept to i%zu\n" , g[u].index); |
785 | assert(!g[u].eod_accept); |
786 | g[u].reports = ig[iv].reports; |
787 | } |
788 | } else { |
789 | assert(ig[iv].type == RIV_ACCEPT_EOD); |
790 | assert(!edge_props.haig); |
791 | |
792 | if (!edge_props.graph) { |
793 | RoseVertex w = add_vertex(g); |
794 | g[w].eod_accept = true; |
795 | g[w].reports = ig[iv].reports; |
796 | g[w].min_offset = g[u].min_offset; |
797 | g[w].max_offset = g[u].max_offset; |
798 | RoseEdge e = add_edge(u, w, g); |
799 | g[e].minBound = 0; |
800 | g[e].maxBound = 0; |
801 | g[e].history = ROSE_ROLE_HISTORY_LAST_BYTE; |
802 | DEBUG_PRINTF("accept eod vertex (index=%zu)\n" , g[w].index); |
803 | continue; |
804 | } |
805 | |
806 | const NGHolder &h = *edge_props.graph; |
807 | assert(!in_degree(h.accept, h)); |
808 | assert(generates_callbacks(h)); |
809 | |
810 | if (tbi->isInETable(u)) { |
811 | assert(h.kind == NFA_SUFFIX); |
812 | assert(!tbi->isAnyStart(u)); |
813 | /* etable can't/shouldn't use eod event */ |
814 | DEBUG_PRINTF("adding suffix to i%zu\n" , g[u].index); |
815 | g[u].suffix.graph = edge_props.graph; |
816 | continue; |
817 | } |
818 | |
819 | makeEodEventLeftfix(*tbi, u, h); |
820 | } |
821 | } |
822 | } |
823 | |
824 | static |
825 | bool suitableForEod(const RoseInGraph &ig, vector<RoseInVertex> topo, |
826 | u32 *max_len, const CompileContext &cc) { |
827 | map<RoseInVertex, u32> max_depth_from_eod; |
828 | *max_len = 0; |
829 | |
830 | reverse(topo.begin(), topo.end()); /* we want to start at accept end */ |
831 | |
832 | for (auto v : topo) { |
833 | u32 v_depth = 0; |
834 | |
835 | if (ig[v].type == RIV_ACCEPT) { |
836 | DEBUG_PRINTF("[ACCEPT]\n" ); |
837 | for (const auto &e : in_edges_range(v, ig)) { |
838 | if (!ig[e].graph || !can_only_match_at_eod(*ig[e].graph)) { |
839 | DEBUG_PRINTF("floating accept\n" ); |
840 | return false; |
841 | } |
842 | } |
843 | } |
844 | |
845 | switch (ig[v].type) { |
846 | case RIV_LITERAL: |
847 | DEBUG_PRINTF("[LITERAL]\n" ); |
848 | break; |
849 | case RIV_START: |
850 | DEBUG_PRINTF("[START]\n" ); |
851 | break; |
852 | case RIV_ANCHORED_START: |
853 | DEBUG_PRINTF("[ANCHOR]\n" ); |
854 | break; |
855 | case RIV_ACCEPT: |
856 | break; |
857 | case RIV_ACCEPT_EOD: |
858 | DEBUG_PRINTF("[EOD]\n" ); |
859 | break; |
860 | default: |
861 | assert(0); |
862 | DEBUG_PRINTF("????\n" ); |
863 | return false; |
864 | } |
865 | |
866 | for (const auto &e : out_edges_range(v, ig)) { |
867 | RoseInVertex t = target(e, ig); |
868 | |
869 | assert(contains(max_depth_from_eod, t)); |
870 | u64a max_width; |
871 | |
872 | if (ig[v].type == RIV_START || ig[v].type == RIV_ANCHORED_START) { |
873 | /* start itself doesn't need to be in history buffer |
874 | * just need to make sure all succ literals are ok */ |
875 | if (ig[t].type == RIV_LITERAL) { |
876 | max_width = ig[t].s.length(); |
877 | } else { |
878 | max_width = 0; |
879 | } |
880 | if (ig[e].graph) { |
881 | depth graph_max_width = findMaxWidth(*ig[e].graph); |
882 | DEBUG_PRINTF("graph max width %s, lag %u\n" , |
883 | graph_max_width.str().c_str(), |
884 | ig[e].graph_lag); |
885 | if (!graph_max_width.is_finite()) { |
886 | DEBUG_PRINTF("fail due to graph with inf max width\n" ); |
887 | return false; |
888 | } |
889 | max_width += graph_max_width; |
890 | } |
891 | } else if (ig[e].haig) { |
892 | DEBUG_PRINTF("fail due to haig\n" ); |
893 | return false; |
894 | } else if (ig[e].graph) { |
895 | depth graph_max_width = findMaxWidth(*ig[e].graph); |
896 | DEBUG_PRINTF("graph max width %s, lag %u\n" , |
897 | graph_max_width.str().c_str(), ig[e].graph_lag); |
898 | if (!graph_max_width.is_finite()) { |
899 | DEBUG_PRINTF("fail due to graph with inf max width\n" ); |
900 | return false; |
901 | } |
902 | max_width = ig[e].graph_lag + graph_max_width; |
903 | } else { |
904 | max_width = ig[e].maxBound; |
905 | if (ig[t].type == RIV_LITERAL) { |
906 | max_width += ig[t].s.length(); |
907 | } |
908 | } |
909 | |
910 | max_width += max_depth_from_eod[t]; |
911 | if (max_width > ROSE_BOUND_INF) { |
912 | max_width = ROSE_BOUND_INF; |
913 | } |
914 | |
915 | DEBUG_PRINTF("max_width=%llu\n" , max_width); |
916 | |
917 | ENSURE_AT_LEAST(&v_depth, (u32)max_width); |
918 | } |
919 | |
920 | if (v_depth == ROSE_BOUND_INF |
921 | || v_depth > cc.grey.maxHistoryAvailable) { |
922 | DEBUG_PRINTF("not suitable for eod table %u\n" , v_depth); |
923 | return false; |
924 | } |
925 | |
926 | max_depth_from_eod[v] = v_depth; |
927 | ENSURE_AT_LEAST(max_len, v_depth); |
928 | } |
929 | |
930 | DEBUG_PRINTF("to the eod table and beyond\n" ); |
931 | return true; |
932 | } |
933 | |
934 | static |
935 | void shift_accepts_to_end(const RoseInGraph &ig, |
936 | vector<RoseInVertex> &topo_order) { |
937 | stable_partition(begin(topo_order), end(topo_order), |
938 | [&](RoseInVertex v){ return !is_any_accept(v, ig); }); |
939 | } |
940 | |
941 | static |
942 | void populateRoseGraph(RoseBuildImpl *tbi, RoseBuildData &bd) { |
943 | const RoseInGraph &ig = bd.ig; |
944 | |
945 | /* add the pattern in to the main rose graph */ |
946 | DEBUG_PRINTF("%srose pop\n" , bd.som ? "som " : "" ); |
947 | |
948 | /* Note: an input vertex may need to create several rose vertices. This is |
949 | * primarily because a RoseVertex can only have 1 one leftfix */ |
950 | map<RoseInVertex, vector<RoseVertex> > vertex_map; |
951 | |
952 | vector<RoseInVertex> v_order = topo_order(ig); |
953 | shift_accepts_to_end(ig, v_order); |
954 | |
955 | u32 eod_space_required; |
956 | bool use_eod_table = suitableForEod(ig, v_order, &eod_space_required, |
957 | tbi->cc); |
958 | if (use_eod_table) { |
959 | ENSURE_AT_LEAST(&tbi->ematcher_region_size, eod_space_required); |
960 | } |
961 | |
962 | assert(ig[v_order.front()].type == RIV_START |
963 | || ig[v_order.front()].type == RIV_ANCHORED_START); |
964 | |
965 | for (RoseInVertex iv : v_order) { |
966 | DEBUG_PRINTF("vertex %zu\n" , ig[iv].index); |
967 | |
968 | if (ig[iv].type == RIV_START) { |
969 | DEBUG_PRINTF("is root\n" ); |
970 | vertex_map[iv].push_back(tbi->root); |
971 | continue; |
972 | } else if (ig[iv].type == RIV_ANCHORED_START) { |
973 | DEBUG_PRINTF("is anchored root\n" ); |
974 | vertex_map[iv].push_back(tbi->anchored_root); |
975 | continue; |
976 | } |
977 | |
978 | vector<pair<RoseVertex, RoseInEdge> > parents; |
979 | for (const auto &e : in_edges_range(iv, ig)) { |
980 | RoseInVertex u = source(e, ig); |
981 | assert(contains(vertex_map, u)); |
982 | const vector<RoseVertex> &images = vertex_map[u]; |
983 | |
984 | // We should have no dupes. |
985 | assert(set<RoseVertex>(images.begin(), images.end()).size() |
986 | == images.size()); |
987 | |
988 | for (auto v_image : images) { |
989 | // v_image should NOT already be in our parents list. |
990 | assert(find_if(parents.begin(), parents.end(), |
991 | [&v_image](const pair<RoseVertex, RoseInEdge> &p) { |
992 | return p.first == v_image; |
993 | }) == parents.end()); |
994 | |
995 | parents.emplace_back(v_image, e); |
996 | |
997 | if (tbi->isAnchored(v_image)) { |
998 | assert(!use_eod_table); |
999 | u32 overlap = findRoseAnchorFloatingOverlap(ig[e], ig[iv]); |
1000 | assert(overlap <= tbi->cc.grey.maxHistoryAvailable + 1); |
1001 | ENSURE_AT_LEAST(&tbi->max_rose_anchored_floating_overlap, |
1002 | overlap); |
1003 | } |
1004 | } |
1005 | } |
1006 | |
1007 | if (ig[iv].type == RIV_LITERAL) { |
1008 | DEBUG_PRINTF("LITERAL '%s'\n" , dumpString(ig[iv].s).c_str()); |
1009 | assert(!isLeafNode(iv, ig)); |
1010 | doRoseLiteralVertex(tbi, use_eod_table, vertex_map, parents, iv, |
1011 | bd); |
1012 | } else { |
1013 | if (ig[iv].type == RIV_ACCEPT) { |
1014 | DEBUG_PRINTF("ACCEPT\n" ); |
1015 | } else { |
1016 | assert(ig[iv].type == RIV_ACCEPT_EOD); |
1017 | DEBUG_PRINTF("ACCEPT_EOD\n" ); |
1018 | } |
1019 | assert(isLeafNode(iv, ig)); /* accepts are final */ |
1020 | doRoseAcceptVertex(tbi, parents, iv, bd); |
1021 | } |
1022 | } |
1023 | DEBUG_PRINTF("done\n" ); |
1024 | } |
1025 | |
1026 | template<typename GraphT> |
1027 | static |
1028 | bool empty(const GraphT &g) { |
1029 | typename GraphT::vertex_iterator vi, ve; |
1030 | tie(vi, ve) = vertices(g); |
1031 | return vi == ve; |
1032 | } |
1033 | |
1034 | static |
1035 | bool canImplementGraph(NGHolder &h, bool prefilter, const ReportManager &rm, |
1036 | const CompileContext &cc) { |
1037 | if (isImplementableNFA(h, &rm, cc)) { |
1038 | return true; |
1039 | } |
1040 | |
1041 | if (prefilter && cc.grey.prefilterReductions) { |
1042 | // If we're prefiltering, we can have another go with a reduced graph. |
1043 | UNUSED size_t numBefore = num_vertices(h); |
1044 | prefilterReductions(h, cc); |
1045 | UNUSED size_t numAfter = num_vertices(h); |
1046 | DEBUG_PRINTF("reduced from %zu to %zu vertices\n" , numBefore, numAfter); |
1047 | |
1048 | if (isImplementableNFA(h, &rm, cc)) { |
1049 | return true; |
1050 | } |
1051 | } |
1052 | |
1053 | DEBUG_PRINTF("unable to build engine\n" ); |
1054 | return false; |
1055 | } |
1056 | |
1057 | static |
1058 | bool predsAreDelaySensitive(const RoseInGraph &ig, RoseInVertex v) { |
1059 | assert(in_degree(v, ig)); |
1060 | |
1061 | for (const auto &e : in_edges_range(v, ig)) { |
1062 | if (ig[e].graph || ig[e].haig) { |
1063 | DEBUG_PRINTF("edge graph\n" ); |
1064 | return true; |
1065 | } |
1066 | if (ig[e].minBound || ig[e].maxBound != ROSE_BOUND_INF) { |
1067 | DEBUG_PRINTF("edge bounds\n" ); |
1068 | return true; |
1069 | } |
1070 | |
1071 | RoseInVertex u = source(e, ig); |
1072 | if (ig[u].type == RIV_START) { |
1073 | continue; |
1074 | } |
1075 | if (ig[u].type != RIV_LITERAL) { |
1076 | DEBUG_PRINTF("unsafe pred vertex\n" ); |
1077 | return true; |
1078 | } |
1079 | if (ig[u].delay) { |
1080 | DEBUG_PRINTF("pred has delay\n" ); |
1081 | return true; |
1082 | } |
1083 | } |
1084 | |
1085 | return false; |
1086 | } |
1087 | |
1088 | static |
1089 | u32 maxAvailableDelay(const ue2_literal &pred_key, const ue2_literal &lit_key) { |
1090 | /* overly conservative if only part of the string is nocase */ |
1091 | string pred = pred_key.get_string(); |
1092 | string lit = lit_key.get_string(); |
1093 | |
1094 | if (pred_key.any_nocase() || lit_key.any_nocase()) { |
1095 | upperString(pred); |
1096 | upperString(lit); |
1097 | } |
1098 | |
1099 | string::size_type last = pred.rfind(lit); |
1100 | if (last == string::npos) { |
1101 | return MAX_DELAY; |
1102 | } |
1103 | |
1104 | u32 raw = pred.size() - last - 1; |
1105 | return MIN(raw, MAX_DELAY); |
1106 | } |
1107 | |
1108 | static |
1109 | u32 findMaxSafeDelay(const RoseInGraph &ig, RoseInVertex u, RoseInVertex v) { |
1110 | // First, check the overlap constraints on (u,v). |
1111 | size_t max_delay; |
1112 | if (ig[v].type == RIV_LITERAL) { |
1113 | DEBUG_PRINTF("lit->lit edge: '%s' -> '%s'\n" , |
1114 | escapeString(ig[u].s).c_str(), |
1115 | escapeString(ig[v].s).c_str()); |
1116 | max_delay = maxAvailableDelay(ig[u].s, ig[v].s); |
1117 | } else if (ig[v].type == RIV_ACCEPT) { |
1118 | DEBUG_PRINTF("lit->accept edge: '%s' -> ACCEPT\n" , |
1119 | escapeString(ig[u].s).c_str()); |
1120 | max_delay = MAX_DELAY; |
1121 | } else { |
1122 | assert(0); |
1123 | return 0; |
1124 | } |
1125 | |
1126 | DEBUG_PRINTF("max safe delay for this edge: %zu\n" , max_delay); |
1127 | |
1128 | // Now consider the predecessors of u. |
1129 | for (const auto &e : in_edges_range(u, ig)) { |
1130 | RoseInVertex w = source(e, ig); |
1131 | if (ig[w].type == RIV_START) { |
1132 | continue; |
1133 | } |
1134 | assert(ig[w].type == RIV_LITERAL); |
1135 | assert(ig[w].delay == 0); |
1136 | |
1137 | DEBUG_PRINTF("pred lit->lit edge: '%s' -> '%s'\n" , |
1138 | escapeString(ig[w].s).c_str(), |
1139 | escapeString(ig[u].s).c_str()); |
1140 | |
1141 | // We cannot delay the literal on u so much that a predecessor literal |
1142 | // could occur in the delayed region. For example, consider |
1143 | // 'barman.*foobar': if we allow 'foobar' to be delayed by 3, then |
1144 | // 'barman' could occur in the input string and race with 'foobar', as |
1145 | // in 'foobarman'. |
1146 | |
1147 | const size_t pred_len = ig[w].s.length(); |
1148 | size_t overlap = maxOverlap(ig[u].s, ig[w].s, 0); |
1149 | DEBUG_PRINTF("pred_len=%zu, overlap=%zu\n" , pred_len, overlap); |
1150 | assert(overlap <= pred_len); |
1151 | size_t max_lit_delay = pred_len - min(overlap + 1, pred_len); |
1152 | DEBUG_PRINTF("overlap=%zu -> max_lit_delay=%zu\n" , overlap, |
1153 | max_lit_delay); |
1154 | max_delay = min(max_delay, max_lit_delay); |
1155 | } |
1156 | |
1157 | DEBUG_PRINTF("max_delay=%zu\n" , max_delay); |
1158 | assert(max_delay <= MAX_DELAY); |
1159 | return max_delay; |
1160 | } |
1161 | |
1162 | static |
1163 | bool transformInfixToDelay(const RoseInGraph &ig, const RoseInEdge &e, |
1164 | const CompileContext &cc, u32 *delay_out) { |
1165 | const u32 max_history = |
1166 | cc.streaming ? cc.grey.maxHistoryAvailable : ROSE_BOUND_INF; |
1167 | |
1168 | const RoseInVertex u = source(e, ig), v = target(e, ig); |
1169 | const u32 graph_lag = ig[e].graph_lag; |
1170 | |
1171 | // Clone a copy of the graph, as we need to be able to roll back this |
1172 | // operation. |
1173 | NGHolder h; |
1174 | cloneHolder(h, *ig[e].graph); |
1175 | |
1176 | DEBUG_PRINTF("target literal: %s\n" , dumpString(ig[v].s).c_str()); |
1177 | DEBUG_PRINTF("graph with %zu vertices and graph_lag %u\n" , num_vertices(h), |
1178 | graph_lag); |
1179 | |
1180 | assert(graph_lag <= ig[v].s.length()); |
1181 | if (graph_lag < ig[v].s.length()) { |
1182 | size_t len = ig[v].s.length() - graph_lag; |
1183 | ue2_literal lit(ig[v].s.substr(0, len)); |
1184 | DEBUG_PRINTF("lit2=%s\n" , dumpString(lit).c_str()); |
1185 | u32 delay2 = removeTrailingLiteralStates(h, lit, max_history); |
1186 | if (delay2 == MO_INVALID_IDX) { |
1187 | DEBUG_PRINTF("couldn't remove trailing literal\n" ); |
1188 | return false; |
1189 | } |
1190 | if (delay2 != len) { |
1191 | DEBUG_PRINTF("couldn't remove entire trailing literal\n" ); |
1192 | return false; |
1193 | } |
1194 | } |
1195 | |
1196 | PureRepeat repeat; |
1197 | if (!isPureRepeat(h, repeat)) { |
1198 | DEBUG_PRINTF("graph is not repeat\n" ); |
1199 | return false; |
1200 | } |
1201 | DEBUG_PRINTF("graph is %s repeat\n" , repeat.bounds.str().c_str()); |
1202 | if (!repeat.bounds.max.is_infinite()) { |
1203 | DEBUG_PRINTF("not inf\n" ); |
1204 | return false; |
1205 | } |
1206 | |
1207 | if (!repeat.reach.all()) { |
1208 | DEBUG_PRINTF("non-dot reach\n" ); |
1209 | return false; |
1210 | } |
1211 | |
1212 | u32 delay = ig[v].s.length() + repeat.bounds.min; |
1213 | if (delay > MAX_DELAY) { |
1214 | DEBUG_PRINTF("delay %u > MAX_DELAY\n" , delay); |
1215 | return false; |
1216 | } |
1217 | |
1218 | if (delay + ig[u].s.length() - 1 > max_history) { |
1219 | DEBUG_PRINTF("delay too large for history\n" ); |
1220 | return false; |
1221 | } |
1222 | |
1223 | *delay_out = delay; |
1224 | return true; |
1225 | } |
1226 | |
1227 | static |
1228 | void transformLiteralDelay(RoseInGraph &ig, const CompileContext &cc) { |
1229 | if (!cc.grey.roseTransformDelay) { |
1230 | return; |
1231 | } |
1232 | |
1233 | for (auto u : vertices_range(ig)) { |
1234 | if (ig[u].type != RIV_LITERAL) { |
1235 | continue; |
1236 | } |
1237 | if (out_degree(u, ig) != 1) { |
1238 | continue; |
1239 | } |
1240 | |
1241 | RoseInEdge e = *out_edges(u, ig).first; |
1242 | RoseInVertex v = target(e, ig); |
1243 | if (ig[v].type != RIV_LITERAL) { |
1244 | continue; |
1245 | } |
1246 | if (ig[e].haig) { |
1247 | continue; |
1248 | } |
1249 | if (!ig[e].graph) { |
1250 | continue; |
1251 | } |
1252 | |
1253 | if (predsAreDelaySensitive(ig, u)) { |
1254 | DEBUG_PRINTF("preds are delay sensitive\n" ); |
1255 | continue; |
1256 | } |
1257 | |
1258 | u32 max_delay = findMaxSafeDelay(ig, u, v); |
1259 | |
1260 | DEBUG_PRINTF("lit->lit edge with graph: '%s' -> '%s'\n" , |
1261 | escapeString(ig[u].s).c_str(), |
1262 | escapeString(ig[v].s).c_str()); |
1263 | |
1264 | u32 delay = 0; |
1265 | if (!transformInfixToDelay(ig, e, cc, &delay)) { |
1266 | continue; |
1267 | } |
1268 | |
1269 | if (delay > max_delay) { |
1270 | DEBUG_PRINTF("delay=%u > max_delay=%u\n" , delay, max_delay); |
1271 | continue; |
1272 | } |
1273 | |
1274 | DEBUG_PRINTF("setting lit delay to %u and deleting graph\n" , delay); |
1275 | ig[u].delay = delay; |
1276 | ig[u].min_offset = add_rose_depth(ig[u].min_offset, delay); |
1277 | ig[u].max_offset = add_rose_depth(ig[u].max_offset, delay); |
1278 | ig[e].graph_lag = 0; |
1279 | ig[e].graph.reset(); |
1280 | ig[e].minBound = 0; |
1281 | ig[e].maxBound = ROSE_BOUND_INF; |
1282 | } |
1283 | } |
1284 | |
1285 | static |
1286 | bool transformInfixToAnchBounds(const RoseInGraph &ig, const RoseInEdge &e, |
1287 | const CompileContext &cc, DepthMinMax *bounds) { |
1288 | const u32 max_history = cc.streaming ? cc.grey.maxHistoryAvailable |
1289 | : ROSE_BOUND_INF; |
1290 | |
1291 | const RoseInVertex v = target(e, ig); |
1292 | const u32 graph_lag = ig[e].graph_lag; |
1293 | |
1294 | // Clone a copy of the graph, as we need to be able to roll back this |
1295 | // operation. |
1296 | NGHolder h; |
1297 | cloneHolder(h, *ig[e].graph); |
1298 | |
1299 | DEBUG_PRINTF("graph with %zu vertices and graph_lag %u\n" , num_vertices(h), |
1300 | graph_lag); |
1301 | |
1302 | assert(graph_lag <= ig[v].s.length()); |
1303 | if (graph_lag < ig[v].s.length()) { |
1304 | size_t len = ig[v].s.length() - graph_lag; |
1305 | ue2_literal lit(ig[v].s.substr(0, len)); |
1306 | DEBUG_PRINTF("lit2=%s\n" , dumpString(lit).c_str()); |
1307 | u32 delay2 = removeTrailingLiteralStates(h, lit, max_history); |
1308 | if (delay2 == MO_INVALID_IDX) { |
1309 | DEBUG_PRINTF("couldn't remove trailing literal\n" ); |
1310 | return false; |
1311 | } |
1312 | if (delay2 != len) { |
1313 | DEBUG_PRINTF("couldn't remove entire trailing literal\n" ); |
1314 | return false; |
1315 | } |
1316 | } |
1317 | |
1318 | PureRepeat repeat; |
1319 | if (!isPureRepeat(h, repeat)) { |
1320 | DEBUG_PRINTF("graph is not repeat\n" ); |
1321 | return false; |
1322 | } |
1323 | DEBUG_PRINTF("graph is %s repeat\n" , repeat.bounds.str().c_str()); |
1324 | if (!repeat.bounds.max.is_infinite()) { |
1325 | DEBUG_PRINTF("not inf\n" ); |
1326 | return false; |
1327 | } |
1328 | |
1329 | if (!repeat.reach.all()) { |
1330 | DEBUG_PRINTF("non-dot reach\n" ); |
1331 | return false; |
1332 | } |
1333 | |
1334 | *bounds = repeat.bounds; |
1335 | return true; |
1336 | } |
1337 | |
1338 | static |
1339 | void transformAnchoredLiteralOverlap(RoseInGraph &ig, RoseBuildData &bd, |
1340 | const CompileContext &cc) { |
1341 | if (!cc.grey.roseTransformDelay) { |
1342 | return; |
1343 | } |
1344 | |
1345 | for (const auto &e : edges_range(ig)) { |
1346 | const RoseInVertex u = source(e, ig); |
1347 | const RoseInVertex v = target(e, ig); |
1348 | |
1349 | if (ig[u].type != RIV_LITERAL || ig[v].type != RIV_LITERAL) { |
1350 | continue; |
1351 | } |
1352 | if (ig[e].haig || !ig[e].graph) { |
1353 | continue; |
1354 | } |
1355 | |
1356 | if (ig[u].min_offset != ig[u].max_offset) { |
1357 | DEBUG_PRINTF("u not fixed depth\n" ); |
1358 | continue; |
1359 | } |
1360 | |
1361 | DEBUG_PRINTF("anch_lit->lit edge with graph: '%s' -> '%s'\n" , |
1362 | escapeString(ig[u].s).c_str(), |
1363 | escapeString(ig[v].s).c_str()); |
1364 | |
1365 | DepthMinMax bounds; |
1366 | if (!transformInfixToAnchBounds(ig, e, cc, &bounds)) { |
1367 | continue; |
1368 | } |
1369 | |
1370 | DEBUG_PRINTF("setting bounds to %s and deleting graph\n" , |
1371 | bounds.str().c_str()); |
1372 | ig[e].graph_lag = 0; |
1373 | ig[e].graph.reset(); |
1374 | ig[e].minBound = bounds.min; |
1375 | ig[e].maxBound = bounds.max.is_finite() ? (u32)bounds.max |
1376 | : ROSE_BOUND_INF; |
1377 | bd.anch_history_edges.insert(e); |
1378 | } |
1379 | } |
1380 | |
1381 | /** |
1382 | * \brief Transform small trailing dot repeat suffixes into delay on the last |
1383 | * literal. |
1384 | * |
1385 | * For example, the case /hatstand.*teakettle./s can just delay 'teakettle' +1 |
1386 | * rather than having a suffix to handle the dot. |
1387 | * |
1388 | * This transformation looks for literal->accept edges and transforms them if |
1389 | * appropriate. It doesn't handle complex cases where the literal has more than |
1390 | * one successor. |
1391 | */ |
1392 | static |
1393 | void transformSuffixDelay(RoseInGraph &ig, const CompileContext &cc) { |
1394 | if (!cc.grey.roseTransformDelay) { |
1395 | return; |
1396 | } |
1397 | |
1398 | const u32 max_history = cc.streaming ? cc.grey.maxHistoryAvailable |
1399 | : ROSE_BOUND_INF; |
1400 | |
1401 | set<RoseInVertex> modified_accepts; // may be dead after transform |
1402 | |
1403 | for (auto u : vertices_range(ig)) { |
1404 | if (ig[u].type != RIV_LITERAL) { |
1405 | continue; |
1406 | } |
1407 | if (out_degree(u, ig) != 1) { |
1408 | continue; |
1409 | } |
1410 | |
1411 | RoseInEdge e = *out_edges(u, ig).first; |
1412 | RoseInVertex v = target(e, ig); |
1413 | if (ig[v].type != RIV_ACCEPT) { |
1414 | continue; |
1415 | } |
1416 | if (ig[e].haig) { |
1417 | continue; |
1418 | } |
1419 | if (!ig[e].graph) { |
1420 | continue; |
1421 | } |
1422 | |
1423 | if (predsAreDelaySensitive(ig, u)) { |
1424 | DEBUG_PRINTF("preds are delay sensitive\n" ); |
1425 | continue; |
1426 | } |
1427 | |
1428 | DEBUG_PRINTF("lit->accept edge with graph: lit='%s'\n" , |
1429 | escapeString(ig[u].s).c_str()); |
1430 | |
1431 | const NGHolder &h = *ig[e].graph; |
1432 | const set<ReportID> reports = all_reports(h); |
1433 | if (reports.size() != 1) { |
1434 | DEBUG_PRINTF("too many reports\n" ); |
1435 | continue; |
1436 | } |
1437 | |
1438 | PureRepeat repeat; |
1439 | if (!isPureRepeat(h, repeat)) { |
1440 | DEBUG_PRINTF("suffix graph is not repeat\n" ); |
1441 | continue; |
1442 | } |
1443 | DEBUG_PRINTF("suffix graph is %s repeat\n" , |
1444 | repeat.bounds.str().c_str()); |
1445 | |
1446 | if (!repeat.reach.all()) { |
1447 | DEBUG_PRINTF("non-dot reach\n" ); |
1448 | continue; |
1449 | } |
1450 | |
1451 | if (repeat.bounds.min != repeat.bounds.max || |
1452 | repeat.bounds.min > depth(MAX_DELAY)) { |
1453 | DEBUG_PRINTF("repeat is variable or too large\n" ); |
1454 | continue; |
1455 | } |
1456 | |
1457 | u32 max_delay = findMaxSafeDelay(ig, u, v); |
1458 | |
1459 | u32 delay = repeat.bounds.min; |
1460 | if (delay > max_delay) { |
1461 | DEBUG_PRINTF("delay=%u > max_delay=%u\n" , delay, max_delay); |
1462 | continue; |
1463 | } |
1464 | |
1465 | if (delay + ig[u].s.length() - 1 > max_history) { |
1466 | DEBUG_PRINTF("delay too large for history\n" ); |
1467 | continue; |
1468 | } |
1469 | |
1470 | DEBUG_PRINTF("setting lit delay to %u and removing suffix\n" , delay); |
1471 | ig[u].delay = delay; |
1472 | ig[u].min_offset = add_rose_depth(ig[u].min_offset, delay); |
1473 | ig[u].max_offset = add_rose_depth(ig[u].max_offset, delay); |
1474 | |
1475 | // Construct a new accept vertex for this report and remove edge e. |
1476 | // (This allows us to cope if v has more than one in-edge). |
1477 | RoseInVertex v2 = |
1478 | add_vertex(RoseInVertexProps::makeAccept(reports), ig); |
1479 | add_edge(u, v2, ig); |
1480 | remove_edge(e, ig); |
1481 | modified_accepts.insert(v); |
1482 | } |
1483 | |
1484 | DEBUG_PRINTF("%zu modified accepts\n" , modified_accepts.size()); |
1485 | |
1486 | for (auto v : modified_accepts) { |
1487 | if (in_degree(v, ig) == 0) { |
1488 | DEBUG_PRINTF("removing accept vertex with no preds\n" ); |
1489 | remove_vertex(v, ig); |
1490 | } |
1491 | } |
1492 | } |
1493 | |
1494 | #ifndef NDEBUG |
1495 | static |
1496 | bool validateKinds(const RoseInGraph &g) { |
1497 | for (const auto &e : edges_range(g)) { |
1498 | if (g[e].graph && g[e].graph->kind != whatRoseIsThis(g, e)) { |
1499 | return false; |
1500 | } |
1501 | } |
1502 | |
1503 | return true; |
1504 | } |
1505 | #endif |
1506 | |
1507 | bool RoseBuildImpl::addRose(const RoseInGraph &ig, bool prefilter) { |
1508 | DEBUG_PRINTF("trying to rose\n" ); |
1509 | assert(validateKinds(ig)); |
1510 | assert(hasCorrectlyNumberedVertices(ig)); |
1511 | |
1512 | if (::ue2::empty(ig)) { |
1513 | assert(0); |
1514 | return false; |
1515 | } |
1516 | |
1517 | const unique_ptr<RoseInGraph> in_ptr = cloneRoseGraph(ig); |
1518 | RoseInGraph &in = *in_ptr; |
1519 | |
1520 | RoseBuildData bd(in, false); |
1521 | |
1522 | transformLiteralDelay(in, cc); |
1523 | transformAnchoredLiteralOverlap(in, bd, cc); |
1524 | transformSuffixDelay(in, cc); |
1525 | |
1526 | renumber_vertices(in); |
1527 | assert(validateKinds(in)); |
1528 | |
1529 | insertion_ordered_map<NGHolder *, vector<RoseInEdge>> graphs; |
1530 | |
1531 | for (const auto &e : edges_range(in)) { |
1532 | if (!in[e].graph) { |
1533 | assert(!in[e].dfa); |
1534 | assert(!in[e].haig); |
1535 | continue; // no graph |
1536 | } |
1537 | |
1538 | if (in[e].haig || in[e].dfa) { |
1539 | /* Early DFAs/Haigs are always implementable (we've already built |
1540 | * the raw DFA). */ |
1541 | continue; |
1542 | } |
1543 | |
1544 | NGHolder *h = in[e].graph.get(); |
1545 | |
1546 | assert(isCorrectlyTopped(*h)); |
1547 | graphs[h].push_back(e); |
1548 | } |
1549 | |
1550 | vector<RoseInEdge> graph_edges; |
1551 | |
1552 | for (const auto &m : graphs) { |
1553 | NGHolder *h = m.first; |
1554 | if (!canImplementGraph(*h, prefilter, rm, cc)) { |
1555 | return false; |
1556 | } |
1557 | insert(&graph_edges, graph_edges.end(), m.second); |
1558 | } |
1559 | |
1560 | /* we are now past the point of no return. We can start making irreversible |
1561 | changes to the rose graph, etc */ |
1562 | |
1563 | for (const auto &e : graph_edges) { |
1564 | assert(in[e].graph); |
1565 | assert(!in[e].haig); |
1566 | NGHolder &h = *in[e].graph; |
1567 | DEBUG_PRINTF("handling %p\n" , &h); |
1568 | assert(allMatchStatesHaveReports(h)); |
1569 | |
1570 | if (!generates_callbacks(whatRoseIsThis(in, e)) |
1571 | && in[target(e, in)].type != RIV_ACCEPT_EOD) { |
1572 | set_report(h, getNewNfaReport()); |
1573 | } |
1574 | } |
1575 | |
1576 | populateRoseGraph(this, bd); |
1577 | |
1578 | return true; |
1579 | } |
1580 | |
1581 | bool RoseBuildImpl::addSombeRose(const RoseInGraph &ig) { |
1582 | DEBUG_PRINTF("rose is trying to consume a sombe\n" ); |
1583 | assert(validateKinds(ig)); |
1584 | |
1585 | if (::ue2::empty(ig)) { |
1586 | assert(0); |
1587 | return false; |
1588 | } |
1589 | |
1590 | RoseBuildData bd(ig, true); |
1591 | |
1592 | for (const auto &e : edges_range(ig)) { |
1593 | if (!ig[e].graph) { |
1594 | continue; // no graph |
1595 | } |
1596 | DEBUG_PRINTF("handling %p\n" , ig[e].graph.get()); |
1597 | assert(allMatchStatesHaveReports(*ig[e].graph)); |
1598 | assert(ig[e].haig); |
1599 | } |
1600 | |
1601 | populateRoseGraph(this, bd); |
1602 | |
1603 | return true; |
1604 | } |
1605 | |
1606 | bool roseCheckRose(const RoseInGraph &ig, bool prefilter, |
1607 | const ReportManager &rm, const CompileContext &cc) { |
1608 | assert(validateKinds(ig)); |
1609 | |
1610 | if (::ue2::empty(ig)) { |
1611 | assert(0); |
1612 | return false; |
1613 | } |
1614 | |
1615 | vector<NGHolder *> graphs; |
1616 | |
1617 | for (const auto &e : edges_range(ig)) { |
1618 | if (!ig[e].graph) { |
1619 | continue; // no graph |
1620 | } |
1621 | |
1622 | if (ig[e].haig) { |
1623 | // Haigs are always implementable (we've already built the raw DFA). |
1624 | continue; |
1625 | } |
1626 | |
1627 | graphs.push_back(ig[e].graph.get()); |
1628 | } |
1629 | |
1630 | for (const auto &g : graphs) { |
1631 | if (!canImplementGraph(*g, prefilter, rm, cc)) { |
1632 | return false; |
1633 | } |
1634 | } |
1635 | |
1636 | return true; |
1637 | } |
1638 | |
1639 | void RoseBuildImpl::add(bool anchored, bool eod, const ue2_literal &lit, |
1640 | const flat_set<ReportID> &reports) { |
1641 | assert(!reports.empty()); |
1642 | |
1643 | if (cc.grey.floodAsPuffette && !anchored && !eod && is_flood(lit) && |
1644 | lit.length() > 3) { |
1645 | DEBUG_PRINTF("adding as puffette\n" ); |
1646 | const CharReach &cr = *lit.begin(); |
1647 | for (const auto &report : reports) { |
1648 | addOutfix(raw_puff(lit.length(), true, report, cr, true)); |
1649 | } |
1650 | |
1651 | return; |
1652 | } |
1653 | |
1654 | RoseInGraph ig; |
1655 | RoseInVertex start = add_vertex(RoseInVertexProps::makeStart(anchored), ig); |
1656 | RoseInVertex accept = add_vertex( |
1657 | eod ? RoseInVertexProps::makeAcceptEod(set<ReportID>()) |
1658 | : RoseInVertexProps::makeAccept(set<ReportID>()), ig); |
1659 | RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), ig); |
1660 | |
1661 | add_edge(start, v, RoseInEdgeProps(0U, anchored ? 0U : ROSE_BOUND_INF), ig); |
1662 | add_edge(v, accept, RoseInEdgeProps(0U, 0U), ig); |
1663 | |
1664 | calcVertexOffsets(ig); |
1665 | |
1666 | ig[accept].reports.insert(reports.begin(), reports.end()); |
1667 | |
1668 | addRose(ig, false); |
1669 | } |
1670 | |
1671 | static |
1672 | u32 findMaxBAWidth(const NGHolder &h) { |
1673 | // Must be bi-anchored: no out-edges from startDs (other than its |
1674 | // self-loop), no in-edges to accept. |
1675 | if (out_degree(h.startDs, h) > 1 || in_degree(h.accept, h)) { |
1676 | return ROSE_BOUND_INF; |
1677 | } |
1678 | depth d = findMaxWidth(h); |
1679 | assert(d.is_reachable()); |
1680 | |
1681 | if (!d.is_finite()) { |
1682 | return ROSE_BOUND_INF; |
1683 | } |
1684 | return d; |
1685 | } |
1686 | |
1687 | static |
1688 | void populateOutfixInfo(OutfixInfo &outfix, const NGHolder &h, |
1689 | const RoseBuildImpl &tbi) { |
1690 | outfix.maxBAWidth = findMaxBAWidth(h); |
1691 | outfix.minWidth = findMinWidth(h); |
1692 | outfix.maxWidth = findMaxWidth(h); |
1693 | outfix.maxOffset = findMaxOffset(h, tbi.rm); |
1694 | populateReverseAccelerationInfo(outfix.rev_info, h); |
1695 | } |
1696 | |
1697 | static |
1698 | bool addEodOutfix(RoseBuildImpl &build, const NGHolder &h) { |
1699 | map<flat_set<ReportID>, ReportID> report_remap; |
1700 | shared_ptr<NGHolder> eod_leftfix |
1701 | = makeRoseEodPrefix(h, build, report_remap); |
1702 | |
1703 | bool nfa_ok = isImplementableNFA(h, &build.rm, build.cc); |
1704 | |
1705 | /* TODO: check if early dfa is possible */ |
1706 | |
1707 | if (!nfa_ok) { |
1708 | DEBUG_PRINTF("could not build as NFA\n" ); |
1709 | return false; |
1710 | } |
1711 | |
1712 | u32 eod_event = getEodEventID(build); |
1713 | |
1714 | auto &g = build.g; |
1715 | for (const auto &report_mapping : report_remap) { |
1716 | RoseVertex v = add_vertex(g); |
1717 | g[v].literals.insert(eod_event); |
1718 | build.literal_info[eod_event].vertices.insert(v); |
1719 | |
1720 | g[v].left.graph = eod_leftfix; |
1721 | g[v].left.leftfix_report = report_mapping.second; |
1722 | g[v].left.lag = 0; |
1723 | RoseEdge e1 = add_edge(build.anchored_root, v, g); |
1724 | g[e1].minBound = 0; |
1725 | g[e1].maxBound = ROSE_BOUND_INF; |
1726 | g[v].min_offset = findMinWidth(*eod_leftfix); |
1727 | g[v].max_offset = ROSE_BOUND_INF; |
1728 | |
1729 | depth max_width = findMaxWidth(*g[v].left.graph); |
1730 | if (max_width.is_finite() && isPureAnchored(*eod_leftfix)) { |
1731 | g[e1].maxBound = max_width; |
1732 | g[v].max_offset = max_width; |
1733 | } |
1734 | |
1735 | g[e1].history = ROSE_ROLE_HISTORY_NONE; // handled by prefix |
1736 | RoseVertex w = add_vertex(g); |
1737 | g[w].eod_accept = true; |
1738 | g[w].reports = report_mapping.first; |
1739 | g[w].min_offset = g[v].min_offset; |
1740 | g[w].max_offset = g[v].max_offset; |
1741 | RoseEdge e = add_edge(v, w, g); |
1742 | g[e].minBound = 0; |
1743 | g[e].maxBound = 0; |
1744 | g[e].history = ROSE_ROLE_HISTORY_NONE; |
1745 | DEBUG_PRINTF("accept eod vertex (index=%zu)\n" , g[w].index); |
1746 | } |
1747 | |
1748 | return true; |
1749 | } |
1750 | |
1751 | bool RoseBuildImpl::addOutfix(const NGHolder &h) { |
1752 | DEBUG_PRINTF("%zu vertices, %zu edges\n" , num_vertices(h), num_edges(h)); |
1753 | |
1754 | /* TODO: handle more than one report */ |
1755 | if (!in_degree(h.accept, h) |
1756 | && all_reports(h).size() == 1 |
1757 | && addEodOutfix(*this, h)) { |
1758 | return true; |
1759 | } |
1760 | |
1761 | const u32 nfa_states = isImplementableNFA(h, &rm, cc); |
1762 | if (nfa_states) { |
1763 | DEBUG_PRINTF("implementable as an NFA in %u states\n" , nfa_states); |
1764 | } else { |
1765 | DEBUG_PRINTF("not implementable as an NFA\n" ); |
1766 | } |
1767 | |
1768 | bool dfa_cand = !nfa_states || nfa_states > 128 /* slow model */ |
1769 | || can_exhaust(h, rm); /* can be pruned */ |
1770 | |
1771 | unique_ptr<raw_dfa> rdfa; |
1772 | |
1773 | if (!nfa_states || cc.grey.roseMcClellanOutfix == 2 || |
1774 | (cc.grey.roseMcClellanOutfix == 1 && dfa_cand)) { |
1775 | rdfa = buildMcClellan(h, &rm, cc.grey); |
1776 | } |
1777 | |
1778 | if (!nfa_states && !rdfa) { |
1779 | DEBUG_PRINTF("could not build as either an NFA or a DFA\n" ); |
1780 | return false; |
1781 | } |
1782 | |
1783 | if (rdfa) { |
1784 | outfixes.push_back(OutfixInfo(move(rdfa))); |
1785 | } else { |
1786 | outfixes.push_back(OutfixInfo(cloneHolder(h))); |
1787 | } |
1788 | |
1789 | populateOutfixInfo(outfixes.back(), h, *this); |
1790 | |
1791 | return true; |
1792 | } |
1793 | |
1794 | bool RoseBuildImpl::addOutfix(const NGHolder &h, const raw_som_dfa &haig) { |
1795 | DEBUG_PRINTF("haig with %zu states\n" , haig.states.size()); |
1796 | |
1797 | outfixes.push_back(OutfixInfo(ue2::make_unique<raw_som_dfa>(haig))); |
1798 | populateOutfixInfo(outfixes.back(), h, *this); |
1799 | |
1800 | return true; /* failure is not yet an option */ |
1801 | } |
1802 | |
1803 | bool RoseBuildImpl::addOutfix(const raw_puff &rp) { |
1804 | if (!mpv_outfix) { |
1805 | mpv_outfix = make_unique<OutfixInfo>(MpvProto()); |
1806 | } |
1807 | |
1808 | auto *mpv = mpv_outfix->mpv(); |
1809 | assert(mpv); |
1810 | mpv->puffettes.push_back(rp); |
1811 | |
1812 | mpv_outfix->maxBAWidth = ROSE_BOUND_INF; /* not ba */ |
1813 | mpv_outfix->minWidth = min(mpv_outfix->minWidth, depth(rp.repeats)); |
1814 | mpv_outfix->maxWidth = rp.unbounded |
1815 | ? depth::infinity() |
1816 | : max(mpv_outfix->maxWidth, depth(rp.repeats)); |
1817 | |
1818 | if (mpv_outfix->maxOffset == ROSE_BOUND_INF || rp.unbounded) { |
1819 | mpv_outfix->maxOffset = ROSE_BOUND_INF; |
1820 | } else { |
1821 | mpv_outfix->maxOffset = MAX(mpv_outfix->maxOffset, rp.repeats); |
1822 | } |
1823 | |
1824 | return true; /* failure is not yet an option */ |
1825 | } |
1826 | |
1827 | bool RoseBuildImpl::addChainTail(const raw_puff &rp, u32 *queue_out, |
1828 | u32 *event_out) { |
1829 | if (!mpv_outfix) { |
1830 | mpv_outfix = make_unique<OutfixInfo>(MpvProto()); |
1831 | } |
1832 | |
1833 | auto *mpv = mpv_outfix->mpv(); |
1834 | assert(mpv); |
1835 | mpv->triggered_puffettes.push_back(rp); |
1836 | |
1837 | mpv_outfix->maxBAWidth = ROSE_BOUND_INF; /* not ba */ |
1838 | mpv_outfix->minWidth = min(mpv_outfix->minWidth, depth(rp.repeats)); |
1839 | mpv_outfix->maxWidth = rp.unbounded |
1840 | ? depth::infinity() |
1841 | : max(mpv_outfix->maxWidth, depth(rp.repeats)); |
1842 | |
1843 | mpv_outfix->maxOffset = ROSE_BOUND_INF; /* TODO: we could get information from |
1844 | * the caller */ |
1845 | |
1846 | *queue_out = mpv_outfix->get_queue(qif); |
1847 | *event_out = MQE_TOP_FIRST + mpv->triggered_puffettes.size() - 1; |
1848 | |
1849 | return true; /* failure is not yet an option */ |
1850 | } |
1851 | |
1852 | static |
1853 | bool prepAcceptForAddAnchoredNFA(RoseBuildImpl &tbi, const NGHolder &w, |
1854 | NFAVertex u, |
1855 | const vector<DepthMinMax> &vertexDepths, |
1856 | map<u32, DepthMinMax> &depthMap, |
1857 | map<NFAVertex, set<u32>> &reportMap, |
1858 | map<ReportID, u32> &allocated_reports, |
1859 | flat_set<u32> &added_lit_ids) { |
1860 | const depth max_anchored_depth(tbi.cc.grey.maxAnchoredRegion); |
1861 | const size_t index = w[u].index; |
1862 | assert(index < vertexDepths.size()); |
1863 | const DepthMinMax &d = vertexDepths.at(index); |
1864 | |
1865 | for (const auto &int_report : w[u].reports) { |
1866 | assert(int_report != MO_INVALID_IDX); |
1867 | |
1868 | u32 lit_id; |
1869 | if (!contains(allocated_reports, int_report)) { |
1870 | lit_id = tbi.getNewLiteralId(); |
1871 | added_lit_ids.insert(lit_id); |
1872 | allocated_reports[int_report] = lit_id; |
1873 | } else { |
1874 | lit_id = allocated_reports[int_report]; |
1875 | } |
1876 | |
1877 | reportMap[u].insert(lit_id); |
1878 | |
1879 | if (!contains(depthMap, lit_id)) { |
1880 | depthMap[lit_id] = d; |
1881 | } else { |
1882 | depthMap[lit_id] = unionDepthMinMax(depthMap[lit_id], d); |
1883 | } |
1884 | |
1885 | if (depthMap[lit_id].max > max_anchored_depth) { |
1886 | DEBUG_PRINTF("depth=%s exceeds maxAnchoredRegion=%u\n" , |
1887 | depthMap[lit_id].max.str().c_str(), |
1888 | tbi.cc.grey.maxAnchoredRegion); |
1889 | return false; |
1890 | } |
1891 | } |
1892 | |
1893 | return true; |
1894 | } |
1895 | |
1896 | // Failure path for addAnchoredAcyclic: removes the literal IDs that have been |
1897 | // added to support anchored NFAs. Assumes that they are a contiguous range at |
1898 | // the end of the RoseBuildImpl::literal_info vector. |
1899 | static |
1900 | void removeAddedLiterals(RoseBuildImpl &tbi, const flat_set<u32> &lit_ids) { |
1901 | if (lit_ids.empty()) { |
1902 | return; |
1903 | } |
1904 | |
1905 | DEBUG_PRINTF("remove last %zu literals\n" , lit_ids.size()); |
1906 | |
1907 | // lit_ids should be a contiguous range. |
1908 | assert(lit_ids.size() == *lit_ids.rbegin() - *lit_ids.begin() + 1); |
1909 | assert(*lit_ids.rbegin() == tbi.literals.size() - 1); |
1910 | |
1911 | assert(all_of_in(lit_ids, [&](u32 lit_id) { |
1912 | return lit_id < tbi.literal_info.size() && |
1913 | tbi.literals.at(lit_id).table == ROSE_ANCHORED && |
1914 | tbi.literal_info[lit_id].vertices.empty(); |
1915 | })); |
1916 | |
1917 | tbi.literals.erase_back(lit_ids.size()); |
1918 | assert(tbi.literals.size() == *lit_ids.begin()); |
1919 | |
1920 | // lit_ids should be at the end of tbi.literal_info. |
1921 | assert(tbi.literal_info.size() == *lit_ids.rbegin() + 1); |
1922 | tbi.literal_info.resize(*lit_ids.begin()); // remove all ids in lit_ids |
1923 | } |
1924 | |
1925 | bool RoseBuildImpl::addAnchoredAcyclic(const NGHolder &h) { |
1926 | auto vertexDepths = calcDepthsFrom(h, h.start); |
1927 | |
1928 | map<NFAVertex, set<u32> > reportMap; /* NFAVertex -> literal ids */ |
1929 | map<u32, DepthMinMax> depthMap; /* literal id -> min/max depth */ |
1930 | map<ReportID, u32> allocated_reports; /* report -> literal id */ |
1931 | flat_set<u32> added_lit_ids; /* literal ids added for this NFA */ |
1932 | |
1933 | for (auto v : inv_adjacent_vertices_range(h.accept, h)) { |
1934 | if (!prepAcceptForAddAnchoredNFA(*this, h, v, vertexDepths, depthMap, |
1935 | reportMap, allocated_reports, |
1936 | added_lit_ids)) { |
1937 | removeAddedLiterals(*this, added_lit_ids); |
1938 | return false; |
1939 | } |
1940 | } |
1941 | |
1942 | map<ReportID, u32> allocated_reports_eod; /* report -> literal id */ |
1943 | |
1944 | for (auto v : inv_adjacent_vertices_range(h.acceptEod, h)) { |
1945 | if (v == h.accept) { |
1946 | continue; |
1947 | } |
1948 | if (!prepAcceptForAddAnchoredNFA(*this, h, v, vertexDepths, depthMap, |
1949 | reportMap, allocated_reports_eod, |
1950 | added_lit_ids)) { |
1951 | removeAddedLiterals(*this, added_lit_ids); |
1952 | return false; |
1953 | } |
1954 | } |
1955 | |
1956 | assert(!reportMap.empty()); |
1957 | |
1958 | int rv = addAnchoredNFA(*this, h, reportMap); |
1959 | if (rv != ANCHORED_FAIL) { |
1960 | assert(rv != ANCHORED_REMAP); |
1961 | DEBUG_PRINTF("added anchored nfa\n" ); |
1962 | /* add edges to the rose graph to bubble the match up */ |
1963 | for (const auto &m : allocated_reports) { |
1964 | const ReportID &report = m.first; |
1965 | const u32 &lit_id = m.second; |
1966 | assert(depthMap[lit_id].max.is_finite()); |
1967 | u32 minBound = depthMap[lit_id].min; |
1968 | u32 maxBound = depthMap[lit_id].max; |
1969 | RoseVertex v |
1970 | = createAnchoredVertex(this, lit_id, minBound, maxBound); |
1971 | g[v].reports.insert(report); |
1972 | } |
1973 | |
1974 | for (const auto &m : allocated_reports_eod) { |
1975 | const ReportID &report = m.first; |
1976 | const u32 &lit_id = m.second; |
1977 | assert(depthMap[lit_id].max.is_finite()); |
1978 | u32 minBound = depthMap[lit_id].min; |
1979 | u32 maxBound = depthMap[lit_id].max; |
1980 | RoseVertex v |
1981 | = createAnchoredVertex(this, lit_id, minBound, maxBound); |
1982 | RoseVertex eod = add_vertex(g); |
1983 | g[eod].eod_accept = true; |
1984 | g[eod].reports.insert(report); |
1985 | g[eod].min_offset = g[v].min_offset; |
1986 | g[eod].max_offset = g[v].max_offset; |
1987 | add_edge(v, eod, g); |
1988 | } |
1989 | |
1990 | return true; |
1991 | } else { |
1992 | DEBUG_PRINTF("failed to add anchored nfa\n" ); |
1993 | removeAddedLiterals(*this, added_lit_ids); |
1994 | return false; |
1995 | } |
1996 | } |
1997 | |
1998 | } // namespace ue2 |
1999 | |