1 | /* |
2 | * Copyright (c) 2015-2018, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | #include "rose_build_misc.h" |
30 | #include "rose_build_impl.h" |
31 | |
32 | #include "rose_build_resources.h" |
33 | #include "hwlm/hwlm_literal.h" |
34 | #include "nfa/castlecompile.h" |
35 | #include "nfa/goughcompile.h" |
36 | #include "nfa/mcclellancompile_util.h" |
37 | #include "nfa/nfa_api.h" |
38 | #include "nfa/rdfa.h" |
39 | #include "nfa/tamaramacompile.h" |
40 | #include "nfagraph/ng_holder.h" |
41 | #include "nfagraph/ng_limex.h" |
42 | #include "nfagraph/ng_reports.h" |
43 | #include "nfagraph/ng_repeat.h" |
44 | #include "nfagraph/ng_util.h" |
45 | #include "nfagraph/ng_width.h" |
46 | #include "smallwrite/smallwrite_build.h" |
47 | #include "util/alloc.h" |
48 | #include "util/boundary_reports.h" |
49 | #include "util/compile_context.h" |
50 | #include "util/container.h" |
51 | #include "util/graph.h" |
52 | #include "util/graph_range.h" |
53 | #include "util/make_unique.h" |
54 | #include "util/order_check.h" |
55 | #include "util/report_manager.h" |
56 | #include "util/ue2string.h" |
57 | #include "util/verify_types.h" |
58 | #include "ue2common.h" |
59 | #include "grey.h" |
60 | |
61 | #include <boost/graph/breadth_first_search.hpp> |
62 | |
63 | using namespace std; |
64 | |
65 | namespace ue2 { |
66 | |
67 | // just to get it out of the header |
68 | RoseBuild::~RoseBuild() { } |
69 | |
70 | RoseBuildImpl::RoseBuildImpl(ReportManager &rm_in, |
71 | SomSlotManager &ssm_in, |
72 | SmallWriteBuild &smwr_in, |
73 | const CompileContext &cc_in, |
74 | const BoundaryReports &boundary_in) |
75 | : cc(cc_in), |
76 | root(add_vertex(g)), |
77 | anchored_root(add_vertex(g)), |
78 | hasSom(false), |
79 | group_end(0), |
80 | ematcher_region_size(0), |
81 | eod_event_literal_id(MO_INVALID_IDX), |
82 | max_rose_anchored_floating_overlap(0), |
83 | rm(rm_in), |
84 | ssm(ssm_in), |
85 | smwr(smwr_in), |
86 | boundary(boundary_in), |
87 | next_nfa_report(0) { |
88 | // add root vertices to graph |
89 | g[root].min_offset = 0; |
90 | g[root].max_offset = 0; |
91 | |
92 | g[anchored_root].min_offset = 0; |
93 | g[anchored_root].max_offset = 0; |
94 | } |
95 | |
96 | RoseBuildImpl::~RoseBuildImpl() { |
97 | // empty |
98 | } |
99 | |
100 | bool RoseVertexProps::isBoring(void) const { |
101 | return !suffix && !left; |
102 | } |
103 | |
104 | bool RoseVertexProps::fixedOffset(void) const { |
105 | assert(min_offset <= max_offset); /* ensure offsets calculated */ |
106 | return max_offset == min_offset && max_offset != ROSE_BOUND_INF; |
107 | } |
108 | |
109 | bool RoseBuildImpl::isRootSuccessor(const RoseVertex &v) const { |
110 | for (auto u : inv_adjacent_vertices_range(v, g)) { |
111 | if (isAnyStart(u)) { |
112 | return true; |
113 | } |
114 | } |
115 | return false; |
116 | } |
117 | |
118 | bool RoseBuildImpl::isNonRootSuccessor(const RoseVertex &v) const { |
119 | for (auto u : inv_adjacent_vertices_range(v, g)) { |
120 | if (!isAnyStart(u)) { |
121 | return true; |
122 | } |
123 | } |
124 | return false; |
125 | } |
126 | |
127 | bool hasAnchHistorySucc(const RoseGraph &g, RoseVertex v) { |
128 | for (const auto &e : out_edges_range(v, g)) { |
129 | if (g[e].history == ROSE_ROLE_HISTORY_ANCH) { |
130 | return true; |
131 | } |
132 | } |
133 | |
134 | return false; |
135 | } |
136 | |
137 | bool hasLastByteHistorySucc(const RoseGraph &g, RoseVertex v) { |
138 | for (const auto &e : out_edges_range(v, g)) { |
139 | if (g[e].history == ROSE_ROLE_HISTORY_LAST_BYTE) { |
140 | return true; |
141 | } |
142 | } |
143 | |
144 | return false; |
145 | } |
146 | |
147 | static |
148 | bool isInTable(const RoseBuildImpl &tbi, RoseVertex v, |
149 | rose_literal_table table) { |
150 | const auto &lit_ids = tbi.g[v].literals; |
151 | if (lit_ids.empty()) { |
152 | return false; // special role with no literals |
153 | } |
154 | |
155 | // All literals for a given vertex will be in the same table, so we need |
156 | // only inspect the first one. |
157 | const auto lit_table = tbi.literals.at(*lit_ids.begin()).table; |
158 | |
159 | // Verify that all literals for this vertex are in the same table. |
160 | assert(all_of_in(lit_ids, [&](u32 lit_id) { |
161 | return tbi.literals.at(lit_id).table == lit_table; |
162 | })); |
163 | |
164 | return lit_table == table; |
165 | } |
166 | |
167 | bool RoseBuildImpl::isAnchored(RoseVertex v) const { |
168 | return isInTable(*this, v, ROSE_ANCHORED); |
169 | } |
170 | |
171 | bool RoseBuildImpl::isFloating(RoseVertex v) const { |
172 | return isInTable(*this, v, ROSE_FLOATING); |
173 | } |
174 | |
175 | bool RoseBuildImpl::isInETable(RoseVertex v) const { |
176 | return isInTable(*this, v, ROSE_EOD_ANCHORED); |
177 | } |
178 | |
179 | bool RoseBuildImpl::hasLiteralInTable(RoseVertex v, |
180 | enum rose_literal_table t) const { |
181 | return isInTable(*this, v, t); |
182 | } |
183 | |
184 | /* Indicates that the floating table (if it exists) will be only run |
185 | conditionally based on matches from the anchored table. */ |
186 | bool RoseBuildImpl::hasNoFloatingRoots() const { |
187 | for (auto v : adjacent_vertices_range(root, g)) { |
188 | if (isFloating(v)) { |
189 | DEBUG_PRINTF("direct floating root %zu\n" , g[v].index); |
190 | return false; |
191 | } |
192 | } |
193 | |
194 | /* need to check if the anchored_root has any literals which are too deep */ |
195 | for (auto v : adjacent_vertices_range(anchored_root, g)) { |
196 | if (isFloating(v)) { |
197 | DEBUG_PRINTF("indirect floating root %zu\n" , g[v].index); |
198 | return false; |
199 | } |
200 | } |
201 | |
202 | return true; |
203 | } |
204 | |
205 | size_t RoseBuildImpl::maxLiteralLen(RoseVertex v) const { |
206 | const auto &lit_ids = g[v].literals; |
207 | assert(!lit_ids.empty()); |
208 | |
209 | size_t maxlen = 0; |
210 | |
211 | for (const auto &lit_id : lit_ids) { |
212 | maxlen = max(maxlen, literals.at(lit_id).elength()); |
213 | } |
214 | |
215 | return maxlen; |
216 | } |
217 | |
218 | size_t RoseBuildImpl::minLiteralLen(RoseVertex v) const { |
219 | const auto &lit_ids = g[v].literals; |
220 | assert(!lit_ids.empty()); |
221 | |
222 | size_t minlen = ROSE_BOUND_INF; |
223 | |
224 | for (const auto &lit_id : lit_ids) { |
225 | minlen = min(minlen, literals.at(lit_id).elength()); |
226 | } |
227 | |
228 | return minlen; |
229 | } |
230 | |
231 | // RoseBuild factory |
232 | unique_ptr<RoseBuild> makeRoseBuilder(ReportManager &rm, |
233 | SomSlotManager &ssm, |
234 | SmallWriteBuild &smwr, |
235 | const CompileContext &cc, |
236 | const BoundaryReports &boundary) { |
237 | return ue2::make_unique<RoseBuildImpl>(rm, ssm, smwr, cc, boundary); |
238 | } |
239 | |
240 | bool roseIsPureLiteral(const RoseEngine *t) { |
241 | return t->runtimeImpl == ROSE_RUNTIME_PURE_LITERAL; |
242 | } |
243 | |
244 | // Returns non-zero max overlap len if a suffix of the literal 'a' overlaps |
245 | // with a prefix of the literal 'b' or 'a' can be contained in 'b'. |
246 | size_t maxOverlap(const ue2_literal &a, const ue2_literal &b, u32 b_delay) { |
247 | /* overly conservative if only part of the string is nocase */ |
248 | bool nocase = a.any_nocase() || b.any_nocase(); |
249 | DEBUG_PRINTF("max overlap %s %s+%u %d\n" , dumpString(a).c_str(), |
250 | dumpString(b).c_str(), b_delay, (int)nocase); |
251 | size_t a_len = a.length(); |
252 | size_t b_len = b.length(); |
253 | const char *a_end = a.c_str() + a_len; |
254 | const char *b_end = b.c_str() + b_len; |
255 | if (b_delay >= a_len) { |
256 | return b_len + b_delay; |
257 | } else if (b_delay) { |
258 | /* a can be a substring of b which overlaps some of the end dots |
259 | * OR b can be a substring near the end of a */ |
260 | /* ignore overlap due to the final trailing dot as delayed literals |
261 | * are delivered before undelayed */ |
262 | for (u32 j = b_delay - 1; j > 0; j--) { |
263 | if (b_len + j >= a_len) { |
264 | if (!cmp(a.c_str(), b_end + j - a_len, a_len - j, nocase)) { |
265 | return b_len + j; |
266 | } |
267 | } else { |
268 | if (!cmp(a_end - j - b_len, b.c_str(), b_len, nocase)) { |
269 | return b_len + j; |
270 | } |
271 | } |
272 | } |
273 | } |
274 | |
275 | return maxStringOverlap(a.get_string(), b.get_string(), nocase); |
276 | } |
277 | |
278 | // Returns non-zero max overlap len if a suffix of the literal ID 'a' overlaps |
279 | // with a prefix of the literal ID 'b' or 'a' can be contained in 'b'. |
280 | size_t maxOverlap(const rose_literal_id &a, const rose_literal_id &b) { |
281 | assert(!a.delay); |
282 | return maxOverlap(a.s, b.s, b.delay); |
283 | } |
284 | |
285 | static |
286 | const rose_literal_id &getOverlapLiteral(const RoseBuildImpl &tbi, |
287 | u32 literal_id) { |
288 | auto it = tbi.anchoredLitSuffix.find(literal_id); |
289 | if (it != tbi.anchoredLitSuffix.end()) { |
290 | return it->second; |
291 | } |
292 | return tbi.literals.at(literal_id); |
293 | } |
294 | |
295 | ue2_literal findNonOverlappingTail(const set<ue2_literal> &lits, |
296 | const ue2_literal &s) { |
297 | size_t max_overlap = 0; |
298 | |
299 | for (const auto &lit : lits) { |
300 | size_t overlap = lit != s ? maxStringOverlap(lit, s) |
301 | : maxStringSelfOverlap(s); |
302 | max_overlap = max(max_overlap, overlap); |
303 | } |
304 | |
305 | /* find the tail that doesn't overlap */ |
306 | ue2_literal tail = s.substr(max_overlap); |
307 | DEBUG_PRINTF("%zu overlap, tail: '%s'\n" , max_overlap, |
308 | dumpString(tail).c_str()); |
309 | return tail; |
310 | } |
311 | |
312 | size_t RoseBuildImpl::maxLiteralOverlap(RoseVertex u, RoseVertex v) const { |
313 | size_t overlap = 0; |
314 | for (auto u_lit_id : g[u].literals) { |
315 | const rose_literal_id &ul = getOverlapLiteral(*this, u_lit_id); |
316 | for (auto v_lit_id : g[v].literals) { |
317 | const rose_literal_id &vl = getOverlapLiteral(*this, v_lit_id); |
318 | overlap = max(overlap, maxOverlap(ul, vl)); |
319 | } |
320 | } |
321 | return overlap; |
322 | } |
323 | |
324 | void RoseBuildImpl::removeVertices(const vector<RoseVertex> &dead) { |
325 | for (auto v : dead) { |
326 | assert(!isAnyStart(v)); |
327 | DEBUG_PRINTF("removing vertex %zu\n" , g[v].index); |
328 | for (auto lit_id : g[v].literals) { |
329 | literal_info[lit_id].vertices.erase(v); |
330 | } |
331 | clear_vertex(v, g); |
332 | remove_vertex(v, g); |
333 | } |
334 | renumber_vertices(g); |
335 | } |
336 | |
337 | // Find the maximum bound on the edges to this vertex's successors ignoring |
338 | // those via infixes. |
339 | u32 RoseBuildImpl::calcSuccMaxBound(RoseVertex u) const { |
340 | u32 maxBound = 0; |
341 | for (const auto &e : out_edges_range(u, g)) { |
342 | RoseVertex v = target(e, g); |
343 | |
344 | if (g[v].left) { |
345 | continue; |
346 | } |
347 | |
348 | u32 thisBound = g[e].maxBound; |
349 | |
350 | if (thisBound == ROSE_BOUND_INF) { |
351 | return ROSE_BOUND_INF; |
352 | } |
353 | |
354 | if (!g[v].eod_accept) { |
355 | // Add the length of the longest of our literals. |
356 | thisBound += maxLiteralLen(v); |
357 | } |
358 | |
359 | maxBound = max(maxBound, thisBound); |
360 | } |
361 | |
362 | assert(maxBound <= ROSE_BOUND_INF); |
363 | return maxBound; |
364 | } |
365 | |
366 | u32 RoseBuildImpl::getLiteralId(const ue2_literal &s, u32 delay, |
367 | rose_literal_table table) { |
368 | DEBUG_PRINTF("getting id for %s in table %d\n" , dumpString(s).c_str(), |
369 | table); |
370 | assert(table != ROSE_ANCHORED); |
371 | rose_literal_id key(s, table, delay); |
372 | |
373 | auto m = literals.insert(key); |
374 | u32 id = m.first; |
375 | bool inserted = m.second; |
376 | |
377 | if (inserted) { |
378 | literal_info.push_back(rose_literal_info()); |
379 | assert(literal_info.size() == id + 1); |
380 | |
381 | if (delay) { |
382 | u32 undelayed_id = getLiteralId(s, 0, table); |
383 | literal_info[id].undelayed_id = undelayed_id; |
384 | literal_info[undelayed_id].delayed_ids.insert(id); |
385 | } else { |
386 | literal_info[id].undelayed_id = id; |
387 | } |
388 | } |
389 | return id; |
390 | } |
391 | |
392 | // Function that operates on a msk/cmp pair and a literal, as used in |
393 | // hwlmLiteral, and zeroes msk elements that don't add any power to the |
394 | // literal. |
395 | void normaliseLiteralMask(const ue2_literal &s_in, vector<u8> &msk, |
396 | vector<u8> &cmp) { |
397 | assert(msk.size() == cmp.size()); |
398 | assert(msk.size() <= HWLM_MASKLEN); |
399 | |
400 | if (msk.empty()) { |
401 | return; |
402 | } |
403 | |
404 | // Work over a caseless copy if the string contains nocase chars. This will |
405 | // ensure that we treat masks designed to handle mixed-sensitivity literals |
406 | // correctly: these will be matched by the literal matcher in caseless |
407 | // mode, with the mask used to narrow the matches. |
408 | ue2_literal s(s_in); |
409 | if (s.any_nocase()) { |
410 | make_nocase(&s); |
411 | } |
412 | |
413 | ue2_literal::const_reverse_iterator it = s.rbegin(), ite = s.rend(); |
414 | size_t i = msk.size(); |
415 | while (i-- != 0 && it != ite) { |
416 | const CharReach &cr = *it; |
417 | for (size_t c = cr.find_first(); c != CharReach::npos; |
418 | c = cr.find_next(c)) { |
419 | if (((u8)c & msk[i]) != cmp[i]) { |
420 | goto skip; |
421 | } |
422 | } |
423 | |
424 | // If we didn't jump out of the loop to skip, then this mask position |
425 | // doesn't further narrow the set of acceptable literals from those |
426 | // accepted by s. So we can zero this element. |
427 | msk[i] = 0; |
428 | cmp[i] = 0; |
429 | skip: |
430 | ++it; |
431 | } |
432 | |
433 | // Wipe out prefix zeroes. |
434 | while (!msk.empty() && msk[0] == 0) { |
435 | msk.erase(msk.begin()); |
436 | cmp.erase(cmp.begin()); |
437 | } |
438 | } |
439 | |
440 | rose_literal_id::rose_literal_id(const ue2_literal &s_in, |
441 | const vector<u8> &msk_in, const vector<u8> &cmp_in, |
442 | rose_literal_table table_in, u32 delay_in) |
443 | : s(s_in), msk(msk_in), cmp(cmp_in), table(table_in), |
444 | delay(delay_in), distinctiveness(0) { |
445 | assert(msk.size() == cmp.size()); |
446 | assert(msk.size() <= HWLM_MASKLEN); |
447 | assert(delay <= MAX_DELAY); |
448 | |
449 | normaliseLiteralMask(s, msk, cmp); |
450 | } |
451 | |
452 | u32 RoseBuildImpl::getLiteralId(const ue2_literal &s, const vector<u8> &msk, |
453 | const vector<u8> &cmp, u32 delay, |
454 | rose_literal_table table) { |
455 | DEBUG_PRINTF("getting id for %s in table %d\n" , dumpString(s).c_str(), |
456 | table); |
457 | assert(table != ROSE_ANCHORED); |
458 | rose_literal_id key(s, msk, cmp, table, delay); |
459 | |
460 | /* ue2_literals are always uppercased if nocase and must have an |
461 | * alpha char */ |
462 | |
463 | auto m = literals.insert(key); |
464 | u32 id = m.first; |
465 | bool inserted = m.second; |
466 | |
467 | if (inserted) { |
468 | literal_info.push_back(rose_literal_info()); |
469 | assert(literal_info.size() == id + 1); |
470 | |
471 | if (delay) { |
472 | u32 undelayed_id = getLiteralId(s, msk, cmp, 0, table); |
473 | literal_info[id].undelayed_id = undelayed_id; |
474 | literal_info[undelayed_id].delayed_ids.insert(id); |
475 | } else { |
476 | literal_info[id].undelayed_id = id; |
477 | } |
478 | } |
479 | return id; |
480 | } |
481 | |
482 | u32 RoseBuildImpl::getNewLiteralId() { |
483 | rose_literal_id key(ue2_literal(), ROSE_ANCHORED, 0); |
484 | u32 numLiterals = verify_u32(literals.size()); |
485 | key.distinctiveness = numLiterals; |
486 | |
487 | auto m = literals.insert(key); |
488 | assert(m.second); |
489 | u32 id = m.first; |
490 | |
491 | literal_info.push_back(rose_literal_info()); |
492 | assert(literal_info.size() == id + 1); |
493 | |
494 | literal_info[id].undelayed_id = id; |
495 | |
496 | return id; |
497 | } |
498 | |
499 | bool operator<(const RoseEdgeProps &a, const RoseEdgeProps &b) { |
500 | ORDER_CHECK(minBound); |
501 | ORDER_CHECK(maxBound); |
502 | ORDER_CHECK(history); |
503 | return false; |
504 | } |
505 | |
506 | #ifndef NDEBUG |
507 | bool roseHasTops(const RoseBuildImpl &build, RoseVertex v) { |
508 | const RoseGraph &g = build.g; |
509 | assert(g[v].left); |
510 | |
511 | set<u32> graph_tops; |
512 | if (!build.isRootSuccessor(v)) { |
513 | for (const auto &e : in_edges_range(v, g)) { |
514 | graph_tops.insert(g[e].rose_top); |
515 | } |
516 | } |
517 | |
518 | return is_subset_of(graph_tops, all_tops(g[v].left)); |
519 | } |
520 | #endif |
521 | |
522 | u32 OutfixInfo::get_queue(QueueIndexFactory &qif) { |
523 | if (queue == ~0U) { |
524 | queue = qif.get_queue(); |
525 | } |
526 | |
527 | return queue; |
528 | } |
529 | |
530 | namespace { |
531 | class OutfixAllReports : public boost::static_visitor<set<ReportID>> { |
532 | public: |
533 | set<ReportID> operator()(const boost::blank &) const { |
534 | return set<ReportID>(); |
535 | } |
536 | |
537 | template<class T> |
538 | set<ReportID> operator()(const unique_ptr<T> &x) const { |
539 | return all_reports(*x); |
540 | } |
541 | |
542 | set<ReportID> operator()(const MpvProto &mpv) const { |
543 | set<ReportID> reports; |
544 | for (const auto &puff : mpv.puffettes) { |
545 | reports.insert(puff.report); |
546 | } |
547 | for (const auto &puff : mpv.triggered_puffettes) { |
548 | reports.insert(puff.report); |
549 | } |
550 | return reports; |
551 | } |
552 | }; |
553 | } |
554 | |
555 | set<ReportID> all_reports(const OutfixInfo &outfix) { |
556 | auto reports = boost::apply_visitor(OutfixAllReports(), outfix.proto); |
557 | assert(!reports.empty()); |
558 | return reports; |
559 | } |
560 | |
561 | bool RoseSuffixInfo::operator==(const RoseSuffixInfo &b) const { |
562 | return top == b.top && graph == b.graph && castle == b.castle && |
563 | rdfa == b.rdfa && haig == b.haig && tamarama == b.tamarama; |
564 | } |
565 | |
566 | bool RoseSuffixInfo::operator<(const RoseSuffixInfo &b) const { |
567 | const RoseSuffixInfo &a = *this; |
568 | ORDER_CHECK(top); |
569 | ORDER_CHECK(graph); |
570 | ORDER_CHECK(castle); |
571 | ORDER_CHECK(haig); |
572 | ORDER_CHECK(rdfa); |
573 | ORDER_CHECK(tamarama); |
574 | assert(a.dfa_min_width == b.dfa_min_width); |
575 | assert(a.dfa_max_width == b.dfa_max_width); |
576 | return false; |
577 | } |
578 | |
579 | size_t RoseSuffixInfo::hash() const { |
580 | return hash_all(top, graph, castle, rdfa, haig, tamarama); |
581 | } |
582 | |
583 | void RoseSuffixInfo::reset(void) { |
584 | top = 0; |
585 | graph.reset(); |
586 | castle.reset(); |
587 | rdfa.reset(); |
588 | haig.reset(); |
589 | tamarama.reset(); |
590 | dfa_min_width = depth(0); |
591 | dfa_max_width = depth::infinity(); |
592 | } |
593 | |
594 | std::set<ReportID> all_reports(const suffix_id &s) { |
595 | assert(s.graph() || s.castle() || s.haig() || s.dfa()); |
596 | if (s.tamarama()) { |
597 | return all_reports(*s.tamarama()); |
598 | } else if (s.graph()) { |
599 | return all_reports(*s.graph()); |
600 | } else if (s.castle()) { |
601 | return all_reports(*s.castle()); |
602 | } else if (s.dfa()) { |
603 | return all_reports(*s.dfa()); |
604 | } else { |
605 | return all_reports(*s.haig()); |
606 | } |
607 | } |
608 | |
609 | depth findMinWidth(const suffix_id &s) { |
610 | assert(s.graph() || s.castle() || s.haig() || s.dfa()); |
611 | if (s.graph()) { |
612 | return findMinWidth(*s.graph()); |
613 | } else if (s.castle()) { |
614 | return findMinWidth(*s.castle()); |
615 | } else { |
616 | return s.dfa_min_width; |
617 | } |
618 | } |
619 | |
620 | depth findMinWidth(const suffix_id &s, u32 top) { |
621 | assert(s.graph() || s.castle() || s.haig() || s.dfa()); |
622 | if (s.graph()) { |
623 | return findMinWidth(*s.graph(), top); |
624 | } else if (s.castle()) { |
625 | return findMinWidth(*s.castle(), top); |
626 | } else { |
627 | return s.dfa_min_width; |
628 | } |
629 | } |
630 | |
631 | depth findMaxWidth(const suffix_id &s) { |
632 | assert(s.graph() || s.castle() || s.haig() || s.dfa()); |
633 | if (s.graph()) { |
634 | return findMaxWidth(*s.graph()); |
635 | } else if (s.castle()) { |
636 | return findMaxWidth(*s.castle()); |
637 | } else { |
638 | return s.dfa_max_width; |
639 | } |
640 | } |
641 | |
642 | depth findMaxWidth(const suffix_id &s, u32 top) { |
643 | assert(s.graph() || s.castle() || s.haig() || s.dfa()); |
644 | if (s.graph()) { |
645 | return findMaxWidth(*s.graph(), top); |
646 | } else if (s.castle()) { |
647 | return findMaxWidth(*s.castle(), top); |
648 | } else { |
649 | return s.dfa_max_width; |
650 | } |
651 | } |
652 | |
653 | bool has_eod_accepts(const suffix_id &s) { |
654 | assert(s.graph() || s.castle() || s.haig() || s.dfa()); |
655 | if (s.graph()) { |
656 | /* ignore accept -> eod edge */ |
657 | return in_degree(s.graph()->acceptEod, *s.graph()) > 1; |
658 | } else if (s.castle()) { |
659 | return false; |
660 | } else if (s.dfa()) { |
661 | return has_eod_accepts(*s.dfa()); |
662 | } else { |
663 | return has_eod_accepts(*s.haig()); |
664 | } |
665 | } |
666 | |
667 | bool has_non_eod_accepts(const suffix_id &s) { |
668 | assert(s.graph() || s.castle() || s.haig() || s.dfa()); |
669 | if (s.graph()) { |
670 | return in_degree(s.graph()->accept, *s.graph()); |
671 | } else if (s.castle()) { |
672 | return true; |
673 | } else if (s.dfa()) { |
674 | return has_non_eod_accepts(*s.dfa()); |
675 | } else { |
676 | return has_non_eod_accepts(*s.haig()); |
677 | } |
678 | } |
679 | |
680 | set<u32> all_tops(const suffix_id &s) { |
681 | assert(s.graph() || s.castle() || s.haig() || s.dfa()); |
682 | if (s.graph()) { |
683 | flat_set<u32> tops = getTops(*s.graph()); |
684 | assert(!tops.empty()); |
685 | return {tops.begin(), tops.end()}; |
686 | } |
687 | |
688 | if (s.castle()) { |
689 | return assoc_keys(s.castle()->repeats); |
690 | } |
691 | |
692 | // Other types of suffix are not multi-top. |
693 | return {0}; |
694 | } |
695 | |
696 | size_t suffix_id::hash() const { |
697 | return hash_all(g, c, d, h, t); |
698 | } |
699 | |
700 | bool isAnchored(const left_id &r) { |
701 | assert(r.graph() || r.castle() || r.haig() || r.dfa()); |
702 | if (r.graph()) { |
703 | return isAnchored(*r.graph()); |
704 | } |
705 | if (r.dfa()) { |
706 | return r.dfa()->start_anchored == DEAD_STATE; |
707 | } |
708 | if (r.haig()) { |
709 | return r.haig()->start_anchored == DEAD_STATE; |
710 | } |
711 | |
712 | // All other types are explicitly anchored. |
713 | return true; |
714 | } |
715 | |
716 | depth findMinWidth(const left_id &r) { |
717 | assert(r.graph() || r.castle() || r.haig() || r.dfa()); |
718 | if (r.graph()) { |
719 | return findMinWidth(*r.graph()); |
720 | } else if (r.castle()) { |
721 | return findMinWidth(*r.castle()); |
722 | } else { |
723 | return r.dfa_min_width; |
724 | } |
725 | } |
726 | |
727 | depth findMaxWidth(const left_id &r) { |
728 | assert(r.graph() || r.castle() || r.haig() || r.dfa()); |
729 | if (r.graph()) { |
730 | return findMaxWidth(*r.graph()); |
731 | } else if (r.castle()) { |
732 | return findMaxWidth(*r.castle()); |
733 | } else { |
734 | return r.dfa_max_width; |
735 | } |
736 | } |
737 | |
738 | set<u32> all_tops(const left_id &r) { |
739 | assert(r.graph() || r.castle() || r.haig() || r.dfa()); |
740 | if (r.graph()) { |
741 | flat_set<u32> tops = getTops(*r.graph()); |
742 | return {tops.begin(), tops.end()}; |
743 | } |
744 | |
745 | if (r.castle()) { |
746 | return assoc_keys(r.castle()->repeats); |
747 | } |
748 | |
749 | // Other types of rose are not multi-top. |
750 | return {0}; |
751 | } |
752 | |
753 | set<u32> all_reports(const left_id &left) { |
754 | assert(left.graph() || left.castle() || left.haig() || left.dfa()); |
755 | if (left.graph()) { |
756 | return all_reports(*left.graph()); |
757 | } else if (left.castle()) { |
758 | return all_reports(*left.castle()); |
759 | } else if (left.dfa()) { |
760 | return all_reports(*left.dfa()); |
761 | } else { |
762 | return all_reports(*left.haig()); |
763 | } |
764 | } |
765 | |
766 | u32 num_tops(const left_id &r) { |
767 | return all_tops(r).size(); |
768 | } |
769 | |
770 | size_t left_id::hash() const { |
771 | return hash_all(g, c, d, h); |
772 | } |
773 | |
774 | u64a findMaxOffset(const set<ReportID> &reports, const ReportManager &rm) { |
775 | assert(!reports.empty()); |
776 | u64a maxOffset = 0; |
777 | for (const auto &report_id : reports) { |
778 | const Report &ir = rm.getReport(report_id); |
779 | if (ir.hasBounds()) { |
780 | maxOffset = max(maxOffset, ir.maxOffset); |
781 | } else { |
782 | return MAX_OFFSET; |
783 | } |
784 | } |
785 | return maxOffset; |
786 | } |
787 | |
788 | size_t LeftEngInfo::hash() const { |
789 | return hash_all(graph, castle, dfa, haig, tamarama, lag, leftfix_report); |
790 | } |
791 | |
792 | void LeftEngInfo::reset(void) { |
793 | graph.reset(); |
794 | castle.reset(); |
795 | dfa.reset(); |
796 | haig.reset(); |
797 | tamarama.reset(); |
798 | lag = 0; |
799 | leftfix_report = MO_INVALID_IDX; |
800 | dfa_min_width = depth(0); |
801 | dfa_max_width = depth::infinity(); |
802 | } |
803 | |
804 | LeftEngInfo::operator bool() const { |
805 | assert((int)!!castle + (int)!!dfa + (int)!!haig <= 1); |
806 | assert(!castle || !graph); |
807 | assert(!dfa || graph); /* dfas always have the graph as well */ |
808 | assert(!haig || graph); |
809 | return graph || castle || dfa || haig; |
810 | } |
811 | |
812 | u32 roseQuality(const RoseResources &res, const RoseEngine *t) { |
813 | /* Rose is low quality if the atable is a Mcclellan 16 or has multiple DFAs |
814 | */ |
815 | if (res.has_anchored) { |
816 | if (res.has_anchored_multiple) { |
817 | DEBUG_PRINTF("multiple atable engines\n" ); |
818 | return 0; |
819 | } |
820 | |
821 | if (res.has_anchored_large) { |
822 | DEBUG_PRINTF("m16 atable engine\n" ); |
823 | return 0; |
824 | } |
825 | } |
826 | |
827 | /* if we always run multiple engines then we are slow */ |
828 | u32 always_run = 0; |
829 | |
830 | if (res.has_anchored) { |
831 | always_run++; |
832 | } |
833 | |
834 | if (t->eagerIterOffset) { |
835 | /* eager prefixes are always run */ |
836 | always_run++; |
837 | } |
838 | |
839 | if (res.has_floating) { |
840 | /* TODO: ignore conditional ftables, or ftables beyond smwr region */ |
841 | always_run++; |
842 | } |
843 | |
844 | if (t->ematcherOffset) { |
845 | always_run++; |
846 | } |
847 | |
848 | /* ignore mpv outfixes as they are v good, mpv outfixes are before begin */ |
849 | if (t->outfixBeginQueue != t->outfixEndQueue) { |
850 | /* TODO: ignore outfixes > smwr region */ |
851 | always_run++; |
852 | } |
853 | |
854 | bool eod_prefix = false; |
855 | |
856 | const LeftNfaInfo *left = getLeftTable(t); |
857 | for (u32 i = 0; i < t->activeLeftCount; i++) { |
858 | if (left->eod_check) { |
859 | eod_prefix = true; |
860 | break; |
861 | } |
862 | } |
863 | |
864 | if (eod_prefix) { |
865 | always_run++; |
866 | DEBUG_PRINTF("eod prefixes are slow" ); |
867 | return 0; |
868 | } |
869 | |
870 | if (always_run > 1) { |
871 | DEBUG_PRINTF("we always run %u engines\n" , always_run); |
872 | return 0; |
873 | } |
874 | |
875 | return 1; |
876 | } |
877 | |
878 | u32 findMinOffset(const RoseBuildImpl &build, u32 lit_id) { |
879 | const auto &lit_vertices = build.literal_info.at(lit_id).vertices; |
880 | assert(!lit_vertices.empty()); |
881 | |
882 | u32 min_offset = UINT32_MAX; |
883 | for (const auto &v : lit_vertices) { |
884 | min_offset = min(min_offset, build.g[v].min_offset); |
885 | } |
886 | |
887 | return min_offset; |
888 | } |
889 | |
890 | u32 findMaxOffset(const RoseBuildImpl &build, u32 lit_id) { |
891 | const auto &lit_vertices = build.literal_info.at(lit_id).vertices; |
892 | assert(!lit_vertices.empty()); |
893 | |
894 | u32 max_offset = 0; |
895 | for (const auto &v : lit_vertices) { |
896 | max_offset = max(max_offset, build.g[v].max_offset); |
897 | } |
898 | |
899 | return max_offset; |
900 | } |
901 | |
902 | bool canEagerlyReportAtEod(const RoseBuildImpl &build, const RoseEdge &e) { |
903 | const auto &g = build.g; |
904 | const auto v = target(e, g); |
905 | |
906 | if (!build.g[v].eod_accept) { |
907 | return false; |
908 | } |
909 | |
910 | // If there's a graph between us and EOD, we shouldn't be eager. |
911 | if (build.g[v].left) { |
912 | return false; |
913 | } |
914 | |
915 | // Must be exactly at EOD. |
916 | if (g[e].minBound != 0 || g[e].maxBound != 0) { |
917 | return false; |
918 | } |
919 | |
920 | // In streaming mode, we can only eagerly report EOD for literals in the |
921 | // EOD-anchored table, as that's the only time we actually know where EOD |
922 | // is. In block mode, we always have this information. |
923 | const auto u = source(e, g); |
924 | if (build.cc.streaming && !build.isInETable(u)) { |
925 | return false; |
926 | } |
927 | |
928 | return true; |
929 | } |
930 | |
931 | #ifndef NDEBUG |
932 | /** \brief Returns true if all the graphs (NFA, DFA, Haig, etc) in this Rose |
933 | * graph are implementable. */ |
934 | bool canImplementGraphs(const RoseBuildImpl &tbi) { |
935 | const RoseGraph &g = tbi.g; |
936 | |
937 | // First, check the Rose leftfixes. |
938 | |
939 | for (auto v : vertices_range(g)) { |
940 | DEBUG_PRINTF("leftfix: check vertex %zu\n" , g[v].index); |
941 | |
942 | if (g[v].left.castle) { |
943 | DEBUG_PRINTF("castle ok\n" ); |
944 | continue; |
945 | } |
946 | if (g[v].left.dfa) { |
947 | DEBUG_PRINTF("dfa ok\n" ); |
948 | continue; |
949 | } |
950 | if (g[v].left.haig) { |
951 | DEBUG_PRINTF("haig ok\n" ); |
952 | continue; |
953 | } |
954 | if (g[v].left.graph) { |
955 | assert(g[v].left.graph->kind |
956 | == (tbi.isRootSuccessor(v) ? NFA_PREFIX : NFA_INFIX)); |
957 | if (!isImplementableNFA(*g[v].left.graph, nullptr, tbi.cc)) { |
958 | DEBUG_PRINTF("nfa prefix %zu failed (%zu vertices)\n" , |
959 | g[v].index, num_vertices(*g[v].left.graph)); |
960 | return false; |
961 | } |
962 | } |
963 | } |
964 | |
965 | // Suffix graphs. |
966 | |
967 | for (auto v : vertices_range(g)) { |
968 | DEBUG_PRINTF("suffix: check vertex %zu\n" , g[v].index); |
969 | |
970 | const RoseSuffixInfo &suffix = g[v].suffix; |
971 | if (suffix.castle) { |
972 | DEBUG_PRINTF("castle suffix ok\n" ); |
973 | continue; |
974 | } |
975 | if (suffix.rdfa) { |
976 | DEBUG_PRINTF("dfa suffix ok\n" ); |
977 | continue; |
978 | } |
979 | if (suffix.haig) { |
980 | DEBUG_PRINTF("haig suffix ok\n" ); |
981 | continue; |
982 | } |
983 | if (suffix.graph) { |
984 | assert(suffix.graph->kind == NFA_SUFFIX); |
985 | if (!isImplementableNFA(*suffix.graph, &tbi.rm, tbi.cc)) { |
986 | DEBUG_PRINTF("nfa suffix %zu failed (%zu vertices)\n" , |
987 | g[v].index, num_vertices(*suffix.graph)); |
988 | return false; |
989 | } |
990 | } |
991 | } |
992 | |
993 | return true; |
994 | } |
995 | |
996 | /** |
997 | * \brief True if there is an engine with a top that is not triggered by a |
998 | * vertex in the Rose graph. This is a consistency check used in assertions. |
999 | */ |
1000 | bool hasOrphanedTops(const RoseBuildImpl &build) { |
1001 | const RoseGraph &g = build.g; |
1002 | |
1003 | unordered_map<left_id, set<u32>> leftfixes; |
1004 | unordered_map<suffix_id, set<u32>> suffixes; |
1005 | |
1006 | for (auto v : vertices_range(g)) { |
1007 | if (g[v].left) { |
1008 | set<u32> &tops = leftfixes[g[v].left]; |
1009 | if (!build.isRootSuccessor(v)) { |
1010 | // Tops for infixes come from the in-edges. |
1011 | for (const auto &e : in_edges_range(v, g)) { |
1012 | tops.insert(g[e].rose_top); |
1013 | } |
1014 | } |
1015 | } |
1016 | if (g[v].suffix) { |
1017 | suffixes[g[v].suffix].insert(g[v].suffix.top); |
1018 | } |
1019 | } |
1020 | |
1021 | for (const auto &e : leftfixes) { |
1022 | if (all_tops(e.first) != e.second) { |
1023 | DEBUG_PRINTF("rose tops (%s) don't match rose graph (%s)\n" , |
1024 | as_string_list(all_tops(e.first)).c_str(), |
1025 | as_string_list(e.second).c_str()); |
1026 | return true; |
1027 | } |
1028 | } |
1029 | |
1030 | for (const auto &e : suffixes) { |
1031 | if (all_tops(e.first) != e.second) { |
1032 | DEBUG_PRINTF("suffix tops (%s) don't match rose graph (%s)\n" , |
1033 | as_string_list(all_tops(e.first)).c_str(), |
1034 | as_string_list(e.second).c_str()); |
1035 | return true; |
1036 | } |
1037 | } |
1038 | |
1039 | return false; |
1040 | } |
1041 | |
1042 | #endif // NDEBUG |
1043 | |
1044 | } // namespace ue2 |
1045 | |