| 1 | /* |
| 2 | * Copyright (c) 2015-2018, Intel Corporation |
| 3 | * |
| 4 | * Redistribution and use in source and binary forms, with or without |
| 5 | * modification, are permitted provided that the following conditions are met: |
| 6 | * |
| 7 | * * Redistributions of source code must retain the above copyright notice, |
| 8 | * this list of conditions and the following disclaimer. |
| 9 | * * Redistributions in binary form must reproduce the above copyright |
| 10 | * notice, this list of conditions and the following disclaimer in the |
| 11 | * documentation and/or other materials provided with the distribution. |
| 12 | * * Neither the name of Intel Corporation nor the names of its contributors |
| 13 | * may be used to endorse or promote products derived from this software |
| 14 | * without specific prior written permission. |
| 15 | * |
| 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 26 | * POSSIBILITY OF SUCH DAMAGE. |
| 27 | */ |
| 28 | |
| 29 | #include "rose_build_misc.h" |
| 30 | #include "rose_build_impl.h" |
| 31 | |
| 32 | #include "rose_build_resources.h" |
| 33 | #include "hwlm/hwlm_literal.h" |
| 34 | #include "nfa/castlecompile.h" |
| 35 | #include "nfa/goughcompile.h" |
| 36 | #include "nfa/mcclellancompile_util.h" |
| 37 | #include "nfa/nfa_api.h" |
| 38 | #include "nfa/rdfa.h" |
| 39 | #include "nfa/tamaramacompile.h" |
| 40 | #include "nfagraph/ng_holder.h" |
| 41 | #include "nfagraph/ng_limex.h" |
| 42 | #include "nfagraph/ng_reports.h" |
| 43 | #include "nfagraph/ng_repeat.h" |
| 44 | #include "nfagraph/ng_util.h" |
| 45 | #include "nfagraph/ng_width.h" |
| 46 | #include "smallwrite/smallwrite_build.h" |
| 47 | #include "util/alloc.h" |
| 48 | #include "util/boundary_reports.h" |
| 49 | #include "util/compile_context.h" |
| 50 | #include "util/container.h" |
| 51 | #include "util/graph.h" |
| 52 | #include "util/graph_range.h" |
| 53 | #include "util/make_unique.h" |
| 54 | #include "util/order_check.h" |
| 55 | #include "util/report_manager.h" |
| 56 | #include "util/ue2string.h" |
| 57 | #include "util/verify_types.h" |
| 58 | #include "ue2common.h" |
| 59 | #include "grey.h" |
| 60 | |
| 61 | #include <boost/graph/breadth_first_search.hpp> |
| 62 | |
| 63 | using namespace std; |
| 64 | |
| 65 | namespace ue2 { |
| 66 | |
| 67 | // just to get it out of the header |
| 68 | RoseBuild::~RoseBuild() { } |
| 69 | |
| 70 | RoseBuildImpl::RoseBuildImpl(ReportManager &rm_in, |
| 71 | SomSlotManager &ssm_in, |
| 72 | SmallWriteBuild &smwr_in, |
| 73 | const CompileContext &cc_in, |
| 74 | const BoundaryReports &boundary_in) |
| 75 | : cc(cc_in), |
| 76 | root(add_vertex(g)), |
| 77 | anchored_root(add_vertex(g)), |
| 78 | hasSom(false), |
| 79 | group_end(0), |
| 80 | ematcher_region_size(0), |
| 81 | eod_event_literal_id(MO_INVALID_IDX), |
| 82 | max_rose_anchored_floating_overlap(0), |
| 83 | rm(rm_in), |
| 84 | ssm(ssm_in), |
| 85 | smwr(smwr_in), |
| 86 | boundary(boundary_in), |
| 87 | next_nfa_report(0) { |
| 88 | // add root vertices to graph |
| 89 | g[root].min_offset = 0; |
| 90 | g[root].max_offset = 0; |
| 91 | |
| 92 | g[anchored_root].min_offset = 0; |
| 93 | g[anchored_root].max_offset = 0; |
| 94 | } |
| 95 | |
| 96 | RoseBuildImpl::~RoseBuildImpl() { |
| 97 | // empty |
| 98 | } |
| 99 | |
| 100 | bool RoseVertexProps::isBoring(void) const { |
| 101 | return !suffix && !left; |
| 102 | } |
| 103 | |
| 104 | bool RoseVertexProps::fixedOffset(void) const { |
| 105 | assert(min_offset <= max_offset); /* ensure offsets calculated */ |
| 106 | return max_offset == min_offset && max_offset != ROSE_BOUND_INF; |
| 107 | } |
| 108 | |
| 109 | bool RoseBuildImpl::isRootSuccessor(const RoseVertex &v) const { |
| 110 | for (auto u : inv_adjacent_vertices_range(v, g)) { |
| 111 | if (isAnyStart(u)) { |
| 112 | return true; |
| 113 | } |
| 114 | } |
| 115 | return false; |
| 116 | } |
| 117 | |
| 118 | bool RoseBuildImpl::isNonRootSuccessor(const RoseVertex &v) const { |
| 119 | for (auto u : inv_adjacent_vertices_range(v, g)) { |
| 120 | if (!isAnyStart(u)) { |
| 121 | return true; |
| 122 | } |
| 123 | } |
| 124 | return false; |
| 125 | } |
| 126 | |
| 127 | bool hasAnchHistorySucc(const RoseGraph &g, RoseVertex v) { |
| 128 | for (const auto &e : out_edges_range(v, g)) { |
| 129 | if (g[e].history == ROSE_ROLE_HISTORY_ANCH) { |
| 130 | return true; |
| 131 | } |
| 132 | } |
| 133 | |
| 134 | return false; |
| 135 | } |
| 136 | |
| 137 | bool hasLastByteHistorySucc(const RoseGraph &g, RoseVertex v) { |
| 138 | for (const auto &e : out_edges_range(v, g)) { |
| 139 | if (g[e].history == ROSE_ROLE_HISTORY_LAST_BYTE) { |
| 140 | return true; |
| 141 | } |
| 142 | } |
| 143 | |
| 144 | return false; |
| 145 | } |
| 146 | |
| 147 | static |
| 148 | bool isInTable(const RoseBuildImpl &tbi, RoseVertex v, |
| 149 | rose_literal_table table) { |
| 150 | const auto &lit_ids = tbi.g[v].literals; |
| 151 | if (lit_ids.empty()) { |
| 152 | return false; // special role with no literals |
| 153 | } |
| 154 | |
| 155 | // All literals for a given vertex will be in the same table, so we need |
| 156 | // only inspect the first one. |
| 157 | const auto lit_table = tbi.literals.at(*lit_ids.begin()).table; |
| 158 | |
| 159 | // Verify that all literals for this vertex are in the same table. |
| 160 | assert(all_of_in(lit_ids, [&](u32 lit_id) { |
| 161 | return tbi.literals.at(lit_id).table == lit_table; |
| 162 | })); |
| 163 | |
| 164 | return lit_table == table; |
| 165 | } |
| 166 | |
| 167 | bool RoseBuildImpl::isAnchored(RoseVertex v) const { |
| 168 | return isInTable(*this, v, ROSE_ANCHORED); |
| 169 | } |
| 170 | |
| 171 | bool RoseBuildImpl::isFloating(RoseVertex v) const { |
| 172 | return isInTable(*this, v, ROSE_FLOATING); |
| 173 | } |
| 174 | |
| 175 | bool RoseBuildImpl::isInETable(RoseVertex v) const { |
| 176 | return isInTable(*this, v, ROSE_EOD_ANCHORED); |
| 177 | } |
| 178 | |
| 179 | bool RoseBuildImpl::hasLiteralInTable(RoseVertex v, |
| 180 | enum rose_literal_table t) const { |
| 181 | return isInTable(*this, v, t); |
| 182 | } |
| 183 | |
| 184 | /* Indicates that the floating table (if it exists) will be only run |
| 185 | conditionally based on matches from the anchored table. */ |
| 186 | bool RoseBuildImpl::hasNoFloatingRoots() const { |
| 187 | for (auto v : adjacent_vertices_range(root, g)) { |
| 188 | if (isFloating(v)) { |
| 189 | DEBUG_PRINTF("direct floating root %zu\n" , g[v].index); |
| 190 | return false; |
| 191 | } |
| 192 | } |
| 193 | |
| 194 | /* need to check if the anchored_root has any literals which are too deep */ |
| 195 | for (auto v : adjacent_vertices_range(anchored_root, g)) { |
| 196 | if (isFloating(v)) { |
| 197 | DEBUG_PRINTF("indirect floating root %zu\n" , g[v].index); |
| 198 | return false; |
| 199 | } |
| 200 | } |
| 201 | |
| 202 | return true; |
| 203 | } |
| 204 | |
| 205 | size_t RoseBuildImpl::maxLiteralLen(RoseVertex v) const { |
| 206 | const auto &lit_ids = g[v].literals; |
| 207 | assert(!lit_ids.empty()); |
| 208 | |
| 209 | size_t maxlen = 0; |
| 210 | |
| 211 | for (const auto &lit_id : lit_ids) { |
| 212 | maxlen = max(maxlen, literals.at(lit_id).elength()); |
| 213 | } |
| 214 | |
| 215 | return maxlen; |
| 216 | } |
| 217 | |
| 218 | size_t RoseBuildImpl::minLiteralLen(RoseVertex v) const { |
| 219 | const auto &lit_ids = g[v].literals; |
| 220 | assert(!lit_ids.empty()); |
| 221 | |
| 222 | size_t minlen = ROSE_BOUND_INF; |
| 223 | |
| 224 | for (const auto &lit_id : lit_ids) { |
| 225 | minlen = min(minlen, literals.at(lit_id).elength()); |
| 226 | } |
| 227 | |
| 228 | return minlen; |
| 229 | } |
| 230 | |
| 231 | // RoseBuild factory |
| 232 | unique_ptr<RoseBuild> makeRoseBuilder(ReportManager &rm, |
| 233 | SomSlotManager &ssm, |
| 234 | SmallWriteBuild &smwr, |
| 235 | const CompileContext &cc, |
| 236 | const BoundaryReports &boundary) { |
| 237 | return ue2::make_unique<RoseBuildImpl>(rm, ssm, smwr, cc, boundary); |
| 238 | } |
| 239 | |
| 240 | bool roseIsPureLiteral(const RoseEngine *t) { |
| 241 | return t->runtimeImpl == ROSE_RUNTIME_PURE_LITERAL; |
| 242 | } |
| 243 | |
| 244 | // Returns non-zero max overlap len if a suffix of the literal 'a' overlaps |
| 245 | // with a prefix of the literal 'b' or 'a' can be contained in 'b'. |
| 246 | size_t maxOverlap(const ue2_literal &a, const ue2_literal &b, u32 b_delay) { |
| 247 | /* overly conservative if only part of the string is nocase */ |
| 248 | bool nocase = a.any_nocase() || b.any_nocase(); |
| 249 | DEBUG_PRINTF("max overlap %s %s+%u %d\n" , dumpString(a).c_str(), |
| 250 | dumpString(b).c_str(), b_delay, (int)nocase); |
| 251 | size_t a_len = a.length(); |
| 252 | size_t b_len = b.length(); |
| 253 | const char *a_end = a.c_str() + a_len; |
| 254 | const char *b_end = b.c_str() + b_len; |
| 255 | if (b_delay >= a_len) { |
| 256 | return b_len + b_delay; |
| 257 | } else if (b_delay) { |
| 258 | /* a can be a substring of b which overlaps some of the end dots |
| 259 | * OR b can be a substring near the end of a */ |
| 260 | /* ignore overlap due to the final trailing dot as delayed literals |
| 261 | * are delivered before undelayed */ |
| 262 | for (u32 j = b_delay - 1; j > 0; j--) { |
| 263 | if (b_len + j >= a_len) { |
| 264 | if (!cmp(a.c_str(), b_end + j - a_len, a_len - j, nocase)) { |
| 265 | return b_len + j; |
| 266 | } |
| 267 | } else { |
| 268 | if (!cmp(a_end - j - b_len, b.c_str(), b_len, nocase)) { |
| 269 | return b_len + j; |
| 270 | } |
| 271 | } |
| 272 | } |
| 273 | } |
| 274 | |
| 275 | return maxStringOverlap(a.get_string(), b.get_string(), nocase); |
| 276 | } |
| 277 | |
| 278 | // Returns non-zero max overlap len if a suffix of the literal ID 'a' overlaps |
| 279 | // with a prefix of the literal ID 'b' or 'a' can be contained in 'b'. |
| 280 | size_t maxOverlap(const rose_literal_id &a, const rose_literal_id &b) { |
| 281 | assert(!a.delay); |
| 282 | return maxOverlap(a.s, b.s, b.delay); |
| 283 | } |
| 284 | |
| 285 | static |
| 286 | const rose_literal_id &getOverlapLiteral(const RoseBuildImpl &tbi, |
| 287 | u32 literal_id) { |
| 288 | auto it = tbi.anchoredLitSuffix.find(literal_id); |
| 289 | if (it != tbi.anchoredLitSuffix.end()) { |
| 290 | return it->second; |
| 291 | } |
| 292 | return tbi.literals.at(literal_id); |
| 293 | } |
| 294 | |
| 295 | ue2_literal findNonOverlappingTail(const set<ue2_literal> &lits, |
| 296 | const ue2_literal &s) { |
| 297 | size_t max_overlap = 0; |
| 298 | |
| 299 | for (const auto &lit : lits) { |
| 300 | size_t overlap = lit != s ? maxStringOverlap(lit, s) |
| 301 | : maxStringSelfOverlap(s); |
| 302 | max_overlap = max(max_overlap, overlap); |
| 303 | } |
| 304 | |
| 305 | /* find the tail that doesn't overlap */ |
| 306 | ue2_literal tail = s.substr(max_overlap); |
| 307 | DEBUG_PRINTF("%zu overlap, tail: '%s'\n" , max_overlap, |
| 308 | dumpString(tail).c_str()); |
| 309 | return tail; |
| 310 | } |
| 311 | |
| 312 | size_t RoseBuildImpl::maxLiteralOverlap(RoseVertex u, RoseVertex v) const { |
| 313 | size_t overlap = 0; |
| 314 | for (auto u_lit_id : g[u].literals) { |
| 315 | const rose_literal_id &ul = getOverlapLiteral(*this, u_lit_id); |
| 316 | for (auto v_lit_id : g[v].literals) { |
| 317 | const rose_literal_id &vl = getOverlapLiteral(*this, v_lit_id); |
| 318 | overlap = max(overlap, maxOverlap(ul, vl)); |
| 319 | } |
| 320 | } |
| 321 | return overlap; |
| 322 | } |
| 323 | |
| 324 | void RoseBuildImpl::removeVertices(const vector<RoseVertex> &dead) { |
| 325 | for (auto v : dead) { |
| 326 | assert(!isAnyStart(v)); |
| 327 | DEBUG_PRINTF("removing vertex %zu\n" , g[v].index); |
| 328 | for (auto lit_id : g[v].literals) { |
| 329 | literal_info[lit_id].vertices.erase(v); |
| 330 | } |
| 331 | clear_vertex(v, g); |
| 332 | remove_vertex(v, g); |
| 333 | } |
| 334 | renumber_vertices(g); |
| 335 | } |
| 336 | |
| 337 | // Find the maximum bound on the edges to this vertex's successors ignoring |
| 338 | // those via infixes. |
| 339 | u32 RoseBuildImpl::calcSuccMaxBound(RoseVertex u) const { |
| 340 | u32 maxBound = 0; |
| 341 | for (const auto &e : out_edges_range(u, g)) { |
| 342 | RoseVertex v = target(e, g); |
| 343 | |
| 344 | if (g[v].left) { |
| 345 | continue; |
| 346 | } |
| 347 | |
| 348 | u32 thisBound = g[e].maxBound; |
| 349 | |
| 350 | if (thisBound == ROSE_BOUND_INF) { |
| 351 | return ROSE_BOUND_INF; |
| 352 | } |
| 353 | |
| 354 | if (!g[v].eod_accept) { |
| 355 | // Add the length of the longest of our literals. |
| 356 | thisBound += maxLiteralLen(v); |
| 357 | } |
| 358 | |
| 359 | maxBound = max(maxBound, thisBound); |
| 360 | } |
| 361 | |
| 362 | assert(maxBound <= ROSE_BOUND_INF); |
| 363 | return maxBound; |
| 364 | } |
| 365 | |
| 366 | u32 RoseBuildImpl::getLiteralId(const ue2_literal &s, u32 delay, |
| 367 | rose_literal_table table) { |
| 368 | DEBUG_PRINTF("getting id for %s in table %d\n" , dumpString(s).c_str(), |
| 369 | table); |
| 370 | assert(table != ROSE_ANCHORED); |
| 371 | rose_literal_id key(s, table, delay); |
| 372 | |
| 373 | auto m = literals.insert(key); |
| 374 | u32 id = m.first; |
| 375 | bool inserted = m.second; |
| 376 | |
| 377 | if (inserted) { |
| 378 | literal_info.push_back(rose_literal_info()); |
| 379 | assert(literal_info.size() == id + 1); |
| 380 | |
| 381 | if (delay) { |
| 382 | u32 undelayed_id = getLiteralId(s, 0, table); |
| 383 | literal_info[id].undelayed_id = undelayed_id; |
| 384 | literal_info[undelayed_id].delayed_ids.insert(id); |
| 385 | } else { |
| 386 | literal_info[id].undelayed_id = id; |
| 387 | } |
| 388 | } |
| 389 | return id; |
| 390 | } |
| 391 | |
| 392 | // Function that operates on a msk/cmp pair and a literal, as used in |
| 393 | // hwlmLiteral, and zeroes msk elements that don't add any power to the |
| 394 | // literal. |
| 395 | void normaliseLiteralMask(const ue2_literal &s_in, vector<u8> &msk, |
| 396 | vector<u8> &cmp) { |
| 397 | assert(msk.size() == cmp.size()); |
| 398 | assert(msk.size() <= HWLM_MASKLEN); |
| 399 | |
| 400 | if (msk.empty()) { |
| 401 | return; |
| 402 | } |
| 403 | |
| 404 | // Work over a caseless copy if the string contains nocase chars. This will |
| 405 | // ensure that we treat masks designed to handle mixed-sensitivity literals |
| 406 | // correctly: these will be matched by the literal matcher in caseless |
| 407 | // mode, with the mask used to narrow the matches. |
| 408 | ue2_literal s(s_in); |
| 409 | if (s.any_nocase()) { |
| 410 | make_nocase(&s); |
| 411 | } |
| 412 | |
| 413 | ue2_literal::const_reverse_iterator it = s.rbegin(), ite = s.rend(); |
| 414 | size_t i = msk.size(); |
| 415 | while (i-- != 0 && it != ite) { |
| 416 | const CharReach &cr = *it; |
| 417 | for (size_t c = cr.find_first(); c != CharReach::npos; |
| 418 | c = cr.find_next(c)) { |
| 419 | if (((u8)c & msk[i]) != cmp[i]) { |
| 420 | goto skip; |
| 421 | } |
| 422 | } |
| 423 | |
| 424 | // If we didn't jump out of the loop to skip, then this mask position |
| 425 | // doesn't further narrow the set of acceptable literals from those |
| 426 | // accepted by s. So we can zero this element. |
| 427 | msk[i] = 0; |
| 428 | cmp[i] = 0; |
| 429 | skip: |
| 430 | ++it; |
| 431 | } |
| 432 | |
| 433 | // Wipe out prefix zeroes. |
| 434 | while (!msk.empty() && msk[0] == 0) { |
| 435 | msk.erase(msk.begin()); |
| 436 | cmp.erase(cmp.begin()); |
| 437 | } |
| 438 | } |
| 439 | |
| 440 | rose_literal_id::rose_literal_id(const ue2_literal &s_in, |
| 441 | const vector<u8> &msk_in, const vector<u8> &cmp_in, |
| 442 | rose_literal_table table_in, u32 delay_in) |
| 443 | : s(s_in), msk(msk_in), cmp(cmp_in), table(table_in), |
| 444 | delay(delay_in), distinctiveness(0) { |
| 445 | assert(msk.size() == cmp.size()); |
| 446 | assert(msk.size() <= HWLM_MASKLEN); |
| 447 | assert(delay <= MAX_DELAY); |
| 448 | |
| 449 | normaliseLiteralMask(s, msk, cmp); |
| 450 | } |
| 451 | |
| 452 | u32 RoseBuildImpl::getLiteralId(const ue2_literal &s, const vector<u8> &msk, |
| 453 | const vector<u8> &cmp, u32 delay, |
| 454 | rose_literal_table table) { |
| 455 | DEBUG_PRINTF("getting id for %s in table %d\n" , dumpString(s).c_str(), |
| 456 | table); |
| 457 | assert(table != ROSE_ANCHORED); |
| 458 | rose_literal_id key(s, msk, cmp, table, delay); |
| 459 | |
| 460 | /* ue2_literals are always uppercased if nocase and must have an |
| 461 | * alpha char */ |
| 462 | |
| 463 | auto m = literals.insert(key); |
| 464 | u32 id = m.first; |
| 465 | bool inserted = m.second; |
| 466 | |
| 467 | if (inserted) { |
| 468 | literal_info.push_back(rose_literal_info()); |
| 469 | assert(literal_info.size() == id + 1); |
| 470 | |
| 471 | if (delay) { |
| 472 | u32 undelayed_id = getLiteralId(s, msk, cmp, 0, table); |
| 473 | literal_info[id].undelayed_id = undelayed_id; |
| 474 | literal_info[undelayed_id].delayed_ids.insert(id); |
| 475 | } else { |
| 476 | literal_info[id].undelayed_id = id; |
| 477 | } |
| 478 | } |
| 479 | return id; |
| 480 | } |
| 481 | |
| 482 | u32 RoseBuildImpl::getNewLiteralId() { |
| 483 | rose_literal_id key(ue2_literal(), ROSE_ANCHORED, 0); |
| 484 | u32 numLiterals = verify_u32(literals.size()); |
| 485 | key.distinctiveness = numLiterals; |
| 486 | |
| 487 | auto m = literals.insert(key); |
| 488 | assert(m.second); |
| 489 | u32 id = m.first; |
| 490 | |
| 491 | literal_info.push_back(rose_literal_info()); |
| 492 | assert(literal_info.size() == id + 1); |
| 493 | |
| 494 | literal_info[id].undelayed_id = id; |
| 495 | |
| 496 | return id; |
| 497 | } |
| 498 | |
| 499 | bool operator<(const RoseEdgeProps &a, const RoseEdgeProps &b) { |
| 500 | ORDER_CHECK(minBound); |
| 501 | ORDER_CHECK(maxBound); |
| 502 | ORDER_CHECK(history); |
| 503 | return false; |
| 504 | } |
| 505 | |
| 506 | #ifndef NDEBUG |
| 507 | bool roseHasTops(const RoseBuildImpl &build, RoseVertex v) { |
| 508 | const RoseGraph &g = build.g; |
| 509 | assert(g[v].left); |
| 510 | |
| 511 | set<u32> graph_tops; |
| 512 | if (!build.isRootSuccessor(v)) { |
| 513 | for (const auto &e : in_edges_range(v, g)) { |
| 514 | graph_tops.insert(g[e].rose_top); |
| 515 | } |
| 516 | } |
| 517 | |
| 518 | return is_subset_of(graph_tops, all_tops(g[v].left)); |
| 519 | } |
| 520 | #endif |
| 521 | |
| 522 | u32 OutfixInfo::get_queue(QueueIndexFactory &qif) { |
| 523 | if (queue == ~0U) { |
| 524 | queue = qif.get_queue(); |
| 525 | } |
| 526 | |
| 527 | return queue; |
| 528 | } |
| 529 | |
| 530 | namespace { |
| 531 | class OutfixAllReports : public boost::static_visitor<set<ReportID>> { |
| 532 | public: |
| 533 | set<ReportID> operator()(const boost::blank &) const { |
| 534 | return set<ReportID>(); |
| 535 | } |
| 536 | |
| 537 | template<class T> |
| 538 | set<ReportID> operator()(const unique_ptr<T> &x) const { |
| 539 | return all_reports(*x); |
| 540 | } |
| 541 | |
| 542 | set<ReportID> operator()(const MpvProto &mpv) const { |
| 543 | set<ReportID> reports; |
| 544 | for (const auto &puff : mpv.puffettes) { |
| 545 | reports.insert(puff.report); |
| 546 | } |
| 547 | for (const auto &puff : mpv.triggered_puffettes) { |
| 548 | reports.insert(puff.report); |
| 549 | } |
| 550 | return reports; |
| 551 | } |
| 552 | }; |
| 553 | } |
| 554 | |
| 555 | set<ReportID> all_reports(const OutfixInfo &outfix) { |
| 556 | auto reports = boost::apply_visitor(OutfixAllReports(), outfix.proto); |
| 557 | assert(!reports.empty()); |
| 558 | return reports; |
| 559 | } |
| 560 | |
| 561 | bool RoseSuffixInfo::operator==(const RoseSuffixInfo &b) const { |
| 562 | return top == b.top && graph == b.graph && castle == b.castle && |
| 563 | rdfa == b.rdfa && haig == b.haig && tamarama == b.tamarama; |
| 564 | } |
| 565 | |
| 566 | bool RoseSuffixInfo::operator<(const RoseSuffixInfo &b) const { |
| 567 | const RoseSuffixInfo &a = *this; |
| 568 | ORDER_CHECK(top); |
| 569 | ORDER_CHECK(graph); |
| 570 | ORDER_CHECK(castle); |
| 571 | ORDER_CHECK(haig); |
| 572 | ORDER_CHECK(rdfa); |
| 573 | ORDER_CHECK(tamarama); |
| 574 | assert(a.dfa_min_width == b.dfa_min_width); |
| 575 | assert(a.dfa_max_width == b.dfa_max_width); |
| 576 | return false; |
| 577 | } |
| 578 | |
| 579 | size_t RoseSuffixInfo::hash() const { |
| 580 | return hash_all(top, graph, castle, rdfa, haig, tamarama); |
| 581 | } |
| 582 | |
| 583 | void RoseSuffixInfo::reset(void) { |
| 584 | top = 0; |
| 585 | graph.reset(); |
| 586 | castle.reset(); |
| 587 | rdfa.reset(); |
| 588 | haig.reset(); |
| 589 | tamarama.reset(); |
| 590 | dfa_min_width = depth(0); |
| 591 | dfa_max_width = depth::infinity(); |
| 592 | } |
| 593 | |
| 594 | std::set<ReportID> all_reports(const suffix_id &s) { |
| 595 | assert(s.graph() || s.castle() || s.haig() || s.dfa()); |
| 596 | if (s.tamarama()) { |
| 597 | return all_reports(*s.tamarama()); |
| 598 | } else if (s.graph()) { |
| 599 | return all_reports(*s.graph()); |
| 600 | } else if (s.castle()) { |
| 601 | return all_reports(*s.castle()); |
| 602 | } else if (s.dfa()) { |
| 603 | return all_reports(*s.dfa()); |
| 604 | } else { |
| 605 | return all_reports(*s.haig()); |
| 606 | } |
| 607 | } |
| 608 | |
| 609 | depth findMinWidth(const suffix_id &s) { |
| 610 | assert(s.graph() || s.castle() || s.haig() || s.dfa()); |
| 611 | if (s.graph()) { |
| 612 | return findMinWidth(*s.graph()); |
| 613 | } else if (s.castle()) { |
| 614 | return findMinWidth(*s.castle()); |
| 615 | } else { |
| 616 | return s.dfa_min_width; |
| 617 | } |
| 618 | } |
| 619 | |
| 620 | depth findMinWidth(const suffix_id &s, u32 top) { |
| 621 | assert(s.graph() || s.castle() || s.haig() || s.dfa()); |
| 622 | if (s.graph()) { |
| 623 | return findMinWidth(*s.graph(), top); |
| 624 | } else if (s.castle()) { |
| 625 | return findMinWidth(*s.castle(), top); |
| 626 | } else { |
| 627 | return s.dfa_min_width; |
| 628 | } |
| 629 | } |
| 630 | |
| 631 | depth findMaxWidth(const suffix_id &s) { |
| 632 | assert(s.graph() || s.castle() || s.haig() || s.dfa()); |
| 633 | if (s.graph()) { |
| 634 | return findMaxWidth(*s.graph()); |
| 635 | } else if (s.castle()) { |
| 636 | return findMaxWidth(*s.castle()); |
| 637 | } else { |
| 638 | return s.dfa_max_width; |
| 639 | } |
| 640 | } |
| 641 | |
| 642 | depth findMaxWidth(const suffix_id &s, u32 top) { |
| 643 | assert(s.graph() || s.castle() || s.haig() || s.dfa()); |
| 644 | if (s.graph()) { |
| 645 | return findMaxWidth(*s.graph(), top); |
| 646 | } else if (s.castle()) { |
| 647 | return findMaxWidth(*s.castle(), top); |
| 648 | } else { |
| 649 | return s.dfa_max_width; |
| 650 | } |
| 651 | } |
| 652 | |
| 653 | bool has_eod_accepts(const suffix_id &s) { |
| 654 | assert(s.graph() || s.castle() || s.haig() || s.dfa()); |
| 655 | if (s.graph()) { |
| 656 | /* ignore accept -> eod edge */ |
| 657 | return in_degree(s.graph()->acceptEod, *s.graph()) > 1; |
| 658 | } else if (s.castle()) { |
| 659 | return false; |
| 660 | } else if (s.dfa()) { |
| 661 | return has_eod_accepts(*s.dfa()); |
| 662 | } else { |
| 663 | return has_eod_accepts(*s.haig()); |
| 664 | } |
| 665 | } |
| 666 | |
| 667 | bool has_non_eod_accepts(const suffix_id &s) { |
| 668 | assert(s.graph() || s.castle() || s.haig() || s.dfa()); |
| 669 | if (s.graph()) { |
| 670 | return in_degree(s.graph()->accept, *s.graph()); |
| 671 | } else if (s.castle()) { |
| 672 | return true; |
| 673 | } else if (s.dfa()) { |
| 674 | return has_non_eod_accepts(*s.dfa()); |
| 675 | } else { |
| 676 | return has_non_eod_accepts(*s.haig()); |
| 677 | } |
| 678 | } |
| 679 | |
| 680 | set<u32> all_tops(const suffix_id &s) { |
| 681 | assert(s.graph() || s.castle() || s.haig() || s.dfa()); |
| 682 | if (s.graph()) { |
| 683 | flat_set<u32> tops = getTops(*s.graph()); |
| 684 | assert(!tops.empty()); |
| 685 | return {tops.begin(), tops.end()}; |
| 686 | } |
| 687 | |
| 688 | if (s.castle()) { |
| 689 | return assoc_keys(s.castle()->repeats); |
| 690 | } |
| 691 | |
| 692 | // Other types of suffix are not multi-top. |
| 693 | return {0}; |
| 694 | } |
| 695 | |
| 696 | size_t suffix_id::hash() const { |
| 697 | return hash_all(g, c, d, h, t); |
| 698 | } |
| 699 | |
| 700 | bool isAnchored(const left_id &r) { |
| 701 | assert(r.graph() || r.castle() || r.haig() || r.dfa()); |
| 702 | if (r.graph()) { |
| 703 | return isAnchored(*r.graph()); |
| 704 | } |
| 705 | if (r.dfa()) { |
| 706 | return r.dfa()->start_anchored == DEAD_STATE; |
| 707 | } |
| 708 | if (r.haig()) { |
| 709 | return r.haig()->start_anchored == DEAD_STATE; |
| 710 | } |
| 711 | |
| 712 | // All other types are explicitly anchored. |
| 713 | return true; |
| 714 | } |
| 715 | |
| 716 | depth findMinWidth(const left_id &r) { |
| 717 | assert(r.graph() || r.castle() || r.haig() || r.dfa()); |
| 718 | if (r.graph()) { |
| 719 | return findMinWidth(*r.graph()); |
| 720 | } else if (r.castle()) { |
| 721 | return findMinWidth(*r.castle()); |
| 722 | } else { |
| 723 | return r.dfa_min_width; |
| 724 | } |
| 725 | } |
| 726 | |
| 727 | depth findMaxWidth(const left_id &r) { |
| 728 | assert(r.graph() || r.castle() || r.haig() || r.dfa()); |
| 729 | if (r.graph()) { |
| 730 | return findMaxWidth(*r.graph()); |
| 731 | } else if (r.castle()) { |
| 732 | return findMaxWidth(*r.castle()); |
| 733 | } else { |
| 734 | return r.dfa_max_width; |
| 735 | } |
| 736 | } |
| 737 | |
| 738 | set<u32> all_tops(const left_id &r) { |
| 739 | assert(r.graph() || r.castle() || r.haig() || r.dfa()); |
| 740 | if (r.graph()) { |
| 741 | flat_set<u32> tops = getTops(*r.graph()); |
| 742 | return {tops.begin(), tops.end()}; |
| 743 | } |
| 744 | |
| 745 | if (r.castle()) { |
| 746 | return assoc_keys(r.castle()->repeats); |
| 747 | } |
| 748 | |
| 749 | // Other types of rose are not multi-top. |
| 750 | return {0}; |
| 751 | } |
| 752 | |
| 753 | set<u32> all_reports(const left_id &left) { |
| 754 | assert(left.graph() || left.castle() || left.haig() || left.dfa()); |
| 755 | if (left.graph()) { |
| 756 | return all_reports(*left.graph()); |
| 757 | } else if (left.castle()) { |
| 758 | return all_reports(*left.castle()); |
| 759 | } else if (left.dfa()) { |
| 760 | return all_reports(*left.dfa()); |
| 761 | } else { |
| 762 | return all_reports(*left.haig()); |
| 763 | } |
| 764 | } |
| 765 | |
| 766 | u32 num_tops(const left_id &r) { |
| 767 | return all_tops(r).size(); |
| 768 | } |
| 769 | |
| 770 | size_t left_id::hash() const { |
| 771 | return hash_all(g, c, d, h); |
| 772 | } |
| 773 | |
| 774 | u64a findMaxOffset(const set<ReportID> &reports, const ReportManager &rm) { |
| 775 | assert(!reports.empty()); |
| 776 | u64a maxOffset = 0; |
| 777 | for (const auto &report_id : reports) { |
| 778 | const Report &ir = rm.getReport(report_id); |
| 779 | if (ir.hasBounds()) { |
| 780 | maxOffset = max(maxOffset, ir.maxOffset); |
| 781 | } else { |
| 782 | return MAX_OFFSET; |
| 783 | } |
| 784 | } |
| 785 | return maxOffset; |
| 786 | } |
| 787 | |
| 788 | size_t LeftEngInfo::hash() const { |
| 789 | return hash_all(graph, castle, dfa, haig, tamarama, lag, leftfix_report); |
| 790 | } |
| 791 | |
| 792 | void LeftEngInfo::reset(void) { |
| 793 | graph.reset(); |
| 794 | castle.reset(); |
| 795 | dfa.reset(); |
| 796 | haig.reset(); |
| 797 | tamarama.reset(); |
| 798 | lag = 0; |
| 799 | leftfix_report = MO_INVALID_IDX; |
| 800 | dfa_min_width = depth(0); |
| 801 | dfa_max_width = depth::infinity(); |
| 802 | } |
| 803 | |
| 804 | LeftEngInfo::operator bool() const { |
| 805 | assert((int)!!castle + (int)!!dfa + (int)!!haig <= 1); |
| 806 | assert(!castle || !graph); |
| 807 | assert(!dfa || graph); /* dfas always have the graph as well */ |
| 808 | assert(!haig || graph); |
| 809 | return graph || castle || dfa || haig; |
| 810 | } |
| 811 | |
| 812 | u32 roseQuality(const RoseResources &res, const RoseEngine *t) { |
| 813 | /* Rose is low quality if the atable is a Mcclellan 16 or has multiple DFAs |
| 814 | */ |
| 815 | if (res.has_anchored) { |
| 816 | if (res.has_anchored_multiple) { |
| 817 | DEBUG_PRINTF("multiple atable engines\n" ); |
| 818 | return 0; |
| 819 | } |
| 820 | |
| 821 | if (res.has_anchored_large) { |
| 822 | DEBUG_PRINTF("m16 atable engine\n" ); |
| 823 | return 0; |
| 824 | } |
| 825 | } |
| 826 | |
| 827 | /* if we always run multiple engines then we are slow */ |
| 828 | u32 always_run = 0; |
| 829 | |
| 830 | if (res.has_anchored) { |
| 831 | always_run++; |
| 832 | } |
| 833 | |
| 834 | if (t->eagerIterOffset) { |
| 835 | /* eager prefixes are always run */ |
| 836 | always_run++; |
| 837 | } |
| 838 | |
| 839 | if (res.has_floating) { |
| 840 | /* TODO: ignore conditional ftables, or ftables beyond smwr region */ |
| 841 | always_run++; |
| 842 | } |
| 843 | |
| 844 | if (t->ematcherOffset) { |
| 845 | always_run++; |
| 846 | } |
| 847 | |
| 848 | /* ignore mpv outfixes as they are v good, mpv outfixes are before begin */ |
| 849 | if (t->outfixBeginQueue != t->outfixEndQueue) { |
| 850 | /* TODO: ignore outfixes > smwr region */ |
| 851 | always_run++; |
| 852 | } |
| 853 | |
| 854 | bool eod_prefix = false; |
| 855 | |
| 856 | const LeftNfaInfo *left = getLeftTable(t); |
| 857 | for (u32 i = 0; i < t->activeLeftCount; i++) { |
| 858 | if (left->eod_check) { |
| 859 | eod_prefix = true; |
| 860 | break; |
| 861 | } |
| 862 | } |
| 863 | |
| 864 | if (eod_prefix) { |
| 865 | always_run++; |
| 866 | DEBUG_PRINTF("eod prefixes are slow" ); |
| 867 | return 0; |
| 868 | } |
| 869 | |
| 870 | if (always_run > 1) { |
| 871 | DEBUG_PRINTF("we always run %u engines\n" , always_run); |
| 872 | return 0; |
| 873 | } |
| 874 | |
| 875 | return 1; |
| 876 | } |
| 877 | |
| 878 | u32 findMinOffset(const RoseBuildImpl &build, u32 lit_id) { |
| 879 | const auto &lit_vertices = build.literal_info.at(lit_id).vertices; |
| 880 | assert(!lit_vertices.empty()); |
| 881 | |
| 882 | u32 min_offset = UINT32_MAX; |
| 883 | for (const auto &v : lit_vertices) { |
| 884 | min_offset = min(min_offset, build.g[v].min_offset); |
| 885 | } |
| 886 | |
| 887 | return min_offset; |
| 888 | } |
| 889 | |
| 890 | u32 findMaxOffset(const RoseBuildImpl &build, u32 lit_id) { |
| 891 | const auto &lit_vertices = build.literal_info.at(lit_id).vertices; |
| 892 | assert(!lit_vertices.empty()); |
| 893 | |
| 894 | u32 max_offset = 0; |
| 895 | for (const auto &v : lit_vertices) { |
| 896 | max_offset = max(max_offset, build.g[v].max_offset); |
| 897 | } |
| 898 | |
| 899 | return max_offset; |
| 900 | } |
| 901 | |
| 902 | bool canEagerlyReportAtEod(const RoseBuildImpl &build, const RoseEdge &e) { |
| 903 | const auto &g = build.g; |
| 904 | const auto v = target(e, g); |
| 905 | |
| 906 | if (!build.g[v].eod_accept) { |
| 907 | return false; |
| 908 | } |
| 909 | |
| 910 | // If there's a graph between us and EOD, we shouldn't be eager. |
| 911 | if (build.g[v].left) { |
| 912 | return false; |
| 913 | } |
| 914 | |
| 915 | // Must be exactly at EOD. |
| 916 | if (g[e].minBound != 0 || g[e].maxBound != 0) { |
| 917 | return false; |
| 918 | } |
| 919 | |
| 920 | // In streaming mode, we can only eagerly report EOD for literals in the |
| 921 | // EOD-anchored table, as that's the only time we actually know where EOD |
| 922 | // is. In block mode, we always have this information. |
| 923 | const auto u = source(e, g); |
| 924 | if (build.cc.streaming && !build.isInETable(u)) { |
| 925 | return false; |
| 926 | } |
| 927 | |
| 928 | return true; |
| 929 | } |
| 930 | |
| 931 | #ifndef NDEBUG |
| 932 | /** \brief Returns true if all the graphs (NFA, DFA, Haig, etc) in this Rose |
| 933 | * graph are implementable. */ |
| 934 | bool canImplementGraphs(const RoseBuildImpl &tbi) { |
| 935 | const RoseGraph &g = tbi.g; |
| 936 | |
| 937 | // First, check the Rose leftfixes. |
| 938 | |
| 939 | for (auto v : vertices_range(g)) { |
| 940 | DEBUG_PRINTF("leftfix: check vertex %zu\n" , g[v].index); |
| 941 | |
| 942 | if (g[v].left.castle) { |
| 943 | DEBUG_PRINTF("castle ok\n" ); |
| 944 | continue; |
| 945 | } |
| 946 | if (g[v].left.dfa) { |
| 947 | DEBUG_PRINTF("dfa ok\n" ); |
| 948 | continue; |
| 949 | } |
| 950 | if (g[v].left.haig) { |
| 951 | DEBUG_PRINTF("haig ok\n" ); |
| 952 | continue; |
| 953 | } |
| 954 | if (g[v].left.graph) { |
| 955 | assert(g[v].left.graph->kind |
| 956 | == (tbi.isRootSuccessor(v) ? NFA_PREFIX : NFA_INFIX)); |
| 957 | if (!isImplementableNFA(*g[v].left.graph, nullptr, tbi.cc)) { |
| 958 | DEBUG_PRINTF("nfa prefix %zu failed (%zu vertices)\n" , |
| 959 | g[v].index, num_vertices(*g[v].left.graph)); |
| 960 | return false; |
| 961 | } |
| 962 | } |
| 963 | } |
| 964 | |
| 965 | // Suffix graphs. |
| 966 | |
| 967 | for (auto v : vertices_range(g)) { |
| 968 | DEBUG_PRINTF("suffix: check vertex %zu\n" , g[v].index); |
| 969 | |
| 970 | const RoseSuffixInfo &suffix = g[v].suffix; |
| 971 | if (suffix.castle) { |
| 972 | DEBUG_PRINTF("castle suffix ok\n" ); |
| 973 | continue; |
| 974 | } |
| 975 | if (suffix.rdfa) { |
| 976 | DEBUG_PRINTF("dfa suffix ok\n" ); |
| 977 | continue; |
| 978 | } |
| 979 | if (suffix.haig) { |
| 980 | DEBUG_PRINTF("haig suffix ok\n" ); |
| 981 | continue; |
| 982 | } |
| 983 | if (suffix.graph) { |
| 984 | assert(suffix.graph->kind == NFA_SUFFIX); |
| 985 | if (!isImplementableNFA(*suffix.graph, &tbi.rm, tbi.cc)) { |
| 986 | DEBUG_PRINTF("nfa suffix %zu failed (%zu vertices)\n" , |
| 987 | g[v].index, num_vertices(*suffix.graph)); |
| 988 | return false; |
| 989 | } |
| 990 | } |
| 991 | } |
| 992 | |
| 993 | return true; |
| 994 | } |
| 995 | |
| 996 | /** |
| 997 | * \brief True if there is an engine with a top that is not triggered by a |
| 998 | * vertex in the Rose graph. This is a consistency check used in assertions. |
| 999 | */ |
| 1000 | bool hasOrphanedTops(const RoseBuildImpl &build) { |
| 1001 | const RoseGraph &g = build.g; |
| 1002 | |
| 1003 | unordered_map<left_id, set<u32>> leftfixes; |
| 1004 | unordered_map<suffix_id, set<u32>> suffixes; |
| 1005 | |
| 1006 | for (auto v : vertices_range(g)) { |
| 1007 | if (g[v].left) { |
| 1008 | set<u32> &tops = leftfixes[g[v].left]; |
| 1009 | if (!build.isRootSuccessor(v)) { |
| 1010 | // Tops for infixes come from the in-edges. |
| 1011 | for (const auto &e : in_edges_range(v, g)) { |
| 1012 | tops.insert(g[e].rose_top); |
| 1013 | } |
| 1014 | } |
| 1015 | } |
| 1016 | if (g[v].suffix) { |
| 1017 | suffixes[g[v].suffix].insert(g[v].suffix.top); |
| 1018 | } |
| 1019 | } |
| 1020 | |
| 1021 | for (const auto &e : leftfixes) { |
| 1022 | if (all_tops(e.first) != e.second) { |
| 1023 | DEBUG_PRINTF("rose tops (%s) don't match rose graph (%s)\n" , |
| 1024 | as_string_list(all_tops(e.first)).c_str(), |
| 1025 | as_string_list(e.second).c_str()); |
| 1026 | return true; |
| 1027 | } |
| 1028 | } |
| 1029 | |
| 1030 | for (const auto &e : suffixes) { |
| 1031 | if (all_tops(e.first) != e.second) { |
| 1032 | DEBUG_PRINTF("suffix tops (%s) don't match rose graph (%s)\n" , |
| 1033 | as_string_list(all_tops(e.first)).c_str(), |
| 1034 | as_string_list(e.second).c_str()); |
| 1035 | return true; |
| 1036 | } |
| 1037 | } |
| 1038 | |
| 1039 | return false; |
| 1040 | } |
| 1041 | |
| 1042 | #endif // NDEBUG |
| 1043 | |
| 1044 | } // namespace ue2 |
| 1045 | |