| 1 | /* |
| 2 | * Copyright (c) 2015-2017, Intel Corporation |
| 3 | * |
| 4 | * Redistribution and use in source and binary forms, with or without |
| 5 | * modification, are permitted provided that the following conditions are met: |
| 6 | * |
| 7 | * * Redistributions of source code must retain the above copyright notice, |
| 8 | * this list of conditions and the following disclaimer. |
| 9 | * * Redistributions in binary form must reproduce the above copyright |
| 10 | * notice, this list of conditions and the following disclaimer in the |
| 11 | * documentation and/or other materials provided with the distribution. |
| 12 | * * Neither the name of Intel Corporation nor the names of its contributors |
| 13 | * may be used to endorse or promote products derived from this software |
| 14 | * without specific prior written permission. |
| 15 | * |
| 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 26 | * POSSIBILITY OF SUCH DAMAGE. |
| 27 | */ |
| 28 | |
| 29 | #include "rose_build_impl.h" |
| 30 | |
| 31 | #include "ue2common.h" |
| 32 | #include "grey.h" |
| 33 | #include "rose_build_add_internal.h" |
| 34 | #include "rose_build_anchored.h" |
| 35 | #include "rose_in_util.h" |
| 36 | #include "hwlm/hwlm_literal.h" |
| 37 | #include "nfagraph/ng_depth.h" |
| 38 | #include "nfagraph/ng_dump.h" |
| 39 | #include "nfagraph/ng_holder.h" |
| 40 | #include "nfagraph/ng_limex.h" |
| 41 | #include "nfagraph/ng_reports.h" |
| 42 | #include "nfagraph/ng_util.h" |
| 43 | #include "nfagraph/ng_width.h" |
| 44 | #include "util/charreach.h" |
| 45 | #include "util/charreach_util.h" |
| 46 | #include "util/compare.h" |
| 47 | #include "util/compile_context.h" |
| 48 | #include "util/container.h" |
| 49 | #include "util/dump_charclass.h" |
| 50 | #include "util/graph.h" |
| 51 | #include "util/make_unique.h" |
| 52 | #include "util/ue2string.h" |
| 53 | #include "util/verify_types.h" |
| 54 | |
| 55 | #include <algorithm> |
| 56 | #include <map> |
| 57 | #include <set> |
| 58 | #include <string> |
| 59 | #include <vector> |
| 60 | #include <utility> |
| 61 | |
| 62 | using namespace std; |
| 63 | |
| 64 | namespace ue2 { |
| 65 | |
| 66 | #define MIN_MASK_LIT_LEN 2 |
| 67 | #define MAX_MASK_SIZE 255 |
| 68 | #define MAX_MASK_LITS 30 |
| 69 | |
| 70 | static |
| 71 | void findMaskLiteral(const vector<CharReach> &mask, bool streaming, |
| 72 | ue2_literal *lit, u32 *offset, const Grey &grey) { |
| 73 | bool case_fixed = false; |
| 74 | bool nocase = false; |
| 75 | |
| 76 | size_t best_begin = 0; |
| 77 | size_t best_end = 0; |
| 78 | size_t best_len = 0; |
| 79 | |
| 80 | size_t begin = 0; |
| 81 | size_t end = 0; |
| 82 | |
| 83 | for (size_t i = 0; i < mask.size(); i++) { |
| 84 | bool fail = false; |
| 85 | if (mask[i].count() != 1 && !mask[i].isCaselessChar()) { |
| 86 | DEBUG_PRINTF("hit non-literal char, resetting at %zu\n" , i); |
| 87 | fail = true; |
| 88 | } |
| 89 | |
| 90 | if (!fail && streaming && (end >= grey.maxHistoryAvailable + 1)) { |
| 91 | DEBUG_PRINTF("hit literal limit, resetting at %zu\n" , i); |
| 92 | fail = true; |
| 93 | } |
| 94 | |
| 95 | if (!fail && case_fixed && mask[i].isAlpha()) { |
| 96 | if (nocase && mask[i].count() != 2) { |
| 97 | fail = true; |
| 98 | } |
| 99 | |
| 100 | if (!nocase && mask[i].count() != 1) { |
| 101 | fail = true; |
| 102 | } |
| 103 | } |
| 104 | |
| 105 | if (fail) { |
| 106 | case_fixed = false; |
| 107 | nocase = false; |
| 108 | size_t len = end - begin; |
| 109 | bool better = len > best_len; |
| 110 | if (better) { |
| 111 | best_begin = begin; |
| 112 | best_end = end; |
| 113 | best_len = len; |
| 114 | } |
| 115 | begin = i + 1; |
| 116 | end = i + 1; |
| 117 | } else { |
| 118 | assert(end == i); |
| 119 | end = i + 1; |
| 120 | |
| 121 | if (mask[i].isAlpha()) { |
| 122 | case_fixed = true; |
| 123 | nocase = mask[i].count() == 2; |
| 124 | } |
| 125 | } |
| 126 | } |
| 127 | |
| 128 | size_t len = end - begin; |
| 129 | /* Everybody would rather be trigger towards the end */ |
| 130 | bool better = len >= best_len && mask.size() - end <= MAX_DELAY; |
| 131 | |
| 132 | if (better) { |
| 133 | best_begin = begin; |
| 134 | best_end = end; |
| 135 | best_len = len; |
| 136 | } |
| 137 | |
| 138 | for (size_t i = best_begin; i < best_end; i++) { |
| 139 | assert(mask[i].count() == 1 || mask[i].count() == 2); |
| 140 | lit->push_back(mask[i].find_first(), mask[i].count() > 1); |
| 141 | } |
| 142 | |
| 143 | *offset = verify_u32(best_begin); |
| 144 | } |
| 145 | |
| 146 | static |
| 147 | bool initFmlCandidates(const CharReach &cr, vector<ue2_literal> &cand) { |
| 148 | for (size_t i = cr.find_first(); i != cr.npos; i = cr.find_next(i)) { |
| 149 | char c = (char)i; |
| 150 | bool nocase = myisupper(c) && cr.test(mytolower(c)); |
| 151 | if (myislower(c) && cr.test(mytoupper(c))) { |
| 152 | continue; |
| 153 | } |
| 154 | |
| 155 | if (cand.size() >= MAX_MASK_LITS) { |
| 156 | DEBUG_PRINTF("hit lit limit of %u\n" , MAX_MASK_LITS); |
| 157 | return false; |
| 158 | } |
| 159 | |
| 160 | cand.emplace_back(c, nocase); |
| 161 | } |
| 162 | |
| 163 | assert(cand.size() <= MAX_MASK_LITS); |
| 164 | return !cand.empty(); |
| 165 | } |
| 166 | |
| 167 | static |
| 168 | bool expandFmlCandidates(const CharReach &cr, vector<ue2_literal> &curr, |
| 169 | vector<ue2_literal> &cand) { |
| 170 | DEBUG_PRINTF("expanding string with cr of %zu\n" , cr.count()); |
| 171 | DEBUG_PRINTF(" current cand list size %zu\n" , cand.size()); |
| 172 | |
| 173 | curr.clear(); |
| 174 | |
| 175 | for (size_t i = cr.find_first(); i != cr.npos; i = cr.find_next(i)) { |
| 176 | char c = (char)i; |
| 177 | bool nocase = myisupper(c) && cr.test(mytolower(c)); |
| 178 | if (myislower(c) && cr.test(mytoupper(c))) { |
| 179 | continue; |
| 180 | } |
| 181 | |
| 182 | for (const auto &lit : cand) { |
| 183 | if (curr.size() >= MAX_MASK_LITS) { |
| 184 | DEBUG_PRINTF("hit lit limit of %u\n" , MAX_MASK_LITS); |
| 185 | return false; |
| 186 | } |
| 187 | |
| 188 | curr.push_back(lit); |
| 189 | curr.back().push_back(c, nocase); |
| 190 | } |
| 191 | } |
| 192 | |
| 193 | if (curr.back().length() > MAX_MASK2_WIDTH && |
| 194 | any_of(begin(curr), end(curr), mixed_sensitivity)) { |
| 195 | DEBUG_PRINTF("mixed-sensitivity lit is too long, stopping\n" ); |
| 196 | return false; |
| 197 | } |
| 198 | |
| 199 | assert(curr.size() <= MAX_MASK_LITS); |
| 200 | cand.swap(curr); |
| 201 | return true; |
| 202 | } |
| 203 | |
| 204 | static |
| 205 | u32 scoreFmlCandidates(const vector<ue2_literal> &cand) { |
| 206 | if (cand.empty()) { |
| 207 | DEBUG_PRINTF("no candidates\n" ); |
| 208 | return 0; |
| 209 | } |
| 210 | |
| 211 | const u32 len = cand.back().length(); |
| 212 | |
| 213 | DEBUG_PRINTF("length = %u count %zu\n" , len, cand.size()); |
| 214 | u32 min_period = len; |
| 215 | |
| 216 | for (const auto &lit : cand) { |
| 217 | DEBUG_PRINTF("candidate: %s\n" , dumpString(lit).c_str()); |
| 218 | u32 period = lit.length() - maxStringSelfOverlap(lit); |
| 219 | min_period = min(min_period, period); |
| 220 | } |
| 221 | DEBUG_PRINTF("min_period %u\n" , min_period); |
| 222 | u32 length_score = |
| 223 | (5 * min_period + len) * (cand.back().any_nocase() ? 90 : 100); |
| 224 | u32 count_penalty; |
| 225 | if (len > 4) { |
| 226 | count_penalty = 9 * len * cand.size(); |
| 227 | } else { |
| 228 | count_penalty = 5 * cand.size(); |
| 229 | } |
| 230 | if (length_score <= count_penalty) { |
| 231 | return 1; |
| 232 | } |
| 233 | return length_score - count_penalty; |
| 234 | } |
| 235 | |
| 236 | /* favours later literals */ |
| 237 | static |
| 238 | bool findMaskLiterals(const vector<CharReach> &mask, vector<ue2_literal> *lit, |
| 239 | u32 *minBound, u32 *length) { |
| 240 | *minBound = 0; |
| 241 | *length = 0; |
| 242 | |
| 243 | vector<ue2_literal> candidates, best_candidates, curr_candidates; |
| 244 | u32 best_score = 0; |
| 245 | u32 best_minOffset = 0; |
| 246 | |
| 247 | for (auto it = mask.begin(); it != mask.end(); ++it) { |
| 248 | candidates.clear(); |
| 249 | if (!initFmlCandidates(*it, candidates)) { |
| 250 | DEBUG_PRINTF("failed to init\n" ); |
| 251 | continue; |
| 252 | } |
| 253 | DEBUG_PRINTF("++\n" ); |
| 254 | auto jt = it; |
| 255 | while (jt != mask.begin()) { |
| 256 | --jt; |
| 257 | DEBUG_PRINTF("--\n" ); |
| 258 | if (!expandFmlCandidates(*jt, curr_candidates, candidates)) { |
| 259 | DEBUG_PRINTF("expansion stopped\n" ); |
| 260 | break; |
| 261 | } |
| 262 | } |
| 263 | |
| 264 | // Candidates have been expanded in reverse order. |
| 265 | for (auto &cand : candidates) { |
| 266 | cand = reverse_literal(cand); |
| 267 | } |
| 268 | |
| 269 | u32 score = scoreFmlCandidates(candidates); |
| 270 | DEBUG_PRINTF("scored %u for literal set of size %zu\n" , score, |
| 271 | candidates.size()); |
| 272 | if (!candidates.empty() && score >= best_score) { |
| 273 | best_minOffset = it - mask.begin() - candidates.back().length() + 1; |
| 274 | best_candidates.swap(candidates); |
| 275 | best_score = score; |
| 276 | } |
| 277 | } |
| 278 | |
| 279 | if (!best_score) { |
| 280 | DEBUG_PRINTF("no lits\n" ); |
| 281 | return false; |
| 282 | } |
| 283 | |
| 284 | *minBound = best_minOffset; |
| 285 | *length = best_candidates.back().length(); |
| 286 | |
| 287 | DEBUG_PRINTF("best minbound %u length %u\n" , *minBound, *length); |
| 288 | |
| 289 | assert(all_of_in(best_candidates, [&](const ue2_literal &s) { |
| 290 | return s.length() == *length; |
| 291 | })); |
| 292 | |
| 293 | *lit = std::move(best_candidates); |
| 294 | return true; |
| 295 | } |
| 296 | |
| 297 | static |
| 298 | unique_ptr<NGHolder> buildMaskLhs(bool anchored, u32 prefix_len, |
| 299 | const vector<CharReach> &mask) { |
| 300 | DEBUG_PRINTF("build %slhs len %u/%zu\n" , anchored ? "anc " : "" , prefix_len, |
| 301 | mask.size()); |
| 302 | |
| 303 | unique_ptr<NGHolder> lhs = ue2::make_unique<NGHolder>(NFA_PREFIX); |
| 304 | |
| 305 | assert(prefix_len); |
| 306 | assert(mask.size() >= prefix_len); |
| 307 | NFAVertex pred = anchored ? lhs->start : lhs->startDs; |
| 308 | |
| 309 | u32 m_idx = 0; |
| 310 | while (prefix_len--) { |
| 311 | NFAVertex v = add_vertex(*lhs); |
| 312 | (*lhs)[v].char_reach = mask[m_idx++]; |
| 313 | add_edge(pred, v, *lhs); |
| 314 | pred = v; |
| 315 | } |
| 316 | add_edge(pred, lhs->accept, *lhs); |
| 317 | (*lhs)[pred].reports.insert(0); |
| 318 | |
| 319 | return lhs; |
| 320 | } |
| 321 | |
| 322 | static |
| 323 | void buildLiteralMask(const vector<CharReach> &mask, vector<u8> &msk, |
| 324 | vector<u8> &cmp, u32 delay) { |
| 325 | msk.clear(); |
| 326 | cmp.clear(); |
| 327 | if (mask.size() <= delay) { |
| 328 | return; |
| 329 | } |
| 330 | |
| 331 | // Construct an and/cmp mask from our mask ending at delay positions before |
| 332 | // the end of the literal, with max length HWLM_MASKLEN. |
| 333 | |
| 334 | auto ite = mask.end() - delay; |
| 335 | auto it = ite - min(size_t{HWLM_MASKLEN}, mask.size() - delay); |
| 336 | |
| 337 | for (; it != ite; ++it) { |
| 338 | msk.push_back(0); |
| 339 | cmp.push_back(0); |
| 340 | make_and_cmp_mask(*it, &msk.back(), &cmp.back()); |
| 341 | } |
| 342 | |
| 343 | assert(msk.size() == cmp.size()); |
| 344 | assert(msk.size() <= HWLM_MASKLEN); |
| 345 | } |
| 346 | |
| 347 | static |
| 348 | bool validateTransientMask(const vector<CharReach> &mask, bool anchored, |
| 349 | bool eod, const Grey &grey) { |
| 350 | assert(!mask.empty()); |
| 351 | |
| 352 | // An EOD anchored mask requires that everything fit into history, while an |
| 353 | // ordinary floating case can handle one byte more (i.e., max history size |
| 354 | // and one byte in the buffer). |
| 355 | const size_t max_width = grey.maxHistoryAvailable + (eod ? 0 : 1); |
| 356 | if (mask.size() > max_width) { |
| 357 | DEBUG_PRINTF("mask too long for max available history\n" ); |
| 358 | return false; |
| 359 | } |
| 360 | |
| 361 | /* although anchored masks cannot be transient, short masks may be placed |
| 362 | * into the atable. */ |
| 363 | if (anchored && mask.size() > grey.maxAnchoredRegion) { |
| 364 | return false; |
| 365 | } |
| 366 | |
| 367 | vector<ue2_literal> lits; |
| 368 | u32 lit_minBound; /* minBound of each literal in lit */ |
| 369 | u32 lit_length; /* length of each literal in lit */ |
| 370 | if (!findMaskLiterals(mask, &lits, &lit_minBound, &lit_length)) { |
| 371 | DEBUG_PRINTF("failed to find any lits\n" ); |
| 372 | return false; |
| 373 | } |
| 374 | |
| 375 | if (lits.empty()) { |
| 376 | return false; |
| 377 | } |
| 378 | |
| 379 | const u32 delay = mask.size() - lit_length - lit_minBound; |
| 380 | if (delay > MAX_DELAY) { |
| 381 | DEBUG_PRINTF("delay %u is too much\n" , delay); |
| 382 | return false; |
| 383 | } |
| 384 | |
| 385 | if (lit_length == 1 && lits.size() > 3) { |
| 386 | DEBUG_PRINTF("no decent trigger\n" ); |
| 387 | return false; |
| 388 | } |
| 389 | |
| 390 | // Mixed-sensitivity literals require benefits masks to implement, and thus |
| 391 | // have a maximum length. This has been taken into account in |
| 392 | // findMaskLiterals. |
| 393 | assert(lit_length <= MAX_MASK2_WIDTH || |
| 394 | none_of(begin(lits), end(lits), mixed_sensitivity)); |
| 395 | |
| 396 | // Build the HWLM literal mask. |
| 397 | vector<u8> msk, cmp; |
| 398 | if (grey.roseHamsterMasks) { |
| 399 | buildLiteralMask(mask, msk, cmp, delay); |
| 400 | } |
| 401 | |
| 402 | // We consider the HWLM mask length to run from the first non-zero byte to |
| 403 | // the end, and let max(mask length, literal length) be the effective |
| 404 | // literal length. |
| 405 | // |
| 406 | // A one-byte literal with no mask is too short, but a one-byte literal |
| 407 | // with a few bytes of mask information is OK. |
| 408 | |
| 409 | u32 msk_length = distance(find_if(begin(msk), end(msk), |
| 410 | [](u8 v) { return v != 0; }), end(msk)); |
| 411 | u32 eff_lit_length = max(lit_length, msk_length); |
| 412 | DEBUG_PRINTF("msk_length=%u, eff_lit_length = %u\n" , msk_length, |
| 413 | eff_lit_length); |
| 414 | |
| 415 | if (eff_lit_length < MIN_MASK_LIT_LEN) { |
| 416 | DEBUG_PRINTF("literals too short\n" ); |
| 417 | return false; |
| 418 | } |
| 419 | |
| 420 | DEBUG_PRINTF("mask is ok\n" ); |
| 421 | return true; |
| 422 | } |
| 423 | |
| 424 | static |
| 425 | bool maskIsNeeded(const ue2_literal &lit, const NGHolder &g) { |
| 426 | flat_set<NFAVertex> curr = {g.accept}; |
| 427 | flat_set<NFAVertex> next; |
| 428 | |
| 429 | for (auto it = lit.rbegin(), ite = lit.rend(); it != ite; ++it) { |
| 430 | const CharReach &cr = *it; |
| 431 | DEBUG_PRINTF("check %s\n" , describeClass(*it).c_str()); |
| 432 | next.clear(); |
| 433 | for (auto v : curr) { |
| 434 | for (auto u : inv_adjacent_vertices_range(v, g)) { |
| 435 | if (isSubsetOf(cr, g[u].char_reach)) { |
| 436 | next.insert(u); |
| 437 | } |
| 438 | } |
| 439 | } |
| 440 | if (next.empty()) { |
| 441 | DEBUG_PRINTF("no path to start\n" ); |
| 442 | return true; |
| 443 | } |
| 444 | curr.swap(next); |
| 445 | } |
| 446 | |
| 447 | for (auto v : curr) { |
| 448 | for (auto u : inv_adjacent_vertices_range(v, g)) { |
| 449 | if (u == g.start || u == g.startDs) { |
| 450 | DEBUG_PRINTF("literal spans graph from start to accept\n" ); |
| 451 | return false; |
| 452 | |
| 453 | } |
| 454 | } |
| 455 | } |
| 456 | |
| 457 | DEBUG_PRINTF("literal doesn't reach start\n" ); |
| 458 | return true; |
| 459 | } |
| 460 | |
| 461 | static |
| 462 | void addTransientMask(RoseBuildImpl &build, const vector<CharReach> &mask, |
| 463 | const flat_set<ReportID> &reports, bool anchored, |
| 464 | bool eod) { |
| 465 | vector<ue2_literal> lits; |
| 466 | u32 lit_minBound; /* minBound of each literal in lit */ |
| 467 | u32 lit_length; /* length of each literal in lit */ |
| 468 | if (!findMaskLiterals(mask, &lits, &lit_minBound, &lit_length)) { |
| 469 | DEBUG_PRINTF("failed to find any lits\n" ); |
| 470 | assert(0); |
| 471 | return; |
| 472 | } |
| 473 | |
| 474 | DEBUG_PRINTF("%zu literals, minBound=%u, length=%u\n" , lits.size(), |
| 475 | lit_minBound, lit_length); |
| 476 | |
| 477 | if (lits.empty()) { |
| 478 | assert(0); |
| 479 | return; |
| 480 | } |
| 481 | |
| 482 | u32 delay = mask.size() - lit_length - lit_minBound; |
| 483 | assert(delay <= MAX_DELAY); |
| 484 | DEBUG_PRINTF("delay=%u\n" , delay); |
| 485 | |
| 486 | shared_ptr<NGHolder> mask_graph = buildMaskLhs(anchored, mask.size(), mask); |
| 487 | |
| 488 | u32 mask_lag = 0; /* TODO */ |
| 489 | |
| 490 | // Everyone gets the same report ID. |
| 491 | ReportID mask_report = build.getNewNfaReport(); |
| 492 | set_report(*mask_graph, mask_report); |
| 493 | |
| 494 | // Build the HWLM literal mask. |
| 495 | vector<u8> msk, cmp; |
| 496 | if (build.cc.grey.roseHamsterMasks) { |
| 497 | buildLiteralMask(mask, msk, cmp, delay); |
| 498 | } |
| 499 | |
| 500 | /* adjust bounds to be relative to trigger rather than mask */ |
| 501 | const u32 v_min_offset = add_rose_depth(0, mask.size()); |
| 502 | const u32 v_max_offset = |
| 503 | add_rose_depth(anchored ? 0 : ROSE_BOUND_INF, mask.size()); |
| 504 | |
| 505 | RoseGraph &g = build.g; |
| 506 | |
| 507 | // By default, masked literals go into the floating table (except for eod |
| 508 | // cases). |
| 509 | enum rose_literal_table table = ROSE_FLOATING; |
| 510 | |
| 511 | RoseVertex eod_v = RoseGraph::null_vertex(); |
| 512 | if (eod) { |
| 513 | eod_v = add_vertex(g); |
| 514 | g[eod_v].eod_accept = true; |
| 515 | insert(&g[eod_v].reports, reports); |
| 516 | g[eod_v].min_offset = v_min_offset; |
| 517 | g[eod_v].max_offset = v_max_offset; |
| 518 | |
| 519 | // Note: because this is a transient mask, we know that we can match it |
| 520 | // completely inside the history buffer. So, using the EOD literal |
| 521 | // table is always safe. |
| 522 | table = ROSE_EOD_ANCHORED; |
| 523 | |
| 524 | // Widen the EOD table window to cover the mask. |
| 525 | ENSURE_AT_LEAST(&build.ematcher_region_size, mask.size()); |
| 526 | } |
| 527 | |
| 528 | const flat_set<ReportID> no_reports; |
| 529 | |
| 530 | for (const auto &lit : lits) { |
| 531 | u32 lit_id = build.getLiteralId(lit, msk, cmp, delay, table); |
| 532 | const RoseVertex parent = anchored ? build.anchored_root : build.root; |
| 533 | bool use_mask = delay || maskIsNeeded(lit, *mask_graph); |
| 534 | |
| 535 | auto v = createVertex(&build, parent, 0, ROSE_BOUND_INF, lit_id, |
| 536 | lit.length(), eod ? no_reports : reports); |
| 537 | |
| 538 | if (use_mask) { |
| 539 | g[v].left.graph = mask_graph; |
| 540 | g[v].left.lag = mask_lag; |
| 541 | g[v].left.leftfix_report = mask_report; |
| 542 | } else { |
| 543 | // Make sure our edge bounds are correct. |
| 544 | RoseEdge e = edge(parent, v, g); |
| 545 | g[e].minBound = 0; |
| 546 | g[e].maxBound = anchored ? 0 : ROSE_BOUND_INF; |
| 547 | g[e].history = anchored ? ROSE_ROLE_HISTORY_ANCH |
| 548 | : ROSE_ROLE_HISTORY_NONE; |
| 549 | } |
| 550 | |
| 551 | // Set offsets correctly. |
| 552 | g[v].min_offset = v_min_offset; |
| 553 | g[v].max_offset = v_max_offset; |
| 554 | |
| 555 | if (eod) { |
| 556 | RoseEdge e = add_edge(v, eod_v, g); |
| 557 | g[e].minBound = 0; |
| 558 | g[e].maxBound = 0; |
| 559 | g[e].history = ROSE_ROLE_HISTORY_LAST_BYTE; |
| 560 | } |
| 561 | } |
| 562 | } |
| 563 | |
| 564 | static |
| 565 | unique_ptr<NGHolder> buildMaskRhs(const flat_set<ReportID> &reports, |
| 566 | const vector<CharReach> &mask, |
| 567 | u32 suffix_len) { |
| 568 | assert(suffix_len); |
| 569 | assert(mask.size() > suffix_len); |
| 570 | |
| 571 | unique_ptr<NGHolder> rhs = ue2::make_unique<NGHolder>(NFA_SUFFIX); |
| 572 | NGHolder &h = *rhs; |
| 573 | |
| 574 | NFAVertex succ = h.accept; |
| 575 | u32 m_idx = mask.size() - 1; |
| 576 | while (suffix_len--) { |
| 577 | NFAVertex u = add_vertex(h); |
| 578 | if (succ == h.accept) { |
| 579 | h[u].reports.insert(reports.begin(), reports.end()); |
| 580 | } |
| 581 | h[u].char_reach = mask[m_idx--]; |
| 582 | add_edge(u, succ, h); |
| 583 | succ = u; |
| 584 | } |
| 585 | |
| 586 | NFAEdge e = add_edge(h.start, succ, h); |
| 587 | h[e].tops.insert(DEFAULT_TOP); |
| 588 | |
| 589 | return rhs; |
| 590 | } |
| 591 | |
| 592 | static |
| 593 | void doAddMask(RoseBuildImpl &tbi, bool anchored, const vector<CharReach> &mask, |
| 594 | const ue2_literal &lit, u32 prefix_len, u32 suffix_len, |
| 595 | const flat_set<ReportID> &reports) { |
| 596 | /* Note: bounds are relative to literal start */ |
| 597 | RoseInGraph ig; |
| 598 | RoseInVertex s = add_vertex(RoseInVertexProps::makeStart(anchored), ig); |
| 599 | RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), ig); |
| 600 | |
| 601 | DEBUG_PRINTF("pref + lit = %u\n" , prefix_len); |
| 602 | assert(prefix_len >= lit.length()); |
| 603 | |
| 604 | // prefix len is relative to end of literal. |
| 605 | u32 minBound = prefix_len - lit.length(); |
| 606 | |
| 607 | if (minBound) { |
| 608 | if (anchored && prefix_len > tbi.cc.grey.maxAnchoredRegion) { |
| 609 | DEBUG_PRINTF("too deep\n" ); |
| 610 | /* see if there is an anchored literal we can also hang off */ |
| 611 | |
| 612 | ue2_literal lit2; |
| 613 | u32 lit2_offset; |
| 614 | vector<CharReach> mask2 = mask; |
| 615 | assert(mask2.size() > tbi.cc.grey.maxAnchoredRegion); |
| 616 | mask2.resize(MIN(tbi.cc.grey.maxAnchoredRegion, minBound)); |
| 617 | |
| 618 | findMaskLiteral(mask2, tbi.cc.streaming, &lit2, &lit2_offset, |
| 619 | tbi.cc.grey); |
| 620 | |
| 621 | if (lit2.length() >= MIN_MASK_LIT_LEN) { |
| 622 | u32 prefix2_len = lit2_offset + lit2.length(); |
| 623 | assert(prefix2_len < minBound); |
| 624 | RoseInVertex u |
| 625 | = add_vertex(RoseInVertexProps::makeLiteral(lit2), ig); |
| 626 | if (lit2_offset){ |
| 627 | DEBUG_PRINTF("building lhs (off %u)\n" , lit2_offset); |
| 628 | shared_ptr<NGHolder> lhs2 |
| 629 | = buildMaskLhs(true, lit2_offset, mask); |
| 630 | add_edge(s, u, RoseInEdgeProps(lhs2, lit2.length()), ig); |
| 631 | } else { |
| 632 | add_edge(s, u, RoseInEdgeProps(0, 0), ig); |
| 633 | } |
| 634 | |
| 635 | /* midfix */ |
| 636 | DEBUG_PRINTF("building mhs\n" ); |
| 637 | vector<CharReach> mask3(mask.begin() + prefix2_len, mask.end()); |
| 638 | u32 overlap = maxOverlap(lit2, lit, 0); |
| 639 | u32 delay = lit.length() - overlap; |
| 640 | shared_ptr<NGHolder> mhs |
| 641 | = buildMaskLhs(true, minBound - prefix2_len + overlap, |
| 642 | mask3); |
| 643 | mhs->kind = NFA_INFIX; |
| 644 | setTops(*mhs); |
| 645 | add_edge(u, v, RoseInEdgeProps(mhs, delay), ig); |
| 646 | |
| 647 | DEBUG_PRINTF("add anch literal too!\n" ); |
| 648 | goto do_rhs; |
| 649 | } |
| 650 | } |
| 651 | |
| 652 | shared_ptr<NGHolder> lhs = buildMaskLhs(anchored, minBound, mask); |
| 653 | add_edge(s, v, RoseInEdgeProps(lhs, lit.length()), ig); |
| 654 | } else { |
| 655 | u32 maxBound = anchored ? minBound : ROSE_BOUND_INF; |
| 656 | add_edge(s, v, RoseInEdgeProps(minBound, maxBound), ig); |
| 657 | } |
| 658 | |
| 659 | do_rhs: |
| 660 | if (suffix_len) { |
| 661 | shared_ptr<NGHolder> rhs = buildMaskRhs(reports, mask, suffix_len); |
| 662 | RoseInVertex a = |
| 663 | add_vertex(RoseInVertexProps::makeAccept(set<ReportID>()), ig); |
| 664 | add_edge(v, a, RoseInEdgeProps(rhs, 0), ig); |
| 665 | } else { |
| 666 | /* Note: masks have no eod connections */ |
| 667 | RoseInVertex a |
| 668 | = add_vertex(RoseInVertexProps::makeAccept(reports), ig); |
| 669 | add_edge(v, a, RoseInEdgeProps(0U, 0U), ig); |
| 670 | } |
| 671 | |
| 672 | calcVertexOffsets(ig); |
| 673 | |
| 674 | bool rv = tbi.addRose(ig, false); |
| 675 | |
| 676 | assert(rv); /* checkAllowMask should have prevented this */ |
| 677 | if (!rv) { |
| 678 | throw std::exception(); |
| 679 | } |
| 680 | } |
| 681 | |
| 682 | static |
| 683 | bool checkAllowMask(const vector<CharReach> &mask, ue2_literal *lit, |
| 684 | u32 *prefix_len, u32 *suffix_len, |
| 685 | const CompileContext &cc) { |
| 686 | assert(!mask.empty()); |
| 687 | u32 lit_offset; |
| 688 | findMaskLiteral(mask, cc.streaming, lit, &lit_offset, cc.grey); |
| 689 | |
| 690 | if (lit->length() < MIN_MASK_LIT_LEN && lit->length() != mask.size()) { |
| 691 | DEBUG_PRINTF("need more literal - bad mask\n" ); |
| 692 | return false; |
| 693 | } |
| 694 | |
| 695 | DEBUG_PRINTF("mask lit '%s', len=%zu at offset=%u\n" , |
| 696 | dumpString(*lit).c_str(), lit->length(), lit_offset); |
| 697 | |
| 698 | assert(!cc.streaming || lit->length() <= cc.grey.maxHistoryAvailable + 1); |
| 699 | |
| 700 | /* literal is included in the prefix nfa so that matches from the prefix |
| 701 | * can't occur in the history buffer - probably should tweak the NFA API |
| 702 | * to allow such matches not to be suppressed */ |
| 703 | *prefix_len = lit_offset + lit->length(); |
| 704 | *suffix_len = mask.size() - *prefix_len; |
| 705 | DEBUG_PRINTF("prefix_len=%u, suffix_len=%u\n" , *prefix_len, *suffix_len); |
| 706 | |
| 707 | /* check if we can backtrack sufficiently */ |
| 708 | if (cc.streaming && *prefix_len > cc.grey.maxHistoryAvailable + 1) { |
| 709 | DEBUG_PRINTF("too much lag\n" ); |
| 710 | return false; |
| 711 | } |
| 712 | |
| 713 | if (*suffix_len > MAX_MASK_SIZE || *prefix_len > MAX_MASK_SIZE) { |
| 714 | DEBUG_PRINTF("too big\n" ); |
| 715 | return false; |
| 716 | } |
| 717 | |
| 718 | return true; |
| 719 | } |
| 720 | |
| 721 | bool RoseBuildImpl::add(bool anchored, const vector<CharReach> &mask, |
| 722 | const flat_set<ReportID> &reports) { |
| 723 | if (validateTransientMask(mask, anchored, false, cc.grey)) { |
| 724 | bool eod = false; |
| 725 | addTransientMask(*this, mask, reports, anchored, eod); |
| 726 | return true; |
| 727 | } |
| 728 | |
| 729 | ue2_literal lit; |
| 730 | u32 prefix_len = 0; |
| 731 | u32 suffix_len = 0; |
| 732 | |
| 733 | if (!checkAllowMask(mask, &lit, &prefix_len, &suffix_len, cc)) { |
| 734 | return false; |
| 735 | } |
| 736 | |
| 737 | /* we know that the mask can be handled now, start playing with the rose |
| 738 | * graph */ |
| 739 | doAddMask(*this, anchored, mask, lit, prefix_len, suffix_len, reports); |
| 740 | |
| 741 | return true; |
| 742 | } |
| 743 | |
| 744 | bool RoseBuildImpl::validateMask(const vector<CharReach> &mask, |
| 745 | UNUSED const flat_set<ReportID> &reports, |
| 746 | bool anchored, bool eod) const { |
| 747 | return validateTransientMask(mask, anchored, eod, cc.grey); |
| 748 | } |
| 749 | |
| 750 | static |
| 751 | unique_ptr<NGHolder> makeAnchoredGraph(const vector<CharReach> &mask, |
| 752 | const flat_set<ReportID> &reports, |
| 753 | bool eod) { |
| 754 | auto gp = ue2::make_unique<NGHolder>(); |
| 755 | NGHolder &g = *gp; |
| 756 | |
| 757 | NFAVertex u = g.start; |
| 758 | for (const auto &cr : mask) { |
| 759 | NFAVertex v = add_vertex(g); |
| 760 | g[v].char_reach = cr; |
| 761 | add_edge(u, v, g); |
| 762 | u = v; |
| 763 | } |
| 764 | |
| 765 | |
| 766 | g[u].reports = reports; |
| 767 | add_edge(u, eod ? g.acceptEod : g.accept, g); |
| 768 | |
| 769 | return gp; |
| 770 | } |
| 771 | |
| 772 | static |
| 773 | bool addAnchoredMask(RoseBuildImpl &build, const vector<CharReach> &mask, |
| 774 | const flat_set<ReportID> &reports, bool eod) { |
| 775 | if (!build.cc.grey.allowAnchoredAcyclic) { |
| 776 | return false; |
| 777 | } |
| 778 | |
| 779 | auto g = makeAnchoredGraph(mask, reports, eod); |
| 780 | assert(g); |
| 781 | |
| 782 | return build.addAnchoredAcyclic(*g); |
| 783 | } |
| 784 | |
| 785 | void RoseBuildImpl::addMask(const vector<CharReach> &mask, |
| 786 | const flat_set<ReportID> &reports, bool anchored, |
| 787 | bool eod) { |
| 788 | if (anchored && addAnchoredMask(*this, mask, reports, eod)) { |
| 789 | DEBUG_PRINTF("added mask as anchored acyclic graph\n" ); |
| 790 | return; |
| 791 | } |
| 792 | |
| 793 | addTransientMask(*this, mask, reports, anchored, eod); |
| 794 | } |
| 795 | |
| 796 | } // namespace ue2 |
| 797 | |