| 1 | /* |
| 2 | * Copyright (c) 2015-2017, Intel Corporation |
| 3 | * |
| 4 | * Redistribution and use in source and binary forms, with or without |
| 5 | * modification, are permitted provided that the following conditions are met: |
| 6 | * |
| 7 | * * Redistributions of source code must retain the above copyright notice, |
| 8 | * this list of conditions and the following disclaimer. |
| 9 | * * Redistributions in binary form must reproduce the above copyright |
| 10 | * notice, this list of conditions and the following disclaimer in the |
| 11 | * documentation and/or other materials provided with the distribution. |
| 12 | * * Neither the name of Intel Corporation nor the names of its contributors |
| 13 | * may be used to endorse or promote products derived from this software |
| 14 | * without specific prior written permission. |
| 15 | * |
| 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 26 | * POSSIBILITY OF SUCH DAMAGE. |
| 27 | */ |
| 28 | |
| 29 | #include "rose_build_convert.h" |
| 30 | |
| 31 | #include "grey.h" |
| 32 | #include "rose_build.h" |
| 33 | #include "rose_build_impl.h" |
| 34 | #include "rose_build_util.h" |
| 35 | #include "ue2common.h" |
| 36 | #include "hwlm/hwlm_build.h" |
| 37 | #include "nfa/castlecompile.h" |
| 38 | #include "nfa/limex_limits.h" |
| 39 | #include "nfagraph/ng_dump.h" |
| 40 | #include "nfagraph/ng_limex.h" |
| 41 | #include "nfagraph/ng_repeat.h" |
| 42 | #include "nfagraph/ng_reports.h" |
| 43 | #include "nfagraph/ng_split.h" |
| 44 | #include "nfagraph/ng_util.h" |
| 45 | #include "nfagraph/ng_width.h" |
| 46 | #include "util/bitutils.h" |
| 47 | #include "util/charreach.h" |
| 48 | #include "util/charreach_util.h" |
| 49 | #include "util/compile_context.h" |
| 50 | #include "util/depth.h" |
| 51 | #include "util/graph_range.h" |
| 52 | #include "util/make_unique.h" |
| 53 | #include "util/order_check.h" |
| 54 | #include "util/ue2string.h" |
| 55 | |
| 56 | #include <algorithm> |
| 57 | #include <map> |
| 58 | #include <queue> |
| 59 | #include <set> |
| 60 | #include <string> |
| 61 | #include <unordered_map> |
| 62 | #include <utility> |
| 63 | #include <vector> |
| 64 | |
| 65 | #include <boost/range/adaptor/map.hpp> |
| 66 | |
| 67 | using namespace std; |
| 68 | using boost::adaptors::map_values; |
| 69 | |
| 70 | namespace ue2 { |
| 71 | |
| 72 | static |
| 73 | NFAVertex addHolderVertex(const CharReach &cr, NGHolder &out) { |
| 74 | assert(cr.any()); |
| 75 | NFAVertex v = add_vertex(out); |
| 76 | out[v].char_reach = cr; |
| 77 | return v; |
| 78 | } |
| 79 | |
| 80 | static |
| 81 | size_t suffixFloodLen(const ue2_literal &s) { |
| 82 | if (s.empty()) { |
| 83 | return 0; |
| 84 | } |
| 85 | |
| 86 | const ue2_literal::elem &c = s.back(); |
| 87 | auto it = find_if(s.rbegin(), s.rend(), |
| 88 | [&c](const ue2_literal::elem &e) { return e != c; }); |
| 89 | return distance(s.rbegin(), it); |
| 90 | } |
| 91 | |
| 92 | static |
| 93 | unique_ptr<NGHolder> makeFloodProneSuffix(const ue2_literal &s, size_t len, |
| 94 | const flat_set<ReportID> &reports) { |
| 95 | assert(len < s.length()); |
| 96 | assert(!reports.empty()); |
| 97 | |
| 98 | unique_ptr<NGHolder> h = ue2::make_unique<NGHolder>(NFA_SUFFIX); |
| 99 | |
| 100 | NFAVertex u = h->start; |
| 101 | for (auto it = s.begin() + s.length() - len; it != s.end(); ++it) { |
| 102 | NFAVertex v = addHolderVertex(*it, *h); |
| 103 | NFAEdge e = add_edge(u, v, *h); |
| 104 | if (u == h->start) { |
| 105 | (*h)[e].tops.insert(DEFAULT_TOP); |
| 106 | } |
| 107 | u = v; |
| 108 | } |
| 109 | |
| 110 | (*h)[u].reports.insert(reports.begin(), reports.end()); |
| 111 | add_edge(u, h->accept, *h); |
| 112 | return h; |
| 113 | } |
| 114 | |
| 115 | static |
| 116 | unique_ptr<NGHolder> makeRosePrefix(const ue2_literal &s) { |
| 117 | unique_ptr<NGHolder> h = ue2::make_unique<NGHolder>(NFA_PREFIX); |
| 118 | |
| 119 | NFAVertex u = h->startDs; |
| 120 | for (const auto &c : s) { |
| 121 | NFAVertex v = addHolderVertex(c, *h); |
| 122 | add_edge(u, v, *h); |
| 123 | u = v; |
| 124 | } |
| 125 | add_edge(u, h->accept, *h); |
| 126 | return h; |
| 127 | } |
| 128 | |
| 129 | static |
| 130 | void replaceWithLitPrefix(RoseBuildImpl &tbi, RoseVertex v, u32 lit_id, |
| 131 | const rose_literal_id &lit, size_t suffixlen, |
| 132 | size_t delay) { |
| 133 | assert(suffixlen < lit.s.length()); |
| 134 | |
| 135 | DEBUG_PRINTF("replacing '%s' with prefix, length=%zu, delay=%zu\n" , |
| 136 | dumpString(lit.s).c_str(), lit.s.length() - suffixlen, delay); |
| 137 | |
| 138 | RoseGraph &g = tbi.g; |
| 139 | ue2_literal new_lit = lit.s.substr(0, lit.s.length() - suffixlen); |
| 140 | u32 new_id = tbi.getLiteralId(new_lit, delay, ROSE_FLOATING); |
| 141 | rose_literal_info &old_info = tbi.literal_info.at(lit_id); |
| 142 | old_info.vertices.erase(v); |
| 143 | tbi.literal_info.at(new_id).vertices.insert(v); |
| 144 | g[v].literals.clear(); |
| 145 | g[v].literals.insert(new_id); |
| 146 | } |
| 147 | |
| 148 | static |
| 149 | bool delayLiteralWithPrefix(RoseBuildImpl &tbi, RoseVertex v, u32 lit_id, |
| 150 | const rose_literal_id &lit, size_t suffixlen) { |
| 151 | if (suffixlen > MAX_DELAY) { |
| 152 | DEBUG_PRINTF("delay too large\n" ); |
| 153 | return false; |
| 154 | } |
| 155 | |
| 156 | if (!tbi.isDirectReport(lit_id)) { |
| 157 | DEBUG_PRINTF("literal is not direct report\n" ); |
| 158 | return false; |
| 159 | } |
| 160 | |
| 161 | if (tbi.cc.streaming && |
| 162 | lit.s.length() > tbi.cc.grey.maxHistoryAvailable + 1) { |
| 163 | DEBUG_PRINTF("insufficient history to delay literal of len %zu\n" , |
| 164 | lit.s.length()); |
| 165 | return false; |
| 166 | } |
| 167 | |
| 168 | shared_ptr<NGHolder> h = makeRosePrefix(lit.s); |
| 169 | ReportID prefix_report = 0; |
| 170 | set_report(*h, prefix_report); |
| 171 | |
| 172 | if (!isImplementableNFA(*h, &tbi.rm, tbi.cc)) { |
| 173 | DEBUG_PRINTF("prefix not implementable\n" ); |
| 174 | return false; |
| 175 | } |
| 176 | |
| 177 | RoseGraph &g = tbi.g; |
| 178 | assert(!g[v].left); |
| 179 | g[v].left.graph = h; |
| 180 | g[v].left.lag = 0; |
| 181 | g[v].left.leftfix_report = prefix_report; |
| 182 | |
| 183 | // Swap v's literal for a shorter one, delayed by suffix len. |
| 184 | replaceWithLitPrefix(tbi, v, lit_id, lit, suffixlen, suffixlen); |
| 185 | |
| 186 | return true; |
| 187 | } |
| 188 | |
| 189 | static |
| 190 | void convertFloodProneSuffix(RoseBuildImpl &tbi, RoseVertex v, u32 lit_id, |
| 191 | const rose_literal_id &lit, size_t suffixlen) { |
| 192 | DEBUG_PRINTF("flood-prone leaf '%s'\n" , dumpString(lit.s).c_str()); |
| 193 | DEBUG_PRINTF("turning last %zu chars into a suffix NFA\n" , suffixlen); |
| 194 | RoseGraph &g = tbi.g; |
| 195 | assert(!g[v].eod_accept); |
| 196 | |
| 197 | // If we're a direct report literal, we may be able to convert this case |
| 198 | // into a delayed literal with a (very boring) transient prefix that |
| 199 | // handles our flood-prone suffix. |
| 200 | if (delayLiteralWithPrefix(tbi, v, lit_id, lit, suffixlen)) { |
| 201 | DEBUG_PRINTF("implemented as delayed literal with a rose prefix\n" ); |
| 202 | return; |
| 203 | } |
| 204 | |
| 205 | // General case: create a suffix that implements the flood-prone portion. |
| 206 | |
| 207 | // Create the NFA. |
| 208 | auto h = makeFloodProneSuffix(lit.s, suffixlen, g[v].reports); |
| 209 | if (!isImplementableNFA(*h, &tbi.rm, tbi.cc)) { |
| 210 | DEBUG_PRINTF("not implementable\n" ); |
| 211 | return; |
| 212 | } |
| 213 | |
| 214 | // Apply the NFA. |
| 215 | assert(!g[v].suffix); |
| 216 | g[v].suffix.graph = move(h); |
| 217 | g[v].reports.clear(); |
| 218 | |
| 219 | // Swap v's literal for a shorter one. |
| 220 | replaceWithLitPrefix(tbi, v, lit_id, lit, suffixlen, 0); |
| 221 | |
| 222 | // It's possible that min_offset might be an underestimate, so we |
| 223 | // subtract min(min_offset, suffixlen) for safety. |
| 224 | g[v].min_offset -= min((size_t)g[v].min_offset, suffixlen); |
| 225 | |
| 226 | if (g[v].max_offset < ROSE_BOUND_INF) { |
| 227 | assert(g[v].max_offset >= suffixlen); |
| 228 | g[v].max_offset -= suffixlen; |
| 229 | } |
| 230 | } |
| 231 | |
| 232 | /** |
| 233 | * Collect an estimate of the number of literals in the floating table, and use |
| 234 | * this to estimate the flood prone suffix length. |
| 235 | */ |
| 236 | static |
| 237 | size_t findFloodProneSuffixLen(const RoseBuildImpl &tbi) { |
| 238 | size_t numLiterals = 0; |
| 239 | for (const rose_literal_id &lit : tbi.literals) { |
| 240 | if (lit.delay) { |
| 241 | continue; // delay ids are virtual-ish |
| 242 | } |
| 243 | if (lit.table != ROSE_FLOATING) { |
| 244 | continue; |
| 245 | } |
| 246 | |
| 247 | numLiterals++; |
| 248 | } |
| 249 | |
| 250 | return hwlmFloodProneSuffixLen(numLiterals, tbi.cc); |
| 251 | } |
| 252 | |
| 253 | /** |
| 254 | * \brief Convert flood-prone literal suffixes into suffix NFAs. |
| 255 | * |
| 256 | * For any trailing string in Rose (string cannot lead to more Rose roles or |
| 257 | * NFAs, etc) ending with a continuous run of a single character with more than |
| 258 | * 3 copies of that single character, |
| 259 | * |
| 260 | * If the result of removing all but 2 copies of that character yields a string |
| 261 | * that is greater than FLOOD_PRONE_LIT_MIN_LENGTH characters, remove those |
| 262 | * final characters from the literal and move them into a suffix NFA. |
| 263 | */ |
| 264 | void convertFloodProneSuffixes(RoseBuildImpl &tbi) { |
| 265 | static const size_t FLOOD_PRONE_LIT_MIN_LENGTH = 5; |
| 266 | |
| 267 | if (!tbi.cc.grey.roseConvertFloodProneSuffixes) { |
| 268 | return; |
| 269 | } |
| 270 | |
| 271 | const size_t floodProneLen = findFloodProneSuffixLen(tbi); |
| 272 | DEBUG_PRINTF("flood prone suffix len = %zu\n" , floodProneLen); |
| 273 | |
| 274 | RoseGraph &g = tbi.g; |
| 275 | |
| 276 | for (auto v : vertices_range(g)) { |
| 277 | if (!isLeafNode(v, g)) { |
| 278 | continue; |
| 279 | } |
| 280 | |
| 281 | if (g[v].reports.empty()) { |
| 282 | continue; |
| 283 | } |
| 284 | |
| 285 | // TODO: currently only boring vertices. |
| 286 | if (!g[v].isBoring()) { |
| 287 | continue; |
| 288 | } |
| 289 | |
| 290 | // Currently only handles vertices with a single literal (should always |
| 291 | // be the case this early in Rose construction). |
| 292 | if (g[v].literals.size() != 1) { |
| 293 | continue; |
| 294 | } |
| 295 | |
| 296 | u32 lit_id = *g[v].literals.begin(); |
| 297 | const rose_literal_id &lit = tbi.literals.at(lit_id); |
| 298 | |
| 299 | // anchored or delayed literals need thought. |
| 300 | if (lit.table != ROSE_FLOATING || lit.delay) { |
| 301 | continue; |
| 302 | } |
| 303 | |
| 304 | // don't do this to literals with msk/cmp. |
| 305 | if (!lit.msk.empty()) { |
| 306 | continue; |
| 307 | } |
| 308 | |
| 309 | // Can't safely do this operation to vertices with delayed |
| 310 | // predecessors. |
| 311 | if (tbi.hasDelayPred(v)) { |
| 312 | DEBUG_PRINTF("delayed pred\n" ); |
| 313 | continue; |
| 314 | } |
| 315 | |
| 316 | if (lit.s.length() <= FLOOD_PRONE_LIT_MIN_LENGTH) { |
| 317 | DEBUG_PRINTF("literal is short enough already\n" ); |
| 318 | continue; |
| 319 | } |
| 320 | |
| 321 | size_t floodLen = suffixFloodLen(lit.s); |
| 322 | if (floodLen < floodProneLen) { |
| 323 | DEBUG_PRINTF("literal not flood-prone\n" ); |
| 324 | continue; |
| 325 | } |
| 326 | |
| 327 | if (floodLen == lit.s.length()) { |
| 328 | DEBUG_PRINTF("whole literal is a flood\n" ); |
| 329 | // Removing the part of the flood from the end of the literal would |
| 330 | // leave us with a shorter, but still flood-prone, prefix. Better |
| 331 | // to leave it alone. |
| 332 | continue; |
| 333 | } |
| 334 | |
| 335 | size_t suffixLen = floodLen - (floodProneLen - 1); |
| 336 | if (lit.s.length() - suffixLen < FLOOD_PRONE_LIT_MIN_LENGTH) { |
| 337 | DEBUG_PRINTF("removing flood would leave literal too short\n" ); |
| 338 | continue; |
| 339 | } |
| 340 | |
| 341 | convertFloodProneSuffix(tbi, v, lit_id, lit, suffixLen); |
| 342 | } |
| 343 | } |
| 344 | |
| 345 | static |
| 346 | CharReach getReachOfNormalVertex(const NGHolder &g) { |
| 347 | for (auto v : vertices_range(g)) { |
| 348 | if (is_special(v, g)) { |
| 349 | continue; |
| 350 | } |
| 351 | return g[v].char_reach; |
| 352 | } |
| 353 | assert(0); |
| 354 | return CharReach(); |
| 355 | } |
| 356 | |
| 357 | /** |
| 358 | * \brief Set the edge bounds and appropriate history on the given edge in the |
| 359 | * Rose graph. |
| 360 | */ |
| 361 | static |
| 362 | void setEdgeBounds(RoseGraph &g, const RoseEdge &e, u32 min_bound, |
| 363 | u32 max_bound) { |
| 364 | assert(min_bound <= max_bound); |
| 365 | assert(max_bound <= ROSE_BOUND_INF); |
| 366 | |
| 367 | g[e].minBound = min_bound; |
| 368 | g[e].maxBound = max_bound; |
| 369 | |
| 370 | if (min_bound || max_bound < ROSE_BOUND_INF) { |
| 371 | g[e].history = ROSE_ROLE_HISTORY_ANCH; |
| 372 | } else { |
| 373 | g[e].history = ROSE_ROLE_HISTORY_NONE; |
| 374 | } |
| 375 | } |
| 376 | |
| 377 | static |
| 378 | bool handleStartPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v, |
| 379 | const RoseEdge &e_old, RoseVertex ar, |
| 380 | vector<RoseEdge> *to_delete) { |
| 381 | DEBUG_PRINTF("hi\n" ); |
| 382 | |
| 383 | /* check for prefix cliches connected to start (^.{N,M}) */ |
| 384 | if (!getReachOfNormalVertex(h).all()) { |
| 385 | DEBUG_PRINTF(":(\n" ); |
| 386 | return false; |
| 387 | } |
| 388 | |
| 389 | PureRepeat repeat; |
| 390 | if (!isPureRepeat(h, repeat)) { |
| 391 | DEBUG_PRINTF(":(\n" ); |
| 392 | return false; |
| 393 | } |
| 394 | |
| 395 | assert(repeat.bounds.min.is_finite()); |
| 396 | assert(repeat.bounds.max.is_reachable()); |
| 397 | assert(repeat.bounds.min <= repeat.bounds.max); |
| 398 | |
| 399 | DEBUG_PRINTF("prefix is ^.{%s,%s}\n" , repeat.bounds.min.str().c_str(), |
| 400 | repeat.bounds.max.str().c_str()); |
| 401 | |
| 402 | /* update bounds on edge */ |
| 403 | |
| 404 | // Convert to Rose graph bounds, which are not (yet?) depth classes. |
| 405 | u32 bound_min = repeat.bounds.min; |
| 406 | u32 bound_max = |
| 407 | repeat.bounds.max.is_finite() ? (u32)repeat.bounds.max : ROSE_BOUND_INF; |
| 408 | |
| 409 | if (source(e_old, g) == ar) { |
| 410 | assert(g[e_old].minBound <= bound_min); |
| 411 | assert(g[e_old].maxBound >= bound_max); |
| 412 | setEdgeBounds(g, e_old, bound_min, bound_max); |
| 413 | } else { |
| 414 | RoseEdge e_new = add_edge(ar, v, g); |
| 415 | setEdgeBounds(g, e_new, bound_min, bound_max); |
| 416 | to_delete->push_back(e_old); |
| 417 | } |
| 418 | |
| 419 | g[v].left.reset(); /* clear the prefix info */ |
| 420 | return true; |
| 421 | } |
| 422 | |
| 423 | static |
| 424 | bool handleStartDsPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v, |
| 425 | const RoseEdge &e) { |
| 426 | DEBUG_PRINTF("hi\n" ); |
| 427 | /* check for prefix cliches connected to start-ds (.{N}, ^.{N,}) */ |
| 428 | u32 repeatCount = 0; |
| 429 | NFAVertex hu = h.startDs; |
| 430 | |
| 431 | auto start_succ = succs<set<NFAVertex>>(h.start, h); |
| 432 | auto startds_succ = succs<set<NFAVertex>>(h.startDs, h); |
| 433 | |
| 434 | if (!is_subset_of(start_succ, startds_succ)) { |
| 435 | DEBUG_PRINTF("not a simple chain\n" ); |
| 436 | return false; |
| 437 | } |
| 438 | |
| 439 | set<NFAVertex> seen; |
| 440 | do { |
| 441 | if (!h[hu].char_reach.all()) { |
| 442 | return false; |
| 443 | } |
| 444 | NFAVertex hv = getSoleDestVertex(h, hu); |
| 445 | if (!hv) { |
| 446 | return false; |
| 447 | } |
| 448 | if (contains(seen, hv)) { |
| 449 | assert(0); |
| 450 | return false; |
| 451 | } |
| 452 | hu = hv; |
| 453 | repeatCount++; |
| 454 | if (hu == h.accept) { |
| 455 | break; |
| 456 | } |
| 457 | } while(1); |
| 458 | |
| 459 | assert(hu == h.accept); |
| 460 | |
| 461 | repeatCount--; /* do not count accept as part of the chain */ |
| 462 | |
| 463 | DEBUG_PRINTF("prefix is ^.{%u,}\n" , repeatCount); |
| 464 | |
| 465 | /* update bounds on edge */ |
| 466 | assert(g[e].minBound <= repeatCount); |
| 467 | setEdgeBounds(g, e, repeatCount, ROSE_BOUND_INF); |
| 468 | |
| 469 | g[v].left.reset(); /* clear the prefix info */ |
| 470 | |
| 471 | return true; |
| 472 | } |
| 473 | |
| 474 | static |
| 475 | bool handleMixedPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v, |
| 476 | const RoseEdge &e_old, RoseVertex ar, |
| 477 | vector<RoseEdge> *to_delete, |
| 478 | const CompileContext &cc) { |
| 479 | assert(in_degree(h.acceptEod, h) == 1); |
| 480 | |
| 481 | bool anchored = !proper_out_degree(h.startDs, h); |
| 482 | NFAVertex key = NGHolder::null_vertex(); |
| 483 | NFAVertex base = anchored ? h.start : h.startDs; |
| 484 | |
| 485 | if (!anchored) { |
| 486 | auto start_succ = succs<set<NFAVertex>>(h.start, h); |
| 487 | auto startds_succ = succs<set<NFAVertex>>(h.startDs, h); |
| 488 | |
| 489 | if (!is_subset_of(start_succ, startds_succ)) { |
| 490 | DEBUG_PRINTF("not a simple chain\n" ); |
| 491 | return false; |
| 492 | } |
| 493 | } |
| 494 | |
| 495 | for (auto w : adjacent_vertices_range(base, h)) { |
| 496 | DEBUG_PRINTF("checking %zu\n" , h[w].index); |
| 497 | if (!h[w].char_reach.all()) { |
| 498 | continue; |
| 499 | } |
| 500 | |
| 501 | if (!is_special(w, h)) { |
| 502 | key = w; |
| 503 | break; |
| 504 | } |
| 505 | } |
| 506 | |
| 507 | if (!key) { |
| 508 | return false; |
| 509 | } |
| 510 | |
| 511 | vector<GraphRepeatInfo> repeats; |
| 512 | findRepeats(h, 2, &repeats); |
| 513 | |
| 514 | vector<GraphRepeatInfo>::const_iterator it; |
| 515 | for (it = repeats.begin(); it != repeats.end(); ++it) { |
| 516 | DEBUG_PRINTF("checking.. %zu verts\n" , it->vertices.size()); |
| 517 | if (find(it->vertices.begin(), it->vertices.end(), key) |
| 518 | != it->vertices.end()) { |
| 519 | break; |
| 520 | } |
| 521 | } |
| 522 | if (it == repeats.end()) { |
| 523 | DEBUG_PRINTF("no repeat found\n" ); |
| 524 | return false; |
| 525 | } |
| 526 | |
| 527 | GraphRepeatInfo ri = *it; |
| 528 | |
| 529 | set<NFAVertex> exits_and_repeat_verts; |
| 530 | for (auto repeat_v : ri.vertices) { |
| 531 | DEBUG_PRINTF("repeat vertex %zu\n" , h[repeat_v].index); |
| 532 | succ(h, repeat_v, &exits_and_repeat_verts); |
| 533 | exits_and_repeat_verts.insert(repeat_v); |
| 534 | } |
| 535 | |
| 536 | DEBUG_PRINTF("repeat {%s,%s}\n" , ri.repeatMin.str().c_str(), |
| 537 | ri.repeatMax.str().c_str()); |
| 538 | |
| 539 | set<NFAVertex> rep_verts; |
| 540 | insert(&rep_verts, ri.vertices); |
| 541 | |
| 542 | set<NFAVertex> exits; |
| 543 | exits = exits_and_repeat_verts; |
| 544 | erase_all(&exits, rep_verts); |
| 545 | |
| 546 | auto base_succ = succs<set<NFAVertex>>(base, h); |
| 547 | base_succ.erase(h.startDs); |
| 548 | |
| 549 | if (is_subset_of(base_succ, rep_verts)) { |
| 550 | /* all good: repeat dominates the rest of the pattern */ |
| 551 | } else if (ri.repeatMin == depth(1) |
| 552 | && is_subset_of(exits, base_succ) |
| 553 | && is_subset_of(base_succ, exits_and_repeat_verts)) { |
| 554 | /* we have a jump edge */ |
| 555 | ri.repeatMin = depth(0); |
| 556 | } else { |
| 557 | return false; |
| 558 | } |
| 559 | |
| 560 | DEBUG_PRINTF("repeat {%s,%s}\n" , ri.repeatMin.str().c_str(), |
| 561 | ri.repeatMax.str().c_str()); |
| 562 | DEBUG_PRINTF("woot?\n" ); |
| 563 | |
| 564 | shared_ptr<NGHolder> h_new = make_shared<NGHolder>(); |
| 565 | unordered_map<NFAVertex, NFAVertex> rhs_map; |
| 566 | vector<NFAVertex> exits_vec; |
| 567 | insert(&exits_vec, exits_vec.end(), exits); |
| 568 | splitRHS(h, exits_vec, h_new.get(), &rhs_map); |
| 569 | h_new->kind = NFA_PREFIX; |
| 570 | |
| 571 | if (num_vertices(*h_new) <= N_SPECIALS) { |
| 572 | DEBUG_PRINTF("not a hybrid??\n" ); |
| 573 | /* TODO: pick up these cases, unify code */ |
| 574 | return false; |
| 575 | } |
| 576 | |
| 577 | for (auto w : adjacent_vertices_range(h_new->start, *h_new)) { |
| 578 | if (w != h_new->startDs) { |
| 579 | add_edge(h_new->startDs, w, *h_new); |
| 580 | } |
| 581 | } |
| 582 | clear_out_edges(h_new->start, *h_new); |
| 583 | add_edge(h_new->start, h_new->startDs, *h_new); |
| 584 | |
| 585 | depth width = findMinWidth(*h_new); |
| 586 | if (width != findMaxWidth(*h_new)) { |
| 587 | return false; |
| 588 | } |
| 589 | |
| 590 | if (g[v].left.dfa) { |
| 591 | /* we were unable to implement initial graph as an nfa; |
| 592 | * we need to to check if we still need a dfa and, if so, rebuild. */ |
| 593 | if (!isImplementableNFA(*h_new, nullptr, cc)) { |
| 594 | return false; /* TODO: handle rebuilding dfa */ |
| 595 | } |
| 596 | } |
| 597 | |
| 598 | if (anchored) { |
| 599 | if (ri.repeatMax.is_infinite()) { |
| 600 | return false; /* TODO */ |
| 601 | } |
| 602 | |
| 603 | if (source(e_old, g) == ar) { |
| 604 | setEdgeBounds(g, e_old, ri.repeatMin + width, ri.repeatMax + width); |
| 605 | } else { |
| 606 | RoseEdge e_new = add_edge(ar, v, g); |
| 607 | setEdgeBounds(g, e_new, ri.repeatMin + width, ri.repeatMax + width); |
| 608 | to_delete->push_back(e_old); |
| 609 | } |
| 610 | |
| 611 | } else { |
| 612 | assert(g[e_old].minBound <= ri.repeatMin + width); |
| 613 | setEdgeBounds(g, e_old, ri.repeatMin + width, ROSE_BOUND_INF); |
| 614 | } |
| 615 | |
| 616 | g[v].left.dfa.reset(); |
| 617 | g[v].left.graph = h_new; |
| 618 | |
| 619 | return true; |
| 620 | } |
| 621 | |
| 622 | /* turns simple prefixes like /^.{30,} into bounds on the root roles */ |
| 623 | void convertPrefixToBounds(RoseBuildImpl &tbi) { |
| 624 | RoseGraph &g = tbi.g; |
| 625 | |
| 626 | vector<RoseEdge> to_delete; |
| 627 | RoseVertex ar = tbi.anchored_root; |
| 628 | |
| 629 | /* graphs with prefixes produced by rose are wired to tbi.root */ |
| 630 | |
| 631 | for (const auto &e : out_edges_range(tbi.root, g)) { |
| 632 | RoseVertex v = target(e, g); |
| 633 | |
| 634 | if (in_degree(v, g) != 1) { |
| 635 | continue; |
| 636 | } |
| 637 | |
| 638 | if (!g[v].left.graph) { |
| 639 | continue; |
| 640 | } |
| 641 | |
| 642 | if (g[v].left.tracksSom()) { |
| 643 | continue; |
| 644 | } |
| 645 | |
| 646 | const NGHolder &h = *g[v].left.graph; |
| 647 | |
| 648 | if (g[v].left.lag != tbi.minLiteralLen(v) |
| 649 | || g[v].left.lag != tbi.maxLiteralLen(v)) { |
| 650 | continue; |
| 651 | } |
| 652 | |
| 653 | if (all_reports(h).size() != 1) { |
| 654 | assert(0); |
| 655 | continue; |
| 656 | } |
| 657 | |
| 658 | DEBUG_PRINTF("inspecting prefix of %zu\n" , g[v].index); |
| 659 | |
| 660 | if (!proper_out_degree(h.startDs, h)) { |
| 661 | if (handleStartPrefixCliche(h, g, v, e, ar, &to_delete)) { |
| 662 | continue; |
| 663 | } |
| 664 | } else { |
| 665 | if (handleStartDsPrefixCliche(h, g, v, e)) { |
| 666 | continue; |
| 667 | } |
| 668 | } |
| 669 | |
| 670 | /* prefix is not just a simple dot repeat. However, it is still |
| 671 | * possible that it consists of dot repeat and fixed width mask that we |
| 672 | * can handle. */ |
| 673 | handleMixedPrefixCliche(h, g, v, e, ar, &to_delete, tbi.cc); |
| 674 | } |
| 675 | |
| 676 | for (const auto &e : out_edges_range(ar, g)) { |
| 677 | RoseVertex v = target(e, g); |
| 678 | |
| 679 | /* note: vertices that we have rehomed will currently have an in-degree |
| 680 | * of 2 */ |
| 681 | if (in_degree(v, g) != 1) { |
| 682 | continue; |
| 683 | } |
| 684 | |
| 685 | if (!g[v].left.graph) { |
| 686 | continue; |
| 687 | } |
| 688 | |
| 689 | if (g[v].left.tracksSom()) { |
| 690 | continue; |
| 691 | } |
| 692 | |
| 693 | if (g[v].left.lag != tbi.minLiteralLen(v) |
| 694 | || g[v].left.lag != tbi.maxLiteralLen(v)) { |
| 695 | continue; |
| 696 | } |
| 697 | |
| 698 | const NGHolder &h = *g[v].left.graph; |
| 699 | if (all_reports(h).size() != 1) { |
| 700 | assert(0); |
| 701 | continue; |
| 702 | } |
| 703 | |
| 704 | DEBUG_PRINTF("inspecting prefix of %zu\n" , g[v].index); |
| 705 | |
| 706 | if (!proper_out_degree(h.startDs, h)) { |
| 707 | if (handleStartPrefixCliche(h, g, v, e, ar, &to_delete)) { |
| 708 | continue; |
| 709 | } |
| 710 | } else { |
| 711 | if (handleStartDsPrefixCliche(h, g, v, e)) { |
| 712 | continue; |
| 713 | } |
| 714 | } |
| 715 | |
| 716 | /* prefix is not just a simple dot repeat. However, it is still |
| 717 | * possible that it consists of dot repeat and fixed width mask that we |
| 718 | * can handle. */ |
| 719 | handleMixedPrefixCliche(h, g, v, e, ar, &to_delete, tbi.cc); |
| 720 | } |
| 721 | |
| 722 | for (const auto &e : to_delete) { |
| 723 | remove_edge(e, g); |
| 724 | } |
| 725 | } |
| 726 | |
| 727 | /** |
| 728 | * Identify dot-repeat infixes after fixed-depth literals and convert them to |
| 729 | * edges with ROSE_ROLE_HISTORY_ANCH history and equivalent bounds. |
| 730 | */ |
| 731 | void convertAnchPrefixToBounds(RoseBuildImpl &tbi) { |
| 732 | RoseGraph &g = tbi.g; |
| 733 | |
| 734 | for (const auto v : vertices_range(g)) { |
| 735 | if (!g[v].left) { |
| 736 | continue; |
| 737 | } |
| 738 | |
| 739 | DEBUG_PRINTF("vertex %zu\n" , g[v].index); |
| 740 | |
| 741 | // This pass runs after makeCastles, so we use the fact that bounded |
| 742 | // repeat detection has already been done for us. |
| 743 | |
| 744 | if (!g[v].left.castle) { |
| 745 | DEBUG_PRINTF("not a castle\n" ); |
| 746 | continue; |
| 747 | } |
| 748 | |
| 749 | const CastleProto &castle = *g[v].left.castle; |
| 750 | |
| 751 | if (castle.repeats.size() != 1) { |
| 752 | DEBUG_PRINTF("too many repeats\n" ); |
| 753 | assert(0); // Castles should not have been merged yet. |
| 754 | continue; |
| 755 | } |
| 756 | |
| 757 | if (!castle.reach().all()) { |
| 758 | DEBUG_PRINTF("not dot\n" ); |
| 759 | continue; |
| 760 | } |
| 761 | |
| 762 | if (in_degree(v, g) != 1) { |
| 763 | DEBUG_PRINTF("too many in-edges\n" ); |
| 764 | continue; |
| 765 | } |
| 766 | |
| 767 | RoseEdge e = *in_edges(v, g).first; |
| 768 | RoseVertex u = source(e, g); |
| 769 | |
| 770 | if (g[e].history != ROSE_ROLE_HISTORY_NONE) { |
| 771 | DEBUG_PRINTF("history already set to something other than NONE?\n" ); |
| 772 | assert(0); |
| 773 | continue; |
| 774 | } |
| 775 | |
| 776 | if (g[u].min_offset != g[u].max_offset) { |
| 777 | DEBUG_PRINTF("pred not fixed offset\n" ); |
| 778 | continue; |
| 779 | } |
| 780 | DEBUG_PRINTF("pred is fixed offset, at %u\n" , g[u].min_offset); |
| 781 | assert(g[u].min_offset < ROSE_BOUND_INF); |
| 782 | |
| 783 | size_t lit_length = tbi.minLiteralLen(v); |
| 784 | if (lit_length != tbi.maxLiteralLen(v)) { |
| 785 | assert(0); |
| 786 | DEBUG_PRINTF("variable literal lengths\n" ); |
| 787 | continue; |
| 788 | } |
| 789 | |
| 790 | u32 lag = g[v].left.lag; |
| 791 | DEBUG_PRINTF("lit_length=%zu, lag=%u\n" , lit_length, lag); |
| 792 | assert(lag <= lit_length); |
| 793 | depth delay_adj(lit_length - lag); |
| 794 | |
| 795 | const PureRepeat &pr = castle.repeats.begin()->second; |
| 796 | DEBUG_PRINTF("castle has repeat %s\n" , pr.bounds.str().c_str()); |
| 797 | DEBUG_PRINTF("delay adj %u\n" , (u32)delay_adj); |
| 798 | |
| 799 | if (delay_adj >= pr.bounds.max) { |
| 800 | DEBUG_PRINTF("delay adj too large\n" ); |
| 801 | continue; |
| 802 | } |
| 803 | |
| 804 | DepthMinMax bounds(pr.bounds); // copy |
| 805 | if (delay_adj > bounds.min) { |
| 806 | bounds.min = depth(0); |
| 807 | } else { |
| 808 | bounds.min -= delay_adj; |
| 809 | } |
| 810 | bounds.max -= delay_adj; |
| 811 | setEdgeBounds(g, e, bounds.min, bounds.max.is_finite() |
| 812 | ? (u32)bounds.max |
| 813 | : ROSE_BOUND_INF); |
| 814 | g[v].left.reset(); |
| 815 | } |
| 816 | } |
| 817 | |
| 818 | } // namespace ue2 |
| 819 | |