| 1 | /* |
| 2 | * Copyright (c) 2015-2017, Intel Corporation |
| 3 | * |
| 4 | * Redistribution and use in source and binary forms, with or without |
| 5 | * modification, are permitted provided that the following conditions are met: |
| 6 | * |
| 7 | * * Redistributions of source code must retain the above copyright notice, |
| 8 | * this list of conditions and the following disclaimer. |
| 9 | * * Redistributions in binary form must reproduce the above copyright |
| 10 | * notice, this list of conditions and the following disclaimer in the |
| 11 | * documentation and/or other materials provided with the distribution. |
| 12 | * * Neither the name of Intel Corporation nor the names of its contributors |
| 13 | * may be used to endorse or promote products derived from this software |
| 14 | * without specific prior written permission. |
| 15 | * |
| 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 26 | * POSSIBILITY OF SUCH DAMAGE. |
| 27 | */ |
| 28 | |
| 29 | /** \file |
| 30 | * \brief Pattern lifetime analysis. |
| 31 | */ |
| 32 | |
| 33 | #include "config.h" |
| 34 | |
| 35 | #include "ng_find_matches.h" |
| 36 | |
| 37 | #include "nfagraph/ng_holder.h" |
| 38 | #include "nfagraph/ng_util.h" |
| 39 | #include "parser/position.h" |
| 40 | #include "util/container.h" |
| 41 | #include "util/compare.h" |
| 42 | #include "util/report.h" |
| 43 | #include "util/report_manager.h" |
| 44 | #include "util/unordered.h" |
| 45 | |
| 46 | #include <algorithm> |
| 47 | |
| 48 | using namespace std; |
| 49 | using namespace ue2; |
| 50 | |
| 51 | using MatchSet = set<pair<size_t, size_t>>; |
| 52 | using StateBitSet = boost::dynamic_bitset<>; |
| 53 | |
| 54 | namespace { |
| 55 | |
| 56 | /** \brief Max number of states (taking edit distance into account). */ |
| 57 | static constexpr size_t STATE_COUNT_MAX = 15000; |
| 58 | |
| 59 | // returns all successors up to a given depth in a vector of sets, indexed by |
| 60 | // zero-based depth from source vertex |
| 61 | static |
| 62 | vector<flat_set<NFAVertex>> |
| 63 | gatherSuccessorsByDepth(const NGHolder &g, const NFAVertex &src, u32 depth) { |
| 64 | assert(depth > 0); |
| 65 | |
| 66 | vector<flat_set<NFAVertex>> result(depth); |
| 67 | |
| 68 | // populate current set of successors |
| 69 | for (auto v : adjacent_vertices_range(src, g)) { |
| 70 | // ignore self-loops |
| 71 | if (src == v) { |
| 72 | continue; |
| 73 | } |
| 74 | DEBUG_PRINTF("Node %zu depth 1\n" , g[v].index); |
| 75 | result[0].insert(v); |
| 76 | } |
| 77 | |
| 78 | for (u32 d = 1; d < depth; d++) { |
| 79 | // collect all successors for all current level vertices |
| 80 | const auto &cur = result[d - 1]; |
| 81 | auto &next = result[d]; |
| 82 | for (auto u : cur) { |
| 83 | // don't go past special nodes |
| 84 | if (is_special(u, g)) { |
| 85 | continue; |
| 86 | } |
| 87 | |
| 88 | for (auto v : adjacent_vertices_range(u, g)) { |
| 89 | // ignore self-loops |
| 90 | if (u == v) { |
| 91 | continue; |
| 92 | } |
| 93 | DEBUG_PRINTF("Node %zu depth %u\n" , g[v].index, d + 1); |
| 94 | next.insert(v); |
| 95 | } |
| 96 | } |
| 97 | } |
| 98 | |
| 99 | return result; |
| 100 | } |
| 101 | |
| 102 | // returns all predecessors up to a given depth in a vector of sets, indexed by |
| 103 | // zero-based depth from source vertex |
| 104 | static |
| 105 | vector<flat_set<NFAVertex>> |
| 106 | gatherPredecessorsByDepth(const NGHolder &g, NFAVertex src, u32 depth) { |
| 107 | assert(depth > 0); |
| 108 | |
| 109 | vector<flat_set<NFAVertex>> result(depth); |
| 110 | |
| 111 | // populate current set of successors |
| 112 | for (auto v : inv_adjacent_vertices_range(src, g)) { |
| 113 | // ignore self-loops |
| 114 | if (src == v) { |
| 115 | continue; |
| 116 | } |
| 117 | DEBUG_PRINTF("Node %zu depth 1\n" , g[v].index); |
| 118 | result[0].insert(v); |
| 119 | } |
| 120 | |
| 121 | for (u32 d = 1; d < depth; d++) { |
| 122 | // collect all successors for all current level vertices |
| 123 | const auto &cur = result[d - 1]; |
| 124 | auto &next = result[d]; |
| 125 | for (auto v : cur) { |
| 126 | for (auto u : inv_adjacent_vertices_range(v, g)) { |
| 127 | // ignore self-loops |
| 128 | if (v == u) { |
| 129 | continue; |
| 130 | } |
| 131 | DEBUG_PRINTF("Node %zu depth %u\n" , g[u].index, d + 1); |
| 132 | next.insert(u); |
| 133 | } |
| 134 | } |
| 135 | } |
| 136 | |
| 137 | return result; |
| 138 | } |
| 139 | |
| 140 | // this is a per-vertex, per-shadow level state transition table |
| 141 | struct GraphCache { |
| 142 | GraphCache(u32 dist_in, u32 hamm_in, const NGHolder &g) |
| 143 | : hamming(hamm_in > 0), size(num_vertices(g)), |
| 144 | edit_distance(hamming ? hamm_in : dist_in) |
| 145 | { |
| 146 | auto dist_max = edit_distance + 1; |
| 147 | |
| 148 | allocateStateTransitionTable(dist_max); |
| 149 | populateTransitionCache(g, dist_max); |
| 150 | populateAcceptCache(g, dist_max); |
| 151 | } |
| 152 | |
| 153 | void allocateStateTransitionTable(u32 dist_max) { |
| 154 | // resize level 1 - per vertex |
| 155 | shadow_transitions.resize(size); |
| 156 | helper_transitions.resize(size); |
| 157 | |
| 158 | // resize level 2 - per shadow level |
| 159 | for (u32 i = 0; i < size; i++) { |
| 160 | shadow_transitions[i].resize(dist_max); |
| 161 | helper_transitions[i].resize(dist_max); |
| 162 | |
| 163 | // resize level 3 - per vertex |
| 164 | for (u32 d = 0; d < dist_max; d++) { |
| 165 | shadow_transitions[i][d].resize(size); |
| 166 | helper_transitions[i][d].resize(size); |
| 167 | } |
| 168 | } |
| 169 | |
| 170 | // accept states are indexed by edit distance |
| 171 | accept_states.resize(dist_max); |
| 172 | accept_eod_states.resize(dist_max); |
| 173 | |
| 174 | // vertex report maps are indexed by edit distance |
| 175 | vertex_reports_by_level.resize(dist_max); |
| 176 | vertex_eod_reports_by_level.resize(dist_max); |
| 177 | } |
| 178 | |
| 179 | /* |
| 180 | * certain transitions to helpers are disallowed: |
| 181 | * 1. transitions from accept/acceptEod |
| 182 | * 2. transitions to accept/acceptEod |
| 183 | * 3. from start to startDs |
| 184 | * 4. to a virtual/multiline start |
| 185 | * |
| 186 | * everything else is allowed. |
| 187 | */ |
| 188 | bool canTransitionToHelper(NFAVertex u, NFAVertex v, const NGHolder &g) const { |
| 189 | if (is_any_accept(u, g)) { |
| 190 | return false; |
| 191 | } |
| 192 | if (is_any_accept(v, g)) { |
| 193 | return false; |
| 194 | } |
| 195 | if (u == g.start && v == g.startDs) { |
| 196 | return false; |
| 197 | } |
| 198 | if (is_virtual_start(v, g)) { |
| 199 | return false; |
| 200 | } |
| 201 | return true; |
| 202 | } |
| 203 | |
| 204 | void populateTransitionCache(const NGHolder &g, u32 dist_max) { |
| 205 | // populate mapping of vertex index to vertex |
| 206 | vector<NFAVertex> idx_to_v(size); |
| 207 | for (auto v : vertices_range(g)) { |
| 208 | idx_to_v[g[v].index] = v; |
| 209 | } |
| 210 | |
| 211 | for (u32 i = 0; i < size; i++) { |
| 212 | auto cur_v = idx_to_v[i]; |
| 213 | |
| 214 | // set up transition tables |
| 215 | auto succs = gatherSuccessorsByDepth(g, cur_v, dist_max); |
| 216 | |
| 217 | assert(succs.size() == dist_max); |
| 218 | |
| 219 | for (u32 d = 0; d < dist_max; d++) { |
| 220 | auto &v_shadows = shadow_transitions[i][d]; |
| 221 | auto cur_v_bit = i; |
| 222 | |
| 223 | // enable transition to next level helper (this handles insertion) |
| 224 | if (!hamming && d < edit_distance && !is_any_accept(cur_v, g)) { |
| 225 | auto &next_v_helpers = helper_transitions[i][d + 1]; |
| 226 | |
| 227 | next_v_helpers.set(cur_v_bit); |
| 228 | } |
| 229 | |
| 230 | // if vertex has a self-loop, we can also transition to it, |
| 231 | // but only if we're at shadow level 0 |
| 232 | if (edge(cur_v, cur_v, g).second && d == 0) { |
| 233 | v_shadows.set(cur_v_bit); |
| 234 | } |
| 235 | |
| 236 | if (hamming && d > 0) { |
| 237 | continue; |
| 238 | } |
| 239 | |
| 240 | // populate state transition tables |
| 241 | for (auto v : succs[d]) { |
| 242 | auto v_bit = g[v].index; |
| 243 | |
| 244 | // we cannot transition to startDs on any level other than |
| 245 | // level 0 |
| 246 | if (v != g.startDs || d == 0) { |
| 247 | // this handles direct transitions as well as removals |
| 248 | v_shadows.set(v_bit); |
| 249 | } |
| 250 | |
| 251 | // we can also transition to next-level helper (handles |
| 252 | // replace), provided we meet the criteria |
| 253 | if (d < edit_distance && canTransitionToHelper(cur_v, v, g)) { |
| 254 | auto &next_v_helpers = helper_transitions[i][d + 1]; |
| 255 | |
| 256 | next_v_helpers.set(v_bit); |
| 257 | } |
| 258 | } |
| 259 | } |
| 260 | } |
| 261 | } |
| 262 | |
| 263 | void populateAcceptCache(const NGHolder &g, u32 dist_max) { |
| 264 | // set up accept states masks |
| 265 | StateBitSet accept(size); |
| 266 | accept.set(g[g.accept].index); |
| 267 | StateBitSet accept_eod(size); |
| 268 | accept_eod.set(g[g.acceptEod].index); |
| 269 | |
| 270 | // gather accept and acceptEod states |
| 271 | for (u32 base_dist = 0; base_dist < dist_max; base_dist++) { |
| 272 | auto &states = accept_states[base_dist]; |
| 273 | auto &eod_states = accept_eod_states[base_dist]; |
| 274 | |
| 275 | states.resize(size); |
| 276 | eod_states.resize(size); |
| 277 | |
| 278 | // inspect each vertex |
| 279 | for (u32 i = 0; i < size; i++) { |
| 280 | // inspect all shadow levels from base_dist to dist_max |
| 281 | for (u32 d = 0; d < dist_max - base_dist; d++) { |
| 282 | auto &shadows = shadow_transitions[i][d]; |
| 283 | |
| 284 | // if this state transitions to accept, set its bit |
| 285 | if ((shadows & accept).any()) { |
| 286 | states.set(i); |
| 287 | } |
| 288 | if ((shadows & accept_eod).any()) { |
| 289 | eod_states.set(i); |
| 290 | } |
| 291 | } |
| 292 | } |
| 293 | } |
| 294 | |
| 295 | // populate accepts cache |
| 296 | for (auto v : inv_adjacent_vertices_range(g.accept, g)) { |
| 297 | const auto &rs = g[v].reports; |
| 298 | |
| 299 | for (u32 d = 0; d <= edit_distance; d++) { |
| 300 | // add self to report list at all levels |
| 301 | vertex_reports_by_level[d][v].insert(rs.begin(), rs.end()); |
| 302 | } |
| 303 | |
| 304 | if (edit_distance == 0 || hamming) { |
| 305 | // if edit distance is 0, no predecessors will have reports |
| 306 | continue; |
| 307 | } |
| 308 | |
| 309 | auto preds_by_depth = gatherPredecessorsByDepth(g, v, edit_distance); |
| 310 | for (u32 pd = 0; pd < preds_by_depth.size(); pd++) { |
| 311 | const auto &preds = preds_by_depth[pd]; |
| 312 | // for each predecessor, add reports up to maximum edit distance |
| 313 | // for current depth from source vertex |
| 314 | for (auto pred : preds) { |
| 315 | for (u32 d = 0; d < edit_distance - pd; d++) { |
| 316 | vertex_reports_by_level[d][pred].insert(rs.begin(), rs.end()); |
| 317 | } |
| 318 | } |
| 319 | } |
| 320 | } |
| 321 | for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) { |
| 322 | const auto &rs = g[v].reports; |
| 323 | |
| 324 | if (v == g.accept) { |
| 325 | continue; |
| 326 | } |
| 327 | |
| 328 | for (u32 d = 0; d <= edit_distance; d++) { |
| 329 | // add self to report list at all levels |
| 330 | vertex_eod_reports_by_level[d][v].insert(rs.begin(), rs.end()); |
| 331 | } |
| 332 | if (edit_distance == 0 || hamming) { |
| 333 | // if edit distance is 0, no predecessors will have reports |
| 334 | continue; |
| 335 | } |
| 336 | |
| 337 | auto preds_by_depth = gatherPredecessorsByDepth(g, v, edit_distance); |
| 338 | for (u32 pd = 0; pd < preds_by_depth.size(); pd++) { |
| 339 | const auto &preds = preds_by_depth[pd]; |
| 340 | // for each predecessor, add reports up to maximum edit distance |
| 341 | // for current depth from source vertex |
| 342 | for (auto pred : preds) { |
| 343 | for (u32 d = 0; d < edit_distance - pd; d++) { |
| 344 | vertex_eod_reports_by_level[d][pred].insert(rs.begin(), rs.end()); |
| 345 | } |
| 346 | } |
| 347 | } |
| 348 | } |
| 349 | } |
| 350 | |
| 351 | #ifdef DEBUG |
| 352 | void dumpStateTransitionTable(const NGHolder &g) { |
| 353 | StateBitSet accept(size); |
| 354 | accept.set(g[g.accept].index); |
| 355 | StateBitSet accept_eod(size); |
| 356 | accept_eod.set(g[g.acceptEod].index); |
| 357 | |
| 358 | DEBUG_PRINTF("Dumping state transition tables\n" ); |
| 359 | DEBUG_PRINTF("Shadows:\n" ); |
| 360 | for (u32 i = 0; i < num_vertices(g); i++) { |
| 361 | DEBUG_PRINTF("%-7s %3u:" , "Vertex" , i); |
| 362 | for (u32 j = 0; j < num_vertices(g); j++) { |
| 363 | printf("%3i" , j); |
| 364 | } |
| 365 | printf("\n" ); |
| 366 | for (u32 d = 0; d <= edit_distance; d++) { |
| 367 | DEBUG_PRINTF("%-7s %3u:" , "Level" , d); |
| 368 | const auto &s = getShadowTransitions(i, d); |
| 369 | for (u32 j = 0; j < num_vertices(g); j++) { |
| 370 | printf("%3i" , s.test(j)); |
| 371 | } |
| 372 | printf("\n" ); |
| 373 | } |
| 374 | DEBUG_PRINTF("\n" ); |
| 375 | } |
| 376 | |
| 377 | DEBUG_PRINTF("Helpers:\n" ); |
| 378 | for (u32 i = 0; i < num_vertices(g); i++) { |
| 379 | DEBUG_PRINTF("%-7s %3u:" , "Vertex" , i); |
| 380 | for (u32 j = 0; j < num_vertices(g); j++) { |
| 381 | printf("%3i" , j); |
| 382 | } |
| 383 | printf("\n" ); |
| 384 | for (u32 d = 0; d <= edit_distance; d++) { |
| 385 | DEBUG_PRINTF("%-7s %3u:" , "Level" , d); |
| 386 | const auto &s = getHelperTransitions(i, d); |
| 387 | for (u32 j = 0; j < num_vertices(g); j++) { |
| 388 | printf("%3i" , s.test(j)); |
| 389 | } |
| 390 | printf("\n" ); |
| 391 | } |
| 392 | DEBUG_PRINTF("\n" ); |
| 393 | } |
| 394 | |
| 395 | DEBUG_PRINTF("Accept transitions:\n" ); |
| 396 | DEBUG_PRINTF("%-12s" , "Vertex idx:" ); |
| 397 | for (u32 j = 0; j < num_vertices(g); j++) { |
| 398 | printf("%3i" , j); |
| 399 | } |
| 400 | printf("\n" ); |
| 401 | for (u32 d = 0; d <= edit_distance; d++) { |
| 402 | DEBUG_PRINTF("%-7s %3u:" , "Level" , d); |
| 403 | const auto &s = getAcceptTransitions(d); |
| 404 | for (u32 j = 0; j < num_vertices(g); j++) { |
| 405 | printf("%3i" , s.test(j)); |
| 406 | } |
| 407 | printf("\n" ); |
| 408 | } |
| 409 | DEBUG_PRINTF("\n" ); |
| 410 | |
| 411 | DEBUG_PRINTF("Accept EOD transitions:\n" ); |
| 412 | DEBUG_PRINTF("%-12s" , "Vertex idx:" ); |
| 413 | for (u32 j = 0; j < num_vertices(g); j++) { |
| 414 | printf("%3i" , j); |
| 415 | } |
| 416 | printf("\n" ); |
| 417 | for (u32 d = 0; d <= edit_distance; d++) { |
| 418 | DEBUG_PRINTF("%-7s %3u:" , "Level" , d); |
| 419 | const auto &s = getAcceptEodTransitions(d); |
| 420 | for (u32 j = 0; j < num_vertices(g); j++) { |
| 421 | printf("%3i" , s.test(j)); |
| 422 | } |
| 423 | printf("\n" ); |
| 424 | } |
| 425 | DEBUG_PRINTF("\n" ); |
| 426 | |
| 427 | DEBUG_PRINTF("%-12s " , "Accepts:" ); |
| 428 | for (u32 i = 0; i < num_vertices(g); i++) { |
| 429 | printf("%3i" , accept.test(i)); |
| 430 | } |
| 431 | printf("\n" ); |
| 432 | |
| 433 | DEBUG_PRINTF("%-12s " , "EOD Accepts:" ); |
| 434 | for (u32 i = 0; i < num_vertices(g); i++) { |
| 435 | printf("%3i" , accept_eod.test(i)); |
| 436 | } |
| 437 | printf("\n" ); |
| 438 | |
| 439 | DEBUG_PRINTF("Reports\n" ); |
| 440 | for (auto v : vertices_range(g)) { |
| 441 | for (u32 d = 0; d <= edit_distance; d++) { |
| 442 | const auto &r = vertex_reports_by_level[d][v]; |
| 443 | const auto &e = vertex_eod_reports_by_level[d][v]; |
| 444 | DEBUG_PRINTF("%-7s %3zu %-8s %3zu %-8s %3zu\n" , |
| 445 | "Vertex" , g[v].index, "rs:" , r.size(), "eod:" , e.size()); |
| 446 | } |
| 447 | } |
| 448 | printf("\n" ); |
| 449 | } |
| 450 | #endif |
| 451 | |
| 452 | const StateBitSet& getShadowTransitions(u32 idx, u32 level) const { |
| 453 | assert(idx < size); |
| 454 | assert(level <= edit_distance); |
| 455 | return shadow_transitions[idx][level]; |
| 456 | } |
| 457 | const StateBitSet& getHelperTransitions(u32 idx, u32 level) const { |
| 458 | assert(idx < size); |
| 459 | assert(level <= edit_distance); |
| 460 | return helper_transitions[idx][level]; |
| 461 | } |
| 462 | const StateBitSet& getAcceptTransitions(u32 level) const { |
| 463 | assert(level <= edit_distance); |
| 464 | return accept_states[level]; |
| 465 | } |
| 466 | const StateBitSet& getAcceptEodTransitions(u32 level) const { |
| 467 | assert(level <= edit_distance); |
| 468 | return accept_eod_states[level]; |
| 469 | } |
| 470 | |
| 471 | /* |
| 472 | * the bitsets are indexed by vertex and shadow level. the bitset's length is |
| 473 | * equal to the total number of vertices in the graph. |
| 474 | * |
| 475 | * for convenience, helper functions are provided. |
| 476 | */ |
| 477 | vector<vector<StateBitSet>> shadow_transitions; |
| 478 | vector<vector<StateBitSet>> helper_transitions; |
| 479 | |
| 480 | // accept states masks, indexed by shadow level |
| 481 | vector<StateBitSet> accept_states; |
| 482 | vector<StateBitSet> accept_eod_states; |
| 483 | |
| 484 | // map of all reports associated with any vertex, indexed by shadow level |
| 485 | vector<map<NFAVertex, flat_set<ReportID>>> vertex_reports_by_level; |
| 486 | vector<map<NFAVertex, flat_set<ReportID>>> vertex_eod_reports_by_level; |
| 487 | |
| 488 | bool hamming; |
| 489 | u32 size; |
| 490 | u32 edit_distance; |
| 491 | }; |
| 492 | |
| 493 | |
| 494 | /* |
| 495 | * SOM workflow is expected to be the following: |
| 496 | * - Caller calls getActiveStates, which reports SOM for each active states |
| 497 | * - Caller calls getSuccessorStates on each of the active states, which *doesn't* |
| 498 | * report SOM |
| 499 | * - Caller decides if the successor state should be activated, and calls |
| 500 | * activateState with SOM set to that of previous active state (not successor!) |
| 501 | * - activateState then resolves any conflicts between SOMs that may arise from |
| 502 | * multiple active states progressing to the same successor |
| 503 | */ |
| 504 | struct StateSet { |
| 505 | struct State { |
| 506 | enum node_type { |
| 507 | NODE_SHADOW = 0, |
| 508 | NODE_HELPER |
| 509 | }; |
| 510 | State(size_t idx_in, u32 level_in, size_t som_in, node_type type_in) : |
| 511 | idx(idx_in), level(level_in), som(som_in), type(type_in) {} |
| 512 | size_t idx; |
| 513 | u32 level; |
| 514 | size_t som; |
| 515 | node_type type; |
| 516 | }; |
| 517 | |
| 518 | // Temporary working data used for step() which we want to keep around |
| 519 | // (rather than reallocating vectors all the time). |
| 520 | struct WorkingData { |
| 521 | vector<State> active; |
| 522 | vector<State> succ_list; |
| 523 | }; |
| 524 | |
| 525 | StateSet(size_t sz, u32 dist_in) : |
| 526 | shadows(dist_in + 1), helpers(dist_in + 1), |
| 527 | shadows_som(dist_in + 1), helpers_som(dist_in + 1), |
| 528 | edit_distance(dist_in) { |
| 529 | for (u32 dist = 0; dist <= dist_in; dist++) { |
| 530 | shadows[dist].resize(sz, false); |
| 531 | helpers[dist].resize(sz, false); |
| 532 | shadows_som[dist].resize(sz, 0); |
| 533 | helpers_som[dist].resize(sz, 0); |
| 534 | } |
| 535 | } |
| 536 | |
| 537 | void reset() { |
| 538 | for (u32 dist = 0; dist <= edit_distance; dist++) { |
| 539 | shadows[dist].reset(); |
| 540 | helpers[dist].reset(); |
| 541 | fill(shadows_som[dist].begin(), shadows_som[dist].end(), 0); |
| 542 | fill(helpers_som[dist].begin(), helpers_som[dist].end(), 0); |
| 543 | } |
| 544 | } |
| 545 | |
| 546 | bool empty() const { |
| 547 | for (u32 dist = 0; dist <= edit_distance; dist++) { |
| 548 | if (shadows[dist].any()) { |
| 549 | return false; |
| 550 | } |
| 551 | if (helpers[dist].any()) { |
| 552 | return false; |
| 553 | } |
| 554 | } |
| 555 | return true; |
| 556 | } |
| 557 | |
| 558 | size_t count() const { |
| 559 | size_t result = 0; |
| 560 | |
| 561 | for (u32 dist = 0; dist <= edit_distance; dist++) { |
| 562 | result += shadows[dist].count(); |
| 563 | result += helpers[dist].count(); |
| 564 | } |
| 565 | |
| 566 | return result; |
| 567 | } |
| 568 | |
| 569 | bool setActive(const State &s) { |
| 570 | switch (s.type) { |
| 571 | case State::NODE_HELPER: |
| 572 | return helpers[s.level].test_set(s.idx); |
| 573 | case State::NODE_SHADOW: |
| 574 | return shadows[s.level].test_set(s.idx); |
| 575 | } |
| 576 | assert(0); |
| 577 | return false; |
| 578 | } |
| 579 | |
| 580 | size_t getCachedSom(const State &s) const { |
| 581 | switch (s.type) { |
| 582 | case State::NODE_HELPER: |
| 583 | return helpers_som[s.level][s.idx]; |
| 584 | case State::NODE_SHADOW: |
| 585 | return shadows_som[s.level][s.idx]; |
| 586 | } |
| 587 | assert(0); |
| 588 | return 0; |
| 589 | } |
| 590 | |
| 591 | void setCachedSom(const State &s, const size_t som_val) { |
| 592 | switch (s.type) { |
| 593 | case State::NODE_HELPER: |
| 594 | helpers_som[s.level][s.idx] = som_val; |
| 595 | break; |
| 596 | case State::NODE_SHADOW: |
| 597 | shadows_som[s.level][s.idx] = som_val; |
| 598 | break; |
| 599 | default: |
| 600 | assert(0); |
| 601 | } |
| 602 | } |
| 603 | |
| 604 | #ifdef DEBUG |
| 605 | void dumpActiveStates() const { |
| 606 | vector<State> states; |
| 607 | getActiveStates(states); |
| 608 | |
| 609 | DEBUG_PRINTF("Dumping active states\n" ); |
| 610 | |
| 611 | for (const auto &state : states) { |
| 612 | DEBUG_PRINTF("type: %s idx: %zu level: %u som: %zu\n" , |
| 613 | state.type == State::NODE_HELPER ? "HELPER" : "SHADOW" , |
| 614 | state.idx, state.level, state.som); |
| 615 | } |
| 616 | } |
| 617 | #endif |
| 618 | |
| 619 | void getActiveStates(vector<State> &result) const { |
| 620 | result.clear(); |
| 621 | |
| 622 | for (u32 dist = 0; dist <= edit_distance; dist++) { |
| 623 | // get all shadow vertices (including original graph) |
| 624 | const auto &cur_shadow_vertices = shadows[dist]; |
| 625 | for (size_t id = cur_shadow_vertices.find_first(); |
| 626 | id != cur_shadow_vertices.npos; |
| 627 | id = cur_shadow_vertices.find_next(id)) { |
| 628 | result.emplace_back(id, dist, shadows_som[dist][id], |
| 629 | State::NODE_SHADOW); |
| 630 | } |
| 631 | |
| 632 | // the rest is only valid for edited graphs |
| 633 | if (dist == 0) { |
| 634 | continue; |
| 635 | } |
| 636 | |
| 637 | // get all helper vertices |
| 638 | const auto &cur_helper_vertices = helpers[dist]; |
| 639 | for (size_t id = cur_helper_vertices.find_first(); |
| 640 | id != cur_helper_vertices.npos; |
| 641 | id = cur_helper_vertices.find_next(id)) { |
| 642 | result.emplace_back(id, dist, helpers_som[dist][id], |
| 643 | State::NODE_HELPER); |
| 644 | } |
| 645 | } |
| 646 | |
| 647 | sort_and_unique(result); |
| 648 | } |
| 649 | |
| 650 | // does not return SOM |
| 651 | void getSuccessors(const State &state, const GraphCache &gc, |
| 652 | vector<State> &result) const { |
| 653 | result.clear(); |
| 654 | |
| 655 | // maximum shadow depth that we can go from current level |
| 656 | u32 max_depth = edit_distance - state.level + 1; |
| 657 | |
| 658 | for (u32 d = 0; d < max_depth; d++) { |
| 659 | const auto &shadow_succ = gc.getShadowTransitions(state.idx, d); |
| 660 | for (size_t id = shadow_succ.find_first(); |
| 661 | id != shadow_succ.npos; |
| 662 | id = shadow_succ.find_next(id)) { |
| 663 | auto new_level = state.level + d; |
| 664 | result.emplace_back(id, new_level, 0, State::NODE_SHADOW); |
| 665 | } |
| 666 | |
| 667 | const auto &helper_succ = gc.getHelperTransitions(state.idx, d); |
| 668 | for (size_t id = helper_succ.find_first(); |
| 669 | id != helper_succ.npos; |
| 670 | id = helper_succ.find_next(id)) { |
| 671 | auto new_level = state.level + d; |
| 672 | result.emplace_back(id, new_level, 0, State::NODE_HELPER); |
| 673 | } |
| 674 | } |
| 675 | |
| 676 | sort_and_unique(result); |
| 677 | } |
| 678 | |
| 679 | void getAcceptStates(const GraphCache &gc, vector<State> &result) const { |
| 680 | result.clear(); |
| 681 | |
| 682 | for (u32 dist = 0; dist <= edit_distance; dist++) { |
| 683 | // get all shadow vertices (including original graph) |
| 684 | auto cur_shadow_vertices = shadows[dist]; |
| 685 | cur_shadow_vertices &= gc.getAcceptTransitions(dist); |
| 686 | for (size_t id = cur_shadow_vertices.find_first(); |
| 687 | id != cur_shadow_vertices.npos; |
| 688 | id = cur_shadow_vertices.find_next(id)) { |
| 689 | result.emplace_back(id, dist, shadows_som[dist][id], |
| 690 | State::NODE_SHADOW); |
| 691 | } |
| 692 | |
| 693 | auto cur_helper_vertices = helpers[dist]; |
| 694 | cur_helper_vertices &= gc.getAcceptTransitions(dist); |
| 695 | for (size_t id = cur_helper_vertices.find_first(); |
| 696 | id != cur_helper_vertices.npos; |
| 697 | id = cur_helper_vertices.find_next(id)) { |
| 698 | result.emplace_back(id, dist, helpers_som[dist][id], |
| 699 | State::NODE_HELPER); |
| 700 | } |
| 701 | } |
| 702 | |
| 703 | sort_and_unique(result); |
| 704 | } |
| 705 | |
| 706 | void getAcceptEodStates(const GraphCache &gc, vector<State> &result) const { |
| 707 | result.clear(); |
| 708 | |
| 709 | for (u32 dist = 0; dist <= edit_distance; dist++) { |
| 710 | // get all shadow vertices (including original graph) |
| 711 | auto cur_shadow_vertices = shadows[dist]; |
| 712 | cur_shadow_vertices &= gc.getAcceptEodTransitions(dist); |
| 713 | for (size_t id = cur_shadow_vertices.find_first(); |
| 714 | id != cur_shadow_vertices.npos; |
| 715 | id = cur_shadow_vertices.find_next(id)) { |
| 716 | result.emplace_back(id, dist, shadows_som[dist][id], |
| 717 | State::NODE_SHADOW); |
| 718 | } |
| 719 | |
| 720 | auto cur_helper_vertices = helpers[dist]; |
| 721 | cur_helper_vertices &= gc.getAcceptEodTransitions(dist); |
| 722 | for (size_t id = cur_helper_vertices.find_first(); |
| 723 | id != cur_helper_vertices.npos; |
| 724 | id = cur_helper_vertices.find_next(id)) { |
| 725 | result.emplace_back(id, dist, helpers_som[dist][id], |
| 726 | State::NODE_HELPER); |
| 727 | } |
| 728 | } |
| 729 | |
| 730 | sort_and_unique(result); |
| 731 | } |
| 732 | |
| 733 | // the caller must specify SOM at current offset, and must not attempt to |
| 734 | // resolve SOM inheritance conflicts |
| 735 | void activateState(const State &state) { |
| 736 | size_t cur_som = state.som; |
| 737 | if (setActive(state)) { |
| 738 | size_t cached_som = getCachedSom(state); |
| 739 | cur_som = min(cur_som, cached_som); |
| 740 | } |
| 741 | setCachedSom(state, cur_som); |
| 742 | } |
| 743 | |
| 744 | vector<StateBitSet> shadows; |
| 745 | vector<StateBitSet> helpers; |
| 746 | vector<vector<size_t>> shadows_som; |
| 747 | vector<vector<size_t>> helpers_som; |
| 748 | u32 edit_distance; |
| 749 | }; |
| 750 | |
| 751 | // for flat_set |
| 752 | bool operator<(const StateSet::State &a, const StateSet::State &b) { |
| 753 | ORDER_CHECK(idx); |
| 754 | ORDER_CHECK(level); |
| 755 | ORDER_CHECK(type); |
| 756 | ORDER_CHECK(som); |
| 757 | return false; |
| 758 | } |
| 759 | |
| 760 | bool operator==(const StateSet::State &a, const StateSet::State &b) { |
| 761 | return a.idx == b.idx && a.level == b.level && a.type == b.type && |
| 762 | a.som == b.som; |
| 763 | } |
| 764 | |
| 765 | /** \brief Cache to speed up edge lookups, rather than hitting the graph. */ |
| 766 | struct EdgeCache { |
| 767 | explicit EdgeCache(const NGHolder &g) { |
| 768 | cache.reserve(num_vertices(g)); |
| 769 | for (auto e : edges_range(g)) { |
| 770 | cache.emplace(make_pair(source(e, g), target(e, g)), e); |
| 771 | } |
| 772 | } |
| 773 | |
| 774 | NFAEdge get(NFAVertex u, NFAVertex v) const { |
| 775 | auto it = cache.find(make_pair(u, v)); |
| 776 | if (it != cache.end()) { |
| 777 | return it->second; |
| 778 | } |
| 779 | return NFAEdge(); |
| 780 | } |
| 781 | |
| 782 | private: |
| 783 | ue2_unordered_map<pair<NFAVertex, NFAVertex>, NFAEdge> cache; |
| 784 | }; |
| 785 | |
| 786 | struct fmstate { |
| 787 | const size_t num_states; // number of vertices in graph |
| 788 | StateSet states; // currently active states |
| 789 | StateSet next; // states on after this iteration |
| 790 | GraphCache &gc; |
| 791 | vector<NFAVertex> vertices; // mapping from index to vertex |
| 792 | EdgeCache edge_cache; |
| 793 | size_t offset = 0; |
| 794 | unsigned char cur = 0; |
| 795 | unsigned char prev = 0; |
| 796 | const bool utf8; |
| 797 | const bool allowStartDs; |
| 798 | const ReportManager &rm; |
| 799 | |
| 800 | fmstate(const NGHolder &g, GraphCache &gc_in, bool utf8_in, bool aSD_in, |
| 801 | const u32 edit_distance, const ReportManager &rm_in) |
| 802 | : num_states(num_vertices(g)), |
| 803 | states(num_states, edit_distance), |
| 804 | next(num_states, edit_distance), |
| 805 | gc(gc_in), vertices(num_vertices(g), NGHolder::null_vertex()), |
| 806 | edge_cache(g), utf8(utf8_in), allowStartDs(aSD_in), rm(rm_in) { |
| 807 | // init states |
| 808 | states.activateState( |
| 809 | StateSet::State {g[g.start].index, 0, 0, |
| 810 | StateSet::State::NODE_SHADOW}); |
| 811 | if (allowStartDs) { |
| 812 | states.activateState( |
| 813 | StateSet::State {g[g.startDs].index, 0, 0, |
| 814 | StateSet::State::NODE_SHADOW}); |
| 815 | } |
| 816 | // fill vertex mapping |
| 817 | for (auto v : vertices_range(g)) { |
| 818 | vertices[g[v].index] = v; |
| 819 | } |
| 820 | } |
| 821 | }; |
| 822 | |
| 823 | } // namespace |
| 824 | |
| 825 | static |
| 826 | bool isWordChar(const unsigned char c) { |
| 827 | // check if it's an alpha character |
| 828 | if (ourisalpha(c)) { |
| 829 | return true; |
| 830 | } |
| 831 | // check if it's a digit |
| 832 | if (c >= '0' && c <= '9') { |
| 833 | return true; |
| 834 | } |
| 835 | // check if it's an underscore |
| 836 | if (c == '_') { |
| 837 | return true; |
| 838 | } |
| 839 | return false; |
| 840 | } |
| 841 | |
| 842 | static |
| 843 | bool isUtf8CodePoint(const char c) { |
| 844 | // check if this is a start of 4-byte character |
| 845 | if ((c & 0xF8) == 0xF0) { |
| 846 | return true; |
| 847 | } |
| 848 | // check if this is a start of 3-byte character |
| 849 | if ((c & 0xF0) == 0xE0) { |
| 850 | return true; |
| 851 | } |
| 852 | // check if this is a start of 2-byte character |
| 853 | if ((c & 0xE0) == 0xC0) { |
| 854 | return true; |
| 855 | } |
| 856 | // check if this is a single-byte character |
| 857 | if ((c & 0x80) == 0) { |
| 858 | return true; |
| 859 | } |
| 860 | return false; |
| 861 | } |
| 862 | |
| 863 | static |
| 864 | bool canReach(const NGHolder &g, const NFAEdge &e, struct fmstate &state) { |
| 865 | auto flags = g[e].assert_flags; |
| 866 | if (!flags) { |
| 867 | return true; |
| 868 | } |
| 869 | |
| 870 | if (flags & POS_FLAG_ASSERT_WORD_TO_NONWORD) { |
| 871 | if (isWordChar(state.prev) && !isWordChar(state.cur)) { |
| 872 | return true; |
| 873 | } |
| 874 | } |
| 875 | |
| 876 | if (flags & POS_FLAG_ASSERT_NONWORD_TO_WORD) { |
| 877 | if (!isWordChar(state.prev) && isWordChar(state.cur)) { |
| 878 | return true; |
| 879 | } |
| 880 | } |
| 881 | |
| 882 | if (flags & POS_FLAG_ASSERT_WORD_TO_WORD) { |
| 883 | if (isWordChar(state.prev) && isWordChar(state.cur)) { |
| 884 | return true; |
| 885 | } |
| 886 | } |
| 887 | |
| 888 | if (flags & POS_FLAG_ASSERT_NONWORD_TO_NONWORD) { |
| 889 | if (!isWordChar(state.prev) && !isWordChar(state.cur)) { |
| 890 | return true; |
| 891 | } |
| 892 | } |
| 893 | |
| 894 | return false; |
| 895 | } |
| 896 | |
| 897 | static |
| 898 | void getAcceptMatches(const NGHolder &g, MatchSet &matches, |
| 899 | struct fmstate &state, NFAVertex accept_vertex, |
| 900 | vector<StateSet::State> &active_states) { |
| 901 | assert(accept_vertex == g.accept || accept_vertex == g.acceptEod); |
| 902 | |
| 903 | const bool eod = accept_vertex == g.acceptEod; |
| 904 | if (eod) { |
| 905 | state.states.getAcceptEodStates(state.gc, active_states); |
| 906 | } else { |
| 907 | state.states.getAcceptStates(state.gc, active_states); |
| 908 | } |
| 909 | |
| 910 | DEBUG_PRINTF("Number of active states: %zu\n" , active_states.size()); |
| 911 | |
| 912 | for (const auto &cur : active_states) { |
| 913 | auto u = state.vertices[cur.idx]; |
| 914 | |
| 915 | // we can't accept anything from startDs in between UTF-8 codepoints |
| 916 | if (state.utf8 && u == g.startDs && !isUtf8CodePoint(state.cur)) { |
| 917 | continue; |
| 918 | } |
| 919 | |
| 920 | const auto &reports = |
| 921 | eod ? state.gc.vertex_eod_reports_by_level[cur.level][u] |
| 922 | : state.gc.vertex_reports_by_level[cur.level][u]; |
| 923 | |
| 924 | NFAEdge e = state.edge_cache.get(u, accept_vertex); |
| 925 | |
| 926 | // we assume edge assertions only exist at level 0 |
| 927 | if (e && !canReach(g, e, state)) { |
| 928 | continue; |
| 929 | } |
| 930 | |
| 931 | DEBUG_PRINTF("%smatch found at %zu\n" , eod ? "eod " : "" , state.offset); |
| 932 | |
| 933 | assert(!reports.empty()); |
| 934 | for (const auto &report_id : reports) { |
| 935 | const Report &ri = state.rm.getReport(report_id); |
| 936 | |
| 937 | DEBUG_PRINTF("report %u has offset adjustment %d\n" , report_id, |
| 938 | ri.offsetAdjust); |
| 939 | DEBUG_PRINTF("match from (i:%zu,l:%u,t:%u): (%zu,%zu)\n" , cur.idx, |
| 940 | cur.level, cur.type, cur.som, |
| 941 | state.offset + ri.offsetAdjust); |
| 942 | matches.emplace(cur.som, state.offset + ri.offsetAdjust); |
| 943 | } |
| 944 | } |
| 945 | } |
| 946 | |
| 947 | static |
| 948 | void getMatches(const NGHolder &g, MatchSet &matches, struct fmstate &state, |
| 949 | StateSet::WorkingData &wd, bool allowEodMatches) { |
| 950 | getAcceptMatches(g, matches, state, g.accept, wd.active); |
| 951 | if (allowEodMatches) { |
| 952 | getAcceptMatches(g, matches, state, g.acceptEod, wd.active); |
| 953 | } |
| 954 | } |
| 955 | |
| 956 | static |
| 957 | void step(const NGHolder &g, fmstate &state, StateSet::WorkingData &wd) { |
| 958 | state.next.reset(); |
| 959 | |
| 960 | state.states.getActiveStates(wd.active); |
| 961 | |
| 962 | for (const auto &cur : wd.active) { |
| 963 | auto u = state.vertices[cur.idx]; |
| 964 | state.states.getSuccessors(cur, state.gc, wd.succ_list); |
| 965 | |
| 966 | for (auto succ : wd.succ_list) { |
| 967 | auto v = state.vertices[succ.idx]; |
| 968 | |
| 969 | if (is_any_accept(v, g)) { |
| 970 | continue; |
| 971 | } |
| 972 | |
| 973 | if (!state.allowStartDs && v == g.startDs) { |
| 974 | continue; |
| 975 | } |
| 976 | |
| 977 | // GraphCache doesn't differentiate between successors for shadows |
| 978 | // and helpers, and StateSet does not know anything about the graph, |
| 979 | // so the only place we can do it is here. we can't self-loop on a |
| 980 | // startDs if we're startDs's helper, so disallow it. |
| 981 | if (u == g.startDs && v == g.startDs && |
| 982 | succ.level != 0 && succ.level == cur.level) { |
| 983 | continue; |
| 984 | } |
| 985 | |
| 986 | // for the reasons outlined above, also putting this here. |
| 987 | // disallow transitions from start to startDs on levels other than zero |
| 988 | if (u == g.start && v == g.startDs && |
| 989 | cur.level != 0 && succ.level != 0) { |
| 990 | continue; |
| 991 | } |
| 992 | |
| 993 | bool can_reach = false; |
| 994 | |
| 995 | if (succ.type == StateSet::State::NODE_HELPER) { |
| 996 | can_reach = true; |
| 997 | } else { |
| 998 | // we assume edge assertions only exist on level 0 |
| 999 | const CharReach &cr = g[v].char_reach; |
| 1000 | NFAEdge e = state.edge_cache.get(u, v); |
| 1001 | |
| 1002 | if (cr.test(state.cur) && |
| 1003 | (!e || canReach(g, e, state))) { |
| 1004 | can_reach = true; |
| 1005 | } |
| 1006 | } |
| 1007 | |
| 1008 | // check edge assertions if we are allowed to reach accept |
| 1009 | DEBUG_PRINTF("reaching %zu->%zu ('%c'->'%c'): %s\n" , |
| 1010 | g[u].index, g[v].index, |
| 1011 | ourisprint(state.prev) ? state.prev : '?', |
| 1012 | ourisprint(state.cur) ? state.cur : '?', |
| 1013 | can_reach ? "yes" : "no" ); |
| 1014 | |
| 1015 | if (can_reach) { |
| 1016 | // we should use current offset as SOM if: |
| 1017 | // - we're at level 0 and we're a start vertex |
| 1018 | // - we're a fake start shadow |
| 1019 | size_t next_som; |
| 1020 | bool reset = is_any_start(u, g) && cur.level == 0; |
| 1021 | reset |= is_virtual_start(u, g) && |
| 1022 | cur.type == StateSet::State::NODE_SHADOW; |
| 1023 | |
| 1024 | if (reset) { |
| 1025 | next_som = state.offset; |
| 1026 | } else { |
| 1027 | // else, inherit SOM from predecessor |
| 1028 | next_som = cur.som; |
| 1029 | } |
| 1030 | succ.som = next_som; |
| 1031 | |
| 1032 | DEBUG_PRINTF("src: idx %zu level: %u som: %zu type: %s\n" , |
| 1033 | cur.idx, cur.level, cur.som, |
| 1034 | cur.type == StateSet::State::NODE_HELPER ? "H" : "S" ); |
| 1035 | DEBUG_PRINTF("dst: idx %zu level: %u som: %zu type: %s\n" , |
| 1036 | succ.idx, succ.level, succ.som, |
| 1037 | succ.type == StateSet::State::NODE_HELPER ? "H" : "S" ); |
| 1038 | |
| 1039 | // activate successor (SOM will be handled by activateState) |
| 1040 | state.next.activateState(succ); |
| 1041 | } |
| 1042 | } |
| 1043 | } |
| 1044 | } |
| 1045 | |
| 1046 | // filter extraneous matches |
| 1047 | static |
| 1048 | void filterMatches(MatchSet &matches) { |
| 1049 | set<size_t> eom; |
| 1050 | |
| 1051 | // first, collect all end-offset matches |
| 1052 | for (const auto &match : matches) { |
| 1053 | eom.insert(match.second); |
| 1054 | } |
| 1055 | |
| 1056 | // now, go through all the end-offsets and filter extra matches |
| 1057 | for (const auto &elem : eom) { |
| 1058 | // find minimum SOM for this EOM |
| 1059 | size_t min_som = -1U; |
| 1060 | for (const auto &match : matches) { |
| 1061 | // skip entries with wrong EOM |
| 1062 | if (match.second != elem) { |
| 1063 | continue; |
| 1064 | } |
| 1065 | |
| 1066 | min_som = min(min_som, match.first); |
| 1067 | } |
| 1068 | |
| 1069 | auto msit = matches.begin(); |
| 1070 | while (msit != matches.end()) { |
| 1071 | // skip everything that doesn't match |
| 1072 | if (msit->second != elem || msit->first <= min_som) { |
| 1073 | ++msit; |
| 1074 | continue; |
| 1075 | } |
| 1076 | DEBUG_PRINTF("erasing match %zu, %zu\n" , msit->first, msit->second); |
| 1077 | matches.erase(msit++); |
| 1078 | } |
| 1079 | } |
| 1080 | } |
| 1081 | |
| 1082 | /** \brief Find all matches for a given graph when executed against \a input. |
| 1083 | * |
| 1084 | * Fills \a matches with offsets into the data stream where a match is found. |
| 1085 | */ |
| 1086 | bool findMatches(const NGHolder &g, const ReportManager &rm, |
| 1087 | const string &input, MatchSet &matches, |
| 1088 | const u32 edit_distance, const u32 hamm_distance, |
| 1089 | const bool notEod, const bool utf8) { |
| 1090 | assert(hasCorrectlyNumberedVertices(g)); |
| 1091 | // cannot match fuzzy utf8 patterns, this should've been filtered out at |
| 1092 | // compile time, so make it an assert |
| 1093 | assert(!edit_distance || !utf8); |
| 1094 | // cannot be both edit and Hamming distance at once |
| 1095 | assert(!edit_distance || !hamm_distance); |
| 1096 | |
| 1097 | bool hamming = hamm_distance > 0; |
| 1098 | auto dist = hamming ? hamm_distance : edit_distance; |
| 1099 | |
| 1100 | const size_t total_states = num_vertices(g) * (3 * dist + 1); |
| 1101 | DEBUG_PRINTF("Finding matches (%zu total states)\n" , total_states); |
| 1102 | if (total_states > STATE_COUNT_MAX) { |
| 1103 | DEBUG_PRINTF("too big\n" ); |
| 1104 | return false; |
| 1105 | } |
| 1106 | |
| 1107 | GraphCache gc(edit_distance, hamm_distance, g); |
| 1108 | #ifdef DEBUG |
| 1109 | gc.dumpStateTransitionTable(g); |
| 1110 | #endif |
| 1111 | |
| 1112 | const bool allowStartDs = (proper_out_degree(g.startDs, g) > 0); |
| 1113 | |
| 1114 | struct fmstate state(g, gc, utf8, allowStartDs, dist, rm); |
| 1115 | |
| 1116 | StateSet::WorkingData wd; |
| 1117 | |
| 1118 | for (auto it = input.begin(), ite = input.end(); it != ite; ++it) { |
| 1119 | #ifdef DEBUG |
| 1120 | state.states.dumpActiveStates(); |
| 1121 | #endif |
| 1122 | state.offset = std::distance(input.begin(), it); |
| 1123 | state.cur = *it; |
| 1124 | |
| 1125 | step(g, state, wd); |
| 1126 | |
| 1127 | getMatches(g, matches, state, wd, false); |
| 1128 | |
| 1129 | DEBUG_PRINTF("offset %zu, %zu states on\n" , state.offset, |
| 1130 | state.next.count()); |
| 1131 | if (state.next.empty()) { |
| 1132 | filterMatches(matches); |
| 1133 | return true; |
| 1134 | } |
| 1135 | state.states = state.next; |
| 1136 | state.prev = state.cur; |
| 1137 | } |
| 1138 | #ifdef DEBUG |
| 1139 | state.states.dumpActiveStates(); |
| 1140 | #endif |
| 1141 | state.offset = input.size(); |
| 1142 | state.cur = 0; |
| 1143 | |
| 1144 | // do additional step to get matches after stream end, this time count eod |
| 1145 | // matches also (or not, if we're in notEod mode) |
| 1146 | |
| 1147 | DEBUG_PRINTF("Looking for EOD matches\n" ); |
| 1148 | getMatches(g, matches, state, wd, !notEod); |
| 1149 | |
| 1150 | filterMatches(matches); |
| 1151 | return true; |
| 1152 | } |
| 1153 | |