| 1 | /* |
| 2 | * Copyright (c) 2015-2017, Intel Corporation |
| 3 | * |
| 4 | * Redistribution and use in source and binary forms, with or without |
| 5 | * modification, are permitted provided that the following conditions are met: |
| 6 | * |
| 7 | * * Redistributions of source code must retain the above copyright notice, |
| 8 | * this list of conditions and the following disclaimer. |
| 9 | * * Redistributions in binary form must reproduce the above copyright |
| 10 | * notice, this list of conditions and the following disclaimer in the |
| 11 | * documentation and/or other materials provided with the distribution. |
| 12 | * * Neither the name of Intel Corporation nor the names of its contributors |
| 13 | * may be used to endorse or promote products derived from this software |
| 14 | * without specific prior written permission. |
| 15 | * |
| 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 26 | * POSSIBILITY OF SUCH DAMAGE. |
| 27 | */ |
| 28 | |
| 29 | /** \file |
| 30 | * \brief Graph fuzzer for approximate matching |
| 31 | */ |
| 32 | |
| 33 | #include "ng_fuzzy.h" |
| 34 | |
| 35 | #include "ng.h" |
| 36 | #include "ng_depth.h" |
| 37 | #include "ng_util.h" |
| 38 | |
| 39 | #include <map> |
| 40 | #include <vector> |
| 41 | using namespace std; |
| 42 | |
| 43 | namespace ue2 { |
| 44 | |
| 45 | // returns all successors up to a given depth in a vector of sets, indexed by |
| 46 | // zero-based depth from source vertex |
| 47 | static |
| 48 | vector<flat_set<NFAVertex>> gatherSuccessorsByDepth(const NGHolder &g, |
| 49 | NFAVertex src, u32 depth) { |
| 50 | vector<flat_set<NFAVertex>> result(depth); |
| 51 | flat_set<NFAVertex> cur, next; |
| 52 | |
| 53 | assert(depth > 0); |
| 54 | |
| 55 | // populate current set of successors |
| 56 | for (auto v : adjacent_vertices_range(src, g)) { |
| 57 | // ignore self-loops |
| 58 | if (src == v) { |
| 59 | continue; |
| 60 | } |
| 61 | DEBUG_PRINTF("Node %zu depth 1\n" , g[v].index); |
| 62 | cur.insert(v); |
| 63 | } |
| 64 | result[0] = cur; |
| 65 | |
| 66 | for (unsigned d = 1; d < depth; d++) { |
| 67 | // collect all successors for all current level vertices |
| 68 | for (auto v : cur) { |
| 69 | // don't go past special nodes |
| 70 | if (is_special(v, g)) { |
| 71 | continue; |
| 72 | } |
| 73 | |
| 74 | for (auto succ : adjacent_vertices_range(v, g)) { |
| 75 | // ignore self-loops |
| 76 | if (v == succ) { |
| 77 | continue; |
| 78 | } |
| 79 | DEBUG_PRINTF("Node %zu depth %u\n" , g[succ].index, d + 1); |
| 80 | next.insert(succ); |
| 81 | } |
| 82 | } |
| 83 | result[d] = next; |
| 84 | next.swap(cur); |
| 85 | next.clear(); |
| 86 | } |
| 87 | |
| 88 | return result; |
| 89 | } |
| 90 | |
| 91 | // returns all predecessors up to a given depth in a vector of sets, indexed by |
| 92 | // zero-based depth from source vertex |
| 93 | static |
| 94 | vector<flat_set<NFAVertex>> gatherPredecessorsByDepth(const NGHolder &g, |
| 95 | NFAVertex src, |
| 96 | u32 depth) { |
| 97 | vector<flat_set<NFAVertex>> result(depth); |
| 98 | flat_set<NFAVertex> cur, next; |
| 99 | |
| 100 | assert(depth > 0); |
| 101 | |
| 102 | // populate current set of successors |
| 103 | for (auto v : inv_adjacent_vertices_range(src, g)) { |
| 104 | // ignore self-loops |
| 105 | if (src == v) { |
| 106 | continue; |
| 107 | } |
| 108 | DEBUG_PRINTF("Node %zu depth 1\n" , g[v].index); |
| 109 | cur.insert(v); |
| 110 | } |
| 111 | result[0] = cur; |
| 112 | |
| 113 | for (unsigned d = 1; d < depth; d++) { |
| 114 | // collect all successors for all current level vertices |
| 115 | for (auto v : cur) { |
| 116 | for (auto pred : inv_adjacent_vertices_range(v, g)) { |
| 117 | // ignore self-loops |
| 118 | if (v == pred) { |
| 119 | continue; |
| 120 | } |
| 121 | DEBUG_PRINTF("Node %zu depth %u\n" , g[pred].index, d + 1); |
| 122 | next.insert(pred); |
| 123 | } |
| 124 | } |
| 125 | result[d] = next; |
| 126 | next.swap(cur); |
| 127 | next.clear(); |
| 128 | } |
| 129 | |
| 130 | return result; |
| 131 | } |
| 132 | |
| 133 | /* |
| 134 | * This struct produces a fuzzed graph; that is, a graph that is able to match |
| 135 | * the original pattern, as well as input data within a certain edit distance. |
| 136 | * Construct the struct, then call fuzz_graph() to transform the graph. |
| 137 | * |
| 138 | * Terminology used: |
| 139 | * - Shadow vertices: vertices mirroring the original graph at various edit |
| 140 | * distances |
| 141 | * - Shadow graph level: edit distance of a particular shadow graph |
| 142 | * - Helpers: dot vertices assigned to shadow vertices, used for insert/replace |
| 143 | */ |
| 144 | struct ShadowGraph { |
| 145 | NGHolder &g; |
| 146 | u32 edit_distance; |
| 147 | bool hamming; |
| 148 | map<pair<NFAVertex, u32>, NFAVertex> shadow_map; |
| 149 | map<pair<NFAVertex, u32>, NFAVertex> helper_map; |
| 150 | map<NFAVertex, NFAVertex> clones; |
| 151 | // edge creation is deferred |
| 152 | vector<pair<NFAVertex, NFAVertex>> edges_to_be_added; |
| 153 | flat_set<NFAVertex> orig; |
| 154 | |
| 155 | ShadowGraph(NGHolder &g_in, u32 ed_in, bool hamm_in) |
| 156 | : g(g_in), edit_distance(ed_in), hamming(hamm_in) {} |
| 157 | |
| 158 | void fuzz_graph() { |
| 159 | if (edit_distance == 0) { |
| 160 | return; |
| 161 | } |
| 162 | |
| 163 | DEBUG_PRINTF("edit distance = %u hamming = %s\n" , edit_distance, |
| 164 | hamming ? "true" : "false" ); |
| 165 | |
| 166 | // step 1: prepare the vertices, helpers and shadows according to |
| 167 | // the original graph |
| 168 | prepare_graph(); |
| 169 | |
| 170 | // step 2: add shadow and helper nodes |
| 171 | build_shadow_graph(); |
| 172 | |
| 173 | // step 3: set up reports for newly created vertices (and make clones |
| 174 | // if necessary) |
| 175 | if (!hamming) { |
| 176 | create_reports(); |
| 177 | } |
| 178 | |
| 179 | // step 4: wire up shadow graph and helpers for insert/replace/remove |
| 180 | connect_shadow_graph(); |
| 181 | |
| 182 | // step 5: commit all the edge wirings |
| 183 | DEBUG_PRINTF("Committing edge wirings\n" ); |
| 184 | for (const auto &p : edges_to_be_added) { |
| 185 | add_edge_if_not_present(p.first, p.second, g); |
| 186 | } |
| 187 | |
| 188 | DEBUG_PRINTF("Done!\n" ); |
| 189 | } |
| 190 | |
| 191 | private: |
| 192 | const NFAVertex& get_clone(const NFAVertex &v) { |
| 193 | return contains(clones, v) ? |
| 194 | clones[v] : v; |
| 195 | } |
| 196 | |
| 197 | void connect_to_clones(const NFAVertex &u, const NFAVertex &v) { |
| 198 | const NFAVertex &clone_u = get_clone(u); |
| 199 | const NFAVertex &clone_v = get_clone(v); |
| 200 | |
| 201 | edges_to_be_added.emplace_back(u, v); |
| 202 | DEBUG_PRINTF("Adding edge: %zu -> %zu\n" , g[u].index, g[v].index); |
| 203 | |
| 204 | // do not connect clones to accepts, we do it during cloning |
| 205 | if (is_any_accept(clone_v, g)) { |
| 206 | return; |
| 207 | } |
| 208 | edges_to_be_added.emplace_back(clone_u, clone_v); |
| 209 | DEBUG_PRINTF("Adding edge: %zu -> %zu\n" , g[clone_u].index, |
| 210 | g[clone_v].index); |
| 211 | } |
| 212 | |
| 213 | void prepare_graph() { |
| 214 | DEBUG_PRINTF("Building shadow graphs\n" ); |
| 215 | |
| 216 | for (auto v : vertices_range(g)) { |
| 217 | // all level 0 vertices are their own helpers and their own shadows |
| 218 | helper_map[make_pair(v, 0)] = v; |
| 219 | shadow_map[make_pair(v, 0)] = v; |
| 220 | |
| 221 | // find special nodes |
| 222 | if (is_any_accept(v, g)) { |
| 223 | DEBUG_PRINTF("Node %zu is a special node\n" , g[v].index); |
| 224 | for (unsigned edit = 1; edit <= edit_distance; edit++) { |
| 225 | // all accepts are their own shadows and helpers at all |
| 226 | // levels |
| 227 | shadow_map[make_pair(v, edit)] = v; |
| 228 | helper_map[make_pair(v, edit)] = v; |
| 229 | } |
| 230 | continue; |
| 231 | } |
| 232 | DEBUG_PRINTF("Node %zu is to be shadowed\n" , g[v].index); |
| 233 | orig.insert(v); |
| 234 | } |
| 235 | } |
| 236 | |
| 237 | void build_shadow_graph() { |
| 238 | for (auto v : orig) { |
| 239 | DEBUG_PRINTF("Adding shadow/helper nodes for node %zu\n" , |
| 240 | g[v].index); |
| 241 | for (unsigned dist = 1; dist <= edit_distance; dist++) { |
| 242 | auto shadow_v = v; |
| 243 | |
| 244 | // start and startDs cannot have shadows but do have helpers |
| 245 | if (!is_any_start(v, g)) { |
| 246 | shadow_v = clone_vertex(g, v); |
| 247 | DEBUG_PRINTF("New shadow node ID: %zu (level %u)\n" , |
| 248 | g[shadow_v].index, dist); |
| 249 | } |
| 250 | shadow_map[make_pair(v, dist)] = shadow_v; |
| 251 | |
| 252 | // if there's nowhere to go from this vertex, no helper needed |
| 253 | if (proper_out_degree(v, g) < 1) { |
| 254 | DEBUG_PRINTF("No helper for node ID: %zu (level %u)\n" , |
| 255 | g[shadow_v].index, dist); |
| 256 | helper_map[make_pair(v, dist)] = shadow_v; |
| 257 | continue; |
| 258 | } |
| 259 | |
| 260 | // start and startDs only have helpers for insert, so not Hamming |
| 261 | if (hamming && is_any_start(v, g)) { |
| 262 | DEBUG_PRINTF("No helper for node ID: %zu (level %u)\n" , |
| 263 | g[shadow_v].index, dist); |
| 264 | helper_map[make_pair(v, dist)] = shadow_v; |
| 265 | continue; |
| 266 | } |
| 267 | |
| 268 | auto helper_v = clone_vertex(g, v); |
| 269 | DEBUG_PRINTF("New helper node ID: %zu (level %u)\n" , |
| 270 | g[helper_v].index, dist); |
| 271 | |
| 272 | // this is a helper, so make it a dot |
| 273 | g[helper_v].char_reach = CharReach::dot(); |
| 274 | // do not copy virtual start's assert flags |
| 275 | if (is_virtual_start(v, g)) { |
| 276 | DEBUG_PRINTF("Helper node ID is virtual start: %zu (level %u)\n" , |
| 277 | g[helper_v].index, dist); |
| 278 | g[helper_v].assert_flags = 0; |
| 279 | } |
| 280 | helper_map[make_pair(v, dist)] = helper_v; |
| 281 | } |
| 282 | } |
| 283 | } |
| 284 | |
| 285 | // wire up successors according to the original graph, wire helpers |
| 286 | // to shadow successors (insert/replace) |
| 287 | void connect_succs(NFAVertex v, u32 dist) { |
| 288 | DEBUG_PRINTF("Wiring up successors for node %zu shadow level %u\n" , |
| 289 | g[v].index, dist); |
| 290 | const auto &cur_shadow_v = shadow_map[make_pair(v, dist)]; |
| 291 | const auto &cur_shadow_helper = helper_map[make_pair(v, dist)]; |
| 292 | |
| 293 | // multiple insert |
| 294 | if (!hamming && dist > 1) { |
| 295 | const auto &prev_level_helper = helper_map[make_pair(v, dist - 1)]; |
| 296 | connect_to_clones(prev_level_helper, cur_shadow_helper); |
| 297 | } |
| 298 | |
| 299 | for (auto orig_dst : adjacent_vertices_range(v, g)) { |
| 300 | const auto &shadow_dst = shadow_map[make_pair(orig_dst, dist)]; |
| 301 | |
| 302 | connect_to_clones(cur_shadow_v, shadow_dst); |
| 303 | |
| 304 | // ignore startDs for insert/replace |
| 305 | if (orig_dst == g.startDs) { |
| 306 | continue; |
| 307 | } |
| 308 | |
| 309 | connect_to_clones(cur_shadow_helper, shadow_dst); |
| 310 | } |
| 311 | } |
| 312 | |
| 313 | // wire up predecessors according to the original graph, wire |
| 314 | // predecessors to helpers (replace), wire predecessor helpers to |
| 315 | // helpers (multiple replace) |
| 316 | void connect_preds(NFAVertex v, u32 dist) { |
| 317 | DEBUG_PRINTF("Wiring up predecessors for node %zu shadow level %u\n" , |
| 318 | g[v].index, dist); |
| 319 | const auto &cur_shadow_v = shadow_map[make_pair(v, dist)]; |
| 320 | const auto &cur_shadow_helper = helper_map[make_pair(v, dist)]; |
| 321 | |
| 322 | auto orig_src_vertices = inv_adjacent_vertices_range(v, g); |
| 323 | for (auto orig_src : orig_src_vertices) { |
| 324 | // ignore edges from start to startDs |
| 325 | if (v == g.startDs && orig_src == g.start) { |
| 326 | continue; |
| 327 | } |
| 328 | // ignore self-loops for replace |
| 329 | if (orig_src != v) { |
| 330 | // do not wire a replace node for start vertices if we |
| 331 | // have a virtual start |
| 332 | if (is_virtual_start(v, g) && is_any_start(orig_src, g)) { |
| 333 | continue; |
| 334 | } |
| 335 | |
| 336 | if (dist) { |
| 337 | const auto &prev_level_src = |
| 338 | shadow_map[make_pair(orig_src, dist - 1)]; |
| 339 | const auto &prev_level_helper = |
| 340 | helper_map[make_pair(orig_src, dist - 1)]; |
| 341 | |
| 342 | connect_to_clones(prev_level_src, cur_shadow_helper); |
| 343 | connect_to_clones(prev_level_helper, cur_shadow_helper); |
| 344 | } |
| 345 | } |
| 346 | // wire predecessor according to original graph |
| 347 | const auto &shadow_src = shadow_map[make_pair(orig_src, dist)]; |
| 348 | |
| 349 | connect_to_clones(shadow_src, cur_shadow_v); |
| 350 | } |
| 351 | } |
| 352 | |
| 353 | // wire up previous level helper to current shadow (insert) |
| 354 | void connect_helpers(NFAVertex v, u32 dist) { |
| 355 | DEBUG_PRINTF("Wiring up helpers for node %zu shadow level %u\n" , |
| 356 | g[v].index, dist); |
| 357 | const auto &cur_shadow_helper = helper_map[make_pair(v, dist)]; |
| 358 | auto prev_level_v = shadow_map[make_pair(v, dist - 1)]; |
| 359 | |
| 360 | connect_to_clones(prev_level_v, cur_shadow_helper); |
| 361 | } |
| 362 | |
| 363 | /* |
| 364 | * wiring edges for removal is a special case. |
| 365 | * |
| 366 | * when wiring edges for removal, as well as wiring up immediate |
| 367 | * predecessors to immediate successors, we also need to wire up more |
| 368 | * distant successors to their respective shadow graph levels. |
| 369 | * |
| 370 | * for example, consider graph start->a->b->c->d->accept. |
| 371 | * |
| 372 | * at edit distance 1, we need remove edges start->b, a->c, b->d, and |
| 373 | * c->accept, all going from original graph (level 0) to shadow graph |
| 374 | * level 1. |
| 375 | * |
| 376 | * at edit distance 2, we also need edges start->c, a->d and b->accept, |
| 377 | * all going from level 0 to shadow graph level 2. |
| 378 | * |
| 379 | * this is propagated to all shadow levels; that is, given edit |
| 380 | * distance 3, we will have edges from shadow levels 0->1, 0->2, |
| 381 | * 0->3, 1->2, 1->3, and 2->3. |
| 382 | * |
| 383 | * therefore, we wire them in steps: first wire with step 1 (0->1, 1->2, |
| 384 | * 2->3) at depth 1, then wire with step 2 (0->2, 1->3) at depth 2, etc. |
| 385 | * |
| 386 | * we also have to wire helpers to their removal successors, to |
| 387 | * accommodate for a replace followed by a remove, on all shadow levels. |
| 388 | * |
| 389 | * and finally, we also have to wire source shadows into removal |
| 390 | * successor helpers on a level above, to accommodate for a remove |
| 391 | * followed by a replace. |
| 392 | */ |
| 393 | void connect_removals(NFAVertex v) { |
| 394 | DEBUG_PRINTF("Wiring up remove edges for node %zu\n" , g[v].index); |
| 395 | |
| 396 | // vertices returned by this function don't include self-loops |
| 397 | auto dst_vertices_by_depth = |
| 398 | gatherSuccessorsByDepth(g, v, edit_distance); |
| 399 | auto orig_src_vertices = inv_adjacent_vertices_range(v, g); |
| 400 | for (auto orig_src : orig_src_vertices) { |
| 401 | // ignore self-loops |
| 402 | if (orig_src == v) { |
| 403 | continue; |
| 404 | } |
| 405 | for (unsigned step = 1; step <= edit_distance; step++) { |
| 406 | for (unsigned dist = step; dist <= edit_distance; dist++) { |
| 407 | auto &dst_vertices = dst_vertices_by_depth[step - 1]; |
| 408 | for (auto &orig_dst : dst_vertices) { |
| 409 | const auto &shadow_src = |
| 410 | shadow_map[make_pair(orig_src, dist - step)]; |
| 411 | const auto &shadow_helper = |
| 412 | helper_map[make_pair(orig_src, dist - step)]; |
| 413 | const auto &shadow_dst = |
| 414 | shadow_map[make_pair(orig_dst, dist)]; |
| 415 | |
| 416 | // removal |
| 417 | connect_to_clones(shadow_src, shadow_dst); |
| 418 | |
| 419 | // removal from helper vertex |
| 420 | connect_to_clones(shadow_helper, shadow_dst); |
| 421 | |
| 422 | // removal into helper, requires additional edit |
| 423 | if ((dist + 1) <= edit_distance) { |
| 424 | const auto &next_level_helper = |
| 425 | helper_map[make_pair(orig_dst, dist + 1)]; |
| 426 | |
| 427 | connect_to_clones(shadow_src, next_level_helper); |
| 428 | } |
| 429 | } |
| 430 | } |
| 431 | } |
| 432 | } |
| 433 | } |
| 434 | |
| 435 | void connect_shadow_graph() { |
| 436 | DEBUG_PRINTF("Wiring up the graph\n" ); |
| 437 | |
| 438 | for (auto v : orig) { |
| 439 | |
| 440 | DEBUG_PRINTF("Wiring up edges for node %zu\n" , g[v].index); |
| 441 | |
| 442 | for (unsigned dist = 0; dist <= edit_distance; dist++) { |
| 443 | |
| 444 | // handle insert/replace |
| 445 | connect_succs(v, dist); |
| 446 | |
| 447 | // handle replace/multiple insert |
| 448 | connect_preds(v, dist); |
| 449 | |
| 450 | // handle helpers |
| 451 | if (!hamming && dist > 0) { |
| 452 | connect_helpers(v, dist); |
| 453 | } |
| 454 | } |
| 455 | |
| 456 | // handle removals |
| 457 | if (!hamming) { |
| 458 | connect_removals(v); |
| 459 | } |
| 460 | } |
| 461 | } |
| 462 | |
| 463 | void connect_to_targets(NFAVertex src, const flat_set<NFAVertex> &targets) { |
| 464 | for (auto dst : targets) { |
| 465 | DEBUG_PRINTF("Adding edge: %zu -> %zu\n" , g[src].index, |
| 466 | g[dst].index); |
| 467 | edges_to_be_added.emplace_back(src, dst); |
| 468 | } |
| 469 | } |
| 470 | |
| 471 | // create a clone of the vertex, but overwrite its report set |
| 472 | void create_clone(NFAVertex v, const flat_set<ReportID> &reports, |
| 473 | unsigned max_edit_distance, |
| 474 | const flat_set<NFAVertex> &targets) { |
| 475 | // some vertices may have the same reports, but different successors; |
| 476 | // therefore, we may need to connect them multiple times, but still only |
| 477 | // clone once |
| 478 | bool needs_cloning = !contains(clones, v); |
| 479 | |
| 480 | DEBUG_PRINTF("Cloning node %zu\n" , g[v].index); |
| 481 | // go through all shadows and helpers, including |
| 482 | // original vertex |
| 483 | for (unsigned d = 0; d < max_edit_distance; d++) { |
| 484 | auto shadow_v = shadow_map[make_pair(v, d)]; |
| 485 | auto helper_v = helper_map[make_pair(v, d)]; |
| 486 | |
| 487 | NFAVertex new_shadow_v, new_helper_v; |
| 488 | |
| 489 | // make sure we don't clone the same vertex twice |
| 490 | if (needs_cloning) { |
| 491 | new_shadow_v = clone_vertex(g, shadow_v); |
| 492 | DEBUG_PRINTF("New shadow node ID: %zu (level %u)\n" , |
| 493 | g[new_shadow_v].index, d); |
| 494 | clones[shadow_v] = new_shadow_v; |
| 495 | } else { |
| 496 | new_shadow_v = clones[shadow_v]; |
| 497 | } |
| 498 | g[new_shadow_v].reports = reports; |
| 499 | |
| 500 | connect_to_targets(new_shadow_v, targets); |
| 501 | |
| 502 | if (shadow_v == helper_v) { |
| 503 | continue; |
| 504 | } |
| 505 | if (needs_cloning) { |
| 506 | new_helper_v = clone_vertex(g, helper_v); |
| 507 | DEBUG_PRINTF("New helper node ID: %zu (level %u)\n" , |
| 508 | g[new_helper_v].index, d); |
| 509 | clones[helper_v] = new_helper_v; |
| 510 | } else { |
| 511 | new_helper_v = clones[helper_v]; |
| 512 | } |
| 513 | g[new_helper_v].reports = reports; |
| 514 | |
| 515 | connect_to_targets(new_helper_v, targets); |
| 516 | } |
| 517 | } |
| 518 | |
| 519 | void write_reports(NFAVertex v, const flat_set<ReportID> &reports, |
| 520 | unsigned max_edit_distance, |
| 521 | const flat_set<NFAVertex> &targets) { |
| 522 | // we're overwriting reports, but we're not losing any |
| 523 | // information as we already cached all the different report |
| 524 | // sets, so vertices having different reports will be cloned and set up |
| 525 | // with the correct report set |
| 526 | |
| 527 | // go through all shadows and helpers, including original |
| 528 | // vertex |
| 529 | for (unsigned d = 0; d < max_edit_distance; d++) { |
| 530 | auto shadow_v = shadow_map[make_pair(v, d)]; |
| 531 | auto helper_v = helper_map[make_pair(v, d)]; |
| 532 | DEBUG_PRINTF("Setting up reports for shadow node: %zu " |
| 533 | "(level %u)\n" , |
| 534 | g[shadow_v].index, d); |
| 535 | DEBUG_PRINTF("Setting up reports for helper node: %zu " |
| 536 | "(level %u)\n" , |
| 537 | g[helper_v].index, d); |
| 538 | g[shadow_v].reports = reports; |
| 539 | g[helper_v].reports = reports; |
| 540 | |
| 541 | connect_to_targets(shadow_v, targets); |
| 542 | connect_to_targets(helper_v, targets); |
| 543 | } |
| 544 | } |
| 545 | |
| 546 | /* |
| 547 | * we may have multiple report sets per graph. that means, whenever we |
| 548 | * construct additional paths through the graph (alternations, removals), we |
| 549 | * have to account for the fact that some vertices are predecessors to |
| 550 | * vertices with different report sets. |
| 551 | * |
| 552 | * whenever that happens, we have to clone the paths for both report sets, |
| 553 | * and set up these new vertices with their respective report sets as well. |
| 554 | * |
| 555 | * in order to do that, we first have to get all the predecessors for accept |
| 556 | * and acceptEod vertices. then, go through them one by one, and take note |
| 557 | * of the report lists. the first report set we find, wins, the rest we |
| 558 | * clone. |
| 559 | * |
| 560 | * we also have to do this in two passes, because there may be vertices that |
| 561 | * are predecessors to vertices with different report sets, so to avoid |
| 562 | * overwriting reports we will be caching reports info instead. |
| 563 | */ |
| 564 | void create_reports() { |
| 565 | map<flat_set<ReportID>, flat_set<NFAVertex>> reports_to_vertices; |
| 566 | flat_set<NFAVertex> accepts{g.accept, g.acceptEod}; |
| 567 | |
| 568 | // gather reports info from all vertices connected to accept |
| 569 | for (auto accept : accepts) { |
| 570 | for (auto src : inv_adjacent_vertices_range(accept, g)) { |
| 571 | // skip special vertices |
| 572 | if (is_special(src, g)) { |
| 573 | continue; |
| 574 | } |
| 575 | reports_to_vertices[g[src].reports].insert(src); |
| 576 | } |
| 577 | } |
| 578 | |
| 579 | // we expect to see at most two report sets |
| 580 | assert(reports_to_vertices.size() > 0 && |
| 581 | reports_to_vertices.size() <= 2); |
| 582 | |
| 583 | // set up all reports |
| 584 | bool clone = false; |
| 585 | for (auto &pair : reports_to_vertices) { |
| 586 | const auto &reports = pair.first; |
| 587 | const auto &vertices = pair.second; |
| 588 | |
| 589 | for (auto src : vertices) { |
| 590 | // get all predecessors up to edit distance |
| 591 | auto src_vertices_by_depth = |
| 592 | gatherPredecessorsByDepth(g, src, edit_distance); |
| 593 | |
| 594 | // find which accepts source vertex connects to |
| 595 | flat_set<NFAVertex> targets; |
| 596 | for (const auto &accept : accepts) { |
| 597 | NFAEdge e = edge(src, accept, g); |
| 598 | if (e) { |
| 599 | targets.insert(accept); |
| 600 | } |
| 601 | } |
| 602 | assert(targets.size()); |
| 603 | |
| 604 | for (unsigned d = 0; d < src_vertices_by_depth.size(); d++) { |
| 605 | const auto &preds = src_vertices_by_depth[d]; |
| 606 | for (auto v : preds) { |
| 607 | // only clone a node if it already contains reports |
| 608 | if (clone && !g[v].reports.empty()) { |
| 609 | create_clone(v, reports, edit_distance - d, |
| 610 | targets); |
| 611 | } else { |
| 612 | write_reports(v, reports, edit_distance - d, |
| 613 | targets); |
| 614 | } |
| 615 | } |
| 616 | } |
| 617 | } |
| 618 | // clone vertices only if it's not our first report set |
| 619 | clone = true; |
| 620 | } |
| 621 | } |
| 622 | }; |
| 623 | |
| 624 | // check if we will edit our way into a vacuous pattern |
| 625 | static |
| 626 | bool will_turn_vacuous(const NGHolder &g, u32 edit_distance) { |
| 627 | auto depths = calcRevDepths(g); |
| 628 | |
| 629 | depth min_depth = depth::infinity(); |
| 630 | auto idx = g[g.start].index; |
| 631 | |
| 632 | // check distance from start to accept/acceptEod |
| 633 | if (depths[idx].toAccept.min.is_finite()) { |
| 634 | min_depth = min(depths[idx].toAccept.min, min_depth); |
| 635 | } |
| 636 | if (depths[idx].toAcceptEod.min.is_finite()) { |
| 637 | min_depth = min(depths[idx].toAcceptEod.min, min_depth); |
| 638 | } |
| 639 | |
| 640 | idx = g[g.startDs].index; |
| 641 | |
| 642 | // check distance from startDs to accept/acceptEod |
| 643 | if (depths[idx].toAccept.min.is_finite()) { |
| 644 | min_depth = min(depths[idx].toAccept.min, min_depth); |
| 645 | } |
| 646 | if (depths[idx].toAcceptEod.min.is_finite()) { |
| 647 | min_depth = min(depths[idx].toAcceptEod.min, min_depth); |
| 648 | } |
| 649 | |
| 650 | assert(min_depth.is_finite()); |
| 651 | |
| 652 | // now, check if we can edit our way into a vacuous pattern |
| 653 | if (min_depth <= (u64a) edit_distance + 1) { |
| 654 | DEBUG_PRINTF("Pattern will turn vacuous if approximately matched\n" ); |
| 655 | return true; |
| 656 | } |
| 657 | return false; |
| 658 | } |
| 659 | |
| 660 | void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool hamming, |
| 661 | bool utf8, const Grey &grey) { |
| 662 | if (edit_distance == 0) { |
| 663 | return; |
| 664 | } |
| 665 | if (!grey.allowApproximateMatching) { |
| 666 | throw CompileError("Approximate matching is disabled." ); |
| 667 | } |
| 668 | if (edit_distance > grey.maxEditDistance) { |
| 669 | throw CompileError("Edit distance is too big." ); |
| 670 | } |
| 671 | if (utf8) { |
| 672 | throw CompileError("UTF-8 is disallowed for approximate matching." ); |
| 673 | } |
| 674 | // graph isn't fuzzable if there are edge assertions anywhere in the graph |
| 675 | for (auto e : edges_range(g)) { |
| 676 | if (g[e].assert_flags) { |
| 677 | throw CompileError("Zero-width assertions are disallowed for " |
| 678 | "approximate matching." ); |
| 679 | } |
| 680 | } |
| 681 | if (!hamming && will_turn_vacuous(g, edit_distance)) { |
| 682 | throw CompileError("Approximate matching patterns that reduce to " |
| 683 | "vacuous patterns are disallowed." ); |
| 684 | } |
| 685 | } |
| 686 | |
| 687 | void make_fuzzy(NGHolder &g, u32 edit_distance, bool hamming, |
| 688 | const Grey &grey) { |
| 689 | if (edit_distance == 0) { |
| 690 | return; |
| 691 | } |
| 692 | |
| 693 | assert(grey.allowApproximateMatching); |
| 694 | assert(grey.maxEditDistance >= edit_distance); |
| 695 | |
| 696 | ShadowGraph sg(g, edit_distance, hamming); |
| 697 | sg.fuzz_graph(); |
| 698 | |
| 699 | // For safety, enforce limit on actual vertex count. |
| 700 | if (num_vertices(g) > grey.limitApproxMatchingVertices) { |
| 701 | DEBUG_PRINTF("built %zu vertices > limit of %u\n" , num_vertices(g), |
| 702 | grey.limitApproxMatchingVertices); |
| 703 | throw ResourceLimitError(); |
| 704 | } |
| 705 | } |
| 706 | |
| 707 | } // namespace ue2 |
| 708 | |