1 | /* |
2 | * Copyright (c) 2015-2017, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** \file |
30 | * \brief Pattern lifetime analysis. |
31 | */ |
32 | |
33 | #include "config.h" |
34 | |
35 | #include "ng_find_matches.h" |
36 | |
37 | #include "nfagraph/ng_holder.h" |
38 | #include "nfagraph/ng_util.h" |
39 | #include "parser/position.h" |
40 | #include "util/container.h" |
41 | #include "util/compare.h" |
42 | #include "util/report.h" |
43 | #include "util/report_manager.h" |
44 | #include "util/unordered.h" |
45 | |
46 | #include <algorithm> |
47 | |
48 | using namespace std; |
49 | using namespace ue2; |
50 | |
51 | using MatchSet = set<pair<size_t, size_t>>; |
52 | using StateBitSet = boost::dynamic_bitset<>; |
53 | |
54 | namespace { |
55 | |
56 | /** \brief Max number of states (taking edit distance into account). */ |
57 | static constexpr size_t STATE_COUNT_MAX = 15000; |
58 | |
59 | // returns all successors up to a given depth in a vector of sets, indexed by |
60 | // zero-based depth from source vertex |
61 | static |
62 | vector<flat_set<NFAVertex>> |
63 | gatherSuccessorsByDepth(const NGHolder &g, const NFAVertex &src, u32 depth) { |
64 | assert(depth > 0); |
65 | |
66 | vector<flat_set<NFAVertex>> result(depth); |
67 | |
68 | // populate current set of successors |
69 | for (auto v : adjacent_vertices_range(src, g)) { |
70 | // ignore self-loops |
71 | if (src == v) { |
72 | continue; |
73 | } |
74 | DEBUG_PRINTF("Node %zu depth 1\n" , g[v].index); |
75 | result[0].insert(v); |
76 | } |
77 | |
78 | for (u32 d = 1; d < depth; d++) { |
79 | // collect all successors for all current level vertices |
80 | const auto &cur = result[d - 1]; |
81 | auto &next = result[d]; |
82 | for (auto u : cur) { |
83 | // don't go past special nodes |
84 | if (is_special(u, g)) { |
85 | continue; |
86 | } |
87 | |
88 | for (auto v : adjacent_vertices_range(u, g)) { |
89 | // ignore self-loops |
90 | if (u == v) { |
91 | continue; |
92 | } |
93 | DEBUG_PRINTF("Node %zu depth %u\n" , g[v].index, d + 1); |
94 | next.insert(v); |
95 | } |
96 | } |
97 | } |
98 | |
99 | return result; |
100 | } |
101 | |
102 | // returns all predecessors up to a given depth in a vector of sets, indexed by |
103 | // zero-based depth from source vertex |
104 | static |
105 | vector<flat_set<NFAVertex>> |
106 | gatherPredecessorsByDepth(const NGHolder &g, NFAVertex src, u32 depth) { |
107 | assert(depth > 0); |
108 | |
109 | vector<flat_set<NFAVertex>> result(depth); |
110 | |
111 | // populate current set of successors |
112 | for (auto v : inv_adjacent_vertices_range(src, g)) { |
113 | // ignore self-loops |
114 | if (src == v) { |
115 | continue; |
116 | } |
117 | DEBUG_PRINTF("Node %zu depth 1\n" , g[v].index); |
118 | result[0].insert(v); |
119 | } |
120 | |
121 | for (u32 d = 1; d < depth; d++) { |
122 | // collect all successors for all current level vertices |
123 | const auto &cur = result[d - 1]; |
124 | auto &next = result[d]; |
125 | for (auto v : cur) { |
126 | for (auto u : inv_adjacent_vertices_range(v, g)) { |
127 | // ignore self-loops |
128 | if (v == u) { |
129 | continue; |
130 | } |
131 | DEBUG_PRINTF("Node %zu depth %u\n" , g[u].index, d + 1); |
132 | next.insert(u); |
133 | } |
134 | } |
135 | } |
136 | |
137 | return result; |
138 | } |
139 | |
140 | // this is a per-vertex, per-shadow level state transition table |
141 | struct GraphCache { |
142 | GraphCache(u32 dist_in, u32 hamm_in, const NGHolder &g) |
143 | : hamming(hamm_in > 0), size(num_vertices(g)), |
144 | edit_distance(hamming ? hamm_in : dist_in) |
145 | { |
146 | auto dist_max = edit_distance + 1; |
147 | |
148 | allocateStateTransitionTable(dist_max); |
149 | populateTransitionCache(g, dist_max); |
150 | populateAcceptCache(g, dist_max); |
151 | } |
152 | |
153 | void allocateStateTransitionTable(u32 dist_max) { |
154 | // resize level 1 - per vertex |
155 | shadow_transitions.resize(size); |
156 | helper_transitions.resize(size); |
157 | |
158 | // resize level 2 - per shadow level |
159 | for (u32 i = 0; i < size; i++) { |
160 | shadow_transitions[i].resize(dist_max); |
161 | helper_transitions[i].resize(dist_max); |
162 | |
163 | // resize level 3 - per vertex |
164 | for (u32 d = 0; d < dist_max; d++) { |
165 | shadow_transitions[i][d].resize(size); |
166 | helper_transitions[i][d].resize(size); |
167 | } |
168 | } |
169 | |
170 | // accept states are indexed by edit distance |
171 | accept_states.resize(dist_max); |
172 | accept_eod_states.resize(dist_max); |
173 | |
174 | // vertex report maps are indexed by edit distance |
175 | vertex_reports_by_level.resize(dist_max); |
176 | vertex_eod_reports_by_level.resize(dist_max); |
177 | } |
178 | |
179 | /* |
180 | * certain transitions to helpers are disallowed: |
181 | * 1. transitions from accept/acceptEod |
182 | * 2. transitions to accept/acceptEod |
183 | * 3. from start to startDs |
184 | * 4. to a virtual/multiline start |
185 | * |
186 | * everything else is allowed. |
187 | */ |
188 | bool canTransitionToHelper(NFAVertex u, NFAVertex v, const NGHolder &g) const { |
189 | if (is_any_accept(u, g)) { |
190 | return false; |
191 | } |
192 | if (is_any_accept(v, g)) { |
193 | return false; |
194 | } |
195 | if (u == g.start && v == g.startDs) { |
196 | return false; |
197 | } |
198 | if (is_virtual_start(v, g)) { |
199 | return false; |
200 | } |
201 | return true; |
202 | } |
203 | |
204 | void populateTransitionCache(const NGHolder &g, u32 dist_max) { |
205 | // populate mapping of vertex index to vertex |
206 | vector<NFAVertex> idx_to_v(size); |
207 | for (auto v : vertices_range(g)) { |
208 | idx_to_v[g[v].index] = v; |
209 | } |
210 | |
211 | for (u32 i = 0; i < size; i++) { |
212 | auto cur_v = idx_to_v[i]; |
213 | |
214 | // set up transition tables |
215 | auto succs = gatherSuccessorsByDepth(g, cur_v, dist_max); |
216 | |
217 | assert(succs.size() == dist_max); |
218 | |
219 | for (u32 d = 0; d < dist_max; d++) { |
220 | auto &v_shadows = shadow_transitions[i][d]; |
221 | auto cur_v_bit = i; |
222 | |
223 | // enable transition to next level helper (this handles insertion) |
224 | if (!hamming && d < edit_distance && !is_any_accept(cur_v, g)) { |
225 | auto &next_v_helpers = helper_transitions[i][d + 1]; |
226 | |
227 | next_v_helpers.set(cur_v_bit); |
228 | } |
229 | |
230 | // if vertex has a self-loop, we can also transition to it, |
231 | // but only if we're at shadow level 0 |
232 | if (edge(cur_v, cur_v, g).second && d == 0) { |
233 | v_shadows.set(cur_v_bit); |
234 | } |
235 | |
236 | if (hamming && d > 0) { |
237 | continue; |
238 | } |
239 | |
240 | // populate state transition tables |
241 | for (auto v : succs[d]) { |
242 | auto v_bit = g[v].index; |
243 | |
244 | // we cannot transition to startDs on any level other than |
245 | // level 0 |
246 | if (v != g.startDs || d == 0) { |
247 | // this handles direct transitions as well as removals |
248 | v_shadows.set(v_bit); |
249 | } |
250 | |
251 | // we can also transition to next-level helper (handles |
252 | // replace), provided we meet the criteria |
253 | if (d < edit_distance && canTransitionToHelper(cur_v, v, g)) { |
254 | auto &next_v_helpers = helper_transitions[i][d + 1]; |
255 | |
256 | next_v_helpers.set(v_bit); |
257 | } |
258 | } |
259 | } |
260 | } |
261 | } |
262 | |
263 | void populateAcceptCache(const NGHolder &g, u32 dist_max) { |
264 | // set up accept states masks |
265 | StateBitSet accept(size); |
266 | accept.set(g[g.accept].index); |
267 | StateBitSet accept_eod(size); |
268 | accept_eod.set(g[g.acceptEod].index); |
269 | |
270 | // gather accept and acceptEod states |
271 | for (u32 base_dist = 0; base_dist < dist_max; base_dist++) { |
272 | auto &states = accept_states[base_dist]; |
273 | auto &eod_states = accept_eod_states[base_dist]; |
274 | |
275 | states.resize(size); |
276 | eod_states.resize(size); |
277 | |
278 | // inspect each vertex |
279 | for (u32 i = 0; i < size; i++) { |
280 | // inspect all shadow levels from base_dist to dist_max |
281 | for (u32 d = 0; d < dist_max - base_dist; d++) { |
282 | auto &shadows = shadow_transitions[i][d]; |
283 | |
284 | // if this state transitions to accept, set its bit |
285 | if ((shadows & accept).any()) { |
286 | states.set(i); |
287 | } |
288 | if ((shadows & accept_eod).any()) { |
289 | eod_states.set(i); |
290 | } |
291 | } |
292 | } |
293 | } |
294 | |
295 | // populate accepts cache |
296 | for (auto v : inv_adjacent_vertices_range(g.accept, g)) { |
297 | const auto &rs = g[v].reports; |
298 | |
299 | for (u32 d = 0; d <= edit_distance; d++) { |
300 | // add self to report list at all levels |
301 | vertex_reports_by_level[d][v].insert(rs.begin(), rs.end()); |
302 | } |
303 | |
304 | if (edit_distance == 0 || hamming) { |
305 | // if edit distance is 0, no predecessors will have reports |
306 | continue; |
307 | } |
308 | |
309 | auto preds_by_depth = gatherPredecessorsByDepth(g, v, edit_distance); |
310 | for (u32 pd = 0; pd < preds_by_depth.size(); pd++) { |
311 | const auto &preds = preds_by_depth[pd]; |
312 | // for each predecessor, add reports up to maximum edit distance |
313 | // for current depth from source vertex |
314 | for (auto pred : preds) { |
315 | for (u32 d = 0; d < edit_distance - pd; d++) { |
316 | vertex_reports_by_level[d][pred].insert(rs.begin(), rs.end()); |
317 | } |
318 | } |
319 | } |
320 | } |
321 | for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) { |
322 | const auto &rs = g[v].reports; |
323 | |
324 | if (v == g.accept) { |
325 | continue; |
326 | } |
327 | |
328 | for (u32 d = 0; d <= edit_distance; d++) { |
329 | // add self to report list at all levels |
330 | vertex_eod_reports_by_level[d][v].insert(rs.begin(), rs.end()); |
331 | } |
332 | if (edit_distance == 0 || hamming) { |
333 | // if edit distance is 0, no predecessors will have reports |
334 | continue; |
335 | } |
336 | |
337 | auto preds_by_depth = gatherPredecessorsByDepth(g, v, edit_distance); |
338 | for (u32 pd = 0; pd < preds_by_depth.size(); pd++) { |
339 | const auto &preds = preds_by_depth[pd]; |
340 | // for each predecessor, add reports up to maximum edit distance |
341 | // for current depth from source vertex |
342 | for (auto pred : preds) { |
343 | for (u32 d = 0; d < edit_distance - pd; d++) { |
344 | vertex_eod_reports_by_level[d][pred].insert(rs.begin(), rs.end()); |
345 | } |
346 | } |
347 | } |
348 | } |
349 | } |
350 | |
351 | #ifdef DEBUG |
352 | void dumpStateTransitionTable(const NGHolder &g) { |
353 | StateBitSet accept(size); |
354 | accept.set(g[g.accept].index); |
355 | StateBitSet accept_eod(size); |
356 | accept_eod.set(g[g.acceptEod].index); |
357 | |
358 | DEBUG_PRINTF("Dumping state transition tables\n" ); |
359 | DEBUG_PRINTF("Shadows:\n" ); |
360 | for (u32 i = 0; i < num_vertices(g); i++) { |
361 | DEBUG_PRINTF("%-7s %3u:" , "Vertex" , i); |
362 | for (u32 j = 0; j < num_vertices(g); j++) { |
363 | printf("%3i" , j); |
364 | } |
365 | printf("\n" ); |
366 | for (u32 d = 0; d <= edit_distance; d++) { |
367 | DEBUG_PRINTF("%-7s %3u:" , "Level" , d); |
368 | const auto &s = getShadowTransitions(i, d); |
369 | for (u32 j = 0; j < num_vertices(g); j++) { |
370 | printf("%3i" , s.test(j)); |
371 | } |
372 | printf("\n" ); |
373 | } |
374 | DEBUG_PRINTF("\n" ); |
375 | } |
376 | |
377 | DEBUG_PRINTF("Helpers:\n" ); |
378 | for (u32 i = 0; i < num_vertices(g); i++) { |
379 | DEBUG_PRINTF("%-7s %3u:" , "Vertex" , i); |
380 | for (u32 j = 0; j < num_vertices(g); j++) { |
381 | printf("%3i" , j); |
382 | } |
383 | printf("\n" ); |
384 | for (u32 d = 0; d <= edit_distance; d++) { |
385 | DEBUG_PRINTF("%-7s %3u:" , "Level" , d); |
386 | const auto &s = getHelperTransitions(i, d); |
387 | for (u32 j = 0; j < num_vertices(g); j++) { |
388 | printf("%3i" , s.test(j)); |
389 | } |
390 | printf("\n" ); |
391 | } |
392 | DEBUG_PRINTF("\n" ); |
393 | } |
394 | |
395 | DEBUG_PRINTF("Accept transitions:\n" ); |
396 | DEBUG_PRINTF("%-12s" , "Vertex idx:" ); |
397 | for (u32 j = 0; j < num_vertices(g); j++) { |
398 | printf("%3i" , j); |
399 | } |
400 | printf("\n" ); |
401 | for (u32 d = 0; d <= edit_distance; d++) { |
402 | DEBUG_PRINTF("%-7s %3u:" , "Level" , d); |
403 | const auto &s = getAcceptTransitions(d); |
404 | for (u32 j = 0; j < num_vertices(g); j++) { |
405 | printf("%3i" , s.test(j)); |
406 | } |
407 | printf("\n" ); |
408 | } |
409 | DEBUG_PRINTF("\n" ); |
410 | |
411 | DEBUG_PRINTF("Accept EOD transitions:\n" ); |
412 | DEBUG_PRINTF("%-12s" , "Vertex idx:" ); |
413 | for (u32 j = 0; j < num_vertices(g); j++) { |
414 | printf("%3i" , j); |
415 | } |
416 | printf("\n" ); |
417 | for (u32 d = 0; d <= edit_distance; d++) { |
418 | DEBUG_PRINTF("%-7s %3u:" , "Level" , d); |
419 | const auto &s = getAcceptEodTransitions(d); |
420 | for (u32 j = 0; j < num_vertices(g); j++) { |
421 | printf("%3i" , s.test(j)); |
422 | } |
423 | printf("\n" ); |
424 | } |
425 | DEBUG_PRINTF("\n" ); |
426 | |
427 | DEBUG_PRINTF("%-12s " , "Accepts:" ); |
428 | for (u32 i = 0; i < num_vertices(g); i++) { |
429 | printf("%3i" , accept.test(i)); |
430 | } |
431 | printf("\n" ); |
432 | |
433 | DEBUG_PRINTF("%-12s " , "EOD Accepts:" ); |
434 | for (u32 i = 0; i < num_vertices(g); i++) { |
435 | printf("%3i" , accept_eod.test(i)); |
436 | } |
437 | printf("\n" ); |
438 | |
439 | DEBUG_PRINTF("Reports\n" ); |
440 | for (auto v : vertices_range(g)) { |
441 | for (u32 d = 0; d <= edit_distance; d++) { |
442 | const auto &r = vertex_reports_by_level[d][v]; |
443 | const auto &e = vertex_eod_reports_by_level[d][v]; |
444 | DEBUG_PRINTF("%-7s %3zu %-8s %3zu %-8s %3zu\n" , |
445 | "Vertex" , g[v].index, "rs:" , r.size(), "eod:" , e.size()); |
446 | } |
447 | } |
448 | printf("\n" ); |
449 | } |
450 | #endif |
451 | |
452 | const StateBitSet& getShadowTransitions(u32 idx, u32 level) const { |
453 | assert(idx < size); |
454 | assert(level <= edit_distance); |
455 | return shadow_transitions[idx][level]; |
456 | } |
457 | const StateBitSet& getHelperTransitions(u32 idx, u32 level) const { |
458 | assert(idx < size); |
459 | assert(level <= edit_distance); |
460 | return helper_transitions[idx][level]; |
461 | } |
462 | const StateBitSet& getAcceptTransitions(u32 level) const { |
463 | assert(level <= edit_distance); |
464 | return accept_states[level]; |
465 | } |
466 | const StateBitSet& getAcceptEodTransitions(u32 level) const { |
467 | assert(level <= edit_distance); |
468 | return accept_eod_states[level]; |
469 | } |
470 | |
471 | /* |
472 | * the bitsets are indexed by vertex and shadow level. the bitset's length is |
473 | * equal to the total number of vertices in the graph. |
474 | * |
475 | * for convenience, helper functions are provided. |
476 | */ |
477 | vector<vector<StateBitSet>> shadow_transitions; |
478 | vector<vector<StateBitSet>> helper_transitions; |
479 | |
480 | // accept states masks, indexed by shadow level |
481 | vector<StateBitSet> accept_states; |
482 | vector<StateBitSet> accept_eod_states; |
483 | |
484 | // map of all reports associated with any vertex, indexed by shadow level |
485 | vector<map<NFAVertex, flat_set<ReportID>>> vertex_reports_by_level; |
486 | vector<map<NFAVertex, flat_set<ReportID>>> vertex_eod_reports_by_level; |
487 | |
488 | bool hamming; |
489 | u32 size; |
490 | u32 edit_distance; |
491 | }; |
492 | |
493 | |
494 | /* |
495 | * SOM workflow is expected to be the following: |
496 | * - Caller calls getActiveStates, which reports SOM for each active states |
497 | * - Caller calls getSuccessorStates on each of the active states, which *doesn't* |
498 | * report SOM |
499 | * - Caller decides if the successor state should be activated, and calls |
500 | * activateState with SOM set to that of previous active state (not successor!) |
501 | * - activateState then resolves any conflicts between SOMs that may arise from |
502 | * multiple active states progressing to the same successor |
503 | */ |
504 | struct StateSet { |
505 | struct State { |
506 | enum node_type { |
507 | NODE_SHADOW = 0, |
508 | NODE_HELPER |
509 | }; |
510 | State(size_t idx_in, u32 level_in, size_t som_in, node_type type_in) : |
511 | idx(idx_in), level(level_in), som(som_in), type(type_in) {} |
512 | size_t idx; |
513 | u32 level; |
514 | size_t som; |
515 | node_type type; |
516 | }; |
517 | |
518 | // Temporary working data used for step() which we want to keep around |
519 | // (rather than reallocating vectors all the time). |
520 | struct WorkingData { |
521 | vector<State> active; |
522 | vector<State> succ_list; |
523 | }; |
524 | |
525 | StateSet(size_t sz, u32 dist_in) : |
526 | shadows(dist_in + 1), helpers(dist_in + 1), |
527 | shadows_som(dist_in + 1), helpers_som(dist_in + 1), |
528 | edit_distance(dist_in) { |
529 | for (u32 dist = 0; dist <= dist_in; dist++) { |
530 | shadows[dist].resize(sz, false); |
531 | helpers[dist].resize(sz, false); |
532 | shadows_som[dist].resize(sz, 0); |
533 | helpers_som[dist].resize(sz, 0); |
534 | } |
535 | } |
536 | |
537 | void reset() { |
538 | for (u32 dist = 0; dist <= edit_distance; dist++) { |
539 | shadows[dist].reset(); |
540 | helpers[dist].reset(); |
541 | fill(shadows_som[dist].begin(), shadows_som[dist].end(), 0); |
542 | fill(helpers_som[dist].begin(), helpers_som[dist].end(), 0); |
543 | } |
544 | } |
545 | |
546 | bool empty() const { |
547 | for (u32 dist = 0; dist <= edit_distance; dist++) { |
548 | if (shadows[dist].any()) { |
549 | return false; |
550 | } |
551 | if (helpers[dist].any()) { |
552 | return false; |
553 | } |
554 | } |
555 | return true; |
556 | } |
557 | |
558 | size_t count() const { |
559 | size_t result = 0; |
560 | |
561 | for (u32 dist = 0; dist <= edit_distance; dist++) { |
562 | result += shadows[dist].count(); |
563 | result += helpers[dist].count(); |
564 | } |
565 | |
566 | return result; |
567 | } |
568 | |
569 | bool setActive(const State &s) { |
570 | switch (s.type) { |
571 | case State::NODE_HELPER: |
572 | return helpers[s.level].test_set(s.idx); |
573 | case State::NODE_SHADOW: |
574 | return shadows[s.level].test_set(s.idx); |
575 | } |
576 | assert(0); |
577 | return false; |
578 | } |
579 | |
580 | size_t getCachedSom(const State &s) const { |
581 | switch (s.type) { |
582 | case State::NODE_HELPER: |
583 | return helpers_som[s.level][s.idx]; |
584 | case State::NODE_SHADOW: |
585 | return shadows_som[s.level][s.idx]; |
586 | } |
587 | assert(0); |
588 | return 0; |
589 | } |
590 | |
591 | void setCachedSom(const State &s, const size_t som_val) { |
592 | switch (s.type) { |
593 | case State::NODE_HELPER: |
594 | helpers_som[s.level][s.idx] = som_val; |
595 | break; |
596 | case State::NODE_SHADOW: |
597 | shadows_som[s.level][s.idx] = som_val; |
598 | break; |
599 | default: |
600 | assert(0); |
601 | } |
602 | } |
603 | |
604 | #ifdef DEBUG |
605 | void dumpActiveStates() const { |
606 | vector<State> states; |
607 | getActiveStates(states); |
608 | |
609 | DEBUG_PRINTF("Dumping active states\n" ); |
610 | |
611 | for (const auto &state : states) { |
612 | DEBUG_PRINTF("type: %s idx: %zu level: %u som: %zu\n" , |
613 | state.type == State::NODE_HELPER ? "HELPER" : "SHADOW" , |
614 | state.idx, state.level, state.som); |
615 | } |
616 | } |
617 | #endif |
618 | |
619 | void getActiveStates(vector<State> &result) const { |
620 | result.clear(); |
621 | |
622 | for (u32 dist = 0; dist <= edit_distance; dist++) { |
623 | // get all shadow vertices (including original graph) |
624 | const auto &cur_shadow_vertices = shadows[dist]; |
625 | for (size_t id = cur_shadow_vertices.find_first(); |
626 | id != cur_shadow_vertices.npos; |
627 | id = cur_shadow_vertices.find_next(id)) { |
628 | result.emplace_back(id, dist, shadows_som[dist][id], |
629 | State::NODE_SHADOW); |
630 | } |
631 | |
632 | // the rest is only valid for edited graphs |
633 | if (dist == 0) { |
634 | continue; |
635 | } |
636 | |
637 | // get all helper vertices |
638 | const auto &cur_helper_vertices = helpers[dist]; |
639 | for (size_t id = cur_helper_vertices.find_first(); |
640 | id != cur_helper_vertices.npos; |
641 | id = cur_helper_vertices.find_next(id)) { |
642 | result.emplace_back(id, dist, helpers_som[dist][id], |
643 | State::NODE_HELPER); |
644 | } |
645 | } |
646 | |
647 | sort_and_unique(result); |
648 | } |
649 | |
650 | // does not return SOM |
651 | void getSuccessors(const State &state, const GraphCache &gc, |
652 | vector<State> &result) const { |
653 | result.clear(); |
654 | |
655 | // maximum shadow depth that we can go from current level |
656 | u32 max_depth = edit_distance - state.level + 1; |
657 | |
658 | for (u32 d = 0; d < max_depth; d++) { |
659 | const auto &shadow_succ = gc.getShadowTransitions(state.idx, d); |
660 | for (size_t id = shadow_succ.find_first(); |
661 | id != shadow_succ.npos; |
662 | id = shadow_succ.find_next(id)) { |
663 | auto new_level = state.level + d; |
664 | result.emplace_back(id, new_level, 0, State::NODE_SHADOW); |
665 | } |
666 | |
667 | const auto &helper_succ = gc.getHelperTransitions(state.idx, d); |
668 | for (size_t id = helper_succ.find_first(); |
669 | id != helper_succ.npos; |
670 | id = helper_succ.find_next(id)) { |
671 | auto new_level = state.level + d; |
672 | result.emplace_back(id, new_level, 0, State::NODE_HELPER); |
673 | } |
674 | } |
675 | |
676 | sort_and_unique(result); |
677 | } |
678 | |
679 | void getAcceptStates(const GraphCache &gc, vector<State> &result) const { |
680 | result.clear(); |
681 | |
682 | for (u32 dist = 0; dist <= edit_distance; dist++) { |
683 | // get all shadow vertices (including original graph) |
684 | auto cur_shadow_vertices = shadows[dist]; |
685 | cur_shadow_vertices &= gc.getAcceptTransitions(dist); |
686 | for (size_t id = cur_shadow_vertices.find_first(); |
687 | id != cur_shadow_vertices.npos; |
688 | id = cur_shadow_vertices.find_next(id)) { |
689 | result.emplace_back(id, dist, shadows_som[dist][id], |
690 | State::NODE_SHADOW); |
691 | } |
692 | |
693 | auto cur_helper_vertices = helpers[dist]; |
694 | cur_helper_vertices &= gc.getAcceptTransitions(dist); |
695 | for (size_t id = cur_helper_vertices.find_first(); |
696 | id != cur_helper_vertices.npos; |
697 | id = cur_helper_vertices.find_next(id)) { |
698 | result.emplace_back(id, dist, helpers_som[dist][id], |
699 | State::NODE_HELPER); |
700 | } |
701 | } |
702 | |
703 | sort_and_unique(result); |
704 | } |
705 | |
706 | void getAcceptEodStates(const GraphCache &gc, vector<State> &result) const { |
707 | result.clear(); |
708 | |
709 | for (u32 dist = 0; dist <= edit_distance; dist++) { |
710 | // get all shadow vertices (including original graph) |
711 | auto cur_shadow_vertices = shadows[dist]; |
712 | cur_shadow_vertices &= gc.getAcceptEodTransitions(dist); |
713 | for (size_t id = cur_shadow_vertices.find_first(); |
714 | id != cur_shadow_vertices.npos; |
715 | id = cur_shadow_vertices.find_next(id)) { |
716 | result.emplace_back(id, dist, shadows_som[dist][id], |
717 | State::NODE_SHADOW); |
718 | } |
719 | |
720 | auto cur_helper_vertices = helpers[dist]; |
721 | cur_helper_vertices &= gc.getAcceptEodTransitions(dist); |
722 | for (size_t id = cur_helper_vertices.find_first(); |
723 | id != cur_helper_vertices.npos; |
724 | id = cur_helper_vertices.find_next(id)) { |
725 | result.emplace_back(id, dist, helpers_som[dist][id], |
726 | State::NODE_HELPER); |
727 | } |
728 | } |
729 | |
730 | sort_and_unique(result); |
731 | } |
732 | |
733 | // the caller must specify SOM at current offset, and must not attempt to |
734 | // resolve SOM inheritance conflicts |
735 | void activateState(const State &state) { |
736 | size_t cur_som = state.som; |
737 | if (setActive(state)) { |
738 | size_t cached_som = getCachedSom(state); |
739 | cur_som = min(cur_som, cached_som); |
740 | } |
741 | setCachedSom(state, cur_som); |
742 | } |
743 | |
744 | vector<StateBitSet> shadows; |
745 | vector<StateBitSet> helpers; |
746 | vector<vector<size_t>> shadows_som; |
747 | vector<vector<size_t>> helpers_som; |
748 | u32 edit_distance; |
749 | }; |
750 | |
751 | // for flat_set |
752 | bool operator<(const StateSet::State &a, const StateSet::State &b) { |
753 | ORDER_CHECK(idx); |
754 | ORDER_CHECK(level); |
755 | ORDER_CHECK(type); |
756 | ORDER_CHECK(som); |
757 | return false; |
758 | } |
759 | |
760 | bool operator==(const StateSet::State &a, const StateSet::State &b) { |
761 | return a.idx == b.idx && a.level == b.level && a.type == b.type && |
762 | a.som == b.som; |
763 | } |
764 | |
765 | /** \brief Cache to speed up edge lookups, rather than hitting the graph. */ |
766 | struct EdgeCache { |
767 | explicit EdgeCache(const NGHolder &g) { |
768 | cache.reserve(num_vertices(g)); |
769 | for (auto e : edges_range(g)) { |
770 | cache.emplace(make_pair(source(e, g), target(e, g)), e); |
771 | } |
772 | } |
773 | |
774 | NFAEdge get(NFAVertex u, NFAVertex v) const { |
775 | auto it = cache.find(make_pair(u, v)); |
776 | if (it != cache.end()) { |
777 | return it->second; |
778 | } |
779 | return NFAEdge(); |
780 | } |
781 | |
782 | private: |
783 | ue2_unordered_map<pair<NFAVertex, NFAVertex>, NFAEdge> cache; |
784 | }; |
785 | |
786 | struct fmstate { |
787 | const size_t num_states; // number of vertices in graph |
788 | StateSet states; // currently active states |
789 | StateSet next; // states on after this iteration |
790 | GraphCache &gc; |
791 | vector<NFAVertex> vertices; // mapping from index to vertex |
792 | EdgeCache edge_cache; |
793 | size_t offset = 0; |
794 | unsigned char cur = 0; |
795 | unsigned char prev = 0; |
796 | const bool utf8; |
797 | const bool allowStartDs; |
798 | const ReportManager &rm; |
799 | |
800 | fmstate(const NGHolder &g, GraphCache &gc_in, bool utf8_in, bool aSD_in, |
801 | const u32 edit_distance, const ReportManager &rm_in) |
802 | : num_states(num_vertices(g)), |
803 | states(num_states, edit_distance), |
804 | next(num_states, edit_distance), |
805 | gc(gc_in), vertices(num_vertices(g), NGHolder::null_vertex()), |
806 | edge_cache(g), utf8(utf8_in), allowStartDs(aSD_in), rm(rm_in) { |
807 | // init states |
808 | states.activateState( |
809 | StateSet::State {g[g.start].index, 0, 0, |
810 | StateSet::State::NODE_SHADOW}); |
811 | if (allowStartDs) { |
812 | states.activateState( |
813 | StateSet::State {g[g.startDs].index, 0, 0, |
814 | StateSet::State::NODE_SHADOW}); |
815 | } |
816 | // fill vertex mapping |
817 | for (auto v : vertices_range(g)) { |
818 | vertices[g[v].index] = v; |
819 | } |
820 | } |
821 | }; |
822 | |
823 | } // namespace |
824 | |
825 | static |
826 | bool isWordChar(const unsigned char c) { |
827 | // check if it's an alpha character |
828 | if (ourisalpha(c)) { |
829 | return true; |
830 | } |
831 | // check if it's a digit |
832 | if (c >= '0' && c <= '9') { |
833 | return true; |
834 | } |
835 | // check if it's an underscore |
836 | if (c == '_') { |
837 | return true; |
838 | } |
839 | return false; |
840 | } |
841 | |
842 | static |
843 | bool isUtf8CodePoint(const char c) { |
844 | // check if this is a start of 4-byte character |
845 | if ((c & 0xF8) == 0xF0) { |
846 | return true; |
847 | } |
848 | // check if this is a start of 3-byte character |
849 | if ((c & 0xF0) == 0xE0) { |
850 | return true; |
851 | } |
852 | // check if this is a start of 2-byte character |
853 | if ((c & 0xE0) == 0xC0) { |
854 | return true; |
855 | } |
856 | // check if this is a single-byte character |
857 | if ((c & 0x80) == 0) { |
858 | return true; |
859 | } |
860 | return false; |
861 | } |
862 | |
863 | static |
864 | bool canReach(const NGHolder &g, const NFAEdge &e, struct fmstate &state) { |
865 | auto flags = g[e].assert_flags; |
866 | if (!flags) { |
867 | return true; |
868 | } |
869 | |
870 | if (flags & POS_FLAG_ASSERT_WORD_TO_NONWORD) { |
871 | if (isWordChar(state.prev) && !isWordChar(state.cur)) { |
872 | return true; |
873 | } |
874 | } |
875 | |
876 | if (flags & POS_FLAG_ASSERT_NONWORD_TO_WORD) { |
877 | if (!isWordChar(state.prev) && isWordChar(state.cur)) { |
878 | return true; |
879 | } |
880 | } |
881 | |
882 | if (flags & POS_FLAG_ASSERT_WORD_TO_WORD) { |
883 | if (isWordChar(state.prev) && isWordChar(state.cur)) { |
884 | return true; |
885 | } |
886 | } |
887 | |
888 | if (flags & POS_FLAG_ASSERT_NONWORD_TO_NONWORD) { |
889 | if (!isWordChar(state.prev) && !isWordChar(state.cur)) { |
890 | return true; |
891 | } |
892 | } |
893 | |
894 | return false; |
895 | } |
896 | |
897 | static |
898 | void getAcceptMatches(const NGHolder &g, MatchSet &matches, |
899 | struct fmstate &state, NFAVertex accept_vertex, |
900 | vector<StateSet::State> &active_states) { |
901 | assert(accept_vertex == g.accept || accept_vertex == g.acceptEod); |
902 | |
903 | const bool eod = accept_vertex == g.acceptEod; |
904 | if (eod) { |
905 | state.states.getAcceptEodStates(state.gc, active_states); |
906 | } else { |
907 | state.states.getAcceptStates(state.gc, active_states); |
908 | } |
909 | |
910 | DEBUG_PRINTF("Number of active states: %zu\n" , active_states.size()); |
911 | |
912 | for (const auto &cur : active_states) { |
913 | auto u = state.vertices[cur.idx]; |
914 | |
915 | // we can't accept anything from startDs in between UTF-8 codepoints |
916 | if (state.utf8 && u == g.startDs && !isUtf8CodePoint(state.cur)) { |
917 | continue; |
918 | } |
919 | |
920 | const auto &reports = |
921 | eod ? state.gc.vertex_eod_reports_by_level[cur.level][u] |
922 | : state.gc.vertex_reports_by_level[cur.level][u]; |
923 | |
924 | NFAEdge e = state.edge_cache.get(u, accept_vertex); |
925 | |
926 | // we assume edge assertions only exist at level 0 |
927 | if (e && !canReach(g, e, state)) { |
928 | continue; |
929 | } |
930 | |
931 | DEBUG_PRINTF("%smatch found at %zu\n" , eod ? "eod " : "" , state.offset); |
932 | |
933 | assert(!reports.empty()); |
934 | for (const auto &report_id : reports) { |
935 | const Report &ri = state.rm.getReport(report_id); |
936 | |
937 | DEBUG_PRINTF("report %u has offset adjustment %d\n" , report_id, |
938 | ri.offsetAdjust); |
939 | DEBUG_PRINTF("match from (i:%zu,l:%u,t:%u): (%zu,%zu)\n" , cur.idx, |
940 | cur.level, cur.type, cur.som, |
941 | state.offset + ri.offsetAdjust); |
942 | matches.emplace(cur.som, state.offset + ri.offsetAdjust); |
943 | } |
944 | } |
945 | } |
946 | |
947 | static |
948 | void getMatches(const NGHolder &g, MatchSet &matches, struct fmstate &state, |
949 | StateSet::WorkingData &wd, bool allowEodMatches) { |
950 | getAcceptMatches(g, matches, state, g.accept, wd.active); |
951 | if (allowEodMatches) { |
952 | getAcceptMatches(g, matches, state, g.acceptEod, wd.active); |
953 | } |
954 | } |
955 | |
956 | static |
957 | void step(const NGHolder &g, fmstate &state, StateSet::WorkingData &wd) { |
958 | state.next.reset(); |
959 | |
960 | state.states.getActiveStates(wd.active); |
961 | |
962 | for (const auto &cur : wd.active) { |
963 | auto u = state.vertices[cur.idx]; |
964 | state.states.getSuccessors(cur, state.gc, wd.succ_list); |
965 | |
966 | for (auto succ : wd.succ_list) { |
967 | auto v = state.vertices[succ.idx]; |
968 | |
969 | if (is_any_accept(v, g)) { |
970 | continue; |
971 | } |
972 | |
973 | if (!state.allowStartDs && v == g.startDs) { |
974 | continue; |
975 | } |
976 | |
977 | // GraphCache doesn't differentiate between successors for shadows |
978 | // and helpers, and StateSet does not know anything about the graph, |
979 | // so the only place we can do it is here. we can't self-loop on a |
980 | // startDs if we're startDs's helper, so disallow it. |
981 | if (u == g.startDs && v == g.startDs && |
982 | succ.level != 0 && succ.level == cur.level) { |
983 | continue; |
984 | } |
985 | |
986 | // for the reasons outlined above, also putting this here. |
987 | // disallow transitions from start to startDs on levels other than zero |
988 | if (u == g.start && v == g.startDs && |
989 | cur.level != 0 && succ.level != 0) { |
990 | continue; |
991 | } |
992 | |
993 | bool can_reach = false; |
994 | |
995 | if (succ.type == StateSet::State::NODE_HELPER) { |
996 | can_reach = true; |
997 | } else { |
998 | // we assume edge assertions only exist on level 0 |
999 | const CharReach &cr = g[v].char_reach; |
1000 | NFAEdge e = state.edge_cache.get(u, v); |
1001 | |
1002 | if (cr.test(state.cur) && |
1003 | (!e || canReach(g, e, state))) { |
1004 | can_reach = true; |
1005 | } |
1006 | } |
1007 | |
1008 | // check edge assertions if we are allowed to reach accept |
1009 | DEBUG_PRINTF("reaching %zu->%zu ('%c'->'%c'): %s\n" , |
1010 | g[u].index, g[v].index, |
1011 | ourisprint(state.prev) ? state.prev : '?', |
1012 | ourisprint(state.cur) ? state.cur : '?', |
1013 | can_reach ? "yes" : "no" ); |
1014 | |
1015 | if (can_reach) { |
1016 | // we should use current offset as SOM if: |
1017 | // - we're at level 0 and we're a start vertex |
1018 | // - we're a fake start shadow |
1019 | size_t next_som; |
1020 | bool reset = is_any_start(u, g) && cur.level == 0; |
1021 | reset |= is_virtual_start(u, g) && |
1022 | cur.type == StateSet::State::NODE_SHADOW; |
1023 | |
1024 | if (reset) { |
1025 | next_som = state.offset; |
1026 | } else { |
1027 | // else, inherit SOM from predecessor |
1028 | next_som = cur.som; |
1029 | } |
1030 | succ.som = next_som; |
1031 | |
1032 | DEBUG_PRINTF("src: idx %zu level: %u som: %zu type: %s\n" , |
1033 | cur.idx, cur.level, cur.som, |
1034 | cur.type == StateSet::State::NODE_HELPER ? "H" : "S" ); |
1035 | DEBUG_PRINTF("dst: idx %zu level: %u som: %zu type: %s\n" , |
1036 | succ.idx, succ.level, succ.som, |
1037 | succ.type == StateSet::State::NODE_HELPER ? "H" : "S" ); |
1038 | |
1039 | // activate successor (SOM will be handled by activateState) |
1040 | state.next.activateState(succ); |
1041 | } |
1042 | } |
1043 | } |
1044 | } |
1045 | |
1046 | // filter extraneous matches |
1047 | static |
1048 | void filterMatches(MatchSet &matches) { |
1049 | set<size_t> eom; |
1050 | |
1051 | // first, collect all end-offset matches |
1052 | for (const auto &match : matches) { |
1053 | eom.insert(match.second); |
1054 | } |
1055 | |
1056 | // now, go through all the end-offsets and filter extra matches |
1057 | for (const auto &elem : eom) { |
1058 | // find minimum SOM for this EOM |
1059 | size_t min_som = -1U; |
1060 | for (const auto &match : matches) { |
1061 | // skip entries with wrong EOM |
1062 | if (match.second != elem) { |
1063 | continue; |
1064 | } |
1065 | |
1066 | min_som = min(min_som, match.first); |
1067 | } |
1068 | |
1069 | auto msit = matches.begin(); |
1070 | while (msit != matches.end()) { |
1071 | // skip everything that doesn't match |
1072 | if (msit->second != elem || msit->first <= min_som) { |
1073 | ++msit; |
1074 | continue; |
1075 | } |
1076 | DEBUG_PRINTF("erasing match %zu, %zu\n" , msit->first, msit->second); |
1077 | matches.erase(msit++); |
1078 | } |
1079 | } |
1080 | } |
1081 | |
1082 | /** \brief Find all matches for a given graph when executed against \a input. |
1083 | * |
1084 | * Fills \a matches with offsets into the data stream where a match is found. |
1085 | */ |
1086 | bool findMatches(const NGHolder &g, const ReportManager &rm, |
1087 | const string &input, MatchSet &matches, |
1088 | const u32 edit_distance, const u32 hamm_distance, |
1089 | const bool notEod, const bool utf8) { |
1090 | assert(hasCorrectlyNumberedVertices(g)); |
1091 | // cannot match fuzzy utf8 patterns, this should've been filtered out at |
1092 | // compile time, so make it an assert |
1093 | assert(!edit_distance || !utf8); |
1094 | // cannot be both edit and Hamming distance at once |
1095 | assert(!edit_distance || !hamm_distance); |
1096 | |
1097 | bool hamming = hamm_distance > 0; |
1098 | auto dist = hamming ? hamm_distance : edit_distance; |
1099 | |
1100 | const size_t total_states = num_vertices(g) * (3 * dist + 1); |
1101 | DEBUG_PRINTF("Finding matches (%zu total states)\n" , total_states); |
1102 | if (total_states > STATE_COUNT_MAX) { |
1103 | DEBUG_PRINTF("too big\n" ); |
1104 | return false; |
1105 | } |
1106 | |
1107 | GraphCache gc(edit_distance, hamm_distance, g); |
1108 | #ifdef DEBUG |
1109 | gc.dumpStateTransitionTable(g); |
1110 | #endif |
1111 | |
1112 | const bool allowStartDs = (proper_out_degree(g.startDs, g) > 0); |
1113 | |
1114 | struct fmstate state(g, gc, utf8, allowStartDs, dist, rm); |
1115 | |
1116 | StateSet::WorkingData wd; |
1117 | |
1118 | for (auto it = input.begin(), ite = input.end(); it != ite; ++it) { |
1119 | #ifdef DEBUG |
1120 | state.states.dumpActiveStates(); |
1121 | #endif |
1122 | state.offset = std::distance(input.begin(), it); |
1123 | state.cur = *it; |
1124 | |
1125 | step(g, state, wd); |
1126 | |
1127 | getMatches(g, matches, state, wd, false); |
1128 | |
1129 | DEBUG_PRINTF("offset %zu, %zu states on\n" , state.offset, |
1130 | state.next.count()); |
1131 | if (state.next.empty()) { |
1132 | filterMatches(matches); |
1133 | return true; |
1134 | } |
1135 | state.states = state.next; |
1136 | state.prev = state.cur; |
1137 | } |
1138 | #ifdef DEBUG |
1139 | state.states.dumpActiveStates(); |
1140 | #endif |
1141 | state.offset = input.size(); |
1142 | state.cur = 0; |
1143 | |
1144 | // do additional step to get matches after stream end, this time count eod |
1145 | // matches also (or not, if we're in notEod mode) |
1146 | |
1147 | DEBUG_PRINTF("Looking for EOD matches\n" ); |
1148 | getMatches(g, matches, state, wd, !notEod); |
1149 | |
1150 | filterMatches(matches); |
1151 | return true; |
1152 | } |
1153 | |