1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Pattern lifetime analysis.
31 */
32
33#include "config.h"
34
35#include "ng_find_matches.h"
36
37#include "nfagraph/ng_holder.h"
38#include "nfagraph/ng_util.h"
39#include "parser/position.h"
40#include "util/container.h"
41#include "util/compare.h"
42#include "util/report.h"
43#include "util/report_manager.h"
44#include "util/unordered.h"
45
46#include <algorithm>
47
48using namespace std;
49using namespace ue2;
50
51using MatchSet = set<pair<size_t, size_t>>;
52using StateBitSet = boost::dynamic_bitset<>;
53
54namespace {
55
56/** \brief Max number of states (taking edit distance into account). */
57static constexpr size_t STATE_COUNT_MAX = 15000;
58
59// returns all successors up to a given depth in a vector of sets, indexed by
60// zero-based depth from source vertex
61static
62vector<flat_set<NFAVertex>>
63gatherSuccessorsByDepth(const NGHolder &g, const NFAVertex &src, u32 depth) {
64 assert(depth > 0);
65
66 vector<flat_set<NFAVertex>> result(depth);
67
68 // populate current set of successors
69 for (auto v : adjacent_vertices_range(src, g)) {
70 // ignore self-loops
71 if (src == v) {
72 continue;
73 }
74 DEBUG_PRINTF("Node %zu depth 1\n", g[v].index);
75 result[0].insert(v);
76 }
77
78 for (u32 d = 1; d < depth; d++) {
79 // collect all successors for all current level vertices
80 const auto &cur = result[d - 1];
81 auto &next = result[d];
82 for (auto u : cur) {
83 // don't go past special nodes
84 if (is_special(u, g)) {
85 continue;
86 }
87
88 for (auto v : adjacent_vertices_range(u, g)) {
89 // ignore self-loops
90 if (u == v) {
91 continue;
92 }
93 DEBUG_PRINTF("Node %zu depth %u\n", g[v].index, d + 1);
94 next.insert(v);
95 }
96 }
97 }
98
99 return result;
100}
101
102// returns all predecessors up to a given depth in a vector of sets, indexed by
103// zero-based depth from source vertex
104static
105vector<flat_set<NFAVertex>>
106gatherPredecessorsByDepth(const NGHolder &g, NFAVertex src, u32 depth) {
107 assert(depth > 0);
108
109 vector<flat_set<NFAVertex>> result(depth);
110
111 // populate current set of successors
112 for (auto v : inv_adjacent_vertices_range(src, g)) {
113 // ignore self-loops
114 if (src == v) {
115 continue;
116 }
117 DEBUG_PRINTF("Node %zu depth 1\n", g[v].index);
118 result[0].insert(v);
119 }
120
121 for (u32 d = 1; d < depth; d++) {
122 // collect all successors for all current level vertices
123 const auto &cur = result[d - 1];
124 auto &next = result[d];
125 for (auto v : cur) {
126 for (auto u : inv_adjacent_vertices_range(v, g)) {
127 // ignore self-loops
128 if (v == u) {
129 continue;
130 }
131 DEBUG_PRINTF("Node %zu depth %u\n", g[u].index, d + 1);
132 next.insert(u);
133 }
134 }
135 }
136
137 return result;
138}
139
140// this is a per-vertex, per-shadow level state transition table
141struct GraphCache {
142 GraphCache(u32 dist_in, u32 hamm_in, const NGHolder &g)
143 : hamming(hamm_in > 0), size(num_vertices(g)),
144 edit_distance(hamming ? hamm_in : dist_in)
145 {
146 auto dist_max = edit_distance + 1;
147
148 allocateStateTransitionTable(dist_max);
149 populateTransitionCache(g, dist_max);
150 populateAcceptCache(g, dist_max);
151 }
152
153 void allocateStateTransitionTable(u32 dist_max) {
154 // resize level 1 - per vertex
155 shadow_transitions.resize(size);
156 helper_transitions.resize(size);
157
158 // resize level 2 - per shadow level
159 for (u32 i = 0; i < size; i++) {
160 shadow_transitions[i].resize(dist_max);
161 helper_transitions[i].resize(dist_max);
162
163 // resize level 3 - per vertex
164 for (u32 d = 0; d < dist_max; d++) {
165 shadow_transitions[i][d].resize(size);
166 helper_transitions[i][d].resize(size);
167 }
168 }
169
170 // accept states are indexed by edit distance
171 accept_states.resize(dist_max);
172 accept_eod_states.resize(dist_max);
173
174 // vertex report maps are indexed by edit distance
175 vertex_reports_by_level.resize(dist_max);
176 vertex_eod_reports_by_level.resize(dist_max);
177 }
178
179 /*
180 * certain transitions to helpers are disallowed:
181 * 1. transitions from accept/acceptEod
182 * 2. transitions to accept/acceptEod
183 * 3. from start to startDs
184 * 4. to a virtual/multiline start
185 *
186 * everything else is allowed.
187 */
188 bool canTransitionToHelper(NFAVertex u, NFAVertex v, const NGHolder &g) const {
189 if (is_any_accept(u, g)) {
190 return false;
191 }
192 if (is_any_accept(v, g)) {
193 return false;
194 }
195 if (u == g.start && v == g.startDs) {
196 return false;
197 }
198 if (is_virtual_start(v, g)) {
199 return false;
200 }
201 return true;
202 }
203
204 void populateTransitionCache(const NGHolder &g, u32 dist_max) {
205 // populate mapping of vertex index to vertex
206 vector<NFAVertex> idx_to_v(size);
207 for (auto v : vertices_range(g)) {
208 idx_to_v[g[v].index] = v;
209 }
210
211 for (u32 i = 0; i < size; i++) {
212 auto cur_v = idx_to_v[i];
213
214 // set up transition tables
215 auto succs = gatherSuccessorsByDepth(g, cur_v, dist_max);
216
217 assert(succs.size() == dist_max);
218
219 for (u32 d = 0; d < dist_max; d++) {
220 auto &v_shadows = shadow_transitions[i][d];
221 auto cur_v_bit = i;
222
223 // enable transition to next level helper (this handles insertion)
224 if (!hamming && d < edit_distance && !is_any_accept(cur_v, g)) {
225 auto &next_v_helpers = helper_transitions[i][d + 1];
226
227 next_v_helpers.set(cur_v_bit);
228 }
229
230 // if vertex has a self-loop, we can also transition to it,
231 // but only if we're at shadow level 0
232 if (edge(cur_v, cur_v, g).second && d == 0) {
233 v_shadows.set(cur_v_bit);
234 }
235
236 if (hamming && d > 0) {
237 continue;
238 }
239
240 // populate state transition tables
241 for (auto v : succs[d]) {
242 auto v_bit = g[v].index;
243
244 // we cannot transition to startDs on any level other than
245 // level 0
246 if (v != g.startDs || d == 0) {
247 // this handles direct transitions as well as removals
248 v_shadows.set(v_bit);
249 }
250
251 // we can also transition to next-level helper (handles
252 // replace), provided we meet the criteria
253 if (d < edit_distance && canTransitionToHelper(cur_v, v, g)) {
254 auto &next_v_helpers = helper_transitions[i][d + 1];
255
256 next_v_helpers.set(v_bit);
257 }
258 }
259 }
260 }
261 }
262
263 void populateAcceptCache(const NGHolder &g, u32 dist_max) {
264 // set up accept states masks
265 StateBitSet accept(size);
266 accept.set(g[g.accept].index);
267 StateBitSet accept_eod(size);
268 accept_eod.set(g[g.acceptEod].index);
269
270 // gather accept and acceptEod states
271 for (u32 base_dist = 0; base_dist < dist_max; base_dist++) {
272 auto &states = accept_states[base_dist];
273 auto &eod_states = accept_eod_states[base_dist];
274
275 states.resize(size);
276 eod_states.resize(size);
277
278 // inspect each vertex
279 for (u32 i = 0; i < size; i++) {
280 // inspect all shadow levels from base_dist to dist_max
281 for (u32 d = 0; d < dist_max - base_dist; d++) {
282 auto &shadows = shadow_transitions[i][d];
283
284 // if this state transitions to accept, set its bit
285 if ((shadows & accept).any()) {
286 states.set(i);
287 }
288 if ((shadows & accept_eod).any()) {
289 eod_states.set(i);
290 }
291 }
292 }
293 }
294
295 // populate accepts cache
296 for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
297 const auto &rs = g[v].reports;
298
299 for (u32 d = 0; d <= edit_distance; d++) {
300 // add self to report list at all levels
301 vertex_reports_by_level[d][v].insert(rs.begin(), rs.end());
302 }
303
304 if (edit_distance == 0 || hamming) {
305 // if edit distance is 0, no predecessors will have reports
306 continue;
307 }
308
309 auto preds_by_depth = gatherPredecessorsByDepth(g, v, edit_distance);
310 for (u32 pd = 0; pd < preds_by_depth.size(); pd++) {
311 const auto &preds = preds_by_depth[pd];
312 // for each predecessor, add reports up to maximum edit distance
313 // for current depth from source vertex
314 for (auto pred : preds) {
315 for (u32 d = 0; d < edit_distance - pd; d++) {
316 vertex_reports_by_level[d][pred].insert(rs.begin(), rs.end());
317 }
318 }
319 }
320 }
321 for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
322 const auto &rs = g[v].reports;
323
324 if (v == g.accept) {
325 continue;
326 }
327
328 for (u32 d = 0; d <= edit_distance; d++) {
329 // add self to report list at all levels
330 vertex_eod_reports_by_level[d][v].insert(rs.begin(), rs.end());
331 }
332 if (edit_distance == 0 || hamming) {
333 // if edit distance is 0, no predecessors will have reports
334 continue;
335 }
336
337 auto preds_by_depth = gatherPredecessorsByDepth(g, v, edit_distance);
338 for (u32 pd = 0; pd < preds_by_depth.size(); pd++) {
339 const auto &preds = preds_by_depth[pd];
340 // for each predecessor, add reports up to maximum edit distance
341 // for current depth from source vertex
342 for (auto pred : preds) {
343 for (u32 d = 0; d < edit_distance - pd; d++) {
344 vertex_eod_reports_by_level[d][pred].insert(rs.begin(), rs.end());
345 }
346 }
347 }
348 }
349 }
350
351#ifdef DEBUG
352 void dumpStateTransitionTable(const NGHolder &g) {
353 StateBitSet accept(size);
354 accept.set(g[g.accept].index);
355 StateBitSet accept_eod(size);
356 accept_eod.set(g[g.acceptEod].index);
357
358 DEBUG_PRINTF("Dumping state transition tables\n");
359 DEBUG_PRINTF("Shadows:\n");
360 for (u32 i = 0; i < num_vertices(g); i++) {
361 DEBUG_PRINTF("%-7s %3u:", "Vertex", i);
362 for (u32 j = 0; j < num_vertices(g); j++) {
363 printf("%3i", j);
364 }
365 printf("\n");
366 for (u32 d = 0; d <= edit_distance; d++) {
367 DEBUG_PRINTF("%-7s %3u:", "Level", d);
368 const auto &s = getShadowTransitions(i, d);
369 for (u32 j = 0; j < num_vertices(g); j++) {
370 printf("%3i", s.test(j));
371 }
372 printf("\n");
373 }
374 DEBUG_PRINTF("\n");
375 }
376
377 DEBUG_PRINTF("Helpers:\n");
378 for (u32 i = 0; i < num_vertices(g); i++) {
379 DEBUG_PRINTF("%-7s %3u:", "Vertex", i);
380 for (u32 j = 0; j < num_vertices(g); j++) {
381 printf("%3i", j);
382 }
383 printf("\n");
384 for (u32 d = 0; d <= edit_distance; d++) {
385 DEBUG_PRINTF("%-7s %3u:", "Level", d);
386 const auto &s = getHelperTransitions(i, d);
387 for (u32 j = 0; j < num_vertices(g); j++) {
388 printf("%3i", s.test(j));
389 }
390 printf("\n");
391 }
392 DEBUG_PRINTF("\n");
393 }
394
395 DEBUG_PRINTF("Accept transitions:\n");
396 DEBUG_PRINTF("%-12s", "Vertex idx:");
397 for (u32 j = 0; j < num_vertices(g); j++) {
398 printf("%3i", j);
399 }
400 printf("\n");
401 for (u32 d = 0; d <= edit_distance; d++) {
402 DEBUG_PRINTF("%-7s %3u:", "Level", d);
403 const auto &s = getAcceptTransitions(d);
404 for (u32 j = 0; j < num_vertices(g); j++) {
405 printf("%3i", s.test(j));
406 }
407 printf("\n");
408 }
409 DEBUG_PRINTF("\n");
410
411 DEBUG_PRINTF("Accept EOD transitions:\n");
412 DEBUG_PRINTF("%-12s", "Vertex idx:");
413 for (u32 j = 0; j < num_vertices(g); j++) {
414 printf("%3i", j);
415 }
416 printf("\n");
417 for (u32 d = 0; d <= edit_distance; d++) {
418 DEBUG_PRINTF("%-7s %3u:", "Level", d);
419 const auto &s = getAcceptEodTransitions(d);
420 for (u32 j = 0; j < num_vertices(g); j++) {
421 printf("%3i", s.test(j));
422 }
423 printf("\n");
424 }
425 DEBUG_PRINTF("\n");
426
427 DEBUG_PRINTF("%-12s ", "Accepts:");
428 for (u32 i = 0; i < num_vertices(g); i++) {
429 printf("%3i", accept.test(i));
430 }
431 printf("\n");
432
433 DEBUG_PRINTF("%-12s ", "EOD Accepts:");
434 for (u32 i = 0; i < num_vertices(g); i++) {
435 printf("%3i", accept_eod.test(i));
436 }
437 printf("\n");
438
439 DEBUG_PRINTF("Reports\n");
440 for (auto v : vertices_range(g)) {
441 for (u32 d = 0; d <= edit_distance; d++) {
442 const auto &r = vertex_reports_by_level[d][v];
443 const auto &e = vertex_eod_reports_by_level[d][v];
444 DEBUG_PRINTF("%-7s %3zu %-8s %3zu %-8s %3zu\n",
445 "Vertex", g[v].index, "rs:", r.size(), "eod:", e.size());
446 }
447 }
448 printf("\n");
449 }
450#endif
451
452 const StateBitSet& getShadowTransitions(u32 idx, u32 level) const {
453 assert(idx < size);
454 assert(level <= edit_distance);
455 return shadow_transitions[idx][level];
456 }
457 const StateBitSet& getHelperTransitions(u32 idx, u32 level) const {
458 assert(idx < size);
459 assert(level <= edit_distance);
460 return helper_transitions[idx][level];
461 }
462 const StateBitSet& getAcceptTransitions(u32 level) const {
463 assert(level <= edit_distance);
464 return accept_states[level];
465 }
466 const StateBitSet& getAcceptEodTransitions(u32 level) const {
467 assert(level <= edit_distance);
468 return accept_eod_states[level];
469 }
470
471 /*
472 * the bitsets are indexed by vertex and shadow level. the bitset's length is
473 * equal to the total number of vertices in the graph.
474 *
475 * for convenience, helper functions are provided.
476 */
477 vector<vector<StateBitSet>> shadow_transitions;
478 vector<vector<StateBitSet>> helper_transitions;
479
480 // accept states masks, indexed by shadow level
481 vector<StateBitSet> accept_states;
482 vector<StateBitSet> accept_eod_states;
483
484 // map of all reports associated with any vertex, indexed by shadow level
485 vector<map<NFAVertex, flat_set<ReportID>>> vertex_reports_by_level;
486 vector<map<NFAVertex, flat_set<ReportID>>> vertex_eod_reports_by_level;
487
488 bool hamming;
489 u32 size;
490 u32 edit_distance;
491};
492
493
494/*
495 * SOM workflow is expected to be the following:
496 * - Caller calls getActiveStates, which reports SOM for each active states
497 * - Caller calls getSuccessorStates on each of the active states, which *doesn't*
498 * report SOM
499 * - Caller decides if the successor state should be activated, and calls
500 * activateState with SOM set to that of previous active state (not successor!)
501 * - activateState then resolves any conflicts between SOMs that may arise from
502 * multiple active states progressing to the same successor
503 */
504struct StateSet {
505 struct State {
506 enum node_type {
507 NODE_SHADOW = 0,
508 NODE_HELPER
509 };
510 State(size_t idx_in, u32 level_in, size_t som_in, node_type type_in) :
511 idx(idx_in), level(level_in), som(som_in), type(type_in) {}
512 size_t idx;
513 u32 level;
514 size_t som;
515 node_type type;
516 };
517
518 // Temporary working data used for step() which we want to keep around
519 // (rather than reallocating vectors all the time).
520 struct WorkingData {
521 vector<State> active;
522 vector<State> succ_list;
523 };
524
525 StateSet(size_t sz, u32 dist_in) :
526 shadows(dist_in + 1), helpers(dist_in + 1),
527 shadows_som(dist_in + 1), helpers_som(dist_in + 1),
528 edit_distance(dist_in) {
529 for (u32 dist = 0; dist <= dist_in; dist++) {
530 shadows[dist].resize(sz, false);
531 helpers[dist].resize(sz, false);
532 shadows_som[dist].resize(sz, 0);
533 helpers_som[dist].resize(sz, 0);
534 }
535 }
536
537 void reset() {
538 for (u32 dist = 0; dist <= edit_distance; dist++) {
539 shadows[dist].reset();
540 helpers[dist].reset();
541 fill(shadows_som[dist].begin(), shadows_som[dist].end(), 0);
542 fill(helpers_som[dist].begin(), helpers_som[dist].end(), 0);
543 }
544 }
545
546 bool empty() const {
547 for (u32 dist = 0; dist <= edit_distance; dist++) {
548 if (shadows[dist].any()) {
549 return false;
550 }
551 if (helpers[dist].any()) {
552 return false;
553 }
554 }
555 return true;
556 }
557
558 size_t count() const {
559 size_t result = 0;
560
561 for (u32 dist = 0; dist <= edit_distance; dist++) {
562 result += shadows[dist].count();
563 result += helpers[dist].count();
564 }
565
566 return result;
567 }
568
569 bool setActive(const State &s) {
570 switch (s.type) {
571 case State::NODE_HELPER:
572 return helpers[s.level].test_set(s.idx);
573 case State::NODE_SHADOW:
574 return shadows[s.level].test_set(s.idx);
575 }
576 assert(0);
577 return false;
578 }
579
580 size_t getCachedSom(const State &s) const {
581 switch (s.type) {
582 case State::NODE_HELPER:
583 return helpers_som[s.level][s.idx];
584 case State::NODE_SHADOW:
585 return shadows_som[s.level][s.idx];
586 }
587 assert(0);
588 return 0;
589 }
590
591 void setCachedSom(const State &s, const size_t som_val) {
592 switch (s.type) {
593 case State::NODE_HELPER:
594 helpers_som[s.level][s.idx] = som_val;
595 break;
596 case State::NODE_SHADOW:
597 shadows_som[s.level][s.idx] = som_val;
598 break;
599 default:
600 assert(0);
601 }
602 }
603
604#ifdef DEBUG
605 void dumpActiveStates() const {
606 vector<State> states;
607 getActiveStates(states);
608
609 DEBUG_PRINTF("Dumping active states\n");
610
611 for (const auto &state : states) {
612 DEBUG_PRINTF("type: %s idx: %zu level: %u som: %zu\n",
613 state.type == State::NODE_HELPER ? "HELPER" : "SHADOW",
614 state.idx, state.level, state.som);
615 }
616 }
617#endif
618
619 void getActiveStates(vector<State> &result) const {
620 result.clear();
621
622 for (u32 dist = 0; dist <= edit_distance; dist++) {
623 // get all shadow vertices (including original graph)
624 const auto &cur_shadow_vertices = shadows[dist];
625 for (size_t id = cur_shadow_vertices.find_first();
626 id != cur_shadow_vertices.npos;
627 id = cur_shadow_vertices.find_next(id)) {
628 result.emplace_back(id, dist, shadows_som[dist][id],
629 State::NODE_SHADOW);
630 }
631
632 // the rest is only valid for edited graphs
633 if (dist == 0) {
634 continue;
635 }
636
637 // get all helper vertices
638 const auto &cur_helper_vertices = helpers[dist];
639 for (size_t id = cur_helper_vertices.find_first();
640 id != cur_helper_vertices.npos;
641 id = cur_helper_vertices.find_next(id)) {
642 result.emplace_back(id, dist, helpers_som[dist][id],
643 State::NODE_HELPER);
644 }
645 }
646
647 sort_and_unique(result);
648 }
649
650 // does not return SOM
651 void getSuccessors(const State &state, const GraphCache &gc,
652 vector<State> &result) const {
653 result.clear();
654
655 // maximum shadow depth that we can go from current level
656 u32 max_depth = edit_distance - state.level + 1;
657
658 for (u32 d = 0; d < max_depth; d++) {
659 const auto &shadow_succ = gc.getShadowTransitions(state.idx, d);
660 for (size_t id = shadow_succ.find_first();
661 id != shadow_succ.npos;
662 id = shadow_succ.find_next(id)) {
663 auto new_level = state.level + d;
664 result.emplace_back(id, new_level, 0, State::NODE_SHADOW);
665 }
666
667 const auto &helper_succ = gc.getHelperTransitions(state.idx, d);
668 for (size_t id = helper_succ.find_first();
669 id != helper_succ.npos;
670 id = helper_succ.find_next(id)) {
671 auto new_level = state.level + d;
672 result.emplace_back(id, new_level, 0, State::NODE_HELPER);
673 }
674 }
675
676 sort_and_unique(result);
677 }
678
679 void getAcceptStates(const GraphCache &gc, vector<State> &result) const {
680 result.clear();
681
682 for (u32 dist = 0; dist <= edit_distance; dist++) {
683 // get all shadow vertices (including original graph)
684 auto cur_shadow_vertices = shadows[dist];
685 cur_shadow_vertices &= gc.getAcceptTransitions(dist);
686 for (size_t id = cur_shadow_vertices.find_first();
687 id != cur_shadow_vertices.npos;
688 id = cur_shadow_vertices.find_next(id)) {
689 result.emplace_back(id, dist, shadows_som[dist][id],
690 State::NODE_SHADOW);
691 }
692
693 auto cur_helper_vertices = helpers[dist];
694 cur_helper_vertices &= gc.getAcceptTransitions(dist);
695 for (size_t id = cur_helper_vertices.find_first();
696 id != cur_helper_vertices.npos;
697 id = cur_helper_vertices.find_next(id)) {
698 result.emplace_back(id, dist, helpers_som[dist][id],
699 State::NODE_HELPER);
700 }
701 }
702
703 sort_and_unique(result);
704 }
705
706 void getAcceptEodStates(const GraphCache &gc, vector<State> &result) const {
707 result.clear();
708
709 for (u32 dist = 0; dist <= edit_distance; dist++) {
710 // get all shadow vertices (including original graph)
711 auto cur_shadow_vertices = shadows[dist];
712 cur_shadow_vertices &= gc.getAcceptEodTransitions(dist);
713 for (size_t id = cur_shadow_vertices.find_first();
714 id != cur_shadow_vertices.npos;
715 id = cur_shadow_vertices.find_next(id)) {
716 result.emplace_back(id, dist, shadows_som[dist][id],
717 State::NODE_SHADOW);
718 }
719
720 auto cur_helper_vertices = helpers[dist];
721 cur_helper_vertices &= gc.getAcceptEodTransitions(dist);
722 for (size_t id = cur_helper_vertices.find_first();
723 id != cur_helper_vertices.npos;
724 id = cur_helper_vertices.find_next(id)) {
725 result.emplace_back(id, dist, helpers_som[dist][id],
726 State::NODE_HELPER);
727 }
728 }
729
730 sort_and_unique(result);
731 }
732
733 // the caller must specify SOM at current offset, and must not attempt to
734 // resolve SOM inheritance conflicts
735 void activateState(const State &state) {
736 size_t cur_som = state.som;
737 if (setActive(state)) {
738 size_t cached_som = getCachedSom(state);
739 cur_som = min(cur_som, cached_som);
740 }
741 setCachedSom(state, cur_som);
742 }
743
744 vector<StateBitSet> shadows;
745 vector<StateBitSet> helpers;
746 vector<vector<size_t>> shadows_som;
747 vector<vector<size_t>> helpers_som;
748 u32 edit_distance;
749};
750
751// for flat_set
752bool operator<(const StateSet::State &a, const StateSet::State &b) {
753 ORDER_CHECK(idx);
754 ORDER_CHECK(level);
755 ORDER_CHECK(type);
756 ORDER_CHECK(som);
757 return false;
758}
759
760bool operator==(const StateSet::State &a, const StateSet::State &b) {
761 return a.idx == b.idx && a.level == b.level && a.type == b.type &&
762 a.som == b.som;
763}
764
765/** \brief Cache to speed up edge lookups, rather than hitting the graph. */
766struct EdgeCache {
767 explicit EdgeCache(const NGHolder &g) {
768 cache.reserve(num_vertices(g));
769 for (auto e : edges_range(g)) {
770 cache.emplace(make_pair(source(e, g), target(e, g)), e);
771 }
772 }
773
774 NFAEdge get(NFAVertex u, NFAVertex v) const {
775 auto it = cache.find(make_pair(u, v));
776 if (it != cache.end()) {
777 return it->second;
778 }
779 return NFAEdge();
780 }
781
782private:
783 ue2_unordered_map<pair<NFAVertex, NFAVertex>, NFAEdge> cache;
784};
785
786struct fmstate {
787 const size_t num_states; // number of vertices in graph
788 StateSet states; // currently active states
789 StateSet next; // states on after this iteration
790 GraphCache &gc;
791 vector<NFAVertex> vertices; // mapping from index to vertex
792 EdgeCache edge_cache;
793 size_t offset = 0;
794 unsigned char cur = 0;
795 unsigned char prev = 0;
796 const bool utf8;
797 const bool allowStartDs;
798 const ReportManager &rm;
799
800 fmstate(const NGHolder &g, GraphCache &gc_in, bool utf8_in, bool aSD_in,
801 const u32 edit_distance, const ReportManager &rm_in)
802 : num_states(num_vertices(g)),
803 states(num_states, edit_distance),
804 next(num_states, edit_distance),
805 gc(gc_in), vertices(num_vertices(g), NGHolder::null_vertex()),
806 edge_cache(g), utf8(utf8_in), allowStartDs(aSD_in), rm(rm_in) {
807 // init states
808 states.activateState(
809 StateSet::State {g[g.start].index, 0, 0,
810 StateSet::State::NODE_SHADOW});
811 if (allowStartDs) {
812 states.activateState(
813 StateSet::State {g[g.startDs].index, 0, 0,
814 StateSet::State::NODE_SHADOW});
815 }
816 // fill vertex mapping
817 for (auto v : vertices_range(g)) {
818 vertices[g[v].index] = v;
819 }
820 }
821};
822
823} // namespace
824
825static
826bool isWordChar(const unsigned char c) {
827 // check if it's an alpha character
828 if (ourisalpha(c)) {
829 return true;
830 }
831 // check if it's a digit
832 if (c >= '0' && c <= '9') {
833 return true;
834 }
835 // check if it's an underscore
836 if (c == '_') {
837 return true;
838 }
839 return false;
840}
841
842static
843bool isUtf8CodePoint(const char c) {
844 // check if this is a start of 4-byte character
845 if ((c & 0xF8) == 0xF0) {
846 return true;
847 }
848 // check if this is a start of 3-byte character
849 if ((c & 0xF0) == 0xE0) {
850 return true;
851 }
852 // check if this is a start of 2-byte character
853 if ((c & 0xE0) == 0xC0) {
854 return true;
855 }
856 // check if this is a single-byte character
857 if ((c & 0x80) == 0) {
858 return true;
859 }
860 return false;
861}
862
863static
864bool canReach(const NGHolder &g, const NFAEdge &e, struct fmstate &state) {
865 auto flags = g[e].assert_flags;
866 if (!flags) {
867 return true;
868 }
869
870 if (flags & POS_FLAG_ASSERT_WORD_TO_NONWORD) {
871 if (isWordChar(state.prev) && !isWordChar(state.cur)) {
872 return true;
873 }
874 }
875
876 if (flags & POS_FLAG_ASSERT_NONWORD_TO_WORD) {
877 if (!isWordChar(state.prev) && isWordChar(state.cur)) {
878 return true;
879 }
880 }
881
882 if (flags & POS_FLAG_ASSERT_WORD_TO_WORD) {
883 if (isWordChar(state.prev) && isWordChar(state.cur)) {
884 return true;
885 }
886 }
887
888 if (flags & POS_FLAG_ASSERT_NONWORD_TO_NONWORD) {
889 if (!isWordChar(state.prev) && !isWordChar(state.cur)) {
890 return true;
891 }
892 }
893
894 return false;
895}
896
897static
898void getAcceptMatches(const NGHolder &g, MatchSet &matches,
899 struct fmstate &state, NFAVertex accept_vertex,
900 vector<StateSet::State> &active_states) {
901 assert(accept_vertex == g.accept || accept_vertex == g.acceptEod);
902
903 const bool eod = accept_vertex == g.acceptEod;
904 if (eod) {
905 state.states.getAcceptEodStates(state.gc, active_states);
906 } else {
907 state.states.getAcceptStates(state.gc, active_states);
908 }
909
910 DEBUG_PRINTF("Number of active states: %zu\n", active_states.size());
911
912 for (const auto &cur : active_states) {
913 auto u = state.vertices[cur.idx];
914
915 // we can't accept anything from startDs in between UTF-8 codepoints
916 if (state.utf8 && u == g.startDs && !isUtf8CodePoint(state.cur)) {
917 continue;
918 }
919
920 const auto &reports =
921 eod ? state.gc.vertex_eod_reports_by_level[cur.level][u]
922 : state.gc.vertex_reports_by_level[cur.level][u];
923
924 NFAEdge e = state.edge_cache.get(u, accept_vertex);
925
926 // we assume edge assertions only exist at level 0
927 if (e && !canReach(g, e, state)) {
928 continue;
929 }
930
931 DEBUG_PRINTF("%smatch found at %zu\n", eod ? "eod " : "", state.offset);
932
933 assert(!reports.empty());
934 for (const auto &report_id : reports) {
935 const Report &ri = state.rm.getReport(report_id);
936
937 DEBUG_PRINTF("report %u has offset adjustment %d\n", report_id,
938 ri.offsetAdjust);
939 DEBUG_PRINTF("match from (i:%zu,l:%u,t:%u): (%zu,%zu)\n", cur.idx,
940 cur.level, cur.type, cur.som,
941 state.offset + ri.offsetAdjust);
942 matches.emplace(cur.som, state.offset + ri.offsetAdjust);
943 }
944 }
945}
946
947static
948void getMatches(const NGHolder &g, MatchSet &matches, struct fmstate &state,
949 StateSet::WorkingData &wd, bool allowEodMatches) {
950 getAcceptMatches(g, matches, state, g.accept, wd.active);
951 if (allowEodMatches) {
952 getAcceptMatches(g, matches, state, g.acceptEod, wd.active);
953 }
954}
955
956static
957void step(const NGHolder &g, fmstate &state, StateSet::WorkingData &wd) {
958 state.next.reset();
959
960 state.states.getActiveStates(wd.active);
961
962 for (const auto &cur : wd.active) {
963 auto u = state.vertices[cur.idx];
964 state.states.getSuccessors(cur, state.gc, wd.succ_list);
965
966 for (auto succ : wd.succ_list) {
967 auto v = state.vertices[succ.idx];
968
969 if (is_any_accept(v, g)) {
970 continue;
971 }
972
973 if (!state.allowStartDs && v == g.startDs) {
974 continue;
975 }
976
977 // GraphCache doesn't differentiate between successors for shadows
978 // and helpers, and StateSet does not know anything about the graph,
979 // so the only place we can do it is here. we can't self-loop on a
980 // startDs if we're startDs's helper, so disallow it.
981 if (u == g.startDs && v == g.startDs &&
982 succ.level != 0 && succ.level == cur.level) {
983 continue;
984 }
985
986 // for the reasons outlined above, also putting this here.
987 // disallow transitions from start to startDs on levels other than zero
988 if (u == g.start && v == g.startDs &&
989 cur.level != 0 && succ.level != 0) {
990 continue;
991 }
992
993 bool can_reach = false;
994
995 if (succ.type == StateSet::State::NODE_HELPER) {
996 can_reach = true;
997 } else {
998 // we assume edge assertions only exist on level 0
999 const CharReach &cr = g[v].char_reach;
1000 NFAEdge e = state.edge_cache.get(u, v);
1001
1002 if (cr.test(state.cur) &&
1003 (!e || canReach(g, e, state))) {
1004 can_reach = true;
1005 }
1006 }
1007
1008 // check edge assertions if we are allowed to reach accept
1009 DEBUG_PRINTF("reaching %zu->%zu ('%c'->'%c'): %s\n",
1010 g[u].index, g[v].index,
1011 ourisprint(state.prev) ? state.prev : '?',
1012 ourisprint(state.cur) ? state.cur : '?',
1013 can_reach ? "yes" : "no");
1014
1015 if (can_reach) {
1016 // we should use current offset as SOM if:
1017 // - we're at level 0 and we're a start vertex
1018 // - we're a fake start shadow
1019 size_t next_som;
1020 bool reset = is_any_start(u, g) && cur.level == 0;
1021 reset |= is_virtual_start(u, g) &&
1022 cur.type == StateSet::State::NODE_SHADOW;
1023
1024 if (reset) {
1025 next_som = state.offset;
1026 } else {
1027 // else, inherit SOM from predecessor
1028 next_som = cur.som;
1029 }
1030 succ.som = next_som;
1031
1032 DEBUG_PRINTF("src: idx %zu level: %u som: %zu type: %s\n",
1033 cur.idx, cur.level, cur.som,
1034 cur.type == StateSet::State::NODE_HELPER ? "H" : "S");
1035 DEBUG_PRINTF("dst: idx %zu level: %u som: %zu type: %s\n",
1036 succ.idx, succ.level, succ.som,
1037 succ.type == StateSet::State::NODE_HELPER ? "H" : "S");
1038
1039 // activate successor (SOM will be handled by activateState)
1040 state.next.activateState(succ);
1041 }
1042 }
1043 }
1044}
1045
1046// filter extraneous matches
1047static
1048void filterMatches(MatchSet &matches) {
1049 set<size_t> eom;
1050
1051 // first, collect all end-offset matches
1052 for (const auto &match : matches) {
1053 eom.insert(match.second);
1054 }
1055
1056 // now, go through all the end-offsets and filter extra matches
1057 for (const auto &elem : eom) {
1058 // find minimum SOM for this EOM
1059 size_t min_som = -1U;
1060 for (const auto &match : matches) {
1061 // skip entries with wrong EOM
1062 if (match.second != elem) {
1063 continue;
1064 }
1065
1066 min_som = min(min_som, match.first);
1067 }
1068
1069 auto msit = matches.begin();
1070 while (msit != matches.end()) {
1071 // skip everything that doesn't match
1072 if (msit->second != elem || msit->first <= min_som) {
1073 ++msit;
1074 continue;
1075 }
1076 DEBUG_PRINTF("erasing match %zu, %zu\n", msit->first, msit->second);
1077 matches.erase(msit++);
1078 }
1079 }
1080}
1081
1082/** \brief Find all matches for a given graph when executed against \a input.
1083 *
1084 * Fills \a matches with offsets into the data stream where a match is found.
1085 */
1086bool findMatches(const NGHolder &g, const ReportManager &rm,
1087 const string &input, MatchSet &matches,
1088 const u32 edit_distance, const u32 hamm_distance,
1089 const bool notEod, const bool utf8) {
1090 assert(hasCorrectlyNumberedVertices(g));
1091 // cannot match fuzzy utf8 patterns, this should've been filtered out at
1092 // compile time, so make it an assert
1093 assert(!edit_distance || !utf8);
1094 // cannot be both edit and Hamming distance at once
1095 assert(!edit_distance || !hamm_distance);
1096
1097 bool hamming = hamm_distance > 0;
1098 auto dist = hamming ? hamm_distance : edit_distance;
1099
1100 const size_t total_states = num_vertices(g) * (3 * dist + 1);
1101 DEBUG_PRINTF("Finding matches (%zu total states)\n", total_states);
1102 if (total_states > STATE_COUNT_MAX) {
1103 DEBUG_PRINTF("too big\n");
1104 return false;
1105 }
1106
1107 GraphCache gc(edit_distance, hamm_distance, g);
1108#ifdef DEBUG
1109 gc.dumpStateTransitionTable(g);
1110#endif
1111
1112 const bool allowStartDs = (proper_out_degree(g.startDs, g) > 0);
1113
1114 struct fmstate state(g, gc, utf8, allowStartDs, dist, rm);
1115
1116 StateSet::WorkingData wd;
1117
1118 for (auto it = input.begin(), ite = input.end(); it != ite; ++it) {
1119#ifdef DEBUG
1120 state.states.dumpActiveStates();
1121#endif
1122 state.offset = std::distance(input.begin(), it);
1123 state.cur = *it;
1124
1125 step(g, state, wd);
1126
1127 getMatches(g, matches, state, wd, false);
1128
1129 DEBUG_PRINTF("offset %zu, %zu states on\n", state.offset,
1130 state.next.count());
1131 if (state.next.empty()) {
1132 filterMatches(matches);
1133 return true;
1134 }
1135 state.states = state.next;
1136 state.prev = state.cur;
1137 }
1138#ifdef DEBUG
1139 state.states.dumpActiveStates();
1140#endif
1141 state.offset = input.size();
1142 state.cur = 0;
1143
1144 // do additional step to get matches after stream end, this time count eod
1145 // matches also (or not, if we're in notEod mode)
1146
1147 DEBUG_PRINTF("Looking for EOD matches\n");
1148 getMatches(g, matches, state, wd, !notEod);
1149
1150 filterMatches(matches);
1151 return true;
1152}
1153