1/*
2 * Copyright (c) 2015-2018, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Build code for McClellan DFA.
31 */
32#include "ng_mcclellan.h"
33
34#include "grey.h"
35#include "nfa/dfa_min.h"
36#include "nfa/rdfa.h"
37#include "ng_holder.h"
38#include "ng_mcclellan_internal.h"
39#include "ng_squash.h"
40#include "ng_util.h"
41#include "ue2common.h"
42#include "util/bitfield.h"
43#include "util/determinise.h"
44#include "util/flat_containers.h"
45#include "util/graph_range.h"
46#include "util/hash.h"
47#include "util/hash_dynamic_bitset.h"
48#include "util/make_unique.h"
49#include "util/report_manager.h"
50
51#include <algorithm>
52#include <functional>
53#include <map>
54#include <set>
55#include <unordered_map>
56#include <vector>
57
58#include <boost/dynamic_bitset.hpp>
59
60using namespace std;
61using boost::dynamic_bitset;
62
63namespace ue2 {
64
65#define FINAL_DFA_STATE_LIMIT 16383
66#define DFA_STATE_LIMIT 1024
67#define NFA_STATE_LIMIT 256
68
69u16 buildAlphabetFromEquivSets(const std::vector<CharReach> &esets,
70 array<u16, ALPHABET_SIZE> &alpha,
71 array<u16, ALPHABET_SIZE> &unalpha) {
72 u16 i = 0;
73 for (; i < esets.size(); i++) {
74 const CharReach &cr = esets[i];
75
76#ifdef DEBUG
77 DEBUG_PRINTF("eq set: ");
78 for (size_t s = cr.find_first(); s != CharReach::npos;
79 s = cr.find_next(s)) {
80 printf("%02hhx ", (u8)s);
81 }
82 printf("-> %u\n", i);
83#endif
84 u16 leader = cr.find_first();
85 for (size_t s = cr.find_first(); s != CharReach::npos;
86 s = cr.find_next(s)) {
87 alpha[s] = i;
88 }
89 unalpha[i] = leader;
90 }
91
92 for (u16 j = N_CHARS; j < ALPHABET_SIZE; j++, i++) {
93 alpha[j] = i;
94 unalpha[i] = j;
95 }
96
97 return i; // alphabet size
98}
99
100void calculateAlphabet(const NGHolder &g, array<u16, ALPHABET_SIZE> &alpha,
101 array<u16, ALPHABET_SIZE> &unalpha, u16 *alphasize) {
102 vector<CharReach> esets(1, CharReach::dot());
103
104 for (auto v : vertices_range(g)) {
105 if (is_special(v, g)) {
106 continue;
107 }
108
109 const CharReach &cr = g[v].char_reach;
110
111 for (size_t i = 0; i < esets.size(); i++) {
112 if (esets[i].count() == 1) {
113 continue;
114 }
115
116 CharReach t = cr & esets[i];
117 if (t.any() && t != esets[i]) {
118 esets[i] &= ~t;
119 esets.push_back(t);
120 }
121 }
122 }
123 // for deterministic compiles
124 sort(esets.begin(), esets.end());
125
126 assert(alphasize);
127 *alphasize = buildAlphabetFromEquivSets(esets, alpha, unalpha);
128}
129
130static
131bool allExternalReports(const ReportManager &rm,
132 const flat_set<ReportID> &reports) {
133 for (auto report_id : reports) {
134 if (!isExternalReport(rm.getReport(report_id))) {
135 return false;
136 }
137 }
138
139 return true;
140}
141
142static
143dstate_id_t successor(const vector<dstate> &dstates, dstate_id_t c,
144 const array<u16, ALPHABET_SIZE> &alpha, symbol_t s) {
145 return dstates[c].next[alpha[s]];
146}
147
148void getFullTransitionFromState(const raw_dfa &n, dstate_id_t state,
149 dstate_id_t *out_table) {
150 for (u32 i = 0; i < ALPHABET_SIZE; i++) {
151 out_table[i] = successor(n.states, state, n.alpha_remap, i);
152 }
153}
154
155template<typename stateset>
156static
157void populateInit(const NGHolder &g, const flat_set<NFAVertex> &unused,
158 stateset *init, stateset *init_deep,
159 vector<NFAVertex> *v_by_index) {
160 for (auto v : vertices_range(g)) {
161 if (contains(unused, v)) {
162 continue;
163 }
164
165 u32 vert_id = g[v].index;
166 assert(vert_id < init->size());
167
168 if (is_any_start(v, g)) {
169 init->set(vert_id);
170 if (hasSelfLoop(v, g) || is_triggered(g)) {
171 DEBUG_PRINTF("setting %u\n", vert_id);
172 init_deep->set(vert_id);
173 }
174 }
175 }
176
177 v_by_index->clear();
178 v_by_index->resize(num_vertices(g), NGHolder::null_vertex());
179
180 for (auto v : vertices_range(g)) {
181 u32 vert_id = g[v].index;
182 assert((*v_by_index)[vert_id] == NGHolder::null_vertex());
183 (*v_by_index)[vert_id] = v;
184 }
185
186 if (is_triggered(g)) {
187 *init_deep = *init;
188 }
189}
190
191template<typename StateSet>
192void populateAccepts(const NGHolder &g, const flat_set<NFAVertex> &unused,
193 StateSet *accept, StateSet *acceptEod) {
194 for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
195 if (contains(unused, v)) {
196 continue;
197 }
198 accept->set(g[v].index);
199 }
200 for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
201 if (v == g.accept) {
202 continue;
203 }
204 if (contains(unused, v)) {
205 continue;
206 }
207 acceptEod->set(g[v].index);
208 }
209}
210
211static
212bool canPruneEdgesFromAccept(const ReportManager &rm, const NGHolder &g) {
213 bool seen = false;
214 u32 ekey = 0;
215
216 for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
217 if (is_special(v, g)) {
218 continue;
219 }
220
221 for (auto report_id : g[v].reports) {
222 const Report &ir = rm.getReport(report_id);
223
224 if (!isSimpleExhaustible(ir)) {
225 return false;
226 }
227
228 if (!seen) {
229 seen = true;
230 ekey = ir.ekey;
231 } else if (ekey != ir.ekey) {
232 return false;
233 }
234 }
235 }
236
237 /* need to check accept eod does not have any unseen reports as well */
238 for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
239 if (is_special(v, g)) {
240 continue;
241 }
242
243 for (auto report_id : g[v].reports) {
244 const Report &ir = rm.getReport(report_id);
245
246 if (!isSimpleExhaustible(ir)) {
247 return false;
248 }
249
250 if (!seen) {
251 seen = true;
252 ekey = ir.ekey;
253 } else if (ekey != ir.ekey) {
254 return false;
255 }
256 }
257 }
258
259 return true;
260}
261
262static
263bool overhangMatchesTrigger(const vector<vector<CharReach> > &all_triggers,
264 vector<CharReach>::const_reverse_iterator itb,
265 vector<CharReach>::const_reverse_iterator ite) {
266 for (const auto &trigger : all_triggers) {
267 vector<CharReach>::const_reverse_iterator it = itb;
268 vector<CharReach>::const_reverse_iterator kt = trigger.rbegin();
269 for (; it != ite && kt != trigger.rend(); ++it, ++kt) {
270 if ((*it & *kt).none()) {
271 /* this trigger does not match the overhang, try next */
272 goto try_next_trigger;
273 }
274 }
275
276 return true;
277 try_next_trigger:;
278 }
279
280 return false; /* no trigger matches the over hang */
281}
282
283static
284bool triggerAllowed(const NGHolder &g, const NFAVertex v,
285 const vector<vector<CharReach> > &all_triggers,
286 const vector<CharReach> &trigger) {
287 flat_set<NFAVertex> curr({v});
288 flat_set<NFAVertex> next;
289
290 for (auto it = trigger.rbegin(); it != trigger.rend(); ++it) {
291 next.clear();
292
293 for (auto u : curr) {
294 assert(u != g.startDs); /* triggered graphs should not use sds */
295 if (u == g.start) {
296 if (overhangMatchesTrigger(all_triggers, it, trigger.rend())) {
297 return true;
298 }
299 continue;
300 }
301
302 if ((g[u].char_reach & *it).none()) {
303 continue;
304 }
305 insert(&next, inv_adjacent_vertices(u, g));
306 }
307
308 if (next.empty()) {
309 return false;
310 }
311
312 next.swap(curr);
313 }
314
315 return true;
316}
317
318void markToppableStarts(const NGHolder &g, const flat_set<NFAVertex> &unused,
319 bool single_trigger,
320 const vector<vector<CharReach>> &triggers,
321 dynamic_bitset<> *out) {
322 if (single_trigger) {
323 return; /* no live states can lead to new states */
324 }
325
326 for (auto v : vertices_range(g)) {
327 if (contains(unused, v)) {
328 continue;
329 }
330 for (const auto &trigger : triggers) {
331 if (triggerAllowed(g, v, triggers, trigger)) {
332 DEBUG_PRINTF("idx %zu is valid location for top\n", g[v].index);
333 out->set(g[v].index);
334 break;
335 }
336 }
337 }
338
339 assert(out->test(g[g.start].index));
340}
341
342namespace {
343
344template<typename Automaton_Traits>
345class Automaton_Base {
346public:
347 using StateSet = typename Automaton_Traits::StateSet;
348 using StateMap = typename Automaton_Traits::StateMap;
349
350 Automaton_Base(const ReportManager *rm_in, const NGHolder &graph_in,
351 bool single_trigger,
352 const vector<vector<CharReach>> &triggers, bool prunable_in)
353 : rm(rm_in), graph(graph_in), numStates(num_vertices(graph)),
354 unused(getRedundantStarts(graph_in)),
355 init(Automaton_Traits::init_states(numStates)),
356 initDS(Automaton_Traits::init_states(numStates)),
357 squash(Automaton_Traits::init_states(numStates)),
358 accept(Automaton_Traits::init_states(numStates)),
359 acceptEod(Automaton_Traits::init_states(numStates)),
360 toppable(Automaton_Traits::init_states(numStates)),
361 dead(Automaton_Traits::init_states(numStates)),
362 prunable(prunable_in) {
363 populateInit(graph, unused, &init, &initDS, &v_by_index);
364 populateAccepts(graph, unused, &accept, &acceptEod);
365
366 start_anchored = DEAD_STATE + 1;
367 if (initDS == init) {
368 start_floating = start_anchored;
369 } else if (initDS.any()) {
370 start_floating = start_anchored + 1;
371 } else {
372 start_floating = DEAD_STATE;
373 }
374
375 calculateAlphabet(graph, alpha, unalpha, &alphasize);
376
377 for (const auto &sq : findSquashers(graph)) {
378 NFAVertex v = sq.first;
379 u32 vert_id = graph[v].index;
380 squash.set(vert_id);
381 squash_mask[vert_id]
382 = Automaton_Traits::copy_states(std::move(sq.second),
383 numStates);
384 }
385
386 cr_by_index = populateCR(graph, v_by_index, alpha);
387 if (is_triggered(graph)) {
388 dynamic_bitset<> temp(numStates);
389 markToppableStarts(graph, unused, single_trigger, triggers,
390 &temp);
391 toppable = Automaton_Traits::copy_states(std::move(temp),
392 numStates);
393 }
394 }
395
396public:
397 void transition(const StateSet &in, StateSet *next) {
398 transition_graph(*this, v_by_index, in, next);
399 }
400
401 const vector<StateSet> initial() {
402 vector<StateSet> rv = {init};
403 if (start_floating != DEAD_STATE && start_floating != start_anchored) {
404 rv.push_back(initDS);
405 }
406 return rv;
407 }
408
409private:
410 void reports_i(const StateSet &in, bool eod, flat_set<ReportID> &rv) {
411 StateSet acc = in & (eod ? acceptEod : accept);
412 for (size_t i = acc.find_first(); i != StateSet::npos;
413 i = acc.find_next(i)) {
414 NFAVertex v = v_by_index[i];
415 DEBUG_PRINTF("marking report\n");
416 const auto &my_reports = graph[v].reports;
417 rv.insert(my_reports.begin(), my_reports.end());
418 }
419 }
420
421public:
422 void reports(const StateSet &in, flat_set<ReportID> &rv) {
423 reports_i(in, false, rv);
424 }
425 void reportsEod(const StateSet &in, flat_set<ReportID> &rv) {
426 reports_i(in, true, rv);
427 }
428
429 bool canPrune(const flat_set<ReportID> &test_reports) const {
430 if (!rm || !prunable || !canPruneEdgesFromAccept(*rm, graph)) {
431 return false;
432 }
433 return allExternalReports(*rm, test_reports);
434 }
435
436private:
437 const ReportManager *rm;
438public:
439 const NGHolder &graph;
440 u32 numStates;
441 const flat_set<NFAVertex> unused;
442 vector<NFAVertex> v_by_index;
443 vector<CharReach> cr_by_index; /* pre alpha'ed */
444 StateSet init;
445 StateSet initDS;
446 StateSet squash; /* states which allow us to mask out other states */
447 StateSet accept;
448 StateSet acceptEod;
449 StateSet toppable; /* states which are allowed to be on when a top arrives,
450 * triggered dfas only */
451 StateSet dead;
452 map<u32, StateSet> squash_mask;
453 bool prunable;
454 array<u16, ALPHABET_SIZE> alpha;
455 array<u16, ALPHABET_SIZE> unalpha;
456 u16 alphasize;
457
458 u16 start_anchored;
459 u16 start_floating;
460};
461
462struct Big_Traits {
463 using StateSet = dynamic_bitset<>;
464 using StateMap = unordered_map<StateSet, dstate_id_t, hash_dynamic_bitset>;
465
466 static StateSet init_states(u32 num) {
467 return StateSet(num);
468 }
469
470 static StateSet copy_states(dynamic_bitset<> in, UNUSED u32 num) {
471 assert(in.size() == num);
472 return in;
473 }
474};
475
476class Automaton_Big : public Automaton_Base<Big_Traits> {
477public:
478 Automaton_Big(const ReportManager *rm_in, const NGHolder &graph_in,
479 bool single_trigger,
480 const vector<vector<CharReach>> &triggers, bool prunable_in)
481 : Automaton_Base(rm_in, graph_in, single_trigger, triggers,
482 prunable_in) {}
483};
484
485struct Graph_Traits {
486 using StateSet = bitfield<NFA_STATE_LIMIT>;
487 using StateMap = unordered_map<StateSet, dstate_id_t>;
488
489 static StateSet init_states(UNUSED u32 num) {
490 assert(num <= NFA_STATE_LIMIT);
491 return StateSet();
492 }
493
494 static StateSet copy_states(const dynamic_bitset<> &in, u32 num) {
495 StateSet out = init_states(num);
496 for (size_t i = in.find_first(); i != in.npos && i < out.size();
497 i = in.find_next(i)) {
498 out.set(i);
499 }
500 return out;
501 }
502};
503
504class Automaton_Graph : public Automaton_Base<Graph_Traits> {
505public:
506 Automaton_Graph(const ReportManager *rm_in, const NGHolder &graph_in,
507 bool single_trigger,
508 const vector<vector<CharReach>> &triggers, bool prunable_in)
509 : Automaton_Base(rm_in, graph_in, single_trigger, triggers,
510 prunable_in) {}
511};
512
513} // namespace
514
515static
516bool startIsRedundant(const NGHolder &g) {
517 set<NFAVertex> start;
518 set<NFAVertex> startDs;
519
520 insert(&start, adjacent_vertices(g.start, g));
521 insert(&startDs, adjacent_vertices(g.startDs, g));
522
523 return start == startDs;
524}
525
526flat_set<NFAVertex> getRedundantStarts(const NGHolder &g) {
527 flat_set<NFAVertex> dead;
528 if (startIsRedundant(g)) {
529 dead.insert(g.start);
530 }
531 if (proper_out_degree(g.startDs, g) == 0) {
532 dead.insert(g.startDs);
533 }
534 return dead;
535}
536
537unique_ptr<raw_dfa> buildMcClellan(const NGHolder &graph,
538 const ReportManager *rm, bool single_trigger,
539 const vector<vector<CharReach>> &triggers,
540 const Grey &grey, bool finalChance) {
541 if (!grey.allowMcClellan) {
542 return nullptr;
543 }
544
545 DEBUG_PRINTF("attempting to build %s mcclellan\n",
546 to_string(graph.kind).c_str());
547 assert(allMatchStatesHaveReports(graph));
548
549 bool prunable = grey.highlanderPruneDFA && has_managed_reports(graph);
550 assert(rm || !has_managed_reports(graph));
551 if (!has_managed_reports(graph)) {
552 rm = nullptr;
553 }
554
555 assert(triggers.empty() == !is_triggered(graph));
556
557 /* We must be getting desperate if it is an outfix, so use the final chance
558 * state limit logic */
559 u32 state_limit
560 = (graph.kind == NFA_OUTFIX || finalChance) ? FINAL_DFA_STATE_LIMIT
561 : DFA_STATE_LIMIT;
562
563 const u32 numStates = num_vertices(graph);
564 DEBUG_PRINTF("determinising nfa with %u vertices\n", numStates);
565
566 if (numStates > FINAL_DFA_STATE_LIMIT) {
567 DEBUG_PRINTF("rejecting nfa as too many vertices\n");
568 return nullptr;
569 }
570
571 auto rdfa = ue2::make_unique<raw_dfa>(graph.kind);
572
573 if (numStates <= NFA_STATE_LIMIT) {
574 /* Fast path. Automaton_Graph uses a bitfield internally to represent
575 * states and is quicker than Automaton_Big. */
576 Automaton_Graph n(rm, graph, single_trigger, triggers, prunable);
577 if (!determinise(n, rdfa->states, state_limit)) {
578 DEBUG_PRINTF("state limit exceeded\n");
579 return nullptr; /* over state limit */
580 }
581
582 rdfa->start_anchored = n.start_anchored;
583 rdfa->start_floating = n.start_floating;
584 rdfa->alpha_size = n.alphasize;
585 rdfa->alpha_remap = n.alpha;
586 } else {
587 /* Slow path. Too many states to use Automaton_Graph. */
588 Automaton_Big n(rm, graph, single_trigger, triggers, prunable);
589 if (!determinise(n, rdfa->states, state_limit)) {
590 DEBUG_PRINTF("state limit exceeded\n");
591 return nullptr; /* over state limit */
592 }
593
594 rdfa->start_anchored = n.start_anchored;
595 rdfa->start_floating = n.start_floating;
596 rdfa->alpha_size = n.alphasize;
597 rdfa->alpha_remap = n.alpha;
598 }
599
600 minimize_hopcroft(*rdfa, grey);
601
602 DEBUG_PRINTF("after determinised into %zu states, building impl dfa "
603 "(a,f) = (%hu,%hu)\n", rdfa->states.size(),
604 rdfa->start_anchored, rdfa->start_floating);
605
606 return rdfa;
607}
608
609unique_ptr<raw_dfa> buildMcClellan(const NGHolder &g, const ReportManager *rm,
610 const Grey &grey) {
611 assert(!is_triggered(g));
612 vector<vector<CharReach>> triggers;
613 return buildMcClellan(g, rm, false, triggers, grey);
614}
615
616} // namespace ue2
617