1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/**
30 * \file
31 * \brief Main NFA build code.
32 */
33
34#include "limex_compile.h"
35
36#include "accel.h"
37#include "accelcompile.h"
38#include "grey.h"
39#include "limex_internal.h"
40#include "limex_limits.h"
41#include "nfa_build_util.h"
42#include "nfagraph/ng_dominators.h"
43#include "nfagraph/ng_holder.h"
44#include "nfagraph/ng_limex_accel.h"
45#include "nfagraph/ng_repeat.h"
46#include "nfagraph/ng_squash.h"
47#include "nfagraph/ng_util.h"
48#include "ue2common.h"
49#include "repeatcompile.h"
50#include "util/alloc.h"
51#include "util/bitutils.h"
52#include "util/bytecode_ptr.h"
53#include "util/charreach.h"
54#include "util/compile_context.h"
55#include "util/container.h"
56#include "util/flat_containers.h"
57#include "util/graph.h"
58#include "util/graph_range.h"
59#include "util/graph_small_color_map.h"
60#include "util/order_check.h"
61#include "util/unordered.h"
62#include "util/verify_types.h"
63
64#include <algorithm>
65#include <cassert>
66#include <cstddef>
67#include <cstdlib>
68#include <cstring>
69#include <map>
70#include <set>
71#include <vector>
72
73#include <boost/graph/breadth_first_search.hpp>
74#include <boost/graph/depth_first_search.hpp>
75#include <boost/range/adaptor/map.hpp>
76
77using namespace std;
78using boost::adaptors::map_values;
79
80namespace ue2 {
81
82/**
83 * \brief Special state index value meaning that the vertex will not
84 * participate in an (NFA/DFA/etc) implementation.
85 */
86static constexpr u32 NO_STATE = ~0;
87
88namespace {
89
90struct precalcAccel {
91 precalcAccel() : single_offset(0), double_offset(0) {}
92 CharReach single_cr;
93 u32 single_offset;
94
95 CharReach double_cr;
96 flat_set<pair<u8, u8>> double_lits; /* double-byte accel stop literals */
97 u32 double_offset;
98};
99
100struct limex_accel_info {
101 unordered_set<NFAVertex> accelerable;
102 map<NFAStateSet, precalcAccel> precalc;
103 unordered_map<NFAVertex, flat_set<NFAVertex>> friends;
104 unordered_map<NFAVertex, AccelScheme> accel_map;
105};
106
107static
108unordered_map<NFAVertex, NFAStateSet>
109reindexByStateId(const unordered_map<NFAVertex, NFAStateSet> &in,
110 const NGHolder &g,
111 const unordered_map<NFAVertex, u32> &state_ids,
112 const u32 num_states) {
113 unordered_map<NFAVertex, NFAStateSet> out;
114 out.reserve(in.size());
115
116 vector<u32> indexToState(num_vertices(g), NO_STATE);
117 for (const auto &m : state_ids) {
118 u32 vert_id = g[m.first].index;
119 assert(vert_id < indexToState.size());
120 indexToState[vert_id] = m.second;
121 }
122
123 for (const auto &m : in) {
124 NFAVertex v = m.first;
125 assert(m.second.size() <= indexToState.size());
126
127 NFAStateSet mask(num_states);
128 for (size_t i = m.second.find_first(); i != m.second.npos;
129 i = m.second.find_next(i)) {
130 u32 state_id = indexToState[i];
131 if (state_id == NO_STATE) {
132 continue;
133 }
134 mask.set(state_id);
135 }
136 out.emplace(v, mask);
137 }
138
139 return out;
140}
141
142struct build_info {
143 build_info(NGHolder &hi,
144 const unordered_map<NFAVertex, u32> &states_in,
145 const vector<BoundedRepeatData> &ri,
146 const unordered_map<NFAVertex, NFAStateSet> &rsmi,
147 const unordered_map<NFAVertex, NFAStateSet> &smi,
148 const map<u32, set<NFAVertex>> &ti, const set<NFAVertex> &zi,
149 bool dai, bool sci, const CompileContext &cci, u32 nsi)
150 : h(hi), state_ids(states_in), repeats(ri), tops(ti), tugs(nsi),
151 zombies(zi), do_accel(dai), stateCompression(sci), cc(cci),
152 num_states(nsi) {
153 for (const auto &br : repeats) {
154 for (auto v : br.tug_triggers) {
155 assert(state_ids.at(v) != NO_STATE);
156 tugs.set(state_ids.at(v));
157 }
158 br_cyclic[br.cyclic] =
159 BoundedRepeatSummary(br.repeatMin, br.repeatMax);
160 }
161
162 // Convert squash maps to be indexed by state index rather than
163 // vertex_index.
164 squashMap = reindexByStateId(smi, h, state_ids, num_states);
165 reportSquashMap = reindexByStateId(rsmi, h, state_ids, num_states);
166 }
167
168 NGHolder &h;
169 const unordered_map<NFAVertex, u32> &state_ids;
170 const vector<BoundedRepeatData> &repeats;
171
172 // Squash maps; state sets are indexed by state_id.
173 unordered_map<NFAVertex, NFAStateSet> reportSquashMap;
174 unordered_map<NFAVertex, NFAStateSet> squashMap;
175
176 const map<u32, set<NFAVertex>> &tops;
177 NFAStateSet tugs;
178 map<NFAVertex, BoundedRepeatSummary> br_cyclic;
179 const set<NFAVertex> &zombies;
180 bool do_accel;
181 bool stateCompression;
182 const CompileContext &cc;
183 u32 num_states;
184 limex_accel_info accel;
185};
186
187#define LAST_LIMEX_NFA LIMEX_NFA_512
188
189// Constants for scoring mechanism
190const int SHIFT_COST = 10; // limex: cost per shift mask
191const int EXCEPTION_COST = 4; // limex: per exception
192
193template<NFAEngineType t> struct NFATraits { };
194
195template<template<NFAEngineType t> class sfunc, typename rv_t, typename arg_t,
196 NFAEngineType lb>
197struct DISPATCH_BY_LIMEX_TYPE_INT {
198 static rv_t doOp(NFAEngineType i, const arg_t &arg) {
199 if (i == lb) {
200 return sfunc<lb>::call(arg);
201 } else {
202 return DISPATCH_BY_LIMEX_TYPE_INT<sfunc, rv_t, arg_t,
203 (NFAEngineType)(lb + 1)>
204 ::doOp(i, arg);
205 }
206 }
207};
208
209template<template<NFAEngineType t> class sfunc, typename rv_t, typename arg_t>
210struct DISPATCH_BY_LIMEX_TYPE_INT<sfunc, rv_t, arg_t,
211 (NFAEngineType)(LAST_LIMEX_NFA + 1)> {
212 // dummy
213 static rv_t doOp(NFAEngineType, const arg_t &) {
214 assert(0);
215 throw std::logic_error("Unreachable");
216 }
217};
218
219#define DISPATCH_BY_LIMEX_TYPE(i, op, arg) \
220 DISPATCH_BY_LIMEX_TYPE_INT<op, decltype(op<(NFAEngineType)0>::call(arg)), \
221 decltype(arg), (NFAEngineType)0>::doOp(i, arg)
222
223// Given a number of states, find the size of the smallest container NFA it
224// will fit in. We support NFAs of the following sizes: 32, 64, 128, 256, 384,
225// 512.
226size_t findContainerSize(size_t states) {
227 if (states > 256 && states <= 384) {
228 return 384;
229 }
230 return 1ULL << (lg2(states - 1) + 1);
231}
232
233bool isLimitedTransition(int from, int to, int maxshift) {
234 int diff = to - from;
235
236 // within our shift?
237 if (diff < 0 || diff > maxshift) {
238 return false;
239 }
240
241 // can't jump over a bollard
242 return (from & ~63) == (to & ~63);
243}
244
245// Fill a bit mask
246template<class Mask>
247void maskFill(Mask &m, u8 c) {
248 memset(&m, c, sizeof(m));
249}
250
251// Clear a bit mask.
252template<class Mask>
253void maskClear(Mask &m) {
254 memset(&m, 0, sizeof(m));
255}
256
257template<class Mask>
258u8 *maskGetByte(Mask &m, u32 bit) {
259 assert(bit < sizeof(m)*8);
260 u8 *m8 = (u8 *)&m;
261
262 return m8 + bit/8;
263}
264
265// Set a bit in a mask, starting from the little end.
266template<class Mask>
267void maskSetBit(Mask &m, const unsigned int bit) {
268 u8 *byte = maskGetByte(m, bit);
269 *byte |= 1U << (bit % 8);
270}
271
272template<class Mask>
273void maskSetBits(Mask &m, const NFAStateSet &bits) {
274 for (size_t i = bits.find_first(); i != bits.npos; i = bits.find_next(i)) {
275 maskSetBit(m, i);
276 }
277}
278
279template<class Mask>
280bool isMaskZero(Mask &m) {
281 u8 *m8 = (u8 *)&m;
282 for (u32 i = 0; i < sizeof(m); i++) {
283 if (m8[i]) {
284 return false;
285 }
286 }
287 return true;
288}
289
290// Sets an entire byte in a mask to the given value
291template<class Mask>
292void maskSetByte(Mask &m, const unsigned int idx, const char val) {
293 assert(idx < sizeof(m));
294 char *m8 = (char *)&m;
295 char &byte = m8[idx];
296 byte = val;
297}
298
299// Clear a bit in the mask, starting from the little end.
300template<class Mask>
301void maskClearBit(Mask &m, const u32 bit) {
302 u8 *byte = maskGetByte(m, bit);
303 *byte &= ~(1U << (bit % 8));
304}
305
306/*
307 * Common code: the following code operates on parts of the NFA that are common
308 * to both the (defunct) General and the LimEx models.
309 */
310
311static
312void buildReachMapping(const build_info &args, vector<NFAStateSet> &reach,
313 vector<u8> &reachMap) {
314 const NGHolder &h = args.h;
315 const auto &state_ids = args.state_ids;
316
317 // Build a list of vertices with a state index assigned.
318 vector<NFAVertex> verts;
319 verts.reserve(args.num_states);
320 for (auto v : vertices_range(h)) {
321 if (state_ids.at(v) != NO_STATE) {
322 verts.push_back(v);
323 }
324 }
325
326 // Build a mapping from set-of-states -> reachability.
327 map<NFAStateSet, CharReach> mapping;
328 NFAStateSet states(args.num_states);
329 for (size_t i = 0; i < N_CHARS; i++) {
330 states.reset();
331 for (auto v : verts) {
332 const CharReach &cr = h[v].char_reach;
333 if (cr.test(i)) {
334 u32 state_id = state_ids.at(v);
335 states.set(state_id);
336 }
337 }
338 mapping[states].set(i);
339 }
340
341 DEBUG_PRINTF("%zu distinct reachability entries\n", mapping.size());
342 assert(!mapping.empty());
343
344 // Build a vector of distinct reachability entries and a mapping from every
345 // character to one of those entries.
346
347 reach.reserve(mapping.size());
348 reachMap.assign(N_CHARS, 0);
349
350 u8 num = 0;
351 for (auto mi = mapping.begin(), me = mapping.end(); mi != me; ++mi, ++num) {
352 // Reach entry.
353 reach.push_back(mi->first);
354
355 // Character mapping.
356 const CharReach &cr = mi->second;
357 for (size_t i = cr.find_first(); i != CharReach::npos;
358 i = cr.find_next(i)) {
359 reachMap[i] = num;
360 }
361 }
362}
363
364struct AccelBuild {
365 AccelBuild() : v(NGHolder::null_vertex()), state(0), offset(0) {}
366 NFAVertex v;
367 u32 state;
368 u32 offset; // offset correction to apply
369 CharReach stop1; // single-byte accel stop literals
370 flat_set<pair<u8, u8>> stop2; // double-byte accel stop literals
371};
372
373static
374void findStopLiterals(const build_info &bi, NFAVertex v, AccelBuild &build) {
375 u32 state = bi.state_ids.at(v);
376 build.v = v;
377 build.state = state;
378 NFAStateSet ss(bi.num_states);
379 ss.set(state);
380
381 if (!contains(bi.accel.precalc, ss)) {
382 build.stop1 = CharReach::dot();
383 } else {
384 const precalcAccel &precalc = bi.accel.precalc.at(ss);
385 if (precalc.double_lits.empty()) {
386 build.stop1 = precalc.single_cr;
387 build.offset = precalc.single_offset;
388 } else {
389 build.stop1 = precalc.double_cr;
390 build.stop2 = precalc.double_lits;
391 build.offset = precalc.double_offset;
392 }
393 }
394
395#ifdef DEBUG
396 printf("state %u stop1:", state);
397 for (size_t j = build.stop1.find_first(); j != build.stop1.npos;
398 j = build.stop1.find_next(j)) {
399 printf(" 0x%02x", (u32)j);
400 }
401 printf("\n");
402 printf("state %u stop2:", state);
403 for (auto it = build.stop2.begin(); it != build.stop2.end(); ++it) {
404 printf(" 0x%02hhx%02hhx", it->first, it->second);
405 }
406 printf("\n");
407#endif
408}
409
410// Generate all the data we need for at most NFA_MAX_ACCEL_STATES accelerable
411// states.
412static
413void gatherAccelStates(const build_info &bi, vector<AccelBuild> &accelStates) {
414 for (auto v : bi.accel.accelerable) {
415 DEBUG_PRINTF("state %u is accelerable\n", bi.state_ids.at(v));
416 AccelBuild a;
417 findStopLiterals(bi, v, a);
418 accelStates.push_back(a);
419 }
420
421 // AccelStates should be sorted by state number, so that we build our accel
422 // masks correctly.
423 sort(accelStates.begin(), accelStates.end(),
424 [](const AccelBuild &a, const AccelBuild &b) {
425 return a.state < b.state;
426 });
427
428 // Our caller shouldn't have fed us too many accel states.
429 assert(accelStates.size() <= NFA_MAX_ACCEL_STATES);
430 if (accelStates.size() > NFA_MAX_ACCEL_STATES) {
431 accelStates.resize(NFA_MAX_ACCEL_STATES);
432 }
433}
434
435static
436void combineAccel(const AccelBuild &in, AccelBuild &out) {
437 // stop1 and stop2 union
438 out.stop1 |= in.stop1;
439 out.stop2.insert(in.stop2.begin(), in.stop2.end());
440 // offset is maximum of the two
441 out.offset = max(out.offset, in.offset);
442}
443
444static
445void minimiseAccel(AccelBuild &build) {
446 flat_set<pair<u8, u8>> new_stop2;
447 // Any two-byte accels beginning with a one-byte accel should be removed
448 for (const auto &si : build.stop2) {
449 if (!build.stop1.test(si.first)) {
450 new_stop2.insert(si);
451 }
452 }
453 build.stop2 = new_stop2;
454}
455
456struct AccelAuxCmp {
457 explicit AccelAuxCmp(const AccelAux &aux_in) : aux(aux_in) {}
458 bool operator()(const AccelAux &a) const {
459 return !memcmp(&a, &aux, sizeof(AccelAux));
460 }
461private:
462 const AccelAux &aux;
463};
464
465static
466bool allow_wide_accel(NFAVertex v, const NGHolder &g, NFAVertex sds_or_proxy) {
467 return v == sds_or_proxy || edge(g.start, v, g).second;
468}
469
470static
471bool allow_wide_accel(const vector<NFAVertex> &vv, const NGHolder &g,
472 NFAVertex sds_or_proxy) {
473 for (auto v : vv) {
474 if (allow_wide_accel(v, g, sds_or_proxy)) {
475 return true;
476 }
477 }
478
479 return false;
480}
481
482// identify and mark states that we feel are accelerable (for a limex NFA)
483/* Note: leftfix nfas allow accepts to be accelerated */
484static
485void nfaFindAccelSchemes(const NGHolder &g,
486 const map<NFAVertex, BoundedRepeatSummary> &br_cyclic,
487 unordered_map<NFAVertex, AccelScheme> *out) {
488 vector<CharReach> refined_cr = reduced_cr(g, br_cyclic);
489
490 NFAVertex sds_or_proxy = get_sds_or_proxy(g);
491
492 for (auto v : vertices_range(g)) {
493 // We want to skip any vertices that don't lead to at least one other
494 // (self-loops don't count) vertex.
495 if (!has_proper_successor(v, g)) {
496 DEBUG_PRINTF("skipping vertex %zu\n", g[v].index);
497 continue;
498 }
499
500 bool allow_wide = allow_wide_accel(v, g, sds_or_proxy);
501
502 AccelScheme as;
503 if (nfaCheckAccel(g, v, refined_cr, br_cyclic, &as, allow_wide)) {
504 DEBUG_PRINTF("graph vertex %zu is accelerable with offset %u.\n",
505 g[v].index, as.offset);
506 (*out)[v] = as;
507 }
508 }
509}
510
511struct fas_visitor : public boost::default_bfs_visitor {
512 fas_visitor(const unordered_map<NFAVertex, AccelScheme> &am_in,
513 unordered_map<NFAVertex, AccelScheme> *out_in)
514 : accel_map(am_in), out(out_in) {}
515
516 void discover_vertex(NFAVertex v, const NGHolder &) {
517 if (accel_map.find(v) != accel_map.end()) {
518 (*out)[v] = accel_map.find(v)->second;
519 }
520 if (out->size() >= NFA_MAX_ACCEL_STATES) {
521 throw this; /* done */
522 }
523 }
524 const unordered_map<NFAVertex, AccelScheme> &accel_map;
525 unordered_map<NFAVertex, AccelScheme> *out;
526};
527
528static
529void filterAccelStates(NGHolder &g, const map<u32, set<NFAVertex>> &tops,
530 unordered_map<NFAVertex, AccelScheme> *accel_map) {
531 /* We want the NFA_MAX_ACCEL_STATES best acceleration states, everything
532 * else should be ditched. We use a simple BFS to choose accel states near
533 * the start. */
534
535 vector<NFAEdge> tempEdges;
536 for (const auto &vv : tops | map_values) {
537 for (NFAVertex v : vv) {
538 if (!edge(g.start, v, g).second) {
539 tempEdges.push_back(add_edge(g.start, v, g).first);
540 }
541 }
542 }
543
544 // Similarly, connect (start, startDs) if necessary.
545 if (!edge(g.start, g.startDs, g).second) {
546 NFAEdge e = add_edge(g.start, g.startDs, g);
547 tempEdges.push_back(e); // Remove edge later.
548 }
549
550 unordered_map<NFAVertex, AccelScheme> out;
551
552 try {
553 boost::breadth_first_search(g, g.start,
554 visitor(fas_visitor(*accel_map, &out))
555 .color_map(make_small_color_map(g)));
556 } catch (fas_visitor *) {
557 ; /* found max accel_states */
558 }
559
560 remove_edges(tempEdges, g);
561
562 assert(out.size() <= NFA_MAX_ACCEL_STATES);
563 accel_map->swap(out);
564}
565
566static
567bool containsBadSubset(const limex_accel_info &accel,
568 const NFAStateSet &state_set, const u32 effective_sds) {
569 NFAStateSet subset(state_set.size());
570 for (size_t j = state_set.find_first(); j != state_set.npos;
571 j = state_set.find_next(j)) {
572 subset = state_set;
573 subset.reset(j);
574
575 if (effective_sds != NO_STATE && subset.count() == 1 &&
576 subset.test(effective_sds)) {
577 continue;
578 }
579
580 if (subset.any() && !contains(accel.precalc, subset)) {
581 return true;
582 }
583 }
584 return false;
585}
586
587static
588bool is_too_wide(const AccelScheme &as) {
589 return as.cr.count() > MAX_MERGED_ACCEL_STOPS;
590}
591
592static
593void fillAccelInfo(build_info &bi) {
594 if (!bi.do_accel) {
595 return;
596 }
597
598 NGHolder &g = bi.h;
599 limex_accel_info &accel = bi.accel;
600 unordered_map<NFAVertex, AccelScheme> &accel_map = accel.accel_map;
601 const map<NFAVertex, BoundedRepeatSummary> &br_cyclic = bi.br_cyclic;
602 const unordered_map<NFAVertex, u32> &state_ids = bi.state_ids;
603 const u32 num_states = bi.num_states;
604
605 nfaFindAccelSchemes(g, br_cyclic, &accel_map);
606 filterAccelStates(g, bi.tops, &accel_map);
607
608 assert(accel_map.size() <= NFA_MAX_ACCEL_STATES);
609
610 vector<CharReach> refined_cr = reduced_cr(g, br_cyclic);
611
612 vector<NFAVertex> astates;
613 for (const auto &m : accel_map) {
614 astates.push_back(m.first);
615 }
616
617 NFAStateSet useful(num_states);
618 NFAStateSet state_set(num_states);
619 vector<NFAVertex> states;
620
621 NFAVertex sds_or_proxy = get_sds_or_proxy(g);
622 const u32 effective_sds = state_ids.at(sds_or_proxy);
623
624 /* for each subset of the accel keys need to find an accel scheme */
625 assert(astates.size() < 32);
626 sort(astates.begin(), astates.end());
627
628 for (u32 i = 1, i_end = 1U << astates.size(); i < i_end; i++) {
629 DEBUG_PRINTF("saving info for accel %u\n", i);
630 states.clear();
631 state_set.reset();
632 for (u32 j = 0, j_end = astates.size(); j < j_end; j++) {
633 if (i & (1U << j)) {
634 NFAVertex v = astates[j];
635 states.push_back(v);
636 state_set.set(state_ids.at(v));
637 }
638 }
639
640 if (containsBadSubset(accel, state_set, effective_sds)) {
641 DEBUG_PRINTF("accel %u has bad subset\n", i);
642 continue; /* if a subset failed to build we would too */
643 }
644
645 const bool allow_wide = allow_wide_accel(states, g, sds_or_proxy);
646
647 AccelScheme as = nfaFindAccel(g, states, refined_cr, br_cyclic,
648 allow_wide, true);
649 if (is_too_wide(as)) {
650 DEBUG_PRINTF("accel %u too wide (%zu, %d)\n", i,
651 as.cr.count(), MAX_MERGED_ACCEL_STOPS);
652 continue;
653 }
654
655 DEBUG_PRINTF("accel %u ok with offset s%u, d%u\n", i, as.offset,
656 as.double_offset);
657
658 precalcAccel &pa = accel.precalc[state_set];
659 pa.single_offset = as.offset;
660 pa.single_cr = as.cr;
661
662 if (as.double_byte.size() != 0) {
663 pa.double_offset = as.double_offset;
664 pa.double_lits = as.double_byte;
665 pa.double_cr = as.double_cr;
666 }
667
668 useful |= state_set;
669 }
670
671 for (const auto &m : accel_map) {
672 NFAVertex v = m.first;
673 const u32 state_id = state_ids.at(v);
674
675 /* if we we unable to make a scheme out of the state in any context,
676 * there is not point marking it as accelerable */
677 if (!useful.test(state_id)) {
678 continue;
679 }
680
681 u32 offset = 0;
682 state_set.reset();
683 state_set.set(state_id);
684
685 accel.accelerable.insert(v);
686 findAccelFriends(g, v, br_cyclic, offset, &accel.friends[v]);
687 }
688}
689
690/** The AccelAux structure has large alignment specified, and this makes some
691 * compilers do odd things unless we specify a custom allocator. */
692typedef vector<AccelAux, AlignedAllocator<AccelAux, alignof(AccelAux)>>
693 AccelAuxVector;
694
695#define IMPOSSIBLE_ACCEL_MASK (~0U)
696
697static
698u32 getEffectiveAccelStates(const build_info &args,
699 const unordered_map<NFAVertex, NFAVertex> &dom_map,
700 u32 active_accel_mask,
701 const vector<AccelBuild> &accelStates) {
702 /* accelStates is indexed by the acceleration bit index and contains a
703 * reference to the original vertex & state_id */
704
705 /* Cases to consider:
706 *
707 * 1: Accel states a and b are on and b can squash a
708 * --> we can ignore a. This will result in a no longer being accurately
709 * modelled - we may miss escapes turning it off and we may also miss
710 * its successors being activated.
711 *
712 * 2: Accel state b is on but accel state a is off and a is .* and must be
713 * seen before b is reached (and would not be covered by (1))
714 * --> if a is squashable (or may die unexpectedly) we should continue
715 * as is
716 * --> if a is not squashable we can treat this as a+b or as a no accel,
717 * impossible case
718 * --> this case could be extended to handle non dot reaches by
719 * effectively creating something similar to squash masks for the
720 * reverse graph
721 *
722 *
723 * Other cases:
724 *
725 * 3: Accel states a and b are on but have incompatible reaches
726 * --> we should treat this as an impossible case. Actually, this case
727 * is unlikely to arise as we pick states with wide reaches to
728 * accelerate so an empty intersection is unlikely.
729 *
730 * Note: we need to be careful when dealing with accel states corresponding
731 * to bounded repeat cyclics - they may 'turn off' based on a max bound and
732 * so we may still require on earlier states to be accurately modelled.
733 */
734 const NGHolder &h = args.h;
735
736 /* map from accel_id to mask of accel_ids that it is dominated by */
737 vector<u32> dominated_by(accelStates.size());
738
739 map<NFAVertex, u32> accel_id_map;
740 for (u32 accel_id = 0; accel_id < accelStates.size(); accel_id++) {
741 NFAVertex v = accelStates[accel_id].v;
742 accel_id_map[v] = accel_id;
743 }
744
745 /* Note: we want a slightly less strict defn of dominate as skip edges
746 * prevent .* 'truly' dominating */
747 for (u32 local_accel_mask = active_accel_mask; local_accel_mask; ) {
748 u32 accel_id = findAndClearLSB_32(&local_accel_mask);
749 assert(accel_id < accelStates.size());
750 NFAVertex v = accelStates[accel_id].v;
751 while (contains(dom_map, v) && dom_map.at(v)) {
752 v = dom_map.at(v);
753 if (contains(accel_id_map, v)) {
754 dominated_by[accel_id] |= 1U << accel_id_map[v];
755 }
756 /* TODO: could also look at inv_adj vertices to handle fan-in */
757 for (NFAVertex a : adjacent_vertices_range(v, h)) {
758 if (a == v || !contains(accel_id_map, a)
759 || a == accelStates[accel_id].v /* not likely */) {
760 continue;
761 }
762 if (!is_subset_of(h[v].reports, h[a].reports)) {
763 continue;
764 }
765 auto v_succ = succs(v, h);
766 auto a_succ = succs(a, h);
767 if (is_subset_of(v_succ, a_succ)) {
768 dominated_by[accel_id] |= 1U << accel_id_map[a];
769 }
770 }
771 }
772 }
773
774 u32 may_turn_off = 0; /* BR with max bound, non-dots, squashed, etc */
775 for (u32 local_accel_mask = active_accel_mask; local_accel_mask; ) {
776 u32 accel_id = findAndClearLSB_32(&local_accel_mask);
777 NFAVertex v = accelStates[accel_id].v;
778 u32 state_id = accelStates[accel_id].state;
779 assert(contains(args.accel.accelerable, v));
780 if (!h[v].char_reach.all()) {
781 may_turn_off |= 1U << accel_id;
782 continue;
783 }
784 if (contains(args.br_cyclic, v)
785 && args.br_cyclic.at(v).repeatMax != depth::infinity()) {
786 may_turn_off |= 1U << accel_id;
787 continue;
788 }
789 for (const auto &s_mask : args.squashMap | map_values) {
790 if (!s_mask.test(state_id)) {
791 may_turn_off |= 1U << accel_id;
792 break;
793 }
794 }
795 for (const auto &s_mask : args.reportSquashMap | map_values) {
796 if (!s_mask.test(state_id)) {
797 may_turn_off |= 1U << accel_id;
798 break;
799 }
800 }
801 }
802
803 /* Case 1: */
804 u32 ignored = 0;
805 for (u32 local_accel_mask = active_accel_mask; local_accel_mask; ) {
806 u32 accel_id_b = findAndClearLSB_32(&local_accel_mask);
807 NFAVertex v = accelStates[accel_id_b].v;
808 if (!contains(args.squashMap, v)) {
809 continue;
810 }
811 assert(!contains(args.br_cyclic, v)
812 || args.br_cyclic.at(v).repeatMax == depth::infinity());
813 NFAStateSet squashed = args.squashMap.at(v);
814 squashed.flip(); /* default sense for mask of survivors */
815
816 for (u32 local_accel_mask2 = active_accel_mask; local_accel_mask2; ) {
817 u32 accel_id_a = findAndClearLSB_32(&local_accel_mask2);
818 if (squashed.test(accelStates[accel_id_a].state)) {
819 ignored |= 1U << accel_id_a;
820 }
821 }
822 }
823
824 /* Case 2: */
825 for (u32 local_accel_mask = active_accel_mask; local_accel_mask; ) {
826 u32 accel_id = findAndClearLSB_32(&local_accel_mask);
827
828 u32 stuck_dominators = dominated_by[accel_id] & ~may_turn_off;
829 if ((stuck_dominators & active_accel_mask) != stuck_dominators) {
830 DEBUG_PRINTF("only %08x on, but we require %08x\n",
831 active_accel_mask, stuck_dominators);
832 return IMPOSSIBLE_ACCEL_MASK;
833 }
834 }
835
836 if (ignored) {
837 DEBUG_PRINTF("in %08x, ignoring %08x\n", active_accel_mask, ignored);
838 }
839
840 return active_accel_mask & ~ignored;
841}
842
843static
844void buildAccel(const build_info &args, NFAStateSet &accelMask,
845 NFAStateSet &accelFriendsMask, AccelAuxVector &auxvec,
846 vector<u8> &accelTable) {
847 const limex_accel_info &accel = args.accel;
848
849 // Init, all zeroes.
850 accelMask.resize(args.num_states);
851 accelFriendsMask.resize(args.num_states);
852
853 if (!args.do_accel) {
854 return;
855 }
856
857 vector<AccelBuild> accelStates;
858 gatherAccelStates(args, accelStates);
859
860 if (accelStates.empty()) {
861 DEBUG_PRINTF("no accelerable states\n");
862 return;
863 }
864
865 const auto dom_map = findDominators(args.h);
866
867 // We have 2^n different accel entries, one for each possible
868 // combination of accelerable states.
869 assert(accelStates.size() < 32);
870 const u32 accelCount = 1U << accelStates.size();
871 assert(accelCount <= 256);
872
873 // Set up a unioned AccelBuild for every possible combination of the set
874 // bits in accelStates.
875 vector<AccelBuild> accelOuts(accelCount);
876 vector<u32> effective_accel_set;
877 effective_accel_set.push_back(0); /* empty is effectively empty */
878
879 for (u32 i = 1; i < accelCount; i++) {
880 u32 effective_i = getEffectiveAccelStates(args, dom_map, i,
881 accelStates);
882 effective_accel_set.push_back(effective_i);
883
884 if (effective_i == IMPOSSIBLE_ACCEL_MASK) {
885 DEBUG_PRINTF("this combination of accel states is not possible\n");
886 accelOuts[i].stop1 = CharReach::dot();
887 continue;
888 }
889
890 while (effective_i) {
891 u32 base_accel_state = findAndClearLSB_32(&effective_i);
892 combineAccel(accelStates[base_accel_state], accelOuts[i]);
893 }
894 minimiseAccel(accelOuts[i]);
895 }
896
897 accelTable.resize(accelCount);
898
899 // We dedupe our AccelAux structures here, so that we only write one copy
900 // of each unique accel scheme into the bytecode, using the accelTable as
901 // an index.
902
903 // Start with the NONE case.
904 auxvec.push_back(AccelAux());
905 memset(&auxvec[0], 0, sizeof(AccelAux));
906 auxvec[0].accel_type = ACCEL_NONE; // no states on.
907
908 AccelAux aux;
909 for (u32 i = 1; i < accelCount; i++) {
910 memset(&aux, 0, sizeof(aux));
911
912 NFAStateSet effective_states(args.num_states);
913 u32 effective_i = effective_accel_set[i];
914
915 AccelInfo ainfo;
916 ainfo.double_offset = accelOuts[i].offset;
917 ainfo.double_stop1 = accelOuts[i].stop1;
918 ainfo.double_stop2 = accelOuts[i].stop2;
919
920 if (effective_i != IMPOSSIBLE_ACCEL_MASK) {
921 while (effective_i) {
922 u32 base_accel_id = findAndClearLSB_32(&effective_i);
923 effective_states.set(accelStates[base_accel_id].state);
924 }
925
926 if (contains(accel.precalc, effective_states)) {
927 const auto &precalc = accel.precalc.at(effective_states);
928 ainfo.single_offset = precalc.single_offset;
929 ainfo.single_stops = precalc.single_cr;
930 }
931 }
932
933 buildAccelAux(ainfo, &aux);
934
935 // FIXME: We may want a faster way to find AccelAux structures that
936 // we've already built before.
937 auto it = find_if(auxvec.begin(), auxvec.end(), AccelAuxCmp(aux));
938 if (it == auxvec.end()) {
939 accelTable[i] = verify_u8(auxvec.size());
940 auxvec.push_back(aux);
941 } else {
942 accelTable[i] = verify_u8(it - auxvec.begin());
943 }
944 }
945
946 DEBUG_PRINTF("%zu unique accel schemes (of max %u)\n", auxvec.size(),
947 accelCount);
948
949 // XXX: ACCEL_NONE?
950 for (const auto &as : accelStates) {
951 NFAVertex v = as.v;
952 assert(v && args.state_ids.at(v) == as.state);
953
954 accelMask.set(as.state);
955 accelFriendsMask.set(as.state);
956
957 if (!contains(accel.friends, v)) {
958 continue;
959 }
960 // Add the friends of this state to the friends mask.
961 const flat_set<NFAVertex> &friends = accel.friends.at(v);
962 DEBUG_PRINTF("%u has %zu friends\n", as.state, friends.size());
963 for (auto friend_v : friends) {
964 u32 state_id = args.state_ids.at(friend_v);
965 DEBUG_PRINTF("--> %u\n", state_id);
966 accelFriendsMask.set(state_id);
967 }
968 }
969}
970
971static
972u32 addSquashMask(const build_info &args, const NFAVertex &v,
973 vector<NFAStateSet> &squash) {
974 auto sit = args.reportSquashMap.find(v);
975 if (sit == args.reportSquashMap.end()) {
976 return MO_INVALID_IDX;
977 }
978
979 // This state has a squash mask. Paw through the existing vector to
980 // see if we've already seen it, otherwise add a new one.
981 auto it = find(squash.begin(), squash.end(), sit->second);
982 if (it != squash.end()) {
983 return verify_u32(std::distance(squash.begin(), it));
984 }
985 u32 idx = verify_u32(squash.size());
986 squash.push_back(sit->second);
987 return idx;
988}
989
990using ReportListCache = ue2_unordered_map<vector<ReportID>, u32>;
991
992static
993u32 addReports(const flat_set<ReportID> &r, vector<ReportID> &reports,
994 ReportListCache &reports_cache) {
995 assert(!r.empty());
996
997 vector<ReportID> my_reports(begin(r), end(r));
998 my_reports.push_back(MO_INVALID_IDX); // sentinel
999
1000 auto cache_it = reports_cache.find(my_reports);
1001 if (cache_it != end(reports_cache)) {
1002 u32 offset = cache_it->second;
1003 DEBUG_PRINTF("reusing cached report list at %u\n", offset);
1004 return offset;
1005 }
1006
1007 auto it = search(begin(reports), end(reports), begin(my_reports),
1008 end(my_reports));
1009 if (it != end(reports)) {
1010 u32 offset = verify_u32(std::distance(begin(reports), it));
1011 DEBUG_PRINTF("reusing found report list at %u\n", offset);
1012 return offset;
1013 }
1014
1015 u32 offset = verify_u32(reports.size());
1016 insert(&reports, reports.end(), my_reports);
1017 reports_cache.emplace(move(my_reports), offset);
1018 return offset;
1019}
1020
1021static
1022void buildAcceptsList(const build_info &args, ReportListCache &reports_cache,
1023 vector<NFAVertex> &verts, vector<NFAAccept> &accepts,
1024 vector<ReportID> &reports, vector<NFAStateSet> &squash) {
1025 if (verts.empty()) {
1026 return;
1027 }
1028
1029 DEBUG_PRINTF("building accept lists for %zu states\n", verts.size());
1030
1031 auto cmp_state_id = [&args](NFAVertex a, NFAVertex b) {
1032 u32 a_state = args.state_ids.at(a);
1033 u32 b_state = args.state_ids.at(b);
1034 assert(a_state != b_state || a == b);
1035 return a_state < b_state;
1036 };
1037
1038 sort(begin(verts), end(verts), cmp_state_id);
1039
1040 const NGHolder &h = args.h;
1041 for (const auto &v : verts) {
1042 DEBUG_PRINTF("state=%u, reports: [%s]\n", args.state_ids.at(v),
1043 as_string_list(h[v].reports).c_str());
1044 NFAAccept a;
1045 memset(&a, 0, sizeof(a));
1046 assert(!h[v].reports.empty());
1047 if (h[v].reports.size() == 1) {
1048 a.single_report = 1;
1049 a.reports = *h[v].reports.begin();
1050 } else {
1051 a.single_report = 0;
1052 a.reports = addReports(h[v].reports, reports, reports_cache);
1053 }
1054 a.squash = addSquashMask(args, v, squash);
1055 accepts.push_back(move(a));
1056 }
1057}
1058
1059static
1060void buildAccepts(const build_info &args, ReportListCache &reports_cache,
1061 NFAStateSet &acceptMask, NFAStateSet &acceptEodMask,
1062 vector<NFAAccept> &accepts, vector<NFAAccept> &acceptsEod,
1063 vector<ReportID> &reports, vector<NFAStateSet> &squash) {
1064 const NGHolder &h = args.h;
1065
1066 acceptMask.resize(args.num_states);
1067 acceptEodMask.resize(args.num_states);
1068
1069 vector<NFAVertex> verts_accept, verts_accept_eod;
1070
1071 for (auto v : vertices_range(h)) {
1072 u32 state_id = args.state_ids.at(v);
1073
1074 if (state_id == NO_STATE || !is_match_vertex(v, h)) {
1075 continue;
1076 }
1077
1078 if (edge(v, h.accept, h).second) {
1079 acceptMask.set(state_id);
1080 verts_accept.push_back(v);
1081 } else {
1082 assert(edge(v, h.acceptEod, h).second);
1083 acceptEodMask.set(state_id);
1084 verts_accept_eod.push_back(v);
1085 }
1086 }
1087
1088 buildAcceptsList(args, reports_cache, verts_accept, accepts, reports,
1089 squash);
1090 buildAcceptsList(args, reports_cache, verts_accept_eod, acceptsEod, reports,
1091 squash);
1092}
1093
1094static
1095void buildTopMasks(const build_info &args, vector<NFAStateSet> &topMasks) {
1096 if (args.tops.empty()) {
1097 return; // No tops, probably an outfix NFA.
1098 }
1099
1100 u32 numMasks = args.tops.rbegin()->first + 1; // max mask index
1101 DEBUG_PRINTF("we have %u top masks\n", numMasks);
1102
1103 topMasks.assign(numMasks, NFAStateSet(args.num_states)); // all zeroes
1104
1105 for (const auto &m : args.tops) {
1106 u32 mask_idx = m.first;
1107 for (NFAVertex v : m.second) {
1108 u32 state_id = args.state_ids.at(v);
1109 DEBUG_PRINTF("state %u is in top mask %u\n", state_id, mask_idx);
1110
1111 assert(mask_idx < numMasks);
1112 assert(state_id != NO_STATE);
1113
1114 topMasks[mask_idx].set(state_id);
1115 }
1116 }
1117}
1118
1119static
1120u32 uncompressedStateSize(u32 num_states) {
1121 // Number of bytes required to store all our states.
1122 return ROUNDUP_N(num_states, 8)/8;
1123}
1124
1125static
1126u32 compressedStateSize(const NGHolder &h, const NFAStateSet &maskedStates,
1127 const unordered_map<NFAVertex, u32> &state_ids) {
1128 // Shrink state requirement to enough to fit the compressed largest reach.
1129 vector<u32> allreach(N_CHARS, 0);
1130
1131 for (auto v : vertices_range(h)) {
1132 u32 i = state_ids.at(v);
1133 if (i == NO_STATE || maskedStates.test(i)) {
1134 continue;
1135 }
1136 const CharReach &cr = h[v].char_reach;
1137 for (size_t j = cr.find_first(); j != cr.npos; j = cr.find_next(j)) {
1138 allreach[j]++; // state 'i' can reach character 'j'.
1139 }
1140 }
1141
1142 u32 maxreach = *max_element(allreach.begin(), allreach.end());
1143 DEBUG_PRINTF("max reach is %u\n", maxreach);
1144 return (maxreach + 7) / 8;
1145}
1146
1147static
1148bool hasSquashableInitDs(const build_info &args) {
1149 const NGHolder &h = args.h;
1150
1151 if (args.squashMap.empty()) {
1152 DEBUG_PRINTF("squash map is empty\n");
1153 return false;
1154 }
1155
1156 NFAStateSet initDs(args.num_states);
1157 u32 sds_state = args.state_ids.at(h.startDs);
1158 if (sds_state == NO_STATE) {
1159 DEBUG_PRINTF("no states in initds\n");
1160 return false;
1161 }
1162
1163 initDs.set(sds_state);
1164
1165 /* TODO: simplify */
1166
1167 // Check normal squash map.
1168 for (const auto &m : args.squashMap) {
1169 DEBUG_PRINTF("checking squash mask for state %u\n",
1170 args.state_ids.at(m.first));
1171 NFAStateSet squashed = ~(m.second); // flip mask
1172 assert(squashed.size() == initDs.size());
1173 if (squashed.intersects(initDs)) {
1174 DEBUG_PRINTF("state %u squashes initds states\n",
1175 args.state_ids.at(m.first));
1176 return true;
1177 }
1178 }
1179
1180 // Check report squash map.
1181 for (const auto &m : args.reportSquashMap) {
1182 DEBUG_PRINTF("checking report squash mask for state %u\n",
1183 args.state_ids.at(m.first));
1184 NFAStateSet squashed = ~(m.second); // flip mask
1185 assert(squashed.size() == initDs.size());
1186 if (squashed.intersects(initDs)) {
1187 DEBUG_PRINTF("state %u squashes initds states\n",
1188 args.state_ids.at(m.first));
1189 return true;
1190 }
1191 }
1192
1193 return false;
1194}
1195
1196static
1197bool hasInitDsStates(const NGHolder &h,
1198 const unordered_map<NFAVertex, u32> &state_ids) {
1199 if (state_ids.at(h.startDs) != NO_STATE) {
1200 return true;
1201 }
1202
1203 if (is_triggered(h) && state_ids.at(h.start) != NO_STATE) {
1204 return true;
1205 }
1206
1207 return false;
1208}
1209
1210static
1211void findMaskedCompressionStates(const build_info &args,
1212 NFAStateSet &maskedStates) {
1213 const NGHolder &h = args.h;
1214 if (!generates_callbacks(h)) {
1215 // Rose leftfixes can mask out initds, which is worth doing if it will
1216 // stay on forever (i.e. it's not squashable).
1217 u32 sds_i = args.state_ids.at(h.startDs);
1218 if (sds_i != NO_STATE && !hasSquashableInitDs(args)) {
1219 maskedStates.set(sds_i);
1220 DEBUG_PRINTF("masking out initds state\n");
1221 }
1222 }
1223
1224 // Suffixes and outfixes can mask out leaf states, which should all be
1225 // accepts. Right now we can only do this when there is nothing in initDs,
1226 // as we switch that on unconditionally in the expand call.
1227 if (!inspects_states_for_accepts(h)
1228 && !hasInitDsStates(h, args.state_ids)) {
1229 NFAStateSet nonleaf(args.num_states);
1230 for (const auto &e : edges_range(h)) {
1231 u32 from = args.state_ids.at(source(e, h));
1232 u32 to = args.state_ids.at(target(e, h));
1233 if (from == NO_STATE) {
1234 continue;
1235 }
1236
1237 // We cannot mask out EOD accepts, as they have to perform an
1238 // action after they're switched on that may be delayed until the
1239 // next stream write.
1240 if (to == NO_STATE && target(e, h) != h.acceptEod) {
1241 continue;
1242 }
1243
1244 nonleaf.set(from);
1245 }
1246
1247 for (u32 i = 0; i < args.num_states; i++) {
1248 if (!nonleaf.test(i)) {
1249 maskedStates.set(i);
1250 }
1251 }
1252
1253 DEBUG_PRINTF("masking out %zu leaf states\n", maskedStates.count());
1254 }
1255}
1256
1257/** \brief Sets a given flag in the LimEx structure. */
1258template<class implNFA_t>
1259static
1260void setLimexFlag(implNFA_t *limex, u32 flag) {
1261 assert(flag);
1262 assert((flag & (flag - 1)) == 0);
1263 limex->flags |= flag;
1264}
1265
1266/** \brief Sets a given flag in the NFA structure */
1267static
1268void setNfaFlag(NFA *nfa, u32 flag) {
1269 assert(flag);
1270 assert((flag & (flag - 1)) == 0);
1271 nfa->flags |= flag;
1272}
1273
1274// Some of our NFA types support compressing the state down if we're not using
1275// all of it.
1276template<class implNFA_t>
1277static
1278void findStateSize(const build_info &args, implNFA_t *limex) {
1279 // Nothing is masked off by default.
1280 maskFill(limex->compressMask, 0xff);
1281
1282 u32 sizeUncompressed = uncompressedStateSize(args.num_states);
1283 assert(sizeUncompressed <= sizeof(limex->compressMask));
1284
1285 if (!args.stateCompression) {
1286 DEBUG_PRINTF("compression disabled, uncompressed state size %u\n",
1287 sizeUncompressed);
1288 limex->stateSize = sizeUncompressed;
1289 return;
1290 }
1291
1292 NFAStateSet maskedStates(args.num_states);
1293 findMaskedCompressionStates(args, maskedStates);
1294
1295 u32 sizeCompressed = compressedStateSize(args.h, maskedStates, args.state_ids);
1296 assert(sizeCompressed <= sizeof(limex->compressMask));
1297
1298 DEBUG_PRINTF("compressed=%u, uncompressed=%u\n", sizeCompressed,
1299 sizeUncompressed);
1300
1301 // Must be at least a 10% saving.
1302 if ((sizeCompressed * 100) <= (sizeUncompressed * 90)) {
1303 DEBUG_PRINTF("using compression, state size %u\n",
1304 sizeCompressed);
1305 setLimexFlag(limex, LIMEX_FLAG_COMPRESS_STATE);
1306 limex->stateSize = sizeCompressed;
1307
1308 if (maskedStates.any()) {
1309 DEBUG_PRINTF("masking %zu states\n", maskedStates.count());
1310 setLimexFlag(limex, LIMEX_FLAG_COMPRESS_MASKED);
1311 for (size_t i = maskedStates.find_first(); i != NFAStateSet::npos;
1312 i = maskedStates.find_next(i)) {
1313 maskClearBit(limex->compressMask, i);
1314 }
1315 }
1316 } else {
1317 DEBUG_PRINTF("not using compression, state size %u\n",
1318 sizeUncompressed);
1319 limex->stateSize = sizeUncompressed;
1320 }
1321}
1322
1323/*
1324 * LimEx NFA: code for building NFAs in the Limited+Exceptional model. Most
1325 * transitions are limited, with transitions outside the constraints of our
1326 * shifts taken care of as 'exceptions'. Exceptions are also used to handle
1327 * accepts and squash behaviour.
1328 */
1329
1330/**
1331 * \brief Prototype exception class.
1332 *
1333 * Used to build up the map of exceptions before being converted to real
1334 * NFAException32 (etc) structures.
1335 */
1336struct ExceptionProto {
1337 u32 reports_index = MO_INVALID_IDX;
1338 NFAStateSet succ_states;
1339 NFAStateSet squash_states;
1340 u32 repeat_index = MO_INVALID_IDX;
1341 enum LimExTrigger trigger = LIMEX_TRIGGER_NONE;
1342 enum LimExSquash squash = LIMEX_SQUASH_NONE;
1343
1344 explicit ExceptionProto(u32 num_states)
1345 : succ_states(num_states), squash_states(num_states) {
1346 // Squash states are represented as the set of states to leave on,
1347 // so we start with all-ones.
1348 squash_states.set();
1349 }
1350
1351 bool operator<(const ExceptionProto &b) const {
1352 const ExceptionProto &a = *this;
1353
1354 ORDER_CHECK(reports_index);
1355 ORDER_CHECK(repeat_index);
1356 ORDER_CHECK(trigger);
1357 ORDER_CHECK(squash);
1358 ORDER_CHECK(succ_states);
1359 ORDER_CHECK(squash_states);
1360
1361 return false;
1362 }
1363};
1364
1365static
1366u32 buildExceptionMap(const build_info &args, ReportListCache &reports_cache,
1367 const unordered_set<NFAEdge> &exceptional,
1368 map<ExceptionProto, vector<u32>> &exceptionMap,
1369 vector<ReportID> &reportList) {
1370 const NGHolder &h = args.h;
1371 const u32 num_states = args.num_states;
1372 u32 exceptionCount = 0;
1373
1374 unordered_map<NFAVertex, u32> pos_trigger;
1375 unordered_map<NFAVertex, u32> tug_trigger;
1376
1377 for (u32 i = 0; i < args.repeats.size(); i++) {
1378 const BoundedRepeatData &br = args.repeats[i];
1379 assert(!contains(pos_trigger, br.pos_trigger));
1380 pos_trigger[br.pos_trigger] = i;
1381 for (auto v : br.tug_triggers) {
1382 assert(!contains(tug_trigger, v));
1383 tug_trigger[v] = i;
1384 }
1385 }
1386
1387 for (auto v : vertices_range(h)) {
1388 const u32 i = args.state_ids.at(v);
1389
1390 if (i == NO_STATE) {
1391 continue;
1392 }
1393
1394 bool addMe = false;
1395 ExceptionProto e(num_states);
1396
1397 if (edge(v, h.accept, h).second && generates_callbacks(h)) {
1398 /* if nfa is never used to produce callbacks, no need to mark
1399 * states as exceptional */
1400 const auto &reports = h[v].reports;
1401
1402 DEBUG_PRINTF("state %u is exceptional due to accept "
1403 "(%zu reports)\n", i, reports.size());
1404
1405 if (reports.empty()) {
1406 e.reports_index = MO_INVALID_IDX;
1407 } else {
1408 e.reports_index =
1409 addReports(reports, reportList, reports_cache);
1410 }
1411
1412 // We may be applying a report squash too.
1413 auto mi = args.reportSquashMap.find(v);
1414 if (mi != args.reportSquashMap.end()) {
1415 DEBUG_PRINTF("report squashes states\n");
1416 assert(e.squash_states.size() == mi->second.size());
1417 e.squash_states = mi->second;
1418 e.squash = LIMEX_SQUASH_REPORT;
1419 }
1420
1421 addMe = true;
1422 }
1423
1424 if (contains(pos_trigger, v)) {
1425 u32 repeat_index = pos_trigger[v];
1426 assert(e.trigger == LIMEX_TRIGGER_NONE);
1427 e.trigger = LIMEX_TRIGGER_POS;
1428 e.repeat_index = repeat_index;
1429 DEBUG_PRINTF("state %u has pos trigger for repeat %u\n", i,
1430 repeat_index);
1431 addMe = true;
1432 }
1433
1434 if (contains(tug_trigger, v)) {
1435 u32 repeat_index = tug_trigger[v];
1436 assert(e.trigger == LIMEX_TRIGGER_NONE);
1437 e.trigger = LIMEX_TRIGGER_TUG;
1438 e.repeat_index = repeat_index;
1439
1440 // TUG triggers can squash the preceding cyclic state.
1441 u32 cyclic = args.state_ids.at(args.repeats[repeat_index].cyclic);
1442 e.squash_states.reset(cyclic);
1443 e.squash = LIMEX_SQUASH_TUG;
1444 DEBUG_PRINTF("state %u has tug trigger for repeat %u, can squash "
1445 "state %u\n", i, repeat_index, cyclic);
1446 addMe = true;
1447 }
1448
1449 // are we a non-limited transition?
1450 for (const auto &oe : out_edges_range(v, h)) {
1451 if (contains(exceptional, oe)) {
1452 NFAVertex w = target(oe, h);
1453 u32 w_idx = args.state_ids.at(w);
1454 assert(w_idx != NO_STATE);
1455 e.succ_states.set(w_idx);
1456 DEBUG_PRINTF("exceptional transition %u->%u\n", i, w_idx);
1457 addMe = true;
1458 }
1459 }
1460
1461 // do we lead SOLELY to a squasher state? (we use the successors as
1462 // a proxy for the out-edge here, so there must be only one for us
1463 // to do this safely)
1464 /* The above comment is IMHO bogus and would result in all squashing
1465 * being disabled around stars */
1466 if (e.trigger != LIMEX_TRIGGER_TUG) {
1467 for (auto w : adjacent_vertices_range(v, h)) {
1468 if (w == v) {
1469 continue;
1470 }
1471 u32 j = args.state_ids.at(w);
1472 if (j == NO_STATE) {
1473 continue;
1474 }
1475 DEBUG_PRINTF("we are checking if succ %u is a squasher\n", j);
1476 auto mi = args.squashMap.find(w);
1477 if (mi != args.squashMap.end()) {
1478 DEBUG_PRINTF("squasher edge (%u, %u)\n", i, j);
1479 DEBUG_PRINTF("e.squash_states.size() == %zu, "
1480 "mi->second.size() = %zu\n",
1481 e.squash_states.size(), mi->second.size());
1482 assert(e.squash_states.size() == mi->second.size());
1483 e.squash_states = mi->second;
1484
1485 // NOTE: this might be being combined with the report
1486 // squashing above.
1487
1488 e.squash = LIMEX_SQUASH_CYCLIC;
1489 DEBUG_PRINTF("squashing succ %u (turns off %zu states)\n",
1490 j, mi->second.size() - mi->second.count());
1491 addMe = true;
1492 }
1493 }
1494 }
1495
1496 if (addMe) {
1497 // Add 'e' if it isn't in the map, and push state i on to its list
1498 // of states.
1499 assert(e.succ_states.size() == num_states);
1500 assert(e.squash_states.size() == num_states);
1501 exceptionMap[e].push_back(i);
1502 exceptionCount++;
1503 }
1504 }
1505
1506 DEBUG_PRINTF("%u exceptions found (%zu unique)\n", exceptionCount,
1507 exceptionMap.size());
1508 return exceptionCount;
1509}
1510
1511static
1512u32 depth_to_u32(const depth &d) {
1513 assert(d.is_reachable());
1514 if (d.is_infinite()) {
1515 return REPEAT_INF;
1516 }
1517
1518 u32 d_val = d;
1519 assert(d_val < REPEAT_INF);
1520 return d_val;
1521}
1522
1523static
1524bool isExceptionalTransition(u32 from, u32 to, const build_info &args,
1525 u32 maxShift) {
1526 if (!isLimitedTransition(from, to, maxShift)) {
1527 return true;
1528 }
1529
1530 // All transitions out of a tug trigger are exceptional.
1531 if (args.tugs.test(from)) {
1532 return true;
1533 }
1534 return false;
1535}
1536
1537static
1538u32 findMaxVarShift(const build_info &args, u32 nShifts) {
1539 const NGHolder &h = args.h;
1540 u32 shiftMask = 0;
1541 for (const auto &e : edges_range(h)) {
1542 u32 from = args.state_ids.at(source(e, h));
1543 u32 to = args.state_ids.at(target(e, h));
1544 if (from == NO_STATE || to == NO_STATE) {
1545 continue;
1546 }
1547 if (!isExceptionalTransition(from, to, args, MAX_SHIFT_AMOUNT)) {
1548 shiftMask |= (1UL << (to - from));
1549 }
1550 }
1551
1552 u32 maxVarShift = 0;
1553 for (u32 shiftCnt = 0; shiftMask != 0 && shiftCnt < nShifts; shiftCnt++) {
1554 maxVarShift = findAndClearLSB_32(&shiftMask);
1555 }
1556
1557 return maxVarShift;
1558}
1559
1560static
1561int getLimexScore(const build_info &args, u32 nShifts) {
1562 const NGHolder &h = args.h;
1563 u32 maxVarShift = nShifts;
1564 int score = 0;
1565
1566 score += SHIFT_COST * nShifts;
1567 maxVarShift = findMaxVarShift(args, nShifts);
1568
1569 NFAStateSet exceptionalStates(args.num_states);
1570 for (const auto &e : edges_range(h)) {
1571 u32 from = args.state_ids.at(source(e, h));
1572 u32 to = args.state_ids.at(target(e, h));
1573 if (from == NO_STATE || to == NO_STATE) {
1574 continue;
1575 }
1576 if (isExceptionalTransition(from, to, args, maxVarShift)) {
1577 exceptionalStates.set(from);
1578 }
1579 }
1580 score += EXCEPTION_COST * exceptionalStates.count();
1581 return score;
1582}
1583
1584// This function finds the best shift scheme with highest score
1585// Returns number of shifts and score calculated for appropriate scheme
1586// Returns zero if no appropriate scheme was found
1587static
1588u32 findBestNumOfVarShifts(const build_info &args,
1589 int *bestScoreRet = nullptr) {
1590 u32 bestNumOfVarShifts = 0;
1591 int bestScore = INT_MAX;
1592 for (u32 shiftCount = 1; shiftCount <= MAX_SHIFT_COUNT; shiftCount++) {
1593 int score = getLimexScore(args, shiftCount);
1594 if (score < bestScore) {
1595 bestScore = score;
1596 bestNumOfVarShifts = shiftCount;
1597 }
1598 }
1599 if (bestScoreRet != nullptr) {
1600 *bestScoreRet = bestScore;
1601 }
1602 return bestNumOfVarShifts;
1603}
1604
1605static
1606bool cannotDie(const build_info &args, const set<NFAVertex> &tops) {
1607 const auto &h = args.h;
1608
1609 // When this top is activated, all of the vertices in 'tops' are switched
1610 // on. If any of those lead to a graph that cannot die, then this top
1611 // cannot die.
1612
1613 // For each top, we use a depth-first search to traverse the graph from the
1614 // top, looking for a cyclic path consisting of vertices of dot reach. If
1615 // one exists, than the NFA cannot die after this top is triggered.
1616
1617 auto colour_map = make_small_color_map(h);
1618
1619 struct CycleFound {};
1620 struct CannotDieVisitor : public boost::default_dfs_visitor {
1621 void back_edge(const NFAEdge &e, const NGHolder &g) const {
1622 DEBUG_PRINTF("back-edge %zu,%zu\n", g[source(e, g)].index,
1623 g[target(e, g)].index);
1624 if (g[target(e, g)].char_reach.all()) {
1625 assert(g[source(e, g)].char_reach.all());
1626 throw CycleFound();
1627 }
1628 }
1629 };
1630
1631 try {
1632 for (const auto &top : tops) {
1633 DEBUG_PRINTF("checking top vertex %zu\n", h[top].index);
1634
1635 // Constrain the search to the top vertices and any dot vertices it
1636 // can reach.
1637 auto term_func = [&](NFAVertex v, const NGHolder &g) {
1638 if (v == top) {
1639 return false;
1640 }
1641 if (!g[v].char_reach.all()) {
1642 return true;
1643 }
1644 if (contains(args.br_cyclic, v) &&
1645 args.br_cyclic.at(v).repeatMax != depth::infinity()) {
1646 // Bounded repeat vertices without inf max can be turned
1647 // off.
1648 return true;
1649 }
1650 return false;
1651 };
1652
1653 boost::depth_first_visit(h, top, CannotDieVisitor(), colour_map,
1654 term_func);
1655 }
1656 } catch (const CycleFound &) {
1657 DEBUG_PRINTF("cycle found\n");
1658 return true;
1659 }
1660
1661 return false;
1662}
1663
1664/** \brief True if this NFA cannot ever be in no states at all. */
1665static
1666bool cannotDie(const build_info &args) {
1667 const auto &h = args.h;
1668 const auto &state_ids = args.state_ids;
1669
1670 // If we have a startDs we're actually using, we can't die.
1671 if (state_ids.at(h.startDs) != NO_STATE) {
1672 DEBUG_PRINTF("is using startDs\n");
1673 return true;
1674 }
1675
1676 return all_of_in(args.tops | map_values, [&](const set<NFAVertex> &verts) {
1677 return cannotDie(args, verts);
1678 });
1679}
1680
1681template<NFAEngineType dtype>
1682struct Factory {
1683 // typedefs for readability, for types derived from traits
1684 typedef typename NFATraits<dtype>::exception_t exception_t;
1685 typedef typename NFATraits<dtype>::implNFA_t implNFA_t;
1686 typedef typename NFATraits<dtype>::tableRow_t tableRow_t;
1687
1688 static
1689 void allocState(NFA *nfa, u32 repeatscratchStateSize,
1690 u32 repeatStreamState) {
1691 implNFA_t *limex = (implNFA_t *)getMutableImplNfa(nfa);
1692
1693 // LimEx NFAs now store the following in state:
1694 // 1. state bitvector (always present)
1695 // 2. space associated with repeats
1696 // This function just needs to size these correctly.
1697
1698 u32 stateSize = limex->stateSize;
1699
1700 DEBUG_PRINTF("bitvector=%zu/%u, repeat full=%u, stream=%u\n",
1701 sizeof(limex->init), stateSize, repeatscratchStateSize,
1702 repeatStreamState);
1703
1704 size_t scratchStateSize = NFATraits<dtype>::scratch_state_size;
1705
1706 if (repeatscratchStateSize) {
1707 scratchStateSize
1708 = ROUNDUP_N(scratchStateSize, alignof(RepeatControl));
1709 scratchStateSize += repeatscratchStateSize;
1710 }
1711 size_t streamStateSize = stateSize + repeatStreamState;
1712
1713 nfa->scratchStateSize = verify_u32(scratchStateSize);
1714 nfa->streamStateSize = verify_u32(streamStateSize);
1715 }
1716
1717 static
1718 size_t repeatAllocSize(const BoundedRepeatData &br, u32 *tableOffset,
1719 u32 *tugMaskOffset) {
1720 size_t len = sizeof(NFARepeatInfo) + sizeof(RepeatInfo);
1721
1722 // sparse lookup table.
1723 if (br.type == REPEAT_SPARSE_OPTIMAL_P) {
1724 len = ROUNDUP_N(len, alignof(u64a));
1725 *tableOffset = verify_u32(len);
1726 len += sizeof(u64a) * (br.repeatMax + 1);
1727 } else {
1728 *tableOffset = 0;
1729 }
1730
1731 // tug mask.
1732 len = ROUNDUP_N(len, alignof(tableRow_t));
1733 *tugMaskOffset = verify_u32(len);
1734 len += sizeof(tableRow_t);
1735
1736 // to simplify layout.
1737 len = ROUNDUP_CL(len);
1738
1739 return len;
1740 }
1741
1742 static
1743 void buildRepeats(const build_info &args,
1744 vector<bytecode_ptr<NFARepeatInfo>> &out,
1745 u32 *scratchStateSize, u32 *streamState) {
1746 out.reserve(args.repeats.size());
1747
1748 u32 repeat_idx = 0;
1749 for (auto it = args.repeats.begin(), ite = args.repeats.end();
1750 it != ite; ++it, ++repeat_idx) {
1751 const BoundedRepeatData &br = *it;
1752 assert(args.state_ids.at(br.cyclic) != NO_STATE);
1753
1754 u32 tableOffset, tugMaskOffset;
1755 size_t len = repeatAllocSize(br, &tableOffset, &tugMaskOffset);
1756 auto info = make_zeroed_bytecode_ptr<NFARepeatInfo>(len);
1757 char *info_ptr = (char *)info.get();
1758
1759 // Collect state space info.
1760 RepeatStateInfo rsi(br.type, br.repeatMin, br.repeatMax, br.minPeriod);
1761 u32 streamStateLen = rsi.packedCtrlSize + rsi.stateSize;
1762
1763 // Fill the NFARepeatInfo structure.
1764 info->cyclicState = args.state_ids.at(br.cyclic);
1765 info->ctrlIndex = repeat_idx;
1766 info->packedCtrlOffset = *streamState;
1767 info->stateOffset = *streamState + rsi.packedCtrlSize;
1768 info->stateSize = streamStateLen;
1769 info->tugMaskOffset = tugMaskOffset;
1770
1771 // Fill the RepeatInfo structure.
1772 RepeatInfo *repeat =
1773 (RepeatInfo *)(info_ptr + sizeof(NFARepeatInfo));
1774 repeat->type = br.type;
1775 repeat->repeatMin = depth_to_u32(br.repeatMin);
1776 repeat->repeatMax = depth_to_u32(br.repeatMax);
1777 repeat->horizon = rsi.horizon;
1778 repeat->packedCtrlSize = rsi.packedCtrlSize;
1779 repeat->stateSize = rsi.stateSize;
1780 copy_bytes(repeat->packedFieldSizes, rsi.packedFieldSizes);
1781 repeat->patchCount = rsi.patchCount;
1782 repeat->patchSize = rsi.patchSize;
1783 repeat->encodingSize = rsi.encodingSize;
1784 repeat->patchesOffset = rsi.patchesOffset;
1785
1786 u32 repeat_len = sizeof(RepeatInfo);
1787 if (br.type == REPEAT_SPARSE_OPTIMAL_P) {
1788 repeat_len += sizeof(u64a) * (rsi.patchSize + 1);
1789 }
1790 repeat->length = repeat_len;
1791
1792 // Copy in the sparse lookup table.
1793 if (br.type == REPEAT_SPARSE_OPTIMAL_P) {
1794 assert(!rsi.table.empty());
1795 copy_bytes(info_ptr + tableOffset, rsi.table);
1796 }
1797
1798 // Fill the tug mask.
1799 tableRow_t *tugMask = (tableRow_t *)(info_ptr + tugMaskOffset);
1800 for (auto v : br.tug_triggers) {
1801 u32 state_id = args.state_ids.at(v);
1802 assert(state_id != NO_STATE);
1803 maskSetBit(*tugMask, state_id);
1804 }
1805
1806 assert(streamStateLen);
1807 *streamState += streamStateLen;
1808 *scratchStateSize += sizeof(RepeatControl);
1809
1810 out.emplace_back(move(info));
1811 }
1812 }
1813
1814 static
1815 void writeLimexMasks(const build_info &args, implNFA_t *limex) {
1816 const NGHolder &h = args.h;
1817
1818 // Init masks.
1819 u32 s_i = args.state_ids.at(h.start);
1820 u32 sds_i = args.state_ids.at(h.startDs);
1821
1822 if (s_i != NO_STATE) {
1823 maskSetBit(limex->init, s_i);
1824 if (is_triggered(h)) {
1825 maskSetBit(limex->initDS, s_i);
1826 }
1827 }
1828
1829 if (sds_i != NO_STATE) {
1830 maskSetBit(limex->init, sds_i);
1831 maskSetBit(limex->initDS, sds_i);
1832 }
1833
1834 // Zombie mask.
1835 for (auto v : args.zombies) {
1836 u32 state_id = args.state_ids.at(v);
1837 assert(state_id != NO_STATE);
1838 maskSetBit(limex->zombieMask, state_id);
1839 }
1840
1841 // Repeat cyclic mask.
1842 for (const auto &br : args.repeats) {
1843 u32 cyclic = args.state_ids.at(br.cyclic);
1844 assert(cyclic != NO_STATE);
1845 maskSetBit(limex->repeatCyclicMask, cyclic);
1846 }
1847 /* also include tugs in repeat cyclic mask */
1848 for (size_t i = args.tugs.find_first(); i != args.tugs.npos;
1849 i = args.tugs.find_next(i)) {
1850 maskSetBit(limex->repeatCyclicMask, i);
1851 }
1852 }
1853
1854 static
1855 void writeShiftMasks(const build_info &args, implNFA_t *limex) {
1856 const NGHolder &h = args.h;
1857 u32 maxShift = findMaxVarShift(args, limex->shiftCount);
1858 u32 shiftMask = 0;
1859 int shiftMaskIdx = 0;
1860
1861 for (const auto &e : edges_range(h)) {
1862 u32 from = args.state_ids.at(source(e, h));
1863 u32 to = args.state_ids.at(target(e, h));
1864 if (from == NO_STATE || to == NO_STATE) {
1865 continue;
1866 }
1867
1868 // We check for exceptional transitions here, as we don't want tug
1869 // trigger transitions emitted as limited transitions (even if they
1870 // could be in this model).
1871 if (!isExceptionalTransition(from, to, args, maxShift)) {
1872 u32 shift = to - from;
1873 if ((shiftMask & (1UL << shift)) == 0UL) {
1874 shiftMask |= (1UL << shift);
1875 limex->shiftAmount[shiftMaskIdx++] = (u8)shift;
1876 }
1877 assert(limex->shiftCount <= MAX_SHIFT_COUNT);
1878 for (u32 i = 0; i < limex->shiftCount; i++) {
1879 if (limex->shiftAmount[i] == (u8)shift) {
1880 maskSetBit(limex->shift[i], from);
1881 break;
1882 }
1883 }
1884 }
1885 }
1886 if (maxShift && limex->shiftCount > 1) {
1887 for (u32 i = 0; i < limex->shiftCount; i++) {
1888 assert(!isMaskZero(limex->shift[i]));
1889 }
1890 }
1891 }
1892
1893 static
1894 void findExceptionalTransitions(const build_info &args,
1895 unordered_set<NFAEdge> &exceptional,
1896 u32 maxShift) {
1897 const NGHolder &h = args.h;
1898
1899 for (const auto &e : edges_range(h)) {
1900 u32 from = args.state_ids.at(source(e, h));
1901 u32 to = args.state_ids.at(target(e, h));
1902 if (from == NO_STATE || to == NO_STATE) {
1903 continue;
1904 }
1905
1906 if (isExceptionalTransition(from, to, args, maxShift)) {
1907 exceptional.insert(e);
1908 }
1909 }
1910 }
1911
1912 static
1913 void writeExceptions(const map<ExceptionProto, vector<u32>> &exceptionMap,
1914 const vector<u32> &repeatOffsets, implNFA_t *limex,
1915 const u32 exceptionsOffset,
1916 const u32 reportListOffset) {
1917 DEBUG_PRINTF("exceptionsOffset=%u\n", exceptionsOffset);
1918
1919 exception_t *etable = (exception_t *)((char *)limex + exceptionsOffset);
1920 assert(ISALIGNED(etable));
1921
1922 map<u32, ExceptionProto> exception_by_state;
1923 for (const auto &m : exceptionMap) {
1924 const ExceptionProto &proto = m.first;
1925 const vector<u32> &states = m.second;
1926 for (u32 i : states) {
1927 assert(!contains(exception_by_state, i));
1928 exception_by_state.emplace(i, proto);
1929 }
1930 }
1931
1932 u32 ecount = 0;
1933 for (const auto &m : exception_by_state) {
1934 const ExceptionProto &proto = m.second;
1935 u32 state_id = m.first;
1936 DEBUG_PRINTF("exception %u, triggered by state %u\n", ecount,
1937 state_id);
1938
1939 // Write the exception entry.
1940 exception_t &e = etable[ecount];
1941 maskSetBits(e.squash, proto.squash_states);
1942 maskSetBits(e.successors, proto.succ_states);
1943 if (proto.reports_index == MO_INVALID_IDX) {
1944 e.reports = MO_INVALID_IDX;
1945 } else {
1946 e.reports = reportListOffset +
1947 proto.reports_index * sizeof(ReportID);
1948 }
1949 e.hasSquash = verify_u8(proto.squash);
1950 e.trigger = verify_u8(proto.trigger);
1951 u32 repeat_offset = proto.repeat_index == MO_INVALID_IDX
1952 ? MO_INVALID_IDX
1953 : repeatOffsets[proto.repeat_index];
1954 e.repeatOffset = repeat_offset;
1955
1956 // for the state that can switch it on
1957 // set this bit in the exception mask
1958 maskSetBit(limex->exceptionMask, state_id);
1959
1960 ecount++;
1961 }
1962
1963 limex->exceptionOffset = exceptionsOffset;
1964 limex->exceptionCount = ecount;
1965 }
1966
1967 static
1968 void writeReachMapping(const vector<NFAStateSet> &reach,
1969 const vector<u8> &reachMap, implNFA_t *limex,
1970 const u32 reachOffset) {
1971 DEBUG_PRINTF("reachOffset=%u\n", reachOffset);
1972
1973 // Reach mapping is inside the LimEx structure.
1974 copy(reachMap.begin(), reachMap.end(), &limex->reachMap[0]);
1975
1976 // Reach table is right after the LimEx structure.
1977 tableRow_t *reachMask = (tableRow_t *)((char *)limex + reachOffset);
1978 assert(ISALIGNED(reachMask));
1979 for (size_t i = 0, end = reach.size(); i < end; i++) {
1980 maskSetBits(reachMask[i], reach[i]);
1981 }
1982 limex->reachSize = verify_u32(reach.size());
1983 }
1984
1985 static
1986 void writeTopMasks(const vector<NFAStateSet> &tops, implNFA_t *limex,
1987 const u32 topsOffset) {
1988 DEBUG_PRINTF("topsOffset=%u\n", topsOffset);
1989
1990 limex->topOffset = topsOffset;
1991 tableRow_t *topMasks = (tableRow_t *)((char *)limex + topsOffset);
1992 assert(ISALIGNED(topMasks));
1993
1994 for (size_t i = 0, end = tops.size(); i < end; i++) {
1995 maskSetBits(topMasks[i], tops[i]);
1996 }
1997
1998 limex->topCount = verify_u32(tops.size());
1999 }
2000
2001 static
2002 void writeAccelSsse3Masks(const NFAStateSet &accelMask, implNFA_t *limex) {
2003 char *perm_base = (char *)&limex->accelPermute;
2004 char *comp_base = (char *)&limex->accelCompare;
2005
2006 u32 num = 0; // index in accel table.
2007 for (size_t i = accelMask.find_first(); i != accelMask.npos;
2008 i = accelMask.find_next(i), ++num) {
2009 u32 state_id = verify_u32(i);
2010 DEBUG_PRINTF("accel num=%u, state=%u\n", num, state_id);
2011
2012 // PSHUFB permute and compare masks
2013 size_t mask_idx = sizeof(u_128) * (state_id / 128U);
2014 DEBUG_PRINTF("mask_idx=%zu\n", mask_idx);
2015 u_128 *perm = (u_128 *)(perm_base + mask_idx);
2016 u_128 *comp = (u_128 *)(comp_base + mask_idx);
2017 maskSetByte(*perm, num, ((state_id % 128U) / 8U));
2018 maskSetByte(*comp, num, ~(1U << (state_id % 8U)));
2019 }
2020 }
2021
2022 static
2023 void writeAccel(const NFAStateSet &accelMask,
2024 const NFAStateSet &accelFriendsMask,
2025 const AccelAuxVector &accelAux,
2026 const vector<u8> &accelTable, implNFA_t *limex,
2027 const u32 accelTableOffset, const u32 accelAuxOffset) {
2028 DEBUG_PRINTF("accelTableOffset=%u, accelAuxOffset=%u\n",
2029 accelTableOffset, accelAuxOffset);
2030
2031 // Write accel lookup table.
2032 limex->accelTableOffset = accelTableOffset;
2033 copy(accelTable.begin(), accelTable.end(),
2034 (u8 *)((char *)limex + accelTableOffset));
2035
2036 // Write accel aux structures.
2037 limex->accelAuxOffset = accelAuxOffset;
2038 AccelAux *auxTable = (AccelAux *)((char *)limex + accelAuxOffset);
2039 assert(ISALIGNED(auxTable));
2040 copy(accelAux.begin(), accelAux.end(), auxTable);
2041
2042 // Write LimEx structure members.
2043 limex->accelCount = verify_u32(accelTable.size());
2044 // FIXME: accelAuxCount is unused?
2045 limex->accelAuxCount = verify_u32(accelAux.size());
2046
2047 // Write LimEx masks.
2048 maskSetBits(limex->accel, accelMask);
2049 maskSetBits(limex->accel_and_friends, accelFriendsMask);
2050
2051 // We can use PSHUFB-based shuffles for models >= 128 states. These
2052 // require some additional masks in the bytecode.
2053 maskClear(limex->accelCompare);
2054 maskFill(limex->accelPermute, (char)0x80);
2055 if (NFATraits<dtype>::maxStates >= 128) {
2056 writeAccelSsse3Masks(accelMask, limex);
2057 }
2058 }
2059
2060 static
2061 void writeAccepts(const NFAStateSet &acceptMask,
2062 const NFAStateSet &acceptEodMask,
2063 const vector<NFAAccept> &accepts,
2064 const vector<NFAAccept> &acceptsEod,
2065 const vector<NFAStateSet> &squash, implNFA_t *limex,
2066 const u32 acceptsOffset, const u32 acceptsEodOffset,
2067 const u32 squashOffset, const u32 reportListOffset) {
2068 char *limex_base = (char *)limex;
2069
2070 DEBUG_PRINTF("acceptsOffset=%u, acceptsEodOffset=%u, squashOffset=%u\n",
2071 acceptsOffset, acceptsEodOffset, squashOffset);
2072
2073 // LimEx masks (in structure)
2074 maskSetBits(limex->accept, acceptMask);
2075 maskSetBits(limex->acceptAtEOD, acceptEodMask);
2076
2077 // Transforms the indices (report list, squash mask) into offsets
2078 // relative to the base of the limex.
2079 auto transform_offset_fn = [&](NFAAccept a) {
2080 if (!a.single_report) {
2081 a.reports = reportListOffset + a.reports * sizeof(ReportID);
2082 }
2083 a.squash = squashOffset + a.squash * sizeof(tableRow_t);
2084 return a;
2085 };
2086
2087 // Write accept table.
2088 limex->acceptOffset = acceptsOffset;
2089 limex->acceptCount = verify_u32(accepts.size());
2090 DEBUG_PRINTF("NFA has %zu accepts\n", accepts.size());
2091 NFAAccept *acceptsTable = (NFAAccept *)(limex_base + acceptsOffset);
2092 assert(ISALIGNED(acceptsTable));
2093 transform(accepts.begin(), accepts.end(), acceptsTable,
2094 transform_offset_fn);
2095
2096 // Write eod accept table.
2097 limex->acceptEodOffset = acceptsEodOffset;
2098 limex->acceptEodCount = verify_u32(acceptsEod.size());
2099 DEBUG_PRINTF("NFA has %zu EOD accepts\n", acceptsEod.size());
2100 NFAAccept *acceptsEodTable = (NFAAccept *)(limex_base + acceptsEodOffset);
2101 assert(ISALIGNED(acceptsEodTable));
2102 transform(acceptsEod.begin(), acceptsEod.end(), acceptsEodTable,
2103 transform_offset_fn);
2104
2105 // Write squash mask table.
2106 limex->squashCount = verify_u32(squash.size());
2107 limex->squashOffset = squashOffset;
2108 DEBUG_PRINTF("NFA has %zu report squash masks\n", squash.size());
2109 tableRow_t *mask = (tableRow_t *)(limex_base + squashOffset);
2110 assert(ISALIGNED(mask));
2111 for (size_t i = 0, end = squash.size(); i < end; i++) {
2112 maskSetBits(mask[i], squash[i]);
2113 }
2114 }
2115
2116 static
2117 void writeRepeats(const vector<bytecode_ptr<NFARepeatInfo>> &repeats,
2118 vector<u32> &repeatOffsets, implNFA_t *limex,
2119 const u32 repeatOffsetsOffset, const u32 repeatOffset) {
2120 const u32 num_repeats = verify_u32(repeats.size());
2121
2122 DEBUG_PRINTF("repeatOffsetsOffset=%u, repeatOffset=%u\n",
2123 repeatOffsetsOffset, repeatOffset);
2124
2125 repeatOffsets.resize(num_repeats);
2126 u32 offset = repeatOffset;
2127
2128 for (u32 i = 0; i < num_repeats; i++) {
2129 repeatOffsets[i] = offset;
2130 assert(repeats[i]);
2131 memcpy((char *)limex + offset, repeats[i].get(), repeats[i].size());
2132 offset += repeats[i].size();
2133 }
2134
2135 // Write repeat offset lookup table.
2136 assert(ISALIGNED_N((char *)limex + repeatOffsetsOffset, alignof(u32)));
2137 copy_bytes((char *)limex + repeatOffsetsOffset, repeatOffsets);
2138
2139 limex->repeatOffset = repeatOffsetsOffset;
2140 limex->repeatCount = num_repeats;
2141 }
2142
2143 static
2144 void writeReportList(const vector<ReportID> &reports, implNFA_t *limex,
2145 const u32 reportListOffset) {
2146 DEBUG_PRINTF("reportListOffset=%u\n", reportListOffset);
2147 assert(ISALIGNED_N((char *)limex + reportListOffset,
2148 alignof(ReportID)));
2149 copy_bytes((char *)limex + reportListOffset, reports);
2150 }
2151
2152 static
2153 bytecode_ptr<NFA> generateNfa(const build_info &args) {
2154 if (args.num_states > NFATraits<dtype>::maxStates) {
2155 return nullptr;
2156 }
2157
2158 // Build bounded repeat structures.
2159 vector<bytecode_ptr<NFARepeatInfo>> repeats;
2160 u32 repeats_full_state = 0;
2161 u32 repeats_stream_state = 0;
2162 buildRepeats(args, repeats, &repeats_full_state, &repeats_stream_state);
2163 size_t repeatSize = 0;
2164 for (size_t i = 0; i < repeats.size(); i++) {
2165 repeatSize += repeats[i].size();
2166 }
2167
2168 // We track report lists that have already been written into the global
2169 // list in case we can reuse them.
2170 ReportListCache reports_cache;
2171
2172 unordered_set<NFAEdge> exceptional;
2173 u32 shiftCount = findBestNumOfVarShifts(args);
2174 assert(shiftCount);
2175 u32 maxShift = findMaxVarShift(args, shiftCount);
2176 findExceptionalTransitions(args, exceptional, maxShift);
2177
2178 map<ExceptionProto, vector<u32>> exceptionMap;
2179 vector<ReportID> reportList;
2180
2181 u32 exceptionCount = buildExceptionMap(args, reports_cache, exceptional,
2182 exceptionMap, reportList);
2183
2184 assert(exceptionCount <= args.num_states);
2185
2186 // Build reach table and character mapping.
2187 vector<NFAStateSet> reach;
2188 vector<u8> reachMap;
2189 buildReachMapping(args, reach, reachMap);
2190
2191 // Build top masks.
2192 vector<NFAStateSet> tops;
2193 buildTopMasks(args, tops);
2194
2195 // Build all our accept info.
2196 NFAStateSet acceptMask, acceptEodMask;
2197 vector<NFAAccept> accepts, acceptsEod;
2198 vector<NFAStateSet> squash;
2199 buildAccepts(args, reports_cache, acceptMask, acceptEodMask, accepts,
2200 acceptsEod, reportList, squash);
2201
2202 // Build all our accel info.
2203 NFAStateSet accelMask, accelFriendsMask;
2204 AccelAuxVector accelAux;
2205 vector<u8> accelTable;
2206 buildAccel(args, accelMask, accelFriendsMask, accelAux, accelTable);
2207
2208 // Compute the offsets in the bytecode for this LimEx NFA for all of
2209 // our structures. First, the NFA and LimEx structures. All other
2210 // offsets are relative to the start of the LimEx struct, starting with
2211 // the reach table.
2212 u32 offset = sizeof(implNFA_t);
2213
2214 const u32 reachOffset = offset;
2215 offset += sizeof(tableRow_t) * reach.size();
2216
2217 const u32 topsOffset = offset;
2218 offset += sizeof(tableRow_t) * tops.size();
2219
2220 const u32 accelTableOffset = offset;
2221 offset += sizeof(u8) * accelTable.size();
2222
2223 offset = ROUNDUP_N(offset, alignof(AccelAux));
2224 const u32 accelAuxOffset = offset;
2225 offset += sizeof(AccelAux) * accelAux.size();
2226
2227 offset = ROUNDUP_N(offset, alignof(NFAAccept));
2228 const u32 acceptsOffset = offset;
2229 offset += sizeof(NFAAccept) * accepts.size();
2230 const u32 acceptsEodOffset = offset;
2231 offset += sizeof(NFAAccept) * acceptsEod.size();
2232
2233 offset = ROUNDUP_CL(offset);
2234 const u32 squashOffset = offset;
2235 offset += sizeof(tableRow_t) * squash.size();
2236
2237 offset = ROUNDUP_CL(offset);
2238 const u32 exceptionsOffset = offset;
2239 offset += sizeof(exception_t) * exceptionCount;
2240
2241 const u32 reportListOffset = offset;
2242 offset += sizeof(ReportID) * reportList.size();
2243
2244 const u32 repeatOffsetsOffset = offset;
2245 offset += sizeof(u32) * args.repeats.size();
2246
2247 offset = ROUNDUP_CL(offset);
2248 const u32 repeatsOffset = offset;
2249 offset += repeatSize;
2250
2251 // Now we can allocate space for the NFA and get to work on layout.
2252
2253 size_t nfaSize = sizeof(NFA) + offset;
2254 DEBUG_PRINTF("nfa size %zu\n", nfaSize);
2255 auto nfa = make_zeroed_bytecode_ptr<NFA>(nfaSize);
2256 assert(nfa); // otherwise we would have thrown std::bad_alloc
2257
2258 implNFA_t *limex = (implNFA_t *)getMutableImplNfa(nfa.get());
2259 assert(ISALIGNED(limex));
2260
2261 writeReachMapping(reach, reachMap, limex, reachOffset);
2262
2263 writeTopMasks(tops, limex, topsOffset);
2264
2265 writeAccel(accelMask, accelFriendsMask, accelAux, accelTable,
2266 limex, accelTableOffset, accelAuxOffset);
2267
2268 writeAccepts(acceptMask, acceptEodMask, accepts, acceptsEod, squash,
2269 limex, acceptsOffset, acceptsEodOffset, squashOffset,
2270 reportListOffset);
2271
2272 limex->shiftCount = shiftCount;
2273 writeShiftMasks(args, limex);
2274
2275 if (cannotDie(args)) {
2276 DEBUG_PRINTF("nfa cannot die\n");
2277 setLimexFlag(limex, LIMEX_FLAG_CANNOT_DIE);
2278 }
2279
2280 // Determine the state required for our state vector.
2281 findStateSize(args, limex);
2282
2283 writeReportList(reportList, limex, reportListOffset);
2284
2285 // Repeat structures and offset table.
2286 vector<u32> repeatOffsets;
2287 writeRepeats(repeats, repeatOffsets, limex, repeatOffsetsOffset,
2288 repeatsOffset);
2289
2290 writeExceptions(exceptionMap, repeatOffsets, limex, exceptionsOffset,
2291 reportListOffset);
2292
2293 writeLimexMasks(args, limex);
2294
2295 allocState(nfa.get(), repeats_full_state, repeats_stream_state);
2296
2297 nfa->type = dtype;
2298 nfa->length = verify_u32(nfaSize);
2299 nfa->nPositions = args.num_states;
2300
2301 if (!args.zombies.empty()) {
2302 setNfaFlag(nfa.get(), NFA_ZOMBIE);
2303 }
2304 if (!acceptsEod.empty()) {
2305 setNfaFlag(nfa.get(), NFA_ACCEPTS_EOD);
2306 }
2307
2308 return nfa;
2309 }
2310
2311 static int score(const build_info &args) {
2312 // LimEx NFAs are available in sizes from 32 to 512-bit.
2313 size_t num_states = args.num_states;
2314
2315 size_t sz = findContainerSize(num_states);
2316 if (sz < 32) {
2317 sz = 32;
2318 }
2319
2320 if (args.cc.grey.nfaForceSize) {
2321 sz = args.cc.grey.nfaForceSize;
2322 }
2323
2324 if (sz != NFATraits<dtype>::maxStates) {
2325 return -1; // fail, size not appropriate
2326 }
2327
2328 // We are of the right size, calculate a score based on the number
2329 // of exceptions and the number of shifts used by this LimEx.
2330 int score;
2331 u32 shiftCount = findBestNumOfVarShifts(args, &score);
2332 if (shiftCount == 0) {
2333 return -1;
2334 }
2335 return score;
2336 }
2337};
2338
2339template<NFAEngineType dtype>
2340struct generateNfa {
2341 static bytecode_ptr<NFA> call(const build_info &args) {
2342 return Factory<dtype>::generateNfa(args);
2343 }
2344};
2345
2346template<NFAEngineType dtype>
2347struct scoreNfa {
2348 static int call(const build_info &args) {
2349 return Factory<dtype>::score(args);
2350 }
2351};
2352
2353#define MAKE_LIMEX_TRAITS(mlt_size) \
2354 template<> struct NFATraits<LIMEX_NFA_##mlt_size> { \
2355 typedef LimExNFA##mlt_size implNFA_t; \
2356 typedef u_##mlt_size tableRow_t; \
2357 typedef NFAException##mlt_size exception_t; \
2358 static const size_t maxStates = mlt_size; \
2359 static const size_t scratch_state_size = mlt_size == 64 ? sizeof(m128) \
2360 : sizeof(tableRow_t); \
2361 };
2362
2363MAKE_LIMEX_TRAITS(32)
2364MAKE_LIMEX_TRAITS(64)
2365MAKE_LIMEX_TRAITS(128)
2366MAKE_LIMEX_TRAITS(256)
2367MAKE_LIMEX_TRAITS(384)
2368MAKE_LIMEX_TRAITS(512)
2369
2370} // namespace
2371
2372#ifndef NDEBUG
2373// Some sanity tests, called by an assertion in generate().
2374static UNUSED
2375bool isSane(const NGHolder &h, const map<u32, set<NFAVertex>> &tops,
2376 const unordered_map<NFAVertex, u32> &state_ids,
2377 u32 num_states) {
2378 unordered_set<u32> seen;
2379 unordered_set<NFAVertex> top_starts;
2380 for (const auto &vv : tops | map_values) {
2381 insert(&top_starts, vv);
2382 }
2383
2384 for (auto v : vertices_range(h)) {
2385 if (!contains(state_ids, v)) {
2386 DEBUG_PRINTF("no entry for vertex %zu in state map\n", h[v].index);
2387 return false;
2388 }
2389 const u32 i = state_ids.at(v);
2390 if (i == NO_STATE) {
2391 continue;
2392 }
2393
2394 DEBUG_PRINTF("checking vertex %zu (state %u)\n", h[v].index, i);
2395
2396 if (i >= num_states || contains(seen, i)) {
2397 DEBUG_PRINTF("vertex %u/%u has invalid state\n", i, num_states);
2398 return false;
2399 }
2400 seen.insert(i);
2401
2402 // All our states should be reachable and have a state assigned.
2403 if (h[v].char_reach.none()) {
2404 DEBUG_PRINTF("vertex %zu has empty reachability\n", h[v].index);
2405 return false;
2406 }
2407
2408 // Every state that isn't a start state (or top, in triggered NFAs)
2409 // must have at least one predecessor that is not itself.
2410 if (v != h.start && v != h.startDs && !contains(top_starts, v)
2411 && !proper_in_degree(v, h)) {
2412 DEBUG_PRINTF("vertex %zu has no pred\n", h[v].index);
2413 return false;
2414 }
2415 }
2416
2417 if (seen.size() != num_states) {
2418 return false;
2419 }
2420
2421 return true;
2422}
2423#endif // NDEBUG
2424
2425static
2426u32 max_state(const unordered_map<NFAVertex, u32> &state_ids) {
2427 u32 rv = 0;
2428 for (const auto &m : state_ids) {
2429 DEBUG_PRINTF("state %u\n", m.second);
2430 if (m.second != NO_STATE) {
2431 rv = max(m.second, rv);
2432 }
2433 }
2434 DEBUG_PRINTF("max %u\n", rv);
2435 return rv;
2436}
2437
2438bytecode_ptr<NFA> generate(NGHolder &h,
2439 const unordered_map<NFAVertex, u32> &states,
2440 const vector<BoundedRepeatData> &repeats,
2441 const unordered_map<NFAVertex, NFAStateSet> &reportSquashMap,
2442 const unordered_map<NFAVertex, NFAStateSet> &squashMap,
2443 const map<u32, set<NFAVertex>> &tops,
2444 const set<NFAVertex> &zombies, bool do_accel,
2445 bool stateCompression, u32 hint,
2446 const CompileContext &cc) {
2447 const u32 num_states = max_state(states) + 1;
2448 DEBUG_PRINTF("total states: %u\n", num_states);
2449
2450 if (!cc.grey.allowLimExNFA) {
2451 DEBUG_PRINTF("limex not allowed\n");
2452 return nullptr;
2453 }
2454
2455 // If you ask for a particular type, it had better be an NFA.
2456 assert(hint == INVALID_NFA || hint <= LAST_LIMEX_NFA);
2457 DEBUG_PRINTF("hint=%u\n", hint);
2458
2459 // Sanity check the input data.
2460 assert(isSane(h, tops, states, num_states));
2461
2462 // Build arguments used in the rest of this file.
2463 build_info arg(h, states, repeats, reportSquashMap, squashMap, tops,
2464 zombies, do_accel, stateCompression, cc, num_states);
2465
2466 // Acceleration analysis.
2467 fillAccelInfo(arg);
2468
2469 vector<pair<int, NFAEngineType>> scores;
2470
2471 if (hint != INVALID_NFA) {
2472 // The caller has told us what to (attempt to) build.
2473 scores.emplace_back(0, (NFAEngineType)hint);
2474 } else {
2475 for (size_t i = 0; i <= LAST_LIMEX_NFA; i++) {
2476 NFAEngineType ntype = (NFAEngineType)i;
2477 int score = DISPATCH_BY_LIMEX_TYPE(ntype, scoreNfa, arg);
2478 if (score >= 0) {
2479 DEBUG_PRINTF("%s scores %d\n", nfa_type_name(ntype), score);
2480 scores.emplace_back(score, ntype);
2481 }
2482 }
2483 }
2484
2485 if (scores.empty()) {
2486 DEBUG_PRINTF("No NFA returned a valid score for this case.\n");
2487 return nullptr;
2488 }
2489
2490 // Sort acceptable models in priority order, lowest score first.
2491 sort(scores.begin(), scores.end());
2492
2493 for (const auto &elem : scores) {
2494 assert(elem.first >= 0);
2495 NFAEngineType limex_model = elem.second;
2496 auto nfa = DISPATCH_BY_LIMEX_TYPE(limex_model, generateNfa, arg);
2497 if (nfa) {
2498 DEBUG_PRINTF("successful build with NFA engine: %s\n",
2499 nfa_type_name(limex_model));
2500 return nfa;
2501 }
2502 }
2503
2504 DEBUG_PRINTF("NFA build failed.\n");
2505 return nullptr;
2506}
2507
2508u32 countAccelStates(NGHolder &h,
2509 const unordered_map<NFAVertex, u32> &states,
2510 const vector<BoundedRepeatData> &repeats,
2511 const unordered_map<NFAVertex, NFAStateSet> &reportSquashMap,
2512 const unordered_map<NFAVertex, NFAStateSet> &squashMap,
2513 const map<u32, set<NFAVertex>> &tops,
2514 const set<NFAVertex> &zombies,
2515 const CompileContext &cc) {
2516 const u32 num_states = max_state(states) + 1;
2517 DEBUG_PRINTF("total states: %u\n", num_states);
2518
2519 if (!cc.grey.allowLimExNFA) {
2520 DEBUG_PRINTF("limex not allowed\n");
2521 return 0;
2522 }
2523
2524 // Sanity check the input data.
2525 assert(isSane(h, tops, states, num_states));
2526
2527 const bool do_accel = true;
2528 const bool state_compression = false;
2529
2530 // Build arguments used in the rest of this file.
2531 build_info bi(h, states, repeats, reportSquashMap, squashMap, tops, zombies,
2532 do_accel, state_compression, cc, num_states);
2533
2534 // Acceleration analysis.
2535 nfaFindAccelSchemes(bi.h, bi.br_cyclic, &bi.accel.accel_map);
2536
2537 u32 num_accel = verify_u32(bi.accel.accel_map.size());
2538 DEBUG_PRINTF("found %u accel states\n", num_accel);
2539 return num_accel;
2540}
2541
2542} // namespace ue2
2543