1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "rose_build_anchored.h"
30
31#include "grey.h"
32#include "rose_build_impl.h"
33#include "rose_build_matchers.h"
34#include "rose_internal.h"
35#include "ue2common.h"
36#include "nfa/dfa_min.h"
37#include "nfa/mcclellancompile.h"
38#include "nfa/mcclellancompile_util.h"
39#include "nfa/nfa_build_util.h"
40#include "nfa/rdfa_merge.h"
41#include "nfagraph/ng_holder.h"
42#include "nfagraph/ng_repeat.h"
43#include "nfagraph/ng_util.h"
44#include "nfagraph/ng_mcclellan_internal.h"
45#include "util/alloc.h"
46#include "util/bitfield.h"
47#include "util/charreach.h"
48#include "util/compile_context.h"
49#include "util/compile_error.h"
50#include "util/container.h"
51#include "util/determinise.h"
52#include "util/flat_containers.h"
53#include "util/graph_range.h"
54#include "util/make_unique.h"
55#include "util/order_check.h"
56#include "util/ue2string.h"
57#include "util/unordered.h"
58#include "util/verify_types.h"
59
60#include <map>
61#include <queue>
62#include <set>
63#include <vector>
64
65using namespace std;
66
67namespace ue2 {
68
69#define ANCHORED_NFA_STATE_LIMIT 512
70#define MAX_DFA_STATES 16000
71#define DFA_PAIR_MERGE_THRESHOLD 5000
72#define MAX_SMALL_START_REACH 4
73
74#define INIT_STATE (DEAD_STATE + 1)
75
76#define NO_FRAG_ID (~0U)
77
78// Adds a vertex with the given reach.
79static
80NFAVertex add_vertex(NGHolder &h, const CharReach &cr) {
81 NFAVertex v = add_vertex(h);
82 h[v].char_reach = cr;
83 return v;
84}
85
86static
87void add_edges(const set<NFAVertex> &parents, NFAVertex v, NGHolder &h) {
88 for (auto p : parents) {
89 add_edge(p, v, h);
90 }
91}
92
93static
94set<NFAVertex> addDotsToGraph(NGHolder &h, NFAVertex start, u32 min, u32 max,
95 const CharReach &cr) {
96 DEBUG_PRINTF("adding [%u, %u] to graph\n", min, max);
97 u32 i = 0;
98 set<NFAVertex> curr;
99 curr.insert(start);
100 for (; i < min; i++) {
101 NFAVertex next = add_vertex(h, cr);
102 add_edges(curr, next, h);
103 curr.clear();
104 curr.insert(next);
105 }
106
107 assert(max != ROSE_BOUND_INF);
108
109 set<NFAVertex> orig = curr;
110 for (; i < max; i++) {
111 NFAVertex next = add_vertex(h, cr);
112 add_edges(curr, next, h);
113 curr.clear();
114 curr.insert(next);
115 curr.insert(orig.begin(), orig.end());
116 }
117
118 return curr;
119}
120
121static
122NFAVertex addToGraph(NGHolder &h, const set<NFAVertex> &curr,
123 const ue2_literal &s) {
124 DEBUG_PRINTF("adding %s to graph\n", dumpString(s).c_str());
125 assert(!s.empty());
126
127 ue2_literal::const_iterator it = s.begin();
128 NFAVertex u = add_vertex(h, *it);
129 add_edges(curr, u, h);
130
131 for (++it; it != s.end(); ++it) {
132 NFAVertex next = add_vertex(h, *it);
133 add_edge(u, next, h);
134 u = next;
135 }
136
137 return u;
138}
139
140static
141void mergeAnchoredDfas(vector<unique_ptr<raw_dfa>> &dfas,
142 const RoseBuildImpl &build) {
143 // First, group our DFAs into "small start" and "big start" sets.
144 vector<unique_ptr<raw_dfa>> small_starts, big_starts;
145 for (auto &rdfa : dfas) {
146 u32 start_size = mcclellanStartReachSize(rdfa.get());
147 if (start_size <= MAX_SMALL_START_REACH) {
148 small_starts.push_back(move(rdfa));
149 } else {
150 big_starts.push_back(move(rdfa));
151 }
152 }
153 dfas.clear();
154
155 DEBUG_PRINTF("%zu dfas with small starts, %zu dfas with big starts\n",
156 small_starts.size(), big_starts.size());
157 mergeDfas(small_starts, MAX_DFA_STATES, nullptr, build.cc.grey);
158 mergeDfas(big_starts, MAX_DFA_STATES, nullptr, build.cc.grey);
159
160 // Rehome our groups into one vector.
161 for (auto &rdfa : small_starts) {
162 dfas.push_back(move(rdfa));
163 }
164 for (auto &rdfa : big_starts) {
165 dfas.push_back(move(rdfa));
166 }
167
168 // Final test: if we've built two DFAs here that are small enough, we can
169 // try to merge them.
170 if (dfas.size() == 2) {
171 size_t total_states = dfas[0]->states.size() + dfas[1]->states.size();
172 if (total_states < DFA_PAIR_MERGE_THRESHOLD) {
173 DEBUG_PRINTF("doing small pair merge\n");
174 mergeDfas(dfas, MAX_DFA_STATES, nullptr, build.cc.grey);
175 }
176 }
177}
178
179static
180void remapAnchoredReports(raw_dfa &rdfa, const vector<u32> &frag_map) {
181 for (dstate &ds : rdfa.states) {
182 assert(ds.reports_eod.empty()); // Not used in anchored matcher.
183 if (ds.reports.empty()) {
184 continue;
185 }
186
187 flat_set<ReportID> new_reports;
188 for (auto id : ds.reports) {
189 assert(id < frag_map.size());
190 new_reports.insert(frag_map[id]);
191 }
192 ds.reports = std::move(new_reports);
193 }
194}
195
196/**
197 * \brief Replaces the report ids currently in the dfas (rose graph literal
198 * ids) with the fragment id for each literal.
199 */
200static
201void remapAnchoredReports(RoseBuildImpl &build, const vector<u32> &frag_map) {
202 for (auto &m : build.anchored_nfas) {
203 for (auto &rdfa : m.second) {
204 assert(rdfa);
205 remapAnchoredReports(*rdfa, frag_map);
206 }
207 }
208}
209
210/**
211 * Returns mapping from literal ids to fragment ids.
212 */
213static
214vector<u32> reverseFragMap(const RoseBuildImpl &build,
215 const vector<LitFragment> &fragments) {
216 vector<u32> rev(build.literal_info.size(), NO_FRAG_ID);
217 for (const auto &f : fragments) {
218 for (u32 lit_id : f.lit_ids) {
219 assert(lit_id < rev.size());
220 rev[lit_id] = f.fragment_id;
221 }
222 }
223 return rev;
224}
225
226/**
227 * \brief Replace the reports (which are literal final_ids) in the given
228 * raw_dfa with program offsets.
229 */
230static
231void remapIdsToPrograms(const vector<LitFragment> &fragments, raw_dfa &rdfa) {
232 for (dstate &ds : rdfa.states) {
233 assert(ds.reports_eod.empty()); // Not used in anchored matcher.
234 if (ds.reports.empty()) {
235 continue;
236 }
237
238 flat_set<ReportID> new_reports;
239 for (auto fragment_id : ds.reports) {
240 const auto &frag = fragments.at(fragment_id);
241 new_reports.insert(frag.lit_program_offset);
242 }
243 ds.reports = std::move(new_reports);
244 }
245}
246
247static
248unique_ptr<NGHolder> populate_holder(const simple_anchored_info &sai,
249 const flat_set<u32> &exit_ids) {
250 DEBUG_PRINTF("populating holder for ^.{%u,%u}%s\n", sai.min_bound,
251 sai.max_bound, dumpString(sai.literal).c_str());
252 auto h_ptr = make_unique<NGHolder>();
253 NGHolder &h = *h_ptr;
254 auto ends = addDotsToGraph(h, h.start, sai.min_bound, sai.max_bound,
255 CharReach::dot());
256 NFAVertex v = addToGraph(h, ends, sai.literal);
257 add_edge(v, h.accept, h);
258 h[v].reports.insert(exit_ids.begin(), exit_ids.end());
259 return h_ptr;
260}
261
262u32 anchoredStateSize(const anchored_matcher_info &atable) {
263 const struct anchored_matcher_info *curr = &atable;
264
265 // Walk the list until we find the last element; total state size will be
266 // that engine's state offset plus its state requirement.
267 while (curr->next_offset) {
268 curr = (const anchored_matcher_info *)
269 ((const char *)curr + curr->next_offset);
270 }
271
272 const NFA *nfa = (const NFA *)((const char *)curr + sizeof(*curr));
273 return curr->state_offset + nfa->streamStateSize;
274}
275
276namespace {
277
278using nfa_state_set = bitfield<ANCHORED_NFA_STATE_LIMIT>;
279
280struct Holder_StateSet {
281 Holder_StateSet() : wdelay(0) {}
282
283 nfa_state_set wrap_state;
284 u32 wdelay;
285
286 bool operator==(const Holder_StateSet &b) const {
287 return wdelay == b.wdelay && wrap_state == b.wrap_state;
288 }
289
290 size_t hash() const {
291 return hash_all(wrap_state, wdelay);
292 }
293};
294
295class Automaton_Holder {
296public:
297 using StateSet = Holder_StateSet;
298 using StateMap = ue2_unordered_map<StateSet, dstate_id_t>;
299
300 explicit Automaton_Holder(const NGHolder &g_in) : g(g_in) {
301 for (auto v : vertices_range(g)) {
302 vertexToIndex[v] = indexToVertex.size();
303 indexToVertex.push_back(v);
304 }
305
306 assert(indexToVertex.size() <= ANCHORED_NFA_STATE_LIMIT);
307
308 DEBUG_PRINTF("%zu states\n", indexToVertex.size());
309 init.wdelay = 0;
310 init.wrap_state.set(vertexToIndex[g.start]);
311
312 DEBUG_PRINTF("init wdelay %u\n", init.wdelay);
313
314 calculateAlphabet();
315 cr_by_index = populateCR(g, indexToVertex, alpha);
316 }
317
318private:
319 void calculateAlphabet() {
320 vector<CharReach> esets(1, CharReach::dot());
321
322 for (auto v : indexToVertex) {
323 const CharReach &cr = g[v].char_reach;
324
325 for (size_t i = 0; i < esets.size(); i++) {
326 if (esets[i].count() == 1) {
327 continue;
328 }
329
330 CharReach t = cr & esets[i];
331
332 if (t.any() && t != esets[i]) {
333 esets[i] &= ~t;
334 esets.push_back(t);
335 }
336 }
337 }
338
339 alphasize = buildAlphabetFromEquivSets(esets, alpha, unalpha);
340 }
341
342public:
343 void transition(const StateSet &in, StateSet *next) {
344 /* track the dfa state, reset nfa states */
345 u32 wdelay = in.wdelay ? in.wdelay - 1 : 0;
346
347 for (symbol_t s = 0; s < alphasize; s++) {
348 next[s].wrap_state.reset();
349 next[s].wdelay = wdelay;
350 }
351
352 nfa_state_set succ;
353
354 if (wdelay != in.wdelay) {
355 DEBUG_PRINTF("enabling start\n");
356 succ.set(vertexToIndex[g.startDs]);
357 }
358
359 for (size_t i = in.wrap_state.find_first(); i != nfa_state_set::npos;
360 i = in.wrap_state.find_next(i)) {
361 NFAVertex v = indexToVertex[i];
362 for (auto w : adjacent_vertices_range(v, g)) {
363 if (!contains(vertexToIndex, w)
364 || w == g.accept || w == g.acceptEod) {
365 continue;
366 }
367
368 if (w == g.startDs) {
369 continue;
370 }
371
372 succ.set(vertexToIndex[w]);
373 }
374 }
375
376 for (size_t j = succ.find_first(); j != nfa_state_set::npos;
377 j = succ.find_next(j)) {
378 const CharReach &cr = cr_by_index[j];
379 for (size_t s = cr.find_first(); s != CharReach::npos;
380 s = cr.find_next(s)) {
381 next[s].wrap_state.set(j); /* pre alpha'ed */
382 }
383 }
384
385 next[alpha[TOP]] = in;
386 }
387
388 const vector<StateSet> initial() {
389 return {init};
390 }
391
392 void reports(const StateSet &in, flat_set<ReportID> &rv) {
393 rv.clear();
394 for (size_t i = in.wrap_state.find_first(); i != nfa_state_set::npos;
395 i = in.wrap_state.find_next(i)) {
396 NFAVertex v = indexToVertex[i];
397 if (edge(v, g.accept, g).second) {
398 assert(!g[v].reports.empty());
399 insert(&rv, g[v].reports);
400 } else {
401 assert(g[v].reports.empty());
402 }
403 }
404 }
405
406 void reportsEod(const StateSet &, flat_set<ReportID> &r) {
407 r.clear();
408 }
409
410 static bool canPrune(const flat_set<ReportID> &) {
411 /* used by ng_ to prune states after highlander accepts */
412 return false;
413 }
414
415private:
416 const NGHolder &g;
417 unordered_map<NFAVertex, u32> vertexToIndex;
418 vector<NFAVertex> indexToVertex;
419 vector<CharReach> cr_by_index;
420 StateSet init;
421public:
422 StateSet dead;
423 array<u16, ALPHABET_SIZE> alpha;
424 array<u16, ALPHABET_SIZE> unalpha;
425 u16 alphasize;
426};
427
428} // namespace
429
430static
431bool check_dupe(const raw_dfa &rdfa,
432 const vector<unique_ptr<raw_dfa>> &existing, ReportID *remap) {
433 if (!remap) {
434 DEBUG_PRINTF("no remap\n");
435 return false;
436 }
437
438 set<ReportID> rdfa_reports;
439 for (const auto &ds : rdfa.states) {
440 rdfa_reports.insert(ds.reports.begin(), ds.reports.end());
441 }
442 if (rdfa_reports.size() != 1) {
443 return false; /* too complicated for now would need mapping TODO */
444 }
445
446 for (const auto &e_rdfa : existing) {
447 assert(e_rdfa);
448 const raw_dfa &b = *e_rdfa;
449
450 if (rdfa.start_anchored != b.start_anchored ||
451 rdfa.alpha_size != b.alpha_size ||
452 rdfa.states.size() != b.states.size() ||
453 rdfa.alpha_remap != b.alpha_remap) {
454 continue;
455 }
456
457 set<ReportID> b_reports;
458
459 for (u32 i = 0; i < b.states.size(); i++) {
460 assert(b.states[i].reports_eod.empty());
461 assert(rdfa.states[i].reports_eod.empty());
462 if (rdfa.states[i].reports.size() != b.states[i].reports.size()) {
463 goto next_dfa;
464 }
465 b_reports.insert(b.states[i].reports.begin(),
466 b.states[i].reports.end());
467
468 assert(rdfa.states[i].next.size() == b.states[i].next.size());
469 if (!equal(rdfa.states[i].next.begin(), rdfa.states[i].next.end(),
470 b.states[i].next.begin())) {
471 goto next_dfa;
472 }
473 }
474
475 if (b_reports.size() != 1) {
476 continue;
477 }
478
479 *remap = *b_reports.begin();
480 DEBUG_PRINTF("dupe found remapping to %u\n", *remap);
481 return true;
482 next_dfa:;
483 }
484
485 return false;
486}
487
488static
489bool check_dupe_simple(const RoseBuildImpl &build, u32 min_bound, u32 max_bound,
490 const ue2_literal &lit, ReportID *remap) {
491 if (!remap) {
492 DEBUG_PRINTF("no remap\n");
493 return false;
494 }
495
496 simple_anchored_info sai(min_bound, max_bound, lit);
497 if (contains(build.anchored_simple, sai)) {
498 *remap = *build.anchored_simple.at(sai).begin();
499 return true;
500 }
501
502 return false;
503}
504
505static
506NFAVertex extractLiteral(const NGHolder &h, ue2_literal *lit) {
507 vector<NFAVertex> lit_verts;
508 NFAVertex v = h.accept;
509 while ((v = getSoleSourceVertex(h, v))) {
510 const CharReach &cr = h[v].char_reach;
511 if (cr.count() > 1 && !cr.isCaselessChar()) {
512 break;
513 }
514 lit_verts.push_back(v);
515 }
516
517 if (lit_verts.empty()) {
518 return NGHolder::null_vertex();
519 }
520
521 bool nocase = false;
522 bool case_set = false;
523
524 for (auto it = lit_verts.rbegin(), ite = lit_verts.rend(); it != ite;
525 ++it) {
526 const CharReach &cr = h[*it].char_reach;
527 if (cr.isAlpha()) {
528 bool cr_nocase = cr.count() != 1;
529 if (case_set && cr_nocase != nocase) {
530 return NGHolder::null_vertex();
531 }
532
533 case_set = true;
534 nocase = cr_nocase;
535 lit->push_back(cr.find_first(), nocase);
536 } else {
537 lit->push_back(cr.find_first(), false);
538 }
539 }
540
541 return lit_verts.back();
542}
543
544static
545bool isSimple(const NGHolder &h, u32 *min_bound, u32 *max_bound,
546 ue2_literal *lit, u32 *report) {
547 assert(!proper_out_degree(h.startDs, h));
548 assert(in_degree(h.acceptEod, h) == 1);
549
550 DEBUG_PRINTF("looking for simple case\n");
551 NFAVertex lit_head = extractLiteral(h, lit);
552
553 if (lit_head == NGHolder::null_vertex()) {
554 DEBUG_PRINTF("no literal found\n");
555 return false;
556 }
557
558 const auto &reps = h[*inv_adjacent_vertices(h.accept, h).first].reports;
559
560 if (reps.size() != 1) {
561 return false;
562 }
563 *report = *reps.begin();
564
565 assert(!lit->empty());
566
567 set<NFAVertex> rep_exits;
568
569 /* lit should only be connected to dot vertices */
570 for (auto u : inv_adjacent_vertices_range(lit_head, h)) {
571 DEBUG_PRINTF("checking %zu\n", h[u].index);
572 if (!h[u].char_reach.all()) {
573 return false;
574 }
575
576 if (u != h.start) {
577 rep_exits.insert(u);
578 }
579 }
580
581 if (rep_exits.empty()) {
582 DEBUG_PRINTF("direct anchored\n");
583 assert(edge(h.start, lit_head, h).second);
584 *min_bound = 0;
585 *max_bound = 0;
586 return true;
587 }
588
589 NFAVertex key = *rep_exits.begin();
590
591 // Special-case the check for '^.foo' or '^.?foo'.
592 if (rep_exits.size() == 1 && edge(h.start, key, h).second &&
593 out_degree(key, h) == 1) {
594 DEBUG_PRINTF("one exit\n");
595 assert(edge(h.start, h.startDs, h).second);
596 size_t num_enters = out_degree(h.start, h);
597 if (num_enters == 2) {
598 DEBUG_PRINTF("^.{1,1} prefix\n");
599 *min_bound = 1;
600 *max_bound = 1;
601 return true;
602 }
603 if (num_enters == 3 && edge(h.start, lit_head, h).second) {
604 DEBUG_PRINTF("^.{0,1} prefix\n");
605 *min_bound = 0;
606 *max_bound = 1;
607 return true;
608 }
609 }
610
611 vector<GraphRepeatInfo> repeats;
612 findRepeats(h, 2, &repeats);
613
614 vector<GraphRepeatInfo>::const_iterator it;
615 for (it = repeats.begin(); it != repeats.end(); ++it) {
616 DEBUG_PRINTF("checking.. %zu verts\n", it->vertices.size());
617 if (find(it->vertices.begin(), it->vertices.end(), key)
618 != it->vertices.end()) {
619 break;
620 }
621 }
622 if (it == repeats.end()) {
623 DEBUG_PRINTF("no repeat found\n");
624 return false;
625 }
626
627 set<NFAVertex> rep_verts;
628 insert(&rep_verts, it->vertices);
629 if (!is_subset_of(rep_exits, rep_verts)) {
630 DEBUG_PRINTF("bad exit check\n");
631 return false;
632 }
633
634 set<NFAVertex> rep_enters;
635 insert(&rep_enters, adjacent_vertices(h.start, h));
636 rep_enters.erase(lit_head);
637 rep_enters.erase(h.startDs);
638
639 if (!is_subset_of(rep_enters, rep_verts)) {
640 DEBUG_PRINTF("bad entry check\n");
641 return false;
642 }
643
644 u32 min_b = it->repeatMin;
645 if (edge(h.start, lit_head, h).second) { /* jump edge */
646 if (min_b != 1) {
647 DEBUG_PRINTF("jump edge around repeat with min bound\n");
648 return false;
649 }
650
651 min_b = 0;
652 }
653 *min_bound = min_b;
654 *max_bound = it->repeatMax;
655
656 DEBUG_PRINTF("repeat %u %u before %s\n", *min_bound, *max_bound,
657 dumpString(*lit).c_str());
658 return true;
659}
660
661static
662int finalise_out(RoseBuildImpl &build, const NGHolder &h,
663 const Automaton_Holder &autom, unique_ptr<raw_dfa> out_dfa,
664 ReportID *remap) {
665 u32 min_bound = ~0U;
666 u32 max_bound = ~0U;
667 ue2_literal lit;
668 u32 simple_report = MO_INVALID_IDX;
669 if (isSimple(h, &min_bound, &max_bound, &lit, &simple_report)) {
670 assert(simple_report != MO_INVALID_IDX);
671 if (check_dupe_simple(build, min_bound, max_bound, lit, remap)) {
672 DEBUG_PRINTF("found duplicate remapping to %u\n", *remap);
673 return ANCHORED_REMAP;
674 }
675 DEBUG_PRINTF("add with report %u\n", simple_report);
676 build.anchored_simple[simple_anchored_info(min_bound, max_bound, lit)]
677 .insert(simple_report);
678 return ANCHORED_SUCCESS;
679 }
680
681 out_dfa->start_anchored = INIT_STATE;
682 out_dfa->start_floating = DEAD_STATE;
683 out_dfa->alpha_size = autom.alphasize;
684 out_dfa->alpha_remap = autom.alpha;
685 auto hash = hash_dfa_no_reports(*out_dfa);
686 if (check_dupe(*out_dfa, build.anchored_nfas[hash], remap)) {
687 return ANCHORED_REMAP;
688 }
689 build.anchored_nfas[hash].push_back(move(out_dfa));
690 return ANCHORED_SUCCESS;
691}
692
693static
694int addAutomaton(RoseBuildImpl &build, const NGHolder &h, ReportID *remap) {
695 if (num_vertices(h) > ANCHORED_NFA_STATE_LIMIT) {
696 DEBUG_PRINTF("autom bad!\n");
697 return ANCHORED_FAIL;
698 }
699
700 Automaton_Holder autom(h);
701
702 auto out_dfa = ue2::make_unique<raw_dfa>(NFA_OUTFIX_RAW);
703 if (determinise(autom, out_dfa->states, MAX_DFA_STATES)) {
704 return finalise_out(build, h, autom, move(out_dfa), remap);
705 }
706
707 DEBUG_PRINTF("determinise failed\n");
708 return ANCHORED_FAIL;
709}
710
711static
712void setReports(NGHolder &h, const map<NFAVertex, set<u32>> &reportMap,
713 const unordered_map<NFAVertex, NFAVertex> &orig_to_copy) {
714 for (const auto &m : reportMap) {
715 NFAVertex t = orig_to_copy.at(m.first);
716 assert(!m.second.empty());
717 add_edge(t, h.accept, h);
718 insert(&h[t].reports, m.second);
719 }
720}
721
722int addAnchoredNFA(RoseBuildImpl &build, const NGHolder &wrapper,
723 const map<NFAVertex, set<u32>> &reportMap) {
724 NGHolder h;
725 unordered_map<NFAVertex, NFAVertex> orig_to_copy;
726 cloneHolder(h, wrapper, &orig_to_copy);
727 clear_in_edges(h.accept, h);
728 clear_in_edges(h.acceptEod, h);
729 add_edge(h.accept, h.acceptEod, h);
730 clearReports(h);
731 setReports(h, reportMap, orig_to_copy);
732
733 return addAutomaton(build, h, nullptr);
734}
735
736int addToAnchoredMatcher(RoseBuildImpl &build, const NGHolder &anchored,
737 u32 exit_id, ReportID *remap) {
738 NGHolder h;
739 cloneHolder(h, anchored);
740 clearReports(h);
741 assert(in_degree(h.acceptEod, h) == 1);
742 for (auto v : inv_adjacent_vertices_range(h.accept, h)) {
743 h[v].reports.clear();
744 h[v].reports.insert(exit_id);
745 }
746
747 return addAutomaton(build, h, remap);
748}
749
750static
751void buildSimpleDfas(const RoseBuildImpl &build, const vector<u32> &frag_map,
752 vector<unique_ptr<raw_dfa>> *anchored_dfas) {
753 /* we should have determinised all of these before so there should be no
754 * chance of failure. */
755 flat_set<u32> exit_ids;
756 for (const auto &simple : build.anchored_simple) {
757 exit_ids.clear();
758 for (auto lit_id : simple.second) {
759 assert(lit_id < frag_map.size());
760 exit_ids.insert(frag_map[lit_id]);
761 }
762 auto h = populate_holder(simple.first, exit_ids);
763 Automaton_Holder autom(*h);
764 auto rdfa = ue2::make_unique<raw_dfa>(NFA_OUTFIX_RAW);
765 UNUSED bool rv = determinise(autom, rdfa->states, MAX_DFA_STATES);
766 assert(rv);
767 rdfa->start_anchored = INIT_STATE;
768 rdfa->start_floating = DEAD_STATE;
769 rdfa->alpha_size = autom.alphasize;
770 rdfa->alpha_remap = autom.alpha;
771 anchored_dfas->push_back(move(rdfa));
772 }
773}
774
775/**
776 * Fill the given vector with all of the raw_dfas we need to compile into the
777 * anchored matcher. Takes ownership of the input structures, clearing them
778 * from RoseBuildImpl.
779 */
780static
781vector<unique_ptr<raw_dfa>> getAnchoredDfas(RoseBuildImpl &build,
782 const vector<u32> &frag_map) {
783 vector<unique_ptr<raw_dfa>> dfas;
784
785 // DFAs that already exist as raw_dfas.
786 for (auto &anch_dfas : build.anchored_nfas) {
787 for (auto &rdfa : anch_dfas.second) {
788 dfas.push_back(move(rdfa));
789 }
790 }
791 build.anchored_nfas.clear();
792
793 // DFAs we currently have as simple literals.
794 if (!build.anchored_simple.empty()) {
795 buildSimpleDfas(build, frag_map, &dfas);
796 build.anchored_simple.clear();
797 }
798
799 return dfas;
800}
801
802/**
803 * \brief Builds our anchored DFAs into runtime NFAs.
804 *
805 * Constructs a vector of NFA structures and a vector of their start offsets
806 * (number of dots removed from the prefix) from the raw_dfa structures given.
807 *
808 * Note: frees the raw_dfa structures on completion.
809 *
810 * \return Total bytes required for the complete anchored matcher.
811 */
812static
813size_t buildNfas(vector<raw_dfa> &anchored_dfas,
814 vector<bytecode_ptr<NFA>> *nfas,
815 vector<u32> *start_offset, const CompileContext &cc,
816 const ReportManager &rm) {
817 const size_t num_dfas = anchored_dfas.size();
818
819 nfas->reserve(num_dfas);
820 start_offset->reserve(num_dfas);
821
822 size_t total_size = 0;
823
824 for (auto &rdfa : anchored_dfas) {
825 u32 removed_dots = remove_leading_dots(rdfa);
826 start_offset->push_back(removed_dots);
827
828 minimize_hopcroft(rdfa, cc.grey);
829
830 auto nfa = mcclellanCompile(rdfa, cc, rm, false);
831 if (!nfa) {
832 assert(0);
833 throw std::bad_alloc();
834 }
835
836 assert(nfa->length);
837 total_size += ROUNDUP_CL(sizeof(anchored_matcher_info) + nfa->length);
838 nfas->push_back(move(nfa));
839 }
840
841 // We no longer need to keep the raw_dfa structures around.
842 anchored_dfas.clear();
843
844 return total_size;
845}
846
847vector<raw_dfa> buildAnchoredDfas(RoseBuildImpl &build,
848 const vector<LitFragment> &fragments) {
849 vector<raw_dfa> dfas;
850
851 if (build.anchored_nfas.empty() && build.anchored_simple.empty()) {
852 DEBUG_PRINTF("empty\n");
853 return dfas;
854 }
855
856 const auto frag_map = reverseFragMap(build, fragments);
857 remapAnchoredReports(build, frag_map);
858
859 auto anch_dfas = getAnchoredDfas(build, frag_map);
860 mergeAnchoredDfas(anch_dfas, build);
861
862 dfas.reserve(anch_dfas.size());
863 for (auto &rdfa : anch_dfas) {
864 assert(rdfa);
865 dfas.push_back(move(*rdfa));
866 }
867 return dfas;
868}
869
870bytecode_ptr<anchored_matcher_info>
871buildAnchoredMatcher(RoseBuildImpl &build, const vector<LitFragment> &fragments,
872 vector<raw_dfa> &dfas) {
873 const CompileContext &cc = build.cc;
874
875 if (dfas.empty()) {
876 DEBUG_PRINTF("empty\n");
877 return nullptr;
878 }
879
880 for (auto &rdfa : dfas) {
881 remapIdsToPrograms(fragments, rdfa);
882 }
883
884 vector<bytecode_ptr<NFA>> nfas;
885 vector<u32> start_offset; // start offset for each dfa (dots removed)
886 size_t total_size = buildNfas(dfas, &nfas, &start_offset, cc, build.rm);
887
888 if (total_size > cc.grey.limitRoseAnchoredSize) {
889 throw ResourceLimitError();
890 }
891
892 auto atable =
893 make_zeroed_bytecode_ptr<anchored_matcher_info>(total_size, 64);
894 char *curr = (char *)atable.get();
895
896 u32 state_offset = 0;
897 for (size_t i = 0; i < nfas.size(); i++) {
898 const NFA *nfa = nfas[i].get();
899 anchored_matcher_info *ami = (anchored_matcher_info *)curr;
900 char *prev_curr = curr;
901
902 curr += sizeof(anchored_matcher_info);
903
904 memcpy(curr, nfa, nfa->length);
905 curr += nfa->length;
906 curr = ROUNDUP_PTR(curr, 64);
907
908 if (i + 1 == nfas.size()) {
909 ami->next_offset = 0U;
910 } else {
911 ami->next_offset = verify_u32(curr - prev_curr);
912 }
913
914 ami->state_offset = state_offset;
915 state_offset += nfa->streamStateSize;
916 ami->anchoredMinDistance = start_offset[i];
917 }
918
919 DEBUG_PRINTF("success %zu\n", atable.size());
920 return atable;
921}
922
923} // namespace ue2
924