1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "rose_build_impl.h"
30
31#include "ue2common.h"
32#include "grey.h"
33#include "rose_build_add_internal.h"
34#include "rose_build_anchored.h"
35#include "rose_in_util.h"
36#include "hwlm/hwlm_literal.h"
37#include "nfagraph/ng_depth.h"
38#include "nfagraph/ng_dump.h"
39#include "nfagraph/ng_holder.h"
40#include "nfagraph/ng_limex.h"
41#include "nfagraph/ng_reports.h"
42#include "nfagraph/ng_util.h"
43#include "nfagraph/ng_width.h"
44#include "util/charreach.h"
45#include "util/charreach_util.h"
46#include "util/compare.h"
47#include "util/compile_context.h"
48#include "util/container.h"
49#include "util/dump_charclass.h"
50#include "util/graph.h"
51#include "util/make_unique.h"
52#include "util/ue2string.h"
53#include "util/verify_types.h"
54
55#include <algorithm>
56#include <map>
57#include <set>
58#include <string>
59#include <vector>
60#include <utility>
61
62using namespace std;
63
64namespace ue2 {
65
66#define MIN_MASK_LIT_LEN 2
67#define MAX_MASK_SIZE 255
68#define MAX_MASK_LITS 30
69
70static
71void findMaskLiteral(const vector<CharReach> &mask, bool streaming,
72 ue2_literal *lit, u32 *offset, const Grey &grey) {
73 bool case_fixed = false;
74 bool nocase = false;
75
76 size_t best_begin = 0;
77 size_t best_end = 0;
78 size_t best_len = 0;
79
80 size_t begin = 0;
81 size_t end = 0;
82
83 for (size_t i = 0; i < mask.size(); i++) {
84 bool fail = false;
85 if (mask[i].count() != 1 && !mask[i].isCaselessChar()) {
86 DEBUG_PRINTF("hit non-literal char, resetting at %zu\n", i);
87 fail = true;
88 }
89
90 if (!fail && streaming && (end >= grey.maxHistoryAvailable + 1)) {
91 DEBUG_PRINTF("hit literal limit, resetting at %zu\n", i);
92 fail = true;
93 }
94
95 if (!fail && case_fixed && mask[i].isAlpha()) {
96 if (nocase && mask[i].count() != 2) {
97 fail = true;
98 }
99
100 if (!nocase && mask[i].count() != 1) {
101 fail = true;
102 }
103 }
104
105 if (fail) {
106 case_fixed = false;
107 nocase = false;
108 size_t len = end - begin;
109 bool better = len > best_len;
110 if (better) {
111 best_begin = begin;
112 best_end = end;
113 best_len = len;
114 }
115 begin = i + 1;
116 end = i + 1;
117 } else {
118 assert(end == i);
119 end = i + 1;
120
121 if (mask[i].isAlpha()) {
122 case_fixed = true;
123 nocase = mask[i].count() == 2;
124 }
125 }
126 }
127
128 size_t len = end - begin;
129 /* Everybody would rather be trigger towards the end */
130 bool better = len >= best_len && mask.size() - end <= MAX_DELAY;
131
132 if (better) {
133 best_begin = begin;
134 best_end = end;
135 best_len = len;
136 }
137
138 for (size_t i = best_begin; i < best_end; i++) {
139 assert(mask[i].count() == 1 || mask[i].count() == 2);
140 lit->push_back(mask[i].find_first(), mask[i].count() > 1);
141 }
142
143 *offset = verify_u32(best_begin);
144}
145
146static
147bool initFmlCandidates(const CharReach &cr, vector<ue2_literal> &cand) {
148 for (size_t i = cr.find_first(); i != cr.npos; i = cr.find_next(i)) {
149 char c = (char)i;
150 bool nocase = myisupper(c) && cr.test(mytolower(c));
151 if (myislower(c) && cr.test(mytoupper(c))) {
152 continue;
153 }
154
155 if (cand.size() >= MAX_MASK_LITS) {
156 DEBUG_PRINTF("hit lit limit of %u\n", MAX_MASK_LITS);
157 return false;
158 }
159
160 cand.emplace_back(c, nocase);
161 }
162
163 assert(cand.size() <= MAX_MASK_LITS);
164 return !cand.empty();
165}
166
167static
168bool expandFmlCandidates(const CharReach &cr, vector<ue2_literal> &curr,
169 vector<ue2_literal> &cand) {
170 DEBUG_PRINTF("expanding string with cr of %zu\n", cr.count());
171 DEBUG_PRINTF(" current cand list size %zu\n", cand.size());
172
173 curr.clear();
174
175 for (size_t i = cr.find_first(); i != cr.npos; i = cr.find_next(i)) {
176 char c = (char)i;
177 bool nocase = myisupper(c) && cr.test(mytolower(c));
178 if (myislower(c) && cr.test(mytoupper(c))) {
179 continue;
180 }
181
182 for (const auto &lit : cand) {
183 if (curr.size() >= MAX_MASK_LITS) {
184 DEBUG_PRINTF("hit lit limit of %u\n", MAX_MASK_LITS);
185 return false;
186 }
187
188 curr.push_back(lit);
189 curr.back().push_back(c, nocase);
190 }
191 }
192
193 if (curr.back().length() > MAX_MASK2_WIDTH &&
194 any_of(begin(curr), end(curr), mixed_sensitivity)) {
195 DEBUG_PRINTF("mixed-sensitivity lit is too long, stopping\n");
196 return false;
197 }
198
199 assert(curr.size() <= MAX_MASK_LITS);
200 cand.swap(curr);
201 return true;
202}
203
204static
205u32 scoreFmlCandidates(const vector<ue2_literal> &cand) {
206 if (cand.empty()) {
207 DEBUG_PRINTF("no candidates\n");
208 return 0;
209 }
210
211 const u32 len = cand.back().length();
212
213 DEBUG_PRINTF("length = %u count %zu\n", len, cand.size());
214 u32 min_period = len;
215
216 for (const auto &lit : cand) {
217 DEBUG_PRINTF("candidate: %s\n", dumpString(lit).c_str());
218 u32 period = lit.length() - maxStringSelfOverlap(lit);
219 min_period = min(min_period, period);
220 }
221 DEBUG_PRINTF("min_period %u\n", min_period);
222 u32 length_score =
223 (5 * min_period + len) * (cand.back().any_nocase() ? 90 : 100);
224 u32 count_penalty;
225 if (len > 4) {
226 count_penalty = 9 * len * cand.size();
227 } else {
228 count_penalty = 5 * cand.size();
229 }
230 if (length_score <= count_penalty) {
231 return 1;
232 }
233 return length_score - count_penalty;
234}
235
236/* favours later literals */
237static
238bool findMaskLiterals(const vector<CharReach> &mask, vector<ue2_literal> *lit,
239 u32 *minBound, u32 *length) {
240 *minBound = 0;
241 *length = 0;
242
243 vector<ue2_literal> candidates, best_candidates, curr_candidates;
244 u32 best_score = 0;
245 u32 best_minOffset = 0;
246
247 for (auto it = mask.begin(); it != mask.end(); ++it) {
248 candidates.clear();
249 if (!initFmlCandidates(*it, candidates)) {
250 DEBUG_PRINTF("failed to init\n");
251 continue;
252 }
253 DEBUG_PRINTF("++\n");
254 auto jt = it;
255 while (jt != mask.begin()) {
256 --jt;
257 DEBUG_PRINTF("--\n");
258 if (!expandFmlCandidates(*jt, curr_candidates, candidates)) {
259 DEBUG_PRINTF("expansion stopped\n");
260 break;
261 }
262 }
263
264 // Candidates have been expanded in reverse order.
265 for (auto &cand : candidates) {
266 cand = reverse_literal(cand);
267 }
268
269 u32 score = scoreFmlCandidates(candidates);
270 DEBUG_PRINTF("scored %u for literal set of size %zu\n", score,
271 candidates.size());
272 if (!candidates.empty() && score >= best_score) {
273 best_minOffset = it - mask.begin() - candidates.back().length() + 1;
274 best_candidates.swap(candidates);
275 best_score = score;
276 }
277 }
278
279 if (!best_score) {
280 DEBUG_PRINTF("no lits\n");
281 return false;
282 }
283
284 *minBound = best_minOffset;
285 *length = best_candidates.back().length();
286
287 DEBUG_PRINTF("best minbound %u length %u\n", *minBound, *length);
288
289 assert(all_of_in(best_candidates, [&](const ue2_literal &s) {
290 return s.length() == *length;
291 }));
292
293 *lit = std::move(best_candidates);
294 return true;
295}
296
297static
298unique_ptr<NGHolder> buildMaskLhs(bool anchored, u32 prefix_len,
299 const vector<CharReach> &mask) {
300 DEBUG_PRINTF("build %slhs len %u/%zu\n", anchored ? "anc " : "", prefix_len,
301 mask.size());
302
303 unique_ptr<NGHolder> lhs = ue2::make_unique<NGHolder>(NFA_PREFIX);
304
305 assert(prefix_len);
306 assert(mask.size() >= prefix_len);
307 NFAVertex pred = anchored ? lhs->start : lhs->startDs;
308
309 u32 m_idx = 0;
310 while (prefix_len--) {
311 NFAVertex v = add_vertex(*lhs);
312 (*lhs)[v].char_reach = mask[m_idx++];
313 add_edge(pred, v, *lhs);
314 pred = v;
315 }
316 add_edge(pred, lhs->accept, *lhs);
317 (*lhs)[pred].reports.insert(0);
318
319 return lhs;
320}
321
322static
323void buildLiteralMask(const vector<CharReach> &mask, vector<u8> &msk,
324 vector<u8> &cmp, u32 delay) {
325 msk.clear();
326 cmp.clear();
327 if (mask.size() <= delay) {
328 return;
329 }
330
331 // Construct an and/cmp mask from our mask ending at delay positions before
332 // the end of the literal, with max length HWLM_MASKLEN.
333
334 auto ite = mask.end() - delay;
335 auto it = ite - min(size_t{HWLM_MASKLEN}, mask.size() - delay);
336
337 for (; it != ite; ++it) {
338 msk.push_back(0);
339 cmp.push_back(0);
340 make_and_cmp_mask(*it, &msk.back(), &cmp.back());
341 }
342
343 assert(msk.size() == cmp.size());
344 assert(msk.size() <= HWLM_MASKLEN);
345}
346
347static
348bool validateTransientMask(const vector<CharReach> &mask, bool anchored,
349 bool eod, const Grey &grey) {
350 assert(!mask.empty());
351
352 // An EOD anchored mask requires that everything fit into history, while an
353 // ordinary floating case can handle one byte more (i.e., max history size
354 // and one byte in the buffer).
355 const size_t max_width = grey.maxHistoryAvailable + (eod ? 0 : 1);
356 if (mask.size() > max_width) {
357 DEBUG_PRINTF("mask too long for max available history\n");
358 return false;
359 }
360
361 /* although anchored masks cannot be transient, short masks may be placed
362 * into the atable. */
363 if (anchored && mask.size() > grey.maxAnchoredRegion) {
364 return false;
365 }
366
367 vector<ue2_literal> lits;
368 u32 lit_minBound; /* minBound of each literal in lit */
369 u32 lit_length; /* length of each literal in lit */
370 if (!findMaskLiterals(mask, &lits, &lit_minBound, &lit_length)) {
371 DEBUG_PRINTF("failed to find any lits\n");
372 return false;
373 }
374
375 if (lits.empty()) {
376 return false;
377 }
378
379 const u32 delay = mask.size() - lit_length - lit_minBound;
380 if (delay > MAX_DELAY) {
381 DEBUG_PRINTF("delay %u is too much\n", delay);
382 return false;
383 }
384
385 if (lit_length == 1 && lits.size() > 3) {
386 DEBUG_PRINTF("no decent trigger\n");
387 return false;
388 }
389
390 // Mixed-sensitivity literals require benefits masks to implement, and thus
391 // have a maximum length. This has been taken into account in
392 // findMaskLiterals.
393 assert(lit_length <= MAX_MASK2_WIDTH ||
394 none_of(begin(lits), end(lits), mixed_sensitivity));
395
396 // Build the HWLM literal mask.
397 vector<u8> msk, cmp;
398 if (grey.roseHamsterMasks) {
399 buildLiteralMask(mask, msk, cmp, delay);
400 }
401
402 // We consider the HWLM mask length to run from the first non-zero byte to
403 // the end, and let max(mask length, literal length) be the effective
404 // literal length.
405 //
406 // A one-byte literal with no mask is too short, but a one-byte literal
407 // with a few bytes of mask information is OK.
408
409 u32 msk_length = distance(find_if(begin(msk), end(msk),
410 [](u8 v) { return v != 0; }), end(msk));
411 u32 eff_lit_length = max(lit_length, msk_length);
412 DEBUG_PRINTF("msk_length=%u, eff_lit_length = %u\n", msk_length,
413 eff_lit_length);
414
415 if (eff_lit_length < MIN_MASK_LIT_LEN) {
416 DEBUG_PRINTF("literals too short\n");
417 return false;
418 }
419
420 DEBUG_PRINTF("mask is ok\n");
421 return true;
422}
423
424static
425bool maskIsNeeded(const ue2_literal &lit, const NGHolder &g) {
426 flat_set<NFAVertex> curr = {g.accept};
427 flat_set<NFAVertex> next;
428
429 for (auto it = lit.rbegin(), ite = lit.rend(); it != ite; ++it) {
430 const CharReach &cr = *it;
431 DEBUG_PRINTF("check %s\n", describeClass(*it).c_str());
432 next.clear();
433 for (auto v : curr) {
434 for (auto u : inv_adjacent_vertices_range(v, g)) {
435 if (isSubsetOf(cr, g[u].char_reach)) {
436 next.insert(u);
437 }
438 }
439 }
440 if (next.empty()) {
441 DEBUG_PRINTF("no path to start\n");
442 return true;
443 }
444 curr.swap(next);
445 }
446
447 for (auto v : curr) {
448 for (auto u : inv_adjacent_vertices_range(v, g)) {
449 if (u == g.start || u == g.startDs) {
450 DEBUG_PRINTF("literal spans graph from start to accept\n");
451 return false;
452
453 }
454 }
455 }
456
457 DEBUG_PRINTF("literal doesn't reach start\n");
458 return true;
459}
460
461static
462void addTransientMask(RoseBuildImpl &build, const vector<CharReach> &mask,
463 const flat_set<ReportID> &reports, bool anchored,
464 bool eod) {
465 vector<ue2_literal> lits;
466 u32 lit_minBound; /* minBound of each literal in lit */
467 u32 lit_length; /* length of each literal in lit */
468 if (!findMaskLiterals(mask, &lits, &lit_minBound, &lit_length)) {
469 DEBUG_PRINTF("failed to find any lits\n");
470 assert(0);
471 return;
472 }
473
474 DEBUG_PRINTF("%zu literals, minBound=%u, length=%u\n", lits.size(),
475 lit_minBound, lit_length);
476
477 if (lits.empty()) {
478 assert(0);
479 return;
480 }
481
482 u32 delay = mask.size() - lit_length - lit_minBound;
483 assert(delay <= MAX_DELAY);
484 DEBUG_PRINTF("delay=%u\n", delay);
485
486 shared_ptr<NGHolder> mask_graph = buildMaskLhs(anchored, mask.size(), mask);
487
488 u32 mask_lag = 0; /* TODO */
489
490 // Everyone gets the same report ID.
491 ReportID mask_report = build.getNewNfaReport();
492 set_report(*mask_graph, mask_report);
493
494 // Build the HWLM literal mask.
495 vector<u8> msk, cmp;
496 if (build.cc.grey.roseHamsterMasks) {
497 buildLiteralMask(mask, msk, cmp, delay);
498 }
499
500 /* adjust bounds to be relative to trigger rather than mask */
501 const u32 v_min_offset = add_rose_depth(0, mask.size());
502 const u32 v_max_offset =
503 add_rose_depth(anchored ? 0 : ROSE_BOUND_INF, mask.size());
504
505 RoseGraph &g = build.g;
506
507 // By default, masked literals go into the floating table (except for eod
508 // cases).
509 enum rose_literal_table table = ROSE_FLOATING;
510
511 RoseVertex eod_v = RoseGraph::null_vertex();
512 if (eod) {
513 eod_v = add_vertex(g);
514 g[eod_v].eod_accept = true;
515 insert(&g[eod_v].reports, reports);
516 g[eod_v].min_offset = v_min_offset;
517 g[eod_v].max_offset = v_max_offset;
518
519 // Note: because this is a transient mask, we know that we can match it
520 // completely inside the history buffer. So, using the EOD literal
521 // table is always safe.
522 table = ROSE_EOD_ANCHORED;
523
524 // Widen the EOD table window to cover the mask.
525 ENSURE_AT_LEAST(&build.ematcher_region_size, mask.size());
526 }
527
528 const flat_set<ReportID> no_reports;
529
530 for (const auto &lit : lits) {
531 u32 lit_id = build.getLiteralId(lit, msk, cmp, delay, table);
532 const RoseVertex parent = anchored ? build.anchored_root : build.root;
533 bool use_mask = delay || maskIsNeeded(lit, *mask_graph);
534
535 auto v = createVertex(&build, parent, 0, ROSE_BOUND_INF, lit_id,
536 lit.length(), eod ? no_reports : reports);
537
538 if (use_mask) {
539 g[v].left.graph = mask_graph;
540 g[v].left.lag = mask_lag;
541 g[v].left.leftfix_report = mask_report;
542 } else {
543 // Make sure our edge bounds are correct.
544 RoseEdge e = edge(parent, v, g);
545 g[e].minBound = 0;
546 g[e].maxBound = anchored ? 0 : ROSE_BOUND_INF;
547 g[e].history = anchored ? ROSE_ROLE_HISTORY_ANCH
548 : ROSE_ROLE_HISTORY_NONE;
549 }
550
551 // Set offsets correctly.
552 g[v].min_offset = v_min_offset;
553 g[v].max_offset = v_max_offset;
554
555 if (eod) {
556 RoseEdge e = add_edge(v, eod_v, g);
557 g[e].minBound = 0;
558 g[e].maxBound = 0;
559 g[e].history = ROSE_ROLE_HISTORY_LAST_BYTE;
560 }
561 }
562}
563
564static
565unique_ptr<NGHolder> buildMaskRhs(const flat_set<ReportID> &reports,
566 const vector<CharReach> &mask,
567 u32 suffix_len) {
568 assert(suffix_len);
569 assert(mask.size() > suffix_len);
570
571 unique_ptr<NGHolder> rhs = ue2::make_unique<NGHolder>(NFA_SUFFIX);
572 NGHolder &h = *rhs;
573
574 NFAVertex succ = h.accept;
575 u32 m_idx = mask.size() - 1;
576 while (suffix_len--) {
577 NFAVertex u = add_vertex(h);
578 if (succ == h.accept) {
579 h[u].reports.insert(reports.begin(), reports.end());
580 }
581 h[u].char_reach = mask[m_idx--];
582 add_edge(u, succ, h);
583 succ = u;
584 }
585
586 NFAEdge e = add_edge(h.start, succ, h);
587 h[e].tops.insert(DEFAULT_TOP);
588
589 return rhs;
590}
591
592static
593void doAddMask(RoseBuildImpl &tbi, bool anchored, const vector<CharReach> &mask,
594 const ue2_literal &lit, u32 prefix_len, u32 suffix_len,
595 const flat_set<ReportID> &reports) {
596 /* Note: bounds are relative to literal start */
597 RoseInGraph ig;
598 RoseInVertex s = add_vertex(RoseInVertexProps::makeStart(anchored), ig);
599 RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), ig);
600
601 DEBUG_PRINTF("pref + lit = %u\n", prefix_len);
602 assert(prefix_len >= lit.length());
603
604 // prefix len is relative to end of literal.
605 u32 minBound = prefix_len - lit.length();
606
607 if (minBound) {
608 if (anchored && prefix_len > tbi.cc.grey.maxAnchoredRegion) {
609 DEBUG_PRINTF("too deep\n");
610 /* see if there is an anchored literal we can also hang off */
611
612 ue2_literal lit2;
613 u32 lit2_offset;
614 vector<CharReach> mask2 = mask;
615 assert(mask2.size() > tbi.cc.grey.maxAnchoredRegion);
616 mask2.resize(MIN(tbi.cc.grey.maxAnchoredRegion, minBound));
617
618 findMaskLiteral(mask2, tbi.cc.streaming, &lit2, &lit2_offset,
619 tbi.cc.grey);
620
621 if (lit2.length() >= MIN_MASK_LIT_LEN) {
622 u32 prefix2_len = lit2_offset + lit2.length();
623 assert(prefix2_len < minBound);
624 RoseInVertex u
625 = add_vertex(RoseInVertexProps::makeLiteral(lit2), ig);
626 if (lit2_offset){
627 DEBUG_PRINTF("building lhs (off %u)\n", lit2_offset);
628 shared_ptr<NGHolder> lhs2
629 = buildMaskLhs(true, lit2_offset, mask);
630 add_edge(s, u, RoseInEdgeProps(lhs2, lit2.length()), ig);
631 } else {
632 add_edge(s, u, RoseInEdgeProps(0, 0), ig);
633 }
634
635 /* midfix */
636 DEBUG_PRINTF("building mhs\n");
637 vector<CharReach> mask3(mask.begin() + prefix2_len, mask.end());
638 u32 overlap = maxOverlap(lit2, lit, 0);
639 u32 delay = lit.length() - overlap;
640 shared_ptr<NGHolder> mhs
641 = buildMaskLhs(true, minBound - prefix2_len + overlap,
642 mask3);
643 mhs->kind = NFA_INFIX;
644 setTops(*mhs);
645 add_edge(u, v, RoseInEdgeProps(mhs, delay), ig);
646
647 DEBUG_PRINTF("add anch literal too!\n");
648 goto do_rhs;
649 }
650 }
651
652 shared_ptr<NGHolder> lhs = buildMaskLhs(anchored, minBound, mask);
653 add_edge(s, v, RoseInEdgeProps(lhs, lit.length()), ig);
654 } else {
655 u32 maxBound = anchored ? minBound : ROSE_BOUND_INF;
656 add_edge(s, v, RoseInEdgeProps(minBound, maxBound), ig);
657 }
658
659 do_rhs:
660 if (suffix_len) {
661 shared_ptr<NGHolder> rhs = buildMaskRhs(reports, mask, suffix_len);
662 RoseInVertex a =
663 add_vertex(RoseInVertexProps::makeAccept(set<ReportID>()), ig);
664 add_edge(v, a, RoseInEdgeProps(rhs, 0), ig);
665 } else {
666 /* Note: masks have no eod connections */
667 RoseInVertex a
668 = add_vertex(RoseInVertexProps::makeAccept(reports), ig);
669 add_edge(v, a, RoseInEdgeProps(0U, 0U), ig);
670 }
671
672 calcVertexOffsets(ig);
673
674 bool rv = tbi.addRose(ig, false);
675
676 assert(rv); /* checkAllowMask should have prevented this */
677 if (!rv) {
678 throw std::exception();
679 }
680}
681
682static
683bool checkAllowMask(const vector<CharReach> &mask, ue2_literal *lit,
684 u32 *prefix_len, u32 *suffix_len,
685 const CompileContext &cc) {
686 assert(!mask.empty());
687 u32 lit_offset;
688 findMaskLiteral(mask, cc.streaming, lit, &lit_offset, cc.grey);
689
690 if (lit->length() < MIN_MASK_LIT_LEN && lit->length() != mask.size()) {
691 DEBUG_PRINTF("need more literal - bad mask\n");
692 return false;
693 }
694
695 DEBUG_PRINTF("mask lit '%s', len=%zu at offset=%u\n",
696 dumpString(*lit).c_str(), lit->length(), lit_offset);
697
698 assert(!cc.streaming || lit->length() <= cc.grey.maxHistoryAvailable + 1);
699
700 /* literal is included in the prefix nfa so that matches from the prefix
701 * can't occur in the history buffer - probably should tweak the NFA API
702 * to allow such matches not to be suppressed */
703 *prefix_len = lit_offset + lit->length();
704 *suffix_len = mask.size() - *prefix_len;
705 DEBUG_PRINTF("prefix_len=%u, suffix_len=%u\n", *prefix_len, *suffix_len);
706
707 /* check if we can backtrack sufficiently */
708 if (cc.streaming && *prefix_len > cc.grey.maxHistoryAvailable + 1) {
709 DEBUG_PRINTF("too much lag\n");
710 return false;
711 }
712
713 if (*suffix_len > MAX_MASK_SIZE || *prefix_len > MAX_MASK_SIZE) {
714 DEBUG_PRINTF("too big\n");
715 return false;
716 }
717
718 return true;
719}
720
721bool RoseBuildImpl::add(bool anchored, const vector<CharReach> &mask,
722 const flat_set<ReportID> &reports) {
723 if (validateTransientMask(mask, anchored, false, cc.grey)) {
724 bool eod = false;
725 addTransientMask(*this, mask, reports, anchored, eod);
726 return true;
727 }
728
729 ue2_literal lit;
730 u32 prefix_len = 0;
731 u32 suffix_len = 0;
732
733 if (!checkAllowMask(mask, &lit, &prefix_len, &suffix_len, cc)) {
734 return false;
735 }
736
737 /* we know that the mask can be handled now, start playing with the rose
738 * graph */
739 doAddMask(*this, anchored, mask, lit, prefix_len, suffix_len, reports);
740
741 return true;
742}
743
744bool RoseBuildImpl::validateMask(const vector<CharReach> &mask,
745 UNUSED const flat_set<ReportID> &reports,
746 bool anchored, bool eod) const {
747 return validateTransientMask(mask, anchored, eod, cc.grey);
748}
749
750static
751unique_ptr<NGHolder> makeAnchoredGraph(const vector<CharReach> &mask,
752 const flat_set<ReportID> &reports,
753 bool eod) {
754 auto gp = ue2::make_unique<NGHolder>();
755 NGHolder &g = *gp;
756
757 NFAVertex u = g.start;
758 for (const auto &cr : mask) {
759 NFAVertex v = add_vertex(g);
760 g[v].char_reach = cr;
761 add_edge(u, v, g);
762 u = v;
763 }
764
765
766 g[u].reports = reports;
767 add_edge(u, eod ? g.acceptEod : g.accept, g);
768
769 return gp;
770}
771
772static
773bool addAnchoredMask(RoseBuildImpl &build, const vector<CharReach> &mask,
774 const flat_set<ReportID> &reports, bool eod) {
775 if (!build.cc.grey.allowAnchoredAcyclic) {
776 return false;
777 }
778
779 auto g = makeAnchoredGraph(mask, reports, eod);
780 assert(g);
781
782 return build.addAnchoredAcyclic(*g);
783}
784
785void RoseBuildImpl::addMask(const vector<CharReach> &mask,
786 const flat_set<ReportID> &reports, bool anchored,
787 bool eod) {
788 if (anchored && addAnchoredMask(*this, mask, reports, eod)) {
789 DEBUG_PRINTF("added mask as anchored acyclic graph\n");
790 return;
791 }
792
793 addTransientMask(*this, mask, reports, anchored, eod);
794}
795
796} // namespace ue2
797