1 | /* |
2 | * Copyright (c) 2015-2017, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | #include "rose_build_impl.h" |
30 | |
31 | #include "ue2common.h" |
32 | #include "grey.h" |
33 | #include "rose_build_add_internal.h" |
34 | #include "rose_build_anchored.h" |
35 | #include "rose_in_util.h" |
36 | #include "hwlm/hwlm_literal.h" |
37 | #include "nfagraph/ng_depth.h" |
38 | #include "nfagraph/ng_dump.h" |
39 | #include "nfagraph/ng_holder.h" |
40 | #include "nfagraph/ng_limex.h" |
41 | #include "nfagraph/ng_reports.h" |
42 | #include "nfagraph/ng_util.h" |
43 | #include "nfagraph/ng_width.h" |
44 | #include "util/charreach.h" |
45 | #include "util/charreach_util.h" |
46 | #include "util/compare.h" |
47 | #include "util/compile_context.h" |
48 | #include "util/container.h" |
49 | #include "util/dump_charclass.h" |
50 | #include "util/graph.h" |
51 | #include "util/make_unique.h" |
52 | #include "util/ue2string.h" |
53 | #include "util/verify_types.h" |
54 | |
55 | #include <algorithm> |
56 | #include <map> |
57 | #include <set> |
58 | #include <string> |
59 | #include <vector> |
60 | #include <utility> |
61 | |
62 | using namespace std; |
63 | |
64 | namespace ue2 { |
65 | |
66 | #define MIN_MASK_LIT_LEN 2 |
67 | #define MAX_MASK_SIZE 255 |
68 | #define MAX_MASK_LITS 30 |
69 | |
70 | static |
71 | void findMaskLiteral(const vector<CharReach> &mask, bool streaming, |
72 | ue2_literal *lit, u32 *offset, const Grey &grey) { |
73 | bool case_fixed = false; |
74 | bool nocase = false; |
75 | |
76 | size_t best_begin = 0; |
77 | size_t best_end = 0; |
78 | size_t best_len = 0; |
79 | |
80 | size_t begin = 0; |
81 | size_t end = 0; |
82 | |
83 | for (size_t i = 0; i < mask.size(); i++) { |
84 | bool fail = false; |
85 | if (mask[i].count() != 1 && !mask[i].isCaselessChar()) { |
86 | DEBUG_PRINTF("hit non-literal char, resetting at %zu\n" , i); |
87 | fail = true; |
88 | } |
89 | |
90 | if (!fail && streaming && (end >= grey.maxHistoryAvailable + 1)) { |
91 | DEBUG_PRINTF("hit literal limit, resetting at %zu\n" , i); |
92 | fail = true; |
93 | } |
94 | |
95 | if (!fail && case_fixed && mask[i].isAlpha()) { |
96 | if (nocase && mask[i].count() != 2) { |
97 | fail = true; |
98 | } |
99 | |
100 | if (!nocase && mask[i].count() != 1) { |
101 | fail = true; |
102 | } |
103 | } |
104 | |
105 | if (fail) { |
106 | case_fixed = false; |
107 | nocase = false; |
108 | size_t len = end - begin; |
109 | bool better = len > best_len; |
110 | if (better) { |
111 | best_begin = begin; |
112 | best_end = end; |
113 | best_len = len; |
114 | } |
115 | begin = i + 1; |
116 | end = i + 1; |
117 | } else { |
118 | assert(end == i); |
119 | end = i + 1; |
120 | |
121 | if (mask[i].isAlpha()) { |
122 | case_fixed = true; |
123 | nocase = mask[i].count() == 2; |
124 | } |
125 | } |
126 | } |
127 | |
128 | size_t len = end - begin; |
129 | /* Everybody would rather be trigger towards the end */ |
130 | bool better = len >= best_len && mask.size() - end <= MAX_DELAY; |
131 | |
132 | if (better) { |
133 | best_begin = begin; |
134 | best_end = end; |
135 | best_len = len; |
136 | } |
137 | |
138 | for (size_t i = best_begin; i < best_end; i++) { |
139 | assert(mask[i].count() == 1 || mask[i].count() == 2); |
140 | lit->push_back(mask[i].find_first(), mask[i].count() > 1); |
141 | } |
142 | |
143 | *offset = verify_u32(best_begin); |
144 | } |
145 | |
146 | static |
147 | bool initFmlCandidates(const CharReach &cr, vector<ue2_literal> &cand) { |
148 | for (size_t i = cr.find_first(); i != cr.npos; i = cr.find_next(i)) { |
149 | char c = (char)i; |
150 | bool nocase = myisupper(c) && cr.test(mytolower(c)); |
151 | if (myislower(c) && cr.test(mytoupper(c))) { |
152 | continue; |
153 | } |
154 | |
155 | if (cand.size() >= MAX_MASK_LITS) { |
156 | DEBUG_PRINTF("hit lit limit of %u\n" , MAX_MASK_LITS); |
157 | return false; |
158 | } |
159 | |
160 | cand.emplace_back(c, nocase); |
161 | } |
162 | |
163 | assert(cand.size() <= MAX_MASK_LITS); |
164 | return !cand.empty(); |
165 | } |
166 | |
167 | static |
168 | bool expandFmlCandidates(const CharReach &cr, vector<ue2_literal> &curr, |
169 | vector<ue2_literal> &cand) { |
170 | DEBUG_PRINTF("expanding string with cr of %zu\n" , cr.count()); |
171 | DEBUG_PRINTF(" current cand list size %zu\n" , cand.size()); |
172 | |
173 | curr.clear(); |
174 | |
175 | for (size_t i = cr.find_first(); i != cr.npos; i = cr.find_next(i)) { |
176 | char c = (char)i; |
177 | bool nocase = myisupper(c) && cr.test(mytolower(c)); |
178 | if (myislower(c) && cr.test(mytoupper(c))) { |
179 | continue; |
180 | } |
181 | |
182 | for (const auto &lit : cand) { |
183 | if (curr.size() >= MAX_MASK_LITS) { |
184 | DEBUG_PRINTF("hit lit limit of %u\n" , MAX_MASK_LITS); |
185 | return false; |
186 | } |
187 | |
188 | curr.push_back(lit); |
189 | curr.back().push_back(c, nocase); |
190 | } |
191 | } |
192 | |
193 | if (curr.back().length() > MAX_MASK2_WIDTH && |
194 | any_of(begin(curr), end(curr), mixed_sensitivity)) { |
195 | DEBUG_PRINTF("mixed-sensitivity lit is too long, stopping\n" ); |
196 | return false; |
197 | } |
198 | |
199 | assert(curr.size() <= MAX_MASK_LITS); |
200 | cand.swap(curr); |
201 | return true; |
202 | } |
203 | |
204 | static |
205 | u32 scoreFmlCandidates(const vector<ue2_literal> &cand) { |
206 | if (cand.empty()) { |
207 | DEBUG_PRINTF("no candidates\n" ); |
208 | return 0; |
209 | } |
210 | |
211 | const u32 len = cand.back().length(); |
212 | |
213 | DEBUG_PRINTF("length = %u count %zu\n" , len, cand.size()); |
214 | u32 min_period = len; |
215 | |
216 | for (const auto &lit : cand) { |
217 | DEBUG_PRINTF("candidate: %s\n" , dumpString(lit).c_str()); |
218 | u32 period = lit.length() - maxStringSelfOverlap(lit); |
219 | min_period = min(min_period, period); |
220 | } |
221 | DEBUG_PRINTF("min_period %u\n" , min_period); |
222 | u32 length_score = |
223 | (5 * min_period + len) * (cand.back().any_nocase() ? 90 : 100); |
224 | u32 count_penalty; |
225 | if (len > 4) { |
226 | count_penalty = 9 * len * cand.size(); |
227 | } else { |
228 | count_penalty = 5 * cand.size(); |
229 | } |
230 | if (length_score <= count_penalty) { |
231 | return 1; |
232 | } |
233 | return length_score - count_penalty; |
234 | } |
235 | |
236 | /* favours later literals */ |
237 | static |
238 | bool findMaskLiterals(const vector<CharReach> &mask, vector<ue2_literal> *lit, |
239 | u32 *minBound, u32 *length) { |
240 | *minBound = 0; |
241 | *length = 0; |
242 | |
243 | vector<ue2_literal> candidates, best_candidates, curr_candidates; |
244 | u32 best_score = 0; |
245 | u32 best_minOffset = 0; |
246 | |
247 | for (auto it = mask.begin(); it != mask.end(); ++it) { |
248 | candidates.clear(); |
249 | if (!initFmlCandidates(*it, candidates)) { |
250 | DEBUG_PRINTF("failed to init\n" ); |
251 | continue; |
252 | } |
253 | DEBUG_PRINTF("++\n" ); |
254 | auto jt = it; |
255 | while (jt != mask.begin()) { |
256 | --jt; |
257 | DEBUG_PRINTF("--\n" ); |
258 | if (!expandFmlCandidates(*jt, curr_candidates, candidates)) { |
259 | DEBUG_PRINTF("expansion stopped\n" ); |
260 | break; |
261 | } |
262 | } |
263 | |
264 | // Candidates have been expanded in reverse order. |
265 | for (auto &cand : candidates) { |
266 | cand = reverse_literal(cand); |
267 | } |
268 | |
269 | u32 score = scoreFmlCandidates(candidates); |
270 | DEBUG_PRINTF("scored %u for literal set of size %zu\n" , score, |
271 | candidates.size()); |
272 | if (!candidates.empty() && score >= best_score) { |
273 | best_minOffset = it - mask.begin() - candidates.back().length() + 1; |
274 | best_candidates.swap(candidates); |
275 | best_score = score; |
276 | } |
277 | } |
278 | |
279 | if (!best_score) { |
280 | DEBUG_PRINTF("no lits\n" ); |
281 | return false; |
282 | } |
283 | |
284 | *minBound = best_minOffset; |
285 | *length = best_candidates.back().length(); |
286 | |
287 | DEBUG_PRINTF("best minbound %u length %u\n" , *minBound, *length); |
288 | |
289 | assert(all_of_in(best_candidates, [&](const ue2_literal &s) { |
290 | return s.length() == *length; |
291 | })); |
292 | |
293 | *lit = std::move(best_candidates); |
294 | return true; |
295 | } |
296 | |
297 | static |
298 | unique_ptr<NGHolder> buildMaskLhs(bool anchored, u32 prefix_len, |
299 | const vector<CharReach> &mask) { |
300 | DEBUG_PRINTF("build %slhs len %u/%zu\n" , anchored ? "anc " : "" , prefix_len, |
301 | mask.size()); |
302 | |
303 | unique_ptr<NGHolder> lhs = ue2::make_unique<NGHolder>(NFA_PREFIX); |
304 | |
305 | assert(prefix_len); |
306 | assert(mask.size() >= prefix_len); |
307 | NFAVertex pred = anchored ? lhs->start : lhs->startDs; |
308 | |
309 | u32 m_idx = 0; |
310 | while (prefix_len--) { |
311 | NFAVertex v = add_vertex(*lhs); |
312 | (*lhs)[v].char_reach = mask[m_idx++]; |
313 | add_edge(pred, v, *lhs); |
314 | pred = v; |
315 | } |
316 | add_edge(pred, lhs->accept, *lhs); |
317 | (*lhs)[pred].reports.insert(0); |
318 | |
319 | return lhs; |
320 | } |
321 | |
322 | static |
323 | void buildLiteralMask(const vector<CharReach> &mask, vector<u8> &msk, |
324 | vector<u8> &cmp, u32 delay) { |
325 | msk.clear(); |
326 | cmp.clear(); |
327 | if (mask.size() <= delay) { |
328 | return; |
329 | } |
330 | |
331 | // Construct an and/cmp mask from our mask ending at delay positions before |
332 | // the end of the literal, with max length HWLM_MASKLEN. |
333 | |
334 | auto ite = mask.end() - delay; |
335 | auto it = ite - min(size_t{HWLM_MASKLEN}, mask.size() - delay); |
336 | |
337 | for (; it != ite; ++it) { |
338 | msk.push_back(0); |
339 | cmp.push_back(0); |
340 | make_and_cmp_mask(*it, &msk.back(), &cmp.back()); |
341 | } |
342 | |
343 | assert(msk.size() == cmp.size()); |
344 | assert(msk.size() <= HWLM_MASKLEN); |
345 | } |
346 | |
347 | static |
348 | bool validateTransientMask(const vector<CharReach> &mask, bool anchored, |
349 | bool eod, const Grey &grey) { |
350 | assert(!mask.empty()); |
351 | |
352 | // An EOD anchored mask requires that everything fit into history, while an |
353 | // ordinary floating case can handle one byte more (i.e., max history size |
354 | // and one byte in the buffer). |
355 | const size_t max_width = grey.maxHistoryAvailable + (eod ? 0 : 1); |
356 | if (mask.size() > max_width) { |
357 | DEBUG_PRINTF("mask too long for max available history\n" ); |
358 | return false; |
359 | } |
360 | |
361 | /* although anchored masks cannot be transient, short masks may be placed |
362 | * into the atable. */ |
363 | if (anchored && mask.size() > grey.maxAnchoredRegion) { |
364 | return false; |
365 | } |
366 | |
367 | vector<ue2_literal> lits; |
368 | u32 lit_minBound; /* minBound of each literal in lit */ |
369 | u32 lit_length; /* length of each literal in lit */ |
370 | if (!findMaskLiterals(mask, &lits, &lit_minBound, &lit_length)) { |
371 | DEBUG_PRINTF("failed to find any lits\n" ); |
372 | return false; |
373 | } |
374 | |
375 | if (lits.empty()) { |
376 | return false; |
377 | } |
378 | |
379 | const u32 delay = mask.size() - lit_length - lit_minBound; |
380 | if (delay > MAX_DELAY) { |
381 | DEBUG_PRINTF("delay %u is too much\n" , delay); |
382 | return false; |
383 | } |
384 | |
385 | if (lit_length == 1 && lits.size() > 3) { |
386 | DEBUG_PRINTF("no decent trigger\n" ); |
387 | return false; |
388 | } |
389 | |
390 | // Mixed-sensitivity literals require benefits masks to implement, and thus |
391 | // have a maximum length. This has been taken into account in |
392 | // findMaskLiterals. |
393 | assert(lit_length <= MAX_MASK2_WIDTH || |
394 | none_of(begin(lits), end(lits), mixed_sensitivity)); |
395 | |
396 | // Build the HWLM literal mask. |
397 | vector<u8> msk, cmp; |
398 | if (grey.roseHamsterMasks) { |
399 | buildLiteralMask(mask, msk, cmp, delay); |
400 | } |
401 | |
402 | // We consider the HWLM mask length to run from the first non-zero byte to |
403 | // the end, and let max(mask length, literal length) be the effective |
404 | // literal length. |
405 | // |
406 | // A one-byte literal with no mask is too short, but a one-byte literal |
407 | // with a few bytes of mask information is OK. |
408 | |
409 | u32 msk_length = distance(find_if(begin(msk), end(msk), |
410 | [](u8 v) { return v != 0; }), end(msk)); |
411 | u32 eff_lit_length = max(lit_length, msk_length); |
412 | DEBUG_PRINTF("msk_length=%u, eff_lit_length = %u\n" , msk_length, |
413 | eff_lit_length); |
414 | |
415 | if (eff_lit_length < MIN_MASK_LIT_LEN) { |
416 | DEBUG_PRINTF("literals too short\n" ); |
417 | return false; |
418 | } |
419 | |
420 | DEBUG_PRINTF("mask is ok\n" ); |
421 | return true; |
422 | } |
423 | |
424 | static |
425 | bool maskIsNeeded(const ue2_literal &lit, const NGHolder &g) { |
426 | flat_set<NFAVertex> curr = {g.accept}; |
427 | flat_set<NFAVertex> next; |
428 | |
429 | for (auto it = lit.rbegin(), ite = lit.rend(); it != ite; ++it) { |
430 | const CharReach &cr = *it; |
431 | DEBUG_PRINTF("check %s\n" , describeClass(*it).c_str()); |
432 | next.clear(); |
433 | for (auto v : curr) { |
434 | for (auto u : inv_adjacent_vertices_range(v, g)) { |
435 | if (isSubsetOf(cr, g[u].char_reach)) { |
436 | next.insert(u); |
437 | } |
438 | } |
439 | } |
440 | if (next.empty()) { |
441 | DEBUG_PRINTF("no path to start\n" ); |
442 | return true; |
443 | } |
444 | curr.swap(next); |
445 | } |
446 | |
447 | for (auto v : curr) { |
448 | for (auto u : inv_adjacent_vertices_range(v, g)) { |
449 | if (u == g.start || u == g.startDs) { |
450 | DEBUG_PRINTF("literal spans graph from start to accept\n" ); |
451 | return false; |
452 | |
453 | } |
454 | } |
455 | } |
456 | |
457 | DEBUG_PRINTF("literal doesn't reach start\n" ); |
458 | return true; |
459 | } |
460 | |
461 | static |
462 | void addTransientMask(RoseBuildImpl &build, const vector<CharReach> &mask, |
463 | const flat_set<ReportID> &reports, bool anchored, |
464 | bool eod) { |
465 | vector<ue2_literal> lits; |
466 | u32 lit_minBound; /* minBound of each literal in lit */ |
467 | u32 lit_length; /* length of each literal in lit */ |
468 | if (!findMaskLiterals(mask, &lits, &lit_minBound, &lit_length)) { |
469 | DEBUG_PRINTF("failed to find any lits\n" ); |
470 | assert(0); |
471 | return; |
472 | } |
473 | |
474 | DEBUG_PRINTF("%zu literals, minBound=%u, length=%u\n" , lits.size(), |
475 | lit_minBound, lit_length); |
476 | |
477 | if (lits.empty()) { |
478 | assert(0); |
479 | return; |
480 | } |
481 | |
482 | u32 delay = mask.size() - lit_length - lit_minBound; |
483 | assert(delay <= MAX_DELAY); |
484 | DEBUG_PRINTF("delay=%u\n" , delay); |
485 | |
486 | shared_ptr<NGHolder> mask_graph = buildMaskLhs(anchored, mask.size(), mask); |
487 | |
488 | u32 mask_lag = 0; /* TODO */ |
489 | |
490 | // Everyone gets the same report ID. |
491 | ReportID mask_report = build.getNewNfaReport(); |
492 | set_report(*mask_graph, mask_report); |
493 | |
494 | // Build the HWLM literal mask. |
495 | vector<u8> msk, cmp; |
496 | if (build.cc.grey.roseHamsterMasks) { |
497 | buildLiteralMask(mask, msk, cmp, delay); |
498 | } |
499 | |
500 | /* adjust bounds to be relative to trigger rather than mask */ |
501 | const u32 v_min_offset = add_rose_depth(0, mask.size()); |
502 | const u32 v_max_offset = |
503 | add_rose_depth(anchored ? 0 : ROSE_BOUND_INF, mask.size()); |
504 | |
505 | RoseGraph &g = build.g; |
506 | |
507 | // By default, masked literals go into the floating table (except for eod |
508 | // cases). |
509 | enum rose_literal_table table = ROSE_FLOATING; |
510 | |
511 | RoseVertex eod_v = RoseGraph::null_vertex(); |
512 | if (eod) { |
513 | eod_v = add_vertex(g); |
514 | g[eod_v].eod_accept = true; |
515 | insert(&g[eod_v].reports, reports); |
516 | g[eod_v].min_offset = v_min_offset; |
517 | g[eod_v].max_offset = v_max_offset; |
518 | |
519 | // Note: because this is a transient mask, we know that we can match it |
520 | // completely inside the history buffer. So, using the EOD literal |
521 | // table is always safe. |
522 | table = ROSE_EOD_ANCHORED; |
523 | |
524 | // Widen the EOD table window to cover the mask. |
525 | ENSURE_AT_LEAST(&build.ematcher_region_size, mask.size()); |
526 | } |
527 | |
528 | const flat_set<ReportID> no_reports; |
529 | |
530 | for (const auto &lit : lits) { |
531 | u32 lit_id = build.getLiteralId(lit, msk, cmp, delay, table); |
532 | const RoseVertex parent = anchored ? build.anchored_root : build.root; |
533 | bool use_mask = delay || maskIsNeeded(lit, *mask_graph); |
534 | |
535 | auto v = createVertex(&build, parent, 0, ROSE_BOUND_INF, lit_id, |
536 | lit.length(), eod ? no_reports : reports); |
537 | |
538 | if (use_mask) { |
539 | g[v].left.graph = mask_graph; |
540 | g[v].left.lag = mask_lag; |
541 | g[v].left.leftfix_report = mask_report; |
542 | } else { |
543 | // Make sure our edge bounds are correct. |
544 | RoseEdge e = edge(parent, v, g); |
545 | g[e].minBound = 0; |
546 | g[e].maxBound = anchored ? 0 : ROSE_BOUND_INF; |
547 | g[e].history = anchored ? ROSE_ROLE_HISTORY_ANCH |
548 | : ROSE_ROLE_HISTORY_NONE; |
549 | } |
550 | |
551 | // Set offsets correctly. |
552 | g[v].min_offset = v_min_offset; |
553 | g[v].max_offset = v_max_offset; |
554 | |
555 | if (eod) { |
556 | RoseEdge e = add_edge(v, eod_v, g); |
557 | g[e].minBound = 0; |
558 | g[e].maxBound = 0; |
559 | g[e].history = ROSE_ROLE_HISTORY_LAST_BYTE; |
560 | } |
561 | } |
562 | } |
563 | |
564 | static |
565 | unique_ptr<NGHolder> buildMaskRhs(const flat_set<ReportID> &reports, |
566 | const vector<CharReach> &mask, |
567 | u32 suffix_len) { |
568 | assert(suffix_len); |
569 | assert(mask.size() > suffix_len); |
570 | |
571 | unique_ptr<NGHolder> rhs = ue2::make_unique<NGHolder>(NFA_SUFFIX); |
572 | NGHolder &h = *rhs; |
573 | |
574 | NFAVertex succ = h.accept; |
575 | u32 m_idx = mask.size() - 1; |
576 | while (suffix_len--) { |
577 | NFAVertex u = add_vertex(h); |
578 | if (succ == h.accept) { |
579 | h[u].reports.insert(reports.begin(), reports.end()); |
580 | } |
581 | h[u].char_reach = mask[m_idx--]; |
582 | add_edge(u, succ, h); |
583 | succ = u; |
584 | } |
585 | |
586 | NFAEdge e = add_edge(h.start, succ, h); |
587 | h[e].tops.insert(DEFAULT_TOP); |
588 | |
589 | return rhs; |
590 | } |
591 | |
592 | static |
593 | void doAddMask(RoseBuildImpl &tbi, bool anchored, const vector<CharReach> &mask, |
594 | const ue2_literal &lit, u32 prefix_len, u32 suffix_len, |
595 | const flat_set<ReportID> &reports) { |
596 | /* Note: bounds are relative to literal start */ |
597 | RoseInGraph ig; |
598 | RoseInVertex s = add_vertex(RoseInVertexProps::makeStart(anchored), ig); |
599 | RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), ig); |
600 | |
601 | DEBUG_PRINTF("pref + lit = %u\n" , prefix_len); |
602 | assert(prefix_len >= lit.length()); |
603 | |
604 | // prefix len is relative to end of literal. |
605 | u32 minBound = prefix_len - lit.length(); |
606 | |
607 | if (minBound) { |
608 | if (anchored && prefix_len > tbi.cc.grey.maxAnchoredRegion) { |
609 | DEBUG_PRINTF("too deep\n" ); |
610 | /* see if there is an anchored literal we can also hang off */ |
611 | |
612 | ue2_literal lit2; |
613 | u32 lit2_offset; |
614 | vector<CharReach> mask2 = mask; |
615 | assert(mask2.size() > tbi.cc.grey.maxAnchoredRegion); |
616 | mask2.resize(MIN(tbi.cc.grey.maxAnchoredRegion, minBound)); |
617 | |
618 | findMaskLiteral(mask2, tbi.cc.streaming, &lit2, &lit2_offset, |
619 | tbi.cc.grey); |
620 | |
621 | if (lit2.length() >= MIN_MASK_LIT_LEN) { |
622 | u32 prefix2_len = lit2_offset + lit2.length(); |
623 | assert(prefix2_len < minBound); |
624 | RoseInVertex u |
625 | = add_vertex(RoseInVertexProps::makeLiteral(lit2), ig); |
626 | if (lit2_offset){ |
627 | DEBUG_PRINTF("building lhs (off %u)\n" , lit2_offset); |
628 | shared_ptr<NGHolder> lhs2 |
629 | = buildMaskLhs(true, lit2_offset, mask); |
630 | add_edge(s, u, RoseInEdgeProps(lhs2, lit2.length()), ig); |
631 | } else { |
632 | add_edge(s, u, RoseInEdgeProps(0, 0), ig); |
633 | } |
634 | |
635 | /* midfix */ |
636 | DEBUG_PRINTF("building mhs\n" ); |
637 | vector<CharReach> mask3(mask.begin() + prefix2_len, mask.end()); |
638 | u32 overlap = maxOverlap(lit2, lit, 0); |
639 | u32 delay = lit.length() - overlap; |
640 | shared_ptr<NGHolder> mhs |
641 | = buildMaskLhs(true, minBound - prefix2_len + overlap, |
642 | mask3); |
643 | mhs->kind = NFA_INFIX; |
644 | setTops(*mhs); |
645 | add_edge(u, v, RoseInEdgeProps(mhs, delay), ig); |
646 | |
647 | DEBUG_PRINTF("add anch literal too!\n" ); |
648 | goto do_rhs; |
649 | } |
650 | } |
651 | |
652 | shared_ptr<NGHolder> lhs = buildMaskLhs(anchored, minBound, mask); |
653 | add_edge(s, v, RoseInEdgeProps(lhs, lit.length()), ig); |
654 | } else { |
655 | u32 maxBound = anchored ? minBound : ROSE_BOUND_INF; |
656 | add_edge(s, v, RoseInEdgeProps(minBound, maxBound), ig); |
657 | } |
658 | |
659 | do_rhs: |
660 | if (suffix_len) { |
661 | shared_ptr<NGHolder> rhs = buildMaskRhs(reports, mask, suffix_len); |
662 | RoseInVertex a = |
663 | add_vertex(RoseInVertexProps::makeAccept(set<ReportID>()), ig); |
664 | add_edge(v, a, RoseInEdgeProps(rhs, 0), ig); |
665 | } else { |
666 | /* Note: masks have no eod connections */ |
667 | RoseInVertex a |
668 | = add_vertex(RoseInVertexProps::makeAccept(reports), ig); |
669 | add_edge(v, a, RoseInEdgeProps(0U, 0U), ig); |
670 | } |
671 | |
672 | calcVertexOffsets(ig); |
673 | |
674 | bool rv = tbi.addRose(ig, false); |
675 | |
676 | assert(rv); /* checkAllowMask should have prevented this */ |
677 | if (!rv) { |
678 | throw std::exception(); |
679 | } |
680 | } |
681 | |
682 | static |
683 | bool checkAllowMask(const vector<CharReach> &mask, ue2_literal *lit, |
684 | u32 *prefix_len, u32 *suffix_len, |
685 | const CompileContext &cc) { |
686 | assert(!mask.empty()); |
687 | u32 lit_offset; |
688 | findMaskLiteral(mask, cc.streaming, lit, &lit_offset, cc.grey); |
689 | |
690 | if (lit->length() < MIN_MASK_LIT_LEN && lit->length() != mask.size()) { |
691 | DEBUG_PRINTF("need more literal - bad mask\n" ); |
692 | return false; |
693 | } |
694 | |
695 | DEBUG_PRINTF("mask lit '%s', len=%zu at offset=%u\n" , |
696 | dumpString(*lit).c_str(), lit->length(), lit_offset); |
697 | |
698 | assert(!cc.streaming || lit->length() <= cc.grey.maxHistoryAvailable + 1); |
699 | |
700 | /* literal is included in the prefix nfa so that matches from the prefix |
701 | * can't occur in the history buffer - probably should tweak the NFA API |
702 | * to allow such matches not to be suppressed */ |
703 | *prefix_len = lit_offset + lit->length(); |
704 | *suffix_len = mask.size() - *prefix_len; |
705 | DEBUG_PRINTF("prefix_len=%u, suffix_len=%u\n" , *prefix_len, *suffix_len); |
706 | |
707 | /* check if we can backtrack sufficiently */ |
708 | if (cc.streaming && *prefix_len > cc.grey.maxHistoryAvailable + 1) { |
709 | DEBUG_PRINTF("too much lag\n" ); |
710 | return false; |
711 | } |
712 | |
713 | if (*suffix_len > MAX_MASK_SIZE || *prefix_len > MAX_MASK_SIZE) { |
714 | DEBUG_PRINTF("too big\n" ); |
715 | return false; |
716 | } |
717 | |
718 | return true; |
719 | } |
720 | |
721 | bool RoseBuildImpl::add(bool anchored, const vector<CharReach> &mask, |
722 | const flat_set<ReportID> &reports) { |
723 | if (validateTransientMask(mask, anchored, false, cc.grey)) { |
724 | bool eod = false; |
725 | addTransientMask(*this, mask, reports, anchored, eod); |
726 | return true; |
727 | } |
728 | |
729 | ue2_literal lit; |
730 | u32 prefix_len = 0; |
731 | u32 suffix_len = 0; |
732 | |
733 | if (!checkAllowMask(mask, &lit, &prefix_len, &suffix_len, cc)) { |
734 | return false; |
735 | } |
736 | |
737 | /* we know that the mask can be handled now, start playing with the rose |
738 | * graph */ |
739 | doAddMask(*this, anchored, mask, lit, prefix_len, suffix_len, reports); |
740 | |
741 | return true; |
742 | } |
743 | |
744 | bool RoseBuildImpl::validateMask(const vector<CharReach> &mask, |
745 | UNUSED const flat_set<ReportID> &reports, |
746 | bool anchored, bool eod) const { |
747 | return validateTransientMask(mask, anchored, eod, cc.grey); |
748 | } |
749 | |
750 | static |
751 | unique_ptr<NGHolder> makeAnchoredGraph(const vector<CharReach> &mask, |
752 | const flat_set<ReportID> &reports, |
753 | bool eod) { |
754 | auto gp = ue2::make_unique<NGHolder>(); |
755 | NGHolder &g = *gp; |
756 | |
757 | NFAVertex u = g.start; |
758 | for (const auto &cr : mask) { |
759 | NFAVertex v = add_vertex(g); |
760 | g[v].char_reach = cr; |
761 | add_edge(u, v, g); |
762 | u = v; |
763 | } |
764 | |
765 | |
766 | g[u].reports = reports; |
767 | add_edge(u, eod ? g.acceptEod : g.accept, g); |
768 | |
769 | return gp; |
770 | } |
771 | |
772 | static |
773 | bool addAnchoredMask(RoseBuildImpl &build, const vector<CharReach> &mask, |
774 | const flat_set<ReportID> &reports, bool eod) { |
775 | if (!build.cc.grey.allowAnchoredAcyclic) { |
776 | return false; |
777 | } |
778 | |
779 | auto g = makeAnchoredGraph(mask, reports, eod); |
780 | assert(g); |
781 | |
782 | return build.addAnchoredAcyclic(*g); |
783 | } |
784 | |
785 | void RoseBuildImpl::addMask(const vector<CharReach> &mask, |
786 | const flat_set<ReportID> &reports, bool anchored, |
787 | bool eod) { |
788 | if (anchored && addAnchoredMask(*this, mask, reports, eod)) { |
789 | DEBUG_PRINTF("added mask as anchored acyclic graph\n" ); |
790 | return; |
791 | } |
792 | |
793 | addTransientMask(*this, mask, reports, anchored, eod); |
794 | } |
795 | |
796 | } // namespace ue2 |
797 | |