1/*
2 * Copyright (c) 2015-2018, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief NG and graph handling.
31 */
32#include "ng.h"
33
34#include "grey.h"
35#include "ng_anchored_acyclic.h"
36#include "ng_anchored_dots.h"
37#include "ng_asserts.h"
38#include "ng_calc_components.h"
39#include "ng_cyclic_redundancy.h"
40#include "ng_dump.h"
41#include "ng_edge_redundancy.h"
42#include "ng_equivalence.h"
43#include "ng_extparam.h"
44#include "ng_fixed_width.h"
45#include "ng_fuzzy.h"
46#include "ng_haig.h"
47#include "ng_literal_component.h"
48#include "ng_literal_decorated.h"
49#include "ng_misc_opt.h"
50#include "ng_puff.h"
51#include "ng_prefilter.h"
52#include "ng_prune.h"
53#include "ng_redundancy.h"
54#include "ng_region.h"
55#include "ng_region_redundancy.h"
56#include "ng_reports.h"
57#include "ng_sep.h"
58#include "ng_small_literal_set.h"
59#include "ng_som.h"
60#include "ng_vacuous.h"
61#include "ng_violet.h"
62#include "ng_utf8.h"
63#include "ng_util.h"
64#include "ng_width.h"
65#include "ue2common.h"
66#include "compiler/compiler.h"
67#include "nfa/goughcompile.h"
68#include "rose/rose_build.h"
69#include "smallwrite/smallwrite_build.h"
70#include "util/compile_error.h"
71#include "util/container.h"
72#include "util/depth.h"
73#include "util/graph_range.h"
74#include "util/make_unique.h"
75#include "util/ue2string.h"
76
77using namespace std;
78
79namespace ue2 {
80
81NG::NG(const CompileContext &in_cc, size_t num_patterns,
82 unsigned in_somPrecision)
83 : maxSomRevHistoryAvailable(in_cc.grey.somMaxRevNfaLength),
84 minWidth(depth::infinity()),
85 rm(in_cc.grey),
86 ssm(in_somPrecision),
87 cc(in_cc),
88 smwr(makeSmallWriteBuilder(num_patterns, rm, cc)),
89 rose(makeRoseBuilder(rm, ssm, *smwr, cc, boundary)) {
90}
91
92NG::~NG() {
93 // empty
94}
95
96/** \brief SOM handling code, called by \ref addComponent.
97 *
98 * \return true if the component was handled completely by something (e.g. a
99 * Haig outfix), false if SOM could be established but implementation via an
100 * engine will be required.
101 *
102 * \throw CompileError if SOM cannot be supported for the component.
103 */
104static
105bool addComponentSom(NG &ng, NGHolder &g, const ExpressionInfo &expr,
106 const som_type som, const u32 comp_id) {
107 DEBUG_PRINTF("doing som\n");
108 dumpComponent(g, "03_presom", expr.index, comp_id, ng.cc.grey);
109 assert(hasCorrectlyNumberedVertices(g));
110 assert(allMatchStatesHaveReports(g));
111
112 // First, we try the "SOM chain" support in ng_som.cpp.
113
114 sombe_rv rv = doSom(ng, g, expr, comp_id, som);
115 if (rv == SOMBE_HANDLED_INTERNAL) {
116 return false;
117 } else if (rv == SOMBE_HANDLED_ALL) {
118 return true;
119 }
120 assert(rv == SOMBE_FAIL);
121
122 /* Next, Sombe style approaches */
123 rv = doSomWithHaig(ng, g, expr, comp_id, som);
124 if (rv == SOMBE_HANDLED_INTERNAL) {
125 return false;
126 } else if (rv == SOMBE_HANDLED_ALL) {
127 return true;
128 }
129 assert(rv == SOMBE_FAIL);
130
131 // If the previous approach could not support this pattern, we try treating
132 // it monolithically, as a Haig outfix.
133
134 vector<vector<CharReach> > triggers; /* empty for outfix */
135
136 assert(g.kind == NFA_OUTFIX);
137 dumpComponent(g, "haig", expr.index, comp_id, ng.cc.grey);
138 makeReportsSomPass(ng.rm, g);
139 auto haig = attemptToBuildHaig(g, som, ng.ssm.somPrecision(), triggers,
140 ng.cc.grey);
141 if (haig) {
142 DEBUG_PRINTF("built haig outfix\n");
143 ng.rose->addOutfix(g, *haig);
144 return true;
145 }
146
147 /* Our various strategies for supporting SOM for this pattern have failed.
148 * Provide a generic pattern not supported/too large return value as it is
149 * unclear what the meaning of a specific SOM error would be */
150 throw CompileError(expr.index, "Pattern is too large.");
151
152 assert(0); // unreachable
153 return false;
154}
155
156void reduceGraph(NGHolder &g, som_type som, bool utf8,
157 const CompileContext &cc) {
158 if (!cc.grey.performGraphSimplification) {
159 return;
160 }
161
162 // We run reduction passes until either the graph stops changing or we hit
163 // a (small) limit.
164
165 if (!som) {
166 mergeCyclicDotStars(g);
167 }
168
169 const unsigned MAX_PASSES = 3;
170 for (unsigned pass = 1; pass <= MAX_PASSES; pass++) {
171 bool changed = false;
172 DEBUG_PRINTF("reduce pass %u/%u\n", pass, MAX_PASSES);
173 changed |= removeEdgeRedundancy(g, som, cc);
174 changed |= reduceGraphEquivalences(g, cc);
175 changed |= removeRedundancy(g, som);
176 changed |= removeCyclicPathRedundancy(g);
177 if (!changed) {
178 DEBUG_PRINTF("graph unchanged after pass %u, stopping\n", pass);
179 break;
180 }
181 }
182
183 if (utf8) {
184 utf8DotRestoration(g, som);
185 }
186
187 /* Minor non-redundancy improvements */
188 if (improveGraph(g, som)) {
189 /* may be some more edges to remove */
190 removeEdgeRedundancy(g, som, cc);
191 }
192
193 removeCyclicDominated(g, som);
194
195 if (!som) {
196 mergeCyclicDotStars(g);
197 }
198
199 if (!som) {
200 removeSiblingsOfStartDotStar(g);
201 }
202}
203
204static
205bool addComponent(NG &ng, NGHolder &g, const ExpressionInfo &expr,
206 const som_type som, const u32 comp_id) {
207 const CompileContext &cc = ng.cc;
208 assert(hasCorrectlyNumberedVertices(g));
209
210 DEBUG_PRINTF("expr=%u, comp=%u: %zu vertices, %zu edges\n",
211 expr.index, comp_id, num_vertices(g), num_edges(g));
212
213 dumpComponent(g, "01_begin", expr.index, comp_id, ng.cc.grey);
214
215 assert(allMatchStatesHaveReports(g));
216
217 reduceExtendedParams(g, ng.rm, som);
218 reduceGraph(g, som, expr.utf8, cc);
219
220 dumpComponent(g, "02_reduced", expr.index, comp_id, ng.cc.grey);
221
222 // There may be redundant regions that we can remove
223 if (cc.grey.performGraphSimplification) {
224 removeRegionRedundancy(g, som);
225 }
226
227 // We might be done at this point: if we've run out of vertices, we can
228 // stop processing.
229 if (num_vertices(g) == N_SPECIALS) {
230 DEBUG_PRINTF("all vertices claimed\n");
231 return true;
232 }
233
234 // "Short Exhaustible Passthrough" patterns always become outfixes.
235 if (!som && isSEP(g, ng.rm, cc.grey)) {
236 DEBUG_PRINTF("graph is SEP\n");
237 if (ng.rose->addOutfix(g)) {
238 return true;
239 }
240 }
241
242 // Start Of Match handling.
243 if (som) {
244 if (addComponentSom(ng, g, expr, som, comp_id)) {
245 return true;
246 }
247 }
248
249 assert(allMatchStatesHaveReports(g));
250
251 if (splitOffAnchoredAcyclic(*ng.rose, g, cc)) {
252 return true;
253 }
254
255 if (handleSmallLiteralSets(*ng.rose, g, cc)
256 || handleFixedWidth(*ng.rose, g, cc.grey)) {
257 return true;
258 }
259
260 if (handleDecoratedLiterals(*ng.rose, g, cc)) {
261 return true;
262 }
263
264 if (doViolet(*ng.rose, g, expr.prefilter, false, ng.rm, cc)) {
265 return true;
266 }
267
268 if (splitOffPuffs(*ng.rose, ng.rm, g, expr.prefilter, cc)) {
269 return true;
270 }
271
272 if (handleSmallLiteralSets(*ng.rose, g, cc)
273 || handleFixedWidth(*ng.rose, g, cc.grey)) {
274 return true;
275 }
276
277 if (handleDecoratedLiterals(*ng.rose, g, cc)) {
278 return true;
279 }
280
281 if (doViolet(*ng.rose, g, expr.prefilter, true, ng.rm, cc)) {
282 return true;
283 }
284
285 DEBUG_PRINTF("testing for outfix\n");
286 assert(allMatchStatesHaveReports(g));
287 if (ng.rose->addOutfix(g)) {
288 return true;
289 }
290
291 return false;
292}
293
294// Returns true if all components have been added.
295static
296bool processComponents(NG &ng, ExpressionInfo &expr,
297 deque<unique_ptr<NGHolder>> &g_comp,
298 const som_type som) {
299 const u32 num_components = g_comp.size();
300
301 u32 failed = 0;
302 for (u32 i = 0; i < num_components; i++) {
303 if (!g_comp[i]) {
304 continue;
305 }
306 if (addComponent(ng, *g_comp[i], expr, som, i)) {
307 g_comp[i].reset();
308 continue;
309 }
310
311 if (som) { /* bail immediately */
312 return false;
313 }
314 failed++;
315 }
316
317 if (!failed) {
318 DEBUG_PRINTF("all components claimed\n");
319 return true;
320 }
321
322 DEBUG_PRINTF("%u components still remain\n", failed);
323 return false;
324}
325
326bool NG::addGraph(ExpressionInfo &expr, unique_ptr<NGHolder> g_ptr) {
327 assert(g_ptr);
328 NGHolder &g = *g_ptr;
329
330 // remove reports that aren't on vertices connected to accept.
331 clearReports(g);
332
333 som_type som = expr.som;
334 if (som && isVacuous(g)) {
335 throw CompileError(expr.index, "Start of match is not "
336 "currently supported for patterns which match an "
337 "empty buffer.");
338 }
339
340 dumpDotWrapper(g, expr, "01_initial", cc.grey);
341 assert(allMatchStatesHaveReports(g));
342
343 /* ensure utf8 starts at cp boundary */
344 ensureCodePointStart(rm, g, expr);
345
346 if (can_never_match(g)) {
347 throw CompileError(expr.index, "Pattern can never match.");
348 }
349
350 bool hamming = expr.hamm_distance > 0;
351 u32 e_dist = hamming ? expr.hamm_distance : expr.edit_distance;
352
353 DEBUG_PRINTF("edit distance = %u hamming = %s\n", e_dist, hamming ? "true" : "false");
354
355 // validate graph's suitability for fuzzing before resolving asserts
356 validate_fuzzy_compile(g, e_dist, hamming, expr.utf8, cc.grey);
357
358 resolveAsserts(rm, g, expr);
359 dumpDotWrapper(g, expr, "02_post_assert_resolve", cc.grey);
360 assert(allMatchStatesHaveReports(g));
361
362 make_fuzzy(g, e_dist, hamming, cc.grey);
363 dumpDotWrapper(g, expr, "02a_post_fuzz", cc.grey);
364
365 pruneUseless(g);
366 pruneEmptyVertices(g);
367
368 if (can_never_match(g)) {
369 throw CompileError(expr.index, "Pattern can never match.");
370 }
371
372 optimiseVirtualStarts(g); /* good for som */
373
374 propagateExtendedParams(g, expr, rm);
375 reduceExtendedParams(g, rm, som);
376
377 // We may have removed all the edges to accept, in which case this
378 // expression cannot match.
379 if (can_never_match(g)) {
380 throw CompileError(expr.index, "Extended parameter constraints can not "
381 "be satisfied for any match from this "
382 "expression.");
383 }
384
385 if (any_of_in(all_reports(g), [&](ReportID id) {
386 return rm.getReport(id).minLength;
387 })) {
388 // We have at least one report with a minimum length constraint, which
389 // we currently use SOM to satisfy.
390 som = SOM_LEFT;
391 ssm.somPrecision(8);
392 }
393
394 if (som) {
395 rose->setSom();
396 }
397
398 // first, we can perform graph work that can be done on an individual
399 // expression basis.
400
401 if (expr.utf8) {
402 relaxForbiddenUtf8(g, expr);
403 }
404
405 if (all_of_in(all_reports(g), [&](ReportID id) {
406 const auto &report = rm.getReport(id);
407 return report.ekey != INVALID_EKEY && !report.minLength &&
408 !report.minOffset;
409 })) {
410 // In highlander mode: if we don't have constraints on our reports that
411 // may prevent us accepting our first match (i.e. extended params) we
412 // can prune the other out-edges of all vertices connected to accept.
413 // TODO: shift the report checking down into pruneHighlanderAccepts()
414 // to allow us to handle the parts we can in mixed cases.
415 pruneHighlanderAccepts(g, rm);
416 }
417
418 dumpDotWrapper(g, expr, "02b_fairly_early", cc.grey);
419
420 // If we're a vacuous pattern, we can handle this early.
421 if (splitOffVacuous(boundary, rm, g, expr)) {
422 DEBUG_PRINTF("split off vacuous\n");
423 }
424
425 // We might be done at this point: if we've run out of vertices, we can
426 // stop processing.
427 if (num_vertices(g) == N_SPECIALS) {
428 DEBUG_PRINTF("all vertices claimed by vacuous handling\n");
429 return true;
430 }
431
432 // Now that vacuous edges have been removed, update the min width exclusive
433 // of boundary reports.
434 minWidth = min(minWidth, findMinWidth(g));
435
436 // Add the pattern to the small write builder.
437 smwr->add(g, expr);
438
439 if (!som) {
440 removeSiblingsOfStartDotStar(g);
441 }
442
443 dumpDotWrapper(g, expr, "03_early", cc.grey);
444
445 // Perform a reduction pass to merge sibling character classes together.
446 if (cc.grey.performGraphSimplification) {
447 removeRedundancy(g, som);
448 prunePathsRedundantWithSuccessorOfCyclics(g, som);
449 }
450
451 dumpDotWrapper(g, expr, "04_reduced", cc.grey);
452
453 // If we've got some literals that span the graph from start to accept, we
454 // can split them off into Rose from here.
455 if (!som) {
456 if (splitOffLiterals(*this, g)) {
457 DEBUG_PRINTF("some vertices claimed by literals\n");
458 }
459 }
460
461 // We might be done at this point: if we've run out of vertices, we can
462 // stop processing.
463 if (num_vertices(g) == N_SPECIALS) {
464 DEBUG_PRINTF("all vertices claimed before calc components\n");
465 return true;
466 }
467
468 // Split the graph into a set of connected components and process those.
469 // Note: this invalidates g_ptr.
470
471 auto g_comp = calcComponents(std::move(g_ptr), cc.grey);
472 assert(!g_comp.empty());
473
474 if (!som) {
475 for (auto &gc : g_comp) {
476 assert(gc);
477 reformLeadingDots(*gc);
478 }
479
480 recalcComponents(g_comp, cc.grey);
481 }
482
483 if (processComponents(*this, expr, g_comp, som)) {
484 return true;
485 }
486
487 // If we're in prefiltering mode, we can run the prefilter reductions and
488 // have another shot at accepting the graph.
489
490 if (cc.grey.prefilterReductions && expr.prefilter) {
491 for (auto &gc : g_comp) {
492 if (!gc) {
493 continue;
494 }
495 prefilterReductions(*gc, cc);
496 }
497
498 if (processComponents(*this, expr, g_comp, som)) {
499 return true;
500 }
501 }
502
503 // We must have components that could not be compiled.
504 for (u32 i = 0; i < g_comp.size(); i++) {
505 if (g_comp[i]) {
506 DEBUG_PRINTF("could not compile component %u with %zu vertices\n",
507 i, num_vertices(*g_comp[i]));
508 throw CompileError(expr.index, "Pattern is too large.");
509 }
510 }
511
512 assert(0); // should have thrown.
513 return false;
514}
515
516/** \brief Used from SOM mode to add an arbitrary NGHolder as an engine. */
517bool NG::addHolder(NGHolder &g) {
518 DEBUG_PRINTF("adding holder of %zu states\n", num_vertices(g));
519 assert(allMatchStatesHaveReports(g));
520 assert(hasCorrectlyNumberedVertices(g));
521
522 /* We don't update the global minWidth here as we care about the min width
523 * of the whole pattern - not a just a prefix of it. */
524
525 bool prefilter = false;
526 //dumpDotComp(comp, g, *this, 20, "prefix_init");
527
528 som_type som = SOM_NONE; /* the prefixes created by the SOM code do not
529 themselves track som */
530 bool utf8 = false; // handling done earlier
531 reduceGraph(g, som, utf8, cc);
532
533 // There may be redundant regions that we can remove
534 if (cc.grey.performGraphSimplification) {
535 removeRegionRedundancy(g, som);
536 }
537
538 // "Short Exhaustible Passthrough" patterns always become outfixes.
539 if (isSEP(g, rm, cc.grey)) {
540 DEBUG_PRINTF("graph is SEP\n");
541 if (rose->addOutfix(g)) {
542 return true;
543 }
544 }
545
546 if (splitOffAnchoredAcyclic(*rose, g, cc)) {
547 return true;
548 }
549
550 if (handleSmallLiteralSets(*rose, g, cc)
551 || handleFixedWidth(*rose, g, cc.grey)) {
552 return true;
553 }
554
555 if (handleDecoratedLiterals(*rose, g, cc)) {
556 return true;
557 }
558
559 if (doViolet(*rose, g, prefilter, false, rm, cc)) {
560 return true;
561 }
562 if (splitOffPuffs(*rose, rm, g, prefilter, cc)) {
563 return true;
564 }
565 if (doViolet(*rose, g, prefilter, true, rm, cc)) {
566 return true;
567 }
568
569 DEBUG_PRINTF("trying for outfix\n");
570 if (rose->addOutfix(g)) {
571 DEBUG_PRINTF("ok\n");
572 return true;
573 }
574 DEBUG_PRINTF("trying for outfix - failed\n");
575 DEBUG_PRINTF("nobody would take us\n");
576 return false;
577}
578
579bool NG::addLiteral(const ue2_literal &literal, u32 expr_index,
580 u32 external_report, bool highlander, som_type som,
581 bool quiet) {
582 assert(!literal.empty());
583
584 if (!cc.grey.shortcutLiterals) {
585 return false;
586 }
587
588 // We can't natively handle arbitrary literals with mixed case sensitivity
589 // in Rose -- they require mechanisms like benefits masks, which have
590 // length limits etc. Better to let those go through full graph processing.
591 if (mixed_sensitivity(literal)) {
592 DEBUG_PRINTF("mixed sensitivity\n");
593 return false;
594 }
595
596 // Register external report and validate highlander constraints.
597 rm.registerExtReport(external_report,
598 external_report_info(highlander, expr_index));
599
600 ReportID id;
601 if (som) {
602 assert(!highlander); // not allowed, checked earlier.
603 Report r = makeSomRelativeCallback(external_report, 0, literal.length());
604 id = rm.getInternalId(r);
605 rose->setSom();
606 } else {
607 u32 ekey = highlander ? rm.getExhaustibleKey(external_report)
608 : INVALID_EKEY;
609 Report r = makeECallback(external_report, 0, ekey, quiet);
610 id = rm.getInternalId(r);
611 }
612
613 DEBUG_PRINTF("success: graph is literal '%s', report ID %u\n",
614 dumpString(literal).c_str(), id);
615
616 rose->add(false, false, literal, {id});
617
618 minWidth = min(minWidth, depth(literal.length()));
619
620 /* inform small write handler about this literal */
621 smwr->add(literal, id);
622
623 return true;
624}
625
626} // namespace ue2
627