1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Resolve special assert vertices.
31 *
32 * The assert resolution algorithm proceeds by iterating over those edges with
33 * assertion flags, considering source and target vertices of each edge. If a
34 * vertex has a superset of the reachability demanded by the assertion on the
35 * edge, it is split into alternatives providing the word and non-word paths
36 * through that vertex.
37 *
38 * A great deal of the complexity in the resolveAsserts pass is devoted to
39 * handling these assertions when the UCP flag is specified (meaning \\w and \\W
40 * are implemented with Unicode properties, rather than their ASCII
41 * interpretation) and the prefiltering flag is also used. Complete,
42 * non-prefiltering UCP support is not available yet.
43 */
44#include "ng_asserts.h"
45
46#include "ng.h"
47#include "ng_prune.h"
48#include "ng_redundancy.h"
49#include "ng_util.h"
50#include "compiler/compiler.h"
51#include "parser/position.h" // for POS flags
52#include "util/bitutils.h" // for findAndClearLSB_32
53#include "util/boundary_reports.h"
54#include "util/container.h"
55#include "util/compile_context.h"
56#include "util/compile_error.h"
57#include "util/graph_range.h"
58#include "util/report_manager.h"
59#include "util/unicode_def.h"
60
61#include <queue>
62
63using namespace std;
64
65namespace ue2 {
66
67/** \brief Hard limit on the maximum number of vertices we'll clone before we
68 * throw up our hands and report 'Pattern too large.' */
69static const size_t MAX_CLONED_VERTICES = 2048;
70
71/** \brief The definition of \\w, since we use it everywhere in here. */
72static const CharReach CHARREACH_WORD(CharReach('a', 'z') |
73 CharReach('A', 'Z') | CharReach('0', '9') | CharReach('_'));
74
75/** \brief \\W is the inverse of \\w */
76static const CharReach CHARREACH_NONWORD(~CHARREACH_WORD);
77
78/** \brief Prefiltering definition of \\w for UCP mode.
79 *
80 * Includes all high bytes as to capture all non-ASCII, however depending on
81 * direction only continuers or starters are strictly required - as the input
82 * is well-formed, this laxness will not cost us. */
83static const CharReach CHARREACH_WORD_UCP_PRE(CHARREACH_WORD
84 | CharReach(128, 255));
85
86/** \brief Prefiltering definition of \\W for UCP Mode.
87 *
88 * (non-word already includes high bytes) */
89static const CharReach CHARREACH_NONWORD_UCP_PRE(CHARREACH_NONWORD);
90
91/** \brief Find all the edges with assertion flags. */
92static
93vector<NFAEdge> getAsserts(const NGHolder &g) {
94 vector<NFAEdge> out;
95 for (const auto &e : edges_range(g)) {
96 if (g[e].assert_flags) {
97 out.push_back(e);
98 }
99 }
100 return out;
101}
102
103static
104void addToSplit(const NGHolder &g, NFAVertex v, map<u32, NFAVertex> *to_split) {
105 DEBUG_PRINTF("%zu needs splitting\n", g[v].index);
106 to_split->emplace(g[v].index, v);
107}
108
109/** \brief Find vertices that need to be split due to an assertion edge.
110 *
111 * A vertex needs to be split if has an edge to/from it with an assert with a
112 * restriction on the relevant end. */
113static
114void findSplitters(const NGHolder &g, const vector<NFAEdge> &asserts,
115 map<u32, NFAVertex> *to_split,
116 map<u32, NFAVertex> *to_split_ucp) {
117 for (const auto &e : asserts) {
118 NFAVertex u = source(e, g);
119 NFAVertex v = target(e, g);
120 u32 flags = g[e].assert_flags;
121 assert(flags);
122
123 const CharReach &u_cr = g[u].char_reach;
124 const CharReach &v_cr = g[v].char_reach;
125
126 bool ucp_assert = flags & UCP_ASSERT_FLAGS;
127 bool normal_assert = flags & NON_UCP_ASSERT_FLAGS;
128 /* In reality, an expression can only be entirely ucp or not ucp */
129 assert(ucp_assert != normal_assert);
130
131 if (normal_assert) {
132 /* assume any flag results in us have to split if the vertex is not
133 * a subset of word or completely disjoint from it. We could be more
134 * nuanced if flags is a disjunction of multiple assertions. */
135 if (!u_cr.isSubsetOf(CHARREACH_WORD)
136 && !u_cr.isSubsetOf(CHARREACH_NONWORD)
137 && u != g.start) { /* start is always considered a nonword */
138 addToSplit(g, u, to_split);
139 }
140
141 if (!v_cr.isSubsetOf(CHARREACH_WORD)
142 && !v_cr.isSubsetOf(CHARREACH_NONWORD)
143 && v != g.accept /* accept require special handling, done on a
144 * per edge basis in resolve asserts
145 */
146 && v != g.acceptEod) { /* eod is always considered a nonword */
147 addToSplit(g, v, to_split);
148 }
149 }
150
151 if (ucp_assert) {
152 /* note: the ucp prefilter crs overlap - requires a bit more care */
153 if (u == g.start) { /* start never needs to be split,
154 * treat nonword */
155 } else if (flags & POS_FLAG_ASSERT_WORD_TO_ANY_UCP) {
156 if (!u_cr.isSubsetOf(CHARREACH_WORD_UCP_PRE)
157 && !u_cr.isSubsetOf(~CHARREACH_WORD_UCP_PRE)) {
158 addToSplit(g, u, to_split_ucp);
159 }
160 } else {
161 assert(flags & POS_FLAG_ASSERT_NONWORD_TO_ANY_UCP);
162 if (!u_cr.isSubsetOf(CHARREACH_NONWORD_UCP_PRE)
163 && !u_cr.isSubsetOf(~CHARREACH_NONWORD_UCP_PRE)) {
164 addToSplit(g, u, to_split_ucp);
165 }
166 }
167
168 if (v == g.acceptEod /* eod is always considered a nonword */
169 || v == g.accept) { /* accept require special handling, done on
170 * a per edge basis in resolve asserts */
171 } else if (flags & POS_FLAG_ASSERT_ANY_TO_WORD_UCP) {
172 if (!v_cr.isSubsetOf(CHARREACH_WORD_UCP_PRE)
173 && !v_cr.isSubsetOf(~CHARREACH_WORD_UCP_PRE)) {
174 addToSplit(g, v, to_split_ucp);
175 }
176 } else {
177 assert(flags & POS_FLAG_ASSERT_ANY_TO_NONWORD_UCP);
178 if (!v_cr.isSubsetOf(CHARREACH_NONWORD_UCP_PRE)
179 && !v_cr.isSubsetOf(~CHARREACH_NONWORD_UCP_PRE)) {
180 addToSplit(g, v, to_split_ucp);
181 }
182 }
183 }
184 }
185}
186
187static
188void setReportId(ReportManager &rm, NGHolder &g, const ExpressionInfo &expr,
189 NFAVertex v, s32 adj) {
190 // Don't try and set the report ID of a special vertex.
191 assert(!is_special(v, g));
192
193 // If there's a report set already, we're replacing it.
194 g[v].reports.clear();
195
196 Report ir = rm.getBasicInternalReport(expr, adj);
197
198 g[v].reports.insert(rm.getInternalId(ir));
199 DEBUG_PRINTF("set report id for vertex %zu, adj %d\n", g[v].index, adj);
200}
201
202static
203NFAVertex makeClone(ReportManager &rm, NGHolder &g, const ExpressionInfo &expr,
204 NFAVertex v, const CharReach &cr_mask) {
205 NFAVertex clone = clone_vertex(g, v);
206 g[clone].char_reach &= cr_mask;
207 clone_out_edges(g, v, clone);
208 clone_in_edges(g, v, clone);
209
210 if (v == g.startDs) {
211 if (expr.utf8) {
212 g[clone].char_reach &= ~UTF_START_CR;
213 }
214
215 DEBUG_PRINTF("marked as virt\n");
216 g[clone].assert_flags = POS_FLAG_VIRTUAL_START;
217
218 setReportId(rm, g, expr, clone, 0);
219 }
220
221 return clone;
222}
223
224static
225void splitVertex(ReportManager &rm, NGHolder &g, const ExpressionInfo &expr,
226 NFAVertex v, bool ucp) {
227 assert(v != g.start);
228 assert(v != g.accept);
229 assert(v != g.acceptEod);
230 DEBUG_PRINTF("partitioning vertex %zu ucp:%d\n", g[v].index, (int)ucp);
231
232 CharReach cr_word = ucp ? CHARREACH_WORD_UCP_PRE : CHARREACH_WORD;
233 CharReach cr_nonword = ucp ? CHARREACH_NONWORD_UCP_PRE : CHARREACH_NONWORD;
234
235 auto has_no_assert = [&g](const NFAEdge &e) { return !g[e].assert_flags; };
236
237 // Split v into word/nonword vertices with only asserting out-edges.
238 NFAVertex w_out = makeClone(rm, g, expr, v, cr_word);
239 NFAVertex nw_out = makeClone(rm, g, expr, v, cr_nonword);
240 remove_out_edge_if(w_out, has_no_assert, g);
241 remove_out_edge_if(nw_out, has_no_assert, g);
242
243 // Split v into word/nonword vertices with only asserting in-edges.
244 NFAVertex w_in = makeClone(rm, g, expr, v, cr_word);
245 NFAVertex nw_in = makeClone(rm, g, expr, v, cr_nonword);
246 remove_in_edge_if(w_in, has_no_assert, g);
247 remove_in_edge_if(nw_in, has_no_assert, g);
248
249 // Prune edges with asserts from original v.
250 auto has_assert = [&g](const NFAEdge &e) { return g[e].assert_flags; };
251 remove_in_edge_if(v, has_assert, g);
252 remove_out_edge_if(v, has_assert, g);
253}
254
255static
256void resolveEdges(ReportManager &rm, NGHolder &g, const ExpressionInfo &expr,
257 set<NFAEdge> *dead) {
258 for (const auto &e : edges_range(g)) {
259 u32 flags = g[e].assert_flags;
260 if (!flags) {
261 continue;
262 }
263
264 NFAVertex u = source(e, g);
265 NFAVertex v = target(e, g);
266
267 assert(u != g.startDs);
268
269 const CharReach &u_cr = g[u].char_reach;
270 const CharReach &v_cr = g[v].char_reach;
271
272 bool impassable = true;
273 bool ucp = flags & UCP_ASSERT_FLAGS;
274 DEBUG_PRINTF("resolving edge %zu->%zu (flags=0x%x, ucp=%d)\n",
275 g[u].index, g[v].index, flags, (int)ucp);
276 while (flags && impassable) {
277 u32 flag = 1U << findAndClearLSB_32(&flags);
278 switch (flag) {
279 case POS_FLAG_ASSERT_NONWORD_TO_NONWORD:
280 case POS_FLAG_ASSERT_NONWORD_TO_WORD:
281 if ((u_cr & CHARREACH_NONWORD).none() && u != g.start) {
282 continue;
283 }
284 break;
285 case POS_FLAG_ASSERT_WORD_TO_NONWORD:
286 case POS_FLAG_ASSERT_WORD_TO_WORD:
287 if ((u_cr & CHARREACH_WORD).none() || u == g.start) {
288 continue;
289 }
290 break;
291 case POS_FLAG_ASSERT_NONWORD_TO_NONWORD_UCP:
292 case POS_FLAG_ASSERT_NONWORD_TO_WORD_UCP:
293 if ((u_cr & ~CHARREACH_NONWORD_UCP_PRE).any() && u != g.start) {
294 continue;
295 }
296 break;
297 case POS_FLAG_ASSERT_WORD_TO_NONWORD_UCP:
298 case POS_FLAG_ASSERT_WORD_TO_WORD_UCP:
299 if ((u_cr & ~CHARREACH_WORD_UCP_PRE).any() || u == g.start) {
300 continue;
301 }
302 break;
303 default:
304 assert(0);
305 }
306
307 if (v == g.accept) {
308 /* accept special will need to be treated specially later */
309 impassable = false;
310 continue;
311 }
312
313 switch (flag) {
314 case POS_FLAG_ASSERT_NONWORD_TO_NONWORD:
315 case POS_FLAG_ASSERT_WORD_TO_NONWORD:
316 if ((v_cr & CHARREACH_NONWORD).none() && v != g.acceptEod) {
317 continue;
318 }
319 break;
320 case POS_FLAG_ASSERT_WORD_TO_WORD:
321 case POS_FLAG_ASSERT_NONWORD_TO_WORD:
322 if ((v_cr & CHARREACH_WORD).none() || v == g.acceptEod) {
323 continue;
324 }
325 break;
326 case POS_FLAG_ASSERT_NONWORD_TO_NONWORD_UCP:
327 case POS_FLAG_ASSERT_WORD_TO_NONWORD_UCP:
328 if ((v_cr & ~CHARREACH_NONWORD_UCP_PRE).any()
329 && v != g.acceptEod) {
330 continue;
331 }
332 break;
333 case POS_FLAG_ASSERT_WORD_TO_WORD_UCP:
334 case POS_FLAG_ASSERT_NONWORD_TO_WORD_UCP:
335 if ((v_cr & ~CHARREACH_WORD_UCP_PRE).any()
336 || v == g.acceptEod) {
337 continue;
338 }
339 break;
340 default:
341 assert(0);
342 }
343 impassable = false;
344 }
345
346 if (impassable) {
347 dead->insert(e);
348 } else if (v == g.accept && !ucp) {
349 bool u_w = (u_cr & CHARREACH_NONWORD).none() && u != g.start;
350 UNUSED bool u_nw = (u_cr & CHARREACH_WORD).none() || u == g.start;
351 assert(u_w != u_nw);
352 bool v_w = false;
353 bool v_nw = false;
354
355 flags = g[e].assert_flags;
356 if (u_w) {
357 v_w = flags & POS_FLAG_ASSERT_WORD_TO_WORD;
358 v_nw = flags & POS_FLAG_ASSERT_WORD_TO_NONWORD;
359 } else {
360 v_w = flags & POS_FLAG_ASSERT_NONWORD_TO_WORD;
361 v_nw = flags & POS_FLAG_ASSERT_NONWORD_TO_NONWORD;
362 }
363 assert(v_w || v_nw);
364 if (v_w && v_nw) {
365 /* edge is effectively unconditional */
366 g[e].assert_flags = 0;
367 } else if (v_w) {
368 /* need to add a word byte */
369 NFAVertex vv = add_vertex(g);
370 setReportId(rm, g, expr, vv, -1);
371 g[vv].char_reach = CHARREACH_WORD;
372 add_edge(vv, g.accept, g);
373 g[e].assert_flags = 0;
374 add_edge(u, vv, g[e], g);
375 dead->insert(e);
376 } else {
377 /* need to add a non word byte or see eod */
378 NFAVertex vv = add_vertex(g);
379 setReportId(rm, g, expr, vv, -1);
380 g[vv].char_reach = CHARREACH_NONWORD;
381 add_edge(vv, g.accept, g);
382 g[e].assert_flags = 0;
383 add_edge(u, vv, g[e], g);
384 /* there may already be a different edge from start to eod if so
385 * we need to make it unconditional and alive
386 */
387 if (NFAEdge start_eod = edge(u, g.acceptEod, g)) {
388 g[start_eod].assert_flags = 0;
389 dead->erase(start_eod);
390 } else {
391 add_edge(u, g.acceptEod, g[e], g);
392 }
393 dead->insert(e);
394 }
395 } else if (v == g.accept && ucp) {
396 DEBUG_PRINTF("resolving ucp assert to accept\n");
397 assert(u_cr.any());
398 bool u_w = (u_cr & CHARREACH_WORD_UCP_PRE).any()
399 && u != g.start;
400 bool u_nw = (u_cr & CHARREACH_NONWORD_UCP_PRE).any()
401 || u == g.start;
402 assert(u_w || u_nw);
403
404 bool v_w = false;
405 bool v_nw = false;
406
407 flags = g[e].assert_flags;
408 if (u_w) {
409 v_w |= flags & POS_FLAG_ASSERT_WORD_TO_WORD_UCP;
410 v_nw |= flags & POS_FLAG_ASSERT_WORD_TO_NONWORD_UCP;
411 }
412 if (u_nw) {
413 v_w |= flags & POS_FLAG_ASSERT_NONWORD_TO_WORD_UCP;
414 v_nw |= flags & POS_FLAG_ASSERT_NONWORD_TO_NONWORD_UCP;
415 }
416 assert(v_w || v_nw);
417 if (v_w && v_nw) {
418 /* edge is effectively unconditional */
419 g[e].assert_flags = 0;
420 } else if (v_w) {
421 /* need to add a word byte */
422 NFAVertex vv = add_vertex(g);
423 setReportId(rm, g, expr, vv, -1);
424 g[vv].char_reach = CHARREACH_WORD_UCP_PRE;
425 add_edge(vv, g.accept, g);
426 g[e].assert_flags = 0;
427 add_edge(u, vv, g[e], g);
428 dead->insert(e);
429 } else {
430 /* need to add a non word byte or see eod */
431 NFAVertex vv = add_vertex(g);
432 setReportId(rm, g, expr, vv, -1);
433 g[vv].char_reach = CHARREACH_NONWORD_UCP_PRE;
434 add_edge(vv, g.accept, g);
435 g[e].assert_flags = 0;
436 add_edge(u, vv, g[e], g);
437 /* there may already be a different edge from start to eod if so
438 * we need to make it unconditional and alive
439 */
440 if (NFAEdge start_eod = edge(u, g.acceptEod, g)) {
441 g[start_eod].assert_flags = 0;
442 dead->erase(start_eod);
443 } else {
444 add_edge(u, g.acceptEod, g[e], g);
445 }
446 dead->insert(e);
447 }
448 } else {
449 /* we can remove the asserts as we have partitioned the vertices
450 * into w/nw around the assert edges
451 */
452 g[e].assert_flags = 0;
453 }
454 }
455}
456
457void resolveAsserts(ReportManager &rm, NGHolder &g,
458 const ExpressionInfo &expr) {
459 vector<NFAEdge> asserts = getAsserts(g);
460 if (asserts.empty()) {
461 return;
462 }
463
464 map<u32, NFAVertex> to_split; /* by index, for determinism */
465 map<u32, NFAVertex> to_split_ucp; /* by index, for determinism */
466 findSplitters(g, asserts, &to_split, &to_split_ucp);
467 if (to_split.size() + to_split_ucp.size() > MAX_CLONED_VERTICES) {
468 throw CompileError(expr.index, "Pattern is too large.");
469 }
470
471 for (const auto &m : to_split) {
472 assert(!contains(to_split_ucp, m.first));
473 splitVertex(rm, g, expr, m.second, false);
474 }
475
476 for (const auto &m : to_split_ucp) {
477 splitVertex(rm, g, expr, m.second, true);
478 }
479
480 set<NFAEdge> dead;
481 resolveEdges(rm, g, expr, &dead);
482
483 remove_edges(dead, g);
484 renumber_vertices(g);
485 pruneUseless(g);
486 pruneEmptyVertices(g);
487
488 renumber_vertices(g);
489 renumber_edges(g);
490 clearReports(g);
491}
492
493void ensureCodePointStart(ReportManager &rm, NGHolder &g,
494 const ExpressionInfo &expr) {
495 /* In utf8 mode there is an implicit assertion that we start at codepoint
496 * boundaries. Assert resolution handles the badness coming from asserts.
497 * The only other source of trouble is startDs->accept connections.
498 */
499 NFAEdge orig = edge(g.startDs, g.accept, g);
500 if (expr.utf8 && orig) {
501 DEBUG_PRINTF("rectifying %u\n", expr.report);
502 Report ir = rm.getBasicInternalReport(expr);
503 ReportID rep = rm.getInternalId(ir);
504
505 NFAVertex v_a = add_vertex(g);
506 g[v_a].assert_flags = POS_FLAG_VIRTUAL_START;
507 g[v_a].char_reach = UTF_ASCII_CR;
508 add_edge(v_a, g.accept, g[orig], g);
509
510 NFAVertex v_2 = add_vertex(g);
511 g[v_2].assert_flags = POS_FLAG_VIRTUAL_START;
512 g[v_2].char_reach = CharReach(UTF_TWO_BYTE_MIN, UTF_TWO_BYTE_MAX);
513
514 NFAVertex v_3 = add_vertex(g);
515 g[v_3].assert_flags = POS_FLAG_VIRTUAL_START;
516 g[v_3].char_reach = CharReach(UTF_THREE_BYTE_MIN, UTF_THREE_BYTE_MAX);
517
518 NFAVertex v_4 = add_vertex(g);
519 g[v_4].assert_flags = POS_FLAG_VIRTUAL_START;
520 g[v_4].char_reach = CharReach(UTF_FOUR_BYTE_MIN, UTF_FOUR_BYTE_MAX);
521
522 NFAVertex v_c = add_vertex(g);
523 g[v_c].assert_flags = POS_FLAG_VIRTUAL_START;
524 g[v_c].char_reach = UTF_CONT_CR;
525 add_edge(v_c, g.accept, g[orig], g);
526
527 add_edge(v_2, v_c, g);
528
529 NFAVertex v_3c = add_vertex(g);
530 g[v_3c].assert_flags = POS_FLAG_VIRTUAL_START;
531 g[v_3c].char_reach = UTF_CONT_CR;
532 add_edge(v_3c, v_c, g);
533 add_edge(v_3, v_3c, g);
534
535 NFAVertex v_4c = add_vertex(g);
536 g[v_4c].assert_flags = POS_FLAG_VIRTUAL_START;
537 g[v_4c].char_reach = UTF_CONT_CR;
538 add_edge(v_4c, v_3c, g);
539 add_edge(v_4, v_4c, g);
540
541 g[v_a].reports.insert(rep);
542 g[v_c].reports.insert(rep);
543
544 add_edge(g.start, v_a, g);
545 add_edge(g.startDs, v_a, g);
546 add_edge(g.start, v_2, g);
547 add_edge(g.startDs, v_2, g);
548 add_edge(g.start, v_3, g);
549 add_edge(g.startDs, v_3, g);
550 add_edge(g.start, v_4, g);
551 add_edge(g.startDs, v_4, g);
552 remove_edge(orig, g);
553 renumber_edges(g);
554 clearReports(g);
555 }
556}
557
558} // namespace ue2
559