1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Analysis pass to reform leading dots.
31 *
32 * We have found that many regexes found in the wild use an anchored dot-repeat
33 * to represent an unanchored pattern, particularly if they have been used with
34 * a regex engine that assumes that a pattern is anchored. This pass reforms
35 * patterns that begin with sequences of dots into a more standard form.
36 *
37 * In addition, both anchored and unanchored patterns with dot repeats as
38 * prefixes will have these prefixes reformed into a canonical form, which some
39 * later analyses depend upon.
40 */
41#include "ng_anchored_dots.h"
42
43#include "grey.h"
44#include "ng_holder.h"
45#include "ng_util.h"
46#include "ue2common.h"
47#include "util/container.h"
48#include "util/depth.h"
49#include "util/graph_range.h"
50
51#include <algorithm>
52#include <queue>
53#include <set>
54#include <vector>
55
56using namespace std;
57
58namespace ue2 {
59
60static
61bool findStarts(const NGHolder &g, set<NFAVertex> &anchored,
62 set<NFAVertex> &unanchored) {
63 // Populate unanchored map
64 for (auto v : adjacent_vertices_range(g.startDs, g)) {
65 if (is_special(v, g)) {
66 continue;
67 }
68 unanchored.insert(v);
69 }
70
71 // Populate anchored map
72 for (auto v : adjacent_vertices_range(g.start, g)) {
73 if (is_special(v, g)) {
74 continue;
75 }
76 anchored.insert(v);
77 }
78
79 if (unanchored == anchored) {
80 anchored.clear();
81 } else if (!unanchored.empty() && !anchored.empty()) {
82 return false;
83 }
84
85 return !anchored.empty() || !unanchored.empty();
86}
87
88namespace {
89class DotInfo {
90public:
91 DotInfo(NFAVertex v, bool se, u32 idx)
92 : vertex(v), hasSelfLoop(se), index(idx) {}
93
94 bool operator<(const DotInfo &other) const {
95 if (hasSelfLoop != other.hasSelfLoop)
96 return hasSelfLoop < other.hasSelfLoop;
97 // tie break with vertex id: lowest ID wins
98 return index > other.index;
99 }
100
101 NFAVertex vertex;
102 bool hasSelfLoop;
103 u32 index;
104};
105}
106
107// Returns nullptr if all vertices in the given set are not dots.
108// We can only pick one dot vertex, so we go for a dot-star if it exists,
109// otherwise the dot without a self-edge with the lowest ID.
110static
111NFAVertex findReformable(const NGHolder &g, const set<NFAVertex> &starts,
112 set<NFAVertex> &otherV) {
113 priority_queue<DotInfo> dotq;
114 for (auto v : starts) {
115 if (is_dot(v, g)) {
116 u32 idx = g[v].index;
117 dotq.push(DotInfo(v, hasSelfLoop(v, g), idx));
118 }
119 }
120
121 if (dotq.empty()) {
122 return NGHolder::null_vertex();
123 }
124
125 const DotInfo &dot = dotq.top();
126 otherV = starts;
127 otherV.erase(dot.vertex);
128 DEBUG_PRINTF("selected dot vertex %u (%s)\n", dot.index,
129 dot.hasSelfLoop ? "has self-edge" : "no self-edge");
130 DEBUG_PRINTF("%zu other vertices\n", otherV.size());
131 return dot.vertex;
132}
133
134// Returns true if the given vertex is only preceded by start. If start is
135// graph.startDs (i.e. unanchored), the given vertex can also be connected to
136// graph.start. If selfLoopIsAcceptable is set, self-loops are ignored.
137static
138bool isStartNode(NFAVertex v, NFAVertex start, const NGHolder &g,
139 bool selfLoopIsAcceptable) {
140 for (auto u : inv_adjacent_vertices_range(v, g)) {
141 if (selfLoopIsAcceptable && u == v) {
142 continue;
143 } else if (u == start) {
144 continue;
145 } else if (start == g.startDs && u == g.start) {
146 continue;
147 } else {
148 return false;
149 }
150 }
151 return true;
152}
153
154// Note: this will only remove the anchored first dot in the chain -- any other
155// removable nodes will be handled by the unanchored case below.
156static
157void reformAnchoredRepeatsComponent(NGHolder &g,
158 set<NFAVertex> &compAnchoredStarts,
159 set<NFAVertex> &compUnanchoredStarts,
160 set<NFAVertex> &dead, depth *startBegin,
161 depth *startEnd) {
162 // anchored cases can not have any unanchored starts
163 if (!compUnanchoredStarts.empty()) {
164 DEBUG_PRINTF("we have unanchored starts, skipping\n");
165 return;
166 }
167
168 NFAVertex dotV = NGHolder::null_vertex();
169 set<NFAVertex> otherV;
170 dotV = findReformable(g, compAnchoredStarts, otherV);
171 if (dotV == NGHolder::null_vertex()) {
172 DEBUG_PRINTF("no candidate reformable dot found.\n");
173 return;
174 }
175
176 NFAEdge loopEdge;
177 bool selfLoop = false;
178 bool bustOut = false;
179
180 for (const auto &e : out_edges_range(dotV, g)) {
181 NFAVertex t = target(e, g);
182 if (t == dotV) {
183 selfLoop = true;
184 loopEdge = e;
185 continue;
186 }
187
188 if (is_special(t, g)) {
189 bustOut = true;
190 break;
191 }
192
193 if (!otherV.empty() && otherV.find(t) == otherV.end()) {
194 bustOut = true;
195 break;
196 }
197 }
198
199 if (bustOut) {
200 DEBUG_PRINTF("busting out\n");
201 return;
202 }
203
204 if (!isStartNode(dotV, g.start, g, true)) {
205 DEBUG_PRINTF("fleeing: vertex %zu has other preds\n", g[dotV].index);
206 return;
207 }
208
209 /* get bounds */
210 depth min;
211 depth max(1);
212
213 if (selfLoop) {
214 // A self-loop indicates that this is a '.+' or '.*'
215 max = depth::infinity();
216 }
217
218 if (!otherV.empty()) {
219 /* We require that the successors of the dot node are are the same
220 * as the start vertex. TODO: remember why.
221 */
222 if (selfLoop) {
223 if (otherV.size() != out_degree(dotV, g) - 1) {
224 return;
225 }
226 } else {
227 if (otherV.size() != out_degree(dotV, g)) {
228 return;
229 }
230 }
231
232 min = depth(0);
233 } else {
234 min = depth(1);
235 }
236
237 *startBegin = min;
238 *startEnd = max;
239
240 for (auto t : adjacent_vertices_range(dotV, g)) {
241 if (t != dotV) {
242 add_edge_if_not_present(g.startDs, t, g);
243 add_edge_if_not_present(g.start, t, g);
244 compUnanchoredStarts.insert(t);
245 }
246 }
247
248 for (auto v : otherV) {
249 remove_edge(g.start, v, g);
250 }
251
252 DEBUG_PRINTF("removing vertex %zu\n", g[dotV].index);
253 clear_vertex(dotV, g);
254 dead.insert(dotV);
255 compAnchoredStarts.erase(dotV);
256}
257
258static
259void reformUnanchoredRepeatsComponent(NGHolder &g,
260 set<NFAVertex> &compAnchoredStarts,
261 set<NFAVertex> &compUnanchoredStarts,
262 set<NFAVertex> &dead,
263 depth *startBegin, depth *startEnd) {
264 // unanchored cases can not have any anchored starts
265 if (!compAnchoredStarts.empty()) {
266 DEBUG_PRINTF("we have anchored starts, skipping\n");
267 return;
268 }
269
270 while (true) {
271 NFAVertex dotV = NGHolder::null_vertex();
272 set<NFAVertex> otherV;
273 dotV = findReformable(g, compUnanchoredStarts, otherV);
274 if (dotV == NGHolder::null_vertex()) {
275 DEBUG_PRINTF("no candidate reformable dot found.\n");
276 return;
277 }
278
279 NFAEdge loopEdge;
280 bool selfLoop = false;
281 bool bustOut = false;
282
283 for (const auto &e : out_edges_range(dotV, g)) {
284 NFAVertex t = target(e, g);
285
286 if (t == dotV) {
287 selfLoop = true;
288 loopEdge = e;
289 continue;
290 }
291
292 if (is_special(t, g)) {
293 bustOut = true;
294 break;
295 }
296
297 if (!otherV.empty() && otherV.find(t) == otherV.end()) {
298 bustOut = true;
299 break;
300 }
301 }
302
303 if (bustOut) {
304 DEBUG_PRINTF("busting out\n");
305 if (!selfLoop) {
306 return;
307 }
308
309 for (auto v : otherV) {
310 if (!edge(dotV, v, g).second) {
311 return;
312 }
313 }
314
315 // A self-loop indicates that this is a '.+' or '.*'
316 DEBUG_PRINTF("self-loop detected on %zu\n", g[dotV].index);
317 *startEnd = depth::infinity();
318 remove_edge(dotV, dotV, g);
319 return;
320 }
321
322 if (!isStartNode(dotV, g.startDs, g, true)) {
323 DEBUG_PRINTF("fleeing: vertex %zu has other preds\n",
324 g[dotV].index);
325 return;
326 }
327
328 /* get bounds */
329 depth min(1);
330 depth max(1);
331
332 if (selfLoop) {
333 // A self-loop indicates that this is a '.+' or '.*'
334 DEBUG_PRINTF("self-loop detected\n");
335 max = depth::infinity();
336 }
337
338 if (!otherV.empty()) {
339 if (!selfLoop && otherV.size() != out_degree(dotV, g)) {
340 return;
341 }
342
343 if (selfLoop && otherV.size() != out_degree(dotV, g) - 1) {
344 return;
345 }
346
347 if (min > depth(1)) {
348 /* this is not a case we can handle */
349 DEBUG_PRINTF("min greater than one, skipping\n");
350 return;
351 }
352 min = depth(0);
353 }
354
355 *startBegin += min;
356 *startEnd += max;
357
358 for (auto v : otherV) {
359 remove_edge(g.start, v, g);
360 remove_edge(g.startDs, v, g);
361 }
362
363 compUnanchoredStarts.clear();
364 for (auto t : adjacent_vertices_range(dotV, g)) {
365 if (t != dotV) {
366 DEBUG_PRINTF("connecting sds -> %zu\n", g[t].index);
367 add_edge(g.startDs, t, g);
368 add_edge(g.start, t, g);
369 compUnanchoredStarts.insert(t);
370 }
371 }
372
373 DEBUG_PRINTF("removing vertex %zu\n", g[dotV].index);
374 dead.insert(dotV);
375 clear_vertex(dotV, g);
376 compUnanchoredStarts.erase(dotV);
377 }
378}
379
380// for t to be another optional dot, it must have only in-edges from v and from
381// starts
382static
383bool isOptionalDot(NFAVertex t, NFAVertex v, const NGHolder &g) {
384 if (!is_dot(t, g)) {
385 return false;
386 }
387
388 bool found_v = false, found_start = false;
389
390 for (auto u : inv_adjacent_vertices_range(t, g)) {
391 if (u == v) {
392 found_v = true;
393 } else if (u == g.start || u == g.startDs) {
394 found_start = true;
395 } else {
396 return false;
397 }
398 }
399
400 return found_v && found_start;
401}
402
403static
404bool gatherParticipants(const NGHolder &g,
405 NFAVertex start, NFAVertex initialDot,
406 set<NFAVertex> &dots, set<NFAVertex> &succ) {
407 // Walk the graph downwards from the initial dot; each dot will have:
408 // 1) a single optional dot successor, or
409 // 2) N successors (our terminating case)
410 dots.insert(initialDot);
411 NFAVertex v = initialDot;
412
413 while (out_degree(v, g) == 1) {
414 NFAVertex t = *(adjacent_vertices(v, g).first);
415 // for t to be another optional dot, it must have only in-edges from v
416 // and from starts
417 if (isOptionalDot(t, v, g)) {
418 // another dot; bail if we've seen it once already
419 if (dots.find(t) != dots.end()) {
420 DEBUG_PRINTF("cycle detected at vertex %zu\n", g[t].index);
421 return false;
422 }
423 dots.insert(t);
424 v = t;
425 continue;
426 }
427 // otherwise, we found a terminating dot state
428 break;
429 }
430
431 // Our terminating states are the successors of v.
432 // All of these MUST have an edge from start as well.
433 for (auto w : adjacent_vertices_range(v, g)) {
434 succ.insert(w);
435 if (!edge(start, w, g).second) {
436 DEBUG_PRINTF("failing, vertex %zu does not have edge from start\n",
437 g[w].index);
438 return false;
439 }
440 }
441
442 /* All the non chained v connected to start must be in succ as well
443 * TODO: remember why (and document). */
444 for (auto u : adjacent_vertices_range(start, g)) {
445 if (is_special(u, g)) {
446 continue;
447 }
448 if (!contains(dots, u) && !contains(succ, u)) {
449 return false;
450 }
451 }
452
453 return !succ.empty();
454}
455
456static
457void collapseVariableDotRepeat(NGHolder &g, NFAVertex start,
458 set<NFAVertex> &dead, UNUSED depth *startBegin,
459 depth *startEnd) {
460 // Handle optional dot repeat prefixes, e.g.
461 // /^.{0,30}foo/s, /^.{0,5}foo/s, unanchored equivs
462 // Note that this code assumes that fixed repeats ('^.{5,20}') have been
463 // pruned already, down (in this case) to '^.{0,15}'.
464
465 // The first of our optional dots must be connected to start. The jump edge
466 // past it will be verified in gatherParticipants(). If start is
467 // graph.start, it should not be connected to startDs.
468 NFAVertex initialDot = NGHolder::null_vertex();
469 for (auto v : adjacent_vertices_range(start, g)) {
470 if (is_special(v, g)) {
471 continue;
472 }
473 if (is_dot(v, g) && isStartNode(v, start, g, false)) {
474 if (initialDot) {
475 return;
476 }
477 initialDot = v;
478 DEBUG_PRINTF("initial dot vertex is %zu\n", g[v].index);
479 }
480 }
481
482 if (!initialDot) {
483 return;
484 }
485
486 // Collect all the other optional dot vertices and the successor vertices
487 // by walking down the graph from initialDot
488 set<NFAVertex> dots, succ;
489 if (!gatherParticipants(g, start, initialDot, dots, succ)) {
490 DEBUG_PRINTF("gatherParticipants failed\n");
491 return;
492 }
493
494 DEBUG_PRINTF("optional dot repeat with %zu participants, "
495 "terminating in %zu non-dot nodes\n",
496 dots.size(), succ.size());
497
498 // Remove all the participants and set the start offset
499 dead.insert(dots.begin(), dots.end());
500
501 DEBUG_PRINTF("current offsets: %s-%s\n", startBegin->str().c_str(),
502 startEnd->str().c_str());
503
504 if (start == g.start && startEnd->is_infinite()) {
505 *startEnd = depth(dots.size());
506 } else if (startEnd->is_finite()) {
507 *startEnd += dots.size();
508 }
509 assert(startEnd->is_reachable());
510
511 // Connect our successor vertices to both start and startDs.
512 for (auto v : succ) {
513 add_edge_if_not_present(g.start, v, g);
514 add_edge_if_not_present(g.startDs, v, g);
515 }
516}
517
518static
519void deleteVertices(set<NFAVertex> &dead, NGHolder &g) {
520 if (!dead.empty()) {
521 DEBUG_PRINTF("pruning %zu vertices\n", dead.size());
522 remove_vertices(dead, g);
523 }
524 dead.clear();
525}
526
527static
528void reformAnchoredRepeats(NGHolder &g, depth *startBegin, depth *startEnd) {
529 DEBUG_PRINTF("component\n");
530 set<NFAVertex> anchored, unanchored, dead;
531 if (!findStarts(g, anchored, unanchored)) {
532 DEBUG_PRINTF("no starts\n");
533 return;
534 }
535
536 reformAnchoredRepeatsComponent(g, anchored, unanchored, dead, startBegin,
537 startEnd);
538 deleteVertices(dead, g);
539
540 reformUnanchoredRepeatsComponent(g, anchored, unanchored, dead, startBegin,
541 startEnd);
542 deleteVertices(dead, g);
543}
544
545static
546void collapseVariableRepeats(NGHolder &g, depth *startBegin, depth *startEnd) {
547 DEBUG_PRINTF("collapseVariableRepeats\n");
548 set<NFAVertex> dead;
549
550 collapseVariableDotRepeat(g, g.start, dead, startBegin, startEnd);
551 deleteVertices(dead, g);
552
553 collapseVariableDotRepeat(g, g.startDs, dead, startBegin, startEnd);
554 deleteVertices(dead, g);
555}
556
557static
558void addDotsBetween(NGHolder &g, NFAVertex lhs, vector<NFAVertex> &rhs,
559 depth min_repeat, depth max_repeat) {
560 const bool unbounded = max_repeat.is_infinite();
561 if (unbounded) {
562 max_repeat = min_repeat;
563 }
564
565 assert(max_repeat.is_finite());
566
567 NFAVertex u = lhs;
568
569 if (!min_repeat && unbounded) {
570 NFAVertex v = add_vertex(g);
571 add_edge(u, v, g);
572 g[v].char_reach.setall();
573
574 for (auto w : rhs) {
575 add_edge(lhs, w, g);
576 }
577 }
578
579 for (u32 i = 0; i < min_repeat; i++) {
580 NFAVertex v = add_vertex(g);
581 add_edge(u, v, g);
582 g[v].char_reach.setall();
583 u = v;
584 }
585
586 NFAVertex split = u;
587 /* lhs now split point for optional */
588 for (u32 i = min_repeat; i < max_repeat; i++) {
589 NFAVertex v = add_vertex(g);
590 add_edge(u, v, g);
591 if (u != split) {
592 add_edge(split, v, g);
593 }
594 g[v].char_reach.setall();
595 u = v;
596 }
597
598 if (unbounded) {
599 add_edge(u, u, g);
600 }
601
602 for (auto w : rhs) {
603 add_edge(u, w, g);
604 if (split != u) {
605 add_edge(split, w, g);
606 }
607 }
608}
609
610static
611void restoreLeadingDots(NGHolder &g, const depth &startBegin,
612 const depth &startEnd) {
613 if (startBegin == depth(0) && startEnd.is_infinite()) {
614 return;
615 }
616 DEBUG_PRINTF("ungobble (%s, %s)\n", startBegin.str().c_str(),
617 startEnd.str().c_str());
618
619 for (UNUSED auto v : adjacent_vertices_range(g.start, g)) {
620 assert(edge(g.startDs, v, g).second);
621 }
622 clear_out_edges(g.start, g);
623 add_edge(g.start, g.startDs, g);
624
625 const bool unbounded = startEnd.is_infinite();
626
627 NFAVertex root = unbounded ? g.startDs : g.start;
628
629 vector<NFAVertex> rhs;
630 insert(&rhs, rhs.end(), adjacent_vertices(g.startDs, g));
631 rhs.erase(remove(rhs.begin(), rhs.end(), g.startDs), rhs.end());
632 for (auto v : rhs) {
633 remove_edge(g.startDs, v, g);
634 }
635
636 addDotsBetween(g, root, rhs, startBegin, startEnd);
637 renumber_vertices(g);
638 renumber_edges(g);
639}
640
641// Entry point.
642void reformLeadingDots(NGHolder &g) {
643 depth startBegin(0);
644 depth startEnd = depth::infinity();
645
646 reformAnchoredRepeats(g, &startBegin, &startEnd);
647 collapseVariableRepeats(g, &startBegin, &startEnd);
648 restoreLeadingDots(g, startBegin, startEnd);
649}
650
651} // namespace ue2
652