1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Convert temporary assert vertices (from construction method) to
31 * edge-based flags.
32 *
33 * This pass converts the temporary assert vertices created by the Glushkov
34 * construction process above (vertices with special assertions flags) into
35 * edges between those vertices' neighbours in the graph.
36 *
37 * These edges have the appropriate flags applied to them -- a path (u,t,v)
38 * through an assert vertex t will be replaced with the edge (u,v) with the
39 * assertion flags from t.
40 *
41 * Edges with mutually incompatible flags (such as the conjunction of
42 * word-to-word and word-to-nonword) are dropped.
43 */
44#include "asserts.h"
45
46#include "compiler/compiler.h"
47#include "nfagraph/ng.h"
48#include "nfagraph/ng_prune.h"
49#include "nfagraph/ng_redundancy.h"
50#include "nfagraph/ng_util.h"
51#include "parser/position.h" // for POS flags
52#include "util/compile_error.h"
53#include "util/graph_range.h"
54
55#include <queue>
56#include <set>
57
58using namespace std;
59
60namespace ue2 {
61
62/** Hard limit on the maximum number of edges we'll clone before we throw up
63 * our hands and report 'Pattern too large.' */
64static const size_t MAX_ASSERT_EDGES = 300000;
65
66/** Flags representing the word-boundary assertions, \\b or \\B. */
67static const int WORDBOUNDARY_FLAGS = POS_FLAG_ASSERT_WORD_TO_WORD
68 | POS_FLAG_ASSERT_WORD_TO_NONWORD
69 | POS_FLAG_ASSERT_NONWORD_TO_WORD
70 | POS_FLAG_ASSERT_NONWORD_TO_NONWORD
71 | POS_FLAG_ASSERT_WORD_TO_WORD_UCP
72 | POS_FLAG_ASSERT_WORD_TO_NONWORD_UCP
73 | POS_FLAG_ASSERT_NONWORD_TO_WORD_UCP
74 | POS_FLAG_ASSERT_NONWORD_TO_NONWORD_UCP;
75
76#define OPEN_EDGE 0U
77#define DEAD_EDGE (~0U)
78
79static
80u32 disjunct(u32 flags1, u32 flags2) {
81 /* from two asserts in parallel */
82 DEBUG_PRINTF("disjunct %x %x\n", flags1, flags2);
83 u32 rv;
84 if (flags1 == DEAD_EDGE) {
85 rv = flags2;
86 } else if (flags2 == DEAD_EDGE) {
87 rv = flags1;
88 } else if (flags1 == OPEN_EDGE || flags2 == OPEN_EDGE) {
89 rv = OPEN_EDGE;
90 } else {
91 rv = flags1 | flags2;
92 }
93 DEBUG_PRINTF("--> %x\n", rv);
94 return rv;
95}
96
97static
98u32 conjunct(u32 flags1, u32 flags2) {
99 /* from two asserts in series */
100 DEBUG_PRINTF("conjunct %x %x\n", flags1, flags2);
101 u32 rv;
102 if (flags1 == OPEN_EDGE) {
103 rv = flags2;
104 } else if (flags2 == OPEN_EDGE) {
105 rv = flags1;
106 } else if (flags1 & flags2) {
107 rv = flags1 & flags2;
108 } else {
109 rv = DEAD_EDGE; /* the conjunction of two different word boundary
110 * assertion is impassable */
111 }
112
113 DEBUG_PRINTF("--> %x\n", rv);
114 return rv;
115}
116
117typedef map<pair<NFAVertex, NFAVertex>, NFAEdge> edge_cache_t;
118
119static
120void replaceAssertVertex(NGHolder &g, NFAVertex t, const ExpressionInfo &expr,
121 edge_cache_t &edge_cache, u32 &assert_edge_count) {
122 DEBUG_PRINTF("replacing assert vertex %zu\n", g[t].index);
123
124 const u32 flags = g[t].assert_flags;
125 DEBUG_PRINTF("consider assert vertex %zu with flags %u\n", g[t].index,
126 flags);
127
128 // Wire up all the predecessors to all the successors.
129
130 for (const auto &inEdge : in_edges_range(t, g)) {
131 NFAVertex u = source(inEdge, g);
132 if (u == t) {
133 continue; // ignore self-loops
134 }
135
136 const u32 flags_inc_in = conjunct(g[inEdge].assert_flags,
137 flags);
138 if (flags_inc_in == DEAD_EDGE) {
139 DEBUG_PRINTF("fail, in-edge has bad flags %d\n",
140 g[inEdge].assert_flags);
141 continue;
142 }
143
144 for (const auto &outEdge : out_edges_range(t, g)) {
145 NFAVertex v = target(outEdge, g);
146
147 DEBUG_PRINTF("consider path [%zu,%zu,%zu]\n", g[u].index,
148 g[t].index, g[v].index);
149
150 if (v == t) {
151 continue; // ignore self-loops
152 }
153
154 const u32 flags_final = conjunct(g[outEdge].assert_flags,
155 flags_inc_in);
156
157 if (flags_final == DEAD_EDGE) {
158 DEBUG_PRINTF("fail, out-edge has bad flags %d\n",
159 g[outEdge].assert_flags);
160 continue;
161 }
162
163 if ((g[u].assert_flags & POS_FLAG_MULTILINE_START)
164 && v == g.acceptEod) {
165 DEBUG_PRINTF("fail, (?m)^ does not match \\n at eod\n");
166 continue;
167 }
168
169 /* Replace path (u,t,v) with direct edge (u,v), unless the edge
170 * already exists, in which case we just need to edit its
171 * properties.
172 *
173 * Use edge_cache to prevent us going O(N).
174 */
175 auto cache_key = make_pair(u, v);
176 auto ecit = edge_cache.find(cache_key);
177 if (ecit == edge_cache.end()) {
178 DEBUG_PRINTF("adding edge %zu %zu\n", g[u].index, g[v].index);
179 NFAEdge e = add_edge(u, v, g);
180 edge_cache.emplace(cache_key, e);
181 g[e].assert_flags = flags;
182 if (++assert_edge_count > MAX_ASSERT_EDGES) {
183 throw CompileError(expr.index, "Pattern is too large.");
184 }
185 } else {
186 NFAEdge e = ecit->second;
187 DEBUG_PRINTF("updating edge %zu %zu [a %zu]\n", g[u].index,
188 g[v].index, g[t].index);
189 // Edge already exists.
190 u32 &e_flags = g[e].assert_flags;
191 e_flags = disjunct(e_flags, flags_final);
192 assert(e_flags != DEAD_EDGE);
193 }
194 }
195 }
196
197 // Clear vertex t to remove all the old edges.
198 /* no need to clear the cache, as we will never look up its edge as it is
199 * unreachable */
200 clear_vertex(t, g);
201}
202
203static
204void setReportId(ReportManager &rm, NGHolder &g, const ExpressionInfo &expr,
205 NFAVertex v, s32 adj) {
206 // Don't try and set the report ID of a special vertex.
207 assert(!is_special(v, g));
208
209 // There should be no reports set already.
210 assert(g[v].reports.empty());
211
212 Report r = rm.getBasicInternalReport(expr, adj);
213
214 g[v].reports.insert(rm.getInternalId(r));
215 DEBUG_PRINTF("set report id for vertex %zu, adj %d\n", g[v].index, adj);
216}
217
218static
219void checkForMultilineStart(ReportManager &rm, NGHolder &g,
220 const ExpressionInfo &expr) {
221 vector<NFAEdge> dead;
222 for (auto v : adjacent_vertices_range(g.start, g)) {
223 if (!(g[v].assert_flags & POS_FLAG_MULTILINE_START)) {
224 continue;
225 }
226 DEBUG_PRINTF("mls %zu %08x\n", g[v].index, g[v].assert_flags);
227
228 /* we have found a multi-line start (maybe more than one) */
229
230 /* we need to interpose a dummy dot vertex between v and accept if
231 * required so that ^ doesn't match trailing \n */
232 for (const auto &e : out_edges_range(v, g)) {
233 if (target(e, g) == g.accept) {
234 dead.push_back(e);
235 }
236 }
237 /* assert has been resolved; clear flag */
238 g[v].assert_flags &= ~POS_FLAG_MULTILINE_START;
239 }
240
241 for (const auto &e : dead) {
242 NFAVertex dummy = add_vertex(g);
243 g[dummy].char_reach.setall();
244 setReportId(rm, g, expr, dummy, -1);
245 add_edge(source(e, g), dummy, g[e], g);
246 add_edge(dummy, g.accept, g);
247 }
248
249 remove_edges(dead, g);
250}
251
252static
253bool hasAssertVertices(const NGHolder &g) {
254 for (auto v : vertices_range(g)) {
255 int flags = g[v].assert_flags;
256 if (flags & WORDBOUNDARY_FLAGS) {
257 return true;
258 }
259 }
260 return false;
261}
262
263/** \brief Convert temporary assert vertices (from construction method) to
264 * edge-based flags.
265 *
266 * Remove the horrors that are the temporary assert vertices which arise from
267 * our construction method. Allows the rest of our code base to live in
268 * blissful ignorance of their existence. */
269void removeAssertVertices(ReportManager &rm, NGHolder &g,
270 const ExpressionInfo &expr) {
271 size_t num = 0;
272
273 DEBUG_PRINTF("before: graph has %zu vertices\n", num_vertices(g));
274
275 // Sweep over the graph and ascertain that we do actually have vertices
276 // with assertion flags set. Otherwise, we're done.
277 if (!hasAssertVertices(g)) {
278 DEBUG_PRINTF("no assert vertices, done\n");
279 return;
280 }
281
282 u32 assert_edge_count = 0;
283
284 // Build a cache of (u, v) vertex pairs to edge descriptors.
285 edge_cache_t edge_cache;
286 for (const auto &e : edges_range(g)) {
287 edge_cache[make_pair(source(e, g), target(e, g))] = e;
288 }
289
290 for (auto v : vertices_range(g)) {
291 if (g[v].assert_flags & WORDBOUNDARY_FLAGS) {
292 replaceAssertVertex(g, v, expr, edge_cache, assert_edge_count);
293 num++;
294 }
295 }
296
297 checkForMultilineStart(rm, g, expr);
298
299 if (num) {
300 DEBUG_PRINTF("resolved %zu assert vertices\n", num);
301 pruneUseless(g);
302 pruneEmptyVertices(g);
303 renumber_vertices(g);
304 renumber_edges(g);
305 }
306
307 DEBUG_PRINTF("after: graph has %zu vertices\n", num_vertices(g));
308 assert(!hasAssertVertices(g));
309}
310
311} // namespace ue2
312