1 | /* |
2 | * Copyright (c) 2015-2017, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** \file |
30 | * \brief Convert temporary assert vertices (from construction method) to |
31 | * edge-based flags. |
32 | * |
33 | * This pass converts the temporary assert vertices created by the Glushkov |
34 | * construction process above (vertices with special assertions flags) into |
35 | * edges between those vertices' neighbours in the graph. |
36 | * |
37 | * These edges have the appropriate flags applied to them -- a path (u,t,v) |
38 | * through an assert vertex t will be replaced with the edge (u,v) with the |
39 | * assertion flags from t. |
40 | * |
41 | * Edges with mutually incompatible flags (such as the conjunction of |
42 | * word-to-word and word-to-nonword) are dropped. |
43 | */ |
44 | #include "asserts.h" |
45 | |
46 | #include "compiler/compiler.h" |
47 | #include "nfagraph/ng.h" |
48 | #include "nfagraph/ng_prune.h" |
49 | #include "nfagraph/ng_redundancy.h" |
50 | #include "nfagraph/ng_util.h" |
51 | #include "parser/position.h" // for POS flags |
52 | #include "util/compile_error.h" |
53 | #include "util/graph_range.h" |
54 | |
55 | #include <queue> |
56 | #include <set> |
57 | |
58 | using namespace std; |
59 | |
60 | namespace ue2 { |
61 | |
62 | /** Hard limit on the maximum number of edges we'll clone before we throw up |
63 | * our hands and report 'Pattern too large.' */ |
64 | static const size_t MAX_ASSERT_EDGES = 300000; |
65 | |
66 | /** Flags representing the word-boundary assertions, \\b or \\B. */ |
67 | static const int WORDBOUNDARY_FLAGS = POS_FLAG_ASSERT_WORD_TO_WORD |
68 | | POS_FLAG_ASSERT_WORD_TO_NONWORD |
69 | | POS_FLAG_ASSERT_NONWORD_TO_WORD |
70 | | POS_FLAG_ASSERT_NONWORD_TO_NONWORD |
71 | | POS_FLAG_ASSERT_WORD_TO_WORD_UCP |
72 | | POS_FLAG_ASSERT_WORD_TO_NONWORD_UCP |
73 | | POS_FLAG_ASSERT_NONWORD_TO_WORD_UCP |
74 | | POS_FLAG_ASSERT_NONWORD_TO_NONWORD_UCP; |
75 | |
76 | #define OPEN_EDGE 0U |
77 | #define DEAD_EDGE (~0U) |
78 | |
79 | static |
80 | u32 disjunct(u32 flags1, u32 flags2) { |
81 | /* from two asserts in parallel */ |
82 | DEBUG_PRINTF("disjunct %x %x\n" , flags1, flags2); |
83 | u32 rv; |
84 | if (flags1 == DEAD_EDGE) { |
85 | rv = flags2; |
86 | } else if (flags2 == DEAD_EDGE) { |
87 | rv = flags1; |
88 | } else if (flags1 == OPEN_EDGE || flags2 == OPEN_EDGE) { |
89 | rv = OPEN_EDGE; |
90 | } else { |
91 | rv = flags1 | flags2; |
92 | } |
93 | DEBUG_PRINTF("--> %x\n" , rv); |
94 | return rv; |
95 | } |
96 | |
97 | static |
98 | u32 conjunct(u32 flags1, u32 flags2) { |
99 | /* from two asserts in series */ |
100 | DEBUG_PRINTF("conjunct %x %x\n" , flags1, flags2); |
101 | u32 rv; |
102 | if (flags1 == OPEN_EDGE) { |
103 | rv = flags2; |
104 | } else if (flags2 == OPEN_EDGE) { |
105 | rv = flags1; |
106 | } else if (flags1 & flags2) { |
107 | rv = flags1 & flags2; |
108 | } else { |
109 | rv = DEAD_EDGE; /* the conjunction of two different word boundary |
110 | * assertion is impassable */ |
111 | } |
112 | |
113 | DEBUG_PRINTF("--> %x\n" , rv); |
114 | return rv; |
115 | } |
116 | |
117 | typedef map<pair<NFAVertex, NFAVertex>, NFAEdge> edge_cache_t; |
118 | |
119 | static |
120 | void replaceAssertVertex(NGHolder &g, NFAVertex t, const ExpressionInfo &expr, |
121 | edge_cache_t &edge_cache, u32 &assert_edge_count) { |
122 | DEBUG_PRINTF("replacing assert vertex %zu\n" , g[t].index); |
123 | |
124 | const u32 flags = g[t].assert_flags; |
125 | DEBUG_PRINTF("consider assert vertex %zu with flags %u\n" , g[t].index, |
126 | flags); |
127 | |
128 | // Wire up all the predecessors to all the successors. |
129 | |
130 | for (const auto &inEdge : in_edges_range(t, g)) { |
131 | NFAVertex u = source(inEdge, g); |
132 | if (u == t) { |
133 | continue; // ignore self-loops |
134 | } |
135 | |
136 | const u32 flags_inc_in = conjunct(g[inEdge].assert_flags, |
137 | flags); |
138 | if (flags_inc_in == DEAD_EDGE) { |
139 | DEBUG_PRINTF("fail, in-edge has bad flags %d\n" , |
140 | g[inEdge].assert_flags); |
141 | continue; |
142 | } |
143 | |
144 | for (const auto &outEdge : out_edges_range(t, g)) { |
145 | NFAVertex v = target(outEdge, g); |
146 | |
147 | DEBUG_PRINTF("consider path [%zu,%zu,%zu]\n" , g[u].index, |
148 | g[t].index, g[v].index); |
149 | |
150 | if (v == t) { |
151 | continue; // ignore self-loops |
152 | } |
153 | |
154 | const u32 flags_final = conjunct(g[outEdge].assert_flags, |
155 | flags_inc_in); |
156 | |
157 | if (flags_final == DEAD_EDGE) { |
158 | DEBUG_PRINTF("fail, out-edge has bad flags %d\n" , |
159 | g[outEdge].assert_flags); |
160 | continue; |
161 | } |
162 | |
163 | if ((g[u].assert_flags & POS_FLAG_MULTILINE_START) |
164 | && v == g.acceptEod) { |
165 | DEBUG_PRINTF("fail, (?m)^ does not match \\n at eod\n" ); |
166 | continue; |
167 | } |
168 | |
169 | /* Replace path (u,t,v) with direct edge (u,v), unless the edge |
170 | * already exists, in which case we just need to edit its |
171 | * properties. |
172 | * |
173 | * Use edge_cache to prevent us going O(N). |
174 | */ |
175 | auto cache_key = make_pair(u, v); |
176 | auto ecit = edge_cache.find(cache_key); |
177 | if (ecit == edge_cache.end()) { |
178 | DEBUG_PRINTF("adding edge %zu %zu\n" , g[u].index, g[v].index); |
179 | NFAEdge e = add_edge(u, v, g); |
180 | edge_cache.emplace(cache_key, e); |
181 | g[e].assert_flags = flags; |
182 | if (++assert_edge_count > MAX_ASSERT_EDGES) { |
183 | throw CompileError(expr.index, "Pattern is too large." ); |
184 | } |
185 | } else { |
186 | NFAEdge e = ecit->second; |
187 | DEBUG_PRINTF("updating edge %zu %zu [a %zu]\n" , g[u].index, |
188 | g[v].index, g[t].index); |
189 | // Edge already exists. |
190 | u32 &e_flags = g[e].assert_flags; |
191 | e_flags = disjunct(e_flags, flags_final); |
192 | assert(e_flags != DEAD_EDGE); |
193 | } |
194 | } |
195 | } |
196 | |
197 | // Clear vertex t to remove all the old edges. |
198 | /* no need to clear the cache, as we will never look up its edge as it is |
199 | * unreachable */ |
200 | clear_vertex(t, g); |
201 | } |
202 | |
203 | static |
204 | void setReportId(ReportManager &rm, NGHolder &g, const ExpressionInfo &expr, |
205 | NFAVertex v, s32 adj) { |
206 | // Don't try and set the report ID of a special vertex. |
207 | assert(!is_special(v, g)); |
208 | |
209 | // There should be no reports set already. |
210 | assert(g[v].reports.empty()); |
211 | |
212 | Report r = rm.getBasicInternalReport(expr, adj); |
213 | |
214 | g[v].reports.insert(rm.getInternalId(r)); |
215 | DEBUG_PRINTF("set report id for vertex %zu, adj %d\n" , g[v].index, adj); |
216 | } |
217 | |
218 | static |
219 | void checkForMultilineStart(ReportManager &rm, NGHolder &g, |
220 | const ExpressionInfo &expr) { |
221 | vector<NFAEdge> dead; |
222 | for (auto v : adjacent_vertices_range(g.start, g)) { |
223 | if (!(g[v].assert_flags & POS_FLAG_MULTILINE_START)) { |
224 | continue; |
225 | } |
226 | DEBUG_PRINTF("mls %zu %08x\n" , g[v].index, g[v].assert_flags); |
227 | |
228 | /* we have found a multi-line start (maybe more than one) */ |
229 | |
230 | /* we need to interpose a dummy dot vertex between v and accept if |
231 | * required so that ^ doesn't match trailing \n */ |
232 | for (const auto &e : out_edges_range(v, g)) { |
233 | if (target(e, g) == g.accept) { |
234 | dead.push_back(e); |
235 | } |
236 | } |
237 | /* assert has been resolved; clear flag */ |
238 | g[v].assert_flags &= ~POS_FLAG_MULTILINE_START; |
239 | } |
240 | |
241 | for (const auto &e : dead) { |
242 | NFAVertex dummy = add_vertex(g); |
243 | g[dummy].char_reach.setall(); |
244 | setReportId(rm, g, expr, dummy, -1); |
245 | add_edge(source(e, g), dummy, g[e], g); |
246 | add_edge(dummy, g.accept, g); |
247 | } |
248 | |
249 | remove_edges(dead, g); |
250 | } |
251 | |
252 | static |
253 | bool hasAssertVertices(const NGHolder &g) { |
254 | for (auto v : vertices_range(g)) { |
255 | int flags = g[v].assert_flags; |
256 | if (flags & WORDBOUNDARY_FLAGS) { |
257 | return true; |
258 | } |
259 | } |
260 | return false; |
261 | } |
262 | |
263 | /** \brief Convert temporary assert vertices (from construction method) to |
264 | * edge-based flags. |
265 | * |
266 | * Remove the horrors that are the temporary assert vertices which arise from |
267 | * our construction method. Allows the rest of our code base to live in |
268 | * blissful ignorance of their existence. */ |
269 | void removeAssertVertices(ReportManager &rm, NGHolder &g, |
270 | const ExpressionInfo &expr) { |
271 | size_t num = 0; |
272 | |
273 | DEBUG_PRINTF("before: graph has %zu vertices\n" , num_vertices(g)); |
274 | |
275 | // Sweep over the graph and ascertain that we do actually have vertices |
276 | // with assertion flags set. Otherwise, we're done. |
277 | if (!hasAssertVertices(g)) { |
278 | DEBUG_PRINTF("no assert vertices, done\n" ); |
279 | return; |
280 | } |
281 | |
282 | u32 assert_edge_count = 0; |
283 | |
284 | // Build a cache of (u, v) vertex pairs to edge descriptors. |
285 | edge_cache_t edge_cache; |
286 | for (const auto &e : edges_range(g)) { |
287 | edge_cache[make_pair(source(e, g), target(e, g))] = e; |
288 | } |
289 | |
290 | for (auto v : vertices_range(g)) { |
291 | if (g[v].assert_flags & WORDBOUNDARY_FLAGS) { |
292 | replaceAssertVertex(g, v, expr, edge_cache, assert_edge_count); |
293 | num++; |
294 | } |
295 | } |
296 | |
297 | checkForMultilineStart(rm, g, expr); |
298 | |
299 | if (num) { |
300 | DEBUG_PRINTF("resolved %zu assert vertices\n" , num); |
301 | pruneUseless(g); |
302 | pruneEmptyVertices(g); |
303 | renumber_vertices(g); |
304 | renumber_edges(g); |
305 | } |
306 | |
307 | DEBUG_PRINTF("after: graph has %zu vertices\n" , num_vertices(g)); |
308 | assert(!hasAssertVertices(g)); |
309 | } |
310 | |
311 | } // namespace ue2 |
312 | |