1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Utility functions related to SOM ("Start of Match").
31 */
32#include "ng_som_util.h"
33
34#include "ng_depth.h"
35#include "ng_execute.h"
36#include "ng_holder.h"
37#include "ng_prune.h"
38#include "ng_util.h"
39#include "util/container.h"
40#include "util/graph_range.h"
41
42using namespace std;
43
44namespace ue2 {
45
46static
47void wireSuccessorsToStart(NGHolder &g, NFAVertex u) {
48 for (auto v : adjacent_vertices_range(u, g)) {
49 add_edge_if_not_present(g.start, v, g);
50 }
51}
52
53vector<DepthMinMax> getDistancesFromSOM(const NGHolder &g_orig) {
54 // We operate on a temporary copy of the original graph here, so we don't
55 // have to mutate the original.
56 NGHolder g;
57 unordered_map<NFAVertex, NFAVertex> vmap; // vertex in g_orig to vertex in g
58 cloneHolder(g, g_orig, &vmap);
59
60 vector<NFAVertex> vstarts;
61 for (auto v : vertices_range(g)) {
62 if (is_virtual_start(v, g)) {
63 vstarts.push_back(v);
64 }
65 }
66 vstarts.push_back(g.startDs);
67
68 // wire the successors of every virtual start or startDs to g.start.
69 for (auto v : vstarts) {
70 wireSuccessorsToStart(g, v);
71 }
72
73 // drop the in-edges of every virtual start so that they don't participate
74 // in the depth calculation.
75 for (auto v : vstarts) {
76 clear_in_edges(v, g);
77 }
78
79 //dumpGraph("som_depth.dot", g);
80
81 // Find depths, indexed by vertex index in g
82 auto temp_depths = calcDepthsFrom(g, g.start);
83
84 // Transfer depths, indexed by vertex index in g_orig.
85 vector<DepthMinMax> depths(num_vertices(g_orig));
86
87 for (auto v_orig : vertices_range(g_orig)) {
88 assert(contains(vmap, v_orig));
89 NFAVertex v_new = vmap[v_orig];
90
91 u32 orig_idx = g_orig[v_orig].index;
92
93 DepthMinMax &d = depths.at(orig_idx);
94
95 if (v_orig == g_orig.startDs || is_virtual_start(v_orig, g_orig)) {
96 // StartDs and virtual starts always have zero depth.
97 d = DepthMinMax(depth(0), depth(0));
98 } else {
99 u32 new_idx = g[v_new].index;
100 d = temp_depths.at(new_idx);
101 }
102 }
103
104 return depths;
105}
106
107bool firstMatchIsFirst(const NGHolder &p) {
108 /* If the first match (by end offset) is not the first match (by start
109 * offset) then we can't create a lock after it.
110 *
111 * Consider: 4009:/(foobar|ob).*bugger/s
112 *
113 * We don't care about races on the last byte as they can be resolved easily
114 * at runtime /(foobar|obar).*hi/
115 *
116 * It should be obvious we don't care about one match being a prefix
117 * of another as they share the same start offset.
118 *
119 * Therefore, the case were we cannot establish that the som does not
120 * regress is when there exists s1 and s2 in the language of p and s2 is a
121 * proper infix of s1.
122 *
123 * It is tempting to add the further restriction that there does not exist a
124 * prefix of s1 that is in the language of p (as in which case we would
125 * presume, the lock has already been set). However, we have no way of
126 * knowing if the lock can be cleared by some characters, and if so, if it
127 * is still set. TODO: if we knew the lock's escapes where we could verify
128 * that the rest of s1 does not clear the lock. (1)
129 */
130
131 DEBUG_PRINTF("entry\n");
132
133 /* If there are any big cycles throw up our hands in despair */
134 if (hasBigCycles(p)) {
135 DEBUG_PRINTF("fail, big cycles\n");
136 return false;
137 }
138
139 flat_set<NFAVertex> states;
140 /* turn on all states (except starts - avoid suffix matches) */
141 /* If we were doing (1) we would also except states leading to accepts -
142 avoid prefix matches */
143 for (auto v : vertices_range(p)) {
144 assert(!is_virtual_start(v, p));
145 if (!is_special(v, p)) {
146 DEBUG_PRINTF("turning on %zu\n", p[v].index);
147 states.insert(v);
148 }
149 }
150
151 /* run the prefix the main graph */
152 states = execute_graph(p, p, states);
153
154 for (auto v : states) {
155 /* need to check if this vertex may represent an infix match - ie
156 * it does not have an edge to accept. */
157 DEBUG_PRINTF("check %zu\n", p[v].index);
158 if (!edge(v, p.accept, p).second) {
159 DEBUG_PRINTF("fail %zu\n", p[v].index);
160 return false;
161 }
162 }
163
164 DEBUG_PRINTF("done first is first check\n");
165 return true;
166}
167
168bool somMayGoBackwards(NFAVertex u, const NGHolder &g,
169 const unordered_map<NFAVertex, u32> &region_map,
170 smgb_cache &cache) {
171 /* Need to ensure all matches of the graph g up to u contain no infixes
172 * which are also matches of the graph to u.
173 *
174 * This is basically the same as firstMatchIsFirst except we g is not
175 * always a dag. As we haven't gotten around to writing an execute_graph
176 * that operates on general graphs, we take some (hopefully) conservative
177 * short cuts.
178 *
179 * Note: if the u can be jumped we will take jump edges
180 * into account as a possibility of som going backwards
181 *
182 * TODO: write a generalised ng_execute_graph/make this less hacky
183 */
184 assert(&g == &cache.g);
185 if (contains(cache.smgb, u)) {
186 return cache.smgb[u];
187 }
188
189 DEBUG_PRINTF("checking if som can go backwards on %zu\n", g[u].index);
190
191 set<NFAEdge> be;
192 BackEdges<set<NFAEdge>> backEdgeVisitor(be);
193 boost::depth_first_search(g, visitor(backEdgeVisitor).root_vertex(g.start));
194
195 bool rv;
196 if (0) {
197 exit:
198 DEBUG_PRINTF("using cached result\n");
199 cache.smgb[u] = rv;
200 return rv;
201 }
202
203 assert(contains(region_map, u));
204 const u32 u_region = region_map.at(u);
205
206 for (const auto &e : be) {
207 NFAVertex s = source(e, g);
208 NFAVertex t = target(e, g);
209 /* only need to worry about big cycles including/before u */
210 DEBUG_PRINTF("back edge %zu %zu\n", g[s].index, g[t].index);
211 if (s != t && region_map.at(s) <= u_region) {
212 DEBUG_PRINTF("eek big cycle\n");
213 rv = true; /* big cycle -> eek */
214 goto exit;
215 }
216 }
217
218 unordered_map<NFAVertex, NFAVertex> orig_to_copy;
219 NGHolder c_g;
220 cloneHolder(c_g, g, &orig_to_copy);
221
222 /* treat virtual starts as unconditional - wire to startDs instead */
223 for (NFAVertex v : vertices_range(g)) {
224 if (!is_virtual_start(v, g)) {
225 continue;
226 }
227 NFAVertex c_v = orig_to_copy[v];
228 orig_to_copy[v] = c_g.startDs;
229 for (NFAVertex c_w : adjacent_vertices_range(c_v, c_g)) {
230 add_edge_if_not_present(c_g.startDs, c_w, c_g);
231 }
232 clear_vertex(c_v, c_g);
233 }
234
235 /* treat u as the only accept state */
236 NFAVertex c_u = orig_to_copy[u];
237 clear_in_edges(c_g.acceptEod, c_g);
238 add_edge(c_g.accept, c_g.acceptEod, c_g);
239 clear_in_edges(c_g.accept, c_g);
240 clear_out_edges(c_u, c_g);
241 if (hasSelfLoop(u, g)) {
242 add_edge(c_u, c_u, c_g);
243 }
244 add_edge(c_u, c_g.accept, c_g);
245
246 set<NFAVertex> u_succ;
247 insert(&u_succ, adjacent_vertices(u, g));
248 u_succ.erase(u);
249
250 for (auto t : inv_adjacent_vertices_range(u, g)) {
251 if (t == u) {
252 continue;
253 }
254 for (auto v : adjacent_vertices_range(t, g)) {
255 if (contains(u_succ, v)) {
256 /* due to virtual starts being aliased with normal starts in the
257 * copy of the graph, we may have already added the edges. */
258 add_edge_if_not_present(orig_to_copy[t], c_g.accept, c_g);
259 break;
260 }
261 }
262 }
263
264 pruneUseless(c_g);
265
266 be.clear();
267 boost::depth_first_search(c_g, visitor(backEdgeVisitor)
268 .root_vertex(c_g.start));
269
270 for (const auto &e : be) {
271 NFAVertex s = source(e, c_g);
272 NFAVertex t = target(e, c_g);
273 DEBUG_PRINTF("back edge %zu %zu\n", c_g[s].index, c_g[t].index);
274 if (s != t) {
275 assert(0);
276 DEBUG_PRINTF("eek big cycle\n");
277 rv = true; /* big cycle -> eek */
278 goto exit;
279 }
280 }
281
282 DEBUG_PRINTF("checking acyclic+selfloop graph\n");
283
284 rv = !firstMatchIsFirst(c_g);
285 DEBUG_PRINTF("som may regress? %d\n", (int)rv);
286 goto exit;
287}
288
289bool sentClearsTail(const NGHolder &g,
290 const unordered_map<NFAVertex, u32> &region_map,
291 const NGHolder &sent, u32 last_head_region,
292 u32 *bad_region) {
293 /* if a subsequent match from the prefix clears the rest of the pattern
294 * we can just keep track of the last match of the prefix.
295 * To see if this property holds, we could:
296 *
297 * 1A: turn on all states in the tail and run all strings that may
298 * match the prefix past the tail, if we are still in any states then
299 * this property does not hold.
300 *
301 * 1B: we turn on the initial states of the tail and run any strings which
302 * may finish any partial matches in the prefix and see if we end up with
303 * anything which would also imply that this property does not hold.
304 *
305 * OR
306 *
307 * 2: we just turn everything and run the prefix inputs past it and see what
308 * we are left with. I think that is equivalent to scheme 1 and is easier to
309 * implement. TODO: ponder
310 *
311 * Anyway, we are going with scheme 2 until further notice.
312 */
313
314 u32 first_bad_region = ~0U;
315 flat_set<NFAVertex> states;
316 /* turn on all states */
317 DEBUG_PRINTF("region %u is cutover\n", last_head_region);
318 for (auto v : vertices_range(g)) {
319 if (v != g.accept && v != g.acceptEod) {
320 states.insert(v);
321 }
322 }
323
324 for (UNUSED auto v : states) {
325 DEBUG_PRINTF("start state: %zu\n", g[v].index);
326 }
327
328 /* run the prefix the main graph */
329 states = execute_graph(g, sent, states);
330
331 /* .. and check if we are left with anything in the tail region */
332 for (auto v : states) {
333 if (v == g.start || v == g.startDs) {
334 continue; /* not in tail */
335 }
336
337 DEBUG_PRINTF("v %zu is still on\n", g[v].index);
338 assert(v != g.accept && v != g.acceptEod); /* no cr */
339
340 assert(contains(region_map, v));
341 const u32 v_region = region_map.at(v);
342 if (v_region > last_head_region) {
343 DEBUG_PRINTF("bailing, %u > %u\n", v_region, last_head_region);
344 first_bad_region = min(first_bad_region, v_region);
345 }
346 }
347
348 if (first_bad_region != ~0U) {
349 DEBUG_PRINTF("first bad region is %u\n", first_bad_region);
350 *bad_region = first_bad_region;
351 return false;
352 }
353
354 return true;
355}
356
357} // namespace ue2
358