ng_som_util.cpp source code [ClickHouse/contrib/hyperscan/src/nfagraph/ng_som_util.cpp]

1	/*
2	* Copyright (c) 2015-2017, Intel Corporation
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions are met:
6	*
7	* * Redistributions of source code must retain the above copyright notice,
8	* this list of conditions and the following disclaimer.
9	* * Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	* * Neither the name of Intel Corporation nor the names of its contributors
13	* may be used to endorse or promote products derived from this software
14	* without specific prior written permission.
15	*
16	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26	* POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/* \file*
30	* \brief Utility functions related to SOM ("Start of Match").
31	*/
32	#include "ng_som_util.h"
33
34	#include "ng_depth.h"
35	#include "ng_execute.h"
36	#include "ng_holder.h"
37	#include "ng_prune.h"
38	#include "ng_util.h"
39	#include "util/container.h"
40	#include "util/graph_range.h"
41
42	using namespace std;
43
44	namespace ue2 {
45
46	static
47	void wireSuccessorsToStart(NGHolder &g, NFAVertex u) {
48	for (auto v : adjacent_vertices_range(u, g)) {
49	add_edge_if_not_present(g.start, v, g);
50	}
51	}
52
53	vector<DepthMinMax> getDistancesFromSOM(const NGHolder &g_orig) {
54	// We operate on a temporary copy of the original graph here, so we don't
55	// have to mutate the original.
56	NGHolder g;
57	unordered_map<NFAVertex, NFAVertex> vmap; // vertex in g_orig to vertex in g
58	cloneHolder(g, g_orig, &vmap);
59
60	vector<NFAVertex> vstarts;
61	for (auto v : vertices_range(g)) {
62	if (is_virtual_start(v, g)) {
63	vstarts.push_back(v);
64	}
65	}
66	vstarts.push_back(g.startDs);
67
68	// wire the successors of every virtual start or startDs to g.start.
69	for (auto v : vstarts) {
70	wireSuccessorsToStart(g, v);
71	}
72
73	// drop the in-edges of every virtual start so that they don't participate
74	// in the depth calculation.
75	for (auto v : vstarts) {
76	clear_in_edges(v, g);
77	}
78
79	//dumpGraph("som_depth.dot", g);
80
81	// Find depths, indexed by vertex index in g
82	auto temp_depths = calcDepthsFrom(g, g.start);
83
84	// Transfer depths, indexed by vertex index in g_orig.
85	vector<DepthMinMax> depths(num_vertices(g_orig));
86
87	for (auto v_orig : vertices_range(g_orig)) {
88	assert(contains(vmap, v_orig));
89	NFAVertex v_new = vmap [v_orig];
90
91	u32 orig_idx = g_orig [v_orig].index;
92
93	DepthMinMax &d = depths.at(orig_idx);
94
95	if (v_orig == g_orig.startDs \|\| is_virtual_start(v_orig, g_orig)) {
96	// StartDs and virtual starts always have zero depth.
97	d = DepthMinMax (depth (`0`), depth (`0`));
98	} else {
99	u32 new_idx = g [v_new].index;
100	d = temp_depths.at(new_idx);
101	}
102	}
103
104	return depths;
105	}
106
107	bool firstMatchIsFirst(const NGHolder &p) {
108	/ If the first match (by end offset) is not the first match (by start*
109	* offset) then we can't create a lock after it.
110	*
111	* Consider: 4009:/(foobar\|ob).*bugger/s
112	*
113	* We don't care about races on the last byte as they can be resolved easily
114	* at runtime /(foobar\|obar).*hi/
115	*
116	* It should be obvious we don't care about one match being a prefix
117	* of another as they share the same start offset.
118	*
119	* Therefore, the case were we cannot establish that the som does not
120	* regress is when there exists s1 and s2 in the language of p and s2 is a
121	* proper infix of s1.
122	*
123	* It is tempting to add the further restriction that there does not exist a
124	* prefix of s1 that is in the language of p (as in which case we would
125	* presume, the lock has already been set). However, we have no way of
126	* knowing if the lock can be cleared by some characters, and if so, if it
127	* is still set. TODO: if we knew the lock's escapes where we could verify
128	* that the rest of s1 does not clear the lock. (1)
129	*/
130
131	DEBUG_PRINTF("entry\n");
132
133	/ If there are any big cycles throw up our hands in despair /
134	if (hasBigCycles(p)) {
135	DEBUG_PRINTF("fail, big cycles\n");
136	return false;
137	}
138
139	flat_set<NFAVertex> states;
140	/ turn on all states (except starts - avoid suffix matches) /
141	/ If we were doing (1) we would also except states leading to accepts -*
142	avoid prefix matches /*
143	for (auto v : vertices_range(p)) {
144	assert(!is_virtual_start(v, p));
145	if (!is_special(v, p)) {
146	DEBUG_PRINTF("turning on %zu\n", p[v].index);
147	states.insert(v);
148	}
149	}
150
151	/ run the prefix the main graph /
152	states = execute_graph(p, p, states);
153
154	for (auto v : states) {
155	/ need to check if this vertex may represent an infix match - ie*
156	* it does not have an edge to accept. */
157	DEBUG_PRINTF("check %zu\n", p[v].index);
158	if (!edge(v, p.accept, p).second) {
159	DEBUG_PRINTF("fail %zu\n", p[v].index);
160	return false;
161	}
162	}
163
164	DEBUG_PRINTF("done first is first check\n");
165	return true;
166	}
167
168	bool somMayGoBackwards(NFAVertex u, const NGHolder &g,
169	const unordered_map<NFAVertex, u32> &region_map,
170	smgb_cache &cache) {
171	/ Need to ensure all matches of the graph g up to u contain no infixes*
172	* which are also matches of the graph to u.
173	*
174	* This is basically the same as firstMatchIsFirst except we g is not
175	* always a dag. As we haven't gotten around to writing an execute_graph
176	* that operates on general graphs, we take some (hopefully) conservative
177	* short cuts.
178	*
179	* Note: if the u can be jumped we will take jump edges
180	* into account as a possibility of som going backwards
181	*
182	* TODO: write a generalised ng_execute_graph/make this less hacky
183	*/
184	assert(&g == &cache.g);
185	if (contains(cache.smgb, u)) {
186	return cache.smgb [u];
187	}
188
189	DEBUG_PRINTF("checking if som can go backwards on %zu\n", g[u].index);
190
191	set<NFAEdge> be;
192	BackEdges<set<NFAEdge>> backEdgeVisitor(be);
193	boost::depth_first_search(g, visitor(backEdgeVisitor).root_vertex(g.start));
194
195	bool rv;
196	if (`0`) {
197	exit:
198	DEBUG_PRINTF("using cached result\n");
199	cache.smgb [u] = rv;
200	return rv;
201	}
202
203	assert(contains(region_map, u));
204	const u32 u_region = region_map.at(u);
205
206	for (const auto &e : be) {
207	NFAVertex s = source(e, g);
208	NFAVertex t = target(e, g);
209	/ only need to worry about big cycles including/before u /
210	DEBUG_PRINTF("back edge %zu %zu\n", g[s].index, g[t].index);
211	if (s != t && region_map.at(s) <= u_region) {
212	DEBUG_PRINTF("eek big cycle\n");
213	rv = true; / big cycle -> eek /
214	goto exit;
215	}
216	}
217
218	unordered_map<NFAVertex, NFAVertex> orig_to_copy;
219	NGHolder c_g;
220	cloneHolder(c_g, g, &orig_to_copy);
221
222	/ treat virtual starts as unconditional - wire to startDs instead /
223	for (NFAVertex v : vertices_range(g)) {
224	if (!is_virtual_start(v, g)) {
225	continue;
226	}
227	NFAVertex c_v = orig_to_copy [v];
228	orig_to_copy [v] = c_g.startDs;
229	for (NFAVertex c_w : adjacent_vertices_range(c_v, c_g)) {
230	add_edge_if_not_present(c_g.startDs, c_w, c_g);
231	}
232	clear_vertex(c_v, c_g);
233	}
234
235	/ treat u as the only accept state /
236	NFAVertex c_u = orig_to_copy [u];
237	clear_in_edges(c_g.acceptEod, c_g);
238	add_edge(c_g.accept, c_g.acceptEod, c_g);
239	clear_in_edges(c_g.accept, c_g);
240	clear_out_edges(c_u, c_g);
241	if (hasSelfLoop(u, g)) {
242	add_edge(c_u, c_u, c_g);
243	}
244	add_edge(c_u, c_g.accept, c_g);
245
246	set<NFAVertex> u_succ;
247	insert(&u_succ, adjacent_vertices(u, g));
248	u_succ.erase(u);
249
250	for (auto t : inv_adjacent_vertices_range(u, g)) {
251	if (t == u) {
252	continue;
253	}
254	for (auto v : adjacent_vertices_range(t, g)) {
255	if (contains(u_succ, v)) {
256	/ due to virtual starts being aliased with normal starts in the*
257	* copy of the graph, we may have already added the edges. */
258	add_edge_if_not_present(orig_to_copy [t], c_g.accept, c_g);
259	break;
260	}
261	}
262	}
263
264	pruneUseless(c_g);
265
266	be.clear();
267	boost::depth_first_search(c_g, visitor(backEdgeVisitor)
268	.root_vertex(c_g.start));
269
270	for (const auto &e : be) {
271	NFAVertex s = source(e, c_g);
272	NFAVertex t = target(e, c_g);
273	DEBUG_PRINTF("back edge %zu %zu\n", c_g[s].index, c_g[t].index);
274	if (s != t) {
275	assert(`0`);
276	DEBUG_PRINTF("eek big cycle\n");
277	rv = true; / big cycle -> eek /
278	goto exit;
279	}
280	}
281
282	DEBUG_PRINTF("checking acyclic+selfloop graph\n");
283
284	rv = !firstMatchIsFirst(c_g);
285	DEBUG_PRINTF("som may regress? %d\n", (int)rv);
286	goto exit;
287	}
288
289	bool sentClearsTail(const NGHolder &g,
290	const unordered_map<NFAVertex, u32> &region_map,
291	const NGHolder &sent, u32 last_head_region,
292	u32 *bad_region) {
293	/ if a subsequent match from the prefix clears the rest of the pattern*
294	* we can just keep track of the last match of the prefix.
295	* To see if this property holds, we could:
296	*
297	* 1A: turn on all states in the tail and run all strings that may
298	* match the prefix past the tail, if we are still in any states then
299	* this property does not hold.
300	*
301	* 1B: we turn on the initial states of the tail and run any strings which
302	* may finish any partial matches in the prefix and see if we end up with
303	* anything which would also imply that this property does not hold.
304	*
305	* OR
306	*
307	* 2: we just turn everything and run the prefix inputs past it and see what
308	* we are left with. I think that is equivalent to scheme 1 and is easier to
309	* implement. TODO: ponder
310	*
311	* Anyway, we are going with scheme 2 until further notice.
312	*/
313
314	u32 first_bad_region = ~`0U`;
315	flat_set<NFAVertex> states;
316	/ turn on all states /
317	DEBUG_PRINTF("region %u is cutover\n", last_head_region);
318	for (auto v : vertices_range(g)) {
319	if (v != g.accept && v != g.acceptEod) {
320	states.insert(v);
321	}
322	}
323
324	for (UNUSED auto v : states) {
325	DEBUG_PRINTF("start state: %zu\n", g[v].index);
326	}
327
328	/ run the prefix the main graph /
329	states = execute_graph(g, sent, states);
330
331	/ .. and check if we are left with anything in the tail region /
332	for (auto v : states) {
333	if (v == g.start \|\| v == g.startDs) {
334	continue; / not in tail /
335	}
336
337	DEBUG_PRINTF("v %zu is still on\n", g[v].index);
338	assert(v != g.accept && v != g.acceptEod); / no cr /
339
340	assert(contains(region_map, v));
341	const u32 v_region = region_map.at(v);
342	if (v_region > last_head_region) {
343	DEBUG_PRINTF("bailing, %u > %u\n", v_region, last_head_region);
344	first_bad_region = min(first_bad_region, v_region);
345	}
346	}
347
348	if (first_bad_region != ~`0U`) {
349	DEBUG_PRINTF("first bad region is %u\n", first_bad_region);
350	*bad_region = first_bad_region;
351	return false;
352	}
353
354	return true;
355	}
356
357	} // namespace ue2
358

Browse the source code of ClickHouse/contrib/hyperscan/src/nfagraph/ng_som_util.cpp