ng_execute.cpp source code [ClickHouse/contrib/hyperscan/src/nfagraph/ng_execute.cpp]

1	/*
2	* Copyright (c) 2015-2016, Intel Corporation
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions are met:
6	*
7	* * Redistributions of source code must retain the above copyright notice,
8	* this list of conditions and the following disclaimer.
9	* * Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	* * Neither the name of Intel Corporation nor the names of its contributors
13	* may be used to endorse or promote products derived from this software
14	* without specific prior written permission.
15	*
16	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26	* POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/* \file*
30	* \brief Execute an NFA over a given input, returning the set of states that
31	* are active afterwards.
32	*
33	* Note: although our external interfaces for execute_graph() use std::set, we
34	* use a dynamic bitset containing the vertex indices internally for
35	* performance.
36	*/
37	#include "ng_execute.h"
38
39	#include "ng_holder.h"
40	#include "ng_util.h"
41	#include "ue2common.h"
42	#include "util/container.h"
43	#include "util/dump_charclass.h"
44	#include "util/graph_range.h"
45	#include "util/ue2string.h"
46
47	#include <sstream>
48	#include <string>
49
50	#include <boost/dynamic_bitset.hpp>
51	#include <boost/graph/depth_first_search.hpp>
52	#include <boost/graph/reverse_graph.hpp>
53
54	using namespace std;
55	using boost::dynamic_bitset;
56
57	namespace ue2 {
58
59	struct StateInfo {
60	StateInfo(NFAVertex v, const CharReach &cr) : vertex (v), reach (cr) {}
61	StateInfo() : vertex (NGHolder::null_vertex()) {}
62	NFAVertex vertex;
63	CharReach reach;
64	};
65
66	#ifdef DEBUG
67	static
68	std::string dumpStates(const dynamic_bitset<> &s) {
69	std::ostringstream oss;
70	for (size_t i = s.find_first(); i != s.npos; i = s.find_next(i)) {
71	oss << i << " ";
72	}
73	return oss.str();
74	}
75	#endif
76
77	static
78	void step(const NGHolder &g, const vector<StateInfo> &info,
79	const dynamic_bitset<> &in, dynamic_bitset<> *out) {
80	out->reset();
81	for (size_t i = in.find_first(); i != in.npos; i = in.find_next(i)) {
82	NFAVertex u = info [i].vertex;
83	for (auto v : adjacent_vertices_range(u, g)) {
84	out->set(g [v].index);
85	}
86	}
87	}
88
89	static
90	void filter_by_reach(const vector<StateInfo> &info, dynamic_bitset<> *states,
91	const CharReach &cr) {
92	for (size_t i = states->find_first(); i != states->npos;
93	i = states->find_next(i)) {
94	if ((info [i].reach & cr).none()) {
95	states->reset(i);
96	}
97	}
98	}
99
100	template<typename inputT>
101	static
102	void execute_graph_i(const NGHolder &g, const vector<StateInfo> &info,
103	const inputT &input, dynamic_bitset<> *states,
104	bool kill_sds) {
105	dynamic_bitset<> &curr = *states;
106	dynamic_bitset<> next(curr.size());
107	DEBUG_PRINTF("%zu states in\n", states->count());
108
109	for (const auto &e : input) {
110	DEBUG_PRINTF("processing %s\n", describeClass(e).c_str());
111	step(g, info, curr, &next);
112	if (kill_sds) {
113	next.reset(NODE_START_DOTSTAR);
114	}
115	filter_by_reach(info, &next, e);
116	next.swap(curr);
117
118	if (curr.empty()) {
119	DEBUG_PRINTF("went dead\n");
120	break;
121	}
122	}
123
124	DEBUG_PRINTF("%zu states out\n", states->size());
125	}
126
127	static
128	dynamic_bitset<> makeStateBitset(const NGHolder &g,
129	const flat_set<NFAVertex> &in) {
130	dynamic_bitset<> work_states(num_vertices(g));
131	for (const auto &v : in) {
132	u32 idx = g [v].index;
133	work_states.set(idx);
134	}
135	return work_states;
136	}
137
138	static
139	flat_set<NFAVertex> getVertices(const dynamic_bitset<> &in,
140	const vector<StateInfo> &info) {
141	flat_set<NFAVertex> out;
142	for (size_t i = in.find_first(); i != in.npos; i = in.find_next(i)) {
143	out.insert(info [i].vertex);
144	}
145	return out;
146	}
147
148	static
149	vector<StateInfo> makeInfoTable(const NGHolder &g) {
150	vector<StateInfo> info(num_vertices(g));
151	for (auto v : vertices_range(g)) {
152	u32 idx = g [v].index;
153	const CharReach &cr = g [v].char_reach;
154	assert(idx < info.size());
155	info [idx] = StateInfo (v, cr);
156	}
157	return info;
158	}
159
160	flat_set<NFAVertex> execute_graph(const NGHolder &g, const ue2_literal &input,
161	const flat_set<NFAVertex> &initial_states,
162	bool kill_sds) {
163	assert(hasCorrectlyNumberedVertices(g));
164
165	auto info = makeInfoTable(g);
166	auto work_states = makeStateBitset(g, initial_states);
167
168	execute_graph_i(g, info, input, &work_states, kill_sds);
169
170	return getVertices(work_states, info);
171	}
172
173	flat_set<NFAVertex> execute_graph(const NGHolder &g,
174	const vector<CharReach> &input,
175	const flat_set<NFAVertex> &initial_states) {
176	assert(hasCorrectlyNumberedVertices(g));
177
178	auto info = makeInfoTable(g);
179	auto work_states = makeStateBitset(g, initial_states);
180
181	execute_graph_i(g, info, input, &work_states, false);
182
183	return getVertices(work_states, info);
184	}
185
186	namespace {
187	class eg_visitor : public boost::default_dfs_visitor {
188	public:
189	eg_visitor(const NGHolder &running_g_in, const vector<StateInfo> &info_in,
190	const NGHolder &input_g_in,
191	map<NFAVertex, dynamic_bitset<> > &states_in)
192	: vertex_count(num_vertices(running_g_in)), running_g(running_g_in),
193	info(info_in), input_g(input_g_in), states(states_in),
194	succs (vertex_count) {}
195
196	void finish_vertex(NFAVertex input_v,
197	const boost::reverse_graph<NGHolder, const NGHolder &> &) {
198	if (input_v == input_g.accept) {
199	return;
200	}
201	assert(input_v != input_g.acceptEod);
202
203	DEBUG_PRINTF("finished p%zu\n", input_g[input_v].index);
204
205	/ finish vertex is called on vertex --> implies that all its parents*
206	* (in the forward graph) are also finished. Our parents will have
207	* pushed all of their successors for us into our stateset. */
208	states [input_v].resize(vertex_count);
209	dynamic_bitset<> our_states = states [input_v];
210	states [input_v].reset();
211
212	filter_by_reach(info, &our_states,
213	input_g [input_v].char_reach);
214
215	if (input_v != input_g.startDs &&
216	edge(input_v, input_v, input_g).second) {
217	bool changed;
218	do {
219	DEBUG_PRINTF("actually not finished -> have self loop\n");
220	succs.reset();
221	step(running_g, info, our_states, &succs);
222	filter_by_reach(info, &succs,
223	input_g [input_v].char_reach);
224	dynamic_bitset<> our_states2 = our_states \| succs;
225	changed = our_states2 != our_states;
226	our_states.swap(our_states2);
227	} while (changed);
228	}
229
230	DEBUG_PRINTF(" active rstates: %s\n", dumpStates(our_states).c_str());
231
232	succs.reset();
233	step(running_g, info, our_states, &succs);
234
235	/ we need to push into all our (forward) children their successors*
236	* from us. */
237	for (auto v : adjacent_vertices_range(input_v, input_g)) {
238	DEBUG_PRINTF("pushing our states to pstate %zu\n",
239	input_g[v].index);
240	if (v == input_g.startDs) {
241	/ no need for intra start edges /
242	continue;
243	}
244
245	states [v].resize(vertex_count); // May not yet exist
246
247	if (v != input_g.accept) {
248	states [v] \|= succs;
249	} else {
250	/ accept is a magical pseudo state which does not consume*
251	* characters and we are using to collect the output states. We
252	* must fill it with our states rather than our succs. */
253	DEBUG_PRINTF("prev outputted rstates: %s\n",
254	dumpStates(states[v]).c_str());
255	DEBUG_PRINTF("outputted rstates: %s\n",
256	dumpStates(our_states).c_str());
257
258	states [v] \|= our_states;
259
260	DEBUG_PRINTF("new outputted rstates: %s\n",
261	dumpStates(states[v]).c_str());
262	}
263	}
264
265	/ note: the states at this vertex are no longer required /
266	}
267
268	private:
269	const size_t vertex_count;
270	const NGHolder &running_g;
271	const vector<StateInfo> &info;
272	const NGHolder &input_g;
273	map<NFAVertex, dynamic_bitset<> > &states; / vertex in input_g -> set of*
274	states in running_g /*
275	dynamic_bitset<> succs; // temp use internally
276	};
277	} // namespace
278
279	flat_set<NFAVertex> execute_graph(const NGHolder &running_g,
280	const NGHolder &input_dag,
281	const flat_set<NFAVertex> &input_start_states,
282	const flat_set<NFAVertex> &initial_states) {
283	DEBUG_PRINTF("g has %zu vertices, input_dag has %zu vertices\n",
284	num_vertices(running_g), num_vertices(input_dag));
285	assert(hasCorrectlyNumberedVertices(running_g));
286	assert(in_degree(input_dag.acceptEod, input_dag) == `1`);
287
288	map<NFAVertex, boost::default_color_type> colours;
289	/ could just a topo order, but really it is time to pull a slightly bigger*
290	* gun: DFS */
291	boost::reverse_graph<NGHolder, const NGHolder &> revg(input_dag);
292	map<NFAVertex, dynamic_bitset<> > dfs_states;
293
294	auto info = makeInfoTable(running_g);
295	auto input_fs = makeStateBitset(running_g, initial_states);
296
297	for (auto v : input_start_states) {
298	dfs_states [v] = input_fs;
299	}
300
301	depth_first_visit(revg, input_dag.accept,
302	eg_visitor (running_g, info, input_dag, dfs_states),
303	make_assoc_property_map(colours));
304
305	auto states = getVertices(dfs_states [input_dag.accept], info);
306
307	#ifdef DEBUG
308	DEBUG_PRINTF(" output rstates:");
309	for (const auto &v : states) {
310	printf(" %zu", running_g[v].index);
311	}
312	printf("\n");
313	#endif
314
315	return states;
316	}
317
318	flat_set<NFAVertex> execute_graph(const NGHolder &running_g,
319	const NGHolder &input_dag,
320	const flat_set<NFAVertex> &initial_states) {
321	auto input_start_states = {input_dag.start, input_dag.startDs};
322	return execute_graph(running_g, input_dag, input_start_states,
323	initial_states);
324	}
325
326	static
327	bool can_die_early(const NGHolder &g, const vector<StateInfo> &info,
328	const dynamic_bitset<> &s,
329	map<dynamic_bitset<>, u32> &visited, u32 age_limit) {
330	if (contains(visited, s) && visited [s] >= age_limit) {
331	/ we have already (or are in the process) of visiting here with a*
332	* looser limit. */
333	return false;
334	}
335	visited [s] = age_limit;
336
337	if (s.none()) {
338	DEBUG_PRINTF("dead\n");
339	return true;
340	}
341
342	if (age_limit == `0`) {
343	return false;
344	}
345
346	dynamic_bitset<> all_succ(s.size());
347	step(g, info, s, &all_succ);
348	all_succ.reset(NODE_START_DOTSTAR);
349
350	for (u32 i = `0`; i < N_CHARS; i++) {
351	dynamic_bitset<> next = all_succ;
352	filter_by_reach(info, &next, CharReach (i));
353	if (can_die_early(g, info, next, visited, age_limit - `1`)) {
354	return true;
355	}
356	}
357
358	return false;
359	}
360
361	bool can_die_early(const NGHolder &g, u32 age_limit) {
362	if (proper_out_degree(g.startDs, g)) {
363	return false;
364	}
365	const vector<StateInfo> &info = makeInfoTable(g);
366	map<dynamic_bitset<>, u32> visited;
367	return can_die_early(g, info, makeStateBitset(g, {g.start}), visited,
368	age_limit);
369	}
370
371	} // namespace ue2
372

Browse the source code of ClickHouse/contrib/hyperscan/src/nfagraph/ng_execute.cpp