ng_mcclellan.cpp source code [ClickHouse/contrib/hyperscan/src/nfagraph/ng_mcclellan.cpp]

1	/*
2	* Copyright (c) 2015-2018, Intel Corporation
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions are met:
6	*
7	* * Redistributions of source code must retain the above copyright notice,
8	* this list of conditions and the following disclaimer.
9	* * Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	* * Neither the name of Intel Corporation nor the names of its contributors
13	* may be used to endorse or promote products derived from this software
14	* without specific prior written permission.
15	*
16	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26	* POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/* \file*
30	* \brief Build code for McClellan DFA.
31	*/
32	#include "ng_mcclellan.h"
33
34	#include "grey.h"
35	#include "nfa/dfa_min.h"
36	#include "nfa/rdfa.h"
37	#include "ng_holder.h"
38	#include "ng_mcclellan_internal.h"
39	#include "ng_squash.h"
40	#include "ng_util.h"
41	#include "ue2common.h"
42	#include "util/bitfield.h"
43	#include "util/determinise.h"
44	#include "util/flat_containers.h"
45	#include "util/graph_range.h"
46	#include "util/hash.h"
47	#include "util/hash_dynamic_bitset.h"
48	#include "util/make_unique.h"
49	#include "util/report_manager.h"
50
51	#include <algorithm>
52	#include <functional>
53	#include <map>
54	#include <set>
55	#include <unordered_map>
56	#include <vector>
57
58	#include <boost/dynamic_bitset.hpp>
59
60	using namespace std;
61	using boost::dynamic_bitset;
62
63	namespace ue2 {
64
65	#define FINAL_DFA_STATE_LIMIT 16383
66	#define DFA_STATE_LIMIT 1024
67	#define NFA_STATE_LIMIT 256
68
69	u16 buildAlphabetFromEquivSets(const std::vector<CharReach> &esets,
70	array<u16, ALPHABET_SIZE> &alpha,
71	array<u16, ALPHABET_SIZE> &unalpha) {
72	u16 i = `0`;
73	for (; i < esets.size(); i++) {
74	const CharReach &cr = esets [i];
75
76	#ifdef DEBUG
77	DEBUG_PRINTF("eq set: ");
78	for (size_t s = cr.find_first(); s != CharReach::npos;
79	s = cr.find_next(s)) {
80	printf("%02hhx ", (u8)s);
81	}
82	printf("-> %u\n", i);
83	#endif
84	u16 leader = cr.find_first();
85	for (size_t s = cr.find_first(); s != CharReach::npos;
86	s = cr.find_next(s)) {
87	alpha [s] = i;
88	}
89	unalpha [i] = leader;
90	}
91
92	for (u16 j = N_CHARS; j < ALPHABET_SIZE; j++, i++) {
93	alpha [j] = i;
94	unalpha [i] = j;
95	}
96
97	return i; // alphabet size
98	}
99
100	void calculateAlphabet(const NGHolder &g, array<u16, ALPHABET_SIZE> &alpha,
101	array<u16, ALPHABET_SIZE> &unalpha, u16 *alphasize) {
102	vector<CharReach> esets(`1`, CharReach::dot());
103
104	for (auto v : vertices_range(g)) {
105	if (is_special(v, g)) {
106	continue;
107	}
108
109	const CharReach &cr = g [v].char_reach;
110
111	for (size_t i = `0`; i < esets.size(); i++) {
112	if (esets [i].count() == `1`) {
113	continue;
114	}
115
116	CharReach t = cr & esets [i];
117	if (t.any() && t != esets [i]) {
118	esets [i] &= ~t;
119	esets.push_back(t);
120	}
121	}
122	}
123	// for deterministic compiles
124	sort(esets.begin(), esets.end());
125
126	assert(alphasize);
127	*alphasize = buildAlphabetFromEquivSets(esets, alpha, unalpha);
128	}
129
130	static
131	bool allExternalReports(const ReportManager &rm,
132	const flat_set<ReportID> &reports) {
133	for (auto report_id : reports) {
134	if (!isExternalReport(rm.getReport(report_id))) {
135	return false;
136	}
137	}
138
139	return true;
140	}
141
142	static
143	dstate_id_t successor(const vector<dstate> &dstates, dstate_id_t c,
144	const array<u16, ALPHABET_SIZE> &alpha, symbol_t s) {
145	return dstates [c].next [alpha [s]];
146	}
147
148	void getFullTransitionFromState(const raw_dfa &n, dstate_id_t state,
149	dstate_id_t *out_table) {
150	for (u32 i = `0`; i < ALPHABET_SIZE; i++) {
151	out_table[i] = successor(n.states, state, n.alpha_remap, i);
152	}
153	}
154
155	template<typename stateset>
156	static
157	void populateInit(const NGHolder &g, const flat_set<NFAVertex> &unused,
158	stateset init, stateset init_deep,
159	vector<NFAVertex> *v_by_index) {
160	for (auto v : vertices_range(g)) {
161	if (contains(unused, v)) {
162	continue;
163	}
164
165	u32 vert_id = g [v].index;
166	assert(vert_id < init->size());
167
168	if (is_any_start(v, g)) {
169	init->set(vert_id);
170	if (hasSelfLoop(v, g) \|\| is_triggered(g)) {
171	DEBUG_PRINTF("setting %u\n", vert_id);
172	init_deep->set(vert_id);
173	}
174	}
175	}
176
177	v_by_index->clear();
178	v_by_index->resize(num_vertices(g), NGHolder::null_vertex());
179
180	for (auto v : vertices_range(g)) {
181	u32 vert_id = g [v].index;
182	assert((*v_by_index)[vert_id] == NGHolder::null_vertex());
183	(*v_by_index)[vert_id] = v;
184	}
185
186	if (is_triggered(g)) {
187	init_deep = init;
188	}
189	}
190
191	template<typename StateSet>
192	void populateAccepts(const NGHolder &g, const flat_set<NFAVertex> &unused,
193	StateSet accept, StateSet acceptEod) {
194	for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
195	if (contains(unused, v)) {
196	continue;
197	}
198	accept->set(g [v].index);
199	}
200	for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
201	if (v == g.accept) {
202	continue;
203	}
204	if (contains(unused, v)) {
205	continue;
206	}
207	acceptEod->set(g [v].index);
208	}
209	}
210
211	static
212	bool canPruneEdgesFromAccept(const ReportManager &rm, const NGHolder &g) {
213	bool seen = false;
214	u32 ekey = `0`;
215
216	for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
217	if (is_special(v, g)) {
218	continue;
219	}
220
221	for (auto report_id : g [v].reports) {
222	const Report &ir = rm.getReport(report_id);
223
224	if (!isSimpleExhaustible(ir)) {
225	return false;
226	}
227
228	if (!seen) {
229	seen = true;
230	ekey = ir.ekey;
231	} else if (ekey != ir.ekey) {
232	return false;
233	}
234	}
235	}
236
237	/ need to check accept eod does not have any unseen reports as well /
238	for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
239	if (is_special(v, g)) {
240	continue;
241	}
242
243	for (auto report_id : g [v].reports) {
244	const Report &ir = rm.getReport(report_id);
245
246	if (!isSimpleExhaustible(ir)) {
247	return false;
248	}
249
250	if (!seen) {
251	seen = true;
252	ekey = ir.ekey;
253	} else if (ekey != ir.ekey) {
254	return false;
255	}
256	}
257	}
258
259	return true;
260	}
261
262	static
263	bool overhangMatchesTrigger(const vector<vector<CharReach> > &all_triggers,
264	vector<CharReach>::const_reverse_iterator itb,
265	vector<CharReach>::const_reverse_iterator ite) {
266	for (const auto &trigger : all_triggers) {
267	vector<CharReach>::const_reverse_iterator it = itb;
268	vector<CharReach>::const_reverse_iterator kt = trigger.rbegin();
269	for (; it != ite && kt != trigger.rend(); ++it, ++kt) {
270	if ((it & kt).none()) {
271	/ this trigger does not match the overhang, try next /
272	goto try_next_trigger;
273	}
274	}
275
276	return true;
277	try_next_trigger:;
278	}
279
280	return false; / no trigger matches the over hang /
281	}
282
283	static
284	bool triggerAllowed(const NGHolder &g, const NFAVertex v,
285	const vector<vector<CharReach> > &all_triggers,
286	const vector<CharReach> &trigger) {
287	flat_set<NFAVertex> curr({v});
288	flat_set<NFAVertex> next;
289
290	for (auto it = trigger.rbegin(); it != trigger.rend(); ++it) {
291	next.clear();
292
293	for (auto u : curr) {
294	assert(u != g.startDs); / triggered graphs should not use sds /
295	if (u == g.start) {
296	if (overhangMatchesTrigger(all_triggers, it, trigger.rend())) {
297	return true;
298	}
299	continue;
300	}
301
302	if ((g [u].char_reach & *it).none()) {
303	continue;
304	}
305	insert(&next, inv_adjacent_vertices(u, g));
306	}
307
308	if (next.empty()) {
309	return false;
310	}
311
312	next.swap(curr);
313	}
314
315	return true;
316	}
317
318	void markToppableStarts(const NGHolder &g, const flat_set<NFAVertex> &unused,
319	bool single_trigger,
320	const vector<vector<CharReach>> &triggers,
321	dynamic_bitset<> *out) {
322	if (single_trigger) {
323	return; / no live states can lead to new states /
324	}
325
326	for (auto v : vertices_range(g)) {
327	if (contains(unused, v)) {
328	continue;
329	}
330	for (const auto &trigger : triggers) {
331	if (triggerAllowed(g, v, triggers, trigger)) {
332	DEBUG_PRINTF("idx %zu is valid location for top\n", g[v].index);
333	out->set(g [v].index);
334	break;
335	}
336	}
337	}
338
339	assert(out->test(g[g.start].index));
340	}
341
342	namespace {
343
344	template<typename Automaton_Traits>
345	class Automaton_Base {
346	public:
347	using StateSet = typename Automaton_Traits::StateSet;
348	using StateMap = typename Automaton_Traits::StateMap;
349
350	Automaton_Base(const ReportManager rm_in, const* NGHolder &graph_in,
351	bool single_trigger,
352	const vector<vector<CharReach>> &triggers, bool prunable_in)
353	: rm(rm_in), graph(graph_in), numStates(num_vertices(graph)),
354	unused (getRedundantStarts(graph_in)),
355	init(Automaton_Traits::init_states(numStates)),
356	initDS(Automaton_Traits::init_states(numStates)),
357	squash(Automaton_Traits::init_states(numStates)),
358	accept(Automaton_Traits::init_states(numStates)),
359	acceptEod(Automaton_Traits::init_states(numStates)),
360	toppable(Automaton_Traits::init_states(numStates)),
361	dead(Automaton_Traits::init_states(numStates)),
362	prunable(prunable_in) {
363	populateInit(graph, unused, &init, &initDS, &v_by_index);
364	populateAccepts(graph, unused, &accept, &acceptEod);
365
366	start_anchored = DEAD_STATE + `1`;
367	if (initDS == init) {
368	start_floating = start_anchored;
369	} else if (initDS.any()) {
370	start_floating = start_anchored + `1`;
371	} else {
372	start_floating = DEAD_STATE;
373	}
374
375	calculateAlphabet(graph, alpha, unalpha, &alphasize);
376
377	for (const auto &sq : findSquashers(graph)) {
378	NFAVertex v = sq.first;
379	u32 vert_id = graph[v].index;
380	squash.set(vert_id);
381	squash_mask[vert_id]
382	= Automaton_Traits::copy_states(std::move(sq.second),
383	numStates);
384	}
385
386	cr_by_index = populateCR(graph, v_by_index, alpha);
387	if (is_triggered(graph)) {
388	dynamic_bitset<> temp(numStates);
389	markToppableStarts(graph, unused, single_trigger, triggers,
390	&temp);
391	toppable = Automaton_Traits::copy_states(std::move(temp),
392	numStates);
393	}
394	}
395
396	public:
397	void transition(const StateSet &in, StateSet *next) {
398	transition_graph(*this, v_by_index, in, next);
399	}
400
401	const vector<StateSet> initial() {
402	vector<StateSet> rv = {init};
403	if (start_floating != DEAD_STATE && start_floating != start_anchored) {
404	rv.push_back(initDS);
405	}
406	return rv;
407	}
408
409	private:
410	void reports_i(const StateSet &in, bool eod, flat_set<ReportID> &rv) {
411	StateSet acc = in & (eod ? acceptEod : accept);
412	for (size_t i = acc.find_first(); i != StateSet::npos;
413	i = acc.find_next(i)) {
414	NFAVertex v = v_by_index[i];
415	DEBUG_PRINTF("marking report\n");
416	const auto &my_reports = graph[v].reports;
417	rv.insert(my_reports.begin(), my_reports.end());
418	}
419	}
420
421	public:
422	void reports(const StateSet &in, flat_set<ReportID> &rv) {
423	reports_i(in, false, rv);
424	}
425	void reportsEod(const StateSet &in, flat_set<ReportID> &rv) {
426	reports_i(in, true, rv);
427	}
428
429	bool canPrune(const flat_set<ReportID> &test_reports) const {
430	if (!rm \|\| !prunable \|\| !canPruneEdgesFromAccept(*rm, graph)) {
431	return false;
432	}
433	return allExternalReports(*rm, test_reports);
434	}
435
436	private:
437	const ReportManager *rm;
438	public:
439	const NGHolder &graph;
440	u32 numStates;
441	const flat_set<NFAVertex> unused;
442	vector<NFAVertex> v_by_index;
443	vector<CharReach> cr_by_index; / pre alpha'ed /
444	StateSet init;
445	StateSet initDS;
446	StateSet squash; / states which allow us to mask out other states /
447	StateSet accept;
448	StateSet acceptEod;
449	StateSet toppable; / states which are allowed to be on when a top arrives,*
450	* triggered dfas only */
451	StateSet dead;
452	map<u32, StateSet> squash_mask;
453	bool prunable;
454	array<u16, ALPHABET_SIZE> alpha;
455	array<u16, ALPHABET_SIZE> unalpha;
456	u16 alphasize;
457
458	u16 start_anchored;
459	u16 start_floating;
460	};
461
462	struct Big_Traits {
463	using StateSet = dynamic_bitset<>;
464	using StateMap = unordered_map<StateSet, dstate_id_t, hash_dynamic_bitset>;
465
466	static StateSet init_states(u32 num) {
467	return StateSet (num);
468	}
469
470	static StateSet copy_states(dynamic_bitset<> in, UNUSED u32 num) {
471	assert(in.size() == num);
472	return in;
473	}
474	};
475
476	class Automaton_Big : public Automaton_Base<Big_Traits> {
477	public:
478	Automaton_Big(const ReportManager rm_in, const* NGHolder &graph_in,
479	bool single_trigger,
480	const vector<vector<CharReach>> &triggers, bool prunable_in)
481	: Automaton_Base (rm_in, graph_in, single_trigger, triggers,
482	prunable_in) {}
483	};
484
485	struct Graph_Traits {
486	using StateSet = bitfield<NFA_STATE_LIMIT>;
487	using StateMap = unordered_map<StateSet, dstate_id_t>;
488
489	static StateSet init_states(UNUSED u32 num) {
490	assert(num <= NFA_STATE_LIMIT);
491	return StateSet ();
492	}
493
494	static StateSet copy_states(const dynamic_bitset<> &in, u32 num) {
495	StateSet out = init_states(num);
496	for (size_t i = in.find_first(); i != in.npos && i < out.size();
497	i = in.find_next(i)) {
498	out.set(i);
499	}
500	return out;
501	}
502	};
503
504	class Automaton_Graph : public Automaton_Base<Graph_Traits> {
505	public:
506	Automaton_Graph(const ReportManager rm_in, const* NGHolder &graph_in,
507	bool single_trigger,
508	const vector<vector<CharReach>> &triggers, bool prunable_in)
509	: Automaton_Base (rm_in, graph_in, single_trigger, triggers,
510	prunable_in) {}
511	};
512
513	} // namespace
514
515	static
516	bool startIsRedundant(const NGHolder &g) {
517	set<NFAVertex> start;
518	set<NFAVertex> startDs;
519
520	insert(&start, adjacent_vertices(g.start, g));
521	insert(&startDs, adjacent_vertices(g.startDs, g));
522
523	return start == startDs;
524	}
525
526	flat_set<NFAVertex> getRedundantStarts(const NGHolder &g) {
527	flat_set<NFAVertex> dead;
528	if (startIsRedundant(g)) {
529	dead.insert(g.start);
530	}
531	if (proper_out_degree(g.startDs, g) == `0`) {
532	dead.insert(g.startDs);
533	}
534	return dead;
535	}
536
537	unique_ptr<raw_dfa> buildMcClellan(const NGHolder &graph,
538	const ReportManager rm, bool* single_trigger,
539	const vector<vector<CharReach>> &triggers,
540	const Grey &grey, bool finalChance) {
541	if (!grey.allowMcClellan) {
542	return nullptr;
543	}
544
545	DEBUG_PRINTF("attempting to build %s mcclellan\n",
546	to_string(graph.kind).c_str());
547	assert(allMatchStatesHaveReports(graph));
548
549	bool prunable = grey.highlanderPruneDFA && has_managed_reports(graph);
550	assert(rm \|\| !has_managed_reports(graph));
551	if (!has_managed_reports(graph)) {
552	rm = nullptr;
553	}
554
555	assert(triggers.empty() == !is_triggered(graph));
556
557	/ We must be getting desperate if it is an outfix, so use the final chance*
558	* state limit logic */
559	u32 state_limit
560	= (graph.kind == NFA_OUTFIX \|\| finalChance) ? FINAL_DFA_STATE_LIMIT
561	: DFA_STATE_LIMIT;
562
563	const u32 numStates = num_vertices(graph);
564	DEBUG_PRINTF("determinising nfa with %u vertices\n", numStates);
565
566	if (numStates > FINAL_DFA_STATE_LIMIT) {
567	DEBUG_PRINTF("rejecting nfa as too many vertices\n");
568	return nullptr;
569	}
570
571	auto rdfa = ue2::make_unique<raw_dfa>(graph.kind);
572
573	if (numStates <= NFA_STATE_LIMIT) {
574	/ Fast path. Automaton_Graph uses a bitfield internally to represent*
575	* states and is quicker than Automaton_Big. */
576	Automaton_Graph n(rm, graph, single_trigger, triggers, prunable);
577	if (!determinise(n, rdfa ->states, state_limit)) {
578	DEBUG_PRINTF("state limit exceeded\n");
579	return nullptr; / over state limit /
580	}
581
582	rdfa ->start_anchored = n.start_anchored;
583	rdfa ->start_floating = n.start_floating;
584	rdfa ->alpha_size = n.alphasize;
585	rdfa ->alpha_remap = n.alpha;
586	} else {
587	/ Slow path. Too many states to use Automaton_Graph. /
588	Automaton_Big n(rm, graph, single_trigger, triggers, prunable);
589	if (!determinise(n, rdfa ->states, state_limit)) {
590	DEBUG_PRINTF("state limit exceeded\n");
591	return nullptr; / over state limit /
592	}
593
594	rdfa ->start_anchored = n.start_anchored;
595	rdfa ->start_floating = n.start_floating;
596	rdfa ->alpha_size = n.alphasize;
597	rdfa ->alpha_remap = n.alpha;
598	}
599
600	minimize_hopcroft(*rdfa, grey);
601
602	DEBUG_PRINTF("after determinised into %zu states, building impl dfa "
603	"(a,f) = (%hu,%hu)\n", rdfa->states.size(),
604	rdfa->start_anchored, rdfa->start_floating);
605
606	return rdfa;
607	}
608
609	unique_ptr<raw_dfa> buildMcClellan(const NGHolder &g, const ReportManager *rm,
610	const Grey &grey) {
611	assert(!is_triggered(g));
612	vector<vector<CharReach>> triggers;
613	return buildMcClellan(g, rm, false, triggers, grey);
614	}
615
616	} // namespace ue2
617

Browse the source code of ClickHouse/contrib/hyperscan/src/nfagraph/ng_mcclellan.cpp