rdfa_merge.cpp source code [ClickHouse/contrib/hyperscan/src/nfa/rdfa_merge.cpp]

1	/*
2	* Copyright (c) 2015-2017, Intel Corporation
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions are met:
6	*
7	* * Redistributions of source code must retain the above copyright notice,
8	* this list of conditions and the following disclaimer.
9	* * Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	* * Neither the name of Intel Corporation nor the names of its contributors
13	* may be used to endorse or promote products derived from this software
14	* without specific prior written permission.
15	*
16	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26	* POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	#include "rdfa_merge.h"
30
31	#include "grey.h"
32	#include "dfa_min.h"
33	#include "mcclellancompile_util.h"
34	#include "rdfa.h"
35	#include "ue2common.h"
36	#include "nfagraph/ng_mcclellan_internal.h"
37	#include "util/container.h"
38	#include "util/determinise.h"
39	#include "util/flat_containers.h"
40	#include "util/make_unique.h"
41	#include "util/report_manager.h"
42	#include "util/unordered.h"
43
44	#include <algorithm>
45	#include <queue>
46
47	using namespace std;
48
49	namespace ue2 {
50
51	#define MAX_DFA_STATES 16383
52
53	namespace {
54
55	class Automaton_Merge {
56	public:
57	using StateSet = vector<u16>;
58	using StateMap = ue2_unordered_map<StateSet, dstate_id_t>;
59
60	Automaton_Merge(const raw_dfa rdfa1, const* raw_dfa *rdfa2,
61	const ReportManager rm_in, const* Grey &grey_in)
62	: rm(rm_in), grey(grey_in), nfas {rdfa1, rdfa2}, dead (`2`) {
63	calculateAlphabet();
64	populateAsFs();
65	prunable = isPrunable();
66	}
67
68	Automaton_Merge(const vector<const raw_dfa *> &dfas,
69	const ReportManager rm_in, const* Grey &grey_in)
70	: rm(rm_in), grey(grey_in), nfas (dfas), dead (nfas.size()) {
71	calculateAlphabet();
72	populateAsFs();
73	prunable = isPrunable();
74	}
75
76	void populateAsFs(void) {
77	bool fs_same = true;
78	bool fs_dead = true;
79
80	as.resize(nfas.size());
81	fs.resize(nfas.size());
82	for (size_t i = `0`, end = nfas.size(); i < end; i++) {
83	as [i] = nfas [i]->start_anchored;
84	fs [i] = nfas [i]->start_floating;
85
86	if (fs [i]) {
87	fs_dead = false;
88	}
89
90	if (as [i] != fs [i]) {
91	fs_same = false;
92	}
93	}
94
95	start_anchored = DEAD_STATE + `1`;
96	if (fs_same) {
97	start_floating = start_anchored;
98	} else if (fs_dead) {
99	start_floating = DEAD_STATE;
100	} else {
101	start_floating = start_anchored + `1`;
102	}
103	}
104
105	void calculateAlphabet(void) {
106	DEBUG_PRINTF("calculating alphabet\n");
107	vector<CharReach> esets = {CharReach::dot()};
108
109	for (const auto &rdfa : nfas) {
110	DEBUG_PRINTF("...next dfa alphabet\n");
111	assert(rdfa);
112	const auto &alpha_remap = rdfa->alpha_remap;
113
114	for (size_t i = `0`; i < esets.size(); i++) {
115	assert(esets[i].count());
116	if (esets [i].count() == `1`) {
117	DEBUG_PRINTF("skipping singleton eq set\n");
118	continue;
119	}
120
121	CharReach t;
122	u8 leader_s = alpha_remap [esets [i].find_first()];
123
124	DEBUG_PRINTF("checking eq set, leader %02hhx \n", leader_s);
125
126	for (size_t s = esets [i].find_first(); s != CharReach::npos;
127	s = esets [i].find_next(s)) {
128	if (alpha_remap [s] != leader_s) {
129	t.set(s);
130	}
131	}
132
133	if (t.any() && t != esets [i]) {
134	esets [i] &= ~t;
135	esets.push_back(t);
136	}
137	}
138	}
139
140	// Sort so that our alphabet mapping isn't dependent on the order of
141	// rdfas passed in.
142	sort(esets.begin(), esets.end());
143
144	alphasize = buildAlphabetFromEquivSets(esets, alpha, unalpha);
145	}
146
147	bool isPrunable() const {
148	if (!grey.highlanderPruneDFA \|\| !rm) {
149	DEBUG_PRINTF("disabled, or not managed reports\n");
150	return false;
151	}
152
153	assert(!nfas.empty());
154	if (!generates_callbacks(nfas.front()->kind)) {
155	DEBUG_PRINTF("doesn't generate callbacks\n");
156	return false;
157	}
158
159	// Collect all reports from all merge candidates.
160	flat_set<ReportID> merge_reports;
161	for (const auto &rdfa : nfas) {
162	insert(&merge_reports, all_reports(*rdfa));
163	}
164
165	DEBUG_PRINTF("all reports: %s\n", as_string_list(merge_reports).c_str());
166
167	// Return true if they're all exhaustible with the same exhaustion key.
168	u32 ekey = INVALID_EKEY;
169	for (const auto &report_id : merge_reports) {
170	const Report &r = rm->getReport(report_id);
171	if (!isSimpleExhaustible(r)) {
172	DEBUG_PRINTF("report %u not simple exhaustible\n", report_id);
173	return false;
174	}
175	assert(r.ekey != INVALID_EKEY);
176	if (ekey == INVALID_EKEY) {
177	ekey = r.ekey;
178	} else if (ekey != r.ekey) {
179	DEBUG_PRINTF("two different ekeys, %u and %u\n", ekey, r.ekey);
180	return false;
181	}
182	}
183
184	DEBUG_PRINTF("is prunable\n");
185	return true;
186	}
187
188
189	void transition(const StateSet &in, StateSet *next) {
190	u16 t[ALPHABET_SIZE];
191
192	for (u32 i = `0`; i < alphasize; i++) {
193	next[i].resize(nfas.size());
194	}
195
196	for (size_t j = `0`, j_end = nfas.size(); j < j_end; j++) {
197	getFullTransitionFromState(*nfas [j], in [j], t);
198	for (u32 i = `0`; i < alphasize; i++) {
199	next[i][j] = t[unalpha [i]];
200	}
201	}
202	}
203
204	const vector<StateSet> initial() {
205	vector<StateSet> rv = {as};
206	if (start_floating != DEAD_STATE && start_floating != start_anchored) {
207	rv.push_back(fs);
208	}
209	return rv;
210	}
211
212	private:
213	void reports_i(const StateSet &in, flat_set<ReportID> dstate::*r_set,
214	flat_set<ReportID> &r) const {
215	for (size_t i = `0`, end = nfas.size(); i < end; i++) {
216	const auto &rs = nfas [i]->states [in [i]].*r_set;
217	insert(&r, rs);
218	}
219	}
220
221	public:
222	void reports(const StateSet &in, flat_set<ReportID> &rv) const {
223	reports_i(in, &dstate::reports, rv);
224	}
225	void reportsEod(const StateSet &in, flat_set<ReportID> &rv) const {
226	reports_i(in, &dstate::reports_eod, rv);
227	}
228
229	bool canPrune(const flat_set<ReportID> &test_reports) const {
230	if (!grey.highlanderPruneDFA \|\| !prunable) {
231	return false;
232	}
233
234	// Must all be external reports.
235	assert(rm);
236	for (const auto &report_id : test_reports) {
237	if (!isExternalReport(rm->getReport(report_id))) {
238	return false;
239	}
240	}
241
242	return true;
243	}
244
245	/* True if the minimization algorithm should be run after merging. /
246	bool shouldMinimize() const {
247	// We only need to run minimization if our merged DFAs shared a report.
248	flat_set<ReportID> seen_reports;
249	for (const auto &rdfa : nfas) {
250	for (const auto &report_id : all_reports(*rdfa)) {
251	if (!seen_reports.insert(report_id).second) {
252	DEBUG_PRINTF("report %u in several dfas\n", report_id);
253	return true;
254	}
255	}
256	}
257
258	return false;
259	}
260
261	private:
262	const ReportManager *rm;
263	const Grey &grey;
264
265	vector<const raw_dfa *> nfas;
266	vector<dstate_id_t> as;
267	vector<dstate_id_t> fs;
268
269	bool prunable = false;
270
271	public:
272	std::array<u16, ALPHABET_SIZE> alpha;
273	std::array<u16, ALPHABET_SIZE> unalpha;
274	u16 alphasize;
275	StateSet dead;
276
277	u16 start_anchored;
278	u16 start_floating;
279	};
280
281	} // namespace
282
283	unique_ptr<raw_dfa> mergeTwoDfas(const raw_dfa d1, const* raw_dfa *d2,
284	size_t max_states, const ReportManager *rm,
285	const Grey &grey) {
286	assert(d1 && d2);
287	assert(d1->kind == d2->kind);
288	assert(max_states <= MAX_DFA_STATES);
289
290	auto rdfa = ue2::make_unique<raw_dfa>(d1->kind);
291
292	Automaton_Merge autom(d1, d2, rm, grey);
293	if (determinise(autom, rdfa ->states, max_states)) {
294	rdfa ->start_anchored = autom.start_anchored;
295	rdfa ->start_floating = autom.start_floating;
296	rdfa ->alpha_size = autom.alphasize;
297	rdfa ->alpha_remap = autom.alpha;
298	DEBUG_PRINTF("merge succeeded, %zu states\n", rdfa->states.size());
299
300	if (autom.shouldMinimize()) {
301	minimize_hopcroft(*rdfa, grey);
302	DEBUG_PRINTF("minimized, %zu states\n", rdfa->states.size());
303	}
304
305	return rdfa;
306	}
307
308	return nullptr;
309	}
310
311	void mergeDfas(vector<unique_ptr<raw_dfa>> &dfas, size_t max_states,
312	const ReportManager rm, const* Grey &grey) {
313	assert(max_states <= MAX_DFA_STATES);
314
315	if (dfas.size() <= `1`) {
316	return;
317	}
318
319	DEBUG_PRINTF("before merging, we have %zu dfas\n", dfas.size());
320
321	queue<unique_ptr<raw_dfa>> q;
322	for (auto &dfa : dfas) {
323	q.push(move(dfa));
324	}
325
326	// All DFAs are now on the queue, so we'll clear the vector and use it for
327	// output from here.
328	dfas.clear();
329
330	while (q.size() > `1`) {
331	// Attempt to merge the two front elements of the queue.
332	unique_ptr<raw_dfa> d1 = move(q.front());
333	q.pop();
334	unique_ptr<raw_dfa> d2 = move(q.front());
335	q.pop();
336
337	auto rdfa = mergeTwoDfas(d1.get(), d2.get(), max_states, rm, grey);
338	if (rdfa) {
339	q.push(move(rdfa));
340	} else {
341	DEBUG_PRINTF("failed to merge\n");
342	// Put the larger of the two DFAs on the output list, retain the
343	// smaller one on the queue for further merge attempts.
344	if (d2 ->states.size() > d1 ->states.size()) {
345	dfas.push_back(move(d2));
346	q.push(move(d1));
347	} else {
348	dfas.push_back(move(d1));
349	q.push(move(d2));
350	}
351	}
352	}
353
354	while (!q.empty()) {
355	dfas.push_back(move(q.front()));
356	q.pop();
357	}
358
359	DEBUG_PRINTF("after merging, we have %zu dfas\n", dfas.size());
360	}
361
362	unique_ptr<raw_dfa> mergeAllDfas(const vector<const raw_dfa *> &dfas,
363	size_t max_states, const ReportManager *rm,
364	const Grey &grey) {
365	assert(max_states <= MAX_DFA_STATES);
366	assert(!dfas.empty());
367
368	// All the DFAs should be of the same kind.
369	const auto kind = dfas.front()->kind;
370	assert(all_of(begin(dfas), end(dfas),
371	[&kind](const raw_dfa rdfa) { return* rdfa->kind == kind; }));
372
373	auto rdfa = ue2::make_unique<raw_dfa>(kind);
374	Automaton_Merge n(dfas, rm, grey);
375
376	DEBUG_PRINTF("merging dfa\n");
377
378	if (!determinise(n, rdfa ->states, max_states)) {
379	DEBUG_PRINTF("state limit (%zu) exceeded\n", max_states);
380	return nullptr; / over state limit /
381	}
382
383	rdfa ->start_anchored = n.start_anchored;
384	rdfa ->start_floating = n.start_floating;
385	rdfa ->alpha_size = n.alphasize;
386	rdfa ->alpha_remap = n.alpha;
387
388	DEBUG_PRINTF("merged, building impl dfa (a,f) = (%hu,%hu)\n",
389	rdfa->start_anchored, rdfa->start_floating);
390
391	if (n.shouldMinimize()) {
392	minimize_hopcroft(*rdfa, grey);
393	DEBUG_PRINTF("minimized, %zu states\n", rdfa->states.size());
394	}
395
396	return rdfa;
397	}
398
399	} // namespace ue2
400

Browse the source code of ClickHouse/contrib/hyperscan/src/nfa/rdfa_merge.cpp