dfa_min.cpp source code [ClickHouse/contrib/hyperscan/src/nfa/dfa_min.cpp]

1	/*
2	* Copyright (c) 2015-2017, Intel Corporation
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions are met:
6	*
7	* * Redistributions of source code must retain the above copyright notice,
8	* this list of conditions and the following disclaimer.
9	* * Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	* * Neither the name of Intel Corporation nor the names of its contributors
13	* may be used to endorse or promote products derived from this software
14	* without specific prior written permission.
15	*
16	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26	* POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/**
30	* \file
31	* \brief Build code for DFA minimization.
32	*/
33
34	/**
35	* /Summary of the Hopcroft minimisation algorithm/
36	*
37	* partition := {F, Q \ F};
38	* work_queue := {F};
39	* while (work_queue is not empty) do
40	* choose and remove a set A from work_queue
41	* for each c in . do
42	* let X be the set of states for which a transition on c
43	* leads to a state in A
44	* for each set Y in partition for which X . Y is nonempty and
45	* Y \ X is nonempty do
46	* replace Y in partition by the two sets X . Y and Y \ X
47	* if Y is in work_queue
48	* replace Y in work_queue by the same two sets
49	* else
50	* if \|X . Y\| <= \|Y \ X\|
51	* add X . Y to work_queue
52	* else
53	* add Y \ X to work_queue
54	* end;
55	* end;
56	* end;
57	*/
58
59	#include "dfa_min.h"
60
61	#include "grey.h"
62	#include "mcclellancompile_util.h"
63	#include "rdfa.h"
64	#include "ue2common.h"
65	#include "util/container.h"
66	#include "util/flat_containers.h"
67	#include "util/noncopyable.h"
68	#include "util/partitioned_set.h"
69
70	#include <algorithm>
71	#include <functional>
72	#include <iterator>
73	#include <map>
74	#include <queue>
75	#include <set>
76	#include <vector>
77
78	using namespace std;
79
80	namespace ue2 {
81
82	namespace {
83
84	struct hopcroft_state_info {
85	explicit hopcroft_state_info(size_t alpha_size) : prev (alpha_size) {}
86
87	/* \brief Mapping from symbol to a list of predecessors that transition to*
88	* this state on that symbol. */
89	vector<vector<dstate_id_t>> prev;
90	};
91
92	struct HopcroftInfo : noncopyable {
93	size_t alpha_size; //!< Size of DFA alphabet.
94	queue<size_t> work_queue; //!< Hopcroft work queue of partition indices.
95	partitioned_set<dstate_id_t> partition; //!< Partition set of DFA states.
96	vector<hopcroft_state_info> states; //!< Pre-calculated state info (preds)
97
98	explicit HopcroftInfo(const raw_dfa &rdfa);
99	};
100
101	} // namespace
102
103	/**
104	* \brief Create an initial partitioning and work_queue.
105	*
106	* Initial partition contains {accepting states..., Non-accepting states}
107	* Initial work_queue contains accepting state subsets
108	*
109	* The initial partitioning needs to distinguish between the different
110	* reporting behaviours (unlike standard Hopcroft) --> more than one subset
111	* possible for the accepting states.
112	*
113	* Look for accepting states in both reports and reports_eod.
114	* Creates a map with a key(reports, reports_eod) and an id.
115	* Reports of each state are searched against the map and
116	* added to the corresponding id -> partition[id] and work_queue[id].
117	* Non Accept states are added to partition[id+1].
118	*/
119	static
120	vector<size_t> create_map(const raw_dfa &rdfa, queue<size_t> &work_queue) {
121	using ReportKey = pair<flat_set<ReportID>, flat_set<ReportID>>;
122	map<ReportKey, size_t> subset_map;
123	vector<size_t> state_to_subset(rdfa.states.size(), INVALID_SUBSET);
124
125	for (size_t i = `0`; i < rdfa.states.size(); i++) {
126	const auto &ds = rdfa.states [i];
127	if (!ds.reports.empty() \|\| !ds.reports_eod.empty()) {
128	ReportKey key(ds.reports, ds.reports_eod);
129	if (contains(subset_map, key)) {
130	state_to_subset [i] = subset_map [key];
131	} else {
132	size_t sub = subset_map.size();
133	subset_map.emplace(std::move(key), sub);
134	state_to_subset [i] = sub;
135	work_queue.push(sub);
136	}
137	}
138	}
139
140	/ Give non-accept states their own subset. /
141	size_t non_accept_sub = subset_map.size();
142	replace(state_to_subset.begin(), state_to_subset.end(), INVALID_SUBSET,
143	non_accept_sub);
144
145	return state_to_subset;
146	}
147
148	HopcroftInfo::HopcroftInfo(const raw_dfa &rdfa)
149	: alpha_size(rdfa.alpha_size), partition (create_map(rdfa, work_queue)),
150	states (rdfa.states.size(), hopcroft_state_info (alpha_size)) {
151	/ Construct predecessor lists for each state, indexed by symbol. /
152	for (size_t i = `0`; i < states.size(); i++) { // i is the previous state
153	for (size_t sym = `0`; sym < alpha_size; sym++) {
154	dstate_id_t present_state = rdfa.states [i].next [sym];
155	states [present_state].prev [sym].push_back(i);
156	}
157	}
158	}
159
160	/**
161	* For a split set X, each subset S (given by part_index) in the partition, two
162	* sets are created: v_inter (X intersection S) and v_sub (S - X).
163	*
164	* For each subset S in the partition that could be split (v_inter is nonempty
165	* and v_sub is nonempty):
166	* - replace S in partition by the two sets v_inter and v_sub.
167	* - if S is in work_queue:
168	* - replace S in work_queue by the two subsets.
169	* - else:
170	* - replace S in work_queue by the smaller of the two sets.
171	*/
172	static
173	void split_and_replace_set(const size_t part_index, HopcroftInfo &info,
174	const flat_set<dstate_id_t> &splitter) {
175	/ singleton sets cannot be split /
176	if (info.partition [part_index].size() == `1`) {
177	return;
178	}
179
180	size_t small_index = info.partition.split(part_index, splitter);
181
182	if (small_index == INVALID_SUBSET) {
183	/ the set could not be split /
184	return;
185	}
186
187	/ larger subset remains at the input subset index, if the input subset was*
188	* already in the work queue then the larger subset will remain there. */
189
190	info.work_queue.push(small_index);
191	}
192
193	/**
194	* \brief Core of the Hopcroft minimisation algorithm.
195	*/
196	static
197	void dfa_min(HopcroftInfo &info) {
198	flat_set<dstate_id_t> curr, sym_preds;
199	vector<size_t> cand_subsets;
200
201	while (!info.work_queue.empty()) {
202	/ Choose and remove a set of states (curr, or A in the description*
203	* above) from the work queue. Note that we copy the set because the
204	* partition may be split by the loop below. */
205	curr.clear();
206	insert(&curr, info.partition [info.work_queue.front()]);
207	info.work_queue.pop();
208
209	for (size_t sym = `0`; sym < info.alpha_size; sym++) {
210	/ Find the set of states sym_preds for which a transition on the*
211	* given symbol leads to a state in curr. */
212	sym_preds.clear();
213	for (dstate_id_t s : curr) {
214	insert(&sym_preds, info.states [s].prev [sym]);
215	}
216
217	if (sym_preds.empty()) {
218	continue;
219	}
220
221	/ we only need to consider subsets with at least one member in*
222	* sym_preds for splitting */
223	cand_subsets.clear();
224	info.partition.find_overlapping(sym_preds, &cand_subsets);
225
226	for (size_t sub : cand_subsets) {
227	split_and_replace_set(sub, info, sym_preds);
228	}
229	}
230	}
231	}
232
233	/**
234	* \brief Build the new DFA state table.
235	*/
236	static
237	void mapping_new_states(const HopcroftInfo &info,
238	vector<dstate_id_t> &old_to_new, raw_dfa &rdfa) {
239	const size_t num_partitions = info.partition.size();
240
241	// Mapping from equiv class's first state to equiv class index.
242	map<dstate_id_t, size_t> ordering;
243
244	// New state id for each equiv class.
245	vector<dstate_id_t> eq_state(num_partitions);
246
247	for (size_t i = `0`; i < num_partitions; i++) {
248	ordering [*info.partition [i].begin()] = i;
249	}
250
251	dstate_id_t new_id = `0`;
252	for (const auto &m : ordering) {
253	eq_state [m.second] = new_id++;
254	}
255
256	for (size_t t = `0`; t < info.partition.size(); t++) {
257	for (dstate_id_t id : info.partition [t]) {
258	old_to_new [id] = eq_state [t];
259	}
260	}
261
262	vector<dstate> new_states;
263	new_states.reserve(num_partitions);
264
265	for (const auto &m : ordering) {
266	new_states.push_back(rdfa.states [m.first]);
267	}
268	rdfa.states = std::move(new_states);
269	}
270
271	static
272	void renumber_new_states(const HopcroftInfo &info,
273	const vector<dstate_id_t> &old_to_new, raw_dfa &rdfa) {
274	for (size_t i = `0`; i < info.partition.size(); i++) {
275	for (size_t sym = `0`; sym < info.alpha_size; sym++) {
276	dstate_id_t output = rdfa.states [i].next [sym];
277	rdfa.states [i].next [sym] = old_to_new [output];
278	}
279	dstate_id_t dad = rdfa.states [i].daddy;
280	rdfa.states [i].daddy = old_to_new [dad];
281	}
282
283	rdfa.start_floating = old_to_new [rdfa.start_floating];
284	rdfa.start_anchored = old_to_new [rdfa.start_anchored];
285	}
286
287	static
288	void new_dfa(raw_dfa &rdfa, const HopcroftInfo &info) {
289	if (info.partition.size() == info.states.size()) {
290	return;
291	}
292
293	vector<dstate_id_t> old_to_new(info.states.size());
294	mapping_new_states(info, old_to_new, rdfa);
295	renumber_new_states(info, old_to_new, rdfa);
296	}
297
298	void minimize_hopcroft(raw_dfa &rdfa, const Grey &grey) {
299	if (!grey.minimizeDFA) {
300	return;
301	}
302
303	if (is_dead(rdfa)) {
304	DEBUG_PRINTF("dfa is empty\n");
305	}
306
307	UNUSED const size_t states_before = rdfa.states.size();
308
309	HopcroftInfo info(rdfa);
310
311	dfa_min(info);
312	new_dfa(rdfa, info);
313
314	DEBUG_PRINTF("reduced from %zu to %zu states\n", states_before,
315	rdfa.states.size());
316	}
317
318	} // namespace ue2
319

Browse the source code of ClickHouse/contrib/hyperscan/src/nfa/dfa_min.cpp