asserts.cpp source code [ClickHouse/contrib/hyperscan/src/compiler/asserts.cpp]

1	/*
2	* Copyright (c) 2015-2017, Intel Corporation
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions are met:
6	*
7	* * Redistributions of source code must retain the above copyright notice,
8	* this list of conditions and the following disclaimer.
9	* * Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	* * Neither the name of Intel Corporation nor the names of its contributors
13	* may be used to endorse or promote products derived from this software
14	* without specific prior written permission.
15	*
16	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26	* POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/* \file*
30	* \brief Convert temporary assert vertices (from construction method) to
31	* edge-based flags.
32	*
33	* This pass converts the temporary assert vertices created by the Glushkov
34	* construction process above (vertices with special assertions flags) into
35	* edges between those vertices' neighbours in the graph.
36	*
37	* These edges have the appropriate flags applied to them -- a path (u,t,v)
38	* through an assert vertex t will be replaced with the edge (u,v) with the
39	* assertion flags from t.
40	*
41	* Edges with mutually incompatible flags (such as the conjunction of
42	* word-to-word and word-to-nonword) are dropped.
43	*/
44	#include "asserts.h"
45
46	#include "compiler/compiler.h"
47	#include "nfagraph/ng.h"
48	#include "nfagraph/ng_prune.h"
49	#include "nfagraph/ng_redundancy.h"
50	#include "nfagraph/ng_util.h"
51	#include "parser/position.h" // for POS flags
52	#include "util/compile_error.h"
53	#include "util/graph_range.h"
54
55	#include <queue>
56	#include <set>
57
58	using namespace std;
59
60	namespace ue2 {
61
62	/* Hard limit on the maximum number of edges we'll clone before we throw up*
63	* our hands and report 'Pattern too large.' */
64	static const size_t MAX_ASSERT_EDGES = `300000`;
65
66	/* Flags representing the word-boundary assertions, \\b or \\B. /
67	static const int WORDBOUNDARY_FLAGS = POS_FLAG_ASSERT_WORD_TO_WORD
68	\| POS_FLAG_ASSERT_WORD_TO_NONWORD
69	\| POS_FLAG_ASSERT_NONWORD_TO_WORD
70	\| POS_FLAG_ASSERT_NONWORD_TO_NONWORD
71	\| POS_FLAG_ASSERT_WORD_TO_WORD_UCP
72	\| POS_FLAG_ASSERT_WORD_TO_NONWORD_UCP
73	\| POS_FLAG_ASSERT_NONWORD_TO_WORD_UCP
74	\| POS_FLAG_ASSERT_NONWORD_TO_NONWORD_UCP;
75
76	#define OPEN_EDGE 0U
77	#define DEAD_EDGE (~0U)
78
79	static
80	u32 disjunct(u32 flags1, u32 flags2) {
81	/ from two asserts in parallel /
82	DEBUG_PRINTF("disjunct %x %x\n", flags1, flags2);
83	u32 rv;
84	if (flags1 == DEAD_EDGE) {
85	rv = flags2;
86	} else if (flags2 == DEAD_EDGE) {
87	rv = flags1;
88	} else if (flags1 == OPEN_EDGE \|\| flags2 == OPEN_EDGE) {
89	rv = OPEN_EDGE;
90	} else {
91	rv = flags1 \| flags2;
92	}
93	DEBUG_PRINTF("--> %x\n", rv);
94	return rv;
95	}
96
97	static
98	u32 conjunct(u32 flags1, u32 flags2) {
99	/ from two asserts in series /
100	DEBUG_PRINTF("conjunct %x %x\n", flags1, flags2);
101	u32 rv;
102	if (flags1 == OPEN_EDGE) {
103	rv = flags2;
104	} else if (flags2 == OPEN_EDGE) {
105	rv = flags1;
106	} else if (flags1 & flags2) {
107	rv = flags1 & flags2;
108	} else {
109	rv = DEAD_EDGE; / the conjunction of two different word boundary*
110	* assertion is impassable */
111	}
112
113	DEBUG_PRINTF("--> %x\n", rv);
114	return rv;
115	}
116
117	typedef map<pair<NFAVertex, NFAVertex>, NFAEdge> edge_cache_t;
118
119	static
120	void replaceAssertVertex(NGHolder &g, NFAVertex t, const ExpressionInfo &expr,
121	edge_cache_t &edge_cache, u32 &assert_edge_count) {
122	DEBUG_PRINTF("replacing assert vertex %zu\n", g[t].index);
123
124	const u32 flags = g [t].assert_flags;
125	DEBUG_PRINTF("consider assert vertex %zu with flags %u\n", g[t].index,
126	flags);
127
128	// Wire up all the predecessors to all the successors.
129
130	for (const auto &inEdge : in_edges_range(t, g)) {
131	NFAVertex u = source(inEdge, g);
132	if (u == t) {
133	continue; // ignore self-loops
134	}
135
136	const u32 flags_inc_in = conjunct(g [inEdge].assert_flags,
137	flags);
138	if (flags_inc_in == DEAD_EDGE) {
139	DEBUG_PRINTF("fail, in-edge has bad flags %d\n",
140	g[inEdge].assert_flags);
141	continue;
142	}
143
144	for (const auto &outEdge : out_edges_range(t, g)) {
145	NFAVertex v = target(outEdge, g);
146
147	DEBUG_PRINTF("consider path [%zu,%zu,%zu]\n", g[u].index,
148	g[t].index, g[v].index);
149
150	if (v == t) {
151	continue; // ignore self-loops
152	}
153
154	const u32 flags_final = conjunct(g [outEdge].assert_flags,
155	flags_inc_in);
156
157	if (flags_final == DEAD_EDGE) {
158	DEBUG_PRINTF("fail, out-edge has bad flags %d\n",
159	g[outEdge].assert_flags);
160	continue;
161	}
162
163	if ((g [u].assert_flags & POS_FLAG_MULTILINE_START)
164	&& v == g.acceptEod) {
165	DEBUG_PRINTF("fail, (?m)^ does not match \\n at eod\n");
166	continue;
167	}
168
169	/ Replace path (u,t,v) with direct edge (u,v), unless the edge*
170	* already exists, in which case we just need to edit its
171	* properties.
172	*
173	* Use edge_cache to prevent us going O(N).
174	*/
175	auto cache_key = make_pair(u, v);
176	auto ecit = edge_cache.find(cache_key);
177	if (ecit == edge_cache.end()) {
178	DEBUG_PRINTF("adding edge %zu %zu\n", g[u].index, g[v].index);
179	NFAEdge e = add_edge(u, v, g);
180	edge_cache.emplace(cache_key, e);
181	g [e].assert_flags = flags;
182	if (++assert_edge_count > MAX_ASSERT_EDGES) {
183	throw CompileError (expr.index, "Pattern is too large.");
184	}
185	} else {
186	NFAEdge e = ecit ->second;
187	DEBUG_PRINTF("updating edge %zu %zu [a %zu]\n", g[u].index,
188	g[v].index, g[t].index);
189	// Edge already exists.
190	u32 &e_flags = g [e].assert_flags;
191	e_flags = disjunct(e_flags, flags_final);
192	assert(e_flags != DEAD_EDGE);
193	}
194	}
195	}
196
197	// Clear vertex t to remove all the old edges.
198	/ no need to clear the cache, as we will never look up its edge as it is*
199	* unreachable */
200	clear_vertex(t, g);
201	}
202
203	static
204	void setReportId(ReportManager &rm, NGHolder &g, const ExpressionInfo &expr,
205	NFAVertex v, s32 adj) {
206	// Don't try and set the report ID of a special vertex.
207	assert(!is_special(v, g));
208
209	// There should be no reports set already.
210	assert(g[v].reports.empty());
211
212	Report r = rm.getBasicInternalReport(expr, adj);
213
214	g [v].reports.insert(rm.getInternalId(r));
215	DEBUG_PRINTF("set report id for vertex %zu, adj %d\n", g[v].index, adj);
216	}
217
218	static
219	void checkForMultilineStart(ReportManager &rm, NGHolder &g,
220	const ExpressionInfo &expr) {
221	vector<NFAEdge> dead;
222	for (auto v : adjacent_vertices_range(g.start, g)) {
223	if (!(g [v].assert_flags & POS_FLAG_MULTILINE_START)) {
224	continue;
225	}
226	DEBUG_PRINTF("mls %zu %08x\n", g[v].index, g[v].assert_flags);
227
228	/ we have found a multi-line start (maybe more than one) /
229
230	/ we need to interpose a dummy dot vertex between v and accept if*
231	* required so that ^ doesn't match trailing \n */
232	for (const auto &e : out_edges_range(v, g)) {
233	if (target(e, g) == g.accept) {
234	dead.push_back(e);
235	}
236	}
237	/ assert has been resolved; clear flag /
238	g [v].assert_flags &= ~POS_FLAG_MULTILINE_START;
239	}
240
241	for (const auto &e : dead) {
242	NFAVertex dummy = add_vertex(g);
243	g [dummy].char_reach.setall();
244	setReportId(rm, g, expr, dummy, -`1`);
245	add_edge(source(e, g), dummy, g [e], g);
246	add_edge(dummy, g.accept, g);
247	}
248
249	remove_edges(dead, g);
250	}
251
252	static
253	bool hasAssertVertices(const NGHolder &g) {
254	for (auto v : vertices_range(g)) {
255	int flags = g [v].assert_flags;
256	if (flags & WORDBOUNDARY_FLAGS) {
257	return true;
258	}
259	}
260	return false;
261	}
262
263	/* \brief Convert temporary assert vertices (from construction method) to*
264	* edge-based flags.
265	*
266	* Remove the horrors that are the temporary assert vertices which arise from
267	* our construction method. Allows the rest of our code base to live in
268	* blissful ignorance of their existence. */
269	void removeAssertVertices(ReportManager &rm, NGHolder &g,
270	const ExpressionInfo &expr) {
271	size_t num = `0`;
272
273	DEBUG_PRINTF("before: graph has %zu vertices\n", num_vertices(g));
274
275	// Sweep over the graph and ascertain that we do actually have vertices
276	// with assertion flags set. Otherwise, we're done.
277	if (!hasAssertVertices(g)) {
278	DEBUG_PRINTF("no assert vertices, done\n");
279	return;
280	}
281
282	u32 assert_edge_count = `0`;
283
284	// Build a cache of (u, v) vertex pairs to edge descriptors.
285	edge_cache_t edge_cache;
286	for (const auto &e : edges_range(g)) {
287	edge_cache [make_pair(source(e, g), target(e, g))] = e;
288	}
289
290	for (auto v : vertices_range(g)) {
291	if (g [v].assert_flags & WORDBOUNDARY_FLAGS) {
292	replaceAssertVertex(g, v, expr, edge_cache, assert_edge_count);
293	num++;
294	}
295	}
296
297	checkForMultilineStart(rm, g, expr);
298
299	if (num) {
300	DEBUG_PRINTF("resolved %zu assert vertices\n", num);
301	pruneUseless(g);
302	pruneEmptyVertices(g);
303	renumber_vertices(g);
304	renumber_edges(g);
305	}
306
307	DEBUG_PRINTF("after: graph has %zu vertices\n", num_vertices(g));
308	assert(!hasAssertVertices(g));
309	}
310
311	} // namespace ue2
312

Browse the source code of ClickHouse/contrib/hyperscan/src/compiler/asserts.cpp