rose_build_lookaround.cpp source code [ClickHouse/contrib/hyperscan/src/rose/rose_build_lookaround.cpp]

1	/*
2	* Copyright (c) 2015-2017, Intel Corporation
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions are met:
6	*
7	* * Redistributions of source code must retain the above copyright notice,
8	* this list of conditions and the following disclaimer.
9	* * Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	* * Neither the name of Intel Corporation nor the names of its contributors
13	* may be used to endorse or promote products derived from this software
14	* without specific prior written permission.
15	*
16	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26	* POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/* \file*
30	* \brief Rose compile-time analysis for lookaround masks.
31	*/
32	#include "rose_build_lookaround.h"
33
34	#include "rose_build_impl.h"
35	#include "nfa/castlecompile.h"
36	#include "nfa/goughcompile.h"
37	#include "nfa/rdfa.h"
38	#include "nfagraph/ng_repeat.h"
39	#include "nfagraph/ng_util.h"
40	#include "util/container.h"
41	#include "util/dump_charclass.h"
42	#include "util/graph_range.h"
43	#include "util/flat_containers.h"
44	#include "util/verify_types.h"
45
46	#include <cstdlib>
47	#include <queue>
48	#include <sstream>
49
50	using namespace std;
51
52	namespace ue2 {
53
54	/* \brief Max search distance for reachability in front of a role. /
55	static const u32 MAX_FWD_LEN = `64`;
56
57	/* \brief Max search distance for reachability behind a role. /
58	static const u32 MAX_BACK_LEN = `64`;
59
60	/* \brief Max lookaround entries for a role. /
61	static const u32 MAX_LOOKAROUND_ENTRIES = `16`;
62
63	/* \brief We would rather have lookarounds with smaller reach than this. /
64	static const u32 LOOKAROUND_WIDE_REACH = `200`;
65
66	#if defined(DEBUG) \|\| defined(DUMP_SUPPORT)
67	static UNUSED
68	string dump(const map<s32, CharReach> &look) {
69	ostringstream oss;
70	for (auto it = look.begin(), ite = look.end(); it != ite; ++it) {
71	if (it != look.begin()) {
72	oss << ", ";
73	}
74	oss << "{" << it->first << ": " << describeClass(it->second) << "}";
75	}
76	return oss.str();
77	}
78	#endif
79
80	static
81	void getForwardReach(const NGHolder &g, u32 top, map<s32, CharReach> &look) {
82	flat_set<NFAVertex> curr, next;
83
84	// Consider only successors of start with the required top.
85	for (const auto &e : out_edges_range(g.start, g)) {
86	NFAVertex v = target(e, g);
87	if (v == g.startDs) {
88	continue;
89	}
90	if (contains(g [e].tops, top)) {
91	curr.insert(v);
92	}
93	}
94
95	for (u32 i = `0`; i < MAX_FWD_LEN; i++) {
96	if (curr.empty() \|\| contains(curr, g.accept) \|\|
97	contains(curr, g.acceptEod)) {
98	break;
99	}
100
101	next.clear();
102	CharReach cr;
103
104	for (auto v : curr) {
105	assert(!is_special(v, g));
106	cr \|= g [v].char_reach;
107	insert(&next, adjacent_vertices(v, g));
108	}
109
110	assert(cr.any());
111	look [i] \|= cr;
112	curr.swap(next);
113	}
114	}
115
116	static
117	void getBackwardReach(const NGHolder &g, ReportID report, u32 lag,
118	map<s32, CharReach> &look) {
119	flat_set<NFAVertex> curr, next;
120
121	for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
122	if (contains(g [v].reports, report)) {
123	curr.insert(v);
124	}
125	}
126
127	for (u32 i = lag + `1`; i <= MAX_BACK_LEN; i++) {
128	if (curr.empty() \|\| contains(curr, g.start) \|\|
129	contains(curr, g.startDs)) {
130	break;
131	}
132
133	next.clear();
134	CharReach cr;
135
136	for (auto v : curr) {
137	assert(!is_special(v, g));
138	cr \|= g [v].char_reach;
139	insert(&next, inv_adjacent_vertices(v, g));
140	}
141
142	assert(cr.any());
143	look [`0` - i] \|= cr;
144	curr.swap(next);
145	}
146	}
147
148	static
149	void getForwardReach(const CastleProto &castle, u32 top,
150	map<s32, CharReach> &look) {
151	depth len = castle.repeats.at(top).bounds.min;
152	len = min(len, depth (MAX_FWD_LEN));
153	assert(len.is_finite());
154
155	const CharReach &cr = castle.reach();
156	for (u32 i = `0`; i < len; i++) {
157	look [i] \|= cr;
158	}
159	}
160
161	static
162	void getBackwardReach(const CastleProto &castle, ReportID report, u32 lag,
163	map<s32, CharReach> &look) {
164	depth min_depth = depth::infinity();
165	for (const auto &m : castle.repeats) {
166	const PureRepeat &pr = m.second;
167	if (contains(pr.reports, report)) {
168	min_depth = min(min_depth, pr.bounds.min);
169	}
170	}
171
172	if (!min_depth.is_finite()) {
173	assert(`0`);
174	return;
175	}
176
177	const CharReach &cr = castle.reach();
178	for (u32 i = lag + `1`; i <= min(lag + (u32)min_depth, MAX_BACK_LEN);
179	i++) {
180	look [`0` - i] \|= cr;
181	}
182	}
183
184	static
185	void getForwardReach(const raw_dfa &rdfa, map<s32, CharReach> &look) {
186	if (rdfa.states.size() < `2`) {
187	return;
188	}
189
190	flat_set<dstate_id_t> curr, next;
191	curr.insert(rdfa.start_anchored);
192
193	for (u32 i = `0`; i < MAX_FWD_LEN && !curr.empty(); i++) {
194	next.clear();
195	CharReach cr;
196
197	for (const auto state_id : curr) {
198	const dstate &ds = rdfa.states [state_id];
199
200	if (!ds.reports.empty() \|\| !ds.reports_eod.empty()) {
201	return;
202	}
203
204	for (unsigned c = `0`; c < N_CHARS; c++) {
205	dstate_id_t succ = ds.next [rdfa.alpha_remap [c]];
206	if (succ != DEAD_STATE) {
207	cr.set(c);
208	next.insert(succ);
209	}
210	}
211	}
212
213	assert(cr.any());
214	look [i] \|= cr;
215	curr.swap(next);
216	}
217	}
218
219	static
220	void getSuffixForwardReach(const suffix_id &suff, u32 top,
221	map<s32, CharReach> &look) {
222	if (suff.graph()) {
223	getForwardReach(*suff.graph(), top, look);
224	} else if (suff.castle()) {
225	getForwardReach(*suff.castle(), top, look);
226	} else if (suff.dfa()) {
227	assert(top == `0`); // DFA isn't multi-top capable.
228	getForwardReach(*suff.dfa(), look);
229	} else if (suff.haig()) {
230	assert(top == `0`); // DFA isn't multi-top capable.
231	getForwardReach(*suff.haig(), look);
232	}
233	}
234
235	static
236	void getRoseForwardReach(const left_id &left, u32 top,
237	map<s32, CharReach> &look) {
238	if (left.graph()) {
239	getForwardReach(*left.graph(), top, look);
240	} else if (left.castle()) {
241	getForwardReach(*left.castle(), top, look);
242	} else if (left.dfa()) {
243	assert(top == `0`); // DFA isn't multi-top capable.
244	getForwardReach(*left.dfa(), look);
245	} else if (left.haig()) {
246	assert(top == `0`); // DFA isn't multi-top capable.
247	getForwardReach(*left.haig(), look);
248	}
249	}
250
251	static
252	void combineForwardMasks(const vector<map<s32, CharReach> > &rose_look,
253	map<s32, CharReach> &look) {
254	for (u32 i = `0`; i < MAX_FWD_LEN; i++) {
255	for (const auto &rlook : rose_look) {
256	if (contains(rlook, i)) {
257	look [i] \|= rlook.at(i);
258	} else {
259	look [i].setall();
260	}
261	}
262	}
263	}
264
265	static
266	void findForwardReach(const RoseGraph &g, const RoseVertex v,
267	map<s32, CharReach> &look) {
268	if (!g [v].reports.empty()) {
269	DEBUG_PRINTF("acceptor\n");
270	return;
271	}
272
273	// Non-leaf vertices can pick up a mask per successor prefix rose
274	// engine.
275	vector<map<s32, CharReach>> rose_look;
276	for (const auto &e : out_edges_range(v, g)) {
277	RoseVertex t = target(e, g);
278	if (!g [t].left) {
279	DEBUG_PRINTF("successor %zu has no leftfix\n", g[t].index);
280	return;
281	}
282	rose_look.push_back(map<s32, CharReach>());
283	getRoseForwardReach(g [t].left, g [e].rose_top, rose_look.back());
284	}
285
286	if (g [v].suffix) {
287	DEBUG_PRINTF("suffix engine\n");
288	rose_look.push_back(map<s32, CharReach>());
289	getSuffixForwardReach(g [v].suffix, g [v].suffix.top, rose_look.back());
290	}
291
292	combineForwardMasks(rose_look, look);
293	}
294
295	static
296	void findBackwardReach(const RoseGraph &g, const RoseVertex v,
297	map<s32, CharReach> &look) {
298	if (!g [v].left) {
299	return;
300	}
301
302	DEBUG_PRINTF("leftfix, report=%u, lag=%u\n", g[v].left.leftfix_report,
303	g[v].left.lag);
304
305	if (g [v].left.graph) {
306	getBackwardReach(*g [v].left.graph, g [v].left.leftfix_report,
307	g [v].left.lag, look);
308	} else if (g [v].left.castle) {
309	getBackwardReach(*g [v].left.castle, g [v].left.leftfix_report,
310	g [v].left.lag, look);
311	}
312
313	// TODO: implement DFA variants if necessary.
314	}
315
316	static
317	void normalise(map<s32, CharReach> &look) {
318	// We can erase entries where the reach is "all characters".
319	vector<s32> dead;
320	for (const auto &m : look) {
321	if (m.second.all()) {
322	dead.push_back(m.first);
323	}
324	}
325	erase_all(&look, dead);
326	}
327
328	namespace {
329
330	struct LookPriority {
331	explicit LookPriority(const map<s32, CharReach> &look_in) : look(look_in) {}
332
333	bool operator()(s32 a, s32 b) const {
334	const CharReach &a_reach = look.at(a);
335	const CharReach &b_reach = look.at(b);
336	if (a_reach.count() != b_reach.count()) {
337	return a_reach.count() < b_reach.count();
338	}
339	return abs(a) < abs(b);
340	}
341
342	private:
343	const map<s32, CharReach> &look;
344	};
345
346	} // namespace
347
348	static
349	bool isFloodProne(const map<s32, CharReach> &look, const CharReach &flood_cr) {
350	for (const auto &m : look) {
351	const CharReach &look_cr = m.second;
352	if (!overlaps(look_cr, flood_cr)) {
353	return false;
354	}
355	}
356	DEBUG_PRINTF("look can't escape flood on %s\n",
357	describeClass(flood_cr).c_str());
358	return true;
359	}
360
361	static
362	bool isFloodProne(const map<s32, CharReach> &look,
363	const set<CharReach> &flood_reach) {
364	if (flood_reach.empty()) {
365	return false;
366	}
367
368	for (const CharReach &flood_cr : flood_reach) {
369	if (isFloodProne(look, flood_cr)) {
370	return true;
371	}
372	}
373
374	return false;
375	}
376
377	static
378	void reduce(map<s32, CharReach> &look, set<CharReach> &flood_reach) {
379	if (look.size() <= MAX_LOOKAROUND_ENTRIES) {
380	return;
381	}
382
383	DEBUG_PRINTF("before reduce: %s\n", dump(look).c_str());
384
385	// First, remove floods that we already can't escape; they shouldn't affect
386	// the analysis below.
387	for (auto it = flood_reach.begin(); it != flood_reach.end();) {
388	if (isFloodProne(look, *it)) {
389	DEBUG_PRINTF("removing inescapable flood on %s from analysis\n",
390	describeClass(*it).c_str());
391	flood_reach.erase(it ++);
392	} else {
393	++it;
394	}
395	}
396
397	LookPriority cmp(look);
398	priority_queue<s32, vector<s32>, LookPriority> pq(cmp);
399	for (const auto &m : look) {
400	pq.push(m.first);
401	}
402
403	while (!pq.empty() && look.size() > MAX_LOOKAROUND_ENTRIES) {
404	s32 d = pq.top();
405	assert(contains(look, d));
406	const CharReach cr(look [d]); // copy
407	pq.pop();
408
409	DEBUG_PRINTF("erasing {%d: %s}\n", d, describeClass(cr).c_str());
410	look.erase(d);
411
412	// If removing this entry would result in us becoming flood_prone on a
413	// particular flood_reach case, reinstate it and move on.
414	if (isFloodProne(look, flood_reach)) {
415	DEBUG_PRINTF("reinstating {%d: %s} due to flood-prone check\n", d,
416	describeClass(cr).c_str());
417	look.insert(make_pair(d, cr));
418	}
419	}
420
421	while (!pq.empty()) {
422	s32 d = pq.top();
423	assert(contains(look, d));
424	const CharReach cr(look [d]); // copy
425	pq.pop();
426
427	if (cr.count() < LOOKAROUND_WIDE_REACH) {
428	continue;
429	}
430
431	DEBUG_PRINTF("erasing {%d: %s}\n", d, describeClass(cr).c_str());
432	look.erase(d);
433
434	// If removing this entry would result in us becoming flood_prone on a
435	// particular flood_reach case, reinstate it and move on.
436	if (isFloodProne(look, flood_reach)) {
437	DEBUG_PRINTF("reinstating {%d: %s} due to flood-prone check\n", d,
438	describeClass(cr).c_str());
439	look.insert(make_pair(d, cr));
440	}
441	}
442
443	DEBUG_PRINTF("after reduce: %s\n", dump(look).c_str());
444	}
445
446	static
447	void findFloodReach(const RoseBuildImpl &tbi, const RoseVertex v,
448	set<CharReach> &flood_reach) {
449	for (u32 lit_id : tbi.g [v].literals) {
450	const ue2_literal &s = tbi.literals.at(lit_id).s;
451	if (s.empty()) {
452	continue;
453	}
454	if (is_flood(s)) {
455	CharReach cr(*s.begin());
456	DEBUG_PRINTF("flood-prone with reach: %s\n",
457	describeClass(cr).c_str());
458	flood_reach.insert(cr);
459	}
460	}
461	}
462
463
464	namespace {
465	struct LookProto {
466	LookProto(s32 offset_in, CharReach reach_in)
467	: offset(offset_in), reach (move(reach_in)) {}
468	s32 offset;
469	CharReach reach;
470	};
471	}
472
473	static
474	vector<LookProto> findLiteralReach(const rose_literal_id &lit) {
475	vector<LookProto> look;
476	look.reserve(lit.s.length());
477
478	s32 i = `0` - lit.s.length() - lit.delay;
479	for (const auto &c : lit.s) {
480	look.emplace_back(i, c);
481	i++;
482	}
483
484	return look;
485	}
486
487	static
488	vector<LookProto> findLiteralReach(const RoseBuildImpl &build,
489	const RoseVertex v) {
490	bool first = true;
491	vector<LookProto> look;
492
493	for (u32 lit_id : build.g [v].literals) {
494	const rose_literal_id &lit = build.literals.at(lit_id);
495	auto lit_look = findLiteralReach(lit);
496
497	if (first) {
498	look = std::move(lit_look);
499	first = false;
500	continue;
501	}
502
503	// Erase elements from look with keys not in lit_look. Where a key is
504	// in both maps, union its reach with the lookaround.
505	auto jt = begin(lit_look);
506	for (auto it = begin(look); it != end(look);) {
507	if (jt == end(lit_look)) {
508	// No further lit_look entries, erase remaining elements from
509	// look.
510	look.erase(it, end(look));
511	break;
512	}
513	if (it ->offset < jt ->offset) {
514	// Offset is present in look but not in lit_look, erase.
515	it = look.erase(it);
516	} else if (it ->offset > jt ->offset) {
517	// Offset is preset in lit_look but not in look, ignore.
518	++jt;
519	} else {
520	// Offset is present in both, union its reach with look.
521	it ->reach \|= jt ->reach;
522	++it;
523	++jt;
524	}
525	}
526	}
527
528	return look;
529	}
530
531	/**
532	* Trim lookaround checks from the prefix that overlap with the literals
533	* themselves.
534	*/
535	static
536	void trimLiterals(const RoseBuildImpl &build, const RoseVertex v,
537	map<s32, CharReach> &look) {
538	DEBUG_PRINTF("pre-trim lookaround: %s\n", dump(look).c_str());
539
540	for (const auto &m : findLiteralReach(build, v)) {
541	auto it = look.find(m.offset);
542	if (it == end(look)) {
543	continue;
544	}
545	if (m.reach.isSubsetOf(it ->second)) {
546	DEBUG_PRINTF("can trim entry at %d\n", it->first);
547	look.erase(it);
548	}
549	}
550
551	DEBUG_PRINTF("post-trim lookaround: %s\n", dump(look).c_str());
552	}
553
554	static
555	void normaliseLeftfix(map<s32, CharReach> &look) {
556	// We can erase entries where the reach is "all characters", except for the
557	// very first one -- this might be required to establish a minimum bound on
558	// the literal's match offset.
559
560	// TODO: It would be cleaner to use a literal program instruction to check
561	// the minimum bound explicitly.
562
563	if (look.empty()) {
564	return;
565	}
566
567	const auto earliest = begin(look)->first;
568
569	vector<s32> dead;
570	for (const auto &m : look) {
571	if (m.second.all() && m.first != earliest) {
572	dead.push_back(m.first);
573	}
574	}
575	erase_all(&look, dead);
576	}
577
578	static
579	bool trimMultipathLeftfix(const RoseBuildImpl &build, const RoseVertex v,
580	vector<map<s32, CharReach>> &looks) {
581	size_t path_count = `0`;
582	for (auto &look : looks) {
583	++path_count;
584	DEBUG_PRINTF("Path #%ld\n", path_count);
585
586	assert(!look.empty());
587	trimLiterals(build, v, look);
588
589	if (look.empty()) {
590	return false;
591	}
592
593	// Could be optimized here, just keep the empty byte of the longest path
594	normaliseLeftfix(look);
595
596	if (look.size() > MAX_LOOKAROUND_ENTRIES) {
597	DEBUG_PRINTF("lookaround too big (%zu entries)\n", look.size());
598	return false;
599	}
600	}
601	return true;
602	}
603
604	static
605	void transToLookaround(const vector<map<s32, CharReach>> &looks,
606	vector<vector<LookEntry>> &lookarounds) {
607	for (const auto &look : looks) {
608	vector<LookEntry> lookaround;
609	DEBUG_PRINTF("lookaround: %s\n", dump(look).c_str());
610	lookaround.reserve(look.size());
611	for (const auto &m : look) {
612	if (m.first < -`128` \|\| m.first > `127`) {
613	DEBUG_PRINTF("range too big\n");
614	lookarounds.clear();
615	return;
616	}
617	s8 offset = verify_s8(m.first);
618	lookaround.emplace_back(offset, m.second);
619	}
620	lookarounds.push_back(lookaround);
621	}
622	}
623
624	void findLookaroundMasks(const RoseBuildImpl &tbi, const RoseVertex v,
625	vector<LookEntry> &lookaround) {
626	lookaround.clear();
627
628	const RoseGraph &g = tbi.g;
629
630	map<s32, CharReach> look;
631	findBackwardReach(g, v, look);
632	findForwardReach(g, v, look);
633	trimLiterals(tbi, v, look);
634
635	if (look.empty()) {
636	return;
637	}
638
639	normalise(look);
640
641	if (look.empty()) {
642	return;
643	}
644
645	set<CharReach> flood_reach;
646	findFloodReach(tbi, v, flood_reach);
647	reduce(look, flood_reach);
648
649	if (look.empty()) {
650	return;
651	}
652
653	DEBUG_PRINTF("lookaround: %s\n", dump(look).c_str());
654	lookaround.reserve(look.size());
655	for (const auto &m : look) {
656	s8 offset = verify_s8(m.first);
657	lookaround.emplace_back(offset, m.second);
658	}
659	}
660
661	static
662	bool checkShuftiBuckets(const vector<map<s32, CharReach>> &looks,
663	u32 bucket_size) {
664	set<u32> bucket;
665	for (const auto &look : looks) {
666	for (const auto &l : look) {
667	CharReach cr = l.second;
668	if (cr.count() > `128`) {
669	cr.flip();
670	}
671	map <u16, u16> lo2hi;
672
673	for (size_t i = cr.find_first(); i != CharReach::npos;) {
674	u8 it_hi = i >> `4`;
675	u16 low_encode = `0`;
676	while (i != CharReach::npos && (i >> `4`) == it_hi) {
677	low_encode \|= `1` << (i &`0xf`);
678	i = cr.find_next(i);
679	}
680	lo2hi [low_encode] \|= `1` << it_hi;
681	}
682
683	for (const auto &it : lo2hi) {
684	u32 hi_lo = (it.second << `16`) \| it.first;
685	bucket.insert(hi_lo);
686	}
687	}
688	}
689	DEBUG_PRINTF("shufti has %lu bucket(s)\n", bucket.size());
690	return bucket.size() <= bucket_size;
691	}
692
693	static
694	bool getTransientPrefixReach(const NGHolder &g, ReportID report, u32 lag,
695	vector<map<s32, CharReach>> &looks) {
696	if (!isAcyclic(g)) {
697	DEBUG_PRINTF("contains back-edge\n");
698	return false;
699	}
700
701	// Must be floating chains wired to startDs.
702	if (!isFloating(g)) {
703	DEBUG_PRINTF("not a floating start\n");
704	return false;
705	}
706
707	vector<NFAVertex> curr;
708	for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
709	if (v == g.start \|\| v == g.startDs) {
710	DEBUG_PRINTF("empty graph\n");
711	return true;
712	}
713	if (contains(g [v].reports, report)) {
714	curr.push_back(v);
715	}
716	}
717
718	assert(!curr.empty());
719
720	u32 total_len = curr.size();
721
722	for (const auto &v : curr) {
723	looks.emplace_back(map<s32, CharReach>());
724	looks.back()[`0` - (lag + `1`)] = g [v].char_reach;
725	}
726
727	bool curr_active = false;
728
729	/ For each offset -i, we backwardly trace the path by vertices in curr.*
730	* Once there are more than 8 paths and more than 64 bits total_len,
731	* which means that neither MULTIPATH_LOOKAROUND nor MULTIPATH_SHUFTI
732	* could be successfully built, we will give up the path finding.
733	* Otherwise, the loop will halt when all vertices in curr are startDs.
734	*/
735	for (u32 i = lag + `2`; i < (lag + `2`) + MAX_BACK_LEN; i++) {
736	curr_active = false;
737	size_t curr_size = curr.size();
738	if (curr.size() > `1` && i > lag + MULTIPATH_MAX_LEN) {
739	DEBUG_PRINTF("range is larger than 16 in multi-path\n");
740	return false;
741	}
742
743	for (size_t idx = `0`; idx < curr_size; idx++) {
744	NFAVertex v = curr [idx];
745	if (v == g.startDs) {
746	continue;
747	}
748	assert(!is_special(v, g));
749
750	for (auto u : inv_adjacent_vertices_range(v, g)) {
751	if (u == g.start \|\| u == g.startDs) {
752	curr [idx] = g.startDs;
753	break;
754	}
755	}
756
757	if (is_special(curr [idx], g)) {
758	continue;
759	}
760
761	for (auto u : inv_adjacent_vertices_range(v, g)) {
762	curr_active = true;
763	if (curr [idx] == v) {
764	curr [idx] = u;
765	looks [idx][`0` - i] = g [u].char_reach;
766	total_len++;
767	} else {
768	curr.push_back(u);
769	looks.push_back(looks [idx]);
770	(looks.back())[`0` - i] = g [u].char_reach;
771	total_len += looks.back().size();
772	}
773
774	if (curr.size() > MAX_LOOKAROUND_PATHS && total_len > `64`) {
775	DEBUG_PRINTF("too many branches\n");
776	return false;
777	}
778	}
779	}
780	if (!curr_active) {
781	break;
782	}
783	}
784
785	if (curr_active) {
786	DEBUG_PRINTF("single path too long\n");
787	return false;
788	}
789
790	// More than 8 paths, check multi-path shufti.
791	if (curr.size() > MAX_LOOKAROUND_PATHS) {
792	u32 bucket_size = total_len > `32` ? `8` : `16`;
793	if (!checkShuftiBuckets(looks, bucket_size)) {
794	DEBUG_PRINTF("shufti has too many buckets\n");
795	return false;
796	}
797	}
798
799	assert(!looks.empty());
800	if (looks.size() == `1`) {
801	DEBUG_PRINTF("single lookaround\n");
802	} else {
803	DEBUG_PRINTF("multi-path lookaround\n");
804	}
805	DEBUG_PRINTF("done\n");
806	return true;
807	}
808
809	bool makeLeftfixLookaround(const RoseBuildImpl &build, const RoseVertex v,
810	vector<vector<LookEntry>> &lookaround) {
811	lookaround.clear();
812
813	const RoseGraph &g = build.g;
814	const left_id leftfix(g [v].left);
815
816	if (!contains(build.transient, leftfix)) {
817	DEBUG_PRINTF("not transient\n");
818	return false;
819	}
820
821	if (!leftfix.graph()) {
822	DEBUG_PRINTF("only supported for graphs so far\n");
823	return false;
824	}
825
826	vector<map<s32, CharReach>> looks;
827	if (!getTransientPrefixReach(*leftfix.graph(), g [v].left.leftfix_report,
828	g [v].left.lag, looks)) {
829	DEBUG_PRINTF("graph has loop or too large\n");
830	return false;
831	}
832
833	if (!trimMultipathLeftfix(build, v, looks)) {
834	return false;
835	}
836	transToLookaround(looks, lookaround);
837
838	return !lookaround.empty();
839	}
840
841	void mergeLookaround(vector<LookEntry> &lookaround,
842	const vector<LookEntry> &more_lookaround) {
843	if (lookaround.size() >= MAX_LOOKAROUND_ENTRIES) {
844	DEBUG_PRINTF("big enough!\n");
845	return;
846	}
847
848	// Don't merge lookarounds at offsets we already have entries for.
849	flat_set<s8> offsets;
850	for (const auto &e : lookaround) {
851	offsets.insert(e.offset);
852	}
853
854	map<s32, CharReach> more;
855	LookPriority cmp(more);
856	priority_queue<s32, vector<s32>, LookPriority> pq(cmp);
857	for (const auto &e : more_lookaround) {
858	if (!contains(offsets, e.offset)) {
859	more.emplace(e.offset, e.reach);
860	pq.push(e.offset);
861	}
862	}
863
864	while (!pq.empty() && lookaround.size() < MAX_LOOKAROUND_ENTRIES) {
865	const s32 offset = pq.top();
866	pq.pop();
867	const auto &cr = more.at(offset);
868	DEBUG_PRINTF("added {%d,%s}\n", offset, describeClass(cr).c_str());
869	lookaround.emplace_back(verify_s8(offset), cr);
870	}
871
872	// Order by offset.
873	sort(begin(lookaround), end(lookaround),
874	[](const LookEntry &a, const LookEntry &b) {
875	return a.offset < b.offset;
876	});
877	}
878
879	} // namespace ue2
880

Browse the source code of ClickHouse/contrib/hyperscan/src/rose/rose_build_lookaround.cpp