ng_literal_analysis.cpp source code [ClickHouse/contrib/hyperscan/src/nfagraph/ng_literal_analysis.cpp]

1	/*
2	* Copyright (c) 2015-2017, Intel Corporation
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions are met:
6	*
7	* * Redistributions of source code must retain the above copyright notice,
8	* this list of conditions and the following disclaimer.
9	* * Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	* * Neither the name of Intel Corporation nor the names of its contributors
13	* may be used to endorse or promote products derived from this software
14	* without specific prior written permission.
15	*
16	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26	* POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/* \file*
30	* \brief Literal analysis and scoring.
31	*/
32	#include "ng_literal_analysis.h"
33
34	#include "ng_holder.h"
35	#include "ng_split.h"
36	#include "ng_util.h"
37	#include "ue2common.h"
38	#include "rose/rose_common.h"
39	#include "util/compare.h"
40	#include "util/depth.h"
41	#include "util/graph.h"
42	#include "util/graph_range.h"
43	#include "util/graph_small_color_map.h"
44	#include "util/ue2_graph.h"
45	#include "util/ue2string.h"
46
47	#include <algorithm>
48	#include <fstream>
49	#include <queue>
50
51	#include <boost/graph/boykov_kolmogorov_max_flow.hpp>
52
53	using namespace std;
54
55	namespace ue2 {
56
57	/* Maximum number of paths to generate. /
58	static const u32 MAX_WIDTH = `11`;
59
60	/* Scoring adjustment for 'uniqueness' in literal. /
61	static const u64a WEIGHT_OF_UNIQUENESS = `250`;
62
63	namespace {
64
65	/ Small literal graph type used for the suffix tree used in*
66	* compressAndScore. */
67
68	struct LitGraphVertexProps {
69	LitGraphVertexProps() = default;
70	explicit LitGraphVertexProps(ue2_literal::elem c_in) : c (move(c_in)) {}
71	ue2_literal::elem c; // string element (char + bool)
72	size_t index; // managed by ue2_graph
73	};
74
75	struct LitGraphEdgeProps {
76	LitGraphEdgeProps() = default;
77	explicit LitGraphEdgeProps(u64a score_in) : score(score_in) {}
78	u64a score = NO_LITERAL_AT_EDGE_SCORE;
79	size_t index; // managed by ue2_graph
80	};
81
82	struct LitGraph
83	: public ue2_graph<LitGraph, LitGraphVertexProps, LitGraphEdgeProps> {
84
85	LitGraph() : root (add_vertex(*this)), sink (add_vertex(*this)) {}
86
87	const vertex_descriptor root;
88	const vertex_descriptor sink;
89	};
90
91	typedef LitGraph::vertex_descriptor LitVertex;
92	typedef LitGraph::edge_descriptor LitEdge;
93
94	typedef pair<LitVertex, NFAVertex> VertexPair;
95	typedef std::queue<VertexPair> LitVertexQ;
96
97	} // namespace
98
99	#ifdef DUMP_SUPPORT
100
101	/* \brief Dump the literal graph in Graphviz format. /
102	static UNUSED
103	void dumpGraph(const char filename, const* LitGraph &lg) {
104	ofstream fout(filename);
105
106	fout << "digraph G {" << endl;
107
108	for (auto v : vertices_range(lg)) {
109	fout << lg[v].index;
110	if (v == lg.root) {
111	fout << "[label=\"ROOT\"];";
112	} else if (v == lg.sink) {
113	fout << "[label=\"SINK\"];";
114	} else {
115	ue2_literal s;
116	s.push_back(lg[v].c);
117	fout << "[label=\"" << dumpString(s) << "\"];";
118	}
119	fout << endl;
120	}
121
122	for (const auto &e : edges_range(lg)) {
123	LitVertex u = source(e, lg), v = target(e, lg);
124	fout << lg[u].index << " -> " << lg[v].index << "[label=\""
125	<< lg[e].score << "\"]"
126	<< ";" << endl;
127	}
128
129	fout << "}" << endl;
130	}
131
132	#endif // DUMP_SUPPORT
133
134	static
135	bool allowExpand(size_t numItems, size_t totalPathsSoFar) {
136	if (numItems == `0`) {
137	return false;
138	}
139
140	if (numItems + totalPathsSoFar > MAX_WIDTH) {
141	return false;
142	}
143
144	return true;
145	}
146
147	static
148	LitVertex addToLitGraph(LitGraph &lg, LitVertex pred,
149	const ue2_literal::elem &c) {
150	// Check if we already have this in the graph.
151	for (auto v : adjacent_vertices_range(pred, lg)) {
152	if (v == lg.sink) {
153	continue;
154	}
155	if (lg [v].c == c) {
156	return v;
157	}
158	}
159
160	LitVertex lv = add_vertex(LitGraphVertexProps (c), lg);
161	add_edge(pred, lv, lg);
162	return lv;
163	}
164
165	static
166	void addToQueue(LitVertexQ &workQ, LitGraph &lg, LitVertex pred,
167	const CharReach &cr, NFAVertex v) {
168	for (size_t i = cr.find_first(); i != CharReach::npos;
169	i = cr.find_next(i)) {
170	if (myisupper(i) && cr.test(mytolower(i))) {
171	// ignore upper half of a nocase pair
172	continue;
173	}
174
175	bool nocase = myislower(i) && cr.test(mytoupper(i));
176	ue2_literal::elem c((char)i, nocase);
177	LitVertex lv = addToLitGraph(lg, pred, c);
178	workQ.push(VertexPair (lv, v));
179	}
180	}
181
182	static
183	void initWorkQueue(LitVertexQ &workQ, LitGraph &lg, const NGHolder &g,
184	const NFAEdge &e) {
185	NFAVertex u = source(e, g);
186	NFAVertex v = target(e, g);
187	const CharReach &cr = g [v].char_reach;
188
189	if (!allowExpand(cr.count(), `0`)) {
190	return;
191	}
192
193	addToQueue(workQ, lg, lg.root, cr, u);
194	}
195
196	static
197	u32 crCardinality(const CharReach &cr) {
198	// Special-case for handling dots, much faster than running the find_next
199	// loop below.
200	if (cr.all()) {
201	return `230`; // [^A-Z]
202	}
203
204	u32 rv = `0`;
205	for (size_t i = cr.find_first(); i != CharReach::npos;
206	i = cr.find_next(i)) {
207	if (myisupper(i) && cr.test(mytolower(i))) {
208	// ignore upper half of a nocase pair
209	continue;
210	}
211	rv++;
212	}
213
214	return rv;
215	}
216
217	/* Filter out literals that include other literals as suffixes. We do this by*
218	* identifying vertices connected to the sink and removing their other
219	* out-edges. */
220	static
221	void filterLitGraph(LitGraph &lg) {
222	for (auto v : inv_adjacent_vertices_range(lg.sink, lg)) {
223	remove_out_edge_if(v, [&lg](const LitEdge &e) {
224	return target(e, lg) != lg.sink;
225	}, lg);
226	}
227
228	// We could do a DFS-and-prune here, if we wanted. Right now, we just
229	// handle it in extractLiterals by throwing away paths that don't run all
230	// the way from sink to root.
231	}
232
233	/* Extracts all the literals from the given literal graph. Walks the graph*
234	* from each predecessor of the sink (note: it's a suffix tree except for this
235	* convenience) towards the source, storing each string as we go. */
236	static
237	void extractLiterals(const LitGraph &lg, set<ue2_literal> &s) {
238	ue2_literal lit;
239
240	for (auto u : inv_adjacent_vertices_range(lg.sink, lg)) {
241	lit.clear();
242	while (u != lg.root) {
243	lit.push_back(lg [u].c);
244	assert(in_degree(u, lg) <= `1`);
245	LitGraph::inv_adjacency_iterator ai2, ae2;
246	tie(ai2, ae2) = inv_adjacent_vertices(u, lg);
247	if (ai2 == ae2) {
248	// Path has been cut, time for the next literal.
249	goto next_literal;
250	}
251	u = *ai2;
252	}
253	s.insert(lit);
254	next_literal:
255	;
256	}
257	}
258
259	#ifndef NDEBUG
260	static
261	bool hasSuffixLiterals(const set<ue2_literal> &s) {
262	for (auto it = s.begin(), ite = s.end(); it != ite; ++it) {
263	for (auto jt = std::next(it); jt != ite; ++jt) {
264	if (isSuffix(it, jt) \|\| isSuffix(jt, it)) {
265	DEBUG_PRINTF("'%s' and '%s' have suffix issues\n",
266	dumpString(*it).c_str(),
267	dumpString(*jt).c_str());
268	return true;
269	}
270	}
271	}
272	return false;
273	}
274	#endif
275
276	static
277	void processWorkQueue(const NGHolder &g, const NFAEdge &e,
278	set<ue2_literal> &s) {
279	if (is_special(target(e, g), g)) {
280	return;
281	}
282
283	LitGraph lg;
284
285	LitVertexQ workQ;
286	initWorkQueue(workQ, lg, g, e);
287
288	while (!workQ.empty()) {
289	const LitVertex lv = workQ.front().first;
290	const NFAVertex &t = workQ.front().second;
291	const CharReach &cr = g [t].char_reach;
292
293	u32 cr_card = crCardinality(cr);
294	size_t numItems = cr_card * in_degree(t, g);
295	size_t committed_count = workQ.size() + in_degree(lg.sink, lg) - `1`;
296
297	if (g [t].index == NODE_START) {
298	// reached start, add to literal set
299	add_edge_if_not_present(lv, lg.sink, lg);
300	goto next_work_elem;
301	}
302
303	// Expand next vertex
304	if (allowExpand(numItems, committed_count)) {
305	for (auto u : inv_adjacent_vertices_range(t, g)) {
306	addToQueue(workQ, lg, lv, cr, u);
307	}
308	goto next_work_elem;
309	}
310
311	// Expand this vertex
312	if (allowExpand(cr_card, committed_count)) {
313	for (size_t i = cr.find_first(); i != CharReach::npos;
314	i = cr.find_next(i)) {
315	if (myisupper(i) && cr.test(mytolower(i))) {
316	// ignore upper half of a nocase pair
317	continue;
318	}
319
320	bool nocase = myislower(i) && cr.test(mytoupper(i));
321	ue2_literal::elem c((char)i, nocase);
322	LitVertex lt = addToLitGraph(lg, lv, c);
323	add_edge_if_not_present(lt, lg.sink, lg);
324	}
325	goto next_work_elem;
326	}
327
328	// add to literal set
329	add_edge_if_not_present(lv, lg.sink, lg);
330	next_work_elem:
331	workQ.pop();
332	}
333
334	filterLitGraph(lg);
335	//dumpGraph("litgraph.dot", lg);
336	extractLiterals(lg, s);
337
338	// Our literal set should contain no literal that is a suffix of another.
339	assert(!hasSuffixLiterals(s));
340
341	DEBUG_PRINTF("edge %zu (%zu->%zu) produced %zu literals\n", g[e].index,
342	g[source(e, g)].index, g[target(e, g)].index, s.size());
343	}
344
345	bool bad_mixed_sensitivity(const ue2_literal &s) {
346	/ TODO: if the mixed cases is entirely within MAX_MASK2_WIDTH of the end,*
347	* we should be able to handle it */
348	return mixed_sensitivity(s) && s.length() > MAX_MASK2_WIDTH;
349	}
350
351	static
352	u64a litUniqueness(const string &s) {
353	CharReach seen(s);
354	return seen.count();
355	}
356
357	/* Count the significant bits of this literal (i.e. seven for nocase alpha,*
358	* eight for everything else). */
359	static
360	u64a litCountBits(const ue2_literal &lit) {
361	u64a n = `0`;
362	for (const auto &c : lit) {
363	n += c.nocase ? `7` : `8`;
364	}
365	return n;
366	}
367
368	/* Returns a fairly arbitrary score for the given literal, used to compare the*
369	* suitability of different candidates. */
370	static
371	u64a scoreLiteral(const ue2_literal &s) {
372	// old scoring scheme: SUM(s in S: 1/s.len()^2)
373	// now weight (currently 75/25) with number of unique chars
374	// in the string
375	u64a len = litCountBits(s);
376	u64a lenUnique = litUniqueness(s.get_string()) * `8`;
377
378	u64a weightedLen = (`1000ULL` - WEIGHT_OF_UNIQUENESS) * len +
379	WEIGHT_OF_UNIQUENESS * lenUnique;
380	weightedLen /= `8`;
381
382	DEBUG_PRINTF("scored literal '%s' %llu\n",
383	escapeString(s.get_string()).c_str(), weightedLen);
384
385	return weightedLen;
386	}
387
388
389	/**
390	* calculateScore has the following properties:
391	* - score of literal is the same as the score of the reversed literal;
392	* - score of substring of literal is worse than the original literal's score;
393	* - score of any literal should be non-zero.
394	*/
395	static
396	u64a calculateScore(const ue2_literal &s) {
397	if (s.empty()) {
398	return NO_LITERAL_AT_EDGE_SCORE;
399	}
400
401	u64a weightedLen = scoreLiteral(s);
402
403	DEBUG_PRINTF("len %zu, wl %llu\n", s.length(), weightedLen);
404	u64a rv = `1000000000000000ULL`/(weightedLen * weightedLen * weightedLen);
405
406	if (!rv) {
407	rv = `1`;
408	}
409	DEBUG_PRINTF("len %zu, score %llu\n", s.length(), rv);
410	return rv;
411	}
412
413	/* Adds a literal in reverse order, building up a suffix tree. /
414	static
415	void addReversedLiteral(const ue2_literal &lit, LitGraph &lg) {
416	DEBUG_PRINTF("literal: '%s'\n", escapeString(lit).c_str());
417	ue2_literal suffix;
418	LitVertex v = lg.root;
419	for (auto it = lit.rbegin(), ite = lit.rend(); it != ite; ++it) {
420	suffix.push_back(*it);
421	LitVertex w;
422	for (auto v2 : adjacent_vertices_range(v, lg)) {
423	if (v2 != lg.sink && lg [v2].c == *it) {
424	w = v2;
425	goto next_char;
426	}
427	}
428	w = add_vertex(LitGraphVertexProps (*it), lg);
429	add_edge(v, w, LitGraphEdgeProps (calculateScore(suffix)), lg);
430	next_char:
431	v = w;
432	}
433
434	// Wire the last vertex to the sink.
435	add_edge(v, lg.sink, lg);
436	}
437
438	static
439	void extractLiterals(const vector<LitEdge> &cutset, const LitGraph &lg,
440	set<ue2_literal> &s) {
441	for (const auto &e : cutset) {
442	LitVertex u = source(e, lg);
443	LitVertex v = target(e, lg);
444	ue2_literal lit;
445	lit.push_back(lg [v].c);
446	while (u != lg.root) {
447	lit.push_back(lg [u].c);
448	assert(in_degree(u, lg) == `1`);
449	LitGraph::inv_adjacency_iterator ai, ae;
450	tie(ai, ae) = inv_adjacent_vertices(u, lg);
451	if (ai == ae) {
452	// Path has been cut, time for the next literal.
453	goto next_literal;
454	}
455	u = *ai;
456	}
457	DEBUG_PRINTF("extracted: '%s'\n", escapeString(lit).c_str());
458	s.insert(lit);
459	next_literal:
460	;
461	}
462	}
463
464	#ifdef DEBUG
465	static UNUSED
466	const char *describeColor(small_color c) {
467	switch (c) {
468	case small_color::white:
469	return "white";
470	case small_color::gray:
471	return "gray";
472	case small_color::black:
473	return "black";
474	default:
475	return "unknown";
476	}
477	}
478	#endif
479
480	/**
481	* The BGL's boykov_kolmogorov_max_flow requires that all edges have their
482	* reverse edge in the graph. This function adds them, returning a vector
483	* mapping edge index to reverse edge. Note: LitGraph should be a DAG so there
484	* should be no existing reverse_edges.
485	*/
486	static
487	vector<LitEdge> add_reverse_edges_and_index(LitGraph &lg) {
488	const size_t edge_count = num_edges(lg);
489	vector<LitEdge> fwd_edges;
490	fwd_edges.reserve(edge_count);
491	for (const auto &e : edges_range(lg)) {
492	fwd_edges.push_back(e);
493	}
494
495	vector<LitEdge> rev_map(`2` * edge_count);
496
497	for (const auto &e : fwd_edges) {
498	LitVertex u = source(e, lg);
499	LitVertex v = target(e, lg);
500
501	assert(!edge(v, u, lg).second);
502
503	LitEdge rev = add_edge(v, u, LitGraphEdgeProps (`0`), lg).first;
504	rev_map [lg [e].index] = rev;
505	rev_map [lg [rev].index] = e;
506	}
507
508	return rev_map;
509	}
510
511	static
512	void findMinCut(LitGraph &lg, vector<LitEdge> &cutset) {
513	cutset.clear();
514
515	//dumpGraph("litgraph.dot", lg);
516
517	assert(!in_degree(lg.root, lg));
518	assert(!out_degree(lg.sink, lg));
519	size_t num_real_edges = num_edges(lg);
520
521	// Add reverse edges for the convenience of the BGL's max flow algorithm.
522	vector<LitEdge> rev_edges = add_reverse_edges_and_index(lg);
523
524	const auto v_index_map = get(&LitGraphVertexProps::index, lg);
525	const auto e_index_map = get(&LitGraphEdgeProps::index, lg);
526	const size_t num_verts = num_vertices(lg);
527	auto colors = make_small_color_map(lg);
528	vector<s32> distances(num_verts);
529	vector<LitEdge> predecessors(num_verts);
530	vector<u64a> residuals(num_edges(lg));
531
532	UNUSED u64a flow = boykov_kolmogorov_max_flow(lg,
533	get(&LitGraphEdgeProps::score, lg),
534	make_iterator_property_map(residuals.begin(), e_index_map),
535	make_iterator_property_map(rev_edges.begin(), e_index_map),
536	make_iterator_property_map(predecessors.begin(), v_index_map),
537	colors,
538	make_iterator_property_map(distances.begin(), v_index_map),
539	v_index_map, lg.root, lg.sink);
540	DEBUG_PRINTF("done, flow = %llu\n", flow);
541
542	/ remove reverse edges /
543	remove_edge_if([&](const LitEdge &e) {
544	return lg [e].index >= num_real_edges;
545	}, lg);
546
547	vector<LitEdge> white_cut, black_cut;
548	u64a white_flow = `0`, black_flow = `0`;
549
550	for (const auto &e : edges_range(lg)) {
551	const LitVertex u = source(e, lg), v = target(e, lg);
552	const auto ucolor = get(colors, u);
553	const auto vcolor = get(colors, v);
554
555	DEBUG_PRINTF("edge %zu:%s -> %zu:%s score %llu\n", lg[u].index,
556	describeColor(ucolor), lg[v].index, describeColor(vcolor),
557	lg[e].score);
558
559	if (ucolor != small_color::white && vcolor == small_color::white) {
560	assert(v != lg.sink);
561	white_cut.push_back(e);
562	white_flow += lg [e].score;
563	}
564	if (ucolor == small_color::black && vcolor != small_color::black) {
565	assert(v != lg.sink);
566	black_cut.push_back(e);
567	black_flow += lg [e].score;
568	}
569	}
570
571	DEBUG_PRINTF("white flow = %llu, black flow = %llu\n",
572	white_flow, black_flow);
573	assert(white_flow && black_flow);
574
575	if (white_flow <= black_flow) {
576	DEBUG_PRINTF("selected white cut\n");
577	cutset.swap(white_cut);
578	} else {
579	DEBUG_PRINTF("selected black cut\n");
580	cutset.swap(black_cut);
581	}
582
583	DEBUG_PRINTF("min cut has %zu edges\n", cutset.size());
584	assert(!cutset.empty());
585	}
586
587	/* Takes a set of literals and derives a better one from them, returning its*
588	* score. Literals with a common suffix S will be replaced with S. (for
589	* example, {foobar, fooobar} -> {oobar}).
590	*/
591	u64a compressAndScore(set<ue2_literal> &s) {
592	if (s.empty()) {
593	return NO_LITERAL_AT_EDGE_SCORE;
594	}
595
596	if (s.size() == `1`) {
597	return calculateScore(*s.begin());
598	}
599
600	UNUSED u64a initialScore = scoreSet(s);
601	DEBUG_PRINTF("begin, initial literals have score %llu\n",
602	initialScore);
603
604	LitGraph lg;
605
606	for (const auto &lit : s) {
607	addReversedLiteral(lit, lg);
608	}
609
610	DEBUG_PRINTF("suffix tree has %zu vertices and %zu edges\n",
611	num_vertices(lg), num_edges(lg));
612
613	vector<LitEdge> cutset;
614	findMinCut(lg, cutset);
615
616	s.clear();
617	extractLiterals(cutset, lg, s);
618
619	u64a score = scoreSet(s);
620	DEBUG_PRINTF("compressed score is %llu\n", score);
621	assert(score <= initialScore);
622	return score;
623	}
624
625	/ like compressAndScore, but replaces long mixed sensitivity literals with*
626	* something weaker. */
627	u64a sanitizeAndCompressAndScore(set<ue2_literal> &lits) {
628	const size_t maxExploded = `8`; // only case-explode this far
629
630	/ TODO: the whole compression thing could be made better by systematically*
631	* considering replacing literal sets not just by common suffixes but also
632	* by nocase literals. */
633
634	vector<ue2_literal> replacements;
635
636	for (auto it = lits.begin(); it != lits.end();) {
637	auto jt = it;
638	++it;
639
640	if (!bad_mixed_sensitivity(*jt)) {
641	continue;
642	}
643
644	/ we have to replace jt with something... /*
645	ue2_literal s = *jt;
646	lits.erase(jt);
647
648	vector<ue2_literal> exploded;
649	for (auto cit = caseIterateBegin(s); cit != caseIterateEnd(); ++cit) {
650	exploded.emplace_back(cit, false*);
651	if (exploded.size() > maxExploded) {
652	goto dont_explode;
653	}
654	}
655	insert(&replacements, replacements.end(), exploded);
656
657	continue;
658	dont_explode:
659	make_nocase(&s);
660	replacements.push_back(s);
661	}
662
663	insert(&lits, replacements);
664	return compressAndScore(lits);
665	}
666
667	u64a scoreSet(const set<ue2_literal> &s) {
668	if (s.empty()) {
669	return NO_LITERAL_AT_EDGE_SCORE;
670	}
671
672	u64a score = `1ULL`;
673
674	for (const auto &lit : s) {
675	score += calculateScore(lit);
676	}
677
678	return score;
679	}
680
681	set<ue2_literal> getLiteralSet(const NGHolder &g, const NFAEdge &e) {
682	set<ue2_literal> s;
683	processWorkQueue(g, e, s);
684	return s;
685	}
686
687	set<ue2_literal> getLiteralSet(const NGHolder &g, const NFAVertex &v,
688	bool only_first_encounter) {
689	set<ue2_literal> s;
690
691	if (is_special(v, g)) {
692	return s;
693	}
694
695	set<ue2_literal> ls;
696
697	for (const auto &e : in_edges_range(v, g)) {
698	if (source(e, g) == v && only_first_encounter) {
699	continue; / ignore self loop on root vertex as we are interested in*
700	* the first time we visit the vertex on the way to
701	* accept. In fact, we can ignore any back edges - but
702	* they would require a bit of effort to discover. */
703	}
704
705	ls = getLiteralSet(g, e);
706	if (ls.empty()) {
707	s.clear();
708	return s;
709	} else {
710	s.insert(ls.begin(), ls.end());
711	}
712	}
713
714	return s;
715	}
716
717	vector<u64a> scoreEdges(const NGHolder &g, const flat_set<NFAEdge> &known_bad) {
718	assert(hasCorrectlyNumberedEdges(g));
719
720	vector<u64a> scores(num_edges(g));
721
722	for (const auto &e : edges_range(g)) {
723	u32 eidx = g [e].index;
724	assert(eidx < scores.size());
725	if (contains(known_bad, e)) {
726	scores [eidx] = NO_LITERAL_AT_EDGE_SCORE;
727	} else {
728	set<ue2_literal> ls = getLiteralSet(g, e);
729	scores [eidx] = compressAndScore(ls);
730	}
731	}
732
733	return scores;
734	}
735
736	bool splitOffLeadingLiteral(const NGHolder &g, ue2_literal *lit_out,
737	NGHolder *rhs) {
738	DEBUG_PRINTF("looking for leading floating literal\n");
739	set<NFAVertex> s_succ;
740	insert(&s_succ, adjacent_vertices(g.start, g));
741
742	set<NFAVertex> sds_succ;
743	insert(&sds_succ, adjacent_vertices(g.startDs, g));
744
745	bool floating = is_subset_of(s_succ, sds_succ);
746	if (!floating) {
747	DEBUG_PRINTF("not floating\n");
748	return false;
749	}
750
751	sds_succ.erase(g.startDs);
752	if (sds_succ.size() != `1`) {
753	DEBUG_PRINTF("branchy root\n");
754	return false;
755	}
756
757	NFAVertex u = g.startDs;
758	NFAVertex v = *sds_succ.begin();
759
760	while (true) {
761	DEBUG_PRINTF("validating vertex %zu\n", g[v].index);
762
763	assert(v != g.acceptEod && v != g.accept);
764
765	const CharReach &cr = g [v].char_reach;
766	if (cr.count() != `1` && !cr.isCaselessChar()) {
767	break;
768	}
769
770	// Rose can only handle mixed-sensitivity literals up to the max mask
771	// length.
772	if (lit_out->length() >= MAX_MASK2_WIDTH) {
773	if (mixed_sensitivity(*lit_out)) {
774	DEBUG_PRINTF("long and mixed sensitivity\n");
775	break;
776	}
777	if (ourisalpha((char)cr.find_first())) {
778	if (cr.isCaselessChar() != lit_out->any_nocase()) {
779	DEBUG_PRINTF("stop at mixed sensitivity on '%c'\n",
780	(char)cr.find_first());
781	break;
782	}
783	}
784	}
785
786	if (edge(v, g.accept, g).second \|\| edge(v, g.acceptEod, g).second) {
787	DEBUG_PRINTF("connection to accept\n");
788	break;
789	}
790
791	lit_out->push_back(cr.find_first(), cr.isCaselessChar());
792	u = v;
793
794	if (out_degree(v, g) != `1`) {
795	DEBUG_PRINTF("out_degree != 1\n");
796	break;
797	}
798
799	v = *adjacent_vertices(v, g).first;
800
801	if (in_degree(v, g) != `1`) {
802	DEBUG_PRINTF("blargh\n"); / picks up cases where there is no path*
803	* to case accept (large cycles),
804	* ensures term */
805	break;
806	}
807	}
808
809	if (lit_out->empty()) {
810	return false;
811	}
812	assert(u != g.startDs);
813
814	unordered_map<NFAVertex, NFAVertex> rhs_map;
815	vector<NFAVertex> pivots = make_vector_from(adjacent_vertices(u, g));
816	splitRHS(g, pivots, rhs, &rhs_map);
817
818	DEBUG_PRINTF("literal is '%s' (len %zu)\n", dumpString(*lit_out).c_str(),
819	lit_out->length());
820	assert(is_triggered(*rhs));
821	return true;
822	}
823
824	bool getTrailingLiteral(const NGHolder &g, ue2_literal *lit_out) {
825	if (in_degree(g.acceptEod, g) != `1`) {
826	return false;
827	}
828
829	NFAVertex v = getSoleSourceVertex(g, g.accept);
830
831	if (!v) {
832	return false;
833	}
834
835	set<ue2_literal> s = getLiteralSet(g, v, false);
836
837	if (s.size() != `1`) {
838	return false;
839	}
840
841	const ue2_literal &lit = *s.begin();
842
843	if (lit.length() > MAX_MASK2_WIDTH && mixed_sensitivity(lit)) {
844	DEBUG_PRINTF("long & mixed-sensitivity, Rose can't handle this.\n");
845	return false;
846	}
847
848	*lit_out = lit;
849	return true;
850	}
851
852	bool literalIsWholeGraph(const NGHolder &g, const ue2_literal &lit) {
853	NFAVertex v = g.accept;
854
855	for (auto it = lit.rbegin(), ite = lit.rend(); it != ite; ++it) {
856	NGHolder::inv_adjacency_iterator ai, ae;
857	tie(ai, ae) = inv_adjacent_vertices(v, g);
858	if (ai == ae) {
859	assert(`0`); // no predecessors?
860	return false;
861	}
862	v = *ai ++;
863	if (ai != ae) {
864	DEBUG_PRINTF("branch, fail\n");
865	return false;
866	}
867
868	if (is_special(v, g)) {
869	DEBUG_PRINTF("special found, fail\n");
870	return false;
871	}
872
873	const CharReach &cr_g = g [v].char_reach;
874	const CharReach &cr_l = *it;
875
876	if (!cr_l.isSubsetOf(cr_g)) {
877	/ running over the prefix is needed to prevent false postives /
878	DEBUG_PRINTF("reach fail\n");
879	return false;
880	}
881	}
882
883	// Our last value for v should have only start states for predecessors.
884	for (auto u : inv_adjacent_vertices_range(v, g)) {
885	if (!is_any_start(u, g)) {
886	DEBUG_PRINTF("pred is not start\n");
887	return false;
888	}
889	}
890
891	assert(num_vertices(g) == lit.length() + N_SPECIALS);
892
893	DEBUG_PRINTF("ok\n");
894	return true;
895	}
896
897	} // namespace ue2
898

Browse the source code of ClickHouse/contrib/hyperscan/src/nfagraph/ng_literal_analysis.cpp