OptimizedRegularExpression.cpp source code [ClickHouse/dbms/src/Common/OptimizedRegularExpression.cpp]

1	#include <Common/Exception.h>
2	#include <Common/PODArray.h>
3	#include <Common/OptimizedRegularExpression.h>
4
5	#define MIN_LENGTH_FOR_STRSTR 3
6	#define MAX_SUBPATTERNS 5
7
8
9	namespace DB
10	{
11	namespace ErrorCodes
12	{
13	extern const int CANNOT_COMPILE_REGEXP;
14	}
15	}
16
17
18	template <bool thread_safe>
19	void OptimizedRegularExpressionImpl<thread_safe>::analyze(
20	const std::string & regexp,
21	std::string & required_substring,
22	bool & is_trivial,
23	bool & required_substring_is_prefix)
24	{
25	/* The expression is trivial if all the metacharacters in it are escaped.*
26	* The non-alternative string is
27	* a string outside parentheses,
28	* in which all metacharacters are escaped,
29	* and also if there are no '\|' outside the brackets,
30	* and also avoid substrings of the form `http://` or `www` and some other
31	* (this is the hack for typical use case in Yandex.Metrica).
32	*/
33	const char * begin = regexp.data();
34	const char * pos = begin;
35	const char * end = regexp.data() + regexp.size();
36	int depth = `0`;
37	is_trivial = true;
38	required_substring_is_prefix = false;
39	required_substring.clear();
40	bool has_alternative_on_depth_0 = false;
41
42	/// Substring with a position.
43	using Substring = std::pair<std::string, size_t>;
44	using Substrings = std::vector<Substring>;
45
46	Substrings trivial_substrings(`1`);
47	Substring * last_substring = &trivial_substrings.back();
48
49	bool in_curly_braces = false;
50	bool in_square_braces = false;
51
52	while (pos != end)
53	{
54	switch (*pos)
55	{
56	case `'\0'`:
57	pos = end;
58	break;
59
60	case `'\\'`:
61	{
62	++pos;
63	if (pos == end)
64	break;
65
66	switch (*pos)
67	{
68	case `'\|'`: case `'('`: case `')'`: case `'^'`: case `'$'`: case `'.'`: case `'['`: case `'?'`: case `''`: case* `'+'`: case `'{'`:
69	if (depth == `0` && !in_curly_braces && !in_square_braces)
70	{
71	if (last_substring->first.empty())
72	last_substring->second = pos - begin;
73	last_substring->first.push_back(*pos);
74	}
75	break;
76	default:
77	/// all other escape sequences are not supported
78	is_trivial = false;
79	if (!last_substring->first.empty())
80	{
81	trivial_substrings.resize(trivial_substrings.size() + `1`);
82	last_substring = &trivial_substrings.back();
83	}
84	break;
85	}
86
87	++pos;
88	break;
89	}
90
91	case `'\|'`:
92	if (depth == `0`)
93	has_alternative_on_depth_0 = true;
94	is_trivial = false;
95	if (!in_square_braces && !last_substring->first.empty())
96	{
97	trivial_substrings.resize(trivial_substrings.size() + `1`);
98	last_substring = &trivial_substrings.back();
99	}
100	++pos;
101	break;
102
103	case `'('`:
104	if (!in_square_braces)
105	{
106	++depth;
107	is_trivial = false;
108	if (!last_substring->first.empty())
109	{
110	trivial_substrings.resize(trivial_substrings.size() + `1`);
111	last_substring = &trivial_substrings.back();
112	}
113	}
114	++pos;
115	break;
116
117	case `'['`:
118	in_square_braces = true;
119	++depth;
120	is_trivial = false;
121	if (!last_substring->first.empty())
122	{
123	trivial_substrings.resize(trivial_substrings.size() + `1`);
124	last_substring = &trivial_substrings.back();
125	}
126	++pos;
127	break;
128
129	case `']'`:
130	if (!in_square_braces)
131	goto ordinary;
132
133	in_square_braces = false;
134	--depth;
135	is_trivial = false;
136	if (!last_substring->first.empty())
137	{
138	trivial_substrings.resize(trivial_substrings.size() + `1`);
139	last_substring = &trivial_substrings.back();
140	}
141	++pos;
142	break;
143
144	case `')'`:
145	if (!in_square_braces)
146	{
147	--depth;
148	is_trivial = false;
149	if (!last_substring->first.empty())
150	{
151	trivial_substrings.resize(trivial_substrings.size() + `1`);
152	last_substring = &trivial_substrings.back();
153	}
154	}
155	++pos;
156	break;
157
158	case `'^'`: case `'$'`: case `'.'`: case `'+'`:
159	is_trivial = false;
160	if (!last_substring->first.empty() && !in_square_braces)
161	{
162	trivial_substrings.resize(trivial_substrings.size() + `1`);
163	last_substring = &trivial_substrings.back();
164	}
165	++pos;
166	break;
167
168	/// Quantifiers that allow a zero number of occurrences.
169	case `'{'`:
170	in_curly_braces = true;
171	[[fallthrough]];
172	case `'?'`:
173	[[fallthrough]];
174	case `'*'`:
175	is_trivial = false;
176	if (!last_substring->first.empty() && !in_square_braces)
177	{
178	last_substring->first.resize(last_substring->first.size() - `1`);
179	trivial_substrings.resize(trivial_substrings.size() + `1`);
180	last_substring = &trivial_substrings.back();
181	}
182	++pos;
183	break;
184
185	case `'}'`:
186	if (!in_curly_braces)
187	goto ordinary;
188
189	in_curly_braces = false;
190	++pos;
191	break;
192
193	ordinary: /// Normal, not escaped symbol.
194	[[fallthrough]];
195	default:
196	if (depth == `0` && !in_curly_braces && !in_square_braces)
197	{
198	if (last_substring->first.empty())
199	last_substring->second = pos - begin;
200	last_substring->first.push_back(*pos);
201	}
202	++pos;
203	break;
204	}
205	}
206
207	if (last_substring && last_substring->first.empty())
208	trivial_substrings.pop_back();
209
210	if (!is_trivial)
211	{
212	if (!has_alternative_on_depth_0)
213	{
214	/// We choose the non-alternative substring of the maximum length for first search.
215
216	/// Tuning for typical usage domain
217	auto tuning_strings_condition = [](const std::string & str)
218	{
219	return str != "://" && str != "http://" && str != "www" && str != "Windows ";
220	};
221	size_t max_length = `0`;
222	Substrings::const_iterator candidate_it = trivial_substrings.begin();
223	for (Substrings::const_iterator it = trivial_substrings.begin(); it != trivial_substrings.end(); ++it)
224	{
225	if (it ->first.size() > max_length && tuning_strings_condition(it ->first))
226	{
227	max_length = it ->first.size();
228	candidate_it = it;
229	}
230	}
231
232	if (max_length >= MIN_LENGTH_FOR_STRSTR)
233	{
234	required_substring = candidate_it ->first;
235	required_substring_is_prefix = candidate_it ->second == `0`;
236	}
237	}
238	}
239	else if (!trivial_substrings.empty())
240	{
241	required_substring = trivial_substrings.front().first;
242	required_substring_is_prefix = trivial_substrings.front().second == `0`;
243	}
244
245	/ std::cerr*
246	<< "regexp: " << regexp
247	<< ", is_trivial: " << is_trivial
248	<< ", required_substring: " << required_substring
249	<< ", required_substring_is_prefix: " << required_substring_is_prefix
250	<< std::endl;/*
251	}
252
253
254	template <bool thread_safe>
255	OptimizedRegularExpressionImpl<thread_safe>::OptimizedRegularExpressionImpl(const std::string & regexp_, int options)
256	{
257	analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix);
258
259	/// Just three following options are supported
260	if (options & (~(RE_CASELESS \| RE_NO_CAPTURE \| RE_DOT_NL)))
261	throw DB::Exception ("OptimizedRegularExpression: Unsupported option.", DB::ErrorCodes::CANNOT_COMPILE_REGEXP);
262
263	is_case_insensitive = options & RE_CASELESS;
264	bool is_no_capture = options & RE_NO_CAPTURE;
265	bool is_dot_nl = options & RE_DOT_NL;
266
267	number_of_subpatterns = `0`;
268	if (!is_trivial)
269	{
270	/// Compile the re2 regular expression.
271	typename RegexType::Options regexp_options;
272
273	/// Never write error messages to stderr. It's ignorant to do it from library code.
274	regexp_options.set_log_errors(false);
275
276	if (is_case_insensitive)
277	regexp_options.set_case_sensitive(false);
278
279	if (is_dot_nl)
280	regexp_options.set_dot_nl(true);
281
282	re2 = std::make_unique<RegexType>(regexp_, regexp_options);
283	if (!re2->ok())
284	throw DB::Exception("OptimizedRegularExpression: cannot compile re2: " + regexp_ + ", error: " + re2->error() + ". Look at https://github.com/google/re2/wiki/Syntax for reference.", DB::ErrorCodes::CANNOT_COMPILE_REGEXP);
285
286	if (!is_no_capture)
287	{
288	number_of_subpatterns = re2->NumberOfCapturingGroups();
289	if (number_of_subpatterns > MAX_SUBPATTERNS)
290	throw DB::Exception ("OptimizedRegularExpression: too many subpatterns in regexp: " + regexp_, DB::ErrorCodes::CANNOT_COMPILE_REGEXP);
291	}
292	}
293	}
294
295
296	template <bool thread_safe>
297	bool OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, size_t subject_size) const
298	{
299	if (is_trivial)
300	{
301	if (is_case_insensitive)
302	return nullptr != strcasestr(subject, required_substring.data());
303	else
304	return nullptr != strstr(subject, required_substring.data());
305	}
306	else
307	{
308	if (!required_substring.empty())
309	{
310	const char * pos;
311	if (is_case_insensitive)
312	pos = strcasestr(subject, required_substring.data());
313	else
314	pos = strstr(subject, required_substring.data());
315
316	if (nullptr == pos)
317	return `0`;
318	}
319
320	return re2->Match(StringPieceType(subject, subject_size), `0`, subject_size, RegexType::UNANCHORED, nullptr, `0`);
321	}
322	}
323
324
325	template <bool thread_safe>
326	bool OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, size_t subject_size, Match & match) const
327	{
328	if (is_trivial)
329	{
330	const char * pos;
331	if (is_case_insensitive)
332	pos = strcasestr(subject, required_substring.data());
333	else
334	pos = strstr(subject, required_substring.data());
335
336	if (pos == nullptr)
337	return `0`;
338	else
339	{
340	match.offset = pos - subject;
341	match.length = required_substring.size();
342	return `1`;
343	}
344	}
345	else
346	{
347	if (!required_substring.empty())
348	{
349	const char * pos;
350	if (is_case_insensitive)
351	pos = strcasestr(subject, required_substring.data());
352	else
353	pos = strstr(subject, required_substring.data());
354
355	if (nullptr == pos)
356	return `0`;
357	}
358
359	StringPieceType piece;
360
361	if (!RegexType::PartialMatch(StringPieceType(subject, subject_size), *re2, &piece))
362	return `0`;
363	else
364	{
365	match.offset = piece.data() - subject;
366	match.length = piece.length();
367	return `1`;
368	}
369	}
370	}
371
372
373	template <bool thread_safe>
374	unsigned OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, size_t subject_size, MatchVec & matches, unsigned limit) const
375	{
376	matches.clear();
377
378	if (limit == `0`)
379	return `0`;
380
381	if (limit > number_of_subpatterns + `1`)
382	limit = number_of_subpatterns + `1`;
383
384	if (is_trivial)
385	{
386	const char * pos;
387	if (is_case_insensitive)
388	pos = strcasestr(subject, required_substring.data());
389	else
390	pos = strstr(subject, required_substring.data());
391
392	if (pos == nullptr)
393	return `0`;
394	else
395	{
396	Match match;
397	match.offset = pos - subject;
398	match.length = required_substring.size();
399	matches.push_back(match);
400	return `1`;
401	}
402	}
403	else
404	{
405	if (!required_substring.empty())
406	{
407	const char * pos;
408	if (is_case_insensitive)
409	pos = strcasestr(subject, required_substring.data());
410	else
411	pos = strstr(subject, required_substring.data());
412
413	if (nullptr == pos)
414	return `0`;
415	}
416
417	DB::PODArrayWithStackMemory<StringPieceType, sizeof(StringPieceType) * (MAX_SUBPATTERNS+`1`)> pieces(limit);
418
419	if (!re2->Match(StringPieceType(subject, subject_size), `0`, subject_size, RegexType::UNANCHORED, pieces.data(), pieces.size()))
420	return `0`;
421	else
422	{
423	matches.resize(limit);
424	for (size_t i = `0`; i < limit; ++i)
425	{
426	if (pieces[i] != nullptr)
427	{
428	matches [i].offset = pieces[i].data() - subject;
429	matches [i].length = pieces[i].length();
430	}
431	else
432	{
433	matches [i].offset = std::string::npos;
434	matches [i].length = `0`;
435	}
436	}
437	return limit;
438	}
439	}
440	}
441
442	template class OptimizedRegularExpressionImpl<true>;
443	template class OptimizedRegularExpressionImpl<false>;
444

Browse the source code of ClickHouse/dbms/src/Common/OptimizedRegularExpression.cpp