1#include <Common/Exception.h>
2#include <Common/PODArray.h>
3#include <Common/OptimizedRegularExpression.h>
4
5#define MIN_LENGTH_FOR_STRSTR 3
6#define MAX_SUBPATTERNS 5
7
8
9namespace DB
10{
11 namespace ErrorCodes
12 {
13 extern const int CANNOT_COMPILE_REGEXP;
14 }
15}
16
17
18template <bool thread_safe>
19void OptimizedRegularExpressionImpl<thread_safe>::analyze(
20 const std::string & regexp,
21 std::string & required_substring,
22 bool & is_trivial,
23 bool & required_substring_is_prefix)
24{
25 /** The expression is trivial if all the metacharacters in it are escaped.
26 * The non-alternative string is
27 * a string outside parentheses,
28 * in which all metacharacters are escaped,
29 * and also if there are no '|' outside the brackets,
30 * and also avoid substrings of the form `http://` or `www` and some other
31 * (this is the hack for typical use case in Yandex.Metrica).
32 */
33 const char * begin = regexp.data();
34 const char * pos = begin;
35 const char * end = regexp.data() + regexp.size();
36 int depth = 0;
37 is_trivial = true;
38 required_substring_is_prefix = false;
39 required_substring.clear();
40 bool has_alternative_on_depth_0 = false;
41
42 /// Substring with a position.
43 using Substring = std::pair<std::string, size_t>;
44 using Substrings = std::vector<Substring>;
45
46 Substrings trivial_substrings(1);
47 Substring * last_substring = &trivial_substrings.back();
48
49 bool in_curly_braces = false;
50 bool in_square_braces = false;
51
52 while (pos != end)
53 {
54 switch (*pos)
55 {
56 case '\0':
57 pos = end;
58 break;
59
60 case '\\':
61 {
62 ++pos;
63 if (pos == end)
64 break;
65
66 switch (*pos)
67 {
68 case '|': case '(': case ')': case '^': case '$': case '.': case '[': case '?': case '*': case '+': case '{':
69 if (depth == 0 && !in_curly_braces && !in_square_braces)
70 {
71 if (last_substring->first.empty())
72 last_substring->second = pos - begin;
73 last_substring->first.push_back(*pos);
74 }
75 break;
76 default:
77 /// all other escape sequences are not supported
78 is_trivial = false;
79 if (!last_substring->first.empty())
80 {
81 trivial_substrings.resize(trivial_substrings.size() + 1);
82 last_substring = &trivial_substrings.back();
83 }
84 break;
85 }
86
87 ++pos;
88 break;
89 }
90
91 case '|':
92 if (depth == 0)
93 has_alternative_on_depth_0 = true;
94 is_trivial = false;
95 if (!in_square_braces && !last_substring->first.empty())
96 {
97 trivial_substrings.resize(trivial_substrings.size() + 1);
98 last_substring = &trivial_substrings.back();
99 }
100 ++pos;
101 break;
102
103 case '(':
104 if (!in_square_braces)
105 {
106 ++depth;
107 is_trivial = false;
108 if (!last_substring->first.empty())
109 {
110 trivial_substrings.resize(trivial_substrings.size() + 1);
111 last_substring = &trivial_substrings.back();
112 }
113 }
114 ++pos;
115 break;
116
117 case '[':
118 in_square_braces = true;
119 ++depth;
120 is_trivial = false;
121 if (!last_substring->first.empty())
122 {
123 trivial_substrings.resize(trivial_substrings.size() + 1);
124 last_substring = &trivial_substrings.back();
125 }
126 ++pos;
127 break;
128
129 case ']':
130 if (!in_square_braces)
131 goto ordinary;
132
133 in_square_braces = false;
134 --depth;
135 is_trivial = false;
136 if (!last_substring->first.empty())
137 {
138 trivial_substrings.resize(trivial_substrings.size() + 1);
139 last_substring = &trivial_substrings.back();
140 }
141 ++pos;
142 break;
143
144 case ')':
145 if (!in_square_braces)
146 {
147 --depth;
148 is_trivial = false;
149 if (!last_substring->first.empty())
150 {
151 trivial_substrings.resize(trivial_substrings.size() + 1);
152 last_substring = &trivial_substrings.back();
153 }
154 }
155 ++pos;
156 break;
157
158 case '^': case '$': case '.': case '+':
159 is_trivial = false;
160 if (!last_substring->first.empty() && !in_square_braces)
161 {
162 trivial_substrings.resize(trivial_substrings.size() + 1);
163 last_substring = &trivial_substrings.back();
164 }
165 ++pos;
166 break;
167
168 /// Quantifiers that allow a zero number of occurrences.
169 case '{':
170 in_curly_braces = true;
171 [[fallthrough]];
172 case '?':
173 [[fallthrough]];
174 case '*':
175 is_trivial = false;
176 if (!last_substring->first.empty() && !in_square_braces)
177 {
178 last_substring->first.resize(last_substring->first.size() - 1);
179 trivial_substrings.resize(trivial_substrings.size() + 1);
180 last_substring = &trivial_substrings.back();
181 }
182 ++pos;
183 break;
184
185 case '}':
186 if (!in_curly_braces)
187 goto ordinary;
188
189 in_curly_braces = false;
190 ++pos;
191 break;
192
193 ordinary: /// Normal, not escaped symbol.
194 [[fallthrough]];
195 default:
196 if (depth == 0 && !in_curly_braces && !in_square_braces)
197 {
198 if (last_substring->first.empty())
199 last_substring->second = pos - begin;
200 last_substring->first.push_back(*pos);
201 }
202 ++pos;
203 break;
204 }
205 }
206
207 if (last_substring && last_substring->first.empty())
208 trivial_substrings.pop_back();
209
210 if (!is_trivial)
211 {
212 if (!has_alternative_on_depth_0)
213 {
214 /// We choose the non-alternative substring of the maximum length for first search.
215
216 /// Tuning for typical usage domain
217 auto tuning_strings_condition = [](const std::string & str)
218 {
219 return str != "://" && str != "http://" && str != "www" && str != "Windows ";
220 };
221 size_t max_length = 0;
222 Substrings::const_iterator candidate_it = trivial_substrings.begin();
223 for (Substrings::const_iterator it = trivial_substrings.begin(); it != trivial_substrings.end(); ++it)
224 {
225 if (it->first.size() > max_length && tuning_strings_condition(it->first))
226 {
227 max_length = it->first.size();
228 candidate_it = it;
229 }
230 }
231
232 if (max_length >= MIN_LENGTH_FOR_STRSTR)
233 {
234 required_substring = candidate_it->first;
235 required_substring_is_prefix = candidate_it->second == 0;
236 }
237 }
238 }
239 else if (!trivial_substrings.empty())
240 {
241 required_substring = trivial_substrings.front().first;
242 required_substring_is_prefix = trivial_substrings.front().second == 0;
243 }
244
245/* std::cerr
246 << "regexp: " << regexp
247 << ", is_trivial: " << is_trivial
248 << ", required_substring: " << required_substring
249 << ", required_substring_is_prefix: " << required_substring_is_prefix
250 << std::endl;*/
251}
252
253
254template <bool thread_safe>
255OptimizedRegularExpressionImpl<thread_safe>::OptimizedRegularExpressionImpl(const std::string & regexp_, int options)
256{
257 analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix);
258
259 /// Just three following options are supported
260 if (options & (~(RE_CASELESS | RE_NO_CAPTURE | RE_DOT_NL)))
261 throw DB::Exception("OptimizedRegularExpression: Unsupported option.", DB::ErrorCodes::CANNOT_COMPILE_REGEXP);
262
263 is_case_insensitive = options & RE_CASELESS;
264 bool is_no_capture = options & RE_NO_CAPTURE;
265 bool is_dot_nl = options & RE_DOT_NL;
266
267 number_of_subpatterns = 0;
268 if (!is_trivial)
269 {
270 /// Compile the re2 regular expression.
271 typename RegexType::Options regexp_options;
272
273 /// Never write error messages to stderr. It's ignorant to do it from library code.
274 regexp_options.set_log_errors(false);
275
276 if (is_case_insensitive)
277 regexp_options.set_case_sensitive(false);
278
279 if (is_dot_nl)
280 regexp_options.set_dot_nl(true);
281
282 re2 = std::make_unique<RegexType>(regexp_, regexp_options);
283 if (!re2->ok())
284 throw DB::Exception("OptimizedRegularExpression: cannot compile re2: " + regexp_ + ", error: " + re2->error() + ". Look at https://github.com/google/re2/wiki/Syntax for reference.", DB::ErrorCodes::CANNOT_COMPILE_REGEXP);
285
286 if (!is_no_capture)
287 {
288 number_of_subpatterns = re2->NumberOfCapturingGroups();
289 if (number_of_subpatterns > MAX_SUBPATTERNS)
290 throw DB::Exception("OptimizedRegularExpression: too many subpatterns in regexp: " + regexp_, DB::ErrorCodes::CANNOT_COMPILE_REGEXP);
291 }
292 }
293}
294
295
296template <bool thread_safe>
297bool OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, size_t subject_size) const
298{
299 if (is_trivial)
300 {
301 if (is_case_insensitive)
302 return nullptr != strcasestr(subject, required_substring.data());
303 else
304 return nullptr != strstr(subject, required_substring.data());
305 }
306 else
307 {
308 if (!required_substring.empty())
309 {
310 const char * pos;
311 if (is_case_insensitive)
312 pos = strcasestr(subject, required_substring.data());
313 else
314 pos = strstr(subject, required_substring.data());
315
316 if (nullptr == pos)
317 return 0;
318 }
319
320 return re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, nullptr, 0);
321 }
322}
323
324
325template <bool thread_safe>
326bool OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, size_t subject_size, Match & match) const
327{
328 if (is_trivial)
329 {
330 const char * pos;
331 if (is_case_insensitive)
332 pos = strcasestr(subject, required_substring.data());
333 else
334 pos = strstr(subject, required_substring.data());
335
336 if (pos == nullptr)
337 return 0;
338 else
339 {
340 match.offset = pos - subject;
341 match.length = required_substring.size();
342 return 1;
343 }
344 }
345 else
346 {
347 if (!required_substring.empty())
348 {
349 const char * pos;
350 if (is_case_insensitive)
351 pos = strcasestr(subject, required_substring.data());
352 else
353 pos = strstr(subject, required_substring.data());
354
355 if (nullptr == pos)
356 return 0;
357 }
358
359 StringPieceType piece;
360
361 if (!RegexType::PartialMatch(StringPieceType(subject, subject_size), *re2, &piece))
362 return 0;
363 else
364 {
365 match.offset = piece.data() - subject;
366 match.length = piece.length();
367 return 1;
368 }
369 }
370}
371
372
373template <bool thread_safe>
374unsigned OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, size_t subject_size, MatchVec & matches, unsigned limit) const
375{
376 matches.clear();
377
378 if (limit == 0)
379 return 0;
380
381 if (limit > number_of_subpatterns + 1)
382 limit = number_of_subpatterns + 1;
383
384 if (is_trivial)
385 {
386 const char * pos;
387 if (is_case_insensitive)
388 pos = strcasestr(subject, required_substring.data());
389 else
390 pos = strstr(subject, required_substring.data());
391
392 if (pos == nullptr)
393 return 0;
394 else
395 {
396 Match match;
397 match.offset = pos - subject;
398 match.length = required_substring.size();
399 matches.push_back(match);
400 return 1;
401 }
402 }
403 else
404 {
405 if (!required_substring.empty())
406 {
407 const char * pos;
408 if (is_case_insensitive)
409 pos = strcasestr(subject, required_substring.data());
410 else
411 pos = strstr(subject, required_substring.data());
412
413 if (nullptr == pos)
414 return 0;
415 }
416
417 DB::PODArrayWithStackMemory<StringPieceType, sizeof(StringPieceType) * (MAX_SUBPATTERNS+1)> pieces(limit);
418
419 if (!re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, pieces.data(), pieces.size()))
420 return 0;
421 else
422 {
423 matches.resize(limit);
424 for (size_t i = 0; i < limit; ++i)
425 {
426 if (pieces[i] != nullptr)
427 {
428 matches[i].offset = pieces[i].data() - subject;
429 matches[i].length = pieces[i].length();
430 }
431 else
432 {
433 matches[i].offset = std::string::npos;
434 matches[i].length = 0;
435 }
436 }
437 return limit;
438 }
439 }
440}
441
442template class OptimizedRegularExpressionImpl<true>;
443template class OptimizedRegularExpressionImpl<false>;
444