1 | #include <Common/Exception.h> |
2 | #include <Common/PODArray.h> |
3 | #include <Common/OptimizedRegularExpression.h> |
4 | |
5 | #define MIN_LENGTH_FOR_STRSTR 3 |
6 | #define MAX_SUBPATTERNS 5 |
7 | |
8 | |
9 | namespace DB |
10 | { |
11 | namespace ErrorCodes |
12 | { |
13 | extern const int CANNOT_COMPILE_REGEXP; |
14 | } |
15 | } |
16 | |
17 | |
18 | template <bool thread_safe> |
19 | void OptimizedRegularExpressionImpl<thread_safe>::analyze( |
20 | const std::string & regexp, |
21 | std::string & required_substring, |
22 | bool & is_trivial, |
23 | bool & required_substring_is_prefix) |
24 | { |
25 | /** The expression is trivial if all the metacharacters in it are escaped. |
26 | * The non-alternative string is |
27 | * a string outside parentheses, |
28 | * in which all metacharacters are escaped, |
29 | * and also if there are no '|' outside the brackets, |
30 | * and also avoid substrings of the form `http://` or `www` and some other |
31 | * (this is the hack for typical use case in Yandex.Metrica). |
32 | */ |
33 | const char * begin = regexp.data(); |
34 | const char * pos = begin; |
35 | const char * end = regexp.data() + regexp.size(); |
36 | int depth = 0; |
37 | is_trivial = true; |
38 | required_substring_is_prefix = false; |
39 | required_substring.clear(); |
40 | bool has_alternative_on_depth_0 = false; |
41 | |
42 | /// Substring with a position. |
43 | using Substring = std::pair<std::string, size_t>; |
44 | using Substrings = std::vector<Substring>; |
45 | |
46 | Substrings trivial_substrings(1); |
47 | Substring * last_substring = &trivial_substrings.back(); |
48 | |
49 | bool in_curly_braces = false; |
50 | bool in_square_braces = false; |
51 | |
52 | while (pos != end) |
53 | { |
54 | switch (*pos) |
55 | { |
56 | case '\0': |
57 | pos = end; |
58 | break; |
59 | |
60 | case '\\': |
61 | { |
62 | ++pos; |
63 | if (pos == end) |
64 | break; |
65 | |
66 | switch (*pos) |
67 | { |
68 | case '|': case '(': case ')': case '^': case '$': case '.': case '[': case '?': case '*': case '+': case '{': |
69 | if (depth == 0 && !in_curly_braces && !in_square_braces) |
70 | { |
71 | if (last_substring->first.empty()) |
72 | last_substring->second = pos - begin; |
73 | last_substring->first.push_back(*pos); |
74 | } |
75 | break; |
76 | default: |
77 | /// all other escape sequences are not supported |
78 | is_trivial = false; |
79 | if (!last_substring->first.empty()) |
80 | { |
81 | trivial_substrings.resize(trivial_substrings.size() + 1); |
82 | last_substring = &trivial_substrings.back(); |
83 | } |
84 | break; |
85 | } |
86 | |
87 | ++pos; |
88 | break; |
89 | } |
90 | |
91 | case '|': |
92 | if (depth == 0) |
93 | has_alternative_on_depth_0 = true; |
94 | is_trivial = false; |
95 | if (!in_square_braces && !last_substring->first.empty()) |
96 | { |
97 | trivial_substrings.resize(trivial_substrings.size() + 1); |
98 | last_substring = &trivial_substrings.back(); |
99 | } |
100 | ++pos; |
101 | break; |
102 | |
103 | case '(': |
104 | if (!in_square_braces) |
105 | { |
106 | ++depth; |
107 | is_trivial = false; |
108 | if (!last_substring->first.empty()) |
109 | { |
110 | trivial_substrings.resize(trivial_substrings.size() + 1); |
111 | last_substring = &trivial_substrings.back(); |
112 | } |
113 | } |
114 | ++pos; |
115 | break; |
116 | |
117 | case '[': |
118 | in_square_braces = true; |
119 | ++depth; |
120 | is_trivial = false; |
121 | if (!last_substring->first.empty()) |
122 | { |
123 | trivial_substrings.resize(trivial_substrings.size() + 1); |
124 | last_substring = &trivial_substrings.back(); |
125 | } |
126 | ++pos; |
127 | break; |
128 | |
129 | case ']': |
130 | if (!in_square_braces) |
131 | goto ordinary; |
132 | |
133 | in_square_braces = false; |
134 | --depth; |
135 | is_trivial = false; |
136 | if (!last_substring->first.empty()) |
137 | { |
138 | trivial_substrings.resize(trivial_substrings.size() + 1); |
139 | last_substring = &trivial_substrings.back(); |
140 | } |
141 | ++pos; |
142 | break; |
143 | |
144 | case ')': |
145 | if (!in_square_braces) |
146 | { |
147 | --depth; |
148 | is_trivial = false; |
149 | if (!last_substring->first.empty()) |
150 | { |
151 | trivial_substrings.resize(trivial_substrings.size() + 1); |
152 | last_substring = &trivial_substrings.back(); |
153 | } |
154 | } |
155 | ++pos; |
156 | break; |
157 | |
158 | case '^': case '$': case '.': case '+': |
159 | is_trivial = false; |
160 | if (!last_substring->first.empty() && !in_square_braces) |
161 | { |
162 | trivial_substrings.resize(trivial_substrings.size() + 1); |
163 | last_substring = &trivial_substrings.back(); |
164 | } |
165 | ++pos; |
166 | break; |
167 | |
168 | /// Quantifiers that allow a zero number of occurrences. |
169 | case '{': |
170 | in_curly_braces = true; |
171 | [[fallthrough]]; |
172 | case '?': |
173 | [[fallthrough]]; |
174 | case '*': |
175 | is_trivial = false; |
176 | if (!last_substring->first.empty() && !in_square_braces) |
177 | { |
178 | last_substring->first.resize(last_substring->first.size() - 1); |
179 | trivial_substrings.resize(trivial_substrings.size() + 1); |
180 | last_substring = &trivial_substrings.back(); |
181 | } |
182 | ++pos; |
183 | break; |
184 | |
185 | case '}': |
186 | if (!in_curly_braces) |
187 | goto ordinary; |
188 | |
189 | in_curly_braces = false; |
190 | ++pos; |
191 | break; |
192 | |
193 | ordinary: /// Normal, not escaped symbol. |
194 | [[fallthrough]]; |
195 | default: |
196 | if (depth == 0 && !in_curly_braces && !in_square_braces) |
197 | { |
198 | if (last_substring->first.empty()) |
199 | last_substring->second = pos - begin; |
200 | last_substring->first.push_back(*pos); |
201 | } |
202 | ++pos; |
203 | break; |
204 | } |
205 | } |
206 | |
207 | if (last_substring && last_substring->first.empty()) |
208 | trivial_substrings.pop_back(); |
209 | |
210 | if (!is_trivial) |
211 | { |
212 | if (!has_alternative_on_depth_0) |
213 | { |
214 | /// We choose the non-alternative substring of the maximum length for first search. |
215 | |
216 | /// Tuning for typical usage domain |
217 | auto tuning_strings_condition = [](const std::string & str) |
218 | { |
219 | return str != "://" && str != "http://" && str != "www" && str != "Windows " ; |
220 | }; |
221 | size_t max_length = 0; |
222 | Substrings::const_iterator candidate_it = trivial_substrings.begin(); |
223 | for (Substrings::const_iterator it = trivial_substrings.begin(); it != trivial_substrings.end(); ++it) |
224 | { |
225 | if (it->first.size() > max_length && tuning_strings_condition(it->first)) |
226 | { |
227 | max_length = it->first.size(); |
228 | candidate_it = it; |
229 | } |
230 | } |
231 | |
232 | if (max_length >= MIN_LENGTH_FOR_STRSTR) |
233 | { |
234 | required_substring = candidate_it->first; |
235 | required_substring_is_prefix = candidate_it->second == 0; |
236 | } |
237 | } |
238 | } |
239 | else if (!trivial_substrings.empty()) |
240 | { |
241 | required_substring = trivial_substrings.front().first; |
242 | required_substring_is_prefix = trivial_substrings.front().second == 0; |
243 | } |
244 | |
245 | /* std::cerr |
246 | << "regexp: " << regexp |
247 | << ", is_trivial: " << is_trivial |
248 | << ", required_substring: " << required_substring |
249 | << ", required_substring_is_prefix: " << required_substring_is_prefix |
250 | << std::endl;*/ |
251 | } |
252 | |
253 | |
254 | template <bool thread_safe> |
255 | OptimizedRegularExpressionImpl<thread_safe>::OptimizedRegularExpressionImpl(const std::string & regexp_, int options) |
256 | { |
257 | analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix); |
258 | |
259 | /// Just three following options are supported |
260 | if (options & (~(RE_CASELESS | RE_NO_CAPTURE | RE_DOT_NL))) |
261 | throw DB::Exception("OptimizedRegularExpression: Unsupported option." , DB::ErrorCodes::CANNOT_COMPILE_REGEXP); |
262 | |
263 | is_case_insensitive = options & RE_CASELESS; |
264 | bool is_no_capture = options & RE_NO_CAPTURE; |
265 | bool is_dot_nl = options & RE_DOT_NL; |
266 | |
267 | number_of_subpatterns = 0; |
268 | if (!is_trivial) |
269 | { |
270 | /// Compile the re2 regular expression. |
271 | typename RegexType::Options regexp_options; |
272 | |
273 | /// Never write error messages to stderr. It's ignorant to do it from library code. |
274 | regexp_options.set_log_errors(false); |
275 | |
276 | if (is_case_insensitive) |
277 | regexp_options.set_case_sensitive(false); |
278 | |
279 | if (is_dot_nl) |
280 | regexp_options.set_dot_nl(true); |
281 | |
282 | re2 = std::make_unique<RegexType>(regexp_, regexp_options); |
283 | if (!re2->ok()) |
284 | throw DB::Exception("OptimizedRegularExpression: cannot compile re2: " + regexp_ + ", error: " + re2->error() + ". Look at https://github.com/google/re2/wiki/Syntax for reference." , DB::ErrorCodes::CANNOT_COMPILE_REGEXP); |
285 | |
286 | if (!is_no_capture) |
287 | { |
288 | number_of_subpatterns = re2->NumberOfCapturingGroups(); |
289 | if (number_of_subpatterns > MAX_SUBPATTERNS) |
290 | throw DB::Exception("OptimizedRegularExpression: too many subpatterns in regexp: " + regexp_, DB::ErrorCodes::CANNOT_COMPILE_REGEXP); |
291 | } |
292 | } |
293 | } |
294 | |
295 | |
296 | template <bool thread_safe> |
297 | bool OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, size_t subject_size) const |
298 | { |
299 | if (is_trivial) |
300 | { |
301 | if (is_case_insensitive) |
302 | return nullptr != strcasestr(subject, required_substring.data()); |
303 | else |
304 | return nullptr != strstr(subject, required_substring.data()); |
305 | } |
306 | else |
307 | { |
308 | if (!required_substring.empty()) |
309 | { |
310 | const char * pos; |
311 | if (is_case_insensitive) |
312 | pos = strcasestr(subject, required_substring.data()); |
313 | else |
314 | pos = strstr(subject, required_substring.data()); |
315 | |
316 | if (nullptr == pos) |
317 | return 0; |
318 | } |
319 | |
320 | return re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, nullptr, 0); |
321 | } |
322 | } |
323 | |
324 | |
325 | template <bool thread_safe> |
326 | bool OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, size_t subject_size, Match & match) const |
327 | { |
328 | if (is_trivial) |
329 | { |
330 | const char * pos; |
331 | if (is_case_insensitive) |
332 | pos = strcasestr(subject, required_substring.data()); |
333 | else |
334 | pos = strstr(subject, required_substring.data()); |
335 | |
336 | if (pos == nullptr) |
337 | return 0; |
338 | else |
339 | { |
340 | match.offset = pos - subject; |
341 | match.length = required_substring.size(); |
342 | return 1; |
343 | } |
344 | } |
345 | else |
346 | { |
347 | if (!required_substring.empty()) |
348 | { |
349 | const char * pos; |
350 | if (is_case_insensitive) |
351 | pos = strcasestr(subject, required_substring.data()); |
352 | else |
353 | pos = strstr(subject, required_substring.data()); |
354 | |
355 | if (nullptr == pos) |
356 | return 0; |
357 | } |
358 | |
359 | StringPieceType piece; |
360 | |
361 | if (!RegexType::PartialMatch(StringPieceType(subject, subject_size), *re2, &piece)) |
362 | return 0; |
363 | else |
364 | { |
365 | match.offset = piece.data() - subject; |
366 | match.length = piece.length(); |
367 | return 1; |
368 | } |
369 | } |
370 | } |
371 | |
372 | |
373 | template <bool thread_safe> |
374 | unsigned OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, size_t subject_size, MatchVec & matches, unsigned limit) const |
375 | { |
376 | matches.clear(); |
377 | |
378 | if (limit == 0) |
379 | return 0; |
380 | |
381 | if (limit > number_of_subpatterns + 1) |
382 | limit = number_of_subpatterns + 1; |
383 | |
384 | if (is_trivial) |
385 | { |
386 | const char * pos; |
387 | if (is_case_insensitive) |
388 | pos = strcasestr(subject, required_substring.data()); |
389 | else |
390 | pos = strstr(subject, required_substring.data()); |
391 | |
392 | if (pos == nullptr) |
393 | return 0; |
394 | else |
395 | { |
396 | Match match; |
397 | match.offset = pos - subject; |
398 | match.length = required_substring.size(); |
399 | matches.push_back(match); |
400 | return 1; |
401 | } |
402 | } |
403 | else |
404 | { |
405 | if (!required_substring.empty()) |
406 | { |
407 | const char * pos; |
408 | if (is_case_insensitive) |
409 | pos = strcasestr(subject, required_substring.data()); |
410 | else |
411 | pos = strstr(subject, required_substring.data()); |
412 | |
413 | if (nullptr == pos) |
414 | return 0; |
415 | } |
416 | |
417 | DB::PODArrayWithStackMemory<StringPieceType, sizeof(StringPieceType) * (MAX_SUBPATTERNS+1)> pieces(limit); |
418 | |
419 | if (!re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, pieces.data(), pieces.size())) |
420 | return 0; |
421 | else |
422 | { |
423 | matches.resize(limit); |
424 | for (size_t i = 0; i < limit; ++i) |
425 | { |
426 | if (pieces[i] != nullptr) |
427 | { |
428 | matches[i].offset = pieces[i].data() - subject; |
429 | matches[i].length = pieces[i].length(); |
430 | } |
431 | else |
432 | { |
433 | matches[i].offset = std::string::npos; |
434 | matches[i].length = 0; |
435 | } |
436 | } |
437 | return limit; |
438 | } |
439 | } |
440 | } |
441 | |
442 | template class OptimizedRegularExpressionImpl<true>; |
443 | template class OptimizedRegularExpressionImpl<false>; |
444 | |