1 | #pragma once |
2 | |
3 | #include <string> |
4 | #include <vector> |
5 | #include <memory> |
6 | #include <Common/config.h> |
7 | #include <re2/re2.h> |
8 | #if USE_RE2_ST |
9 | #include <re2_st/re2.h> |
10 | #else |
11 | #define re2_st re2 |
12 | #endif |
13 | |
14 | |
15 | /** Uses two ways to optimize a regular expression: |
16 | * 1. If the regular expression is trivial (reduces to finding a substring in a string), |
17 | * then replaces the search with strstr or strcasestr. |
18 | * 2. If the regular expression contains a non-alternative substring of sufficient length, |
19 | * then before testing, strstr or strcasestr of sufficient length is used; |
20 | * regular expression is only fully checked if a substring is found. |
21 | * 3. In other cases, the re2 engine is used. |
22 | * |
23 | * This makes sense, since strstr and strcasestr in libc for Linux are well optimized. |
24 | * |
25 | * Suitable if the following conditions are simultaneously met: |
26 | * - if in most calls, the regular expression does not match; |
27 | * - if the regular expression is compatible with the re2 engine; |
28 | * - you can use at your own risk, since, probably, not all cases are taken into account. |
29 | * |
30 | * NOTE: Multi-character metasymbols such as \Pl are handled incorrectly. |
31 | */ |
32 | |
33 | namespace OptimizedRegularExpressionDetails |
34 | { |
35 | struct Match |
36 | { |
37 | std::string::size_type offset; |
38 | std::string::size_type length; |
39 | }; |
40 | } |
41 | |
42 | template <bool thread_safe> |
43 | class OptimizedRegularExpressionImpl |
44 | { |
45 | public: |
46 | enum Options |
47 | { |
48 | RE_CASELESS = 0x00000001, |
49 | RE_NO_CAPTURE = 0x00000010, |
50 | RE_DOT_NL = 0x00000100 |
51 | }; |
52 | |
53 | using Match = OptimizedRegularExpressionDetails::Match; |
54 | using MatchVec = std::vector<Match>; |
55 | |
56 | using RegexType = std::conditional_t<thread_safe, re2::RE2, re2_st::RE2>; |
57 | using StringPieceType = std::conditional_t<thread_safe, re2::StringPiece, re2_st::StringPiece>; |
58 | |
59 | OptimizedRegularExpressionImpl(const std::string & regexp_, int options = 0); |
60 | |
61 | bool match(const std::string & subject) const |
62 | { |
63 | return match(subject.data(), subject.size()); |
64 | } |
65 | |
66 | bool match(const std::string & subject, Match & match_) const |
67 | { |
68 | return match(subject.data(), subject.size(), match_); |
69 | } |
70 | |
71 | unsigned match(const std::string & subject, MatchVec & matches) const |
72 | { |
73 | return match(subject.data(), subject.size(), matches); |
74 | } |
75 | |
76 | unsigned match(const char * subject, size_t subject_size, MatchVec & matches) const |
77 | { |
78 | return match(subject, subject_size, matches, number_of_subpatterns + 1); |
79 | } |
80 | |
81 | bool match(const char * subject, size_t subject_size) const; |
82 | bool match(const char * subject, size_t subject_size, Match & match) const; |
83 | unsigned match(const char * subject, size_t subject_size, MatchVec & matches, unsigned limit) const; |
84 | |
85 | unsigned getNumberOfSubpatterns() const { return number_of_subpatterns; } |
86 | |
87 | /// Get the regexp re2 or nullptr if the pattern is trivial (for output to the log). |
88 | const std::unique_ptr<RegexType> & getRE2() const { return re2; } |
89 | |
90 | static void analyze(const std::string & regexp_, std::string & required_substring, bool & is_trivial, bool & required_substring_is_prefix); |
91 | |
92 | void getAnalyzeResult(std::string & out_required_substring, bool & out_is_trivial, bool & out_required_substring_is_prefix) const |
93 | { |
94 | out_required_substring = required_substring; |
95 | out_is_trivial = is_trivial; |
96 | out_required_substring_is_prefix = required_substring_is_prefix; |
97 | } |
98 | |
99 | private: |
100 | bool is_trivial; |
101 | bool required_substring_is_prefix; |
102 | bool is_case_insensitive; |
103 | std::string required_substring; |
104 | std::unique_ptr<RegexType> re2; |
105 | unsigned number_of_subpatterns; |
106 | }; |
107 | |
108 | using OptimizedRegularExpression = OptimizedRegularExpressionImpl<true>; |
109 | |