| 1 | #pragma once |
| 2 | |
| 3 | #include <string> |
| 4 | #include <vector> |
| 5 | #include <memory> |
| 6 | #include <Common/config.h> |
| 7 | #include <re2/re2.h> |
| 8 | #if USE_RE2_ST |
| 9 | #include <re2_st/re2.h> |
| 10 | #else |
| 11 | #define re2_st re2 |
| 12 | #endif |
| 13 | |
| 14 | |
| 15 | /** Uses two ways to optimize a regular expression: |
| 16 | * 1. If the regular expression is trivial (reduces to finding a substring in a string), |
| 17 | * then replaces the search with strstr or strcasestr. |
| 18 | * 2. If the regular expression contains a non-alternative substring of sufficient length, |
| 19 | * then before testing, strstr or strcasestr of sufficient length is used; |
| 20 | * regular expression is only fully checked if a substring is found. |
| 21 | * 3. In other cases, the re2 engine is used. |
| 22 | * |
| 23 | * This makes sense, since strstr and strcasestr in libc for Linux are well optimized. |
| 24 | * |
| 25 | * Suitable if the following conditions are simultaneously met: |
| 26 | * - if in most calls, the regular expression does not match; |
| 27 | * - if the regular expression is compatible with the re2 engine; |
| 28 | * - you can use at your own risk, since, probably, not all cases are taken into account. |
| 29 | * |
| 30 | * NOTE: Multi-character metasymbols such as \Pl are handled incorrectly. |
| 31 | */ |
| 32 | |
| 33 | namespace OptimizedRegularExpressionDetails |
| 34 | { |
| 35 | struct Match |
| 36 | { |
| 37 | std::string::size_type offset; |
| 38 | std::string::size_type length; |
| 39 | }; |
| 40 | } |
| 41 | |
| 42 | template <bool thread_safe> |
| 43 | class OptimizedRegularExpressionImpl |
| 44 | { |
| 45 | public: |
| 46 | enum Options |
| 47 | { |
| 48 | RE_CASELESS = 0x00000001, |
| 49 | RE_NO_CAPTURE = 0x00000010, |
| 50 | RE_DOT_NL = 0x00000100 |
| 51 | }; |
| 52 | |
| 53 | using Match = OptimizedRegularExpressionDetails::Match; |
| 54 | using MatchVec = std::vector<Match>; |
| 55 | |
| 56 | using RegexType = std::conditional_t<thread_safe, re2::RE2, re2_st::RE2>; |
| 57 | using StringPieceType = std::conditional_t<thread_safe, re2::StringPiece, re2_st::StringPiece>; |
| 58 | |
| 59 | OptimizedRegularExpressionImpl(const std::string & regexp_, int options = 0); |
| 60 | |
| 61 | bool match(const std::string & subject) const |
| 62 | { |
| 63 | return match(subject.data(), subject.size()); |
| 64 | } |
| 65 | |
| 66 | bool match(const std::string & subject, Match & match_) const |
| 67 | { |
| 68 | return match(subject.data(), subject.size(), match_); |
| 69 | } |
| 70 | |
| 71 | unsigned match(const std::string & subject, MatchVec & matches) const |
| 72 | { |
| 73 | return match(subject.data(), subject.size(), matches); |
| 74 | } |
| 75 | |
| 76 | unsigned match(const char * subject, size_t subject_size, MatchVec & matches) const |
| 77 | { |
| 78 | return match(subject, subject_size, matches, number_of_subpatterns + 1); |
| 79 | } |
| 80 | |
| 81 | bool match(const char * subject, size_t subject_size) const; |
| 82 | bool match(const char * subject, size_t subject_size, Match & match) const; |
| 83 | unsigned match(const char * subject, size_t subject_size, MatchVec & matches, unsigned limit) const; |
| 84 | |
| 85 | unsigned getNumberOfSubpatterns() const { return number_of_subpatterns; } |
| 86 | |
| 87 | /// Get the regexp re2 or nullptr if the pattern is trivial (for output to the log). |
| 88 | const std::unique_ptr<RegexType> & getRE2() const { return re2; } |
| 89 | |
| 90 | static void analyze(const std::string & regexp_, std::string & required_substring, bool & is_trivial, bool & required_substring_is_prefix); |
| 91 | |
| 92 | void getAnalyzeResult(std::string & out_required_substring, bool & out_is_trivial, bool & out_required_substring_is_prefix) const |
| 93 | { |
| 94 | out_required_substring = required_substring; |
| 95 | out_is_trivial = is_trivial; |
| 96 | out_required_substring_is_prefix = required_substring_is_prefix; |
| 97 | } |
| 98 | |
| 99 | private: |
| 100 | bool is_trivial; |
| 101 | bool required_substring_is_prefix; |
| 102 | bool is_case_insensitive; |
| 103 | std::string required_substring; |
| 104 | std::unique_ptr<RegexType> re2; |
| 105 | unsigned number_of_subpatterns; |
| 106 | }; |
| 107 | |
| 108 | using OptimizedRegularExpression = OptimizedRegularExpressionImpl<true>; |
| 109 | |