1#pragma once
2
3#include <string>
4#include <vector>
5#include <memory>
6#include <Common/config.h>
7#include <re2/re2.h>
8#if USE_RE2_ST
9 #include <re2_st/re2.h>
10#else
11 #define re2_st re2
12#endif
13
14
15/** Uses two ways to optimize a regular expression:
16 * 1. If the regular expression is trivial (reduces to finding a substring in a string),
17 * then replaces the search with strstr or strcasestr.
18 * 2. If the regular expression contains a non-alternative substring of sufficient length,
19 * then before testing, strstr or strcasestr of sufficient length is used;
20 * regular expression is only fully checked if a substring is found.
21 * 3. In other cases, the re2 engine is used.
22 *
23 * This makes sense, since strstr and strcasestr in libc for Linux are well optimized.
24 *
25 * Suitable if the following conditions are simultaneously met:
26 * - if in most calls, the regular expression does not match;
27 * - if the regular expression is compatible with the re2 engine;
28 * - you can use at your own risk, since, probably, not all cases are taken into account.
29 *
30 * NOTE: Multi-character metasymbols such as \Pl are handled incorrectly.
31 */
32
33namespace OptimizedRegularExpressionDetails
34{
35 struct Match
36 {
37 std::string::size_type offset;
38 std::string::size_type length;
39 };
40}
41
42template <bool thread_safe>
43class OptimizedRegularExpressionImpl
44{
45public:
46 enum Options
47 {
48 RE_CASELESS = 0x00000001,
49 RE_NO_CAPTURE = 0x00000010,
50 RE_DOT_NL = 0x00000100
51 };
52
53 using Match = OptimizedRegularExpressionDetails::Match;
54 using MatchVec = std::vector<Match>;
55
56 using RegexType = std::conditional_t<thread_safe, re2::RE2, re2_st::RE2>;
57 using StringPieceType = std::conditional_t<thread_safe, re2::StringPiece, re2_st::StringPiece>;
58
59 OptimizedRegularExpressionImpl(const std::string & regexp_, int options = 0);
60
61 bool match(const std::string & subject) const
62 {
63 return match(subject.data(), subject.size());
64 }
65
66 bool match(const std::string & subject, Match & match_) const
67 {
68 return match(subject.data(), subject.size(), match_);
69 }
70
71 unsigned match(const std::string & subject, MatchVec & matches) const
72 {
73 return match(subject.data(), subject.size(), matches);
74 }
75
76 unsigned match(const char * subject, size_t subject_size, MatchVec & matches) const
77 {
78 return match(subject, subject_size, matches, number_of_subpatterns + 1);
79 }
80
81 bool match(const char * subject, size_t subject_size) const;
82 bool match(const char * subject, size_t subject_size, Match & match) const;
83 unsigned match(const char * subject, size_t subject_size, MatchVec & matches, unsigned limit) const;
84
85 unsigned getNumberOfSubpatterns() const { return number_of_subpatterns; }
86
87 /// Get the regexp re2 or nullptr if the pattern is trivial (for output to the log).
88 const std::unique_ptr<RegexType> & getRE2() const { return re2; }
89
90 static void analyze(const std::string & regexp_, std::string & required_substring, bool & is_trivial, bool & required_substring_is_prefix);
91
92 void getAnalyzeResult(std::string & out_required_substring, bool & out_is_trivial, bool & out_required_substring_is_prefix) const
93 {
94 out_required_substring = required_substring;
95 out_is_trivial = is_trivial;
96 out_required_substring_is_prefix = required_substring_is_prefix;
97 }
98
99private:
100 bool is_trivial;
101 bool required_substring_is_prefix;
102 bool is_case_insensitive;
103 std::string required_substring;
104 std::unique_ptr<RegexType> re2;
105 unsigned number_of_subpatterns;
106};
107
108using OptimizedRegularExpression = OptimizedRegularExpressionImpl<true>;
109