1 | #pragma once |
2 | |
3 | #include <map> |
4 | #include <memory> |
5 | #include <mutex> |
6 | #include <optional> |
7 | #include <string> |
8 | #include <utility> |
9 | #include <vector> |
10 | #include <Functions/likePatternToRegexp.h> |
11 | #include <Common/Exception.h> |
12 | #include <Common/ObjectPool.h> |
13 | #include <Common/OptimizedRegularExpression.h> |
14 | #include <Common/ProfileEvents.h> |
15 | #include <common/StringRef.h> |
16 | |
17 | |
18 | #include "config_functions.h" |
19 | #if USE_HYPERSCAN |
20 | # if __has_include(<hs/hs.h>) |
21 | # include <hs/hs.h> |
22 | # else |
23 | # include <hs.h> |
24 | # endif |
25 | #endif |
26 | |
27 | namespace ProfileEvents |
28 | { |
29 | extern const Event RegexpCreated; |
30 | } |
31 | |
32 | |
33 | namespace DB |
34 | { |
35 | namespace ErrorCodes |
36 | { |
37 | extern const int CANNOT_ALLOCATE_MEMORY; |
38 | extern const int LOGICAL_ERROR; |
39 | } |
40 | |
41 | namespace Regexps |
42 | { |
43 | using Regexp = OptimizedRegularExpressionImpl<false>; |
44 | using Pool = ObjectPoolMap<Regexp, String>; |
45 | |
46 | template <bool like> |
47 | inline Regexp createRegexp(const std::string & pattern, int flags) |
48 | { |
49 | return {pattern, flags}; |
50 | } |
51 | |
52 | template <> |
53 | inline Regexp createRegexp<true>(const std::string & pattern, int flags) |
54 | { |
55 | return {likePatternToRegexp(pattern), flags}; |
56 | } |
57 | |
58 | template <bool like, bool no_capture> |
59 | inline Pool::Pointer get(const std::string & pattern) |
60 | { |
61 | /// C++11 has thread-safe function-local statics on most modern compilers. |
62 | static Pool known_regexps; /// Different variables for different pattern parameters. |
63 | |
64 | return known_regexps.get(pattern, [&pattern] |
65 | { |
66 | int flags = OptimizedRegularExpression::RE_DOT_NL; |
67 | if (no_capture) |
68 | flags |= OptimizedRegularExpression::RE_NO_CAPTURE; |
69 | |
70 | ProfileEvents::increment(ProfileEvents::RegexpCreated); |
71 | return new Regexp{createRegexp<like>(pattern, flags)}; |
72 | }); |
73 | } |
74 | } |
75 | |
76 | #if USE_HYPERSCAN |
77 | |
78 | namespace MultiRegexps |
79 | { |
80 | template <typename Deleter, Deleter deleter> |
81 | struct HyperscanDeleter |
82 | { |
83 | template <typename T> |
84 | void operator()(T * ptr) const |
85 | { |
86 | deleter(ptr); |
87 | } |
88 | }; |
89 | |
90 | /// Helper unique pointers to correctly delete the allocated space when hyperscan cannot compile something and we throw an exception. |
91 | using CompilerError = std::unique_ptr<hs_compile_error_t, HyperscanDeleter<decltype(&hs_free_compile_error), &hs_free_compile_error>>; |
92 | using ScratchPtr = std::unique_ptr<hs_scratch_t, HyperscanDeleter<decltype(&hs_free_scratch), &hs_free_scratch>>; |
93 | using DataBasePtr = std::unique_ptr<hs_database_t, HyperscanDeleter<decltype(&hs_free_database), &hs_free_database>>; |
94 | |
95 | /// Database is thread safe across multiple threads and Scratch is not but we can copy it whenever we use it in the searcher. |
96 | class Regexps |
97 | { |
98 | public: |
99 | Regexps(hs_database_t * db_, hs_scratch_t * scratch_) : db{db_}, scratch{scratch_} { } |
100 | |
101 | hs_database_t * getDB() const { return db.get(); } |
102 | hs_scratch_t * getScratch() const { return scratch.get(); } |
103 | |
104 | private: |
105 | DataBasePtr db; |
106 | ScratchPtr scratch; |
107 | }; |
108 | |
109 | struct Pool |
110 | { |
111 | /// Mutex for finding in map. |
112 | std::mutex mutex; |
113 | /// Patterns + possible edit_distance to database and scratch. |
114 | std::map<std::pair<std::vector<String>, std::optional<UInt32>>, Regexps> storage; |
115 | }; |
116 | |
117 | template <bool save_indices, bool CompileForEditDistance> |
118 | inline Regexps constructRegexps(const std::vector<String> & str_patterns, std::optional<UInt32> edit_distance) |
119 | { |
120 | (void)edit_distance; |
121 | /// Common pointers |
122 | std::vector<const char *> patterns; |
123 | std::vector<unsigned int> flags; |
124 | |
125 | /// Pointer for external edit distance compilation |
126 | std::vector<hs_expr_ext> ext_exprs; |
127 | std::vector<const hs_expr_ext *> ext_exprs_ptrs; |
128 | |
129 | patterns.reserve(str_patterns.size()); |
130 | flags.reserve(str_patterns.size()); |
131 | |
132 | if constexpr (CompileForEditDistance) |
133 | { |
134 | ext_exprs.reserve(str_patterns.size()); |
135 | ext_exprs_ptrs.reserve(str_patterns.size()); |
136 | } |
137 | |
138 | for (const StringRef ref : str_patterns) |
139 | { |
140 | patterns.push_back(ref.data); |
141 | /* Flags below are the pattern matching flags. |
142 | * HS_FLAG_DOTALL is a compile flag where matching a . will not exclude newlines. This is a good |
143 | * performance practice accrording to Hyperscan API. https://intel.github.io/hyperscan/dev-reference/performance.html#dot-all-mode |
144 | * HS_FLAG_ALLOWEMPTY is a compile flag where empty strings are allowed to match. |
145 | * HS_FLAG_UTF8 is a flag where UTF8 literals are matched. |
146 | * HS_FLAG_SINGLEMATCH is a compile flag where each pattern match will be returned only once. it is a good performance practice |
147 | * as it is said in the Hyperscan documentation. https://intel.github.io/hyperscan/dev-reference/performance.html#single-match-flag |
148 | */ |
149 | flags.push_back(HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8); |
150 | if constexpr (CompileForEditDistance) |
151 | { |
152 | /// Hyperscan currently does not support UTF8 matching with edit distance. |
153 | flags.back() &= ~HS_FLAG_UTF8; |
154 | ext_exprs.emplace_back(); |
155 | /// HS_EXT_FLAG_EDIT_DISTANCE is a compile flag responsible for Levenstein distance. |
156 | ext_exprs.back().flags = HS_EXT_FLAG_EDIT_DISTANCE; |
157 | ext_exprs.back().edit_distance = edit_distance.value(); |
158 | ext_exprs_ptrs.push_back(&ext_exprs.back()); |
159 | } |
160 | } |
161 | hs_database_t * db = nullptr; |
162 | hs_compile_error_t * compile_error; |
163 | |
164 | |
165 | std::unique_ptr<unsigned int[]> ids; |
166 | |
167 | /// We mark the patterns to provide the callback results. |
168 | if constexpr (save_indices) |
169 | { |
170 | ids.reset(new unsigned int[patterns.size()]); |
171 | for (size_t i = 0; i < patterns.size(); ++i) |
172 | ids[i] = i + 1; |
173 | } |
174 | |
175 | hs_error_t err; |
176 | if constexpr (!CompileForEditDistance) |
177 | err = hs_compile_multi( |
178 | patterns.data(), |
179 | flags.data(), |
180 | ids.get(), |
181 | patterns.size(), |
182 | HS_MODE_BLOCK, |
183 | nullptr, |
184 | &db, |
185 | &compile_error); |
186 | else |
187 | err = hs_compile_ext_multi( |
188 | patterns.data(), |
189 | flags.data(), |
190 | ids.get(), |
191 | ext_exprs_ptrs.data(), |
192 | patterns.size(), |
193 | HS_MODE_BLOCK, |
194 | nullptr, |
195 | &db, |
196 | &compile_error); |
197 | |
198 | if (err != HS_SUCCESS) |
199 | { |
200 | /// CompilerError is a unique_ptr, so correct memory free after the exception is thrown. |
201 | CompilerError error(compile_error); |
202 | |
203 | if (error->expression < 0) |
204 | throw Exception(String(error->message), ErrorCodes::LOGICAL_ERROR); |
205 | else |
206 | throw Exception( |
207 | "Pattern '" + str_patterns[error->expression] + "' failed with error '" + String(error->message), |
208 | ErrorCodes::LOGICAL_ERROR); |
209 | } |
210 | |
211 | ProfileEvents::increment(ProfileEvents::RegexpCreated); |
212 | |
213 | /// We allocate the scratch space only once, then copy it across multiple threads with hs_clone_scratch |
214 | /// function which is faster than allocating scratch space each time in each thread. |
215 | hs_scratch_t * scratch = nullptr; |
216 | err = hs_alloc_scratch(db, &scratch); |
217 | |
218 | /// If not HS_SUCCESS, it is guaranteed that the memory would not be allocated for scratch. |
219 | if (err != HS_SUCCESS) |
220 | throw Exception("Could not allocate scratch space for hyperscan" , ErrorCodes::CANNOT_ALLOCATE_MEMORY); |
221 | |
222 | return Regexps{db, scratch}; |
223 | } |
224 | |
225 | /// If CompileForEditDistance is False, edit_distance must be nullopt |
226 | /// Also, we use templates here because each instantiation of function |
227 | /// template has its own copy of local static variables which must not be the same |
228 | /// for different hyperscan compilations. |
229 | template <bool save_indices, bool CompileForEditDistance> |
230 | inline Regexps * get(const std::vector<StringRef> & patterns, std::optional<UInt32> edit_distance) |
231 | { |
232 | /// C++11 has thread-safe function-local statics on most modern compilers. |
233 | static Pool known_regexps; /// Different variables for different pattern parameters. |
234 | |
235 | std::vector<String> str_patterns; |
236 | str_patterns.reserve(patterns.size()); |
237 | for (const StringRef & ref : patterns) |
238 | str_patterns.push_back(ref.toString()); |
239 | |
240 | /// Get the lock for finding database. |
241 | std::unique_lock lock(known_regexps.mutex); |
242 | |
243 | auto it = known_regexps.storage.find({str_patterns, edit_distance}); |
244 | |
245 | /// If not found, compile and let other threads wait. |
246 | if (known_regexps.storage.end() == it) |
247 | it = known_regexps.storage |
248 | .emplace( |
249 | std::pair{str_patterns, edit_distance}, |
250 | constructRegexps<save_indices, CompileForEditDistance>(str_patterns, edit_distance)) |
251 | .first; |
252 | /// If found, unlock and return the database. |
253 | lock.unlock(); |
254 | |
255 | return &it->second; |
256 | } |
257 | } |
258 | |
259 | #endif // USE_HYPERSCAN |
260 | |
261 | } |
262 | |