1#pragma once
2
3#include <map>
4#include <memory>
5#include <mutex>
6#include <optional>
7#include <string>
8#include <utility>
9#include <vector>
10#include <Functions/likePatternToRegexp.h>
11#include <Common/Exception.h>
12#include <Common/ObjectPool.h>
13#include <Common/OptimizedRegularExpression.h>
14#include <Common/ProfileEvents.h>
15#include <common/StringRef.h>
16
17
18#include "config_functions.h"
19#if USE_HYPERSCAN
20# if __has_include(<hs/hs.h>)
21# include <hs/hs.h>
22# else
23# include <hs.h>
24# endif
25#endif
26
27namespace ProfileEvents
28{
29extern const Event RegexpCreated;
30}
31
32
33namespace DB
34{
35namespace ErrorCodes
36{
37 extern const int CANNOT_ALLOCATE_MEMORY;
38 extern const int LOGICAL_ERROR;
39}
40
41namespace Regexps
42{
43 using Regexp = OptimizedRegularExpressionImpl<false>;
44 using Pool = ObjectPoolMap<Regexp, String>;
45
46 template <bool like>
47 inline Regexp createRegexp(const std::string & pattern, int flags)
48 {
49 return {pattern, flags};
50 }
51
52 template <>
53 inline Regexp createRegexp<true>(const std::string & pattern, int flags)
54 {
55 return {likePatternToRegexp(pattern), flags};
56 }
57
58 template <bool like, bool no_capture>
59 inline Pool::Pointer get(const std::string & pattern)
60 {
61 /// C++11 has thread-safe function-local statics on most modern compilers.
62 static Pool known_regexps; /// Different variables for different pattern parameters.
63
64 return known_regexps.get(pattern, [&pattern]
65 {
66 int flags = OptimizedRegularExpression::RE_DOT_NL;
67 if (no_capture)
68 flags |= OptimizedRegularExpression::RE_NO_CAPTURE;
69
70 ProfileEvents::increment(ProfileEvents::RegexpCreated);
71 return new Regexp{createRegexp<like>(pattern, flags)};
72 });
73 }
74}
75
76#if USE_HYPERSCAN
77
78namespace MultiRegexps
79{
80 template <typename Deleter, Deleter deleter>
81 struct HyperscanDeleter
82 {
83 template <typename T>
84 void operator()(T * ptr) const
85 {
86 deleter(ptr);
87 }
88 };
89
90 /// Helper unique pointers to correctly delete the allocated space when hyperscan cannot compile something and we throw an exception.
91 using CompilerError = std::unique_ptr<hs_compile_error_t, HyperscanDeleter<decltype(&hs_free_compile_error), &hs_free_compile_error>>;
92 using ScratchPtr = std::unique_ptr<hs_scratch_t, HyperscanDeleter<decltype(&hs_free_scratch), &hs_free_scratch>>;
93 using DataBasePtr = std::unique_ptr<hs_database_t, HyperscanDeleter<decltype(&hs_free_database), &hs_free_database>>;
94
95 /// Database is thread safe across multiple threads and Scratch is not but we can copy it whenever we use it in the searcher.
96 class Regexps
97 {
98 public:
99 Regexps(hs_database_t * db_, hs_scratch_t * scratch_) : db{db_}, scratch{scratch_} { }
100
101 hs_database_t * getDB() const { return db.get(); }
102 hs_scratch_t * getScratch() const { return scratch.get(); }
103
104 private:
105 DataBasePtr db;
106 ScratchPtr scratch;
107 };
108
109 struct Pool
110 {
111 /// Mutex for finding in map.
112 std::mutex mutex;
113 /// Patterns + possible edit_distance to database and scratch.
114 std::map<std::pair<std::vector<String>, std::optional<UInt32>>, Regexps> storage;
115 };
116
117 template <bool save_indices, bool CompileForEditDistance>
118 inline Regexps constructRegexps(const std::vector<String> & str_patterns, std::optional<UInt32> edit_distance)
119 {
120 (void)edit_distance;
121 /// Common pointers
122 std::vector<const char *> patterns;
123 std::vector<unsigned int> flags;
124
125 /// Pointer for external edit distance compilation
126 std::vector<hs_expr_ext> ext_exprs;
127 std::vector<const hs_expr_ext *> ext_exprs_ptrs;
128
129 patterns.reserve(str_patterns.size());
130 flags.reserve(str_patterns.size());
131
132 if constexpr (CompileForEditDistance)
133 {
134 ext_exprs.reserve(str_patterns.size());
135 ext_exprs_ptrs.reserve(str_patterns.size());
136 }
137
138 for (const StringRef ref : str_patterns)
139 {
140 patterns.push_back(ref.data);
141 /* Flags below are the pattern matching flags.
142 * HS_FLAG_DOTALL is a compile flag where matching a . will not exclude newlines. This is a good
143 * performance practice accrording to Hyperscan API. https://intel.github.io/hyperscan/dev-reference/performance.html#dot-all-mode
144 * HS_FLAG_ALLOWEMPTY is a compile flag where empty strings are allowed to match.
145 * HS_FLAG_UTF8 is a flag where UTF8 literals are matched.
146 * HS_FLAG_SINGLEMATCH is a compile flag where each pattern match will be returned only once. it is a good performance practice
147 * as it is said in the Hyperscan documentation. https://intel.github.io/hyperscan/dev-reference/performance.html#single-match-flag
148 */
149 flags.push_back(HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8);
150 if constexpr (CompileForEditDistance)
151 {
152 /// Hyperscan currently does not support UTF8 matching with edit distance.
153 flags.back() &= ~HS_FLAG_UTF8;
154 ext_exprs.emplace_back();
155 /// HS_EXT_FLAG_EDIT_DISTANCE is a compile flag responsible for Levenstein distance.
156 ext_exprs.back().flags = HS_EXT_FLAG_EDIT_DISTANCE;
157 ext_exprs.back().edit_distance = edit_distance.value();
158 ext_exprs_ptrs.push_back(&ext_exprs.back());
159 }
160 }
161 hs_database_t * db = nullptr;
162 hs_compile_error_t * compile_error;
163
164
165 std::unique_ptr<unsigned int[]> ids;
166
167 /// We mark the patterns to provide the callback results.
168 if constexpr (save_indices)
169 {
170 ids.reset(new unsigned int[patterns.size()]);
171 for (size_t i = 0; i < patterns.size(); ++i)
172 ids[i] = i + 1;
173 }
174
175 hs_error_t err;
176 if constexpr (!CompileForEditDistance)
177 err = hs_compile_multi(
178 patterns.data(),
179 flags.data(),
180 ids.get(),
181 patterns.size(),
182 HS_MODE_BLOCK,
183 nullptr,
184 &db,
185 &compile_error);
186 else
187 err = hs_compile_ext_multi(
188 patterns.data(),
189 flags.data(),
190 ids.get(),
191 ext_exprs_ptrs.data(),
192 patterns.size(),
193 HS_MODE_BLOCK,
194 nullptr,
195 &db,
196 &compile_error);
197
198 if (err != HS_SUCCESS)
199 {
200 /// CompilerError is a unique_ptr, so correct memory free after the exception is thrown.
201 CompilerError error(compile_error);
202
203 if (error->expression < 0)
204 throw Exception(String(error->message), ErrorCodes::LOGICAL_ERROR);
205 else
206 throw Exception(
207 "Pattern '" + str_patterns[error->expression] + "' failed with error '" + String(error->message),
208 ErrorCodes::LOGICAL_ERROR);
209 }
210
211 ProfileEvents::increment(ProfileEvents::RegexpCreated);
212
213 /// We allocate the scratch space only once, then copy it across multiple threads with hs_clone_scratch
214 /// function which is faster than allocating scratch space each time in each thread.
215 hs_scratch_t * scratch = nullptr;
216 err = hs_alloc_scratch(db, &scratch);
217
218 /// If not HS_SUCCESS, it is guaranteed that the memory would not be allocated for scratch.
219 if (err != HS_SUCCESS)
220 throw Exception("Could not allocate scratch space for hyperscan", ErrorCodes::CANNOT_ALLOCATE_MEMORY);
221
222 return Regexps{db, scratch};
223 }
224
225 /// If CompileForEditDistance is False, edit_distance must be nullopt
226 /// Also, we use templates here because each instantiation of function
227 /// template has its own copy of local static variables which must not be the same
228 /// for different hyperscan compilations.
229 template <bool save_indices, bool CompileForEditDistance>
230 inline Regexps * get(const std::vector<StringRef> & patterns, std::optional<UInt32> edit_distance)
231 {
232 /// C++11 has thread-safe function-local statics on most modern compilers.
233 static Pool known_regexps; /// Different variables for different pattern parameters.
234
235 std::vector<String> str_patterns;
236 str_patterns.reserve(patterns.size());
237 for (const StringRef & ref : patterns)
238 str_patterns.push_back(ref.toString());
239
240 /// Get the lock for finding database.
241 std::unique_lock lock(known_regexps.mutex);
242
243 auto it = known_regexps.storage.find({str_patterns, edit_distance});
244
245 /// If not found, compile and let other threads wait.
246 if (known_regexps.storage.end() == it)
247 it = known_regexps.storage
248 .emplace(
249 std::pair{str_patterns, edit_distance},
250 constructRegexps<save_indices, CompileForEditDistance>(str_patterns, edit_distance))
251 .first;
252 /// If found, unlock and return the database.
253 lock.unlock();
254
255 return &it->second;
256 }
257}
258
259#endif // USE_HYPERSCAN
260
261}
262