1 | #include "FunctionsStringRegex.h" |
2 | #include "FunctionsStringSearch.h" |
3 | #include <Columns/ColumnFixedString.h> |
4 | #include <DataTypes/DataTypeFixedString.h> |
5 | #include <Functions/FunctionFactory.h> |
6 | #include <Functions/Regexps.h> |
7 | #include <IO/WriteHelpers.h> |
8 | #include <re2/re2.h> |
9 | #include <re2/stringpiece.h> |
10 | #include <Poco/UTF8String.h> |
11 | #include <Common/Volnitsky.h> |
12 | #include <algorithm> |
13 | #include <memory> |
14 | #include <optional> |
15 | #include <string> |
16 | #include <vector> |
17 | |
18 | #include "config_functions.h" |
19 | #if USE_HYPERSCAN |
20 | # if __has_include(<hs/hs.h>) |
21 | # include <hs/hs.h> |
22 | # else |
23 | # include <hs.h> |
24 | # endif |
25 | #endif |
26 | |
27 | #include <Common/config.h> |
28 | #if USE_RE2_ST |
29 | # include <re2_st/re2.h> |
30 | #else |
31 | # define re2_st re2 |
32 | #endif |
33 | |
34 | namespace DB |
35 | { |
36 | namespace ErrorCodes |
37 | { |
38 | extern const int BAD_ARGUMENTS; |
39 | extern const int ILLEGAL_COLUMN; |
40 | extern const int TOO_MANY_BYTES; |
41 | extern const int NOT_IMPLEMENTED; |
42 | extern const int HYPERSCAN_CANNOT_SCAN_TEXT; |
43 | } |
44 | |
45 | /// Is the LIKE expression reduced to finding a substring in a string? |
46 | inline bool likePatternIsStrstr(const String & pattern, String & res) |
47 | { |
48 | res = "" ; |
49 | |
50 | if (pattern.size() < 2 || pattern.front() != '%' || pattern.back() != '%') |
51 | return false; |
52 | |
53 | res.reserve(pattern.size() * 2); |
54 | |
55 | const char * pos = pattern.data(); |
56 | const char * end = pos + pattern.size(); |
57 | |
58 | ++pos; |
59 | --end; |
60 | |
61 | while (pos < end) |
62 | { |
63 | switch (*pos) |
64 | { |
65 | case '%': |
66 | case '_': |
67 | return false; |
68 | case '\\': |
69 | ++pos; |
70 | if (pos == end) |
71 | return false; |
72 | else |
73 | res += *pos; |
74 | break; |
75 | default: |
76 | res += *pos; |
77 | break; |
78 | } |
79 | ++pos; |
80 | } |
81 | |
82 | return true; |
83 | } |
84 | |
85 | /** 'like' - if true, treat pattern as SQL LIKE; if false - treat pattern as re2 regexp. |
86 | * NOTE: We want to run regexp search for whole block by one call (as implemented in function 'position') |
87 | * but for that, regexp engine must support \0 bytes and their interpretation as string boundaries. |
88 | */ |
89 | template <bool like, bool revert = false> |
90 | struct MatchImpl |
91 | { |
92 | using ResultType = UInt8; |
93 | |
94 | static void vector_constant( |
95 | const ColumnString::Chars & data, const ColumnString::Offsets & offsets, const std::string & pattern, PaddedPODArray<UInt8> & res) |
96 | { |
97 | if (offsets.empty()) |
98 | return; |
99 | |
100 | String strstr_pattern; |
101 | /// A simple case where the LIKE expression reduces to finding a substring in a string |
102 | if (like && likePatternIsStrstr(pattern, strstr_pattern)) |
103 | { |
104 | const UInt8 * begin = data.data(); |
105 | const UInt8 * pos = begin; |
106 | const UInt8 * end = pos + data.size(); |
107 | |
108 | /// The current index in the array of strings. |
109 | size_t i = 0; |
110 | |
111 | /// TODO You need to make that `searcher` is common to all the calls of the function. |
112 | Volnitsky searcher(strstr_pattern.data(), strstr_pattern.size(), end - pos); |
113 | |
114 | /// We will search for the next occurrence in all rows at once. |
115 | while (pos < end && end != (pos = searcher.search(pos, end - pos))) |
116 | { |
117 | /// Let's determine which index it refers to. |
118 | while (begin + offsets[i] <= pos) |
119 | { |
120 | res[i] = revert; |
121 | ++i; |
122 | } |
123 | |
124 | /// We check that the entry does not pass through the boundaries of strings. |
125 | if (pos + strstr_pattern.size() < begin + offsets[i]) |
126 | res[i] = !revert; |
127 | else |
128 | res[i] = revert; |
129 | |
130 | pos = begin + offsets[i]; |
131 | ++i; |
132 | } |
133 | |
134 | /// Tail, in which there can be no substring. |
135 | if (i < res.size()) |
136 | memset(&res[i], revert, (res.size() - i) * sizeof(res[0])); |
137 | } |
138 | else |
139 | { |
140 | size_t size = offsets.size(); |
141 | |
142 | const auto & regexp = Regexps::get<like, true>(pattern); |
143 | |
144 | std::string required_substring; |
145 | bool is_trivial; |
146 | bool required_substring_is_prefix; /// for `anchored` execution of the regexp. |
147 | |
148 | regexp->getAnalyzeResult(required_substring, is_trivial, required_substring_is_prefix); |
149 | |
150 | if (required_substring.empty()) |
151 | { |
152 | if (!regexp->getRE2()) /// An empty regexp. Always matches. |
153 | { |
154 | if (size) |
155 | memset(res.data(), 1, size * sizeof(res[0])); |
156 | } |
157 | else |
158 | { |
159 | size_t prev_offset = 0; |
160 | for (size_t i = 0; i < size; ++i) |
161 | { |
162 | res[i] = revert |
163 | ^ regexp->getRE2()->Match( |
164 | re2_st::StringPiece(reinterpret_cast<const char *>(&data[prev_offset]), offsets[i] - prev_offset - 1), |
165 | 0, |
166 | offsets[i] - prev_offset - 1, |
167 | re2_st::RE2::UNANCHORED, |
168 | nullptr, |
169 | 0); |
170 | |
171 | prev_offset = offsets[i]; |
172 | } |
173 | } |
174 | } |
175 | else |
176 | { |
177 | /// NOTE This almost matches with the case of LikePatternIsStrstr. |
178 | |
179 | const UInt8 * begin = data.data(); |
180 | const UInt8 * pos = begin; |
181 | const UInt8 * end = pos + data.size(); |
182 | |
183 | /// The current index in the array of strings. |
184 | size_t i = 0; |
185 | |
186 | Volnitsky searcher(required_substring.data(), required_substring.size(), end - pos); |
187 | |
188 | /// We will search for the next occurrence in all rows at once. |
189 | while (pos < end && end != (pos = searcher.search(pos, end - pos))) |
190 | { |
191 | /// Determine which index it refers to. |
192 | while (begin + offsets[i] <= pos) |
193 | { |
194 | res[i] = revert; |
195 | ++i; |
196 | } |
197 | |
198 | /// We check that the entry does not pass through the boundaries of strings. |
199 | if (pos + strstr_pattern.size() < begin + offsets[i]) |
200 | { |
201 | /// And if it does not, if necessary, we check the regexp. |
202 | |
203 | if (is_trivial) |
204 | res[i] = !revert; |
205 | else |
206 | { |
207 | const char * str_data = reinterpret_cast<const char *>(&data[offsets[i - 1]]); |
208 | size_t str_size = offsets[i] - offsets[i - 1] - 1; |
209 | |
210 | /** Even in the case of `required_substring_is_prefix` use UNANCHORED check for regexp, |
211 | * so that it can match when `required_substring` occurs into the string several times, |
212 | * and at the first occurrence, the regexp is not a match. |
213 | */ |
214 | |
215 | if (required_substring_is_prefix) |
216 | res[i] = revert |
217 | ^ regexp->getRE2()->Match( |
218 | re2_st::StringPiece(str_data, str_size), |
219 | reinterpret_cast<const char *>(pos) - str_data, |
220 | str_size, |
221 | re2_st::RE2::UNANCHORED, |
222 | nullptr, |
223 | 0); |
224 | else |
225 | res[i] = revert |
226 | ^ regexp->getRE2()->Match( |
227 | re2_st::StringPiece(str_data, str_size), 0, str_size, re2_st::RE2::UNANCHORED, nullptr, 0); |
228 | } |
229 | } |
230 | else |
231 | res[i] = revert; |
232 | |
233 | pos = begin + offsets[i]; |
234 | ++i; |
235 | } |
236 | |
237 | if (i < res.size()) |
238 | memset(&res[i], revert, (res.size() - i) * sizeof(res[0])); |
239 | } |
240 | } |
241 | } |
242 | |
243 | static void constant_constant(const std::string & data, const std::string & pattern, UInt8 & res) |
244 | { |
245 | const auto & regexp = Regexps::get<like, true>(pattern); |
246 | res = revert ^ regexp->match(data); |
247 | } |
248 | |
249 | template <typename... Args> |
250 | static void vector_vector(Args &&...) |
251 | { |
252 | throw Exception("Functions 'like' and 'match' don't support non-constant needle argument" , ErrorCodes::ILLEGAL_COLUMN); |
253 | } |
254 | |
255 | /// Search different needles in single haystack. |
256 | template <typename... Args> |
257 | static void constant_vector(Args &&...) |
258 | { |
259 | throw Exception("Functions 'like' and 'match' don't support non-constant needle argument" , ErrorCodes::ILLEGAL_COLUMN); |
260 | } |
261 | }; |
262 | |
263 | |
264 | template <typename Type, bool FindAny, bool FindAnyIndex, bool MultiSearchDistance> |
265 | struct MultiMatchAnyImpl |
266 | { |
267 | static_assert(static_cast<int>(FindAny) + static_cast<int>(FindAnyIndex) == 1); |
268 | using ResultType = Type; |
269 | static constexpr bool is_using_hyperscan = true; |
270 | /// Variable for understanding, if we used offsets for the output, most |
271 | /// likely to determine whether the function returns ColumnVector of ColumnArray. |
272 | static constexpr bool is_column_array = false; |
273 | static auto ReturnType() |
274 | { |
275 | return std::make_shared<DataTypeNumber<ResultType>>(); |
276 | } |
277 | |
278 | static void vector_constant( |
279 | const ColumnString::Chars & haystack_data, |
280 | const ColumnString::Offsets & haystack_offsets, |
281 | const std::vector<StringRef> & needles, |
282 | PaddedPODArray<Type> & res, |
283 | PaddedPODArray<UInt64> & offsets) |
284 | { |
285 | vector_constant(haystack_data, haystack_offsets, needles, res, offsets, std::nullopt); |
286 | } |
287 | |
288 | static void vector_constant( |
289 | const ColumnString::Chars & haystack_data, |
290 | const ColumnString::Offsets & haystack_offsets, |
291 | const std::vector<StringRef> & needles, |
292 | PaddedPODArray<Type> & res, |
293 | [[maybe_unused]] PaddedPODArray<UInt64> & offsets, |
294 | [[maybe_unused]] std::optional<UInt32> edit_distance) |
295 | { |
296 | (void)FindAny; |
297 | (void)FindAnyIndex; |
298 | res.resize(haystack_offsets.size()); |
299 | #if USE_HYPERSCAN |
300 | const auto & hyperscan_regex = MultiRegexps::get<FindAnyIndex, MultiSearchDistance>(needles, edit_distance); |
301 | hs_scratch_t * scratch = nullptr; |
302 | hs_error_t err = hs_clone_scratch(hyperscan_regex->getScratch(), &scratch); |
303 | |
304 | if (err != HS_SUCCESS) |
305 | throw Exception("Could not clone scratch space for hyperscan" , ErrorCodes::CANNOT_ALLOCATE_MEMORY); |
306 | |
307 | MultiRegexps::ScratchPtr smart_scratch(scratch); |
308 | |
309 | auto on_match = []([[maybe_unused]] unsigned int id, |
310 | unsigned long long /* from */, |
311 | unsigned long long /* to */, |
312 | unsigned int /* flags */, |
313 | void * context) -> int |
314 | { |
315 | if constexpr (FindAnyIndex) |
316 | *reinterpret_cast<Type *>(context) = id; |
317 | else if constexpr (FindAny) |
318 | *reinterpret_cast<Type *>(context) = 1; |
319 | /// Once we hit the callback, there is no need to search for others. |
320 | return 1; |
321 | }; |
322 | const size_t haystack_offsets_size = haystack_offsets.size(); |
323 | UInt64 offset = 0; |
324 | for (size_t i = 0; i < haystack_offsets_size; ++i) |
325 | { |
326 | UInt64 length = haystack_offsets[i] - offset - 1; |
327 | /// Hyperscan restriction. |
328 | if (length > std::numeric_limits<UInt32>::max()) |
329 | throw Exception("Too long string to search" , ErrorCodes::TOO_MANY_BYTES); |
330 | /// Zero the result, scan, check, update the offset. |
331 | res[i] = 0; |
332 | err = hs_scan( |
333 | hyperscan_regex->getDB(), |
334 | reinterpret_cast<const char *>(haystack_data.data()) + offset, |
335 | length, |
336 | 0, |
337 | smart_scratch.get(), |
338 | on_match, |
339 | &res[i]); |
340 | if (err != HS_SUCCESS && err != HS_SCAN_TERMINATED) |
341 | throw Exception("Failed to scan with hyperscan" , ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT); |
342 | offset = haystack_offsets[i]; |
343 | } |
344 | #else |
345 | /// Fallback if do not use hyperscan |
346 | if constexpr (MultiSearchDistance) |
347 | throw Exception( |
348 | "Edit distance multi-search is not implemented when hyperscan is off (is it x86 processor?)" , |
349 | ErrorCodes::NOT_IMPLEMENTED); |
350 | PaddedPODArray<UInt8> accum(res.size()); |
351 | memset(res.data(), 0, res.size() * sizeof(res.front())); |
352 | memset(accum.data(), 0, accum.size()); |
353 | for (size_t j = 0; j < needles.size(); ++j) |
354 | { |
355 | MatchImpl<false, false>::vector_constant(haystack_data, haystack_offsets, needles[j].toString(), accum); |
356 | for (size_t i = 0; i < res.size(); ++i) |
357 | { |
358 | if constexpr (FindAny) |
359 | res[i] |= accum[i]; |
360 | else if (FindAnyIndex && accum[i]) |
361 | res[i] = j + 1; |
362 | } |
363 | } |
364 | #endif // USE_HYPERSCAN |
365 | } |
366 | }; |
367 | |
368 | template <typename Type, bool MultiSearchDistance> |
369 | struct MultiMatchAllIndicesImpl |
370 | { |
371 | using ResultType = Type; |
372 | static constexpr bool is_using_hyperscan = true; |
373 | /// Variable for understanding, if we used offsets for the output, most |
374 | /// likely to determine whether the function returns ColumnVector of ColumnArray. |
375 | static constexpr bool is_column_array = true; |
376 | static auto ReturnType() |
377 | { |
378 | return std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt64>()); |
379 | } |
380 | |
381 | static void vector_constant( |
382 | const ColumnString::Chars & haystack_data, |
383 | const ColumnString::Offsets & haystack_offsets, |
384 | const std::vector<StringRef> & needles, |
385 | PaddedPODArray<Type> & res, |
386 | PaddedPODArray<UInt64> & offsets) |
387 | { |
388 | vector_constant(haystack_data, haystack_offsets, needles, res, offsets, std::nullopt); |
389 | } |
390 | |
391 | static void vector_constant( |
392 | const ColumnString::Chars & haystack_data, |
393 | const ColumnString::Offsets & haystack_offsets, |
394 | const std::vector<StringRef> & needles, |
395 | PaddedPODArray<Type> & res, |
396 | PaddedPODArray<UInt64> & offsets, |
397 | [[maybe_unused]] std::optional<UInt32> edit_distance) |
398 | { |
399 | offsets.resize(haystack_offsets.size()); |
400 | #if USE_HYPERSCAN |
401 | const auto & hyperscan_regex = MultiRegexps::get</*SaveIndices=*/true, MultiSearchDistance>(needles, edit_distance); |
402 | hs_scratch_t * scratch = nullptr; |
403 | hs_error_t err = hs_clone_scratch(hyperscan_regex->getScratch(), &scratch); |
404 | |
405 | if (err != HS_SUCCESS) |
406 | throw Exception("Could not clone scratch space for hyperscan" , ErrorCodes::CANNOT_ALLOCATE_MEMORY); |
407 | |
408 | MultiRegexps::ScratchPtr smart_scratch(scratch); |
409 | |
410 | auto on_match = [](unsigned int id, |
411 | unsigned long long /* from */, |
412 | unsigned long long /* to */, |
413 | unsigned int /* flags */, |
414 | void * context) -> int |
415 | { |
416 | static_cast<PaddedPODArray<Type>*>(context)->push_back(id); |
417 | return 0; |
418 | }; |
419 | const size_t haystack_offsets_size = haystack_offsets.size(); |
420 | UInt64 offset = 0; |
421 | for (size_t i = 0; i < haystack_offsets_size; ++i) |
422 | { |
423 | UInt64 length = haystack_offsets[i] - offset - 1; |
424 | /// Hyperscan restriction. |
425 | if (length > std::numeric_limits<UInt32>::max()) |
426 | throw Exception("Too long string to search" , ErrorCodes::TOO_MANY_BYTES); |
427 | /// Scan, check, update the offsets array and the offset of haystack. |
428 | err = hs_scan( |
429 | hyperscan_regex->getDB(), |
430 | reinterpret_cast<const char *>(haystack_data.data()) + offset, |
431 | length, |
432 | 0, |
433 | smart_scratch.get(), |
434 | on_match, |
435 | &res); |
436 | if (err != HS_SUCCESS) |
437 | throw Exception("Failed to scan with hyperscan" , ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT); |
438 | offsets[i] = res.size(); |
439 | offset = haystack_offsets[i]; |
440 | } |
441 | #else |
442 | (void)haystack_data; |
443 | (void)haystack_offsets; |
444 | (void)needles; |
445 | (void)res; |
446 | (void)offsets; |
447 | throw Exception( |
448 | "multi-search all indices is not implemented when hyperscan is off (is it x86 processor?)" , |
449 | ErrorCodes::NOT_IMPLEMENTED); |
450 | #endif // USE_HYPERSCAN |
451 | } |
452 | }; |
453 | |
454 | |
455 | struct |
456 | { |
457 | static void ( |
458 | const ColumnString::Chars & data, |
459 | const ColumnString::Offsets & offsets, |
460 | const std::string & pattern, |
461 | ColumnString::Chars & res_data, |
462 | ColumnString::Offsets & res_offsets) |
463 | { |
464 | res_data.reserve(data.size() / 5); |
465 | res_offsets.resize(offsets.size()); |
466 | |
467 | const auto & regexp = Regexps::get<false, false>(pattern); |
468 | |
469 | unsigned capture = regexp->getNumberOfSubpatterns() > 0 ? 1 : 0; |
470 | OptimizedRegularExpression::MatchVec matches; |
471 | matches.reserve(capture + 1); |
472 | size_t prev_offset = 0; |
473 | size_t res_offset = 0; |
474 | |
475 | for (size_t i = 0; i < offsets.size(); ++i) |
476 | { |
477 | size_t cur_offset = offsets[i]; |
478 | |
479 | unsigned count |
480 | = regexp->match(reinterpret_cast<const char *>(&data[prev_offset]), cur_offset - prev_offset - 1, matches, capture + 1); |
481 | if (count > capture && matches[capture].offset != std::string::npos) |
482 | { |
483 | const auto & match = matches[capture]; |
484 | res_data.resize(res_offset + match.length + 1); |
485 | memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], &data[prev_offset + match.offset], match.length); |
486 | res_offset += match.length; |
487 | } |
488 | else |
489 | { |
490 | res_data.resize(res_offset + 1); |
491 | } |
492 | |
493 | res_data[res_offset] = 0; |
494 | ++res_offset; |
495 | res_offsets[i] = res_offset; |
496 | |
497 | prev_offset = cur_offset; |
498 | } |
499 | } |
500 | }; |
501 | |
502 | |
503 | /** Replace all matches of regexp 'needle' to string 'replacement'. 'needle' and 'replacement' are constants. |
504 | * 'replacement' could contain substitutions, for example: '\2-\3-\1' |
505 | */ |
506 | template <bool replace_one = false> |
507 | struct ReplaceRegexpImpl |
508 | { |
509 | /// Sequence of instructions, describing how to get resulting string. |
510 | /// Each element is either: |
511 | /// - substitution (in that case first element of pair is their number and second element is empty) |
512 | /// - string that need to be inserted (in that case, first element of pair is that string and second element is -1) |
513 | using Instructions = std::vector<std::pair<int, std::string>>; |
514 | |
515 | static const size_t max_captures = 10; |
516 | |
517 | |
518 | static Instructions createInstructions(const std::string & s, int num_captures) |
519 | { |
520 | Instructions instructions; |
521 | |
522 | String now = "" ; |
523 | for (size_t i = 0; i < s.size(); ++i) |
524 | { |
525 | if (s[i] == '\\' && i + 1 < s.size()) |
526 | { |
527 | if (isNumericASCII(s[i + 1])) /// Substitution |
528 | { |
529 | if (!now.empty()) |
530 | { |
531 | instructions.emplace_back(-1, now); |
532 | now = "" ; |
533 | } |
534 | instructions.emplace_back(s[i + 1] - '0', String()); |
535 | } |
536 | else |
537 | now += s[i + 1]; /// Escaping |
538 | ++i; |
539 | } |
540 | else |
541 | now += s[i]; /// Plain character |
542 | } |
543 | |
544 | if (!now.empty()) |
545 | { |
546 | instructions.emplace_back(-1, now); |
547 | now = "" ; |
548 | } |
549 | |
550 | for (const auto & it : instructions) |
551 | if (it.first >= num_captures) |
552 | throw Exception( |
553 | "Invalid replace instruction in replacement string. Id: " + toString(it.first) + ", but regexp has only " |
554 | + toString(num_captures - 1) + " subpatterns" , |
555 | ErrorCodes::BAD_ARGUMENTS); |
556 | |
557 | return instructions; |
558 | } |
559 | |
560 | |
561 | static void processString( |
562 | const re2_st::StringPiece & input, |
563 | ColumnString::Chars & res_data, |
564 | ColumnString::Offset & res_offset, |
565 | re2_st::RE2 & searcher, |
566 | int num_captures, |
567 | const Instructions & instructions) |
568 | { |
569 | re2_st::StringPiece matches[max_captures]; |
570 | |
571 | size_t start_pos = 0; |
572 | while (start_pos < static_cast<size_t>(input.length())) |
573 | { |
574 | /// If no more replacements possible for current string |
575 | bool can_finish_current_string = false; |
576 | |
577 | if (searcher.Match(input, start_pos, input.length(), re2_st::RE2::Anchor::UNANCHORED, matches, num_captures)) |
578 | { |
579 | const auto & match = matches[0]; |
580 | size_t bytes_to_copy = (match.data() - input.data()) - start_pos; |
581 | |
582 | /// Copy prefix before matched regexp without modification |
583 | res_data.resize(res_data.size() + bytes_to_copy); |
584 | memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], input.data() + start_pos, bytes_to_copy); |
585 | res_offset += bytes_to_copy; |
586 | start_pos += bytes_to_copy + match.length(); |
587 | |
588 | /// Do substitution instructions |
589 | for (const auto & it : instructions) |
590 | { |
591 | if (it.first >= 0) |
592 | { |
593 | res_data.resize(res_data.size() + matches[it.first].length()); |
594 | memcpy(&res_data[res_offset], matches[it.first].data(), matches[it.first].length()); |
595 | res_offset += matches[it.first].length(); |
596 | } |
597 | else |
598 | { |
599 | res_data.resize(res_data.size() + it.second.size()); |
600 | memcpy(&res_data[res_offset], it.second.data(), it.second.size()); |
601 | res_offset += it.second.size(); |
602 | } |
603 | } |
604 | |
605 | if (replace_one || match.length() == 0) /// Stop after match of zero length, to avoid infinite loop. |
606 | can_finish_current_string = true; |
607 | } |
608 | else |
609 | can_finish_current_string = true; |
610 | |
611 | /// If ready, append suffix after match to end of string. |
612 | if (can_finish_current_string) |
613 | { |
614 | res_data.resize(res_data.size() + input.length() - start_pos); |
615 | memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], input.data() + start_pos, input.length() - start_pos); |
616 | res_offset += input.length() - start_pos; |
617 | start_pos = input.length(); |
618 | } |
619 | } |
620 | |
621 | res_data.resize(res_data.size() + 1); |
622 | res_data[res_offset] = 0; |
623 | ++res_offset; |
624 | } |
625 | |
626 | |
627 | static void vector( |
628 | const ColumnString::Chars & data, |
629 | const ColumnString::Offsets & offsets, |
630 | const std::string & needle, |
631 | const std::string & replacement, |
632 | ColumnString::Chars & res_data, |
633 | ColumnString::Offsets & res_offsets) |
634 | { |
635 | ColumnString::Offset res_offset = 0; |
636 | res_data.reserve(data.size()); |
637 | size_t size = offsets.size(); |
638 | res_offsets.resize(size); |
639 | |
640 | re2_st::RE2 searcher(needle); |
641 | int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, static_cast<int>(max_captures)); |
642 | |
643 | Instructions instructions = createInstructions(replacement, num_captures); |
644 | |
645 | /// Cannot perform search for whole block. Will process each string separately. |
646 | for (size_t i = 0; i < size; ++i) |
647 | { |
648 | int from = i > 0 ? offsets[i - 1] : 0; |
649 | re2_st::StringPiece input(reinterpret_cast<const char *>(data.data() + from), offsets[i] - from - 1); |
650 | |
651 | processString(input, res_data, res_offset, searcher, num_captures, instructions); |
652 | res_offsets[i] = res_offset; |
653 | } |
654 | } |
655 | |
656 | static void vector_fixed( |
657 | const ColumnString::Chars & data, |
658 | size_t n, |
659 | const std::string & needle, |
660 | const std::string & replacement, |
661 | ColumnString::Chars & res_data, |
662 | ColumnString::Offsets & res_offsets) |
663 | { |
664 | ColumnString::Offset res_offset = 0; |
665 | size_t size = data.size() / n; |
666 | res_data.reserve(data.size()); |
667 | res_offsets.resize(size); |
668 | |
669 | re2_st::RE2 searcher(needle); |
670 | int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, static_cast<int>(max_captures)); |
671 | |
672 | Instructions instructions = createInstructions(replacement, num_captures); |
673 | |
674 | for (size_t i = 0; i < size; ++i) |
675 | { |
676 | int from = i * n; |
677 | re2_st::StringPiece input(reinterpret_cast<const char *>(data.data() + from), n); |
678 | |
679 | processString(input, res_data, res_offset, searcher, num_captures, instructions); |
680 | res_offsets[i] = res_offset; |
681 | } |
682 | } |
683 | }; |
684 | |
685 | |
686 | /** Replace one or all occurencies of substring 'needle' to 'replacement'. 'needle' and 'replacement' are constants. |
687 | */ |
688 | template <bool replace_one = false> |
689 | struct ReplaceStringImpl |
690 | { |
691 | static void vector( |
692 | const ColumnString::Chars & data, |
693 | const ColumnString::Offsets & offsets, |
694 | const std::string & needle, |
695 | const std::string & replacement, |
696 | ColumnString::Chars & res_data, |
697 | ColumnString::Offsets & res_offsets) |
698 | { |
699 | const UInt8 * begin = data.data(); |
700 | const UInt8 * pos = begin; |
701 | const UInt8 * end = pos + data.size(); |
702 | |
703 | ColumnString::Offset res_offset = 0; |
704 | res_data.reserve(data.size()); |
705 | size_t size = offsets.size(); |
706 | res_offsets.resize(size); |
707 | |
708 | /// The current index in the array of strings. |
709 | size_t i = 0; |
710 | |
711 | Volnitsky searcher(needle.data(), needle.size(), end - pos); |
712 | |
713 | /// We will search for the next occurrence in all rows at once. |
714 | while (pos < end) |
715 | { |
716 | const UInt8 * match = searcher.search(pos, end - pos); |
717 | |
718 | /// Copy the data without changing |
719 | res_data.resize(res_data.size() + (match - pos)); |
720 | memcpy(&res_data[res_offset], pos, match - pos); |
721 | |
722 | /// Determine which index it belongs to. |
723 | while (i < offsets.size() && begin + offsets[i] <= match) |
724 | { |
725 | res_offsets[i] = res_offset + ((begin + offsets[i]) - pos); |
726 | ++i; |
727 | } |
728 | res_offset += (match - pos); |
729 | |
730 | /// If you have reached the end, it's time to stop |
731 | if (i == offsets.size()) |
732 | break; |
733 | |
734 | /// Is it true that this string no longer needs to perform transformations. |
735 | bool can_finish_current_string = false; |
736 | |
737 | /// We check that the entry does not go through the boundaries of strings. |
738 | if (match + needle.size() < begin + offsets[i]) |
739 | { |
740 | res_data.resize(res_data.size() + replacement.size()); |
741 | memcpy(&res_data[res_offset], replacement.data(), replacement.size()); |
742 | res_offset += replacement.size(); |
743 | pos = match + needle.size(); |
744 | if (replace_one) |
745 | can_finish_current_string = true; |
746 | } |
747 | else |
748 | { |
749 | pos = match; |
750 | can_finish_current_string = true; |
751 | } |
752 | |
753 | if (can_finish_current_string) |
754 | { |
755 | res_data.resize(res_data.size() + (begin + offsets[i] - pos)); |
756 | memcpy(&res_data[res_offset], pos, (begin + offsets[i] - pos)); |
757 | res_offset += (begin + offsets[i] - pos); |
758 | res_offsets[i] = res_offset; |
759 | pos = begin + offsets[i]; |
760 | ++i; |
761 | } |
762 | } |
763 | } |
764 | |
765 | /// Note: this function converts fixed-length strings to variable-length strings |
766 | /// and each variable-length string should ends with zero byte. |
767 | static void vector_fixed( |
768 | const ColumnString::Chars & data, |
769 | size_t n, |
770 | const std::string & needle, |
771 | const std::string & replacement, |
772 | ColumnString::Chars & res_data, |
773 | ColumnString::Offsets & res_offsets) |
774 | { |
775 | const UInt8 * begin = data.data(); |
776 | const UInt8 * pos = begin; |
777 | const UInt8 * end = pos + data.size(); |
778 | |
779 | ColumnString::Offset res_offset = 0; |
780 | size_t count = data.size() / n; |
781 | res_data.reserve(data.size()); |
782 | res_offsets.resize(count); |
783 | |
784 | /// The current index in the string array. |
785 | size_t i = 0; |
786 | |
787 | Volnitsky searcher(needle.data(), needle.size(), end - pos); |
788 | |
789 | /// We will search for the next occurrence in all rows at once. |
790 | while (pos < end) |
791 | { |
792 | const UInt8 * match = searcher.search(pos, end - pos); |
793 | |
794 | #define COPY_REST_OF_CURRENT_STRING() \ |
795 | do \ |
796 | { \ |
797 | const size_t len = begin + n * (i + 1) - pos; \ |
798 | res_data.resize(res_data.size() + len + 1); \ |
799 | memcpy(&res_data[res_offset], pos, len); \ |
800 | res_offset += len; \ |
801 | res_data[res_offset++] = 0; \ |
802 | res_offsets[i] = res_offset; \ |
803 | pos = begin + n * (i + 1); \ |
804 | ++i; \ |
805 | } while (false) |
806 | |
807 | /// Copy skipped strings without any changes but |
808 | /// add zero byte to the end of each string. |
809 | while (i < count && begin + n * (i + 1) <= match) |
810 | { |
811 | COPY_REST_OF_CURRENT_STRING(); |
812 | } |
813 | |
814 | /// If you have reached the end, it's time to stop |
815 | if (i == count) |
816 | break; |
817 | |
818 | /// Copy unchanged part of current string. |
819 | res_data.resize(res_data.size() + (match - pos)); |
820 | memcpy(&res_data[res_offset], pos, match - pos); |
821 | res_offset += (match - pos); |
822 | |
823 | /// Is it true that this string no longer needs to perform conversions. |
824 | bool can_finish_current_string = false; |
825 | |
826 | /// We check that the entry does not pass through the boundaries of strings. |
827 | if (match + needle.size() <= begin + n * (i + 1)) |
828 | { |
829 | res_data.resize(res_data.size() + replacement.size()); |
830 | memcpy(&res_data[res_offset], replacement.data(), replacement.size()); |
831 | res_offset += replacement.size(); |
832 | pos = match + needle.size(); |
833 | if (replace_one || pos == begin + n * (i + 1)) |
834 | can_finish_current_string = true; |
835 | } |
836 | else |
837 | { |
838 | pos = match; |
839 | can_finish_current_string = true; |
840 | } |
841 | |
842 | if (can_finish_current_string) |
843 | { |
844 | COPY_REST_OF_CURRENT_STRING(); |
845 | } |
846 | #undef COPY_REST_OF_CURRENT_STRING |
847 | } |
848 | } |
849 | |
850 | static void constant(const std::string & data, const std::string & needle, const std::string & replacement, std::string & res_data) |
851 | { |
852 | res_data = "" ; |
853 | int replace_cnt = 0; |
854 | for (size_t i = 0; i < data.size(); ++i) |
855 | { |
856 | bool match = true; |
857 | if (i + needle.size() > data.size() || (replace_one && replace_cnt > 0)) |
858 | match = false; |
859 | for (size_t j = 0; match && j < needle.size(); ++j) |
860 | if (data[i + j] != needle[j]) |
861 | match = false; |
862 | if (match) |
863 | { |
864 | ++replace_cnt; |
865 | res_data += replacement; |
866 | i = i + needle.size() - 1; |
867 | } |
868 | else |
869 | res_data += data[i]; |
870 | } |
871 | } |
872 | }; |
873 | |
874 | |
875 | template <typename Impl, typename Name> |
876 | class FunctionStringReplace : public IFunction |
877 | { |
878 | public: |
879 | static constexpr auto name = Name::name; |
880 | static FunctionPtr create(const Context &) { return std::make_shared<FunctionStringReplace>(); } |
881 | |
882 | String getName() const override { return name; } |
883 | |
884 | size_t getNumberOfArguments() const override { return 3; } |
885 | |
886 | bool useDefaultImplementationForConstants() const override { return true; } |
887 | ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; } |
888 | |
889 | DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override |
890 | { |
891 | if (!isStringOrFixedString(arguments[0])) |
892 | throw Exception( |
893 | "Illegal type " + arguments[0]->getName() + " of first argument of function " + getName(), |
894 | ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
895 | |
896 | if (!isStringOrFixedString(arguments[1])) |
897 | throw Exception( |
898 | "Illegal type " + arguments[1]->getName() + " of second argument of function " + getName(), |
899 | ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
900 | |
901 | if (!isStringOrFixedString(arguments[2])) |
902 | throw Exception( |
903 | "Illegal type " + arguments[2]->getName() + " of third argument of function " + getName(), |
904 | ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
905 | |
906 | return std::make_shared<DataTypeString>(); |
907 | } |
908 | |
909 | void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override |
910 | { |
911 | const ColumnPtr column_src = block.getByPosition(arguments[0]).column; |
912 | const ColumnPtr column_needle = block.getByPosition(arguments[1]).column; |
913 | const ColumnPtr column_replacement = block.getByPosition(arguments[2]).column; |
914 | |
915 | if (!isColumnConst(*column_needle) || !isColumnConst(*column_replacement)) |
916 | throw Exception("2nd and 3rd arguments of function " + getName() + " must be constants." , ErrorCodes::ILLEGAL_COLUMN); |
917 | |
918 | const IColumn * c1 = block.getByPosition(arguments[1]).column.get(); |
919 | const IColumn * c2 = block.getByPosition(arguments[2]).column.get(); |
920 | const ColumnConst * c1_const = typeid_cast<const ColumnConst *>(c1); |
921 | const ColumnConst * c2_const = typeid_cast<const ColumnConst *>(c2); |
922 | String needle = c1_const->getValue<String>(); |
923 | String replacement = c2_const->getValue<String>(); |
924 | |
925 | if (needle.size() == 0) |
926 | throw Exception("Length of the second argument of function replace must be greater than 0." , ErrorCodes::ARGUMENT_OUT_OF_BOUND); |
927 | |
928 | if (const ColumnString * col = checkAndGetColumn<ColumnString>(column_src.get())) |
929 | { |
930 | auto col_res = ColumnString::create(); |
931 | Impl::vector(col->getChars(), col->getOffsets(), needle, replacement, col_res->getChars(), col_res->getOffsets()); |
932 | block.getByPosition(result).column = std::move(col_res); |
933 | } |
934 | else if (const ColumnFixedString * col_fixed = checkAndGetColumn<ColumnFixedString>(column_src.get())) |
935 | { |
936 | auto col_res = ColumnString::create(); |
937 | Impl::vector_fixed(col_fixed->getChars(), col_fixed->getN(), needle, replacement, col_res->getChars(), col_res->getOffsets()); |
938 | block.getByPosition(result).column = std::move(col_res); |
939 | } |
940 | else |
941 | throw Exception( |
942 | "Illegal column " + block.getByPosition(arguments[0]).column->getName() + " of first argument of function " + getName(), |
943 | ErrorCodes::ILLEGAL_COLUMN); |
944 | } |
945 | }; |
946 | |
947 | struct NameMatch |
948 | { |
949 | static constexpr auto name = "match" ; |
950 | }; |
951 | struct NameLike |
952 | { |
953 | static constexpr auto name = "like" ; |
954 | }; |
955 | struct NameNotLike |
956 | { |
957 | static constexpr auto name = "notLike" ; |
958 | }; |
959 | struct NameMultiMatchAny |
960 | { |
961 | static constexpr auto name = "multiMatchAny" ; |
962 | }; |
963 | struct NameMultiMatchAnyIndex |
964 | { |
965 | static constexpr auto name = "multiMatchAnyIndex" ; |
966 | }; |
967 | struct NameMultiMatchAllIndices |
968 | { |
969 | static constexpr auto name = "multiMatchAllIndices" ; |
970 | }; |
971 | struct NameMultiFuzzyMatchAny |
972 | { |
973 | static constexpr auto name = "multiFuzzyMatchAny" ; |
974 | }; |
975 | struct NameMultiFuzzyMatchAnyIndex |
976 | { |
977 | static constexpr auto name = "multiFuzzyMatchAnyIndex" ; |
978 | }; |
979 | struct NameMultiFuzzyMatchAllIndices |
980 | { |
981 | static constexpr auto name = "multiFuzzyMatchAllIndices" ; |
982 | }; |
983 | struct |
984 | { |
985 | static constexpr auto = "extract" ; |
986 | }; |
987 | struct NameReplaceOne |
988 | { |
989 | static constexpr auto name = "replaceOne" ; |
990 | }; |
991 | struct NameReplaceAll |
992 | { |
993 | static constexpr auto name = "replaceAll" ; |
994 | }; |
995 | struct NameReplaceRegexpOne |
996 | { |
997 | static constexpr auto name = "replaceRegexpOne" ; |
998 | }; |
999 | struct NameReplaceRegexpAll |
1000 | { |
1001 | static constexpr auto name = "replaceRegexpAll" ; |
1002 | }; |
1003 | |
1004 | |
1005 | using FunctionMatch = FunctionsStringSearch<MatchImpl<false>, NameMatch>; |
1006 | |
1007 | using FunctionMultiMatchAny = FunctionsMultiStringSearch< |
1008 | MultiMatchAnyImpl<UInt8, true, false, false>, |
1009 | NameMultiMatchAny, |
1010 | std::numeric_limits<UInt32>::max()>; |
1011 | |
1012 | using FunctionMultiMatchAnyIndex = FunctionsMultiStringSearch< |
1013 | MultiMatchAnyImpl<UInt64, false, true, false>, |
1014 | NameMultiMatchAnyIndex, |
1015 | std::numeric_limits<UInt32>::max()>; |
1016 | |
1017 | using FunctionMultiMatchAllIndices = FunctionsMultiStringSearch< |
1018 | MultiMatchAllIndicesImpl<UInt64, false>, |
1019 | NameMultiMatchAllIndices, |
1020 | std::numeric_limits<UInt32>::max()>; |
1021 | |
1022 | using FunctionMultiFuzzyMatchAny = FunctionsMultiStringFuzzySearch< |
1023 | MultiMatchAnyImpl<UInt8, true, false, true>, |
1024 | NameMultiFuzzyMatchAny, |
1025 | std::numeric_limits<UInt32>::max()>; |
1026 | |
1027 | using FunctionMultiFuzzyMatchAnyIndex = FunctionsMultiStringFuzzySearch< |
1028 | MultiMatchAnyImpl<UInt64, false, true, true>, |
1029 | NameMultiFuzzyMatchAnyIndex, |
1030 | std::numeric_limits<UInt32>::max()>; |
1031 | |
1032 | using FunctionMultiFuzzyMatchAllIndices = FunctionsMultiStringFuzzySearch< |
1033 | MultiMatchAllIndicesImpl<UInt64, true>, |
1034 | NameMultiFuzzyMatchAllIndices, |
1035 | std::numeric_limits<UInt32>::max()>; |
1036 | |
1037 | using FunctionLike = FunctionsStringSearch<MatchImpl<true>, NameLike>; |
1038 | using FunctionNotLike = FunctionsStringSearch<MatchImpl<true, true>, NameNotLike>; |
1039 | using = FunctionsStringSearchToString<ExtractImpl, NameExtract>; |
1040 | using FunctionReplaceOne = FunctionStringReplace<ReplaceStringImpl<true>, NameReplaceOne>; |
1041 | using FunctionReplaceAll = FunctionStringReplace<ReplaceStringImpl<false>, NameReplaceAll>; |
1042 | using FunctionReplaceRegexpOne = FunctionStringReplace<ReplaceRegexpImpl<true>, NameReplaceRegexpOne>; |
1043 | using FunctionReplaceRegexpAll = FunctionStringReplace<ReplaceRegexpImpl<false>, NameReplaceRegexpAll>; |
1044 | |
1045 | void registerFunctionsStringRegex(FunctionFactory & factory) |
1046 | { |
1047 | factory.registerFunction<FunctionMatch>(); |
1048 | factory.registerFunction<FunctionLike>(); |
1049 | factory.registerFunction<FunctionNotLike>(); |
1050 | factory.registerFunction<FunctionExtract>(); |
1051 | |
1052 | factory.registerFunction<FunctionReplaceOne>(); |
1053 | factory.registerFunction<FunctionReplaceAll>(); |
1054 | factory.registerFunction<FunctionReplaceRegexpOne>(); |
1055 | factory.registerFunction<FunctionReplaceRegexpAll>(); |
1056 | |
1057 | factory.registerFunction<FunctionMultiMatchAny>(); |
1058 | factory.registerFunction<FunctionMultiMatchAnyIndex>(); |
1059 | factory.registerFunction<FunctionMultiMatchAllIndices>(); |
1060 | factory.registerFunction<FunctionMultiFuzzyMatchAny>(); |
1061 | factory.registerFunction<FunctionMultiFuzzyMatchAnyIndex>(); |
1062 | factory.registerFunction<FunctionMultiFuzzyMatchAllIndices>(); |
1063 | factory.registerAlias("replace" , NameReplaceAll::name, FunctionFactory::CaseInsensitive); |
1064 | } |
1065 | } |
1066 | |