1 | #pragma once |
2 | |
3 | #include <algorithm> |
4 | #include <vector> |
5 | #include <stdint.h> |
6 | #include <string.h> |
7 | #include <Core/Types.h> |
8 | #include <Poco/UTF8Encoding.h> |
9 | #include <Poco/Unicode.h> |
10 | #include <Common/StringSearcher.h> |
11 | #include <Common/StringUtils/StringUtils.h> |
12 | #include <common/StringRef.h> |
13 | #include <common/unaligned.h> |
14 | |
15 | /** Search for a substring in a string by Volnitsky's algorithm |
16 | * http://volnitsky.com/project/str_search/ |
17 | * |
18 | * `haystack` and `needle` can contain zero bytes. |
19 | * |
20 | * Algorithm: |
21 | * - if the `needle` is too small or too large, or too small `haystack`, use std::search or memchr; |
22 | * - when initializing, fill in an open-addressing linear probing hash table of the form |
23 | * hash from the bigram of needle -> the position of this bigram in needle + 1. |
24 | * (one is added only to distinguish zero offset from an empty cell) |
25 | * - the keys are not stored in the hash table, only the values are stored; |
26 | * - bigrams can be inserted several times if they occur in the needle several times; |
27 | * - when searching, take from haystack bigram, which should correspond to the last bigram of needle (comparing from the end); |
28 | * - look for it in the hash table, if found - get the offset from the hash table and compare the string bytewise; |
29 | * - if it did not match, we check the next cell of the hash table from the collision resolution chain; |
30 | * - if not found, skip to haystack almost the size of the needle bytes; |
31 | * |
32 | * MultiVolnitsky - search for multiple substrings in a string: |
33 | * - Add bigrams to hash table with string index. Then the usual Volnitsky search is used. |
34 | * - We are adding while searching, limiting the number of fallback searchers and the total number of added bigrams |
35 | */ |
36 | |
37 | |
38 | namespace DB |
39 | { |
40 | namespace VolnitskyTraits |
41 | { |
42 | using Offset = UInt8; /// Offset in the needle. For the basic algorithm, the length of the needle must not be greater than 255. |
43 | using Id = UInt8; /// Index of the string (within the array of multiple needles), must not be greater than 255. |
44 | using Ngram = UInt16; /// n-gram (2 bytes). |
45 | |
46 | /** Fits into the L2 cache (of common Intel CPUs). |
47 | * This number is extremely good for compilers as it is numeric_limits<Uint16>::max() and there are optimizations with movzwl and other instructions with 2 bytes |
48 | */ |
49 | static constexpr size_t hash_size = 64 * 1024; |
50 | |
51 | /// min haystack size to use main algorithm instead of fallback |
52 | static constexpr size_t min_haystack_size_for_algorithm = 20000; |
53 | |
54 | static inline bool isFallbackNeedle(const size_t needle_size, size_t haystack_size_hint = 0) |
55 | { |
56 | return needle_size < 2 * sizeof(Ngram) || needle_size >= std::numeric_limits<Offset>::max() |
57 | || (haystack_size_hint && haystack_size_hint < min_haystack_size_for_algorithm); |
58 | } |
59 | |
60 | static inline Ngram toNGram(const UInt8 * const pos) { return unalignedLoad<Ngram>(pos); } |
61 | |
62 | template <typename Callback> |
63 | static inline void putNGramASCIICaseInsensitive(const UInt8 * const pos, const int offset, const Callback & putNGramBase) |
64 | { |
65 | struct Chars |
66 | { |
67 | UInt8 c0; |
68 | UInt8 c1; |
69 | }; |
70 | |
71 | union |
72 | { |
73 | Ngram n; |
74 | Chars chars; |
75 | }; |
76 | |
77 | n = toNGram(pos); |
78 | |
79 | const auto c0_al = isAlphaASCII(chars.c0); |
80 | const auto c1_al = isAlphaASCII(chars.c1); |
81 | |
82 | if (c0_al && c1_al) |
83 | { |
84 | /// 4 combinations: AB, aB, Ab, ab |
85 | putNGramBase(n, offset); |
86 | chars.c0 = alternateCaseIfAlphaASCII(chars.c0); |
87 | putNGramBase(n, offset); |
88 | chars.c1 = alternateCaseIfAlphaASCII(chars.c1); |
89 | putNGramBase(n, offset); |
90 | chars.c0 = alternateCaseIfAlphaASCII(chars.c0); |
91 | putNGramBase(n, offset); |
92 | } |
93 | else if (c0_al) |
94 | { |
95 | /// 2 combinations: A1, a1 |
96 | putNGramBase(n, offset); |
97 | chars.c0 = alternateCaseIfAlphaASCII(chars.c0); |
98 | putNGramBase(n, offset); |
99 | } |
100 | else if (c1_al) |
101 | { |
102 | /// 2 combinations: 0B, 0b |
103 | putNGramBase(n, offset); |
104 | chars.c1 = alternateCaseIfAlphaASCII(chars.c1); |
105 | putNGramBase(n, offset); |
106 | } |
107 | else |
108 | /// 1 combination: 01 |
109 | putNGramBase(n, offset); |
110 | } |
111 | |
112 | template <bool CaseSensitive, bool ASCII, typename Callback> |
113 | static inline void putNGram(const UInt8 * const pos, const int offset, [[maybe_unused]] const UInt8 * const begin, const Callback & putNGramBase) |
114 | { |
115 | if constexpr (CaseSensitive) |
116 | { |
117 | putNGramBase(toNGram(pos), offset); |
118 | } |
119 | else |
120 | { |
121 | if constexpr (ASCII) |
122 | { |
123 | putNGramASCIICaseInsensitive(pos, offset, putNGramBase); |
124 | } |
125 | else |
126 | { |
127 | struct Chars |
128 | { |
129 | UInt8 c0; |
130 | UInt8 c1; |
131 | }; |
132 | |
133 | union |
134 | { |
135 | VolnitskyTraits::Ngram n; |
136 | Chars chars; |
137 | }; |
138 | |
139 | n = toNGram(pos); |
140 | |
141 | if (isascii(chars.c0) && isascii(chars.c1)) |
142 | putNGramASCIICaseInsensitive(pos, offset, putNGramBase); |
143 | else |
144 | { |
145 | /** n-gram (in the case of n = 2) |
146 | * can be entirely located within one code point, |
147 | * or intersect with two code points. |
148 | * |
149 | * In the first case, you need to consider up to two alternatives - this code point in upper and lower case, |
150 | * and in the second case - up to four alternatives - fragments of two code points in all combinations of cases. |
151 | * |
152 | * It does not take into account the dependence of the case-transformation from the locale (for example - Turkish `Ii`) |
153 | * as well as composition / decomposition and other features. |
154 | * |
155 | * It also does not work if characters with lower and upper cases are represented by different number of bytes or code points. |
156 | */ |
157 | |
158 | using Seq = UInt8[6]; |
159 | |
160 | static const Poco::UTF8Encoding utf8; |
161 | |
162 | if (UTF8::isContinuationOctet(chars.c1)) |
163 | { |
164 | /// ngram is inside a sequence |
165 | auto seq_pos = pos; |
166 | UTF8::syncBackward(seq_pos, begin); |
167 | |
168 | const auto u32 = utf8.convert(seq_pos); |
169 | const auto l_u32 = Poco::Unicode::toLower(u32); |
170 | const auto u_u32 = Poco::Unicode::toUpper(u32); |
171 | |
172 | /// symbol is case-independent |
173 | if (l_u32 == u_u32) |
174 | putNGramBase(n, offset); |
175 | else |
176 | { |
177 | /// where is the given ngram in respect to the start of UTF-8 sequence? |
178 | const auto seq_ngram_offset = pos - seq_pos; |
179 | |
180 | Seq seq; |
181 | |
182 | /// put ngram for lowercase |
183 | utf8.convert(l_u32, seq, sizeof(seq)); |
184 | chars.c0 = seq[seq_ngram_offset]; |
185 | chars.c1 = seq[seq_ngram_offset + 1]; |
186 | putNGramBase(n, offset); |
187 | |
188 | /// put ngram for uppercase |
189 | utf8.convert(u_u32, seq, sizeof(seq)); |
190 | chars.c0 = seq[seq_ngram_offset]; //-V519 |
191 | chars.c1 = seq[seq_ngram_offset + 1]; //-V519 |
192 | putNGramBase(n, offset); |
193 | } |
194 | } |
195 | else |
196 | { |
197 | /// ngram is on the boundary of two sequences |
198 | /// first sequence may start before u_pos if it is not ASCII |
199 | auto first_seq_pos = pos; |
200 | UTF8::syncBackward(first_seq_pos, begin); |
201 | /// where is the given ngram in respect to the start of first UTF-8 sequence? |
202 | const auto seq_ngram_offset = pos - first_seq_pos; |
203 | |
204 | const auto first_u32 = utf8.convert(first_seq_pos); |
205 | const auto first_l_u32 = Poco::Unicode::toLower(first_u32); |
206 | const auto first_u_u32 = Poco::Unicode::toUpper(first_u32); |
207 | |
208 | /// second sequence always start immediately after u_pos |
209 | auto second_seq_pos = pos + 1; |
210 | |
211 | const auto second_u32 = utf8.convert(second_seq_pos); /// TODO This assumes valid UTF-8 or zero byte after needle. |
212 | const auto second_l_u32 = Poco::Unicode::toLower(second_u32); |
213 | const auto second_u_u32 = Poco::Unicode::toUpper(second_u32); |
214 | |
215 | /// both symbols are case-independent |
216 | if (first_l_u32 == first_u_u32 && second_l_u32 == second_u_u32) |
217 | { |
218 | putNGramBase(n, offset); |
219 | } |
220 | else if (first_l_u32 == first_u_u32) |
221 | { |
222 | /// first symbol is case-independent |
223 | Seq seq; |
224 | |
225 | /// put ngram for lowercase |
226 | utf8.convert(second_l_u32, seq, sizeof(seq)); |
227 | chars.c1 = seq[0]; |
228 | putNGramBase(n, offset); |
229 | |
230 | /// put ngram from uppercase, if it is different |
231 | utf8.convert(second_u_u32, seq, sizeof(seq)); |
232 | if (chars.c1 != seq[0]) |
233 | { |
234 | chars.c1 = seq[0]; |
235 | putNGramBase(n, offset); |
236 | } |
237 | } |
238 | else if (second_l_u32 == second_u_u32) |
239 | { |
240 | /// second symbol is case-independent |
241 | Seq seq; |
242 | |
243 | /// put ngram for lowercase |
244 | utf8.convert(first_l_u32, seq, sizeof(seq)); |
245 | chars.c0 = seq[seq_ngram_offset]; |
246 | putNGramBase(n, offset); |
247 | |
248 | /// put ngram for uppercase, if it is different |
249 | utf8.convert(first_u_u32, seq, sizeof(seq)); |
250 | if (chars.c0 != seq[seq_ngram_offset]) |
251 | { |
252 | chars.c0 = seq[seq_ngram_offset]; |
253 | putNGramBase(n, offset); |
254 | } |
255 | } |
256 | else |
257 | { |
258 | Seq first_l_seq; |
259 | Seq first_u_seq; |
260 | Seq second_l_seq; |
261 | Seq second_u_seq; |
262 | |
263 | utf8.convert(first_l_u32, first_l_seq, sizeof(first_l_seq)); |
264 | utf8.convert(first_u_u32, first_u_seq, sizeof(first_u_seq)); |
265 | utf8.convert(second_l_u32, second_l_seq, sizeof(second_l_seq)); |
266 | utf8.convert(second_u_u32, second_u_seq, sizeof(second_u_seq)); |
267 | |
268 | auto c0l = first_l_seq[seq_ngram_offset]; |
269 | auto c0u = first_u_seq[seq_ngram_offset]; |
270 | auto c1l = second_l_seq[0]; |
271 | auto c1u = second_u_seq[0]; |
272 | |
273 | /// ngram for ll |
274 | chars.c0 = c0l; |
275 | chars.c1 = c1l; |
276 | putNGramBase(n, offset); |
277 | |
278 | if (c0l != c0u) |
279 | { |
280 | /// ngram for Ul |
281 | chars.c0 = c0u; |
282 | chars.c1 = c1l; |
283 | putNGramBase(n, offset); |
284 | } |
285 | |
286 | if (c1l != c1u) |
287 | { |
288 | /// ngram for lU |
289 | chars.c0 = c0l; |
290 | chars.c1 = c1u; |
291 | putNGramBase(n, offset); |
292 | } |
293 | |
294 | if (c0l != c0u && c1l != c1u) |
295 | { |
296 | /// ngram for UU |
297 | chars.c0 = c0u; |
298 | chars.c1 = c1u; |
299 | putNGramBase(n, offset); |
300 | } |
301 | } |
302 | } |
303 | } |
304 | } |
305 | } |
306 | } |
307 | } |
308 | |
309 | |
310 | /// @todo store lowercase needle to speed up in case there are numerous occurrences of bigrams from needle in haystack |
311 | template <bool CaseSensitive, bool ASCII, typename FallbackSearcher> |
312 | class VolnitskyBase |
313 | { |
314 | protected: |
315 | const UInt8 * const needle; |
316 | const size_t needle_size; |
317 | const UInt8 * const needle_end = needle + needle_size; |
318 | /// For how long we move, if the n-gram from haystack is not found in the hash table. |
319 | const size_t step = needle_size - sizeof(VolnitskyTraits::Ngram) + 1; |
320 | |
321 | /** max needle length is 255, max distinct ngrams for case-sensitive is (255 - 1), case-insensitive is 4 * (255 - 1) |
322 | * storage of 64K ngrams (n = 2, 128 KB) should be large enough for both cases */ |
323 | VolnitskyTraits::Offset hash[VolnitskyTraits::hash_size]; /// Hash table. |
324 | |
325 | const bool fallback; /// Do we need to use the fallback algorithm. |
326 | |
327 | FallbackSearcher fallback_searcher; |
328 | |
329 | public: |
330 | using Searcher = FallbackSearcher; |
331 | |
332 | /** haystack_size_hint - the expected total size of the haystack for `search` calls. Optional (zero means unspecified). |
333 | * If you specify it small enough, the fallback algorithm will be used, |
334 | * since it is considered that it's useless to waste time initializing the hash table. |
335 | */ |
336 | VolnitskyBase(const char * const needle_, const size_t needle_size_, size_t haystack_size_hint = 0) |
337 | : needle{reinterpret_cast<const UInt8 *>(needle_)} |
338 | , needle_size{needle_size_} |
339 | , fallback{VolnitskyTraits::isFallbackNeedle(needle_size, haystack_size_hint)} |
340 | , fallback_searcher{needle_, needle_size} |
341 | { |
342 | if (fallback) |
343 | return; |
344 | |
345 | memset(hash, 0, sizeof(hash)); |
346 | |
347 | auto callback = [this](const VolnitskyTraits::Ngram ngram, const int offset) { return this->putNGramBase(ngram, offset); }; |
348 | /// ssize_t is used here because unsigned can't be used with condition like `i >= 0`, unsigned always >= 0 |
349 | /// And also adding from the end guarantees that we will find first occurence because we will lookup bigger offsets first. |
350 | for (auto i = static_cast<ssize_t>(needle_size - sizeof(VolnitskyTraits::Ngram)); i >= 0; --i) |
351 | VolnitskyTraits::putNGram<CaseSensitive, ASCII>(this->needle + i, i + 1, this->needle, callback); |
352 | } |
353 | |
354 | |
355 | /// If not found, the end of the haystack is returned. |
356 | const UInt8 * search(const UInt8 * const haystack, const size_t haystack_size) const |
357 | { |
358 | if (needle_size == 0) |
359 | return haystack; |
360 | |
361 | const auto haystack_end = haystack + haystack_size; |
362 | |
363 | if (fallback || haystack_size <= needle_size) |
364 | return fallback_searcher.search(haystack, haystack_end); |
365 | |
366 | /// Let's "apply" the needle to the haystack and compare the n-gram from the end of the needle. |
367 | const auto * pos = haystack + needle_size - sizeof(VolnitskyTraits::Ngram); |
368 | for (; pos <= haystack_end - needle_size; pos += step) |
369 | { |
370 | /// We look at all the cells of the hash table that can correspond to the n-gram from haystack. |
371 | for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num]; |
372 | cell_num = (cell_num + 1) % VolnitskyTraits::hash_size) |
373 | { |
374 | /// When found - compare bytewise, using the offset from the hash table. |
375 | const auto res = pos - (hash[cell_num] - 1); |
376 | |
377 | /// pointer in the code is always padded array so we can use pagesafe semantics |
378 | if (fallback_searcher.compare(haystack, haystack_end, res)) |
379 | return res; |
380 | } |
381 | } |
382 | |
383 | return fallback_searcher.search(pos - step + 1, haystack_end); |
384 | } |
385 | |
386 | const char * search(const char * haystack, size_t haystack_size) const |
387 | { |
388 | return reinterpret_cast<const char *>(search(reinterpret_cast<const UInt8 *>(haystack), haystack_size)); |
389 | } |
390 | |
391 | protected: |
392 | void putNGramBase(const VolnitskyTraits::Ngram ngram, const int offset) |
393 | { |
394 | /// Put the offset for the n-gram in the corresponding cell or the nearest free cell. |
395 | size_t cell_num = ngram % VolnitskyTraits::hash_size; |
396 | |
397 | while (hash[cell_num]) |
398 | cell_num = (cell_num + 1) % VolnitskyTraits::hash_size; /// Search for the next free cell. |
399 | |
400 | hash[cell_num] = offset; |
401 | } |
402 | }; |
403 | |
404 | |
405 | template <bool CaseSensitive, bool ASCII, typename FallbackSearcher> |
406 | class MultiVolnitskyBase |
407 | { |
408 | private: |
409 | /// needles and their offsets |
410 | const std::vector<StringRef> & needles; |
411 | |
412 | |
413 | /// fallback searchers |
414 | std::vector<size_t> fallback_needles; |
415 | std::vector<FallbackSearcher> fallback_searchers; |
416 | |
417 | /// because std::pair<> is not POD |
418 | struct OffsetId |
419 | { |
420 | VolnitskyTraits::Id id; |
421 | VolnitskyTraits::Offset off; |
422 | }; |
423 | |
424 | OffsetId hash[VolnitskyTraits::hash_size]; |
425 | |
426 | /// step for each bunch of strings |
427 | size_t step; |
428 | |
429 | /// last index of offsets that was not processed |
430 | size_t last; |
431 | |
432 | /// limit for adding to hashtable. In worst case with case insentive search, the table will be filled at most as half |
433 | static constexpr size_t small_limit = VolnitskyTraits::hash_size / 8; |
434 | |
435 | public: |
436 | MultiVolnitskyBase(const std::vector<StringRef> & needles_) : needles{needles_}, step{0}, last{0} |
437 | { |
438 | fallback_searchers.reserve(needles.size()); |
439 | } |
440 | |
441 | /** |
442 | * This function is needed to initialize hash table |
443 | * Returns `true` if there is nothing to initialize |
444 | * and `false` if we have something to initialize and initializes it. |
445 | * This function is a kind of fallback if there are many needles. |
446 | * We actually destroy the hash table and initialize it with uninitialized needles |
447 | * and search through the haystack again. |
448 | * The actual usage of this function is like this: |
449 | * while (hasMoreToSearch()) |
450 | * { |
451 | * search inside the haystack with the known needles |
452 | * } |
453 | */ |
454 | bool hasMoreToSearch() |
455 | { |
456 | if (last == needles.size()) |
457 | return false; |
458 | |
459 | memset(hash, 0, sizeof(hash)); |
460 | fallback_needles.clear(); |
461 | step = std::numeric_limits<size_t>::max(); |
462 | |
463 | size_t buf = 0; |
464 | size_t size = needles.size(); |
465 | |
466 | for (; last < size; ++last) |
467 | { |
468 | const char * cur_needle_data = needles[last].data; |
469 | const size_t cur_needle_size = needles[last].size; |
470 | |
471 | /// save the indices of fallback searchers |
472 | if (VolnitskyTraits::isFallbackNeedle(cur_needle_size)) |
473 | { |
474 | fallback_needles.push_back(last); |
475 | } |
476 | else |
477 | { |
478 | /// put all bigrams |
479 | auto callback = [this](const VolnitskyTraits::Ngram ngram, const int offset) |
480 | { |
481 | return this->putNGramBase(ngram, offset, this->last); |
482 | }; |
483 | |
484 | buf += cur_needle_size - sizeof(VolnitskyTraits::Ngram) + 1; |
485 | |
486 | /// this is the condition when we actually need to stop and start searching with known needles |
487 | if (buf > small_limit) |
488 | break; |
489 | |
490 | step = std::min(step, cur_needle_size - sizeof(VolnitskyTraits::Ngram) + 1); |
491 | for (auto i = static_cast<int>(cur_needle_size - sizeof(VolnitskyTraits::Ngram)); i >= 0; --i) |
492 | { |
493 | VolnitskyTraits::putNGram<CaseSensitive, ASCII>( |
494 | reinterpret_cast<const UInt8 *>(cur_needle_data) + i, |
495 | i + 1, |
496 | reinterpret_cast<const UInt8 *>(cur_needle_data), |
497 | callback); |
498 | } |
499 | } |
500 | fallback_searchers.emplace_back(cur_needle_data, cur_needle_size); |
501 | } |
502 | return true; |
503 | } |
504 | |
505 | inline bool searchOne(const UInt8 * haystack, const UInt8 * haystack_end) const |
506 | { |
507 | const size_t fallback_size = fallback_needles.size(); |
508 | for (size_t i = 0; i < fallback_size; ++i) |
509 | if (fallback_searchers[fallback_needles[i]].search(haystack, haystack_end) != haystack_end) |
510 | return true; |
511 | |
512 | /// check if we have one non empty volnitsky searcher |
513 | if (step != std::numeric_limits<size_t>::max()) |
514 | { |
515 | const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram); |
516 | for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step) |
517 | { |
518 | for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off; |
519 | cell_num = (cell_num + 1) % VolnitskyTraits::hash_size) |
520 | { |
521 | if (pos >= haystack + hash[cell_num].off - 1) |
522 | { |
523 | const auto res = pos - (hash[cell_num].off - 1); |
524 | const size_t ind = hash[cell_num].id; |
525 | if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(haystack, haystack_end, res)) |
526 | return true; |
527 | } |
528 | } |
529 | } |
530 | } |
531 | return false; |
532 | } |
533 | |
534 | inline size_t searchOneFirstIndex(const UInt8 * haystack, const UInt8 * haystack_end) const |
535 | { |
536 | const size_t fallback_size = fallback_needles.size(); |
537 | |
538 | size_t ans = std::numeric_limits<size_t>::max(); |
539 | |
540 | for (size_t i = 0; i < fallback_size; ++i) |
541 | if (fallback_searchers[fallback_needles[i]].search(haystack, haystack_end) != haystack_end) |
542 | ans = std::min(ans, fallback_needles[i]); |
543 | |
544 | /// check if we have one non empty volnitsky searcher |
545 | if (step != std::numeric_limits<size_t>::max()) |
546 | { |
547 | const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram); |
548 | for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step) |
549 | { |
550 | for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off; |
551 | cell_num = (cell_num + 1) % VolnitskyTraits::hash_size) |
552 | { |
553 | if (pos >= haystack + hash[cell_num].off - 1) |
554 | { |
555 | const auto res = pos - (hash[cell_num].off - 1); |
556 | const size_t ind = hash[cell_num].id; |
557 | if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(haystack, haystack_end, res)) |
558 | ans = std::min(ans, ind); |
559 | } |
560 | } |
561 | } |
562 | } |
563 | |
564 | /* |
565 | * if nothing was found, ans + 1 will be equal to zero and we can |
566 | * assign it into the result because we need to return the position starting with one |
567 | */ |
568 | return ans + 1; |
569 | } |
570 | |
571 | template <typename CountCharsCallback> |
572 | inline UInt64 searchOneFirstPosition(const UInt8 * haystack, const UInt8 * haystack_end, const CountCharsCallback & count_chars) const |
573 | { |
574 | const size_t fallback_size = fallback_needles.size(); |
575 | |
576 | UInt64 ans = std::numeric_limits<UInt64>::max(); |
577 | |
578 | for (size_t i = 0; i < fallback_size; ++i) |
579 | if (auto pos = fallback_searchers[fallback_needles[i]].search(haystack, haystack_end); pos != haystack_end) |
580 | ans = std::min<UInt64>(ans, pos - haystack); |
581 | |
582 | /// check if we have one non empty volnitsky searcher |
583 | if (step != std::numeric_limits<size_t>::max()) |
584 | { |
585 | const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram); |
586 | for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step) |
587 | { |
588 | for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off; |
589 | cell_num = (cell_num + 1) % VolnitskyTraits::hash_size) |
590 | { |
591 | if (pos >= haystack + hash[cell_num].off - 1) |
592 | { |
593 | const auto res = pos - (hash[cell_num].off - 1); |
594 | const size_t ind = hash[cell_num].id; |
595 | if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(haystack, haystack_end, res)) |
596 | ans = std::min<UInt64>(ans, res - haystack); |
597 | } |
598 | } |
599 | } |
600 | } |
601 | if (ans == std::numeric_limits<UInt64>::max()) |
602 | return 0; |
603 | return count_chars(haystack, haystack + ans); |
604 | } |
605 | |
606 | template <typename CountCharsCallback, typename AnsType> |
607 | inline void searchOneAll(const UInt8 * haystack, const UInt8 * haystack_end, AnsType * ans, const CountCharsCallback & count_chars) const |
608 | { |
609 | const size_t fallback_size = fallback_needles.size(); |
610 | for (size_t i = 0; i < fallback_size; ++i) |
611 | { |
612 | const UInt8 * ptr = fallback_searchers[fallback_needles[i]].search(haystack, haystack_end); |
613 | if (ptr != haystack_end) |
614 | ans[fallback_needles[i]] = count_chars(haystack, ptr); |
615 | } |
616 | |
617 | /// check if we have one non empty volnitsky searcher |
618 | if (step != std::numeric_limits<size_t>::max()) |
619 | { |
620 | const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram); |
621 | for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step) |
622 | { |
623 | for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off; |
624 | cell_num = (cell_num + 1) % VolnitskyTraits::hash_size) |
625 | { |
626 | if (pos >= haystack + hash[cell_num].off - 1) |
627 | { |
628 | const auto * res = pos - (hash[cell_num].off - 1); |
629 | const size_t ind = hash[cell_num].id; |
630 | if (ans[ind] == 0 && res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(haystack, haystack_end, res)) |
631 | ans[ind] = count_chars(haystack, res); |
632 | } |
633 | } |
634 | } |
635 | } |
636 | } |
637 | |
638 | void putNGramBase(const VolnitskyTraits::Ngram ngram, const int offset, const size_t num) |
639 | { |
640 | size_t cell_num = ngram % VolnitskyTraits::hash_size; |
641 | |
642 | while (hash[cell_num].off) |
643 | cell_num = (cell_num + 1) % VolnitskyTraits::hash_size; |
644 | |
645 | hash[cell_num] = {static_cast<VolnitskyTraits::Id>(num), static_cast<VolnitskyTraits::Offset>(offset)}; |
646 | } |
647 | }; |
648 | |
649 | |
650 | using Volnitsky = VolnitskyBase<true, true, ASCIICaseSensitiveStringSearcher>; |
651 | using VolnitskyUTF8 = VolnitskyBase<true, false, ASCIICaseSensitiveStringSearcher>; /// exactly same as Volnitsky |
652 | using VolnitskyCaseInsensitive = VolnitskyBase<false, true, ASCIICaseInsensitiveStringSearcher>; /// ignores non-ASCII bytes |
653 | using VolnitskyCaseInsensitiveUTF8 = VolnitskyBase<false, false, UTF8CaseInsensitiveStringSearcher>; |
654 | |
655 | using VolnitskyCaseSensitiveToken = VolnitskyBase<true, true, ASCIICaseSensitiveTokenSearcher>; |
656 | using VolnitskyCaseInsensitiveToken = VolnitskyBase<false, true, ASCIICaseInsensitiveTokenSearcher>; |
657 | |
658 | using MultiVolnitsky = MultiVolnitskyBase<true, true, ASCIICaseSensitiveStringSearcher>; |
659 | using MultiVolnitskyUTF8 = MultiVolnitskyBase<true, false, ASCIICaseSensitiveStringSearcher>; |
660 | using MultiVolnitskyCaseInsensitive = MultiVolnitskyBase<false, true, ASCIICaseInsensitiveStringSearcher>; |
661 | using MultiVolnitskyCaseInsensitiveUTF8 = MultiVolnitskyBase<false, false, UTF8CaseInsensitiveStringSearcher>; |
662 | |
663 | |
664 | } |
665 | |