| 1 | #pragma once |
| 2 | |
| 3 | #include <algorithm> |
| 4 | #include <vector> |
| 5 | #include <stdint.h> |
| 6 | #include <string.h> |
| 7 | #include <Core/Types.h> |
| 8 | #include <Poco/UTF8Encoding.h> |
| 9 | #include <Poco/Unicode.h> |
| 10 | #include <Common/StringSearcher.h> |
| 11 | #include <Common/StringUtils/StringUtils.h> |
| 12 | #include <common/StringRef.h> |
| 13 | #include <common/unaligned.h> |
| 14 | |
| 15 | /** Search for a substring in a string by Volnitsky's algorithm |
| 16 | * http://volnitsky.com/project/str_search/ |
| 17 | * |
| 18 | * `haystack` and `needle` can contain zero bytes. |
| 19 | * |
| 20 | * Algorithm: |
| 21 | * - if the `needle` is too small or too large, or too small `haystack`, use std::search or memchr; |
| 22 | * - when initializing, fill in an open-addressing linear probing hash table of the form |
| 23 | * hash from the bigram of needle -> the position of this bigram in needle + 1. |
| 24 | * (one is added only to distinguish zero offset from an empty cell) |
| 25 | * - the keys are not stored in the hash table, only the values are stored; |
| 26 | * - bigrams can be inserted several times if they occur in the needle several times; |
| 27 | * - when searching, take from haystack bigram, which should correspond to the last bigram of needle (comparing from the end); |
| 28 | * - look for it in the hash table, if found - get the offset from the hash table and compare the string bytewise; |
| 29 | * - if it did not match, we check the next cell of the hash table from the collision resolution chain; |
| 30 | * - if not found, skip to haystack almost the size of the needle bytes; |
| 31 | * |
| 32 | * MultiVolnitsky - search for multiple substrings in a string: |
| 33 | * - Add bigrams to hash table with string index. Then the usual Volnitsky search is used. |
| 34 | * - We are adding while searching, limiting the number of fallback searchers and the total number of added bigrams |
| 35 | */ |
| 36 | |
| 37 | |
| 38 | namespace DB |
| 39 | { |
| 40 | namespace VolnitskyTraits |
| 41 | { |
| 42 | using Offset = UInt8; /// Offset in the needle. For the basic algorithm, the length of the needle must not be greater than 255. |
| 43 | using Id = UInt8; /// Index of the string (within the array of multiple needles), must not be greater than 255. |
| 44 | using Ngram = UInt16; /// n-gram (2 bytes). |
| 45 | |
| 46 | /** Fits into the L2 cache (of common Intel CPUs). |
| 47 | * This number is extremely good for compilers as it is numeric_limits<Uint16>::max() and there are optimizations with movzwl and other instructions with 2 bytes |
| 48 | */ |
| 49 | static constexpr size_t hash_size = 64 * 1024; |
| 50 | |
| 51 | /// min haystack size to use main algorithm instead of fallback |
| 52 | static constexpr size_t min_haystack_size_for_algorithm = 20000; |
| 53 | |
| 54 | static inline bool isFallbackNeedle(const size_t needle_size, size_t haystack_size_hint = 0) |
| 55 | { |
| 56 | return needle_size < 2 * sizeof(Ngram) || needle_size >= std::numeric_limits<Offset>::max() |
| 57 | || (haystack_size_hint && haystack_size_hint < min_haystack_size_for_algorithm); |
| 58 | } |
| 59 | |
| 60 | static inline Ngram toNGram(const UInt8 * const pos) { return unalignedLoad<Ngram>(pos); } |
| 61 | |
| 62 | template <typename Callback> |
| 63 | static inline void putNGramASCIICaseInsensitive(const UInt8 * const pos, const int offset, const Callback & putNGramBase) |
| 64 | { |
| 65 | struct Chars |
| 66 | { |
| 67 | UInt8 c0; |
| 68 | UInt8 c1; |
| 69 | }; |
| 70 | |
| 71 | union |
| 72 | { |
| 73 | Ngram n; |
| 74 | Chars chars; |
| 75 | }; |
| 76 | |
| 77 | n = toNGram(pos); |
| 78 | |
| 79 | const auto c0_al = isAlphaASCII(chars.c0); |
| 80 | const auto c1_al = isAlphaASCII(chars.c1); |
| 81 | |
| 82 | if (c0_al && c1_al) |
| 83 | { |
| 84 | /// 4 combinations: AB, aB, Ab, ab |
| 85 | putNGramBase(n, offset); |
| 86 | chars.c0 = alternateCaseIfAlphaASCII(chars.c0); |
| 87 | putNGramBase(n, offset); |
| 88 | chars.c1 = alternateCaseIfAlphaASCII(chars.c1); |
| 89 | putNGramBase(n, offset); |
| 90 | chars.c0 = alternateCaseIfAlphaASCII(chars.c0); |
| 91 | putNGramBase(n, offset); |
| 92 | } |
| 93 | else if (c0_al) |
| 94 | { |
| 95 | /// 2 combinations: A1, a1 |
| 96 | putNGramBase(n, offset); |
| 97 | chars.c0 = alternateCaseIfAlphaASCII(chars.c0); |
| 98 | putNGramBase(n, offset); |
| 99 | } |
| 100 | else if (c1_al) |
| 101 | { |
| 102 | /// 2 combinations: 0B, 0b |
| 103 | putNGramBase(n, offset); |
| 104 | chars.c1 = alternateCaseIfAlphaASCII(chars.c1); |
| 105 | putNGramBase(n, offset); |
| 106 | } |
| 107 | else |
| 108 | /// 1 combination: 01 |
| 109 | putNGramBase(n, offset); |
| 110 | } |
| 111 | |
| 112 | template <bool CaseSensitive, bool ASCII, typename Callback> |
| 113 | static inline void putNGram(const UInt8 * const pos, const int offset, [[maybe_unused]] const UInt8 * const begin, const Callback & putNGramBase) |
| 114 | { |
| 115 | if constexpr (CaseSensitive) |
| 116 | { |
| 117 | putNGramBase(toNGram(pos), offset); |
| 118 | } |
| 119 | else |
| 120 | { |
| 121 | if constexpr (ASCII) |
| 122 | { |
| 123 | putNGramASCIICaseInsensitive(pos, offset, putNGramBase); |
| 124 | } |
| 125 | else |
| 126 | { |
| 127 | struct Chars |
| 128 | { |
| 129 | UInt8 c0; |
| 130 | UInt8 c1; |
| 131 | }; |
| 132 | |
| 133 | union |
| 134 | { |
| 135 | VolnitskyTraits::Ngram n; |
| 136 | Chars chars; |
| 137 | }; |
| 138 | |
| 139 | n = toNGram(pos); |
| 140 | |
| 141 | if (isascii(chars.c0) && isascii(chars.c1)) |
| 142 | putNGramASCIICaseInsensitive(pos, offset, putNGramBase); |
| 143 | else |
| 144 | { |
| 145 | /** n-gram (in the case of n = 2) |
| 146 | * can be entirely located within one code point, |
| 147 | * or intersect with two code points. |
| 148 | * |
| 149 | * In the first case, you need to consider up to two alternatives - this code point in upper and lower case, |
| 150 | * and in the second case - up to four alternatives - fragments of two code points in all combinations of cases. |
| 151 | * |
| 152 | * It does not take into account the dependence of the case-transformation from the locale (for example - Turkish `Ii`) |
| 153 | * as well as composition / decomposition and other features. |
| 154 | * |
| 155 | * It also does not work if characters with lower and upper cases are represented by different number of bytes or code points. |
| 156 | */ |
| 157 | |
| 158 | using Seq = UInt8[6]; |
| 159 | |
| 160 | static const Poco::UTF8Encoding utf8; |
| 161 | |
| 162 | if (UTF8::isContinuationOctet(chars.c1)) |
| 163 | { |
| 164 | /// ngram is inside a sequence |
| 165 | auto seq_pos = pos; |
| 166 | UTF8::syncBackward(seq_pos, begin); |
| 167 | |
| 168 | const auto u32 = utf8.convert(seq_pos); |
| 169 | const auto l_u32 = Poco::Unicode::toLower(u32); |
| 170 | const auto u_u32 = Poco::Unicode::toUpper(u32); |
| 171 | |
| 172 | /// symbol is case-independent |
| 173 | if (l_u32 == u_u32) |
| 174 | putNGramBase(n, offset); |
| 175 | else |
| 176 | { |
| 177 | /// where is the given ngram in respect to the start of UTF-8 sequence? |
| 178 | const auto seq_ngram_offset = pos - seq_pos; |
| 179 | |
| 180 | Seq seq; |
| 181 | |
| 182 | /// put ngram for lowercase |
| 183 | utf8.convert(l_u32, seq, sizeof(seq)); |
| 184 | chars.c0 = seq[seq_ngram_offset]; |
| 185 | chars.c1 = seq[seq_ngram_offset + 1]; |
| 186 | putNGramBase(n, offset); |
| 187 | |
| 188 | /// put ngram for uppercase |
| 189 | utf8.convert(u_u32, seq, sizeof(seq)); |
| 190 | chars.c0 = seq[seq_ngram_offset]; //-V519 |
| 191 | chars.c1 = seq[seq_ngram_offset + 1]; //-V519 |
| 192 | putNGramBase(n, offset); |
| 193 | } |
| 194 | } |
| 195 | else |
| 196 | { |
| 197 | /// ngram is on the boundary of two sequences |
| 198 | /// first sequence may start before u_pos if it is not ASCII |
| 199 | auto first_seq_pos = pos; |
| 200 | UTF8::syncBackward(first_seq_pos, begin); |
| 201 | /// where is the given ngram in respect to the start of first UTF-8 sequence? |
| 202 | const auto seq_ngram_offset = pos - first_seq_pos; |
| 203 | |
| 204 | const auto first_u32 = utf8.convert(first_seq_pos); |
| 205 | const auto first_l_u32 = Poco::Unicode::toLower(first_u32); |
| 206 | const auto first_u_u32 = Poco::Unicode::toUpper(first_u32); |
| 207 | |
| 208 | /// second sequence always start immediately after u_pos |
| 209 | auto second_seq_pos = pos + 1; |
| 210 | |
| 211 | const auto second_u32 = utf8.convert(second_seq_pos); /// TODO This assumes valid UTF-8 or zero byte after needle. |
| 212 | const auto second_l_u32 = Poco::Unicode::toLower(second_u32); |
| 213 | const auto second_u_u32 = Poco::Unicode::toUpper(second_u32); |
| 214 | |
| 215 | /// both symbols are case-independent |
| 216 | if (first_l_u32 == first_u_u32 && second_l_u32 == second_u_u32) |
| 217 | { |
| 218 | putNGramBase(n, offset); |
| 219 | } |
| 220 | else if (first_l_u32 == first_u_u32) |
| 221 | { |
| 222 | /// first symbol is case-independent |
| 223 | Seq seq; |
| 224 | |
| 225 | /// put ngram for lowercase |
| 226 | utf8.convert(second_l_u32, seq, sizeof(seq)); |
| 227 | chars.c1 = seq[0]; |
| 228 | putNGramBase(n, offset); |
| 229 | |
| 230 | /// put ngram from uppercase, if it is different |
| 231 | utf8.convert(second_u_u32, seq, sizeof(seq)); |
| 232 | if (chars.c1 != seq[0]) |
| 233 | { |
| 234 | chars.c1 = seq[0]; |
| 235 | putNGramBase(n, offset); |
| 236 | } |
| 237 | } |
| 238 | else if (second_l_u32 == second_u_u32) |
| 239 | { |
| 240 | /// second symbol is case-independent |
| 241 | Seq seq; |
| 242 | |
| 243 | /// put ngram for lowercase |
| 244 | utf8.convert(first_l_u32, seq, sizeof(seq)); |
| 245 | chars.c0 = seq[seq_ngram_offset]; |
| 246 | putNGramBase(n, offset); |
| 247 | |
| 248 | /// put ngram for uppercase, if it is different |
| 249 | utf8.convert(first_u_u32, seq, sizeof(seq)); |
| 250 | if (chars.c0 != seq[seq_ngram_offset]) |
| 251 | { |
| 252 | chars.c0 = seq[seq_ngram_offset]; |
| 253 | putNGramBase(n, offset); |
| 254 | } |
| 255 | } |
| 256 | else |
| 257 | { |
| 258 | Seq first_l_seq; |
| 259 | Seq first_u_seq; |
| 260 | Seq second_l_seq; |
| 261 | Seq second_u_seq; |
| 262 | |
| 263 | utf8.convert(first_l_u32, first_l_seq, sizeof(first_l_seq)); |
| 264 | utf8.convert(first_u_u32, first_u_seq, sizeof(first_u_seq)); |
| 265 | utf8.convert(second_l_u32, second_l_seq, sizeof(second_l_seq)); |
| 266 | utf8.convert(second_u_u32, second_u_seq, sizeof(second_u_seq)); |
| 267 | |
| 268 | auto c0l = first_l_seq[seq_ngram_offset]; |
| 269 | auto c0u = first_u_seq[seq_ngram_offset]; |
| 270 | auto c1l = second_l_seq[0]; |
| 271 | auto c1u = second_u_seq[0]; |
| 272 | |
| 273 | /// ngram for ll |
| 274 | chars.c0 = c0l; |
| 275 | chars.c1 = c1l; |
| 276 | putNGramBase(n, offset); |
| 277 | |
| 278 | if (c0l != c0u) |
| 279 | { |
| 280 | /// ngram for Ul |
| 281 | chars.c0 = c0u; |
| 282 | chars.c1 = c1l; |
| 283 | putNGramBase(n, offset); |
| 284 | } |
| 285 | |
| 286 | if (c1l != c1u) |
| 287 | { |
| 288 | /// ngram for lU |
| 289 | chars.c0 = c0l; |
| 290 | chars.c1 = c1u; |
| 291 | putNGramBase(n, offset); |
| 292 | } |
| 293 | |
| 294 | if (c0l != c0u && c1l != c1u) |
| 295 | { |
| 296 | /// ngram for UU |
| 297 | chars.c0 = c0u; |
| 298 | chars.c1 = c1u; |
| 299 | putNGramBase(n, offset); |
| 300 | } |
| 301 | } |
| 302 | } |
| 303 | } |
| 304 | } |
| 305 | } |
| 306 | } |
| 307 | } |
| 308 | |
| 309 | |
| 310 | /// @todo store lowercase needle to speed up in case there are numerous occurrences of bigrams from needle in haystack |
| 311 | template <bool CaseSensitive, bool ASCII, typename FallbackSearcher> |
| 312 | class VolnitskyBase |
| 313 | { |
| 314 | protected: |
| 315 | const UInt8 * const needle; |
| 316 | const size_t needle_size; |
| 317 | const UInt8 * const needle_end = needle + needle_size; |
| 318 | /// For how long we move, if the n-gram from haystack is not found in the hash table. |
| 319 | const size_t step = needle_size - sizeof(VolnitskyTraits::Ngram) + 1; |
| 320 | |
| 321 | /** max needle length is 255, max distinct ngrams for case-sensitive is (255 - 1), case-insensitive is 4 * (255 - 1) |
| 322 | * storage of 64K ngrams (n = 2, 128 KB) should be large enough for both cases */ |
| 323 | VolnitskyTraits::Offset hash[VolnitskyTraits::hash_size]; /// Hash table. |
| 324 | |
| 325 | const bool fallback; /// Do we need to use the fallback algorithm. |
| 326 | |
| 327 | FallbackSearcher fallback_searcher; |
| 328 | |
| 329 | public: |
| 330 | using Searcher = FallbackSearcher; |
| 331 | |
| 332 | /** haystack_size_hint - the expected total size of the haystack for `search` calls. Optional (zero means unspecified). |
| 333 | * If you specify it small enough, the fallback algorithm will be used, |
| 334 | * since it is considered that it's useless to waste time initializing the hash table. |
| 335 | */ |
| 336 | VolnitskyBase(const char * const needle_, const size_t needle_size_, size_t haystack_size_hint = 0) |
| 337 | : needle{reinterpret_cast<const UInt8 *>(needle_)} |
| 338 | , needle_size{needle_size_} |
| 339 | , fallback{VolnitskyTraits::isFallbackNeedle(needle_size, haystack_size_hint)} |
| 340 | , fallback_searcher{needle_, needle_size} |
| 341 | { |
| 342 | if (fallback) |
| 343 | return; |
| 344 | |
| 345 | memset(hash, 0, sizeof(hash)); |
| 346 | |
| 347 | auto callback = [this](const VolnitskyTraits::Ngram ngram, const int offset) { return this->putNGramBase(ngram, offset); }; |
| 348 | /// ssize_t is used here because unsigned can't be used with condition like `i >= 0`, unsigned always >= 0 |
| 349 | /// And also adding from the end guarantees that we will find first occurence because we will lookup bigger offsets first. |
| 350 | for (auto i = static_cast<ssize_t>(needle_size - sizeof(VolnitskyTraits::Ngram)); i >= 0; --i) |
| 351 | VolnitskyTraits::putNGram<CaseSensitive, ASCII>(this->needle + i, i + 1, this->needle, callback); |
| 352 | } |
| 353 | |
| 354 | |
| 355 | /// If not found, the end of the haystack is returned. |
| 356 | const UInt8 * search(const UInt8 * const haystack, const size_t haystack_size) const |
| 357 | { |
| 358 | if (needle_size == 0) |
| 359 | return haystack; |
| 360 | |
| 361 | const auto haystack_end = haystack + haystack_size; |
| 362 | |
| 363 | if (fallback || haystack_size <= needle_size) |
| 364 | return fallback_searcher.search(haystack, haystack_end); |
| 365 | |
| 366 | /// Let's "apply" the needle to the haystack and compare the n-gram from the end of the needle. |
| 367 | const auto * pos = haystack + needle_size - sizeof(VolnitskyTraits::Ngram); |
| 368 | for (; pos <= haystack_end - needle_size; pos += step) |
| 369 | { |
| 370 | /// We look at all the cells of the hash table that can correspond to the n-gram from haystack. |
| 371 | for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num]; |
| 372 | cell_num = (cell_num + 1) % VolnitskyTraits::hash_size) |
| 373 | { |
| 374 | /// When found - compare bytewise, using the offset from the hash table. |
| 375 | const auto res = pos - (hash[cell_num] - 1); |
| 376 | |
| 377 | /// pointer in the code is always padded array so we can use pagesafe semantics |
| 378 | if (fallback_searcher.compare(haystack, haystack_end, res)) |
| 379 | return res; |
| 380 | } |
| 381 | } |
| 382 | |
| 383 | return fallback_searcher.search(pos - step + 1, haystack_end); |
| 384 | } |
| 385 | |
| 386 | const char * search(const char * haystack, size_t haystack_size) const |
| 387 | { |
| 388 | return reinterpret_cast<const char *>(search(reinterpret_cast<const UInt8 *>(haystack), haystack_size)); |
| 389 | } |
| 390 | |
| 391 | protected: |
| 392 | void putNGramBase(const VolnitskyTraits::Ngram ngram, const int offset) |
| 393 | { |
| 394 | /// Put the offset for the n-gram in the corresponding cell or the nearest free cell. |
| 395 | size_t cell_num = ngram % VolnitskyTraits::hash_size; |
| 396 | |
| 397 | while (hash[cell_num]) |
| 398 | cell_num = (cell_num + 1) % VolnitskyTraits::hash_size; /// Search for the next free cell. |
| 399 | |
| 400 | hash[cell_num] = offset; |
| 401 | } |
| 402 | }; |
| 403 | |
| 404 | |
| 405 | template <bool CaseSensitive, bool ASCII, typename FallbackSearcher> |
| 406 | class MultiVolnitskyBase |
| 407 | { |
| 408 | private: |
| 409 | /// needles and their offsets |
| 410 | const std::vector<StringRef> & needles; |
| 411 | |
| 412 | |
| 413 | /// fallback searchers |
| 414 | std::vector<size_t> fallback_needles; |
| 415 | std::vector<FallbackSearcher> fallback_searchers; |
| 416 | |
| 417 | /// because std::pair<> is not POD |
| 418 | struct OffsetId |
| 419 | { |
| 420 | VolnitskyTraits::Id id; |
| 421 | VolnitskyTraits::Offset off; |
| 422 | }; |
| 423 | |
| 424 | OffsetId hash[VolnitskyTraits::hash_size]; |
| 425 | |
| 426 | /// step for each bunch of strings |
| 427 | size_t step; |
| 428 | |
| 429 | /// last index of offsets that was not processed |
| 430 | size_t last; |
| 431 | |
| 432 | /// limit for adding to hashtable. In worst case with case insentive search, the table will be filled at most as half |
| 433 | static constexpr size_t small_limit = VolnitskyTraits::hash_size / 8; |
| 434 | |
| 435 | public: |
| 436 | MultiVolnitskyBase(const std::vector<StringRef> & needles_) : needles{needles_}, step{0}, last{0} |
| 437 | { |
| 438 | fallback_searchers.reserve(needles.size()); |
| 439 | } |
| 440 | |
| 441 | /** |
| 442 | * This function is needed to initialize hash table |
| 443 | * Returns `true` if there is nothing to initialize |
| 444 | * and `false` if we have something to initialize and initializes it. |
| 445 | * This function is a kind of fallback if there are many needles. |
| 446 | * We actually destroy the hash table and initialize it with uninitialized needles |
| 447 | * and search through the haystack again. |
| 448 | * The actual usage of this function is like this: |
| 449 | * while (hasMoreToSearch()) |
| 450 | * { |
| 451 | * search inside the haystack with the known needles |
| 452 | * } |
| 453 | */ |
| 454 | bool hasMoreToSearch() |
| 455 | { |
| 456 | if (last == needles.size()) |
| 457 | return false; |
| 458 | |
| 459 | memset(hash, 0, sizeof(hash)); |
| 460 | fallback_needles.clear(); |
| 461 | step = std::numeric_limits<size_t>::max(); |
| 462 | |
| 463 | size_t buf = 0; |
| 464 | size_t size = needles.size(); |
| 465 | |
| 466 | for (; last < size; ++last) |
| 467 | { |
| 468 | const char * cur_needle_data = needles[last].data; |
| 469 | const size_t cur_needle_size = needles[last].size; |
| 470 | |
| 471 | /// save the indices of fallback searchers |
| 472 | if (VolnitskyTraits::isFallbackNeedle(cur_needle_size)) |
| 473 | { |
| 474 | fallback_needles.push_back(last); |
| 475 | } |
| 476 | else |
| 477 | { |
| 478 | /// put all bigrams |
| 479 | auto callback = [this](const VolnitskyTraits::Ngram ngram, const int offset) |
| 480 | { |
| 481 | return this->putNGramBase(ngram, offset, this->last); |
| 482 | }; |
| 483 | |
| 484 | buf += cur_needle_size - sizeof(VolnitskyTraits::Ngram) + 1; |
| 485 | |
| 486 | /// this is the condition when we actually need to stop and start searching with known needles |
| 487 | if (buf > small_limit) |
| 488 | break; |
| 489 | |
| 490 | step = std::min(step, cur_needle_size - sizeof(VolnitskyTraits::Ngram) + 1); |
| 491 | for (auto i = static_cast<int>(cur_needle_size - sizeof(VolnitskyTraits::Ngram)); i >= 0; --i) |
| 492 | { |
| 493 | VolnitskyTraits::putNGram<CaseSensitive, ASCII>( |
| 494 | reinterpret_cast<const UInt8 *>(cur_needle_data) + i, |
| 495 | i + 1, |
| 496 | reinterpret_cast<const UInt8 *>(cur_needle_data), |
| 497 | callback); |
| 498 | } |
| 499 | } |
| 500 | fallback_searchers.emplace_back(cur_needle_data, cur_needle_size); |
| 501 | } |
| 502 | return true; |
| 503 | } |
| 504 | |
| 505 | inline bool searchOne(const UInt8 * haystack, const UInt8 * haystack_end) const |
| 506 | { |
| 507 | const size_t fallback_size = fallback_needles.size(); |
| 508 | for (size_t i = 0; i < fallback_size; ++i) |
| 509 | if (fallback_searchers[fallback_needles[i]].search(haystack, haystack_end) != haystack_end) |
| 510 | return true; |
| 511 | |
| 512 | /// check if we have one non empty volnitsky searcher |
| 513 | if (step != std::numeric_limits<size_t>::max()) |
| 514 | { |
| 515 | const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram); |
| 516 | for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step) |
| 517 | { |
| 518 | for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off; |
| 519 | cell_num = (cell_num + 1) % VolnitskyTraits::hash_size) |
| 520 | { |
| 521 | if (pos >= haystack + hash[cell_num].off - 1) |
| 522 | { |
| 523 | const auto res = pos - (hash[cell_num].off - 1); |
| 524 | const size_t ind = hash[cell_num].id; |
| 525 | if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(haystack, haystack_end, res)) |
| 526 | return true; |
| 527 | } |
| 528 | } |
| 529 | } |
| 530 | } |
| 531 | return false; |
| 532 | } |
| 533 | |
| 534 | inline size_t searchOneFirstIndex(const UInt8 * haystack, const UInt8 * haystack_end) const |
| 535 | { |
| 536 | const size_t fallback_size = fallback_needles.size(); |
| 537 | |
| 538 | size_t ans = std::numeric_limits<size_t>::max(); |
| 539 | |
| 540 | for (size_t i = 0; i < fallback_size; ++i) |
| 541 | if (fallback_searchers[fallback_needles[i]].search(haystack, haystack_end) != haystack_end) |
| 542 | ans = std::min(ans, fallback_needles[i]); |
| 543 | |
| 544 | /// check if we have one non empty volnitsky searcher |
| 545 | if (step != std::numeric_limits<size_t>::max()) |
| 546 | { |
| 547 | const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram); |
| 548 | for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step) |
| 549 | { |
| 550 | for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off; |
| 551 | cell_num = (cell_num + 1) % VolnitskyTraits::hash_size) |
| 552 | { |
| 553 | if (pos >= haystack + hash[cell_num].off - 1) |
| 554 | { |
| 555 | const auto res = pos - (hash[cell_num].off - 1); |
| 556 | const size_t ind = hash[cell_num].id; |
| 557 | if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(haystack, haystack_end, res)) |
| 558 | ans = std::min(ans, ind); |
| 559 | } |
| 560 | } |
| 561 | } |
| 562 | } |
| 563 | |
| 564 | /* |
| 565 | * if nothing was found, ans + 1 will be equal to zero and we can |
| 566 | * assign it into the result because we need to return the position starting with one |
| 567 | */ |
| 568 | return ans + 1; |
| 569 | } |
| 570 | |
| 571 | template <typename CountCharsCallback> |
| 572 | inline UInt64 searchOneFirstPosition(const UInt8 * haystack, const UInt8 * haystack_end, const CountCharsCallback & count_chars) const |
| 573 | { |
| 574 | const size_t fallback_size = fallback_needles.size(); |
| 575 | |
| 576 | UInt64 ans = std::numeric_limits<UInt64>::max(); |
| 577 | |
| 578 | for (size_t i = 0; i < fallback_size; ++i) |
| 579 | if (auto pos = fallback_searchers[fallback_needles[i]].search(haystack, haystack_end); pos != haystack_end) |
| 580 | ans = std::min<UInt64>(ans, pos - haystack); |
| 581 | |
| 582 | /// check if we have one non empty volnitsky searcher |
| 583 | if (step != std::numeric_limits<size_t>::max()) |
| 584 | { |
| 585 | const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram); |
| 586 | for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step) |
| 587 | { |
| 588 | for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off; |
| 589 | cell_num = (cell_num + 1) % VolnitskyTraits::hash_size) |
| 590 | { |
| 591 | if (pos >= haystack + hash[cell_num].off - 1) |
| 592 | { |
| 593 | const auto res = pos - (hash[cell_num].off - 1); |
| 594 | const size_t ind = hash[cell_num].id; |
| 595 | if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(haystack, haystack_end, res)) |
| 596 | ans = std::min<UInt64>(ans, res - haystack); |
| 597 | } |
| 598 | } |
| 599 | } |
| 600 | } |
| 601 | if (ans == std::numeric_limits<UInt64>::max()) |
| 602 | return 0; |
| 603 | return count_chars(haystack, haystack + ans); |
| 604 | } |
| 605 | |
| 606 | template <typename CountCharsCallback, typename AnsType> |
| 607 | inline void searchOneAll(const UInt8 * haystack, const UInt8 * haystack_end, AnsType * ans, const CountCharsCallback & count_chars) const |
| 608 | { |
| 609 | const size_t fallback_size = fallback_needles.size(); |
| 610 | for (size_t i = 0; i < fallback_size; ++i) |
| 611 | { |
| 612 | const UInt8 * ptr = fallback_searchers[fallback_needles[i]].search(haystack, haystack_end); |
| 613 | if (ptr != haystack_end) |
| 614 | ans[fallback_needles[i]] = count_chars(haystack, ptr); |
| 615 | } |
| 616 | |
| 617 | /// check if we have one non empty volnitsky searcher |
| 618 | if (step != std::numeric_limits<size_t>::max()) |
| 619 | { |
| 620 | const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram); |
| 621 | for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step) |
| 622 | { |
| 623 | for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off; |
| 624 | cell_num = (cell_num + 1) % VolnitskyTraits::hash_size) |
| 625 | { |
| 626 | if (pos >= haystack + hash[cell_num].off - 1) |
| 627 | { |
| 628 | const auto * res = pos - (hash[cell_num].off - 1); |
| 629 | const size_t ind = hash[cell_num].id; |
| 630 | if (ans[ind] == 0 && res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(haystack, haystack_end, res)) |
| 631 | ans[ind] = count_chars(haystack, res); |
| 632 | } |
| 633 | } |
| 634 | } |
| 635 | } |
| 636 | } |
| 637 | |
| 638 | void putNGramBase(const VolnitskyTraits::Ngram ngram, const int offset, const size_t num) |
| 639 | { |
| 640 | size_t cell_num = ngram % VolnitskyTraits::hash_size; |
| 641 | |
| 642 | while (hash[cell_num].off) |
| 643 | cell_num = (cell_num + 1) % VolnitskyTraits::hash_size; |
| 644 | |
| 645 | hash[cell_num] = {static_cast<VolnitskyTraits::Id>(num), static_cast<VolnitskyTraits::Offset>(offset)}; |
| 646 | } |
| 647 | }; |
| 648 | |
| 649 | |
| 650 | using Volnitsky = VolnitskyBase<true, true, ASCIICaseSensitiveStringSearcher>; |
| 651 | using VolnitskyUTF8 = VolnitskyBase<true, false, ASCIICaseSensitiveStringSearcher>; /// exactly same as Volnitsky |
| 652 | using VolnitskyCaseInsensitive = VolnitskyBase<false, true, ASCIICaseInsensitiveStringSearcher>; /// ignores non-ASCII bytes |
| 653 | using VolnitskyCaseInsensitiveUTF8 = VolnitskyBase<false, false, UTF8CaseInsensitiveStringSearcher>; |
| 654 | |
| 655 | using VolnitskyCaseSensitiveToken = VolnitskyBase<true, true, ASCIICaseSensitiveTokenSearcher>; |
| 656 | using VolnitskyCaseInsensitiveToken = VolnitskyBase<false, true, ASCIICaseInsensitiveTokenSearcher>; |
| 657 | |
| 658 | using MultiVolnitsky = MultiVolnitskyBase<true, true, ASCIICaseSensitiveStringSearcher>; |
| 659 | using MultiVolnitskyUTF8 = MultiVolnitskyBase<true, false, ASCIICaseSensitiveStringSearcher>; |
| 660 | using MultiVolnitskyCaseInsensitive = MultiVolnitskyBase<false, true, ASCIICaseInsensitiveStringSearcher>; |
| 661 | using MultiVolnitskyCaseInsensitiveUTF8 = MultiVolnitskyBase<false, false, UTF8CaseInsensitiveStringSearcher>; |
| 662 | |
| 663 | |
| 664 | } |
| 665 | |