| 1 | #include <Functions/FunctionsStringSimilarity.h> |
| 2 | #include <Functions/FunctionFactory.h> |
| 3 | #include <Functions/FunctionsHashing.h> |
| 4 | #include <Common/HashTable/ClearableHashMap.h> |
| 5 | #include <Common/HashTable/Hash.h> |
| 6 | #include <Common/UTF8Helpers.h> |
| 7 | |
| 8 | #include <Core/Defines.h> |
| 9 | |
| 10 | #include <common/unaligned.h> |
| 11 | |
| 12 | #include <algorithm> |
| 13 | #include <climits> |
| 14 | #include <cstring> |
| 15 | #include <limits> |
| 16 | #include <memory> |
| 17 | #include <utility> |
| 18 | |
| 19 | #ifdef __SSE4_2__ |
| 20 | # include <nmmintrin.h> |
| 21 | #endif |
| 22 | |
| 23 | namespace DB |
| 24 | { |
| 25 | /** Distance function implementation. |
| 26 | * We calculate all the n-grams from left string and count by the index of |
| 27 | * 16 bits hash of them in the map. |
| 28 | * Then calculate all the n-grams from the right string and calculate |
| 29 | * the n-gram distance on the flight by adding and subtracting from the hashmap. |
| 30 | * Then return the map into the condition of which it was after the left string |
| 31 | * calculation. If the right string size is big (more than 2**15 bytes), |
| 32 | * the strings are not similar at all and we return 1. |
| 33 | */ |
| 34 | template <size_t N, class CodePoint, bool UTF8, bool case_insensitive, bool symmetric> |
| 35 | struct NgramDistanceImpl |
| 36 | { |
| 37 | using ResultType = Float32; |
| 38 | |
| 39 | /// map_size for ngram difference. |
| 40 | static constexpr size_t map_size = 1u << 16; |
| 41 | |
| 42 | /// If the haystack size is bigger than this, behaviour is unspecified for this function. |
| 43 | static constexpr size_t max_string_size = 1u << 15; |
| 44 | |
| 45 | /// Default padding to read safely. |
| 46 | static constexpr size_t default_padding = 16; |
| 47 | |
| 48 | /// Max codepoints to store at once. 16 is for batching usage and PODArray has this padding. |
| 49 | static constexpr size_t simultaneously_codepoints_num = default_padding + N - 1; |
| 50 | |
| 51 | /** This fits mostly in L2 cache all the time. |
| 52 | * Actually use UInt16 as addings and subtractions do not UB overflow. But think of it as a signed |
| 53 | * integer array. |
| 54 | */ |
| 55 | using NgramStats = UInt16[map_size]; |
| 56 | |
| 57 | static ALWAYS_INLINE UInt16 ASCIIHash(const CodePoint * code_points) |
| 58 | { |
| 59 | return intHashCRC32(unalignedLoad<UInt32>(code_points)) & 0xFFFFu; |
| 60 | } |
| 61 | |
| 62 | static ALWAYS_INLINE UInt16 UTF8Hash(const CodePoint * code_points) |
| 63 | { |
| 64 | UInt64 combined = (static_cast<UInt64>(code_points[0]) << 32) | code_points[1]; |
| 65 | #ifdef __SSE4_2__ |
| 66 | return _mm_crc32_u64(code_points[2], combined) & 0xFFFFu; |
| 67 | #else |
| 68 | return (intHashCRC32(combined) ^ intHashCRC32(code_points[2])) & 0xFFFFu; |
| 69 | #endif |
| 70 | } |
| 71 | |
| 72 | template <size_t Offset, class Container, size_t... I> |
| 73 | static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence<I...> &) |
| 74 | { |
| 75 | ((cont[Offset + I] = std::tolower(cont[Offset + I])), ...); |
| 76 | } |
| 77 | |
| 78 | static ALWAYS_INLINE size_t readASCIICodePoints(CodePoint * code_points, const char *& pos, const char * end) |
| 79 | { |
| 80 | /// Offset before which we copy some data. |
| 81 | constexpr size_t padding_offset = default_padding - N + 1; |
| 82 | /// We have an array like this for ASCII (N == 4, other cases are similar) |
| 83 | /// |a0|a1|a2|a3|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18| |
| 84 | /// And we copy ^^^^^^^^^^^^^^^ these bytes to the start |
| 85 | /// Actually it is enough to copy 3 bytes, but memcpy for 4 bytes translates into 1 instruction |
| 86 | memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint)); |
| 87 | /// Now we have an array |
| 88 | /// |a13|a14|a15|a16|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18| |
| 89 | /// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| 90 | /// Doing unaligned read of 16 bytes and copy them like above |
| 91 | /// 16 is also chosen to do two `movups`. |
| 92 | /// Such copying allow us to have 3 codepoints from the previous read to produce the 4-grams with them. |
| 93 | memcpy(code_points + (N - 1), pos, default_padding * sizeof(CodePoint)); |
| 94 | |
| 95 | if constexpr (case_insensitive) |
| 96 | { |
| 97 | /// We really need template lambdas with C++20 to do it inline |
| 98 | unrollLowering<N - 1>(code_points, std::make_index_sequence<padding_offset>()); |
| 99 | } |
| 100 | pos += padding_offset; |
| 101 | if (pos > end) |
| 102 | return default_padding - (pos - end); |
| 103 | return default_padding; |
| 104 | } |
| 105 | |
| 106 | static ALWAYS_INLINE size_t readUTF8CodePoints(CodePoint * code_points, const char *& pos, const char * end) |
| 107 | { |
| 108 | /// The same copying as described in the function above. |
| 109 | memcpy(code_points, code_points + default_padding - N + 1, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint)); |
| 110 | |
| 111 | size_t num = N - 1; |
| 112 | while (num < default_padding && pos < end) |
| 113 | { |
| 114 | size_t length = UTF8::seqLength(*pos); |
| 115 | |
| 116 | if (pos + length > end) |
| 117 | length = end - pos; |
| 118 | |
| 119 | CodePoint res; |
| 120 | /// This is faster than just memcpy because of compiler optimizations with moving bytes. |
| 121 | switch (length) |
| 122 | { |
| 123 | case 1: |
| 124 | res = 0; |
| 125 | memcpy(&res, pos, 1); |
| 126 | break; |
| 127 | case 2: |
| 128 | res = 0; |
| 129 | memcpy(&res, pos, 2); |
| 130 | break; |
| 131 | case 3: |
| 132 | res = 0; |
| 133 | memcpy(&res, pos, 3); |
| 134 | break; |
| 135 | default: |
| 136 | memcpy(&res, pos, 4); |
| 137 | } |
| 138 | |
| 139 | /// This is not a really true case insensitive utf8. We zero the 5-th bit of every byte. |
| 140 | /// And first bit of first byte if there are two bytes. |
| 141 | /// For ASCII it works https://catonmat.net/ascii-case-conversion-trick. For most cyrrilic letters also does. |
| 142 | /// For others, we don't care now. Lowering UTF is not a cheap operation. |
| 143 | if constexpr (case_insensitive) |
| 144 | { |
| 145 | switch (length) |
| 146 | { |
| 147 | case 4: |
| 148 | res &= ~(1u << (5 + 3 * CHAR_BIT)); |
| 149 | [[fallthrough]]; |
| 150 | case 3: |
| 151 | res &= ~(1u << (5 + 2 * CHAR_BIT)); |
| 152 | [[fallthrough]]; |
| 153 | case 2: |
| 154 | res &= ~(1u); |
| 155 | res &= ~(1u << (5 + CHAR_BIT)); |
| 156 | [[fallthrough]]; |
| 157 | default: |
| 158 | res &= ~(1u << 5); |
| 159 | } |
| 160 | } |
| 161 | |
| 162 | pos += length; |
| 163 | code_points[num++] = res; |
| 164 | } |
| 165 | return num; |
| 166 | } |
| 167 | |
| 168 | template <bool save_ngrams> |
| 169 | static ALWAYS_INLINE inline size_t calculateNeedleStats( |
| 170 | const char * data, |
| 171 | const size_t size, |
| 172 | NgramStats & ngram_stats, |
| 173 | [[maybe_unused]] UInt16 * ngram_storage, |
| 174 | size_t (*read_code_points)(CodePoint *, const char *&, const char *), |
| 175 | UInt16 (*hash_functor)(const CodePoint *)) |
| 176 | { |
| 177 | const char * start = data; |
| 178 | const char * end = data + size; |
| 179 | CodePoint cp[simultaneously_codepoints_num] = {}; |
| 180 | /// read_code_points returns the position of cp where it stopped reading codepoints. |
| 181 | size_t found = read_code_points(cp, start, end); |
| 182 | /// We need to start for the first time here, because first N - 1 codepoints mean nothing. |
| 183 | size_t i = N - 1; |
| 184 | size_t len = 0; |
| 185 | do |
| 186 | { |
| 187 | for (; i + N <= found; ++i) |
| 188 | { |
| 189 | ++len; |
| 190 | UInt16 hash = hash_functor(cp + i); |
| 191 | if constexpr (save_ngrams) |
| 192 | *ngram_storage++ = hash; |
| 193 | ++ngram_stats[hash]; |
| 194 | } |
| 195 | i = 0; |
| 196 | } while (start < end && (found = read_code_points(cp, start, end))); |
| 197 | |
| 198 | return len; |
| 199 | } |
| 200 | |
| 201 | template <bool reuse_stats> |
| 202 | static ALWAYS_INLINE inline UInt64 calculateHaystackStatsAndMetric( |
| 203 | const char * data, |
| 204 | const size_t size, |
| 205 | NgramStats & ngram_stats, |
| 206 | size_t & distance, |
| 207 | [[maybe_unused]] UInt16 * ngram_storage, |
| 208 | size_t (*read_code_points)(CodePoint *, const char *&, const char *), |
| 209 | UInt16 (*hash_functor)(const CodePoint *)) |
| 210 | { |
| 211 | size_t ngram_cnt = 0; |
| 212 | const char * start = data; |
| 213 | const char * end = data + size; |
| 214 | CodePoint cp[simultaneously_codepoints_num] = {}; |
| 215 | |
| 216 | /// read_code_points returns the position of cp where it stopped reading codepoints. |
| 217 | size_t found = read_code_points(cp, start, end); |
| 218 | /// We need to start for the first time here, because first N - 1 codepoints mean nothing. |
| 219 | size_t iter = N - 1; |
| 220 | |
| 221 | do |
| 222 | { |
| 223 | for (; iter + N <= found; ++iter) |
| 224 | { |
| 225 | UInt16 hash = hash_functor(cp + iter); |
| 226 | /// For symmetric version we should add when we can't subtract to get symmetric difference. |
| 227 | if (static_cast<Int16>(ngram_stats[hash]) > 0) |
| 228 | --distance; |
| 229 | else if constexpr (symmetric) |
| 230 | ++distance; |
| 231 | if constexpr (reuse_stats) |
| 232 | ngram_storage[ngram_cnt] = hash; |
| 233 | ++ngram_cnt; |
| 234 | --ngram_stats[hash]; |
| 235 | } |
| 236 | iter = 0; |
| 237 | } while (start < end && (found = read_code_points(cp, start, end))); |
| 238 | |
| 239 | /// Return the state of hash map to its initial. |
| 240 | if constexpr (reuse_stats) |
| 241 | { |
| 242 | for (size_t i = 0; i < ngram_cnt; ++i) |
| 243 | ++ngram_stats[ngram_storage[i]]; |
| 244 | } |
| 245 | return ngram_cnt; |
| 246 | } |
| 247 | |
| 248 | template <class Callback, class... Args> |
| 249 | static inline auto dispatchSearcher(Callback callback, Args &&... args) |
| 250 | { |
| 251 | if constexpr (!UTF8) |
| 252 | return callback(std::forward<Args>(args)..., readASCIICodePoints, ASCIIHash); |
| 253 | else |
| 254 | return callback(std::forward<Args>(args)..., readUTF8CodePoints, UTF8Hash); |
| 255 | } |
| 256 | |
| 257 | static void constant_constant(std::string data, std::string needle, Float32 & res) |
| 258 | { |
| 259 | NgramStats common_stats = {}; |
| 260 | |
| 261 | /// We use unsafe versions of getting ngrams, so I decided to use padded strings. |
| 262 | const size_t needle_size = needle.size(); |
| 263 | const size_t data_size = data.size(); |
| 264 | needle.resize(needle_size + default_padding); |
| 265 | data.resize(data_size + default_padding); |
| 266 | |
| 267 | size_t second_size = dispatchSearcher(calculateNeedleStats<false>, needle.data(), needle_size, common_stats, nullptr); |
| 268 | size_t distance = second_size; |
| 269 | if (data_size <= max_string_size) |
| 270 | { |
| 271 | size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric<false>, data.data(), data_size, common_stats, distance, nullptr); |
| 272 | /// For !symmetric version we should not use first_size. |
| 273 | if constexpr (symmetric) |
| 274 | res = distance * 1.f / std::max(first_size + second_size, size_t(1)); |
| 275 | else |
| 276 | res = 1.f - distance * 1.f / std::max(second_size, size_t(1)); |
| 277 | } |
| 278 | else |
| 279 | { |
| 280 | if constexpr (symmetric) |
| 281 | res = 1.f; |
| 282 | else |
| 283 | res = 0.f; |
| 284 | } |
| 285 | } |
| 286 | |
| 287 | static void vector_vector( |
| 288 | const ColumnString::Chars & haystack_data, |
| 289 | const ColumnString::Offsets & haystack_offsets, |
| 290 | const ColumnString::Chars & needle_data, |
| 291 | const ColumnString::Offsets & needle_offsets, |
| 292 | PaddedPODArray<Float32> & res) |
| 293 | { |
| 294 | const size_t haystack_offsets_size = haystack_offsets.size(); |
| 295 | size_t prev_haystack_offset = 0; |
| 296 | size_t prev_needle_offset = 0; |
| 297 | |
| 298 | NgramStats common_stats = {}; |
| 299 | |
| 300 | /// The main motivation is to not allocate more on stack because we have already allocated a lot (128Kb). |
| 301 | /// And we can reuse these storages in one thread because we care only about what was written to first places. |
| 302 | std::unique_ptr<UInt16[]> needle_ngram_storage(new UInt16[max_string_size]); |
| 303 | std::unique_ptr<UInt16[]> haystack_ngram_storage(new UInt16[max_string_size]); |
| 304 | |
| 305 | for (size_t i = 0; i < haystack_offsets_size; ++i) |
| 306 | { |
| 307 | const char * haystack = reinterpret_cast<const char *>(&haystack_data[prev_haystack_offset]); |
| 308 | const size_t haystack_size = haystack_offsets[i] - prev_haystack_offset - 1; |
| 309 | const char * needle = reinterpret_cast<const char *>(&needle_data[prev_needle_offset]); |
| 310 | const size_t needle_size = needle_offsets[i] - prev_needle_offset - 1; |
| 311 | |
| 312 | if (needle_size <= max_string_size && haystack_size <= max_string_size) |
| 313 | { |
| 314 | /// Get needle stats. |
| 315 | const size_t needle_stats_size = dispatchSearcher( |
| 316 | calculateNeedleStats<true>, |
| 317 | needle, |
| 318 | needle_size, |
| 319 | common_stats, |
| 320 | needle_ngram_storage.get()); |
| 321 | |
| 322 | size_t distance = needle_stats_size; |
| 323 | |
| 324 | /// Combine with haystack stats, return to initial needle stats. |
| 325 | const size_t haystack_stats_size = dispatchSearcher( |
| 326 | calculateHaystackStatsAndMetric<true>, |
| 327 | haystack, |
| 328 | haystack_size, |
| 329 | common_stats, |
| 330 | distance, |
| 331 | haystack_ngram_storage.get()); |
| 332 | |
| 333 | /// Return to zero array stats. |
| 334 | for (size_t j = 0; j < needle_stats_size; ++j) |
| 335 | --common_stats[needle_ngram_storage[j]]; |
| 336 | |
| 337 | /// For now, common stats is a zero array. |
| 338 | |
| 339 | |
| 340 | /// For !symmetric version we should not use haystack_stats_size. |
| 341 | if constexpr (symmetric) |
| 342 | res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1)); |
| 343 | else |
| 344 | res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1)); |
| 345 | } |
| 346 | else |
| 347 | { |
| 348 | /// Strings are too big, we are assuming they are not the same. This is done because of limiting number |
| 349 | /// of bigrams added and not allocating too much memory. |
| 350 | if constexpr (symmetric) |
| 351 | res[i] = 1.f; |
| 352 | else |
| 353 | res[i] = 0.f; |
| 354 | } |
| 355 | |
| 356 | prev_needle_offset = needle_offsets[i]; |
| 357 | prev_haystack_offset = haystack_offsets[i]; |
| 358 | } |
| 359 | } |
| 360 | |
| 361 | static void constant_vector( |
| 362 | std::string haystack, |
| 363 | const ColumnString::Chars & needle_data, |
| 364 | const ColumnString::Offsets & needle_offsets, |
| 365 | PaddedPODArray<Float32> & res) |
| 366 | { |
| 367 | /// For symmetric version it is better to use vector_constant |
| 368 | if constexpr (symmetric) |
| 369 | { |
| 370 | vector_constant(needle_data, needle_offsets, std::move(haystack), res); |
| 371 | } |
| 372 | else |
| 373 | { |
| 374 | const size_t haystack_size = haystack.size(); |
| 375 | haystack.resize(haystack_size + default_padding); |
| 376 | |
| 377 | /// For logic explanation see vector_vector function. |
| 378 | const size_t needle_offsets_size = needle_offsets.size(); |
| 379 | size_t prev_offset = 0; |
| 380 | |
| 381 | NgramStats common_stats = {}; |
| 382 | |
| 383 | std::unique_ptr<UInt16[]> needle_ngram_storage(new UInt16[max_string_size]); |
| 384 | std::unique_ptr<UInt16[]> haystack_ngram_storage(new UInt16[max_string_size]); |
| 385 | |
| 386 | for (size_t i = 0; i < needle_offsets_size; ++i) |
| 387 | { |
| 388 | const char * needle = reinterpret_cast<const char *>(&needle_data[prev_offset]); |
| 389 | const size_t needle_size = needle_offsets[i] - prev_offset - 1; |
| 390 | |
| 391 | if (needle_size <= max_string_size && haystack_size <= max_string_size) |
| 392 | { |
| 393 | const size_t needle_stats_size = dispatchSearcher( |
| 394 | calculateNeedleStats<true>, |
| 395 | needle, |
| 396 | needle_size, |
| 397 | common_stats, |
| 398 | needle_ngram_storage.get()); |
| 399 | |
| 400 | size_t distance = needle_stats_size; |
| 401 | |
| 402 | dispatchSearcher( |
| 403 | calculateHaystackStatsAndMetric<true>, |
| 404 | haystack.data(), |
| 405 | haystack_size, |
| 406 | common_stats, |
| 407 | distance, |
| 408 | haystack_ngram_storage.get()); |
| 409 | |
| 410 | for (size_t j = 0; j < needle_stats_size; ++j) |
| 411 | --common_stats[needle_ngram_storage[j]]; |
| 412 | |
| 413 | res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1)); |
| 414 | } |
| 415 | else |
| 416 | { |
| 417 | res[i] = 0.f; |
| 418 | } |
| 419 | |
| 420 | prev_offset = needle_offsets[i]; |
| 421 | } |
| 422 | |
| 423 | } |
| 424 | } |
| 425 | |
| 426 | static void vector_constant( |
| 427 | const ColumnString::Chars & data, |
| 428 | const ColumnString::Offsets & offsets, |
| 429 | std::string needle, |
| 430 | PaddedPODArray<Float32> & res) |
| 431 | { |
| 432 | /// zeroing our map |
| 433 | NgramStats common_stats = {}; |
| 434 | |
| 435 | /// The main motivation is to not allocate more on stack because we have already allocated a lot (128Kb). |
| 436 | /// And we can reuse these storages in one thread because we care only about what was written to first places. |
| 437 | std::unique_ptr<UInt16[]> ngram_storage(new UInt16[max_string_size]); |
| 438 | |
| 439 | /// We use unsafe versions of getting ngrams, so I decided to use padded_data even in needle case. |
| 440 | const size_t needle_size = needle.size(); |
| 441 | needle.resize(needle_size + default_padding); |
| 442 | |
| 443 | const size_t needle_stats_size = dispatchSearcher(calculateNeedleStats<false>, needle.data(), needle_size, common_stats, nullptr); |
| 444 | |
| 445 | size_t distance = needle_stats_size; |
| 446 | size_t prev_offset = 0; |
| 447 | for (size_t i = 0; i < offsets.size(); ++i) |
| 448 | { |
| 449 | const UInt8 * haystack = &data[prev_offset]; |
| 450 | const size_t haystack_size = offsets[i] - prev_offset - 1; |
| 451 | if (haystack_size <= max_string_size) |
| 452 | { |
| 453 | size_t haystack_stats_size = dispatchSearcher( |
| 454 | calculateHaystackStatsAndMetric<true>, |
| 455 | reinterpret_cast<const char *>(haystack), |
| 456 | haystack_size, common_stats, |
| 457 | distance, |
| 458 | ngram_storage.get()); |
| 459 | /// For !symmetric version we should not use haystack_stats_size. |
| 460 | if constexpr (symmetric) |
| 461 | res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1)); |
| 462 | else |
| 463 | res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1)); |
| 464 | } |
| 465 | else |
| 466 | { |
| 467 | /// if the strings are too big, we say they are completely not the same |
| 468 | if constexpr (symmetric) |
| 469 | res[i] = 1.f; |
| 470 | else |
| 471 | res[i] = 0.f; |
| 472 | } |
| 473 | distance = needle_stats_size; |
| 474 | prev_offset = offsets[i]; |
| 475 | } |
| 476 | } |
| 477 | }; |
| 478 | |
| 479 | |
| 480 | struct NameNgramDistance |
| 481 | { |
| 482 | static constexpr auto name = "ngramDistance" ; |
| 483 | }; |
| 484 | struct NameNgramDistanceCaseInsensitive |
| 485 | { |
| 486 | static constexpr auto name = "ngramDistanceCaseInsensitive" ; |
| 487 | }; |
| 488 | |
| 489 | struct NameNgramDistanceUTF8 |
| 490 | { |
| 491 | static constexpr auto name = "ngramDistanceUTF8" ; |
| 492 | }; |
| 493 | |
| 494 | struct NameNgramDistanceUTF8CaseInsensitive |
| 495 | { |
| 496 | static constexpr auto name = "ngramDistanceCaseInsensitiveUTF8" ; |
| 497 | }; |
| 498 | |
| 499 | struct NameNgramSearch |
| 500 | { |
| 501 | static constexpr auto name = "ngramSearch" ; |
| 502 | }; |
| 503 | struct NameNgramSearchCaseInsensitive |
| 504 | { |
| 505 | static constexpr auto name = "ngramSearchCaseInsensitive" ; |
| 506 | }; |
| 507 | struct NameNgramSearchUTF8 |
| 508 | { |
| 509 | static constexpr auto name = "ngramSearchUTF8" ; |
| 510 | }; |
| 511 | |
| 512 | struct NameNgramSearchUTF8CaseInsensitive |
| 513 | { |
| 514 | static constexpr auto name = "ngramSearchCaseInsensitiveUTF8" ; |
| 515 | }; |
| 516 | |
| 517 | using FunctionNgramDistance = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, false, true>, NameNgramDistance>; |
| 518 | using FunctionNgramDistanceCaseInsensitive = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, true, true>, NameNgramDistanceCaseInsensitive>; |
| 519 | using FunctionNgramDistanceUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, false, true>, NameNgramDistanceUTF8>; |
| 520 | using FunctionNgramDistanceCaseInsensitiveUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, true, true>, NameNgramDistanceUTF8CaseInsensitive>; |
| 521 | |
| 522 | using FunctionNgramSearch = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, false, false>, NameNgramSearch>; |
| 523 | using FunctionNgramSearchCaseInsensitive = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, true, false>, NameNgramSearchCaseInsensitive>; |
| 524 | using FunctionNgramSearchUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, false, false>, NameNgramSearchUTF8>; |
| 525 | using FunctionNgramSearchCaseInsensitiveUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, true, false>, NameNgramSearchUTF8CaseInsensitive>; |
| 526 | |
| 527 | |
| 528 | void registerFunctionsStringSimilarity(FunctionFactory & factory) |
| 529 | { |
| 530 | factory.registerFunction<FunctionNgramDistance>(); |
| 531 | factory.registerFunction<FunctionNgramDistanceCaseInsensitive>(); |
| 532 | factory.registerFunction<FunctionNgramDistanceUTF8>(); |
| 533 | factory.registerFunction<FunctionNgramDistanceCaseInsensitiveUTF8>(); |
| 534 | |
| 535 | factory.registerFunction<FunctionNgramSearch>(); |
| 536 | factory.registerFunction<FunctionNgramSearchCaseInsensitive>(); |
| 537 | factory.registerFunction<FunctionNgramSearchUTF8>(); |
| 538 | factory.registerFunction<FunctionNgramSearchCaseInsensitiveUTF8>(); |
| 539 | } |
| 540 | |
| 541 | } |
| 542 | |