1 | #include <Functions/FunctionsStringSimilarity.h> |
2 | #include <Functions/FunctionFactory.h> |
3 | #include <Functions/FunctionsHashing.h> |
4 | #include <Common/HashTable/ClearableHashMap.h> |
5 | #include <Common/HashTable/Hash.h> |
6 | #include <Common/UTF8Helpers.h> |
7 | |
8 | #include <Core/Defines.h> |
9 | |
10 | #include <common/unaligned.h> |
11 | |
12 | #include <algorithm> |
13 | #include <climits> |
14 | #include <cstring> |
15 | #include <limits> |
16 | #include <memory> |
17 | #include <utility> |
18 | |
19 | #ifdef __SSE4_2__ |
20 | # include <nmmintrin.h> |
21 | #endif |
22 | |
23 | namespace DB |
24 | { |
25 | /** Distance function implementation. |
26 | * We calculate all the n-grams from left string and count by the index of |
27 | * 16 bits hash of them in the map. |
28 | * Then calculate all the n-grams from the right string and calculate |
29 | * the n-gram distance on the flight by adding and subtracting from the hashmap. |
30 | * Then return the map into the condition of which it was after the left string |
31 | * calculation. If the right string size is big (more than 2**15 bytes), |
32 | * the strings are not similar at all and we return 1. |
33 | */ |
34 | template <size_t N, class CodePoint, bool UTF8, bool case_insensitive, bool symmetric> |
35 | struct NgramDistanceImpl |
36 | { |
37 | using ResultType = Float32; |
38 | |
39 | /// map_size for ngram difference. |
40 | static constexpr size_t map_size = 1u << 16; |
41 | |
42 | /// If the haystack size is bigger than this, behaviour is unspecified for this function. |
43 | static constexpr size_t max_string_size = 1u << 15; |
44 | |
45 | /// Default padding to read safely. |
46 | static constexpr size_t default_padding = 16; |
47 | |
48 | /// Max codepoints to store at once. 16 is for batching usage and PODArray has this padding. |
49 | static constexpr size_t simultaneously_codepoints_num = default_padding + N - 1; |
50 | |
51 | /** This fits mostly in L2 cache all the time. |
52 | * Actually use UInt16 as addings and subtractions do not UB overflow. But think of it as a signed |
53 | * integer array. |
54 | */ |
55 | using NgramStats = UInt16[map_size]; |
56 | |
57 | static ALWAYS_INLINE UInt16 ASCIIHash(const CodePoint * code_points) |
58 | { |
59 | return intHashCRC32(unalignedLoad<UInt32>(code_points)) & 0xFFFFu; |
60 | } |
61 | |
62 | static ALWAYS_INLINE UInt16 UTF8Hash(const CodePoint * code_points) |
63 | { |
64 | UInt64 combined = (static_cast<UInt64>(code_points[0]) << 32) | code_points[1]; |
65 | #ifdef __SSE4_2__ |
66 | return _mm_crc32_u64(code_points[2], combined) & 0xFFFFu; |
67 | #else |
68 | return (intHashCRC32(combined) ^ intHashCRC32(code_points[2])) & 0xFFFFu; |
69 | #endif |
70 | } |
71 | |
72 | template <size_t Offset, class Container, size_t... I> |
73 | static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence<I...> &) |
74 | { |
75 | ((cont[Offset + I] = std::tolower(cont[Offset + I])), ...); |
76 | } |
77 | |
78 | static ALWAYS_INLINE size_t readASCIICodePoints(CodePoint * code_points, const char *& pos, const char * end) |
79 | { |
80 | /// Offset before which we copy some data. |
81 | constexpr size_t padding_offset = default_padding - N + 1; |
82 | /// We have an array like this for ASCII (N == 4, other cases are similar) |
83 | /// |a0|a1|a2|a3|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18| |
84 | /// And we copy ^^^^^^^^^^^^^^^ these bytes to the start |
85 | /// Actually it is enough to copy 3 bytes, but memcpy for 4 bytes translates into 1 instruction |
86 | memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint)); |
87 | /// Now we have an array |
88 | /// |a13|a14|a15|a16|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18| |
89 | /// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
90 | /// Doing unaligned read of 16 bytes and copy them like above |
91 | /// 16 is also chosen to do two `movups`. |
92 | /// Such copying allow us to have 3 codepoints from the previous read to produce the 4-grams with them. |
93 | memcpy(code_points + (N - 1), pos, default_padding * sizeof(CodePoint)); |
94 | |
95 | if constexpr (case_insensitive) |
96 | { |
97 | /// We really need template lambdas with C++20 to do it inline |
98 | unrollLowering<N - 1>(code_points, std::make_index_sequence<padding_offset>()); |
99 | } |
100 | pos += padding_offset; |
101 | if (pos > end) |
102 | return default_padding - (pos - end); |
103 | return default_padding; |
104 | } |
105 | |
106 | static ALWAYS_INLINE size_t readUTF8CodePoints(CodePoint * code_points, const char *& pos, const char * end) |
107 | { |
108 | /// The same copying as described in the function above. |
109 | memcpy(code_points, code_points + default_padding - N + 1, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint)); |
110 | |
111 | size_t num = N - 1; |
112 | while (num < default_padding && pos < end) |
113 | { |
114 | size_t length = UTF8::seqLength(*pos); |
115 | |
116 | if (pos + length > end) |
117 | length = end - pos; |
118 | |
119 | CodePoint res; |
120 | /// This is faster than just memcpy because of compiler optimizations with moving bytes. |
121 | switch (length) |
122 | { |
123 | case 1: |
124 | res = 0; |
125 | memcpy(&res, pos, 1); |
126 | break; |
127 | case 2: |
128 | res = 0; |
129 | memcpy(&res, pos, 2); |
130 | break; |
131 | case 3: |
132 | res = 0; |
133 | memcpy(&res, pos, 3); |
134 | break; |
135 | default: |
136 | memcpy(&res, pos, 4); |
137 | } |
138 | |
139 | /// This is not a really true case insensitive utf8. We zero the 5-th bit of every byte. |
140 | /// And first bit of first byte if there are two bytes. |
141 | /// For ASCII it works https://catonmat.net/ascii-case-conversion-trick. For most cyrrilic letters also does. |
142 | /// For others, we don't care now. Lowering UTF is not a cheap operation. |
143 | if constexpr (case_insensitive) |
144 | { |
145 | switch (length) |
146 | { |
147 | case 4: |
148 | res &= ~(1u << (5 + 3 * CHAR_BIT)); |
149 | [[fallthrough]]; |
150 | case 3: |
151 | res &= ~(1u << (5 + 2 * CHAR_BIT)); |
152 | [[fallthrough]]; |
153 | case 2: |
154 | res &= ~(1u); |
155 | res &= ~(1u << (5 + CHAR_BIT)); |
156 | [[fallthrough]]; |
157 | default: |
158 | res &= ~(1u << 5); |
159 | } |
160 | } |
161 | |
162 | pos += length; |
163 | code_points[num++] = res; |
164 | } |
165 | return num; |
166 | } |
167 | |
168 | template <bool save_ngrams> |
169 | static ALWAYS_INLINE inline size_t calculateNeedleStats( |
170 | const char * data, |
171 | const size_t size, |
172 | NgramStats & ngram_stats, |
173 | [[maybe_unused]] UInt16 * ngram_storage, |
174 | size_t (*read_code_points)(CodePoint *, const char *&, const char *), |
175 | UInt16 (*hash_functor)(const CodePoint *)) |
176 | { |
177 | const char * start = data; |
178 | const char * end = data + size; |
179 | CodePoint cp[simultaneously_codepoints_num] = {}; |
180 | /// read_code_points returns the position of cp where it stopped reading codepoints. |
181 | size_t found = read_code_points(cp, start, end); |
182 | /// We need to start for the first time here, because first N - 1 codepoints mean nothing. |
183 | size_t i = N - 1; |
184 | size_t len = 0; |
185 | do |
186 | { |
187 | for (; i + N <= found; ++i) |
188 | { |
189 | ++len; |
190 | UInt16 hash = hash_functor(cp + i); |
191 | if constexpr (save_ngrams) |
192 | *ngram_storage++ = hash; |
193 | ++ngram_stats[hash]; |
194 | } |
195 | i = 0; |
196 | } while (start < end && (found = read_code_points(cp, start, end))); |
197 | |
198 | return len; |
199 | } |
200 | |
201 | template <bool reuse_stats> |
202 | static ALWAYS_INLINE inline UInt64 calculateHaystackStatsAndMetric( |
203 | const char * data, |
204 | const size_t size, |
205 | NgramStats & ngram_stats, |
206 | size_t & distance, |
207 | [[maybe_unused]] UInt16 * ngram_storage, |
208 | size_t (*read_code_points)(CodePoint *, const char *&, const char *), |
209 | UInt16 (*hash_functor)(const CodePoint *)) |
210 | { |
211 | size_t ngram_cnt = 0; |
212 | const char * start = data; |
213 | const char * end = data + size; |
214 | CodePoint cp[simultaneously_codepoints_num] = {}; |
215 | |
216 | /// read_code_points returns the position of cp where it stopped reading codepoints. |
217 | size_t found = read_code_points(cp, start, end); |
218 | /// We need to start for the first time here, because first N - 1 codepoints mean nothing. |
219 | size_t iter = N - 1; |
220 | |
221 | do |
222 | { |
223 | for (; iter + N <= found; ++iter) |
224 | { |
225 | UInt16 hash = hash_functor(cp + iter); |
226 | /// For symmetric version we should add when we can't subtract to get symmetric difference. |
227 | if (static_cast<Int16>(ngram_stats[hash]) > 0) |
228 | --distance; |
229 | else if constexpr (symmetric) |
230 | ++distance; |
231 | if constexpr (reuse_stats) |
232 | ngram_storage[ngram_cnt] = hash; |
233 | ++ngram_cnt; |
234 | --ngram_stats[hash]; |
235 | } |
236 | iter = 0; |
237 | } while (start < end && (found = read_code_points(cp, start, end))); |
238 | |
239 | /// Return the state of hash map to its initial. |
240 | if constexpr (reuse_stats) |
241 | { |
242 | for (size_t i = 0; i < ngram_cnt; ++i) |
243 | ++ngram_stats[ngram_storage[i]]; |
244 | } |
245 | return ngram_cnt; |
246 | } |
247 | |
248 | template <class Callback, class... Args> |
249 | static inline auto dispatchSearcher(Callback callback, Args &&... args) |
250 | { |
251 | if constexpr (!UTF8) |
252 | return callback(std::forward<Args>(args)..., readASCIICodePoints, ASCIIHash); |
253 | else |
254 | return callback(std::forward<Args>(args)..., readUTF8CodePoints, UTF8Hash); |
255 | } |
256 | |
257 | static void constant_constant(std::string data, std::string needle, Float32 & res) |
258 | { |
259 | NgramStats common_stats = {}; |
260 | |
261 | /// We use unsafe versions of getting ngrams, so I decided to use padded strings. |
262 | const size_t needle_size = needle.size(); |
263 | const size_t data_size = data.size(); |
264 | needle.resize(needle_size + default_padding); |
265 | data.resize(data_size + default_padding); |
266 | |
267 | size_t second_size = dispatchSearcher(calculateNeedleStats<false>, needle.data(), needle_size, common_stats, nullptr); |
268 | size_t distance = second_size; |
269 | if (data_size <= max_string_size) |
270 | { |
271 | size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric<false>, data.data(), data_size, common_stats, distance, nullptr); |
272 | /// For !symmetric version we should not use first_size. |
273 | if constexpr (symmetric) |
274 | res = distance * 1.f / std::max(first_size + second_size, size_t(1)); |
275 | else |
276 | res = 1.f - distance * 1.f / std::max(second_size, size_t(1)); |
277 | } |
278 | else |
279 | { |
280 | if constexpr (symmetric) |
281 | res = 1.f; |
282 | else |
283 | res = 0.f; |
284 | } |
285 | } |
286 | |
287 | static void vector_vector( |
288 | const ColumnString::Chars & haystack_data, |
289 | const ColumnString::Offsets & haystack_offsets, |
290 | const ColumnString::Chars & needle_data, |
291 | const ColumnString::Offsets & needle_offsets, |
292 | PaddedPODArray<Float32> & res) |
293 | { |
294 | const size_t haystack_offsets_size = haystack_offsets.size(); |
295 | size_t prev_haystack_offset = 0; |
296 | size_t prev_needle_offset = 0; |
297 | |
298 | NgramStats common_stats = {}; |
299 | |
300 | /// The main motivation is to not allocate more on stack because we have already allocated a lot (128Kb). |
301 | /// And we can reuse these storages in one thread because we care only about what was written to first places. |
302 | std::unique_ptr<UInt16[]> needle_ngram_storage(new UInt16[max_string_size]); |
303 | std::unique_ptr<UInt16[]> haystack_ngram_storage(new UInt16[max_string_size]); |
304 | |
305 | for (size_t i = 0; i < haystack_offsets_size; ++i) |
306 | { |
307 | const char * haystack = reinterpret_cast<const char *>(&haystack_data[prev_haystack_offset]); |
308 | const size_t haystack_size = haystack_offsets[i] - prev_haystack_offset - 1; |
309 | const char * needle = reinterpret_cast<const char *>(&needle_data[prev_needle_offset]); |
310 | const size_t needle_size = needle_offsets[i] - prev_needle_offset - 1; |
311 | |
312 | if (needle_size <= max_string_size && haystack_size <= max_string_size) |
313 | { |
314 | /// Get needle stats. |
315 | const size_t needle_stats_size = dispatchSearcher( |
316 | calculateNeedleStats<true>, |
317 | needle, |
318 | needle_size, |
319 | common_stats, |
320 | needle_ngram_storage.get()); |
321 | |
322 | size_t distance = needle_stats_size; |
323 | |
324 | /// Combine with haystack stats, return to initial needle stats. |
325 | const size_t haystack_stats_size = dispatchSearcher( |
326 | calculateHaystackStatsAndMetric<true>, |
327 | haystack, |
328 | haystack_size, |
329 | common_stats, |
330 | distance, |
331 | haystack_ngram_storage.get()); |
332 | |
333 | /// Return to zero array stats. |
334 | for (size_t j = 0; j < needle_stats_size; ++j) |
335 | --common_stats[needle_ngram_storage[j]]; |
336 | |
337 | /// For now, common stats is a zero array. |
338 | |
339 | |
340 | /// For !symmetric version we should not use haystack_stats_size. |
341 | if constexpr (symmetric) |
342 | res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1)); |
343 | else |
344 | res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1)); |
345 | } |
346 | else |
347 | { |
348 | /// Strings are too big, we are assuming they are not the same. This is done because of limiting number |
349 | /// of bigrams added and not allocating too much memory. |
350 | if constexpr (symmetric) |
351 | res[i] = 1.f; |
352 | else |
353 | res[i] = 0.f; |
354 | } |
355 | |
356 | prev_needle_offset = needle_offsets[i]; |
357 | prev_haystack_offset = haystack_offsets[i]; |
358 | } |
359 | } |
360 | |
361 | static void constant_vector( |
362 | std::string haystack, |
363 | const ColumnString::Chars & needle_data, |
364 | const ColumnString::Offsets & needle_offsets, |
365 | PaddedPODArray<Float32> & res) |
366 | { |
367 | /// For symmetric version it is better to use vector_constant |
368 | if constexpr (symmetric) |
369 | { |
370 | vector_constant(needle_data, needle_offsets, std::move(haystack), res); |
371 | } |
372 | else |
373 | { |
374 | const size_t haystack_size = haystack.size(); |
375 | haystack.resize(haystack_size + default_padding); |
376 | |
377 | /// For logic explanation see vector_vector function. |
378 | const size_t needle_offsets_size = needle_offsets.size(); |
379 | size_t prev_offset = 0; |
380 | |
381 | NgramStats common_stats = {}; |
382 | |
383 | std::unique_ptr<UInt16[]> needle_ngram_storage(new UInt16[max_string_size]); |
384 | std::unique_ptr<UInt16[]> haystack_ngram_storage(new UInt16[max_string_size]); |
385 | |
386 | for (size_t i = 0; i < needle_offsets_size; ++i) |
387 | { |
388 | const char * needle = reinterpret_cast<const char *>(&needle_data[prev_offset]); |
389 | const size_t needle_size = needle_offsets[i] - prev_offset - 1; |
390 | |
391 | if (needle_size <= max_string_size && haystack_size <= max_string_size) |
392 | { |
393 | const size_t needle_stats_size = dispatchSearcher( |
394 | calculateNeedleStats<true>, |
395 | needle, |
396 | needle_size, |
397 | common_stats, |
398 | needle_ngram_storage.get()); |
399 | |
400 | size_t distance = needle_stats_size; |
401 | |
402 | dispatchSearcher( |
403 | calculateHaystackStatsAndMetric<true>, |
404 | haystack.data(), |
405 | haystack_size, |
406 | common_stats, |
407 | distance, |
408 | haystack_ngram_storage.get()); |
409 | |
410 | for (size_t j = 0; j < needle_stats_size; ++j) |
411 | --common_stats[needle_ngram_storage[j]]; |
412 | |
413 | res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1)); |
414 | } |
415 | else |
416 | { |
417 | res[i] = 0.f; |
418 | } |
419 | |
420 | prev_offset = needle_offsets[i]; |
421 | } |
422 | |
423 | } |
424 | } |
425 | |
426 | static void vector_constant( |
427 | const ColumnString::Chars & data, |
428 | const ColumnString::Offsets & offsets, |
429 | std::string needle, |
430 | PaddedPODArray<Float32> & res) |
431 | { |
432 | /// zeroing our map |
433 | NgramStats common_stats = {}; |
434 | |
435 | /// The main motivation is to not allocate more on stack because we have already allocated a lot (128Kb). |
436 | /// And we can reuse these storages in one thread because we care only about what was written to first places. |
437 | std::unique_ptr<UInt16[]> ngram_storage(new UInt16[max_string_size]); |
438 | |
439 | /// We use unsafe versions of getting ngrams, so I decided to use padded_data even in needle case. |
440 | const size_t needle_size = needle.size(); |
441 | needle.resize(needle_size + default_padding); |
442 | |
443 | const size_t needle_stats_size = dispatchSearcher(calculateNeedleStats<false>, needle.data(), needle_size, common_stats, nullptr); |
444 | |
445 | size_t distance = needle_stats_size; |
446 | size_t prev_offset = 0; |
447 | for (size_t i = 0; i < offsets.size(); ++i) |
448 | { |
449 | const UInt8 * haystack = &data[prev_offset]; |
450 | const size_t haystack_size = offsets[i] - prev_offset - 1; |
451 | if (haystack_size <= max_string_size) |
452 | { |
453 | size_t haystack_stats_size = dispatchSearcher( |
454 | calculateHaystackStatsAndMetric<true>, |
455 | reinterpret_cast<const char *>(haystack), |
456 | haystack_size, common_stats, |
457 | distance, |
458 | ngram_storage.get()); |
459 | /// For !symmetric version we should not use haystack_stats_size. |
460 | if constexpr (symmetric) |
461 | res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1)); |
462 | else |
463 | res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1)); |
464 | } |
465 | else |
466 | { |
467 | /// if the strings are too big, we say they are completely not the same |
468 | if constexpr (symmetric) |
469 | res[i] = 1.f; |
470 | else |
471 | res[i] = 0.f; |
472 | } |
473 | distance = needle_stats_size; |
474 | prev_offset = offsets[i]; |
475 | } |
476 | } |
477 | }; |
478 | |
479 | |
480 | struct NameNgramDistance |
481 | { |
482 | static constexpr auto name = "ngramDistance" ; |
483 | }; |
484 | struct NameNgramDistanceCaseInsensitive |
485 | { |
486 | static constexpr auto name = "ngramDistanceCaseInsensitive" ; |
487 | }; |
488 | |
489 | struct NameNgramDistanceUTF8 |
490 | { |
491 | static constexpr auto name = "ngramDistanceUTF8" ; |
492 | }; |
493 | |
494 | struct NameNgramDistanceUTF8CaseInsensitive |
495 | { |
496 | static constexpr auto name = "ngramDistanceCaseInsensitiveUTF8" ; |
497 | }; |
498 | |
499 | struct NameNgramSearch |
500 | { |
501 | static constexpr auto name = "ngramSearch" ; |
502 | }; |
503 | struct NameNgramSearchCaseInsensitive |
504 | { |
505 | static constexpr auto name = "ngramSearchCaseInsensitive" ; |
506 | }; |
507 | struct NameNgramSearchUTF8 |
508 | { |
509 | static constexpr auto name = "ngramSearchUTF8" ; |
510 | }; |
511 | |
512 | struct NameNgramSearchUTF8CaseInsensitive |
513 | { |
514 | static constexpr auto name = "ngramSearchCaseInsensitiveUTF8" ; |
515 | }; |
516 | |
517 | using FunctionNgramDistance = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, false, true>, NameNgramDistance>; |
518 | using FunctionNgramDistanceCaseInsensitive = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, true, true>, NameNgramDistanceCaseInsensitive>; |
519 | using FunctionNgramDistanceUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, false, true>, NameNgramDistanceUTF8>; |
520 | using FunctionNgramDistanceCaseInsensitiveUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, true, true>, NameNgramDistanceUTF8CaseInsensitive>; |
521 | |
522 | using FunctionNgramSearch = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, false, false>, NameNgramSearch>; |
523 | using FunctionNgramSearchCaseInsensitive = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, true, false>, NameNgramSearchCaseInsensitive>; |
524 | using FunctionNgramSearchUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, false, false>, NameNgramSearchUTF8>; |
525 | using FunctionNgramSearchCaseInsensitiveUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, true, false>, NameNgramSearchUTF8CaseInsensitive>; |
526 | |
527 | |
528 | void registerFunctionsStringSimilarity(FunctionFactory & factory) |
529 | { |
530 | factory.registerFunction<FunctionNgramDistance>(); |
531 | factory.registerFunction<FunctionNgramDistanceCaseInsensitive>(); |
532 | factory.registerFunction<FunctionNgramDistanceUTF8>(); |
533 | factory.registerFunction<FunctionNgramDistanceCaseInsensitiveUTF8>(); |
534 | |
535 | factory.registerFunction<FunctionNgramSearch>(); |
536 | factory.registerFunction<FunctionNgramSearchCaseInsensitive>(); |
537 | factory.registerFunction<FunctionNgramSearchUTF8>(); |
538 | factory.registerFunction<FunctionNgramSearchCaseInsensitiveUTF8>(); |
539 | } |
540 | |
541 | } |
542 | |