1#include <Functions/FunctionsStringSimilarity.h>
2#include <Functions/FunctionFactory.h>
3#include <Functions/FunctionsHashing.h>
4#include <Common/HashTable/ClearableHashMap.h>
5#include <Common/HashTable/Hash.h>
6#include <Common/UTF8Helpers.h>
7
8#include <Core/Defines.h>
9
10#include <common/unaligned.h>
11
12#include <algorithm>
13#include <climits>
14#include <cstring>
15#include <limits>
16#include <memory>
17#include <utility>
18
19#ifdef __SSE4_2__
20# include <nmmintrin.h>
21#endif
22
23namespace DB
24{
25/** Distance function implementation.
26 * We calculate all the n-grams from left string and count by the index of
27 * 16 bits hash of them in the map.
28 * Then calculate all the n-grams from the right string and calculate
29 * the n-gram distance on the flight by adding and subtracting from the hashmap.
30 * Then return the map into the condition of which it was after the left string
31 * calculation. If the right string size is big (more than 2**15 bytes),
32 * the strings are not similar at all and we return 1.
33 */
34template <size_t N, class CodePoint, bool UTF8, bool case_insensitive, bool symmetric>
35struct NgramDistanceImpl
36{
37 using ResultType = Float32;
38
39 /// map_size for ngram difference.
40 static constexpr size_t map_size = 1u << 16;
41
42 /// If the haystack size is bigger than this, behaviour is unspecified for this function.
43 static constexpr size_t max_string_size = 1u << 15;
44
45 /// Default padding to read safely.
46 static constexpr size_t default_padding = 16;
47
48 /// Max codepoints to store at once. 16 is for batching usage and PODArray has this padding.
49 static constexpr size_t simultaneously_codepoints_num = default_padding + N - 1;
50
51 /** This fits mostly in L2 cache all the time.
52 * Actually use UInt16 as addings and subtractions do not UB overflow. But think of it as a signed
53 * integer array.
54 */
55 using NgramStats = UInt16[map_size];
56
57 static ALWAYS_INLINE UInt16 ASCIIHash(const CodePoint * code_points)
58 {
59 return intHashCRC32(unalignedLoad<UInt32>(code_points)) & 0xFFFFu;
60 }
61
62 static ALWAYS_INLINE UInt16 UTF8Hash(const CodePoint * code_points)
63 {
64 UInt64 combined = (static_cast<UInt64>(code_points[0]) << 32) | code_points[1];
65#ifdef __SSE4_2__
66 return _mm_crc32_u64(code_points[2], combined) & 0xFFFFu;
67#else
68 return (intHashCRC32(combined) ^ intHashCRC32(code_points[2])) & 0xFFFFu;
69#endif
70 }
71
72 template <size_t Offset, class Container, size_t... I>
73 static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence<I...> &)
74 {
75 ((cont[Offset + I] = std::tolower(cont[Offset + I])), ...);
76 }
77
78 static ALWAYS_INLINE size_t readASCIICodePoints(CodePoint * code_points, const char *& pos, const char * end)
79 {
80 /// Offset before which we copy some data.
81 constexpr size_t padding_offset = default_padding - N + 1;
82 /// We have an array like this for ASCII (N == 4, other cases are similar)
83 /// |a0|a1|a2|a3|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18|
84 /// And we copy ^^^^^^^^^^^^^^^ these bytes to the start
85 /// Actually it is enough to copy 3 bytes, but memcpy for 4 bytes translates into 1 instruction
86 memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint));
87 /// Now we have an array
88 /// |a13|a14|a15|a16|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18|
89 /// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
90 /// Doing unaligned read of 16 bytes and copy them like above
91 /// 16 is also chosen to do two `movups`.
92 /// Such copying allow us to have 3 codepoints from the previous read to produce the 4-grams with them.
93 memcpy(code_points + (N - 1), pos, default_padding * sizeof(CodePoint));
94
95 if constexpr (case_insensitive)
96 {
97 /// We really need template lambdas with C++20 to do it inline
98 unrollLowering<N - 1>(code_points, std::make_index_sequence<padding_offset>());
99 }
100 pos += padding_offset;
101 if (pos > end)
102 return default_padding - (pos - end);
103 return default_padding;
104 }
105
106 static ALWAYS_INLINE size_t readUTF8CodePoints(CodePoint * code_points, const char *& pos, const char * end)
107 {
108 /// The same copying as described in the function above.
109 memcpy(code_points, code_points + default_padding - N + 1, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint));
110
111 size_t num = N - 1;
112 while (num < default_padding && pos < end)
113 {
114 size_t length = UTF8::seqLength(*pos);
115
116 if (pos + length > end)
117 length = end - pos;
118
119 CodePoint res;
120 /// This is faster than just memcpy because of compiler optimizations with moving bytes.
121 switch (length)
122 {
123 case 1:
124 res = 0;
125 memcpy(&res, pos, 1);
126 break;
127 case 2:
128 res = 0;
129 memcpy(&res, pos, 2);
130 break;
131 case 3:
132 res = 0;
133 memcpy(&res, pos, 3);
134 break;
135 default:
136 memcpy(&res, pos, 4);
137 }
138
139 /// This is not a really true case insensitive utf8. We zero the 5-th bit of every byte.
140 /// And first bit of first byte if there are two bytes.
141 /// For ASCII it works https://catonmat.net/ascii-case-conversion-trick. For most cyrrilic letters also does.
142 /// For others, we don't care now. Lowering UTF is not a cheap operation.
143 if constexpr (case_insensitive)
144 {
145 switch (length)
146 {
147 case 4:
148 res &= ~(1u << (5 + 3 * CHAR_BIT));
149 [[fallthrough]];
150 case 3:
151 res &= ~(1u << (5 + 2 * CHAR_BIT));
152 [[fallthrough]];
153 case 2:
154 res &= ~(1u);
155 res &= ~(1u << (5 + CHAR_BIT));
156 [[fallthrough]];
157 default:
158 res &= ~(1u << 5);
159 }
160 }
161
162 pos += length;
163 code_points[num++] = res;
164 }
165 return num;
166 }
167
168 template <bool save_ngrams>
169 static ALWAYS_INLINE inline size_t calculateNeedleStats(
170 const char * data,
171 const size_t size,
172 NgramStats & ngram_stats,
173 [[maybe_unused]] UInt16 * ngram_storage,
174 size_t (*read_code_points)(CodePoint *, const char *&, const char *),
175 UInt16 (*hash_functor)(const CodePoint *))
176 {
177 const char * start = data;
178 const char * end = data + size;
179 CodePoint cp[simultaneously_codepoints_num] = {};
180 /// read_code_points returns the position of cp where it stopped reading codepoints.
181 size_t found = read_code_points(cp, start, end);
182 /// We need to start for the first time here, because first N - 1 codepoints mean nothing.
183 size_t i = N - 1;
184 size_t len = 0;
185 do
186 {
187 for (; i + N <= found; ++i)
188 {
189 ++len;
190 UInt16 hash = hash_functor(cp + i);
191 if constexpr (save_ngrams)
192 *ngram_storage++ = hash;
193 ++ngram_stats[hash];
194 }
195 i = 0;
196 } while (start < end && (found = read_code_points(cp, start, end)));
197
198 return len;
199 }
200
201 template <bool reuse_stats>
202 static ALWAYS_INLINE inline UInt64 calculateHaystackStatsAndMetric(
203 const char * data,
204 const size_t size,
205 NgramStats & ngram_stats,
206 size_t & distance,
207 [[maybe_unused]] UInt16 * ngram_storage,
208 size_t (*read_code_points)(CodePoint *, const char *&, const char *),
209 UInt16 (*hash_functor)(const CodePoint *))
210 {
211 size_t ngram_cnt = 0;
212 const char * start = data;
213 const char * end = data + size;
214 CodePoint cp[simultaneously_codepoints_num] = {};
215
216 /// read_code_points returns the position of cp where it stopped reading codepoints.
217 size_t found = read_code_points(cp, start, end);
218 /// We need to start for the first time here, because first N - 1 codepoints mean nothing.
219 size_t iter = N - 1;
220
221 do
222 {
223 for (; iter + N <= found; ++iter)
224 {
225 UInt16 hash = hash_functor(cp + iter);
226 /// For symmetric version we should add when we can't subtract to get symmetric difference.
227 if (static_cast<Int16>(ngram_stats[hash]) > 0)
228 --distance;
229 else if constexpr (symmetric)
230 ++distance;
231 if constexpr (reuse_stats)
232 ngram_storage[ngram_cnt] = hash;
233 ++ngram_cnt;
234 --ngram_stats[hash];
235 }
236 iter = 0;
237 } while (start < end && (found = read_code_points(cp, start, end)));
238
239 /// Return the state of hash map to its initial.
240 if constexpr (reuse_stats)
241 {
242 for (size_t i = 0; i < ngram_cnt; ++i)
243 ++ngram_stats[ngram_storage[i]];
244 }
245 return ngram_cnt;
246 }
247
248 template <class Callback, class... Args>
249 static inline auto dispatchSearcher(Callback callback, Args &&... args)
250 {
251 if constexpr (!UTF8)
252 return callback(std::forward<Args>(args)..., readASCIICodePoints, ASCIIHash);
253 else
254 return callback(std::forward<Args>(args)..., readUTF8CodePoints, UTF8Hash);
255 }
256
257 static void constant_constant(std::string data, std::string needle, Float32 & res)
258 {
259 NgramStats common_stats = {};
260
261 /// We use unsafe versions of getting ngrams, so I decided to use padded strings.
262 const size_t needle_size = needle.size();
263 const size_t data_size = data.size();
264 needle.resize(needle_size + default_padding);
265 data.resize(data_size + default_padding);
266
267 size_t second_size = dispatchSearcher(calculateNeedleStats<false>, needle.data(), needle_size, common_stats, nullptr);
268 size_t distance = second_size;
269 if (data_size <= max_string_size)
270 {
271 size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric<false>, data.data(), data_size, common_stats, distance, nullptr);
272 /// For !symmetric version we should not use first_size.
273 if constexpr (symmetric)
274 res = distance * 1.f / std::max(first_size + second_size, size_t(1));
275 else
276 res = 1.f - distance * 1.f / std::max(second_size, size_t(1));
277 }
278 else
279 {
280 if constexpr (symmetric)
281 res = 1.f;
282 else
283 res = 0.f;
284 }
285 }
286
287 static void vector_vector(
288 const ColumnString::Chars & haystack_data,
289 const ColumnString::Offsets & haystack_offsets,
290 const ColumnString::Chars & needle_data,
291 const ColumnString::Offsets & needle_offsets,
292 PaddedPODArray<Float32> & res)
293 {
294 const size_t haystack_offsets_size = haystack_offsets.size();
295 size_t prev_haystack_offset = 0;
296 size_t prev_needle_offset = 0;
297
298 NgramStats common_stats = {};
299
300 /// The main motivation is to not allocate more on stack because we have already allocated a lot (128Kb).
301 /// And we can reuse these storages in one thread because we care only about what was written to first places.
302 std::unique_ptr<UInt16[]> needle_ngram_storage(new UInt16[max_string_size]);
303 std::unique_ptr<UInt16[]> haystack_ngram_storage(new UInt16[max_string_size]);
304
305 for (size_t i = 0; i < haystack_offsets_size; ++i)
306 {
307 const char * haystack = reinterpret_cast<const char *>(&haystack_data[prev_haystack_offset]);
308 const size_t haystack_size = haystack_offsets[i] - prev_haystack_offset - 1;
309 const char * needle = reinterpret_cast<const char *>(&needle_data[prev_needle_offset]);
310 const size_t needle_size = needle_offsets[i] - prev_needle_offset - 1;
311
312 if (needle_size <= max_string_size && haystack_size <= max_string_size)
313 {
314 /// Get needle stats.
315 const size_t needle_stats_size = dispatchSearcher(
316 calculateNeedleStats<true>,
317 needle,
318 needle_size,
319 common_stats,
320 needle_ngram_storage.get());
321
322 size_t distance = needle_stats_size;
323
324 /// Combine with haystack stats, return to initial needle stats.
325 const size_t haystack_stats_size = dispatchSearcher(
326 calculateHaystackStatsAndMetric<true>,
327 haystack,
328 haystack_size,
329 common_stats,
330 distance,
331 haystack_ngram_storage.get());
332
333 /// Return to zero array stats.
334 for (size_t j = 0; j < needle_stats_size; ++j)
335 --common_stats[needle_ngram_storage[j]];
336
337 /// For now, common stats is a zero array.
338
339
340 /// For !symmetric version we should not use haystack_stats_size.
341 if constexpr (symmetric)
342 res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1));
343 else
344 res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1));
345 }
346 else
347 {
348 /// Strings are too big, we are assuming they are not the same. This is done because of limiting number
349 /// of bigrams added and not allocating too much memory.
350 if constexpr (symmetric)
351 res[i] = 1.f;
352 else
353 res[i] = 0.f;
354 }
355
356 prev_needle_offset = needle_offsets[i];
357 prev_haystack_offset = haystack_offsets[i];
358 }
359 }
360
361 static void constant_vector(
362 std::string haystack,
363 const ColumnString::Chars & needle_data,
364 const ColumnString::Offsets & needle_offsets,
365 PaddedPODArray<Float32> & res)
366 {
367 /// For symmetric version it is better to use vector_constant
368 if constexpr (symmetric)
369 {
370 vector_constant(needle_data, needle_offsets, std::move(haystack), res);
371 }
372 else
373 {
374 const size_t haystack_size = haystack.size();
375 haystack.resize(haystack_size + default_padding);
376
377 /// For logic explanation see vector_vector function.
378 const size_t needle_offsets_size = needle_offsets.size();
379 size_t prev_offset = 0;
380
381 NgramStats common_stats = {};
382
383 std::unique_ptr<UInt16[]> needle_ngram_storage(new UInt16[max_string_size]);
384 std::unique_ptr<UInt16[]> haystack_ngram_storage(new UInt16[max_string_size]);
385
386 for (size_t i = 0; i < needle_offsets_size; ++i)
387 {
388 const char * needle = reinterpret_cast<const char *>(&needle_data[prev_offset]);
389 const size_t needle_size = needle_offsets[i] - prev_offset - 1;
390
391 if (needle_size <= max_string_size && haystack_size <= max_string_size)
392 {
393 const size_t needle_stats_size = dispatchSearcher(
394 calculateNeedleStats<true>,
395 needle,
396 needle_size,
397 common_stats,
398 needle_ngram_storage.get());
399
400 size_t distance = needle_stats_size;
401
402 dispatchSearcher(
403 calculateHaystackStatsAndMetric<true>,
404 haystack.data(),
405 haystack_size,
406 common_stats,
407 distance,
408 haystack_ngram_storage.get());
409
410 for (size_t j = 0; j < needle_stats_size; ++j)
411 --common_stats[needle_ngram_storage[j]];
412
413 res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1));
414 }
415 else
416 {
417 res[i] = 0.f;
418 }
419
420 prev_offset = needle_offsets[i];
421 }
422
423 }
424 }
425
426 static void vector_constant(
427 const ColumnString::Chars & data,
428 const ColumnString::Offsets & offsets,
429 std::string needle,
430 PaddedPODArray<Float32> & res)
431 {
432 /// zeroing our map
433 NgramStats common_stats = {};
434
435 /// The main motivation is to not allocate more on stack because we have already allocated a lot (128Kb).
436 /// And we can reuse these storages in one thread because we care only about what was written to first places.
437 std::unique_ptr<UInt16[]> ngram_storage(new UInt16[max_string_size]);
438
439 /// We use unsafe versions of getting ngrams, so I decided to use padded_data even in needle case.
440 const size_t needle_size = needle.size();
441 needle.resize(needle_size + default_padding);
442
443 const size_t needle_stats_size = dispatchSearcher(calculateNeedleStats<false>, needle.data(), needle_size, common_stats, nullptr);
444
445 size_t distance = needle_stats_size;
446 size_t prev_offset = 0;
447 for (size_t i = 0; i < offsets.size(); ++i)
448 {
449 const UInt8 * haystack = &data[prev_offset];
450 const size_t haystack_size = offsets[i] - prev_offset - 1;
451 if (haystack_size <= max_string_size)
452 {
453 size_t haystack_stats_size = dispatchSearcher(
454 calculateHaystackStatsAndMetric<true>,
455 reinterpret_cast<const char *>(haystack),
456 haystack_size, common_stats,
457 distance,
458 ngram_storage.get());
459 /// For !symmetric version we should not use haystack_stats_size.
460 if constexpr (symmetric)
461 res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1));
462 else
463 res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1));
464 }
465 else
466 {
467 /// if the strings are too big, we say they are completely not the same
468 if constexpr (symmetric)
469 res[i] = 1.f;
470 else
471 res[i] = 0.f;
472 }
473 distance = needle_stats_size;
474 prev_offset = offsets[i];
475 }
476 }
477};
478
479
480struct NameNgramDistance
481{
482 static constexpr auto name = "ngramDistance";
483};
484struct NameNgramDistanceCaseInsensitive
485{
486 static constexpr auto name = "ngramDistanceCaseInsensitive";
487};
488
489struct NameNgramDistanceUTF8
490{
491 static constexpr auto name = "ngramDistanceUTF8";
492};
493
494struct NameNgramDistanceUTF8CaseInsensitive
495{
496 static constexpr auto name = "ngramDistanceCaseInsensitiveUTF8";
497};
498
499struct NameNgramSearch
500{
501 static constexpr auto name = "ngramSearch";
502};
503struct NameNgramSearchCaseInsensitive
504{
505 static constexpr auto name = "ngramSearchCaseInsensitive";
506};
507struct NameNgramSearchUTF8
508{
509 static constexpr auto name = "ngramSearchUTF8";
510};
511
512struct NameNgramSearchUTF8CaseInsensitive
513{
514 static constexpr auto name = "ngramSearchCaseInsensitiveUTF8";
515};
516
517using FunctionNgramDistance = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, false, true>, NameNgramDistance>;
518using FunctionNgramDistanceCaseInsensitive = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, true, true>, NameNgramDistanceCaseInsensitive>;
519using FunctionNgramDistanceUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, false, true>, NameNgramDistanceUTF8>;
520using FunctionNgramDistanceCaseInsensitiveUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, true, true>, NameNgramDistanceUTF8CaseInsensitive>;
521
522using FunctionNgramSearch = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, false, false>, NameNgramSearch>;
523using FunctionNgramSearchCaseInsensitive = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, true, false>, NameNgramSearchCaseInsensitive>;
524using FunctionNgramSearchUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, false, false>, NameNgramSearchUTF8>;
525using FunctionNgramSearchCaseInsensitiveUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, true, false>, NameNgramSearchUTF8CaseInsensitive>;
526
527
528void registerFunctionsStringSimilarity(FunctionFactory & factory)
529{
530 factory.registerFunction<FunctionNgramDistance>();
531 factory.registerFunction<FunctionNgramDistanceCaseInsensitive>();
532 factory.registerFunction<FunctionNgramDistanceUTF8>();
533 factory.registerFunction<FunctionNgramDistanceCaseInsensitiveUTF8>();
534
535 factory.registerFunction<FunctionNgramSearch>();
536 factory.registerFunction<FunctionNgramSearchCaseInsensitive>();
537 factory.registerFunction<FunctionNgramSearchUTF8>();
538 factory.registerFunction<FunctionNgramSearchCaseInsensitiveUTF8>();
539}
540
541}
542