FunctionsStringSimilarity.cpp source code [ClickHouse/dbms/src/Functions/FunctionsStringSimilarity.cpp]

1	#include <Functions/FunctionsStringSimilarity.h>
2	#include <Functions/FunctionFactory.h>
3	#include <Functions/FunctionsHashing.h>
4	#include <Common/HashTable/ClearableHashMap.h>
5	#include <Common/HashTable/Hash.h>
6	#include <Common/UTF8Helpers.h>
7
8	#include <Core/Defines.h>
9
10	#include <common/unaligned.h>
11
12	#include <algorithm>
13	#include <climits>
14	#include <cstring>
15	#include <limits>
16	#include <memory>
17	#include <utility>
18
19	#ifdef __SSE4_2__
20	# include <nmmintrin.h>
21	#endif
22
23	namespace DB
24	{
25	/* Distance function implementation.*
26	* We calculate all the n-grams from left string and count by the index of
27	* 16 bits hash of them in the map.
28	* Then calculate all the n-grams from the right string and calculate
29	* the n-gram distance on the flight by adding and subtracting from the hashmap.
30	* Then return the map into the condition of which it was after the left string
31	* calculation. If the right string size is big (more than 2**15 bytes),
32	* the strings are not similar at all and we return 1.
33	*/
34	template <size_t N, class CodePoint, bool UTF8, bool case_insensitive, bool symmetric>
35	struct NgramDistanceImpl
36	{
37	using ResultType = Float32;
38
39	/// map_size for ngram difference.
40	static constexpr size_t map_size = `1u` << `16`;
41
42	/// If the haystack size is bigger than this, behaviour is unspecified for this function.
43	static constexpr size_t max_string_size = `1u` << `15`;
44
45	/// Default padding to read safely.
46	static constexpr size_t default_padding = `16`;
47
48	/// Max codepoints to store at once. 16 is for batching usage and PODArray has this padding.
49	static constexpr size_t simultaneously_codepoints_num = default_padding + N - `1`;
50
51	/* This fits mostly in L2 cache all the time.*
52	* Actually use UInt16 as addings and subtractions do not UB overflow. But think of it as a signed
53	* integer array.
54	*/
55	using NgramStats = UInt16[map_size];
56
57	static ALWAYS_INLINE UInt16 ASCIIHash(const CodePoint * code_points)
58	{
59	return intHashCRC32(unalignedLoad<UInt32>(code_points)) & `0xFFFFu`;
60	}
61
62	static ALWAYS_INLINE UInt16 UTF8Hash(const CodePoint * code_points)
63	{
64	UInt64 combined = (static_cast<UInt64>(code_points[`0`]) << `32`) \| code_points[`1`];
65	#ifdef __SSE4_2__
66	return _mm_crc32_u64(code_points[`2`], combined) & `0xFFFFu`;
67	#else
68	return (intHashCRC32(combined) ^ intHashCRC32(code_points[`2`])) & `0xFFFFu`;
69	#endif
70	}
71
72	template <size_t Offset, class Container, size_t... I>
73	static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence<I...> &)
74	{
75	((cont[Offset + I] = std::tolower(cont[Offset + I])), ...);
76	}
77
78	static ALWAYS_INLINE size_t readASCIICodePoints(CodePoint * code_points, const char & pos, const* char * end)
79	{
80	/// Offset before which we copy some data.
81	constexpr size_t padding_offset = default_padding - N + `1`;
82	/// We have an array like this for ASCII (N == 4, other cases are similar)
83	/// \|a0\|a1\|a2\|a3\|a4\|a5\|a6\|a7\|a8\|a9\|a10\|a11\|a12\|a13\|a14\|a15\|a16\|a17\|a18\|
84	/// And we copy ^^^^^^^^^^^^^^^ these bytes to the start
85	/// Actually it is enough to copy 3 bytes, but memcpy for 4 bytes translates into 1 instruction
86	memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - `1`) * sizeof(CodePoint));
87	/// Now we have an array
88	/// \|a13\|a14\|a15\|a16\|a4\|a5\|a6\|a7\|a8\|a9\|a10\|a11\|a12\|a13\|a14\|a15\|a16\|a17\|a18\|
89	/// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
90	/// Doing unaligned read of 16 bytes and copy them like above
91	/// 16 is also chosen to do two `movups`.
92	/// Such copying allow us to have 3 codepoints from the previous read to produce the 4-grams with them.
93	memcpy(code_points + (N - `1`), pos, default_padding * sizeof(CodePoint));
94
95	if constexpr (case_insensitive)
96	{
97	/// We really need template lambdas with C++20 to do it inline
98	unrollLowering<N - `1`>(code_points, std::make_index_sequence<padding_offset>());
99	}
100	pos += padding_offset;
101	if (pos > end)
102	return default_padding - (pos - end);
103	return default_padding;
104	}
105
106	static ALWAYS_INLINE size_t readUTF8CodePoints(CodePoint * code_points, const char & pos, const* char * end)
107	{
108	/// The same copying as described in the function above.
109	memcpy(code_points, code_points + default_padding - N + `1`, roundUpToPowerOfTwoOrZero(N - `1`) * sizeof(CodePoint));
110
111	size_t num = N - `1`;
112	while (num < default_padding && pos < end)
113	{
114	size_t length = UTF8::seqLength(*pos);
115
116	if (pos + length > end)
117	length = end - pos;
118
119	CodePoint res;
120	/// This is faster than just memcpy because of compiler optimizations with moving bytes.
121	switch (length)
122	{
123	case `1`:
124	res = `0`;
125	memcpy(&res, pos, `1`);
126	break;
127	case `2`:
128	res = `0`;
129	memcpy(&res, pos, `2`);
130	break;
131	case `3`:
132	res = `0`;
133	memcpy(&res, pos, `3`);
134	break;
135	default:
136	memcpy(&res, pos, `4`);
137	}
138
139	/// This is not a really true case insensitive utf8. We zero the 5-th bit of every byte.
140	/// And first bit of first byte if there are two bytes.
141	/// For ASCII it works https://catonmat.net/ascii-case-conversion-trick. For most cyrrilic letters also does.
142	/// For others, we don't care now. Lowering UTF is not a cheap operation.
143	if constexpr (case_insensitive)
144	{
145	switch (length)
146	{
147	case `4`:
148	res &= ~(`1u` << (`5` + `3` * CHAR_BIT));
149	[[fallthrough]];
150	case `3`:
151	res &= ~(`1u` << (`5` + `2` * CHAR_BIT));
152	[[fallthrough]];
153	case `2`:
154	res &= ~(`1u`);
155	res &= ~(`1u` << (`5` + CHAR_BIT));
156	[[fallthrough]];
157	default:
158	res &= ~(`1u` << `5`);
159	}
160	}
161
162	pos += length;
163	code_points[num++] = res;
164	}
165	return num;
166	}
167
168	template <bool save_ngrams>
169	static ALWAYS_INLINE inline size_t calculateNeedleStats(
170	const char * data,
171	const size_t size,
172	NgramStats & ngram_stats,
173	[[maybe_unused]] UInt16 * ngram_storage,
174	size_t (read_code_points)(CodePoint , const char &, const* char *),
175	UInt16 (hash_functor)(const* CodePoint *))
176	{
177	const char * start = data;
178	const char * end = data + size;
179	CodePoint cp[simultaneously_codepoints_num] = {};
180	/// read_code_points returns the position of cp where it stopped reading codepoints.
181	size_t found = read_code_points(cp, start, end);
182	/// We need to start for the first time here, because first N - 1 codepoints mean nothing.
183	size_t i = N - `1`;
184	size_t len = `0`;
185	do
186	{
187	for (; i + N <= found; ++i)
188	{
189	++len;
190	UInt16 hash = hash_functor(cp + i);
191	if constexpr (save_ngrams)
192	*ngram_storage++ = hash;
193	++ngram_stats[hash];
194	}
195	i = `0`;
196	} while (start < end && (found = read_code_points(cp, start, end)));
197
198	return len;
199	}
200
201	template <bool reuse_stats>
202	static ALWAYS_INLINE inline UInt64 calculateHaystackStatsAndMetric(
203	const char * data,
204	const size_t size,
205	NgramStats & ngram_stats,
206	size_t & distance,
207	[[maybe_unused]] UInt16 * ngram_storage,
208	size_t (read_code_points)(CodePoint , const char &, const* char *),
209	UInt16 (hash_functor)(const* CodePoint *))
210	{
211	size_t ngram_cnt = `0`;
212	const char * start = data;
213	const char * end = data + size;
214	CodePoint cp[simultaneously_codepoints_num] = {};
215
216	/// read_code_points returns the position of cp where it stopped reading codepoints.
217	size_t found = read_code_points(cp, start, end);
218	/// We need to start for the first time here, because first N - 1 codepoints mean nothing.
219	size_t iter = N - `1`;
220
221	do
222	{
223	for (; iter + N <= found; ++iter)
224	{
225	UInt16 hash = hash_functor(cp + iter);
226	/// For symmetric version we should add when we can't subtract to get symmetric difference.
227	if (static_cast<Int16>(ngram_stats[hash]) > `0`)
228	--distance;
229	else if constexpr (symmetric)
230	++distance;
231	if constexpr (reuse_stats)
232	ngram_storage[ngram_cnt] = hash;
233	++ngram_cnt;
234	--ngram_stats[hash];
235	}
236	iter = `0`;
237	} while (start < end && (found = read_code_points(cp, start, end)));
238
239	/// Return the state of hash map to its initial.
240	if constexpr (reuse_stats)
241	{
242	for (size_t i = `0`; i < ngram_cnt; ++i)
243	++ngram_stats[ngram_storage[i]];
244	}
245	return ngram_cnt;
246	}
247
248	template <class Callback, class... Args>
249	static inline auto dispatchSearcher(Callback callback, Args &&... args)
250	{
251	if constexpr (!UTF8)
252	return callback(std::forward<Args>(args)..., readASCIICodePoints, ASCIIHash);
253	else
254	return callback(std::forward<Args>(args)..., readUTF8CodePoints, UTF8Hash);
255	}
256
257	static void constant_constant(std::string data, std::string needle, Float32 & res)
258	{
259	NgramStats common_stats = {};
260
261	/// We use unsafe versions of getting ngrams, so I decided to use padded strings.
262	const size_t needle_size = needle.size();
263	const size_t data_size = data.size();
264	needle.resize(needle_size + default_padding);
265	data.resize(data_size + default_padding);
266
267	size_t second_size = dispatchSearcher(calculateNeedleStats<false>, needle.data(), needle_size, common_stats, nullptr);
268	size_t distance = second_size;
269	if (data_size <= max_string_size)
270	{
271	size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric<false>, data.data(), data_size, common_stats, distance, nullptr);
272	/// For !symmetric version we should not use first_size.
273	if constexpr (symmetric)
274	res = distance * `1.f` / std::max(first_size + second_size, size_t(`1`));
275	else
276	res = `1.f` - distance * `1.f` / std::max(second_size, size_t(`1`));
277	}
278	else
279	{
280	if constexpr (symmetric)
281	res = `1.f`;
282	else
283	res = `0.f`;
284	}
285	}
286
287	static void vector_vector(
288	const ColumnString::Chars & haystack_data,
289	const ColumnString::Offsets & haystack_offsets,
290	const ColumnString::Chars & needle_data,
291	const ColumnString::Offsets & needle_offsets,
292	PaddedPODArray<Float32> & res)
293	{
294	const size_t haystack_offsets_size = haystack_offsets.size();
295	size_t prev_haystack_offset = `0`;
296	size_t prev_needle_offset = `0`;
297
298	NgramStats common_stats = {};
299
300	/// The main motivation is to not allocate more on stack because we have already allocated a lot (128Kb).
301	/// And we can reuse these storages in one thread because we care only about what was written to first places.
302	std::unique_ptr<UInt16[]> needle_ngram_storage(new UInt16[max_string_size]);
303	std::unique_ptr<UInt16[]> haystack_ngram_storage(new UInt16[max_string_size]);
304
305	for (size_t i = `0`; i < haystack_offsets_size; ++i)
306	{
307	const char * haystack = reinterpret_cast<const char *>(&haystack_data [prev_haystack_offset]);
308	const size_t haystack_size = haystack_offsets [i] - prev_haystack_offset - `1`;
309	const char * needle = reinterpret_cast<const char *>(&needle_data [prev_needle_offset]);
310	const size_t needle_size = needle_offsets [i] - prev_needle_offset - `1`;
311
312	if (needle_size <= max_string_size && haystack_size <= max_string_size)
313	{
314	/// Get needle stats.
315	const size_t needle_stats_size = dispatchSearcher(
316	calculateNeedleStats<true>,
317	needle,
318	needle_size,
319	common_stats,
320	needle_ngram_storage.get());
321
322	size_t distance = needle_stats_size;
323
324	/// Combine with haystack stats, return to initial needle stats.
325	const size_t haystack_stats_size = dispatchSearcher(
326	calculateHaystackStatsAndMetric<true>,
327	haystack,
328	haystack_size,
329	common_stats,
330	distance,
331	haystack_ngram_storage.get());
332
333	/// Return to zero array stats.
334	for (size_t j = `0`; j < needle_stats_size; ++j)
335	--common_stats[needle_ngram_storage [j]];
336
337	/// For now, common stats is a zero array.
338
339
340	/// For !symmetric version we should not use haystack_stats_size.
341	if constexpr (symmetric)
342	res [i] = distance * `1.f` / std::max(haystack_stats_size + needle_stats_size, size_t(`1`));
343	else
344	res [i] = `1.f` - distance * `1.f` / std::max(needle_stats_size, size_t(`1`));
345	}
346	else
347	{
348	/// Strings are too big, we are assuming they are not the same. This is done because of limiting number
349	/// of bigrams added and not allocating too much memory.
350	if constexpr (symmetric)
351	res [i] = `1.f`;
352	else
353	res [i] = `0.f`;
354	}
355
356	prev_needle_offset = needle_offsets [i];
357	prev_haystack_offset = haystack_offsets [i];
358	}
359	}
360
361	static void constant_vector(
362	std::string haystack,
363	const ColumnString::Chars & needle_data,
364	const ColumnString::Offsets & needle_offsets,
365	PaddedPODArray<Float32> & res)
366	{
367	/// For symmetric version it is better to use vector_constant
368	if constexpr (symmetric)
369	{
370	vector_constant(needle_data, needle_offsets, std::move(haystack), res);
371	}
372	else
373	{
374	const size_t haystack_size = haystack.size();
375	haystack.resize(haystack_size + default_padding);
376
377	/// For logic explanation see vector_vector function.
378	const size_t needle_offsets_size = needle_offsets.size();
379	size_t prev_offset = `0`;
380
381	NgramStats common_stats = {};
382
383	std::unique_ptr<UInt16[]> needle_ngram_storage(new UInt16[max_string_size]);
384	std::unique_ptr<UInt16[]> haystack_ngram_storage(new UInt16[max_string_size]);
385
386	for (size_t i = `0`; i < needle_offsets_size; ++i)
387	{
388	const char * needle = reinterpret_cast<const char *>(&needle_data [prev_offset]);
389	const size_t needle_size = needle_offsets [i] - prev_offset - `1`;
390
391	if (needle_size <= max_string_size && haystack_size <= max_string_size)
392	{
393	const size_t needle_stats_size = dispatchSearcher(
394	calculateNeedleStats<true>,
395	needle,
396	needle_size,
397	common_stats,
398	needle_ngram_storage.get());
399
400	size_t distance = needle_stats_size;
401
402	dispatchSearcher(
403	calculateHaystackStatsAndMetric<true>,
404	haystack.data(),
405	haystack_size,
406	common_stats,
407	distance,
408	haystack_ngram_storage.get());
409
410	for (size_t j = `0`; j < needle_stats_size; ++j)
411	--common_stats[needle_ngram_storage [j]];
412
413	res [i] = `1.f` - distance * `1.f` / std::max(needle_stats_size, size_t(`1`));
414	}
415	else
416	{
417	res [i] = `0.f`;
418	}
419
420	prev_offset = needle_offsets [i];
421	}
422
423	}
424	}
425
426	static void vector_constant(
427	const ColumnString::Chars & data,
428	const ColumnString::Offsets & offsets,
429	std::string needle,
430	PaddedPODArray<Float32> & res)
431	{
432	/// zeroing our map
433	NgramStats common_stats = {};
434
435	/// The main motivation is to not allocate more on stack because we have already allocated a lot (128Kb).
436	/// And we can reuse these storages in one thread because we care only about what was written to first places.
437	std::unique_ptr<UInt16[]> ngram_storage(new UInt16[max_string_size]);
438
439	/// We use unsafe versions of getting ngrams, so I decided to use padded_data even in needle case.
440	const size_t needle_size = needle.size();
441	needle.resize(needle_size + default_padding);
442
443	const size_t needle_stats_size = dispatchSearcher(calculateNeedleStats<false>, needle.data(), needle_size, common_stats, nullptr);
444
445	size_t distance = needle_stats_size;
446	size_t prev_offset = `0`;
447	for (size_t i = `0`; i < offsets.size(); ++i)
448	{
449	const UInt8 * haystack = &data [prev_offset];
450	const size_t haystack_size = offsets [i] - prev_offset - `1`;
451	if (haystack_size <= max_string_size)
452	{
453	size_t haystack_stats_size = dispatchSearcher(
454	calculateHaystackStatsAndMetric<true>,
455	reinterpret_cast<const char *>(haystack),
456	haystack_size, common_stats,
457	distance,
458	ngram_storage.get());
459	/// For !symmetric version we should not use haystack_stats_size.
460	if constexpr (symmetric)
461	res [i] = distance * `1.f` / std::max(haystack_stats_size + needle_stats_size, size_t(`1`));
462	else
463	res [i] = `1.f` - distance * `1.f` / std::max(needle_stats_size, size_t(`1`));
464	}
465	else
466	{
467	/// if the strings are too big, we say they are completely not the same
468	if constexpr (symmetric)
469	res [i] = `1.f`;
470	else
471	res [i] = `0.f`;
472	}
473	distance = needle_stats_size;
474	prev_offset = offsets [i];
475	}
476	}
477	};
478
479
480	struct NameNgramDistance
481	{
482	static constexpr auto name = "ngramDistance";
483	};
484	struct NameNgramDistanceCaseInsensitive
485	{
486	static constexpr auto name = "ngramDistanceCaseInsensitive";
487	};
488
489	struct NameNgramDistanceUTF8
490	{
491	static constexpr auto name = "ngramDistanceUTF8";
492	};
493
494	struct NameNgramDistanceUTF8CaseInsensitive
495	{
496	static constexpr auto name = "ngramDistanceCaseInsensitiveUTF8";
497	};
498
499	struct NameNgramSearch
500	{
501	static constexpr auto name = "ngramSearch";
502	};
503	struct NameNgramSearchCaseInsensitive
504	{
505	static constexpr auto name = "ngramSearchCaseInsensitive";
506	};
507	struct NameNgramSearchUTF8
508	{
509	static constexpr auto name = "ngramSearchUTF8";
510	};
511
512	struct NameNgramSearchUTF8CaseInsensitive
513	{
514	static constexpr auto name = "ngramSearchCaseInsensitiveUTF8";
515	};
516
517	using FunctionNgramDistance = FunctionsStringSimilarity<NgramDistanceImpl<`4`, UInt8, false, false, true>, NameNgramDistance>;
518	using FunctionNgramDistanceCaseInsensitive = FunctionsStringSimilarity<NgramDistanceImpl<`4`, UInt8, false, true, true>, NameNgramDistanceCaseInsensitive>;
519	using FunctionNgramDistanceUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<`3`, UInt32, true, false, true>, NameNgramDistanceUTF8>;
520	using FunctionNgramDistanceCaseInsensitiveUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<`3`, UInt32, true, true, true>, NameNgramDistanceUTF8CaseInsensitive>;
521
522	using FunctionNgramSearch = FunctionsStringSimilarity<NgramDistanceImpl<`4`, UInt8, false, false, false>, NameNgramSearch>;
523	using FunctionNgramSearchCaseInsensitive = FunctionsStringSimilarity<NgramDistanceImpl<`4`, UInt8, false, true, false>, NameNgramSearchCaseInsensitive>;
524	using FunctionNgramSearchUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<`3`, UInt32, true, false, false>, NameNgramSearchUTF8>;
525	using FunctionNgramSearchCaseInsensitiveUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<`3`, UInt32, true, true, false>, NameNgramSearchUTF8CaseInsensitive>;
526
527
528	void registerFunctionsStringSimilarity(FunctionFactory & factory)
529	{
530	factory.registerFunction<FunctionNgramDistance>();
531	factory.registerFunction<FunctionNgramDistanceCaseInsensitive>();
532	factory.registerFunction<FunctionNgramDistanceUTF8>();
533	factory.registerFunction<FunctionNgramDistanceCaseInsensitiveUTF8>();
534
535	factory.registerFunction<FunctionNgramSearch>();
536	factory.registerFunction<FunctionNgramSearchCaseInsensitive>();
537	factory.registerFunction<FunctionNgramSearchUTF8>();
538	factory.registerFunction<FunctionNgramSearchCaseInsensitiveUTF8>();
539	}
540
541	}
542

Browse the source code of ClickHouse/dbms/src/Functions/FunctionsStringSimilarity.cpp