Volnitsky.h source code [ClickHouse/dbms/src/Common/Volnitsky.h]

1	#pragma once
2
3	#include <algorithm>
4	#include <vector>
5	#include <stdint.h>
6	#include <string.h>
7	#include <Core/Types.h>
8	#include <Poco/UTF8Encoding.h>
9	#include <Poco/Unicode.h>
10	#include <Common/StringSearcher.h>
11	#include <Common/StringUtils/StringUtils.h>
12	#include <common/StringRef.h>
13	#include <common/unaligned.h>
14
15	/* Search for a substring in a string by Volnitsky's algorithm*
16	* http://volnitsky.com/project/str_search/
17	*
18	* `haystack` and `needle` can contain zero bytes.
19	*
20	* Algorithm:
21	* - if the `needle` is too small or too large, or too small `haystack`, use std::search or memchr;
22	* - when initializing, fill in an open-addressing linear probing hash table of the form
23	* hash from the bigram of needle -> the position of this bigram in needle + 1.
24	* (one is added only to distinguish zero offset from an empty cell)
25	* - the keys are not stored in the hash table, only the values are stored;
26	* - bigrams can be inserted several times if they occur in the needle several times;
27	* - when searching, take from haystack bigram, which should correspond to the last bigram of needle (comparing from the end);
28	* - look for it in the hash table, if found - get the offset from the hash table and compare the string bytewise;
29	* - if it did not match, we check the next cell of the hash table from the collision resolution chain;
30	* - if not found, skip to haystack almost the size of the needle bytes;
31	*
32	* MultiVolnitsky - search for multiple substrings in a string:
33	* - Add bigrams to hash table with string index. Then the usual Volnitsky search is used.
34	* - We are adding while searching, limiting the number of fallback searchers and the total number of added bigrams
35	*/
36
37
38	namespace DB
39	{
40	namespace VolnitskyTraits
41	{
42	using Offset = UInt8; /// Offset in the needle. For the basic algorithm, the length of the needle must not be greater than 255.
43	using Id = UInt8; /// Index of the string (within the array of multiple needles), must not be greater than 255.
44	using Ngram = UInt16; /// n-gram (2 bytes).
45
46	/* Fits into the L2 cache (of common Intel CPUs).*
47	* This number is extremely good for compilers as it is numeric_limits<Uint16>::max() and there are optimizations with movzwl and other instructions with 2 bytes
48	*/
49	static constexpr size_t hash_size = `64` * `1024`;
50
51	/// min haystack size to use main algorithm instead of fallback
52	static constexpr size_t min_haystack_size_for_algorithm = `20000`;
53
54	static inline bool isFallbackNeedle(const size_t needle_size, size_t haystack_size_hint = `0`)
55	{
56	return needle_size < `2` * sizeof(Ngram) \|\| needle_size >= std::numeric_limits<Offset>::max()
57	\|\| (haystack_size_hint && haystack_size_hint < min_haystack_size_for_algorithm);
58	}
59
60	static inline Ngram toNGram(const UInt8 * const pos) { return unalignedLoad<Ngram>(pos); }
61
62	template <typename Callback>
63	static inline void putNGramASCIICaseInsensitive(const UInt8 * const pos, const int offset, const Callback & putNGramBase)
64	{
65	struct Chars
66	{
67	UInt8 c0;
68	UInt8 c1;
69	};
70
71	union
72	{
73	Ngram n;
74	Chars chars;
75	};
76
77	n = toNGram(pos);
78
79	const auto c0_al = isAlphaASCII(chars.c0);
80	const auto c1_al = isAlphaASCII(chars.c1);
81
82	if (c0_al && c1_al)
83	{
84	/// 4 combinations: AB, aB, Ab, ab
85	putNGramBase(n, offset);
86	chars.c0 = alternateCaseIfAlphaASCII(chars.c0);
87	putNGramBase(n, offset);
88	chars.c1 = alternateCaseIfAlphaASCII(chars.c1);
89	putNGramBase(n, offset);
90	chars.c0 = alternateCaseIfAlphaASCII(chars.c0);
91	putNGramBase(n, offset);
92	}
93	else if (c0_al)
94	{
95	/// 2 combinations: A1, a1
96	putNGramBase(n, offset);
97	chars.c0 = alternateCaseIfAlphaASCII(chars.c0);
98	putNGramBase(n, offset);
99	}
100	else if (c1_al)
101	{
102	/// 2 combinations: 0B, 0b
103	putNGramBase(n, offset);
104	chars.c1 = alternateCaseIfAlphaASCII(chars.c1);
105	putNGramBase(n, offset);
106	}
107	else
108	/// 1 combination: 01
109	putNGramBase(n, offset);
110	}
111
112	template <bool CaseSensitive, bool ASCII, typename Callback>
113	static inline void putNGram(const UInt8 * const pos, const int offset, [[maybe_unused]] const UInt8 * const begin, const Callback & putNGramBase)
114	{
115	if constexpr (CaseSensitive)
116	{
117	putNGramBase(toNGram(pos), offset);
118	}
119	else
120	{
121	if constexpr (ASCII)
122	{
123	putNGramASCIICaseInsensitive(pos, offset, putNGramBase);
124	}
125	else
126	{
127	struct Chars
128	{
129	UInt8 c0;
130	UInt8 c1;
131	};
132
133	union
134	{
135	VolnitskyTraits::Ngram n;
136	Chars chars;
137	};
138
139	n = toNGram(pos);
140
141	if (isascii(chars.c0) && isascii(chars.c1))
142	putNGramASCIICaseInsensitive(pos, offset, putNGramBase);
143	else
144	{
145	/* n-gram (in the case of n = 2)*
146	* can be entirely located within one code point,
147	* or intersect with two code points.
148	*
149	* In the first case, you need to consider up to two alternatives - this code point in upper and lower case,
150	* and in the second case - up to four alternatives - fragments of two code points in all combinations of cases.
151	*
152	* It does not take into account the dependence of the case-transformation from the locale (for example - Turkish `Ii`)
153	* as well as composition / decomposition and other features.
154	*
155	* It also does not work if characters with lower and upper cases are represented by different number of bytes or code points.
156	*/
157
158	using Seq = UInt8[`6`];
159
160	static const Poco::UTF8Encoding utf8;
161
162	if (UTF8::isContinuationOctet(chars.c1))
163	{
164	/// ngram is inside a sequence
165	auto seq_pos = pos;
166	UTF8::syncBackward(seq_pos, begin);
167
168	const auto u32 = utf8.convert(seq_pos);
169	const auto l_u32 = Poco::Unicode::toLower(u32);
170	const auto u_u32 = Poco::Unicode::toUpper(u32);
171
172	/// symbol is case-independent
173	if (l_u32 == u_u32)
174	putNGramBase(n, offset);
175	else
176	{
177	/// where is the given ngram in respect to the start of UTF-8 sequence?
178	const auto seq_ngram_offset = pos - seq_pos;
179
180	Seq seq;
181
182	/// put ngram for lowercase
183	utf8.convert(l_u32, seq, sizeof(seq));
184	chars.c0 = seq[seq_ngram_offset];
185	chars.c1 = seq[seq_ngram_offset + `1`];
186	putNGramBase(n, offset);
187
188	/// put ngram for uppercase
189	utf8.convert(u_u32, seq, sizeof(seq));
190	chars.c0 = seq[seq_ngram_offset]; //-V519
191	chars.c1 = seq[seq_ngram_offset + `1`]; //-V519
192	putNGramBase(n, offset);
193	}
194	}
195	else
196	{
197	/// ngram is on the boundary of two sequences
198	/// first sequence may start before u_pos if it is not ASCII
199	auto first_seq_pos = pos;
200	UTF8::syncBackward(first_seq_pos, begin);
201	/// where is the given ngram in respect to the start of first UTF-8 sequence?
202	const auto seq_ngram_offset = pos - first_seq_pos;
203
204	const auto first_u32 = utf8.convert(first_seq_pos);
205	const auto first_l_u32 = Poco::Unicode::toLower(first_u32);
206	const auto first_u_u32 = Poco::Unicode::toUpper(first_u32);
207
208	/// second sequence always start immediately after u_pos
209	auto second_seq_pos = pos + `1`;
210
211	const auto second_u32 = utf8.convert(second_seq_pos); /// TODO This assumes valid UTF-8 or zero byte after needle.
212	const auto second_l_u32 = Poco::Unicode::toLower(second_u32);
213	const auto second_u_u32 = Poco::Unicode::toUpper(second_u32);
214
215	/// both symbols are case-independent
216	if (first_l_u32 == first_u_u32 && second_l_u32 == second_u_u32)
217	{
218	putNGramBase(n, offset);
219	}
220	else if (first_l_u32 == first_u_u32)
221	{
222	/// first symbol is case-independent
223	Seq seq;
224
225	/// put ngram for lowercase
226	utf8.convert(second_l_u32, seq, sizeof(seq));
227	chars.c1 = seq[`0`];
228	putNGramBase(n, offset);
229
230	/// put ngram from uppercase, if it is different
231	utf8.convert(second_u_u32, seq, sizeof(seq));
232	if (chars.c1 != seq[`0`])
233	{
234	chars.c1 = seq[`0`];
235	putNGramBase(n, offset);
236	}
237	}
238	else if (second_l_u32 == second_u_u32)
239	{
240	/// second symbol is case-independent
241	Seq seq;
242
243	/// put ngram for lowercase
244	utf8.convert(first_l_u32, seq, sizeof(seq));
245	chars.c0 = seq[seq_ngram_offset];
246	putNGramBase(n, offset);
247
248	/// put ngram for uppercase, if it is different
249	utf8.convert(first_u_u32, seq, sizeof(seq));
250	if (chars.c0 != seq[seq_ngram_offset])
251	{
252	chars.c0 = seq[seq_ngram_offset];
253	putNGramBase(n, offset);
254	}
255	}
256	else
257	{
258	Seq first_l_seq;
259	Seq first_u_seq;
260	Seq second_l_seq;
261	Seq second_u_seq;
262
263	utf8.convert(first_l_u32, first_l_seq, sizeof(first_l_seq));
264	utf8.convert(first_u_u32, first_u_seq, sizeof(first_u_seq));
265	utf8.convert(second_l_u32, second_l_seq, sizeof(second_l_seq));
266	utf8.convert(second_u_u32, second_u_seq, sizeof(second_u_seq));
267
268	auto c0l = first_l_seq[seq_ngram_offset];
269	auto c0u = first_u_seq[seq_ngram_offset];
270	auto c1l = second_l_seq[`0`];
271	auto c1u = second_u_seq[`0`];
272
273	/// ngram for ll
274	chars.c0 = c0l;
275	chars.c1 = c1l;
276	putNGramBase(n, offset);
277
278	if (c0l != c0u)
279	{
280	/// ngram for Ul
281	chars.c0 = c0u;
282	chars.c1 = c1l;
283	putNGramBase(n, offset);
284	}
285
286	if (c1l != c1u)
287	{
288	/// ngram for lU
289	chars.c0 = c0l;
290	chars.c1 = c1u;
291	putNGramBase(n, offset);
292	}
293
294	if (c0l != c0u && c1l != c1u)
295	{
296	/// ngram for UU
297	chars.c0 = c0u;
298	chars.c1 = c1u;
299	putNGramBase(n, offset);
300	}
301	}
302	}
303	}
304	}
305	}
306	}
307	}
308
309
310	/// @todo store lowercase needle to speed up in case there are numerous occurrences of bigrams from needle in haystack
311	template <bool CaseSensitive, bool ASCII, typename FallbackSearcher>
312	class VolnitskyBase
313	{
314	protected:
315	const UInt8 * const needle;
316	const size_t needle_size;
317	const UInt8 * const needle_end = needle + needle_size;
318	/// For how long we move, if the n-gram from haystack is not found in the hash table.
319	const size_t step = needle_size - sizeof(VolnitskyTraits::Ngram) + `1`;
320
321	/* max needle length is 255, max distinct ngrams for case-sensitive is (255 - 1), case-insensitive is 4 * (255 - 1)*
322	* storage of 64K ngrams (n = 2, 128 KB) should be large enough for both cases */
323	VolnitskyTraits::Offset hash[VolnitskyTraits::hash_size]; /// Hash table.
324
325	const bool fallback; /// Do we need to use the fallback algorithm.
326
327	FallbackSearcher fallback_searcher;
328
329	public:
330	using Searcher = FallbackSearcher;
331
332	/* haystack_size_hint - the expected total size of the haystack for `search` calls. Optional (zero means unspecified).*
333	* If you specify it small enough, the fallback algorithm will be used,
334	* since it is considered that it's useless to waste time initializing the hash table.
335	*/
336	VolnitskyBase(const char * const needle_, const size_t needle_size_, size_t haystack_size_hint = `0`)
337	: needle{reinterpret_cast<const UInt8 *>(needle_)}
338	, needle_size{needle_size_}
339	, fallback{VolnitskyTraits::isFallbackNeedle(needle_size, haystack_size_hint)}
340	, fallback_searcher{needle_, needle_size}
341	{
342	if (fallback)
343	return;
344
345	memset(hash, `0`, sizeof(hash));
346
347	auto callback = [this](const VolnitskyTraits::Ngram ngram, const int offset) { return this->putNGramBase(ngram, offset); };
348	/// ssize_t is used here because unsigned can't be used with condition like `i >= 0`, unsigned always >= 0
349	/// And also adding from the end guarantees that we will find first occurence because we will lookup bigger offsets first.
350	for (auto i = static_cast<ssize_t>(needle_size - sizeof(VolnitskyTraits::Ngram)); i >= `0`; --i)
351	VolnitskyTraits::putNGram<CaseSensitive, ASCII>(this->needle + i, i + `1`, this->needle, callback);
352	}
353
354
355	/// If not found, the end of the haystack is returned.
356	const UInt8 * search(const UInt8 * const haystack, const size_t haystack_size) const
357	{
358	if (needle_size == `0`)
359	return haystack;
360
361	const auto haystack_end = haystack + haystack_size;
362
363	if (fallback \|\| haystack_size <= needle_size)
364	return fallback_searcher.search(haystack, haystack_end);
365
366	/// Let's "apply" the needle to the haystack and compare the n-gram from the end of the needle.
367	const auto * pos = haystack + needle_size - sizeof(VolnitskyTraits::Ngram);
368	for (; pos <= haystack_end - needle_size; pos += step)
369	{
370	/// We look at all the cells of the hash table that can correspond to the n-gram from haystack.
371	for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num];
372	cell_num = (cell_num + `1`) % VolnitskyTraits::hash_size)
373	{
374	/// When found - compare bytewise, using the offset from the hash table.
375	const auto res = pos - (hash[cell_num] - `1`);
376
377	/// pointer in the code is always padded array so we can use pagesafe semantics
378	if (fallback_searcher.compare(haystack, haystack_end, res))
379	return res;
380	}
381	}
382
383	return fallback_searcher.search(pos - step + `1`, haystack_end);
384	}
385
386	const char * search(const char * haystack, size_t haystack_size) const
387	{
388	return reinterpret_cast<const char >(search(reinterpret_cast<const* UInt8 *>(haystack), haystack_size));
389	}
390
391	protected:
392	void putNGramBase(const VolnitskyTraits::Ngram ngram, const int offset)
393	{
394	/// Put the offset for the n-gram in the corresponding cell or the nearest free cell.
395	size_t cell_num = ngram % VolnitskyTraits::hash_size;
396
397	while (hash[cell_num])
398	cell_num = (cell_num + `1`) % VolnitskyTraits::hash_size; /// Search for the next free cell.
399
400	hash[cell_num] = offset;
401	}
402	};
403
404
405	template <bool CaseSensitive, bool ASCII, typename FallbackSearcher>
406	class MultiVolnitskyBase
407	{
408	private:
409	/// needles and their offsets
410	const std::vector<StringRef> & needles;
411
412
413	/// fallback searchers
414	std::vector<size_t> fallback_needles;
415	std::vector<FallbackSearcher> fallback_searchers;
416
417	/// because std::pair<> is not POD
418	struct OffsetId
419	{
420	VolnitskyTraits::Id id;
421	VolnitskyTraits::Offset off;
422	};
423
424	OffsetId hash[VolnitskyTraits::hash_size];
425
426	/// step for each bunch of strings
427	size_t step;
428
429	/// last index of offsets that was not processed
430	size_t last;
431
432	/// limit for adding to hashtable. In worst case with case insentive search, the table will be filled at most as half
433	static constexpr size_t small_limit = VolnitskyTraits::hash_size / `8`;
434
435	public:
436	MultiVolnitskyBase(const std::vector<StringRef> & needles_) : needles{needles_}, step{`0`}, last{`0`}
437	{
438	fallback_searchers.reserve(needles.size());
439	}
440
441	/**
442	* This function is needed to initialize hash table
443	* Returns `true` if there is nothing to initialize
444	* and `false` if we have something to initialize and initializes it.
445	* This function is a kind of fallback if there are many needles.
446	* We actually destroy the hash table and initialize it with uninitialized needles
447	* and search through the haystack again.
448	* The actual usage of this function is like this:
449	* while (hasMoreToSearch())
450	* {
451	* search inside the haystack with the known needles
452	* }
453	*/
454	bool hasMoreToSearch()
455	{
456	if (last == needles.size())
457	return false;
458
459	memset(hash, `0`, sizeof(hash));
460	fallback_needles.clear();
461	step = std::numeric_limits<size_t>::max();
462
463	size_t buf = `0`;
464	size_t size = needles.size();
465
466	for (; last < size; ++last)
467	{
468	const char * cur_needle_data = needles[last].data;
469	const size_t cur_needle_size = needles[last].size;
470
471	/// save the indices of fallback searchers
472	if (VolnitskyTraits::isFallbackNeedle(cur_needle_size))
473	{
474	fallback_needles.push_back(last);
475	}
476	else
477	{
478	/// put all bigrams
479	auto callback = [this](const VolnitskyTraits::Ngram ngram, const int offset)
480	{
481	return this->putNGramBase(ngram, offset, this->last);
482	};
483
484	buf += cur_needle_size - sizeof(VolnitskyTraits::Ngram) + `1`;
485
486	/// this is the condition when we actually need to stop and start searching with known needles
487	if (buf > small_limit)
488	break;
489
490	step = std::min(step, cur_needle_size - sizeof(VolnitskyTraits::Ngram) + `1`);
491	for (auto i = static_cast<int>(cur_needle_size - sizeof(VolnitskyTraits::Ngram)); i >= `0`; --i)
492	{
493	VolnitskyTraits::putNGram<CaseSensitive, ASCII>(
494	reinterpret_cast<const UInt8 *>(cur_needle_data) + i,
495	i + `1`,
496	reinterpret_cast<const UInt8 *>(cur_needle_data),
497	callback);
498	}
499	}
500	fallback_searchers.emplace_back(cur_needle_data, cur_needle_size);
501	}
502	return true;
503	}
504
505	inline bool searchOne(const UInt8 * haystack, const UInt8 * haystack_end) const
506	{
507	const size_t fallback_size = fallback_needles.size();
508	for (size_t i = `0`; i < fallback_size; ++i)
509	if (fallback_searchers[fallback_needles[i]].search(haystack, haystack_end) != haystack_end)
510	return true;
511
512	/// check if we have one non empty volnitsky searcher
513	if (step != std::numeric_limits<size_t>::max())
514	{
515	const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
516	for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step)
517	{
518	for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off;
519	cell_num = (cell_num + `1`) % VolnitskyTraits::hash_size)
520	{
521	if (pos >= haystack + hash[cell_num].off - `1`)
522	{
523	const auto res = pos - (hash[cell_num].off - `1`);
524	const size_t ind = hash[cell_num].id;
525	if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(haystack, haystack_end, res))
526	return true;
527	}
528	}
529	}
530	}
531	return false;
532	}
533
534	inline size_t searchOneFirstIndex(const UInt8 * haystack, const UInt8 * haystack_end) const
535	{
536	const size_t fallback_size = fallback_needles.size();
537
538	size_t ans = std::numeric_limits<size_t>::max();
539
540	for (size_t i = `0`; i < fallback_size; ++i)
541	if (fallback_searchers[fallback_needles[i]].search(haystack, haystack_end) != haystack_end)
542	ans = std::min(ans, fallback_needles[i]);
543
544	/// check if we have one non empty volnitsky searcher
545	if (step != std::numeric_limits<size_t>::max())
546	{
547	const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
548	for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step)
549	{
550	for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off;
551	cell_num = (cell_num + `1`) % VolnitskyTraits::hash_size)
552	{
553	if (pos >= haystack + hash[cell_num].off - `1`)
554	{
555	const auto res = pos - (hash[cell_num].off - `1`);
556	const size_t ind = hash[cell_num].id;
557	if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(haystack, haystack_end, res))
558	ans = std::min(ans, ind);
559	}
560	}
561	}
562	}
563
564	/*
565	* if nothing was found, ans + 1 will be equal to zero and we can
566	* assign it into the result because we need to return the position starting with one
567	*/
568	return ans + `1`;
569	}
570
571	template <typename CountCharsCallback>
572	inline UInt64 searchOneFirstPosition(const UInt8 * haystack, const UInt8 * haystack_end, const CountCharsCallback & count_chars) const
573	{
574	const size_t fallback_size = fallback_needles.size();
575
576	UInt64 ans = std::numeric_limits<UInt64>::max();
577
578	for (size_t i = `0`; i < fallback_size; ++i)
579	if (auto pos = fallback_searchers[fallback_needles[i]].search(haystack, haystack_end); pos != haystack_end)
580	ans = std::min<UInt64>(ans, pos - haystack);
581
582	/// check if we have one non empty volnitsky searcher
583	if (step != std::numeric_limits<size_t>::max())
584	{
585	const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
586	for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step)
587	{
588	for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off;
589	cell_num = (cell_num + `1`) % VolnitskyTraits::hash_size)
590	{
591	if (pos >= haystack + hash[cell_num].off - `1`)
592	{
593	const auto res = pos - (hash[cell_num].off - `1`);
594	const size_t ind = hash[cell_num].id;
595	if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(haystack, haystack_end, res))
596	ans = std::min<UInt64>(ans, res - haystack);
597	}
598	}
599	}
600	}
601	if (ans == std::numeric_limits<UInt64>::max())
602	return `0`;
603	return count_chars(haystack, haystack + ans);
604	}
605
606	template <typename CountCharsCallback, typename AnsType>
607	inline void searchOneAll(const UInt8 * haystack, const UInt8 * haystack_end, AnsType * ans, const CountCharsCallback & count_chars) const
608	{
609	const size_t fallback_size = fallback_needles.size();
610	for (size_t i = `0`; i < fallback_size; ++i)
611	{
612	const UInt8 * ptr = fallback_searchers[fallback_needles[i]].search(haystack, haystack_end);
613	if (ptr != haystack_end)
614	ans[fallback_needles[i]] = count_chars(haystack, ptr);
615	}
616
617	/// check if we have one non empty volnitsky searcher
618	if (step != std::numeric_limits<size_t>::max())
619	{
620	const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
621	for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step)
622	{
623	for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off;
624	cell_num = (cell_num + `1`) % VolnitskyTraits::hash_size)
625	{
626	if (pos >= haystack + hash[cell_num].off - `1`)
627	{
628	const auto * res = pos - (hash[cell_num].off - `1`);
629	const size_t ind = hash[cell_num].id;
630	if (ans[ind] == `0` && res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(haystack, haystack_end, res))
631	ans[ind] = count_chars(haystack, res);
632	}
633	}
634	}
635	}
636	}
637
638	void putNGramBase(const VolnitskyTraits::Ngram ngram, const int offset, const size_t num)
639	{
640	size_t cell_num = ngram % VolnitskyTraits::hash_size;
641
642	while (hash[cell_num].off)
643	cell_num = (cell_num + `1`) % VolnitskyTraits::hash_size;
644
645	hash[cell_num] = {static_cast<VolnitskyTraits::Id>(num), static_cast<VolnitskyTraits::Offset>(offset)};
646	}
647	};
648
649
650	using Volnitsky = VolnitskyBase<true, true, ASCIICaseSensitiveStringSearcher>;
651	using VolnitskyUTF8 = VolnitskyBase<true, false, ASCIICaseSensitiveStringSearcher>; /// exactly same as Volnitsky
652	using VolnitskyCaseInsensitive = VolnitskyBase<false, true, ASCIICaseInsensitiveStringSearcher>; /// ignores non-ASCII bytes
653	using VolnitskyCaseInsensitiveUTF8 = VolnitskyBase<false, false, UTF8CaseInsensitiveStringSearcher>;
654
655	using VolnitskyCaseSensitiveToken = VolnitskyBase<true, true, ASCIICaseSensitiveTokenSearcher>;
656	using VolnitskyCaseInsensitiveToken = VolnitskyBase<false, true, ASCIICaseInsensitiveTokenSearcher>;
657
658	using MultiVolnitsky = MultiVolnitskyBase<true, true, ASCIICaseSensitiveStringSearcher>;
659	using MultiVolnitskyUTF8 = MultiVolnitskyBase<true, false, ASCIICaseSensitiveStringSearcher>;
660	using MultiVolnitskyCaseInsensitive = MultiVolnitskyBase<false, true, ASCIICaseInsensitiveStringSearcher>;
661	using MultiVolnitskyCaseInsensitiveUTF8 = MultiVolnitskyBase<false, false, UTF8CaseInsensitiveStringSearcher>;
662
663
664	}
665

Browse the source code of ClickHouse/dbms/src/Common/Volnitsky.h