1#include <Storages/MergeTree/MergeTreeIndexFullText.h>
2
3#include <Common/StringUtils/StringUtils.h>
4#include <Common/UTF8Helpers.h>
5#include <DataTypes/DataTypesNumber.h>
6#include <IO/WriteHelpers.h>
7#include <IO/ReadHelpers.h>
8#include <Interpreters/ExpressionActions.h>
9#include <Interpreters/ExpressionAnalyzer.h>
10#include <Interpreters/SyntaxAnalyzer.h>
11#include <Interpreters/misc.h>
12#include <Storages/MergeTree/MergeTreeData.h>
13#include <Storages/MergeTree/RPNBuilder.h>
14#include <Parsers/ASTIdentifier.h>
15#include <Parsers/ASTLiteral.h>
16#include <Parsers/ASTSubquery.h>
17
18#include <Poco/Logger.h>
19
20#include <boost/algorithm/string.hpp>
21
22
23namespace DB
24{
25
26namespace ErrorCodes
27{
28 extern const int INCORRECT_QUERY;
29}
30
31
32/// Adds all tokens from string to bloom filter.
33static void stringToBloomFilter(
34 const char * data, size_t size, const std::unique_ptr<ITokenExtractor> & token_extractor, BloomFilter & bloom_filter)
35{
36 size_t cur = 0;
37 size_t token_start = 0;
38 size_t token_len = 0;
39 while (cur < size && token_extractor->next(data, size, &cur, &token_start, &token_len))
40 bloom_filter.add(data + token_start, token_len);
41}
42
43/// Adds all tokens from like pattern string to bloom filter. (Because like pattern can contain `\%` and `\_`.)
44static void likeStringToBloomFilter(
45 const String & data, const std::unique_ptr<ITokenExtractor> & token_extractor, BloomFilter & bloom_filter)
46{
47 size_t cur = 0;
48 String token;
49 while (cur < data.size() && token_extractor->nextLike(data, &cur, token))
50 bloom_filter.add(token.c_str(), token.size());
51}
52/// Unified condition for equals, startsWith and endsWith
53bool MergeTreeConditionFullText::createFunctionEqualsCondition(RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx)
54{
55 out.function = RPNElement::FUNCTION_EQUALS;
56 out.bloom_filter = std::make_unique<BloomFilter>(
57 idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed);
58
59 const auto & str = value.get<String>();
60 stringToBloomFilter(str.c_str(), str.size(), idx.token_extractor_func, *out.bloom_filter);
61 return true;
62}
63
64MergeTreeIndexGranuleFullText::MergeTreeIndexGranuleFullText(const MergeTreeIndexFullText & index_)
65 : IMergeTreeIndexGranule()
66 , index(index_)
67 , bloom_filters(
68 index.columns.size(), BloomFilter(index.bloom_filter_size, index.bloom_filter_hashes, index.seed))
69 , has_elems(false) {}
70
71void MergeTreeIndexGranuleFullText::serializeBinary(WriteBuffer & ostr) const
72{
73 if (empty())
74 throw Exception("Attempt to write empty minmax index " + backQuote(index.name), ErrorCodes::LOGICAL_ERROR);
75
76 for (const auto & bloom_filter : bloom_filters)
77 ostr.write(reinterpret_cast<const char *>(bloom_filter.getFilter().data()), index.bloom_filter_size);
78}
79
80void MergeTreeIndexGranuleFullText::deserializeBinary(ReadBuffer & istr)
81{
82 for (auto & bloom_filter : bloom_filters)
83 {
84 istr.read(reinterpret_cast<char *>(bloom_filter.getFilter().data()), index.bloom_filter_size);
85 }
86 has_elems = true;
87}
88
89
90MergeTreeIndexAggregatorFullText::MergeTreeIndexAggregatorFullText(const MergeTreeIndexFullText & index_)
91 : index(index_), granule(std::make_shared<MergeTreeIndexGranuleFullText>(index)) {}
92
93MergeTreeIndexGranulePtr MergeTreeIndexAggregatorFullText::getGranuleAndReset()
94{
95 auto new_granule = std::make_shared<MergeTreeIndexGranuleFullText>(index);
96 new_granule.swap(granule);
97 return new_granule;
98}
99
100void MergeTreeIndexAggregatorFullText::update(const Block & block, size_t * pos, size_t limit)
101{
102 if (*pos >= block.rows())
103 throw Exception(
104 "The provided position is not less than the number of block rows. Position: "
105 + toString(*pos) + ", Block rows: " + toString(block.rows()) + ".", ErrorCodes::LOGICAL_ERROR);
106
107 size_t rows_read = std::min(limit, block.rows() - *pos);
108
109 for (size_t col = 0; col < index.columns.size(); ++col)
110 {
111 const auto & column = block.getByName(index.columns[col]).column;
112 for (size_t i = 0; i < rows_read; ++i)
113 {
114 auto ref = column->getDataAt(*pos + i);
115 stringToBloomFilter(ref.data, ref.size, index.token_extractor_func, granule->bloom_filters[col]);
116 }
117 }
118 granule->has_elems = true;
119 *pos += rows_read;
120}
121
122
123const MergeTreeConditionFullText::AtomMap MergeTreeConditionFullText::atom_map
124{
125 {
126 "notEquals",
127 [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx)
128 {
129 out.function = RPNElement::FUNCTION_NOT_EQUALS;
130 out.bloom_filter = std::make_unique<BloomFilter>(
131 idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed);
132
133 const auto & str = value.get<String>();
134 stringToBloomFilter(str.c_str(), str.size(), idx.token_extractor_func, *out.bloom_filter);
135 return true;
136 }
137 },
138 {
139 "equals",
140 [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx)
141 {
142 return createFunctionEqualsCondition(out, value, idx);
143 }
144 },
145 {
146 "like",
147 [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx)
148 {
149 out.function = RPNElement::FUNCTION_EQUALS;
150 out.bloom_filter = std::make_unique<BloomFilter>(
151 idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed);
152
153 const auto & str = value.get<String>();
154 likeStringToBloomFilter(str, idx.token_extractor_func, *out.bloom_filter);
155 return true;
156 }
157 },
158 {
159 "notLike",
160 [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx)
161 {
162 out.function = RPNElement::FUNCTION_NOT_EQUALS;
163 out.bloom_filter = std::make_unique<BloomFilter>(
164 idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed);
165
166 const auto & str = value.get<String>();
167 likeStringToBloomFilter(str, idx.token_extractor_func, *out.bloom_filter);
168 return true;
169 }
170 },
171 {
172 "hasToken",
173 [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx)
174 {
175 out.function = RPNElement::FUNCTION_EQUALS;
176 out.bloom_filter = std::make_unique<BloomFilter>(
177 idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed);
178
179 const auto & str = value.get<String>();
180 stringToBloomFilter(str.c_str(), str.size(), idx.token_extractor_func, *out.bloom_filter);
181 return true;
182 }
183 },
184 {
185 "startsWith",
186 [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx)
187 {
188 return createFunctionEqualsCondition(out, value, idx);
189 }
190 },
191 {
192 "endsWith",
193 [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx)
194 {
195 return createFunctionEqualsCondition(out, value, idx);
196 }
197 },
198 {
199 "multiSearchAny",
200 [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx)
201 {
202 out.function = RPNElement::FUNCTION_MULTI_SEARCH;
203
204 /// 2d vector is not needed here but is used because already exists for FUNCTION_IN
205 std::vector<std::vector<BloomFilter>> bloom_filters;
206 bloom_filters.emplace_back();
207 for (const auto & element : value.get<Array>())
208 {
209 if (element.getType() != Field::Types::String)
210 return false;
211
212 bloom_filters.back().emplace_back(idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed);
213 const auto & str = element.get<String>();
214 stringToBloomFilter(str.c_str(), str.size(), idx.token_extractor_func, bloom_filters.back().back());
215 }
216 out.set_bloom_filters = std::move(bloom_filters);
217 return true;
218 }
219 },
220 {
221 "notIn",
222 [] (RPNElement & out, const Field &, const MergeTreeIndexFullText &)
223 {
224 out.function = RPNElement::FUNCTION_NOT_IN;
225 return true;
226 }
227 },
228 {
229 "in",
230 [] (RPNElement & out, const Field &, const MergeTreeIndexFullText &)
231 {
232 out.function = RPNElement::FUNCTION_IN;
233 return true;
234 }
235 },
236};
237
238MergeTreeConditionFullText::MergeTreeConditionFullText(
239 const SelectQueryInfo & query_info,
240 const Context & context,
241 const MergeTreeIndexFullText & index_) : index(index_), prepared_sets(query_info.sets)
242{
243 rpn = std::move(
244 RPNBuilder<RPNElement>(
245 query_info, context,
246 [this] (const ASTPtr & node, const Context & /* context */, Block & block_with_constants, RPNElement & out) -> bool
247 {
248 return this->atomFromAST(node, block_with_constants, out);
249 }).extractRPN());
250}
251
252bool MergeTreeConditionFullText::alwaysUnknownOrTrue() const
253{
254 /// Check like in KeyCondition.
255 std::vector<bool> rpn_stack;
256
257 for (const auto & element : rpn)
258 {
259 if (element.function == RPNElement::FUNCTION_UNKNOWN
260 || element.function == RPNElement::ALWAYS_TRUE)
261 {
262 rpn_stack.push_back(true);
263 }
264 else if (element.function == RPNElement::FUNCTION_EQUALS
265 || element.function == RPNElement::FUNCTION_NOT_EQUALS
266 || element.function == RPNElement::FUNCTION_IN
267 || element.function == RPNElement::FUNCTION_NOT_IN
268 || element.function == RPNElement::FUNCTION_MULTI_SEARCH
269 || element.function == RPNElement::ALWAYS_FALSE)
270 {
271 rpn_stack.push_back(false);
272 }
273 else if (element.function == RPNElement::FUNCTION_NOT)
274 {
275 // do nothing
276 }
277 else if (element.function == RPNElement::FUNCTION_AND)
278 {
279 auto arg1 = rpn_stack.back();
280 rpn_stack.pop_back();
281 auto arg2 = rpn_stack.back();
282 rpn_stack.back() = arg1 && arg2;
283 }
284 else if (element.function == RPNElement::FUNCTION_OR)
285 {
286 auto arg1 = rpn_stack.back();
287 rpn_stack.pop_back();
288 auto arg2 = rpn_stack.back();
289 rpn_stack.back() = arg1 || arg2;
290 }
291 else
292 throw Exception("Unexpected function type in KeyCondition::RPNElement", ErrorCodes::LOGICAL_ERROR);
293 }
294
295 return rpn_stack[0];
296}
297
298bool MergeTreeConditionFullText::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const
299{
300 std::shared_ptr<MergeTreeIndexGranuleFullText> granule
301 = std::dynamic_pointer_cast<MergeTreeIndexGranuleFullText>(idx_granule);
302 if (!granule)
303 throw Exception(
304 "BloomFilter index condition got a granule with the wrong type.", ErrorCodes::LOGICAL_ERROR);
305
306 /// Check like in KeyCondition.
307 std::vector<BoolMask> rpn_stack;
308 for (const auto & element : rpn)
309 {
310 if (element.function == RPNElement::FUNCTION_UNKNOWN)
311 {
312 rpn_stack.emplace_back(true, true);
313 }
314 else if (element.function == RPNElement::FUNCTION_EQUALS
315 || element.function == RPNElement::FUNCTION_NOT_EQUALS)
316 {
317 rpn_stack.emplace_back(
318 granule->bloom_filters[element.key_column].contains(*element.bloom_filter), true);
319
320 if (element.function == RPNElement::FUNCTION_NOT_EQUALS)
321 rpn_stack.back() = !rpn_stack.back();
322 }
323 else if (element.function == RPNElement::FUNCTION_IN
324 || element.function == RPNElement::FUNCTION_NOT_IN)
325 {
326 std::vector<bool> result(element.set_bloom_filters.back().size(), true);
327
328 for (size_t column = 0; column < element.set_key_position.size(); ++column)
329 {
330 const size_t key_idx = element.set_key_position[column];
331
332 const auto & bloom_filters = element.set_bloom_filters[column];
333 for (size_t row = 0; row < bloom_filters.size(); ++row)
334 result[row] = result[row] && granule->bloom_filters[key_idx].contains(bloom_filters[row]);
335 }
336
337 rpn_stack.emplace_back(
338 std::find(std::cbegin(result), std::cend(result), true) != std::end(result), true);
339 if (element.function == RPNElement::FUNCTION_NOT_IN)
340 rpn_stack.back() = !rpn_stack.back();
341 }
342 else if (element.function == RPNElement::FUNCTION_MULTI_SEARCH)
343 {
344 std::vector<bool> result(element.set_bloom_filters.back().size(), true);
345
346 const auto & bloom_filters = element.set_bloom_filters[0];
347
348 for (size_t row = 0; row < bloom_filters.size(); ++row)
349 result[row] = result[row] && granule->bloom_filters[element.key_column].contains(bloom_filters[row]);
350
351 rpn_stack.emplace_back(
352 std::find(std::cbegin(result), std::cend(result), true) != std::end(result), true);
353 }
354 else if (element.function == RPNElement::FUNCTION_NOT)
355 {
356 rpn_stack.back() = !rpn_stack.back();
357 }
358 else if (element.function == RPNElement::FUNCTION_AND)
359 {
360 auto arg1 = rpn_stack.back();
361 rpn_stack.pop_back();
362 auto arg2 = rpn_stack.back();
363 rpn_stack.back() = arg1 & arg2;
364 }
365 else if (element.function == RPNElement::FUNCTION_OR)
366 {
367 auto arg1 = rpn_stack.back();
368 rpn_stack.pop_back();
369 auto arg2 = rpn_stack.back();
370 rpn_stack.back() = arg1 | arg2;
371 }
372 else if (element.function == RPNElement::ALWAYS_FALSE)
373 {
374 rpn_stack.emplace_back(false, true);
375 }
376 else if (element.function == RPNElement::ALWAYS_TRUE)
377 {
378 rpn_stack.emplace_back(true, false);
379 }
380 else
381 throw Exception("Unexpected function type in KeyCondition::RPNElement", ErrorCodes::LOGICAL_ERROR);
382 }
383
384 if (rpn_stack.size() != 1)
385 throw Exception("Unexpected stack size in KeyCondition::mayBeTrueInRange", ErrorCodes::LOGICAL_ERROR);
386
387 return rpn_stack[0].can_be_true;
388}
389
390bool MergeTreeConditionFullText::getKey(const ASTPtr & node, size_t & key_column_num)
391{
392 auto it = std::find(index.columns.begin(), index.columns.end(), node->getColumnName());
393 if (it == index.columns.end())
394 return false;
395
396 key_column_num = static_cast<size_t>(it - index.columns.begin());
397 return true;
398}
399
400bool MergeTreeConditionFullText::atomFromAST(
401 const ASTPtr & node, Block & block_with_constants, RPNElement & out)
402{
403 Field const_value;
404 DataTypePtr const_type;
405 if (const auto * func = typeid_cast<const ASTFunction *>(node.get()))
406 {
407 const ASTs & args = typeid_cast<const ASTExpressionList &>(*func->arguments).children;
408
409 if (args.size() != 2)
410 return false;
411
412 size_t key_arg_pos; /// Position of argument with key column (non-const argument)
413 size_t key_column_num = -1; /// Number of a key column (inside key_column_names array)
414 const auto & func_name = func->name;
415
416 if (functionIsInOrGlobalInOperator(func_name) && tryPrepareSetBloomFilter(args, out))
417 {
418 key_arg_pos = 0;
419 }
420 else if (KeyCondition::getConstant(args[1], block_with_constants, const_value, const_type) && getKey(args[0], key_column_num))
421 {
422 key_arg_pos = 0;
423 }
424 else if (KeyCondition::getConstant(args[0], block_with_constants, const_value, const_type) && getKey(args[1], key_column_num))
425 {
426 key_arg_pos = 1;
427 }
428 else
429 return false;
430
431 if (const_type && const_type->getTypeId() != TypeIndex::String
432 && const_type->getTypeId() != TypeIndex::FixedString
433 && const_type->getTypeId() != TypeIndex::Array)
434 return false;
435
436 if (key_arg_pos == 1 && (func_name != "equals" || func_name != "notEquals"))
437 return false;
438 else if (!index.token_extractor_func->supportLike() && (func_name == "like" || func_name == "notLike"))
439 return false;
440
441 const auto atom_it = atom_map.find(func_name);
442 if (atom_it == std::end(atom_map))
443 return false;
444
445 out.key_column = key_column_num;
446 return atom_it->second(out, const_value, index);
447 }
448 else if (KeyCondition::getConstant(node, block_with_constants, const_value, const_type))
449 {
450 /// Check constant like in KeyCondition
451 if (const_value.getType() == Field::Types::UInt64
452 || const_value.getType() == Field::Types::Int64
453 || const_value.getType() == Field::Types::Float64)
454 {
455 /// Zero in all types is represented in memory the same way as in UInt64.
456 out.function = const_value.get<UInt64>()
457 ? RPNElement::ALWAYS_TRUE
458 : RPNElement::ALWAYS_FALSE;
459
460 return true;
461 }
462 }
463
464 return false;
465}
466
467bool MergeTreeConditionFullText::tryPrepareSetBloomFilter(
468 const ASTs & args,
469 RPNElement & out)
470{
471 const ASTPtr & left_arg = args[0];
472 const ASTPtr & right_arg = args[1];
473
474 std::vector<KeyTuplePositionMapping> key_tuple_mapping;
475 DataTypes data_types;
476
477 const auto * left_arg_tuple = typeid_cast<const ASTFunction *>(left_arg.get());
478 if (left_arg_tuple && left_arg_tuple->name == "tuple")
479 {
480 const auto & tuple_elements = left_arg_tuple->arguments->children;
481 for (size_t i = 0; i < tuple_elements.size(); ++i)
482 {
483 size_t key = 0;
484 if (getKey(tuple_elements[i], key))
485 {
486 key_tuple_mapping.emplace_back(i, key);
487 data_types.push_back(index.data_types[key]);
488 }
489 }
490 }
491 else
492 {
493 size_t key = 0;
494 if (getKey(left_arg, key))
495 {
496 key_tuple_mapping.emplace_back(0, key);
497 data_types.push_back(index.data_types[key]);
498 }
499 }
500
501 if (key_tuple_mapping.empty())
502 return false;
503
504 PreparedSetKey set_key;
505 if (typeid_cast<const ASTSubquery *>(right_arg.get()) || typeid_cast<const ASTIdentifier *>(right_arg.get()))
506 set_key = PreparedSetKey::forSubquery(*right_arg);
507 else
508 set_key = PreparedSetKey::forLiteral(*right_arg, data_types);
509
510 auto set_it = prepared_sets.find(set_key);
511 if (set_it == prepared_sets.end())
512 return false;
513
514 const SetPtr & prepared_set = set_it->second;
515 if (!prepared_set->hasExplicitSetElements())
516 return false;
517
518 for (const auto & data_type : prepared_set->getDataTypes())
519 if (data_type->getTypeId() != TypeIndex::String && data_type->getTypeId() != TypeIndex::FixedString)
520 return false;
521
522 std::vector<std::vector<BloomFilter>> bloom_filters;
523 std::vector<size_t> key_position;
524
525 Columns columns = prepared_set->getSetElements();
526 for (size_t col = 0; col < key_tuple_mapping.size(); ++col)
527 {
528 bloom_filters.emplace_back();
529 key_position.push_back(key_tuple_mapping[col].key_index);
530
531 size_t tuple_idx = key_tuple_mapping[col].tuple_index;
532 const auto & column = columns[tuple_idx];
533 for (size_t row = 0; row < prepared_set->getTotalRowCount(); ++row)
534 {
535 bloom_filters.back().emplace_back(index.bloom_filter_size, index.bloom_filter_hashes, index.seed);
536 auto ref = column->getDataAt(row);
537 stringToBloomFilter(ref.data, ref.size, index.token_extractor_func, bloom_filters.back().back());
538 }
539 }
540
541 out.set_key_position = std::move(key_position);
542 out.set_bloom_filters = std::move(bloom_filters);
543
544 return true;
545}
546
547MergeTreeIndexGranulePtr MergeTreeIndexFullText::createIndexGranule() const
548{
549 return std::make_shared<MergeTreeIndexGranuleFullText>(*this);
550}
551
552MergeTreeIndexAggregatorPtr MergeTreeIndexFullText::createIndexAggregator() const
553{
554 return std::make_shared<MergeTreeIndexAggregatorFullText>(*this);
555}
556
557MergeTreeIndexConditionPtr MergeTreeIndexFullText::createIndexCondition(
558 const SelectQueryInfo & query, const Context & context) const
559{
560 return std::make_shared<MergeTreeConditionFullText>(query, context, *this);
561};
562
563bool MergeTreeIndexFullText::mayBenefitFromIndexForIn(const ASTPtr & node) const
564{
565 return std::find(std::cbegin(columns), std::cend(columns), node->getColumnName()) != std::cend(columns);
566}
567
568
569bool NgramTokenExtractor::next(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const
570{
571 *token_start = *pos;
572 *token_len = 0;
573 size_t code_points = 0;
574 for (; code_points < n && *token_start + *token_len < len; ++code_points)
575 {
576 size_t sz = UTF8::seqLength(static_cast<UInt8>(data[*token_start + *token_len]));
577 *token_len += sz;
578 }
579 *pos += UTF8::seqLength(static_cast<UInt8>(data[*pos]));
580 return code_points == n;
581}
582
583bool NgramTokenExtractor::nextLike(const String & str, size_t * pos, String & token) const
584{
585 token.clear();
586
587 size_t code_points = 0;
588 bool escaped = false;
589 for (size_t i = *pos; i < str.size();)
590 {
591 if (escaped && (str[i] == '%' || str[i] == '_' || str[i] == '\\'))
592 {
593 token += str[i];
594 ++code_points;
595 escaped = false;
596 ++i;
597 }
598 else if (!escaped && (str[i] == '%' || str[i] == '_'))
599 {
600 /// This token is too small, go to the next.
601 token.clear();
602 code_points = 0;
603 escaped = false;
604 *pos = ++i;
605 }
606 else if (!escaped && str[i] == '\\')
607 {
608 escaped = true;
609 ++i;
610 }
611 else
612 {
613 const size_t sz = UTF8::seqLength(static_cast<UInt8>(str[i]));
614 for (size_t j = 0; j < sz; ++j)
615 token += str[i + j];
616 i += sz;
617 ++code_points;
618 escaped = false;
619 }
620
621 if (code_points == n)
622 {
623 *pos += UTF8::seqLength(static_cast<UInt8>(str[*pos]));
624 return true;
625 }
626 }
627
628 return false;
629}
630
631bool SplitTokenExtractor::next(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const
632{
633 *token_start = *pos;
634 *token_len = 0;
635 while (*pos < len)
636 {
637 if (isASCII(data[*pos]) && !isAlphaNumericASCII(data[*pos]))
638 {
639 if (*token_len > 0)
640 return true;
641 *token_start = ++*pos;
642 }
643 else
644 {
645 const size_t sz = UTF8::seqLength(static_cast<UInt8>(data[*pos]));
646 *pos += sz;
647 *token_len += sz;
648 }
649 }
650 return *token_len > 0;
651}
652
653bool SplitTokenExtractor::nextLike(const String & str, size_t * pos, String & token) const
654{
655 token.clear();
656 bool bad_token = false; // % or _ before token
657 bool escaped = false;
658 while (*pos < str.size())
659 {
660 if (!escaped && (str[*pos] == '%' || str[*pos] == '_'))
661 {
662 token.clear();
663 bad_token = true;
664 ++*pos;
665 }
666 else if (!escaped && str[*pos] == '\\')
667 {
668 escaped = true;
669 ++*pos;
670 }
671 else if (isASCII(str[*pos]) && !isAlphaNumericASCII(str[*pos]))
672 {
673 if (!bad_token && !token.empty())
674 return true;
675
676 token.clear();
677 bad_token = false;
678 escaped = false;
679 ++*pos;
680 }
681 else
682 {
683 const size_t sz = UTF8::seqLength(static_cast<UInt8>(str[*pos]));
684 for (size_t j = 0; j < sz; ++j)
685 {
686 token += str[*pos];
687 ++*pos;
688 }
689 escaped = false;
690 }
691 }
692
693 return !bad_token && !token.empty();
694}
695
696
697std::unique_ptr<IMergeTreeIndex> bloomFilterIndexCreator(
698 const NamesAndTypesList & new_columns,
699 std::shared_ptr<ASTIndexDeclaration> node,
700 const Context & context)
701{
702 if (node->name.empty())
703 throw Exception("Index must have unique name", ErrorCodes::INCORRECT_QUERY);
704
705 ASTPtr expr_list = MergeTreeData::extractKeyExpressionList(node->expr->clone());
706
707 auto syntax = SyntaxAnalyzer(context, {}).analyze(
708 expr_list, new_columns);
709 auto index_expr = ExpressionAnalyzer(expr_list, syntax, context).getActions(false);
710
711 auto sample = ExpressionAnalyzer(expr_list, syntax, context)
712 .getActions(true)->getSampleBlock();
713
714 Names columns;
715 DataTypes data_types;
716
717 for (size_t i = 0; i < expr_list->children.size(); ++i)
718 {
719 const auto & column = sample.getByPosition(i);
720
721 columns.emplace_back(column.name);
722 data_types.emplace_back(column.type);
723
724 if (data_types.back()->getTypeId() != TypeIndex::String
725 && data_types.back()->getTypeId() != TypeIndex::FixedString)
726 throw Exception("Bloom filter index can be used only with `String` or `FixedString` column.", ErrorCodes::INCORRECT_QUERY);
727 }
728
729 boost::algorithm::to_lower(node->type->name);
730 if (node->type->name == NgramTokenExtractor::getName())
731 {
732 if (!node->type->arguments || node->type->arguments->children.size() != 4)
733 throw Exception("`ngrambf` index must have exactly 4 arguments.", ErrorCodes::INCORRECT_QUERY);
734
735 size_t n = typeid_cast<const ASTLiteral &>(
736 *node->type->arguments->children[0]).value.get<size_t>();
737 size_t bloom_filter_size = typeid_cast<const ASTLiteral &>(
738 *node->type->arguments->children[1]).value.get<size_t>();
739 size_t bloom_filter_hashes = typeid_cast<const ASTLiteral &>(
740 *node->type->arguments->children[2]).value.get<size_t>();
741 size_t seed = typeid_cast<const ASTLiteral &>(
742 *node->type->arguments->children[3]).value.get<size_t>();
743
744 auto tokenizer = std::make_unique<NgramTokenExtractor>(n);
745
746 return std::make_unique<MergeTreeIndexFullText>(
747 node->name, std::move(index_expr), columns, data_types, sample, node->granularity,
748 bloom_filter_size, bloom_filter_hashes, seed, std::move(tokenizer));
749 }
750 else if (node->type->name == SplitTokenExtractor::getName())
751 {
752 if (!node->type->arguments || node->type->arguments->children.size() != 3)
753 throw Exception("`tokenbf` index must have exactly 3 arguments.", ErrorCodes::INCORRECT_QUERY);
754
755 size_t bloom_filter_size = typeid_cast<const ASTLiteral &>(
756 *node->type->arguments->children[0]).value.get<size_t>();
757 size_t bloom_filter_hashes = typeid_cast<const ASTLiteral &>(
758 *node->type->arguments->children[1]).value.get<size_t>();
759 size_t seed = typeid_cast<const ASTLiteral &>(
760 *node->type->arguments->children[2]).value.get<size_t>();
761
762 auto tokenizer = std::make_unique<SplitTokenExtractor>();
763
764 return std::make_unique<MergeTreeIndexFullText>(
765 node->name, std::move(index_expr), columns, data_types, sample, node->granularity,
766 bloom_filter_size, bloom_filter_hashes, seed, std::move(tokenizer));
767 }
768 else
769 {
770 throw Exception("Unknown index type: " + backQuote(node->name), ErrorCodes::LOGICAL_ERROR);
771 }
772}
773
774}
775