1 | #include <Storages/MergeTree/MergeTreeIndexFullText.h> |
2 | |
3 | #include <Common/StringUtils/StringUtils.h> |
4 | #include <Common/UTF8Helpers.h> |
5 | #include <DataTypes/DataTypesNumber.h> |
6 | #include <IO/WriteHelpers.h> |
7 | #include <IO/ReadHelpers.h> |
8 | #include <Interpreters/ExpressionActions.h> |
9 | #include <Interpreters/ExpressionAnalyzer.h> |
10 | #include <Interpreters/SyntaxAnalyzer.h> |
11 | #include <Interpreters/misc.h> |
12 | #include <Storages/MergeTree/MergeTreeData.h> |
13 | #include <Storages/MergeTree/RPNBuilder.h> |
14 | #include <Parsers/ASTIdentifier.h> |
15 | #include <Parsers/ASTLiteral.h> |
16 | #include <Parsers/ASTSubquery.h> |
17 | |
18 | #include <Poco/Logger.h> |
19 | |
20 | #include <boost/algorithm/string.hpp> |
21 | |
22 | |
23 | namespace DB |
24 | { |
25 | |
26 | namespace ErrorCodes |
27 | { |
28 | extern const int INCORRECT_QUERY; |
29 | } |
30 | |
31 | |
32 | /// Adds all tokens from string to bloom filter. |
33 | static void ( |
34 | const char * data, size_t size, const std::unique_ptr<ITokenExtractor> & , BloomFilter & bloom_filter) |
35 | { |
36 | size_t cur = 0; |
37 | size_t token_start = 0; |
38 | size_t token_len = 0; |
39 | while (cur < size && token_extractor->next(data, size, &cur, &token_start, &token_len)) |
40 | bloom_filter.add(data + token_start, token_len); |
41 | } |
42 | |
43 | /// Adds all tokens from like pattern string to bloom filter. (Because like pattern can contain `\%` and `\_`.) |
44 | static void ( |
45 | const String & data, const std::unique_ptr<ITokenExtractor> & , BloomFilter & bloom_filter) |
46 | { |
47 | size_t cur = 0; |
48 | String token; |
49 | while (cur < data.size() && token_extractor->nextLike(data, &cur, token)) |
50 | bloom_filter.add(token.c_str(), token.size()); |
51 | } |
52 | /// Unified condition for equals, startsWith and endsWith |
53 | bool MergeTreeConditionFullText::createFunctionEqualsCondition(RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx) |
54 | { |
55 | out.function = RPNElement::FUNCTION_EQUALS; |
56 | out.bloom_filter = std::make_unique<BloomFilter>( |
57 | idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed); |
58 | |
59 | const auto & str = value.get<String>(); |
60 | stringToBloomFilter(str.c_str(), str.size(), idx.token_extractor_func, *out.bloom_filter); |
61 | return true; |
62 | } |
63 | |
64 | MergeTreeIndexGranuleFullText::MergeTreeIndexGranuleFullText(const MergeTreeIndexFullText & index_) |
65 | : IMergeTreeIndexGranule() |
66 | , index(index_) |
67 | , bloom_filters( |
68 | index.columns.size(), BloomFilter(index.bloom_filter_size, index.bloom_filter_hashes, index.seed)) |
69 | , has_elems(false) {} |
70 | |
71 | void MergeTreeIndexGranuleFullText::serializeBinary(WriteBuffer & ostr) const |
72 | { |
73 | if (empty()) |
74 | throw Exception("Attempt to write empty minmax index " + backQuote(index.name), ErrorCodes::LOGICAL_ERROR); |
75 | |
76 | for (const auto & bloom_filter : bloom_filters) |
77 | ostr.write(reinterpret_cast<const char *>(bloom_filter.getFilter().data()), index.bloom_filter_size); |
78 | } |
79 | |
80 | void MergeTreeIndexGranuleFullText::deserializeBinary(ReadBuffer & istr) |
81 | { |
82 | for (auto & bloom_filter : bloom_filters) |
83 | { |
84 | istr.read(reinterpret_cast<char *>(bloom_filter.getFilter().data()), index.bloom_filter_size); |
85 | } |
86 | has_elems = true; |
87 | } |
88 | |
89 | |
90 | MergeTreeIndexAggregatorFullText::MergeTreeIndexAggregatorFullText(const MergeTreeIndexFullText & index_) |
91 | : index(index_), granule(std::make_shared<MergeTreeIndexGranuleFullText>(index)) {} |
92 | |
93 | MergeTreeIndexGranulePtr MergeTreeIndexAggregatorFullText::getGranuleAndReset() |
94 | { |
95 | auto new_granule = std::make_shared<MergeTreeIndexGranuleFullText>(index); |
96 | new_granule.swap(granule); |
97 | return new_granule; |
98 | } |
99 | |
100 | void MergeTreeIndexAggregatorFullText::update(const Block & block, size_t * pos, size_t limit) |
101 | { |
102 | if (*pos >= block.rows()) |
103 | throw Exception( |
104 | "The provided position is not less than the number of block rows. Position: " |
105 | + toString(*pos) + ", Block rows: " + toString(block.rows()) + "." , ErrorCodes::LOGICAL_ERROR); |
106 | |
107 | size_t rows_read = std::min(limit, block.rows() - *pos); |
108 | |
109 | for (size_t col = 0; col < index.columns.size(); ++col) |
110 | { |
111 | const auto & column = block.getByName(index.columns[col]).column; |
112 | for (size_t i = 0; i < rows_read; ++i) |
113 | { |
114 | auto ref = column->getDataAt(*pos + i); |
115 | stringToBloomFilter(ref.data, ref.size, index.token_extractor_func, granule->bloom_filters[col]); |
116 | } |
117 | } |
118 | granule->has_elems = true; |
119 | *pos += rows_read; |
120 | } |
121 | |
122 | |
123 | const MergeTreeConditionFullText::AtomMap MergeTreeConditionFullText::atom_map |
124 | { |
125 | { |
126 | "notEquals" , |
127 | [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx) |
128 | { |
129 | out.function = RPNElement::FUNCTION_NOT_EQUALS; |
130 | out.bloom_filter = std::make_unique<BloomFilter>( |
131 | idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed); |
132 | |
133 | const auto & str = value.get<String>(); |
134 | stringToBloomFilter(str.c_str(), str.size(), idx.token_extractor_func, *out.bloom_filter); |
135 | return true; |
136 | } |
137 | }, |
138 | { |
139 | "equals" , |
140 | [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx) |
141 | { |
142 | return createFunctionEqualsCondition(out, value, idx); |
143 | } |
144 | }, |
145 | { |
146 | "like" , |
147 | [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx) |
148 | { |
149 | out.function = RPNElement::FUNCTION_EQUALS; |
150 | out.bloom_filter = std::make_unique<BloomFilter>( |
151 | idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed); |
152 | |
153 | const auto & str = value.get<String>(); |
154 | likeStringToBloomFilter(str, idx.token_extractor_func, *out.bloom_filter); |
155 | return true; |
156 | } |
157 | }, |
158 | { |
159 | "notLike" , |
160 | [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx) |
161 | { |
162 | out.function = RPNElement::FUNCTION_NOT_EQUALS; |
163 | out.bloom_filter = std::make_unique<BloomFilter>( |
164 | idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed); |
165 | |
166 | const auto & str = value.get<String>(); |
167 | likeStringToBloomFilter(str, idx.token_extractor_func, *out.bloom_filter); |
168 | return true; |
169 | } |
170 | }, |
171 | { |
172 | "hasToken" , |
173 | [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx) |
174 | { |
175 | out.function = RPNElement::FUNCTION_EQUALS; |
176 | out.bloom_filter = std::make_unique<BloomFilter>( |
177 | idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed); |
178 | |
179 | const auto & str = value.get<String>(); |
180 | stringToBloomFilter(str.c_str(), str.size(), idx.token_extractor_func, *out.bloom_filter); |
181 | return true; |
182 | } |
183 | }, |
184 | { |
185 | "startsWith" , |
186 | [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx) |
187 | { |
188 | return createFunctionEqualsCondition(out, value, idx); |
189 | } |
190 | }, |
191 | { |
192 | "endsWith" , |
193 | [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx) |
194 | { |
195 | return createFunctionEqualsCondition(out, value, idx); |
196 | } |
197 | }, |
198 | { |
199 | "multiSearchAny" , |
200 | [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx) |
201 | { |
202 | out.function = RPNElement::FUNCTION_MULTI_SEARCH; |
203 | |
204 | /// 2d vector is not needed here but is used because already exists for FUNCTION_IN |
205 | std::vector<std::vector<BloomFilter>> bloom_filters; |
206 | bloom_filters.emplace_back(); |
207 | for (const auto & element : value.get<Array>()) |
208 | { |
209 | if (element.getType() != Field::Types::String) |
210 | return false; |
211 | |
212 | bloom_filters.back().emplace_back(idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed); |
213 | const auto & str = element.get<String>(); |
214 | stringToBloomFilter(str.c_str(), str.size(), idx.token_extractor_func, bloom_filters.back().back()); |
215 | } |
216 | out.set_bloom_filters = std::move(bloom_filters); |
217 | return true; |
218 | } |
219 | }, |
220 | { |
221 | "notIn" , |
222 | [] (RPNElement & out, const Field &, const MergeTreeIndexFullText &) |
223 | { |
224 | out.function = RPNElement::FUNCTION_NOT_IN; |
225 | return true; |
226 | } |
227 | }, |
228 | { |
229 | "in" , |
230 | [] (RPNElement & out, const Field &, const MergeTreeIndexFullText &) |
231 | { |
232 | out.function = RPNElement::FUNCTION_IN; |
233 | return true; |
234 | } |
235 | }, |
236 | }; |
237 | |
238 | MergeTreeConditionFullText::MergeTreeConditionFullText( |
239 | const SelectQueryInfo & query_info, |
240 | const Context & context, |
241 | const MergeTreeIndexFullText & index_) : index(index_), prepared_sets(query_info.sets) |
242 | { |
243 | rpn = std::move( |
244 | RPNBuilder<RPNElement>( |
245 | query_info, context, |
246 | [this] (const ASTPtr & node, const Context & /* context */, Block & block_with_constants, RPNElement & out) -> bool |
247 | { |
248 | return this->atomFromAST(node, block_with_constants, out); |
249 | }).extractRPN()); |
250 | } |
251 | |
252 | bool MergeTreeConditionFullText::alwaysUnknownOrTrue() const |
253 | { |
254 | /// Check like in KeyCondition. |
255 | std::vector<bool> rpn_stack; |
256 | |
257 | for (const auto & element : rpn) |
258 | { |
259 | if (element.function == RPNElement::FUNCTION_UNKNOWN |
260 | || element.function == RPNElement::ALWAYS_TRUE) |
261 | { |
262 | rpn_stack.push_back(true); |
263 | } |
264 | else if (element.function == RPNElement::FUNCTION_EQUALS |
265 | || element.function == RPNElement::FUNCTION_NOT_EQUALS |
266 | || element.function == RPNElement::FUNCTION_IN |
267 | || element.function == RPNElement::FUNCTION_NOT_IN |
268 | || element.function == RPNElement::FUNCTION_MULTI_SEARCH |
269 | || element.function == RPNElement::ALWAYS_FALSE) |
270 | { |
271 | rpn_stack.push_back(false); |
272 | } |
273 | else if (element.function == RPNElement::FUNCTION_NOT) |
274 | { |
275 | // do nothing |
276 | } |
277 | else if (element.function == RPNElement::FUNCTION_AND) |
278 | { |
279 | auto arg1 = rpn_stack.back(); |
280 | rpn_stack.pop_back(); |
281 | auto arg2 = rpn_stack.back(); |
282 | rpn_stack.back() = arg1 && arg2; |
283 | } |
284 | else if (element.function == RPNElement::FUNCTION_OR) |
285 | { |
286 | auto arg1 = rpn_stack.back(); |
287 | rpn_stack.pop_back(); |
288 | auto arg2 = rpn_stack.back(); |
289 | rpn_stack.back() = arg1 || arg2; |
290 | } |
291 | else |
292 | throw Exception("Unexpected function type in KeyCondition::RPNElement" , ErrorCodes::LOGICAL_ERROR); |
293 | } |
294 | |
295 | return rpn_stack[0]; |
296 | } |
297 | |
298 | bool MergeTreeConditionFullText::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const |
299 | { |
300 | std::shared_ptr<MergeTreeIndexGranuleFullText> granule |
301 | = std::dynamic_pointer_cast<MergeTreeIndexGranuleFullText>(idx_granule); |
302 | if (!granule) |
303 | throw Exception( |
304 | "BloomFilter index condition got a granule with the wrong type." , ErrorCodes::LOGICAL_ERROR); |
305 | |
306 | /// Check like in KeyCondition. |
307 | std::vector<BoolMask> rpn_stack; |
308 | for (const auto & element : rpn) |
309 | { |
310 | if (element.function == RPNElement::FUNCTION_UNKNOWN) |
311 | { |
312 | rpn_stack.emplace_back(true, true); |
313 | } |
314 | else if (element.function == RPNElement::FUNCTION_EQUALS |
315 | || element.function == RPNElement::FUNCTION_NOT_EQUALS) |
316 | { |
317 | rpn_stack.emplace_back( |
318 | granule->bloom_filters[element.key_column].contains(*element.bloom_filter), true); |
319 | |
320 | if (element.function == RPNElement::FUNCTION_NOT_EQUALS) |
321 | rpn_stack.back() = !rpn_stack.back(); |
322 | } |
323 | else if (element.function == RPNElement::FUNCTION_IN |
324 | || element.function == RPNElement::FUNCTION_NOT_IN) |
325 | { |
326 | std::vector<bool> result(element.set_bloom_filters.back().size(), true); |
327 | |
328 | for (size_t column = 0; column < element.set_key_position.size(); ++column) |
329 | { |
330 | const size_t key_idx = element.set_key_position[column]; |
331 | |
332 | const auto & bloom_filters = element.set_bloom_filters[column]; |
333 | for (size_t row = 0; row < bloom_filters.size(); ++row) |
334 | result[row] = result[row] && granule->bloom_filters[key_idx].contains(bloom_filters[row]); |
335 | } |
336 | |
337 | rpn_stack.emplace_back( |
338 | std::find(std::cbegin(result), std::cend(result), true) != std::end(result), true); |
339 | if (element.function == RPNElement::FUNCTION_NOT_IN) |
340 | rpn_stack.back() = !rpn_stack.back(); |
341 | } |
342 | else if (element.function == RPNElement::FUNCTION_MULTI_SEARCH) |
343 | { |
344 | std::vector<bool> result(element.set_bloom_filters.back().size(), true); |
345 | |
346 | const auto & bloom_filters = element.set_bloom_filters[0]; |
347 | |
348 | for (size_t row = 0; row < bloom_filters.size(); ++row) |
349 | result[row] = result[row] && granule->bloom_filters[element.key_column].contains(bloom_filters[row]); |
350 | |
351 | rpn_stack.emplace_back( |
352 | std::find(std::cbegin(result), std::cend(result), true) != std::end(result), true); |
353 | } |
354 | else if (element.function == RPNElement::FUNCTION_NOT) |
355 | { |
356 | rpn_stack.back() = !rpn_stack.back(); |
357 | } |
358 | else if (element.function == RPNElement::FUNCTION_AND) |
359 | { |
360 | auto arg1 = rpn_stack.back(); |
361 | rpn_stack.pop_back(); |
362 | auto arg2 = rpn_stack.back(); |
363 | rpn_stack.back() = arg1 & arg2; |
364 | } |
365 | else if (element.function == RPNElement::FUNCTION_OR) |
366 | { |
367 | auto arg1 = rpn_stack.back(); |
368 | rpn_stack.pop_back(); |
369 | auto arg2 = rpn_stack.back(); |
370 | rpn_stack.back() = arg1 | arg2; |
371 | } |
372 | else if (element.function == RPNElement::ALWAYS_FALSE) |
373 | { |
374 | rpn_stack.emplace_back(false, true); |
375 | } |
376 | else if (element.function == RPNElement::ALWAYS_TRUE) |
377 | { |
378 | rpn_stack.emplace_back(true, false); |
379 | } |
380 | else |
381 | throw Exception("Unexpected function type in KeyCondition::RPNElement" , ErrorCodes::LOGICAL_ERROR); |
382 | } |
383 | |
384 | if (rpn_stack.size() != 1) |
385 | throw Exception("Unexpected stack size in KeyCondition::mayBeTrueInRange" , ErrorCodes::LOGICAL_ERROR); |
386 | |
387 | return rpn_stack[0].can_be_true; |
388 | } |
389 | |
390 | bool MergeTreeConditionFullText::getKey(const ASTPtr & node, size_t & key_column_num) |
391 | { |
392 | auto it = std::find(index.columns.begin(), index.columns.end(), node->getColumnName()); |
393 | if (it == index.columns.end()) |
394 | return false; |
395 | |
396 | key_column_num = static_cast<size_t>(it - index.columns.begin()); |
397 | return true; |
398 | } |
399 | |
400 | bool MergeTreeConditionFullText::atomFromAST( |
401 | const ASTPtr & node, Block & block_with_constants, RPNElement & out) |
402 | { |
403 | Field const_value; |
404 | DataTypePtr const_type; |
405 | if (const auto * func = typeid_cast<const ASTFunction *>(node.get())) |
406 | { |
407 | const ASTs & args = typeid_cast<const ASTExpressionList &>(*func->arguments).children; |
408 | |
409 | if (args.size() != 2) |
410 | return false; |
411 | |
412 | size_t key_arg_pos; /// Position of argument with key column (non-const argument) |
413 | size_t key_column_num = -1; /// Number of a key column (inside key_column_names array) |
414 | const auto & func_name = func->name; |
415 | |
416 | if (functionIsInOrGlobalInOperator(func_name) && tryPrepareSetBloomFilter(args, out)) |
417 | { |
418 | key_arg_pos = 0; |
419 | } |
420 | else if (KeyCondition::getConstant(args[1], block_with_constants, const_value, const_type) && getKey(args[0], key_column_num)) |
421 | { |
422 | key_arg_pos = 0; |
423 | } |
424 | else if (KeyCondition::getConstant(args[0], block_with_constants, const_value, const_type) && getKey(args[1], key_column_num)) |
425 | { |
426 | key_arg_pos = 1; |
427 | } |
428 | else |
429 | return false; |
430 | |
431 | if (const_type && const_type->getTypeId() != TypeIndex::String |
432 | && const_type->getTypeId() != TypeIndex::FixedString |
433 | && const_type->getTypeId() != TypeIndex::Array) |
434 | return false; |
435 | |
436 | if (key_arg_pos == 1 && (func_name != "equals" || func_name != "notEquals" )) |
437 | return false; |
438 | else if (!index.token_extractor_func->supportLike() && (func_name == "like" || func_name == "notLike" )) |
439 | return false; |
440 | |
441 | const auto atom_it = atom_map.find(func_name); |
442 | if (atom_it == std::end(atom_map)) |
443 | return false; |
444 | |
445 | out.key_column = key_column_num; |
446 | return atom_it->second(out, const_value, index); |
447 | } |
448 | else if (KeyCondition::getConstant(node, block_with_constants, const_value, const_type)) |
449 | { |
450 | /// Check constant like in KeyCondition |
451 | if (const_value.getType() == Field::Types::UInt64 |
452 | || const_value.getType() == Field::Types::Int64 |
453 | || const_value.getType() == Field::Types::Float64) |
454 | { |
455 | /// Zero in all types is represented in memory the same way as in UInt64. |
456 | out.function = const_value.get<UInt64>() |
457 | ? RPNElement::ALWAYS_TRUE |
458 | : RPNElement::ALWAYS_FALSE; |
459 | |
460 | return true; |
461 | } |
462 | } |
463 | |
464 | return false; |
465 | } |
466 | |
467 | bool MergeTreeConditionFullText::tryPrepareSetBloomFilter( |
468 | const ASTs & args, |
469 | RPNElement & out) |
470 | { |
471 | const ASTPtr & left_arg = args[0]; |
472 | const ASTPtr & right_arg = args[1]; |
473 | |
474 | std::vector<KeyTuplePositionMapping> key_tuple_mapping; |
475 | DataTypes data_types; |
476 | |
477 | const auto * left_arg_tuple = typeid_cast<const ASTFunction *>(left_arg.get()); |
478 | if (left_arg_tuple && left_arg_tuple->name == "tuple" ) |
479 | { |
480 | const auto & tuple_elements = left_arg_tuple->arguments->children; |
481 | for (size_t i = 0; i < tuple_elements.size(); ++i) |
482 | { |
483 | size_t key = 0; |
484 | if (getKey(tuple_elements[i], key)) |
485 | { |
486 | key_tuple_mapping.emplace_back(i, key); |
487 | data_types.push_back(index.data_types[key]); |
488 | } |
489 | } |
490 | } |
491 | else |
492 | { |
493 | size_t key = 0; |
494 | if (getKey(left_arg, key)) |
495 | { |
496 | key_tuple_mapping.emplace_back(0, key); |
497 | data_types.push_back(index.data_types[key]); |
498 | } |
499 | } |
500 | |
501 | if (key_tuple_mapping.empty()) |
502 | return false; |
503 | |
504 | PreparedSetKey set_key; |
505 | if (typeid_cast<const ASTSubquery *>(right_arg.get()) || typeid_cast<const ASTIdentifier *>(right_arg.get())) |
506 | set_key = PreparedSetKey::forSubquery(*right_arg); |
507 | else |
508 | set_key = PreparedSetKey::forLiteral(*right_arg, data_types); |
509 | |
510 | auto set_it = prepared_sets.find(set_key); |
511 | if (set_it == prepared_sets.end()) |
512 | return false; |
513 | |
514 | const SetPtr & prepared_set = set_it->second; |
515 | if (!prepared_set->hasExplicitSetElements()) |
516 | return false; |
517 | |
518 | for (const auto & data_type : prepared_set->getDataTypes()) |
519 | if (data_type->getTypeId() != TypeIndex::String && data_type->getTypeId() != TypeIndex::FixedString) |
520 | return false; |
521 | |
522 | std::vector<std::vector<BloomFilter>> bloom_filters; |
523 | std::vector<size_t> key_position; |
524 | |
525 | Columns columns = prepared_set->getSetElements(); |
526 | for (size_t col = 0; col < key_tuple_mapping.size(); ++col) |
527 | { |
528 | bloom_filters.emplace_back(); |
529 | key_position.push_back(key_tuple_mapping[col].key_index); |
530 | |
531 | size_t tuple_idx = key_tuple_mapping[col].tuple_index; |
532 | const auto & column = columns[tuple_idx]; |
533 | for (size_t row = 0; row < prepared_set->getTotalRowCount(); ++row) |
534 | { |
535 | bloom_filters.back().emplace_back(index.bloom_filter_size, index.bloom_filter_hashes, index.seed); |
536 | auto ref = column->getDataAt(row); |
537 | stringToBloomFilter(ref.data, ref.size, index.token_extractor_func, bloom_filters.back().back()); |
538 | } |
539 | } |
540 | |
541 | out.set_key_position = std::move(key_position); |
542 | out.set_bloom_filters = std::move(bloom_filters); |
543 | |
544 | return true; |
545 | } |
546 | |
547 | MergeTreeIndexGranulePtr MergeTreeIndexFullText::createIndexGranule() const |
548 | { |
549 | return std::make_shared<MergeTreeIndexGranuleFullText>(*this); |
550 | } |
551 | |
552 | MergeTreeIndexAggregatorPtr MergeTreeIndexFullText::createIndexAggregator() const |
553 | { |
554 | return std::make_shared<MergeTreeIndexAggregatorFullText>(*this); |
555 | } |
556 | |
557 | MergeTreeIndexConditionPtr MergeTreeIndexFullText::createIndexCondition( |
558 | const SelectQueryInfo & query, const Context & context) const |
559 | { |
560 | return std::make_shared<MergeTreeConditionFullText>(query, context, *this); |
561 | }; |
562 | |
563 | bool MergeTreeIndexFullText::mayBenefitFromIndexForIn(const ASTPtr & node) const |
564 | { |
565 | return std::find(std::cbegin(columns), std::cend(columns), node->getColumnName()) != std::cend(columns); |
566 | } |
567 | |
568 | |
569 | bool NgramTokenExtractor::(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const |
570 | { |
571 | *token_start = *pos; |
572 | *token_len = 0; |
573 | size_t code_points = 0; |
574 | for (; code_points < n && *token_start + *token_len < len; ++code_points) |
575 | { |
576 | size_t sz = UTF8::seqLength(static_cast<UInt8>(data[*token_start + *token_len])); |
577 | *token_len += sz; |
578 | } |
579 | *pos += UTF8::seqLength(static_cast<UInt8>(data[*pos])); |
580 | return code_points == n; |
581 | } |
582 | |
583 | bool NgramTokenExtractor::(const String & str, size_t * pos, String & token) const |
584 | { |
585 | token.clear(); |
586 | |
587 | size_t code_points = 0; |
588 | bool escaped = false; |
589 | for (size_t i = *pos; i < str.size();) |
590 | { |
591 | if (escaped && (str[i] == '%' || str[i] == '_' || str[i] == '\\')) |
592 | { |
593 | token += str[i]; |
594 | ++code_points; |
595 | escaped = false; |
596 | ++i; |
597 | } |
598 | else if (!escaped && (str[i] == '%' || str[i] == '_')) |
599 | { |
600 | /// This token is too small, go to the next. |
601 | token.clear(); |
602 | code_points = 0; |
603 | escaped = false; |
604 | *pos = ++i; |
605 | } |
606 | else if (!escaped && str[i] == '\\') |
607 | { |
608 | escaped = true; |
609 | ++i; |
610 | } |
611 | else |
612 | { |
613 | const size_t sz = UTF8::seqLength(static_cast<UInt8>(str[i])); |
614 | for (size_t j = 0; j < sz; ++j) |
615 | token += str[i + j]; |
616 | i += sz; |
617 | ++code_points; |
618 | escaped = false; |
619 | } |
620 | |
621 | if (code_points == n) |
622 | { |
623 | *pos += UTF8::seqLength(static_cast<UInt8>(str[*pos])); |
624 | return true; |
625 | } |
626 | } |
627 | |
628 | return false; |
629 | } |
630 | |
631 | bool SplitTokenExtractor::(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const |
632 | { |
633 | *token_start = *pos; |
634 | *token_len = 0; |
635 | while (*pos < len) |
636 | { |
637 | if (isASCII(data[*pos]) && !isAlphaNumericASCII(data[*pos])) |
638 | { |
639 | if (*token_len > 0) |
640 | return true; |
641 | *token_start = ++*pos; |
642 | } |
643 | else |
644 | { |
645 | const size_t sz = UTF8::seqLength(static_cast<UInt8>(data[*pos])); |
646 | *pos += sz; |
647 | *token_len += sz; |
648 | } |
649 | } |
650 | return *token_len > 0; |
651 | } |
652 | |
653 | bool SplitTokenExtractor::(const String & str, size_t * pos, String & token) const |
654 | { |
655 | token.clear(); |
656 | bool bad_token = false; // % or _ before token |
657 | bool escaped = false; |
658 | while (*pos < str.size()) |
659 | { |
660 | if (!escaped && (str[*pos] == '%' || str[*pos] == '_')) |
661 | { |
662 | token.clear(); |
663 | bad_token = true; |
664 | ++*pos; |
665 | } |
666 | else if (!escaped && str[*pos] == '\\') |
667 | { |
668 | escaped = true; |
669 | ++*pos; |
670 | } |
671 | else if (isASCII(str[*pos]) && !isAlphaNumericASCII(str[*pos])) |
672 | { |
673 | if (!bad_token && !token.empty()) |
674 | return true; |
675 | |
676 | token.clear(); |
677 | bad_token = false; |
678 | escaped = false; |
679 | ++*pos; |
680 | } |
681 | else |
682 | { |
683 | const size_t sz = UTF8::seqLength(static_cast<UInt8>(str[*pos])); |
684 | for (size_t j = 0; j < sz; ++j) |
685 | { |
686 | token += str[*pos]; |
687 | ++*pos; |
688 | } |
689 | escaped = false; |
690 | } |
691 | } |
692 | |
693 | return !bad_token && !token.empty(); |
694 | } |
695 | |
696 | |
697 | std::unique_ptr<IMergeTreeIndex> bloomFilterIndexCreator( |
698 | const NamesAndTypesList & new_columns, |
699 | std::shared_ptr<ASTIndexDeclaration> node, |
700 | const Context & context) |
701 | { |
702 | if (node->name.empty()) |
703 | throw Exception("Index must have unique name" , ErrorCodes::INCORRECT_QUERY); |
704 | |
705 | ASTPtr expr_list = MergeTreeData::extractKeyExpressionList(node->expr->clone()); |
706 | |
707 | auto syntax = SyntaxAnalyzer(context, {}).analyze( |
708 | expr_list, new_columns); |
709 | auto index_expr = ExpressionAnalyzer(expr_list, syntax, context).getActions(false); |
710 | |
711 | auto sample = ExpressionAnalyzer(expr_list, syntax, context) |
712 | .getActions(true)->getSampleBlock(); |
713 | |
714 | Names columns; |
715 | DataTypes data_types; |
716 | |
717 | for (size_t i = 0; i < expr_list->children.size(); ++i) |
718 | { |
719 | const auto & column = sample.getByPosition(i); |
720 | |
721 | columns.emplace_back(column.name); |
722 | data_types.emplace_back(column.type); |
723 | |
724 | if (data_types.back()->getTypeId() != TypeIndex::String |
725 | && data_types.back()->getTypeId() != TypeIndex::FixedString) |
726 | throw Exception("Bloom filter index can be used only with `String` or `FixedString` column." , ErrorCodes::INCORRECT_QUERY); |
727 | } |
728 | |
729 | boost::algorithm::to_lower(node->type->name); |
730 | if (node->type->name == NgramTokenExtractor::getName()) |
731 | { |
732 | if (!node->type->arguments || node->type->arguments->children.size() != 4) |
733 | throw Exception("`ngrambf` index must have exactly 4 arguments." , ErrorCodes::INCORRECT_QUERY); |
734 | |
735 | size_t n = typeid_cast<const ASTLiteral &>( |
736 | *node->type->arguments->children[0]).value.get<size_t>(); |
737 | size_t bloom_filter_size = typeid_cast<const ASTLiteral &>( |
738 | *node->type->arguments->children[1]).value.get<size_t>(); |
739 | size_t bloom_filter_hashes = typeid_cast<const ASTLiteral &>( |
740 | *node->type->arguments->children[2]).value.get<size_t>(); |
741 | size_t seed = typeid_cast<const ASTLiteral &>( |
742 | *node->type->arguments->children[3]).value.get<size_t>(); |
743 | |
744 | auto tokenizer = std::make_unique<NgramTokenExtractor>(n); |
745 | |
746 | return std::make_unique<MergeTreeIndexFullText>( |
747 | node->name, std::move(index_expr), columns, data_types, sample, node->granularity, |
748 | bloom_filter_size, bloom_filter_hashes, seed, std::move(tokenizer)); |
749 | } |
750 | else if (node->type->name == SplitTokenExtractor::getName()) |
751 | { |
752 | if (!node->type->arguments || node->type->arguments->children.size() != 3) |
753 | throw Exception("`tokenbf` index must have exactly 3 arguments." , ErrorCodes::INCORRECT_QUERY); |
754 | |
755 | size_t bloom_filter_size = typeid_cast<const ASTLiteral &>( |
756 | *node->type->arguments->children[0]).value.get<size_t>(); |
757 | size_t bloom_filter_hashes = typeid_cast<const ASTLiteral &>( |
758 | *node->type->arguments->children[1]).value.get<size_t>(); |
759 | size_t seed = typeid_cast<const ASTLiteral &>( |
760 | *node->type->arguments->children[2]).value.get<size_t>(); |
761 | |
762 | auto tokenizer = std::make_unique<SplitTokenExtractor>(); |
763 | |
764 | return std::make_unique<MergeTreeIndexFullText>( |
765 | node->name, std::move(index_expr), columns, data_types, sample, node->granularity, |
766 | bloom_filter_size, bloom_filter_hashes, seed, std::move(tokenizer)); |
767 | } |
768 | else |
769 | { |
770 | throw Exception("Unknown index type: " + backQuote(node->name), ErrorCodes::LOGICAL_ERROR); |
771 | } |
772 | } |
773 | |
774 | } |
775 | |