1 | #include <optional> |
2 | |
3 | #include <Core/Field.h> |
4 | #include <Common/FieldVisitors.h> |
5 | #include <Core/Row.h> |
6 | |
7 | #include <Columns/ColumnsNumber.h> |
8 | #include <Columns/ColumnTuple.h> |
9 | |
10 | #include <Common/typeid_cast.h> |
11 | |
12 | #include <DataStreams/IBlockInputStream.h> |
13 | |
14 | #include <DataTypes/DataTypeTuple.h> |
15 | #include <DataTypes/DataTypeNullable.h> |
16 | |
17 | #include <Parsers/ASTExpressionList.h> |
18 | #include <Parsers/ASTFunction.h> |
19 | #include <Parsers/ASTLiteral.h> |
20 | |
21 | #include <Interpreters/Set.h> |
22 | #include <Interpreters/convertFieldToType.h> |
23 | #include <Interpreters/evaluateConstantExpression.h> |
24 | #include <Interpreters/NullableUtils.h> |
25 | #include <Interpreters/sortBlock.h> |
26 | |
27 | #include <Storages/MergeTree/KeyCondition.h> |
28 | |
29 | #include <ext/range.h> |
30 | #include <DataTypes/DataTypeLowCardinality.h> |
31 | |
32 | |
33 | namespace DB |
34 | { |
35 | |
36 | namespace ErrorCodes |
37 | { |
38 | extern const int LOGICAL_ERROR; |
39 | extern const int SET_SIZE_LIMIT_EXCEEDED; |
40 | extern const int TYPE_MISMATCH; |
41 | extern const int INCORRECT_ELEMENT_OF_SET; |
42 | extern const int NUMBER_OF_COLUMNS_DOESNT_MATCH; |
43 | } |
44 | |
45 | |
46 | template <typename Method> |
47 | void NO_INLINE Set::insertFromBlockImpl( |
48 | Method & method, |
49 | const ColumnRawPtrs & key_columns, |
50 | size_t rows, |
51 | SetVariants & variants, |
52 | ConstNullMapPtr null_map, |
53 | ColumnUInt8::Container * out_filter) |
54 | { |
55 | if (null_map) |
56 | { |
57 | if (out_filter) |
58 | insertFromBlockImplCase<Method, true, true>(method, key_columns, rows, variants, null_map, out_filter); |
59 | else |
60 | insertFromBlockImplCase<Method, true, false>(method, key_columns, rows, variants, null_map, out_filter); |
61 | } |
62 | else |
63 | { |
64 | if (out_filter) |
65 | insertFromBlockImplCase<Method, false, true>(method, key_columns, rows, variants, null_map, out_filter); |
66 | else |
67 | insertFromBlockImplCase<Method, false, false>(method, key_columns, rows, variants, null_map, out_filter); |
68 | } |
69 | } |
70 | |
71 | |
72 | template <typename Method, bool has_null_map, bool build_filter> |
73 | void NO_INLINE Set::insertFromBlockImplCase( |
74 | Method & method, |
75 | const ColumnRawPtrs & key_columns, |
76 | size_t rows, |
77 | SetVariants & variants, |
78 | [[maybe_unused]] ConstNullMapPtr null_map, |
79 | [[maybe_unused]] ColumnUInt8::Container * out_filter) |
80 | { |
81 | typename Method::State state(key_columns, key_sizes, nullptr); |
82 | |
83 | /// For all rows |
84 | for (size_t i = 0; i < rows; ++i) |
85 | { |
86 | if constexpr (has_null_map) |
87 | { |
88 | if ((*null_map)[i]) |
89 | { |
90 | if constexpr (build_filter) |
91 | { |
92 | (*out_filter)[i] = false; |
93 | } |
94 | continue; |
95 | } |
96 | } |
97 | |
98 | [[maybe_unused]] auto emplace_result = state.emplaceKey(method.data, i, variants.string_pool); |
99 | |
100 | if constexpr (build_filter) |
101 | (*out_filter)[i] = emplace_result.isInserted(); |
102 | } |
103 | } |
104 | |
105 | |
106 | void Set::(const Block & block) |
107 | { |
108 | std::unique_lock lock(rwlock); |
109 | |
110 | if (!empty()) |
111 | return; |
112 | |
113 | keys_size = block.columns(); |
114 | ColumnRawPtrs key_columns; |
115 | key_columns.reserve(keys_size); |
116 | data_types.reserve(keys_size); |
117 | set_elements_types.reserve(keys_size); |
118 | |
119 | /// The constant columns to the right of IN are not supported directly. For this, they first materialize. |
120 | Columns materialized_columns; |
121 | |
122 | /// Remember the columns we will work with |
123 | for (size_t i = 0; i < keys_size; ++i) |
124 | { |
125 | materialized_columns.emplace_back(block.safeGetByPosition(i).column->convertToFullColumnIfConst()); |
126 | key_columns.emplace_back(materialized_columns.back().get()); |
127 | data_types.emplace_back(block.safeGetByPosition(i).type); |
128 | set_elements_types.emplace_back(block.safeGetByPosition(i).type); |
129 | |
130 | /// Convert low cardinality column to full. |
131 | if (auto * low_cardinality_type = typeid_cast<const DataTypeLowCardinality *>(data_types.back().get())) |
132 | { |
133 | data_types.back() = low_cardinality_type->getDictionaryType(); |
134 | materialized_columns.emplace_back(key_columns.back()->convertToFullColumnIfLowCardinality()); |
135 | key_columns.back() = materialized_columns.back().get(); |
136 | } |
137 | } |
138 | |
139 | /// We will insert to the Set only keys, where all components are not NULL. |
140 | ConstNullMapPtr null_map{}; |
141 | ColumnPtr null_map_holder = extractNestedColumnsAndNullMap(key_columns, null_map); |
142 | |
143 | if (fill_set_elements) |
144 | { |
145 | /// Create empty columns with set values in advance. |
146 | /// It is needed because set may be empty, so method 'insertFromBlock' will be never called. |
147 | set_elements.reserve(keys_size); |
148 | for (const auto & type : set_elements_types) |
149 | set_elements.emplace_back(type->createColumn()); |
150 | } |
151 | |
152 | /// Choose data structure to use for the set. |
153 | data.init(data.chooseMethod(key_columns, key_sizes)); |
154 | } |
155 | |
156 | |
157 | bool Set::insertFromBlock(const Block & block) |
158 | { |
159 | std::unique_lock lock(rwlock); |
160 | |
161 | if (empty()) |
162 | throw Exception("Method Set::setHeader must be called before Set::insertFromBlock" , ErrorCodes::LOGICAL_ERROR); |
163 | |
164 | ColumnRawPtrs key_columns; |
165 | key_columns.reserve(keys_size); |
166 | |
167 | /// The constant columns to the right of IN are not supported directly. For this, they first materialize. |
168 | Columns materialized_columns; |
169 | |
170 | /// Remember the columns we will work with |
171 | for (size_t i = 0; i < keys_size; ++i) |
172 | { |
173 | materialized_columns.emplace_back(block.safeGetByPosition(i).column->convertToFullColumnIfConst()->convertToFullColumnIfLowCardinality()); |
174 | key_columns.emplace_back(materialized_columns.back().get()); |
175 | } |
176 | |
177 | size_t rows = block.rows(); |
178 | |
179 | /// We will insert to the Set only keys, where all components are not NULL. |
180 | ConstNullMapPtr null_map{}; |
181 | ColumnPtr null_map_holder = extractNestedColumnsAndNullMap(key_columns, null_map); |
182 | |
183 | /// Filter to extract distinct values from the block. |
184 | ColumnUInt8::MutablePtr filter; |
185 | if (fill_set_elements) |
186 | filter = ColumnUInt8::create(block.rows()); |
187 | |
188 | switch (data.type) |
189 | { |
190 | case SetVariants::Type::EMPTY: |
191 | break; |
192 | #define M(NAME) \ |
193 | case SetVariants::Type::NAME: \ |
194 | insertFromBlockImpl(*data.NAME, key_columns, rows, data, null_map, filter ? &filter->getData() : nullptr); \ |
195 | break; |
196 | APPLY_FOR_SET_VARIANTS(M) |
197 | #undef M |
198 | } |
199 | |
200 | if (fill_set_elements) |
201 | { |
202 | for (size_t i = 0; i < keys_size; ++i) |
203 | { |
204 | auto filtered_column = block.getByPosition(i).column->filter(filter->getData(), rows); |
205 | if (set_elements[i]->empty()) |
206 | set_elements[i] = filtered_column; |
207 | else |
208 | set_elements[i]->insertRangeFrom(*filtered_column, 0, filtered_column->size()); |
209 | } |
210 | } |
211 | |
212 | return limits.check(getTotalRowCount(), getTotalByteCount(), "IN-set" , ErrorCodes::SET_SIZE_LIMIT_EXCEEDED); |
213 | } |
214 | |
215 | |
216 | static Field (const ASTPtr & node, const IDataType & type, const Context & context) |
217 | { |
218 | if (const auto * lit = node->as<ASTLiteral>()) |
219 | { |
220 | return convertFieldToType(lit->value, type); |
221 | } |
222 | else if (node->as<ASTFunction>()) |
223 | { |
224 | std::pair<Field, DataTypePtr> value_raw = evaluateConstantExpression(node, context); |
225 | return convertFieldToType(value_raw.first, type, value_raw.second.get()); |
226 | } |
227 | else |
228 | throw Exception("Incorrect element of set. Must be literal or constant expression." , ErrorCodes::INCORRECT_ELEMENT_OF_SET); |
229 | } |
230 | |
231 | |
232 | void Set::createFromAST(const DataTypes & types, ASTPtr node, const Context & context) |
233 | { |
234 | /// Will form a block with values from the set. |
235 | |
236 | Block ; |
237 | size_t num_columns = types.size(); |
238 | for (size_t i = 0; i < num_columns; ++i) |
239 | header.insert(ColumnWithTypeAndName(types[i]->createColumn(), types[i], "_" + toString(i))); |
240 | setHeader(header); |
241 | |
242 | MutableColumns columns = header.cloneEmptyColumns(); |
243 | |
244 | DataTypePtr tuple_type; |
245 | Row tuple_values; |
246 | const auto & list = node->as<ASTExpressionList &>(); |
247 | for (auto & elem : list.children) |
248 | { |
249 | if (num_columns == 1) |
250 | { |
251 | Field value = extractValueFromNode(elem, *types[0], context); |
252 | |
253 | if (!value.isNull()) |
254 | columns[0]->insert(value); |
255 | } |
256 | else if (const auto * func = elem->as<ASTFunction>()) |
257 | { |
258 | Field function_result; |
259 | const Tuple * tuple = nullptr; |
260 | if (func->name != "tuple" ) |
261 | { |
262 | if (!tuple_type) |
263 | tuple_type = std::make_shared<DataTypeTuple>(types); |
264 | |
265 | function_result = extractValueFromNode(elem, *tuple_type, context); |
266 | if (function_result.getType() != Field::Types::Tuple) |
267 | throw Exception("Invalid type of set. Expected tuple, got " + String(function_result.getTypeName()), |
268 | ErrorCodes::INCORRECT_ELEMENT_OF_SET); |
269 | |
270 | tuple = &function_result.get<Tuple>(); |
271 | } |
272 | |
273 | size_t tuple_size = tuple ? tuple->size() : func->arguments->children.size(); |
274 | if (tuple_size != num_columns) |
275 | throw Exception("Incorrect size of tuple in set: " + toString(tuple_size) + " instead of " + toString(num_columns), |
276 | ErrorCodes::INCORRECT_ELEMENT_OF_SET); |
277 | |
278 | if (tuple_values.empty()) |
279 | tuple_values.resize(tuple_size); |
280 | |
281 | size_t i = 0; |
282 | for (; i < tuple_size; ++i) |
283 | { |
284 | Field value = tuple ? (*tuple)[i] |
285 | : extractValueFromNode(func->arguments->children[i], *types[i], context); |
286 | |
287 | /// If at least one of the elements of the tuple has an impossible (outside the range of the type) value, then the entire tuple too. |
288 | if (value.isNull()) |
289 | break; |
290 | |
291 | tuple_values[i] = value; |
292 | } |
293 | |
294 | if (i == tuple_size) |
295 | for (i = 0; i < tuple_size; ++i) |
296 | columns[i]->insert(tuple_values[i]); |
297 | } |
298 | else |
299 | throw Exception("Incorrect element of set" , ErrorCodes::INCORRECT_ELEMENT_OF_SET); |
300 | } |
301 | |
302 | Block block = header.cloneWithColumns(std::move(columns)); |
303 | insertFromBlock(block); |
304 | finishInsert(); |
305 | } |
306 | |
307 | |
308 | ColumnPtr Set::execute(const Block & block, bool negative) const |
309 | { |
310 | size_t num_key_columns = block.columns(); |
311 | |
312 | if (0 == num_key_columns) |
313 | throw Exception("Logical error: no columns passed to Set::execute method." , ErrorCodes::LOGICAL_ERROR); |
314 | |
315 | auto res = ColumnUInt8::create(); |
316 | ColumnUInt8::Container & vec_res = res->getData(); |
317 | vec_res.resize(block.safeGetByPosition(0).column->size()); |
318 | |
319 | if (vec_res.empty()) |
320 | return res; |
321 | |
322 | std::shared_lock lock(rwlock); |
323 | |
324 | /// If the set is empty. |
325 | if (data_types.empty()) |
326 | { |
327 | if (negative) |
328 | memset(vec_res.data(), 1, vec_res.size()); |
329 | else |
330 | memset(vec_res.data(), 0, vec_res.size()); |
331 | return res; |
332 | } |
333 | |
334 | checkColumnsNumber(num_key_columns); |
335 | |
336 | /// Remember the columns we will work with. Also check that the data types are correct. |
337 | ColumnRawPtrs key_columns; |
338 | key_columns.reserve(num_key_columns); |
339 | |
340 | /// The constant columns to the left of IN are not supported directly. For this, they first materialize. |
341 | Columns materialized_columns; |
342 | |
343 | for (size_t i = 0; i < num_key_columns; ++i) |
344 | { |
345 | checkTypesEqual(i, block.safeGetByPosition(i).type); |
346 | materialized_columns.emplace_back(block.safeGetByPosition(i).column->convertToFullColumnIfConst()); |
347 | key_columns.emplace_back() = materialized_columns.back().get(); |
348 | } |
349 | |
350 | /// We will check existence in Set only for keys, where all components are not NULL. |
351 | ConstNullMapPtr null_map{}; |
352 | ColumnPtr null_map_holder = extractNestedColumnsAndNullMap(key_columns, null_map); |
353 | |
354 | executeOrdinary(key_columns, vec_res, negative, null_map); |
355 | |
356 | return res; |
357 | } |
358 | |
359 | |
360 | template <typename Method> |
361 | void NO_INLINE Set::executeImpl( |
362 | Method & method, |
363 | const ColumnRawPtrs & key_columns, |
364 | ColumnUInt8::Container & vec_res, |
365 | bool negative, |
366 | size_t rows, |
367 | ConstNullMapPtr null_map) const |
368 | { |
369 | if (null_map) |
370 | executeImplCase<Method, true>(method, key_columns, vec_res, negative, rows, null_map); |
371 | else |
372 | executeImplCase<Method, false>(method, key_columns, vec_res, negative, rows, null_map); |
373 | } |
374 | |
375 | |
376 | template <typename Method, bool has_null_map> |
377 | void NO_INLINE Set::executeImplCase( |
378 | Method & method, |
379 | const ColumnRawPtrs & key_columns, |
380 | ColumnUInt8::Container & vec_res, |
381 | bool negative, |
382 | size_t rows, |
383 | ConstNullMapPtr null_map) const |
384 | { |
385 | Arena pool; |
386 | typename Method::State state(key_columns, key_sizes, nullptr); |
387 | |
388 | /// NOTE Optimization is not used for consecutive identical strings. |
389 | |
390 | /// For all rows |
391 | for (size_t i = 0; i < rows; ++i) |
392 | { |
393 | if (has_null_map && (*null_map)[i]) |
394 | vec_res[i] = negative; |
395 | else |
396 | { |
397 | auto find_result = state.findKey(method.data, i, pool); |
398 | vec_res[i] = negative ^ find_result.isFound(); |
399 | } |
400 | } |
401 | } |
402 | |
403 | |
404 | void Set::executeOrdinary( |
405 | const ColumnRawPtrs & key_columns, |
406 | ColumnUInt8::Container & vec_res, |
407 | bool negative, |
408 | ConstNullMapPtr null_map) const |
409 | { |
410 | size_t rows = key_columns[0]->size(); |
411 | |
412 | switch (data.type) |
413 | { |
414 | case SetVariants::Type::EMPTY: |
415 | break; |
416 | #define M(NAME) \ |
417 | case SetVariants::Type::NAME: \ |
418 | executeImpl(*data.NAME, key_columns, vec_res, negative, rows, null_map); \ |
419 | break; |
420 | APPLY_FOR_SET_VARIANTS(M) |
421 | #undef M |
422 | } |
423 | } |
424 | |
425 | void Set::checkColumnsNumber(size_t num_key_columns) const |
426 | { |
427 | if (data_types.size() != num_key_columns) |
428 | { |
429 | std::stringstream message; |
430 | message << "Number of columns in section IN doesn't match. " |
431 | << num_key_columns << " at left, " << data_types.size() << " at right." ; |
432 | throw Exception(message.str(), ErrorCodes::NUMBER_OF_COLUMNS_DOESNT_MATCH); |
433 | } |
434 | } |
435 | |
436 | void Set::checkTypesEqual(size_t set_type_idx, const DataTypePtr & other_type) const |
437 | { |
438 | if (!removeNullable(recursiveRemoveLowCardinality(data_types[set_type_idx]))->equals(*removeNullable(recursiveRemoveLowCardinality(other_type)))) |
439 | throw Exception("Types of column " + toString(set_type_idx + 1) + " in section IN don't match: " |
440 | + other_type->getName() + " on the left, " |
441 | + data_types[set_type_idx]->getName() + " on the right" , ErrorCodes::TYPE_MISMATCH); |
442 | } |
443 | |
444 | MergeTreeSetIndex::MergeTreeSetIndex(const Columns & set_elements, std::vector<KeyTuplePositionMapping> && index_mapping_) |
445 | : indexes_mapping(std::move(index_mapping_)) |
446 | { |
447 | std::sort(indexes_mapping.begin(), indexes_mapping.end(), |
448 | [](const KeyTuplePositionMapping & l, const KeyTuplePositionMapping & r) |
449 | { |
450 | return std::forward_as_tuple(l.key_index, l.tuple_index) < std::forward_as_tuple(r.key_index, r.tuple_index); |
451 | }); |
452 | |
453 | indexes_mapping.erase(std::unique( |
454 | indexes_mapping.begin(), indexes_mapping.end(), |
455 | [](const KeyTuplePositionMapping & l, const KeyTuplePositionMapping & r) |
456 | { |
457 | return l.key_index == r.key_index; |
458 | }), indexes_mapping.end()); |
459 | |
460 | size_t tuple_size = indexes_mapping.size(); |
461 | ordered_set.resize(tuple_size); |
462 | for (size_t i = 0; i < tuple_size; ++i) |
463 | ordered_set[i] = set_elements[indexes_mapping[i].tuple_index]; |
464 | |
465 | Block block_to_sort; |
466 | SortDescription sort_description; |
467 | for (size_t i = 0; i < tuple_size; ++i) |
468 | { |
469 | block_to_sort.insert({ ordered_set[i], nullptr, "" }); |
470 | sort_description.emplace_back(i, 1, 1); |
471 | } |
472 | |
473 | sortBlock(block_to_sort, sort_description); |
474 | |
475 | for (size_t i = 0; i < tuple_size; ++i) |
476 | ordered_set[i] = block_to_sort.getByPosition(i).column; |
477 | } |
478 | |
479 | |
480 | /** Return the BoolMask where: |
481 | * 1: the intersection of the set and the range is non-empty |
482 | * 2: the range contains elements not in the set |
483 | */ |
484 | BoolMask MergeTreeSetIndex::mayBeTrueInRange(const std::vector<Range> & key_ranges, const DataTypes & data_types) |
485 | { |
486 | size_t tuple_size = indexes_mapping.size(); |
487 | |
488 | using FieldWithInfinityTuple = std::vector<FieldWithInfinity>; |
489 | |
490 | FieldWithInfinityTuple left_point; |
491 | FieldWithInfinityTuple right_point; |
492 | left_point.reserve(tuple_size); |
493 | right_point.reserve(tuple_size); |
494 | |
495 | bool invert_left_infinities = false; |
496 | bool invert_right_infinities = false; |
497 | |
498 | for (size_t i = 0; i < tuple_size; ++i) |
499 | { |
500 | std::optional<Range> new_range = KeyCondition::applyMonotonicFunctionsChainToRange( |
501 | key_ranges[indexes_mapping[i].key_index], |
502 | indexes_mapping[i].functions, |
503 | data_types[indexes_mapping[i].key_index]); |
504 | |
505 | if (!new_range) |
506 | return {true, true}; |
507 | |
508 | /** A range that ends in (x, y, ..., +inf) exclusive is the same as a range |
509 | * that ends in (x, y, ..., -inf) inclusive and vice versa for the left bound. |
510 | */ |
511 | if (new_range->left_bounded) |
512 | { |
513 | if (!new_range->left_included) |
514 | invert_left_infinities = true; |
515 | |
516 | left_point.push_back(FieldWithInfinity(new_range->left)); |
517 | } |
518 | else |
519 | { |
520 | if (invert_left_infinities) |
521 | left_point.push_back(FieldWithInfinity::getPlusinfinity()); |
522 | else |
523 | left_point.push_back(FieldWithInfinity::getMinusInfinity()); |
524 | } |
525 | |
526 | if (new_range->right_bounded) |
527 | { |
528 | if (!new_range->right_included) |
529 | invert_right_infinities = true; |
530 | |
531 | right_point.push_back(FieldWithInfinity(new_range->right)); |
532 | } |
533 | else |
534 | { |
535 | if (invert_right_infinities) |
536 | right_point.push_back(FieldWithInfinity::getMinusInfinity()); |
537 | else |
538 | right_point.push_back(FieldWithInfinity::getPlusinfinity()); |
539 | } |
540 | } |
541 | |
542 | /// This allows to construct tuple in 'ordered_set' at specified index for comparison with range. |
543 | |
544 | auto indices = ext::range(0, ordered_set.at(0)->size()); |
545 | |
546 | auto = [tuple_size, this](size_t i) |
547 | { |
548 | /// Inefficient. |
549 | FieldWithInfinityTuple res; |
550 | res.reserve(tuple_size); |
551 | for (size_t j = 0; j < tuple_size; ++j) |
552 | res.emplace_back((*ordered_set[j])[i]); |
553 | return res; |
554 | }; |
555 | |
556 | auto compare = [&extract_tuple](size_t i, const FieldWithInfinityTuple & rhs) |
557 | { |
558 | return extract_tuple(i) < rhs; |
559 | }; |
560 | |
561 | /** Because each parallelogram maps to a contiguous sequence of elements |
562 | * layed out in the lexicographically increasing order, the set intersects the range |
563 | * if and only if either bound coincides with an element or at least one element |
564 | * is between the lower bounds |
565 | */ |
566 | auto left_lower = std::lower_bound(indices.begin(), indices.end(), left_point, compare); |
567 | auto right_lower = std::lower_bound(indices.begin(), indices.end(), right_point, compare); |
568 | |
569 | return |
570 | { |
571 | left_lower != right_lower |
572 | || (left_lower != indices.end() && extract_tuple(*left_lower) == left_point) |
573 | || (right_lower != indices.end() && extract_tuple(*right_lower) == right_point), |
574 | true |
575 | }; |
576 | } |
577 | |
578 | } |
579 | |