1#include <optional>
2
3#include <Core/Field.h>
4#include <Common/FieldVisitors.h>
5#include <Core/Row.h>
6
7#include <Columns/ColumnsNumber.h>
8#include <Columns/ColumnTuple.h>
9
10#include <Common/typeid_cast.h>
11
12#include <DataStreams/IBlockInputStream.h>
13
14#include <DataTypes/DataTypeTuple.h>
15#include <DataTypes/DataTypeNullable.h>
16
17#include <Parsers/ASTExpressionList.h>
18#include <Parsers/ASTFunction.h>
19#include <Parsers/ASTLiteral.h>
20
21#include <Interpreters/Set.h>
22#include <Interpreters/convertFieldToType.h>
23#include <Interpreters/evaluateConstantExpression.h>
24#include <Interpreters/NullableUtils.h>
25#include <Interpreters/sortBlock.h>
26
27#include <Storages/MergeTree/KeyCondition.h>
28
29#include <ext/range.h>
30#include <DataTypes/DataTypeLowCardinality.h>
31
32
33namespace DB
34{
35
36namespace ErrorCodes
37{
38 extern const int LOGICAL_ERROR;
39 extern const int SET_SIZE_LIMIT_EXCEEDED;
40 extern const int TYPE_MISMATCH;
41 extern const int INCORRECT_ELEMENT_OF_SET;
42 extern const int NUMBER_OF_COLUMNS_DOESNT_MATCH;
43}
44
45
46template <typename Method>
47void NO_INLINE Set::insertFromBlockImpl(
48 Method & method,
49 const ColumnRawPtrs & key_columns,
50 size_t rows,
51 SetVariants & variants,
52 ConstNullMapPtr null_map,
53 ColumnUInt8::Container * out_filter)
54{
55 if (null_map)
56 {
57 if (out_filter)
58 insertFromBlockImplCase<Method, true, true>(method, key_columns, rows, variants, null_map, out_filter);
59 else
60 insertFromBlockImplCase<Method, true, false>(method, key_columns, rows, variants, null_map, out_filter);
61 }
62 else
63 {
64 if (out_filter)
65 insertFromBlockImplCase<Method, false, true>(method, key_columns, rows, variants, null_map, out_filter);
66 else
67 insertFromBlockImplCase<Method, false, false>(method, key_columns, rows, variants, null_map, out_filter);
68 }
69}
70
71
72template <typename Method, bool has_null_map, bool build_filter>
73void NO_INLINE Set::insertFromBlockImplCase(
74 Method & method,
75 const ColumnRawPtrs & key_columns,
76 size_t rows,
77 SetVariants & variants,
78 [[maybe_unused]] ConstNullMapPtr null_map,
79 [[maybe_unused]] ColumnUInt8::Container * out_filter)
80{
81 typename Method::State state(key_columns, key_sizes, nullptr);
82
83 /// For all rows
84 for (size_t i = 0; i < rows; ++i)
85 {
86 if constexpr (has_null_map)
87 {
88 if ((*null_map)[i])
89 {
90 if constexpr (build_filter)
91 {
92 (*out_filter)[i] = false;
93 }
94 continue;
95 }
96 }
97
98 [[maybe_unused]] auto emplace_result = state.emplaceKey(method.data, i, variants.string_pool);
99
100 if constexpr (build_filter)
101 (*out_filter)[i] = emplace_result.isInserted();
102 }
103}
104
105
106void Set::setHeader(const Block & block)
107{
108 std::unique_lock lock(rwlock);
109
110 if (!empty())
111 return;
112
113 keys_size = block.columns();
114 ColumnRawPtrs key_columns;
115 key_columns.reserve(keys_size);
116 data_types.reserve(keys_size);
117 set_elements_types.reserve(keys_size);
118
119 /// The constant columns to the right of IN are not supported directly. For this, they first materialize.
120 Columns materialized_columns;
121
122 /// Remember the columns we will work with
123 for (size_t i = 0; i < keys_size; ++i)
124 {
125 materialized_columns.emplace_back(block.safeGetByPosition(i).column->convertToFullColumnIfConst());
126 key_columns.emplace_back(materialized_columns.back().get());
127 data_types.emplace_back(block.safeGetByPosition(i).type);
128 set_elements_types.emplace_back(block.safeGetByPosition(i).type);
129
130 /// Convert low cardinality column to full.
131 if (auto * low_cardinality_type = typeid_cast<const DataTypeLowCardinality *>(data_types.back().get()))
132 {
133 data_types.back() = low_cardinality_type->getDictionaryType();
134 materialized_columns.emplace_back(key_columns.back()->convertToFullColumnIfLowCardinality());
135 key_columns.back() = materialized_columns.back().get();
136 }
137 }
138
139 /// We will insert to the Set only keys, where all components are not NULL.
140 ConstNullMapPtr null_map{};
141 ColumnPtr null_map_holder = extractNestedColumnsAndNullMap(key_columns, null_map);
142
143 if (fill_set_elements)
144 {
145 /// Create empty columns with set values in advance.
146 /// It is needed because set may be empty, so method 'insertFromBlock' will be never called.
147 set_elements.reserve(keys_size);
148 for (const auto & type : set_elements_types)
149 set_elements.emplace_back(type->createColumn());
150 }
151
152 /// Choose data structure to use for the set.
153 data.init(data.chooseMethod(key_columns, key_sizes));
154}
155
156
157bool Set::insertFromBlock(const Block & block)
158{
159 std::unique_lock lock(rwlock);
160
161 if (empty())
162 throw Exception("Method Set::setHeader must be called before Set::insertFromBlock", ErrorCodes::LOGICAL_ERROR);
163
164 ColumnRawPtrs key_columns;
165 key_columns.reserve(keys_size);
166
167 /// The constant columns to the right of IN are not supported directly. For this, they first materialize.
168 Columns materialized_columns;
169
170 /// Remember the columns we will work with
171 for (size_t i = 0; i < keys_size; ++i)
172 {
173 materialized_columns.emplace_back(block.safeGetByPosition(i).column->convertToFullColumnIfConst()->convertToFullColumnIfLowCardinality());
174 key_columns.emplace_back(materialized_columns.back().get());
175 }
176
177 size_t rows = block.rows();
178
179 /// We will insert to the Set only keys, where all components are not NULL.
180 ConstNullMapPtr null_map{};
181 ColumnPtr null_map_holder = extractNestedColumnsAndNullMap(key_columns, null_map);
182
183 /// Filter to extract distinct values from the block.
184 ColumnUInt8::MutablePtr filter;
185 if (fill_set_elements)
186 filter = ColumnUInt8::create(block.rows());
187
188 switch (data.type)
189 {
190 case SetVariants::Type::EMPTY:
191 break;
192#define M(NAME) \
193 case SetVariants::Type::NAME: \
194 insertFromBlockImpl(*data.NAME, key_columns, rows, data, null_map, filter ? &filter->getData() : nullptr); \
195 break;
196 APPLY_FOR_SET_VARIANTS(M)
197#undef M
198 }
199
200 if (fill_set_elements)
201 {
202 for (size_t i = 0; i < keys_size; ++i)
203 {
204 auto filtered_column = block.getByPosition(i).column->filter(filter->getData(), rows);
205 if (set_elements[i]->empty())
206 set_elements[i] = filtered_column;
207 else
208 set_elements[i]->insertRangeFrom(*filtered_column, 0, filtered_column->size());
209 }
210 }
211
212 return limits.check(getTotalRowCount(), getTotalByteCount(), "IN-set", ErrorCodes::SET_SIZE_LIMIT_EXCEEDED);
213}
214
215
216static Field extractValueFromNode(const ASTPtr & node, const IDataType & type, const Context & context)
217{
218 if (const auto * lit = node->as<ASTLiteral>())
219 {
220 return convertFieldToType(lit->value, type);
221 }
222 else if (node->as<ASTFunction>())
223 {
224 std::pair<Field, DataTypePtr> value_raw = evaluateConstantExpression(node, context);
225 return convertFieldToType(value_raw.first, type, value_raw.second.get());
226 }
227 else
228 throw Exception("Incorrect element of set. Must be literal or constant expression.", ErrorCodes::INCORRECT_ELEMENT_OF_SET);
229}
230
231
232void Set::createFromAST(const DataTypes & types, ASTPtr node, const Context & context)
233{
234 /// Will form a block with values from the set.
235
236 Block header;
237 size_t num_columns = types.size();
238 for (size_t i = 0; i < num_columns; ++i)
239 header.insert(ColumnWithTypeAndName(types[i]->createColumn(), types[i], "_" + toString(i)));
240 setHeader(header);
241
242 MutableColumns columns = header.cloneEmptyColumns();
243
244 DataTypePtr tuple_type;
245 Row tuple_values;
246 const auto & list = node->as<ASTExpressionList &>();
247 for (auto & elem : list.children)
248 {
249 if (num_columns == 1)
250 {
251 Field value = extractValueFromNode(elem, *types[0], context);
252
253 if (!value.isNull())
254 columns[0]->insert(value);
255 }
256 else if (const auto * func = elem->as<ASTFunction>())
257 {
258 Field function_result;
259 const Tuple * tuple = nullptr;
260 if (func->name != "tuple")
261 {
262 if (!tuple_type)
263 tuple_type = std::make_shared<DataTypeTuple>(types);
264
265 function_result = extractValueFromNode(elem, *tuple_type, context);
266 if (function_result.getType() != Field::Types::Tuple)
267 throw Exception("Invalid type of set. Expected tuple, got " + String(function_result.getTypeName()),
268 ErrorCodes::INCORRECT_ELEMENT_OF_SET);
269
270 tuple = &function_result.get<Tuple>();
271 }
272
273 size_t tuple_size = tuple ? tuple->size() : func->arguments->children.size();
274 if (tuple_size != num_columns)
275 throw Exception("Incorrect size of tuple in set: " + toString(tuple_size) + " instead of " + toString(num_columns),
276 ErrorCodes::INCORRECT_ELEMENT_OF_SET);
277
278 if (tuple_values.empty())
279 tuple_values.resize(tuple_size);
280
281 size_t i = 0;
282 for (; i < tuple_size; ++i)
283 {
284 Field value = tuple ? (*tuple)[i]
285 : extractValueFromNode(func->arguments->children[i], *types[i], context);
286
287 /// If at least one of the elements of the tuple has an impossible (outside the range of the type) value, then the entire tuple too.
288 if (value.isNull())
289 break;
290
291 tuple_values[i] = value;
292 }
293
294 if (i == tuple_size)
295 for (i = 0; i < tuple_size; ++i)
296 columns[i]->insert(tuple_values[i]);
297 }
298 else
299 throw Exception("Incorrect element of set", ErrorCodes::INCORRECT_ELEMENT_OF_SET);
300 }
301
302 Block block = header.cloneWithColumns(std::move(columns));
303 insertFromBlock(block);
304 finishInsert();
305}
306
307
308ColumnPtr Set::execute(const Block & block, bool negative) const
309{
310 size_t num_key_columns = block.columns();
311
312 if (0 == num_key_columns)
313 throw Exception("Logical error: no columns passed to Set::execute method.", ErrorCodes::LOGICAL_ERROR);
314
315 auto res = ColumnUInt8::create();
316 ColumnUInt8::Container & vec_res = res->getData();
317 vec_res.resize(block.safeGetByPosition(0).column->size());
318
319 if (vec_res.empty())
320 return res;
321
322 std::shared_lock lock(rwlock);
323
324 /// If the set is empty.
325 if (data_types.empty())
326 {
327 if (negative)
328 memset(vec_res.data(), 1, vec_res.size());
329 else
330 memset(vec_res.data(), 0, vec_res.size());
331 return res;
332 }
333
334 checkColumnsNumber(num_key_columns);
335
336 /// Remember the columns we will work with. Also check that the data types are correct.
337 ColumnRawPtrs key_columns;
338 key_columns.reserve(num_key_columns);
339
340 /// The constant columns to the left of IN are not supported directly. For this, they first materialize.
341 Columns materialized_columns;
342
343 for (size_t i = 0; i < num_key_columns; ++i)
344 {
345 checkTypesEqual(i, block.safeGetByPosition(i).type);
346 materialized_columns.emplace_back(block.safeGetByPosition(i).column->convertToFullColumnIfConst());
347 key_columns.emplace_back() = materialized_columns.back().get();
348 }
349
350 /// We will check existence in Set only for keys, where all components are not NULL.
351 ConstNullMapPtr null_map{};
352 ColumnPtr null_map_holder = extractNestedColumnsAndNullMap(key_columns, null_map);
353
354 executeOrdinary(key_columns, vec_res, negative, null_map);
355
356 return res;
357}
358
359
360template <typename Method>
361void NO_INLINE Set::executeImpl(
362 Method & method,
363 const ColumnRawPtrs & key_columns,
364 ColumnUInt8::Container & vec_res,
365 bool negative,
366 size_t rows,
367 ConstNullMapPtr null_map) const
368{
369 if (null_map)
370 executeImplCase<Method, true>(method, key_columns, vec_res, negative, rows, null_map);
371 else
372 executeImplCase<Method, false>(method, key_columns, vec_res, negative, rows, null_map);
373}
374
375
376template <typename Method, bool has_null_map>
377void NO_INLINE Set::executeImplCase(
378 Method & method,
379 const ColumnRawPtrs & key_columns,
380 ColumnUInt8::Container & vec_res,
381 bool negative,
382 size_t rows,
383 ConstNullMapPtr null_map) const
384{
385 Arena pool;
386 typename Method::State state(key_columns, key_sizes, nullptr);
387
388 /// NOTE Optimization is not used for consecutive identical strings.
389
390 /// For all rows
391 for (size_t i = 0; i < rows; ++i)
392 {
393 if (has_null_map && (*null_map)[i])
394 vec_res[i] = negative;
395 else
396 {
397 auto find_result = state.findKey(method.data, i, pool);
398 vec_res[i] = negative ^ find_result.isFound();
399 }
400 }
401}
402
403
404void Set::executeOrdinary(
405 const ColumnRawPtrs & key_columns,
406 ColumnUInt8::Container & vec_res,
407 bool negative,
408 ConstNullMapPtr null_map) const
409{
410 size_t rows = key_columns[0]->size();
411
412 switch (data.type)
413 {
414 case SetVariants::Type::EMPTY:
415 break;
416#define M(NAME) \
417 case SetVariants::Type::NAME: \
418 executeImpl(*data.NAME, key_columns, vec_res, negative, rows, null_map); \
419 break;
420 APPLY_FOR_SET_VARIANTS(M)
421#undef M
422 }
423}
424
425void Set::checkColumnsNumber(size_t num_key_columns) const
426{
427 if (data_types.size() != num_key_columns)
428 {
429 std::stringstream message;
430 message << "Number of columns in section IN doesn't match. "
431 << num_key_columns << " at left, " << data_types.size() << " at right.";
432 throw Exception(message.str(), ErrorCodes::NUMBER_OF_COLUMNS_DOESNT_MATCH);
433 }
434}
435
436void Set::checkTypesEqual(size_t set_type_idx, const DataTypePtr & other_type) const
437{
438 if (!removeNullable(recursiveRemoveLowCardinality(data_types[set_type_idx]))->equals(*removeNullable(recursiveRemoveLowCardinality(other_type))))
439 throw Exception("Types of column " + toString(set_type_idx + 1) + " in section IN don't match: "
440 + other_type->getName() + " on the left, "
441 + data_types[set_type_idx]->getName() + " on the right", ErrorCodes::TYPE_MISMATCH);
442}
443
444MergeTreeSetIndex::MergeTreeSetIndex(const Columns & set_elements, std::vector<KeyTuplePositionMapping> && index_mapping_)
445 : indexes_mapping(std::move(index_mapping_))
446{
447 std::sort(indexes_mapping.begin(), indexes_mapping.end(),
448 [](const KeyTuplePositionMapping & l, const KeyTuplePositionMapping & r)
449 {
450 return std::forward_as_tuple(l.key_index, l.tuple_index) < std::forward_as_tuple(r.key_index, r.tuple_index);
451 });
452
453 indexes_mapping.erase(std::unique(
454 indexes_mapping.begin(), indexes_mapping.end(),
455 [](const KeyTuplePositionMapping & l, const KeyTuplePositionMapping & r)
456 {
457 return l.key_index == r.key_index;
458 }), indexes_mapping.end());
459
460 size_t tuple_size = indexes_mapping.size();
461 ordered_set.resize(tuple_size);
462 for (size_t i = 0; i < tuple_size; ++i)
463 ordered_set[i] = set_elements[indexes_mapping[i].tuple_index];
464
465 Block block_to_sort;
466 SortDescription sort_description;
467 for (size_t i = 0; i < tuple_size; ++i)
468 {
469 block_to_sort.insert({ ordered_set[i], nullptr, "" });
470 sort_description.emplace_back(i, 1, 1);
471 }
472
473 sortBlock(block_to_sort, sort_description);
474
475 for (size_t i = 0; i < tuple_size; ++i)
476 ordered_set[i] = block_to_sort.getByPosition(i).column;
477}
478
479
480/** Return the BoolMask where:
481 * 1: the intersection of the set and the range is non-empty
482 * 2: the range contains elements not in the set
483 */
484BoolMask MergeTreeSetIndex::mayBeTrueInRange(const std::vector<Range> & key_ranges, const DataTypes & data_types)
485{
486 size_t tuple_size = indexes_mapping.size();
487
488 using FieldWithInfinityTuple = std::vector<FieldWithInfinity>;
489
490 FieldWithInfinityTuple left_point;
491 FieldWithInfinityTuple right_point;
492 left_point.reserve(tuple_size);
493 right_point.reserve(tuple_size);
494
495 bool invert_left_infinities = false;
496 bool invert_right_infinities = false;
497
498 for (size_t i = 0; i < tuple_size; ++i)
499 {
500 std::optional<Range> new_range = KeyCondition::applyMonotonicFunctionsChainToRange(
501 key_ranges[indexes_mapping[i].key_index],
502 indexes_mapping[i].functions,
503 data_types[indexes_mapping[i].key_index]);
504
505 if (!new_range)
506 return {true, true};
507
508 /** A range that ends in (x, y, ..., +inf) exclusive is the same as a range
509 * that ends in (x, y, ..., -inf) inclusive and vice versa for the left bound.
510 */
511 if (new_range->left_bounded)
512 {
513 if (!new_range->left_included)
514 invert_left_infinities = true;
515
516 left_point.push_back(FieldWithInfinity(new_range->left));
517 }
518 else
519 {
520 if (invert_left_infinities)
521 left_point.push_back(FieldWithInfinity::getPlusinfinity());
522 else
523 left_point.push_back(FieldWithInfinity::getMinusInfinity());
524 }
525
526 if (new_range->right_bounded)
527 {
528 if (!new_range->right_included)
529 invert_right_infinities = true;
530
531 right_point.push_back(FieldWithInfinity(new_range->right));
532 }
533 else
534 {
535 if (invert_right_infinities)
536 right_point.push_back(FieldWithInfinity::getMinusInfinity());
537 else
538 right_point.push_back(FieldWithInfinity::getPlusinfinity());
539 }
540 }
541
542 /// This allows to construct tuple in 'ordered_set' at specified index for comparison with range.
543
544 auto indices = ext::range(0, ordered_set.at(0)->size());
545
546 auto extract_tuple = [tuple_size, this](size_t i)
547 {
548 /// Inefficient.
549 FieldWithInfinityTuple res;
550 res.reserve(tuple_size);
551 for (size_t j = 0; j < tuple_size; ++j)
552 res.emplace_back((*ordered_set[j])[i]);
553 return res;
554 };
555
556 auto compare = [&extract_tuple](size_t i, const FieldWithInfinityTuple & rhs)
557 {
558 return extract_tuple(i) < rhs;
559 };
560
561 /** Because each parallelogram maps to a contiguous sequence of elements
562 * layed out in the lexicographically increasing order, the set intersects the range
563 * if and only if either bound coincides with an element or at least one element
564 * is between the lower bounds
565 */
566 auto left_lower = std::lower_bound(indices.begin(), indices.end(), left_point, compare);
567 auto right_lower = std::lower_bound(indices.begin(), indices.end(), right_point, compare);
568
569 return
570 {
571 left_lower != right_lower
572 || (left_lower != indices.end() && extract_tuple(*left_lower) == left_point)
573 || (right_lower != indices.end() && extract_tuple(*right_lower) == right_point),
574 true
575 };
576}
577
578}
579