1#include <Storages/MergeTree/MergeTreeIndexConditionBloomFilter.h>
2#include <Interpreters/misc.h>
3#include <Interpreters/BloomFilterHash.h>
4#include <Common/HashTable/ClearableHashMap.h>
5#include <Storages/MergeTree/RPNBuilder.h>
6#include <Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.h>
7#include <DataTypes/DataTypeArray.h>
8#include <DataTypes/DataTypeTuple.h>
9#include <Columns/ColumnConst.h>
10#include <ext/bit_cast.h>
11#include <Parsers/ASTSubquery.h>
12#include <Parsers/ASTIdentifier.h>
13#include <Columns/ColumnTuple.h>
14#include <Interpreters/castColumn.h>
15#include <Interpreters/convertFieldToType.h>
16
17
18namespace DB
19{
20
21namespace
22{
23
24PreparedSetKey getPreparedSetKey(const ASTPtr & node, const DataTypePtr & data_type)
25{
26 /// If the data type is tuple, let's try unbox once
27 if (node->as<ASTSubquery>() || node->as<ASTIdentifier>())
28 return PreparedSetKey::forSubquery(*node);
29
30 if (const auto * date_type_tuple = typeid_cast<const DataTypeTuple *>(&*data_type))
31 return PreparedSetKey::forLiteral(*node, date_type_tuple->getElements());
32
33 return PreparedSetKey::forLiteral(*node, DataTypes(1, data_type));
34}
35
36ColumnWithTypeAndName getPreparedSetInfo(const SetPtr & prepared_set)
37{
38 if (prepared_set->getDataTypes().size() == 1)
39 return {prepared_set->getSetElements()[0], prepared_set->getElementsTypes()[0], "dummy"};
40
41 return {ColumnTuple::create(prepared_set->getSetElements()), std::make_shared<DataTypeTuple>(prepared_set->getElementsTypes()), "dummy"};
42}
43
44bool maybeTrueOnBloomFilter(const IColumn * hash_column, const BloomFilterPtr & bloom_filter, size_t hash_functions)
45{
46 const auto const_column = typeid_cast<const ColumnConst *>(hash_column);
47 const auto non_const_column = typeid_cast<const ColumnUInt64 *>(hash_column);
48
49 if (!const_column && !non_const_column)
50 throw Exception("LOGICAL ERROR: hash column must be Const Column or UInt64 Column.", ErrorCodes::LOGICAL_ERROR);
51
52 if (const_column)
53 {
54 for (size_t index = 0; index < hash_functions; ++index)
55 if (!bloom_filter->findHashWithSeed(const_column->getValue<UInt64>(), BloomFilterHash::bf_hash_seed[index]))
56 return false;
57 return true;
58 }
59 else
60 {
61 bool missing_rows = true;
62 const ColumnUInt64::Container & data = non_const_column->getData();
63
64 for (size_t index = 0, size = data.size(); missing_rows && index < size; ++index)
65 {
66 bool match_row = true;
67 for (size_t hash_index = 0; match_row && hash_index < hash_functions; ++hash_index)
68 match_row = bloom_filter->findHashWithSeed(data[index], BloomFilterHash::bf_hash_seed[hash_index]);
69
70 missing_rows = !match_row;
71 }
72
73 return !missing_rows;
74 }
75}
76
77}
78
79MergeTreeIndexConditionBloomFilter::MergeTreeIndexConditionBloomFilter(
80 const SelectQueryInfo & info_, const Context & context_, const Block & header_, size_t hash_functions_)
81 : header(header_), context(context_), query_info(info_), hash_functions(hash_functions_)
82{
83 auto atomFromAST = [this](auto & node, auto &, auto & constants, auto & out) { return traverseAtomAST(node, constants, out); };
84 rpn = std::move(RPNBuilder<RPNElement>(info_, context, atomFromAST).extractRPN());
85}
86
87bool MergeTreeIndexConditionBloomFilter::alwaysUnknownOrTrue() const
88{
89 std::vector<bool> rpn_stack;
90
91 for (const auto & element : rpn)
92 {
93 if (element.function == RPNElement::FUNCTION_UNKNOWN
94 || element.function == RPNElement::ALWAYS_TRUE)
95 {
96 rpn_stack.push_back(true);
97 }
98 else if (element.function == RPNElement::FUNCTION_EQUALS
99 || element.function == RPNElement::FUNCTION_NOT_EQUALS
100 || element.function == RPNElement::FUNCTION_HAS
101 || element.function == RPNElement::FUNCTION_IN
102 || element.function == RPNElement::FUNCTION_NOT_IN
103 || element.function == RPNElement::ALWAYS_FALSE)
104 {
105 rpn_stack.push_back(false);
106 }
107 else if (element.function == RPNElement::FUNCTION_NOT)
108 {
109 // do nothing
110 }
111 else if (element.function == RPNElement::FUNCTION_AND)
112 {
113 auto arg1 = rpn_stack.back();
114 rpn_stack.pop_back();
115 auto arg2 = rpn_stack.back();
116 rpn_stack.back() = arg1 && arg2;
117 }
118 else if (element.function == RPNElement::FUNCTION_OR)
119 {
120 auto arg1 = rpn_stack.back();
121 rpn_stack.pop_back();
122 auto arg2 = rpn_stack.back();
123 rpn_stack.back() = arg1 || arg2;
124 }
125 else
126 throw Exception("Unexpected function type in KeyCondition::RPNElement", ErrorCodes::LOGICAL_ERROR);
127 }
128
129 return rpn_stack[0];
130}
131
132bool MergeTreeIndexConditionBloomFilter::mayBeTrueOnGranule(const MergeTreeIndexGranuleBloomFilter * granule) const
133{
134 std::vector<BoolMask> rpn_stack;
135 const auto & filters = granule->getFilters();
136
137 for (const auto & element : rpn)
138 {
139 if (element.function == RPNElement::FUNCTION_UNKNOWN)
140 {
141 rpn_stack.emplace_back(true, true);
142 }
143 else if (element.function == RPNElement::FUNCTION_IN
144 || element.function == RPNElement::FUNCTION_NOT_IN
145 || element.function == RPNElement::FUNCTION_EQUALS
146 || element.function == RPNElement::FUNCTION_NOT_EQUALS
147 || element.function == RPNElement::FUNCTION_HAS)
148 {
149 bool match_rows = true;
150 const auto & predicate = element.predicate;
151 for (size_t index = 0; match_rows && index < predicate.size(); ++index)
152 {
153 const auto & query_index_hash = predicate[index];
154 const auto & filter = filters[query_index_hash.first];
155 const ColumnPtr & hash_column = query_index_hash.second;
156 match_rows = maybeTrueOnBloomFilter(&*hash_column, filter, hash_functions);
157 }
158
159 rpn_stack.emplace_back(match_rows, !match_rows);
160 if (element.function == RPNElement::FUNCTION_NOT_EQUALS || element.function == RPNElement::FUNCTION_NOT_IN)
161 rpn_stack.back() = !rpn_stack.back();
162 }
163 else if (element.function == RPNElement::FUNCTION_NOT)
164 {
165 rpn_stack.back() = !rpn_stack.back();
166 }
167 else if (element.function == RPNElement::FUNCTION_OR)
168 {
169 auto arg1 = rpn_stack.back();
170 rpn_stack.pop_back();
171 auto arg2 = rpn_stack.back();
172 rpn_stack.back() = arg1 | arg2;
173 }
174 else if (element.function == RPNElement::FUNCTION_AND)
175 {
176 auto arg1 = rpn_stack.back();
177 rpn_stack.pop_back();
178 auto arg2 = rpn_stack.back();
179 rpn_stack.back() = arg1 & arg2;
180 }
181 else if (element.function == RPNElement::ALWAYS_TRUE)
182 {
183 rpn_stack.emplace_back(true, false);
184 }
185 else if (element.function == RPNElement::ALWAYS_FALSE)
186 {
187 rpn_stack.emplace_back(false, true);
188 }
189 else
190 throw Exception("Unexpected function type in KeyCondition::RPNElement", ErrorCodes::LOGICAL_ERROR);
191 }
192
193 if (rpn_stack.size() != 1)
194 throw Exception("Unexpected stack size in KeyCondition::mayBeTrueInRange", ErrorCodes::LOGICAL_ERROR);
195
196 return rpn_stack[0].can_be_true;
197}
198
199bool MergeTreeIndexConditionBloomFilter::traverseAtomAST(const ASTPtr & node, Block & block_with_constants, RPNElement & out)
200{
201 {
202 Field const_value;
203 DataTypePtr const_type;
204 if (KeyCondition::getConstant(node, block_with_constants, const_value, const_type))
205 {
206 if (const_value.getType() == Field::Types::UInt64 || const_value.getType() == Field::Types::Int64 ||
207 const_value.getType() == Field::Types::Float64)
208 {
209 /// Zero in all types is represented in memory the same way as in UInt64.
210 out.function = const_value.get<UInt64>() ? RPNElement::ALWAYS_TRUE : RPNElement::ALWAYS_FALSE;
211 return true;
212 }
213 }
214 }
215
216 if (const auto * function = node->as<ASTFunction>())
217 {
218 const ASTs & arguments = function->arguments->children;
219
220 if (arguments.size() != 2)
221 return false;
222
223 if (functionIsInOrGlobalInOperator(function->name))
224 {
225 if (const auto & prepared_set = getPreparedSet(arguments[1]))
226 return traverseASTIn(function->name, arguments[0], prepared_set, out);
227 }
228 else if (function->name == "equals" || function->name == "notEquals" || function->name == "has")
229 {
230 Field const_value;
231 DataTypePtr const_type;
232 if (KeyCondition::getConstant(arguments[1], block_with_constants, const_value, const_type))
233 return traverseASTEquals(function->name, arguments[0], const_type, const_value, out);
234 else if (KeyCondition::getConstant(arguments[0], block_with_constants, const_value, const_type))
235 return traverseASTEquals(function->name, arguments[1], const_type, const_value, out);
236 }
237 }
238
239 return false;
240}
241
242bool MergeTreeIndexConditionBloomFilter::traverseASTIn(
243 const String & function_name, const ASTPtr & key_ast, const SetPtr & prepared_set, RPNElement & out)
244{
245 const auto prepared_info = getPreparedSetInfo(prepared_set);
246 return traverseASTIn(function_name, key_ast, prepared_info.type, prepared_info.column, out);
247}
248
249bool MergeTreeIndexConditionBloomFilter::traverseASTIn(
250 const String & function_name, const ASTPtr & key_ast, const DataTypePtr & type, const ColumnPtr & column, RPNElement & out)
251{
252 if (header.has(key_ast->getColumnName()))
253 {
254 size_t row_size = column->size();
255 size_t position = header.getPositionByName(key_ast->getColumnName());
256 const DataTypePtr & index_type = header.getByPosition(position).type;
257 const auto & converted_column = castColumn(ColumnWithTypeAndName{column, type, ""}, index_type, context);
258 out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithColumn(index_type, converted_column, 0, row_size)));
259
260 if (function_name == "in" || function_name == "globalIn")
261 out.function = RPNElement::FUNCTION_IN;
262
263 if (function_name == "notIn" || function_name == "globalNotIn")
264 out.function = RPNElement::FUNCTION_NOT_IN;
265
266 return true;
267 }
268
269 if (const auto * function = key_ast->as<ASTFunction>())
270 {
271 WhichDataType which(type);
272
273 if (which.isTuple() && function->name == "tuple")
274 {
275 const auto & tuple_column = typeid_cast<const ColumnTuple *>(column.get());
276 const auto & tuple_data_type = typeid_cast<const DataTypeTuple *>(type.get());
277 const ASTs & arguments = typeid_cast<const ASTExpressionList &>(*function->arguments).children;
278
279 if (tuple_data_type->getElements().size() != arguments.size() || tuple_column->getColumns().size() != arguments.size())
280 throw Exception("Illegal types of arguments of function " + function_name, ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
281
282 bool match_with_subtype = false;
283 const auto & sub_columns = tuple_column->getColumns();
284 const auto & sub_data_types = tuple_data_type->getElements();
285
286 for (size_t index = 0; index < arguments.size(); ++index)
287 match_with_subtype |= traverseASTIn(function_name, arguments[index], sub_data_types[index], sub_columns[index], out);
288
289 return match_with_subtype;
290 }
291 }
292
293 return false;
294}
295
296bool MergeTreeIndexConditionBloomFilter::traverseASTEquals(
297 const String & function_name, const ASTPtr & key_ast, const DataTypePtr & value_type, const Field & value_field, RPNElement & out)
298{
299 if (header.has(key_ast->getColumnName()))
300 {
301 size_t position = header.getPositionByName(key_ast->getColumnName());
302 const DataTypePtr & index_type = header.getByPosition(position).type;
303 const auto * array_type = typeid_cast<const DataTypeArray *>(index_type.get());
304
305 if (function_name == "has")
306 {
307 out.function = RPNElement::FUNCTION_HAS;
308
309 if (!array_type)
310 throw Exception("First argument for function has must be an array.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
311
312 const DataTypePtr actual_type = BloomFilter::getPrimitiveType(array_type->getNestedType());
313 Field converted_field = convertFieldToType(value_field, *actual_type, value_type.get());
314 out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithField(actual_type.get(), converted_field)));
315 }
316 else
317 {
318 if (array_type)
319 throw Exception("An array type of bloom_filter supports only has() function.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
320
321 out.function = function_name == "equals" ? RPNElement::FUNCTION_EQUALS : RPNElement::FUNCTION_NOT_EQUALS;
322 const DataTypePtr actual_type = BloomFilter::getPrimitiveType(index_type);
323 Field converted_field = convertFieldToType(value_field, *actual_type, value_type.get());
324 out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithField(actual_type.get(), converted_field)));
325 }
326
327 return true;
328 }
329
330 if (const auto * function = key_ast->as<ASTFunction>())
331 {
332 WhichDataType which(value_type);
333
334 if (which.isTuple() && function->name == "tuple")
335 {
336 const Tuple & tuple = get<const Tuple &>(value_field);
337 const auto value_tuple_data_type = typeid_cast<const DataTypeTuple *>(value_type.get());
338 const ASTs & arguments = typeid_cast<const ASTExpressionList &>(*function->arguments).children;
339
340 if (tuple.size() != arguments.size())
341 throw Exception("Illegal types of arguments of function " + function_name, ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
342
343 bool match_with_subtype = false;
344 const DataTypes & subtypes = value_tuple_data_type->getElements();
345
346 for (size_t index = 0; index < tuple.size(); ++index)
347 match_with_subtype |= traverseASTEquals(function_name, arguments[index], subtypes[index], tuple[index], out);
348
349 return match_with_subtype;
350 }
351 }
352
353 return false;
354}
355
356SetPtr MergeTreeIndexConditionBloomFilter::getPreparedSet(const ASTPtr & node)
357{
358 if (header.has(node->getColumnName()))
359 {
360 const auto & column_and_type = header.getByName(node->getColumnName());
361 const auto & prepared_set_it = query_info.sets.find(getPreparedSetKey(node, column_and_type.type));
362
363 if (prepared_set_it != query_info.sets.end() && prepared_set_it->second->hasExplicitSetElements())
364 return prepared_set_it->second;
365 }
366 else
367 {
368 for (const auto & prepared_set_it : query_info.sets)
369 if (prepared_set_it.first.ast_hash == node->getTreeHash() && prepared_set_it.second->hasExplicitSetElements())
370 return prepared_set_it.second;
371 }
372
373 return DB::SetPtr();
374}
375
376}
377