1#include <Storages/MergeTree/MergeTreeWhereOptimizer.h>
2#include <Storages/MergeTree/MergeTreeData.h>
3#include <Storages/MergeTree/KeyCondition.h>
4#include <Interpreters/IdentifierSemantic.h>
5#include <Parsers/ASTSelectQuery.h>
6#include <Parsers/ASTFunction.h>
7#include <Parsers/ASTIdentifier.h>
8#include <Parsers/ASTLiteral.h>
9#include <Parsers/ASTExpressionList.h>
10#include <Parsers/ASTSubquery.h>
11#include <Parsers/formatAST.h>
12#include <Interpreters/misc.h>
13#include <Common/typeid_cast.h>
14#include <DataTypes/NestedUtils.h>
15#include <ext/map.h>
16
17
18namespace DB
19{
20
21namespace ErrorCodes
22{
23 extern const int LOGICAL_ERROR;
24}
25
26/// Conditions like "x = N" are considered good if abs(N) > threshold.
27/// This is used to assume that condition is likely to have good selectivity.
28static constexpr auto threshold = 2;
29
30
31MergeTreeWhereOptimizer::MergeTreeWhereOptimizer(
32 SelectQueryInfo & query_info,
33 const Context & context,
34 const MergeTreeData & data,
35 const Names & queried_columns_,
36 Logger * log_)
37 : table_columns{ext::map<std::unordered_set>(data.getColumns().getAllPhysical(),
38 [] (const NameAndTypePair & col) { return col.name; })},
39 queried_columns{queried_columns_},
40 block_with_constants{KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context)},
41 log{log_}
42{
43 if (!data.primary_key_columns.empty())
44 first_primary_key_column = data.primary_key_columns[0];
45
46 calculateColumnSizes(data, queried_columns);
47 determineArrayJoinedNames(query_info.query->as<ASTSelectQuery &>());
48 optimize(query_info.query->as<ASTSelectQuery &>());
49}
50
51
52void MergeTreeWhereOptimizer::calculateColumnSizes(const MergeTreeData & data, const Names & column_names)
53{
54 for (const auto & column_name : column_names)
55 {
56 UInt64 size = data.getColumnCompressedSize(column_name);
57 column_sizes[column_name] = size;
58 total_size_of_queried_columns += size;
59 }
60}
61
62
63static void collectIdentifiersNoSubqueries(const ASTPtr & ast, NameSet & set)
64{
65 if (auto opt_name = tryGetIdentifierName(ast))
66 return (void)set.insert(*opt_name);
67
68 if (ast->as<ASTSubquery>())
69 return;
70
71 for (const auto & child : ast->children)
72 collectIdentifiersNoSubqueries(child, set);
73}
74
75void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node) const
76{
77 if (const auto * func_and = node->as<ASTFunction>(); func_and && func_and->name == "and")
78 {
79 for (const auto & elem : func_and->arguments->children)
80 analyzeImpl(res, elem);
81 }
82 else
83 {
84 Condition cond;
85 cond.node = node;
86
87 collectIdentifiersNoSubqueries(node, cond.identifiers);
88
89 cond.viable =
90 /// Condition depend on some column. Constant expressions are not moved.
91 !cond.identifiers.empty()
92 && !cannotBeMoved(node)
93 /// Do not take into consideration the conditions consisting only of the first primary key column
94 && !hasPrimaryKeyAtoms(node)
95 /// Only table columns are considered. Not array joined columns. NOTE We're assuming that aliases was expanded.
96 && isSubsetOfTableColumns(cond.identifiers)
97 /// Do not move conditions involving all queried columns.
98 && cond.identifiers.size() < queried_columns.size();
99
100 if (cond.viable)
101 {
102 cond.columns_size = getIdentifiersColumnSize(cond.identifiers);
103 cond.good = isConditionGood(node);
104 }
105
106 res.emplace_back(std::move(cond));
107 }
108}
109
110/// Transform conjunctions chain in WHERE expression to Conditions list.
111MergeTreeWhereOptimizer::Conditions MergeTreeWhereOptimizer::analyze(const ASTPtr & expression) const
112{
113 Conditions res;
114 analyzeImpl(res, expression);
115 return res;
116}
117
118/// Transform Conditions list to WHERE or PREWHERE expression.
119ASTPtr MergeTreeWhereOptimizer::reconstruct(const Conditions & conditions) const
120{
121 if (conditions.empty())
122 return {};
123
124 if (conditions.size() == 1)
125 return conditions.front().node;
126
127 const auto function = std::make_shared<ASTFunction>();
128
129 function->name = "and";
130 function->arguments = std::make_shared<ASTExpressionList>();
131 function->children.push_back(function->arguments);
132
133 for (const auto & elem : conditions)
134 function->arguments->children.push_back(elem.node);
135
136 return function;
137}
138
139
140void MergeTreeWhereOptimizer::optimize(ASTSelectQuery & select) const
141{
142 if (!select.where() || select.prewhere())
143 return;
144
145 Conditions where_conditions = analyze(select.where());
146 Conditions prewhere_conditions;
147
148 UInt64 total_size_of_moved_conditions = 0;
149
150 /// Move condition and all other conditions depend on the same set of columns.
151 auto move_condition = [&](Conditions::iterator cond_it)
152 {
153 prewhere_conditions.splice(prewhere_conditions.end(), where_conditions, cond_it);
154 total_size_of_moved_conditions += cond_it->columns_size;
155
156 /// Move all other conditions that depend on the same set of columns.
157 for (auto jt = where_conditions.begin(); jt != where_conditions.end();)
158 {
159 if (jt->columns_size == cond_it->columns_size && jt->identifiers == cond_it->identifiers)
160 prewhere_conditions.splice(prewhere_conditions.end(), where_conditions, jt++);
161 else
162 ++jt;
163 }
164 };
165
166 /// Move conditions unless the ratio of total_size_of_moved_conditions to the total_size_of_queried_columns is less than some threshold.
167 while (!where_conditions.empty())
168 {
169 /// Move the best condition to PREWHERE if it is viable.
170
171 auto it = std::min_element(where_conditions.begin(), where_conditions.end());
172
173 if (!it->viable)
174 break;
175
176 /// 10% ratio is just a guess.
177 if (total_size_of_moved_conditions > 0 && (total_size_of_moved_conditions + it->columns_size) * 10 > total_size_of_queried_columns)
178 break;
179
180 move_condition(it);
181 }
182
183 /// Nothing was moved.
184 if (prewhere_conditions.empty())
185 return;
186
187 /// Rewrite the SELECT query.
188
189 select.setExpression(ASTSelectQuery::Expression::WHERE, reconstruct(where_conditions));
190 select.setExpression(ASTSelectQuery::Expression::PREWHERE, reconstruct(prewhere_conditions));
191
192 LOG_DEBUG(log, "MergeTreeWhereOptimizer: condition \"" << select.prewhere() << "\" moved to PREWHERE");
193}
194
195
196UInt64 MergeTreeWhereOptimizer::getIdentifiersColumnSize(const NameSet & identifiers) const
197{
198 UInt64 size = 0;
199
200 for (const auto & identifier : identifiers)
201 if (column_sizes.count(identifier))
202 size += column_sizes.at(identifier);
203
204 return size;
205}
206
207
208bool MergeTreeWhereOptimizer::isConditionGood(const ASTPtr & condition) const
209{
210 const auto * function = condition->as<ASTFunction>();
211 if (!function)
212 return false;
213
214 /** we are only considering conditions of form `equals(one, another)` or `one = another`,
215 * especially if either `one` or `another` is ASTIdentifier */
216 if (function->name != "equals")
217 return false;
218
219 auto left_arg = function->arguments->children.front().get();
220 auto right_arg = function->arguments->children.back().get();
221
222 /// try to ensure left_arg points to ASTIdentifier
223 if (!left_arg->as<ASTIdentifier>() && right_arg->as<ASTIdentifier>())
224 std::swap(left_arg, right_arg);
225
226 if (left_arg->as<ASTIdentifier>())
227 {
228 /// condition may be "good" if only right_arg is a constant and its value is outside the threshold
229 if (const auto * literal = right_arg->as<ASTLiteral>())
230 {
231 const auto & field = literal->value;
232 const auto type = field.getType();
233
234 /// check the value with respect to threshold
235 if (type == Field::Types::UInt64)
236 {
237 const auto value = field.get<UInt64>();
238 return value > threshold;
239 }
240 else if (type == Field::Types::Int64)
241 {
242 const auto value = field.get<Int64>();
243 return value < -threshold || threshold < value;
244 }
245 else if (type == Field::Types::Float64)
246 {
247 const auto value = field.get<Float64>();
248 return value < threshold || threshold < value;
249 }
250 }
251 }
252
253 return false;
254}
255
256
257bool MergeTreeWhereOptimizer::hasPrimaryKeyAtoms(const ASTPtr & ast) const
258{
259 if (const auto * func = ast->as<ASTFunction>())
260 {
261 const auto & args = func->arguments->children;
262
263 if ((func->name == "not" && 1 == args.size()) || func->name == "and" || func->name == "or")
264 {
265 for (const auto & arg : args)
266 if (hasPrimaryKeyAtoms(arg))
267 return true;
268
269 return false;
270 }
271 }
272
273 return isPrimaryKeyAtom(ast);
274}
275
276
277bool MergeTreeWhereOptimizer::isPrimaryKeyAtom(const ASTPtr & ast) const
278{
279 if (const auto * func = ast->as<ASTFunction>())
280 {
281 if (!KeyCondition::atom_map.count(func->name))
282 return false;
283
284 const auto & args = func->arguments->children;
285 if (args.size() != 2)
286 return false;
287
288 const auto & first_arg_name = args.front()->getColumnName();
289 const auto & second_arg_name = args.back()->getColumnName();
290
291 if ((first_primary_key_column == first_arg_name && isConstant(args[1]))
292 || (first_primary_key_column == second_arg_name && isConstant(args[0]))
293 || (first_primary_key_column == first_arg_name && functionIsInOrGlobalInOperator(func->name)))
294 return true;
295 }
296
297 return false;
298}
299
300
301bool MergeTreeWhereOptimizer::isConstant(const ASTPtr & expr) const
302{
303 const auto column_name = expr->getColumnName();
304
305 if (expr->as<ASTLiteral>()
306 || (block_with_constants.has(column_name) && isColumnConst(*block_with_constants.getByName(column_name).column)))
307 return true;
308
309 return false;
310}
311
312
313bool MergeTreeWhereOptimizer::isSubsetOfTableColumns(const NameSet & identifiers) const
314{
315 for (const auto & identifier : identifiers)
316 if (table_columns.count(identifier) == 0)
317 return false;
318
319 return true;
320}
321
322
323bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr) const
324{
325 if (const auto * function_ptr = ptr->as<ASTFunction>())
326 {
327 /// disallow arrayJoin expressions to be moved to PREWHERE for now
328 if ("arrayJoin" == function_ptr->name)
329 return true;
330
331 /// disallow GLOBAL IN, GLOBAL NOT IN
332 if ("globalIn" == function_ptr->name
333 || "globalNotIn" == function_ptr->name)
334 return true;
335
336 /// indexHint is a special function that it does not make sense to transfer to PREWHERE
337 if ("indexHint" == function_ptr->name)
338 return true;
339 }
340 else if (auto opt_name = IdentifierSemantic::getColumnName(ptr))
341 {
342 /// disallow moving result of ARRAY JOIN to PREWHERE
343 if (array_joined_names.count(*opt_name) ||
344 array_joined_names.count(Nested::extractTableName(*opt_name)))
345 return true;
346 }
347
348 for (const auto & child : ptr->children)
349 if (cannotBeMoved(child))
350 return true;
351
352 return false;
353}
354
355
356void MergeTreeWhereOptimizer::determineArrayJoinedNames(ASTSelectQuery & select)
357{
358 auto array_join_expression_list = select.array_join_expression_list();
359
360 /// much simplified code from ExpressionAnalyzer::getArrayJoinedColumns()
361 if (!array_join_expression_list)
362 return;
363
364 for (const auto & ast : array_join_expression_list->children)
365 array_joined_names.emplace(ast->getAliasOrColumnName());
366}
367
368}
369