1 | #include <Storages/MergeTree/MergeTreeWhereOptimizer.h> |
2 | #include <Storages/MergeTree/MergeTreeData.h> |
3 | #include <Storages/MergeTree/KeyCondition.h> |
4 | #include <Interpreters/IdentifierSemantic.h> |
5 | #include <Parsers/ASTSelectQuery.h> |
6 | #include <Parsers/ASTFunction.h> |
7 | #include <Parsers/ASTIdentifier.h> |
8 | #include <Parsers/ASTLiteral.h> |
9 | #include <Parsers/ASTExpressionList.h> |
10 | #include <Parsers/ASTSubquery.h> |
11 | #include <Parsers/formatAST.h> |
12 | #include <Interpreters/misc.h> |
13 | #include <Common/typeid_cast.h> |
14 | #include <DataTypes/NestedUtils.h> |
15 | #include <ext/map.h> |
16 | |
17 | |
18 | namespace DB |
19 | { |
20 | |
21 | namespace ErrorCodes |
22 | { |
23 | extern const int LOGICAL_ERROR; |
24 | } |
25 | |
26 | /// Conditions like "x = N" are considered good if abs(N) > threshold. |
27 | /// This is used to assume that condition is likely to have good selectivity. |
28 | static constexpr auto threshold = 2; |
29 | |
30 | |
31 | MergeTreeWhereOptimizer::MergeTreeWhereOptimizer( |
32 | SelectQueryInfo & query_info, |
33 | const Context & context, |
34 | const MergeTreeData & data, |
35 | const Names & queried_columns_, |
36 | Logger * log_) |
37 | : table_columns{ext::map<std::unordered_set>(data.getColumns().getAllPhysical(), |
38 | [] (const NameAndTypePair & col) { return col.name; })}, |
39 | queried_columns{queried_columns_}, |
40 | block_with_constants{KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context)}, |
41 | log{log_} |
42 | { |
43 | if (!data.primary_key_columns.empty()) |
44 | first_primary_key_column = data.primary_key_columns[0]; |
45 | |
46 | calculateColumnSizes(data, queried_columns); |
47 | determineArrayJoinedNames(query_info.query->as<ASTSelectQuery &>()); |
48 | optimize(query_info.query->as<ASTSelectQuery &>()); |
49 | } |
50 | |
51 | |
52 | void MergeTreeWhereOptimizer::calculateColumnSizes(const MergeTreeData & data, const Names & column_names) |
53 | { |
54 | for (const auto & column_name : column_names) |
55 | { |
56 | UInt64 size = data.getColumnCompressedSize(column_name); |
57 | column_sizes[column_name] = size; |
58 | total_size_of_queried_columns += size; |
59 | } |
60 | } |
61 | |
62 | |
63 | static void collectIdentifiersNoSubqueries(const ASTPtr & ast, NameSet & set) |
64 | { |
65 | if (auto opt_name = tryGetIdentifierName(ast)) |
66 | return (void)set.insert(*opt_name); |
67 | |
68 | if (ast->as<ASTSubquery>()) |
69 | return; |
70 | |
71 | for (const auto & child : ast->children) |
72 | collectIdentifiersNoSubqueries(child, set); |
73 | } |
74 | |
75 | void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node) const |
76 | { |
77 | if (const auto * func_and = node->as<ASTFunction>(); func_and && func_and->name == "and" ) |
78 | { |
79 | for (const auto & elem : func_and->arguments->children) |
80 | analyzeImpl(res, elem); |
81 | } |
82 | else |
83 | { |
84 | Condition cond; |
85 | cond.node = node; |
86 | |
87 | collectIdentifiersNoSubqueries(node, cond.identifiers); |
88 | |
89 | cond.viable = |
90 | /// Condition depend on some column. Constant expressions are not moved. |
91 | !cond.identifiers.empty() |
92 | && !cannotBeMoved(node) |
93 | /// Do not take into consideration the conditions consisting only of the first primary key column |
94 | && !hasPrimaryKeyAtoms(node) |
95 | /// Only table columns are considered. Not array joined columns. NOTE We're assuming that aliases was expanded. |
96 | && isSubsetOfTableColumns(cond.identifiers) |
97 | /// Do not move conditions involving all queried columns. |
98 | && cond.identifiers.size() < queried_columns.size(); |
99 | |
100 | if (cond.viable) |
101 | { |
102 | cond.columns_size = getIdentifiersColumnSize(cond.identifiers); |
103 | cond.good = isConditionGood(node); |
104 | } |
105 | |
106 | res.emplace_back(std::move(cond)); |
107 | } |
108 | } |
109 | |
110 | /// Transform conjunctions chain in WHERE expression to Conditions list. |
111 | MergeTreeWhereOptimizer::Conditions MergeTreeWhereOptimizer::analyze(const ASTPtr & expression) const |
112 | { |
113 | Conditions res; |
114 | analyzeImpl(res, expression); |
115 | return res; |
116 | } |
117 | |
118 | /// Transform Conditions list to WHERE or PREWHERE expression. |
119 | ASTPtr MergeTreeWhereOptimizer::reconstruct(const Conditions & conditions) const |
120 | { |
121 | if (conditions.empty()) |
122 | return {}; |
123 | |
124 | if (conditions.size() == 1) |
125 | return conditions.front().node; |
126 | |
127 | const auto function = std::make_shared<ASTFunction>(); |
128 | |
129 | function->name = "and" ; |
130 | function->arguments = std::make_shared<ASTExpressionList>(); |
131 | function->children.push_back(function->arguments); |
132 | |
133 | for (const auto & elem : conditions) |
134 | function->arguments->children.push_back(elem.node); |
135 | |
136 | return function; |
137 | } |
138 | |
139 | |
140 | void MergeTreeWhereOptimizer::optimize(ASTSelectQuery & select) const |
141 | { |
142 | if (!select.where() || select.prewhere()) |
143 | return; |
144 | |
145 | Conditions where_conditions = analyze(select.where()); |
146 | Conditions prewhere_conditions; |
147 | |
148 | UInt64 total_size_of_moved_conditions = 0; |
149 | |
150 | /// Move condition and all other conditions depend on the same set of columns. |
151 | auto move_condition = [&](Conditions::iterator cond_it) |
152 | { |
153 | prewhere_conditions.splice(prewhere_conditions.end(), where_conditions, cond_it); |
154 | total_size_of_moved_conditions += cond_it->columns_size; |
155 | |
156 | /// Move all other conditions that depend on the same set of columns. |
157 | for (auto jt = where_conditions.begin(); jt != where_conditions.end();) |
158 | { |
159 | if (jt->columns_size == cond_it->columns_size && jt->identifiers == cond_it->identifiers) |
160 | prewhere_conditions.splice(prewhere_conditions.end(), where_conditions, jt++); |
161 | else |
162 | ++jt; |
163 | } |
164 | }; |
165 | |
166 | /// Move conditions unless the ratio of total_size_of_moved_conditions to the total_size_of_queried_columns is less than some threshold. |
167 | while (!where_conditions.empty()) |
168 | { |
169 | /// Move the best condition to PREWHERE if it is viable. |
170 | |
171 | auto it = std::min_element(where_conditions.begin(), where_conditions.end()); |
172 | |
173 | if (!it->viable) |
174 | break; |
175 | |
176 | /// 10% ratio is just a guess. |
177 | if (total_size_of_moved_conditions > 0 && (total_size_of_moved_conditions + it->columns_size) * 10 > total_size_of_queried_columns) |
178 | break; |
179 | |
180 | move_condition(it); |
181 | } |
182 | |
183 | /// Nothing was moved. |
184 | if (prewhere_conditions.empty()) |
185 | return; |
186 | |
187 | /// Rewrite the SELECT query. |
188 | |
189 | select.setExpression(ASTSelectQuery::Expression::WHERE, reconstruct(where_conditions)); |
190 | select.setExpression(ASTSelectQuery::Expression::PREWHERE, reconstruct(prewhere_conditions)); |
191 | |
192 | LOG_DEBUG(log, "MergeTreeWhereOptimizer: condition \"" << select.prewhere() << "\" moved to PREWHERE" ); |
193 | } |
194 | |
195 | |
196 | UInt64 MergeTreeWhereOptimizer::getIdentifiersColumnSize(const NameSet & identifiers) const |
197 | { |
198 | UInt64 size = 0; |
199 | |
200 | for (const auto & identifier : identifiers) |
201 | if (column_sizes.count(identifier)) |
202 | size += column_sizes.at(identifier); |
203 | |
204 | return size; |
205 | } |
206 | |
207 | |
208 | bool MergeTreeWhereOptimizer::isConditionGood(const ASTPtr & condition) const |
209 | { |
210 | const auto * function = condition->as<ASTFunction>(); |
211 | if (!function) |
212 | return false; |
213 | |
214 | /** we are only considering conditions of form `equals(one, another)` or `one = another`, |
215 | * especially if either `one` or `another` is ASTIdentifier */ |
216 | if (function->name != "equals" ) |
217 | return false; |
218 | |
219 | auto left_arg = function->arguments->children.front().get(); |
220 | auto right_arg = function->arguments->children.back().get(); |
221 | |
222 | /// try to ensure left_arg points to ASTIdentifier |
223 | if (!left_arg->as<ASTIdentifier>() && right_arg->as<ASTIdentifier>()) |
224 | std::swap(left_arg, right_arg); |
225 | |
226 | if (left_arg->as<ASTIdentifier>()) |
227 | { |
228 | /// condition may be "good" if only right_arg is a constant and its value is outside the threshold |
229 | if (const auto * literal = right_arg->as<ASTLiteral>()) |
230 | { |
231 | const auto & field = literal->value; |
232 | const auto type = field.getType(); |
233 | |
234 | /// check the value with respect to threshold |
235 | if (type == Field::Types::UInt64) |
236 | { |
237 | const auto value = field.get<UInt64>(); |
238 | return value > threshold; |
239 | } |
240 | else if (type == Field::Types::Int64) |
241 | { |
242 | const auto value = field.get<Int64>(); |
243 | return value < -threshold || threshold < value; |
244 | } |
245 | else if (type == Field::Types::Float64) |
246 | { |
247 | const auto value = field.get<Float64>(); |
248 | return value < threshold || threshold < value; |
249 | } |
250 | } |
251 | } |
252 | |
253 | return false; |
254 | } |
255 | |
256 | |
257 | bool MergeTreeWhereOptimizer::hasPrimaryKeyAtoms(const ASTPtr & ast) const |
258 | { |
259 | if (const auto * func = ast->as<ASTFunction>()) |
260 | { |
261 | const auto & args = func->arguments->children; |
262 | |
263 | if ((func->name == "not" && 1 == args.size()) || func->name == "and" || func->name == "or" ) |
264 | { |
265 | for (const auto & arg : args) |
266 | if (hasPrimaryKeyAtoms(arg)) |
267 | return true; |
268 | |
269 | return false; |
270 | } |
271 | } |
272 | |
273 | return isPrimaryKeyAtom(ast); |
274 | } |
275 | |
276 | |
277 | bool MergeTreeWhereOptimizer::isPrimaryKeyAtom(const ASTPtr & ast) const |
278 | { |
279 | if (const auto * func = ast->as<ASTFunction>()) |
280 | { |
281 | if (!KeyCondition::atom_map.count(func->name)) |
282 | return false; |
283 | |
284 | const auto & args = func->arguments->children; |
285 | if (args.size() != 2) |
286 | return false; |
287 | |
288 | const auto & first_arg_name = args.front()->getColumnName(); |
289 | const auto & second_arg_name = args.back()->getColumnName(); |
290 | |
291 | if ((first_primary_key_column == first_arg_name && isConstant(args[1])) |
292 | || (first_primary_key_column == second_arg_name && isConstant(args[0])) |
293 | || (first_primary_key_column == first_arg_name && functionIsInOrGlobalInOperator(func->name))) |
294 | return true; |
295 | } |
296 | |
297 | return false; |
298 | } |
299 | |
300 | |
301 | bool MergeTreeWhereOptimizer::isConstant(const ASTPtr & expr) const |
302 | { |
303 | const auto column_name = expr->getColumnName(); |
304 | |
305 | if (expr->as<ASTLiteral>() |
306 | || (block_with_constants.has(column_name) && isColumnConst(*block_with_constants.getByName(column_name).column))) |
307 | return true; |
308 | |
309 | return false; |
310 | } |
311 | |
312 | |
313 | bool MergeTreeWhereOptimizer::isSubsetOfTableColumns(const NameSet & identifiers) const |
314 | { |
315 | for (const auto & identifier : identifiers) |
316 | if (table_columns.count(identifier) == 0) |
317 | return false; |
318 | |
319 | return true; |
320 | } |
321 | |
322 | |
323 | bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr) const |
324 | { |
325 | if (const auto * function_ptr = ptr->as<ASTFunction>()) |
326 | { |
327 | /// disallow arrayJoin expressions to be moved to PREWHERE for now |
328 | if ("arrayJoin" == function_ptr->name) |
329 | return true; |
330 | |
331 | /// disallow GLOBAL IN, GLOBAL NOT IN |
332 | if ("globalIn" == function_ptr->name |
333 | || "globalNotIn" == function_ptr->name) |
334 | return true; |
335 | |
336 | /// indexHint is a special function that it does not make sense to transfer to PREWHERE |
337 | if ("indexHint" == function_ptr->name) |
338 | return true; |
339 | } |
340 | else if (auto opt_name = IdentifierSemantic::getColumnName(ptr)) |
341 | { |
342 | /// disallow moving result of ARRAY JOIN to PREWHERE |
343 | if (array_joined_names.count(*opt_name) || |
344 | array_joined_names.count(Nested::extractTableName(*opt_name))) |
345 | return true; |
346 | } |
347 | |
348 | for (const auto & child : ptr->children) |
349 | if (cannotBeMoved(child)) |
350 | return true; |
351 | |
352 | return false; |
353 | } |
354 | |
355 | |
356 | void MergeTreeWhereOptimizer::determineArrayJoinedNames(ASTSelectQuery & select) |
357 | { |
358 | auto array_join_expression_list = select.array_join_expression_list(); |
359 | |
360 | /// much simplified code from ExpressionAnalyzer::getArrayJoinedColumns() |
361 | if (!array_join_expression_list) |
362 | return; |
363 | |
364 | for (const auto & ast : array_join_expression_list->children) |
365 | array_joined_names.emplace(ast->getAliasOrColumnName()); |
366 | } |
367 | |
368 | } |
369 | |