1 | #include <Core/Settings.h> |
2 | #include <Core/NamesAndTypes.h> |
3 | |
4 | #include <Interpreters/SyntaxAnalyzer.h> |
5 | #include <Interpreters/InJoinSubqueriesPreprocessor.h> |
6 | #include <Interpreters/LogicalExpressionsOptimizer.h> |
7 | #include <Interpreters/QueryAliasesVisitor.h> |
8 | #include <Interpreters/InterpreterSelectWithUnionQuery.h> |
9 | #include <Interpreters/ArrayJoinedColumnsVisitor.h> |
10 | #include <Interpreters/TranslateQualifiedNamesVisitor.h> |
11 | #include <Interpreters/Context.h> |
12 | #include <Interpreters/MarkTableIdentifiersVisitor.h> |
13 | #include <Interpreters/QueryNormalizer.h> |
14 | #include <Interpreters/ExecuteScalarSubqueriesVisitor.h> |
15 | #include <Interpreters/PredicateExpressionsOptimizer.h> |
16 | #include <Interpreters/CollectJoinOnKeysVisitor.h> |
17 | #include <Interpreters/ExternalDictionariesLoader.h> |
18 | #include <Interpreters/OptimizeIfWithConstantConditionVisitor.h> |
19 | #include <Interpreters/RequiredSourceColumnsVisitor.h> |
20 | #include <Interpreters/GetAggregatesVisitor.h> |
21 | #include <Interpreters/AnalyzedJoin.h> |
22 | #include <Interpreters/ExpressionActions.h> /// getSmallestColumn() |
23 | #include <Interpreters/getTableExpressions.h> |
24 | #include <Interpreters/OptimizeIfChains.h> |
25 | |
26 | #include <Parsers/ASTExpressionList.h> |
27 | #include <Parsers/ASTFunction.h> |
28 | #include <Parsers/ASTLiteral.h> |
29 | #include <Parsers/ASTOrderByElement.h> |
30 | #include <Parsers/ASTSelectQuery.h> |
31 | #include <Parsers/ASTTablesInSelectQuery.h> |
32 | #include <Parsers/ParserTablesInSelectQuery.h> |
33 | #include <Parsers/parseQuery.h> |
34 | #include <Parsers/queryToString.h> |
35 | |
36 | #include <DataTypes/NestedUtils.h> |
37 | #include <DataTypes/DataTypeNullable.h> |
38 | |
39 | #include <IO/WriteHelpers.h> |
40 | #include <Storages/IStorage.h> |
41 | |
42 | #include <functional> |
43 | |
44 | |
45 | namespace DB |
46 | { |
47 | |
48 | namespace ErrorCodes |
49 | { |
50 | extern const int EMPTY_NESTED_TABLE; |
51 | extern const int LOGICAL_ERROR; |
52 | extern const int INVALID_JOIN_ON_EXPRESSION; |
53 | extern const int EMPTY_LIST_OF_COLUMNS_QUERIED; |
54 | extern const int NOT_IMPLEMENTED; |
55 | extern const int UNKNOWN_IDENTIFIER; |
56 | extern const int EXPECTED_ALL_OR_ANY; |
57 | extern const int ALIAS_REQUIRED; |
58 | } |
59 | |
60 | namespace |
61 | { |
62 | |
63 | using LogAST = DebugASTLog<false>; /// set to true to enable logs |
64 | |
65 | /// Select implementation of countDistinct based on settings. |
66 | /// Important that it is done as query rewrite. It means rewritten query |
67 | /// will be sent to remote servers during distributed query execution, |
68 | /// and on all remote servers, function implementation will be same. |
69 | struct CustomizeFunctionsData |
70 | { |
71 | using TypeToVisit = ASTFunction; |
72 | |
73 | const String & count_distinct; |
74 | |
75 | void visit(ASTFunction & func, ASTPtr &) |
76 | { |
77 | if (Poco::toLower(func.name) == "countdistinct" ) |
78 | func.name = count_distinct; |
79 | } |
80 | }; |
81 | |
82 | using CustomizeFunctionsMatcher = OneTypeMatcher<CustomizeFunctionsData>; |
83 | using CustomizeFunctionsVisitor = InDepthNodeVisitor<CustomizeFunctionsMatcher, true>; |
84 | |
85 | |
86 | /// Add columns from storage to source_columns list. |
87 | void collectSourceColumns(const ColumnsDescription & columns, NamesAndTypesList & source_columns, bool add_virtuals) |
88 | { |
89 | auto physical_columns = columns.getAllPhysical(); |
90 | if (source_columns.empty()) |
91 | source_columns.swap(physical_columns); |
92 | else |
93 | source_columns.insert(source_columns.end(), physical_columns.begin(), physical_columns.end()); |
94 | |
95 | if (add_virtuals) |
96 | { |
97 | const auto & storage_aliases = columns.getAliases(); |
98 | const auto & storage_virtuals = columns.getVirtuals(); |
99 | source_columns.insert(source_columns.end(), storage_aliases.begin(), storage_aliases.end()); |
100 | source_columns.insert(source_columns.end(), storage_virtuals.begin(), storage_virtuals.end()); |
101 | } |
102 | } |
103 | |
104 | std::vector<TableWithColumnNames> getTablesWithColumns(const std::vector<const ASTTableExpression * > & table_expressions, |
105 | const Context & context) |
106 | { |
107 | std::vector<TableWithColumnNames> tables_with_columns = getDatabaseAndTablesWithColumnNames(table_expressions, context); |
108 | |
109 | auto & settings = context.getSettingsRef(); |
110 | if (settings.joined_subquery_requires_alias && tables_with_columns.size() > 1) |
111 | { |
112 | for (auto & pr : tables_with_columns) |
113 | if (pr.table.table.empty() && pr.table.alias.empty()) |
114 | throw Exception("Not unique subquery in FROM requires an alias (or joined_subquery_requires_alias=0 to disable restriction)." , |
115 | ErrorCodes::ALIAS_REQUIRED); |
116 | } |
117 | |
118 | return tables_with_columns; |
119 | } |
120 | |
121 | |
122 | /// Translate qualified names such as db.table.column, table.column, table_alias.column to names' normal form. |
123 | /// Expand asterisks and qualified asterisks with column names. |
124 | /// There would be columns in normal form & column aliases after translation. Column & column alias would be normalized in QueryNormalizer. |
125 | void translateQualifiedNames(ASTPtr & query, const ASTSelectQuery & select_query, const NameSet & source_columns_set, |
126 | std::vector<TableWithColumnNames> && tables_with_columns) |
127 | { |
128 | LogAST log; |
129 | TranslateQualifiedNamesVisitor::Data visitor_data(source_columns_set, std::move(tables_with_columns)); |
130 | TranslateQualifiedNamesVisitor visitor(visitor_data, log.stream()); |
131 | visitor.visit(query); |
132 | |
133 | /// This may happen after expansion of COLUMNS('regexp'). |
134 | if (select_query.select()->children.empty()) |
135 | throw Exception("Empty list of columns in SELECT query" , ErrorCodes::EMPTY_LIST_OF_COLUMNS_QUERIED); |
136 | } |
137 | |
138 | bool hasArrayJoin(const ASTPtr & ast) |
139 | { |
140 | if (const ASTFunction * function = ast->as<ASTFunction>()) |
141 | if (function->name == "arrayJoin" ) |
142 | return true; |
143 | |
144 | for (const auto & child : ast->children) |
145 | if (!child->as<ASTSelectQuery>() && hasArrayJoin(child)) |
146 | return true; |
147 | |
148 | return false; |
149 | } |
150 | |
151 | /// Keep number of columns for 'GLOBAL IN (SELECT 1 AS a, a)' |
152 | void renameDuplicatedColumns(const ASTSelectQuery * select_query) |
153 | { |
154 | ASTs & elements = select_query->select()->children; |
155 | |
156 | std::set<String> all_column_names; |
157 | std::set<String> assigned_column_names; |
158 | |
159 | for (auto & expr : elements) |
160 | all_column_names.insert(expr->getAliasOrColumnName()); |
161 | |
162 | for (auto & expr : elements) |
163 | { |
164 | auto name = expr->getAliasOrColumnName(); |
165 | |
166 | if (!assigned_column_names.insert(name).second) |
167 | { |
168 | size_t i = 1; |
169 | while (all_column_names.end() != all_column_names.find(name + "_" + toString(i))) |
170 | ++i; |
171 | |
172 | name = name + "_" + toString(i); |
173 | expr = expr->clone(); /// Cancels fuse of the same expressions in the tree. |
174 | expr->setAlias(name); |
175 | |
176 | all_column_names.insert(name); |
177 | assigned_column_names.insert(name); |
178 | } |
179 | } |
180 | } |
181 | |
182 | /// Sometimes we have to calculate more columns in SELECT clause than will be returned from query. |
183 | /// This is the case when we have DISTINCT or arrayJoin: we require more columns in SELECT even if we need less columns in result. |
184 | /// Also we have to remove duplicates in case of GLOBAL subqueries. Their results are placed into tables so duplicates are inpossible. |
185 | void removeUnneededColumnsFromSelectClause(const ASTSelectQuery * select_query, const Names & required_result_columns, bool remove_dups) |
186 | { |
187 | ASTs & elements = select_query->select()->children; |
188 | |
189 | std::map<String, size_t> required_columns_with_duplicate_count; |
190 | |
191 | if (!required_result_columns.empty()) |
192 | { |
193 | /// Some columns may be queried multiple times, like SELECT x, y, y FROM table. |
194 | for (const auto & name : required_result_columns) |
195 | { |
196 | if (remove_dups) |
197 | required_columns_with_duplicate_count[name] = 1; |
198 | else |
199 | ++required_columns_with_duplicate_count[name]; |
200 | } |
201 | } |
202 | else if (remove_dups) |
203 | { |
204 | /// Even if we have no requirements there could be duplicates cause of asterisks. SELECT *, t.* |
205 | for (const auto & elem : elements) |
206 | required_columns_with_duplicate_count.emplace(elem->getAliasOrColumnName(), 1); |
207 | } |
208 | else |
209 | return; |
210 | |
211 | ASTs new_elements; |
212 | new_elements.reserve(elements.size()); |
213 | |
214 | for (const auto & elem : elements) |
215 | { |
216 | String name = elem->getAliasOrColumnName(); |
217 | |
218 | auto it = required_columns_with_duplicate_count.find(name); |
219 | if (required_columns_with_duplicate_count.end() != it && it->second) |
220 | { |
221 | new_elements.push_back(elem); |
222 | --it->second; |
223 | } |
224 | else if (select_query->distinct || hasArrayJoin(elem)) |
225 | { |
226 | new_elements.push_back(elem); |
227 | } |
228 | } |
229 | |
230 | elements = std::move(new_elements); |
231 | } |
232 | |
233 | /// Replacing scalar subqueries with constant values. |
234 | void executeScalarSubqueries(ASTPtr & query, const Context & context, size_t subquery_depth, Scalars & scalars) |
235 | { |
236 | LogAST log; |
237 | ExecuteScalarSubqueriesVisitor::Data visitor_data{context, subquery_depth, scalars}; |
238 | ExecuteScalarSubqueriesVisitor(visitor_data, log.stream()).visit(query); |
239 | } |
240 | |
241 | /** Calls to these functions in the GROUP BY statement would be |
242 | * replaced by their immediate argument. |
243 | */ |
244 | const std::unordered_set<String> injective_function_names |
245 | { |
246 | "negate" , |
247 | "bitNot" , |
248 | "reverse" , |
249 | "reverseUTF8" , |
250 | "toString" , |
251 | "toFixedString" , |
252 | "IPv4NumToString" , |
253 | "IPv4StringToNum" , |
254 | "hex" , |
255 | "unhex" , |
256 | "bitmaskToList" , |
257 | "bitmaskToArray" , |
258 | "tuple" , |
259 | "regionToName" , |
260 | "concatAssumeInjective" , |
261 | }; |
262 | |
263 | const std::unordered_set<String> possibly_injective_function_names |
264 | { |
265 | "dictGetString" , |
266 | "dictGetUInt8" , |
267 | "dictGetUInt16" , |
268 | "dictGetUInt32" , |
269 | "dictGetUInt64" , |
270 | "dictGetInt8" , |
271 | "dictGetInt16" , |
272 | "dictGetInt32" , |
273 | "dictGetInt64" , |
274 | "dictGetFloat32" , |
275 | "dictGetFloat64" , |
276 | "dictGetDate" , |
277 | "dictGetDateTime" |
278 | }; |
279 | |
280 | /** You can not completely remove GROUP BY. Because if there were no aggregate functions, then it turns out that there will be no aggregation. |
281 | * Instead, leave `GROUP BY const`. |
282 | * Next, see deleting the constants in the analyzeAggregation method. |
283 | */ |
284 | void appendUnusedGroupByColumn(ASTSelectQuery * select_query, const NameSet & source_columns) |
285 | { |
286 | /// You must insert a constant that is not the name of the column in the table. Such a case is rare, but it happens. |
287 | UInt64 unused_column = 0; |
288 | String unused_column_name = toString(unused_column); |
289 | |
290 | while (source_columns.count(unused_column_name)) |
291 | { |
292 | ++unused_column; |
293 | unused_column_name = toString(unused_column); |
294 | } |
295 | |
296 | select_query->setExpression(ASTSelectQuery::Expression::GROUP_BY, std::make_shared<ASTExpressionList>()); |
297 | select_query->groupBy()->children.emplace_back(std::make_shared<ASTLiteral>(UInt64(unused_column))); |
298 | } |
299 | |
300 | /// Eliminates injective function calls and constant expressions from group by statement. |
301 | void optimizeGroupBy(ASTSelectQuery * select_query, const NameSet & source_columns, const Context & context) |
302 | { |
303 | if (!select_query->groupBy()) |
304 | { |
305 | // If there is a HAVING clause without GROUP BY, make sure we have some aggregation happen. |
306 | if (select_query->having()) |
307 | appendUnusedGroupByColumn(select_query, source_columns); |
308 | return; |
309 | } |
310 | |
311 | const auto is_literal = [] (const ASTPtr & ast) -> bool |
312 | { |
313 | return ast->as<ASTLiteral>(); |
314 | }; |
315 | |
316 | auto & group_exprs = select_query->groupBy()->children; |
317 | |
318 | /// removes expression at index idx by making it last one and calling .pop_back() |
319 | const auto remove_expr_at_index = [&group_exprs] (const size_t idx) |
320 | { |
321 | if (idx < group_exprs.size() - 1) |
322 | std::swap(group_exprs[idx], group_exprs.back()); |
323 | |
324 | group_exprs.pop_back(); |
325 | }; |
326 | |
327 | /// iterate over each GROUP BY expression, eliminate injective function calls and literals |
328 | for (size_t i = 0; i < group_exprs.size();) |
329 | { |
330 | if (const auto * function = group_exprs[i]->as<ASTFunction>()) |
331 | { |
332 | /// assert function is injective |
333 | if (possibly_injective_function_names.count(function->name)) |
334 | { |
335 | /// do not handle semantic errors here |
336 | if (function->arguments->children.size() < 2) |
337 | { |
338 | ++i; |
339 | continue; |
340 | } |
341 | |
342 | const auto & dict_name = function->arguments->children[0]->as<ASTLiteral &>().value.safeGet<String>(); |
343 | const auto & dict_ptr = context.getExternalDictionariesLoader().getDictionary(dict_name); |
344 | const auto & attr_name = function->arguments->children[1]->as<ASTLiteral &>().value.safeGet<String>(); |
345 | |
346 | if (!dict_ptr->isInjective(attr_name)) |
347 | { |
348 | ++i; |
349 | continue; |
350 | } |
351 | } |
352 | else if (!injective_function_names.count(function->name)) |
353 | { |
354 | ++i; |
355 | continue; |
356 | } |
357 | |
358 | /// copy shared pointer to args in order to ensure lifetime |
359 | auto args_ast = function->arguments; |
360 | |
361 | /** remove function call and take a step back to ensure |
362 | * next iteration does not skip not yet processed data |
363 | */ |
364 | remove_expr_at_index(i); |
365 | |
366 | /// copy non-literal arguments |
367 | std::remove_copy_if( |
368 | std::begin(args_ast->children), std::end(args_ast->children), |
369 | std::back_inserter(group_exprs), is_literal |
370 | ); |
371 | } |
372 | else if (is_literal(group_exprs[i])) |
373 | { |
374 | remove_expr_at_index(i); |
375 | } |
376 | else |
377 | { |
378 | /// if neither a function nor literal - advance to next expression |
379 | ++i; |
380 | } |
381 | } |
382 | |
383 | if (group_exprs.empty()) |
384 | appendUnusedGroupByColumn(select_query, source_columns); |
385 | } |
386 | |
387 | /// Remove duplicate items from ORDER BY. |
388 | void optimizeOrderBy(const ASTSelectQuery * select_query) |
389 | { |
390 | if (!select_query->orderBy()) |
391 | return; |
392 | |
393 | /// Make unique sorting conditions. |
394 | using NameAndLocale = std::pair<String, String>; |
395 | std::set<NameAndLocale> elems_set; |
396 | |
397 | ASTs & elems = select_query->orderBy()->children; |
398 | ASTs unique_elems; |
399 | unique_elems.reserve(elems.size()); |
400 | |
401 | for (const auto & elem : elems) |
402 | { |
403 | String name = elem->children.front()->getColumnName(); |
404 | const auto & order_by_elem = elem->as<ASTOrderByElement &>(); |
405 | |
406 | if (elems_set.emplace(name, order_by_elem.collation ? order_by_elem.collation->getColumnName() : "" ).second) |
407 | unique_elems.emplace_back(elem); |
408 | } |
409 | |
410 | if (unique_elems.size() < elems.size()) |
411 | elems = std::move(unique_elems); |
412 | } |
413 | |
414 | /// Remove duplicate items from LIMIT BY. |
415 | void optimizeLimitBy(const ASTSelectQuery * select_query) |
416 | { |
417 | if (!select_query->limitBy()) |
418 | return; |
419 | |
420 | std::set<String> elems_set; |
421 | |
422 | ASTs & elems = select_query->limitBy()->children; |
423 | ASTs unique_elems; |
424 | unique_elems.reserve(elems.size()); |
425 | |
426 | for (const auto & elem : elems) |
427 | { |
428 | if (elems_set.emplace(elem->getColumnName()).second) |
429 | unique_elems.emplace_back(elem); |
430 | } |
431 | |
432 | if (unique_elems.size() < elems.size()) |
433 | elems = std::move(unique_elems); |
434 | } |
435 | |
436 | /// Remove duplicated columns from USING(...). |
437 | void optimizeUsing(const ASTSelectQuery * select_query) |
438 | { |
439 | if (!select_query->join()) |
440 | return; |
441 | |
442 | const auto * table_join = select_query->join()->table_join->as<ASTTableJoin>(); |
443 | if (!(table_join && table_join->using_expression_list)) |
444 | return; |
445 | |
446 | ASTs & expression_list = table_join->using_expression_list->children; |
447 | ASTs uniq_expressions_list; |
448 | |
449 | std::set<String> expressions_names; |
450 | |
451 | for (const auto & expression : expression_list) |
452 | { |
453 | auto expression_name = expression->getAliasOrColumnName(); |
454 | if (expressions_names.find(expression_name) == expressions_names.end()) |
455 | { |
456 | uniq_expressions_list.push_back(expression); |
457 | expressions_names.insert(expression_name); |
458 | } |
459 | } |
460 | |
461 | if (uniq_expressions_list.size() < expression_list.size()) |
462 | expression_list = uniq_expressions_list; |
463 | } |
464 | |
465 | void getArrayJoinedColumns(ASTPtr & query, SyntaxAnalyzerResult & result, const ASTSelectQuery * select_query, |
466 | const NamesAndTypesList & source_columns, const NameSet & source_columns_set) |
467 | { |
468 | if (ASTPtr array_join_expression_list = select_query->array_join_expression_list()) |
469 | { |
470 | ArrayJoinedColumnsVisitor::Data visitor_data{result.aliases, |
471 | result.array_join_name_to_alias, |
472 | result.array_join_alias_to_name, |
473 | result.array_join_result_to_source}; |
474 | ArrayJoinedColumnsVisitor(visitor_data).visit(query); |
475 | |
476 | /// If the result of ARRAY JOIN is not used, it is necessary to ARRAY-JOIN any column, |
477 | /// to get the correct number of rows. |
478 | if (result.array_join_result_to_source.empty()) |
479 | { |
480 | ASTPtr expr = select_query->array_join_expression_list()->children.at(0); |
481 | String source_name = expr->getColumnName(); |
482 | String result_name = expr->getAliasOrColumnName(); |
483 | |
484 | /// This is an array. |
485 | if (!expr->as<ASTIdentifier>() || source_columns_set.count(source_name)) |
486 | { |
487 | result.array_join_result_to_source[result_name] = source_name; |
488 | } |
489 | else /// This is a nested table. |
490 | { |
491 | bool found = false; |
492 | for (const auto & column : source_columns) |
493 | { |
494 | auto splitted = Nested::splitName(column.name); |
495 | if (splitted.first == source_name && !splitted.second.empty()) |
496 | { |
497 | result.array_join_result_to_source[Nested::concatenateName(result_name, splitted.second)] = column.name; |
498 | found = true; |
499 | break; |
500 | } |
501 | } |
502 | if (!found) |
503 | throw Exception("No columns in nested table " + source_name, ErrorCodes::EMPTY_NESTED_TABLE); |
504 | } |
505 | } |
506 | } |
507 | } |
508 | |
509 | void setJoinStrictness(ASTSelectQuery & select_query, JoinStrictness join_default_strictness, bool old_any, ASTTableJoin & out_table_join) |
510 | { |
511 | const ASTTablesInSelectQueryElement * node = select_query.join(); |
512 | if (!node) |
513 | return; |
514 | |
515 | auto & table_join = const_cast<ASTTablesInSelectQueryElement *>(node)->table_join->as<ASTTableJoin &>(); |
516 | |
517 | if (table_join.strictness == ASTTableJoin::Strictness::Unspecified && |
518 | table_join.kind != ASTTableJoin::Kind::Cross) |
519 | { |
520 | if (join_default_strictness == JoinStrictness::ANY) |
521 | table_join.strictness = ASTTableJoin::Strictness::Any; |
522 | else if (join_default_strictness == JoinStrictness::ALL) |
523 | table_join.strictness = ASTTableJoin::Strictness::All; |
524 | else |
525 | throw Exception("Expected ANY or ALL in JOIN section, because setting (join_default_strictness) is empty" , |
526 | DB::ErrorCodes::EXPECTED_ALL_OR_ANY); |
527 | } |
528 | |
529 | if (old_any && table_join.strictness == ASTTableJoin::Strictness::Any) |
530 | table_join.strictness = ASTTableJoin::Strictness::RightAny; |
531 | |
532 | out_table_join = table_join; |
533 | } |
534 | |
535 | /// Find the columns that are obtained by JOIN. |
536 | void collectJoinedColumns(AnalyzedJoin & analyzed_join, const ASTSelectQuery & select_query, |
537 | const std::vector<TableWithColumnNames> & tables, const Aliases & aliases) |
538 | { |
539 | const ASTTablesInSelectQueryElement * node = select_query.join(); |
540 | if (!node) |
541 | return; |
542 | |
543 | const auto & table_join = node->table_join->as<ASTTableJoin &>(); |
544 | |
545 | if (table_join.using_expression_list) |
546 | { |
547 | const auto & keys = table_join.using_expression_list->as<ASTExpressionList &>(); |
548 | for (const auto & key : keys.children) |
549 | analyzed_join.addUsingKey(key); |
550 | } |
551 | else if (table_join.on_expression) |
552 | { |
553 | bool is_asof = (table_join.strictness == ASTTableJoin::Strictness::Asof); |
554 | |
555 | CollectJoinOnKeysVisitor::Data data{analyzed_join, tables[0], tables[1], aliases, is_asof}; |
556 | CollectJoinOnKeysVisitor(data).visit(table_join.on_expression); |
557 | if (!data.has_some) |
558 | throw Exception("Cannot get JOIN keys from JOIN ON section: " + queryToString(table_join.on_expression), |
559 | ErrorCodes::INVALID_JOIN_ON_EXPRESSION); |
560 | if (is_asof) |
561 | data.asofToJoinKeys(); |
562 | } |
563 | } |
564 | |
565 | void replaceJoinedTable(const ASTTablesInSelectQueryElement * join) |
566 | { |
567 | if (!join || !join->table_expression) |
568 | return; |
569 | |
570 | /// TODO: Push down for CROSS JOIN is not OK [disabled] |
571 | const auto & table_join = join->table_join->as<ASTTableJoin &>(); |
572 | if (table_join.kind == ASTTableJoin::Kind::Cross) |
573 | return; |
574 | |
575 | auto & table_expr = join->table_expression->as<ASTTableExpression &>(); |
576 | if (table_expr.database_and_table_name) |
577 | { |
578 | const auto & table_id = table_expr.database_and_table_name->as<ASTIdentifier &>(); |
579 | String expr = "(select * from " + table_id.name + ") as " + table_id.shortName(); |
580 | |
581 | // FIXME: since the expression "a as b" exposes both "a" and "b" names, which is not equivalent to "(select * from a) as b", |
582 | // we can't replace aliased tables. |
583 | // FIXME: long table names include database name, which we can't save within alias. |
584 | if (table_id.alias.empty() && table_id.isShort()) |
585 | { |
586 | ParserTableExpression parser; |
587 | table_expr = parseQuery(parser, expr, 0)->as<ASTTableExpression &>(); |
588 | } |
589 | } |
590 | } |
591 | |
592 | void checkJoin(const ASTTablesInSelectQueryElement * join) |
593 | { |
594 | if (!join->table_join) |
595 | return; |
596 | |
597 | const auto & table_join = join->table_join->as<ASTTableJoin &>(); |
598 | |
599 | if (table_join.strictness == ASTTableJoin::Strictness::Any) |
600 | if (table_join.kind == ASTTableJoin::Kind::Full) |
601 | throw Exception("ANY FULL JOINs are not implemented." , ErrorCodes::NOT_IMPLEMENTED); |
602 | } |
603 | |
604 | std::vector<const ASTFunction *> getAggregates(const ASTPtr & query) |
605 | { |
606 | if (const auto * select_query = query->as<ASTSelectQuery>()) |
607 | { |
608 | /// There can not be aggregate functions inside the WHERE and PREWHERE. |
609 | if (select_query->where()) |
610 | assertNoAggregates(select_query->where(), "in WHERE" ); |
611 | if (select_query->prewhere()) |
612 | assertNoAggregates(select_query->prewhere(), "in PREWHERE" ); |
613 | |
614 | GetAggregatesVisitor::Data data; |
615 | GetAggregatesVisitor(data).visit(query); |
616 | |
617 | /// There can not be other aggregate functions within the aggregate functions. |
618 | for (const ASTFunction * node : data.aggregates) |
619 | for (auto & arg : node->arguments->children) |
620 | assertNoAggregates(arg, "inside another aggregate function" ); |
621 | return data.aggregates; |
622 | } |
623 | else |
624 | assertNoAggregates(query, "in wrong place" ); |
625 | return {}; |
626 | } |
627 | |
628 | } |
629 | |
630 | /// Calculate which columns are required to execute the expression. |
631 | /// Then, delete all other columns from the list of available columns. |
632 | /// After execution, columns will only contain the list of columns needed to read from the table. |
633 | void SyntaxAnalyzerResult::collectUsedColumns(const ASTPtr & query, const NamesAndTypesList & additional_source_columns) |
634 | { |
635 | /// We caclulate required_source_columns with source_columns modifications and swap them on exit |
636 | required_source_columns = source_columns; |
637 | |
638 | if (!additional_source_columns.empty()) |
639 | { |
640 | source_columns.insert(source_columns.end(), additional_source_columns.begin(), additional_source_columns.end()); |
641 | removeDuplicateColumns(source_columns); |
642 | } |
643 | |
644 | RequiredSourceColumnsVisitor::Data columns_context; |
645 | RequiredSourceColumnsVisitor(columns_context).visit(query); |
646 | |
647 | NameSet source_column_names; |
648 | for (const auto & column : source_columns) |
649 | source_column_names.insert(column.name); |
650 | |
651 | NameSet required = columns_context.requiredColumns(); |
652 | |
653 | if (columns_context.has_table_join) |
654 | { |
655 | NameSet avaliable_columns; |
656 | for (const auto & name : source_columns) |
657 | avaliable_columns.insert(name.name); |
658 | |
659 | /// Add columns obtained by JOIN (if needed). |
660 | for (const auto & joined_column : analyzed_join->columnsFromJoinedTable()) |
661 | { |
662 | auto & name = joined_column.name; |
663 | if (avaliable_columns.count(name)) |
664 | continue; |
665 | |
666 | if (required.count(name)) |
667 | { |
668 | /// Optimisation: do not add columns needed only in JOIN ON section. |
669 | if (columns_context.nameInclusion(name) > analyzed_join->rightKeyInclusion(name)) |
670 | analyzed_join->addJoinedColumn(joined_column); |
671 | |
672 | required.erase(name); |
673 | } |
674 | } |
675 | } |
676 | |
677 | NameSet array_join_sources; |
678 | if (columns_context.has_array_join) |
679 | { |
680 | /// Insert the columns required for the ARRAY JOIN calculation into the required columns list. |
681 | for (const auto & result_source : array_join_result_to_source) |
682 | array_join_sources.insert(result_source.second); |
683 | |
684 | for (const auto & column_name_type : source_columns) |
685 | if (array_join_sources.count(column_name_type.name)) |
686 | required.insert(column_name_type.name); |
687 | } |
688 | |
689 | const auto * select_query = query->as<ASTSelectQuery>(); |
690 | |
691 | /// You need to read at least one column to find the number of rows. |
692 | if (select_query && required.empty()) |
693 | { |
694 | maybe_optimize_trivial_count = true; |
695 | /// We will find a column with minimum <compressed_size, type_size, uncompressed_size>. |
696 | /// Because it is the column that is cheapest to read. |
697 | struct ColumnSizeTuple |
698 | { |
699 | size_t compressed_size; |
700 | size_t type_size; |
701 | size_t uncompressed_size; |
702 | String name; |
703 | bool operator<(const ColumnSizeTuple & that) const |
704 | { |
705 | return std::tie(compressed_size, type_size, uncompressed_size) |
706 | < std::tie(that.compressed_size, that.type_size, that.uncompressed_size); |
707 | } |
708 | }; |
709 | std::vector<ColumnSizeTuple> columns; |
710 | if (storage) |
711 | { |
712 | auto column_sizes = storage->getColumnSizes(); |
713 | for (auto & source_column : source_columns) |
714 | { |
715 | auto c = column_sizes.find(source_column.name); |
716 | if (c == column_sizes.end()) |
717 | continue; |
718 | size_t type_size = source_column.type->haveMaximumSizeOfValue() ? source_column.type->getMaximumSizeOfValueInMemory() : 100; |
719 | columns.emplace_back(ColumnSizeTuple{c->second.data_compressed, type_size, c->second.data_uncompressed, source_column.name}); |
720 | } |
721 | } |
722 | if (columns.size()) |
723 | required.insert(std::min_element(columns.begin(), columns.end())->name); |
724 | else |
725 | /// If we have no information about columns sizes, choose a column of minimum size of its data type. |
726 | required.insert(ExpressionActions::getSmallestColumn(source_columns)); |
727 | } |
728 | |
729 | NameSet unknown_required_source_columns = required; |
730 | |
731 | for (NamesAndTypesList::iterator it = source_columns.begin(); it != source_columns.end();) |
732 | { |
733 | const String & column_name = it->name; |
734 | unknown_required_source_columns.erase(column_name); |
735 | |
736 | if (!required.count(column_name)) |
737 | source_columns.erase(it++); |
738 | else |
739 | ++it; |
740 | } |
741 | |
742 | /// If there are virtual columns among the unknown columns. Remove them from the list of unknown and add |
743 | /// in columns list, so that when further processing they are also considered. |
744 | if (storage) |
745 | { |
746 | for (auto it = unknown_required_source_columns.begin(); it != unknown_required_source_columns.end();) |
747 | { |
748 | if (storage->hasColumn(*it)) |
749 | { |
750 | source_columns.push_back(storage->getColumn(*it)); |
751 | unknown_required_source_columns.erase(it++); |
752 | } |
753 | else |
754 | ++it; |
755 | } |
756 | } |
757 | |
758 | if (!unknown_required_source_columns.empty()) |
759 | { |
760 | std::stringstream ss; |
761 | ss << "Missing columns:" ; |
762 | for (const auto & name : unknown_required_source_columns) |
763 | ss << " '" << name << "'" ; |
764 | ss << " while processing query: '" << queryToString(query) << "'" ; |
765 | |
766 | ss << ", required columns:" ; |
767 | for (const auto & name : columns_context.requiredColumns()) |
768 | ss << " '" << name << "'" ; |
769 | |
770 | if (!source_column_names.empty()) |
771 | { |
772 | ss << ", source columns:" ; |
773 | for (const auto & name : source_column_names) |
774 | ss << " '" << name << "'" ; |
775 | } |
776 | else |
777 | ss << ", no source columns" ; |
778 | |
779 | if (columns_context.has_table_join) |
780 | { |
781 | ss << ", joined columns:" ; |
782 | for (const auto & column : analyzed_join->columnsFromJoinedTable()) |
783 | ss << " '" << column.name << "'" ; |
784 | } |
785 | |
786 | if (!array_join_sources.empty()) |
787 | { |
788 | ss << ", arrayJoin columns:" ; |
789 | for (const auto & name : array_join_sources) |
790 | ss << " '" << name << "'" ; |
791 | } |
792 | |
793 | throw Exception(ss.str(), ErrorCodes::UNKNOWN_IDENTIFIER); |
794 | } |
795 | |
796 | required_source_columns.swap(source_columns); |
797 | } |
798 | |
799 | |
800 | SyntaxAnalyzerResultPtr SyntaxAnalyzer::analyze( |
801 | ASTPtr & query, |
802 | const NamesAndTypesList & source_columns_, |
803 | const Names & required_result_columns, |
804 | StoragePtr storage, |
805 | const NamesAndTypesList & additional_source_columns) const |
806 | { |
807 | auto * select_query = query->as<ASTSelectQuery>(); |
808 | if (!storage && select_query) |
809 | { |
810 | if (auto db_and_table = getDatabaseAndTable(*select_query, 0)) |
811 | storage = context.tryGetTable(db_and_table->database, db_and_table->table); |
812 | } |
813 | |
814 | const auto & settings = context.getSettingsRef(); |
815 | |
816 | SyntaxAnalyzerResult result; |
817 | result.storage = storage; |
818 | result.source_columns = source_columns_; |
819 | result.analyzed_join = std::make_shared<AnalyzedJoin>(settings, context.getTemporaryPath()); /// TODO: move to select_query logic |
820 | |
821 | if (storage) |
822 | collectSourceColumns(storage->getColumns(), result.source_columns, (select_query != nullptr)); |
823 | NameSet source_columns_set = removeDuplicateColumns(result.source_columns); |
824 | std::vector<TableWithColumnNames> tables_with_columns; |
825 | |
826 | if (select_query) |
827 | { |
828 | if (remove_duplicates) |
829 | renameDuplicatedColumns(select_query); |
830 | |
831 | const ASTTablesInSelectQueryElement * table_join_node = select_query->join(); |
832 | if (table_join_node) |
833 | { |
834 | if (!settings.any_join_distinct_right_table_keys) |
835 | checkJoin(table_join_node); |
836 | |
837 | if (settings.enable_optimize_predicate_expression) |
838 | replaceJoinedTable(table_join_node); |
839 | } |
840 | |
841 | std::vector<const ASTTableExpression *> table_expressions = getTableExpressions(*select_query); |
842 | tables_with_columns = getTablesWithColumns(table_expressions, context); |
843 | |
844 | if (tables_with_columns.empty()) |
845 | { |
846 | if (storage) |
847 | { |
848 | const ColumnsDescription & starage_columns = storage->getColumns(); |
849 | tables_with_columns.emplace_back(DatabaseAndTableWithAlias{}, starage_columns.getOrdinary().getNames()); |
850 | auto & table = tables_with_columns.back(); |
851 | table.addHiddenColumns(starage_columns.getMaterialized()); |
852 | table.addHiddenColumns(starage_columns.getAliases()); |
853 | table.addHiddenColumns(starage_columns.getVirtuals()); |
854 | } |
855 | else |
856 | { |
857 | Names columns; |
858 | columns.reserve(result.source_columns.size()); |
859 | for (const auto & column : result.source_columns) |
860 | columns.push_back(column.name); |
861 | tables_with_columns.emplace_back(DatabaseAndTableWithAlias{}, columns); |
862 | } |
863 | } |
864 | |
865 | if (table_expressions.size() > 1) |
866 | { |
867 | result.analyzed_join->columns_from_joined_table = getColumnsFromTableExpression(*table_expressions[1], context); |
868 | result.analyzed_join->deduplicateAndQualifyColumnNames( |
869 | source_columns_set, tables_with_columns[1].table.getQualifiedNamePrefix()); |
870 | } |
871 | |
872 | translateQualifiedNames(query, *select_query, source_columns_set, std::move(tables_with_columns)); |
873 | |
874 | /// Rewrite IN and/or JOIN for distributed tables according to distributed_product_mode setting. |
875 | InJoinSubqueriesPreprocessor(context).visit(query); |
876 | |
877 | /// Optimizes logical expressions. |
878 | LogicalExpressionsOptimizer(select_query, settings.optimize_min_equality_disjunction_chain_length.value).perform(); |
879 | } |
880 | |
881 | { |
882 | CustomizeFunctionsVisitor::Data data{settings.count_distinct_implementation}; |
883 | CustomizeFunctionsVisitor(data).visit(query); |
884 | } |
885 | |
886 | /// Creates a dictionary `aliases`: alias -> ASTPtr |
887 | { |
888 | LogAST log; |
889 | QueryAliasesVisitor::Data query_aliases_data{result.aliases}; |
890 | QueryAliasesVisitor(query_aliases_data, log.stream()).visit(query); |
891 | } |
892 | |
893 | /// Mark table ASTIdentifiers with not a column marker |
894 | { |
895 | MarkTableIdentifiersVisitor::Data data{result.aliases}; |
896 | MarkTableIdentifiersVisitor(data).visit(query); |
897 | } |
898 | |
899 | /// Common subexpression elimination. Rewrite rules. |
900 | { |
901 | QueryNormalizer::Data normalizer_data(result.aliases, context.getSettingsRef()); |
902 | QueryNormalizer(normalizer_data).visit(query); |
903 | } |
904 | |
905 | /// Remove unneeded columns according to 'required_result_columns'. |
906 | /// Leave all selected columns in case of DISTINCT; columns that contain arrayJoin function inside. |
907 | /// Must be after 'normalizeTree' (after expanding aliases, for aliases not get lost) |
908 | /// and before 'executeScalarSubqueries', 'analyzeAggregation', etc. to avoid excessive calculations. |
909 | if (select_query) |
910 | removeUnneededColumnsFromSelectClause(select_query, required_result_columns, remove_duplicates); |
911 | |
912 | /// Executing scalar subqueries - replacing them with constant values. |
913 | executeScalarSubqueries(query, context, subquery_depth, result.scalars); |
914 | |
915 | /// Optimize if with constant condition after constants was substituted instead of scalar subqueries. |
916 | OptimizeIfWithConstantConditionVisitor(result.aliases).visit(query); |
917 | |
918 | if (settings.optimize_if_chain_to_miltiif) |
919 | OptimizeIfChainsVisitor().visit(query); |
920 | |
921 | if (select_query) |
922 | { |
923 | /// GROUP BY injective function elimination. |
924 | optimizeGroupBy(select_query, source_columns_set, context); |
925 | |
926 | /// Remove duplicate items from ORDER BY. |
927 | optimizeOrderBy(select_query); |
928 | |
929 | /// Remove duplicated elements from LIMIT BY clause. |
930 | optimizeLimitBy(select_query); |
931 | |
932 | /// Remove duplicated columns from USING(...). |
933 | optimizeUsing(select_query); |
934 | |
935 | /// array_join_alias_to_name, array_join_result_to_source. |
936 | getArrayJoinedColumns(query, result, select_query, result.source_columns, source_columns_set); |
937 | |
938 | /// Push the predicate expression down to the subqueries. |
939 | result.rewrite_subqueries = PredicateExpressionsOptimizer(select_query, settings, context).optimize(); |
940 | |
941 | setJoinStrictness(*select_query, settings.join_default_strictness, settings.any_join_distinct_right_table_keys, |
942 | result.analyzed_join->table_join); |
943 | collectJoinedColumns(*result.analyzed_join, *select_query, tables_with_columns, result.aliases); |
944 | } |
945 | |
946 | result.aggregates = getAggregates(query); |
947 | result.collectUsedColumns(query, additional_source_columns); |
948 | return std::make_shared<const SyntaxAnalyzerResult>(result); |
949 | } |
950 | |
951 | } |
952 | |