1#include <Core/Settings.h>
2#include <Core/NamesAndTypes.h>
3
4#include <Interpreters/SyntaxAnalyzer.h>
5#include <Interpreters/InJoinSubqueriesPreprocessor.h>
6#include <Interpreters/LogicalExpressionsOptimizer.h>
7#include <Interpreters/QueryAliasesVisitor.h>
8#include <Interpreters/InterpreterSelectWithUnionQuery.h>
9#include <Interpreters/ArrayJoinedColumnsVisitor.h>
10#include <Interpreters/TranslateQualifiedNamesVisitor.h>
11#include <Interpreters/Context.h>
12#include <Interpreters/MarkTableIdentifiersVisitor.h>
13#include <Interpreters/QueryNormalizer.h>
14#include <Interpreters/ExecuteScalarSubqueriesVisitor.h>
15#include <Interpreters/PredicateExpressionsOptimizer.h>
16#include <Interpreters/CollectJoinOnKeysVisitor.h>
17#include <Interpreters/ExternalDictionariesLoader.h>
18#include <Interpreters/OptimizeIfWithConstantConditionVisitor.h>
19#include <Interpreters/RequiredSourceColumnsVisitor.h>
20#include <Interpreters/GetAggregatesVisitor.h>
21#include <Interpreters/AnalyzedJoin.h>
22#include <Interpreters/ExpressionActions.h> /// getSmallestColumn()
23#include <Interpreters/getTableExpressions.h>
24#include <Interpreters/OptimizeIfChains.h>
25
26#include <Parsers/ASTExpressionList.h>
27#include <Parsers/ASTFunction.h>
28#include <Parsers/ASTLiteral.h>
29#include <Parsers/ASTOrderByElement.h>
30#include <Parsers/ASTSelectQuery.h>
31#include <Parsers/ASTTablesInSelectQuery.h>
32#include <Parsers/ParserTablesInSelectQuery.h>
33#include <Parsers/parseQuery.h>
34#include <Parsers/queryToString.h>
35
36#include <DataTypes/NestedUtils.h>
37#include <DataTypes/DataTypeNullable.h>
38
39#include <IO/WriteHelpers.h>
40#include <Storages/IStorage.h>
41
42#include <functional>
43
44
45namespace DB
46{
47
48namespace ErrorCodes
49{
50 extern const int EMPTY_NESTED_TABLE;
51 extern const int LOGICAL_ERROR;
52 extern const int INVALID_JOIN_ON_EXPRESSION;
53 extern const int EMPTY_LIST_OF_COLUMNS_QUERIED;
54 extern const int NOT_IMPLEMENTED;
55 extern const int UNKNOWN_IDENTIFIER;
56 extern const int EXPECTED_ALL_OR_ANY;
57 extern const int ALIAS_REQUIRED;
58}
59
60namespace
61{
62
63using LogAST = DebugASTLog<false>; /// set to true to enable logs
64
65/// Select implementation of countDistinct based on settings.
66/// Important that it is done as query rewrite. It means rewritten query
67/// will be sent to remote servers during distributed query execution,
68/// and on all remote servers, function implementation will be same.
69struct CustomizeFunctionsData
70{
71 using TypeToVisit = ASTFunction;
72
73 const String & count_distinct;
74
75 void visit(ASTFunction & func, ASTPtr &)
76 {
77 if (Poco::toLower(func.name) == "countdistinct")
78 func.name = count_distinct;
79 }
80};
81
82using CustomizeFunctionsMatcher = OneTypeMatcher<CustomizeFunctionsData>;
83using CustomizeFunctionsVisitor = InDepthNodeVisitor<CustomizeFunctionsMatcher, true>;
84
85
86/// Add columns from storage to source_columns list.
87void collectSourceColumns(const ColumnsDescription & columns, NamesAndTypesList & source_columns, bool add_virtuals)
88{
89 auto physical_columns = columns.getAllPhysical();
90 if (source_columns.empty())
91 source_columns.swap(physical_columns);
92 else
93 source_columns.insert(source_columns.end(), physical_columns.begin(), physical_columns.end());
94
95 if (add_virtuals)
96 {
97 const auto & storage_aliases = columns.getAliases();
98 const auto & storage_virtuals = columns.getVirtuals();
99 source_columns.insert(source_columns.end(), storage_aliases.begin(), storage_aliases.end());
100 source_columns.insert(source_columns.end(), storage_virtuals.begin(), storage_virtuals.end());
101 }
102}
103
104std::vector<TableWithColumnNames> getTablesWithColumns(const std::vector<const ASTTableExpression * > & table_expressions,
105 const Context & context)
106{
107 std::vector<TableWithColumnNames> tables_with_columns = getDatabaseAndTablesWithColumnNames(table_expressions, context);
108
109 auto & settings = context.getSettingsRef();
110 if (settings.joined_subquery_requires_alias && tables_with_columns.size() > 1)
111 {
112 for (auto & pr : tables_with_columns)
113 if (pr.table.table.empty() && pr.table.alias.empty())
114 throw Exception("Not unique subquery in FROM requires an alias (or joined_subquery_requires_alias=0 to disable restriction).",
115 ErrorCodes::ALIAS_REQUIRED);
116 }
117
118 return tables_with_columns;
119}
120
121
122/// Translate qualified names such as db.table.column, table.column, table_alias.column to names' normal form.
123/// Expand asterisks and qualified asterisks with column names.
124/// There would be columns in normal form & column aliases after translation. Column & column alias would be normalized in QueryNormalizer.
125void translateQualifiedNames(ASTPtr & query, const ASTSelectQuery & select_query, const NameSet & source_columns_set,
126 std::vector<TableWithColumnNames> && tables_with_columns)
127{
128 LogAST log;
129 TranslateQualifiedNamesVisitor::Data visitor_data(source_columns_set, std::move(tables_with_columns));
130 TranslateQualifiedNamesVisitor visitor(visitor_data, log.stream());
131 visitor.visit(query);
132
133 /// This may happen after expansion of COLUMNS('regexp').
134 if (select_query.select()->children.empty())
135 throw Exception("Empty list of columns in SELECT query", ErrorCodes::EMPTY_LIST_OF_COLUMNS_QUERIED);
136}
137
138bool hasArrayJoin(const ASTPtr & ast)
139{
140 if (const ASTFunction * function = ast->as<ASTFunction>())
141 if (function->name == "arrayJoin")
142 return true;
143
144 for (const auto & child : ast->children)
145 if (!child->as<ASTSelectQuery>() && hasArrayJoin(child))
146 return true;
147
148 return false;
149}
150
151/// Keep number of columns for 'GLOBAL IN (SELECT 1 AS a, a)'
152void renameDuplicatedColumns(const ASTSelectQuery * select_query)
153{
154 ASTs & elements = select_query->select()->children;
155
156 std::set<String> all_column_names;
157 std::set<String> assigned_column_names;
158
159 for (auto & expr : elements)
160 all_column_names.insert(expr->getAliasOrColumnName());
161
162 for (auto & expr : elements)
163 {
164 auto name = expr->getAliasOrColumnName();
165
166 if (!assigned_column_names.insert(name).second)
167 {
168 size_t i = 1;
169 while (all_column_names.end() != all_column_names.find(name + "_" + toString(i)))
170 ++i;
171
172 name = name + "_" + toString(i);
173 expr = expr->clone(); /// Cancels fuse of the same expressions in the tree.
174 expr->setAlias(name);
175
176 all_column_names.insert(name);
177 assigned_column_names.insert(name);
178 }
179 }
180}
181
182/// Sometimes we have to calculate more columns in SELECT clause than will be returned from query.
183/// This is the case when we have DISTINCT or arrayJoin: we require more columns in SELECT even if we need less columns in result.
184/// Also we have to remove duplicates in case of GLOBAL subqueries. Their results are placed into tables so duplicates are inpossible.
185void removeUnneededColumnsFromSelectClause(const ASTSelectQuery * select_query, const Names & required_result_columns, bool remove_dups)
186{
187 ASTs & elements = select_query->select()->children;
188
189 std::map<String, size_t> required_columns_with_duplicate_count;
190
191 if (!required_result_columns.empty())
192 {
193 /// Some columns may be queried multiple times, like SELECT x, y, y FROM table.
194 for (const auto & name : required_result_columns)
195 {
196 if (remove_dups)
197 required_columns_with_duplicate_count[name] = 1;
198 else
199 ++required_columns_with_duplicate_count[name];
200 }
201 }
202 else if (remove_dups)
203 {
204 /// Even if we have no requirements there could be duplicates cause of asterisks. SELECT *, t.*
205 for (const auto & elem : elements)
206 required_columns_with_duplicate_count.emplace(elem->getAliasOrColumnName(), 1);
207 }
208 else
209 return;
210
211 ASTs new_elements;
212 new_elements.reserve(elements.size());
213
214 for (const auto & elem : elements)
215 {
216 String name = elem->getAliasOrColumnName();
217
218 auto it = required_columns_with_duplicate_count.find(name);
219 if (required_columns_with_duplicate_count.end() != it && it->second)
220 {
221 new_elements.push_back(elem);
222 --it->second;
223 }
224 else if (select_query->distinct || hasArrayJoin(elem))
225 {
226 new_elements.push_back(elem);
227 }
228 }
229
230 elements = std::move(new_elements);
231}
232
233/// Replacing scalar subqueries with constant values.
234void executeScalarSubqueries(ASTPtr & query, const Context & context, size_t subquery_depth, Scalars & scalars)
235{
236 LogAST log;
237 ExecuteScalarSubqueriesVisitor::Data visitor_data{context, subquery_depth, scalars};
238 ExecuteScalarSubqueriesVisitor(visitor_data, log.stream()).visit(query);
239}
240
241/** Calls to these functions in the GROUP BY statement would be
242 * replaced by their immediate argument.
243 */
244const std::unordered_set<String> injective_function_names
245{
246 "negate",
247 "bitNot",
248 "reverse",
249 "reverseUTF8",
250 "toString",
251 "toFixedString",
252 "IPv4NumToString",
253 "IPv4StringToNum",
254 "hex",
255 "unhex",
256 "bitmaskToList",
257 "bitmaskToArray",
258 "tuple",
259 "regionToName",
260 "concatAssumeInjective",
261};
262
263const std::unordered_set<String> possibly_injective_function_names
264{
265 "dictGetString",
266 "dictGetUInt8",
267 "dictGetUInt16",
268 "dictGetUInt32",
269 "dictGetUInt64",
270 "dictGetInt8",
271 "dictGetInt16",
272 "dictGetInt32",
273 "dictGetInt64",
274 "dictGetFloat32",
275 "dictGetFloat64",
276 "dictGetDate",
277 "dictGetDateTime"
278};
279
280/** You can not completely remove GROUP BY. Because if there were no aggregate functions, then it turns out that there will be no aggregation.
281 * Instead, leave `GROUP BY const`.
282 * Next, see deleting the constants in the analyzeAggregation method.
283 */
284void appendUnusedGroupByColumn(ASTSelectQuery * select_query, const NameSet & source_columns)
285{
286 /// You must insert a constant that is not the name of the column in the table. Such a case is rare, but it happens.
287 UInt64 unused_column = 0;
288 String unused_column_name = toString(unused_column);
289
290 while (source_columns.count(unused_column_name))
291 {
292 ++unused_column;
293 unused_column_name = toString(unused_column);
294 }
295
296 select_query->setExpression(ASTSelectQuery::Expression::GROUP_BY, std::make_shared<ASTExpressionList>());
297 select_query->groupBy()->children.emplace_back(std::make_shared<ASTLiteral>(UInt64(unused_column)));
298}
299
300/// Eliminates injective function calls and constant expressions from group by statement.
301void optimizeGroupBy(ASTSelectQuery * select_query, const NameSet & source_columns, const Context & context)
302{
303 if (!select_query->groupBy())
304 {
305 // If there is a HAVING clause without GROUP BY, make sure we have some aggregation happen.
306 if (select_query->having())
307 appendUnusedGroupByColumn(select_query, source_columns);
308 return;
309 }
310
311 const auto is_literal = [] (const ASTPtr & ast) -> bool
312 {
313 return ast->as<ASTLiteral>();
314 };
315
316 auto & group_exprs = select_query->groupBy()->children;
317
318 /// removes expression at index idx by making it last one and calling .pop_back()
319 const auto remove_expr_at_index = [&group_exprs] (const size_t idx)
320 {
321 if (idx < group_exprs.size() - 1)
322 std::swap(group_exprs[idx], group_exprs.back());
323
324 group_exprs.pop_back();
325 };
326
327 /// iterate over each GROUP BY expression, eliminate injective function calls and literals
328 for (size_t i = 0; i < group_exprs.size();)
329 {
330 if (const auto * function = group_exprs[i]->as<ASTFunction>())
331 {
332 /// assert function is injective
333 if (possibly_injective_function_names.count(function->name))
334 {
335 /// do not handle semantic errors here
336 if (function->arguments->children.size() < 2)
337 {
338 ++i;
339 continue;
340 }
341
342 const auto & dict_name = function->arguments->children[0]->as<ASTLiteral &>().value.safeGet<String>();
343 const auto & dict_ptr = context.getExternalDictionariesLoader().getDictionary(dict_name);
344 const auto & attr_name = function->arguments->children[1]->as<ASTLiteral &>().value.safeGet<String>();
345
346 if (!dict_ptr->isInjective(attr_name))
347 {
348 ++i;
349 continue;
350 }
351 }
352 else if (!injective_function_names.count(function->name))
353 {
354 ++i;
355 continue;
356 }
357
358 /// copy shared pointer to args in order to ensure lifetime
359 auto args_ast = function->arguments;
360
361 /** remove function call and take a step back to ensure
362 * next iteration does not skip not yet processed data
363 */
364 remove_expr_at_index(i);
365
366 /// copy non-literal arguments
367 std::remove_copy_if(
368 std::begin(args_ast->children), std::end(args_ast->children),
369 std::back_inserter(group_exprs), is_literal
370 );
371 }
372 else if (is_literal(group_exprs[i]))
373 {
374 remove_expr_at_index(i);
375 }
376 else
377 {
378 /// if neither a function nor literal - advance to next expression
379 ++i;
380 }
381 }
382
383 if (group_exprs.empty())
384 appendUnusedGroupByColumn(select_query, source_columns);
385}
386
387/// Remove duplicate items from ORDER BY.
388void optimizeOrderBy(const ASTSelectQuery * select_query)
389{
390 if (!select_query->orderBy())
391 return;
392
393 /// Make unique sorting conditions.
394 using NameAndLocale = std::pair<String, String>;
395 std::set<NameAndLocale> elems_set;
396
397 ASTs & elems = select_query->orderBy()->children;
398 ASTs unique_elems;
399 unique_elems.reserve(elems.size());
400
401 for (const auto & elem : elems)
402 {
403 String name = elem->children.front()->getColumnName();
404 const auto & order_by_elem = elem->as<ASTOrderByElement &>();
405
406 if (elems_set.emplace(name, order_by_elem.collation ? order_by_elem.collation->getColumnName() : "").second)
407 unique_elems.emplace_back(elem);
408 }
409
410 if (unique_elems.size() < elems.size())
411 elems = std::move(unique_elems);
412}
413
414/// Remove duplicate items from LIMIT BY.
415void optimizeLimitBy(const ASTSelectQuery * select_query)
416{
417 if (!select_query->limitBy())
418 return;
419
420 std::set<String> elems_set;
421
422 ASTs & elems = select_query->limitBy()->children;
423 ASTs unique_elems;
424 unique_elems.reserve(elems.size());
425
426 for (const auto & elem : elems)
427 {
428 if (elems_set.emplace(elem->getColumnName()).second)
429 unique_elems.emplace_back(elem);
430 }
431
432 if (unique_elems.size() < elems.size())
433 elems = std::move(unique_elems);
434}
435
436/// Remove duplicated columns from USING(...).
437void optimizeUsing(const ASTSelectQuery * select_query)
438{
439 if (!select_query->join())
440 return;
441
442 const auto * table_join = select_query->join()->table_join->as<ASTTableJoin>();
443 if (!(table_join && table_join->using_expression_list))
444 return;
445
446 ASTs & expression_list = table_join->using_expression_list->children;
447 ASTs uniq_expressions_list;
448
449 std::set<String> expressions_names;
450
451 for (const auto & expression : expression_list)
452 {
453 auto expression_name = expression->getAliasOrColumnName();
454 if (expressions_names.find(expression_name) == expressions_names.end())
455 {
456 uniq_expressions_list.push_back(expression);
457 expressions_names.insert(expression_name);
458 }
459 }
460
461 if (uniq_expressions_list.size() < expression_list.size())
462 expression_list = uniq_expressions_list;
463}
464
465void getArrayJoinedColumns(ASTPtr & query, SyntaxAnalyzerResult & result, const ASTSelectQuery * select_query,
466 const NamesAndTypesList & source_columns, const NameSet & source_columns_set)
467{
468 if (ASTPtr array_join_expression_list = select_query->array_join_expression_list())
469 {
470 ArrayJoinedColumnsVisitor::Data visitor_data{result.aliases,
471 result.array_join_name_to_alias,
472 result.array_join_alias_to_name,
473 result.array_join_result_to_source};
474 ArrayJoinedColumnsVisitor(visitor_data).visit(query);
475
476 /// If the result of ARRAY JOIN is not used, it is necessary to ARRAY-JOIN any column,
477 /// to get the correct number of rows.
478 if (result.array_join_result_to_source.empty())
479 {
480 ASTPtr expr = select_query->array_join_expression_list()->children.at(0);
481 String source_name = expr->getColumnName();
482 String result_name = expr->getAliasOrColumnName();
483
484 /// This is an array.
485 if (!expr->as<ASTIdentifier>() || source_columns_set.count(source_name))
486 {
487 result.array_join_result_to_source[result_name] = source_name;
488 }
489 else /// This is a nested table.
490 {
491 bool found = false;
492 for (const auto & column : source_columns)
493 {
494 auto splitted = Nested::splitName(column.name);
495 if (splitted.first == source_name && !splitted.second.empty())
496 {
497 result.array_join_result_to_source[Nested::concatenateName(result_name, splitted.second)] = column.name;
498 found = true;
499 break;
500 }
501 }
502 if (!found)
503 throw Exception("No columns in nested table " + source_name, ErrorCodes::EMPTY_NESTED_TABLE);
504 }
505 }
506 }
507}
508
509void setJoinStrictness(ASTSelectQuery & select_query, JoinStrictness join_default_strictness, bool old_any, ASTTableJoin & out_table_join)
510{
511 const ASTTablesInSelectQueryElement * node = select_query.join();
512 if (!node)
513 return;
514
515 auto & table_join = const_cast<ASTTablesInSelectQueryElement *>(node)->table_join->as<ASTTableJoin &>();
516
517 if (table_join.strictness == ASTTableJoin::Strictness::Unspecified &&
518 table_join.kind != ASTTableJoin::Kind::Cross)
519 {
520 if (join_default_strictness == JoinStrictness::ANY)
521 table_join.strictness = ASTTableJoin::Strictness::Any;
522 else if (join_default_strictness == JoinStrictness::ALL)
523 table_join.strictness = ASTTableJoin::Strictness::All;
524 else
525 throw Exception("Expected ANY or ALL in JOIN section, because setting (join_default_strictness) is empty",
526 DB::ErrorCodes::EXPECTED_ALL_OR_ANY);
527 }
528
529 if (old_any && table_join.strictness == ASTTableJoin::Strictness::Any)
530 table_join.strictness = ASTTableJoin::Strictness::RightAny;
531
532 out_table_join = table_join;
533}
534
535/// Find the columns that are obtained by JOIN.
536void collectJoinedColumns(AnalyzedJoin & analyzed_join, const ASTSelectQuery & select_query,
537 const std::vector<TableWithColumnNames> & tables, const Aliases & aliases)
538{
539 const ASTTablesInSelectQueryElement * node = select_query.join();
540 if (!node)
541 return;
542
543 const auto & table_join = node->table_join->as<ASTTableJoin &>();
544
545 if (table_join.using_expression_list)
546 {
547 const auto & keys = table_join.using_expression_list->as<ASTExpressionList &>();
548 for (const auto & key : keys.children)
549 analyzed_join.addUsingKey(key);
550 }
551 else if (table_join.on_expression)
552 {
553 bool is_asof = (table_join.strictness == ASTTableJoin::Strictness::Asof);
554
555 CollectJoinOnKeysVisitor::Data data{analyzed_join, tables[0], tables[1], aliases, is_asof};
556 CollectJoinOnKeysVisitor(data).visit(table_join.on_expression);
557 if (!data.has_some)
558 throw Exception("Cannot get JOIN keys from JOIN ON section: " + queryToString(table_join.on_expression),
559 ErrorCodes::INVALID_JOIN_ON_EXPRESSION);
560 if (is_asof)
561 data.asofToJoinKeys();
562 }
563}
564
565void replaceJoinedTable(const ASTTablesInSelectQueryElement * join)
566{
567 if (!join || !join->table_expression)
568 return;
569
570 /// TODO: Push down for CROSS JOIN is not OK [disabled]
571 const auto & table_join = join->table_join->as<ASTTableJoin &>();
572 if (table_join.kind == ASTTableJoin::Kind::Cross)
573 return;
574
575 auto & table_expr = join->table_expression->as<ASTTableExpression &>();
576 if (table_expr.database_and_table_name)
577 {
578 const auto & table_id = table_expr.database_and_table_name->as<ASTIdentifier &>();
579 String expr = "(select * from " + table_id.name + ") as " + table_id.shortName();
580
581 // FIXME: since the expression "a as b" exposes both "a" and "b" names, which is not equivalent to "(select * from a) as b",
582 // we can't replace aliased tables.
583 // FIXME: long table names include database name, which we can't save within alias.
584 if (table_id.alias.empty() && table_id.isShort())
585 {
586 ParserTableExpression parser;
587 table_expr = parseQuery(parser, expr, 0)->as<ASTTableExpression &>();
588 }
589 }
590}
591
592void checkJoin(const ASTTablesInSelectQueryElement * join)
593{
594 if (!join->table_join)
595 return;
596
597 const auto & table_join = join->table_join->as<ASTTableJoin &>();
598
599 if (table_join.strictness == ASTTableJoin::Strictness::Any)
600 if (table_join.kind == ASTTableJoin::Kind::Full)
601 throw Exception("ANY FULL JOINs are not implemented.", ErrorCodes::NOT_IMPLEMENTED);
602}
603
604std::vector<const ASTFunction *> getAggregates(const ASTPtr & query)
605{
606 if (const auto * select_query = query->as<ASTSelectQuery>())
607 {
608 /// There can not be aggregate functions inside the WHERE and PREWHERE.
609 if (select_query->where())
610 assertNoAggregates(select_query->where(), "in WHERE");
611 if (select_query->prewhere())
612 assertNoAggregates(select_query->prewhere(), "in PREWHERE");
613
614 GetAggregatesVisitor::Data data;
615 GetAggregatesVisitor(data).visit(query);
616
617 /// There can not be other aggregate functions within the aggregate functions.
618 for (const ASTFunction * node : data.aggregates)
619 for (auto & arg : node->arguments->children)
620 assertNoAggregates(arg, "inside another aggregate function");
621 return data.aggregates;
622 }
623 else
624 assertNoAggregates(query, "in wrong place");
625 return {};
626}
627
628}
629
630/// Calculate which columns are required to execute the expression.
631/// Then, delete all other columns from the list of available columns.
632/// After execution, columns will only contain the list of columns needed to read from the table.
633void SyntaxAnalyzerResult::collectUsedColumns(const ASTPtr & query, const NamesAndTypesList & additional_source_columns)
634{
635 /// We caclulate required_source_columns with source_columns modifications and swap them on exit
636 required_source_columns = source_columns;
637
638 if (!additional_source_columns.empty())
639 {
640 source_columns.insert(source_columns.end(), additional_source_columns.begin(), additional_source_columns.end());
641 removeDuplicateColumns(source_columns);
642 }
643
644 RequiredSourceColumnsVisitor::Data columns_context;
645 RequiredSourceColumnsVisitor(columns_context).visit(query);
646
647 NameSet source_column_names;
648 for (const auto & column : source_columns)
649 source_column_names.insert(column.name);
650
651 NameSet required = columns_context.requiredColumns();
652
653 if (columns_context.has_table_join)
654 {
655 NameSet avaliable_columns;
656 for (const auto & name : source_columns)
657 avaliable_columns.insert(name.name);
658
659 /// Add columns obtained by JOIN (if needed).
660 for (const auto & joined_column : analyzed_join->columnsFromJoinedTable())
661 {
662 auto & name = joined_column.name;
663 if (avaliable_columns.count(name))
664 continue;
665
666 if (required.count(name))
667 {
668 /// Optimisation: do not add columns needed only in JOIN ON section.
669 if (columns_context.nameInclusion(name) > analyzed_join->rightKeyInclusion(name))
670 analyzed_join->addJoinedColumn(joined_column);
671
672 required.erase(name);
673 }
674 }
675 }
676
677 NameSet array_join_sources;
678 if (columns_context.has_array_join)
679 {
680 /// Insert the columns required for the ARRAY JOIN calculation into the required columns list.
681 for (const auto & result_source : array_join_result_to_source)
682 array_join_sources.insert(result_source.second);
683
684 for (const auto & column_name_type : source_columns)
685 if (array_join_sources.count(column_name_type.name))
686 required.insert(column_name_type.name);
687 }
688
689 const auto * select_query = query->as<ASTSelectQuery>();
690
691 /// You need to read at least one column to find the number of rows.
692 if (select_query && required.empty())
693 {
694 maybe_optimize_trivial_count = true;
695 /// We will find a column with minimum <compressed_size, type_size, uncompressed_size>.
696 /// Because it is the column that is cheapest to read.
697 struct ColumnSizeTuple
698 {
699 size_t compressed_size;
700 size_t type_size;
701 size_t uncompressed_size;
702 String name;
703 bool operator<(const ColumnSizeTuple & that) const
704 {
705 return std::tie(compressed_size, type_size, uncompressed_size)
706 < std::tie(that.compressed_size, that.type_size, that.uncompressed_size);
707 }
708 };
709 std::vector<ColumnSizeTuple> columns;
710 if (storage)
711 {
712 auto column_sizes = storage->getColumnSizes();
713 for (auto & source_column : source_columns)
714 {
715 auto c = column_sizes.find(source_column.name);
716 if (c == column_sizes.end())
717 continue;
718 size_t type_size = source_column.type->haveMaximumSizeOfValue() ? source_column.type->getMaximumSizeOfValueInMemory() : 100;
719 columns.emplace_back(ColumnSizeTuple{c->second.data_compressed, type_size, c->second.data_uncompressed, source_column.name});
720 }
721 }
722 if (columns.size())
723 required.insert(std::min_element(columns.begin(), columns.end())->name);
724 else
725 /// If we have no information about columns sizes, choose a column of minimum size of its data type.
726 required.insert(ExpressionActions::getSmallestColumn(source_columns));
727 }
728
729 NameSet unknown_required_source_columns = required;
730
731 for (NamesAndTypesList::iterator it = source_columns.begin(); it != source_columns.end();)
732 {
733 const String & column_name = it->name;
734 unknown_required_source_columns.erase(column_name);
735
736 if (!required.count(column_name))
737 source_columns.erase(it++);
738 else
739 ++it;
740 }
741
742 /// If there are virtual columns among the unknown columns. Remove them from the list of unknown and add
743 /// in columns list, so that when further processing they are also considered.
744 if (storage)
745 {
746 for (auto it = unknown_required_source_columns.begin(); it != unknown_required_source_columns.end();)
747 {
748 if (storage->hasColumn(*it))
749 {
750 source_columns.push_back(storage->getColumn(*it));
751 unknown_required_source_columns.erase(it++);
752 }
753 else
754 ++it;
755 }
756 }
757
758 if (!unknown_required_source_columns.empty())
759 {
760 std::stringstream ss;
761 ss << "Missing columns:";
762 for (const auto & name : unknown_required_source_columns)
763 ss << " '" << name << "'";
764 ss << " while processing query: '" << queryToString(query) << "'";
765
766 ss << ", required columns:";
767 for (const auto & name : columns_context.requiredColumns())
768 ss << " '" << name << "'";
769
770 if (!source_column_names.empty())
771 {
772 ss << ", source columns:";
773 for (const auto & name : source_column_names)
774 ss << " '" << name << "'";
775 }
776 else
777 ss << ", no source columns";
778
779 if (columns_context.has_table_join)
780 {
781 ss << ", joined columns:";
782 for (const auto & column : analyzed_join->columnsFromJoinedTable())
783 ss << " '" << column.name << "'";
784 }
785
786 if (!array_join_sources.empty())
787 {
788 ss << ", arrayJoin columns:";
789 for (const auto & name : array_join_sources)
790 ss << " '" << name << "'";
791 }
792
793 throw Exception(ss.str(), ErrorCodes::UNKNOWN_IDENTIFIER);
794 }
795
796 required_source_columns.swap(source_columns);
797}
798
799
800SyntaxAnalyzerResultPtr SyntaxAnalyzer::analyze(
801 ASTPtr & query,
802 const NamesAndTypesList & source_columns_,
803 const Names & required_result_columns,
804 StoragePtr storage,
805 const NamesAndTypesList & additional_source_columns) const
806{
807 auto * select_query = query->as<ASTSelectQuery>();
808 if (!storage && select_query)
809 {
810 if (auto db_and_table = getDatabaseAndTable(*select_query, 0))
811 storage = context.tryGetTable(db_and_table->database, db_and_table->table);
812 }
813
814 const auto & settings = context.getSettingsRef();
815
816 SyntaxAnalyzerResult result;
817 result.storage = storage;
818 result.source_columns = source_columns_;
819 result.analyzed_join = std::make_shared<AnalyzedJoin>(settings, context.getTemporaryPath()); /// TODO: move to select_query logic
820
821 if (storage)
822 collectSourceColumns(storage->getColumns(), result.source_columns, (select_query != nullptr));
823 NameSet source_columns_set = removeDuplicateColumns(result.source_columns);
824 std::vector<TableWithColumnNames> tables_with_columns;
825
826 if (select_query)
827 {
828 if (remove_duplicates)
829 renameDuplicatedColumns(select_query);
830
831 const ASTTablesInSelectQueryElement * table_join_node = select_query->join();
832 if (table_join_node)
833 {
834 if (!settings.any_join_distinct_right_table_keys)
835 checkJoin(table_join_node);
836
837 if (settings.enable_optimize_predicate_expression)
838 replaceJoinedTable(table_join_node);
839 }
840
841 std::vector<const ASTTableExpression *> table_expressions = getTableExpressions(*select_query);
842 tables_with_columns = getTablesWithColumns(table_expressions, context);
843
844 if (tables_with_columns.empty())
845 {
846 if (storage)
847 {
848 const ColumnsDescription & starage_columns = storage->getColumns();
849 tables_with_columns.emplace_back(DatabaseAndTableWithAlias{}, starage_columns.getOrdinary().getNames());
850 auto & table = tables_with_columns.back();
851 table.addHiddenColumns(starage_columns.getMaterialized());
852 table.addHiddenColumns(starage_columns.getAliases());
853 table.addHiddenColumns(starage_columns.getVirtuals());
854 }
855 else
856 {
857 Names columns;
858 columns.reserve(result.source_columns.size());
859 for (const auto & column : result.source_columns)
860 columns.push_back(column.name);
861 tables_with_columns.emplace_back(DatabaseAndTableWithAlias{}, columns);
862 }
863 }
864
865 if (table_expressions.size() > 1)
866 {
867 result.analyzed_join->columns_from_joined_table = getColumnsFromTableExpression(*table_expressions[1], context);
868 result.analyzed_join->deduplicateAndQualifyColumnNames(
869 source_columns_set, tables_with_columns[1].table.getQualifiedNamePrefix());
870 }
871
872 translateQualifiedNames(query, *select_query, source_columns_set, std::move(tables_with_columns));
873
874 /// Rewrite IN and/or JOIN for distributed tables according to distributed_product_mode setting.
875 InJoinSubqueriesPreprocessor(context).visit(query);
876
877 /// Optimizes logical expressions.
878 LogicalExpressionsOptimizer(select_query, settings.optimize_min_equality_disjunction_chain_length.value).perform();
879 }
880
881 {
882 CustomizeFunctionsVisitor::Data data{settings.count_distinct_implementation};
883 CustomizeFunctionsVisitor(data).visit(query);
884 }
885
886 /// Creates a dictionary `aliases`: alias -> ASTPtr
887 {
888 LogAST log;
889 QueryAliasesVisitor::Data query_aliases_data{result.aliases};
890 QueryAliasesVisitor(query_aliases_data, log.stream()).visit(query);
891 }
892
893 /// Mark table ASTIdentifiers with not a column marker
894 {
895 MarkTableIdentifiersVisitor::Data data{result.aliases};
896 MarkTableIdentifiersVisitor(data).visit(query);
897 }
898
899 /// Common subexpression elimination. Rewrite rules.
900 {
901 QueryNormalizer::Data normalizer_data(result.aliases, context.getSettingsRef());
902 QueryNormalizer(normalizer_data).visit(query);
903 }
904
905 /// Remove unneeded columns according to 'required_result_columns'.
906 /// Leave all selected columns in case of DISTINCT; columns that contain arrayJoin function inside.
907 /// Must be after 'normalizeTree' (after expanding aliases, for aliases not get lost)
908 /// and before 'executeScalarSubqueries', 'analyzeAggregation', etc. to avoid excessive calculations.
909 if (select_query)
910 removeUnneededColumnsFromSelectClause(select_query, required_result_columns, remove_duplicates);
911
912 /// Executing scalar subqueries - replacing them with constant values.
913 executeScalarSubqueries(query, context, subquery_depth, result.scalars);
914
915 /// Optimize if with constant condition after constants was substituted instead of scalar subqueries.
916 OptimizeIfWithConstantConditionVisitor(result.aliases).visit(query);
917
918 if (settings.optimize_if_chain_to_miltiif)
919 OptimizeIfChainsVisitor().visit(query);
920
921 if (select_query)
922 {
923 /// GROUP BY injective function elimination.
924 optimizeGroupBy(select_query, source_columns_set, context);
925
926 /// Remove duplicate items from ORDER BY.
927 optimizeOrderBy(select_query);
928
929 /// Remove duplicated elements from LIMIT BY clause.
930 optimizeLimitBy(select_query);
931
932 /// Remove duplicated columns from USING(...).
933 optimizeUsing(select_query);
934
935 /// array_join_alias_to_name, array_join_result_to_source.
936 getArrayJoinedColumns(query, result, select_query, result.source_columns, source_columns_set);
937
938 /// Push the predicate expression down to the subqueries.
939 result.rewrite_subqueries = PredicateExpressionsOptimizer(select_query, settings, context).optimize();
940
941 setJoinStrictness(*select_query, settings.join_default_strictness, settings.any_join_distinct_right_table_keys,
942 result.analyzed_join->table_join);
943 collectJoinedColumns(*result.analyzed_join, *select_query, tables_with_columns, result.aliases);
944 }
945
946 result.aggregates = getAggregates(query);
947 result.collectUsedColumns(query, additional_source_columns);
948 return std::make_shared<const SyntaxAnalyzerResult>(result);
949}
950
951}
952