| 1 | #include <Storages/MergeTree/MergeTreeWhereOptimizer.h> |
| 2 | #include <Storages/MergeTree/MergeTreeData.h> |
| 3 | #include <Storages/MergeTree/KeyCondition.h> |
| 4 | #include <Interpreters/IdentifierSemantic.h> |
| 5 | #include <Parsers/ASTSelectQuery.h> |
| 6 | #include <Parsers/ASTFunction.h> |
| 7 | #include <Parsers/ASTIdentifier.h> |
| 8 | #include <Parsers/ASTLiteral.h> |
| 9 | #include <Parsers/ASTExpressionList.h> |
| 10 | #include <Parsers/ASTSubquery.h> |
| 11 | #include <Parsers/formatAST.h> |
| 12 | #include <Interpreters/misc.h> |
| 13 | #include <Common/typeid_cast.h> |
| 14 | #include <DataTypes/NestedUtils.h> |
| 15 | #include <ext/map.h> |
| 16 | |
| 17 | |
| 18 | namespace DB |
| 19 | { |
| 20 | |
| 21 | namespace ErrorCodes |
| 22 | { |
| 23 | extern const int LOGICAL_ERROR; |
| 24 | } |
| 25 | |
| 26 | /// Conditions like "x = N" are considered good if abs(N) > threshold. |
| 27 | /// This is used to assume that condition is likely to have good selectivity. |
| 28 | static constexpr auto threshold = 2; |
| 29 | |
| 30 | |
| 31 | MergeTreeWhereOptimizer::MergeTreeWhereOptimizer( |
| 32 | SelectQueryInfo & query_info, |
| 33 | const Context & context, |
| 34 | const MergeTreeData & data, |
| 35 | const Names & queried_columns_, |
| 36 | Logger * log_) |
| 37 | : table_columns{ext::map<std::unordered_set>(data.getColumns().getAllPhysical(), |
| 38 | [] (const NameAndTypePair & col) { return col.name; })}, |
| 39 | queried_columns{queried_columns_}, |
| 40 | block_with_constants{KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context)}, |
| 41 | log{log_} |
| 42 | { |
| 43 | if (!data.primary_key_columns.empty()) |
| 44 | first_primary_key_column = data.primary_key_columns[0]; |
| 45 | |
| 46 | calculateColumnSizes(data, queried_columns); |
| 47 | determineArrayJoinedNames(query_info.query->as<ASTSelectQuery &>()); |
| 48 | optimize(query_info.query->as<ASTSelectQuery &>()); |
| 49 | } |
| 50 | |
| 51 | |
| 52 | void MergeTreeWhereOptimizer::calculateColumnSizes(const MergeTreeData & data, const Names & column_names) |
| 53 | { |
| 54 | for (const auto & column_name : column_names) |
| 55 | { |
| 56 | UInt64 size = data.getColumnCompressedSize(column_name); |
| 57 | column_sizes[column_name] = size; |
| 58 | total_size_of_queried_columns += size; |
| 59 | } |
| 60 | } |
| 61 | |
| 62 | |
| 63 | static void collectIdentifiersNoSubqueries(const ASTPtr & ast, NameSet & set) |
| 64 | { |
| 65 | if (auto opt_name = tryGetIdentifierName(ast)) |
| 66 | return (void)set.insert(*opt_name); |
| 67 | |
| 68 | if (ast->as<ASTSubquery>()) |
| 69 | return; |
| 70 | |
| 71 | for (const auto & child : ast->children) |
| 72 | collectIdentifiersNoSubqueries(child, set); |
| 73 | } |
| 74 | |
| 75 | void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node) const |
| 76 | { |
| 77 | if (const auto * func_and = node->as<ASTFunction>(); func_and && func_and->name == "and" ) |
| 78 | { |
| 79 | for (const auto & elem : func_and->arguments->children) |
| 80 | analyzeImpl(res, elem); |
| 81 | } |
| 82 | else |
| 83 | { |
| 84 | Condition cond; |
| 85 | cond.node = node; |
| 86 | |
| 87 | collectIdentifiersNoSubqueries(node, cond.identifiers); |
| 88 | |
| 89 | cond.viable = |
| 90 | /// Condition depend on some column. Constant expressions are not moved. |
| 91 | !cond.identifiers.empty() |
| 92 | && !cannotBeMoved(node) |
| 93 | /// Do not take into consideration the conditions consisting only of the first primary key column |
| 94 | && !hasPrimaryKeyAtoms(node) |
| 95 | /// Only table columns are considered. Not array joined columns. NOTE We're assuming that aliases was expanded. |
| 96 | && isSubsetOfTableColumns(cond.identifiers) |
| 97 | /// Do not move conditions involving all queried columns. |
| 98 | && cond.identifiers.size() < queried_columns.size(); |
| 99 | |
| 100 | if (cond.viable) |
| 101 | { |
| 102 | cond.columns_size = getIdentifiersColumnSize(cond.identifiers); |
| 103 | cond.good = isConditionGood(node); |
| 104 | } |
| 105 | |
| 106 | res.emplace_back(std::move(cond)); |
| 107 | } |
| 108 | } |
| 109 | |
| 110 | /// Transform conjunctions chain in WHERE expression to Conditions list. |
| 111 | MergeTreeWhereOptimizer::Conditions MergeTreeWhereOptimizer::analyze(const ASTPtr & expression) const |
| 112 | { |
| 113 | Conditions res; |
| 114 | analyzeImpl(res, expression); |
| 115 | return res; |
| 116 | } |
| 117 | |
| 118 | /// Transform Conditions list to WHERE or PREWHERE expression. |
| 119 | ASTPtr MergeTreeWhereOptimizer::reconstruct(const Conditions & conditions) const |
| 120 | { |
| 121 | if (conditions.empty()) |
| 122 | return {}; |
| 123 | |
| 124 | if (conditions.size() == 1) |
| 125 | return conditions.front().node; |
| 126 | |
| 127 | const auto function = std::make_shared<ASTFunction>(); |
| 128 | |
| 129 | function->name = "and" ; |
| 130 | function->arguments = std::make_shared<ASTExpressionList>(); |
| 131 | function->children.push_back(function->arguments); |
| 132 | |
| 133 | for (const auto & elem : conditions) |
| 134 | function->arguments->children.push_back(elem.node); |
| 135 | |
| 136 | return function; |
| 137 | } |
| 138 | |
| 139 | |
| 140 | void MergeTreeWhereOptimizer::optimize(ASTSelectQuery & select) const |
| 141 | { |
| 142 | if (!select.where() || select.prewhere()) |
| 143 | return; |
| 144 | |
| 145 | Conditions where_conditions = analyze(select.where()); |
| 146 | Conditions prewhere_conditions; |
| 147 | |
| 148 | UInt64 total_size_of_moved_conditions = 0; |
| 149 | |
| 150 | /// Move condition and all other conditions depend on the same set of columns. |
| 151 | auto move_condition = [&](Conditions::iterator cond_it) |
| 152 | { |
| 153 | prewhere_conditions.splice(prewhere_conditions.end(), where_conditions, cond_it); |
| 154 | total_size_of_moved_conditions += cond_it->columns_size; |
| 155 | |
| 156 | /// Move all other conditions that depend on the same set of columns. |
| 157 | for (auto jt = where_conditions.begin(); jt != where_conditions.end();) |
| 158 | { |
| 159 | if (jt->columns_size == cond_it->columns_size && jt->identifiers == cond_it->identifiers) |
| 160 | prewhere_conditions.splice(prewhere_conditions.end(), where_conditions, jt++); |
| 161 | else |
| 162 | ++jt; |
| 163 | } |
| 164 | }; |
| 165 | |
| 166 | /// Move conditions unless the ratio of total_size_of_moved_conditions to the total_size_of_queried_columns is less than some threshold. |
| 167 | while (!where_conditions.empty()) |
| 168 | { |
| 169 | /// Move the best condition to PREWHERE if it is viable. |
| 170 | |
| 171 | auto it = std::min_element(where_conditions.begin(), where_conditions.end()); |
| 172 | |
| 173 | if (!it->viable) |
| 174 | break; |
| 175 | |
| 176 | /// 10% ratio is just a guess. |
| 177 | if (total_size_of_moved_conditions > 0 && (total_size_of_moved_conditions + it->columns_size) * 10 > total_size_of_queried_columns) |
| 178 | break; |
| 179 | |
| 180 | move_condition(it); |
| 181 | } |
| 182 | |
| 183 | /// Nothing was moved. |
| 184 | if (prewhere_conditions.empty()) |
| 185 | return; |
| 186 | |
| 187 | /// Rewrite the SELECT query. |
| 188 | |
| 189 | select.setExpression(ASTSelectQuery::Expression::WHERE, reconstruct(where_conditions)); |
| 190 | select.setExpression(ASTSelectQuery::Expression::PREWHERE, reconstruct(prewhere_conditions)); |
| 191 | |
| 192 | LOG_DEBUG(log, "MergeTreeWhereOptimizer: condition \"" << select.prewhere() << "\" moved to PREWHERE" ); |
| 193 | } |
| 194 | |
| 195 | |
| 196 | UInt64 MergeTreeWhereOptimizer::getIdentifiersColumnSize(const NameSet & identifiers) const |
| 197 | { |
| 198 | UInt64 size = 0; |
| 199 | |
| 200 | for (const auto & identifier : identifiers) |
| 201 | if (column_sizes.count(identifier)) |
| 202 | size += column_sizes.at(identifier); |
| 203 | |
| 204 | return size; |
| 205 | } |
| 206 | |
| 207 | |
| 208 | bool MergeTreeWhereOptimizer::isConditionGood(const ASTPtr & condition) const |
| 209 | { |
| 210 | const auto * function = condition->as<ASTFunction>(); |
| 211 | if (!function) |
| 212 | return false; |
| 213 | |
| 214 | /** we are only considering conditions of form `equals(one, another)` or `one = another`, |
| 215 | * especially if either `one` or `another` is ASTIdentifier */ |
| 216 | if (function->name != "equals" ) |
| 217 | return false; |
| 218 | |
| 219 | auto left_arg = function->arguments->children.front().get(); |
| 220 | auto right_arg = function->arguments->children.back().get(); |
| 221 | |
| 222 | /// try to ensure left_arg points to ASTIdentifier |
| 223 | if (!left_arg->as<ASTIdentifier>() && right_arg->as<ASTIdentifier>()) |
| 224 | std::swap(left_arg, right_arg); |
| 225 | |
| 226 | if (left_arg->as<ASTIdentifier>()) |
| 227 | { |
| 228 | /// condition may be "good" if only right_arg is a constant and its value is outside the threshold |
| 229 | if (const auto * literal = right_arg->as<ASTLiteral>()) |
| 230 | { |
| 231 | const auto & field = literal->value; |
| 232 | const auto type = field.getType(); |
| 233 | |
| 234 | /// check the value with respect to threshold |
| 235 | if (type == Field::Types::UInt64) |
| 236 | { |
| 237 | const auto value = field.get<UInt64>(); |
| 238 | return value > threshold; |
| 239 | } |
| 240 | else if (type == Field::Types::Int64) |
| 241 | { |
| 242 | const auto value = field.get<Int64>(); |
| 243 | return value < -threshold || threshold < value; |
| 244 | } |
| 245 | else if (type == Field::Types::Float64) |
| 246 | { |
| 247 | const auto value = field.get<Float64>(); |
| 248 | return value < threshold || threshold < value; |
| 249 | } |
| 250 | } |
| 251 | } |
| 252 | |
| 253 | return false; |
| 254 | } |
| 255 | |
| 256 | |
| 257 | bool MergeTreeWhereOptimizer::hasPrimaryKeyAtoms(const ASTPtr & ast) const |
| 258 | { |
| 259 | if (const auto * func = ast->as<ASTFunction>()) |
| 260 | { |
| 261 | const auto & args = func->arguments->children; |
| 262 | |
| 263 | if ((func->name == "not" && 1 == args.size()) || func->name == "and" || func->name == "or" ) |
| 264 | { |
| 265 | for (const auto & arg : args) |
| 266 | if (hasPrimaryKeyAtoms(arg)) |
| 267 | return true; |
| 268 | |
| 269 | return false; |
| 270 | } |
| 271 | } |
| 272 | |
| 273 | return isPrimaryKeyAtom(ast); |
| 274 | } |
| 275 | |
| 276 | |
| 277 | bool MergeTreeWhereOptimizer::isPrimaryKeyAtom(const ASTPtr & ast) const |
| 278 | { |
| 279 | if (const auto * func = ast->as<ASTFunction>()) |
| 280 | { |
| 281 | if (!KeyCondition::atom_map.count(func->name)) |
| 282 | return false; |
| 283 | |
| 284 | const auto & args = func->arguments->children; |
| 285 | if (args.size() != 2) |
| 286 | return false; |
| 287 | |
| 288 | const auto & first_arg_name = args.front()->getColumnName(); |
| 289 | const auto & second_arg_name = args.back()->getColumnName(); |
| 290 | |
| 291 | if ((first_primary_key_column == first_arg_name && isConstant(args[1])) |
| 292 | || (first_primary_key_column == second_arg_name && isConstant(args[0])) |
| 293 | || (first_primary_key_column == first_arg_name && functionIsInOrGlobalInOperator(func->name))) |
| 294 | return true; |
| 295 | } |
| 296 | |
| 297 | return false; |
| 298 | } |
| 299 | |
| 300 | |
| 301 | bool MergeTreeWhereOptimizer::isConstant(const ASTPtr & expr) const |
| 302 | { |
| 303 | const auto column_name = expr->getColumnName(); |
| 304 | |
| 305 | if (expr->as<ASTLiteral>() |
| 306 | || (block_with_constants.has(column_name) && isColumnConst(*block_with_constants.getByName(column_name).column))) |
| 307 | return true; |
| 308 | |
| 309 | return false; |
| 310 | } |
| 311 | |
| 312 | |
| 313 | bool MergeTreeWhereOptimizer::isSubsetOfTableColumns(const NameSet & identifiers) const |
| 314 | { |
| 315 | for (const auto & identifier : identifiers) |
| 316 | if (table_columns.count(identifier) == 0) |
| 317 | return false; |
| 318 | |
| 319 | return true; |
| 320 | } |
| 321 | |
| 322 | |
| 323 | bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr) const |
| 324 | { |
| 325 | if (const auto * function_ptr = ptr->as<ASTFunction>()) |
| 326 | { |
| 327 | /// disallow arrayJoin expressions to be moved to PREWHERE for now |
| 328 | if ("arrayJoin" == function_ptr->name) |
| 329 | return true; |
| 330 | |
| 331 | /// disallow GLOBAL IN, GLOBAL NOT IN |
| 332 | if ("globalIn" == function_ptr->name |
| 333 | || "globalNotIn" == function_ptr->name) |
| 334 | return true; |
| 335 | |
| 336 | /// indexHint is a special function that it does not make sense to transfer to PREWHERE |
| 337 | if ("indexHint" == function_ptr->name) |
| 338 | return true; |
| 339 | } |
| 340 | else if (auto opt_name = IdentifierSemantic::getColumnName(ptr)) |
| 341 | { |
| 342 | /// disallow moving result of ARRAY JOIN to PREWHERE |
| 343 | if (array_joined_names.count(*opt_name) || |
| 344 | array_joined_names.count(Nested::extractTableName(*opt_name))) |
| 345 | return true; |
| 346 | } |
| 347 | |
| 348 | for (const auto & child : ptr->children) |
| 349 | if (cannotBeMoved(child)) |
| 350 | return true; |
| 351 | |
| 352 | return false; |
| 353 | } |
| 354 | |
| 355 | |
| 356 | void MergeTreeWhereOptimizer::determineArrayJoinedNames(ASTSelectQuery & select) |
| 357 | { |
| 358 | auto array_join_expression_list = select.array_join_expression_list(); |
| 359 | |
| 360 | /// much simplified code from ExpressionAnalyzer::getArrayJoinedColumns() |
| 361 | if (!array_join_expression_list) |
| 362 | return; |
| 363 | |
| 364 | for (const auto & ast : array_join_expression_list->children) |
| 365 | array_joined_names.emplace(ast->getAliasOrColumnName()); |
| 366 | } |
| 367 | |
| 368 | } |
| 369 | |