| 1 | #include "duckdb/function/table/table_scan.hpp" |
| 2 | #include "duckdb/optimizer/join_order/join_node.hpp" |
| 3 | #include "duckdb/optimizer/join_order/join_order_optimizer.hpp" |
| 4 | #include "duckdb/planner/filter/conjunction_filter.hpp" |
| 5 | #include "duckdb/planner/filter/constant_filter.hpp" |
| 6 | #include "duckdb/planner/operator/logical_comparison_join.hpp" |
| 7 | #include "duckdb/planner/operator/logical_get.hpp" |
| 8 | #include "duckdb/storage/data_table.hpp" |
| 9 | #include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp" |
| 10 | |
| 11 | #include <cmath> |
| 12 | |
| 13 | namespace duckdb { |
| 14 | |
| 15 | static optional_ptr<TableCatalogEntry> GetCatalogTableEntry(LogicalOperator &op) { |
| 16 | D_ASSERT(op.type == LogicalOperatorType::LOGICAL_GET); |
| 17 | auto &get = op.Cast<LogicalGet>(); |
| 18 | return get.GetTable(); |
| 19 | } |
| 20 | |
| 21 | // The filter was made on top of a logical sample or other projection, |
| 22 | // but no specific columns are referenced. See issue 4978 number 4. |
| 23 | bool CardinalityEstimator::EmptyFilter(FilterInfo &filter_info) { |
| 24 | if (!filter_info.left_set && !filter_info.right_set) { |
| 25 | return true; |
| 26 | } |
| 27 | return false; |
| 28 | } |
| 29 | |
| 30 | void CardinalityEstimator::AddRelationTdom(FilterInfo &filter_info) { |
| 31 | D_ASSERT(filter_info.set.count >= 1); |
| 32 | for (const RelationsToTDom &r2tdom : relations_to_tdoms) { |
| 33 | auto &i_set = r2tdom.equivalent_relations; |
| 34 | if (i_set.find(x: filter_info.left_binding) != i_set.end()) { |
| 35 | // found an equivalent filter |
| 36 | return; |
| 37 | } |
| 38 | } |
| 39 | auto key = ColumnBinding(filter_info.left_binding.table_index, filter_info.left_binding.column_index); |
| 40 | relations_to_tdoms.emplace_back(args: column_binding_set_t({key})); |
| 41 | } |
| 42 | |
| 43 | bool CardinalityEstimator::SingleColumnFilter(FilterInfo &filter_info) { |
| 44 | if (filter_info.left_set && filter_info.right_set) { |
| 45 | // Both set |
| 46 | return false; |
| 47 | } |
| 48 | if (EmptyFilter(filter_info)) { |
| 49 | return false; |
| 50 | } |
| 51 | return true; |
| 52 | } |
| 53 | |
| 54 | vector<idx_t> CardinalityEstimator::DetermineMatchingEquivalentSets(FilterInfo *filter_info) { |
| 55 | vector<idx_t> matching_equivalent_sets; |
| 56 | auto equivalent_relation_index = 0; |
| 57 | |
| 58 | for (const RelationsToTDom &r2tdom : relations_to_tdoms) { |
| 59 | auto &i_set = r2tdom.equivalent_relations; |
| 60 | if (i_set.find(x: filter_info->left_binding) != i_set.end()) { |
| 61 | matching_equivalent_sets.push_back(x: equivalent_relation_index); |
| 62 | } else if (i_set.find(x: filter_info->right_binding) != i_set.end()) { |
| 63 | // don't add both left and right to the matching_equivalent_sets |
| 64 | // since both left and right get added to that index anyway. |
| 65 | matching_equivalent_sets.push_back(x: equivalent_relation_index); |
| 66 | } |
| 67 | equivalent_relation_index++; |
| 68 | } |
| 69 | return matching_equivalent_sets; |
| 70 | } |
| 71 | |
| 72 | void CardinalityEstimator::AddToEquivalenceSets(FilterInfo *filter_info, vector<idx_t> matching_equivalent_sets) { |
| 73 | D_ASSERT(matching_equivalent_sets.size() <= 2); |
| 74 | if (matching_equivalent_sets.size() > 1) { |
| 75 | // an equivalence relation is connecting to sets of equivalence relations |
| 76 | // so push all relations from the second set into the first. Later we will delete |
| 77 | // the second set. |
| 78 | for (ColumnBinding i : relations_to_tdoms.at(n: matching_equivalent_sets[1]).equivalent_relations) { |
| 79 | relations_to_tdoms.at(n: matching_equivalent_sets[0]).equivalent_relations.insert(x: i); |
| 80 | } |
| 81 | relations_to_tdoms.at(n: matching_equivalent_sets[1]).equivalent_relations.clear(); |
| 82 | relations_to_tdoms.at(n: matching_equivalent_sets[0]).filters.push_back(x: filter_info); |
| 83 | // add all values of one set to the other, delete the empty one |
| 84 | } else if (matching_equivalent_sets.size() == 1) { |
| 85 | auto &tdom_i = relations_to_tdoms.at(n: matching_equivalent_sets.at(n: 0)); |
| 86 | tdom_i.equivalent_relations.insert(x: filter_info->left_binding); |
| 87 | tdom_i.equivalent_relations.insert(x: filter_info->right_binding); |
| 88 | tdom_i.filters.push_back(x: filter_info); |
| 89 | } else if (matching_equivalent_sets.empty()) { |
| 90 | column_binding_set_t tmp; |
| 91 | tmp.insert(x: filter_info->left_binding); |
| 92 | tmp.insert(x: filter_info->right_binding); |
| 93 | relations_to_tdoms.emplace_back(args&: tmp); |
| 94 | relations_to_tdoms.back().filters.push_back(x: filter_info); |
| 95 | } |
| 96 | } |
| 97 | |
| 98 | void CardinalityEstimator::AddRelationToColumnMapping(ColumnBinding key, ColumnBinding value) { |
| 99 | relation_column_to_original_column[key] = value; |
| 100 | } |
| 101 | |
| 102 | void CardinalityEstimator::CopyRelationMap(column_binding_map_t<ColumnBinding> &child_binding_map) { |
| 103 | for (auto &binding_map : relation_column_to_original_column) { |
| 104 | D_ASSERT(child_binding_map.find(binding_map.first) == child_binding_map.end()); |
| 105 | child_binding_map[binding_map.first] = binding_map.second; |
| 106 | } |
| 107 | } |
| 108 | |
| 109 | void CardinalityEstimator::AddColumnToRelationMap(idx_t table_index, idx_t column_index) { |
| 110 | relation_attributes[table_index].columns.insert(x: column_index); |
| 111 | } |
| 112 | |
| 113 | void CardinalityEstimator::InitEquivalentRelations(vector<unique_ptr<FilterInfo>> &filter_infos) { |
| 114 | // For each filter, we fill keep track of the index of the equivalent relation set |
| 115 | // the left and right relation needs to be added to. |
| 116 | for (auto &filter : filter_infos) { |
| 117 | if (SingleColumnFilter(filter_info&: *filter)) { |
| 118 | // Filter on one relation, (i.e string or range filter on a column). |
| 119 | // Grab the first relation and add it to the equivalence_relations |
| 120 | AddRelationTdom(filter_info&: *filter); |
| 121 | continue; |
| 122 | } else if (EmptyFilter(filter_info&: *filter)) { |
| 123 | continue; |
| 124 | } |
| 125 | D_ASSERT(filter->left_set->count >= 1); |
| 126 | D_ASSERT(filter->right_set->count >= 1); |
| 127 | |
| 128 | auto matching_equivalent_sets = DetermineMatchingEquivalentSets(filter_info: filter.get()); |
| 129 | AddToEquivalenceSets(filter_info: filter.get(), matching_equivalent_sets); |
| 130 | } |
| 131 | } |
| 132 | |
| 133 | void CardinalityEstimator::VerifySymmetry(JoinNode &result, JoinNode &entry) { |
| 134 | if (result.GetCardinality<double>() != entry.GetCardinality<double>()) { |
| 135 | // Currently it's possible that some entries are cartesian joins. |
| 136 | // When this is the case, you don't always have symmetry, but |
| 137 | // if the cost of the result is less, then just assure the cardinality |
| 138 | // is also less, then you have the same effect of symmetry. |
| 139 | D_ASSERT(ceil(result.GetCardinality<double>()) <= ceil(entry.GetCardinality<double>()) || |
| 140 | floor(result.GetCardinality<double>()) <= floor(entry.GetCardinality<double>())); |
| 141 | } |
| 142 | } |
| 143 | |
| 144 | void CardinalityEstimator::InitTotalDomains() { |
| 145 | auto remove_start = std::remove_if(first: relations_to_tdoms.begin(), last: relations_to_tdoms.end(), |
| 146 | pred: [](RelationsToTDom &r_2_tdom) { return r_2_tdom.equivalent_relations.empty(); }); |
| 147 | relations_to_tdoms.erase(first: remove_start, last: relations_to_tdoms.end()); |
| 148 | } |
| 149 | |
| 150 | double CardinalityEstimator::ComputeCost(JoinNode &left, JoinNode &right, double expected_cardinality) { |
| 151 | return expected_cardinality + left.GetCost() + right.GetCost(); |
| 152 | } |
| 153 | |
| 154 | double CardinalityEstimator::EstimateCrossProduct(const JoinNode &left, const JoinNode &right) { |
| 155 | // need to explicity use double here, otherwise auto converts it to an int, then |
| 156 | // there is an autocast in the return. |
| 157 | if (left.GetCardinality<double>() >= (NumericLimits<double>::Maximum() / right.GetCardinality<double>())) { |
| 158 | return NumericLimits<double>::Maximum(); |
| 159 | } |
| 160 | return left.GetCardinality<double>() * right.GetCardinality<double>(); |
| 161 | } |
| 162 | |
| 163 | void CardinalityEstimator::AddRelationColumnMapping(LogicalGet &get, idx_t relation_id) { |
| 164 | for (idx_t it = 0; it < get.column_ids.size(); it++) { |
| 165 | auto key = ColumnBinding(relation_id, it); |
| 166 | auto value = ColumnBinding(get.table_index, get.column_ids[it]); |
| 167 | AddRelationToColumnMapping(key, value); |
| 168 | } |
| 169 | } |
| 170 | |
| 171 | void UpdateDenom(Subgraph2Denominator &relation_2_denom, RelationsToTDom &relation_to_tdom) { |
| 172 | relation_2_denom.denom *= relation_to_tdom.has_tdom_hll ? relation_to_tdom.tdom_hll : relation_to_tdom.tdom_no_hll; |
| 173 | } |
| 174 | |
| 175 | void FindSubgraphMatchAndMerge(Subgraph2Denominator &merge_to, idx_t find_me, |
| 176 | vector<Subgraph2Denominator>::iterator subgraph, |
| 177 | vector<Subgraph2Denominator>::iterator end) { |
| 178 | for (; subgraph != end; subgraph++) { |
| 179 | if (subgraph->relations.count(x: find_me) >= 1) { |
| 180 | for (auto &relation : subgraph->relations) { |
| 181 | merge_to.relations.insert(x: relation); |
| 182 | } |
| 183 | subgraph->relations.clear(); |
| 184 | merge_to.denom *= subgraph->denom; |
| 185 | return; |
| 186 | } |
| 187 | } |
| 188 | } |
| 189 | |
| 190 | double CardinalityEstimator::EstimateCardinalityWithSet(JoinRelationSet &new_set) { |
| 191 | double numerator = 1; |
| 192 | unordered_set<idx_t> actual_set; |
| 193 | for (idx_t i = 0; i < new_set.count; i++) { |
| 194 | numerator *= relation_attributes[new_set.relations[i]].cardinality; |
| 195 | actual_set.insert(x: new_set.relations[i]); |
| 196 | } |
| 197 | vector<Subgraph2Denominator> subgraphs; |
| 198 | bool done = false; |
| 199 | bool found_match = false; |
| 200 | |
| 201 | // Finding the denominator is tricky. You need to go through the tdoms in decreasing order |
| 202 | // Then loop through all filters in the equivalence set of the tdom to see if both the |
| 203 | // left and right relations are in the new set, if so you can use that filter. |
| 204 | // You must also make sure that the filters all relations in the given set, so we use subgraphs |
| 205 | // that should eventually merge into one connected graph that joins all the relations |
| 206 | // TODO: Implement a method to cache subgraphs so you don't have to build them up every |
| 207 | // time the cardinality of a new set is requested |
| 208 | |
| 209 | // relations_to_tdoms has already been sorted. |
| 210 | for (auto &relation_2_tdom : relations_to_tdoms) { |
| 211 | // loop through each filter in the tdom. |
| 212 | if (done) { |
| 213 | break; |
| 214 | } |
| 215 | for (auto &filter : relation_2_tdom.filters) { |
| 216 | if (actual_set.count(x: filter->left_binding.table_index) == 0 || |
| 217 | actual_set.count(x: filter->right_binding.table_index) == 0) { |
| 218 | continue; |
| 219 | } |
| 220 | // the join filter is on relations in the new set. |
| 221 | found_match = false; |
| 222 | vector<Subgraph2Denominator>::iterator it; |
| 223 | for (it = subgraphs.begin(); it != subgraphs.end(); it++) { |
| 224 | auto left_in = it->relations.count(x: filter->left_binding.table_index); |
| 225 | auto right_in = it->relations.count(x: filter->right_binding.table_index); |
| 226 | if (left_in && right_in) { |
| 227 | // if both left and right bindings are in the subgraph, continue. |
| 228 | // This means another filter is connecting relations already in the |
| 229 | // subgraph it, but it has a tdom that is less, and we don't care. |
| 230 | found_match = true; |
| 231 | continue; |
| 232 | } |
| 233 | if (!left_in && !right_in) { |
| 234 | // if both left and right bindings are *not* in the subgraph, continue |
| 235 | // without finding a match. This will trigger the process to add a new |
| 236 | // subgraph |
| 237 | continue; |
| 238 | } |
| 239 | idx_t find_table; |
| 240 | if (left_in) { |
| 241 | find_table = filter->right_binding.table_index; |
| 242 | } else { |
| 243 | D_ASSERT(right_in); |
| 244 | find_table = filter->left_binding.table_index; |
| 245 | } |
| 246 | auto next_subgraph = it + 1; |
| 247 | // iterate through other subgraphs and merge. |
| 248 | FindSubgraphMatchAndMerge(merge_to&: *it, find_me: find_table, subgraph: next_subgraph, end: subgraphs.end()); |
| 249 | // Now insert the right binding and update denominator with the |
| 250 | // tdom of the filter |
| 251 | it->relations.insert(x: find_table); |
| 252 | UpdateDenom(relation_2_denom&: *it, relation_to_tdom&: relation_2_tdom); |
| 253 | found_match = true; |
| 254 | break; |
| 255 | } |
| 256 | // means that the filter joins relations in the given set, but there is no |
| 257 | // connection to any subgraph in subgraphs. Add a new subgraph, and maybe later there will be |
| 258 | // a connection. |
| 259 | if (!found_match) { |
| 260 | subgraphs.emplace_back(); |
| 261 | auto &subgraph = subgraphs.back(); |
| 262 | subgraph.relations.insert(x: filter->left_binding.table_index); |
| 263 | subgraph.relations.insert(x: filter->right_binding.table_index); |
| 264 | UpdateDenom(relation_2_denom&: subgraph, relation_to_tdom&: relation_2_tdom); |
| 265 | } |
| 266 | auto remove_start = std::remove_if(first: subgraphs.begin(), last: subgraphs.end(), |
| 267 | pred: [](Subgraph2Denominator &s) { return s.relations.empty(); }); |
| 268 | subgraphs.erase(first: remove_start, last: subgraphs.end()); |
| 269 | |
| 270 | if (subgraphs.size() == 1 && subgraphs.at(n: 0).relations.size() == new_set.count) { |
| 271 | // You have found enough filters to connect the relations. These are guaranteed |
| 272 | // to be the filters with the highest Tdoms. |
| 273 | done = true; |
| 274 | break; |
| 275 | } |
| 276 | } |
| 277 | } |
| 278 | double denom = 1; |
| 279 | // TODO: It's possible cross-products were added and are not present in the filters in the relation_2_tdom |
| 280 | // structures. When that's the case, multiply the denom structures that have no intersection |
| 281 | for (auto &match : subgraphs) { |
| 282 | // It's possible that in production, one of the D_ASSERTS above will fail and not all subgraphs |
| 283 | // were connected. When this happens, just use the largest denominator of all the subgraphs. |
| 284 | if (match.denom > denom) { |
| 285 | denom = match.denom; |
| 286 | } |
| 287 | } |
| 288 | // can happen if a table has cardinality 0, or a tdom is set to 0 |
| 289 | if (denom == 0) { |
| 290 | denom = 1; |
| 291 | } |
| 292 | return numerator / denom; |
| 293 | } |
| 294 | |
| 295 | static bool IsLogicalFilter(LogicalOperator &op) { |
| 296 | return op.type == LogicalOperatorType::LOGICAL_FILTER; |
| 297 | } |
| 298 | |
| 299 | static optional_ptr<LogicalGet> GetLogicalGet(LogicalOperator &op, idx_t table_index = DConstants::INVALID_INDEX) { |
| 300 | optional_ptr<LogicalGet> get; |
| 301 | switch (op.type) { |
| 302 | case LogicalOperatorType::LOGICAL_GET: |
| 303 | get = &op.Cast<LogicalGet>(); |
| 304 | break; |
| 305 | case LogicalOperatorType::LOGICAL_FILTER: |
| 306 | get = GetLogicalGet(op&: *op.children.at(n: 0), table_index); |
| 307 | break; |
| 308 | case LogicalOperatorType::LOGICAL_PROJECTION: |
| 309 | get = GetLogicalGet(op&: *op.children.at(n: 0), table_index); |
| 310 | break; |
| 311 | case LogicalOperatorType::LOGICAL_ASOF_JOIN: |
| 312 | case LogicalOperatorType::LOGICAL_COMPARISON_JOIN: { |
| 313 | auto &join = op.Cast<LogicalComparisonJoin>(); |
| 314 | // We should never be calling GetLogicalGet without a valid table_index. |
| 315 | // We are attempting to get the catalog table for a relation (for statistics/cardinality estimation) |
| 316 | // A logical join means there is a non-reorderable relation in the join plan. This means we need |
| 317 | // to know the exact table index to return. |
| 318 | D_ASSERT(table_index != DConstants::INVALID_INDEX); |
| 319 | if (join.join_type == JoinType::MARK || join.join_type == JoinType::LEFT) { |
| 320 | auto &left_child = *join.children.at(n: 0); |
| 321 | get = GetLogicalGet(op&: left_child, table_index); |
| 322 | if (get && get->table_index == table_index) { |
| 323 | return get; |
| 324 | } |
| 325 | auto &right_child = *join.children.at(n: 1); |
| 326 | get = GetLogicalGet(op&: right_child, table_index); |
| 327 | if (get && get->table_index == table_index) { |
| 328 | return get; |
| 329 | } |
| 330 | } |
| 331 | break; |
| 332 | } |
| 333 | default: |
| 334 | // return null pointer, maybe there is no logical get under this child |
| 335 | break; |
| 336 | } |
| 337 | return get; |
| 338 | } |
| 339 | |
| 340 | void CardinalityEstimator::MergeBindings(idx_t binding_index, idx_t relation_id, |
| 341 | vector<column_binding_map_t<ColumnBinding>> &child_binding_maps) { |
| 342 | for (auto &map_set : child_binding_maps) { |
| 343 | for (auto &mapping : map_set) { |
| 344 | ColumnBinding relation_bindings = mapping.first; |
| 345 | ColumnBinding actual_bindings = mapping.second; |
| 346 | |
| 347 | if (actual_bindings.table_index == binding_index) { |
| 348 | auto key = ColumnBinding(relation_id, relation_bindings.column_index); |
| 349 | AddRelationToColumnMapping(key, value: actual_bindings); |
| 350 | } |
| 351 | } |
| 352 | } |
| 353 | } |
| 354 | |
| 355 | bool SortTdoms(const RelationsToTDom &a, const RelationsToTDom &b) { |
| 356 | if (a.has_tdom_hll && b.has_tdom_hll) { |
| 357 | return a.tdom_hll > b.tdom_hll; |
| 358 | } |
| 359 | if (a.has_tdom_hll) { |
| 360 | return a.tdom_hll > b.tdom_no_hll; |
| 361 | } |
| 362 | if (b.has_tdom_hll) { |
| 363 | return a.tdom_no_hll > b.tdom_hll; |
| 364 | } |
| 365 | return a.tdom_no_hll > b.tdom_no_hll; |
| 366 | } |
| 367 | |
| 368 | void CardinalityEstimator::InitCardinalityEstimatorProps(vector<NodeOp> &node_ops, |
| 369 | vector<unique_ptr<FilterInfo>> &filter_infos) { |
| 370 | InitEquivalentRelations(filter_infos); |
| 371 | InitTotalDomains(); |
| 372 | for (idx_t i = 0; i < node_ops.size(); i++) { |
| 373 | auto &join_node = *node_ops[i].node; |
| 374 | auto &op = node_ops[i].op; |
| 375 | join_node.SetBaseTableCardinality(op.EstimateCardinality(context)); |
| 376 | if (op.type == LogicalOperatorType::LOGICAL_COMPARISON_JOIN) { |
| 377 | auto &join = op.Cast<LogicalComparisonJoin>(); |
| 378 | if (join.join_type == JoinType::LEFT) { |
| 379 | // If a base op is a Logical Comparison join it is probably a left join, |
| 380 | // so the cost of the larger table is a fine estimate. |
| 381 | // TODO: provide better estimates for cost of mark joins |
| 382 | // MARK joins are used for anti and semi joins, so the cost can conceivably be |
| 383 | // less than the base table cardinality. |
| 384 | join_node.SetCost(join_node.GetBaseTableCardinality()); |
| 385 | } |
| 386 | } else if (op.type == LogicalOperatorType::LOGICAL_ASOF_JOIN) { |
| 387 | // AsOf joins have the cardinality of the LHS |
| 388 | join_node.SetCost(join_node.GetBaseTableCardinality()); |
| 389 | } |
| 390 | // Total domains can be affected by filters. So we update base table cardinality first |
| 391 | EstimateBaseTableCardinality(node&: join_node, op); |
| 392 | // Then update total domains. |
| 393 | UpdateTotalDomains(node&: join_node, op); |
| 394 | } |
| 395 | |
| 396 | // sort relations from greatest tdom to lowest tdom. |
| 397 | std::sort(first: relations_to_tdoms.begin(), last: relations_to_tdoms.end(), comp: SortTdoms); |
| 398 | } |
| 399 | |
| 400 | void CardinalityEstimator::UpdateTotalDomains(JoinNode &node, LogicalOperator &op) { |
| 401 | auto relation_id = node.set.relations[0]; |
| 402 | relation_attributes[relation_id].cardinality = node.GetCardinality<double>(); |
| 403 | //! Initialize the distinct count for all columns used in joins with the current relation. |
| 404 | idx_t distinct_count = node.GetBaseTableCardinality(); |
| 405 | optional_ptr<TableCatalogEntry> catalog_table; |
| 406 | |
| 407 | optional_ptr<LogicalGet> get; |
| 408 | bool get_updated = true; |
| 409 | for (auto &column : relation_attributes[relation_id].columns) { |
| 410 | //! for every column used in a filter in the relation, get the distinct count via HLL, or assume it to be |
| 411 | //! the cardinality |
| 412 | ColumnBinding key = ColumnBinding(relation_id, column); |
| 413 | auto actual_binding = relation_column_to_original_column.find(x: key); |
| 414 | // each relation has columns that are either projected or used as filters |
| 415 | // In order to get column statistics we need to make sure the actual binding still |
| 416 | // refers to the same base table relation, as non-reorderable joins may involve 2+ |
| 417 | // base table relations and therefore the columns may also refer to 2 different |
| 418 | // base table relations |
| 419 | if (actual_binding != relation_column_to_original_column.end() && |
| 420 | (!get || get->table_index != actual_binding->second.table_index)) { |
| 421 | get = GetLogicalGet(op, table_index: actual_binding->second.table_index); |
| 422 | get_updated = true; |
| 423 | } else { |
| 424 | get_updated = false; |
| 425 | } |
| 426 | |
| 427 | if (get_updated) { |
| 428 | if (get) { |
| 429 | catalog_table = GetCatalogTableEntry(op&: *get); |
| 430 | } else { |
| 431 | catalog_table = nullptr; |
| 432 | } |
| 433 | } |
| 434 | |
| 435 | if (catalog_table && actual_binding != relation_column_to_original_column.end()) { |
| 436 | // Get HLL stats here |
| 437 | auto base_stats = catalog_table->GetStatistics(context, column_id: actual_binding->second.column_index); |
| 438 | if (base_stats) { |
| 439 | distinct_count = base_stats->GetDistinctCount(); |
| 440 | } |
| 441 | |
| 442 | // HLL has estimation error, distinct_count can't be greater than cardinality of the table before filters |
| 443 | if (distinct_count > node.GetBaseTableCardinality()) { |
| 444 | distinct_count = node.GetBaseTableCardinality(); |
| 445 | } |
| 446 | } else { |
| 447 | distinct_count = node.GetBaseTableCardinality(); |
| 448 | } |
| 449 | // Update the relation_to_tdom set with the estimated distinct count (or tdom) calculated above |
| 450 | for (auto &relation_to_tdom : relations_to_tdoms) { |
| 451 | column_binding_set_t i_set = relation_to_tdom.equivalent_relations; |
| 452 | if (i_set.count(x: key) != 1) { |
| 453 | continue; |
| 454 | } |
| 455 | if (catalog_table) { |
| 456 | if (relation_to_tdom.tdom_hll < distinct_count) { |
| 457 | relation_to_tdom.tdom_hll = distinct_count; |
| 458 | relation_to_tdom.has_tdom_hll = true; |
| 459 | } |
| 460 | if (relation_to_tdom.tdom_no_hll > distinct_count) { |
| 461 | relation_to_tdom.tdom_no_hll = distinct_count; |
| 462 | } |
| 463 | } else { |
| 464 | // Here we don't have catalog statistics, and the following is how we determine |
| 465 | // the tdom |
| 466 | // 1. If there is any hll data in the equivalence set, use that |
| 467 | // 2. Otherwise, use the table with the smallest cardinality |
| 468 | if (relation_to_tdom.tdom_no_hll > distinct_count && !relation_to_tdom.has_tdom_hll) { |
| 469 | relation_to_tdom.tdom_no_hll = distinct_count; |
| 470 | } |
| 471 | } |
| 472 | break; |
| 473 | } |
| 474 | } |
| 475 | } |
| 476 | |
| 477 | optional_ptr<TableFilterSet> CardinalityEstimator::GetTableFilters(LogicalOperator &op, idx_t table_index) { |
| 478 | auto get = GetLogicalGet(op, table_index); |
| 479 | return get ? &get->table_filters : nullptr; |
| 480 | } |
| 481 | |
| 482 | idx_t CardinalityEstimator::InspectConjunctionAND(idx_t cardinality, idx_t column_index, ConjunctionAndFilter &filter, |
| 483 | unique_ptr<BaseStatistics> base_stats) { |
| 484 | auto has_equality_filter = false; |
| 485 | auto cardinality_after_filters = cardinality; |
| 486 | for (auto &child_filter : filter.child_filters) { |
| 487 | if (child_filter->filter_type != TableFilterType::CONSTANT_COMPARISON) { |
| 488 | continue; |
| 489 | } |
| 490 | auto &comparison_filter = child_filter->Cast<ConstantFilter>(); |
| 491 | if (comparison_filter.comparison_type != ExpressionType::COMPARE_EQUAL) { |
| 492 | continue; |
| 493 | } |
| 494 | auto column_count = 0; |
| 495 | if (base_stats) { |
| 496 | column_count = base_stats->GetDistinctCount(); |
| 497 | } |
| 498 | auto filtered_card = cardinality; |
| 499 | // column_count = 0 when there is no column count (i.e parquet scans) |
| 500 | if (column_count > 0) { |
| 501 | // we want the ceil of cardinality/column_count. We also want to avoid compiler errors |
| 502 | filtered_card = (cardinality + column_count - 1) / column_count; |
| 503 | cardinality_after_filters = filtered_card; |
| 504 | } |
| 505 | if (has_equality_filter) { |
| 506 | cardinality_after_filters = MinValue(a: filtered_card, b: cardinality_after_filters); |
| 507 | } |
| 508 | has_equality_filter = true; |
| 509 | } |
| 510 | return cardinality_after_filters; |
| 511 | } |
| 512 | |
| 513 | idx_t CardinalityEstimator::InspectConjunctionOR(idx_t cardinality, idx_t column_index, ConjunctionOrFilter &filter, |
| 514 | unique_ptr<BaseStatistics> base_stats) { |
| 515 | auto has_equality_filter = false; |
| 516 | auto cardinality_after_filters = cardinality; |
| 517 | for (auto &child_filter : filter.child_filters) { |
| 518 | if (child_filter->filter_type != TableFilterType::CONSTANT_COMPARISON) { |
| 519 | continue; |
| 520 | } |
| 521 | auto &comparison_filter = child_filter->Cast<ConstantFilter>(); |
| 522 | if (comparison_filter.comparison_type == ExpressionType::COMPARE_EQUAL) { |
| 523 | auto column_count = cardinality_after_filters; |
| 524 | if (base_stats) { |
| 525 | column_count = base_stats->GetDistinctCount(); |
| 526 | } |
| 527 | auto increment = MaxValue<idx_t>(a: ((cardinality + column_count - 1) / column_count), b: 1); |
| 528 | if (has_equality_filter) { |
| 529 | cardinality_after_filters += increment; |
| 530 | } else { |
| 531 | cardinality_after_filters = increment; |
| 532 | } |
| 533 | has_equality_filter = true; |
| 534 | } |
| 535 | } |
| 536 | D_ASSERT(cardinality_after_filters > 0); |
| 537 | return cardinality_after_filters; |
| 538 | } |
| 539 | |
| 540 | idx_t CardinalityEstimator::InspectTableFilters(idx_t cardinality, LogicalOperator &op, TableFilterSet &table_filters, |
| 541 | idx_t table_index) { |
| 542 | idx_t cardinality_after_filters = cardinality; |
| 543 | auto get = GetLogicalGet(op, table_index); |
| 544 | unique_ptr<BaseStatistics> column_statistics; |
| 545 | for (auto &it : table_filters.filters) { |
| 546 | column_statistics = nullptr; |
| 547 | if (get->bind_data && get->function.name.compare(s: "seq_scan" ) == 0) { |
| 548 | auto &table_scan_bind_data = get->bind_data->Cast<TableScanBindData>(); |
| 549 | column_statistics = get->function.statistics(context, &table_scan_bind_data, it.first); |
| 550 | } |
| 551 | if (it.second->filter_type == TableFilterType::CONJUNCTION_AND) { |
| 552 | auto &filter = it.second->Cast<ConjunctionAndFilter>(); |
| 553 | idx_t cardinality_with_and_filter = |
| 554 | InspectConjunctionAND(cardinality, column_index: it.first, filter, base_stats: std::move(column_statistics)); |
| 555 | cardinality_after_filters = MinValue(a: cardinality_after_filters, b: cardinality_with_and_filter); |
| 556 | } else if (it.second->filter_type == TableFilterType::CONJUNCTION_OR) { |
| 557 | auto &filter = it.second->Cast<ConjunctionOrFilter>(); |
| 558 | idx_t cardinality_with_or_filter = |
| 559 | InspectConjunctionOR(cardinality, column_index: it.first, filter, base_stats: std::move(column_statistics)); |
| 560 | cardinality_after_filters = MinValue(a: cardinality_after_filters, b: cardinality_with_or_filter); |
| 561 | } |
| 562 | } |
| 563 | // if the above code didn't find an equality filter (i.e country_code = "[us]") |
| 564 | // and there are other table filters, use default selectivity. |
| 565 | bool has_equality_filter = (cardinality_after_filters != cardinality); |
| 566 | if (!has_equality_filter && !table_filters.filters.empty()) { |
| 567 | cardinality_after_filters = MaxValue<idx_t>(a: cardinality * DEFAULT_SELECTIVITY, b: 1); |
| 568 | } |
| 569 | return cardinality_after_filters; |
| 570 | } |
| 571 | |
| 572 | void CardinalityEstimator::EstimateBaseTableCardinality(JoinNode &node, LogicalOperator &op) { |
| 573 | auto has_logical_filter = IsLogicalFilter(op); |
| 574 | D_ASSERT(node.set.count == 1); |
| 575 | auto relation_id = node.set.relations[0]; |
| 576 | |
| 577 | double lowest_card_found = node.GetBaseTableCardinality(); |
| 578 | for (auto &column : relation_attributes[relation_id].columns) { |
| 579 | auto card_after_filters = node.GetBaseTableCardinality(); |
| 580 | ColumnBinding key = ColumnBinding(relation_id, column); |
| 581 | optional_ptr<TableFilterSet> table_filters; |
| 582 | auto actual_binding = relation_column_to_original_column.find(x: key); |
| 583 | if (actual_binding != relation_column_to_original_column.end()) { |
| 584 | table_filters = GetTableFilters(op, table_index: actual_binding->second.table_index); |
| 585 | } |
| 586 | |
| 587 | if (table_filters) { |
| 588 | double inspect_result = |
| 589 | (double)InspectTableFilters(cardinality: card_after_filters, op, table_filters&: *table_filters, table_index: actual_binding->second.table_index); |
| 590 | card_after_filters = MinValue(a: inspect_result, b: (double)card_after_filters); |
| 591 | } |
| 592 | if (has_logical_filter) { |
| 593 | card_after_filters *= DEFAULT_SELECTIVITY; |
| 594 | } |
| 595 | lowest_card_found = MinValue(a: card_after_filters, b: lowest_card_found); |
| 596 | } |
| 597 | node.SetEstimatedCardinality(lowest_card_found); |
| 598 | } |
| 599 | |
| 600 | } // namespace duckdb |
| 601 | |