1 | #include "duckdb/function/table/table_scan.hpp" |
2 | #include "duckdb/optimizer/join_order/join_node.hpp" |
3 | #include "duckdb/optimizer/join_order/join_order_optimizer.hpp" |
4 | #include "duckdb/planner/filter/conjunction_filter.hpp" |
5 | #include "duckdb/planner/filter/constant_filter.hpp" |
6 | #include "duckdb/planner/operator/logical_comparison_join.hpp" |
7 | #include "duckdb/planner/operator/logical_get.hpp" |
8 | #include "duckdb/storage/data_table.hpp" |
9 | #include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp" |
10 | |
11 | #include <cmath> |
12 | |
13 | namespace duckdb { |
14 | |
15 | static optional_ptr<TableCatalogEntry> GetCatalogTableEntry(LogicalOperator &op) { |
16 | D_ASSERT(op.type == LogicalOperatorType::LOGICAL_GET); |
17 | auto &get = op.Cast<LogicalGet>(); |
18 | return get.GetTable(); |
19 | } |
20 | |
21 | // The filter was made on top of a logical sample or other projection, |
22 | // but no specific columns are referenced. See issue 4978 number 4. |
23 | bool CardinalityEstimator::EmptyFilter(FilterInfo &filter_info) { |
24 | if (!filter_info.left_set && !filter_info.right_set) { |
25 | return true; |
26 | } |
27 | return false; |
28 | } |
29 | |
30 | void CardinalityEstimator::AddRelationTdom(FilterInfo &filter_info) { |
31 | D_ASSERT(filter_info.set.count >= 1); |
32 | for (const RelationsToTDom &r2tdom : relations_to_tdoms) { |
33 | auto &i_set = r2tdom.equivalent_relations; |
34 | if (i_set.find(x: filter_info.left_binding) != i_set.end()) { |
35 | // found an equivalent filter |
36 | return; |
37 | } |
38 | } |
39 | auto key = ColumnBinding(filter_info.left_binding.table_index, filter_info.left_binding.column_index); |
40 | relations_to_tdoms.emplace_back(args: column_binding_set_t({key})); |
41 | } |
42 | |
43 | bool CardinalityEstimator::SingleColumnFilter(FilterInfo &filter_info) { |
44 | if (filter_info.left_set && filter_info.right_set) { |
45 | // Both set |
46 | return false; |
47 | } |
48 | if (EmptyFilter(filter_info)) { |
49 | return false; |
50 | } |
51 | return true; |
52 | } |
53 | |
54 | vector<idx_t> CardinalityEstimator::DetermineMatchingEquivalentSets(FilterInfo *filter_info) { |
55 | vector<idx_t> matching_equivalent_sets; |
56 | auto equivalent_relation_index = 0; |
57 | |
58 | for (const RelationsToTDom &r2tdom : relations_to_tdoms) { |
59 | auto &i_set = r2tdom.equivalent_relations; |
60 | if (i_set.find(x: filter_info->left_binding) != i_set.end()) { |
61 | matching_equivalent_sets.push_back(x: equivalent_relation_index); |
62 | } else if (i_set.find(x: filter_info->right_binding) != i_set.end()) { |
63 | // don't add both left and right to the matching_equivalent_sets |
64 | // since both left and right get added to that index anyway. |
65 | matching_equivalent_sets.push_back(x: equivalent_relation_index); |
66 | } |
67 | equivalent_relation_index++; |
68 | } |
69 | return matching_equivalent_sets; |
70 | } |
71 | |
72 | void CardinalityEstimator::AddToEquivalenceSets(FilterInfo *filter_info, vector<idx_t> matching_equivalent_sets) { |
73 | D_ASSERT(matching_equivalent_sets.size() <= 2); |
74 | if (matching_equivalent_sets.size() > 1) { |
75 | // an equivalence relation is connecting to sets of equivalence relations |
76 | // so push all relations from the second set into the first. Later we will delete |
77 | // the second set. |
78 | for (ColumnBinding i : relations_to_tdoms.at(n: matching_equivalent_sets[1]).equivalent_relations) { |
79 | relations_to_tdoms.at(n: matching_equivalent_sets[0]).equivalent_relations.insert(x: i); |
80 | } |
81 | relations_to_tdoms.at(n: matching_equivalent_sets[1]).equivalent_relations.clear(); |
82 | relations_to_tdoms.at(n: matching_equivalent_sets[0]).filters.push_back(x: filter_info); |
83 | // add all values of one set to the other, delete the empty one |
84 | } else if (matching_equivalent_sets.size() == 1) { |
85 | auto &tdom_i = relations_to_tdoms.at(n: matching_equivalent_sets.at(n: 0)); |
86 | tdom_i.equivalent_relations.insert(x: filter_info->left_binding); |
87 | tdom_i.equivalent_relations.insert(x: filter_info->right_binding); |
88 | tdom_i.filters.push_back(x: filter_info); |
89 | } else if (matching_equivalent_sets.empty()) { |
90 | column_binding_set_t tmp; |
91 | tmp.insert(x: filter_info->left_binding); |
92 | tmp.insert(x: filter_info->right_binding); |
93 | relations_to_tdoms.emplace_back(args&: tmp); |
94 | relations_to_tdoms.back().filters.push_back(x: filter_info); |
95 | } |
96 | } |
97 | |
98 | void CardinalityEstimator::AddRelationToColumnMapping(ColumnBinding key, ColumnBinding value) { |
99 | relation_column_to_original_column[key] = value; |
100 | } |
101 | |
102 | void CardinalityEstimator::CopyRelationMap(column_binding_map_t<ColumnBinding> &child_binding_map) { |
103 | for (auto &binding_map : relation_column_to_original_column) { |
104 | D_ASSERT(child_binding_map.find(binding_map.first) == child_binding_map.end()); |
105 | child_binding_map[binding_map.first] = binding_map.second; |
106 | } |
107 | } |
108 | |
109 | void CardinalityEstimator::AddColumnToRelationMap(idx_t table_index, idx_t column_index) { |
110 | relation_attributes[table_index].columns.insert(x: column_index); |
111 | } |
112 | |
113 | void CardinalityEstimator::InitEquivalentRelations(vector<unique_ptr<FilterInfo>> &filter_infos) { |
114 | // For each filter, we fill keep track of the index of the equivalent relation set |
115 | // the left and right relation needs to be added to. |
116 | for (auto &filter : filter_infos) { |
117 | if (SingleColumnFilter(filter_info&: *filter)) { |
118 | // Filter on one relation, (i.e string or range filter on a column). |
119 | // Grab the first relation and add it to the equivalence_relations |
120 | AddRelationTdom(filter_info&: *filter); |
121 | continue; |
122 | } else if (EmptyFilter(filter_info&: *filter)) { |
123 | continue; |
124 | } |
125 | D_ASSERT(filter->left_set->count >= 1); |
126 | D_ASSERT(filter->right_set->count >= 1); |
127 | |
128 | auto matching_equivalent_sets = DetermineMatchingEquivalentSets(filter_info: filter.get()); |
129 | AddToEquivalenceSets(filter_info: filter.get(), matching_equivalent_sets); |
130 | } |
131 | } |
132 | |
133 | void CardinalityEstimator::VerifySymmetry(JoinNode &result, JoinNode &entry) { |
134 | if (result.GetCardinality<double>() != entry.GetCardinality<double>()) { |
135 | // Currently it's possible that some entries are cartesian joins. |
136 | // When this is the case, you don't always have symmetry, but |
137 | // if the cost of the result is less, then just assure the cardinality |
138 | // is also less, then you have the same effect of symmetry. |
139 | D_ASSERT(ceil(result.GetCardinality<double>()) <= ceil(entry.GetCardinality<double>()) || |
140 | floor(result.GetCardinality<double>()) <= floor(entry.GetCardinality<double>())); |
141 | } |
142 | } |
143 | |
144 | void CardinalityEstimator::InitTotalDomains() { |
145 | auto remove_start = std::remove_if(first: relations_to_tdoms.begin(), last: relations_to_tdoms.end(), |
146 | pred: [](RelationsToTDom &r_2_tdom) { return r_2_tdom.equivalent_relations.empty(); }); |
147 | relations_to_tdoms.erase(first: remove_start, last: relations_to_tdoms.end()); |
148 | } |
149 | |
150 | double CardinalityEstimator::ComputeCost(JoinNode &left, JoinNode &right, double expected_cardinality) { |
151 | return expected_cardinality + left.GetCost() + right.GetCost(); |
152 | } |
153 | |
154 | double CardinalityEstimator::EstimateCrossProduct(const JoinNode &left, const JoinNode &right) { |
155 | // need to explicity use double here, otherwise auto converts it to an int, then |
156 | // there is an autocast in the return. |
157 | if (left.GetCardinality<double>() >= (NumericLimits<double>::Maximum() / right.GetCardinality<double>())) { |
158 | return NumericLimits<double>::Maximum(); |
159 | } |
160 | return left.GetCardinality<double>() * right.GetCardinality<double>(); |
161 | } |
162 | |
163 | void CardinalityEstimator::AddRelationColumnMapping(LogicalGet &get, idx_t relation_id) { |
164 | for (idx_t it = 0; it < get.column_ids.size(); it++) { |
165 | auto key = ColumnBinding(relation_id, it); |
166 | auto value = ColumnBinding(get.table_index, get.column_ids[it]); |
167 | AddRelationToColumnMapping(key, value); |
168 | } |
169 | } |
170 | |
171 | void UpdateDenom(Subgraph2Denominator &relation_2_denom, RelationsToTDom &relation_to_tdom) { |
172 | relation_2_denom.denom *= relation_to_tdom.has_tdom_hll ? relation_to_tdom.tdom_hll : relation_to_tdom.tdom_no_hll; |
173 | } |
174 | |
175 | void FindSubgraphMatchAndMerge(Subgraph2Denominator &merge_to, idx_t find_me, |
176 | vector<Subgraph2Denominator>::iterator subgraph, |
177 | vector<Subgraph2Denominator>::iterator end) { |
178 | for (; subgraph != end; subgraph++) { |
179 | if (subgraph->relations.count(x: find_me) >= 1) { |
180 | for (auto &relation : subgraph->relations) { |
181 | merge_to.relations.insert(x: relation); |
182 | } |
183 | subgraph->relations.clear(); |
184 | merge_to.denom *= subgraph->denom; |
185 | return; |
186 | } |
187 | } |
188 | } |
189 | |
190 | double CardinalityEstimator::EstimateCardinalityWithSet(JoinRelationSet &new_set) { |
191 | double numerator = 1; |
192 | unordered_set<idx_t> actual_set; |
193 | for (idx_t i = 0; i < new_set.count; i++) { |
194 | numerator *= relation_attributes[new_set.relations[i]].cardinality; |
195 | actual_set.insert(x: new_set.relations[i]); |
196 | } |
197 | vector<Subgraph2Denominator> subgraphs; |
198 | bool done = false; |
199 | bool found_match = false; |
200 | |
201 | // Finding the denominator is tricky. You need to go through the tdoms in decreasing order |
202 | // Then loop through all filters in the equivalence set of the tdom to see if both the |
203 | // left and right relations are in the new set, if so you can use that filter. |
204 | // You must also make sure that the filters all relations in the given set, so we use subgraphs |
205 | // that should eventually merge into one connected graph that joins all the relations |
206 | // TODO: Implement a method to cache subgraphs so you don't have to build them up every |
207 | // time the cardinality of a new set is requested |
208 | |
209 | // relations_to_tdoms has already been sorted. |
210 | for (auto &relation_2_tdom : relations_to_tdoms) { |
211 | // loop through each filter in the tdom. |
212 | if (done) { |
213 | break; |
214 | } |
215 | for (auto &filter : relation_2_tdom.filters) { |
216 | if (actual_set.count(x: filter->left_binding.table_index) == 0 || |
217 | actual_set.count(x: filter->right_binding.table_index) == 0) { |
218 | continue; |
219 | } |
220 | // the join filter is on relations in the new set. |
221 | found_match = false; |
222 | vector<Subgraph2Denominator>::iterator it; |
223 | for (it = subgraphs.begin(); it != subgraphs.end(); it++) { |
224 | auto left_in = it->relations.count(x: filter->left_binding.table_index); |
225 | auto right_in = it->relations.count(x: filter->right_binding.table_index); |
226 | if (left_in && right_in) { |
227 | // if both left and right bindings are in the subgraph, continue. |
228 | // This means another filter is connecting relations already in the |
229 | // subgraph it, but it has a tdom that is less, and we don't care. |
230 | found_match = true; |
231 | continue; |
232 | } |
233 | if (!left_in && !right_in) { |
234 | // if both left and right bindings are *not* in the subgraph, continue |
235 | // without finding a match. This will trigger the process to add a new |
236 | // subgraph |
237 | continue; |
238 | } |
239 | idx_t find_table; |
240 | if (left_in) { |
241 | find_table = filter->right_binding.table_index; |
242 | } else { |
243 | D_ASSERT(right_in); |
244 | find_table = filter->left_binding.table_index; |
245 | } |
246 | auto next_subgraph = it + 1; |
247 | // iterate through other subgraphs and merge. |
248 | FindSubgraphMatchAndMerge(merge_to&: *it, find_me: find_table, subgraph: next_subgraph, end: subgraphs.end()); |
249 | // Now insert the right binding and update denominator with the |
250 | // tdom of the filter |
251 | it->relations.insert(x: find_table); |
252 | UpdateDenom(relation_2_denom&: *it, relation_to_tdom&: relation_2_tdom); |
253 | found_match = true; |
254 | break; |
255 | } |
256 | // means that the filter joins relations in the given set, but there is no |
257 | // connection to any subgraph in subgraphs. Add a new subgraph, and maybe later there will be |
258 | // a connection. |
259 | if (!found_match) { |
260 | subgraphs.emplace_back(); |
261 | auto &subgraph = subgraphs.back(); |
262 | subgraph.relations.insert(x: filter->left_binding.table_index); |
263 | subgraph.relations.insert(x: filter->right_binding.table_index); |
264 | UpdateDenom(relation_2_denom&: subgraph, relation_to_tdom&: relation_2_tdom); |
265 | } |
266 | auto remove_start = std::remove_if(first: subgraphs.begin(), last: subgraphs.end(), |
267 | pred: [](Subgraph2Denominator &s) { return s.relations.empty(); }); |
268 | subgraphs.erase(first: remove_start, last: subgraphs.end()); |
269 | |
270 | if (subgraphs.size() == 1 && subgraphs.at(n: 0).relations.size() == new_set.count) { |
271 | // You have found enough filters to connect the relations. These are guaranteed |
272 | // to be the filters with the highest Tdoms. |
273 | done = true; |
274 | break; |
275 | } |
276 | } |
277 | } |
278 | double denom = 1; |
279 | // TODO: It's possible cross-products were added and are not present in the filters in the relation_2_tdom |
280 | // structures. When that's the case, multiply the denom structures that have no intersection |
281 | for (auto &match : subgraphs) { |
282 | // It's possible that in production, one of the D_ASSERTS above will fail and not all subgraphs |
283 | // were connected. When this happens, just use the largest denominator of all the subgraphs. |
284 | if (match.denom > denom) { |
285 | denom = match.denom; |
286 | } |
287 | } |
288 | // can happen if a table has cardinality 0, or a tdom is set to 0 |
289 | if (denom == 0) { |
290 | denom = 1; |
291 | } |
292 | return numerator / denom; |
293 | } |
294 | |
295 | static bool IsLogicalFilter(LogicalOperator &op) { |
296 | return op.type == LogicalOperatorType::LOGICAL_FILTER; |
297 | } |
298 | |
299 | static optional_ptr<LogicalGet> GetLogicalGet(LogicalOperator &op, idx_t table_index = DConstants::INVALID_INDEX) { |
300 | optional_ptr<LogicalGet> get; |
301 | switch (op.type) { |
302 | case LogicalOperatorType::LOGICAL_GET: |
303 | get = &op.Cast<LogicalGet>(); |
304 | break; |
305 | case LogicalOperatorType::LOGICAL_FILTER: |
306 | get = GetLogicalGet(op&: *op.children.at(n: 0), table_index); |
307 | break; |
308 | case LogicalOperatorType::LOGICAL_PROJECTION: |
309 | get = GetLogicalGet(op&: *op.children.at(n: 0), table_index); |
310 | break; |
311 | case LogicalOperatorType::LOGICAL_ASOF_JOIN: |
312 | case LogicalOperatorType::LOGICAL_COMPARISON_JOIN: { |
313 | auto &join = op.Cast<LogicalComparisonJoin>(); |
314 | // We should never be calling GetLogicalGet without a valid table_index. |
315 | // We are attempting to get the catalog table for a relation (for statistics/cardinality estimation) |
316 | // A logical join means there is a non-reorderable relation in the join plan. This means we need |
317 | // to know the exact table index to return. |
318 | D_ASSERT(table_index != DConstants::INVALID_INDEX); |
319 | if (join.join_type == JoinType::MARK || join.join_type == JoinType::LEFT) { |
320 | auto &left_child = *join.children.at(n: 0); |
321 | get = GetLogicalGet(op&: left_child, table_index); |
322 | if (get && get->table_index == table_index) { |
323 | return get; |
324 | } |
325 | auto &right_child = *join.children.at(n: 1); |
326 | get = GetLogicalGet(op&: right_child, table_index); |
327 | if (get && get->table_index == table_index) { |
328 | return get; |
329 | } |
330 | } |
331 | break; |
332 | } |
333 | default: |
334 | // return null pointer, maybe there is no logical get under this child |
335 | break; |
336 | } |
337 | return get; |
338 | } |
339 | |
340 | void CardinalityEstimator::MergeBindings(idx_t binding_index, idx_t relation_id, |
341 | vector<column_binding_map_t<ColumnBinding>> &child_binding_maps) { |
342 | for (auto &map_set : child_binding_maps) { |
343 | for (auto &mapping : map_set) { |
344 | ColumnBinding relation_bindings = mapping.first; |
345 | ColumnBinding actual_bindings = mapping.second; |
346 | |
347 | if (actual_bindings.table_index == binding_index) { |
348 | auto key = ColumnBinding(relation_id, relation_bindings.column_index); |
349 | AddRelationToColumnMapping(key, value: actual_bindings); |
350 | } |
351 | } |
352 | } |
353 | } |
354 | |
355 | bool SortTdoms(const RelationsToTDom &a, const RelationsToTDom &b) { |
356 | if (a.has_tdom_hll && b.has_tdom_hll) { |
357 | return a.tdom_hll > b.tdom_hll; |
358 | } |
359 | if (a.has_tdom_hll) { |
360 | return a.tdom_hll > b.tdom_no_hll; |
361 | } |
362 | if (b.has_tdom_hll) { |
363 | return a.tdom_no_hll > b.tdom_hll; |
364 | } |
365 | return a.tdom_no_hll > b.tdom_no_hll; |
366 | } |
367 | |
368 | void CardinalityEstimator::InitCardinalityEstimatorProps(vector<NodeOp> &node_ops, |
369 | vector<unique_ptr<FilterInfo>> &filter_infos) { |
370 | InitEquivalentRelations(filter_infos); |
371 | InitTotalDomains(); |
372 | for (idx_t i = 0; i < node_ops.size(); i++) { |
373 | auto &join_node = *node_ops[i].node; |
374 | auto &op = node_ops[i].op; |
375 | join_node.SetBaseTableCardinality(op.EstimateCardinality(context)); |
376 | if (op.type == LogicalOperatorType::LOGICAL_COMPARISON_JOIN) { |
377 | auto &join = op.Cast<LogicalComparisonJoin>(); |
378 | if (join.join_type == JoinType::LEFT) { |
379 | // If a base op is a Logical Comparison join it is probably a left join, |
380 | // so the cost of the larger table is a fine estimate. |
381 | // TODO: provide better estimates for cost of mark joins |
382 | // MARK joins are used for anti and semi joins, so the cost can conceivably be |
383 | // less than the base table cardinality. |
384 | join_node.SetCost(join_node.GetBaseTableCardinality()); |
385 | } |
386 | } else if (op.type == LogicalOperatorType::LOGICAL_ASOF_JOIN) { |
387 | // AsOf joins have the cardinality of the LHS |
388 | join_node.SetCost(join_node.GetBaseTableCardinality()); |
389 | } |
390 | // Total domains can be affected by filters. So we update base table cardinality first |
391 | EstimateBaseTableCardinality(node&: join_node, op); |
392 | // Then update total domains. |
393 | UpdateTotalDomains(node&: join_node, op); |
394 | } |
395 | |
396 | // sort relations from greatest tdom to lowest tdom. |
397 | std::sort(first: relations_to_tdoms.begin(), last: relations_to_tdoms.end(), comp: SortTdoms); |
398 | } |
399 | |
400 | void CardinalityEstimator::UpdateTotalDomains(JoinNode &node, LogicalOperator &op) { |
401 | auto relation_id = node.set.relations[0]; |
402 | relation_attributes[relation_id].cardinality = node.GetCardinality<double>(); |
403 | //! Initialize the distinct count for all columns used in joins with the current relation. |
404 | idx_t distinct_count = node.GetBaseTableCardinality(); |
405 | optional_ptr<TableCatalogEntry> catalog_table; |
406 | |
407 | optional_ptr<LogicalGet> get; |
408 | bool get_updated = true; |
409 | for (auto &column : relation_attributes[relation_id].columns) { |
410 | //! for every column used in a filter in the relation, get the distinct count via HLL, or assume it to be |
411 | //! the cardinality |
412 | ColumnBinding key = ColumnBinding(relation_id, column); |
413 | auto actual_binding = relation_column_to_original_column.find(x: key); |
414 | // each relation has columns that are either projected or used as filters |
415 | // In order to get column statistics we need to make sure the actual binding still |
416 | // refers to the same base table relation, as non-reorderable joins may involve 2+ |
417 | // base table relations and therefore the columns may also refer to 2 different |
418 | // base table relations |
419 | if (actual_binding != relation_column_to_original_column.end() && |
420 | (!get || get->table_index != actual_binding->second.table_index)) { |
421 | get = GetLogicalGet(op, table_index: actual_binding->second.table_index); |
422 | get_updated = true; |
423 | } else { |
424 | get_updated = false; |
425 | } |
426 | |
427 | if (get_updated) { |
428 | if (get) { |
429 | catalog_table = GetCatalogTableEntry(op&: *get); |
430 | } else { |
431 | catalog_table = nullptr; |
432 | } |
433 | } |
434 | |
435 | if (catalog_table && actual_binding != relation_column_to_original_column.end()) { |
436 | // Get HLL stats here |
437 | auto base_stats = catalog_table->GetStatistics(context, column_id: actual_binding->second.column_index); |
438 | if (base_stats) { |
439 | distinct_count = base_stats->GetDistinctCount(); |
440 | } |
441 | |
442 | // HLL has estimation error, distinct_count can't be greater than cardinality of the table before filters |
443 | if (distinct_count > node.GetBaseTableCardinality()) { |
444 | distinct_count = node.GetBaseTableCardinality(); |
445 | } |
446 | } else { |
447 | distinct_count = node.GetBaseTableCardinality(); |
448 | } |
449 | // Update the relation_to_tdom set with the estimated distinct count (or tdom) calculated above |
450 | for (auto &relation_to_tdom : relations_to_tdoms) { |
451 | column_binding_set_t i_set = relation_to_tdom.equivalent_relations; |
452 | if (i_set.count(x: key) != 1) { |
453 | continue; |
454 | } |
455 | if (catalog_table) { |
456 | if (relation_to_tdom.tdom_hll < distinct_count) { |
457 | relation_to_tdom.tdom_hll = distinct_count; |
458 | relation_to_tdom.has_tdom_hll = true; |
459 | } |
460 | if (relation_to_tdom.tdom_no_hll > distinct_count) { |
461 | relation_to_tdom.tdom_no_hll = distinct_count; |
462 | } |
463 | } else { |
464 | // Here we don't have catalog statistics, and the following is how we determine |
465 | // the tdom |
466 | // 1. If there is any hll data in the equivalence set, use that |
467 | // 2. Otherwise, use the table with the smallest cardinality |
468 | if (relation_to_tdom.tdom_no_hll > distinct_count && !relation_to_tdom.has_tdom_hll) { |
469 | relation_to_tdom.tdom_no_hll = distinct_count; |
470 | } |
471 | } |
472 | break; |
473 | } |
474 | } |
475 | } |
476 | |
477 | optional_ptr<TableFilterSet> CardinalityEstimator::GetTableFilters(LogicalOperator &op, idx_t table_index) { |
478 | auto get = GetLogicalGet(op, table_index); |
479 | return get ? &get->table_filters : nullptr; |
480 | } |
481 | |
482 | idx_t CardinalityEstimator::InspectConjunctionAND(idx_t cardinality, idx_t column_index, ConjunctionAndFilter &filter, |
483 | unique_ptr<BaseStatistics> base_stats) { |
484 | auto has_equality_filter = false; |
485 | auto cardinality_after_filters = cardinality; |
486 | for (auto &child_filter : filter.child_filters) { |
487 | if (child_filter->filter_type != TableFilterType::CONSTANT_COMPARISON) { |
488 | continue; |
489 | } |
490 | auto &comparison_filter = child_filter->Cast<ConstantFilter>(); |
491 | if (comparison_filter.comparison_type != ExpressionType::COMPARE_EQUAL) { |
492 | continue; |
493 | } |
494 | auto column_count = 0; |
495 | if (base_stats) { |
496 | column_count = base_stats->GetDistinctCount(); |
497 | } |
498 | auto filtered_card = cardinality; |
499 | // column_count = 0 when there is no column count (i.e parquet scans) |
500 | if (column_count > 0) { |
501 | // we want the ceil of cardinality/column_count. We also want to avoid compiler errors |
502 | filtered_card = (cardinality + column_count - 1) / column_count; |
503 | cardinality_after_filters = filtered_card; |
504 | } |
505 | if (has_equality_filter) { |
506 | cardinality_after_filters = MinValue(a: filtered_card, b: cardinality_after_filters); |
507 | } |
508 | has_equality_filter = true; |
509 | } |
510 | return cardinality_after_filters; |
511 | } |
512 | |
513 | idx_t CardinalityEstimator::InspectConjunctionOR(idx_t cardinality, idx_t column_index, ConjunctionOrFilter &filter, |
514 | unique_ptr<BaseStatistics> base_stats) { |
515 | auto has_equality_filter = false; |
516 | auto cardinality_after_filters = cardinality; |
517 | for (auto &child_filter : filter.child_filters) { |
518 | if (child_filter->filter_type != TableFilterType::CONSTANT_COMPARISON) { |
519 | continue; |
520 | } |
521 | auto &comparison_filter = child_filter->Cast<ConstantFilter>(); |
522 | if (comparison_filter.comparison_type == ExpressionType::COMPARE_EQUAL) { |
523 | auto column_count = cardinality_after_filters; |
524 | if (base_stats) { |
525 | column_count = base_stats->GetDistinctCount(); |
526 | } |
527 | auto increment = MaxValue<idx_t>(a: ((cardinality + column_count - 1) / column_count), b: 1); |
528 | if (has_equality_filter) { |
529 | cardinality_after_filters += increment; |
530 | } else { |
531 | cardinality_after_filters = increment; |
532 | } |
533 | has_equality_filter = true; |
534 | } |
535 | } |
536 | D_ASSERT(cardinality_after_filters > 0); |
537 | return cardinality_after_filters; |
538 | } |
539 | |
540 | idx_t CardinalityEstimator::InspectTableFilters(idx_t cardinality, LogicalOperator &op, TableFilterSet &table_filters, |
541 | idx_t table_index) { |
542 | idx_t cardinality_after_filters = cardinality; |
543 | auto get = GetLogicalGet(op, table_index); |
544 | unique_ptr<BaseStatistics> column_statistics; |
545 | for (auto &it : table_filters.filters) { |
546 | column_statistics = nullptr; |
547 | if (get->bind_data && get->function.name.compare(s: "seq_scan" ) == 0) { |
548 | auto &table_scan_bind_data = get->bind_data->Cast<TableScanBindData>(); |
549 | column_statistics = get->function.statistics(context, &table_scan_bind_data, it.first); |
550 | } |
551 | if (it.second->filter_type == TableFilterType::CONJUNCTION_AND) { |
552 | auto &filter = it.second->Cast<ConjunctionAndFilter>(); |
553 | idx_t cardinality_with_and_filter = |
554 | InspectConjunctionAND(cardinality, column_index: it.first, filter, base_stats: std::move(column_statistics)); |
555 | cardinality_after_filters = MinValue(a: cardinality_after_filters, b: cardinality_with_and_filter); |
556 | } else if (it.second->filter_type == TableFilterType::CONJUNCTION_OR) { |
557 | auto &filter = it.second->Cast<ConjunctionOrFilter>(); |
558 | idx_t cardinality_with_or_filter = |
559 | InspectConjunctionOR(cardinality, column_index: it.first, filter, base_stats: std::move(column_statistics)); |
560 | cardinality_after_filters = MinValue(a: cardinality_after_filters, b: cardinality_with_or_filter); |
561 | } |
562 | } |
563 | // if the above code didn't find an equality filter (i.e country_code = "[us]") |
564 | // and there are other table filters, use default selectivity. |
565 | bool has_equality_filter = (cardinality_after_filters != cardinality); |
566 | if (!has_equality_filter && !table_filters.filters.empty()) { |
567 | cardinality_after_filters = MaxValue<idx_t>(a: cardinality * DEFAULT_SELECTIVITY, b: 1); |
568 | } |
569 | return cardinality_after_filters; |
570 | } |
571 | |
572 | void CardinalityEstimator::EstimateBaseTableCardinality(JoinNode &node, LogicalOperator &op) { |
573 | auto has_logical_filter = IsLogicalFilter(op); |
574 | D_ASSERT(node.set.count == 1); |
575 | auto relation_id = node.set.relations[0]; |
576 | |
577 | double lowest_card_found = node.GetBaseTableCardinality(); |
578 | for (auto &column : relation_attributes[relation_id].columns) { |
579 | auto card_after_filters = node.GetBaseTableCardinality(); |
580 | ColumnBinding key = ColumnBinding(relation_id, column); |
581 | optional_ptr<TableFilterSet> table_filters; |
582 | auto actual_binding = relation_column_to_original_column.find(x: key); |
583 | if (actual_binding != relation_column_to_original_column.end()) { |
584 | table_filters = GetTableFilters(op, table_index: actual_binding->second.table_index); |
585 | } |
586 | |
587 | if (table_filters) { |
588 | double inspect_result = |
589 | (double)InspectTableFilters(cardinality: card_after_filters, op, table_filters&: *table_filters, table_index: actual_binding->second.table_index); |
590 | card_after_filters = MinValue(a: inspect_result, b: (double)card_after_filters); |
591 | } |
592 | if (has_logical_filter) { |
593 | card_after_filters *= DEFAULT_SELECTIVITY; |
594 | } |
595 | lowest_card_found = MinValue(a: card_after_filters, b: lowest_card_found); |
596 | } |
597 | node.SetEstimatedCardinality(lowest_card_found); |
598 | } |
599 | |
600 | } // namespace duckdb |
601 | |