1 | #include "duckdb/optimizer/remove_unused_columns.hpp" |
2 | |
3 | #include "duckdb/function/aggregate/distributive_functions.hpp" |
4 | #include "duckdb/function/function_binder.hpp" |
5 | #include "duckdb/parser/parsed_data/vacuum_info.hpp" |
6 | #include "duckdb/planner/binder.hpp" |
7 | #include "duckdb/planner/column_binding_map.hpp" |
8 | #include "duckdb/planner/expression/bound_aggregate_expression.hpp" |
9 | #include "duckdb/planner/expression/bound_columnref_expression.hpp" |
10 | #include "duckdb/planner/expression/bound_constant_expression.hpp" |
11 | #include "duckdb/planner/expression_iterator.hpp" |
12 | #include "duckdb/planner/operator/logical_aggregate.hpp" |
13 | #include "duckdb/planner/operator/logical_comparison_join.hpp" |
14 | #include "duckdb/planner/operator/logical_filter.hpp" |
15 | #include "duckdb/planner/operator/logical_get.hpp" |
16 | #include "duckdb/planner/operator/logical_order.hpp" |
17 | #include "duckdb/planner/operator/logical_projection.hpp" |
18 | #include "duckdb/planner/operator/logical_set_operation.hpp" |
19 | #include "duckdb/planner/operator/logical_simple.hpp" |
20 | |
21 | namespace duckdb { |
22 | |
23 | void RemoveUnusedColumns::ReplaceBinding(ColumnBinding current_binding, ColumnBinding new_binding) { |
24 | auto colrefs = column_references.find(x: current_binding); |
25 | if (colrefs != column_references.end()) { |
26 | for (auto &colref : colrefs->second) { |
27 | D_ASSERT(colref->binding == current_binding); |
28 | colref->binding = new_binding; |
29 | } |
30 | } |
31 | } |
32 | |
33 | template <class T> |
34 | void RemoveUnusedColumns::ClearUnusedExpressions(vector<T> &list, idx_t table_idx, bool replace) { |
35 | idx_t offset = 0; |
36 | for (idx_t col_idx = 0; col_idx < list.size(); col_idx++) { |
37 | auto current_binding = ColumnBinding(table_idx, col_idx + offset); |
38 | auto entry = column_references.find(x: current_binding); |
39 | if (entry == column_references.end()) { |
40 | // this entry is not referred to, erase it from the set of expressions |
41 | list.erase(list.begin() + col_idx); |
42 | offset++; |
43 | col_idx--; |
44 | } else if (offset > 0 && replace) { |
45 | // column is used but the ColumnBinding has changed because of removed columns |
46 | ReplaceBinding(current_binding, new_binding: ColumnBinding(table_idx, col_idx)); |
47 | } |
48 | } |
49 | } |
50 | |
51 | void RemoveUnusedColumns::VisitOperator(LogicalOperator &op) { |
52 | switch (op.type) { |
53 | case LogicalOperatorType::LOGICAL_AGGREGATE_AND_GROUP_BY: { |
54 | // aggregate |
55 | if (!everything_referenced) { |
56 | // FIXME: groups that are not referenced need to stay -> but they don't need to be scanned and output! |
57 | auto &aggr = op.Cast<LogicalAggregate>(); |
58 | ClearUnusedExpressions(list&: aggr.expressions, table_idx: aggr.aggregate_index); |
59 | if (aggr.expressions.empty() && aggr.groups.empty()) { |
60 | // removed all expressions from the aggregate: push a COUNT(*) |
61 | auto count_star_fun = CountStarFun::GetFunction(); |
62 | FunctionBinder function_binder(context); |
63 | aggr.expressions.push_back( |
64 | x: function_binder.BindAggregateFunction(bound_function: count_star_fun, children: {}, filter: nullptr, aggr_type: AggregateType::NON_DISTINCT)); |
65 | } |
66 | } |
67 | |
68 | // then recurse into the children of the aggregate |
69 | RemoveUnusedColumns remove(binder, context); |
70 | remove.VisitOperatorExpressions(op); |
71 | remove.VisitOperator(op&: *op.children[0]); |
72 | return; |
73 | } |
74 | case LogicalOperatorType::LOGICAL_ASOF_JOIN: |
75 | case LogicalOperatorType::LOGICAL_DELIM_JOIN: |
76 | case LogicalOperatorType::LOGICAL_COMPARISON_JOIN: { |
77 | if (!everything_referenced) { |
78 | auto &comp_join = op.Cast<LogicalComparisonJoin>(); |
79 | |
80 | if (comp_join.join_type != JoinType::INNER) { |
81 | break; |
82 | } |
83 | // for inner joins with equality predicates in the form of (X=Y) |
84 | // we can replace any references to the RHS (Y) to references to the LHS (X) |
85 | // this reduces the amount of columns we need to extract from the join hash table |
86 | for (auto &cond : comp_join.conditions) { |
87 | if (cond.comparison == ExpressionType::COMPARE_EQUAL) { |
88 | if (cond.left->expression_class == ExpressionClass::BOUND_COLUMN_REF && |
89 | cond.right->expression_class == ExpressionClass::BOUND_COLUMN_REF) { |
90 | // comparison join between two bound column refs |
91 | // we can replace any reference to the RHS (build-side) with a reference to the LHS (probe-side) |
92 | auto &lhs_col = cond.left->Cast<BoundColumnRefExpression>(); |
93 | auto &rhs_col = cond.right->Cast<BoundColumnRefExpression>(); |
94 | // if there are any columns that refer to the RHS, |
95 | auto colrefs = column_references.find(x: rhs_col.binding); |
96 | if (colrefs != column_references.end()) { |
97 | for (auto &entry : colrefs->second) { |
98 | entry->binding = lhs_col.binding; |
99 | column_references[lhs_col.binding].push_back(x: entry); |
100 | } |
101 | column_references.erase(x: rhs_col.binding); |
102 | } |
103 | } |
104 | } |
105 | } |
106 | } |
107 | break; |
108 | } |
109 | case LogicalOperatorType::LOGICAL_ANY_JOIN: |
110 | break; |
111 | case LogicalOperatorType::LOGICAL_UNION: |
112 | if (!everything_referenced) { |
113 | // for UNION we can remove unreferenced columns as long as everything_referenced is false (i.e. we |
114 | // encounter a UNION node that is not preceded by a DISTINCT) |
115 | // this happens when UNION ALL is used |
116 | auto &setop = op.Cast<LogicalSetOperation>(); |
117 | vector<idx_t> entries; |
118 | for (idx_t i = 0; i < setop.column_count; i++) { |
119 | entries.push_back(x: i); |
120 | } |
121 | ClearUnusedExpressions(list&: entries, table_idx: setop.table_index); |
122 | if (entries.size() < setop.column_count) { |
123 | if (entries.empty()) { |
124 | // no columns referenced: this happens in the case of a COUNT(*) |
125 | // extract the first column |
126 | entries.push_back(x: 0); |
127 | } |
128 | // columns were cleared |
129 | setop.column_count = entries.size(); |
130 | |
131 | for (idx_t child_idx = 0; child_idx < op.children.size(); child_idx++) { |
132 | RemoveUnusedColumns remove(binder, context, true); |
133 | auto &child = op.children[child_idx]; |
134 | |
135 | // we push a projection under this child that references the required columns of the union |
136 | child->ResolveOperatorTypes(); |
137 | auto bindings = child->GetColumnBindings(); |
138 | vector<unique_ptr<Expression>> expressions; |
139 | expressions.reserve(n: entries.size()); |
140 | for (auto &column_idx : entries) { |
141 | expressions.push_back( |
142 | x: make_uniq<BoundColumnRefExpression>(args&: child->types[column_idx], args&: bindings[column_idx])); |
143 | } |
144 | auto new_projection = |
145 | make_uniq<LogicalProjection>(args: binder.GenerateTableIndex(), args: std::move(expressions)); |
146 | new_projection->children.push_back(x: std::move(child)); |
147 | op.children[child_idx] = std::move(new_projection); |
148 | |
149 | remove.VisitOperator(op&: *op.children[child_idx]); |
150 | } |
151 | return; |
152 | } |
153 | } |
154 | for (auto &child : op.children) { |
155 | RemoveUnusedColumns remove(binder, context, true); |
156 | remove.VisitOperator(op&: *child); |
157 | } |
158 | return; |
159 | case LogicalOperatorType::LOGICAL_EXCEPT: |
160 | case LogicalOperatorType::LOGICAL_INTERSECT: |
161 | // for INTERSECT/EXCEPT operations we can't remove anything, just recursively visit the children |
162 | for (auto &child : op.children) { |
163 | RemoveUnusedColumns remove(binder, context, true); |
164 | remove.VisitOperator(op&: *child); |
165 | } |
166 | return; |
167 | case LogicalOperatorType::LOGICAL_ORDER_BY: |
168 | if (!everything_referenced) { |
169 | auto &order = op.Cast<LogicalOrder>(); |
170 | D_ASSERT(order.projections.empty()); // should not yet be set |
171 | const auto all_bindings = order.GetColumnBindings(); |
172 | |
173 | for (idx_t col_idx = 0; col_idx < all_bindings.size(); col_idx++) { |
174 | if (column_references.find(x: all_bindings[col_idx]) != column_references.end()) { |
175 | order.projections.push_back(x: col_idx); |
176 | } |
177 | } |
178 | } |
179 | for (auto &child : op.children) { |
180 | RemoveUnusedColumns remove(binder, context, true); |
181 | remove.VisitOperator(op&: *child); |
182 | } |
183 | return; |
184 | case LogicalOperatorType::LOGICAL_PROJECTION: { |
185 | if (!everything_referenced) { |
186 | auto &proj = op.Cast<LogicalProjection>(); |
187 | ClearUnusedExpressions(list&: proj.expressions, table_idx: proj.table_index); |
188 | |
189 | if (proj.expressions.empty()) { |
190 | // nothing references the projected expressions |
191 | // this happens in the case of e.g. EXISTS(SELECT * FROM ...) |
192 | // in this case we only need to project a single constant |
193 | proj.expressions.push_back(x: make_uniq<BoundConstantExpression>(args: Value::INTEGER(value: 42))); |
194 | } |
195 | } |
196 | // then recurse into the children of this projection |
197 | RemoveUnusedColumns remove(binder, context); |
198 | remove.VisitOperatorExpressions(op); |
199 | remove.VisitOperator(op&: *op.children[0]); |
200 | return; |
201 | } |
202 | case LogicalOperatorType::LOGICAL_INSERT: |
203 | case LogicalOperatorType::LOGICAL_UPDATE: |
204 | case LogicalOperatorType::LOGICAL_DELETE: { |
205 | //! When RETURNING is used, a PROJECTION is the top level operator for INSERTS, UPDATES, and DELETES |
206 | //! We still need to project all values from these operators so the projection |
207 | //! on top of them can select from only the table values being inserted. |
208 | //! TODO: Push down the projections from the returning statement |
209 | //! TODO: Be careful because you might be adding expressions when a user returns * |
210 | RemoveUnusedColumns remove(binder, context, true); |
211 | remove.VisitOperatorExpressions(op); |
212 | remove.VisitOperator(op&: *op.children[0]); |
213 | return; |
214 | } |
215 | case LogicalOperatorType::LOGICAL_GET: |
216 | LogicalOperatorVisitor::VisitOperatorExpressions(op); |
217 | if (!everything_referenced) { |
218 | auto &get = op.Cast<LogicalGet>(); |
219 | if (!get.function.projection_pushdown) { |
220 | return; |
221 | } |
222 | |
223 | // Create "selection vector" of all column ids |
224 | vector<idx_t> proj_sel; |
225 | for (idx_t col_idx = 0; col_idx < get.column_ids.size(); col_idx++) { |
226 | proj_sel.push_back(x: col_idx); |
227 | } |
228 | // Create a copy that we can use to match ids later |
229 | auto col_sel = proj_sel; |
230 | // Clear unused ids, exclude filter columns that are projected out immediately |
231 | ClearUnusedExpressions(list&: proj_sel, table_idx: get.table_index, replace: false); |
232 | |
233 | // for every table filter, push a column binding into the column references map to prevent the column from |
234 | // being projected out |
235 | for (auto &filter : get.table_filters.filters) { |
236 | idx_t index = DConstants::INVALID_INDEX; |
237 | for (idx_t i = 0; i < get.column_ids.size(); i++) { |
238 | if (get.column_ids[i] == filter.first) { |
239 | index = i; |
240 | break; |
241 | } |
242 | } |
243 | if (index == DConstants::INVALID_INDEX) { |
244 | throw InternalException("Could not find column index for table filter" ); |
245 | } |
246 | ColumnBinding filter_binding(get.table_index, index); |
247 | if (column_references.find(x: filter_binding) == column_references.end()) { |
248 | column_references.insert(x: make_pair(x&: filter_binding, y: vector<BoundColumnRefExpression *>())); |
249 | } |
250 | } |
251 | |
252 | // Clear unused ids, include filter columns that are projected out immediately |
253 | ClearUnusedExpressions(list&: col_sel, table_idx: get.table_index); |
254 | |
255 | // Now set the column ids in the LogicalGet using the "selection vector" |
256 | vector<column_t> column_ids; |
257 | column_ids.reserve(n: col_sel.size()); |
258 | for (auto col_sel_idx : col_sel) { |
259 | column_ids.push_back(x: get.column_ids[col_sel_idx]); |
260 | } |
261 | get.column_ids = std::move(column_ids); |
262 | |
263 | if (get.function.filter_prune) { |
264 | // Now set the projection cols by matching the "selection vector" that excludes filter columns |
265 | // with the "selection vector" that includes filter columns |
266 | idx_t col_idx = 0; |
267 | for (auto proj_sel_idx : proj_sel) { |
268 | for (; col_idx < col_sel.size(); col_idx++) { |
269 | if (proj_sel_idx == col_sel[col_idx]) { |
270 | get.projection_ids.push_back(x: col_idx); |
271 | break; |
272 | } |
273 | } |
274 | } |
275 | } |
276 | |
277 | if (get.column_ids.empty()) { |
278 | // this generally means we are only interested in whether or not anything exists in the table (e.g. |
279 | // EXISTS(SELECT * FROM tbl)) in this case, we just scan the row identifier column as it means we do not |
280 | // need to read any of the columns |
281 | get.column_ids.push_back(x: COLUMN_IDENTIFIER_ROW_ID); |
282 | } |
283 | } |
284 | return; |
285 | case LogicalOperatorType::LOGICAL_FILTER: { |
286 | auto &filter = op.Cast<LogicalFilter>(); |
287 | if (!filter.projection_map.empty()) { |
288 | // if we have any entries in the filter projection map don't prune any columns |
289 | // FIXME: we can do something more clever here |
290 | everything_referenced = true; |
291 | } |
292 | break; |
293 | } |
294 | case LogicalOperatorType::LOGICAL_DISTINCT: { |
295 | // distinct, all projected columns are used for the DISTINCT computation |
296 | // mark all columns as used and continue to the children |
297 | // FIXME: DISTINCT with expression list does not implicitly reference everything |
298 | everything_referenced = true; |
299 | break; |
300 | } |
301 | case LogicalOperatorType::LOGICAL_RECURSIVE_CTE: { |
302 | everything_referenced = true; |
303 | break; |
304 | } |
305 | case LogicalOperatorType::LOGICAL_CTE_REF: { |
306 | everything_referenced = true; |
307 | break; |
308 | } |
309 | case LogicalOperatorType::LOGICAL_PIVOT: { |
310 | everything_referenced = true; |
311 | break; |
312 | } |
313 | default: |
314 | break; |
315 | } |
316 | LogicalOperatorVisitor::VisitOperatorExpressions(op); |
317 | LogicalOperatorVisitor::VisitOperatorChildren(op); |
318 | } |
319 | |
320 | unique_ptr<Expression> RemoveUnusedColumns::VisitReplace(BoundColumnRefExpression &expr, |
321 | unique_ptr<Expression> *expr_ptr) { |
322 | // add a column reference |
323 | column_references[expr.binding].push_back(x: &expr); |
324 | return nullptr; |
325 | } |
326 | |
327 | unique_ptr<Expression> RemoveUnusedColumns::VisitReplace(BoundReferenceExpression &expr, |
328 | unique_ptr<Expression> *expr_ptr) { |
329 | // BoundReferenceExpression should not be used here yet, they only belong in the physical plan |
330 | throw InternalException("BoundReferenceExpression should not be used here yet!" ); |
331 | } |
332 | |
333 | } // namespace duckdb |
334 | |