1#include "duckdb/optimizer/remove_unused_columns.hpp"
2
3#include "duckdb/function/aggregate/distributive_functions.hpp"
4#include "duckdb/function/function_binder.hpp"
5#include "duckdb/parser/parsed_data/vacuum_info.hpp"
6#include "duckdb/planner/binder.hpp"
7#include "duckdb/planner/column_binding_map.hpp"
8#include "duckdb/planner/expression/bound_aggregate_expression.hpp"
9#include "duckdb/planner/expression/bound_columnref_expression.hpp"
10#include "duckdb/planner/expression/bound_constant_expression.hpp"
11#include "duckdb/planner/expression_iterator.hpp"
12#include "duckdb/planner/operator/logical_aggregate.hpp"
13#include "duckdb/planner/operator/logical_comparison_join.hpp"
14#include "duckdb/planner/operator/logical_filter.hpp"
15#include "duckdb/planner/operator/logical_get.hpp"
16#include "duckdb/planner/operator/logical_order.hpp"
17#include "duckdb/planner/operator/logical_projection.hpp"
18#include "duckdb/planner/operator/logical_set_operation.hpp"
19#include "duckdb/planner/operator/logical_simple.hpp"
20
21namespace duckdb {
22
23void RemoveUnusedColumns::ReplaceBinding(ColumnBinding current_binding, ColumnBinding new_binding) {
24 auto colrefs = column_references.find(x: current_binding);
25 if (colrefs != column_references.end()) {
26 for (auto &colref : colrefs->second) {
27 D_ASSERT(colref->binding == current_binding);
28 colref->binding = new_binding;
29 }
30 }
31}
32
33template <class T>
34void RemoveUnusedColumns::ClearUnusedExpressions(vector<T> &list, idx_t table_idx, bool replace) {
35 idx_t offset = 0;
36 for (idx_t col_idx = 0; col_idx < list.size(); col_idx++) {
37 auto current_binding = ColumnBinding(table_idx, col_idx + offset);
38 auto entry = column_references.find(x: current_binding);
39 if (entry == column_references.end()) {
40 // this entry is not referred to, erase it from the set of expressions
41 list.erase(list.begin() + col_idx);
42 offset++;
43 col_idx--;
44 } else if (offset > 0 && replace) {
45 // column is used but the ColumnBinding has changed because of removed columns
46 ReplaceBinding(current_binding, new_binding: ColumnBinding(table_idx, col_idx));
47 }
48 }
49}
50
51void RemoveUnusedColumns::VisitOperator(LogicalOperator &op) {
52 switch (op.type) {
53 case LogicalOperatorType::LOGICAL_AGGREGATE_AND_GROUP_BY: {
54 // aggregate
55 if (!everything_referenced) {
56 // FIXME: groups that are not referenced need to stay -> but they don't need to be scanned and output!
57 auto &aggr = op.Cast<LogicalAggregate>();
58 ClearUnusedExpressions(list&: aggr.expressions, table_idx: aggr.aggregate_index);
59 if (aggr.expressions.empty() && aggr.groups.empty()) {
60 // removed all expressions from the aggregate: push a COUNT(*)
61 auto count_star_fun = CountStarFun::GetFunction();
62 FunctionBinder function_binder(context);
63 aggr.expressions.push_back(
64 x: function_binder.BindAggregateFunction(bound_function: count_star_fun, children: {}, filter: nullptr, aggr_type: AggregateType::NON_DISTINCT));
65 }
66 }
67
68 // then recurse into the children of the aggregate
69 RemoveUnusedColumns remove(binder, context);
70 remove.VisitOperatorExpressions(op);
71 remove.VisitOperator(op&: *op.children[0]);
72 return;
73 }
74 case LogicalOperatorType::LOGICAL_ASOF_JOIN:
75 case LogicalOperatorType::LOGICAL_DELIM_JOIN:
76 case LogicalOperatorType::LOGICAL_COMPARISON_JOIN: {
77 if (!everything_referenced) {
78 auto &comp_join = op.Cast<LogicalComparisonJoin>();
79
80 if (comp_join.join_type != JoinType::INNER) {
81 break;
82 }
83 // for inner joins with equality predicates in the form of (X=Y)
84 // we can replace any references to the RHS (Y) to references to the LHS (X)
85 // this reduces the amount of columns we need to extract from the join hash table
86 for (auto &cond : comp_join.conditions) {
87 if (cond.comparison == ExpressionType::COMPARE_EQUAL) {
88 if (cond.left->expression_class == ExpressionClass::BOUND_COLUMN_REF &&
89 cond.right->expression_class == ExpressionClass::BOUND_COLUMN_REF) {
90 // comparison join between two bound column refs
91 // we can replace any reference to the RHS (build-side) with a reference to the LHS (probe-side)
92 auto &lhs_col = cond.left->Cast<BoundColumnRefExpression>();
93 auto &rhs_col = cond.right->Cast<BoundColumnRefExpression>();
94 // if there are any columns that refer to the RHS,
95 auto colrefs = column_references.find(x: rhs_col.binding);
96 if (colrefs != column_references.end()) {
97 for (auto &entry : colrefs->second) {
98 entry->binding = lhs_col.binding;
99 column_references[lhs_col.binding].push_back(x: entry);
100 }
101 column_references.erase(x: rhs_col.binding);
102 }
103 }
104 }
105 }
106 }
107 break;
108 }
109 case LogicalOperatorType::LOGICAL_ANY_JOIN:
110 break;
111 case LogicalOperatorType::LOGICAL_UNION:
112 if (!everything_referenced) {
113 // for UNION we can remove unreferenced columns as long as everything_referenced is false (i.e. we
114 // encounter a UNION node that is not preceded by a DISTINCT)
115 // this happens when UNION ALL is used
116 auto &setop = op.Cast<LogicalSetOperation>();
117 vector<idx_t> entries;
118 for (idx_t i = 0; i < setop.column_count; i++) {
119 entries.push_back(x: i);
120 }
121 ClearUnusedExpressions(list&: entries, table_idx: setop.table_index);
122 if (entries.size() < setop.column_count) {
123 if (entries.empty()) {
124 // no columns referenced: this happens in the case of a COUNT(*)
125 // extract the first column
126 entries.push_back(x: 0);
127 }
128 // columns were cleared
129 setop.column_count = entries.size();
130
131 for (idx_t child_idx = 0; child_idx < op.children.size(); child_idx++) {
132 RemoveUnusedColumns remove(binder, context, true);
133 auto &child = op.children[child_idx];
134
135 // we push a projection under this child that references the required columns of the union
136 child->ResolveOperatorTypes();
137 auto bindings = child->GetColumnBindings();
138 vector<unique_ptr<Expression>> expressions;
139 expressions.reserve(n: entries.size());
140 for (auto &column_idx : entries) {
141 expressions.push_back(
142 x: make_uniq<BoundColumnRefExpression>(args&: child->types[column_idx], args&: bindings[column_idx]));
143 }
144 auto new_projection =
145 make_uniq<LogicalProjection>(args: binder.GenerateTableIndex(), args: std::move(expressions));
146 new_projection->children.push_back(x: std::move(child));
147 op.children[child_idx] = std::move(new_projection);
148
149 remove.VisitOperator(op&: *op.children[child_idx]);
150 }
151 return;
152 }
153 }
154 for (auto &child : op.children) {
155 RemoveUnusedColumns remove(binder, context, true);
156 remove.VisitOperator(op&: *child);
157 }
158 return;
159 case LogicalOperatorType::LOGICAL_EXCEPT:
160 case LogicalOperatorType::LOGICAL_INTERSECT:
161 // for INTERSECT/EXCEPT operations we can't remove anything, just recursively visit the children
162 for (auto &child : op.children) {
163 RemoveUnusedColumns remove(binder, context, true);
164 remove.VisitOperator(op&: *child);
165 }
166 return;
167 case LogicalOperatorType::LOGICAL_ORDER_BY:
168 if (!everything_referenced) {
169 auto &order = op.Cast<LogicalOrder>();
170 D_ASSERT(order.projections.empty()); // should not yet be set
171 const auto all_bindings = order.GetColumnBindings();
172
173 for (idx_t col_idx = 0; col_idx < all_bindings.size(); col_idx++) {
174 if (column_references.find(x: all_bindings[col_idx]) != column_references.end()) {
175 order.projections.push_back(x: col_idx);
176 }
177 }
178 }
179 for (auto &child : op.children) {
180 RemoveUnusedColumns remove(binder, context, true);
181 remove.VisitOperator(op&: *child);
182 }
183 return;
184 case LogicalOperatorType::LOGICAL_PROJECTION: {
185 if (!everything_referenced) {
186 auto &proj = op.Cast<LogicalProjection>();
187 ClearUnusedExpressions(list&: proj.expressions, table_idx: proj.table_index);
188
189 if (proj.expressions.empty()) {
190 // nothing references the projected expressions
191 // this happens in the case of e.g. EXISTS(SELECT * FROM ...)
192 // in this case we only need to project a single constant
193 proj.expressions.push_back(x: make_uniq<BoundConstantExpression>(args: Value::INTEGER(value: 42)));
194 }
195 }
196 // then recurse into the children of this projection
197 RemoveUnusedColumns remove(binder, context);
198 remove.VisitOperatorExpressions(op);
199 remove.VisitOperator(op&: *op.children[0]);
200 return;
201 }
202 case LogicalOperatorType::LOGICAL_INSERT:
203 case LogicalOperatorType::LOGICAL_UPDATE:
204 case LogicalOperatorType::LOGICAL_DELETE: {
205 //! When RETURNING is used, a PROJECTION is the top level operator for INSERTS, UPDATES, and DELETES
206 //! We still need to project all values from these operators so the projection
207 //! on top of them can select from only the table values being inserted.
208 //! TODO: Push down the projections from the returning statement
209 //! TODO: Be careful because you might be adding expressions when a user returns *
210 RemoveUnusedColumns remove(binder, context, true);
211 remove.VisitOperatorExpressions(op);
212 remove.VisitOperator(op&: *op.children[0]);
213 return;
214 }
215 case LogicalOperatorType::LOGICAL_GET:
216 LogicalOperatorVisitor::VisitOperatorExpressions(op);
217 if (!everything_referenced) {
218 auto &get = op.Cast<LogicalGet>();
219 if (!get.function.projection_pushdown) {
220 return;
221 }
222
223 // Create "selection vector" of all column ids
224 vector<idx_t> proj_sel;
225 for (idx_t col_idx = 0; col_idx < get.column_ids.size(); col_idx++) {
226 proj_sel.push_back(x: col_idx);
227 }
228 // Create a copy that we can use to match ids later
229 auto col_sel = proj_sel;
230 // Clear unused ids, exclude filter columns that are projected out immediately
231 ClearUnusedExpressions(list&: proj_sel, table_idx: get.table_index, replace: false);
232
233 // for every table filter, push a column binding into the column references map to prevent the column from
234 // being projected out
235 for (auto &filter : get.table_filters.filters) {
236 idx_t index = DConstants::INVALID_INDEX;
237 for (idx_t i = 0; i < get.column_ids.size(); i++) {
238 if (get.column_ids[i] == filter.first) {
239 index = i;
240 break;
241 }
242 }
243 if (index == DConstants::INVALID_INDEX) {
244 throw InternalException("Could not find column index for table filter");
245 }
246 ColumnBinding filter_binding(get.table_index, index);
247 if (column_references.find(x: filter_binding) == column_references.end()) {
248 column_references.insert(x: make_pair(x&: filter_binding, y: vector<BoundColumnRefExpression *>()));
249 }
250 }
251
252 // Clear unused ids, include filter columns that are projected out immediately
253 ClearUnusedExpressions(list&: col_sel, table_idx: get.table_index);
254
255 // Now set the column ids in the LogicalGet using the "selection vector"
256 vector<column_t> column_ids;
257 column_ids.reserve(n: col_sel.size());
258 for (auto col_sel_idx : col_sel) {
259 column_ids.push_back(x: get.column_ids[col_sel_idx]);
260 }
261 get.column_ids = std::move(column_ids);
262
263 if (get.function.filter_prune) {
264 // Now set the projection cols by matching the "selection vector" that excludes filter columns
265 // with the "selection vector" that includes filter columns
266 idx_t col_idx = 0;
267 for (auto proj_sel_idx : proj_sel) {
268 for (; col_idx < col_sel.size(); col_idx++) {
269 if (proj_sel_idx == col_sel[col_idx]) {
270 get.projection_ids.push_back(x: col_idx);
271 break;
272 }
273 }
274 }
275 }
276
277 if (get.column_ids.empty()) {
278 // this generally means we are only interested in whether or not anything exists in the table (e.g.
279 // EXISTS(SELECT * FROM tbl)) in this case, we just scan the row identifier column as it means we do not
280 // need to read any of the columns
281 get.column_ids.push_back(x: COLUMN_IDENTIFIER_ROW_ID);
282 }
283 }
284 return;
285 case LogicalOperatorType::LOGICAL_FILTER: {
286 auto &filter = op.Cast<LogicalFilter>();
287 if (!filter.projection_map.empty()) {
288 // if we have any entries in the filter projection map don't prune any columns
289 // FIXME: we can do something more clever here
290 everything_referenced = true;
291 }
292 break;
293 }
294 case LogicalOperatorType::LOGICAL_DISTINCT: {
295 // distinct, all projected columns are used for the DISTINCT computation
296 // mark all columns as used and continue to the children
297 // FIXME: DISTINCT with expression list does not implicitly reference everything
298 everything_referenced = true;
299 break;
300 }
301 case LogicalOperatorType::LOGICAL_RECURSIVE_CTE: {
302 everything_referenced = true;
303 break;
304 }
305 case LogicalOperatorType::LOGICAL_CTE_REF: {
306 everything_referenced = true;
307 break;
308 }
309 case LogicalOperatorType::LOGICAL_PIVOT: {
310 everything_referenced = true;
311 break;
312 }
313 default:
314 break;
315 }
316 LogicalOperatorVisitor::VisitOperatorExpressions(op);
317 LogicalOperatorVisitor::VisitOperatorChildren(op);
318}
319
320unique_ptr<Expression> RemoveUnusedColumns::VisitReplace(BoundColumnRefExpression &expr,
321 unique_ptr<Expression> *expr_ptr) {
322 // add a column reference
323 column_references[expr.binding].push_back(x: &expr);
324 return nullptr;
325}
326
327unique_ptr<Expression> RemoveUnusedColumns::VisitReplace(BoundReferenceExpression &expr,
328 unique_ptr<Expression> *expr_ptr) {
329 // BoundReferenceExpression should not be used here yet, they only belong in the physical plan
330 throw InternalException("BoundReferenceExpression should not be used here yet!");
331}
332
333} // namespace duckdb
334