1#include "duckdb/optimizer/remove_unused_columns.hpp"
2
3#include "duckdb/planner/expression/bound_aggregate_expression.hpp"
4#include "duckdb/planner/expression/bound_columnref_expression.hpp"
5#include "duckdb/planner/expression/bound_constant_expression.hpp"
6
7#include "duckdb/planner/operator/logical_aggregate.hpp"
8#include "duckdb/planner/operator/logical_comparison_join.hpp"
9#include "duckdb/planner/operator/logical_filter.hpp"
10#include "duckdb/planner/operator/logical_get.hpp"
11#include "duckdb/planner/operator/logical_projection.hpp"
12#include "duckdb/planner/column_binding_map.hpp"
13
14#include "duckdb/function/aggregate/distributive_functions.hpp"
15
16#include "duckdb/planner/expression_iterator.hpp"
17
18using namespace duckdb;
19using namespace std;
20
21void RemoveUnusedColumns::ReplaceBinding(ColumnBinding current_binding, ColumnBinding new_binding) {
22 auto colrefs = column_references.find(current_binding);
23 if (colrefs != column_references.end()) {
24 for (auto &colref : colrefs->second) {
25 assert(colref->binding == current_binding);
26 colref->binding = new_binding;
27 }
28 }
29}
30
31template <class T> void RemoveUnusedColumns::ClearUnusedExpressions(vector<T> &list, idx_t table_idx) {
32 idx_t offset = 0;
33 for (idx_t col_idx = 0; col_idx < list.size(); col_idx++) {
34 auto current_binding = ColumnBinding(table_idx, col_idx + offset);
35 auto entry = column_references.find(current_binding);
36 if (entry == column_references.end()) {
37 // this entry is not referred to, erase it from the set of expresisons
38 list.erase(list.begin() + col_idx);
39 offset++;
40 col_idx--;
41 } else if (offset > 0) {
42 // column is used but the ColumnBinding has changed because of removed columns
43 ReplaceBinding(current_binding, ColumnBinding(table_idx, col_idx));
44 }
45 }
46}
47
48void RemoveUnusedColumns::VisitOperator(LogicalOperator &op) {
49 switch (op.type) {
50 case LogicalOperatorType::AGGREGATE_AND_GROUP_BY: {
51 // aggregate
52 if (!everything_referenced) {
53 // FIXME: groups that are not referenced need to stay -> but they don't need to be scanned and output!
54 auto &aggr = (LogicalAggregate &)op;
55 ClearUnusedExpressions(aggr.expressions, aggr.aggregate_index);
56
57 if (aggr.expressions.size() == 0 && aggr.groups.size() == 0) {
58 // removed all expressions from the aggregate: push a COUNT(*)
59 aggr.expressions.push_back(
60 make_unique<BoundAggregateExpression>(TypeId::INT64, CountStarFun::GetFunction(), false));
61 }
62 }
63
64 // then recurse into the children of the aggregate
65 RemoveUnusedColumns remove;
66 remove.VisitOperatorExpressions(op);
67 remove.VisitOperator(*op.children[0]);
68 return;
69 }
70 case LogicalOperatorType::DELIM_JOIN:
71 case LogicalOperatorType::COMPARISON_JOIN: {
72 if (!everything_referenced) {
73 auto &comp_join = (LogicalComparisonJoin &)op;
74
75 if (comp_join.join_type != JoinType::INNER) {
76 break;
77 }
78 // for inner joins with equality predicates in the form of (X=Y)
79 // we can replace any references to the RHS (Y) to references to the LHS (X)
80 // this reduces the amount of columns we need to extract from the join hash table
81 for (auto &cond : comp_join.conditions) {
82 if (cond.comparison == ExpressionType::COMPARE_EQUAL) {
83 if (cond.left->expression_class == ExpressionClass::BOUND_COLUMN_REF &&
84 cond.right->expression_class == ExpressionClass::BOUND_COLUMN_REF) {
85 // comparison join between two bound column refs
86 // we can replace any reference to the RHS (build-side) with a reference to the LHS (probe-side)
87 auto &lhs_col = (BoundColumnRefExpression &)*cond.left;
88 auto &rhs_col = (BoundColumnRefExpression &)*cond.right;
89 // if there are any columns that refer to the RHS,
90 auto colrefs = column_references.find(rhs_col.binding);
91 if (colrefs != column_references.end()) {
92 for (auto &entry : colrefs->second) {
93 entry->binding = lhs_col.binding;
94 column_references[lhs_col.binding].push_back(entry);
95 }
96 column_references.erase(rhs_col.binding);
97 }
98 }
99 }
100 }
101 }
102 break;
103 }
104 case LogicalOperatorType::ANY_JOIN:
105 break;
106 case LogicalOperatorType::UNION:
107 case LogicalOperatorType::EXCEPT:
108 case LogicalOperatorType::INTERSECT:
109 // for set operations we don't remove anything, just recursively visit the children
110 // FIXME: for UNION we can remove unreferenced columns as long as everything_referenced is false (i.e. we
111 // encounter a UNION node that is not preceded by a DISTINCT)
112 for (auto &child : op.children) {
113 RemoveUnusedColumns remove(true);
114 remove.VisitOperator(*child);
115 }
116 return;
117 case LogicalOperatorType::PROJECTION: {
118 if (!everything_referenced) {
119 auto &proj = (LogicalProjection &)op;
120 ClearUnusedExpressions(proj.expressions, proj.table_index);
121
122 if (proj.expressions.size() == 0) {
123 // nothing references the projected expressions
124 // this happens in the case of e.g. EXISTS(SELECT * FROM ...)
125 // in this case we only need to project a single constant
126 proj.expressions.push_back(make_unique<BoundConstantExpression>(Value::INTEGER(42)));
127 }
128 }
129 // then recurse into the children of this projection
130 RemoveUnusedColumns remove;
131 remove.VisitOperatorExpressions(op);
132 remove.VisitOperator(*op.children[0]);
133 return;
134 }
135 case LogicalOperatorType::GET:
136 LogicalOperatorVisitor::VisitOperatorExpressions(op);
137 if (!everything_referenced) {
138 auto &get = (LogicalGet &)op;
139 // table scan: figure out which columns are referenced
140 ClearUnusedExpressions(get.column_ids, get.table_index);
141
142 if (get.column_ids.size() == 0) {
143 // this generally means we are only interested in whether or not anything exists in the table (e.g.
144 // EXISTS(SELECT * FROM tbl)) in this case, we just scan the row identifier column as it means we do not
145 // need to read any of the columns
146 get.column_ids.push_back(COLUMN_IDENTIFIER_ROW_ID);
147 }
148 }
149 return;
150 case LogicalOperatorType::DISTINCT: {
151 // distinct, all projected columns are used for the DISTINCT computation
152 // mark all columns as used and continue to the children
153 // FIXME: DISTINCT with expression list does not implicitly reference everything
154 everything_referenced = true;
155 break;
156 }
157 case LogicalOperatorType::RECURSIVE_CTE: {
158 everything_referenced = true;
159 break;
160 }
161 case LogicalOperatorType::CTE_REF: {
162 everything_referenced = true;
163 break;
164 }
165 default:
166 break;
167 }
168 LogicalOperatorVisitor::VisitOperatorExpressions(op);
169 LogicalOperatorVisitor::VisitOperatorChildren(op);
170}
171
172unique_ptr<Expression> RemoveUnusedColumns::VisitReplace(BoundColumnRefExpression &expr,
173 unique_ptr<Expression> *expr_ptr) {
174 // add a column reference
175 column_references[expr.binding].push_back(&expr);
176 return nullptr;
177}
178
179unique_ptr<Expression> RemoveUnusedColumns::VisitReplace(BoundReferenceExpression &expr,
180 unique_ptr<Expression> *expr_ptr) {
181 // BoundReferenceExpression should not be used here yet, they only belong in the physical plan
182 throw InternalException("BoundReferenceExpression should not be used here yet!");
183}
184