| 1 | #include "duckdb/optimizer/remove_unused_columns.hpp" |
| 2 | |
| 3 | #include "duckdb/function/aggregate/distributive_functions.hpp" |
| 4 | #include "duckdb/function/function_binder.hpp" |
| 5 | #include "duckdb/parser/parsed_data/vacuum_info.hpp" |
| 6 | #include "duckdb/planner/binder.hpp" |
| 7 | #include "duckdb/planner/column_binding_map.hpp" |
| 8 | #include "duckdb/planner/expression/bound_aggregate_expression.hpp" |
| 9 | #include "duckdb/planner/expression/bound_columnref_expression.hpp" |
| 10 | #include "duckdb/planner/expression/bound_constant_expression.hpp" |
| 11 | #include "duckdb/planner/expression_iterator.hpp" |
| 12 | #include "duckdb/planner/operator/logical_aggregate.hpp" |
| 13 | #include "duckdb/planner/operator/logical_comparison_join.hpp" |
| 14 | #include "duckdb/planner/operator/logical_filter.hpp" |
| 15 | #include "duckdb/planner/operator/logical_get.hpp" |
| 16 | #include "duckdb/planner/operator/logical_order.hpp" |
| 17 | #include "duckdb/planner/operator/logical_projection.hpp" |
| 18 | #include "duckdb/planner/operator/logical_set_operation.hpp" |
| 19 | #include "duckdb/planner/operator/logical_simple.hpp" |
| 20 | |
| 21 | namespace duckdb { |
| 22 | |
| 23 | void RemoveUnusedColumns::ReplaceBinding(ColumnBinding current_binding, ColumnBinding new_binding) { |
| 24 | auto colrefs = column_references.find(x: current_binding); |
| 25 | if (colrefs != column_references.end()) { |
| 26 | for (auto &colref : colrefs->second) { |
| 27 | D_ASSERT(colref->binding == current_binding); |
| 28 | colref->binding = new_binding; |
| 29 | } |
| 30 | } |
| 31 | } |
| 32 | |
| 33 | template <class T> |
| 34 | void RemoveUnusedColumns::ClearUnusedExpressions(vector<T> &list, idx_t table_idx, bool replace) { |
| 35 | idx_t offset = 0; |
| 36 | for (idx_t col_idx = 0; col_idx < list.size(); col_idx++) { |
| 37 | auto current_binding = ColumnBinding(table_idx, col_idx + offset); |
| 38 | auto entry = column_references.find(x: current_binding); |
| 39 | if (entry == column_references.end()) { |
| 40 | // this entry is not referred to, erase it from the set of expressions |
| 41 | list.erase(list.begin() + col_idx); |
| 42 | offset++; |
| 43 | col_idx--; |
| 44 | } else if (offset > 0 && replace) { |
| 45 | // column is used but the ColumnBinding has changed because of removed columns |
| 46 | ReplaceBinding(current_binding, new_binding: ColumnBinding(table_idx, col_idx)); |
| 47 | } |
| 48 | } |
| 49 | } |
| 50 | |
| 51 | void RemoveUnusedColumns::VisitOperator(LogicalOperator &op) { |
| 52 | switch (op.type) { |
| 53 | case LogicalOperatorType::LOGICAL_AGGREGATE_AND_GROUP_BY: { |
| 54 | // aggregate |
| 55 | if (!everything_referenced) { |
| 56 | // FIXME: groups that are not referenced need to stay -> but they don't need to be scanned and output! |
| 57 | auto &aggr = op.Cast<LogicalAggregate>(); |
| 58 | ClearUnusedExpressions(list&: aggr.expressions, table_idx: aggr.aggregate_index); |
| 59 | if (aggr.expressions.empty() && aggr.groups.empty()) { |
| 60 | // removed all expressions from the aggregate: push a COUNT(*) |
| 61 | auto count_star_fun = CountStarFun::GetFunction(); |
| 62 | FunctionBinder function_binder(context); |
| 63 | aggr.expressions.push_back( |
| 64 | x: function_binder.BindAggregateFunction(bound_function: count_star_fun, children: {}, filter: nullptr, aggr_type: AggregateType::NON_DISTINCT)); |
| 65 | } |
| 66 | } |
| 67 | |
| 68 | // then recurse into the children of the aggregate |
| 69 | RemoveUnusedColumns remove(binder, context); |
| 70 | remove.VisitOperatorExpressions(op); |
| 71 | remove.VisitOperator(op&: *op.children[0]); |
| 72 | return; |
| 73 | } |
| 74 | case LogicalOperatorType::LOGICAL_ASOF_JOIN: |
| 75 | case LogicalOperatorType::LOGICAL_DELIM_JOIN: |
| 76 | case LogicalOperatorType::LOGICAL_COMPARISON_JOIN: { |
| 77 | if (!everything_referenced) { |
| 78 | auto &comp_join = op.Cast<LogicalComparisonJoin>(); |
| 79 | |
| 80 | if (comp_join.join_type != JoinType::INNER) { |
| 81 | break; |
| 82 | } |
| 83 | // for inner joins with equality predicates in the form of (X=Y) |
| 84 | // we can replace any references to the RHS (Y) to references to the LHS (X) |
| 85 | // this reduces the amount of columns we need to extract from the join hash table |
| 86 | for (auto &cond : comp_join.conditions) { |
| 87 | if (cond.comparison == ExpressionType::COMPARE_EQUAL) { |
| 88 | if (cond.left->expression_class == ExpressionClass::BOUND_COLUMN_REF && |
| 89 | cond.right->expression_class == ExpressionClass::BOUND_COLUMN_REF) { |
| 90 | // comparison join between two bound column refs |
| 91 | // we can replace any reference to the RHS (build-side) with a reference to the LHS (probe-side) |
| 92 | auto &lhs_col = cond.left->Cast<BoundColumnRefExpression>(); |
| 93 | auto &rhs_col = cond.right->Cast<BoundColumnRefExpression>(); |
| 94 | // if there are any columns that refer to the RHS, |
| 95 | auto colrefs = column_references.find(x: rhs_col.binding); |
| 96 | if (colrefs != column_references.end()) { |
| 97 | for (auto &entry : colrefs->second) { |
| 98 | entry->binding = lhs_col.binding; |
| 99 | column_references[lhs_col.binding].push_back(x: entry); |
| 100 | } |
| 101 | column_references.erase(x: rhs_col.binding); |
| 102 | } |
| 103 | } |
| 104 | } |
| 105 | } |
| 106 | } |
| 107 | break; |
| 108 | } |
| 109 | case LogicalOperatorType::LOGICAL_ANY_JOIN: |
| 110 | break; |
| 111 | case LogicalOperatorType::LOGICAL_UNION: |
| 112 | if (!everything_referenced) { |
| 113 | // for UNION we can remove unreferenced columns as long as everything_referenced is false (i.e. we |
| 114 | // encounter a UNION node that is not preceded by a DISTINCT) |
| 115 | // this happens when UNION ALL is used |
| 116 | auto &setop = op.Cast<LogicalSetOperation>(); |
| 117 | vector<idx_t> entries; |
| 118 | for (idx_t i = 0; i < setop.column_count; i++) { |
| 119 | entries.push_back(x: i); |
| 120 | } |
| 121 | ClearUnusedExpressions(list&: entries, table_idx: setop.table_index); |
| 122 | if (entries.size() < setop.column_count) { |
| 123 | if (entries.empty()) { |
| 124 | // no columns referenced: this happens in the case of a COUNT(*) |
| 125 | // extract the first column |
| 126 | entries.push_back(x: 0); |
| 127 | } |
| 128 | // columns were cleared |
| 129 | setop.column_count = entries.size(); |
| 130 | |
| 131 | for (idx_t child_idx = 0; child_idx < op.children.size(); child_idx++) { |
| 132 | RemoveUnusedColumns remove(binder, context, true); |
| 133 | auto &child = op.children[child_idx]; |
| 134 | |
| 135 | // we push a projection under this child that references the required columns of the union |
| 136 | child->ResolveOperatorTypes(); |
| 137 | auto bindings = child->GetColumnBindings(); |
| 138 | vector<unique_ptr<Expression>> expressions; |
| 139 | expressions.reserve(n: entries.size()); |
| 140 | for (auto &column_idx : entries) { |
| 141 | expressions.push_back( |
| 142 | x: make_uniq<BoundColumnRefExpression>(args&: child->types[column_idx], args&: bindings[column_idx])); |
| 143 | } |
| 144 | auto new_projection = |
| 145 | make_uniq<LogicalProjection>(args: binder.GenerateTableIndex(), args: std::move(expressions)); |
| 146 | new_projection->children.push_back(x: std::move(child)); |
| 147 | op.children[child_idx] = std::move(new_projection); |
| 148 | |
| 149 | remove.VisitOperator(op&: *op.children[child_idx]); |
| 150 | } |
| 151 | return; |
| 152 | } |
| 153 | } |
| 154 | for (auto &child : op.children) { |
| 155 | RemoveUnusedColumns remove(binder, context, true); |
| 156 | remove.VisitOperator(op&: *child); |
| 157 | } |
| 158 | return; |
| 159 | case LogicalOperatorType::LOGICAL_EXCEPT: |
| 160 | case LogicalOperatorType::LOGICAL_INTERSECT: |
| 161 | // for INTERSECT/EXCEPT operations we can't remove anything, just recursively visit the children |
| 162 | for (auto &child : op.children) { |
| 163 | RemoveUnusedColumns remove(binder, context, true); |
| 164 | remove.VisitOperator(op&: *child); |
| 165 | } |
| 166 | return; |
| 167 | case LogicalOperatorType::LOGICAL_ORDER_BY: |
| 168 | if (!everything_referenced) { |
| 169 | auto &order = op.Cast<LogicalOrder>(); |
| 170 | D_ASSERT(order.projections.empty()); // should not yet be set |
| 171 | const auto all_bindings = order.GetColumnBindings(); |
| 172 | |
| 173 | for (idx_t col_idx = 0; col_idx < all_bindings.size(); col_idx++) { |
| 174 | if (column_references.find(x: all_bindings[col_idx]) != column_references.end()) { |
| 175 | order.projections.push_back(x: col_idx); |
| 176 | } |
| 177 | } |
| 178 | } |
| 179 | for (auto &child : op.children) { |
| 180 | RemoveUnusedColumns remove(binder, context, true); |
| 181 | remove.VisitOperator(op&: *child); |
| 182 | } |
| 183 | return; |
| 184 | case LogicalOperatorType::LOGICAL_PROJECTION: { |
| 185 | if (!everything_referenced) { |
| 186 | auto &proj = op.Cast<LogicalProjection>(); |
| 187 | ClearUnusedExpressions(list&: proj.expressions, table_idx: proj.table_index); |
| 188 | |
| 189 | if (proj.expressions.empty()) { |
| 190 | // nothing references the projected expressions |
| 191 | // this happens in the case of e.g. EXISTS(SELECT * FROM ...) |
| 192 | // in this case we only need to project a single constant |
| 193 | proj.expressions.push_back(x: make_uniq<BoundConstantExpression>(args: Value::INTEGER(value: 42))); |
| 194 | } |
| 195 | } |
| 196 | // then recurse into the children of this projection |
| 197 | RemoveUnusedColumns remove(binder, context); |
| 198 | remove.VisitOperatorExpressions(op); |
| 199 | remove.VisitOperator(op&: *op.children[0]); |
| 200 | return; |
| 201 | } |
| 202 | case LogicalOperatorType::LOGICAL_INSERT: |
| 203 | case LogicalOperatorType::LOGICAL_UPDATE: |
| 204 | case LogicalOperatorType::LOGICAL_DELETE: { |
| 205 | //! When RETURNING is used, a PROJECTION is the top level operator for INSERTS, UPDATES, and DELETES |
| 206 | //! We still need to project all values from these operators so the projection |
| 207 | //! on top of them can select from only the table values being inserted. |
| 208 | //! TODO: Push down the projections from the returning statement |
| 209 | //! TODO: Be careful because you might be adding expressions when a user returns * |
| 210 | RemoveUnusedColumns remove(binder, context, true); |
| 211 | remove.VisitOperatorExpressions(op); |
| 212 | remove.VisitOperator(op&: *op.children[0]); |
| 213 | return; |
| 214 | } |
| 215 | case LogicalOperatorType::LOGICAL_GET: |
| 216 | LogicalOperatorVisitor::VisitOperatorExpressions(op); |
| 217 | if (!everything_referenced) { |
| 218 | auto &get = op.Cast<LogicalGet>(); |
| 219 | if (!get.function.projection_pushdown) { |
| 220 | return; |
| 221 | } |
| 222 | |
| 223 | // Create "selection vector" of all column ids |
| 224 | vector<idx_t> proj_sel; |
| 225 | for (idx_t col_idx = 0; col_idx < get.column_ids.size(); col_idx++) { |
| 226 | proj_sel.push_back(x: col_idx); |
| 227 | } |
| 228 | // Create a copy that we can use to match ids later |
| 229 | auto col_sel = proj_sel; |
| 230 | // Clear unused ids, exclude filter columns that are projected out immediately |
| 231 | ClearUnusedExpressions(list&: proj_sel, table_idx: get.table_index, replace: false); |
| 232 | |
| 233 | // for every table filter, push a column binding into the column references map to prevent the column from |
| 234 | // being projected out |
| 235 | for (auto &filter : get.table_filters.filters) { |
| 236 | idx_t index = DConstants::INVALID_INDEX; |
| 237 | for (idx_t i = 0; i < get.column_ids.size(); i++) { |
| 238 | if (get.column_ids[i] == filter.first) { |
| 239 | index = i; |
| 240 | break; |
| 241 | } |
| 242 | } |
| 243 | if (index == DConstants::INVALID_INDEX) { |
| 244 | throw InternalException("Could not find column index for table filter" ); |
| 245 | } |
| 246 | ColumnBinding filter_binding(get.table_index, index); |
| 247 | if (column_references.find(x: filter_binding) == column_references.end()) { |
| 248 | column_references.insert(x: make_pair(x&: filter_binding, y: vector<BoundColumnRefExpression *>())); |
| 249 | } |
| 250 | } |
| 251 | |
| 252 | // Clear unused ids, include filter columns that are projected out immediately |
| 253 | ClearUnusedExpressions(list&: col_sel, table_idx: get.table_index); |
| 254 | |
| 255 | // Now set the column ids in the LogicalGet using the "selection vector" |
| 256 | vector<column_t> column_ids; |
| 257 | column_ids.reserve(n: col_sel.size()); |
| 258 | for (auto col_sel_idx : col_sel) { |
| 259 | column_ids.push_back(x: get.column_ids[col_sel_idx]); |
| 260 | } |
| 261 | get.column_ids = std::move(column_ids); |
| 262 | |
| 263 | if (get.function.filter_prune) { |
| 264 | // Now set the projection cols by matching the "selection vector" that excludes filter columns |
| 265 | // with the "selection vector" that includes filter columns |
| 266 | idx_t col_idx = 0; |
| 267 | for (auto proj_sel_idx : proj_sel) { |
| 268 | for (; col_idx < col_sel.size(); col_idx++) { |
| 269 | if (proj_sel_idx == col_sel[col_idx]) { |
| 270 | get.projection_ids.push_back(x: col_idx); |
| 271 | break; |
| 272 | } |
| 273 | } |
| 274 | } |
| 275 | } |
| 276 | |
| 277 | if (get.column_ids.empty()) { |
| 278 | // this generally means we are only interested in whether or not anything exists in the table (e.g. |
| 279 | // EXISTS(SELECT * FROM tbl)) in this case, we just scan the row identifier column as it means we do not |
| 280 | // need to read any of the columns |
| 281 | get.column_ids.push_back(x: COLUMN_IDENTIFIER_ROW_ID); |
| 282 | } |
| 283 | } |
| 284 | return; |
| 285 | case LogicalOperatorType::LOGICAL_FILTER: { |
| 286 | auto &filter = op.Cast<LogicalFilter>(); |
| 287 | if (!filter.projection_map.empty()) { |
| 288 | // if we have any entries in the filter projection map don't prune any columns |
| 289 | // FIXME: we can do something more clever here |
| 290 | everything_referenced = true; |
| 291 | } |
| 292 | break; |
| 293 | } |
| 294 | case LogicalOperatorType::LOGICAL_DISTINCT: { |
| 295 | // distinct, all projected columns are used for the DISTINCT computation |
| 296 | // mark all columns as used and continue to the children |
| 297 | // FIXME: DISTINCT with expression list does not implicitly reference everything |
| 298 | everything_referenced = true; |
| 299 | break; |
| 300 | } |
| 301 | case LogicalOperatorType::LOGICAL_RECURSIVE_CTE: { |
| 302 | everything_referenced = true; |
| 303 | break; |
| 304 | } |
| 305 | case LogicalOperatorType::LOGICAL_CTE_REF: { |
| 306 | everything_referenced = true; |
| 307 | break; |
| 308 | } |
| 309 | case LogicalOperatorType::LOGICAL_PIVOT: { |
| 310 | everything_referenced = true; |
| 311 | break; |
| 312 | } |
| 313 | default: |
| 314 | break; |
| 315 | } |
| 316 | LogicalOperatorVisitor::VisitOperatorExpressions(op); |
| 317 | LogicalOperatorVisitor::VisitOperatorChildren(op); |
| 318 | } |
| 319 | |
| 320 | unique_ptr<Expression> RemoveUnusedColumns::VisitReplace(BoundColumnRefExpression &expr, |
| 321 | unique_ptr<Expression> *expr_ptr) { |
| 322 | // add a column reference |
| 323 | column_references[expr.binding].push_back(x: &expr); |
| 324 | return nullptr; |
| 325 | } |
| 326 | |
| 327 | unique_ptr<Expression> RemoveUnusedColumns::VisitReplace(BoundReferenceExpression &expr, |
| 328 | unique_ptr<Expression> *expr_ptr) { |
| 329 | // BoundReferenceExpression should not be used here yet, they only belong in the physical plan |
| 330 | throw InternalException("BoundReferenceExpression should not be used here yet!" ); |
| 331 | } |
| 332 | |
| 333 | } // namespace duckdb |
| 334 | |