1 | #include "duckdb/planner/binder.hpp" |
2 | #include "duckdb/parser/statement/show_statement.hpp" |
3 | #include "duckdb/planner/operator/logical_show.hpp" |
4 | #include "duckdb/parser/query_node/select_node.hpp" |
5 | #include "duckdb/parser/expression/function_expression.hpp" |
6 | #include "duckdb/parser/expression/constant_expression.hpp" |
7 | #include "duckdb/parser/expression/cast_expression.hpp" |
8 | #include "duckdb/parser/tableref/subqueryref.hpp" |
9 | |
10 | namespace duckdb { |
11 | |
12 | static unique_ptr<ParsedExpression> SummarizeWrapUnnest(vector<unique_ptr<ParsedExpression>> &children, |
13 | const string &alias) { |
14 | auto list_function = make_uniq<FunctionExpression>(args: "list_value" , args: std::move(children)); |
15 | vector<unique_ptr<ParsedExpression>> unnest_children; |
16 | unnest_children.push_back(x: std::move(list_function)); |
17 | auto unnest_function = make_uniq<FunctionExpression>(args: "unnest" , args: std::move(unnest_children)); |
18 | unnest_function->alias = alias; |
19 | return std::move(unnest_function); |
20 | } |
21 | |
22 | static unique_ptr<ParsedExpression> SummarizeCreateAggregate(const string &aggregate, string column_name) { |
23 | vector<unique_ptr<ParsedExpression>> children; |
24 | children.push_back(x: make_uniq<ColumnRefExpression>(args: std::move(column_name))); |
25 | auto aggregate_function = make_uniq<FunctionExpression>(args: aggregate, args: std::move(children)); |
26 | auto cast_function = make_uniq<CastExpression>(args: LogicalType::VARCHAR, args: std::move(aggregate_function)); |
27 | return std::move(cast_function); |
28 | } |
29 | |
30 | static unique_ptr<ParsedExpression> SummarizeCreateAggregate(const string &aggregate, string column_name, |
31 | const Value &modifier) { |
32 | vector<unique_ptr<ParsedExpression>> children; |
33 | children.push_back(x: make_uniq<ColumnRefExpression>(args: std::move(column_name))); |
34 | children.push_back(x: make_uniq<ConstantExpression>(args: modifier)); |
35 | auto aggregate_function = make_uniq<FunctionExpression>(args: aggregate, args: std::move(children)); |
36 | auto cast_function = make_uniq<CastExpression>(args: LogicalType::VARCHAR, args: std::move(aggregate_function)); |
37 | return std::move(cast_function); |
38 | } |
39 | |
40 | static unique_ptr<ParsedExpression> SummarizeCreateCountStar() { |
41 | vector<unique_ptr<ParsedExpression>> children; |
42 | auto aggregate_function = make_uniq<FunctionExpression>(args: "count_star" , args: std::move(children)); |
43 | return std::move(aggregate_function); |
44 | } |
45 | |
46 | static unique_ptr<ParsedExpression> SummarizeCreateBinaryFunction(const string &op, unique_ptr<ParsedExpression> left, |
47 | unique_ptr<ParsedExpression> right) { |
48 | vector<unique_ptr<ParsedExpression>> children; |
49 | children.push_back(x: std::move(left)); |
50 | children.push_back(x: std::move(right)); |
51 | auto binary_function = make_uniq<FunctionExpression>(args: op, args: std::move(children)); |
52 | return std::move(binary_function); |
53 | } |
54 | |
55 | static unique_ptr<ParsedExpression> SummarizeCreateNullPercentage(string column_name) { |
56 | auto count_star = make_uniq<CastExpression>(args: LogicalType::DOUBLE, args: SummarizeCreateCountStar()); |
57 | auto count = |
58 | make_uniq<CastExpression>(args: LogicalType::DOUBLE, args: SummarizeCreateAggregate(aggregate: "count" , column_name: std::move(column_name))); |
59 | auto null_percentage = SummarizeCreateBinaryFunction(op: "/" , left: std::move(count), right: std::move(count_star)); |
60 | auto negate_x = |
61 | SummarizeCreateBinaryFunction(op: "-" , left: make_uniq<ConstantExpression>(args: Value::DOUBLE(value: 1)), right: std::move(null_percentage)); |
62 | auto percentage_x = |
63 | SummarizeCreateBinaryFunction(op: "*" , left: std::move(negate_x), right: make_uniq<ConstantExpression>(args: Value::DOUBLE(value: 100))); |
64 | auto round_x = SummarizeCreateBinaryFunction(op: "round" , left: std::move(percentage_x), |
65 | right: make_uniq<ConstantExpression>(args: Value::INTEGER(value: 2))); |
66 | auto concat_x = |
67 | SummarizeCreateBinaryFunction(op: "concat" , left: std::move(round_x), right: make_uniq<ConstantExpression>(args: Value("%" ))); |
68 | |
69 | return concat_x; |
70 | } |
71 | |
72 | BoundStatement Binder::BindSummarize(ShowStatement &stmt) { |
73 | auto query_copy = stmt.info->query->Copy(); |
74 | |
75 | // we bind the plan once in a child-node to figure out the column names and column types |
76 | auto child_binder = Binder::CreateBinder(context); |
77 | auto plan = child_binder->Bind(node&: *stmt.info->query); |
78 | D_ASSERT(plan.types.size() == plan.names.size()); |
79 | vector<unique_ptr<ParsedExpression>> name_children; |
80 | vector<unique_ptr<ParsedExpression>> type_children; |
81 | vector<unique_ptr<ParsedExpression>> min_children; |
82 | vector<unique_ptr<ParsedExpression>> max_children; |
83 | vector<unique_ptr<ParsedExpression>> unique_children; |
84 | vector<unique_ptr<ParsedExpression>> avg_children; |
85 | vector<unique_ptr<ParsedExpression>> std_children; |
86 | vector<unique_ptr<ParsedExpression>> q25_children; |
87 | vector<unique_ptr<ParsedExpression>> q50_children; |
88 | vector<unique_ptr<ParsedExpression>> q75_children; |
89 | vector<unique_ptr<ParsedExpression>> count_children; |
90 | vector<unique_ptr<ParsedExpression>> null_percentage_children; |
91 | auto select = make_uniq<SelectStatement>(); |
92 | select->node = std::move(query_copy); |
93 | for (idx_t i = 0; i < plan.names.size(); i++) { |
94 | name_children.push_back(x: make_uniq<ConstantExpression>(args: Value(plan.names[i]))); |
95 | type_children.push_back(x: make_uniq<ConstantExpression>(args: Value(plan.types[i].ToString()))); |
96 | min_children.push_back(x: SummarizeCreateAggregate(aggregate: "min" , column_name: plan.names[i])); |
97 | max_children.push_back(x: SummarizeCreateAggregate(aggregate: "max" , column_name: plan.names[i])); |
98 | unique_children.push_back(x: SummarizeCreateAggregate(aggregate: "approx_count_distinct" , column_name: plan.names[i])); |
99 | if (plan.types[i].IsNumeric()) { |
100 | avg_children.push_back(x: SummarizeCreateAggregate(aggregate: "avg" , column_name: plan.names[i])); |
101 | std_children.push_back(x: SummarizeCreateAggregate(aggregate: "stddev" , column_name: plan.names[i])); |
102 | q25_children.push_back(x: SummarizeCreateAggregate(aggregate: "approx_quantile" , column_name: plan.names[i], modifier: Value::FLOAT(value: 0.25))); |
103 | q50_children.push_back(x: SummarizeCreateAggregate(aggregate: "approx_quantile" , column_name: plan.names[i], modifier: Value::FLOAT(value: 0.50))); |
104 | q75_children.push_back(x: SummarizeCreateAggregate(aggregate: "approx_quantile" , column_name: plan.names[i], modifier: Value::FLOAT(value: 0.75))); |
105 | } else { |
106 | avg_children.push_back(x: make_uniq<ConstantExpression>(args: Value())); |
107 | std_children.push_back(x: make_uniq<ConstantExpression>(args: Value())); |
108 | q25_children.push_back(x: make_uniq<ConstantExpression>(args: Value())); |
109 | q50_children.push_back(x: make_uniq<ConstantExpression>(args: Value())); |
110 | q75_children.push_back(x: make_uniq<ConstantExpression>(args: Value())); |
111 | } |
112 | count_children.push_back(x: SummarizeCreateCountStar()); |
113 | null_percentage_children.push_back(x: SummarizeCreateNullPercentage(column_name: plan.names[i])); |
114 | } |
115 | auto subquery_ref = make_uniq<SubqueryRef>(args: std::move(select), args: "summarize_tbl" ); |
116 | subquery_ref->column_name_alias = plan.names; |
117 | |
118 | auto select_node = make_uniq<SelectNode>(); |
119 | select_node->select_list.push_back(x: SummarizeWrapUnnest(children&: name_children, alias: "column_name" )); |
120 | select_node->select_list.push_back(x: SummarizeWrapUnnest(children&: type_children, alias: "column_type" )); |
121 | select_node->select_list.push_back(x: SummarizeWrapUnnest(children&: min_children, alias: "min" )); |
122 | select_node->select_list.push_back(x: SummarizeWrapUnnest(children&: max_children, alias: "max" )); |
123 | select_node->select_list.push_back(x: SummarizeWrapUnnest(children&: unique_children, alias: "approx_unique" )); |
124 | select_node->select_list.push_back(x: SummarizeWrapUnnest(children&: avg_children, alias: "avg" )); |
125 | select_node->select_list.push_back(x: SummarizeWrapUnnest(children&: std_children, alias: "std" )); |
126 | select_node->select_list.push_back(x: SummarizeWrapUnnest(children&: q25_children, alias: "q25" )); |
127 | select_node->select_list.push_back(x: SummarizeWrapUnnest(children&: q50_children, alias: "q50" )); |
128 | select_node->select_list.push_back(x: SummarizeWrapUnnest(children&: q75_children, alias: "q75" )); |
129 | select_node->select_list.push_back(x: SummarizeWrapUnnest(children&: count_children, alias: "count" )); |
130 | select_node->select_list.push_back(x: SummarizeWrapUnnest(children&: null_percentage_children, alias: "null_percentage" )); |
131 | select_node->from_table = std::move(subquery_ref); |
132 | |
133 | properties.return_type = StatementReturnType::QUERY_RESULT; |
134 | return Bind(node&: *select_node); |
135 | } |
136 | |
137 | } // namespace duckdb |
138 | |