1 | #include "duckdb/common/exception.hpp" |
2 | #include "duckdb/common/field_writer.hpp" |
3 | #include "duckdb/common/string_util.hpp" |
4 | #include "duckdb/common/types/vector.hpp" |
5 | #include "duckdb/storage/statistics/base_statistics.hpp" |
6 | #include "duckdb/storage/statistics/list_stats.hpp" |
7 | #include "duckdb/storage/statistics/struct_stats.hpp" |
8 | |
9 | namespace duckdb { |
10 | |
11 | BaseStatistics::BaseStatistics() : type(LogicalType::INVALID) { |
12 | } |
13 | |
14 | BaseStatistics::BaseStatistics(LogicalType type) { |
15 | Construct(stats&: *this, type: std::move(type)); |
16 | } |
17 | |
18 | void BaseStatistics::Construct(BaseStatistics &stats, LogicalType type) { |
19 | stats.distinct_count = 0; |
20 | stats.type = std::move(type); |
21 | switch (GetStatsType(type: stats.type)) { |
22 | case StatisticsType::LIST_STATS: |
23 | ListStats::Construct(stats); |
24 | break; |
25 | case StatisticsType::STRUCT_STATS: |
26 | StructStats::Construct(stats); |
27 | break; |
28 | default: |
29 | break; |
30 | } |
31 | } |
32 | |
33 | BaseStatistics::~BaseStatistics() { |
34 | } |
35 | |
36 | BaseStatistics::BaseStatistics(BaseStatistics &&other) noexcept { |
37 | std::swap(a&: type, b&: other.type); |
38 | has_null = other.has_null; |
39 | has_no_null = other.has_no_null; |
40 | distinct_count = other.distinct_count; |
41 | stats_union = other.stats_union; |
42 | std::swap(a&: child_stats, b&: other.child_stats); |
43 | } |
44 | |
45 | BaseStatistics &BaseStatistics::operator=(BaseStatistics &&other) noexcept { |
46 | std::swap(a&: type, b&: other.type); |
47 | has_null = other.has_null; |
48 | has_no_null = other.has_no_null; |
49 | distinct_count = other.distinct_count; |
50 | stats_union = other.stats_union; |
51 | std::swap(a&: child_stats, b&: other.child_stats); |
52 | return *this; |
53 | } |
54 | |
55 | StatisticsType BaseStatistics::GetStatsType(const LogicalType &type) { |
56 | if (type.id() == LogicalTypeId::SQLNULL) { |
57 | return StatisticsType::BASE_STATS; |
58 | } |
59 | switch (type.InternalType()) { |
60 | case PhysicalType::BOOL: |
61 | case PhysicalType::INT8: |
62 | case PhysicalType::INT16: |
63 | case PhysicalType::INT32: |
64 | case PhysicalType::INT64: |
65 | case PhysicalType::UINT8: |
66 | case PhysicalType::UINT16: |
67 | case PhysicalType::UINT32: |
68 | case PhysicalType::UINT64: |
69 | case PhysicalType::INT128: |
70 | case PhysicalType::FLOAT: |
71 | case PhysicalType::DOUBLE: |
72 | return StatisticsType::NUMERIC_STATS; |
73 | case PhysicalType::VARCHAR: |
74 | return StatisticsType::STRING_STATS; |
75 | case PhysicalType::STRUCT: |
76 | return StatisticsType::STRUCT_STATS; |
77 | case PhysicalType::LIST: |
78 | return StatisticsType::LIST_STATS; |
79 | case PhysicalType::BIT: |
80 | case PhysicalType::INTERVAL: |
81 | default: |
82 | return StatisticsType::BASE_STATS; |
83 | } |
84 | } |
85 | |
86 | StatisticsType BaseStatistics::GetStatsType() const { |
87 | return GetStatsType(type: GetType()); |
88 | } |
89 | |
90 | void BaseStatistics::InitializeUnknown() { |
91 | has_null = true; |
92 | has_no_null = true; |
93 | } |
94 | |
95 | void BaseStatistics::InitializeEmpty() { |
96 | has_null = false; |
97 | has_no_null = true; |
98 | } |
99 | |
100 | bool BaseStatistics::CanHaveNull() const { |
101 | return has_null; |
102 | } |
103 | |
104 | bool BaseStatistics::CanHaveNoNull() const { |
105 | return has_no_null; |
106 | } |
107 | |
108 | bool BaseStatistics::IsConstant() const { |
109 | if (type.id() == LogicalTypeId::VALIDITY) { |
110 | // validity mask |
111 | if (CanHaveNull() && !CanHaveNoNull()) { |
112 | return true; |
113 | } |
114 | if (!CanHaveNull() && CanHaveNoNull()) { |
115 | return true; |
116 | } |
117 | return false; |
118 | } |
119 | switch (GetStatsType()) { |
120 | case StatisticsType::NUMERIC_STATS: |
121 | return NumericStats::IsConstant(stats: *this); |
122 | default: |
123 | break; |
124 | } |
125 | return false; |
126 | } |
127 | |
128 | void BaseStatistics::Merge(const BaseStatistics &other) { |
129 | has_null = has_null || other.has_null; |
130 | has_no_null = has_no_null || other.has_no_null; |
131 | switch (GetStatsType()) { |
132 | case StatisticsType::NUMERIC_STATS: |
133 | NumericStats::Merge(stats&: *this, other_p: other); |
134 | break; |
135 | case StatisticsType::STRING_STATS: |
136 | StringStats::Merge(stats&: *this, other); |
137 | break; |
138 | case StatisticsType::LIST_STATS: |
139 | ListStats::Merge(stats&: *this, other); |
140 | break; |
141 | case StatisticsType::STRUCT_STATS: |
142 | StructStats::Merge(stats&: *this, other); |
143 | break; |
144 | default: |
145 | break; |
146 | } |
147 | } |
148 | |
149 | idx_t BaseStatistics::GetDistinctCount() { |
150 | return distinct_count; |
151 | } |
152 | |
153 | BaseStatistics BaseStatistics::CreateUnknownType(LogicalType type) { |
154 | switch (GetStatsType(type)) { |
155 | case StatisticsType::NUMERIC_STATS: |
156 | return NumericStats::CreateUnknown(type: std::move(type)); |
157 | case StatisticsType::STRING_STATS: |
158 | return StringStats::CreateUnknown(type: std::move(type)); |
159 | case StatisticsType::LIST_STATS: |
160 | return ListStats::CreateUnknown(type: std::move(type)); |
161 | case StatisticsType::STRUCT_STATS: |
162 | return StructStats::CreateUnknown(type: std::move(type)); |
163 | default: |
164 | return BaseStatistics(std::move(type)); |
165 | } |
166 | } |
167 | |
168 | BaseStatistics BaseStatistics::CreateEmptyType(LogicalType type) { |
169 | switch (GetStatsType(type)) { |
170 | case StatisticsType::NUMERIC_STATS: |
171 | return NumericStats::CreateEmpty(type: std::move(type)); |
172 | case StatisticsType::STRING_STATS: |
173 | return StringStats::CreateEmpty(type: std::move(type)); |
174 | case StatisticsType::LIST_STATS: |
175 | return ListStats::CreateEmpty(type: std::move(type)); |
176 | case StatisticsType::STRUCT_STATS: |
177 | return StructStats::CreateEmpty(type: std::move(type)); |
178 | default: |
179 | return BaseStatistics(std::move(type)); |
180 | } |
181 | } |
182 | |
183 | BaseStatistics BaseStatistics::CreateUnknown(LogicalType type) { |
184 | auto result = CreateUnknownType(type: std::move(type)); |
185 | result.InitializeUnknown(); |
186 | return result; |
187 | } |
188 | |
189 | BaseStatistics BaseStatistics::CreateEmpty(LogicalType type) { |
190 | if (type.InternalType() == PhysicalType::BIT) { |
191 | // FIXME: this special case should not be necessary |
192 | // but currently InitializeEmpty sets StatsInfo::CAN_HAVE_VALID_VALUES |
193 | BaseStatistics result(std::move(type)); |
194 | result.Set(StatsInfo::CANNOT_HAVE_NULL_VALUES); |
195 | result.Set(StatsInfo::CANNOT_HAVE_VALID_VALUES); |
196 | return result; |
197 | } |
198 | auto result = CreateEmptyType(type: std::move(type)); |
199 | result.InitializeEmpty(); |
200 | return result; |
201 | } |
202 | |
203 | void BaseStatistics::Copy(const BaseStatistics &other) { |
204 | D_ASSERT(GetType() == other.GetType()); |
205 | CopyBase(orig: other); |
206 | stats_union = other.stats_union; |
207 | switch (GetStatsType()) { |
208 | case StatisticsType::LIST_STATS: |
209 | ListStats::Copy(stats&: *this, other); |
210 | break; |
211 | case StatisticsType::STRUCT_STATS: |
212 | StructStats::Copy(stats&: *this, other); |
213 | break; |
214 | default: |
215 | break; |
216 | } |
217 | } |
218 | |
219 | BaseStatistics BaseStatistics::Copy() const { |
220 | BaseStatistics result(type); |
221 | result.Copy(other: *this); |
222 | return result; |
223 | } |
224 | |
225 | unique_ptr<BaseStatistics> BaseStatistics::ToUnique() const { |
226 | auto result = unique_ptr<BaseStatistics>(new BaseStatistics(type)); |
227 | result->Copy(other: *this); |
228 | return result; |
229 | } |
230 | |
231 | void BaseStatistics::CopyBase(const BaseStatistics &other) { |
232 | has_null = other.has_null; |
233 | has_no_null = other.has_no_null; |
234 | distinct_count = other.distinct_count; |
235 | } |
236 | |
237 | void BaseStatistics::Set(StatsInfo info) { |
238 | switch (info) { |
239 | case StatsInfo::CAN_HAVE_NULL_VALUES: |
240 | has_null = true; |
241 | break; |
242 | case StatsInfo::CANNOT_HAVE_NULL_VALUES: |
243 | has_null = false; |
244 | break; |
245 | case StatsInfo::CAN_HAVE_VALID_VALUES: |
246 | has_no_null = true; |
247 | break; |
248 | case StatsInfo::CANNOT_HAVE_VALID_VALUES: |
249 | has_no_null = false; |
250 | break; |
251 | case StatsInfo::CAN_HAVE_NULL_AND_VALID_VALUES: |
252 | has_null = true; |
253 | has_no_null = true; |
254 | break; |
255 | default: |
256 | throw InternalException("Unrecognized StatsInfo for BaseStatistics::Set" ); |
257 | } |
258 | } |
259 | |
260 | void BaseStatistics::CombineValidity(BaseStatistics &left, BaseStatistics &right) { |
261 | has_null = left.has_null || right.has_null; |
262 | has_no_null = left.has_no_null || right.has_no_null; |
263 | } |
264 | |
265 | void BaseStatistics::CopyValidity(BaseStatistics &stats) { |
266 | has_null = stats.has_null; |
267 | has_no_null = stats.has_no_null; |
268 | } |
269 | |
270 | void BaseStatistics::Serialize(Serializer &serializer) const { |
271 | FieldWriter writer(serializer); |
272 | writer.WriteField<bool>(element: has_null); |
273 | writer.WriteField<bool>(element: has_no_null); |
274 | writer.WriteField<idx_t>(element: distinct_count); |
275 | Serialize(writer); |
276 | writer.Finalize(); |
277 | } |
278 | |
279 | void BaseStatistics::SetDistinctCount(idx_t count) { |
280 | this->distinct_count = count; |
281 | } |
282 | |
283 | void BaseStatistics::Serialize(FieldWriter &writer) const { |
284 | switch (GetStatsType()) { |
285 | case StatisticsType::NUMERIC_STATS: |
286 | NumericStats::Serialize(stats: *this, writer); |
287 | break; |
288 | case StatisticsType::STRING_STATS: |
289 | StringStats::Serialize(stats: *this, writer); |
290 | break; |
291 | case StatisticsType::LIST_STATS: |
292 | ListStats::Serialize(stats: *this, writer); |
293 | break; |
294 | case StatisticsType::STRUCT_STATS: |
295 | StructStats::Serialize(stats: *this, writer); |
296 | break; |
297 | default: |
298 | break; |
299 | } |
300 | } |
301 | BaseStatistics BaseStatistics::DeserializeType(FieldReader &reader, LogicalType type) { |
302 | switch (GetStatsType(type)) { |
303 | case StatisticsType::NUMERIC_STATS: |
304 | return NumericStats::Deserialize(reader, type: std::move(type)); |
305 | case StatisticsType::STRING_STATS: |
306 | return StringStats::Deserialize(reader, type: std::move(type)); |
307 | case StatisticsType::LIST_STATS: |
308 | return ListStats::Deserialize(reader, type: std::move(type)); |
309 | case StatisticsType::STRUCT_STATS: |
310 | return StructStats::Deserialize(reader, type: std::move(type)); |
311 | default: |
312 | return BaseStatistics(std::move(type)); |
313 | } |
314 | } |
315 | |
316 | BaseStatistics BaseStatistics::Deserialize(Deserializer &source, LogicalType type) { |
317 | FieldReader reader(source); |
318 | bool has_null = reader.ReadRequired<bool>(); |
319 | bool has_no_null = reader.ReadRequired<bool>(); |
320 | idx_t distinct_count = reader.ReadRequired<idx_t>(); |
321 | auto result = DeserializeType(reader, type: std::move(type)); |
322 | result.has_null = has_null; |
323 | result.has_no_null = has_no_null; |
324 | result.distinct_count = distinct_count; |
325 | reader.Finalize(); |
326 | return result; |
327 | } |
328 | |
329 | string BaseStatistics::ToString() const { |
330 | auto has_n = has_null ? "true" : "false" ; |
331 | auto has_n_n = has_no_null ? "true" : "false" ; |
332 | string result = |
333 | StringUtil::Format(fmt_str: "%s%s" , params: StringUtil::Format(fmt_str: "[Has Null: %s, Has No Null: %s]" , params: has_n, params: has_n_n), |
334 | params: distinct_count > 0 ? StringUtil::Format(fmt_str: "[Approx Unique: %lld]" , params: distinct_count) : "" ); |
335 | switch (GetStatsType()) { |
336 | case StatisticsType::NUMERIC_STATS: |
337 | result = NumericStats::ToString(stats: *this) + result; |
338 | break; |
339 | case StatisticsType::STRING_STATS: |
340 | result = StringStats::ToString(stats: *this) + result; |
341 | break; |
342 | case StatisticsType::LIST_STATS: |
343 | result = ListStats::ToString(stats: *this) + result; |
344 | break; |
345 | case StatisticsType::STRUCT_STATS: |
346 | result = StructStats::ToString(stats: *this) + result; |
347 | break; |
348 | default: |
349 | break; |
350 | } |
351 | return result; |
352 | } |
353 | |
354 | void BaseStatistics::Verify(Vector &vector, const SelectionVector &sel, idx_t count) const { |
355 | D_ASSERT(vector.GetType() == this->type); |
356 | switch (GetStatsType()) { |
357 | case StatisticsType::NUMERIC_STATS: |
358 | NumericStats::Verify(stats: *this, vector, sel, count); |
359 | break; |
360 | case StatisticsType::STRING_STATS: |
361 | StringStats::Verify(stats: *this, vector, sel, count); |
362 | break; |
363 | case StatisticsType::LIST_STATS: |
364 | ListStats::Verify(stats: *this, vector, sel, count); |
365 | break; |
366 | case StatisticsType::STRUCT_STATS: |
367 | StructStats::Verify(stats: *this, vector, sel, count); |
368 | break; |
369 | default: |
370 | break; |
371 | } |
372 | if (has_null && has_no_null) { |
373 | // nothing to verify |
374 | return; |
375 | } |
376 | UnifiedVectorFormat vdata; |
377 | vector.ToUnifiedFormat(count, data&: vdata); |
378 | for (idx_t i = 0; i < count; i++) { |
379 | auto idx = sel.get_index(idx: i); |
380 | auto index = vdata.sel->get_index(idx); |
381 | bool row_is_valid = vdata.validity.RowIsValid(row_idx: index); |
382 | if (row_is_valid && !has_no_null) { |
383 | throw InternalException( |
384 | "Statistics mismatch: vector labeled as having only NULL values, but vector contains valid values: %s" , |
385 | vector.ToString(count)); |
386 | } |
387 | if (!row_is_valid && !has_null) { |
388 | throw InternalException( |
389 | "Statistics mismatch: vector labeled as not having NULL values, but vector contains null values: %s" , |
390 | vector.ToString(count)); |
391 | } |
392 | } |
393 | } |
394 | |
395 | void BaseStatistics::Verify(Vector &vector, idx_t count) const { |
396 | auto sel = FlatVector::IncrementalSelectionVector(); |
397 | Verify(vector, sel: *sel, count); |
398 | } |
399 | |
400 | BaseStatistics BaseStatistics::FromConstantType(const Value &input) { |
401 | switch (GetStatsType(type: input.type())) { |
402 | case StatisticsType::NUMERIC_STATS: { |
403 | auto result = NumericStats::CreateEmpty(type: input.type()); |
404 | NumericStats::SetMin(stats&: result, val: input); |
405 | NumericStats::SetMax(stats&: result, val: input); |
406 | return result; |
407 | } |
408 | case StatisticsType::STRING_STATS: { |
409 | auto result = StringStats::CreateEmpty(type: input.type()); |
410 | if (!input.IsNull()) { |
411 | auto &string_value = StringValue::Get(value: input); |
412 | StringStats::Update(stats&: result, value: string_t(string_value)); |
413 | } |
414 | return result; |
415 | } |
416 | case StatisticsType::LIST_STATS: { |
417 | auto result = ListStats::CreateEmpty(type: input.type()); |
418 | auto &child_stats = ListStats::GetChildStats(stats&: result); |
419 | if (!input.IsNull()) { |
420 | auto &list_children = ListValue::GetChildren(value: input); |
421 | for (auto &child_element : list_children) { |
422 | child_stats.Merge(other: FromConstant(input: child_element)); |
423 | } |
424 | } |
425 | return result; |
426 | } |
427 | case StatisticsType::STRUCT_STATS: { |
428 | auto result = StructStats::CreateEmpty(type: input.type()); |
429 | auto &child_types = StructType::GetChildTypes(type: input.type()); |
430 | if (input.IsNull()) { |
431 | for (idx_t i = 0; i < child_types.size(); i++) { |
432 | StructStats::SetChildStats(stats&: result, i, new_stats: FromConstant(input: Value(child_types[i].second))); |
433 | } |
434 | } else { |
435 | auto &struct_children = StructValue::GetChildren(value: input); |
436 | for (idx_t i = 0; i < child_types.size(); i++) { |
437 | StructStats::SetChildStats(stats&: result, i, new_stats: FromConstant(input: struct_children[i])); |
438 | } |
439 | } |
440 | return result; |
441 | } |
442 | default: |
443 | return BaseStatistics(input.type()); |
444 | } |
445 | } |
446 | |
447 | BaseStatistics BaseStatistics::FromConstant(const Value &input) { |
448 | auto result = FromConstantType(input); |
449 | result.SetDistinctCount(1); |
450 | if (input.IsNull()) { |
451 | result.Set(StatsInfo::CAN_HAVE_NULL_VALUES); |
452 | result.Set(StatsInfo::CANNOT_HAVE_VALID_VALUES); |
453 | } else { |
454 | result.Set(StatsInfo::CANNOT_HAVE_NULL_VALUES); |
455 | result.Set(StatsInfo::CAN_HAVE_VALID_VALUES); |
456 | } |
457 | return result; |
458 | } |
459 | |
460 | } // namespace duckdb |
461 | |