1 | #include "duckdb/storage/statistics/distinct_statistics.hpp" |
2 | |
3 | #include "duckdb/common/field_writer.hpp" |
4 | #include "duckdb/common/string_util.hpp" |
5 | |
6 | #include <math.h> |
7 | |
8 | namespace duckdb { |
9 | |
10 | DistinctStatistics::DistinctStatistics() : log(make_uniq<HyperLogLog>()), sample_count(0), total_count(0) { |
11 | } |
12 | |
13 | DistinctStatistics::DistinctStatistics(unique_ptr<HyperLogLog> log, idx_t sample_count, idx_t total_count) |
14 | : log(std::move(log)), sample_count(sample_count), total_count(total_count) { |
15 | } |
16 | |
17 | unique_ptr<DistinctStatistics> DistinctStatistics::Copy() const { |
18 | return make_uniq<DistinctStatistics>(args: log->Copy(), args: sample_count, args: total_count); |
19 | } |
20 | |
21 | void DistinctStatistics::Merge(const DistinctStatistics &other) { |
22 | log = log->Merge(other&: *other.log); |
23 | sample_count += other.sample_count; |
24 | total_count += other.total_count; |
25 | } |
26 | |
27 | void DistinctStatistics::Serialize(Serializer &serializer) const { |
28 | FieldWriter writer(serializer); |
29 | Serialize(writer); |
30 | writer.Finalize(); |
31 | } |
32 | |
33 | void DistinctStatistics::Serialize(FieldWriter &writer) const { |
34 | writer.WriteField<idx_t>(element: sample_count); |
35 | writer.WriteField<idx_t>(element: total_count); |
36 | log->Serialize(writer); |
37 | } |
38 | |
39 | unique_ptr<DistinctStatistics> DistinctStatistics::Deserialize(Deserializer &source) { |
40 | FieldReader reader(source); |
41 | auto result = Deserialize(reader); |
42 | reader.Finalize(); |
43 | return result; |
44 | } |
45 | |
46 | unique_ptr<DistinctStatistics> DistinctStatistics::Deserialize(FieldReader &reader) { |
47 | auto sample_count = reader.ReadRequired<idx_t>(); |
48 | auto total_count = reader.ReadRequired<idx_t>(); |
49 | return make_uniq<DistinctStatistics>(args: HyperLogLog::Deserialize(reader), args&: sample_count, args&: total_count); |
50 | } |
51 | |
52 | void DistinctStatistics::Update(Vector &v, idx_t count, bool sample) { |
53 | UnifiedVectorFormat vdata; |
54 | v.ToUnifiedFormat(count, data&: vdata); |
55 | Update(update_data&: vdata, ptype: v.GetType(), count, sample); |
56 | } |
57 | |
58 | void DistinctStatistics::Update(UnifiedVectorFormat &vdata, const LogicalType &type, idx_t count, bool sample) { |
59 | if (count == 0) { |
60 | return; |
61 | } |
62 | |
63 | total_count += count; |
64 | if (sample) { |
65 | count = MinValue<idx_t>(a: idx_t(SAMPLE_RATE * MaxValue<idx_t>(STANDARD_VECTOR_SIZE, b: count)), b: count); |
66 | } |
67 | sample_count += count; |
68 | |
69 | uint64_t indices[STANDARD_VECTOR_SIZE]; |
70 | uint8_t counts[STANDARD_VECTOR_SIZE]; |
71 | |
72 | HyperLogLog::ProcessEntries(vdata, type, hashes: indices, counts, count); |
73 | log->AddToLog(vdata, count, indices, counts); |
74 | } |
75 | |
76 | string DistinctStatistics::ToString() const { |
77 | return StringUtil::Format(fmt_str: "[Approx Unique: %s]" , params: to_string(val: GetCount())); |
78 | } |
79 | |
80 | idx_t DistinctStatistics::GetCount() const { |
81 | if (sample_count == 0 || total_count == 0) { |
82 | return 0; |
83 | } |
84 | |
85 | double u = MinValue<idx_t>(a: log->Count(), b: sample_count); |
86 | double s = sample_count; |
87 | double n = total_count; |
88 | |
89 | // Assume this proportion of the the sampled values occurred only once |
90 | double u1 = pow(x: u / s, y: 2) * u; |
91 | |
92 | // Estimate total uniques using Good Turing Estimation |
93 | idx_t estimate = u + u1 / s * (n - s); |
94 | return MinValue<idx_t>(a: estimate, b: total_count); |
95 | } |
96 | |
97 | bool DistinctStatistics::TypeIsSupported(const LogicalType &type) { |
98 | return type.InternalType() != PhysicalType::LIST && type.InternalType() != PhysicalType::STRUCT; |
99 | } |
100 | |
101 | } // namespace duckdb |
102 | |