1#include "duckdb/storage/statistics/distinct_statistics.hpp"
2
3#include "duckdb/common/field_writer.hpp"
4#include "duckdb/common/string_util.hpp"
5
6#include <math.h>
7
8namespace duckdb {
9
10DistinctStatistics::DistinctStatistics() : log(make_uniq<HyperLogLog>()), sample_count(0), total_count(0) {
11}
12
13DistinctStatistics::DistinctStatistics(unique_ptr<HyperLogLog> log, idx_t sample_count, idx_t total_count)
14 : log(std::move(log)), sample_count(sample_count), total_count(total_count) {
15}
16
17unique_ptr<DistinctStatistics> DistinctStatistics::Copy() const {
18 return make_uniq<DistinctStatistics>(args: log->Copy(), args: sample_count, args: total_count);
19}
20
21void DistinctStatistics::Merge(const DistinctStatistics &other) {
22 log = log->Merge(other&: *other.log);
23 sample_count += other.sample_count;
24 total_count += other.total_count;
25}
26
27void DistinctStatistics::Serialize(Serializer &serializer) const {
28 FieldWriter writer(serializer);
29 Serialize(writer);
30 writer.Finalize();
31}
32
33void DistinctStatistics::Serialize(FieldWriter &writer) const {
34 writer.WriteField<idx_t>(element: sample_count);
35 writer.WriteField<idx_t>(element: total_count);
36 log->Serialize(writer);
37}
38
39unique_ptr<DistinctStatistics> DistinctStatistics::Deserialize(Deserializer &source) {
40 FieldReader reader(source);
41 auto result = Deserialize(reader);
42 reader.Finalize();
43 return result;
44}
45
46unique_ptr<DistinctStatistics> DistinctStatistics::Deserialize(FieldReader &reader) {
47 auto sample_count = reader.ReadRequired<idx_t>();
48 auto total_count = reader.ReadRequired<idx_t>();
49 return make_uniq<DistinctStatistics>(args: HyperLogLog::Deserialize(reader), args&: sample_count, args&: total_count);
50}
51
52void DistinctStatistics::Update(Vector &v, idx_t count, bool sample) {
53 UnifiedVectorFormat vdata;
54 v.ToUnifiedFormat(count, data&: vdata);
55 Update(update_data&: vdata, ptype: v.GetType(), count, sample);
56}
57
58void DistinctStatistics::Update(UnifiedVectorFormat &vdata, const LogicalType &type, idx_t count, bool sample) {
59 if (count == 0) {
60 return;
61 }
62
63 total_count += count;
64 if (sample) {
65 count = MinValue<idx_t>(a: idx_t(SAMPLE_RATE * MaxValue<idx_t>(STANDARD_VECTOR_SIZE, b: count)), b: count);
66 }
67 sample_count += count;
68
69 uint64_t indices[STANDARD_VECTOR_SIZE];
70 uint8_t counts[STANDARD_VECTOR_SIZE];
71
72 HyperLogLog::ProcessEntries(vdata, type, hashes: indices, counts, count);
73 log->AddToLog(vdata, count, indices, counts);
74}
75
76string DistinctStatistics::ToString() const {
77 return StringUtil::Format(fmt_str: "[Approx Unique: %s]", params: to_string(val: GetCount()));
78}
79
80idx_t DistinctStatistics::GetCount() const {
81 if (sample_count == 0 || total_count == 0) {
82 return 0;
83 }
84
85 double u = MinValue<idx_t>(a: log->Count(), b: sample_count);
86 double s = sample_count;
87 double n = total_count;
88
89 // Assume this proportion of the the sampled values occurred only once
90 double u1 = pow(x: u / s, y: 2) * u;
91
92 // Estimate total uniques using Good Turing Estimation
93 idx_t estimate = u + u1 / s * (n - s);
94 return MinValue<idx_t>(a: estimate, b: total_count);
95}
96
97bool DistinctStatistics::TypeIsSupported(const LogicalType &type) {
98 return type.InternalType() != PhysicalType::LIST && type.InternalType() != PhysicalType::STRUCT;
99}
100
101} // namespace duckdb
102