| 1 | #include "duckdb/storage/statistics/distinct_statistics.hpp" |
| 2 | |
| 3 | #include "duckdb/common/field_writer.hpp" |
| 4 | #include "duckdb/common/string_util.hpp" |
| 5 | |
| 6 | #include <math.h> |
| 7 | |
| 8 | namespace duckdb { |
| 9 | |
| 10 | DistinctStatistics::DistinctStatistics() : log(make_uniq<HyperLogLog>()), sample_count(0), total_count(0) { |
| 11 | } |
| 12 | |
| 13 | DistinctStatistics::DistinctStatistics(unique_ptr<HyperLogLog> log, idx_t sample_count, idx_t total_count) |
| 14 | : log(std::move(log)), sample_count(sample_count), total_count(total_count) { |
| 15 | } |
| 16 | |
| 17 | unique_ptr<DistinctStatistics> DistinctStatistics::Copy() const { |
| 18 | return make_uniq<DistinctStatistics>(args: log->Copy(), args: sample_count, args: total_count); |
| 19 | } |
| 20 | |
| 21 | void DistinctStatistics::Merge(const DistinctStatistics &other) { |
| 22 | log = log->Merge(other&: *other.log); |
| 23 | sample_count += other.sample_count; |
| 24 | total_count += other.total_count; |
| 25 | } |
| 26 | |
| 27 | void DistinctStatistics::Serialize(Serializer &serializer) const { |
| 28 | FieldWriter writer(serializer); |
| 29 | Serialize(writer); |
| 30 | writer.Finalize(); |
| 31 | } |
| 32 | |
| 33 | void DistinctStatistics::Serialize(FieldWriter &writer) const { |
| 34 | writer.WriteField<idx_t>(element: sample_count); |
| 35 | writer.WriteField<idx_t>(element: total_count); |
| 36 | log->Serialize(writer); |
| 37 | } |
| 38 | |
| 39 | unique_ptr<DistinctStatistics> DistinctStatistics::Deserialize(Deserializer &source) { |
| 40 | FieldReader reader(source); |
| 41 | auto result = Deserialize(reader); |
| 42 | reader.Finalize(); |
| 43 | return result; |
| 44 | } |
| 45 | |
| 46 | unique_ptr<DistinctStatistics> DistinctStatistics::Deserialize(FieldReader &reader) { |
| 47 | auto sample_count = reader.ReadRequired<idx_t>(); |
| 48 | auto total_count = reader.ReadRequired<idx_t>(); |
| 49 | return make_uniq<DistinctStatistics>(args: HyperLogLog::Deserialize(reader), args&: sample_count, args&: total_count); |
| 50 | } |
| 51 | |
| 52 | void DistinctStatistics::Update(Vector &v, idx_t count, bool sample) { |
| 53 | UnifiedVectorFormat vdata; |
| 54 | v.ToUnifiedFormat(count, data&: vdata); |
| 55 | Update(update_data&: vdata, ptype: v.GetType(), count, sample); |
| 56 | } |
| 57 | |
| 58 | void DistinctStatistics::Update(UnifiedVectorFormat &vdata, const LogicalType &type, idx_t count, bool sample) { |
| 59 | if (count == 0) { |
| 60 | return; |
| 61 | } |
| 62 | |
| 63 | total_count += count; |
| 64 | if (sample) { |
| 65 | count = MinValue<idx_t>(a: idx_t(SAMPLE_RATE * MaxValue<idx_t>(STANDARD_VECTOR_SIZE, b: count)), b: count); |
| 66 | } |
| 67 | sample_count += count; |
| 68 | |
| 69 | uint64_t indices[STANDARD_VECTOR_SIZE]; |
| 70 | uint8_t counts[STANDARD_VECTOR_SIZE]; |
| 71 | |
| 72 | HyperLogLog::ProcessEntries(vdata, type, hashes: indices, counts, count); |
| 73 | log->AddToLog(vdata, count, indices, counts); |
| 74 | } |
| 75 | |
| 76 | string DistinctStatistics::ToString() const { |
| 77 | return StringUtil::Format(fmt_str: "[Approx Unique: %s]" , params: to_string(val: GetCount())); |
| 78 | } |
| 79 | |
| 80 | idx_t DistinctStatistics::GetCount() const { |
| 81 | if (sample_count == 0 || total_count == 0) { |
| 82 | return 0; |
| 83 | } |
| 84 | |
| 85 | double u = MinValue<idx_t>(a: log->Count(), b: sample_count); |
| 86 | double s = sample_count; |
| 87 | double n = total_count; |
| 88 | |
| 89 | // Assume this proportion of the the sampled values occurred only once |
| 90 | double u1 = pow(x: u / s, y: 2) * u; |
| 91 | |
| 92 | // Estimate total uniques using Good Turing Estimation |
| 93 | idx_t estimate = u + u1 / s * (n - s); |
| 94 | return MinValue<idx_t>(a: estimate, b: total_count); |
| 95 | } |
| 96 | |
| 97 | bool DistinctStatistics::TypeIsSupported(const LogicalType &type) { |
| 98 | return type.InternalType() != PhysicalType::LIST && type.InternalType() != PhysicalType::STRUCT; |
| 99 | } |
| 100 | |
| 101 | } // namespace duckdb |
| 102 | |