1 | #pragma once |
2 | |
3 | #include <Common/Exception.h> |
4 | |
5 | #include <algorithm> |
6 | #include <limits> |
7 | #include <tuple> |
8 | #include <type_traits> |
9 | |
10 | /** This class provides a way to evaluate the error in the result of applying the HyperLogLog algorithm. |
11 | * Empirical observations show that large errors occur at E < 5 * 2^precision, where |
12 | * E is the return value of the HyperLogLog algorithm, and `precision` is the HyperLogLog precision parameter. |
13 | * See "HyperLogLog in Practice: Algorithmic Engineering of a State of the Art Cardinality Estimation Algorithm". |
14 | * (S. Heule et al., Proceedings of the EDBT 2013 Conference). |
15 | */ |
16 | template <typename BiasData> |
17 | class HyperLogLogBiasEstimator |
18 | { |
19 | public: |
20 | static constexpr bool isTrivial() |
21 | { |
22 | return false; |
23 | } |
24 | |
25 | /// Maximum number of unique values to which the correction should apply |
26 | /// from the LinearCounting algorithm. |
27 | static double getThreshold() |
28 | { |
29 | return BiasData::getThreshold(); |
30 | } |
31 | |
32 | /// Return the error estimate. |
33 | static double getBias(double raw_estimate) |
34 | { |
35 | const auto & estimates = BiasData::getRawEstimates(); |
36 | const auto & biases = BiasData::getBiases(); |
37 | |
38 | auto it = std::lower_bound(estimates.begin(), estimates.end(), raw_estimate); |
39 | |
40 | if (it == estimates.end()) |
41 | { |
42 | return biases[estimates.size() - 1]; |
43 | } |
44 | else if (*it == raw_estimate) |
45 | { |
46 | size_t index = std::distance(estimates.begin(), it); |
47 | return biases[index]; |
48 | } |
49 | else if (it == estimates.begin()) |
50 | { |
51 | return biases[0]; |
52 | } |
53 | else |
54 | { |
55 | /// We get the error estimate by linear interpolation. |
56 | size_t index = std::distance(estimates.begin(), it); |
57 | |
58 | double estimate1 = estimates[index - 1]; |
59 | double estimate2 = estimates[index]; |
60 | |
61 | double bias1 = biases[index - 1]; |
62 | double bias2 = biases[index]; |
63 | /// It is assumed that the estimate1 < estimate2 condition is always satisfied. |
64 | double slope = (bias2 - bias1) / (estimate2 - estimate1); |
65 | |
66 | return bias1 + slope * (raw_estimate - estimate1); |
67 | } |
68 | } |
69 | |
70 | private: |
71 | /// Static checks. |
72 | using TRawEstimatesRef = decltype(BiasData::getRawEstimates()); |
73 | using TRawEstimates = std::remove_reference_t<TRawEstimatesRef>; |
74 | |
75 | using TBiasDataRef = decltype(BiasData::getBiases()); |
76 | using TBiasData = std::remove_reference_t<TBiasDataRef>; |
77 | |
78 | static_assert(std::is_same_v<TRawEstimates, TBiasData>, "Bias estimator data have inconsistent types" ); |
79 | static_assert(std::tuple_size<TRawEstimates>::value > 0, "Bias estimator has no raw estimate data" ); |
80 | static_assert(std::tuple_size<TBiasData>::value > 0, "Bias estimator has no bias data" ); |
81 | static_assert(std::tuple_size<TRawEstimates>::value == std::tuple_size<TBiasData>::value, |
82 | "Bias estimator has inconsistent data" ); |
83 | }; |
84 | |
85 | /** Trivial case of HyperLogLogBiasEstimator: used if we do not want to fix |
86 | * error. This has meaning for small values of the accuracy parameter, for example 5 or 12. |
87 | * Then the corrections from the original version of the HyperLogLog algorithm are applied. |
88 | * See "HyperLogLog: The analysis of a near-optimal cardinality estimation algorithm" |
89 | * (P. Flajolet et al., AOFA '07: Proceedings of the 2007 International Conference on Analysis |
90 | * of Algorithms) |
91 | */ |
92 | struct TrivialBiasEstimator |
93 | { |
94 | static constexpr bool isTrivial() |
95 | { |
96 | return true; |
97 | } |
98 | |
99 | static double getThreshold() |
100 | { |
101 | return 0.0; |
102 | } |
103 | |
104 | static double getBias(double /*raw_estimate*/) |
105 | { |
106 | return 0.0; |
107 | } |
108 | }; |
109 | |