1#pragma once
2
3#include <Common/Exception.h>
4
5#include <algorithm>
6#include <limits>
7#include <tuple>
8#include <type_traits>
9
10/** This class provides a way to evaluate the error in the result of applying the HyperLogLog algorithm.
11 * Empirical observations show that large errors occur at E < 5 * 2^precision, where
12 * E is the return value of the HyperLogLog algorithm, and `precision` is the HyperLogLog precision parameter.
13 * See "HyperLogLog in Practice: Algorithmic Engineering of a State of the Art Cardinality Estimation Algorithm".
14 * (S. Heule et al., Proceedings of the EDBT 2013 Conference).
15 */
16template <typename BiasData>
17class HyperLogLogBiasEstimator
18{
19public:
20 static constexpr bool isTrivial()
21 {
22 return false;
23 }
24
25 /// Maximum number of unique values to which the correction should apply
26 /// from the LinearCounting algorithm.
27 static double getThreshold()
28 {
29 return BiasData::getThreshold();
30 }
31
32 /// Return the error estimate.
33 static double getBias(double raw_estimate)
34 {
35 const auto & estimates = BiasData::getRawEstimates();
36 const auto & biases = BiasData::getBiases();
37
38 auto it = std::lower_bound(estimates.begin(), estimates.end(), raw_estimate);
39
40 if (it == estimates.end())
41 {
42 return biases[estimates.size() - 1];
43 }
44 else if (*it == raw_estimate)
45 {
46 size_t index = std::distance(estimates.begin(), it);
47 return biases[index];
48 }
49 else if (it == estimates.begin())
50 {
51 return biases[0];
52 }
53 else
54 {
55 /// We get the error estimate by linear interpolation.
56 size_t index = std::distance(estimates.begin(), it);
57
58 double estimate1 = estimates[index - 1];
59 double estimate2 = estimates[index];
60
61 double bias1 = biases[index - 1];
62 double bias2 = biases[index];
63 /// It is assumed that the estimate1 < estimate2 condition is always satisfied.
64 double slope = (bias2 - bias1) / (estimate2 - estimate1);
65
66 return bias1 + slope * (raw_estimate - estimate1);
67 }
68 }
69
70private:
71 /// Static checks.
72 using TRawEstimatesRef = decltype(BiasData::getRawEstimates());
73 using TRawEstimates = std::remove_reference_t<TRawEstimatesRef>;
74
75 using TBiasDataRef = decltype(BiasData::getBiases());
76 using TBiasData = std::remove_reference_t<TBiasDataRef>;
77
78 static_assert(std::is_same_v<TRawEstimates, TBiasData>, "Bias estimator data have inconsistent types");
79 static_assert(std::tuple_size<TRawEstimates>::value > 0, "Bias estimator has no raw estimate data");
80 static_assert(std::tuple_size<TBiasData>::value > 0, "Bias estimator has no bias data");
81 static_assert(std::tuple_size<TRawEstimates>::value == std::tuple_size<TBiasData>::value,
82 "Bias estimator has inconsistent data");
83};
84
85/** Trivial case of HyperLogLogBiasEstimator: used if we do not want to fix
86 * error. This has meaning for small values of the accuracy parameter, for example 5 or 12.
87 * Then the corrections from the original version of the HyperLogLog algorithm are applied.
88 * See "HyperLogLog: The analysis of a near-optimal cardinality estimation algorithm"
89 * (P. Flajolet et al., AOFA '07: Proceedings of the 2007 International Conference on Analysis
90 * of Algorithms)
91 */
92struct TrivialBiasEstimator
93{
94 static constexpr bool isTrivial()
95 {
96 return true;
97 }
98
99 static double getThreshold()
100 {
101 return 0.0;
102 }
103
104 static double getBias(double /*raw_estimate*/)
105 {
106 return 0.0;
107 }
108};
109