HyperLogLogBiasEstimator.h source code [ClickHouse/dbms/src/Common/HyperLogLogBiasEstimator.h]

1	#pragma once
2
3	#include <Common/Exception.h>
4
5	#include <algorithm>
6	#include <limits>
7	#include <tuple>
8	#include <type_traits>
9
10	/* This class provides a way to evaluate the error in the result of applying the HyperLogLog algorithm.*
11	* Empirical observations show that large errors occur at E < 5 * 2^precision, where
12	* E is the return value of the HyperLogLog algorithm, and `precision` is the HyperLogLog precision parameter.
13	* See "HyperLogLog in Practice: Algorithmic Engineering of a State of the Art Cardinality Estimation Algorithm".
14	* (S. Heule et al., Proceedings of the EDBT 2013 Conference).
15	*/
16	template <typename BiasData>
17	class HyperLogLogBiasEstimator
18	{
19	public:
20	static constexpr bool isTrivial()
21	{
22	return false;
23	}
24
25	/// Maximum number of unique values to which the correction should apply
26	/// from the LinearCounting algorithm.
27	static double getThreshold()
28	{
29	return BiasData::getThreshold();
30	}
31
32	/// Return the error estimate.
33	static double getBias(double raw_estimate)
34	{
35	const auto & estimates = BiasData::getRawEstimates();
36	const auto & biases = BiasData::getBiases();
37
38	auto it = std::lower_bound(estimates.begin(), estimates.end(), raw_estimate);
39
40	if (it == estimates.end())
41	{
42	return biases[estimates.size() - `1`];
43	}
44	else if (*it == raw_estimate)
45	{
46	size_t index = std::distance(estimates.begin(), it);
47	return biases[index];
48	}
49	else if (it == estimates.begin())
50	{
51	return biases[`0`];
52	}
53	else
54	{
55	/// We get the error estimate by linear interpolation.
56	size_t index = std::distance(estimates.begin(), it);
57
58	double estimate1 = estimates[index - `1`];
59	double estimate2 = estimates[index];
60
61	double bias1 = biases[index - `1`];
62	double bias2 = biases[index];
63	/// It is assumed that the estimate1 < estimate2 condition is always satisfied.
64	double slope = (bias2 - bias1) / (estimate2 - estimate1);
65
66	return bias1 + slope * (raw_estimate - estimate1);
67	}
68	}
69
70	private:
71	/// Static checks.
72	using TRawEstimatesRef = decltype(BiasData::getRawEstimates());
73	using TRawEstimates = std::remove_reference_t<TRawEstimatesRef>;
74
75	using TBiasDataRef = decltype(BiasData::getBiases());
76	using TBiasData = std::remove_reference_t<TBiasDataRef>;
77
78	static_assert(std::is_same_v<TRawEstimates, TBiasData>, "Bias estimator data have inconsistent types");
79	static_assert(std::tuple_size<TRawEstimates>::value > `0`, "Bias estimator has no raw estimate data");
80	static_assert(std::tuple_size<TBiasData>::value > `0`, "Bias estimator has no bias data");
81	static_assert(std::tuple_size<TRawEstimates>::value == std::tuple_size<TBiasData>::value,
82	"Bias estimator has inconsistent data");
83	};
84
85	/* Trivial case of HyperLogLogBiasEstimator: used if we do not want to fix*
86	* error. This has meaning for small values of the accuracy parameter, for example 5 or 12.
87	* Then the corrections from the original version of the HyperLogLog algorithm are applied.
88	* See "HyperLogLog: The analysis of a near-optimal cardinality estimation algorithm"
89	* (P. Flajolet et al., AOFA '07: Proceedings of the 2007 International Conference on Analysis
90	* of Algorithms)
91	*/
92	struct TrivialBiasEstimator
93	{
94	static constexpr bool isTrivial()
95	{
96	return true;
97	}
98
99	static double getThreshold()
100	{
101	return `0.0`;
102	}
103
104	static double getBias(double /raw_estimate/)
105	{
106	return `0.0`;
107	}
108	};
109

Browse the source code of ClickHouse/dbms/src/Common/HyperLogLogBiasEstimator.h