QuantileTDigest.h source code [ClickHouse/dbms/src/AggregateFunctions/QuantileTDigest.h]

1	#pragma once
2
3	#include <cmath>
4	#include <Common/RadixSort.h>
5	#include <Common/PODArray.h>
6	#include <IO/WriteBuffer.h>
7	#include <IO/ReadBuffer.h>
8	#include <IO/VarInt.h>
9
10
11	namespace DB
12	{
13
14	namespace ErrorCodes
15	{
16	extern const int TOO_LARGE_ARRAY_SIZE;
17	}
18
19
20	/* The algorithm was implemented by Alexei Borzenkov https://github.com/snaury*
21	* He owns the authorship of the code and half the comments in this namespace,
22	* except for merging, serialization, and sorting, as well as selecting types and other changes.
23	* We thank Alexei Borzenkov for writing the original code.
24	*/
25
26	/* Implementation of t-digest algorithm (https://github.com/tdunning/t-digest).*
27	* This option is very similar to MergingDigest on java, however the decision about
28	* the union is accepted based on the original condition from the article
29	* (via a size constraint, using the approximation of the quantile of each
30	* centroid, not the distance on the curve of the position of their boundaries). MergingDigest
31	* on java gives significantly fewer centroids than this variant, that
32	* negatively affects accuracy with the same compression factor, but gives
33	* size guarantees. The author himself on the proposal for this variant said that
34	* the size of the digest grows like O(log(n)), while the version on java
35	* does not depend on the expected number of points. Also an variant on java
36	* uses asin, which slows down the algorithm a bit.
37	*/
38	template <typename T>
39	class QuantileTDigest
40	{
41	using Value = Float32;
42	using Count = Float32;
43
44	/* The centroid stores the weight of points around their mean value*
45	*/
46	struct Centroid
47	{
48	Value mean;
49	Count count;
50
51	Centroid() = default;
52
53	explicit Centroid(Value mean_, Count count_)
54	: mean(mean_)
55	, count(count_)
56	{}
57
58	Centroid & operator+=(const Centroid & other)
59	{
60	count += other.count;
61	mean += other.count * (other.mean - mean) / count;
62	return *this;
63	}
64
65	bool operator<(const Centroid & other) const
66	{
67	return mean < other.mean;
68	}
69	};
70
71
72	/* :param epsilon: value \delta from the article - error in the range*
73	* quantile 0.5 (default is 0.01, i.e. 1%)
74	* :param max_unmerged: when accumulating count of new points beyond this
75	* value centroid compression is triggered
76	* (default is 2048, the higher the value - the
77	* more memory is required, but amortization of execution time increases)
78	*/
79	struct Params
80	{
81	Value epsilon = `0.01`;
82	size_t max_unmerged = `2048`;
83	};
84
85	Params params;
86
87	/// The memory will be allocated to several elements at once, so that the state occupies 64 bytes.
88	static constexpr size_t bytes_in_arena = `128` - sizeof(PODArray<Centroid>) - sizeof(Count) - sizeof(UInt32);
89	using Summary = PODArrayWithStackMemory<Centroid, bytes_in_arena>;
90
91	Summary summary;
92	Count count = `0`;
93	UInt32 unmerged = `0`;
94
95	/* Linear interpolation at the point x on the line (x1, y1)..(x2, y2)*
96	*/
97	static Value interpolate(Value x, Value x1, Value y1, Value x2, Value y2)
98	{
99	double k = (x - x1) / (x2 - x1);
100	return y1 + k * (y2 - y1);
101	}
102
103	struct RadixSortTraits
104	{
105	using Element = Centroid;
106	using Key = Value;
107	using CountType = UInt32;
108	using KeyBits = UInt32;
109
110	static constexpr size_t PART_SIZE_BITS = `8`;
111
112	using Transform = RadixSortFloatTransform<KeyBits>;
113	using Allocator = RadixSortMallocAllocator;
114
115	/// The function to get the key from an array element.
116	static Key & extractKey(Element & elem) { return elem.mean; }
117	};
118
119	/* Adds a centroid `c` to the digest*
120	*/
121	void addCentroid(const Centroid & c)
122	{
123	summary.push_back(c);
124	count += c.count;
125	++unmerged;
126	if (unmerged >= params.max_unmerged)
127	compress();
128	}
129
130	/* Performs compression of accumulated centroids*
131	* When merging, the invariant is retained to the maximum size of each
132	* centroid that does not exceed `4 q (1 - q) \ delta N`.
133	*/
134	void compress()
135	{
136	if (unmerged > `0`)
137	{
138	RadixSort<RadixSortTraits>::executeLSD(summary.data(), summary.size());
139
140	if (summary.size() > `3`)
141	{
142	/// A pair of consecutive bars of the histogram.
143	auto l = summary.begin();
144	auto r = std::next(l);
145
146	Count sum = `0`;
147	while (r != summary.end())
148	{
149	// we use quantile which gives us the smallest error
150
151	/// The ratio of the part of the histogram to l, including the half l to the entire histogram. That is, what level quantile in position l.
152	Value ql = (sum + l->count * `0.5`) / count;
153	Value err = ql * (`1` - ql);
154
155	/// The ratio of the portion of the histogram to l, including l and half r to the entire histogram. That is, what level is the quantile in position r.
156	Value qr = (sum + l->count + r->count * `0.5`) / count;
157	Value err2 = qr * (`1` - qr);
158
159	if (err > err2)
160	err = err2;
161
162	Value k = `4` * count * err * params.epsilon;
163
164	/* The ratio of the weight of the glued column pair to all values is not greater,*
165	* than epsilon multiply by a certain quadratic coefficient, which in the median is 1 (4 * 1/2 * 1/2),
166	* and at the edges decreases and is approximately equal to the distance to the edge * 4.
167	*/
168
169	if (l->count + r->count <= k)
170	{
171	// it is possible to merge left and right
172	/// The left column "eats" the right.
173	l += r;
174	}
175	else
176	{
177	// not enough capacity, check the next pair
178	sum += l->count;
179	++l;
180
181	/// We skip all the values "eaten" earlier.
182	if (l != r)
183	l = r;
184	}
185	++r;
186	}
187
188	/// At the end of the loop, all values to the right of l were "eaten".
189	summary.resize(l - summary.begin() + `1`);
190	}
191
192	unmerged = `0`;
193	}
194	}
195
196	public:
197	/* Adds to the digest a change in `x` with a weight of `cnt` (default 1)*
198	*/
199	void add(T x, UInt64 cnt = `1`)
200	{
201	addCentroid(Centroid(Value(x), Count(cnt)));
202	}
203
204	void merge(const QuantileTDigest & other)
205	{
206	for (const auto & c : other.summary)
207	addCentroid(c);
208	}
209
210	void serialize(WriteBuffer & buf)
211	{
212	compress();
213	writeVarUInt(summary.size(), buf);
214	buf.write(reinterpret_cast<const char >(summary.data()), summary.size() sizeof(summary[`0`]));
215	}
216
217	void deserialize(ReadBuffer & buf)
218	{
219	size_t size = `0`;
220	readVarUInt(size, buf);
221
222	if (size > params.max_unmerged)
223	throw Exception ("Too large t-digest summary size", ErrorCodes::TOO_LARGE_ARRAY_SIZE);
224
225	summary.resize(size);
226	buf.read(reinterpret_cast<char >(summary.data()), size sizeof(summary[`0`]));
227
228	count = `0`;
229	for (const auto & c : summary)
230	count += c.count;
231	}
232
233	/* Calculates the quantile q [0, 1] based on the digest.*
234	* For an empty digest returns NaN.
235	*/
236	template <typename ResultType>
237	ResultType getImpl(Float64 level)
238	{
239	if (summary.empty())
240	return std::is_floating_point_v<ResultType> ? NAN : `0`;
241
242	compress();
243
244	if (summary.size() == `1`)
245	return summary.front().mean;
246
247	Float64 x = level * count;
248	Float64 prev_x = `0`;
249	Count sum = `0`;
250	Value prev_mean = summary.front().mean;
251
252	for (const auto & c : summary)
253	{
254	Float64 current_x = sum + c.count * `0.5`;
255
256	if (current_x >= x)
257	return interpolate(x, prev_x, prev_mean, current_x, c.mean);
258
259	sum += c.count;
260	prev_mean = c.mean;
261	prev_x = current_x;
262	}
263
264	return summary.back().mean;
265	}
266
267	/* Get multiple quantiles (`size` parts).*
268	* levels - an array of levels of the desired quantiles. They are in a random order.
269	* levels_permutation - array-permutation levels. The i-th position will be the index of the i-th ascending level in the `levels` array.
270	* result - the array where the results are added, in order of `levels`,
271	*/
272	template <typename ResultType>
273	void getManyImpl(const Float64 * levels, const size_t * levels_permutation, size_t size, ResultType * result)
274	{
275	if (summary.empty())
276	{
277	for (size_t result_num = `0`; result_num < size; ++result_num)
278	result[result_num] = std::is_floating_point_v<ResultType> ? NAN : `0`;
279	return;
280	}
281
282	compress();
283
284	if (summary.size() == `1`)
285	{
286	for (size_t result_num = `0`; result_num < size; ++result_num)
287	result[result_num] = summary.front().mean;
288	return;
289	}
290
291	Float64 x = levels[levels_permutation[`0`]] * count;
292	Float64 prev_x = `0`;
293	Count sum = `0`;
294	Value prev_mean = summary.front().mean;
295
296	size_t result_num = `0`;
297	for (const auto & c : summary)
298	{
299	Float64 current_x = sum + c.count * `0.5`;
300
301	while (current_x >= x)
302	{
303	result[levels_permutation[result_num]] = interpolate(x, prev_x, prev_mean, current_x, c.mean);
304
305	++result_num;
306	if (result_num >= size)
307	return;
308
309	x = levels[levels_permutation[result_num]] * count;
310	}
311
312	sum += c.count;
313	prev_mean = c.mean;
314	prev_x = current_x;
315	}
316
317	auto rest_of_results = summary.back().mean;
318	for (; result_num < size; ++result_num)
319	result[levels_permutation[result_num]] = rest_of_results;
320	}
321
322	T get(Float64 level)
323	{
324	return getImpl<T>(level);
325	}
326
327	Float32 getFloat(Float64 level)
328	{
329	return getImpl<Float32>(level);
330	}
331
332	void getMany(const Float64 * levels, const size_t * indices, size_t size, T * result)
333	{
334	getManyImpl(levels, indices, size, result);
335	}
336
337	void getManyFloat(const Float64 * levels, const size_t * indices, size_t size, Float32 * result)
338	{
339	getManyImpl(levels, indices, size, result);
340	}
341	};
342
343	}
344

Browse the source code of ClickHouse/dbms/src/AggregateFunctions/QuantileTDigest.h