1 | #pragma once |
2 | |
3 | #include <cmath> |
4 | #include <Common/RadixSort.h> |
5 | #include <Common/PODArray.h> |
6 | #include <IO/WriteBuffer.h> |
7 | #include <IO/ReadBuffer.h> |
8 | #include <IO/VarInt.h> |
9 | |
10 | |
11 | namespace DB |
12 | { |
13 | |
14 | namespace ErrorCodes |
15 | { |
16 | extern const int TOO_LARGE_ARRAY_SIZE; |
17 | } |
18 | |
19 | |
20 | /** The algorithm was implemented by Alexei Borzenkov https://github.com/snaury |
21 | * He owns the authorship of the code and half the comments in this namespace, |
22 | * except for merging, serialization, and sorting, as well as selecting types and other changes. |
23 | * We thank Alexei Borzenkov for writing the original code. |
24 | */ |
25 | |
26 | /** Implementation of t-digest algorithm (https://github.com/tdunning/t-digest). |
27 | * This option is very similar to MergingDigest on java, however the decision about |
28 | * the union is accepted based on the original condition from the article |
29 | * (via a size constraint, using the approximation of the quantile of each |
30 | * centroid, not the distance on the curve of the position of their boundaries). MergingDigest |
31 | * on java gives significantly fewer centroids than this variant, that |
32 | * negatively affects accuracy with the same compression factor, but gives |
33 | * size guarantees. The author himself on the proposal for this variant said that |
34 | * the size of the digest grows like O(log(n)), while the version on java |
35 | * does not depend on the expected number of points. Also an variant on java |
36 | * uses asin, which slows down the algorithm a bit. |
37 | */ |
38 | template <typename T> |
39 | class QuantileTDigest |
40 | { |
41 | using Value = Float32; |
42 | using Count = Float32; |
43 | |
44 | /** The centroid stores the weight of points around their mean value |
45 | */ |
46 | struct Centroid |
47 | { |
48 | Value mean; |
49 | Count count; |
50 | |
51 | Centroid() = default; |
52 | |
53 | explicit Centroid(Value mean_, Count count_) |
54 | : mean(mean_) |
55 | , count(count_) |
56 | {} |
57 | |
58 | Centroid & operator+=(const Centroid & other) |
59 | { |
60 | count += other.count; |
61 | mean += other.count * (other.mean - mean) / count; |
62 | return *this; |
63 | } |
64 | |
65 | bool operator<(const Centroid & other) const |
66 | { |
67 | return mean < other.mean; |
68 | } |
69 | }; |
70 | |
71 | |
72 | /** :param epsilon: value \delta from the article - error in the range |
73 | * quantile 0.5 (default is 0.01, i.e. 1%) |
74 | * :param max_unmerged: when accumulating count of new points beyond this |
75 | * value centroid compression is triggered |
76 | * (default is 2048, the higher the value - the |
77 | * more memory is required, but amortization of execution time increases) |
78 | */ |
79 | struct Params |
80 | { |
81 | Value epsilon = 0.01; |
82 | size_t max_unmerged = 2048; |
83 | }; |
84 | |
85 | Params params; |
86 | |
87 | /// The memory will be allocated to several elements at once, so that the state occupies 64 bytes. |
88 | static constexpr size_t bytes_in_arena = 128 - sizeof(PODArray<Centroid>) - sizeof(Count) - sizeof(UInt32); |
89 | using Summary = PODArrayWithStackMemory<Centroid, bytes_in_arena>; |
90 | |
91 | Summary summary; |
92 | Count count = 0; |
93 | UInt32 unmerged = 0; |
94 | |
95 | /** Linear interpolation at the point x on the line (x1, y1)..(x2, y2) |
96 | */ |
97 | static Value interpolate(Value x, Value x1, Value y1, Value x2, Value y2) |
98 | { |
99 | double k = (x - x1) / (x2 - x1); |
100 | return y1 + k * (y2 - y1); |
101 | } |
102 | |
103 | struct RadixSortTraits |
104 | { |
105 | using Element = Centroid; |
106 | using Key = Value; |
107 | using CountType = UInt32; |
108 | using KeyBits = UInt32; |
109 | |
110 | static constexpr size_t PART_SIZE_BITS = 8; |
111 | |
112 | using Transform = RadixSortFloatTransform<KeyBits>; |
113 | using Allocator = RadixSortMallocAllocator; |
114 | |
115 | /// The function to get the key from an array element. |
116 | static Key & (Element & elem) { return elem.mean; } |
117 | }; |
118 | |
119 | /** Adds a centroid `c` to the digest |
120 | */ |
121 | void addCentroid(const Centroid & c) |
122 | { |
123 | summary.push_back(c); |
124 | count += c.count; |
125 | ++unmerged; |
126 | if (unmerged >= params.max_unmerged) |
127 | compress(); |
128 | } |
129 | |
130 | /** Performs compression of accumulated centroids |
131 | * When merging, the invariant is retained to the maximum size of each |
132 | * centroid that does not exceed `4 q (1 - q) \ delta N`. |
133 | */ |
134 | void compress() |
135 | { |
136 | if (unmerged > 0) |
137 | { |
138 | RadixSort<RadixSortTraits>::executeLSD(summary.data(), summary.size()); |
139 | |
140 | if (summary.size() > 3) |
141 | { |
142 | /// A pair of consecutive bars of the histogram. |
143 | auto l = summary.begin(); |
144 | auto r = std::next(l); |
145 | |
146 | Count sum = 0; |
147 | while (r != summary.end()) |
148 | { |
149 | // we use quantile which gives us the smallest error |
150 | |
151 | /// The ratio of the part of the histogram to l, including the half l to the entire histogram. That is, what level quantile in position l. |
152 | Value ql = (sum + l->count * 0.5) / count; |
153 | Value err = ql * (1 - ql); |
154 | |
155 | /// The ratio of the portion of the histogram to l, including l and half r to the entire histogram. That is, what level is the quantile in position r. |
156 | Value qr = (sum + l->count + r->count * 0.5) / count; |
157 | Value err2 = qr * (1 - qr); |
158 | |
159 | if (err > err2) |
160 | err = err2; |
161 | |
162 | Value k = 4 * count * err * params.epsilon; |
163 | |
164 | /** The ratio of the weight of the glued column pair to all values is not greater, |
165 | * than epsilon multiply by a certain quadratic coefficient, which in the median is 1 (4 * 1/2 * 1/2), |
166 | * and at the edges decreases and is approximately equal to the distance to the edge * 4. |
167 | */ |
168 | |
169 | if (l->count + r->count <= k) |
170 | { |
171 | // it is possible to merge left and right |
172 | /// The left column "eats" the right. |
173 | *l += *r; |
174 | } |
175 | else |
176 | { |
177 | // not enough capacity, check the next pair |
178 | sum += l->count; |
179 | ++l; |
180 | |
181 | /// We skip all the values "eaten" earlier. |
182 | if (l != r) |
183 | *l = *r; |
184 | } |
185 | ++r; |
186 | } |
187 | |
188 | /// At the end of the loop, all values to the right of l were "eaten". |
189 | summary.resize(l - summary.begin() + 1); |
190 | } |
191 | |
192 | unmerged = 0; |
193 | } |
194 | } |
195 | |
196 | public: |
197 | /** Adds to the digest a change in `x` with a weight of `cnt` (default 1) |
198 | */ |
199 | void add(T x, UInt64 cnt = 1) |
200 | { |
201 | addCentroid(Centroid(Value(x), Count(cnt))); |
202 | } |
203 | |
204 | void merge(const QuantileTDigest & other) |
205 | { |
206 | for (const auto & c : other.summary) |
207 | addCentroid(c); |
208 | } |
209 | |
210 | void serialize(WriteBuffer & buf) |
211 | { |
212 | compress(); |
213 | writeVarUInt(summary.size(), buf); |
214 | buf.write(reinterpret_cast<const char *>(summary.data()), summary.size() * sizeof(summary[0])); |
215 | } |
216 | |
217 | void deserialize(ReadBuffer & buf) |
218 | { |
219 | size_t size = 0; |
220 | readVarUInt(size, buf); |
221 | |
222 | if (size > params.max_unmerged) |
223 | throw Exception("Too large t-digest summary size" , ErrorCodes::TOO_LARGE_ARRAY_SIZE); |
224 | |
225 | summary.resize(size); |
226 | buf.read(reinterpret_cast<char *>(summary.data()), size * sizeof(summary[0])); |
227 | |
228 | count = 0; |
229 | for (const auto & c : summary) |
230 | count += c.count; |
231 | } |
232 | |
233 | /** Calculates the quantile q [0, 1] based on the digest. |
234 | * For an empty digest returns NaN. |
235 | */ |
236 | template <typename ResultType> |
237 | ResultType getImpl(Float64 level) |
238 | { |
239 | if (summary.empty()) |
240 | return std::is_floating_point_v<ResultType> ? NAN : 0; |
241 | |
242 | compress(); |
243 | |
244 | if (summary.size() == 1) |
245 | return summary.front().mean; |
246 | |
247 | Float64 x = level * count; |
248 | Float64 prev_x = 0; |
249 | Count sum = 0; |
250 | Value prev_mean = summary.front().mean; |
251 | |
252 | for (const auto & c : summary) |
253 | { |
254 | Float64 current_x = sum + c.count * 0.5; |
255 | |
256 | if (current_x >= x) |
257 | return interpolate(x, prev_x, prev_mean, current_x, c.mean); |
258 | |
259 | sum += c.count; |
260 | prev_mean = c.mean; |
261 | prev_x = current_x; |
262 | } |
263 | |
264 | return summary.back().mean; |
265 | } |
266 | |
267 | /** Get multiple quantiles (`size` parts). |
268 | * levels - an array of levels of the desired quantiles. They are in a random order. |
269 | * levels_permutation - array-permutation levels. The i-th position will be the index of the i-th ascending level in the `levels` array. |
270 | * result - the array where the results are added, in order of `levels`, |
271 | */ |
272 | template <typename ResultType> |
273 | void getManyImpl(const Float64 * levels, const size_t * levels_permutation, size_t size, ResultType * result) |
274 | { |
275 | if (summary.empty()) |
276 | { |
277 | for (size_t result_num = 0; result_num < size; ++result_num) |
278 | result[result_num] = std::is_floating_point_v<ResultType> ? NAN : 0; |
279 | return; |
280 | } |
281 | |
282 | compress(); |
283 | |
284 | if (summary.size() == 1) |
285 | { |
286 | for (size_t result_num = 0; result_num < size; ++result_num) |
287 | result[result_num] = summary.front().mean; |
288 | return; |
289 | } |
290 | |
291 | Float64 x = levels[levels_permutation[0]] * count; |
292 | Float64 prev_x = 0; |
293 | Count sum = 0; |
294 | Value prev_mean = summary.front().mean; |
295 | |
296 | size_t result_num = 0; |
297 | for (const auto & c : summary) |
298 | { |
299 | Float64 current_x = sum + c.count * 0.5; |
300 | |
301 | while (current_x >= x) |
302 | { |
303 | result[levels_permutation[result_num]] = interpolate(x, prev_x, prev_mean, current_x, c.mean); |
304 | |
305 | ++result_num; |
306 | if (result_num >= size) |
307 | return; |
308 | |
309 | x = levels[levels_permutation[result_num]] * count; |
310 | } |
311 | |
312 | sum += c.count; |
313 | prev_mean = c.mean; |
314 | prev_x = current_x; |
315 | } |
316 | |
317 | auto rest_of_results = summary.back().mean; |
318 | for (; result_num < size; ++result_num) |
319 | result[levels_permutation[result_num]] = rest_of_results; |
320 | } |
321 | |
322 | T get(Float64 level) |
323 | { |
324 | return getImpl<T>(level); |
325 | } |
326 | |
327 | Float32 getFloat(Float64 level) |
328 | { |
329 | return getImpl<Float32>(level); |
330 | } |
331 | |
332 | void getMany(const Float64 * levels, const size_t * indices, size_t size, T * result) |
333 | { |
334 | getManyImpl(levels, indices, size, result); |
335 | } |
336 | |
337 | void getManyFloat(const Float64 * levels, const size_t * indices, size_t size, Float32 * result) |
338 | { |
339 | getManyImpl(levels, indices, size, result); |
340 | } |
341 | }; |
342 | |
343 | } |
344 | |