t_digest.hpp source code [Velox/build/_deps/duckdb-src/third_party/tdigest/t_digest.hpp]

1	/*
2	* Licensed to Derrick R. Burns under one or more
3	* contributor license agreements. See the NOTICES file distributed with
4	* this work for additional information regarding copyright ownership.
5	* The ASF licenses this file to You under the Apache License, Version 2.0
6	* (the "License"); you may not use this file except in compliance with
7	* the License. You may obtain a copy of the License at
8	*
9	* http://www.apache.org/licenses/LICENSE-2.0
10	*
11	* Unless required by applicable law or agreed to in writing, software
12	* distributed under the License is distributed on an "AS IS" BASIS,
13	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14	* See the License for the specific language governing permissions and
15	* limitations under the License.
16	*/
17
18	#pragma once
19
20	#include <algorithm>
21	#include <cfloat>
22	#include <cmath>
23	#include <queue>
24	#include <utility>
25	#include <vector>
26
27	#ifdef min
28	#undef min
29	#endif
30
31	#ifdef max
32	#undef max
33	#endif
34
35
36	namespace duckdb_tdigest {
37
38	using Value = double;
39	using Weight = double;
40	using Index = size_t;
41
42	const size_t kHighWater = `40000`;
43	const double pi = `3.14159265358979323846`;
44	class Centroid {
45	public:
46	Centroid() : Centroid (`0.0`, `0.0`) {
47	}
48
49	Centroid(Value mean, Weight weight) : mean_(mean), weight_(weight) {
50	}
51
52	inline Value mean() const noexcept {
53	return mean_;
54	}
55
56	inline Weight weight() const noexcept {
57	return weight_;
58	}
59
60	inline void add(const Centroid &c) {
61	// CHECK_GT(c.weight_, 0);
62	if (weight_ != `0.0`) {
63	weight_ += c.weight_;
64	mean_ += c.weight_ * (c.mean_ - mean_) / weight_;
65	} else {
66	weight_ = c.weight_;
67	mean_ = c.mean_;
68	}
69	}
70
71	private:
72	Value mean_ = `0`;
73	Weight weight_ = `0`;
74	};
75
76	struct CentroidList {
77	explicit CentroidList(const std::vector<Centroid> &s) : iter (s.cbegin()), end (s.cend()) {
78	}
79	std::vector<Centroid>::const_iterator iter;
80	std::vector<Centroid>::const_iterator end;
81
82	bool advance() {
83	return ++iter != end;
84	}
85	};
86
87	class CentroidListComparator {
88	public:
89	CentroidListComparator() {
90	}
91
92	bool operator()(const CentroidList &left, const CentroidList &right) const {
93	return left.iter ->mean() > right.iter ->mean();
94	}
95	};
96
97	using CentroidListQueue = std::priority_queue<CentroidList, std::vector<CentroidList>, CentroidListComparator>;
98
99	struct CentroidComparator {
100	bool operator()(const Centroid &a, const Centroid &b) const {
101	return a.mean() < b.mean();
102	}
103	};
104
105	class TDigest {
106	class TDigestComparator {
107	public:
108	TDigestComparator() {
109	}
110
111	bool operator()(const TDigest left, const* TDigest right) const* {
112	return left->totalSize() > right->totalSize();
113	}
114	};
115
116	using TDigestQueue = std::priority_queue<const TDigest , std::vector<const* TDigest *>, TDigestComparator>;
117
118	public:
119	TDigest() : TDigest (`1000`) {
120	}
121
122	explicit TDigest(Value compression) : TDigest (compression, `0`) {
123	}
124
125	TDigest(Value compression, Index bufferSize) : TDigest (compression, bufferSize, `0`) {
126	}
127
128	TDigest(Value compression, Index unmergedSize, Index mergedSize)
129	: compression_(compression), maxProcessed_(processedSize(size: mergedSize, compression)),
130	maxUnprocessed_(unprocessedSize(size: unmergedSize, compression)) {
131	processed_.reserve(n: maxProcessed_);
132	unprocessed_.reserve(n: maxUnprocessed_ + `1`);
133	}
134
135	TDigest(std::vector<Centroid> &&processed, std::vector<Centroid> &&unprocessed, Value compression,
136	Index unmergedSize, Index mergedSize)
137	: TDigest (compression, unmergedSize, mergedSize) {
138	processed_ = std::move(processed);
139	unprocessed_ = std::move(unprocessed);
140
141	processedWeight_ = weight(centroids&: processed_);
142	unprocessedWeight_ = weight(centroids&: unprocessed_);
143	if (!processed_.empty()) {
144	min_ = std::min(min_, processed_[`0`].mean());
145	max_ = std::max(max_, (processed_.cend() - `1`)->mean());
146	}
147	updateCumulative();
148	}
149
150	static Weight weight(std::vector<Centroid> &centroids) noexcept {
151	Weight w = `0.0`;
152	for (auto centroid : centroids) {
153	w += centroid.weight();
154	}
155	return w;
156	}
157
158	TDigest &operator=(TDigest &&o) {
159	compression_ = o.compression_;
160	maxProcessed_ = o.maxProcessed_;
161	maxUnprocessed_ = o.maxUnprocessed_;
162	processedWeight_ = o.processedWeight_;
163	unprocessedWeight_ = o.unprocessedWeight_;
164	processed_ = std::move(o.processed_);
165	unprocessed_ = std::move(o.unprocessed_);
166	cumulative_ = std::move(o.cumulative_);
167	min_ = o.min_;
168	max_ = o.max_;
169	return *this;
170	}
171
172	TDigest(TDigest &&o)
173	: TDigest (std::move(o.processed_), std::move(o.unprocessed_), o.compression_, o.maxUnprocessed_,
174	o.maxProcessed_) {
175	}
176
177	static inline Index processedSize(Index size, Value compression) noexcept {
178	return (size == `0`) ? static_cast<Index>(`2` * std::ceil(x: compression)) : size;
179	}
180
181	static inline Index unprocessedSize(Index size, Value compression) noexcept {
182	return (size == `0`) ? static_cast<Index>(`8` * std::ceil(x: compression)) : size;
183	}
184
185	// merge in another t-digest
186	inline void merge(const TDigest *other) {
187	std::vector<const TDigest *> others {other};
188	add(iter: others.cbegin(), end: others.cend());
189	}
190
191	const std::vector<Centroid> &processed() const {
192	return processed_;
193	}
194
195	const std::vector<Centroid> &unprocessed() const {
196	return unprocessed_;
197	}
198
199	Index maxUnprocessed() const {
200	return maxUnprocessed_;
201	}
202
203	Index maxProcessed() const {
204	return maxProcessed_;
205	}
206
207	inline void add(std::vector<const TDigest *> digests) {
208	add(iter: digests.cbegin(), end: digests.cend());
209	}
210
211	// merge in a vector of tdigests in the most efficient manner possible
212	// in constant space
213	// works for any value of kHighWater
214	void add(std::vector<const TDigest >::const_iterator iter, std::vector<const* TDigest *>::const_iterator end) {
215	if (iter != end) {
216	auto size = std::distance(first: iter, last: end);
217	TDigestQueue pq(TDigestComparator {});
218	for (; iter != end; iter ++) {
219	pq.push(x: (*iter));
220	}
221	std::vector<const TDigest *> batch;
222	batch.reserve(n: size);
223
224	size_t totalSize = `0`;
225	while (!pq.empty()) {
226	auto td = pq.top();
227	batch.push_back(x: td);
228	pq.pop();
229	totalSize += td->totalSize();
230	if (totalSize >= kHighWater \|\| pq.empty()) {
231	mergeProcessed(tdigests: batch);
232	mergeUnprocessed(tdigests: batch);
233	processIfNecessary();
234	batch.clear();
235	totalSize = `0`;
236	}
237	}
238	updateCumulative();
239	}
240	}
241
242	Weight processedWeight() const {
243	return processedWeight_;
244	}
245
246	Weight unprocessedWeight() const {
247	return unprocessedWeight_;
248	}
249
250	bool haveUnprocessed() const {
251	return unprocessed_.size() > `0`;
252	}
253
254	size_t totalSize() const {
255	return processed_.size() + unprocessed_.size();
256	}
257
258	long totalWeight() const {
259	return static_cast<long>(processedWeight_ + unprocessedWeight_);
260	}
261
262	// return the cdf on the t-digest
263	Value cdf(Value x) {
264	if (haveUnprocessed() \|\| isDirty()) {
265	process();
266	}
267	return cdfProcessed(x);
268	}
269
270	bool isDirty() {
271	return processed_.size() > maxProcessed_ \|\| unprocessed_.size() > maxUnprocessed_;
272	}
273
274	// return the cdf on the processed values
275	Value cdfProcessed(Value x) const {
276	if (processed_.empty()) {
277	// no data to examin_e
278
279	return `0.0`;
280	} else if (processed_.size() == `1`) {
281	// exactly one centroid, should have max_==min_
282	auto width = max_ - min_;
283	if (x < min_) {
284	return `0.0`;
285	} else if (x > max_) {
286	return `1.0`;
287	} else if (x - min_ <= width) {
288	// min_ and max_ are too close together to do any viable interpolation
289	return `0.5`;
290	} else {
291	// interpolate if somehow we have weight > 0 and max_ != min_
292	return (x - min_) / (max_ - min_);
293	}
294	} else {
295	auto n = processed_.size();
296	if (x <= min_) {
297	return `0`;
298	}
299
300	if (x >= max_) {
301	return `1`;
302	}
303
304	// check for the left tail
305	if (x <= mean(i: `0`)) {
306
307	// note that this is different than mean(0) > min_ ... this guarantees interpolation works
308	if (mean(i: `0`) - min_ > `0`) {
309	return (x - min_) / (mean(i: `0`) - min_) * weight(i: `0`) / processedWeight_ / `2.0`;
310	} else {
311	return `0`;
312	}
313	}
314
315	// and the right tail
316	if (x >= mean(i: n - `1`)) {
317	if (max_ - mean(i: n - `1`) > `0`) {
318	return `1.0` - (max_ - x) / (max_ - mean(i: n - `1`)) * weight(i: n - `1`) / processedWeight_ / `2.0`;
319	} else {
320	return `1`;
321	}
322	}
323
324	CentroidComparator cc;
325	auto iter = std::upper_bound(first: processed_.cbegin(), last: processed_.cend(), val: Centroid (x, `0`), comp: cc);
326
327	auto i = std::distance(first: processed_.cbegin(), last: iter);
328	auto z1 = x - (iter - `1`)->mean();
329	auto z2 = (iter)->mean() - x;
330	return weightedAverage(x1: cumulative_[i - `1`], w1: z2, x2: cumulative_[i], w2: z1) / processedWeight_;
331	}
332	}
333
334	// this returns a quantile on the t-digest
335	Value quantile(Value q) {
336	if (haveUnprocessed() \|\| isDirty()) {
337	process();
338	}
339	return quantileProcessed(q);
340	}
341
342	// this returns a quantile on the currently processed values without changing the t-digest
343	// the value will not represent the unprocessed values
344	Value quantileProcessed(Value q) const {
345	if (q < `0` \|\| q > `1`) {
346	return NAN;
347	}
348
349	if (processed_.size() == `0`) {
350	// no sorted means no data, no way to get a quantile
351	return NAN;
352	} else if (processed_.size() == `1`) {
353	// with one data point, all quantiles lead to Rome
354
355	return mean(i: `0`);
356	}
357
358	// we know that there are at least two sorted now
359	auto n = processed_.size();
360
361	// if values were stored in a sorted array, index would be the offset we are Weighterested in
362	const auto index = q * processedWeight_;
363
364	// at the boundaries, we return min_ or max_
365	if (index <= weight(i: `0`) / `2.0`) {
366	return min_ + `2.0` * index / weight(i: `0`) * (mean(i: `0`) - min_);
367	}
368
369	auto iter = std::lower_bound(cumulative_.cbegin(), cumulative_.cend(), index);
370
371	if (iter + `1` != cumulative_.cend()) {
372	auto i = std::distance(first: cumulative_.cbegin(), last: iter);
373	auto z1 = index - *(iter - `1`);
374	auto z2 = *(iter)-index;
375	// LOG(INFO) << "z2 " << z2 << " index " << index << " z1 " << z1;
376	return weightedAverage(x1: mean(i: i - `1`), w1: z2, x2: mean(i), w2: z1);
377	}
378	auto z1 = index - processedWeight_ - weight(i: n - `1`) / `2.0`;
379	auto z2 = weight(i: n - `1`) / `2` - z1;
380	return weightedAverage(x1: mean(i: n - `1`), w1: z1, x2: max_, w2: z2);
381	}
382
383	Value compression() const {
384	return compression_;
385	}
386
387	void add(Value x) {
388	add(x, w: `1`);
389	}
390
391	inline void compress() {
392	process();
393	}
394
395	// add a single centroid to the unprocessed vector, processing previously unprocessed sorted if our limit has
396	// been reached.
397	inline bool add(Value x, Weight w) {
398	if (std::isnan(x: x)) {
399	return false;
400	}
401	unprocessed_.push_back(x: Centroid (x, w));
402	unprocessedWeight_ += w;
403	processIfNecessary();
404	return true;
405	}
406
407	inline void add(std::vector<Centroid>::const_iterator iter, std::vector<Centroid>::const_iterator end) {
408	while (iter != end) {
409	const size_t diff = std::distance(first: iter, last: end);
410	const size_t room = maxUnprocessed_ - unprocessed_.size();
411	auto mid = iter + std::min(diff, room);
412	while (iter != mid) {
413	unprocessed_.push_back(x: *(iter ++));
414	}
415	if (unprocessed_.size() >= maxUnprocessed_) {
416	process();
417	}
418	}
419	}
420
421	private:
422	Value compression_;
423
424	Value min_ = std::numeric_limits<Value>::max();
425
426	Value max_ = std::numeric_limits<Value>::min();
427
428	Index maxProcessed_;
429
430	Index maxUnprocessed_;
431
432	Value processedWeight_ = `0.0`;
433
434	Value unprocessedWeight_ = `0.0`;
435
436	std::vector<Centroid> processed_;
437
438	std::vector<Centroid> unprocessed_;
439
440	std::vector<Weight> cumulative_;
441
442	// return mean of i-th centroid
443	inline Value mean(int i) const noexcept {
444	return processed_[i].mean();
445	}
446
447	// return weight of i-th centroid
448	inline Weight weight(int i) const noexcept {
449	return processed_[i].weight();
450	}
451
452	// append all unprocessed centroids into current unprocessed vector
453	void mergeUnprocessed(const std::vector<const TDigest *> &tdigests) {
454	if (tdigests.size() == `0`) {
455	return;
456	}
457
458	size_t total = unprocessed_.size();
459	for (auto &td : tdigests) {
460	total += td->unprocessed_.size();
461	}
462
463	unprocessed_.reserve(n: total);
464	for (auto &td : tdigests) {
465	unprocessed_.insert(position: unprocessed_.end(), first: td->unprocessed_.cbegin(), last: td->unprocessed_.cend());
466	unprocessedWeight_ += td->unprocessedWeight_;
467	}
468	}
469
470	// merge all processed centroids together into a single sorted vector
471	void mergeProcessed(const std::vector<const TDigest *> &tdigests) {
472	if (tdigests.size() == `0`) {
473	return;
474	}
475
476	size_t total = `0`;
477	CentroidListQueue pq(CentroidListComparator {});
478	for (auto &td : tdigests) {
479	auto &sorted = td->processed_;
480	auto size = sorted.size();
481	if (size > `0`) {
482	pq.push(x: CentroidList (sorted));
483	total += size;
484	processedWeight_ += td->processedWeight_;
485	}
486	}
487	if (total == `0`) {
488	return;
489	}
490
491	if (processed_.size() > `0`) {
492	pq.push(x: CentroidList (processed_));
493	total += processed_.size();
494	}
495
496	std::vector<Centroid> sorted;
497	sorted.reserve(n: total);
498
499	while (!pq.empty()) {
500	auto best = pq.top();
501	pq.pop();
502	sorted.push_back(x: *(best.iter));
503	if (best.advance()) {
504	pq.push(x: best);
505	}
506	}
507	processed_ = std::move(sorted);
508	if (processed_.size() > `0`) {
509	min_ = std::min(min_, processed_[`0`].mean());
510	max_ = std::max(max_, (processed_.cend() - `1`)->mean());
511	}
512	}
513
514	inline void processIfNecessary() {
515	if (isDirty()) {
516	process();
517	}
518	}
519
520	void updateCumulative() {
521	const auto n = processed_.size();
522	cumulative_.clear();
523	cumulative_.reserve(n: n + `1`);
524	auto previous = `0.0`;
525	for (Index i = `0`; i < n; i++) {
526	auto current = weight(i);
527	auto halfCurrent = current / `2.0`;
528	cumulative_.push_back(x: previous + halfCurrent);
529	previous = previous + current;
530	}
531	cumulative_.push_back(x: previous);
532	}
533
534	// merges unprocessed_ centroids and processed_ centroids together and processes them
535	// when complete, unprocessed_ will be empty and processed_ will have at most maxProcessed_ centroids
536	inline void process() {
537	CentroidComparator cc;
538	std::sort(first: unprocessed_.begin(), last: unprocessed_.end(), comp: cc);
539	auto count = unprocessed_.size();
540	unprocessed_.insert(position: unprocessed_.end(), first: processed_.cbegin(), last: processed_.cend());
541	std::inplace_merge(first: unprocessed_.begin(), middle: unprocessed_.begin() + count, last: unprocessed_.end(), comp: cc);
542
543	processedWeight_ += unprocessedWeight_;
544	unprocessedWeight_ = `0`;
545	processed_.clear();
546
547	processed_.push_back(x: unprocessed_[`0`]);
548	Weight wSoFar = unprocessed_[`0`].weight();
549	Weight wLimit = processedWeight_ * integratedQ(k: `1.0`);
550
551	auto end = unprocessed_.end();
552	for (auto iter = unprocessed_.cbegin() + `1`; iter < end; iter ++) {
553	auto &centroid = *iter;
554	Weight projectedW = wSoFar + centroid.weight();
555	if (projectedW <= wLimit) {
556	wSoFar = projectedW;
557	(processed_.end() - `1`)->add(c: centroid);
558	} else {
559	auto k1 = integratedLocation(q: wSoFar / processedWeight_);
560	wLimit = processedWeight_ * integratedQ(k: k1 + `1.0`);
561	wSoFar += centroid.weight();
562	processed_.emplace_back(args: centroid);
563	}
564	}
565	unprocessed_.clear();
566	min_ = std::min(min_, processed_[`0`].mean());
567	max_ = std::max(max_, (processed_.cend() - `1`)->mean());
568	updateCumulative();
569	}
570
571	inline int checkWeights() {
572	return checkWeights(sorted: processed_, total: processedWeight_);
573	}
574
575	size_t checkWeights(const std::vector<Centroid> &sorted, Value total) {
576	size_t badWeight = `0`;
577	auto k1 = `0.0`;
578	auto q = `0.0`;
579	for (auto iter = sorted.cbegin(); iter != sorted.cend(); iter ++) {
580	auto w = iter ->weight();
581	auto dq = w / total;
582	auto k2 = integratedLocation(q: q + dq);
583	if (k2 - k1 > `1` && w != `1`) {
584	badWeight++;
585	}
586	if (k2 - k1 > `1.5` && w != `1`) {
587	badWeight++;
588	}
589	q += dq;
590	k1 = k2;
591	}
592
593	return badWeight;
594	}
595
596	/**
597	* Converts a quantile into a centroid scale value. The centroid scale is nomin_ally
598	* the number k of the centroid that a quantile point q should belong to. Due to
599	* round-offs, however, we can't align things perfectly without splitting points
600	* and sorted. We don't want to do that, so we have to allow for offsets.
601	* In the end, the criterion is that any quantile range that spans a centroid
602	* scale range more than one should be split across more than one centroid if
603	* possible. This won't be possible if the quantile range refers to a single point
604	* or an already existing centroid.
605	* <p/>
606	* This mapping is steep near q=0 or q=1 so each centroid there will correspond to
607	* less q range. Near q=0.5, the mapping is flatter so that sorted there will
608	* represent a larger chunk of quantiles.
609	*
610	* @param q The quantile scale value to be mapped.
611	* @return The centroid scale value corresponding to q.
612	*/
613	inline Value integratedLocation(Value q) const {
614	return compression_ * (std::asin(x: `2.0` * q - `1.0`) + pi / `2`) / pi;
615	}
616
617	inline Value integratedQ(Value k) const {
618	return (std::sin(x: std::min(k, compression_) * pi / compression_ - pi / `2`) + `1`) / `2`;
619	}
620
621	/**
622	* Same as {@link #weightedAverageSorted(Value, Value, Value, Value)} but flips
623	* the order of the variables if <code>x2</code> is greater than
624	* <code>x1</code>.
625	*/
626	static Value weightedAverage(Value x1, Value w1, Value x2, Value w2) {
627	return (x1 <= x2) ? weightedAverageSorted(x1, w1, x2, w2) : weightedAverageSorted(x1: x2, w1: w2, x2: x1, w2: w1);
628	}
629
630	/**
631	* Compute the weighted average between <code>x1</code> with a weight of
632	* <code>w1</code> and <code>x2</code> with a weight of <code>w2</code>.
633	* This expects <code>x1</code> to be less than or equal to <code>x2</code>
634	* and is guaranteed to return a number between <code>x1</code> and
635	* <code>x2</code>.
636	*/
637	static Value weightedAverageSorted(Value x1, Value w1, Value x2, Value w2) {
638	const Value x = (x1 * w1 + x2 * w2) / (w1 + w2);
639	return std::max(x1, std::min(x, x2));
640	}
641
642	static Value interpolate(Value x, Value x0, Value x1) {
643	return (x - x0) / (x1 - x0);
644	}
645
646	/**
647	* Computes an interpolated value of a quantile that is between two sorted.
648	*
649	* Index is the quantile desired multiplied by the total number of samples - 1.
650	*
651	* @param index Denormalized quantile desired
652	* @param previousIndex The denormalized quantile corresponding to the center of the previous centroid.
653	* @param nextIndex The denormalized quantile corresponding to the center of the following centroid.
654	* @param previousMean The mean of the previous centroid.
655	* @param nextMean The mean of the following centroid.
656	* @return The interpolated mean.
657	*/
658	static Value quantile(Value index, Value previousIndex, Value nextIndex, Value previousMean, Value nextMean) {
659	const auto delta = nextIndex - previousIndex;
660	const auto previousWeight = (nextIndex - index) / delta;
661	const auto nextWeight = (index - previousIndex) / delta;
662	return previousMean * previousWeight + nextMean * nextWeight;
663	}
664	};
665
666	} // namespace tdigest
667

Browse the source code of Velox/build/_deps/duckdb-src/third_party/tdigest/t_digest.hpp