1/*
2 * Copyright 2012-present Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#pragma once
18
19#include <cmath>
20#include <vector>
21
22#include <folly/Range.h>
23#include <folly/Utility.h>
24
25namespace folly {
26
27/*
28 * TDigests are a biased quantile estimator designed to estimate the values of
29 * the quantiles of streaming data with high accuracy and low memory,
30 * particularly for quantiles at the tails (p0.1, p1, p99, p99.9). See
31 * https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf
32 * for an explanation of what the purpose of TDigests is, and how they work.
33 *
34 * There is a notable difference between the implementation here and the
35 * implementation in the paper. In the paper, the recommended scaling function
36 * for bucketing centroids is an arcsin function. The arcsin function provides
37 * high accuracy for low memory, but comes at a relatively high compute cost.
38 * A good choice algorithm has the following properties:
39 * - The value of the function k(0, delta) = 0, and k(1, delta) = delta.
40 * This is a requirement for any t-digest function.
41 * - The limit of the derivative of the function dk/dq at 0 is inf, and at
42 * 1 is inf. This provides bias to improve accuracy at the tails.
43 * - For any q <= 0.5, dk/dq(q) = dk/dq(1-q). This ensures that the accuracy
44 * of upper and lower quantiles are equivalent.
45 * As such, TDigest uses a sqrt function with these properties, which is faster
46 * than arcsin. There is a small, but relatively negligible impact to accuracy
47 * at the tail. In empirical tests, accuracy of the sqrt approach has been
48 * adequate.
49 */
50class TDigest {
51 public:
52 class Centroid {
53 public:
54 explicit Centroid(double mean = 0.0, double weight = 1.0)
55 : mean_(mean), weight_(weight) {
56 DCHECK_GT(weight, 0);
57 }
58
59 inline double mean() const {
60 return mean_;
61 }
62
63 inline double weight() const {
64 return weight_;
65 }
66
67 /*
68 * Adds the sum/weight to this centroid, and returns the new sum.
69 */
70 inline double add(double sum, double weight);
71
72 inline bool operator<(const Centroid& other) const {
73 return mean() < other.mean();
74 }
75
76 private:
77 double mean_;
78 double weight_;
79 };
80
81 explicit TDigest(size_t maxSize = 100)
82 : maxSize_(maxSize), sum_(0.0), count_(0.0), max_(NAN), min_(NAN) {}
83
84 explicit TDigest(
85 std::vector<Centroid> centroids,
86 double sum,
87 double count,
88 double max_val,
89 double min_val,
90 size_t maxSize = 100);
91
92 /*
93 * Returns a new TDigest constructed with values merged from the current
94 * digest and the given sortedValues.
95 */
96 TDigest merge(presorted_t, Range<const double*> sortedValues) const;
97 TDigest merge(Range<const double*> unsortedValues) const;
98
99 /*
100 * Returns a new TDigest constructed with values merged from the given
101 * digests.
102 */
103 static TDigest merge(Range<const TDigest*> digests);
104
105 /*
106 * Estimates the value of the given quantile.
107 */
108 double estimateQuantile(double q) const;
109
110 double mean() const {
111 return count_ ? sum_ / count_ : 0;
112 }
113
114 double sum() const {
115 return sum_;
116 }
117
118 double count() const {
119 return count_;
120 }
121
122 double min() const {
123 return min_;
124 }
125
126 double max() const {
127 return max_;
128 }
129
130 bool empty() const {
131 return centroids_.empty();
132 }
133
134 const std::vector<Centroid>& getCentroids() const {
135 return centroids_;
136 }
137
138 size_t maxSize() const {
139 return maxSize_;
140 }
141
142 private:
143 std::vector<Centroid> centroids_;
144 size_t maxSize_;
145 double sum_;
146 double count_;
147 double max_;
148 double min_;
149};
150
151} // namespace folly
152