1 | /* |
2 | * Copyright 2012-present Facebook, Inc. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | #pragma once |
18 | |
19 | #include <cmath> |
20 | #include <vector> |
21 | |
22 | #include <folly/Range.h> |
23 | #include <folly/Utility.h> |
24 | |
25 | namespace folly { |
26 | |
27 | /* |
28 | * TDigests are a biased quantile estimator designed to estimate the values of |
29 | * the quantiles of streaming data with high accuracy and low memory, |
30 | * particularly for quantiles at the tails (p0.1, p1, p99, p99.9). See |
31 | * https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf |
32 | * for an explanation of what the purpose of TDigests is, and how they work. |
33 | * |
34 | * There is a notable difference between the implementation here and the |
35 | * implementation in the paper. In the paper, the recommended scaling function |
36 | * for bucketing centroids is an arcsin function. The arcsin function provides |
37 | * high accuracy for low memory, but comes at a relatively high compute cost. |
38 | * A good choice algorithm has the following properties: |
39 | * - The value of the function k(0, delta) = 0, and k(1, delta) = delta. |
40 | * This is a requirement for any t-digest function. |
41 | * - The limit of the derivative of the function dk/dq at 0 is inf, and at |
42 | * 1 is inf. This provides bias to improve accuracy at the tails. |
43 | * - For any q <= 0.5, dk/dq(q) = dk/dq(1-q). This ensures that the accuracy |
44 | * of upper and lower quantiles are equivalent. |
45 | * As such, TDigest uses a sqrt function with these properties, which is faster |
46 | * than arcsin. There is a small, but relatively negligible impact to accuracy |
47 | * at the tail. In empirical tests, accuracy of the sqrt approach has been |
48 | * adequate. |
49 | */ |
50 | class TDigest { |
51 | public: |
52 | class Centroid { |
53 | public: |
54 | explicit Centroid(double mean = 0.0, double weight = 1.0) |
55 | : mean_(mean), weight_(weight) { |
56 | DCHECK_GT(weight, 0); |
57 | } |
58 | |
59 | inline double mean() const { |
60 | return mean_; |
61 | } |
62 | |
63 | inline double weight() const { |
64 | return weight_; |
65 | } |
66 | |
67 | /* |
68 | * Adds the sum/weight to this centroid, and returns the new sum. |
69 | */ |
70 | inline double add(double sum, double weight); |
71 | |
72 | inline bool operator<(const Centroid& other) const { |
73 | return mean() < other.mean(); |
74 | } |
75 | |
76 | private: |
77 | double mean_; |
78 | double weight_; |
79 | }; |
80 | |
81 | explicit TDigest(size_t maxSize = 100) |
82 | : maxSize_(maxSize), sum_(0.0), count_(0.0), max_(NAN), min_(NAN) {} |
83 | |
84 | explicit TDigest( |
85 | std::vector<Centroid> centroids, |
86 | double sum, |
87 | double count, |
88 | double max_val, |
89 | double min_val, |
90 | size_t maxSize = 100); |
91 | |
92 | /* |
93 | * Returns a new TDigest constructed with values merged from the current |
94 | * digest and the given sortedValues. |
95 | */ |
96 | TDigest merge(presorted_t, Range<const double*> sortedValues) const; |
97 | TDigest merge(Range<const double*> unsortedValues) const; |
98 | |
99 | /* |
100 | * Returns a new TDigest constructed with values merged from the given |
101 | * digests. |
102 | */ |
103 | static TDigest merge(Range<const TDigest*> digests); |
104 | |
105 | /* |
106 | * Estimates the value of the given quantile. |
107 | */ |
108 | double estimateQuantile(double q) const; |
109 | |
110 | double mean() const { |
111 | return count_ ? sum_ / count_ : 0; |
112 | } |
113 | |
114 | double sum() const { |
115 | return sum_; |
116 | } |
117 | |
118 | double count() const { |
119 | return count_; |
120 | } |
121 | |
122 | double min() const { |
123 | return min_; |
124 | } |
125 | |
126 | double max() const { |
127 | return max_; |
128 | } |
129 | |
130 | bool empty() const { |
131 | return centroids_.empty(); |
132 | } |
133 | |
134 | const std::vector<Centroid>& getCentroids() const { |
135 | return centroids_; |
136 | } |
137 | |
138 | size_t maxSize() const { |
139 | return maxSize_; |
140 | } |
141 | |
142 | private: |
143 | std::vector<Centroid> centroids_; |
144 | size_t maxSize_; |
145 | double sum_; |
146 | double count_; |
147 | double max_; |
148 | double min_; |
149 | }; |
150 | |
151 | } // namespace folly |
152 | |