1/*
2 * Copyright 2013-present Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <folly/stats/TDigest.h>
18
19#include <chrono>
20#include <random>
21
22#include <folly/portability/GTest.h>
23
24using namespace folly;
25
26/*
27 * Tests run at a reasonable speed with these settings, but it is good to
28 * occasionally test with kNumRandomRuns = 3000 and kNumSamples = 50000
29 */
30const int32_t kNumSamples = 3000;
31const int32_t kNumRandomRuns = 10;
32const int32_t kSeed = 0;
33
34TEST(TDigest, Basic) {
35 TDigest digest(100);
36
37 std::vector<double> values;
38 for (int i = 1; i <= 100; ++i) {
39 values.push_back(i);
40 }
41
42 digest = digest.merge(values);
43
44 EXPECT_EQ(100, digest.count());
45 EXPECT_EQ(5050, digest.sum());
46 EXPECT_EQ(50.5, digest.mean());
47 EXPECT_EQ(1, digest.min());
48 EXPECT_EQ(100, digest.max());
49
50 EXPECT_EQ(1, digest.estimateQuantile(0.001));
51 EXPECT_EQ(2.0 - 0.5, digest.estimateQuantile(0.01));
52 EXPECT_EQ(50.375, digest.estimateQuantile(0.5));
53 EXPECT_EQ(100.0 - 0.5, digest.estimateQuantile(0.99));
54 EXPECT_EQ(100, digest.estimateQuantile(0.999));
55}
56
57TEST(TDigest, Merge) {
58 TDigest digest(100);
59
60 std::vector<double> values;
61 for (int i = 1; i <= 100; ++i) {
62 values.push_back(i);
63 }
64 digest = digest.merge(values);
65
66 values.clear();
67 for (int i = 101; i <= 200; ++i) {
68 values.push_back(i);
69 }
70 digest = digest.merge(values);
71
72 EXPECT_EQ(200, digest.count());
73 EXPECT_EQ(20100, digest.sum());
74 EXPECT_EQ(100.5, digest.mean());
75 EXPECT_EQ(1, digest.min());
76 EXPECT_EQ(200, digest.max());
77
78 EXPECT_EQ(1, digest.estimateQuantile(0.001));
79 EXPECT_EQ(4.0 - 1.5, digest.estimateQuantile(0.01));
80 EXPECT_EQ(100.25, digest.estimateQuantile(0.5));
81 EXPECT_EQ(200.0 - 1.5, digest.estimateQuantile(0.99));
82 EXPECT_EQ(200, digest.estimateQuantile(0.999));
83}
84
85TEST(TDigest, MergeSmall) {
86 TDigest digest(100);
87
88 std::vector<double> values;
89 values.push_back(1);
90
91 digest = digest.merge(values);
92
93 EXPECT_EQ(1, digest.count());
94 EXPECT_EQ(1, digest.sum());
95 EXPECT_EQ(1, digest.mean());
96 EXPECT_EQ(1, digest.min());
97 EXPECT_EQ(1, digest.max());
98
99 EXPECT_EQ(1.0, digest.estimateQuantile(0.001));
100 EXPECT_EQ(1.0, digest.estimateQuantile(0.01));
101 EXPECT_EQ(1.0, digest.estimateQuantile(0.5));
102 EXPECT_EQ(1.0, digest.estimateQuantile(0.99));
103 EXPECT_EQ(1.0, digest.estimateQuantile(0.999));
104}
105
106TEST(TDigest, MergeLarge) {
107 TDigest digest(100);
108
109 std::vector<double> values;
110 for (int i = 1; i <= 1000; ++i) {
111 values.push_back(i);
112 }
113 digest = digest.merge(values);
114
115 EXPECT_EQ(1000, digest.count());
116 EXPECT_EQ(500500, digest.sum());
117 EXPECT_EQ(500.5, digest.mean());
118 EXPECT_EQ(1, digest.min());
119 EXPECT_EQ(1000, digest.max());
120
121 EXPECT_EQ(1.5, digest.estimateQuantile(0.001));
122 EXPECT_EQ(10.5, digest.estimateQuantile(0.01));
123 EXPECT_EQ(500.25, digest.estimateQuantile(0.5));
124 EXPECT_EQ(990.25, digest.estimateQuantile(0.99));
125 EXPECT_EQ(999.5, digest.estimateQuantile(0.999));
126}
127
128TEST(TDigest, MergeLargeAsDigests) {
129 std::vector<TDigest> digests;
130 TDigest digest(100);
131
132 std::vector<double> values;
133 for (int i = 1; i <= 1000; ++i) {
134 values.push_back(i);
135 }
136 // Ensure that the values do not monotonically increase across digests.
137 std::shuffle(
138 values.begin(), values.end(), std::mt19937(std::random_device()()));
139 for (int i = 0; i < 10; ++i) {
140 std::vector<double> unsorted_values(
141 values.begin() + (i * 100), values.begin() + (i + 1) * 100);
142 digests.push_back(digest.merge(unsorted_values));
143 }
144
145 digest = TDigest::merge(digests);
146
147 EXPECT_EQ(1000, digest.count());
148 EXPECT_EQ(500500, digest.sum());
149 EXPECT_EQ(500.5, digest.mean());
150 EXPECT_EQ(1, digest.min());
151 EXPECT_EQ(1000, digest.max());
152
153 EXPECT_EQ(1.5, digest.estimateQuantile(0.001));
154 EXPECT_EQ(10.5, digest.estimateQuantile(0.01));
155 EXPECT_EQ(990.25, digest.estimateQuantile(0.99));
156 EXPECT_EQ(999.5, digest.estimateQuantile(0.999));
157}
158
159TEST(TDigest, NegativeValues) {
160 std::vector<TDigest> digests;
161 TDigest digest(100);
162
163 std::vector<double> values;
164 for (int i = 1; i <= 100; ++i) {
165 values.push_back(i);
166 values.push_back(-i);
167 }
168
169 digest = digest.merge(values);
170
171 EXPECT_EQ(200, digest.count());
172 EXPECT_EQ(0, digest.sum());
173 EXPECT_EQ(0, digest.mean());
174 EXPECT_EQ(-100, digest.min());
175 EXPECT_EQ(100, digest.max());
176
177 EXPECT_EQ(-100, digest.estimateQuantile(0.0));
178 EXPECT_EQ(-100, digest.estimateQuantile(0.001));
179 EXPECT_EQ(-98.5, digest.estimateQuantile(0.01));
180 EXPECT_EQ(98.5, digest.estimateQuantile(0.99));
181 EXPECT_EQ(100, digest.estimateQuantile(0.999));
182 EXPECT_EQ(100, digest.estimateQuantile(1.0));
183}
184
185TEST(TDigest, NegativeValuesMergeDigests) {
186 std::vector<TDigest> digests;
187 TDigest digest(100);
188
189 std::vector<double> values;
190 std::vector<double> negativeValues;
191 for (int i = 1; i <= 100; ++i) {
192 values.push_back(i);
193 negativeValues.push_back(-i);
194 }
195
196 auto digest1 = digest.merge(values);
197 auto digest2 = digest.merge(negativeValues);
198
199 std::array<TDigest, 2> a{{digest1, digest2}};
200
201 digest = TDigest::merge(a);
202
203 EXPECT_EQ(200, digest.count());
204 EXPECT_EQ(0, digest.sum());
205 EXPECT_EQ(0, digest.mean());
206 EXPECT_EQ(-100, digest.min());
207 EXPECT_EQ(100, digest.max());
208
209 EXPECT_EQ(-100, digest.estimateQuantile(0.0));
210 EXPECT_EQ(-100, digest.estimateQuantile(0.001));
211 EXPECT_EQ(-98.5, digest.estimateQuantile(0.01));
212 EXPECT_EQ(98.5, digest.estimateQuantile(0.99));
213 EXPECT_EQ(100, digest.estimateQuantile(0.999));
214 EXPECT_EQ(100, digest.estimateQuantile(1.0));
215}
216
217TEST(TDigest, ConstructFromCentroids) {
218 std::vector<TDigest::Centroid> centroids{};
219
220 TDigest digest(100);
221 std::vector<double> values;
222 for (int i = 1; i <= 100; ++i) {
223 values.push_back(i);
224 }
225 auto digest1 = digest.merge(values);
226
227 size_t centroid_count = digest1.getCentroids().size();
228
229 TDigest digest2(
230 digest1.getCentroids(),
231 digest1.sum(),
232 digest1.count(),
233 digest1.max(),
234 digest1.min(),
235 100);
236
237 EXPECT_EQ(digest1.sum(), digest2.sum());
238 EXPECT_EQ(digest1.count(), digest2.count());
239 EXPECT_EQ(digest1.min(), digest2.min());
240 EXPECT_EQ(digest1.max(), digest2.max());
241 EXPECT_EQ(digest1.getCentroids().size(), digest2.getCentroids().size());
242
243 TDigest digest3(
244 digest1.getCentroids(),
245 digest1.sum(),
246 digest1.count(),
247 digest1.max(),
248 digest1.min(),
249 centroid_count - 1);
250 EXPECT_EQ(digest1.sum(), digest3.sum());
251 EXPECT_EQ(digest1.count(), digest3.count());
252 EXPECT_EQ(digest1.min(), digest3.min());
253 EXPECT_EQ(digest1.max(), digest3.max());
254 EXPECT_NE(digest1.getCentroids().size(), digest3.getCentroids().size());
255}
256
257TEST(TDigest, LargeOutlierTest) {
258 folly::TDigest digest(100);
259
260 std::vector<double> values;
261 for (double i = 0; i < 19; ++i) {
262 values.push_back(i);
263 }
264 values.push_back(1000000);
265
266 std::sort(values.begin(), values.end());
267 digest = digest.merge(values);
268 EXPECT_LT(
269 (int64_t)digest.estimateQuantile(0.5),
270 (int64_t)digest.estimateQuantile(0.90));
271}
272
273TEST(TDigest, FloatingPointSortedTest) {
274 // When combining centroids, floating point accuracy can lead to us building
275 // and unsorted digest if we are not careful. This tests that we are properly
276 // sorting the digest.
277 double val = 1.4;
278 TDigest digest1(100);
279 std::vector<double> values1;
280 for (int i = 1; i <= 100; ++i) {
281 values1.push_back(val);
282 }
283 digest1 = digest1.merge(values1);
284
285 TDigest digest2(100);
286 std::vector<double> values2;
287 for (int i = 1; i <= 100; ++i) {
288 values2.push_back(val);
289 }
290 digest2 = digest2.merge(values2);
291
292 std::array<TDigest, 2> a{{digest1, digest2}};
293 auto mergeDigest1 = TDigest::merge(a);
294
295 TDigest digest3(100);
296 std::vector<double> values3;
297 for (int i = 1; i <= 100; ++i) {
298 values3.push_back(val);
299 }
300 digest3 = digest2.merge(values3);
301 std::array<TDigest, 2> b{{digest3, mergeDigest1}};
302 auto mergeDigest2 = TDigest::merge(b);
303
304 auto centroids = mergeDigest2.getCentroids();
305 EXPECT_EQ(std::is_sorted(centroids.begin(), centroids.end()), true);
306}
307
308class DistributionTest
309 : public ::testing::TestWithParam<
310 std::tuple<std::pair<bool, size_t>, double, bool>> {};
311
312TEST_P(DistributionTest, ReasonableError) {
313 std::pair<bool, size_t> underlyingDistribution;
314 bool logarithmic;
315 size_t modes;
316 double quantile;
317 double reasonableError = 0;
318 bool digestMerge;
319
320 std::tie(underlyingDistribution, quantile, digestMerge) = GetParam();
321
322 std::tie(logarithmic, modes) = underlyingDistribution;
323 if (quantile == 0.001 || quantile == 0.999) {
324 reasonableError = 0.001;
325 } else if (quantile == 0.01 || quantile == 0.99) {
326 reasonableError = 0.01;
327 } else if (quantile == 0.25 || quantile == 0.5 || quantile == 0.75) {
328 reasonableError = 0.04;
329 }
330
331 std::vector<double> errors;
332
333 std::default_random_engine generator;
334 generator.seed(kSeed);
335 for (size_t iter = 0; iter < kNumRandomRuns; ++iter) {
336 TDigest digest(100);
337
338 std::vector<double> values;
339
340 if (logarithmic) {
341 std::lognormal_distribution<double> distribution(0.0, 1.0);
342
343 for (size_t i = 0; i < kNumSamples; ++i) {
344 auto mode = (int)distribution(generator) % modes;
345 values.push_back(distribution(generator) + 100.0 * mode);
346 }
347 } else {
348 std::uniform_int_distribution<int> distributionPicker(0, modes - 1);
349
350 std::vector<std::normal_distribution<double>> distributions;
351 for (size_t i = 0; i < modes; ++i) {
352 distributions.emplace_back(100.0 * (i + 1), 25);
353 }
354
355 for (size_t i = 0; i < kNumSamples; ++i) {
356 auto distributionIdx = distributionPicker(generator);
357 values.push_back(distributions[distributionIdx](generator));
358 }
359 }
360
361 std::vector<TDigest> digests;
362 for (size_t i = 0; i < kNumSamples / 1000; ++i) {
363 folly::Range<const double*> r(values, i * 1000, 1000);
364 if (digestMerge) {
365 digests.push_back(digest.merge(r));
366 } else {
367 digest = digest.merge(r);
368 }
369 }
370
371 std::sort(values.begin(), values.end());
372
373 if (digestMerge) {
374 digest = TDigest::merge(digests);
375 }
376
377 double est = digest.estimateQuantile(quantile);
378 auto it = std::lower_bound(values.begin(), values.end(), est);
379 int32_t actualRank = std::distance(values.begin(), it);
380 double actualQuantile = ((double)actualRank) / kNumSamples;
381 errors.push_back(actualQuantile - quantile);
382 }
383
384 double sum = 0.0;
385
386 for (auto error : errors) {
387 sum += error;
388 }
389
390 double mean = sum / kNumRandomRuns;
391
392 double numerator = 0.0;
393 for (auto error : errors) {
394 numerator += pow(error - mean, 2);
395 }
396
397 double stddev = std::sqrt(numerator / (kNumRandomRuns - 1));
398
399 EXPECT_GE(reasonableError, stddev);
400}
401
402INSTANTIATE_TEST_CASE_P(
403 ReasonableErrors,
404 DistributionTest,
405 ::testing::Combine(
406 ::testing::Values(
407 std::make_pair(true, 1),
408 std::make_pair(true, 3),
409 std::make_pair(false, 1),
410 std::make_pair(false, 10)),
411 ::testing::Values(0.001, 0.01, 0.25, 0.50, 0.75, 0.99, 0.999),
412 ::testing::Bool()));
413