1 | /* |
2 | * Copyright 2013-present Facebook, Inc. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | #include <folly/stats/TDigest.h> |
18 | |
19 | #include <chrono> |
20 | #include <random> |
21 | |
22 | #include <folly/portability/GTest.h> |
23 | |
24 | using namespace folly; |
25 | |
26 | /* |
27 | * Tests run at a reasonable speed with these settings, but it is good to |
28 | * occasionally test with kNumRandomRuns = 3000 and kNumSamples = 50000 |
29 | */ |
30 | const int32_t kNumSamples = 3000; |
31 | const int32_t kNumRandomRuns = 10; |
32 | const int32_t kSeed = 0; |
33 | |
34 | TEST(TDigest, Basic) { |
35 | TDigest digest(100); |
36 | |
37 | std::vector<double> values; |
38 | for (int i = 1; i <= 100; ++i) { |
39 | values.push_back(i); |
40 | } |
41 | |
42 | digest = digest.merge(values); |
43 | |
44 | EXPECT_EQ(100, digest.count()); |
45 | EXPECT_EQ(5050, digest.sum()); |
46 | EXPECT_EQ(50.5, digest.mean()); |
47 | EXPECT_EQ(1, digest.min()); |
48 | EXPECT_EQ(100, digest.max()); |
49 | |
50 | EXPECT_EQ(1, digest.estimateQuantile(0.001)); |
51 | EXPECT_EQ(2.0 - 0.5, digest.estimateQuantile(0.01)); |
52 | EXPECT_EQ(50.375, digest.estimateQuantile(0.5)); |
53 | EXPECT_EQ(100.0 - 0.5, digest.estimateQuantile(0.99)); |
54 | EXPECT_EQ(100, digest.estimateQuantile(0.999)); |
55 | } |
56 | |
57 | TEST(TDigest, Merge) { |
58 | TDigest digest(100); |
59 | |
60 | std::vector<double> values; |
61 | for (int i = 1; i <= 100; ++i) { |
62 | values.push_back(i); |
63 | } |
64 | digest = digest.merge(values); |
65 | |
66 | values.clear(); |
67 | for (int i = 101; i <= 200; ++i) { |
68 | values.push_back(i); |
69 | } |
70 | digest = digest.merge(values); |
71 | |
72 | EXPECT_EQ(200, digest.count()); |
73 | EXPECT_EQ(20100, digest.sum()); |
74 | EXPECT_EQ(100.5, digest.mean()); |
75 | EXPECT_EQ(1, digest.min()); |
76 | EXPECT_EQ(200, digest.max()); |
77 | |
78 | EXPECT_EQ(1, digest.estimateQuantile(0.001)); |
79 | EXPECT_EQ(4.0 - 1.5, digest.estimateQuantile(0.01)); |
80 | EXPECT_EQ(100.25, digest.estimateQuantile(0.5)); |
81 | EXPECT_EQ(200.0 - 1.5, digest.estimateQuantile(0.99)); |
82 | EXPECT_EQ(200, digest.estimateQuantile(0.999)); |
83 | } |
84 | |
85 | TEST(TDigest, MergeSmall) { |
86 | TDigest digest(100); |
87 | |
88 | std::vector<double> values; |
89 | values.push_back(1); |
90 | |
91 | digest = digest.merge(values); |
92 | |
93 | EXPECT_EQ(1, digest.count()); |
94 | EXPECT_EQ(1, digest.sum()); |
95 | EXPECT_EQ(1, digest.mean()); |
96 | EXPECT_EQ(1, digest.min()); |
97 | EXPECT_EQ(1, digest.max()); |
98 | |
99 | EXPECT_EQ(1.0, digest.estimateQuantile(0.001)); |
100 | EXPECT_EQ(1.0, digest.estimateQuantile(0.01)); |
101 | EXPECT_EQ(1.0, digest.estimateQuantile(0.5)); |
102 | EXPECT_EQ(1.0, digest.estimateQuantile(0.99)); |
103 | EXPECT_EQ(1.0, digest.estimateQuantile(0.999)); |
104 | } |
105 | |
106 | TEST(TDigest, MergeLarge) { |
107 | TDigest digest(100); |
108 | |
109 | std::vector<double> values; |
110 | for (int i = 1; i <= 1000; ++i) { |
111 | values.push_back(i); |
112 | } |
113 | digest = digest.merge(values); |
114 | |
115 | EXPECT_EQ(1000, digest.count()); |
116 | EXPECT_EQ(500500, digest.sum()); |
117 | EXPECT_EQ(500.5, digest.mean()); |
118 | EXPECT_EQ(1, digest.min()); |
119 | EXPECT_EQ(1000, digest.max()); |
120 | |
121 | EXPECT_EQ(1.5, digest.estimateQuantile(0.001)); |
122 | EXPECT_EQ(10.5, digest.estimateQuantile(0.01)); |
123 | EXPECT_EQ(500.25, digest.estimateQuantile(0.5)); |
124 | EXPECT_EQ(990.25, digest.estimateQuantile(0.99)); |
125 | EXPECT_EQ(999.5, digest.estimateQuantile(0.999)); |
126 | } |
127 | |
128 | TEST(TDigest, MergeLargeAsDigests) { |
129 | std::vector<TDigest> digests; |
130 | TDigest digest(100); |
131 | |
132 | std::vector<double> values; |
133 | for (int i = 1; i <= 1000; ++i) { |
134 | values.push_back(i); |
135 | } |
136 | // Ensure that the values do not monotonically increase across digests. |
137 | std::shuffle( |
138 | values.begin(), values.end(), std::mt19937(std::random_device()())); |
139 | for (int i = 0; i < 10; ++i) { |
140 | std::vector<double> unsorted_values( |
141 | values.begin() + (i * 100), values.begin() + (i + 1) * 100); |
142 | digests.push_back(digest.merge(unsorted_values)); |
143 | } |
144 | |
145 | digest = TDigest::merge(digests); |
146 | |
147 | EXPECT_EQ(1000, digest.count()); |
148 | EXPECT_EQ(500500, digest.sum()); |
149 | EXPECT_EQ(500.5, digest.mean()); |
150 | EXPECT_EQ(1, digest.min()); |
151 | EXPECT_EQ(1000, digest.max()); |
152 | |
153 | EXPECT_EQ(1.5, digest.estimateQuantile(0.001)); |
154 | EXPECT_EQ(10.5, digest.estimateQuantile(0.01)); |
155 | EXPECT_EQ(990.25, digest.estimateQuantile(0.99)); |
156 | EXPECT_EQ(999.5, digest.estimateQuantile(0.999)); |
157 | } |
158 | |
159 | TEST(TDigest, NegativeValues) { |
160 | std::vector<TDigest> digests; |
161 | TDigest digest(100); |
162 | |
163 | std::vector<double> values; |
164 | for (int i = 1; i <= 100; ++i) { |
165 | values.push_back(i); |
166 | values.push_back(-i); |
167 | } |
168 | |
169 | digest = digest.merge(values); |
170 | |
171 | EXPECT_EQ(200, digest.count()); |
172 | EXPECT_EQ(0, digest.sum()); |
173 | EXPECT_EQ(0, digest.mean()); |
174 | EXPECT_EQ(-100, digest.min()); |
175 | EXPECT_EQ(100, digest.max()); |
176 | |
177 | EXPECT_EQ(-100, digest.estimateQuantile(0.0)); |
178 | EXPECT_EQ(-100, digest.estimateQuantile(0.001)); |
179 | EXPECT_EQ(-98.5, digest.estimateQuantile(0.01)); |
180 | EXPECT_EQ(98.5, digest.estimateQuantile(0.99)); |
181 | EXPECT_EQ(100, digest.estimateQuantile(0.999)); |
182 | EXPECT_EQ(100, digest.estimateQuantile(1.0)); |
183 | } |
184 | |
185 | TEST(TDigest, NegativeValuesMergeDigests) { |
186 | std::vector<TDigest> digests; |
187 | TDigest digest(100); |
188 | |
189 | std::vector<double> values; |
190 | std::vector<double> negativeValues; |
191 | for (int i = 1; i <= 100; ++i) { |
192 | values.push_back(i); |
193 | negativeValues.push_back(-i); |
194 | } |
195 | |
196 | auto digest1 = digest.merge(values); |
197 | auto digest2 = digest.merge(negativeValues); |
198 | |
199 | std::array<TDigest, 2> a{{digest1, digest2}}; |
200 | |
201 | digest = TDigest::merge(a); |
202 | |
203 | EXPECT_EQ(200, digest.count()); |
204 | EXPECT_EQ(0, digest.sum()); |
205 | EXPECT_EQ(0, digest.mean()); |
206 | EXPECT_EQ(-100, digest.min()); |
207 | EXPECT_EQ(100, digest.max()); |
208 | |
209 | EXPECT_EQ(-100, digest.estimateQuantile(0.0)); |
210 | EXPECT_EQ(-100, digest.estimateQuantile(0.001)); |
211 | EXPECT_EQ(-98.5, digest.estimateQuantile(0.01)); |
212 | EXPECT_EQ(98.5, digest.estimateQuantile(0.99)); |
213 | EXPECT_EQ(100, digest.estimateQuantile(0.999)); |
214 | EXPECT_EQ(100, digest.estimateQuantile(1.0)); |
215 | } |
216 | |
217 | TEST(TDigest, ConstructFromCentroids) { |
218 | std::vector<TDigest::Centroid> centroids{}; |
219 | |
220 | TDigest digest(100); |
221 | std::vector<double> values; |
222 | for (int i = 1; i <= 100; ++i) { |
223 | values.push_back(i); |
224 | } |
225 | auto digest1 = digest.merge(values); |
226 | |
227 | size_t centroid_count = digest1.getCentroids().size(); |
228 | |
229 | TDigest digest2( |
230 | digest1.getCentroids(), |
231 | digest1.sum(), |
232 | digest1.count(), |
233 | digest1.max(), |
234 | digest1.min(), |
235 | 100); |
236 | |
237 | EXPECT_EQ(digest1.sum(), digest2.sum()); |
238 | EXPECT_EQ(digest1.count(), digest2.count()); |
239 | EXPECT_EQ(digest1.min(), digest2.min()); |
240 | EXPECT_EQ(digest1.max(), digest2.max()); |
241 | EXPECT_EQ(digest1.getCentroids().size(), digest2.getCentroids().size()); |
242 | |
243 | TDigest digest3( |
244 | digest1.getCentroids(), |
245 | digest1.sum(), |
246 | digest1.count(), |
247 | digest1.max(), |
248 | digest1.min(), |
249 | centroid_count - 1); |
250 | EXPECT_EQ(digest1.sum(), digest3.sum()); |
251 | EXPECT_EQ(digest1.count(), digest3.count()); |
252 | EXPECT_EQ(digest1.min(), digest3.min()); |
253 | EXPECT_EQ(digest1.max(), digest3.max()); |
254 | EXPECT_NE(digest1.getCentroids().size(), digest3.getCentroids().size()); |
255 | } |
256 | |
257 | TEST(TDigest, LargeOutlierTest) { |
258 | folly::TDigest digest(100); |
259 | |
260 | std::vector<double> values; |
261 | for (double i = 0; i < 19; ++i) { |
262 | values.push_back(i); |
263 | } |
264 | values.push_back(1000000); |
265 | |
266 | std::sort(values.begin(), values.end()); |
267 | digest = digest.merge(values); |
268 | EXPECT_LT( |
269 | (int64_t)digest.estimateQuantile(0.5), |
270 | (int64_t)digest.estimateQuantile(0.90)); |
271 | } |
272 | |
273 | TEST(TDigest, FloatingPointSortedTest) { |
274 | // When combining centroids, floating point accuracy can lead to us building |
275 | // and unsorted digest if we are not careful. This tests that we are properly |
276 | // sorting the digest. |
277 | double val = 1.4; |
278 | TDigest digest1(100); |
279 | std::vector<double> values1; |
280 | for (int i = 1; i <= 100; ++i) { |
281 | values1.push_back(val); |
282 | } |
283 | digest1 = digest1.merge(values1); |
284 | |
285 | TDigest digest2(100); |
286 | std::vector<double> values2; |
287 | for (int i = 1; i <= 100; ++i) { |
288 | values2.push_back(val); |
289 | } |
290 | digest2 = digest2.merge(values2); |
291 | |
292 | std::array<TDigest, 2> a{{digest1, digest2}}; |
293 | auto mergeDigest1 = TDigest::merge(a); |
294 | |
295 | TDigest digest3(100); |
296 | std::vector<double> values3; |
297 | for (int i = 1; i <= 100; ++i) { |
298 | values3.push_back(val); |
299 | } |
300 | digest3 = digest2.merge(values3); |
301 | std::array<TDigest, 2> b{{digest3, mergeDigest1}}; |
302 | auto mergeDigest2 = TDigest::merge(b); |
303 | |
304 | auto centroids = mergeDigest2.getCentroids(); |
305 | EXPECT_EQ(std::is_sorted(centroids.begin(), centroids.end()), true); |
306 | } |
307 | |
308 | class DistributionTest |
309 | : public ::testing::TestWithParam< |
310 | std::tuple<std::pair<bool, size_t>, double, bool>> {}; |
311 | |
312 | TEST_P(DistributionTest, ReasonableError) { |
313 | std::pair<bool, size_t> underlyingDistribution; |
314 | bool logarithmic; |
315 | size_t modes; |
316 | double quantile; |
317 | double reasonableError = 0; |
318 | bool digestMerge; |
319 | |
320 | std::tie(underlyingDistribution, quantile, digestMerge) = GetParam(); |
321 | |
322 | std::tie(logarithmic, modes) = underlyingDistribution; |
323 | if (quantile == 0.001 || quantile == 0.999) { |
324 | reasonableError = 0.001; |
325 | } else if (quantile == 0.01 || quantile == 0.99) { |
326 | reasonableError = 0.01; |
327 | } else if (quantile == 0.25 || quantile == 0.5 || quantile == 0.75) { |
328 | reasonableError = 0.04; |
329 | } |
330 | |
331 | std::vector<double> errors; |
332 | |
333 | std::default_random_engine generator; |
334 | generator.seed(kSeed); |
335 | for (size_t iter = 0; iter < kNumRandomRuns; ++iter) { |
336 | TDigest digest(100); |
337 | |
338 | std::vector<double> values; |
339 | |
340 | if (logarithmic) { |
341 | std::lognormal_distribution<double> distribution(0.0, 1.0); |
342 | |
343 | for (size_t i = 0; i < kNumSamples; ++i) { |
344 | auto mode = (int)distribution(generator) % modes; |
345 | values.push_back(distribution(generator) + 100.0 * mode); |
346 | } |
347 | } else { |
348 | std::uniform_int_distribution<int> distributionPicker(0, modes - 1); |
349 | |
350 | std::vector<std::normal_distribution<double>> distributions; |
351 | for (size_t i = 0; i < modes; ++i) { |
352 | distributions.emplace_back(100.0 * (i + 1), 25); |
353 | } |
354 | |
355 | for (size_t i = 0; i < kNumSamples; ++i) { |
356 | auto distributionIdx = distributionPicker(generator); |
357 | values.push_back(distributions[distributionIdx](generator)); |
358 | } |
359 | } |
360 | |
361 | std::vector<TDigest> digests; |
362 | for (size_t i = 0; i < kNumSamples / 1000; ++i) { |
363 | folly::Range<const double*> r(values, i * 1000, 1000); |
364 | if (digestMerge) { |
365 | digests.push_back(digest.merge(r)); |
366 | } else { |
367 | digest = digest.merge(r); |
368 | } |
369 | } |
370 | |
371 | std::sort(values.begin(), values.end()); |
372 | |
373 | if (digestMerge) { |
374 | digest = TDigest::merge(digests); |
375 | } |
376 | |
377 | double est = digest.estimateQuantile(quantile); |
378 | auto it = std::lower_bound(values.begin(), values.end(), est); |
379 | int32_t actualRank = std::distance(values.begin(), it); |
380 | double actualQuantile = ((double)actualRank) / kNumSamples; |
381 | errors.push_back(actualQuantile - quantile); |
382 | } |
383 | |
384 | double sum = 0.0; |
385 | |
386 | for (auto error : errors) { |
387 | sum += error; |
388 | } |
389 | |
390 | double mean = sum / kNumRandomRuns; |
391 | |
392 | double numerator = 0.0; |
393 | for (auto error : errors) { |
394 | numerator += pow(error - mean, 2); |
395 | } |
396 | |
397 | double stddev = std::sqrt(numerator / (kNumRandomRuns - 1)); |
398 | |
399 | EXPECT_GE(reasonableError, stddev); |
400 | } |
401 | |
402 | INSTANTIATE_TEST_CASE_P( |
403 | ReasonableErrors, |
404 | DistributionTest, |
405 | ::testing::Combine( |
406 | ::testing::Values( |
407 | std::make_pair(true, 1), |
408 | std::make_pair(true, 3), |
409 | std::make_pair(false, 1), |
410 | std::make_pair(false, 10)), |
411 | ::testing::Values(0.001, 0.01, 0.25, 0.50, 0.75, 0.99, 0.999), |
412 | ::testing::Bool())); |
413 | |