1 | /* |
2 | * Copyright 2013-present Facebook, Inc. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | #pragma once |
18 | |
19 | #include <cmath> |
20 | #include <cstddef> |
21 | #include <cstdint> |
22 | #include <limits> |
23 | #include <ostream> |
24 | #include <stdexcept> |
25 | #include <string> |
26 | #include <type_traits> |
27 | #include <vector> |
28 | |
29 | #include <folly/CPortability.h> |
30 | #include <folly/Traits.h> |
31 | #include <folly/stats/detail/Bucket.h> |
32 | |
33 | namespace folly { |
34 | |
35 | namespace detail { |
36 | |
37 | /* |
38 | * A helper class to manage a set of histogram buckets. |
39 | */ |
40 | template <typename T, typename BucketT> |
41 | class HistogramBuckets { |
42 | public: |
43 | typedef T ValueType; |
44 | typedef BucketT BucketType; |
45 | |
46 | /* |
47 | * Create a set of histogram buckets. |
48 | * |
49 | * One bucket will be created for each bucketSize interval of values within |
50 | * the specified range. Additionally, one bucket will be created to track |
51 | * all values that fall below the specified minimum, and one bucket will be |
52 | * created for all values above the specified maximum. |
53 | * |
54 | * If (max - min) is not a multiple of bucketSize, the last bucket will cover |
55 | * a smaller range of values than the other buckets. |
56 | * |
57 | * (max - min) must be larger than or equal to bucketSize. |
58 | */ |
59 | HistogramBuckets( |
60 | ValueType bucketSize, |
61 | ValueType min, |
62 | ValueType max, |
63 | const BucketType& defaultBucket); |
64 | |
65 | /* Returns the bucket size of each bucket in the histogram. */ |
66 | ValueType getBucketSize() const { |
67 | return bucketSize_; |
68 | } |
69 | |
70 | /* Returns the min value at which bucketing begins. */ |
71 | ValueType getMin() const { |
72 | return min_; |
73 | } |
74 | |
75 | /* Returns the max value at which bucketing ends. */ |
76 | ValueType getMax() const { |
77 | return max_; |
78 | } |
79 | |
80 | /* |
81 | * Returns the number of buckets. |
82 | * |
83 | * This includes the total number of buckets for the [min, max) range, |
84 | * plus 2 extra buckets, one for handling values less than min, and one for |
85 | * values greater than max. |
86 | */ |
87 | size_t getNumBuckets() const { |
88 | return buckets_.size(); |
89 | } |
90 | |
91 | /* Returns the bucket index into which the given value would fall. */ |
92 | size_t getBucketIdx(ValueType value) const; |
93 | |
94 | /* Returns the bucket for the specified value */ |
95 | BucketType& getByValue(ValueType value) { |
96 | return buckets_[getBucketIdx(value)]; |
97 | } |
98 | |
99 | /* Returns the bucket for the specified value */ |
100 | const BucketType& getByValue(ValueType value) const { |
101 | return buckets_[getBucketIdx(value)]; |
102 | } |
103 | |
104 | /* |
105 | * Returns the bucket at the specified index. |
106 | * |
107 | * Note that index 0 is the bucket for all values less than the specified |
108 | * minimum. Index 1 is the first bucket in the specified bucket range. |
109 | */ |
110 | BucketType& getByIndex(size_t idx) { |
111 | return buckets_[idx]; |
112 | } |
113 | |
114 | /* Returns the bucket at the specified index. */ |
115 | const BucketType& getByIndex(size_t idx) const { |
116 | return buckets_[idx]; |
117 | } |
118 | |
119 | /* |
120 | * Returns the minimum threshold for the bucket at the given index. |
121 | * |
122 | * The bucket at the specified index will store values in the range |
123 | * [bucketMin, bucketMin + bucketSize), or [bucketMin, max), if the overall |
124 | * max is smaller than bucketMin + bucketSize. |
125 | */ |
126 | ValueType getBucketMin(size_t idx) const { |
127 | if (idx == 0) { |
128 | return std::numeric_limits<ValueType>::min(); |
129 | } |
130 | if (idx == buckets_.size() - 1) { |
131 | return max_; |
132 | } |
133 | |
134 | return ValueType(min_ + ((idx - 1) * bucketSize_)); |
135 | } |
136 | |
137 | /* |
138 | * Returns the maximum threshold for the bucket at the given index. |
139 | * |
140 | * The bucket at the specified index will store values in the range |
141 | * [bucketMin, bucketMin + bucketSize), or [bucketMin, max), if the overall |
142 | * max is smaller than bucketMin + bucketSize. |
143 | */ |
144 | ValueType getBucketMax(size_t idx) const { |
145 | if (idx == buckets_.size() - 1) { |
146 | return std::numeric_limits<ValueType>::max(); |
147 | } |
148 | |
149 | return ValueType(min_ + (idx * bucketSize_)); |
150 | } |
151 | |
152 | /** |
153 | * Computes the total number of values stored across all buckets. |
154 | * |
155 | * Runs in O(numBuckets) |
156 | * |
157 | * @param countFn A function that takes a const BucketType&, and returns the |
158 | * number of values in that bucket |
159 | * @return Returns the total number of values stored across all buckets |
160 | */ |
161 | template <typename CountFn> |
162 | uint64_t computeTotalCount(CountFn countFromBucket) const; |
163 | |
164 | /** |
165 | * Determine which bucket the specified percentile falls into. |
166 | * |
167 | * Looks for the bucket that contains the Nth percentile data point. |
168 | * |
169 | * @param pct The desired percentile to find, as a value from 0.0 to 1.0. |
170 | * @param countFn A function that takes a const BucketType&, and returns the |
171 | * number of values in that bucket. |
172 | * @param lowPct The lowest percentile stored in the selected bucket will be |
173 | * returned via this parameter. |
174 | * @param highPct The highest percentile stored in the selected bucket will |
175 | * be returned via this parameter. |
176 | * |
177 | * @return Returns the index of the bucket that contains the Nth percentile |
178 | * data point. |
179 | */ |
180 | template <typename CountFn> |
181 | size_t getPercentileBucketIdx( |
182 | double pct, |
183 | CountFn countFromBucket, |
184 | double* lowPct = nullptr, |
185 | double* highPct = nullptr) const; |
186 | |
187 | /** |
188 | * Estimate the value at the specified percentile. |
189 | * |
190 | * @param pct The desired percentile to find, as a value from 0.0 to 1.0. |
191 | * @param countFn A function that takes a const BucketType&, and returns the |
192 | * number of values in that bucket. |
193 | * @param avgFn A function that takes a const BucketType&, and returns the |
194 | * average of all the values in that bucket. |
195 | * |
196 | * @return Returns an estimate for N, where N is the number where exactly pct |
197 | * percentage of the data points in the histogram are less than N. |
198 | */ |
199 | template <typename CountFn, typename AvgFn> |
200 | ValueType getPercentileEstimate( |
201 | double pct, |
202 | CountFn countFromBucket, |
203 | AvgFn avgFromBucket) const; |
204 | |
205 | /* |
206 | * Iterator access to the buckets. |
207 | * |
208 | * Note that the first bucket is for all values less than min, and the last |
209 | * bucket is for all values greater than max. The buckets tracking values in |
210 | * the [min, max) actually start at the second bucket. |
211 | */ |
212 | typename std::vector<BucketType>::const_iterator begin() const { |
213 | return buckets_.begin(); |
214 | } |
215 | typename std::vector<BucketType>::iterator begin() { |
216 | return buckets_.begin(); |
217 | } |
218 | typename std::vector<BucketType>::const_iterator end() const { |
219 | return buckets_.end(); |
220 | } |
221 | typename std::vector<BucketType>::iterator end() { |
222 | return buckets_.end(); |
223 | } |
224 | |
225 | private: |
226 | static constexpr bool kIsExact = std::numeric_limits<ValueType>::is_exact; |
227 | ValueType bucketSize_; |
228 | ValueType min_; |
229 | ValueType max_; |
230 | std::vector<BucketType> buckets_; |
231 | }; |
232 | |
233 | } // namespace detail |
234 | |
235 | /* |
236 | * A basic histogram class. |
237 | * |
238 | * Groups data points into equally-sized buckets, and stores the overall sum of |
239 | * the data points in each bucket, as well as the number of data points in the |
240 | * bucket. |
241 | * |
242 | * The caller must specify the minimum and maximum data points to expect ahead |
243 | * of time, as well as the bucket width. |
244 | */ |
245 | template <typename T> |
246 | class Histogram { |
247 | public: |
248 | typedef T ValueType; |
249 | typedef detail::Bucket<T> Bucket; |
250 | |
251 | Histogram(ValueType bucketSize, ValueType min, ValueType max) |
252 | : buckets_(bucketSize, min, max, Bucket()) {} |
253 | |
254 | /* Add a data point to the histogram */ |
255 | void addValue(ValueType value) { |
256 | Bucket& bucket = buckets_.getByValue(value); |
257 | // NOTE: Overflow is handled elsewhere and tests check this |
258 | // behavior (see HistogramTest.cpp TestOverflow* tests). |
259 | // TODO: It would be nice to handle overflow here and redesign this class. |
260 | auto const addend = to_unsigned(value); |
261 | bucket.sum = static_cast<ValueType>(to_unsigned(bucket.sum) + addend); |
262 | bucket.count += 1; |
263 | } |
264 | |
265 | /* Add multiple same data points to the histogram */ |
266 | void addRepeatedValue(ValueType value, uint64_t nSamples) { |
267 | Bucket& bucket = buckets_.getByValue(value); |
268 | // NOTE: Overflow is handled elsewhere and tests check this |
269 | // behavior (see HistogramTest.cpp TestOverflow* tests). |
270 | // TODO: It would be nice to handle overflow here and redesign this class. |
271 | auto const addend = to_unsigned(value) * nSamples; |
272 | bucket.sum = static_cast<ValueType>(to_unsigned(bucket.sum) + addend); |
273 | bucket.count += nSamples; |
274 | } |
275 | |
276 | /* |
277 | * Remove a data point to the histogram |
278 | * |
279 | * Note that this method does not actually verify that this exact data point |
280 | * had previously been added to the histogram; it merely subtracts the |
281 | * requested value from the appropriate bucket's sum. |
282 | */ |
283 | void removeValue(ValueType value) { |
284 | Bucket& bucket = buckets_.getByValue(value); |
285 | // NOTE: Overflow is handled elsewhere and tests check this |
286 | // behavior (see HistogramTest.cpp TestOverflow* tests). |
287 | // TODO: It would be nice to handle overflow here and redesign this class. |
288 | if (bucket.count > 0) { |
289 | auto const subtrahend = to_unsigned(value); |
290 | bucket.sum = static_cast<ValueType>(to_unsigned(bucket.sum) - subtrahend); |
291 | bucket.count -= 1; |
292 | } else { |
293 | bucket.sum = ValueType(); |
294 | bucket.count = 0; |
295 | } |
296 | } |
297 | |
298 | /* Remove multiple same data points from the histogram */ |
299 | void removeRepeatedValue(ValueType value, uint64_t nSamples) { |
300 | Bucket& bucket = buckets_.getByValue(value); |
301 | // TODO: It would be nice to handle overflow here. |
302 | if (bucket.count >= nSamples) { |
303 | bucket.sum -= value * nSamples; |
304 | bucket.count -= nSamples; |
305 | } else { |
306 | bucket.sum = ValueType(); |
307 | bucket.count = 0; |
308 | } |
309 | } |
310 | |
311 | /* Remove all data points from the histogram */ |
312 | void clear() { |
313 | for (size_t i = 0; i < buckets_.getNumBuckets(); i++) { |
314 | buckets_.getByIndex(i).clear(); |
315 | } |
316 | } |
317 | |
318 | /* Subtract another histogram data from the histogram */ |
319 | void subtract(const Histogram& hist) { |
320 | // the two histogram bucket definitions must match to support |
321 | // subtract. |
322 | if (getBucketSize() != hist.getBucketSize() || getMin() != hist.getMin() || |
323 | getMax() != hist.getMax() || getNumBuckets() != hist.getNumBuckets()) { |
324 | throw std::invalid_argument("Cannot subtract input histogram." ); |
325 | } |
326 | |
327 | for (size_t i = 0; i < buckets_.getNumBuckets(); i++) { |
328 | buckets_.getByIndex(i) -= hist.buckets_.getByIndex(i); |
329 | } |
330 | } |
331 | |
332 | /* Merge two histogram data together */ |
333 | void merge(const Histogram& hist) { |
334 | // the two histogram bucket definitions must match to support |
335 | // a merge. |
336 | if (getBucketSize() != hist.getBucketSize() || getMin() != hist.getMin() || |
337 | getMax() != hist.getMax() || getNumBuckets() != hist.getNumBuckets()) { |
338 | throw std::invalid_argument("Cannot merge from input histogram." ); |
339 | } |
340 | |
341 | for (size_t i = 0; i < buckets_.getNumBuckets(); i++) { |
342 | buckets_.getByIndex(i) += hist.buckets_.getByIndex(i); |
343 | } |
344 | } |
345 | |
346 | /* Copy bucket values from another histogram */ |
347 | void copy(const Histogram& hist) { |
348 | // the two histogram bucket definition must match |
349 | if (getBucketSize() != hist.getBucketSize() || getMin() != hist.getMin() || |
350 | getMax() != hist.getMax() || getNumBuckets() != hist.getNumBuckets()) { |
351 | throw std::invalid_argument("Cannot copy from input histogram." ); |
352 | } |
353 | |
354 | for (size_t i = 0; i < buckets_.getNumBuckets(); i++) { |
355 | buckets_.getByIndex(i) = hist.buckets_.getByIndex(i); |
356 | } |
357 | } |
358 | |
359 | /* Returns the bucket size of each bucket in the histogram. */ |
360 | ValueType getBucketSize() const { |
361 | return buckets_.getBucketSize(); |
362 | } |
363 | /* Returns the min value at which bucketing begins. */ |
364 | ValueType getMin() const { |
365 | return buckets_.getMin(); |
366 | } |
367 | /* Returns the max value at which bucketing ends. */ |
368 | ValueType getMax() const { |
369 | return buckets_.getMax(); |
370 | } |
371 | /* Returns the number of buckets */ |
372 | size_t getNumBuckets() const { |
373 | return buckets_.getNumBuckets(); |
374 | } |
375 | |
376 | /* Returns the specified bucket (for reading only!) */ |
377 | const Bucket& getBucketByIndex(size_t idx) const { |
378 | return buckets_.getByIndex(idx); |
379 | } |
380 | |
381 | /* |
382 | * Returns the minimum threshold for the bucket at the given index. |
383 | * |
384 | * The bucket at the specified index will store values in the range |
385 | * [bucketMin, bucketMin + bucketSize), or [bucketMin, max), if the overall |
386 | * max is smaller than bucketMin + bucketSize. |
387 | */ |
388 | ValueType getBucketMin(size_t idx) const { |
389 | return buckets_.getBucketMin(idx); |
390 | } |
391 | |
392 | /* |
393 | * Returns the maximum threshold for the bucket at the given index. |
394 | * |
395 | * The bucket at the specified index will store values in the range |
396 | * [bucketMin, bucketMin + bucketSize), or [bucketMin, max), if the overall |
397 | * max is smaller than bucketMin + bucketSize. |
398 | */ |
399 | ValueType getBucketMax(size_t idx) const { |
400 | return buckets_.getBucketMax(idx); |
401 | } |
402 | |
403 | /** |
404 | * Computes the total number of values stored across all buckets. |
405 | * |
406 | * Runs in O(numBuckets) |
407 | */ |
408 | uint64_t computeTotalCount() const { |
409 | CountFromBucket countFn; |
410 | return buckets_.computeTotalCount(countFn); |
411 | } |
412 | |
413 | /* |
414 | * Get the bucket that the specified percentile falls into |
415 | * |
416 | * The lowest and highest percentile data points in returned bucket will be |
417 | * returned in the lowPct and highPct arguments, if they are not nullptr. |
418 | */ |
419 | size_t getPercentileBucketIdx( |
420 | double pct, |
421 | double* lowPct = nullptr, |
422 | double* highPct = nullptr) const { |
423 | // We unfortunately can't use lambdas here yet; |
424 | // Some users of this code are still built with gcc-4.4. |
425 | CountFromBucket countFn; |
426 | return buckets_.getPercentileBucketIdx(pct, countFn, lowPct, highPct); |
427 | } |
428 | |
429 | /** |
430 | * Estimate the value at the specified percentile. |
431 | * |
432 | * @param pct The desired percentile to find, as a value from 0.0 to 1.0. |
433 | * |
434 | * @return Returns an estimate for N, where N is the number where exactly pct |
435 | * percentage of the data points in the histogram are less than N. |
436 | */ |
437 | ValueType getPercentileEstimate(double pct) const { |
438 | CountFromBucket countFn; |
439 | AvgFromBucket avgFn; |
440 | return buckets_.getPercentileEstimate(pct, countFn, avgFn); |
441 | } |
442 | |
443 | /* |
444 | * Get a human-readable string describing the histogram contents |
445 | */ |
446 | std::string debugString() const; |
447 | |
448 | /* |
449 | * Write the histogram contents in tab-separated values (TSV) format. |
450 | * Format is "min max count sum". |
451 | */ |
452 | void toTSV(std::ostream& out, bool skipEmptyBuckets = true) const; |
453 | |
454 | struct CountFromBucket { |
455 | uint64_t operator()(const Bucket& bucket) const { |
456 | return bucket.count; |
457 | } |
458 | }; |
459 | struct AvgFromBucket { |
460 | ValueType operator()(const Bucket& bucket) const { |
461 | if (bucket.count == 0) { |
462 | return ValueType(0); |
463 | } |
464 | // Cast bucket.count to a signed integer type. This ensures that we |
465 | // perform division properly here: If bucket.sum is a signed integer |
466 | // type but we divide by an unsigned number, unsigned division will be |
467 | // performed and bucket.sum will be converted to unsigned first. |
468 | // If bucket.sum is unsigned, the code will still do unsigned division |
469 | // correctly. |
470 | // |
471 | // The only downside is if bucket.count is large enough to be negative |
472 | // when treated as signed. That should be extremely unlikely, though. |
473 | return bucket.sum / static_cast<int64_t>(bucket.count); |
474 | } |
475 | }; |
476 | |
477 | private: |
478 | template <typename S, typename = std::enable_if_t<std::is_integral<S>::value>> |
479 | static constexpr std::make_unsigned_t<S> to_unsigned(S s) { |
480 | return static_cast<std::make_unsigned_t<S>>(s); |
481 | } |
482 | template < |
483 | typename S, |
484 | typename = std::enable_if_t<!std::is_integral<S>::value>> |
485 | static constexpr S to_unsigned(S s) { |
486 | return s; |
487 | } |
488 | |
489 | detail::HistogramBuckets<ValueType, Bucket> buckets_; |
490 | }; |
491 | |
492 | } // namespace folly |
493 | |
494 | // MSVC 2017 Update 3/4 has an issue with explicitly instantiating templated |
495 | // functions with default arguments inside templated classes when compiled |
496 | // with /permissive- (the default for the CMake build), so we directly include |
497 | // the -defs as if it were -inl, and don't provide the explicit instantiations. |
498 | // https://developercommunity.visualstudio.com/content/problem/81223/incorrect-error-c5037-with-permissive.html |
499 | #if defined(_MSC_VER) && _MSC_FULL_VER >= 191125506 && \ |
500 | _MSC_FULL_VER <= 191125547 |
501 | #define FOLLY_MSVC_USE_WORKAROUND_FOR_C5037 1 |
502 | #else |
503 | #define FOLLY_MSVC_USE_WORKAROUND_FOR_C5037 0 |
504 | #endif |
505 | |
506 | #if FOLLY_MSVC_USE_WORKAROUND_FOR_C5037 |
507 | #include <folly/stats/Histogram-defs.h> |
508 | #endif |
509 | |