1/*
2 * Copyright 2013-present Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#pragma once
18
19#include <folly/Conv.h>
20#include <folly/stats/Histogram.h>
21
22#include <glog/logging.h>
23
24namespace folly {
25
26namespace detail {
27
28template <typename T, typename BucketT>
29HistogramBuckets<T, BucketT>::HistogramBuckets(
30 ValueType bucketSize,
31 ValueType min,
32 ValueType max,
33 const BucketType& defaultBucket)
34 : bucketSize_(bucketSize), min_(min), max_(max) {
35 CHECK_GT(bucketSize_, ValueType(0));
36 CHECK_LT(min_, max_);
37
38 // Deliberately make this a signed type, because we're about
39 // to compare it against max-min, which is nominally signed, too.
40 int64_t numBuckets = int64_t((max - min) / bucketSize);
41 // Round up if the bucket size does not fit evenly
42 if (numBuckets * bucketSize < max - min) {
43 ++numBuckets;
44 }
45 // Add 2 for the extra 'below min' and 'above max' buckets
46 numBuckets += 2;
47 buckets_.assign(size_t(numBuckets), defaultBucket);
48}
49
50template <typename T, typename BucketType>
51size_t HistogramBuckets<T, BucketType>::getBucketIdx(ValueType value) const {
52 if (value < min_) {
53 return 0;
54 } else if (value >= max_) {
55 return buckets_.size() - 1;
56 } else {
57 // the 1 is the below_min bucket
58 return size_t(((value - min_) / bucketSize_) + 1);
59 }
60}
61
62template <typename T, typename BucketType>
63template <typename CountFn>
64uint64_t HistogramBuckets<T, BucketType>::computeTotalCount(
65 CountFn countFromBucket) const {
66 uint64_t count = 0;
67 for (size_t n = 0; n < buckets_.size(); ++n) {
68 count += countFromBucket(const_cast<const BucketType&>(buckets_[n]));
69 }
70 return count;
71}
72
73template <typename T, typename BucketType>
74template <typename CountFn>
75size_t HistogramBuckets<T, BucketType>::getPercentileBucketIdx(
76 double pct,
77 CountFn countFromBucket,
78 double* lowPct,
79 double* highPct) const {
80 CHECK_GE(pct, 0.0);
81 CHECK_LE(pct, 1.0);
82
83 auto numBuckets = buckets_.size();
84
85 // Compute the counts in each bucket
86 std::vector<uint64_t> counts(numBuckets);
87 uint64_t totalCount = 0;
88 for (size_t n = 0; n < numBuckets; ++n) {
89 uint64_t bucketCount =
90 countFromBucket(const_cast<const BucketType&>(buckets_[n]));
91 counts[n] = bucketCount;
92 totalCount += bucketCount;
93 }
94
95 // If there are no elements, just return the lowest bucket.
96 // Note that we return bucket 1, which is the first bucket in the
97 // histogram range; bucket 0 is for all values below min_.
98 if (totalCount == 0) {
99 // Set lowPct and highPct both to 0.
100 // getPercentileEstimate() will recognize this to mean that the histogram
101 // is empty.
102 if (lowPct) {
103 *lowPct = 0.0;
104 }
105 if (highPct) {
106 *highPct = 0.0;
107 }
108 return 1;
109 }
110
111 // Loop through all the buckets, keeping track of each bucket's
112 // percentile range: [0,10], [10,17], [17,45], etc. When we find a range
113 // that includes our desired percentile, we return that bucket index.
114 double prevPct = 0.0;
115 double curPct = 0.0;
116 uint64_t curCount = 0;
117 size_t idx;
118 for (idx = 0; idx < numBuckets; ++idx) {
119 if (counts[idx] == 0) {
120 // skip empty buckets
121 continue;
122 }
123
124 prevPct = curPct;
125 curCount += counts[idx];
126 curPct = static_cast<double>(curCount) / totalCount;
127 if (pct <= curPct) {
128 // This is the desired bucket
129 break;
130 }
131 }
132
133 if (lowPct) {
134 *lowPct = prevPct;
135 }
136 if (highPct) {
137 *highPct = curPct;
138 }
139 return idx;
140}
141
142template <typename T, typename BucketType>
143template <typename CountFn, typename AvgFn>
144T HistogramBuckets<T, BucketType>::getPercentileEstimate(
145 double pct,
146 CountFn countFromBucket,
147 AvgFn avgFromBucket) const {
148 // Find the bucket where this percentile falls
149 double lowPct;
150 double highPct;
151 size_t bucketIdx =
152 getPercentileBucketIdx(pct, countFromBucket, &lowPct, &highPct);
153 if (lowPct == 0.0 && highPct == 0.0) {
154 // Invalid range -- the buckets must all be empty
155 // Return the default value for ValueType.
156 return ValueType();
157 }
158 if (lowPct == highPct) {
159 // Unlikely to have exact equality,
160 // but just return the bucket average in this case.
161 // We handle this here to avoid division by 0 below.
162 return avgFromBucket(buckets_[bucketIdx]);
163 }
164
165 CHECK_GE(pct, lowPct);
166 CHECK_LE(pct, highPct);
167 CHECK_LT(lowPct, highPct);
168
169 // Compute information about this bucket
170 ValueType avg = avgFromBucket(buckets_[bucketIdx]);
171 ValueType low;
172 ValueType high;
173 if (bucketIdx == 0) {
174 if (kIsExact && min_ < avg) {
175 // This normally shouldn't happen. This bucket is only supposed to track
176 // values less than min_. Most likely this means that integer overflow
177 // occurred, and the code in avgFromBucket() returned a huge value
178 // instead of a small one. Just return the minimum possible value for
179 // now.
180 //
181 // (Note that if the counter keeps being decremented, eventually it will
182 // wrap and become small enough that we won't detect this any more, and
183 // we will return bogus information.)
184 LOG(ERROR) << "invalid average value in histogram minimum bucket: " << avg
185 << " > " << min_ << ": possible integer overflow?";
186 return getBucketMin(bucketIdx);
187 }
188 // For the below-min bucket, just assume the lowest value ever seen is
189 // twice as far away from min_ as avg.
190 high = min_;
191 low = high - (2 * (high - avg));
192 // Adjust low in case it wrapped
193 if (low > avg) {
194 low = std::numeric_limits<ValueType>::min();
195 }
196 } else if (bucketIdx == buckets_.size() - 1) {
197 if (kIsExact && avg < max_) {
198 // Most likely this means integer overflow occurred. See the comments
199 // above in the minimum case.
200 LOG(ERROR) << "invalid average value in histogram maximum bucket: " << avg
201 << " < " << max_ << ": possible integer overflow?";
202 return getBucketMax(bucketIdx);
203 }
204 // Similarly for the above-max bucket, assume the highest value ever seen
205 // is twice as far away from max_ as avg.
206 low = max_;
207 high = low + (2 * (avg - low));
208 // Adjust high in case it wrapped
209 if (high < avg) {
210 high = std::numeric_limits<ValueType>::max();
211 }
212 } else {
213 low = getBucketMin(bucketIdx);
214 high = getBucketMax(bucketIdx);
215 if (kIsExact && (avg < low || avg > high)) {
216 // Most likely this means an integer overflow occurred.
217 // See the comments above. Return the midpoint between low and high
218 // as a best guess, since avg is meaningless.
219 LOG(ERROR) << "invalid average value in histogram bucket: " << avg
220 << " not in range [" << low << ", " << high
221 << "]: possible integer overflow?";
222 return (low + high) / 2;
223 }
224 }
225
226 // Since we know the average value in this bucket, we can do slightly better
227 // than just assuming the data points in this bucket are uniformly
228 // distributed between low and high.
229 //
230 // Assume that the median value in this bucket is the same as the average
231 // value.
232 double medianPct = (lowPct + highPct) / 2.0;
233 if (pct < medianPct) {
234 // Assume that the data points lower than the median of this bucket
235 // are uniformly distributed between low and avg
236 double pctThroughSection = (pct - lowPct) / (medianPct - lowPct);
237 return T(low + ((avg - low) * pctThroughSection));
238 } else {
239 // Assume that the data points greater than the median of this bucket
240 // are uniformly distributed between avg and high
241 double pctThroughSection = (pct - medianPct) / (highPct - medianPct);
242 return T(avg + ((high - avg) * pctThroughSection));
243 }
244}
245
246} // namespace detail
247
248template <typename T>
249std::string Histogram<T>::debugString() const {
250 std::string ret = folly::to<std::string>(
251 "num buckets: ",
252 buckets_.getNumBuckets(),
253 ", bucketSize: ",
254 buckets_.getBucketSize(),
255 ", min: ",
256 buckets_.getMin(),
257 ", max: ",
258 buckets_.getMax(),
259 "\n");
260
261 for (size_t i = 0; i < buckets_.getNumBuckets(); ++i) {
262 folly::toAppend(
263 " ",
264 buckets_.getBucketMin(i),
265 ": ",
266 buckets_.getByIndex(i).count,
267 "\n",
268 &ret);
269 }
270
271 return ret;
272}
273
274template <typename T>
275void Histogram<T>::toTSV(std::ostream& out, bool skipEmptyBuckets) const {
276 for (size_t i = 0; i < buckets_.getNumBuckets(); ++i) {
277 // Do not output empty buckets in order to reduce data file size.
278 if (skipEmptyBuckets && getBucketByIndex(i).count == 0) {
279 continue;
280 }
281 const auto& bucket = getBucketByIndex(i);
282 out << getBucketMin(i) << '\t' << getBucketMax(i) << '\t' << bucket.count
283 << '\t' << bucket.sum << '\n';
284 }
285}
286
287} // namespace folly
288