1 | /* |
2 | * Copyright 2013-present Facebook, Inc. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | #pragma once |
18 | |
19 | #include <folly/Conv.h> |
20 | #include <folly/stats/Histogram.h> |
21 | |
22 | #include <glog/logging.h> |
23 | |
24 | namespace folly { |
25 | |
26 | namespace detail { |
27 | |
28 | template <typename T, typename BucketT> |
29 | HistogramBuckets<T, BucketT>::HistogramBuckets( |
30 | ValueType bucketSize, |
31 | ValueType min, |
32 | ValueType max, |
33 | const BucketType& defaultBucket) |
34 | : bucketSize_(bucketSize), min_(min), max_(max) { |
35 | CHECK_GT(bucketSize_, ValueType(0)); |
36 | CHECK_LT(min_, max_); |
37 | |
38 | // Deliberately make this a signed type, because we're about |
39 | // to compare it against max-min, which is nominally signed, too. |
40 | int64_t numBuckets = int64_t((max - min) / bucketSize); |
41 | // Round up if the bucket size does not fit evenly |
42 | if (numBuckets * bucketSize < max - min) { |
43 | ++numBuckets; |
44 | } |
45 | // Add 2 for the extra 'below min' and 'above max' buckets |
46 | numBuckets += 2; |
47 | buckets_.assign(size_t(numBuckets), defaultBucket); |
48 | } |
49 | |
50 | template <typename T, typename BucketType> |
51 | size_t HistogramBuckets<T, BucketType>::getBucketIdx(ValueType value) const { |
52 | if (value < min_) { |
53 | return 0; |
54 | } else if (value >= max_) { |
55 | return buckets_.size() - 1; |
56 | } else { |
57 | // the 1 is the below_min bucket |
58 | return size_t(((value - min_) / bucketSize_) + 1); |
59 | } |
60 | } |
61 | |
62 | template <typename T, typename BucketType> |
63 | template <typename CountFn> |
64 | uint64_t HistogramBuckets<T, BucketType>::computeTotalCount( |
65 | CountFn countFromBucket) const { |
66 | uint64_t count = 0; |
67 | for (size_t n = 0; n < buckets_.size(); ++n) { |
68 | count += countFromBucket(const_cast<const BucketType&>(buckets_[n])); |
69 | } |
70 | return count; |
71 | } |
72 | |
73 | template <typename T, typename BucketType> |
74 | template <typename CountFn> |
75 | size_t HistogramBuckets<T, BucketType>::getPercentileBucketIdx( |
76 | double pct, |
77 | CountFn countFromBucket, |
78 | double* lowPct, |
79 | double* highPct) const { |
80 | CHECK_GE(pct, 0.0); |
81 | CHECK_LE(pct, 1.0); |
82 | |
83 | auto numBuckets = buckets_.size(); |
84 | |
85 | // Compute the counts in each bucket |
86 | std::vector<uint64_t> counts(numBuckets); |
87 | uint64_t totalCount = 0; |
88 | for (size_t n = 0; n < numBuckets; ++n) { |
89 | uint64_t bucketCount = |
90 | countFromBucket(const_cast<const BucketType&>(buckets_[n])); |
91 | counts[n] = bucketCount; |
92 | totalCount += bucketCount; |
93 | } |
94 | |
95 | // If there are no elements, just return the lowest bucket. |
96 | // Note that we return bucket 1, which is the first bucket in the |
97 | // histogram range; bucket 0 is for all values below min_. |
98 | if (totalCount == 0) { |
99 | // Set lowPct and highPct both to 0. |
100 | // getPercentileEstimate() will recognize this to mean that the histogram |
101 | // is empty. |
102 | if (lowPct) { |
103 | *lowPct = 0.0; |
104 | } |
105 | if (highPct) { |
106 | *highPct = 0.0; |
107 | } |
108 | return 1; |
109 | } |
110 | |
111 | // Loop through all the buckets, keeping track of each bucket's |
112 | // percentile range: [0,10], [10,17], [17,45], etc. When we find a range |
113 | // that includes our desired percentile, we return that bucket index. |
114 | double prevPct = 0.0; |
115 | double curPct = 0.0; |
116 | uint64_t curCount = 0; |
117 | size_t idx; |
118 | for (idx = 0; idx < numBuckets; ++idx) { |
119 | if (counts[idx] == 0) { |
120 | // skip empty buckets |
121 | continue; |
122 | } |
123 | |
124 | prevPct = curPct; |
125 | curCount += counts[idx]; |
126 | curPct = static_cast<double>(curCount) / totalCount; |
127 | if (pct <= curPct) { |
128 | // This is the desired bucket |
129 | break; |
130 | } |
131 | } |
132 | |
133 | if (lowPct) { |
134 | *lowPct = prevPct; |
135 | } |
136 | if (highPct) { |
137 | *highPct = curPct; |
138 | } |
139 | return idx; |
140 | } |
141 | |
142 | template <typename T, typename BucketType> |
143 | template <typename CountFn, typename AvgFn> |
144 | T HistogramBuckets<T, BucketType>::getPercentileEstimate( |
145 | double pct, |
146 | CountFn countFromBucket, |
147 | AvgFn avgFromBucket) const { |
148 | // Find the bucket where this percentile falls |
149 | double lowPct; |
150 | double highPct; |
151 | size_t bucketIdx = |
152 | getPercentileBucketIdx(pct, countFromBucket, &lowPct, &highPct); |
153 | if (lowPct == 0.0 && highPct == 0.0) { |
154 | // Invalid range -- the buckets must all be empty |
155 | // Return the default value for ValueType. |
156 | return ValueType(); |
157 | } |
158 | if (lowPct == highPct) { |
159 | // Unlikely to have exact equality, |
160 | // but just return the bucket average in this case. |
161 | // We handle this here to avoid division by 0 below. |
162 | return avgFromBucket(buckets_[bucketIdx]); |
163 | } |
164 | |
165 | CHECK_GE(pct, lowPct); |
166 | CHECK_LE(pct, highPct); |
167 | CHECK_LT(lowPct, highPct); |
168 | |
169 | // Compute information about this bucket |
170 | ValueType avg = avgFromBucket(buckets_[bucketIdx]); |
171 | ValueType low; |
172 | ValueType high; |
173 | if (bucketIdx == 0) { |
174 | if (kIsExact && min_ < avg) { |
175 | // This normally shouldn't happen. This bucket is only supposed to track |
176 | // values less than min_. Most likely this means that integer overflow |
177 | // occurred, and the code in avgFromBucket() returned a huge value |
178 | // instead of a small one. Just return the minimum possible value for |
179 | // now. |
180 | // |
181 | // (Note that if the counter keeps being decremented, eventually it will |
182 | // wrap and become small enough that we won't detect this any more, and |
183 | // we will return bogus information.) |
184 | LOG(ERROR) << "invalid average value in histogram minimum bucket: " << avg |
185 | << " > " << min_ << ": possible integer overflow?" ; |
186 | return getBucketMin(bucketIdx); |
187 | } |
188 | // For the below-min bucket, just assume the lowest value ever seen is |
189 | // twice as far away from min_ as avg. |
190 | high = min_; |
191 | low = high - (2 * (high - avg)); |
192 | // Adjust low in case it wrapped |
193 | if (low > avg) { |
194 | low = std::numeric_limits<ValueType>::min(); |
195 | } |
196 | } else if (bucketIdx == buckets_.size() - 1) { |
197 | if (kIsExact && avg < max_) { |
198 | // Most likely this means integer overflow occurred. See the comments |
199 | // above in the minimum case. |
200 | LOG(ERROR) << "invalid average value in histogram maximum bucket: " << avg |
201 | << " < " << max_ << ": possible integer overflow?" ; |
202 | return getBucketMax(bucketIdx); |
203 | } |
204 | // Similarly for the above-max bucket, assume the highest value ever seen |
205 | // is twice as far away from max_ as avg. |
206 | low = max_; |
207 | high = low + (2 * (avg - low)); |
208 | // Adjust high in case it wrapped |
209 | if (high < avg) { |
210 | high = std::numeric_limits<ValueType>::max(); |
211 | } |
212 | } else { |
213 | low = getBucketMin(bucketIdx); |
214 | high = getBucketMax(bucketIdx); |
215 | if (kIsExact && (avg < low || avg > high)) { |
216 | // Most likely this means an integer overflow occurred. |
217 | // See the comments above. Return the midpoint between low and high |
218 | // as a best guess, since avg is meaningless. |
219 | LOG(ERROR) << "invalid average value in histogram bucket: " << avg |
220 | << " not in range [" << low << ", " << high |
221 | << "]: possible integer overflow?" ; |
222 | return (low + high) / 2; |
223 | } |
224 | } |
225 | |
226 | // Since we know the average value in this bucket, we can do slightly better |
227 | // than just assuming the data points in this bucket are uniformly |
228 | // distributed between low and high. |
229 | // |
230 | // Assume that the median value in this bucket is the same as the average |
231 | // value. |
232 | double medianPct = (lowPct + highPct) / 2.0; |
233 | if (pct < medianPct) { |
234 | // Assume that the data points lower than the median of this bucket |
235 | // are uniformly distributed between low and avg |
236 | double pctThroughSection = (pct - lowPct) / (medianPct - lowPct); |
237 | return T(low + ((avg - low) * pctThroughSection)); |
238 | } else { |
239 | // Assume that the data points greater than the median of this bucket |
240 | // are uniformly distributed between avg and high |
241 | double pctThroughSection = (pct - medianPct) / (highPct - medianPct); |
242 | return T(avg + ((high - avg) * pctThroughSection)); |
243 | } |
244 | } |
245 | |
246 | } // namespace detail |
247 | |
248 | template <typename T> |
249 | std::string Histogram<T>::debugString() const { |
250 | std::string ret = folly::to<std::string>( |
251 | "num buckets: " , |
252 | buckets_.getNumBuckets(), |
253 | ", bucketSize: " , |
254 | buckets_.getBucketSize(), |
255 | ", min: " , |
256 | buckets_.getMin(), |
257 | ", max: " , |
258 | buckets_.getMax(), |
259 | "\n" ); |
260 | |
261 | for (size_t i = 0; i < buckets_.getNumBuckets(); ++i) { |
262 | folly::toAppend( |
263 | " " , |
264 | buckets_.getBucketMin(i), |
265 | ": " , |
266 | buckets_.getByIndex(i).count, |
267 | "\n" , |
268 | &ret); |
269 | } |
270 | |
271 | return ret; |
272 | } |
273 | |
274 | template <typename T> |
275 | void Histogram<T>::toTSV(std::ostream& out, bool skipEmptyBuckets) const { |
276 | for (size_t i = 0; i < buckets_.getNumBuckets(); ++i) { |
277 | // Do not output empty buckets in order to reduce data file size. |
278 | if (skipEmptyBuckets && getBucketByIndex(i).count == 0) { |
279 | continue; |
280 | } |
281 | const auto& bucket = getBucketByIndex(i); |
282 | out << getBucketMin(i) << '\t' << getBucketMax(i) << '\t' << bucket.count |
283 | << '\t' << bucket.sum << '\n'; |
284 | } |
285 | } |
286 | |
287 | } // namespace folly |
288 | |