Histogram-defs.h source code [folly/stats/Histogram-defs.h]

1	/*
2	* Copyright 2013-present Facebook, Inc.
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	#pragma once
18
19	#include <folly/Conv.h>
20	#include <folly/stats/Histogram.h>
21
22	#include <glog/logging.h>
23
24	namespace folly {
25
26	namespace detail {
27
28	template <typename T, typename BucketT>
29	HistogramBuckets<T, BucketT>::HistogramBuckets(
30	ValueType bucketSize,
31	ValueType min,
32	ValueType max,
33	const BucketType& defaultBucket)
34	: bucketSize_(bucketSize), min_(min), max_(max) {
35	CHECK_GT(bucketSize_, ValueType(`0`));
36	CHECK_LT(min_, max_);
37
38	// Deliberately make this a signed type, because we're about
39	// to compare it against max-min, which is nominally signed, too.
40	int64_t numBuckets = int64_t((max - min) / bucketSize);
41	// Round up if the bucket size does not fit evenly
42	if (numBuckets * bucketSize < max - min) {
43	++numBuckets;
44	}
45	// Add 2 for the extra 'below min' and 'above max' buckets
46	numBuckets += `2`;
47	buckets_.assign(size_t(numBuckets), defaultBucket);
48	}
49
50	template <typename T, typename BucketType>
51	size_t HistogramBuckets<T, BucketType>::getBucketIdx(ValueType value) const {
52	if (value < min_) {
53	return `0`;
54	} else if (value >= max_) {
55	return buckets_.size() - `1`;
56	} else {
57	// the 1 is the below_min bucket
58	return size_t(((value - min_) / bucketSize_) + `1`);
59	}
60	}
61
62	template <typename T, typename BucketType>
63	template <typename CountFn>
64	uint64_t HistogramBuckets<T, BucketType>::computeTotalCount(
65	CountFn countFromBucket) const {
66	uint64_t count = `0`;
67	for (size_t n = `0`; n < buckets_.size(); ++n) {
68	count += countFromBucket(const_cast<const BucketType&>(buckets_[n]));
69	}
70	return count;
71	}
72
73	template <typename T, typename BucketType>
74	template <typename CountFn>
75	size_t HistogramBuckets<T, BucketType>::getPercentileBucketIdx(
76	double pct,
77	CountFn countFromBucket,
78	double* lowPct,
79	double* highPct) const {
80	CHECK_GE(pct, `0.0`);
81	CHECK_LE(pct, `1.0`);
82
83	auto numBuckets = buckets_.size();
84
85	// Compute the counts in each bucket
86	std::vector<uint64_t> counts(numBuckets);
87	uint64_t totalCount = `0`;
88	for (size_t n = `0`; n < numBuckets; ++n) {
89	uint64_t bucketCount =
90	countFromBucket(const_cast<const BucketType&>(buckets_[n]));
91	counts [n] = bucketCount;
92	totalCount += bucketCount;
93	}
94
95	// If there are no elements, just return the lowest bucket.
96	// Note that we return bucket 1, which is the first bucket in the
97	// histogram range; bucket 0 is for all values below min_.
98	if (totalCount == `0`) {
99	// Set lowPct and highPct both to 0.
100	// getPercentileEstimate() will recognize this to mean that the histogram
101	// is empty.
102	if (lowPct) {
103	*lowPct = `0.0`;
104	}
105	if (highPct) {
106	*highPct = `0.0`;
107	}
108	return `1`;
109	}
110
111	// Loop through all the buckets, keeping track of each bucket's
112	// percentile range: [0,10], [10,17], [17,45], etc. When we find a range
113	// that includes our desired percentile, we return that bucket index.
114	double prevPct = `0.0`;
115	double curPct = `0.0`;
116	uint64_t curCount = `0`;
117	size_t idx;
118	for (idx = `0`; idx < numBuckets; ++idx) {
119	if (counts [idx] == `0`) {
120	// skip empty buckets
121	continue;
122	}
123
124	prevPct = curPct;
125	curCount += counts [idx];
126	curPct = static_cast<double>(curCount) / totalCount;
127	if (pct <= curPct) {
128	// This is the desired bucket
129	break;
130	}
131	}
132
133	if (lowPct) {
134	*lowPct = prevPct;
135	}
136	if (highPct) {
137	*highPct = curPct;
138	}
139	return idx;
140	}
141
142	template <typename T, typename BucketType>
143	template <typename CountFn, typename AvgFn>
144	T HistogramBuckets<T, BucketType>::getPercentileEstimate(
145	double pct,
146	CountFn countFromBucket,
147	AvgFn avgFromBucket) const {
148	// Find the bucket where this percentile falls
149	double lowPct;
150	double highPct;
151	size_t bucketIdx =
152	getPercentileBucketIdx(pct, countFromBucket, &lowPct, &highPct);
153	if (lowPct == `0.0` && highPct == `0.0`) {
154	// Invalid range -- the buckets must all be empty
155	// Return the default value for ValueType.
156	return ValueType();
157	}
158	if (lowPct == highPct) {
159	// Unlikely to have exact equality,
160	// but just return the bucket average in this case.
161	// We handle this here to avoid division by 0 below.
162	return avgFromBucket(buckets_[bucketIdx]);
163	}
164
165	CHECK_GE(pct, lowPct);
166	CHECK_LE(pct, highPct);
167	CHECK_LT(lowPct, highPct);
168
169	// Compute information about this bucket
170	ValueType avg = avgFromBucket(buckets_[bucketIdx]);
171	ValueType low;
172	ValueType high;
173	if (bucketIdx == `0`) {
174	if (kIsExact && min_ < avg) {
175	// This normally shouldn't happen. This bucket is only supposed to track
176	// values less than min_. Most likely this means that integer overflow
177	// occurred, and the code in avgFromBucket() returned a huge value
178	// instead of a small one. Just return the minimum possible value for
179	// now.
180	//
181	// (Note that if the counter keeps being decremented, eventually it will
182	// wrap and become small enough that we won't detect this any more, and
183	// we will return bogus information.)
184	LOG(ERROR) << "invalid average value in histogram minimum bucket: " << avg
185	<< " > " << min_ << ": possible integer overflow?";
186	return getBucketMin(bucketIdx);
187	}
188	// For the below-min bucket, just assume the lowest value ever seen is
189	// twice as far away from min_ as avg.
190	high = min_;
191	low = high - (`2` * (high - avg));
192	// Adjust low in case it wrapped
193	if (low > avg) {
194	low = std::numeric_limits<ValueType>::min();
195	}
196	} else if (bucketIdx == buckets_.size() - `1`) {
197	if (kIsExact && avg < max_) {
198	// Most likely this means integer overflow occurred. See the comments
199	// above in the minimum case.
200	LOG(ERROR) << "invalid average value in histogram maximum bucket: " << avg
201	<< " < " << max_ << ": possible integer overflow?";
202	return getBucketMax(bucketIdx);
203	}
204	// Similarly for the above-max bucket, assume the highest value ever seen
205	// is twice as far away from max_ as avg.
206	low = max_;
207	high = low + (`2` * (avg - low));
208	// Adjust high in case it wrapped
209	if (high < avg) {
210	high = std::numeric_limits<ValueType>::max();
211	}
212	} else {
213	low = getBucketMin(bucketIdx);
214	high = getBucketMax(bucketIdx);
215	if (kIsExact && (avg < low \|\| avg > high)) {
216	// Most likely this means an integer overflow occurred.
217	// See the comments above. Return the midpoint between low and high
218	// as a best guess, since avg is meaningless.
219	LOG(ERROR) << "invalid average value in histogram bucket: " << avg
220	<< " not in range [" << low << ", " << high
221	<< "]: possible integer overflow?";
222	return (low + high) / `2`;
223	}
224	}
225
226	// Since we know the average value in this bucket, we can do slightly better
227	// than just assuming the data points in this bucket are uniformly
228	// distributed between low and high.
229	//
230	// Assume that the median value in this bucket is the same as the average
231	// value.
232	double medianPct = (lowPct + highPct) / `2.0`;
233	if (pct < medianPct) {
234	// Assume that the data points lower than the median of this bucket
235	// are uniformly distributed between low and avg
236	double pctThroughSection = (pct - lowPct) / (medianPct - lowPct);
237	return T(low + ((avg - low) * pctThroughSection));
238	} else {
239	// Assume that the data points greater than the median of this bucket
240	// are uniformly distributed between avg and high
241	double pctThroughSection = (pct - medianPct) / (highPct - medianPct);
242	return T(avg + ((high - avg) * pctThroughSection));
243	}
244	}
245
246	} // namespace detail
247
248	template <typename T>
249	std::string Histogram<T>::debugString() const {
250	std::string ret = folly::to<std::string>(
251	"num buckets: ",
252	buckets_.getNumBuckets(),
253	", bucketSize: ",
254	buckets_.getBucketSize(),
255	", min: ",
256	buckets_.getMin(),
257	", max: ",
258	buckets_.getMax(),
259	"\n");
260
261	for (size_t i = `0`; i < buckets_.getNumBuckets(); ++i) {
262	folly::toAppend(
263	" ",
264	buckets_.getBucketMin(i),
265	": ",
266	buckets_.getByIndex(i).count,
267	"\n",
268	&ret);
269	}
270
271	return ret;
272	}
273
274	template <typename T>
275	void Histogram<T>::toTSV(std::ostream& out, bool skipEmptyBuckets) const {
276	for (size_t i = `0`; i < buckets_.getNumBuckets(); ++i) {
277	// Do not output empty buckets in order to reduce data file size.
278	if (skipEmptyBuckets && getBucketByIndex(i).count == `0`) {
279	continue;
280	}
281	const auto& bucket = getBucketByIndex(i);
282	out << getBucketMin(i) << `'\t'` << getBucketMax(i) << `'\t'` << bucket.count
283	<< `'\t'` << bucket.sum << `'\n'`;
284	}
285	}
286
287	} // namespace folly
288

Browse the source code of folly/stats/Histogram-defs.h