1/*
2 * Copyright 2012-present Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#pragma once
18
19#include <folly/Portability.h>
20#include <folly/Preprocessor.h> // for FB_ANONYMOUS_VARIABLE
21#include <folly/ScopeGuard.h>
22#include <folly/Traits.h>
23#include <folly/functional/Invoke.h>
24#include <folly/portability/GFlags.h>
25
26#include <cassert>
27#include <chrono>
28#include <functional>
29#include <limits>
30#include <type_traits>
31#include <unordered_map>
32
33#include <boost/function_types/function_arity.hpp>
34#include <glog/logging.h>
35
36DECLARE_bool(benchmark);
37
38namespace folly {
39
40/**
41 * Runs all benchmarks defined. Usually put in main().
42 */
43void runBenchmarks();
44
45/**
46 * Runs all benchmarks defined if and only if the --benchmark flag has
47 * been passed to the program. Usually put in main().
48 */
49inline bool runBenchmarksOnFlag() {
50 if (FLAGS_benchmark) {
51 runBenchmarks();
52 }
53 return FLAGS_benchmark;
54}
55
56using UserCounters = std::unordered_map<std::string, int>;
57
58namespace detail {
59struct TimeIterData {
60 std::chrono::high_resolution_clock::duration duration;
61 unsigned int niter;
62 UserCounters userCounters;
63};
64
65using BenchmarkFun = std::function<TimeIterData(unsigned int)>;
66
67struct BenchmarkRegistration {
68 std::string file;
69 std::string name;
70 BenchmarkFun func;
71 bool useCounter = false;
72};
73
74struct BenchmarkResult {
75 std::string file;
76 std::string name;
77 double timeInNs;
78 UserCounters counters;
79};
80
81/**
82 * Adds a benchmark wrapped in a std::function. Only used
83 * internally. Pass by value is intentional.
84 */
85void addBenchmarkImpl(
86 const char* file,
87 const char* name,
88 BenchmarkFun,
89 bool useCounter);
90
91} // namespace detail
92
93/**
94 * Supporting type for BENCHMARK_SUSPEND defined below.
95 */
96struct BenchmarkSuspender {
97 using Clock = std::chrono::high_resolution_clock;
98 using TimePoint = Clock::time_point;
99 using Duration = Clock::duration;
100
101 BenchmarkSuspender() {
102 start = Clock::now();
103 }
104
105 BenchmarkSuspender(const BenchmarkSuspender&) = delete;
106 BenchmarkSuspender(BenchmarkSuspender&& rhs) noexcept {
107 start = rhs.start;
108 rhs.start = {};
109 }
110
111 BenchmarkSuspender& operator=(const BenchmarkSuspender&) = delete;
112 BenchmarkSuspender& operator=(BenchmarkSuspender&& rhs) {
113 if (start != TimePoint{}) {
114 tally();
115 }
116 start = rhs.start;
117 rhs.start = {};
118 return *this;
119 }
120
121 ~BenchmarkSuspender() {
122 if (start != TimePoint{}) {
123 tally();
124 }
125 }
126
127 void dismiss() {
128 assert(start != TimePoint{});
129 tally();
130 start = {};
131 }
132
133 void rehire() {
134 assert(start == TimePoint{});
135 start = Clock::now();
136 }
137
138 template <class F>
139 auto dismissing(F f) -> invoke_result_t<F> {
140 SCOPE_EXIT {
141 rehire();
142 };
143 dismiss();
144 return f();
145 }
146
147 /**
148 * This is for use inside of if-conditions, used in BENCHMARK macros.
149 * If-conditions bypass the explicit on operator bool.
150 */
151 explicit operator bool() const {
152 return false;
153 }
154
155 /**
156 * Accumulates time spent outside benchmark.
157 */
158 static Duration timeSpent;
159
160 private:
161 void tally() {
162 auto end = Clock::now();
163 timeSpent += end - start;
164 start = end;
165 }
166
167 TimePoint start;
168};
169
170/**
171 * Adds a benchmark. Usually not called directly but instead through
172 * the macro BENCHMARK defined below. The lambda function involved
173 * must take exactly one parameter of type unsigned, and the benchmark
174 * uses it with counter semantics (iteration occurs inside the
175 * function).
176 */
177template <typename Lambda>
178typename std::enable_if<folly::is_invocable<Lambda, unsigned>::value>::type
179addBenchmark(const char* file, const char* name, Lambda&& lambda) {
180 auto execute = [=](unsigned int times) {
181 BenchmarkSuspender::timeSpent = {};
182 unsigned int niter;
183
184 // CORE MEASUREMENT STARTS
185 auto start = std::chrono::high_resolution_clock::now();
186 niter = lambda(times);
187 auto end = std::chrono::high_resolution_clock::now();
188 // CORE MEASUREMENT ENDS
189 return detail::TimeIterData{
190 (end - start) - BenchmarkSuspender::timeSpent, niter, UserCounters{}};
191 };
192
193 detail::addBenchmarkImpl(file, name, detail::BenchmarkFun(execute), false);
194}
195
196/**
197 * Adds a benchmark. Usually not called directly but instead through
198 * the macro BENCHMARK defined below. The lambda function involved
199 * must take zero parameters, and the benchmark calls it repeatedly
200 * (iteration occurs outside the function).
201 */
202template <typename Lambda>
203typename std::enable_if<folly::is_invocable<Lambda>::value>::type
204addBenchmark(const char* file, const char* name, Lambda&& lambda) {
205 addBenchmark(file, name, [=](unsigned int times) {
206 unsigned int niter = 0;
207 while (times-- > 0) {
208 niter += lambda();
209 }
210 return niter;
211 });
212}
213
214/**
215 * similar as previous two template specialization, but lambda will also take
216 * customized counters in the following two cases
217 */
218template <typename Lambda>
219typename std::enable_if<
220 folly::is_invocable<Lambda, UserCounters&, unsigned>::value>::type
221addBenchmark(const char* file, const char* name, Lambda&& lambda) {
222 auto execute = [=](unsigned int times) {
223 BenchmarkSuspender::timeSpent = {};
224 unsigned int niter;
225
226 // CORE MEASUREMENT STARTS
227 auto start = std::chrono::high_resolution_clock::now();
228 UserCounters counters;
229 niter = lambda(counters, times);
230 auto end = std::chrono::high_resolution_clock::now();
231 // CORE MEASUREMENT ENDS
232 return detail::TimeIterData{
233 (end - start) - BenchmarkSuspender::timeSpent, niter, counters};
234 };
235
236 detail::addBenchmarkImpl(
237 file,
238 name,
239 std::function<detail::TimeIterData(unsigned int)>(execute),
240 true);
241}
242
243template <typename Lambda>
244typename std::enable_if<folly::is_invocable<Lambda, UserCounters&>::value>::type
245addBenchmark(const char* file, const char* name, Lambda&& lambda) {
246 addBenchmark(file, name, [=](UserCounters& counters, unsigned int times) {
247 unsigned int niter = 0;
248 while (times-- > 0) {
249 niter += lambda(counters);
250 }
251 return niter;
252 });
253}
254
255/**
256 * Call doNotOptimizeAway(var) to ensure that var will be computed even
257 * post-optimization. Use it for variables that are computed during
258 * benchmarking but otherwise are useless. The compiler tends to do a
259 * good job at eliminating unused variables, and this function fools it
260 * into thinking var is in fact needed.
261 *
262 * Call makeUnpredictable(var) when you don't want the optimizer to use
263 * its knowledge of var to shape the following code. This is useful
264 * when constant propagation or power reduction is possible during your
265 * benchmark but not in real use cases.
266 */
267
268#ifdef _MSC_VER
269
270#pragma optimize("", off)
271
272inline void doNotOptimizeDependencySink(const void*) {}
273
274#pragma optimize("", on)
275
276template <class T>
277void doNotOptimizeAway(const T& datum) {
278 doNotOptimizeDependencySink(&datum);
279}
280
281template <typename T>
282void makeUnpredictable(T& datum) {
283 doNotOptimizeDependencySink(&datum);
284}
285
286#else
287
288namespace detail {
289template <typename T>
290struct DoNotOptimizeAwayNeedsIndirect {
291 using Decayed = typename std::decay<T>::type;
292
293 // First two constraints ensure it can be an "r" operand.
294 // std::is_pointer check is because callers seem to expect that
295 // doNotOptimizeAway(&x) is equivalent to doNotOptimizeAway(x).
296 constexpr static bool value = !folly::is_trivially_copyable<Decayed>::value ||
297 sizeof(Decayed) > sizeof(long) || std::is_pointer<Decayed>::value;
298};
299} // namespace detail
300
301template <typename T>
302auto doNotOptimizeAway(const T& datum) -> typename std::enable_if<
303 !detail::DoNotOptimizeAwayNeedsIndirect<T>::value>::type {
304 // The "r" constraint forces the compiler to make datum available
305 // in a register to the asm block, which means that it must have
306 // computed/loaded it. We use this path for things that are <=
307 // sizeof(long) (they have to fit), trivial (otherwise the compiler
308 // doesn't want to put them in a register), and not a pointer (because
309 // doNotOptimizeAway(&foo) would otherwise be a foot gun that didn't
310 // necessarily compute foo).
311 //
312 // An earlier version of this method had a more permissive input operand
313 // constraint, but that caused unnecessary variation between clang and
314 // gcc benchmarks.
315 asm volatile("" ::"r"(datum));
316}
317
318template <typename T>
319auto doNotOptimizeAway(const T& datum) -> typename std::enable_if<
320 detail::DoNotOptimizeAwayNeedsIndirect<T>::value>::type {
321 // This version of doNotOptimizeAway tells the compiler that the asm
322 // block will read datum from memory, and that in addition it might read
323 // or write from any memory location. If the memory clobber could be
324 // separated into input and output that would be preferrable.
325 asm volatile("" ::"m"(datum) : "memory");
326}
327
328template <typename T>
329auto makeUnpredictable(T& datum) -> typename std::enable_if<
330 !detail::DoNotOptimizeAwayNeedsIndirect<T>::value>::type {
331 asm volatile("" : "+r"(datum));
332}
333
334template <typename T>
335auto makeUnpredictable(T& datum) -> typename std::enable_if<
336 detail::DoNotOptimizeAwayNeedsIndirect<T>::value>::type {
337 asm volatile("" ::"m"(datum) : "memory");
338}
339
340#endif
341
342struct dynamic;
343
344void benchmarkResultsToDynamic(
345 const std::vector<detail::BenchmarkResult>& data,
346 dynamic&);
347
348void benchmarkResultsFromDynamic(
349 const dynamic&,
350 std::vector<detail::BenchmarkResult>&);
351
352void printResultComparison(
353 const std::vector<detail::BenchmarkResult>& base,
354 const std::vector<detail::BenchmarkResult>& test);
355
356} // namespace folly
357
358/**
359 * Introduces a benchmark function. Used internally, see BENCHMARK and
360 * friends below.
361 */
362
363#define BENCHMARK_IMPL(funName, stringName, rv, paramType, paramName) \
364 static void funName(paramType); \
365 static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) = \
366 (::folly::addBenchmark( \
367 __FILE__, \
368 stringName, \
369 [](paramType paramName) -> unsigned { \
370 funName(paramName); \
371 return rv; \
372 }), \
373 true); \
374 static void funName(paramType paramName)
375
376#define BENCHMARK_IMPL_COUNTERS( \
377 funName, stringName, counters, rv, paramType, paramName) \
378 static void funName(UserCounters& FOLLY_PP_DETAIL_APPEND_VA_ARG(paramType)); \
379 static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) = \
380 (::folly::addBenchmark( \
381 __FILE__, \
382 stringName, \
383 [](UserCounters& counters FOLLY_PP_DETAIL_APPEND_VA_ARG( \
384 paramType paramName)) -> unsigned { \
385 funName(counters FOLLY_PP_DETAIL_APPEND_VA_ARG(paramName)); \
386 return rv; \
387 }), \
388 true); \
389 static void funName(UserCounters& counters FOLLY_PP_DETAIL_APPEND_VA_ARG( \
390 paramType paramName))
391
392/**
393 * Introduces a benchmark function with support for returning the actual
394 * number of iterations. Used internally, see BENCHMARK_MULTI and friends
395 * below.
396 */
397#define BENCHMARK_MULTI_IMPL(funName, stringName, paramType, paramName) \
398 static unsigned funName(paramType); \
399 static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) = \
400 (::folly::addBenchmark( \
401 __FILE__, \
402 stringName, \
403 [](paramType paramName) { return funName(paramName); }), \
404 true); \
405 static unsigned funName(paramType paramName)
406
407/**
408 * Introduces a benchmark function. Use with either one or two arguments.
409 * The first is the name of the benchmark. Use something descriptive, such
410 * as insertVectorBegin. The second argument may be missing, or could be a
411 * symbolic counter. The counter dictates how many internal iteration the
412 * benchmark does. Example:
413 *
414 * BENCHMARK(vectorPushBack) {
415 * vector<int> v;
416 * v.push_back(42);
417 * }
418 *
419 * BENCHMARK(insertVectorBegin, iters) {
420 * vector<int> v;
421 * FOR_EACH_RANGE (i, 0, iters) {
422 * v.insert(v.begin(), 42);
423 * }
424 * }
425 */
426#define BENCHMARK(name, ...) \
427 BENCHMARK_IMPL( \
428 name, \
429 FB_STRINGIZE(name), \
430 FB_ARG_2_OR_1(1, ##__VA_ARGS__), \
431 FB_ONE_OR_NONE(unsigned, ##__VA_ARGS__), \
432 __VA_ARGS__)
433
434/**
435 * Allow users to record customized counter during benchmarking,
436 * there will be one extra column showing in the output result for each counter
437 *
438 * BENCHMARK_COUNTERS(insertVectorBegin, couters, iters) {
439 * vector<int> v;
440 * FOR_EACH_RANGE (i, 0, iters) {
441 * v.insert(v.begin(), 42);
442 * }
443 * BENCHMARK_SUSPEND {
444 * counters["foo"] = 10;
445 * }
446 * }
447 */
448#define BENCHMARK_COUNTERS(name, counters, ...) \
449 BENCHMARK_IMPL_COUNTERS( \
450 name, \
451 FB_STRINGIZE(name), \
452 counters, \
453 FB_ARG_2_OR_1(1, ##__VA_ARGS__), \
454 FB_ONE_OR_NONE(unsigned, ##__VA_ARGS__), \
455 __VA_ARGS__)
456/**
457 * Like BENCHMARK above, but allows the user to return the actual
458 * number of iterations executed in the function body. This can be
459 * useful if the benchmark function doesn't know upfront how many
460 * iterations it's going to run or if it runs through a certain
461 * number of test cases, e.g.:
462 *
463 * BENCHMARK_MULTI(benchmarkSomething) {
464 * std::vector<int> testCases { 0, 1, 1, 2, 3, 5 };
465 * for (int c : testCases) {
466 * doSomething(c);
467 * }
468 * return testCases.size();
469 * }
470 */
471#define BENCHMARK_MULTI(name, ...) \
472 BENCHMARK_MULTI_IMPL( \
473 name, \
474 FB_STRINGIZE(name), \
475 FB_ONE_OR_NONE(unsigned, ##__VA_ARGS__), \
476 __VA_ARGS__)
477
478/**
479 * Defines a benchmark that passes a parameter to another one. This is
480 * common for benchmarks that need a "problem size" in addition to
481 * "number of iterations". Consider:
482 *
483 * void pushBack(uint32_t n, size_t initialSize) {
484 * vector<int> v;
485 * BENCHMARK_SUSPEND {
486 * v.resize(initialSize);
487 * }
488 * FOR_EACH_RANGE (i, 0, n) {
489 * v.push_back(i);
490 * }
491 * }
492 * BENCHMARK_PARAM(pushBack, 0)
493 * BENCHMARK_PARAM(pushBack, 1000)
494 * BENCHMARK_PARAM(pushBack, 1000000)
495 *
496 * The benchmark above estimates the speed of push_back at different
497 * initial sizes of the vector. The framework will pass 0, 1000, and
498 * 1000000 for initialSize, and the iteration count for n.
499 */
500#define BENCHMARK_PARAM(name, param) BENCHMARK_NAMED_PARAM(name, param, param)
501
502/**
503 * Same as BENCHMARK_PARAM, but allows one to return the actual number of
504 * iterations that have been run.
505 */
506#define BENCHMARK_PARAM_MULTI(name, param) \
507 BENCHMARK_NAMED_PARAM_MULTI(name, param, param)
508
509/*
510 * Like BENCHMARK_PARAM(), but allows a custom name to be specified for each
511 * parameter, rather than using the parameter value.
512 *
513 * Useful when the parameter value is not a valid token for string pasting,
514 * of when you want to specify multiple parameter arguments.
515 *
516 * For example:
517 *
518 * void addValue(uint32_t n, int64_t bucketSize, int64_t min, int64_t max) {
519 * Histogram<int64_t> hist(bucketSize, min, max);
520 * int64_t num = min;
521 * FOR_EACH_RANGE (i, 0, n) {
522 * hist.addValue(num);
523 * ++num;
524 * if (num > max) { num = min; }
525 * }
526 * }
527 *
528 * BENCHMARK_NAMED_PARAM(addValue, 0_to_100, 1, 0, 100)
529 * BENCHMARK_NAMED_PARAM(addValue, 0_to_1000, 10, 0, 1000)
530 * BENCHMARK_NAMED_PARAM(addValue, 5k_to_20k, 250, 5000, 20000)
531 */
532#define BENCHMARK_NAMED_PARAM(name, param_name, ...) \
533 BENCHMARK_IMPL( \
534 FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)), \
535 FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")", \
536 iters, \
537 unsigned, \
538 iters) { \
539 name(iters, ##__VA_ARGS__); \
540 }
541
542/**
543 * Same as BENCHMARK_NAMED_PARAM, but allows one to return the actual number
544 * of iterations that have been run.
545 */
546#define BENCHMARK_NAMED_PARAM_MULTI(name, param_name, ...) \
547 BENCHMARK_MULTI_IMPL( \
548 FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)), \
549 FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")", \
550 unsigned, \
551 iters) { \
552 return name(iters, ##__VA_ARGS__); \
553 }
554
555/**
556 * Just like BENCHMARK, but prints the time relative to a
557 * baseline. The baseline is the most recent BENCHMARK() seen in
558 * the current scope. Example:
559 *
560 * // This is the baseline
561 * BENCHMARK(insertVectorBegin, n) {
562 * vector<int> v;
563 * FOR_EACH_RANGE (i, 0, n) {
564 * v.insert(v.begin(), 42);
565 * }
566 * }
567 *
568 * BENCHMARK_RELATIVE(insertListBegin, n) {
569 * list<int> s;
570 * FOR_EACH_RANGE (i, 0, n) {
571 * s.insert(s.begin(), 42);
572 * }
573 * }
574 *
575 * Any number of relative benchmark can be associated with a
576 * baseline. Another BENCHMARK() occurrence effectively establishes a
577 * new baseline.
578 */
579#define BENCHMARK_RELATIVE(name, ...) \
580 BENCHMARK_IMPL( \
581 name, \
582 "%" FB_STRINGIZE(name), \
583 FB_ARG_2_OR_1(1, ##__VA_ARGS__), \
584 FB_ONE_OR_NONE(unsigned, ##__VA_ARGS__), \
585 __VA_ARGS__)
586
587#define BENCHMARK_COUNTERS_RELATIVE(name, counters, ...) \
588 BENCHMARK_IMPL_COUNTERS( \
589 name, \
590 "%" FB_STRINGIZE(name), \
591 counters, \
592 FB_ARG_2_OR_1(1, ##__VA_ARGS__), \
593 FB_ONE_OR_NONE(unsigned, ##__VA_ARGS__), \
594 __VA_ARGS__)
595/**
596 * Same as BENCHMARK_RELATIVE, but allows one to return the actual number
597 * of iterations that have been run.
598 */
599#define BENCHMARK_RELATIVE_MULTI(name, ...) \
600 BENCHMARK_MULTI_IMPL( \
601 name, \
602 "%" FB_STRINGIZE(name), \
603 FB_ONE_OR_NONE(unsigned, ##__VA_ARGS__), \
604 __VA_ARGS__)
605
606/**
607 * A combination of BENCHMARK_RELATIVE and BENCHMARK_PARAM.
608 */
609#define BENCHMARK_RELATIVE_PARAM(name, param) \
610 BENCHMARK_RELATIVE_NAMED_PARAM(name, param, param)
611
612/**
613 * Same as BENCHMARK_RELATIVE_PARAM, but allows one to return the actual
614 * number of iterations that have been run.
615 */
616#define BENCHMARK_RELATIVE_PARAM_MULTI(name, param) \
617 BENCHMARK_RELATIVE_NAMED_PARAM_MULTI(name, param, param)
618
619/**
620 * A combination of BENCHMARK_RELATIVE and BENCHMARK_NAMED_PARAM.
621 */
622#define BENCHMARK_RELATIVE_NAMED_PARAM(name, param_name, ...) \
623 BENCHMARK_IMPL( \
624 FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)), \
625 "%" FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")", \
626 iters, \
627 unsigned, \
628 iters) { \
629 name(iters, ##__VA_ARGS__); \
630 }
631
632/**
633 * Same as BENCHMARK_RELATIVE_NAMED_PARAM, but allows one to return the
634 * actual number of iterations that have been run.
635 */
636#define BENCHMARK_RELATIVE_NAMED_PARAM_MULTI(name, param_name, ...) \
637 BENCHMARK_MULTI_IMPL( \
638 FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)), \
639 "%" FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")", \
640 unsigned, \
641 iters) { \
642 return name(iters, ##__VA_ARGS__); \
643 }
644
645/**
646 * Draws a line of dashes.
647 */
648#define BENCHMARK_DRAW_LINE() \
649 static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) = \
650 (::folly::addBenchmark(__FILE__, "-", []() -> unsigned { return 0; }), \
651 true)
652
653/**
654 * Allows execution of code that doesn't count torward the benchmark's
655 * time budget. Example:
656 *
657 * BENCHMARK_START_GROUP(insertVectorBegin, n) {
658 * vector<int> v;
659 * BENCHMARK_SUSPEND {
660 * v.reserve(n);
661 * }
662 * FOR_EACH_RANGE (i, 0, n) {
663 * v.insert(v.begin(), 42);
664 * }
665 * }
666 */
667#define BENCHMARK_SUSPEND \
668 if (auto FB_ANONYMOUS_VARIABLE(BENCHMARK_SUSPEND) = \
669 ::folly::BenchmarkSuspender()) { \
670 } else
671