1/*
2 * Copyright 2012-present Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17// @author Andrei Alexandrescu (andrei.alexandrescu@fb.com)
18
19#include <folly/Benchmark.h>
20
21#include <algorithm>
22#include <cmath>
23#include <cstring>
24#include <iostream>
25#include <limits>
26#include <map>
27#include <memory>
28#include <numeric>
29#include <utility>
30#include <vector>
31
32#include <boost/regex.hpp>
33
34#include <folly/MapUtil.h>
35#include <folly/String.h>
36#include <folly/container/Foreach.h>
37#include <folly/json.h>
38
39using namespace std;
40
41DEFINE_bool(benchmark, false, "Run benchmarks.");
42DEFINE_bool(json, false, "Output in JSON format.");
43DEFINE_bool(json_verbose, false, "Output in verbose JSON format.");
44
45DEFINE_string(
46 bm_regex,
47 "",
48 "Only benchmarks whose names match this regex will be run.");
49
50DEFINE_int64(
51 bm_min_usec,
52 100,
53 "Minimum # of microseconds we'll accept for each benchmark.");
54
55DEFINE_int32(
56 bm_min_iters,
57 1,
58 "Minimum # of iterations we'll try for each benchmark.");
59
60DEFINE_int64(
61 bm_max_iters,
62 1 << 30,
63 "Maximum # of iterations we'll try for each benchmark.");
64
65DEFINE_int32(
66 bm_max_secs,
67 1,
68 "Maximum # of seconds we'll spend on each benchmark.");
69
70namespace folly {
71
72std::chrono::high_resolution_clock::duration BenchmarkSuspender::timeSpent;
73
74typedef function<detail::TimeIterData(unsigned int)> BenchmarkFun;
75
76vector<detail::BenchmarkRegistration>& benchmarks() {
77 static vector<detail::BenchmarkRegistration> _benchmarks;
78 return _benchmarks;
79}
80
81#define FB_FOLLY_GLOBAL_BENCHMARK_BASELINE fbFollyGlobalBenchmarkBaseline
82#define FB_STRINGIZE_X2(x) FB_STRINGIZE(x)
83
84// Add the global baseline
85BENCHMARK(FB_FOLLY_GLOBAL_BENCHMARK_BASELINE) {
86#ifdef _MSC_VER
87 _ReadWriteBarrier();
88#else
89 asm volatile("");
90#endif
91}
92
93size_t getGlobalBenchmarkBaselineIndex() {
94 const char* global = FB_STRINGIZE_X2(FB_FOLLY_GLOBAL_BENCHMARK_BASELINE);
95 auto it = std::find_if(
96 benchmarks().begin(),
97 benchmarks().end(),
98 [global](const detail::BenchmarkRegistration& v) {
99 return v.name == global;
100 });
101 CHECK(it != benchmarks().end());
102 return size_t(std::distance(benchmarks().begin(), it));
103}
104
105#undef FB_STRINGIZE_X2
106#undef FB_FOLLY_GLOBAL_BENCHMARK_BASELINE
107
108void detail::addBenchmarkImpl(
109 const char* file,
110 const char* name,
111 BenchmarkFun fun,
112 bool useCounter) {
113 benchmarks().push_back({file, name, std::move(fun), useCounter});
114}
115
116static std::pair<double, UserCounters> runBenchmarkGetNSPerIteration(
117 const BenchmarkFun& fun,
118 const double globalBaseline) {
119 using std::chrono::duration_cast;
120 using std::chrono::high_resolution_clock;
121 using std::chrono::microseconds;
122 using std::chrono::nanoseconds;
123 using std::chrono::seconds;
124
125 // They key here is accuracy; too low numbers means the accuracy was
126 // coarse. We up the ante until we get to at least minNanoseconds
127 // timings.
128 static_assert(
129 std::is_same<high_resolution_clock::duration, nanoseconds>::value,
130 "High resolution clock must be nanosecond resolution.");
131 // We choose a minimum minimum (sic) of 100,000 nanoseconds, but if
132 // the clock resolution is worse than that, it will be larger. In
133 // essence we're aiming at making the quantization noise 0.01%.
134 static const auto minNanoseconds = std::max<nanoseconds>(
135 nanoseconds(100000), microseconds(FLAGS_bm_min_usec));
136
137 // We do measurements in several epochs and take the minimum, to
138 // account for jitter.
139 static const unsigned int epochs = 1000;
140 // We establish a total time budget as we don't want a measurement
141 // to take too long. This will curtail the number of actual epochs.
142 const auto timeBudget = seconds(FLAGS_bm_max_secs);
143 auto global = high_resolution_clock::now();
144
145 std::vector<std::pair<double, UserCounters>> epochResults(epochs);
146 size_t actualEpochs = 0;
147
148 for (; actualEpochs < epochs; ++actualEpochs) {
149 const auto maxIters = uint32_t(FLAGS_bm_max_iters);
150 for (auto n = uint32_t(FLAGS_bm_min_iters); n < maxIters; n *= 2) {
151 detail::TimeIterData timeIterData = fun(static_cast<unsigned int>(n));
152 if (timeIterData.duration < minNanoseconds) {
153 continue;
154 }
155 // We got an accurate enough timing, done. But only save if
156 // smaller than the current result.
157 auto nsecs = duration_cast<nanoseconds>(timeIterData.duration);
158 epochResults[actualEpochs] = std::make_pair(
159 max(0.0, double(nsecs.count()) / timeIterData.niter - globalBaseline),
160 std::move(timeIterData.userCounters));
161 // Done with the current epoch, we got a meaningful timing.
162 break;
163 }
164 auto now = high_resolution_clock::now();
165 if (now - global >= timeBudget) {
166 // No more time budget available.
167 ++actualEpochs;
168 break;
169 }
170 }
171
172 // Current state of the art: get the minimum. After some
173 // experimentation, it seems taking the minimum is the best.
174 auto iter = min_element(
175 epochResults.begin(),
176 epochResults.begin() + actualEpochs,
177 [](const auto& a, const auto& b) { return a.first < b.first; });
178
179 // If the benchmark was basically drowned in baseline noise, it's
180 // possible it became negative.
181 return std::make_pair(max(0.0, iter->first), iter->second);
182}
183
184struct ScaleInfo {
185 double boundary;
186 const char* suffix;
187};
188
189static const ScaleInfo kTimeSuffixes[]{
190 {365.25 * 24 * 3600, "years"},
191 {24 * 3600, "days"},
192 {3600, "hr"},
193 {60, "min"},
194 {1, "s"},
195 {1E-3, "ms"},
196 {1E-6, "us"},
197 {1E-9, "ns"},
198 {1E-12, "ps"},
199 {1E-15, "fs"},
200 {0, nullptr},
201};
202
203static const ScaleInfo kMetricSuffixes[]{
204 {1E24, "Y"}, // yotta
205 {1E21, "Z"}, // zetta
206 {1E18, "X"}, // "exa" written with suffix 'X' so as to not create
207 // confusion with scientific notation
208 {1E15, "P"}, // peta
209 {1E12, "T"}, // terra
210 {1E9, "G"}, // giga
211 {1E6, "M"}, // mega
212 {1E3, "K"}, // kilo
213 {1, ""},
214 {1E-3, "m"}, // milli
215 {1E-6, "u"}, // micro
216 {1E-9, "n"}, // nano
217 {1E-12, "p"}, // pico
218 {1E-15, "f"}, // femto
219 {1E-18, "a"}, // atto
220 {1E-21, "z"}, // zepto
221 {1E-24, "y"}, // yocto
222 {0, nullptr},
223};
224
225static string
226humanReadable(double n, unsigned int decimals, const ScaleInfo* scales) {
227 if (std::isinf(n) || std::isnan(n)) {
228 return folly::to<string>(n);
229 }
230
231 const double absValue = fabs(n);
232 const ScaleInfo* scale = scales;
233 while (absValue < scale[0].boundary && scale[1].suffix != nullptr) {
234 ++scale;
235 }
236
237 const double scaledValue = n / scale->boundary;
238 return stringPrintf("%.*f%s", decimals, scaledValue, scale->suffix);
239}
240
241static string readableTime(double n, unsigned int decimals) {
242 return humanReadable(n, decimals, kTimeSuffixes);
243}
244
245static string metricReadable(double n, unsigned int decimals) {
246 return humanReadable(n, decimals, kMetricSuffixes);
247}
248
249namespace {
250class BenchmarkResultsPrinter {
251 public:
252 BenchmarkResultsPrinter() = default;
253 explicit BenchmarkResultsPrinter(std::set<std::string> counterNames)
254 : counterNames_(std::move(counterNames)),
255 namesLength_{std::accumulate(
256 counterNames_.begin(),
257 counterNames_.end(),
258 size_t{0},
259 [](size_t acc, auto&& name) { return acc + 2 + name.length(); })} {}
260
261 static constexpr unsigned int columns{76};
262 void separator(char pad) {
263 puts(string(columns + namesLength_, pad).c_str());
264 }
265
266 void header(const string& file) {
267 separator('=');
268 printf("%-*srelative time/iter iters/s", columns - 28, file.c_str());
269 for (auto const& name : counterNames_) {
270 printf(" %s", name.c_str());
271 }
272 printf("\n");
273 separator('=');
274 }
275
276 void print(const vector<detail::BenchmarkResult>& data) {
277 for (auto& datum : data) {
278 auto file = datum.file;
279 if (file != lastFile_) {
280 // New file starting
281 header(file);
282 lastFile_ = file;
283 }
284
285 string s = datum.name;
286 if (s == "-") {
287 separator('-');
288 continue;
289 }
290 bool useBaseline /* = void */;
291 if (s[0] == '%') {
292 s.erase(0, 1);
293 useBaseline = true;
294 } else {
295 baselineNsPerIter_ = datum.timeInNs;
296 useBaseline = false;
297 }
298 s.resize(columns - 29, ' ');
299 auto nsPerIter = datum.timeInNs;
300 auto secPerIter = nsPerIter / 1E9;
301 auto itersPerSec = (secPerIter == 0)
302 ? std::numeric_limits<double>::infinity()
303 : (1 / secPerIter);
304 if (!useBaseline) {
305 // Print without baseline
306 printf(
307 "%*s %9s %7s",
308 static_cast<int>(s.size()),
309 s.c_str(),
310 readableTime(secPerIter, 2).c_str(),
311 metricReadable(itersPerSec, 2).c_str());
312 } else {
313 // Print with baseline
314 auto rel = baselineNsPerIter_ / nsPerIter * 100.0;
315 printf(
316 "%*s %7.2f%% %9s %7s",
317 static_cast<int>(s.size()),
318 s.c_str(),
319 rel,
320 readableTime(secPerIter, 2).c_str(),
321 metricReadable(itersPerSec, 2).c_str());
322 }
323 for (auto const& name : counterNames_) {
324 if (auto ptr = folly::get_ptr(datum.counters, name)) {
325 printf(" %-*d", int(name.length()), *ptr);
326 } else {
327 printf(" %-*s", int(name.length()), "NaN");
328 }
329 }
330 printf("\n");
331 }
332 }
333
334 private:
335 std::set<std::string> counterNames_;
336 size_t namesLength_{0};
337 double baselineNsPerIter_{numeric_limits<double>::max()};
338 string lastFile_;
339};
340} // namespace
341
342static void printBenchmarkResultsAsJson(
343 const vector<detail::BenchmarkResult>& data) {
344 dynamic d = dynamic::object;
345 for (auto& datum : data) {
346 d[datum.name] = datum.timeInNs * 1000.;
347 }
348
349 printf("%s\n", toPrettyJson(d).c_str());
350}
351
352static void printBenchmarkResultsAsVerboseJson(
353 const vector<detail::BenchmarkResult>& data) {
354 dynamic d;
355 benchmarkResultsToDynamic(data, d);
356 printf("%s\n", toPrettyJson(d).c_str());
357}
358
359static void printBenchmarkResults(const vector<detail::BenchmarkResult>& data) {
360 if (FLAGS_json_verbose) {
361 printBenchmarkResultsAsVerboseJson(data);
362 return;
363 } else if (FLAGS_json) {
364 printBenchmarkResultsAsJson(data);
365 return;
366 }
367
368 CHECK(FLAGS_json_verbose || FLAGS_json) << "Cannot print benchmark results";
369}
370
371void benchmarkResultsToDynamic(
372 const vector<detail::BenchmarkResult>& data,
373 dynamic& out) {
374 out = dynamic::array;
375 for (auto& datum : data) {
376 out.push_back(dynamic::array(datum.file, datum.name, datum.timeInNs));
377 }
378}
379
380void benchmarkResultsFromDynamic(
381 const dynamic& d,
382 vector<detail::BenchmarkResult>& results) {
383 for (auto& datum : d) {
384 results.push_back({datum[0].asString(),
385 datum[1].asString(),
386 datum[2].asDouble(),
387 UserCounters{}});
388 }
389}
390
391static pair<StringPiece, StringPiece> resultKey(
392 const detail::BenchmarkResult& result) {
393 return pair<StringPiece, StringPiece>(result.file, result.name);
394}
395
396void printResultComparison(
397 const vector<detail::BenchmarkResult>& base,
398 const vector<detail::BenchmarkResult>& test) {
399 map<pair<StringPiece, StringPiece>, double> baselines;
400
401 for (auto& baseResult : base) {
402 baselines[resultKey(baseResult)] = baseResult.timeInNs;
403 }
404 //
405 // Width available
406 static const unsigned int columns = 76;
407
408 // Compute the longest benchmark name
409 size_t longestName = 0;
410 for (auto& datum : test) {
411 longestName = max(longestName, datum.name.size());
412 }
413
414 // Print a horizontal rule
415 auto separator = [&](char pad) { puts(string(columns, pad).c_str()); };
416
417 // Print header for a file
418 auto header = [&](const string& file) {
419 separator('=');
420 printf("%-*srelative time/iter iters/s", columns - 28, file.c_str());
421 separator('=');
422 };
423
424 string lastFile;
425
426 for (auto& datum : test) {
427 folly::Optional<double> baseline =
428 folly::get_optional(baselines, resultKey(datum));
429 auto file = datum.file;
430 if (file != lastFile) {
431 // New file starting
432 header(file);
433 lastFile = file;
434 }
435
436 string s = datum.name;
437 if (s == "-") {
438 separator('-');
439 continue;
440 }
441 if (s[0] == '%') {
442 s.erase(0, 1);
443 }
444 s.resize(columns - 29, ' ');
445 auto nsPerIter = datum.timeInNs;
446 auto secPerIter = nsPerIter / 1E9;
447 auto itersPerSec = (secPerIter == 0)
448 ? std::numeric_limits<double>::infinity()
449 : (1 / secPerIter);
450 if (!baseline) {
451 // Print without baseline
452 printf(
453 "%*s %9s %7s\n",
454 static_cast<int>(s.size()),
455 s.c_str(),
456 readableTime(secPerIter, 2).c_str(),
457 metricReadable(itersPerSec, 2).c_str());
458 } else {
459 // Print with baseline
460 auto rel = *baseline / nsPerIter * 100.0;
461 printf(
462 "%*s %7.2f%% %9s %7s\n",
463 static_cast<int>(s.size()),
464 s.c_str(),
465 rel,
466 readableTime(secPerIter, 2).c_str(),
467 metricReadable(itersPerSec, 2).c_str());
468 }
469 }
470 separator('=');
471}
472
473void checkRunMode() {
474 if (folly::kIsDebug || folly::kIsSanitize) {
475 std::cerr << "WARNING: Benchmark running "
476 << (folly::kIsDebug ? "in DEBUG mode" : "with SANITIZERS")
477 << std::endl;
478 }
479}
480
481void runBenchmarks() {
482 CHECK(!benchmarks().empty());
483
484 checkRunMode();
485
486 vector<detail::BenchmarkResult> results;
487 results.reserve(benchmarks().size() - 1);
488
489 std::unique_ptr<boost::regex> bmRegex;
490 if (!FLAGS_bm_regex.empty()) {
491 bmRegex = std::make_unique<boost::regex>(FLAGS_bm_regex);
492 }
493
494 // PLEASE KEEP QUIET. MEASUREMENTS IN PROGRESS.
495
496 size_t baselineIndex = getGlobalBenchmarkBaselineIndex();
497
498 auto const globalBaseline =
499 runBenchmarkGetNSPerIteration(benchmarks()[baselineIndex].func, 0);
500
501 bool useCounter =
502 std::any_of(benchmarks().begin(), benchmarks().end(), [](const auto& bm) {
503 return bm.useCounter;
504 });
505 BenchmarkResultsPrinter printer;
506 std::set<std::string> counterNames;
507 FOR_EACH_RANGE (i, 0, benchmarks().size()) {
508 if (i == baselineIndex) {
509 continue;
510 }
511 std::pair<double, UserCounters> elapsed;
512 auto& bm = benchmarks()[i];
513 if (bm.name != "-") { // skip separators
514 if (bmRegex && !boost::regex_search(bm.name, *bmRegex)) {
515 continue;
516 }
517 elapsed = runBenchmarkGetNSPerIteration(bm.func, globalBaseline.first);
518 }
519
520 // if customized user counters is used, it cannot print the result in real
521 // time as it needs to run all cases first to know the complete set of
522 // counters have been used, then the header can be printed out properly
523 if (!FLAGS_json_verbose && !FLAGS_json && !useCounter) {
524 printer.print({{bm.file, bm.name, elapsed.first, elapsed.second}});
525 } else {
526 results.push_back({bm.file, bm.name, elapsed.first, elapsed.second});
527 }
528
529 // get all counter names
530 for (auto const& kv : elapsed.second) {
531 counterNames.insert(kv.first);
532 }
533 }
534
535 // PLEASE MAKE NOISE. MEASUREMENTS DONE.
536 if (FLAGS_json_verbose || FLAGS_json) {
537 printBenchmarkResults(results);
538 } else {
539 printer = BenchmarkResultsPrinter{std::move(counterNames)};
540 printer.print(results);
541 printer.separator('=');
542 }
543
544 checkRunMode();
545}
546
547} // namespace folly
548