1 | /* |
2 | * Copyright 2012-present Facebook, Inc. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | // @author Andrei Alexandrescu (andrei.alexandrescu@fb.com) |
18 | |
19 | #include <folly/Benchmark.h> |
20 | |
21 | #include <algorithm> |
22 | #include <cmath> |
23 | #include <cstring> |
24 | #include <iostream> |
25 | #include <limits> |
26 | #include <map> |
27 | #include <memory> |
28 | #include <numeric> |
29 | #include <utility> |
30 | #include <vector> |
31 | |
32 | #include <boost/regex.hpp> |
33 | |
34 | #include <folly/MapUtil.h> |
35 | #include <folly/String.h> |
36 | #include <folly/container/Foreach.h> |
37 | #include <folly/json.h> |
38 | |
39 | using namespace std; |
40 | |
41 | DEFINE_bool(benchmark, false, "Run benchmarks." ); |
42 | DEFINE_bool(json, false, "Output in JSON format." ); |
43 | DEFINE_bool(json_verbose, false, "Output in verbose JSON format." ); |
44 | |
45 | DEFINE_string( |
46 | bm_regex, |
47 | "" , |
48 | "Only benchmarks whose names match this regex will be run." ); |
49 | |
50 | DEFINE_int64( |
51 | bm_min_usec, |
52 | 100, |
53 | "Minimum # of microseconds we'll accept for each benchmark." ); |
54 | |
55 | DEFINE_int32( |
56 | bm_min_iters, |
57 | 1, |
58 | "Minimum # of iterations we'll try for each benchmark." ); |
59 | |
60 | DEFINE_int64( |
61 | bm_max_iters, |
62 | 1 << 30, |
63 | "Maximum # of iterations we'll try for each benchmark." ); |
64 | |
65 | DEFINE_int32( |
66 | bm_max_secs, |
67 | 1, |
68 | "Maximum # of seconds we'll spend on each benchmark." ); |
69 | |
70 | namespace folly { |
71 | |
72 | std::chrono::high_resolution_clock::duration BenchmarkSuspender::timeSpent; |
73 | |
74 | typedef function<detail::TimeIterData(unsigned int)> BenchmarkFun; |
75 | |
76 | vector<detail::BenchmarkRegistration>& benchmarks() { |
77 | static vector<detail::BenchmarkRegistration> _benchmarks; |
78 | return _benchmarks; |
79 | } |
80 | |
81 | #define FB_FOLLY_GLOBAL_BENCHMARK_BASELINE fbFollyGlobalBenchmarkBaseline |
82 | #define FB_STRINGIZE_X2(x) FB_STRINGIZE(x) |
83 | |
84 | // Add the global baseline |
85 | BENCHMARK(FB_FOLLY_GLOBAL_BENCHMARK_BASELINE) { |
86 | #ifdef _MSC_VER |
87 | _ReadWriteBarrier(); |
88 | #else |
89 | asm volatile("" ); |
90 | #endif |
91 | } |
92 | |
93 | size_t getGlobalBenchmarkBaselineIndex() { |
94 | const char* global = FB_STRINGIZE_X2(FB_FOLLY_GLOBAL_BENCHMARK_BASELINE); |
95 | auto it = std::find_if( |
96 | benchmarks().begin(), |
97 | benchmarks().end(), |
98 | [global](const detail::BenchmarkRegistration& v) { |
99 | return v.name == global; |
100 | }); |
101 | CHECK(it != benchmarks().end()); |
102 | return size_t(std::distance(benchmarks().begin(), it)); |
103 | } |
104 | |
105 | #undef FB_STRINGIZE_X2 |
106 | #undef FB_FOLLY_GLOBAL_BENCHMARK_BASELINE |
107 | |
108 | void detail::addBenchmarkImpl( |
109 | const char* file, |
110 | const char* name, |
111 | BenchmarkFun fun, |
112 | bool useCounter) { |
113 | benchmarks().push_back({file, name, std::move(fun), useCounter}); |
114 | } |
115 | |
116 | static std::pair<double, UserCounters> runBenchmarkGetNSPerIteration( |
117 | const BenchmarkFun& fun, |
118 | const double globalBaseline) { |
119 | using std::chrono::duration_cast; |
120 | using std::chrono::high_resolution_clock; |
121 | using std::chrono::microseconds; |
122 | using std::chrono::nanoseconds; |
123 | using std::chrono::seconds; |
124 | |
125 | // They key here is accuracy; too low numbers means the accuracy was |
126 | // coarse. We up the ante until we get to at least minNanoseconds |
127 | // timings. |
128 | static_assert( |
129 | std::is_same<high_resolution_clock::duration, nanoseconds>::value, |
130 | "High resolution clock must be nanosecond resolution." ); |
131 | // We choose a minimum minimum (sic) of 100,000 nanoseconds, but if |
132 | // the clock resolution is worse than that, it will be larger. In |
133 | // essence we're aiming at making the quantization noise 0.01%. |
134 | static const auto minNanoseconds = std::max<nanoseconds>( |
135 | nanoseconds(100000), microseconds(FLAGS_bm_min_usec)); |
136 | |
137 | // We do measurements in several epochs and take the minimum, to |
138 | // account for jitter. |
139 | static const unsigned int epochs = 1000; |
140 | // We establish a total time budget as we don't want a measurement |
141 | // to take too long. This will curtail the number of actual epochs. |
142 | const auto timeBudget = seconds(FLAGS_bm_max_secs); |
143 | auto global = high_resolution_clock::now(); |
144 | |
145 | std::vector<std::pair<double, UserCounters>> epochResults(epochs); |
146 | size_t actualEpochs = 0; |
147 | |
148 | for (; actualEpochs < epochs; ++actualEpochs) { |
149 | const auto maxIters = uint32_t(FLAGS_bm_max_iters); |
150 | for (auto n = uint32_t(FLAGS_bm_min_iters); n < maxIters; n *= 2) { |
151 | detail::TimeIterData timeIterData = fun(static_cast<unsigned int>(n)); |
152 | if (timeIterData.duration < minNanoseconds) { |
153 | continue; |
154 | } |
155 | // We got an accurate enough timing, done. But only save if |
156 | // smaller than the current result. |
157 | auto nsecs = duration_cast<nanoseconds>(timeIterData.duration); |
158 | epochResults[actualEpochs] = std::make_pair( |
159 | max(0.0, double(nsecs.count()) / timeIterData.niter - globalBaseline), |
160 | std::move(timeIterData.userCounters)); |
161 | // Done with the current epoch, we got a meaningful timing. |
162 | break; |
163 | } |
164 | auto now = high_resolution_clock::now(); |
165 | if (now - global >= timeBudget) { |
166 | // No more time budget available. |
167 | ++actualEpochs; |
168 | break; |
169 | } |
170 | } |
171 | |
172 | // Current state of the art: get the minimum. After some |
173 | // experimentation, it seems taking the minimum is the best. |
174 | auto iter = min_element( |
175 | epochResults.begin(), |
176 | epochResults.begin() + actualEpochs, |
177 | [](const auto& a, const auto& b) { return a.first < b.first; }); |
178 | |
179 | // If the benchmark was basically drowned in baseline noise, it's |
180 | // possible it became negative. |
181 | return std::make_pair(max(0.0, iter->first), iter->second); |
182 | } |
183 | |
184 | struct ScaleInfo { |
185 | double boundary; |
186 | const char* suffix; |
187 | }; |
188 | |
189 | static const ScaleInfo kTimeSuffixes[]{ |
190 | {365.25 * 24 * 3600, "years" }, |
191 | {24 * 3600, "days" }, |
192 | {3600, "hr" }, |
193 | {60, "min" }, |
194 | {1, "s" }, |
195 | {1E-3, "ms" }, |
196 | {1E-6, "us" }, |
197 | {1E-9, "ns" }, |
198 | {1E-12, "ps" }, |
199 | {1E-15, "fs" }, |
200 | {0, nullptr}, |
201 | }; |
202 | |
203 | static const ScaleInfo kMetricSuffixes[]{ |
204 | {1E24, "Y" }, // yotta |
205 | {1E21, "Z" }, // zetta |
206 | {1E18, "X" }, // "exa" written with suffix 'X' so as to not create |
207 | // confusion with scientific notation |
208 | {1E15, "P" }, // peta |
209 | {1E12, "T" }, // terra |
210 | {1E9, "G" }, // giga |
211 | {1E6, "M" }, // mega |
212 | {1E3, "K" }, // kilo |
213 | {1, "" }, |
214 | {1E-3, "m" }, // milli |
215 | {1E-6, "u" }, // micro |
216 | {1E-9, "n" }, // nano |
217 | {1E-12, "p" }, // pico |
218 | {1E-15, "f" }, // femto |
219 | {1E-18, "a" }, // atto |
220 | {1E-21, "z" }, // zepto |
221 | {1E-24, "y" }, // yocto |
222 | {0, nullptr}, |
223 | }; |
224 | |
225 | static string |
226 | humanReadable(double n, unsigned int decimals, const ScaleInfo* scales) { |
227 | if (std::isinf(n) || std::isnan(n)) { |
228 | return folly::to<string>(n); |
229 | } |
230 | |
231 | const double absValue = fabs(n); |
232 | const ScaleInfo* scale = scales; |
233 | while (absValue < scale[0].boundary && scale[1].suffix != nullptr) { |
234 | ++scale; |
235 | } |
236 | |
237 | const double scaledValue = n / scale->boundary; |
238 | return stringPrintf("%.*f%s" , decimals, scaledValue, scale->suffix); |
239 | } |
240 | |
241 | static string readableTime(double n, unsigned int decimals) { |
242 | return humanReadable(n, decimals, kTimeSuffixes); |
243 | } |
244 | |
245 | static string metricReadable(double n, unsigned int decimals) { |
246 | return humanReadable(n, decimals, kMetricSuffixes); |
247 | } |
248 | |
249 | namespace { |
250 | class BenchmarkResultsPrinter { |
251 | public: |
252 | BenchmarkResultsPrinter() = default; |
253 | explicit BenchmarkResultsPrinter(std::set<std::string> counterNames) |
254 | : counterNames_(std::move(counterNames)), |
255 | namesLength_{std::accumulate( |
256 | counterNames_.begin(), |
257 | counterNames_.end(), |
258 | size_t{0}, |
259 | [](size_t acc, auto&& name) { return acc + 2 + name.length(); })} {} |
260 | |
261 | static constexpr unsigned int columns{76}; |
262 | void separator(char pad) { |
263 | puts(string(columns + namesLength_, pad).c_str()); |
264 | } |
265 | |
266 | void (const string& file) { |
267 | separator('='); |
268 | printf("%-*srelative time/iter iters/s" , columns - 28, file.c_str()); |
269 | for (auto const& name : counterNames_) { |
270 | printf(" %s" , name.c_str()); |
271 | } |
272 | printf("\n" ); |
273 | separator('='); |
274 | } |
275 | |
276 | void print(const vector<detail::BenchmarkResult>& data) { |
277 | for (auto& datum : data) { |
278 | auto file = datum.file; |
279 | if (file != lastFile_) { |
280 | // New file starting |
281 | header(file); |
282 | lastFile_ = file; |
283 | } |
284 | |
285 | string s = datum.name; |
286 | if (s == "-" ) { |
287 | separator('-'); |
288 | continue; |
289 | } |
290 | bool useBaseline /* = void */; |
291 | if (s[0] == '%') { |
292 | s.erase(0, 1); |
293 | useBaseline = true; |
294 | } else { |
295 | baselineNsPerIter_ = datum.timeInNs; |
296 | useBaseline = false; |
297 | } |
298 | s.resize(columns - 29, ' '); |
299 | auto nsPerIter = datum.timeInNs; |
300 | auto secPerIter = nsPerIter / 1E9; |
301 | auto itersPerSec = (secPerIter == 0) |
302 | ? std::numeric_limits<double>::infinity() |
303 | : (1 / secPerIter); |
304 | if (!useBaseline) { |
305 | // Print without baseline |
306 | printf( |
307 | "%*s %9s %7s" , |
308 | static_cast<int>(s.size()), |
309 | s.c_str(), |
310 | readableTime(secPerIter, 2).c_str(), |
311 | metricReadable(itersPerSec, 2).c_str()); |
312 | } else { |
313 | // Print with baseline |
314 | auto rel = baselineNsPerIter_ / nsPerIter * 100.0; |
315 | printf( |
316 | "%*s %7.2f%% %9s %7s" , |
317 | static_cast<int>(s.size()), |
318 | s.c_str(), |
319 | rel, |
320 | readableTime(secPerIter, 2).c_str(), |
321 | metricReadable(itersPerSec, 2).c_str()); |
322 | } |
323 | for (auto const& name : counterNames_) { |
324 | if (auto ptr = folly::get_ptr(datum.counters, name)) { |
325 | printf(" %-*d" , int(name.length()), *ptr); |
326 | } else { |
327 | printf(" %-*s" , int(name.length()), "NaN" ); |
328 | } |
329 | } |
330 | printf("\n" ); |
331 | } |
332 | } |
333 | |
334 | private: |
335 | std::set<std::string> counterNames_; |
336 | size_t namesLength_{0}; |
337 | double baselineNsPerIter_{numeric_limits<double>::max()}; |
338 | string lastFile_; |
339 | }; |
340 | } // namespace |
341 | |
342 | static void printBenchmarkResultsAsJson( |
343 | const vector<detail::BenchmarkResult>& data) { |
344 | dynamic d = dynamic::object; |
345 | for (auto& datum : data) { |
346 | d[datum.name] = datum.timeInNs * 1000.; |
347 | } |
348 | |
349 | printf("%s\n" , toPrettyJson(d).c_str()); |
350 | } |
351 | |
352 | static void printBenchmarkResultsAsVerboseJson( |
353 | const vector<detail::BenchmarkResult>& data) { |
354 | dynamic d; |
355 | benchmarkResultsToDynamic(data, d); |
356 | printf("%s\n" , toPrettyJson(d).c_str()); |
357 | } |
358 | |
359 | static void printBenchmarkResults(const vector<detail::BenchmarkResult>& data) { |
360 | if (FLAGS_json_verbose) { |
361 | printBenchmarkResultsAsVerboseJson(data); |
362 | return; |
363 | } else if (FLAGS_json) { |
364 | printBenchmarkResultsAsJson(data); |
365 | return; |
366 | } |
367 | |
368 | CHECK(FLAGS_json_verbose || FLAGS_json) << "Cannot print benchmark results" ; |
369 | } |
370 | |
371 | void benchmarkResultsToDynamic( |
372 | const vector<detail::BenchmarkResult>& data, |
373 | dynamic& out) { |
374 | out = dynamic::array; |
375 | for (auto& datum : data) { |
376 | out.push_back(dynamic::array(datum.file, datum.name, datum.timeInNs)); |
377 | } |
378 | } |
379 | |
380 | void benchmarkResultsFromDynamic( |
381 | const dynamic& d, |
382 | vector<detail::BenchmarkResult>& results) { |
383 | for (auto& datum : d) { |
384 | results.push_back({datum[0].asString(), |
385 | datum[1].asString(), |
386 | datum[2].asDouble(), |
387 | UserCounters{}}); |
388 | } |
389 | } |
390 | |
391 | static pair<StringPiece, StringPiece> resultKey( |
392 | const detail::BenchmarkResult& result) { |
393 | return pair<StringPiece, StringPiece>(result.file, result.name); |
394 | } |
395 | |
396 | void printResultComparison( |
397 | const vector<detail::BenchmarkResult>& base, |
398 | const vector<detail::BenchmarkResult>& test) { |
399 | map<pair<StringPiece, StringPiece>, double> baselines; |
400 | |
401 | for (auto& baseResult : base) { |
402 | baselines[resultKey(baseResult)] = baseResult.timeInNs; |
403 | } |
404 | // |
405 | // Width available |
406 | static const unsigned int columns = 76; |
407 | |
408 | // Compute the longest benchmark name |
409 | size_t longestName = 0; |
410 | for (auto& datum : test) { |
411 | longestName = max(longestName, datum.name.size()); |
412 | } |
413 | |
414 | // Print a horizontal rule |
415 | auto separator = [&](char pad) { puts(string(columns, pad).c_str()); }; |
416 | |
417 | // Print header for a file |
418 | auto = [&](const string& file) { |
419 | separator('='); |
420 | printf("%-*srelative time/iter iters/s" , columns - 28, file.c_str()); |
421 | separator('='); |
422 | }; |
423 | |
424 | string lastFile; |
425 | |
426 | for (auto& datum : test) { |
427 | folly::Optional<double> baseline = |
428 | folly::get_optional(baselines, resultKey(datum)); |
429 | auto file = datum.file; |
430 | if (file != lastFile) { |
431 | // New file starting |
432 | header(file); |
433 | lastFile = file; |
434 | } |
435 | |
436 | string s = datum.name; |
437 | if (s == "-" ) { |
438 | separator('-'); |
439 | continue; |
440 | } |
441 | if (s[0] == '%') { |
442 | s.erase(0, 1); |
443 | } |
444 | s.resize(columns - 29, ' '); |
445 | auto nsPerIter = datum.timeInNs; |
446 | auto secPerIter = nsPerIter / 1E9; |
447 | auto itersPerSec = (secPerIter == 0) |
448 | ? std::numeric_limits<double>::infinity() |
449 | : (1 / secPerIter); |
450 | if (!baseline) { |
451 | // Print without baseline |
452 | printf( |
453 | "%*s %9s %7s\n" , |
454 | static_cast<int>(s.size()), |
455 | s.c_str(), |
456 | readableTime(secPerIter, 2).c_str(), |
457 | metricReadable(itersPerSec, 2).c_str()); |
458 | } else { |
459 | // Print with baseline |
460 | auto rel = *baseline / nsPerIter * 100.0; |
461 | printf( |
462 | "%*s %7.2f%% %9s %7s\n" , |
463 | static_cast<int>(s.size()), |
464 | s.c_str(), |
465 | rel, |
466 | readableTime(secPerIter, 2).c_str(), |
467 | metricReadable(itersPerSec, 2).c_str()); |
468 | } |
469 | } |
470 | separator('='); |
471 | } |
472 | |
473 | void checkRunMode() { |
474 | if (folly::kIsDebug || folly::kIsSanitize) { |
475 | std::cerr << "WARNING: Benchmark running " |
476 | << (folly::kIsDebug ? "in DEBUG mode" : "with SANITIZERS" ) |
477 | << std::endl; |
478 | } |
479 | } |
480 | |
481 | void runBenchmarks() { |
482 | CHECK(!benchmarks().empty()); |
483 | |
484 | checkRunMode(); |
485 | |
486 | vector<detail::BenchmarkResult> results; |
487 | results.reserve(benchmarks().size() - 1); |
488 | |
489 | std::unique_ptr<boost::regex> bmRegex; |
490 | if (!FLAGS_bm_regex.empty()) { |
491 | bmRegex = std::make_unique<boost::regex>(FLAGS_bm_regex); |
492 | } |
493 | |
494 | // PLEASE KEEP QUIET. MEASUREMENTS IN PROGRESS. |
495 | |
496 | size_t baselineIndex = getGlobalBenchmarkBaselineIndex(); |
497 | |
498 | auto const globalBaseline = |
499 | runBenchmarkGetNSPerIteration(benchmarks()[baselineIndex].func, 0); |
500 | |
501 | bool useCounter = |
502 | std::any_of(benchmarks().begin(), benchmarks().end(), [](const auto& bm) { |
503 | return bm.useCounter; |
504 | }); |
505 | BenchmarkResultsPrinter printer; |
506 | std::set<std::string> counterNames; |
507 | FOR_EACH_RANGE (i, 0, benchmarks().size()) { |
508 | if (i == baselineIndex) { |
509 | continue; |
510 | } |
511 | std::pair<double, UserCounters> elapsed; |
512 | auto& bm = benchmarks()[i]; |
513 | if (bm.name != "-" ) { // skip separators |
514 | if (bmRegex && !boost::regex_search(bm.name, *bmRegex)) { |
515 | continue; |
516 | } |
517 | elapsed = runBenchmarkGetNSPerIteration(bm.func, globalBaseline.first); |
518 | } |
519 | |
520 | // if customized user counters is used, it cannot print the result in real |
521 | // time as it needs to run all cases first to know the complete set of |
522 | // counters have been used, then the header can be printed out properly |
523 | if (!FLAGS_json_verbose && !FLAGS_json && !useCounter) { |
524 | printer.print({{bm.file, bm.name, elapsed.first, elapsed.second}}); |
525 | } else { |
526 | results.push_back({bm.file, bm.name, elapsed.first, elapsed.second}); |
527 | } |
528 | |
529 | // get all counter names |
530 | for (auto const& kv : elapsed.second) { |
531 | counterNames.insert(kv.first); |
532 | } |
533 | } |
534 | |
535 | // PLEASE MAKE NOISE. MEASUREMENTS DONE. |
536 | if (FLAGS_json_verbose || FLAGS_json) { |
537 | printBenchmarkResults(results); |
538 | } else { |
539 | printer = BenchmarkResultsPrinter{std::move(counterNames)}; |
540 | printer.print(results); |
541 | printer.separator('='); |
542 | } |
543 | |
544 | checkRunMode(); |
545 | } |
546 | |
547 | } // namespace folly |
548 | |