Benchmark.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495
  1. /*
  2. * Copyright 2012-present Facebook, Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. // @author Andrei Alexandrescu (andrei.alexandrescu@fb.com)
  17. #include <folly/Benchmark.h>
  18. #include <algorithm>
  19. #include <cmath>
  20. #include <cstring>
  21. #include <iostream>
  22. #include <limits>
  23. #include <map>
  24. #include <memory>
  25. #include <utility>
  26. #include <vector>
  27. #include <boost/regex.hpp>
  28. #include <folly/MapUtil.h>
  29. #include <folly/String.h>
  30. #include <folly/container/Foreach.h>
  31. #include <folly/json.h>
  32. using namespace std;
  33. DEFINE_bool(benchmark, false, "Run benchmarks.");
  34. DEFINE_bool(json, false, "Output in JSON format.");
  35. DEFINE_bool(json_verbose, false, "Output in verbose JSON format.");
  36. DEFINE_string(
  37. bm_regex,
  38. "",
  39. "Only benchmarks whose names match this regex will be run.");
  40. DEFINE_int64(
  41. bm_min_usec,
  42. 100,
  43. "Minimum # of microseconds we'll accept for each benchmark.");
  44. DEFINE_int32(
  45. bm_min_iters,
  46. 1,
  47. "Minimum # of iterations we'll try for each benchmark.");
  48. DEFINE_int64(
  49. bm_max_iters,
  50. 1 << 30,
  51. "Maximum # of iterations we'll try for each benchmark.");
  52. DEFINE_int32(
  53. bm_max_secs,
  54. 1,
  55. "Maximum # of seconds we'll spend on each benchmark.");
  56. namespace folly {
  57. std::chrono::high_resolution_clock::duration BenchmarkSuspender::timeSpent;
  58. typedef function<detail::TimeIterPair(unsigned int)> BenchmarkFun;
  59. vector<detail::BenchmarkRegistration>& benchmarks() {
  60. static vector<detail::BenchmarkRegistration> _benchmarks;
  61. return _benchmarks;
  62. }
  63. #define FB_FOLLY_GLOBAL_BENCHMARK_BASELINE fbFollyGlobalBenchmarkBaseline
  64. #define FB_STRINGIZE_X2(x) FB_STRINGIZE(x)
  65. // Add the global baseline
  66. BENCHMARK(FB_FOLLY_GLOBAL_BENCHMARK_BASELINE) {
  67. #ifdef _MSC_VER
  68. _ReadWriteBarrier();
  69. #else
  70. asm volatile("");
  71. #endif
  72. }
  73. size_t getGlobalBenchmarkBaselineIndex() {
  74. const char* global = FB_STRINGIZE_X2(FB_FOLLY_GLOBAL_BENCHMARK_BASELINE);
  75. auto it = std::find_if(
  76. benchmarks().begin(),
  77. benchmarks().end(),
  78. [global](const detail::BenchmarkRegistration& v) {
  79. return v.name == global;
  80. });
  81. CHECK(it != benchmarks().end());
  82. return size_t(std::distance(benchmarks().begin(), it));
  83. }
  84. #undef FB_STRINGIZE_X2
  85. #undef FB_FOLLY_GLOBAL_BENCHMARK_BASELINE
  86. void detail::addBenchmarkImpl(
  87. const char* file,
  88. const char* name,
  89. BenchmarkFun fun) {
  90. benchmarks().push_back({file, name, std::move(fun)});
  91. }
  92. /**
  93. * Given a bunch of benchmark samples, estimate the actual run time.
  94. */
  95. static double estimateTime(double* begin, double* end) {
  96. assert(begin < end);
  97. // Current state of the art: get the minimum. After some
  98. // experimentation, it seems taking the minimum is the best.
  99. return *min_element(begin, end);
  100. }
  101. static double runBenchmarkGetNSPerIteration(
  102. const BenchmarkFun& fun,
  103. const double globalBaseline) {
  104. using std::chrono::duration_cast;
  105. using std::chrono::high_resolution_clock;
  106. using std::chrono::microseconds;
  107. using std::chrono::nanoseconds;
  108. using std::chrono::seconds;
  109. // They key here is accuracy; too low numbers means the accuracy was
  110. // coarse. We up the ante until we get to at least minNanoseconds
  111. // timings.
  112. static_assert(
  113. std::is_same<high_resolution_clock::duration, nanoseconds>::value,
  114. "High resolution clock must be nanosecond resolution.");
  115. // We choose a minimum minimum (sic) of 100,000 nanoseconds, but if
  116. // the clock resolution is worse than that, it will be larger. In
  117. // essence we're aiming at making the quantization noise 0.01%.
  118. static const auto minNanoseconds = std::max<nanoseconds>(
  119. nanoseconds(100000), microseconds(FLAGS_bm_min_usec));
  120. // We do measurements in several epochs and take the minimum, to
  121. // account for jitter.
  122. static const unsigned int epochs = 1000;
  123. // We establish a total time budget as we don't want a measurement
  124. // to take too long. This will curtail the number of actual epochs.
  125. const auto timeBudget = seconds(FLAGS_bm_max_secs);
  126. auto global = high_resolution_clock::now();
  127. double epochResults[epochs] = {0};
  128. size_t actualEpochs = 0;
  129. for (; actualEpochs < epochs; ++actualEpochs) {
  130. const auto maxIters = uint32_t(FLAGS_bm_max_iters);
  131. for (auto n = uint32_t(FLAGS_bm_min_iters); n < maxIters; n *= 2) {
  132. auto const nsecsAndIter = fun(static_cast<unsigned int>(n));
  133. if (nsecsAndIter.first < minNanoseconds) {
  134. continue;
  135. }
  136. // We got an accurate enough timing, done. But only save if
  137. // smaller than the current result.
  138. auto nsecs = duration_cast<nanoseconds>(nsecsAndIter.first).count();
  139. epochResults[actualEpochs] =
  140. max(0.0, double(nsecs) / nsecsAndIter.second - globalBaseline);
  141. // Done with the current epoch, we got a meaningful timing.
  142. break;
  143. }
  144. auto now = high_resolution_clock::now();
  145. if (now - global >= timeBudget) {
  146. // No more time budget available.
  147. ++actualEpochs;
  148. break;
  149. }
  150. }
  151. // If the benchmark was basically drowned in baseline noise, it's
  152. // possible it became negative.
  153. return max(0.0, estimateTime(epochResults, epochResults + actualEpochs));
  154. }
  155. struct ScaleInfo {
  156. double boundary;
  157. const char* suffix;
  158. };
  159. static const ScaleInfo kTimeSuffixes[]{
  160. {365.25 * 24 * 3600, "years"},
  161. {24 * 3600, "days"},
  162. {3600, "hr"},
  163. {60, "min"},
  164. {1, "s"},
  165. {1E-3, "ms"},
  166. {1E-6, "us"},
  167. {1E-9, "ns"},
  168. {1E-12, "ps"},
  169. {1E-15, "fs"},
  170. {0, nullptr},
  171. };
  172. static const ScaleInfo kMetricSuffixes[]{
  173. {1E24, "Y"}, // yotta
  174. {1E21, "Z"}, // zetta
  175. {1E18, "X"}, // "exa" written with suffix 'X' so as to not create
  176. // confusion with scientific notation
  177. {1E15, "P"}, // peta
  178. {1E12, "T"}, // terra
  179. {1E9, "G"}, // giga
  180. {1E6, "M"}, // mega
  181. {1E3, "K"}, // kilo
  182. {1, ""},
  183. {1E-3, "m"}, // milli
  184. {1E-6, "u"}, // micro
  185. {1E-9, "n"}, // nano
  186. {1E-12, "p"}, // pico
  187. {1E-15, "f"}, // femto
  188. {1E-18, "a"}, // atto
  189. {1E-21, "z"}, // zepto
  190. {1E-24, "y"}, // yocto
  191. {0, nullptr},
  192. };
  193. static string
  194. humanReadable(double n, unsigned int decimals, const ScaleInfo* scales) {
  195. if (std::isinf(n) || std::isnan(n)) {
  196. return folly::to<string>(n);
  197. }
  198. const double absValue = fabs(n);
  199. const ScaleInfo* scale = scales;
  200. while (absValue < scale[0].boundary && scale[1].suffix != nullptr) {
  201. ++scale;
  202. }
  203. const double scaledValue = n / scale->boundary;
  204. return stringPrintf("%.*f%s", decimals, scaledValue, scale->suffix);
  205. }
  206. static string readableTime(double n, unsigned int decimals) {
  207. return humanReadable(n, decimals, kTimeSuffixes);
  208. }
  209. static string metricReadable(double n, unsigned int decimals) {
  210. return humanReadable(n, decimals, kMetricSuffixes);
  211. }
  212. namespace {
  213. class BenchmarkResultsPrinter {
  214. public:
  215. static constexpr unsigned int columns{76};
  216. double baselineNsPerIter{numeric_limits<double>::max()};
  217. string lastFile;
  218. void separator(char pad) {
  219. puts(string(columns, pad).c_str());
  220. }
  221. void header(const string& file) {
  222. separator('=');
  223. printf("%-*srelative time/iter iters/s\n", columns - 28, file.c_str());
  224. separator('=');
  225. }
  226. void print(const vector<detail::BenchmarkResult>& data) {
  227. for (auto& datum : data) {
  228. auto file = datum.file;
  229. if (file != lastFile) {
  230. // New file starting
  231. header(file);
  232. lastFile = file;
  233. }
  234. string s = datum.name;
  235. if (s == "-") {
  236. separator('-');
  237. continue;
  238. }
  239. bool useBaseline /* = void */;
  240. if (s[0] == '%') {
  241. s.erase(0, 1);
  242. useBaseline = true;
  243. } else {
  244. baselineNsPerIter = datum.timeInNs;
  245. useBaseline = false;
  246. }
  247. s.resize(columns - 29, ' ');
  248. auto nsPerIter = datum.timeInNs;
  249. auto secPerIter = nsPerIter / 1E9;
  250. auto itersPerSec = (secPerIter == 0)
  251. ? std::numeric_limits<double>::infinity()
  252. : (1 / secPerIter);
  253. if (!useBaseline) {
  254. // Print without baseline
  255. printf(
  256. "%*s %9s %7s\n",
  257. static_cast<int>(s.size()),
  258. s.c_str(),
  259. readableTime(secPerIter, 2).c_str(),
  260. metricReadable(itersPerSec, 2).c_str());
  261. } else {
  262. // Print with baseline
  263. auto rel = baselineNsPerIter / nsPerIter * 100.0;
  264. printf(
  265. "%*s %7.2f%% %9s %7s\n",
  266. static_cast<int>(s.size()),
  267. s.c_str(),
  268. rel,
  269. readableTime(secPerIter, 2).c_str(),
  270. metricReadable(itersPerSec, 2).c_str());
  271. }
  272. }
  273. }
  274. };
  275. } // namespace
  276. static void printBenchmarkResultsAsJson(
  277. const vector<detail::BenchmarkResult>& data) {
  278. dynamic d = dynamic::object;
  279. for (auto& datum : data) {
  280. d[datum.name] = datum.timeInNs * 1000.;
  281. }
  282. printf("%s\n", toPrettyJson(d).c_str());
  283. }
  284. static void printBenchmarkResultsAsVerboseJson(
  285. const vector<detail::BenchmarkResult>& data) {
  286. dynamic d;
  287. benchmarkResultsToDynamic(data, d);
  288. printf("%s\n", toPrettyJson(d).c_str());
  289. }
  290. static void printBenchmarkResults(const vector<detail::BenchmarkResult>& data) {
  291. if (FLAGS_json_verbose) {
  292. printBenchmarkResultsAsVerboseJson(data);
  293. return;
  294. } else if (FLAGS_json) {
  295. printBenchmarkResultsAsJson(data);
  296. return;
  297. }
  298. CHECK(FLAGS_json_verbose || FLAGS_json) << "Cannot print benchmark results";
  299. }
  300. void benchmarkResultsToDynamic(
  301. const vector<detail::BenchmarkResult>& data,
  302. dynamic& out) {
  303. out = dynamic::array;
  304. for (auto& datum : data) {
  305. out.push_back(dynamic::array(datum.file, datum.name, datum.timeInNs));
  306. }
  307. }
  308. void benchmarkResultsFromDynamic(
  309. const dynamic& d,
  310. vector<detail::BenchmarkResult>& results) {
  311. for (auto& datum : d) {
  312. results.push_back(
  313. {datum[0].asString(), datum[1].asString(), datum[2].asDouble()});
  314. }
  315. }
  316. static pair<StringPiece, StringPiece> resultKey(
  317. const detail::BenchmarkResult& result) {
  318. return pair<StringPiece, StringPiece>(result.file, result.name);
  319. }
  320. void printResultComparison(
  321. const vector<detail::BenchmarkResult>& base,
  322. const vector<detail::BenchmarkResult>& test) {
  323. map<pair<StringPiece, StringPiece>, double> baselines;
  324. for (auto& baseResult : base) {
  325. baselines[resultKey(baseResult)] = baseResult.timeInNs;
  326. }
  327. //
  328. // Width available
  329. static const unsigned int columns = 76;
  330. // Compute the longest benchmark name
  331. size_t longestName = 0;
  332. for (auto& datum : test) {
  333. longestName = max(longestName, datum.name.size());
  334. }
  335. // Print a horizontal rule
  336. auto separator = [&](char pad) { puts(string(columns, pad).c_str()); };
  337. // Print header for a file
  338. auto header = [&](const string& file) {
  339. separator('=');
  340. printf("%-*srelative time/iter iters/s\n", columns - 28, file.c_str());
  341. separator('=');
  342. };
  343. string lastFile;
  344. for (auto& datum : test) {
  345. folly::Optional<double> baseline =
  346. folly::get_optional(baselines, resultKey(datum));
  347. auto file = datum.file;
  348. if (file != lastFile) {
  349. // New file starting
  350. header(file);
  351. lastFile = file;
  352. }
  353. string s = datum.name;
  354. if (s == "-") {
  355. separator('-');
  356. continue;
  357. }
  358. if (s[0] == '%') {
  359. s.erase(0, 1);
  360. }
  361. s.resize(columns - 29, ' ');
  362. auto nsPerIter = datum.timeInNs;
  363. auto secPerIter = nsPerIter / 1E9;
  364. auto itersPerSec = (secPerIter == 0)
  365. ? std::numeric_limits<double>::infinity()
  366. : (1 / secPerIter);
  367. if (!baseline) {
  368. // Print without baseline
  369. printf(
  370. "%*s %9s %7s\n",
  371. static_cast<int>(s.size()),
  372. s.c_str(),
  373. readableTime(secPerIter, 2).c_str(),
  374. metricReadable(itersPerSec, 2).c_str());
  375. } else {
  376. // Print with baseline
  377. auto rel = *baseline / nsPerIter * 100.0;
  378. printf(
  379. "%*s %7.2f%% %9s %7s\n",
  380. static_cast<int>(s.size()),
  381. s.c_str(),
  382. rel,
  383. readableTime(secPerIter, 2).c_str(),
  384. metricReadable(itersPerSec, 2).c_str());
  385. }
  386. }
  387. separator('=');
  388. }
  389. void runBenchmarks() {
  390. CHECK(!benchmarks().empty());
  391. vector<detail::BenchmarkResult> results;
  392. results.reserve(benchmarks().size() - 1);
  393. std::unique_ptr<boost::regex> bmRegex;
  394. if (!FLAGS_bm_regex.empty()) {
  395. bmRegex = std::make_unique<boost::regex>(FLAGS_bm_regex);
  396. }
  397. // PLEASE KEEP QUIET. MEASUREMENTS IN PROGRESS.
  398. size_t baselineIndex = getGlobalBenchmarkBaselineIndex();
  399. auto const globalBaseline =
  400. runBenchmarkGetNSPerIteration(benchmarks()[baselineIndex].func, 0);
  401. auto printer = BenchmarkResultsPrinter{};
  402. FOR_EACH_RANGE (i, 0, benchmarks().size()) {
  403. if (i == baselineIndex) {
  404. continue;
  405. }
  406. double elapsed = 0.0;
  407. auto& bm = benchmarks()[i];
  408. if (bm.name != "-") { // skip separators
  409. if (bmRegex && !boost::regex_search(bm.name, *bmRegex)) {
  410. continue;
  411. }
  412. elapsed = runBenchmarkGetNSPerIteration(bm.func, globalBaseline);
  413. }
  414. if (!FLAGS_json_verbose && !FLAGS_json) {
  415. printer.print({{bm.file, bm.name, elapsed}});
  416. } else {
  417. results.push_back({bm.file, bm.name, elapsed});
  418. }
  419. }
  420. // PLEASE MAKE NOISE. MEASUREMENTS DONE.
  421. if (FLAGS_json_verbose || FLAGS_json) {
  422. printBenchmarkResults(results);
  423. } else {
  424. printer.separator('=');
  425. }
  426. }
  427. } // namespace folly