SmallLocksBenchmark.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548
  1. /*
  2. * Copyright 2016-present Facebook, Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <algorithm>
  17. #include <cmath>
  18. #include <condition_variable>
  19. #include <numeric>
  20. #include <thread>
  21. #include <vector>
  22. #include <google/base/spinlock.h>
  23. #include <folly/Benchmark.h>
  24. #include <folly/SharedMutex.h>
  25. #include <folly/synchronization/DistributedMutex.h>
  26. #include <folly/synchronization/SmallLocks.h>
  27. /* "Work cycle" is just an additional nop loop iteration.
  28. * A smaller number of work cyles will result in more contention,
  29. * which is what we're trying to measure. The relative ratio of
  30. * locked to unlocked cycles will simulate how big critical sections
  31. * are in production code
  32. */
  33. DEFINE_int32(work, 100, "Number of work cycles");
  34. DEFINE_int32(unlocked_work, 1000, "Number of unlocked work cycles");
  35. DEFINE_int32(
  36. threads,
  37. std::thread::hardware_concurrency(),
  38. "Number of threads for fairness test");
  39. static void burn(size_t n) {
  40. for (size_t i = 0; i < n; ++i) {
  41. folly::doNotOptimizeAway(i);
  42. }
  43. }
  44. namespace {
  45. template <typename Mutex>
  46. std::unique_lock<Mutex> lock(Mutex& mutex) {
  47. return std::unique_lock<Mutex>{mutex};
  48. }
  49. template <typename Mutex, typename Other>
  50. void unlock(Mutex&, Other) {}
  51. auto lock(folly::DistributedMutex& mutex) {
  52. return mutex.lock();
  53. }
  54. template <typename State>
  55. void unlock(folly::DistributedMutex& mutex, State state) {
  56. mutex.unlock(std::move(state));
  57. }
  58. struct SimpleBarrier {
  59. explicit SimpleBarrier(int count) : count_(count) {}
  60. void wait() {
  61. // we spin for a bit to try and get the kernel to schedule threads on
  62. // different cores
  63. for (auto i = 0; i < 100000; ++i) {
  64. folly::doNotOptimizeAway(i);
  65. }
  66. num_.fetch_add(1);
  67. while (num_.load() != count_) {
  68. }
  69. }
  70. private:
  71. std::atomic<int> num_{0};
  72. const int count_;
  73. };
  74. } // namespace
  75. template <typename Lock>
  76. class InitLock {
  77. Lock lock_;
  78. public:
  79. InitLock() {
  80. lock_.init();
  81. }
  82. void lock() {
  83. lock_.lock();
  84. }
  85. void unlock() {
  86. lock_.unlock();
  87. }
  88. };
  89. class GoogleSpinLockAdapter {
  90. public:
  91. void lock() {
  92. lock_.Lock();
  93. }
  94. void unlock() {
  95. lock_.Unlock();
  96. }
  97. private:
  98. SpinLock lock_;
  99. };
  100. template <typename Lock>
  101. static void runContended(size_t numOps, size_t numThreads) {
  102. folly::BenchmarkSuspender braces;
  103. size_t totalthreads = std::thread::hardware_concurrency();
  104. if (totalthreads < numThreads) {
  105. totalthreads = numThreads;
  106. }
  107. size_t threadgroups = totalthreads / numThreads;
  108. struct lockstruct {
  109. char padding1[128];
  110. Lock mutex;
  111. char padding2[128];
  112. long value = 1;
  113. };
  114. auto locks =
  115. (struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct));
  116. char padding3[128];
  117. (void)padding3;
  118. std::vector<std::thread> threads(totalthreads);
  119. SimpleBarrier runbarrier(totalthreads + 1);
  120. for (size_t t = 0; t < totalthreads; ++t) {
  121. threads[t] = std::thread([&, t] {
  122. lockstruct* mutex = &locks[t % threadgroups];
  123. runbarrier.wait();
  124. for (size_t op = 0; op < numOps; op += 1) {
  125. auto state = lock(mutex->mutex);
  126. burn(FLAGS_work);
  127. mutex->value++;
  128. unlock(mutex->mutex, std::move(state));
  129. burn(FLAGS_unlocked_work);
  130. }
  131. });
  132. }
  133. runbarrier.wait();
  134. braces.dismissing([&] {
  135. for (auto& thr : threads) {
  136. thr.join();
  137. }
  138. });
  139. }
  140. template <typename Lock>
  141. static void runFairness() {
  142. size_t numThreads = FLAGS_threads;
  143. size_t totalthreads = std::thread::hardware_concurrency();
  144. if (totalthreads < numThreads) {
  145. totalthreads = numThreads;
  146. }
  147. long threadgroups = totalthreads / numThreads;
  148. struct lockstruct {
  149. char padding1[128];
  150. Lock lock;
  151. };
  152. auto locks =
  153. (struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct));
  154. char padding3[64];
  155. (void)padding3;
  156. std::vector<std::thread> threads(totalthreads);
  157. std::atomic<bool> stop{false};
  158. std::mutex rlock;
  159. std::vector<long> results;
  160. std::vector<std::chrono::microseconds> maxes;
  161. std::vector<std::chrono::microseconds> aqTime;
  162. std::vector<unsigned long> aqTimeSq;
  163. SimpleBarrier runbarrier(totalthreads + 1);
  164. for (size_t t = 0; t < totalthreads; ++t) {
  165. threads[t] = std::thread([&, t] {
  166. lockstruct* mutex = &locks[t % threadgroups];
  167. long value = 0;
  168. std::chrono::microseconds max(0);
  169. std::chrono::microseconds time(0);
  170. unsigned long timeSq(0);
  171. runbarrier.wait();
  172. while (!stop) {
  173. std::chrono::steady_clock::time_point prelock =
  174. std::chrono::steady_clock::now();
  175. auto state = lock(mutex->lock);
  176. std::chrono::steady_clock::time_point postlock =
  177. std::chrono::steady_clock::now();
  178. auto diff = std::chrono::duration_cast<std::chrono::microseconds>(
  179. postlock - prelock);
  180. time += diff;
  181. timeSq += diff.count() * diff.count();
  182. if (diff > max) {
  183. max = diff;
  184. }
  185. burn(FLAGS_work);
  186. value++;
  187. unlock(mutex->lock, std::move(state));
  188. burn(FLAGS_unlocked_work);
  189. }
  190. {
  191. std::lock_guard<std::mutex> g(rlock);
  192. results.push_back(value);
  193. maxes.push_back(max);
  194. aqTime.push_back(time);
  195. aqTimeSq.push_back(timeSq);
  196. }
  197. });
  198. }
  199. runbarrier.wait();
  200. /* sleep override */
  201. std::this_thread::sleep_for(std::chrono::seconds(2));
  202. stop = true;
  203. for (auto& thr : threads) {
  204. thr.join();
  205. }
  206. // Calulate some stats
  207. unsigned long sum = std::accumulate(results.begin(), results.end(), 0.0);
  208. double m = sum / results.size();
  209. double accum = 0.0;
  210. std::for_each(results.begin(), results.end(), [&](const double d) {
  211. accum += (d - m) * (d - m);
  212. });
  213. double stdev = std::sqrt(accum / (results.size() - 1));
  214. std::chrono::microseconds mx = *std::max_element(maxes.begin(), maxes.end());
  215. std::chrono::microseconds agAqTime = std::accumulate(
  216. aqTime.begin(), aqTime.end(), std::chrono::microseconds(0));
  217. unsigned long agAqTimeSq =
  218. std::accumulate(aqTimeSq.begin(), aqTimeSq.end(), 0);
  219. std::chrono::microseconds mean = agAqTime / sum;
  220. double variance = (sum * agAqTimeSq - (agAqTime.count() * agAqTime.count())) /
  221. sum / (sum - 1);
  222. double stddev2 = std::sqrt(variance);
  223. printf("Sum: %li Mean: %.0f stddev: %.0f\n", sum, m, stdev);
  224. printf(
  225. "Lock time stats in us: mean %li stddev %.0f max %li\n",
  226. mean.count(),
  227. stddev2,
  228. mx.count());
  229. }
  230. template <typename Mutex>
  231. void runUncontended(std::size_t iters) {
  232. auto&& mutex = Mutex{};
  233. for (auto i = std::size_t{0}; i < iters; ++i) {
  234. auto state = lock(mutex);
  235. unlock(mutex, std::move(state));
  236. }
  237. }
  238. BENCHMARK(StdMutexUncontendedBenchmark, iters) {
  239. runUncontended<std::mutex>(iters);
  240. }
  241. BENCHMARK(GoogleSpinUncontendedBenchmark, iters) {
  242. runUncontended<GoogleSpinLockAdapter>(iters);
  243. }
  244. BENCHMARK(MicroSpinLockUncontendedBenchmark, iters) {
  245. runUncontended<InitLock<folly::MicroSpinLock>>(iters);
  246. }
  247. BENCHMARK(PicoSpinLockUncontendedBenchmark, iters) {
  248. runUncontended<InitLock<folly::PicoSpinLock<std::uint16_t>>>(iters);
  249. }
  250. BENCHMARK(MicroLockUncontendedBenchmark, iters) {
  251. runUncontended<InitLock<folly::MicroLock>>(iters);
  252. }
  253. BENCHMARK(SharedMutexUncontendedBenchmark, iters) {
  254. runUncontended<folly::SharedMutex>(iters);
  255. }
  256. BENCHMARK(DistributedMutexUncontendedBenchmark, iters) {
  257. runUncontended<folly::DistributedMutex>(iters);
  258. }
  259. BENCHMARK(AtomicFetchAddUncontendedBenchmark, iters) {
  260. auto&& atomic = std::atomic<uint64_t>{0};
  261. while (iters--) {
  262. folly::doNotOptimizeAway(atomic.fetch_add(1));
  263. }
  264. }
  265. struct VirtualBase {
  266. virtual void foo() = 0;
  267. virtual ~VirtualBase() {}
  268. };
  269. struct VirtualImpl : VirtualBase {
  270. void foo() override { /* noop */
  271. }
  272. ~VirtualImpl() override {}
  273. };
  274. #ifndef __clang__
  275. __attribute__((noinline, noclone)) VirtualBase* makeVirtual() {
  276. return new VirtualImpl();
  277. }
  278. BENCHMARK(VirtualFunctionCall, iters) {
  279. VirtualBase* vb = makeVirtual();
  280. while (iters--) {
  281. vb->foo();
  282. }
  283. delete vb;
  284. }
  285. #endif
  286. BENCHMARK_DRAW_LINE();
  287. #define BENCH_BASE(...) FB_VA_GLUE(BENCHMARK_NAMED_PARAM, (__VA_ARGS__))
  288. #define BENCH_REL(...) FB_VA_GLUE(BENCHMARK_RELATIVE_NAMED_PARAM, (__VA_ARGS__))
  289. static void std_mutex(size_t numOps, size_t numThreads) {
  290. runContended<std::mutex>(numOps, numThreads);
  291. }
  292. static void google_spin(size_t numOps, size_t numThreads) {
  293. runContended<GoogleSpinLockAdapter>(numOps, numThreads);
  294. }
  295. static void folly_microspin(size_t numOps, size_t numThreads) {
  296. runContended<InitLock<folly::MicroSpinLock>>(numOps, numThreads);
  297. }
  298. static void folly_picospin(size_t numOps, size_t numThreads) {
  299. runContended<InitLock<folly::PicoSpinLock<uint16_t>>>(numOps, numThreads);
  300. }
  301. static void folly_microlock(size_t numOps, size_t numThreads) {
  302. runContended<folly::MicroLock>(numOps, numThreads);
  303. }
  304. static void folly_sharedmutex(size_t numOps, size_t numThreads) {
  305. runContended<folly::SharedMutex>(numOps, numThreads);
  306. }
  307. static void folly_distributedmutex(size_t numOps, size_t numThreads) {
  308. runContended<folly::DistributedMutex>(numOps, numThreads);
  309. }
  310. BENCHMARK_DRAW_LINE();
  311. BENCH_BASE(std_mutex, 1thread, 1)
  312. BENCH_REL(google_spin, 1thread, 1)
  313. BENCH_REL(folly_microspin, 1thread, 1)
  314. BENCH_REL(folly_picospin, 1thread, 1)
  315. BENCH_REL(folly_microlock, 1thread, 1)
  316. BENCH_REL(folly_sharedmutex, 1thread, 1)
  317. BENCH_REL(folly_distributedmutex, 1thread, 1)
  318. BENCHMARK_DRAW_LINE();
  319. BENCH_BASE(std_mutex, 2thread, 2)
  320. BENCH_REL(google_spin, 2thread, 2)
  321. BENCH_REL(folly_microspin, 2thread, 2)
  322. BENCH_REL(folly_picospin, 2thread, 2)
  323. BENCH_REL(folly_microlock, 2thread, 2)
  324. BENCH_REL(folly_sharedmutex, 2thread, 2)
  325. BENCH_REL(folly_distributedmutex, 2thread, 2)
  326. BENCHMARK_DRAW_LINE();
  327. BENCH_BASE(std_mutex, 4thread, 4)
  328. BENCH_REL(google_spin, 4thread, 4)
  329. BENCH_REL(folly_microspin, 4thread, 4)
  330. BENCH_REL(folly_picospin, 4thread, 4)
  331. BENCH_REL(folly_microlock, 4thread, 4)
  332. BENCH_REL(folly_sharedmutex, 4thread, 4)
  333. BENCH_REL(folly_distributedmutex, 4thread, 4)
  334. BENCHMARK_DRAW_LINE();
  335. BENCH_BASE(std_mutex, 8thread, 8)
  336. BENCH_REL(google_spin, 8thread, 8)
  337. BENCH_REL(folly_microspin, 8thread, 8)
  338. BENCH_REL(folly_picospin, 8thread, 8)
  339. BENCH_REL(folly_microlock, 8thread, 8)
  340. BENCH_REL(folly_sharedmutex, 8thread, 8)
  341. BENCH_REL(folly_distributedmutex, 8thread, 8)
  342. BENCHMARK_DRAW_LINE();
  343. BENCH_BASE(std_mutex, 16thread, 16)
  344. BENCH_REL(google_spin, 16thread, 16)
  345. BENCH_REL(folly_microspin, 16thread, 16)
  346. BENCH_REL(folly_picospin, 16thread, 16)
  347. BENCH_REL(folly_microlock, 16thread, 16)
  348. BENCH_REL(folly_sharedmutex, 16thread, 16)
  349. BENCH_REL(folly_distributedmutex, 16thread, 16)
  350. BENCHMARK_DRAW_LINE();
  351. BENCH_BASE(std_mutex, 32thread, 32)
  352. BENCH_REL(google_spin, 32thread, 32)
  353. BENCH_REL(folly_microspin, 32thread, 32)
  354. BENCH_REL(folly_picospin, 32thread, 32)
  355. BENCH_REL(folly_microlock, 32thread, 32)
  356. BENCH_REL(folly_sharedmutex, 32thread, 32)
  357. BENCH_REL(folly_distributedmutex, 32thread, 32)
  358. BENCHMARK_DRAW_LINE();
  359. BENCH_BASE(std_mutex, 64thread, 64)
  360. BENCH_REL(google_spin, 64thread, 64)
  361. BENCH_REL(folly_microspin, 64thread, 64)
  362. BENCH_REL(folly_picospin, 64thread, 64)
  363. BENCH_REL(folly_microlock, 64thread, 64)
  364. BENCH_REL(folly_sharedmutex, 64thread, 64)
  365. BENCH_REL(folly_distributedmutex, 64thread, 64)
  366. BENCHMARK_DRAW_LINE();
  367. BENCH_BASE(std_mutex, 128thread, 128)
  368. BENCH_REL(google_spin, 128thread, 128)
  369. BENCH_REL(folly_microspin, 128thread, 128)
  370. BENCH_REL(folly_picospin, 128thread, 128)
  371. BENCH_REL(folly_microlock, 128thread, 128)
  372. BENCH_REL(folly_sharedmutex, 128thread, 128)
  373. BENCH_REL(folly_distributedmutex, 128thread, 128)
  374. #define FairnessTest(type) \
  375. { \
  376. printf(#type ": \n"); \
  377. runFairness<type>(); \
  378. }
  379. int main(int argc, char** argv) {
  380. gflags::ParseCommandLineFlags(&argc, &argv, true);
  381. FairnessTest(std::mutex);
  382. FairnessTest(GoogleSpinLockAdapter);
  383. FairnessTest(InitLock<folly::MicroSpinLock>);
  384. FairnessTest(InitLock<folly::PicoSpinLock<uint16_t>>);
  385. FairnessTest(InitLock<folly::MicroLock>);
  386. FairnessTest(folly::SharedMutex);
  387. FairnessTest(folly::DistributedMutex);
  388. folly::runBenchmarks();
  389. return 0;
  390. }
  391. /*
  392. ./small_locks_benchmark --bm_min_iters=100000
  393. Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz
  394. std::mutex:
  395. Sum: 3645010 Mean: 65089 stddev: 841
  396. Lock time stats in us: mean 16 stddev 1178 max 21361
  397. GoogleSpinLockAdapter:
  398. Sum: 4329140 Mean: 77306 stddev: 2338
  399. Lock time stats in us: mean 10 stddev 16 max 19860
  400. InitLock<folly::MicroSpinLock>:
  401. Sum: 3513882 Mean: 62747 stddev: 27713
  402. Lock time stats in us: mean 31 stddev 1222 max 211624
  403. InitLock<folly::PicoSpinLock<uint16_t>>:
  404. Sum: 2182472 Mean: 38972 stddev: 41789
  405. Lock time stats in us: mean 49 stddev 1967 max 228875
  406. InitLock<folly::MicroLock>:
  407. Sum: 1868601 Mean: 33367 stddev: 4836
  408. Lock time stats in us: mean 48 stddev 2298 max 12235
  409. folly::SharedMutex:
  410. Sum: 2037742 Mean: 36388 stddev: 18204
  411. Lock time stats in us: mean 53 stddev 2107 max 132469
  412. folly::DistributedMutex:
  413. Sum: 6793764 Mean: 121317 stddev: 20791
  414. Lock time stats in us: mean 15 stddev 8 max 55696
  415. ============================================================================
  416. folly/synchronization/test/SmallLocksBenchmark.cpprelative time/iter iters/s
  417. ============================================================================
  418. StdMutexUncontendedBenchmark 16.73ns 59.77M
  419. GoogleSpinUncontendedBenchmark 11.26ns 88.80M
  420. MicroSpinLockUncontendedBenchmark 10.06ns 99.44M
  421. PicoSpinLockUncontendedBenchmark 11.25ns 88.89M
  422. MicroLockUncontendedBenchmark 19.20ns 52.09M
  423. SharedMutexUncontendedBenchmark 19.45ns 51.40M
  424. DistributedMutexUncontendedBenchmark 17.02ns 58.75M
  425. AtomicFetchAddUncontendedBenchmark 5.47ns 182.91M
  426. ----------------------------------------------------------------------------
  427. ----------------------------------------------------------------------------
  428. std_mutex(1thread) 802.21ns 1.25M
  429. google_spin(1thread) 109.81% 730.52ns 1.37M
  430. folly_microspin(1thread) 119.16% 673.22ns 1.49M
  431. folly_picospin(1thread) 119.02% 673.99ns 1.48M
  432. folly_microlock(1thread) 131.67% 609.28ns 1.64M
  433. folly_sharedmutex(1thread) 118.41% 677.46ns 1.48M
  434. folly_distributedmutex(1thread) 100.27% 800.02ns 1.25M
  435. ----------------------------------------------------------------------------
  436. std_mutex(2thread) 1.30us 769.21K
  437. google_spin(2thread) 129.59% 1.00us 996.85K
  438. folly_microspin(2thread) 158.13% 822.13ns 1.22M
  439. folly_picospin(2thread) 150.43% 864.23ns 1.16M
  440. folly_microlock(2thread) 144.94% 896.92ns 1.11M
  441. folly_sharedmutex(2thread) 120.36% 1.08us 925.83K
  442. folly_distributedmutex(2thread) 112.98% 1.15us 869.08K
  443. ----------------------------------------------------------------------------
  444. std_mutex(4thread) 2.36us 424.08K
  445. google_spin(4thread) 120.20% 1.96us 509.75K
  446. folly_microspin(4thread) 109.07% 2.16us 462.53K
  447. folly_picospin(4thread) 113.37% 2.08us 480.78K
  448. folly_microlock(4thread) 83.88% 2.81us 355.71K
  449. folly_sharedmutex(4thread) 90.47% 2.61us 383.65K
  450. folly_distributedmutex(4thread) 121.82% 1.94us 516.63K
  451. ----------------------------------------------------------------------------
  452. std_mutex(8thread) 5.39us 185.64K
  453. google_spin(8thread) 127.72% 4.22us 237.10K
  454. folly_microspin(8thread) 106.70% 5.05us 198.08K
  455. folly_picospin(8thread) 88.02% 6.12us 163.41K
  456. folly_microlock(8thread) 79.78% 6.75us 148.11K
  457. folly_sharedmutex(8thread) 78.25% 6.88us 145.26K
  458. folly_distributedmutex(8thread) 162.74% 3.31us 302.12K
  459. ----------------------------------------------------------------------------
  460. std_mutex(16thread) 11.74us 85.16K
  461. google_spin(16thread) 109.91% 10.68us 93.60K
  462. folly_microspin(16thread) 103.93% 11.30us 88.50K
  463. folly_picospin(16thread) 50.36% 23.32us 42.89K
  464. folly_microlock(16thread) 55.85% 21.03us 47.56K
  465. folly_sharedmutex(16thread) 64.27% 18.27us 54.74K
  466. folly_distributedmutex(16thread) 181.32% 6.48us 154.41K
  467. ----------------------------------------------------------------------------
  468. std_mutex(32thread) 31.56us 31.68K
  469. google_spin(32thread) 95.17% 33.17us 30.15K
  470. folly_microspin(32thread) 100.60% 31.38us 31.87K
  471. folly_picospin(32thread) 31.30% 100.84us 9.92K
  472. folly_microlock(32thread) 55.04% 57.35us 17.44K
  473. folly_sharedmutex(32thread) 65.09% 48.49us 20.62K
  474. folly_distributedmutex(32thread) 177.39% 17.79us 56.20K
  475. ----------------------------------------------------------------------------
  476. std_mutex(64thread) 39.90us 25.06K
  477. google_spin(64thread) 110.92% 35.98us 27.80K
  478. folly_microspin(64thread) 105.98% 37.65us 26.56K
  479. folly_picospin(64thread) 33.03% 120.80us 8.28K
  480. folly_microlock(64thread) 58.02% 68.78us 14.54K
  481. folly_sharedmutex(64thread) 68.43% 58.32us 17.15K
  482. folly_distributedmutex(64thread) 200.38% 19.91us 50.22K
  483. ----------------------------------------------------------------------------
  484. std_mutex(128thread) 75.67us 13.21K
  485. google_spin(128thread) 116.14% 65.16us 15.35K
  486. folly_microspin(128thread) 100.82% 75.06us 13.32K
  487. folly_picospin(128thread) 44.99% 168.21us 5.94K
  488. folly_microlock(128thread) 53.93% 140.31us 7.13K
  489. folly_sharedmutex(128thread) 64.37% 117.55us 8.51K
  490. folly_distributedmutex(128thread) 185.71% 40.75us 24.54K
  491. ============================================================================
  492. */