ProducerConsumerQueueBenchmark.cpp 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295
  1. /*
  2. * Copyright 2013-present Facebook, Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. // @author: Bert Maher <bertrand@fb.com>
  17. #include <folly/ProducerConsumerQueue.h>
  18. #include <cstdio>
  19. #include <iostream>
  20. #include <thread>
  21. #include <glog/logging.h>
  22. #include <folly/Benchmark.h>
  23. #include <folly/portability/GFlags.h>
  24. #include <folly/portability/PThread.h>
  25. #include <folly/stats/Histogram-defs.h>
  26. #include <folly/stats/Histogram.h>
  27. namespace {
  28. using namespace folly;
  29. typedef unsigned int ThroughputType;
  30. typedef ProducerConsumerQueue<ThroughputType> ThroughputQueueType;
  31. typedef unsigned long LatencyType;
  32. typedef ProducerConsumerQueue<LatencyType> LatencyQueueType;
  33. template <class QueueType>
  34. struct ThroughputTest {
  35. explicit ThroughputTest(size_t size, int iters, int cpu0, int cpu1)
  36. : queue_(size), done_(false), iters_(iters), cpu0_(cpu0), cpu1_(cpu1) {}
  37. void producer() {
  38. if (cpu0_ > -1) {
  39. cpu_set_t cpuset;
  40. CPU_ZERO(&cpuset);
  41. CPU_SET(cpu0_, &cpuset);
  42. pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
  43. }
  44. for (int i = 0; i < iters_; ++i) {
  45. ThroughputType item = i;
  46. while (!queue_.write((ThroughputType)item)) {
  47. }
  48. }
  49. }
  50. void consumer() {
  51. if (cpu1_ > -1) {
  52. cpu_set_t cpuset;
  53. CPU_ZERO(&cpuset);
  54. CPU_SET(cpu1_, &cpuset);
  55. pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
  56. }
  57. for (int i = 0; i < iters_; ++i) {
  58. ThroughputType item = 0;
  59. while (!queue_.read(item)) {
  60. }
  61. doNotOptimizeAway(item);
  62. }
  63. }
  64. QueueType queue_;
  65. std::atomic<bool> done_;
  66. const int iters_;
  67. int cpu0_;
  68. int cpu1_;
  69. };
  70. template <class QueueType>
  71. struct LatencyTest {
  72. explicit LatencyTest(size_t size, int iters, int cpu0, int cpu1)
  73. : queue_(size),
  74. done_(false),
  75. iters_(iters),
  76. cpu0_(cpu0),
  77. cpu1_(cpu1),
  78. hist_(1, 0, 30) {
  79. computeTimeCost();
  80. }
  81. static uint64_t timespecDiff(timespec end, timespec start) {
  82. if (end.tv_sec == start.tv_sec) {
  83. assert(end.tv_nsec >= start.tv_nsec);
  84. return uint64_t(end.tv_nsec - start.tv_nsec);
  85. }
  86. assert(end.tv_sec > start.tv_sec);
  87. auto diff = uint64_t(end.tv_sec - start.tv_sec);
  88. assert(diff < std::numeric_limits<uint64_t>::max() / 1000000000ULL);
  89. return diff * 1000000000ULL + end.tv_nsec - start.tv_nsec;
  90. }
  91. void computeTimeCost() {
  92. timespec start, end;
  93. clock_gettime(CLOCK_REALTIME, &start);
  94. for (int i = 0; i < iters_; ++i) {
  95. timespec tv;
  96. clock_gettime(CLOCK_REALTIME, &tv);
  97. }
  98. clock_gettime(CLOCK_REALTIME, &end);
  99. time_cost_ = 2 * timespecDiff(end, start) / iters_;
  100. }
  101. void producer() {
  102. if (cpu0_ > -1) {
  103. cpu_set_t cpuset;
  104. CPU_ZERO(&cpuset);
  105. CPU_SET(cpu0_, &cpuset);
  106. pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
  107. }
  108. for (int i = 0; i < iters_; ++i) {
  109. timespec sleeptime, sleepstart;
  110. clock_gettime(CLOCK_REALTIME, &sleepstart);
  111. do {
  112. clock_gettime(CLOCK_REALTIME, &sleeptime);
  113. } while (timespecDiff(sleeptime, sleepstart) < 1000000);
  114. timespec tv;
  115. clock_gettime(CLOCK_REALTIME, &tv);
  116. while (!queue_.write((LatencyType)tv.tv_nsec)) {
  117. }
  118. }
  119. }
  120. void consumer() {
  121. if (cpu1_ > -1) {
  122. cpu_set_t cpuset;
  123. CPU_ZERO(&cpuset);
  124. CPU_SET(cpu1_, &cpuset);
  125. pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
  126. }
  127. for (int i = 0; i < iters_; ++i) {
  128. unsigned long enqueue_nsec;
  129. while (!queue_.read(enqueue_nsec)) {
  130. }
  131. timespec tv;
  132. clock_gettime(CLOCK_REALTIME, &tv);
  133. int diff = tv.tv_nsec - enqueue_nsec - time_cost_;
  134. if (diff < 0) {
  135. continue;
  136. }
  137. // Naive log-scale bucketing.
  138. int bucket;
  139. for (bucket = 0; bucket <= 30 && (1 << bucket) <= diff; ++bucket) {
  140. }
  141. hist_.addValue(bucket - 1);
  142. }
  143. }
  144. void printHistogram() {
  145. hist_.toTSV(std::cout);
  146. }
  147. QueueType queue_;
  148. std::atomic<bool> done_;
  149. int time_cost_;
  150. const int iters_;
  151. int cpu0_;
  152. int cpu1_;
  153. Histogram<int> hist_;
  154. };
  155. void BM_ProducerConsumer(int iters, int size) {
  156. BenchmarkSuspender susp;
  157. CHECK_GT(size, 0);
  158. ThroughputTest<ThroughputQueueType>* test =
  159. new ThroughputTest<ThroughputQueueType>(size, iters, -1, -1);
  160. susp.dismiss();
  161. std::thread producer([test] { test->producer(); });
  162. std::thread consumer([test] { test->consumer(); });
  163. producer.join();
  164. test->done_ = true;
  165. consumer.join();
  166. delete test;
  167. }
  168. void BM_ProducerConsumerAffinity(int iters, int size) {
  169. BenchmarkSuspender susp;
  170. CHECK_GT(size, 0);
  171. ThroughputTest<ThroughputQueueType>* test =
  172. new ThroughputTest<ThroughputQueueType>(size, iters, 0, 1);
  173. susp.dismiss();
  174. std::thread producer([test] { test->producer(); });
  175. std::thread consumer([test] { test->consumer(); });
  176. producer.join();
  177. test->done_ = true;
  178. consumer.join();
  179. delete test;
  180. }
  181. void BM_ProducerConsumerLatency(int /* iters */, int size) {
  182. BenchmarkSuspender susp;
  183. CHECK_GT(size, 0);
  184. LatencyTest<LatencyQueueType>* test =
  185. new LatencyTest<LatencyQueueType>(size, 100000, 0, 1);
  186. susp.dismiss();
  187. std::thread producer([test] { test->producer(); });
  188. std::thread consumer([test] { test->consumer(); });
  189. producer.join();
  190. test->done_ = true;
  191. consumer.join();
  192. test->printHistogram();
  193. delete test;
  194. }
  195. BENCHMARK_DRAW_LINE();
  196. BENCHMARK_PARAM(BM_ProducerConsumer, 1048574)
  197. BENCHMARK_PARAM(BM_ProducerConsumerAffinity, 1048574)
  198. BENCHMARK_PARAM(BM_ProducerConsumerLatency, 1048574)
  199. } // namespace
  200. int main(int argc, char** argv) {
  201. google::InitGoogleLogging(argv[0]);
  202. gflags::ParseCommandLineFlags(&argc, &argv, true);
  203. runBenchmarks();
  204. return 0;
  205. }
  206. #if 0
  207. /*
  208. Benchmark
  209. $ lscpu
  210. Architecture: x86_64
  211. CPU op-mode(s): 32-bit, 64-bit
  212. Byte Order: Little Endian
  213. CPU(s): 24
  214. On-line CPU(s) list: 0-23
  215. Thread(s) per core: 1
  216. Core(s) per socket: 1
  217. Socket(s): 24
  218. NUMA node(s): 1
  219. Vendor ID: GenuineIntel
  220. CPU family: 6
  221. Model: 60
  222. Model name: Intel Core Processor (Haswell, no TSX)
  223. Stepping: 1
  224. CPU MHz: 2494.244
  225. BogoMIPS: 4988.48
  226. Hypervisor vendor: KVM
  227. Virtualization type: full
  228. L1d cache: 32K
  229. L1i cache: 32K
  230. L2 cache: 4096K
  231. NUMA node0 CPU(s): 0-23
  232. $ ../buck-out/gen/folly/test/producer_consumer_queue_benchmark
  233. 5 6 1 5
  234. 6 7 1893 11358
  235. 7 8 39671 277697
  236. 8 9 34921 279368
  237. 9 10 17799 160191
  238. 10 11 3685 36850
  239. 11 12 1075 11825
  240. 12 13 456 5472
  241. 13 14 422 5486
  242. 14 15 64 896
  243. 15 16 7 105
  244. 16 17 3 48
  245. 17 18 3 51
  246. ============================================================================
  247. folly/test/ProducerConsumerQueueBenchmark.cpp relative time/iter iters/s
  248. ============================================================================
  249. ----------------------------------------------------------------------------
  250. BM_ProducerConsumer(1048574) 5.82ns 171.75M
  251. BM_ProducerConsumerAffinity(1048574) 7.36ns 135.83M
  252. BM_ProducerConsumerLatency(1048574) 1.67min 9.99m
  253. ============================================================================
  254. */
  255. #endif