123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295 |
- /*
- * Copyright 2013-present Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- // @author: Bert Maher <bertrand@fb.com>
- #include <folly/ProducerConsumerQueue.h>
- #include <cstdio>
- #include <iostream>
- #include <thread>
- #include <glog/logging.h>
- #include <folly/Benchmark.h>
- #include <folly/portability/GFlags.h>
- #include <folly/portability/PThread.h>
- #include <folly/stats/Histogram-defs.h>
- #include <folly/stats/Histogram.h>
- namespace {
- using namespace folly;
- typedef unsigned int ThroughputType;
- typedef ProducerConsumerQueue<ThroughputType> ThroughputQueueType;
- typedef unsigned long LatencyType;
- typedef ProducerConsumerQueue<LatencyType> LatencyQueueType;
- template <class QueueType>
- struct ThroughputTest {
- explicit ThroughputTest(size_t size, int iters, int cpu0, int cpu1)
- : queue_(size), done_(false), iters_(iters), cpu0_(cpu0), cpu1_(cpu1) {}
- void producer() {
- if (cpu0_ > -1) {
- cpu_set_t cpuset;
- CPU_ZERO(&cpuset);
- CPU_SET(cpu0_, &cpuset);
- pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
- }
- for (int i = 0; i < iters_; ++i) {
- ThroughputType item = i;
- while (!queue_.write((ThroughputType)item)) {
- }
- }
- }
- void consumer() {
- if (cpu1_ > -1) {
- cpu_set_t cpuset;
- CPU_ZERO(&cpuset);
- CPU_SET(cpu1_, &cpuset);
- pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
- }
- for (int i = 0; i < iters_; ++i) {
- ThroughputType item = 0;
- while (!queue_.read(item)) {
- }
- doNotOptimizeAway(item);
- }
- }
- QueueType queue_;
- std::atomic<bool> done_;
- const int iters_;
- int cpu0_;
- int cpu1_;
- };
- template <class QueueType>
- struct LatencyTest {
- explicit LatencyTest(size_t size, int iters, int cpu0, int cpu1)
- : queue_(size),
- done_(false),
- iters_(iters),
- cpu0_(cpu0),
- cpu1_(cpu1),
- hist_(1, 0, 30) {
- computeTimeCost();
- }
- static uint64_t timespecDiff(timespec end, timespec start) {
- if (end.tv_sec == start.tv_sec) {
- assert(end.tv_nsec >= start.tv_nsec);
- return uint64_t(end.tv_nsec - start.tv_nsec);
- }
- assert(end.tv_sec > start.tv_sec);
- auto diff = uint64_t(end.tv_sec - start.tv_sec);
- assert(diff < std::numeric_limits<uint64_t>::max() / 1000000000ULL);
- return diff * 1000000000ULL + end.tv_nsec - start.tv_nsec;
- }
- void computeTimeCost() {
- timespec start, end;
- clock_gettime(CLOCK_REALTIME, &start);
- for (int i = 0; i < iters_; ++i) {
- timespec tv;
- clock_gettime(CLOCK_REALTIME, &tv);
- }
- clock_gettime(CLOCK_REALTIME, &end);
- time_cost_ = 2 * timespecDiff(end, start) / iters_;
- }
- void producer() {
- if (cpu0_ > -1) {
- cpu_set_t cpuset;
- CPU_ZERO(&cpuset);
- CPU_SET(cpu0_, &cpuset);
- pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
- }
- for (int i = 0; i < iters_; ++i) {
- timespec sleeptime, sleepstart;
- clock_gettime(CLOCK_REALTIME, &sleepstart);
- do {
- clock_gettime(CLOCK_REALTIME, &sleeptime);
- } while (timespecDiff(sleeptime, sleepstart) < 1000000);
- timespec tv;
- clock_gettime(CLOCK_REALTIME, &tv);
- while (!queue_.write((LatencyType)tv.tv_nsec)) {
- }
- }
- }
- void consumer() {
- if (cpu1_ > -1) {
- cpu_set_t cpuset;
- CPU_ZERO(&cpuset);
- CPU_SET(cpu1_, &cpuset);
- pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
- }
- for (int i = 0; i < iters_; ++i) {
- unsigned long enqueue_nsec;
- while (!queue_.read(enqueue_nsec)) {
- }
- timespec tv;
- clock_gettime(CLOCK_REALTIME, &tv);
- int diff = tv.tv_nsec - enqueue_nsec - time_cost_;
- if (diff < 0) {
- continue;
- }
- // Naive log-scale bucketing.
- int bucket;
- for (bucket = 0; bucket <= 30 && (1 << bucket) <= diff; ++bucket) {
- }
- hist_.addValue(bucket - 1);
- }
- }
- void printHistogram() {
- hist_.toTSV(std::cout);
- }
- QueueType queue_;
- std::atomic<bool> done_;
- int time_cost_;
- const int iters_;
- int cpu0_;
- int cpu1_;
- Histogram<int> hist_;
- };
- void BM_ProducerConsumer(int iters, int size) {
- BenchmarkSuspender susp;
- CHECK_GT(size, 0);
- ThroughputTest<ThroughputQueueType>* test =
- new ThroughputTest<ThroughputQueueType>(size, iters, -1, -1);
- susp.dismiss();
- std::thread producer([test] { test->producer(); });
- std::thread consumer([test] { test->consumer(); });
- producer.join();
- test->done_ = true;
- consumer.join();
- delete test;
- }
- void BM_ProducerConsumerAffinity(int iters, int size) {
- BenchmarkSuspender susp;
- CHECK_GT(size, 0);
- ThroughputTest<ThroughputQueueType>* test =
- new ThroughputTest<ThroughputQueueType>(size, iters, 0, 1);
- susp.dismiss();
- std::thread producer([test] { test->producer(); });
- std::thread consumer([test] { test->consumer(); });
- producer.join();
- test->done_ = true;
- consumer.join();
- delete test;
- }
- void BM_ProducerConsumerLatency(int /* iters */, int size) {
- BenchmarkSuspender susp;
- CHECK_GT(size, 0);
- LatencyTest<LatencyQueueType>* test =
- new LatencyTest<LatencyQueueType>(size, 100000, 0, 1);
- susp.dismiss();
- std::thread producer([test] { test->producer(); });
- std::thread consumer([test] { test->consumer(); });
- producer.join();
- test->done_ = true;
- consumer.join();
- test->printHistogram();
- delete test;
- }
- BENCHMARK_DRAW_LINE();
- BENCHMARK_PARAM(BM_ProducerConsumer, 1048574)
- BENCHMARK_PARAM(BM_ProducerConsumerAffinity, 1048574)
- BENCHMARK_PARAM(BM_ProducerConsumerLatency, 1048574)
- } // namespace
- int main(int argc, char** argv) {
- google::InitGoogleLogging(argv[0]);
- gflags::ParseCommandLineFlags(&argc, &argv, true);
- runBenchmarks();
- return 0;
- }
- #if 0
- /*
- Benchmark
- $ lscpu
- Architecture: x86_64
- CPU op-mode(s): 32-bit, 64-bit
- Byte Order: Little Endian
- CPU(s): 24
- On-line CPU(s) list: 0-23
- Thread(s) per core: 1
- Core(s) per socket: 1
- Socket(s): 24
- NUMA node(s): 1
- Vendor ID: GenuineIntel
- CPU family: 6
- Model: 60
- Model name: Intel Core Processor (Haswell, no TSX)
- Stepping: 1
- CPU MHz: 2494.244
- BogoMIPS: 4988.48
- Hypervisor vendor: KVM
- Virtualization type: full
- L1d cache: 32K
- L1i cache: 32K
- L2 cache: 4096K
- NUMA node0 CPU(s): 0-23
- $ ../buck-out/gen/folly/test/producer_consumer_queue_benchmark
- 5 6 1 5
- 6 7 1893 11358
- 7 8 39671 277697
- 8 9 34921 279368
- 9 10 17799 160191
- 10 11 3685 36850
- 11 12 1075 11825
- 12 13 456 5472
- 13 14 422 5486
- 14 15 64 896
- 15 16 7 105
- 16 17 3 48
- 17 18 3 51
- ============================================================================
- folly/test/ProducerConsumerQueueBenchmark.cpp relative time/iter iters/s
- ============================================================================
- ----------------------------------------------------------------------------
- BM_ProducerConsumer(1048574) 5.82ns 171.75M
- BM_ProducerConsumerAffinity(1048574) 7.36ns 135.83M
- BM_ProducerConsumerLatency(1048574) 1.67min 9.99m
- ============================================================================
- */
- #endif
|