RWSpinLock.h 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851
  1. /*
  2. * Copyright 2011-present Facebook, Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. /*
  17. * N.B. You most likely do _not_ want to use RWSpinLock or any other
  18. * kind of spinlock. Use SharedMutex instead.
  19. *
  20. * In short, spinlocks in preemptive multi-tasking operating systems
  21. * have serious problems and fast mutexes like SharedMutex are almost
  22. * certainly the better choice, because letting the OS scheduler put a
  23. * thread to sleep is better for system responsiveness and throughput
  24. * than wasting a timeslice repeatedly querying a lock held by a
  25. * thread that's blocked, and you can't prevent userspace
  26. * programs blocking.
  27. *
  28. * Spinlocks in an operating system kernel make much more sense than
  29. * they do in userspace.
  30. *
  31. * -------------------------------------------------------------------
  32. *
  33. * Two Read-Write spin lock implementations.
  34. *
  35. * Ref: http://locklessinc.com/articles/locks
  36. *
  37. * Both locks here are faster than pthread_rwlock and have very low
  38. * overhead (usually 20-30ns). They don't use any system mutexes and
  39. * are very compact (4/8 bytes), so are suitable for per-instance
  40. * based locking, particularly when contention is not expected.
  41. *
  42. * For a spinlock, RWSpinLock is a reasonable choice. (See the note
  43. * about for why a spin lock is frequently a bad idea generally.)
  44. * RWSpinLock has minimal overhead, and comparable contention
  45. * performance when the number of competing threads is less than or
  46. * equal to the number of logical CPUs. Even as the number of
  47. * threads gets larger, RWSpinLock can still be very competitive in
  48. * READ, although it is slower on WRITE, and also inherently unfair
  49. * to writers.
  50. *
  51. * RWTicketSpinLock shows more balanced READ/WRITE performance. If
  52. * your application really needs a lot more threads, and a
  53. * higher-priority writer, prefer one of the RWTicketSpinLock locks.
  54. *
  55. * Caveats:
  56. *
  57. * RWTicketSpinLock locks can only be used with GCC on x86/x86-64
  58. * based systems.
  59. *
  60. * RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent
  61. * readers and writers.
  62. *
  63. * RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
  64. * readers and writers.
  65. *
  66. * RWTicketSpinLock<..., true> (kFavorWriter = true, that is, strict
  67. * writer priority) is NOT reentrant, even for lock_shared().
  68. *
  69. * The lock will not grant any new shared (read) accesses while a thread
  70. * attempting to acquire the lock in write mode is blocked. (That is,
  71. * if the lock is held in shared mode by N threads, and a thread attempts
  72. * to acquire it in write mode, no one else can acquire it in shared mode
  73. * until these N threads release the lock and then the blocked thread
  74. * acquires and releases the exclusive lock.) This also applies for
  75. * attempts to reacquire the lock in shared mode by threads that already
  76. * hold it in shared mode, making the lock non-reentrant.
  77. *
  78. * RWSpinLock handles 2^30 - 1 concurrent readers.
  79. *
  80. * @author Xin Liu <xliux@fb.com>
  81. */
  82. #pragma once
  83. /*
  84. ========================================================================
  85. Benchmark on (Intel(R) Xeon(R) CPU L5630 @ 2.13GHz) 8 cores(16 HTs)
  86. ========================================================================
  87. ------------------------------------------------------------------------------
  88. 1. Single thread benchmark (read/write lock + unlock overhead)
  89. Benchmark Iters Total t t/iter iter/sec
  90. -------------------------------------------------------------------------------
  91. * BM_RWSpinLockRead 100000 1.786 ms 17.86 ns 53.4M
  92. +30.5% BM_RWSpinLockWrite 100000 2.331 ms 23.31 ns 40.91M
  93. +85.7% BM_RWTicketSpinLock32Read 100000 3.317 ms 33.17 ns 28.75M
  94. +96.0% BM_RWTicketSpinLock32Write 100000 3.5 ms 35 ns 27.25M
  95. +85.6% BM_RWTicketSpinLock64Read 100000 3.315 ms 33.15 ns 28.77M
  96. +96.0% BM_RWTicketSpinLock64Write 100000 3.5 ms 35 ns 27.25M
  97. +85.7% BM_RWTicketSpinLock32FavorWriterRead 100000 3.317 ms 33.17 ns 28.75M
  98. +29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000 2.316 ms 23.16 ns 41.18M
  99. +85.3% BM_RWTicketSpinLock64FavorWriterRead 100000 3.309 ms 33.09 ns 28.82M
  100. +30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000 2.325 ms 23.25 ns 41.02M
  101. + 175% BM_PThreadRWMutexRead 100000 4.917 ms 49.17 ns 19.4M
  102. + 166% BM_PThreadRWMutexWrite 100000 4.757 ms 47.57 ns 20.05M
  103. ------------------------------------------------------------------------------
  104. 2. Contention Benchmark 90% read 10% write
  105. Benchmark hits average min max sigma
  106. ------------------------------------------------------------------------------
  107. ---------- 8 threads ------------
  108. RWSpinLock Write 142666 220ns 78ns 40.8us 269ns
  109. RWSpinLock Read 1282297 222ns 80ns 37.7us 248ns
  110. RWTicketSpinLock Write 85692 209ns 71ns 17.9us 252ns
  111. RWTicketSpinLock Read 769571 215ns 78ns 33.4us 251ns
  112. pthread_rwlock_t Write 84248 2.48us 99ns 269us 8.19us
  113. pthread_rwlock_t Read 761646 933ns 101ns 374us 3.25us
  114. ---------- 16 threads ------------
  115. RWSpinLock Write 124236 237ns 78ns 261us 801ns
  116. RWSpinLock Read 1115807 236ns 78ns 2.27ms 2.17us
  117. RWTicketSpinLock Write 81781 231ns 71ns 31.4us 351ns
  118. RWTicketSpinLock Read 734518 238ns 78ns 73.6us 379ns
  119. pthread_rwlock_t Write 83363 7.12us 99ns 785us 28.1us
  120. pthread_rwlock_t Read 754978 2.18us 101ns 1.02ms 14.3us
  121. ---------- 50 threads ------------
  122. RWSpinLock Write 131142 1.37us 82ns 7.53ms 68.2us
  123. RWSpinLock Read 1181240 262ns 78ns 6.62ms 12.7us
  124. RWTicketSpinLock Write 83045 397ns 73ns 7.01ms 31.5us
  125. RWTicketSpinLock Read 744133 386ns 78ns 11ms 31.4us
  126. pthread_rwlock_t Write 80849 112us 103ns 4.52ms 263us
  127. pthread_rwlock_t Read 728698 24us 101ns 7.28ms 194us
  128. */
  129. #include <folly/Portability.h>
  130. #include <folly/portability/Asm.h>
  131. #if defined(__GNUC__) && (defined(__i386) || FOLLY_X64 || defined(ARCH_K8))
  132. #define RW_SPINLOCK_USE_X86_INTRINSIC_
  133. #include <x86intrin.h>
  134. #elif defined(_MSC_VER) && defined(FOLLY_X64)
  135. #define RW_SPINLOCK_USE_X86_INTRINSIC_
  136. #elif FOLLY_AARCH64
  137. #define RW_SPINLOCK_USE_X86_INTRINSIC_
  138. #else
  139. #undef RW_SPINLOCK_USE_X86_INTRINSIC_
  140. #endif
  141. // iOS doesn't define _mm_cvtsi64_si128 and friends
  142. #if (FOLLY_SSE >= 2) && !FOLLY_MOBILE && FOLLY_X64
  143. #define RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
  144. #else
  145. #undef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
  146. #endif
  147. #include <algorithm>
  148. #include <atomic>
  149. #include <thread>
  150. #include <folly/Likely.h>
  151. namespace folly {
  152. /*
  153. * A simple, small (4-bytes), but unfair rwlock. Use it when you want
  154. * a nice writer and don't expect a lot of write/read contention, or
  155. * when you need small rwlocks since you are creating a large number
  156. * of them.
  157. *
  158. * Note that the unfairness here is extreme: if the lock is
  159. * continually accessed for read, writers will never get a chance. If
  160. * the lock can be that highly contended this class is probably not an
  161. * ideal choice anyway.
  162. *
  163. * It currently implements most of the Lockable, SharedLockable and
  164. * UpgradeLockable concepts except the TimedLockable related locking/unlocking
  165. * interfaces.
  166. */
  167. class RWSpinLock {
  168. enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 };
  169. public:
  170. constexpr RWSpinLock() : bits_(0) {}
  171. RWSpinLock(RWSpinLock const&) = delete;
  172. RWSpinLock& operator=(RWSpinLock const&) = delete;
  173. // Lockable Concept
  174. void lock() {
  175. uint_fast32_t count = 0;
  176. while (!LIKELY(try_lock())) {
  177. if (++count > 1000) {
  178. std::this_thread::yield();
  179. }
  180. }
  181. }
  182. // Writer is responsible for clearing up both the UPGRADED and WRITER bits.
  183. void unlock() {
  184. static_assert(READER > WRITER + UPGRADED, "wrong bits!");
  185. bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release);
  186. }
  187. // SharedLockable Concept
  188. void lock_shared() {
  189. uint_fast32_t count = 0;
  190. while (!LIKELY(try_lock_shared())) {
  191. if (++count > 1000) {
  192. std::this_thread::yield();
  193. }
  194. }
  195. }
  196. void unlock_shared() {
  197. bits_.fetch_add(-READER, std::memory_order_release);
  198. }
  199. // Downgrade the lock from writer status to reader status.
  200. void unlock_and_lock_shared() {
  201. bits_.fetch_add(READER, std::memory_order_acquire);
  202. unlock();
  203. }
  204. // UpgradeLockable Concept
  205. void lock_upgrade() {
  206. uint_fast32_t count = 0;
  207. while (!try_lock_upgrade()) {
  208. if (++count > 1000) {
  209. std::this_thread::yield();
  210. }
  211. }
  212. }
  213. void unlock_upgrade() {
  214. bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel);
  215. }
  216. // unlock upgrade and try to acquire write lock
  217. void unlock_upgrade_and_lock() {
  218. int64_t count = 0;
  219. while (!try_unlock_upgrade_and_lock()) {
  220. if (++count > 1000) {
  221. std::this_thread::yield();
  222. }
  223. }
  224. }
  225. // unlock upgrade and read lock atomically
  226. void unlock_upgrade_and_lock_shared() {
  227. bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel);
  228. }
  229. // write unlock and upgrade lock atomically
  230. void unlock_and_lock_upgrade() {
  231. // need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
  232. // the same time when other threads are trying do try_lock_upgrade().
  233. bits_.fetch_or(UPGRADED, std::memory_order_acquire);
  234. bits_.fetch_add(-WRITER, std::memory_order_release);
  235. }
  236. // Attempt to acquire writer permission. Return false if we didn't get it.
  237. bool try_lock() {
  238. int32_t expect = 0;
  239. return bits_.compare_exchange_strong(
  240. expect, WRITER, std::memory_order_acq_rel);
  241. }
  242. // Try to get reader permission on the lock. This can fail if we
  243. // find out someone is a writer or upgrader.
  244. // Setting the UPGRADED bit would allow a writer-to-be to indicate
  245. // its intention to write and block any new readers while waiting
  246. // for existing readers to finish and release their read locks. This
  247. // helps avoid starving writers (promoted from upgraders).
  248. bool try_lock_shared() {
  249. // fetch_add is considerably (100%) faster than compare_exchange,
  250. // so here we are optimizing for the common (lock success) case.
  251. int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
  252. if (UNLIKELY(value & (WRITER | UPGRADED))) {
  253. bits_.fetch_add(-READER, std::memory_order_release);
  254. return false;
  255. }
  256. return true;
  257. }
  258. // try to unlock upgrade and write lock atomically
  259. bool try_unlock_upgrade_and_lock() {
  260. int32_t expect = UPGRADED;
  261. return bits_.compare_exchange_strong(
  262. expect, WRITER, std::memory_order_acq_rel);
  263. }
  264. // try to acquire an upgradable lock.
  265. bool try_lock_upgrade() {
  266. int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire);
  267. // Note: when failed, we cannot flip the UPGRADED bit back,
  268. // as in this case there is either another upgrade lock or a write lock.
  269. // If it's a write lock, the bit will get cleared up when that lock's done
  270. // with unlock().
  271. return ((value & (UPGRADED | WRITER)) == 0);
  272. }
  273. // mainly for debugging purposes.
  274. int32_t bits() const {
  275. return bits_.load(std::memory_order_acquire);
  276. }
  277. class ReadHolder;
  278. class UpgradedHolder;
  279. class WriteHolder;
  280. class ReadHolder {
  281. public:
  282. explicit ReadHolder(RWSpinLock* lock) : lock_(lock) {
  283. if (lock_) {
  284. lock_->lock_shared();
  285. }
  286. }
  287. explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) {
  288. lock_->lock_shared();
  289. }
  290. ReadHolder(ReadHolder&& other) noexcept : lock_(other.lock_) {
  291. other.lock_ = nullptr;
  292. }
  293. // down-grade
  294. explicit ReadHolder(UpgradedHolder&& upgraded) : lock_(upgraded.lock_) {
  295. upgraded.lock_ = nullptr;
  296. if (lock_) {
  297. lock_->unlock_upgrade_and_lock_shared();
  298. }
  299. }
  300. explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
  301. writer.lock_ = nullptr;
  302. if (lock_) {
  303. lock_->unlock_and_lock_shared();
  304. }
  305. }
  306. ReadHolder& operator=(ReadHolder&& other) {
  307. using std::swap;
  308. swap(lock_, other.lock_);
  309. return *this;
  310. }
  311. ReadHolder(const ReadHolder& other) = delete;
  312. ReadHolder& operator=(const ReadHolder& other) = delete;
  313. ~ReadHolder() {
  314. if (lock_) {
  315. lock_->unlock_shared();
  316. }
  317. }
  318. void reset(RWSpinLock* lock = nullptr) {
  319. if (lock == lock_) {
  320. return;
  321. }
  322. if (lock_) {
  323. lock_->unlock_shared();
  324. }
  325. lock_ = lock;
  326. if (lock_) {
  327. lock_->lock_shared();
  328. }
  329. }
  330. void swap(ReadHolder* other) {
  331. std::swap(lock_, other->lock_);
  332. }
  333. private:
  334. friend class UpgradedHolder;
  335. friend class WriteHolder;
  336. RWSpinLock* lock_;
  337. };
  338. class UpgradedHolder {
  339. public:
  340. explicit UpgradedHolder(RWSpinLock* lock) : lock_(lock) {
  341. if (lock_) {
  342. lock_->lock_upgrade();
  343. }
  344. }
  345. explicit UpgradedHolder(RWSpinLock& lock) : lock_(&lock) {
  346. lock_->lock_upgrade();
  347. }
  348. explicit UpgradedHolder(WriteHolder&& writer) {
  349. lock_ = writer.lock_;
  350. writer.lock_ = nullptr;
  351. if (lock_) {
  352. lock_->unlock_and_lock_upgrade();
  353. }
  354. }
  355. UpgradedHolder(UpgradedHolder&& other) noexcept : lock_(other.lock_) {
  356. other.lock_ = nullptr;
  357. }
  358. UpgradedHolder& operator=(UpgradedHolder&& other) {
  359. using std::swap;
  360. swap(lock_, other.lock_);
  361. return *this;
  362. }
  363. UpgradedHolder(const UpgradedHolder& other) = delete;
  364. UpgradedHolder& operator=(const UpgradedHolder& other) = delete;
  365. ~UpgradedHolder() {
  366. if (lock_) {
  367. lock_->unlock_upgrade();
  368. }
  369. }
  370. void reset(RWSpinLock* lock = nullptr) {
  371. if (lock == lock_) {
  372. return;
  373. }
  374. if (lock_) {
  375. lock_->unlock_upgrade();
  376. }
  377. lock_ = lock;
  378. if (lock_) {
  379. lock_->lock_upgrade();
  380. }
  381. }
  382. void swap(UpgradedHolder* other) {
  383. using std::swap;
  384. swap(lock_, other->lock_);
  385. }
  386. private:
  387. friend class WriteHolder;
  388. friend class ReadHolder;
  389. RWSpinLock* lock_;
  390. };
  391. class WriteHolder {
  392. public:
  393. explicit WriteHolder(RWSpinLock* lock) : lock_(lock) {
  394. if (lock_) {
  395. lock_->lock();
  396. }
  397. }
  398. explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) {
  399. lock_->lock();
  400. }
  401. // promoted from an upgrade lock holder
  402. explicit WriteHolder(UpgradedHolder&& upgraded) {
  403. lock_ = upgraded.lock_;
  404. upgraded.lock_ = nullptr;
  405. if (lock_) {
  406. lock_->unlock_upgrade_and_lock();
  407. }
  408. }
  409. WriteHolder(WriteHolder&& other) noexcept : lock_(other.lock_) {
  410. other.lock_ = nullptr;
  411. }
  412. WriteHolder& operator=(WriteHolder&& other) {
  413. using std::swap;
  414. swap(lock_, other.lock_);
  415. return *this;
  416. }
  417. WriteHolder(const WriteHolder& other) = delete;
  418. WriteHolder& operator=(const WriteHolder& other) = delete;
  419. ~WriteHolder() {
  420. if (lock_) {
  421. lock_->unlock();
  422. }
  423. }
  424. void reset(RWSpinLock* lock = nullptr) {
  425. if (lock == lock_) {
  426. return;
  427. }
  428. if (lock_) {
  429. lock_->unlock();
  430. }
  431. lock_ = lock;
  432. if (lock_) {
  433. lock_->lock();
  434. }
  435. }
  436. void swap(WriteHolder* other) {
  437. using std::swap;
  438. swap(lock_, other->lock_);
  439. }
  440. private:
  441. friend class ReadHolder;
  442. friend class UpgradedHolder;
  443. RWSpinLock* lock_;
  444. };
  445. private:
  446. std::atomic<int32_t> bits_;
  447. };
  448. #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
  449. // A more balanced Read-Write spin lock implemented based on GCC intrinsics.
  450. namespace detail {
  451. template <size_t kBitWidth>
  452. struct RWTicketIntTrait {
  453. static_assert(
  454. kBitWidth == 32 || kBitWidth == 64,
  455. "bit width has to be either 32 or 64 ");
  456. };
  457. template <>
  458. struct RWTicketIntTrait<64> {
  459. typedef uint64_t FullInt;
  460. typedef uint32_t HalfInt;
  461. typedef uint16_t QuarterInt;
  462. #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
  463. static __m128i make128(const uint16_t v[4]) {
  464. return _mm_set_epi16(
  465. 0, 0, 0, 0, short(v[3]), short(v[2]), short(v[1]), short(v[0]));
  466. }
  467. static inline __m128i fromInteger(uint64_t from) {
  468. return _mm_cvtsi64_si128(int64_t(from));
  469. }
  470. static inline uint64_t toInteger(__m128i in) {
  471. return uint64_t(_mm_cvtsi128_si64(in));
  472. }
  473. static inline uint64_t addParallel(__m128i in, __m128i kDelta) {
  474. return toInteger(_mm_add_epi16(in, kDelta));
  475. }
  476. #endif
  477. };
  478. template <>
  479. struct RWTicketIntTrait<32> {
  480. typedef uint32_t FullInt;
  481. typedef uint16_t HalfInt;
  482. typedef uint8_t QuarterInt;
  483. #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
  484. static __m128i make128(const uint8_t v[4]) {
  485. // clang-format off
  486. return _mm_set_epi8(
  487. 0, 0, 0, 0,
  488. 0, 0, 0, 0,
  489. 0, 0, 0, 0,
  490. char(v[3]), char(v[2]), char(v[1]), char(v[0]));
  491. // clang-format on
  492. }
  493. static inline __m128i fromInteger(uint32_t from) {
  494. return _mm_cvtsi32_si128(int32_t(from));
  495. }
  496. static inline uint32_t toInteger(__m128i in) {
  497. return uint32_t(_mm_cvtsi128_si32(in));
  498. }
  499. static inline uint32_t addParallel(__m128i in, __m128i kDelta) {
  500. return toInteger(_mm_add_epi8(in, kDelta));
  501. }
  502. #endif
  503. };
  504. } // namespace detail
  505. template <size_t kBitWidth, bool kFavorWriter = false>
  506. class RWTicketSpinLockT {
  507. typedef detail::RWTicketIntTrait<kBitWidth> IntTraitType;
  508. typedef typename detail::RWTicketIntTrait<kBitWidth>::FullInt FullInt;
  509. typedef typename detail::RWTicketIntTrait<kBitWidth>::HalfInt HalfInt;
  510. typedef typename detail::RWTicketIntTrait<kBitWidth>::QuarterInt QuarterInt;
  511. union RWTicket {
  512. constexpr RWTicket() : whole(0) {}
  513. FullInt whole;
  514. HalfInt readWrite;
  515. __extension__ struct {
  516. QuarterInt write;
  517. QuarterInt read;
  518. QuarterInt users;
  519. };
  520. } ticket;
  521. private: // Some x64-specific utilities for atomic access to ticket.
  522. template <class T>
  523. static T load_acquire(T* addr) {
  524. T t = *addr; // acquire barrier
  525. asm_volatile_memory();
  526. return t;
  527. }
  528. template <class T>
  529. static void store_release(T* addr, T v) {
  530. asm_volatile_memory();
  531. *addr = v; // release barrier
  532. }
  533. public:
  534. constexpr RWTicketSpinLockT() {}
  535. RWTicketSpinLockT(RWTicketSpinLockT const&) = delete;
  536. RWTicketSpinLockT& operator=(RWTicketSpinLockT const&) = delete;
  537. void lock() {
  538. if (kFavorWriter) {
  539. writeLockAggressive();
  540. } else {
  541. writeLockNice();
  542. }
  543. }
  544. /*
  545. * Both try_lock and try_lock_shared diverge in our implementation from the
  546. * lock algorithm described in the link above.
  547. *
  548. * In the read case, it is undesirable that the readers could wait
  549. * for another reader (before increasing ticket.read in the other
  550. * implementation). Our approach gives up on
  551. * first-come-first-serve, but our benchmarks showed improve
  552. * performance for both readers and writers under heavily contended
  553. * cases, particularly when the number of threads exceeds the number
  554. * of logical CPUs.
  555. *
  556. * We have writeLockAggressive() using the original implementation
  557. * for a writer, which gives some advantage to the writer over the
  558. * readers---for that path it is guaranteed that the writer will
  559. * acquire the lock after all the existing readers exit.
  560. */
  561. bool try_lock() {
  562. RWTicket t;
  563. FullInt old = t.whole = load_acquire(&ticket.whole);
  564. if (t.users != t.write) {
  565. return false;
  566. }
  567. ++t.users;
  568. return __sync_bool_compare_and_swap(&ticket.whole, old, t.whole);
  569. }
  570. /*
  571. * Call this if you want to prioritize writer to avoid starvation.
  572. * Unlike writeLockNice, immediately acquires the write lock when
  573. * the existing readers (arriving before the writer) finish their
  574. * turns.
  575. */
  576. void writeLockAggressive() {
  577. // std::this_thread::yield() is needed here to avoid a pathology if the
  578. // number of threads attempting concurrent writes is >= the number of real
  579. // cores allocated to this process. This is less likely than the
  580. // corresponding situation in lock_shared(), but we still want to
  581. // avoid it
  582. uint_fast32_t count = 0;
  583. QuarterInt val = __sync_fetch_and_add(&ticket.users, 1);
  584. while (val != load_acquire(&ticket.write)) {
  585. asm_volatile_pause();
  586. if (UNLIKELY(++count > 1000)) {
  587. std::this_thread::yield();
  588. }
  589. }
  590. }
  591. // Call this when the writer should be nicer to the readers.
  592. void writeLockNice() {
  593. // Here it doesn't cpu-relax the writer.
  594. //
  595. // This is because usually we have many more readers than the
  596. // writers, so the writer has less chance to get the lock when
  597. // there are a lot of competing readers. The aggressive spinning
  598. // can help to avoid starving writers.
  599. //
  600. // We don't worry about std::this_thread::yield() here because the caller
  601. // has already explicitly abandoned fairness.
  602. while (!try_lock()) {
  603. }
  604. }
  605. // Atomically unlock the write-lock from writer and acquire the read-lock.
  606. void unlock_and_lock_shared() {
  607. QuarterInt val = __sync_fetch_and_add(&ticket.read, 1);
  608. }
  609. // Release writer permission on the lock.
  610. void unlock() {
  611. RWTicket t;
  612. t.whole = load_acquire(&ticket.whole);
  613. #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
  614. FullInt old = t.whole;
  615. // SSE2 can reduce the lock and unlock overhead by 10%
  616. static const QuarterInt kDeltaBuf[4] = {1, 1, 0, 0}; // write/read/user
  617. static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
  618. __m128i m = IntTraitType::fromInteger(old);
  619. t.whole = IntTraitType::addParallel(m, kDelta);
  620. #else
  621. ++t.read;
  622. ++t.write;
  623. #endif
  624. store_release(&ticket.readWrite, t.readWrite);
  625. }
  626. void lock_shared() {
  627. // std::this_thread::yield() is important here because we can't grab the
  628. // shared lock if there is a pending writeLockAggressive, so we
  629. // need to let threads that already have a shared lock complete
  630. uint_fast32_t count = 0;
  631. while (!LIKELY(try_lock_shared())) {
  632. asm_volatile_pause();
  633. if (UNLIKELY((++count & 1023) == 0)) {
  634. std::this_thread::yield();
  635. }
  636. }
  637. }
  638. bool try_lock_shared() {
  639. RWTicket t, old;
  640. old.whole = t.whole = load_acquire(&ticket.whole);
  641. old.users = old.read;
  642. #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
  643. // SSE2 may reduce the total lock and unlock overhead by 10%
  644. static const QuarterInt kDeltaBuf[4] = {0, 1, 1, 0}; // write/read/user
  645. static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
  646. __m128i m = IntTraitType::fromInteger(old.whole);
  647. t.whole = IntTraitType::addParallel(m, kDelta);
  648. #else
  649. ++t.read;
  650. ++t.users;
  651. #endif
  652. return __sync_bool_compare_and_swap(&ticket.whole, old.whole, t.whole);
  653. }
  654. void unlock_shared() {
  655. __sync_fetch_and_add(&ticket.write, 1);
  656. }
  657. class WriteHolder;
  658. typedef RWTicketSpinLockT<kBitWidth, kFavorWriter> RWSpinLock;
  659. class ReadHolder {
  660. public:
  661. ReadHolder(ReadHolder const&) = delete;
  662. ReadHolder& operator=(ReadHolder const&) = delete;
  663. explicit ReadHolder(RWSpinLock* lock) : lock_(lock) {
  664. if (lock_) {
  665. lock_->lock_shared();
  666. }
  667. }
  668. explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) {
  669. if (lock_) {
  670. lock_->lock_shared();
  671. }
  672. }
  673. // atomically unlock the write-lock from writer and acquire the read-lock
  674. explicit ReadHolder(WriteHolder* writer) : lock_(nullptr) {
  675. std::swap(this->lock_, writer->lock_);
  676. if (lock_) {
  677. lock_->unlock_and_lock_shared();
  678. }
  679. }
  680. ~ReadHolder() {
  681. if (lock_) {
  682. lock_->unlock_shared();
  683. }
  684. }
  685. void reset(RWSpinLock* lock = nullptr) {
  686. if (lock_) {
  687. lock_->unlock_shared();
  688. }
  689. lock_ = lock;
  690. if (lock_) {
  691. lock_->lock_shared();
  692. }
  693. }
  694. void swap(ReadHolder* other) {
  695. std::swap(this->lock_, other->lock_);
  696. }
  697. private:
  698. RWSpinLock* lock_;
  699. };
  700. class WriteHolder {
  701. public:
  702. WriteHolder(WriteHolder const&) = delete;
  703. WriteHolder& operator=(WriteHolder const&) = delete;
  704. explicit WriteHolder(RWSpinLock* lock) : lock_(lock) {
  705. if (lock_) {
  706. lock_->lock();
  707. }
  708. }
  709. explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) {
  710. if (lock_) {
  711. lock_->lock();
  712. }
  713. }
  714. ~WriteHolder() {
  715. if (lock_) {
  716. lock_->unlock();
  717. }
  718. }
  719. void reset(RWSpinLock* lock = nullptr) {
  720. if (lock == lock_) {
  721. return;
  722. }
  723. if (lock_) {
  724. lock_->unlock();
  725. }
  726. lock_ = lock;
  727. if (lock_) {
  728. lock_->lock();
  729. }
  730. }
  731. void swap(WriteHolder* other) {
  732. std::swap(this->lock_, other->lock_);
  733. }
  734. private:
  735. friend class ReadHolder;
  736. RWSpinLock* lock_;
  737. };
  738. };
  739. typedef RWTicketSpinLockT<32> RWTicketSpinLock32;
  740. typedef RWTicketSpinLockT<64> RWTicketSpinLock64;
  741. #endif // RW_SPINLOCK_USE_X86_INTRINSIC_
  742. } // namespace folly
  743. #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
  744. #undef RW_SPINLOCK_USE_X86_INTRINSIC_
  745. #endif