1 | /* |
2 | * Copyright 2011-present Facebook, Inc. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | /* |
17 | * N.B. You most likely do _not_ want to use RWSpinLock or any other |
18 | * kind of spinlock. Use SharedMutex instead. |
19 | * |
20 | * In short, spinlocks in preemptive multi-tasking operating systems |
21 | * have serious problems and fast mutexes like SharedMutex are almost |
22 | * certainly the better choice, because letting the OS scheduler put a |
23 | * thread to sleep is better for system responsiveness and throughput |
24 | * than wasting a timeslice repeatedly querying a lock held by a |
25 | * thread that's blocked, and you can't prevent userspace |
26 | * programs blocking. |
27 | * |
28 | * Spinlocks in an operating system kernel make much more sense than |
29 | * they do in userspace. |
30 | * |
31 | * ------------------------------------------------------------------- |
32 | * |
33 | * Two Read-Write spin lock implementations. |
34 | * |
35 | * Ref: http://locklessinc.com/articles/locks |
36 | * |
37 | * Both locks here are faster than pthread_rwlock and have very low |
38 | * overhead (usually 20-30ns). They don't use any system mutexes and |
39 | * are very compact (4/8 bytes), so are suitable for per-instance |
40 | * based locking, particularly when contention is not expected. |
41 | * |
42 | * For a spinlock, RWSpinLock is a reasonable choice. (See the note |
43 | * about for why a spin lock is frequently a bad idea generally.) |
44 | * RWSpinLock has minimal overhead, and comparable contention |
45 | * performance when the number of competing threads is less than or |
46 | * equal to the number of logical CPUs. Even as the number of |
47 | * threads gets larger, RWSpinLock can still be very competitive in |
48 | * READ, although it is slower on WRITE, and also inherently unfair |
49 | * to writers. |
50 | * |
51 | * RWTicketSpinLock shows more balanced READ/WRITE performance. If |
52 | * your application really needs a lot more threads, and a |
53 | * higher-priority writer, prefer one of the RWTicketSpinLock locks. |
54 | * |
55 | * Caveats: |
56 | * |
57 | * RWTicketSpinLock locks can only be used with GCC on x86/x86-64 |
58 | * based systems. |
59 | * |
60 | * RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent |
61 | * readers and writers. |
62 | * |
63 | * RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent |
64 | * readers and writers. |
65 | * |
66 | * RWTicketSpinLock<..., true> (kFavorWriter = true, that is, strict |
67 | * writer priority) is NOT reentrant, even for lock_shared(). |
68 | * |
69 | * The lock will not grant any new shared (read) accesses while a thread |
70 | * attempting to acquire the lock in write mode is blocked. (That is, |
71 | * if the lock is held in shared mode by N threads, and a thread attempts |
72 | * to acquire it in write mode, no one else can acquire it in shared mode |
73 | * until these N threads release the lock and then the blocked thread |
74 | * acquires and releases the exclusive lock.) This also applies for |
75 | * attempts to reacquire the lock in shared mode by threads that already |
76 | * hold it in shared mode, making the lock non-reentrant. |
77 | * |
78 | * RWSpinLock handles 2^30 - 1 concurrent readers. |
79 | * |
80 | * @author Xin Liu <xliux@fb.com> |
81 | */ |
82 | |
83 | #pragma once |
84 | |
85 | /* |
86 | ======================================================================== |
87 | Benchmark on (Intel(R) Xeon(R) CPU L5630 @ 2.13GHz) 8 cores(16 HTs) |
88 | ======================================================================== |
89 | |
90 | ------------------------------------------------------------------------------ |
91 | 1. Single thread benchmark (read/write lock + unlock overhead) |
92 | Benchmark Iters Total t t/iter iter/sec |
93 | ------------------------------------------------------------------------------- |
94 | * BM_RWSpinLockRead 100000 1.786 ms 17.86 ns 53.4M |
95 | +30.5% BM_RWSpinLockWrite 100000 2.331 ms 23.31 ns 40.91M |
96 | +85.7% BM_RWTicketSpinLock32Read 100000 3.317 ms 33.17 ns 28.75M |
97 | +96.0% BM_RWTicketSpinLock32Write 100000 3.5 ms 35 ns 27.25M |
98 | +85.6% BM_RWTicketSpinLock64Read 100000 3.315 ms 33.15 ns 28.77M |
99 | +96.0% BM_RWTicketSpinLock64Write 100000 3.5 ms 35 ns 27.25M |
100 | +85.7% BM_RWTicketSpinLock32FavorWriterRead 100000 3.317 ms 33.17 ns 28.75M |
101 | +29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000 2.316 ms 23.16 ns 41.18M |
102 | +85.3% BM_RWTicketSpinLock64FavorWriterRead 100000 3.309 ms 33.09 ns 28.82M |
103 | +30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000 2.325 ms 23.25 ns 41.02M |
104 | + 175% BM_PThreadRWMutexRead 100000 4.917 ms 49.17 ns 19.4M |
105 | + 166% BM_PThreadRWMutexWrite 100000 4.757 ms 47.57 ns 20.05M |
106 | |
107 | ------------------------------------------------------------------------------ |
108 | 2. Contention Benchmark 90% read 10% write |
109 | Benchmark hits average min max sigma |
110 | ------------------------------------------------------------------------------ |
111 | ---------- 8 threads ------------ |
112 | RWSpinLock Write 142666 220ns 78ns 40.8us 269ns |
113 | RWSpinLock Read 1282297 222ns 80ns 37.7us 248ns |
114 | RWTicketSpinLock Write 85692 209ns 71ns 17.9us 252ns |
115 | RWTicketSpinLock Read 769571 215ns 78ns 33.4us 251ns |
116 | pthread_rwlock_t Write 84248 2.48us 99ns 269us 8.19us |
117 | pthread_rwlock_t Read 761646 933ns 101ns 374us 3.25us |
118 | |
119 | ---------- 16 threads ------------ |
120 | RWSpinLock Write 124236 237ns 78ns 261us 801ns |
121 | RWSpinLock Read 1115807 236ns 78ns 2.27ms 2.17us |
122 | RWTicketSpinLock Write 81781 231ns 71ns 31.4us 351ns |
123 | RWTicketSpinLock Read 734518 238ns 78ns 73.6us 379ns |
124 | pthread_rwlock_t Write 83363 7.12us 99ns 785us 28.1us |
125 | pthread_rwlock_t Read 754978 2.18us 101ns 1.02ms 14.3us |
126 | |
127 | ---------- 50 threads ------------ |
128 | RWSpinLock Write 131142 1.37us 82ns 7.53ms 68.2us |
129 | RWSpinLock Read 1181240 262ns 78ns 6.62ms 12.7us |
130 | RWTicketSpinLock Write 83045 397ns 73ns 7.01ms 31.5us |
131 | RWTicketSpinLock Read 744133 386ns 78ns 11ms 31.4us |
132 | pthread_rwlock_t Write 80849 112us 103ns 4.52ms 263us |
133 | pthread_rwlock_t Read 728698 24us 101ns 7.28ms 194us |
134 | |
135 | */ |
136 | |
137 | #include <folly/Portability.h> |
138 | #include <folly/portability/Asm.h> |
139 | |
140 | #if defined(__GNUC__) && (defined(__i386) || FOLLY_X64 || defined(ARCH_K8)) |
141 | #define RW_SPINLOCK_USE_X86_INTRINSIC_ |
142 | #include <x86intrin.h> |
143 | #elif defined(_MSC_VER) && defined(FOLLY_X64) |
144 | #define RW_SPINLOCK_USE_X86_INTRINSIC_ |
145 | #elif FOLLY_AARCH64 |
146 | #define RW_SPINLOCK_USE_X86_INTRINSIC_ |
147 | #else |
148 | #undef RW_SPINLOCK_USE_X86_INTRINSIC_ |
149 | #endif |
150 | |
151 | // iOS doesn't define _mm_cvtsi64_si128 and friends |
152 | #if (FOLLY_SSE >= 2) && !FOLLY_MOBILE && FOLLY_X64 |
153 | #define RW_SPINLOCK_USE_SSE_INSTRUCTIONS_ |
154 | #else |
155 | #undef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_ |
156 | #endif |
157 | |
158 | #include <algorithm> |
159 | #include <atomic> |
160 | #include <thread> |
161 | |
162 | #include <folly/Likely.h> |
163 | |
164 | namespace folly { |
165 | |
166 | /* |
167 | * A simple, small (4-bytes), but unfair rwlock. Use it when you want |
168 | * a nice writer and don't expect a lot of write/read contention, or |
169 | * when you need small rwlocks since you are creating a large number |
170 | * of them. |
171 | * |
172 | * Note that the unfairness here is extreme: if the lock is |
173 | * continually accessed for read, writers will never get a chance. If |
174 | * the lock can be that highly contended this class is probably not an |
175 | * ideal choice anyway. |
176 | * |
177 | * It currently implements most of the Lockable, SharedLockable and |
178 | * UpgradeLockable concepts except the TimedLockable related locking/unlocking |
179 | * interfaces. |
180 | */ |
181 | class RWSpinLock { |
182 | enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 }; |
183 | |
184 | public: |
185 | constexpr RWSpinLock() : bits_(0) {} |
186 | |
187 | RWSpinLock(RWSpinLock const&) = delete; |
188 | RWSpinLock& operator=(RWSpinLock const&) = delete; |
189 | |
190 | // Lockable Concept |
191 | void lock() { |
192 | uint_fast32_t count = 0; |
193 | while (!LIKELY(try_lock())) { |
194 | if (++count > 1000) { |
195 | std::this_thread::yield(); |
196 | } |
197 | } |
198 | } |
199 | |
200 | // Writer is responsible for clearing up both the UPGRADED and WRITER bits. |
201 | void unlock() { |
202 | static_assert(READER > WRITER + UPGRADED, "wrong bits!" ); |
203 | bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release); |
204 | } |
205 | |
206 | // SharedLockable Concept |
207 | void lock_shared() { |
208 | uint_fast32_t count = 0; |
209 | while (!LIKELY(try_lock_shared())) { |
210 | if (++count > 1000) { |
211 | std::this_thread::yield(); |
212 | } |
213 | } |
214 | } |
215 | |
216 | void unlock_shared() { |
217 | bits_.fetch_add(-READER, std::memory_order_release); |
218 | } |
219 | |
220 | // Downgrade the lock from writer status to reader status. |
221 | void unlock_and_lock_shared() { |
222 | bits_.fetch_add(READER, std::memory_order_acquire); |
223 | unlock(); |
224 | } |
225 | |
226 | // UpgradeLockable Concept |
227 | void lock_upgrade() { |
228 | uint_fast32_t count = 0; |
229 | while (!try_lock_upgrade()) { |
230 | if (++count > 1000) { |
231 | std::this_thread::yield(); |
232 | } |
233 | } |
234 | } |
235 | |
236 | void unlock_upgrade() { |
237 | bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel); |
238 | } |
239 | |
240 | // unlock upgrade and try to acquire write lock |
241 | void unlock_upgrade_and_lock() { |
242 | int64_t count = 0; |
243 | while (!try_unlock_upgrade_and_lock()) { |
244 | if (++count > 1000) { |
245 | std::this_thread::yield(); |
246 | } |
247 | } |
248 | } |
249 | |
250 | // unlock upgrade and read lock atomically |
251 | void unlock_upgrade_and_lock_shared() { |
252 | bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel); |
253 | } |
254 | |
255 | // write unlock and upgrade lock atomically |
256 | void unlock_and_lock_upgrade() { |
257 | // need to do it in two steps here -- as the UPGRADED bit might be OR-ed at |
258 | // the same time when other threads are trying do try_lock_upgrade(). |
259 | bits_.fetch_or(UPGRADED, std::memory_order_acquire); |
260 | bits_.fetch_add(-WRITER, std::memory_order_release); |
261 | } |
262 | |
263 | // Attempt to acquire writer permission. Return false if we didn't get it. |
264 | bool try_lock() { |
265 | int32_t expect = 0; |
266 | return bits_.compare_exchange_strong( |
267 | expect, WRITER, std::memory_order_acq_rel); |
268 | } |
269 | |
270 | // Try to get reader permission on the lock. This can fail if we |
271 | // find out someone is a writer or upgrader. |
272 | // Setting the UPGRADED bit would allow a writer-to-be to indicate |
273 | // its intention to write and block any new readers while waiting |
274 | // for existing readers to finish and release their read locks. This |
275 | // helps avoid starving writers (promoted from upgraders). |
276 | bool try_lock_shared() { |
277 | // fetch_add is considerably (100%) faster than compare_exchange, |
278 | // so here we are optimizing for the common (lock success) case. |
279 | int32_t value = bits_.fetch_add(READER, std::memory_order_acquire); |
280 | if (UNLIKELY(value & (WRITER | UPGRADED))) { |
281 | bits_.fetch_add(-READER, std::memory_order_release); |
282 | return false; |
283 | } |
284 | return true; |
285 | } |
286 | |
287 | // try to unlock upgrade and write lock atomically |
288 | bool try_unlock_upgrade_and_lock() { |
289 | int32_t expect = UPGRADED; |
290 | return bits_.compare_exchange_strong( |
291 | expect, WRITER, std::memory_order_acq_rel); |
292 | } |
293 | |
294 | // try to acquire an upgradable lock. |
295 | bool try_lock_upgrade() { |
296 | int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire); |
297 | |
298 | // Note: when failed, we cannot flip the UPGRADED bit back, |
299 | // as in this case there is either another upgrade lock or a write lock. |
300 | // If it's a write lock, the bit will get cleared up when that lock's done |
301 | // with unlock(). |
302 | return ((value & (UPGRADED | WRITER)) == 0); |
303 | } |
304 | |
305 | // mainly for debugging purposes. |
306 | int32_t bits() const { |
307 | return bits_.load(std::memory_order_acquire); |
308 | } |
309 | |
310 | class ReadHolder; |
311 | class UpgradedHolder; |
312 | class WriteHolder; |
313 | |
314 | class ReadHolder { |
315 | public: |
316 | explicit ReadHolder(RWSpinLock* lock) : lock_(lock) { |
317 | if (lock_) { |
318 | lock_->lock_shared(); |
319 | } |
320 | } |
321 | |
322 | explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) { |
323 | lock_->lock_shared(); |
324 | } |
325 | |
326 | ReadHolder(ReadHolder&& other) noexcept : lock_(other.lock_) { |
327 | other.lock_ = nullptr; |
328 | } |
329 | |
330 | // down-grade |
331 | explicit ReadHolder(UpgradedHolder&& upgraded) : lock_(upgraded.lock_) { |
332 | upgraded.lock_ = nullptr; |
333 | if (lock_) { |
334 | lock_->unlock_upgrade_and_lock_shared(); |
335 | } |
336 | } |
337 | |
338 | explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) { |
339 | writer.lock_ = nullptr; |
340 | if (lock_) { |
341 | lock_->unlock_and_lock_shared(); |
342 | } |
343 | } |
344 | |
345 | ReadHolder& operator=(ReadHolder&& other) { |
346 | using std::swap; |
347 | swap(lock_, other.lock_); |
348 | return *this; |
349 | } |
350 | |
351 | ReadHolder(const ReadHolder& other) = delete; |
352 | ReadHolder& operator=(const ReadHolder& other) = delete; |
353 | |
354 | ~ReadHolder() { |
355 | if (lock_) { |
356 | lock_->unlock_shared(); |
357 | } |
358 | } |
359 | |
360 | void reset(RWSpinLock* lock = nullptr) { |
361 | if (lock == lock_) { |
362 | return; |
363 | } |
364 | if (lock_) { |
365 | lock_->unlock_shared(); |
366 | } |
367 | lock_ = lock; |
368 | if (lock_) { |
369 | lock_->lock_shared(); |
370 | } |
371 | } |
372 | |
373 | void swap(ReadHolder* other) { |
374 | std::swap(lock_, other->lock_); |
375 | } |
376 | |
377 | private: |
378 | friend class UpgradedHolder; |
379 | friend class WriteHolder; |
380 | RWSpinLock* lock_; |
381 | }; |
382 | |
383 | class UpgradedHolder { |
384 | public: |
385 | explicit UpgradedHolder(RWSpinLock* lock) : lock_(lock) { |
386 | if (lock_) { |
387 | lock_->lock_upgrade(); |
388 | } |
389 | } |
390 | |
391 | explicit UpgradedHolder(RWSpinLock& lock) : lock_(&lock) { |
392 | lock_->lock_upgrade(); |
393 | } |
394 | |
395 | explicit UpgradedHolder(WriteHolder&& writer) { |
396 | lock_ = writer.lock_; |
397 | writer.lock_ = nullptr; |
398 | if (lock_) { |
399 | lock_->unlock_and_lock_upgrade(); |
400 | } |
401 | } |
402 | |
403 | UpgradedHolder(UpgradedHolder&& other) noexcept : lock_(other.lock_) { |
404 | other.lock_ = nullptr; |
405 | } |
406 | |
407 | UpgradedHolder& operator=(UpgradedHolder&& other) { |
408 | using std::swap; |
409 | swap(lock_, other.lock_); |
410 | return *this; |
411 | } |
412 | |
413 | UpgradedHolder(const UpgradedHolder& other) = delete; |
414 | UpgradedHolder& operator=(const UpgradedHolder& other) = delete; |
415 | |
416 | ~UpgradedHolder() { |
417 | if (lock_) { |
418 | lock_->unlock_upgrade(); |
419 | } |
420 | } |
421 | |
422 | void reset(RWSpinLock* lock = nullptr) { |
423 | if (lock == lock_) { |
424 | return; |
425 | } |
426 | if (lock_) { |
427 | lock_->unlock_upgrade(); |
428 | } |
429 | lock_ = lock; |
430 | if (lock_) { |
431 | lock_->lock_upgrade(); |
432 | } |
433 | } |
434 | |
435 | void swap(UpgradedHolder* other) { |
436 | using std::swap; |
437 | swap(lock_, other->lock_); |
438 | } |
439 | |
440 | private: |
441 | friend class WriteHolder; |
442 | friend class ReadHolder; |
443 | RWSpinLock* lock_; |
444 | }; |
445 | |
446 | class WriteHolder { |
447 | public: |
448 | explicit WriteHolder(RWSpinLock* lock) : lock_(lock) { |
449 | if (lock_) { |
450 | lock_->lock(); |
451 | } |
452 | } |
453 | |
454 | explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) { |
455 | lock_->lock(); |
456 | } |
457 | |
458 | // promoted from an upgrade lock holder |
459 | explicit WriteHolder(UpgradedHolder&& upgraded) { |
460 | lock_ = upgraded.lock_; |
461 | upgraded.lock_ = nullptr; |
462 | if (lock_) { |
463 | lock_->unlock_upgrade_and_lock(); |
464 | } |
465 | } |
466 | |
467 | WriteHolder(WriteHolder&& other) noexcept : lock_(other.lock_) { |
468 | other.lock_ = nullptr; |
469 | } |
470 | |
471 | WriteHolder& operator=(WriteHolder&& other) { |
472 | using std::swap; |
473 | swap(lock_, other.lock_); |
474 | return *this; |
475 | } |
476 | |
477 | WriteHolder(const WriteHolder& other) = delete; |
478 | WriteHolder& operator=(const WriteHolder& other) = delete; |
479 | |
480 | ~WriteHolder() { |
481 | if (lock_) { |
482 | lock_->unlock(); |
483 | } |
484 | } |
485 | |
486 | void reset(RWSpinLock* lock = nullptr) { |
487 | if (lock == lock_) { |
488 | return; |
489 | } |
490 | if (lock_) { |
491 | lock_->unlock(); |
492 | } |
493 | lock_ = lock; |
494 | if (lock_) { |
495 | lock_->lock(); |
496 | } |
497 | } |
498 | |
499 | void swap(WriteHolder* other) { |
500 | using std::swap; |
501 | swap(lock_, other->lock_); |
502 | } |
503 | |
504 | private: |
505 | friend class ReadHolder; |
506 | friend class UpgradedHolder; |
507 | RWSpinLock* lock_; |
508 | }; |
509 | |
510 | private: |
511 | std::atomic<int32_t> bits_; |
512 | }; |
513 | |
514 | #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_ |
515 | // A more balanced Read-Write spin lock implemented based on GCC intrinsics. |
516 | |
517 | namespace detail { |
518 | template <size_t kBitWidth> |
519 | struct RWTicketIntTrait { |
520 | static_assert( |
521 | kBitWidth == 32 || kBitWidth == 64, |
522 | "bit width has to be either 32 or 64 " ); |
523 | }; |
524 | |
525 | template <> |
526 | struct RWTicketIntTrait<64> { |
527 | typedef uint64_t FullInt; |
528 | typedef uint32_t HalfInt; |
529 | typedef uint16_t QuarterInt; |
530 | |
531 | #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_ |
532 | static __m128i make128(const uint16_t v[4]) { |
533 | return _mm_set_epi16( |
534 | 0, 0, 0, 0, short(v[3]), short(v[2]), short(v[1]), short(v[0])); |
535 | } |
536 | static inline __m128i fromInteger(uint64_t from) { |
537 | return _mm_cvtsi64_si128(int64_t(from)); |
538 | } |
539 | static inline uint64_t toInteger(__m128i in) { |
540 | return uint64_t(_mm_cvtsi128_si64(in)); |
541 | } |
542 | static inline uint64_t addParallel(__m128i in, __m128i kDelta) { |
543 | return toInteger(_mm_add_epi16(in, kDelta)); |
544 | } |
545 | #endif |
546 | }; |
547 | |
548 | template <> |
549 | struct RWTicketIntTrait<32> { |
550 | typedef uint32_t FullInt; |
551 | typedef uint16_t HalfInt; |
552 | typedef uint8_t QuarterInt; |
553 | |
554 | #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_ |
555 | static __m128i make128(const uint8_t v[4]) { |
556 | // clang-format off |
557 | return _mm_set_epi8( |
558 | 0, 0, 0, 0, |
559 | 0, 0, 0, 0, |
560 | 0, 0, 0, 0, |
561 | char(v[3]), char(v[2]), char(v[1]), char(v[0])); |
562 | // clang-format on |
563 | } |
564 | static inline __m128i fromInteger(uint32_t from) { |
565 | return _mm_cvtsi32_si128(int32_t(from)); |
566 | } |
567 | static inline uint32_t toInteger(__m128i in) { |
568 | return uint32_t(_mm_cvtsi128_si32(in)); |
569 | } |
570 | static inline uint32_t addParallel(__m128i in, __m128i kDelta) { |
571 | return toInteger(_mm_add_epi8(in, kDelta)); |
572 | } |
573 | #endif |
574 | }; |
575 | } // namespace detail |
576 | |
577 | template <size_t kBitWidth, bool kFavorWriter = false> |
578 | class RWTicketSpinLockT { |
579 | typedef detail::RWTicketIntTrait<kBitWidth> IntTraitType; |
580 | typedef typename detail::RWTicketIntTrait<kBitWidth>::FullInt FullInt; |
581 | typedef typename detail::RWTicketIntTrait<kBitWidth>::HalfInt HalfInt; |
582 | typedef typename detail::RWTicketIntTrait<kBitWidth>::QuarterInt QuarterInt; |
583 | |
584 | union RWTicket { |
585 | constexpr RWTicket() : whole(0) {} |
586 | FullInt whole; |
587 | HalfInt readWrite; |
588 | __extension__ struct { |
589 | QuarterInt write; |
590 | QuarterInt read; |
591 | QuarterInt users; |
592 | }; |
593 | } ticket; |
594 | |
595 | private: // Some x64-specific utilities for atomic access to ticket. |
596 | template <class T> |
597 | static T load_acquire(T* addr) { |
598 | T t = *addr; // acquire barrier |
599 | asm_volatile_memory(); |
600 | return t; |
601 | } |
602 | |
603 | template <class T> |
604 | static void store_release(T* addr, T v) { |
605 | asm_volatile_memory(); |
606 | *addr = v; // release barrier |
607 | } |
608 | |
609 | public: |
610 | constexpr RWTicketSpinLockT() {} |
611 | |
612 | RWTicketSpinLockT(RWTicketSpinLockT const&) = delete; |
613 | RWTicketSpinLockT& operator=(RWTicketSpinLockT const&) = delete; |
614 | |
615 | void lock() { |
616 | if (kFavorWriter) { |
617 | writeLockAggressive(); |
618 | } else { |
619 | writeLockNice(); |
620 | } |
621 | } |
622 | |
623 | /* |
624 | * Both try_lock and try_lock_shared diverge in our implementation from the |
625 | * lock algorithm described in the link above. |
626 | * |
627 | * In the read case, it is undesirable that the readers could wait |
628 | * for another reader (before increasing ticket.read in the other |
629 | * implementation). Our approach gives up on |
630 | * first-come-first-serve, but our benchmarks showed improve |
631 | * performance for both readers and writers under heavily contended |
632 | * cases, particularly when the number of threads exceeds the number |
633 | * of logical CPUs. |
634 | * |
635 | * We have writeLockAggressive() using the original implementation |
636 | * for a writer, which gives some advantage to the writer over the |
637 | * readers---for that path it is guaranteed that the writer will |
638 | * acquire the lock after all the existing readers exit. |
639 | */ |
640 | bool try_lock() { |
641 | RWTicket t; |
642 | FullInt old = t.whole = load_acquire(&ticket.whole); |
643 | if (t.users != t.write) { |
644 | return false; |
645 | } |
646 | ++t.users; |
647 | return __sync_bool_compare_and_swap(&ticket.whole, old, t.whole); |
648 | } |
649 | |
650 | /* |
651 | * Call this if you want to prioritize writer to avoid starvation. |
652 | * Unlike writeLockNice, immediately acquires the write lock when |
653 | * the existing readers (arriving before the writer) finish their |
654 | * turns. |
655 | */ |
656 | void writeLockAggressive() { |
657 | // std::this_thread::yield() is needed here to avoid a pathology if the |
658 | // number of threads attempting concurrent writes is >= the number of real |
659 | // cores allocated to this process. This is less likely than the |
660 | // corresponding situation in lock_shared(), but we still want to |
661 | // avoid it |
662 | uint_fast32_t count = 0; |
663 | QuarterInt val = __sync_fetch_and_add(&ticket.users, 1); |
664 | while (val != load_acquire(&ticket.write)) { |
665 | asm_volatile_pause(); |
666 | if (UNLIKELY(++count > 1000)) { |
667 | std::this_thread::yield(); |
668 | } |
669 | } |
670 | } |
671 | |
672 | // Call this when the writer should be nicer to the readers. |
673 | void writeLockNice() { |
674 | // Here it doesn't cpu-relax the writer. |
675 | // |
676 | // This is because usually we have many more readers than the |
677 | // writers, so the writer has less chance to get the lock when |
678 | // there are a lot of competing readers. The aggressive spinning |
679 | // can help to avoid starving writers. |
680 | // |
681 | // We don't worry about std::this_thread::yield() here because the caller |
682 | // has already explicitly abandoned fairness. |
683 | while (!try_lock()) { |
684 | } |
685 | } |
686 | |
687 | // Atomically unlock the write-lock from writer and acquire the read-lock. |
688 | void unlock_and_lock_shared() { |
689 | QuarterInt val = __sync_fetch_and_add(&ticket.read, 1); |
690 | } |
691 | |
692 | // Release writer permission on the lock. |
693 | void unlock() { |
694 | RWTicket t; |
695 | t.whole = load_acquire(&ticket.whole); |
696 | |
697 | #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_ |
698 | FullInt old = t.whole; |
699 | // SSE2 can reduce the lock and unlock overhead by 10% |
700 | static const QuarterInt kDeltaBuf[4] = {1, 1, 0, 0}; // write/read/user |
701 | static const __m128i kDelta = IntTraitType::make128(kDeltaBuf); |
702 | __m128i m = IntTraitType::fromInteger(old); |
703 | t.whole = IntTraitType::addParallel(m, kDelta); |
704 | #else |
705 | ++t.read; |
706 | ++t.write; |
707 | #endif |
708 | store_release(&ticket.readWrite, t.readWrite); |
709 | } |
710 | |
711 | void lock_shared() { |
712 | // std::this_thread::yield() is important here because we can't grab the |
713 | // shared lock if there is a pending writeLockAggressive, so we |
714 | // need to let threads that already have a shared lock complete |
715 | uint_fast32_t count = 0; |
716 | while (!LIKELY(try_lock_shared())) { |
717 | asm_volatile_pause(); |
718 | if (UNLIKELY((++count & 1023) == 0)) { |
719 | std::this_thread::yield(); |
720 | } |
721 | } |
722 | } |
723 | |
724 | bool try_lock_shared() { |
725 | RWTicket t, old; |
726 | old.whole = t.whole = load_acquire(&ticket.whole); |
727 | old.users = old.read; |
728 | #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_ |
729 | // SSE2 may reduce the total lock and unlock overhead by 10% |
730 | static const QuarterInt kDeltaBuf[4] = {0, 1, 1, 0}; // write/read/user |
731 | static const __m128i kDelta = IntTraitType::make128(kDeltaBuf); |
732 | __m128i m = IntTraitType::fromInteger(old.whole); |
733 | t.whole = IntTraitType::addParallel(m, kDelta); |
734 | #else |
735 | ++t.read; |
736 | ++t.users; |
737 | #endif |
738 | return __sync_bool_compare_and_swap(&ticket.whole, old.whole, t.whole); |
739 | } |
740 | |
741 | void unlock_shared() { |
742 | __sync_fetch_and_add(&ticket.write, 1); |
743 | } |
744 | |
745 | class WriteHolder; |
746 | |
747 | typedef RWTicketSpinLockT<kBitWidth, kFavorWriter> RWSpinLock; |
748 | class ReadHolder { |
749 | public: |
750 | ReadHolder(ReadHolder const&) = delete; |
751 | ReadHolder& operator=(ReadHolder const&) = delete; |
752 | |
753 | explicit ReadHolder(RWSpinLock* lock) : lock_(lock) { |
754 | if (lock_) { |
755 | lock_->lock_shared(); |
756 | } |
757 | } |
758 | |
759 | explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) { |
760 | if (lock_) { |
761 | lock_->lock_shared(); |
762 | } |
763 | } |
764 | |
765 | // atomically unlock the write-lock from writer and acquire the read-lock |
766 | explicit ReadHolder(WriteHolder* writer) : lock_(nullptr) { |
767 | std::swap(this->lock_, writer->lock_); |
768 | if (lock_) { |
769 | lock_->unlock_and_lock_shared(); |
770 | } |
771 | } |
772 | |
773 | ~ReadHolder() { |
774 | if (lock_) { |
775 | lock_->unlock_shared(); |
776 | } |
777 | } |
778 | |
779 | void reset(RWSpinLock* lock = nullptr) { |
780 | if (lock_) { |
781 | lock_->unlock_shared(); |
782 | } |
783 | lock_ = lock; |
784 | if (lock_) { |
785 | lock_->lock_shared(); |
786 | } |
787 | } |
788 | |
789 | void swap(ReadHolder* other) { |
790 | std::swap(this->lock_, other->lock_); |
791 | } |
792 | |
793 | private: |
794 | RWSpinLock* lock_; |
795 | }; |
796 | |
797 | class WriteHolder { |
798 | public: |
799 | WriteHolder(WriteHolder const&) = delete; |
800 | WriteHolder& operator=(WriteHolder const&) = delete; |
801 | |
802 | explicit WriteHolder(RWSpinLock* lock) : lock_(lock) { |
803 | if (lock_) { |
804 | lock_->lock(); |
805 | } |
806 | } |
807 | explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) { |
808 | if (lock_) { |
809 | lock_->lock(); |
810 | } |
811 | } |
812 | |
813 | ~WriteHolder() { |
814 | if (lock_) { |
815 | lock_->unlock(); |
816 | } |
817 | } |
818 | |
819 | void reset(RWSpinLock* lock = nullptr) { |
820 | if (lock == lock_) { |
821 | return; |
822 | } |
823 | if (lock_) { |
824 | lock_->unlock(); |
825 | } |
826 | lock_ = lock; |
827 | if (lock_) { |
828 | lock_->lock(); |
829 | } |
830 | } |
831 | |
832 | void swap(WriteHolder* other) { |
833 | std::swap(this->lock_, other->lock_); |
834 | } |
835 | |
836 | private: |
837 | friend class ReadHolder; |
838 | RWSpinLock* lock_; |
839 | }; |
840 | }; |
841 | |
842 | typedef RWTicketSpinLockT<32> RWTicketSpinLock32; |
843 | typedef RWTicketSpinLockT<64> RWTicketSpinLock64; |
844 | |
845 | #endif // RW_SPINLOCK_USE_X86_INTRINSIC_ |
846 | |
847 | } // namespace folly |
848 | |
849 | #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_ |
850 | #undef RW_SPINLOCK_USE_X86_INTRINSIC_ |
851 | #endif |
852 | |