1 | /* |
2 | * Copyright 2015-present Facebook, Inc. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | // @author Nathan Bronson (ngbronson@fb.com) |
18 | |
19 | #pragma once |
20 | |
21 | #include <stdint.h> |
22 | |
23 | #include <atomic> |
24 | #include <thread> |
25 | #include <type_traits> |
26 | |
27 | #include <folly/CPortability.h> |
28 | #include <folly/Likely.h> |
29 | #include <folly/concurrency/CacheLocality.h> |
30 | #include <folly/detail/Futex.h> |
31 | #include <folly/portability/Asm.h> |
32 | #include <folly/portability/SysResource.h> |
33 | #include <folly/synchronization/SanitizeThread.h> |
34 | |
35 | // SharedMutex is a reader-writer lock. It is small, very fast, scalable |
36 | // on multi-core, and suitable for use when readers or writers may block. |
37 | // Unlike most other reader-writer locks, its throughput with concurrent |
38 | // readers scales linearly; it is able to acquire and release the lock |
39 | // in shared mode without cache line ping-ponging. It is suitable for |
40 | // a wide range of lock hold times because it starts with spinning, |
41 | // proceeds to using sched_yield with a preemption heuristic, and then |
42 | // waits using futex and precise wakeups. |
43 | // |
44 | // SharedMutex provides all of the methods of folly::RWSpinLock, |
45 | // boost::shared_mutex, boost::upgrade_mutex, and C++14's |
46 | // std::shared_timed_mutex. All operations that can block are available |
47 | // in try, try-for, and try-until (system_clock or steady_clock) versions. |
48 | // |
49 | // SharedMutexReadPriority gives priority to readers, |
50 | // SharedMutexWritePriority gives priority to writers. SharedMutex is an |
51 | // alias for SharedMutexWritePriority, because writer starvation is more |
52 | // likely than reader starvation for the read-heavy workloads targetted |
53 | // by SharedMutex. |
54 | // |
55 | // In my tests SharedMutex is as good or better than the other |
56 | // reader-writer locks in use at Facebook for almost all use cases, |
57 | // sometimes by a wide margin. (If it is rare that there are actually |
58 | // concurrent readers then RWSpinLock can be a few nanoseconds faster.) |
59 | // I compared it to folly::RWSpinLock, folly::RWTicketSpinLock64, |
60 | // boost::shared_mutex, pthread_rwlock_t, and a RWLock that internally uses |
61 | // spinlocks to guard state and pthread_mutex_t+pthread_cond_t to block. |
62 | // (Thrift's ReadWriteMutex is based underneath on pthread_rwlock_t.) |
63 | // It is generally as good or better than the rest when evaluating size, |
64 | // speed, scalability, or latency outliers. In the corner cases where |
65 | // it is not the fastest (such as single-threaded use or heavy write |
66 | // contention) it is never very much worse than the best. See the bottom |
67 | // of folly/test/SharedMutexTest.cpp for lots of microbenchmark results. |
68 | // |
69 | // Comparison to folly::RWSpinLock: |
70 | // |
71 | // * SharedMutex is faster than RWSpinLock when there are actually |
72 | // concurrent read accesses (sometimes much faster), and ~5 nanoseconds |
73 | // slower when there is not actually any contention. SharedMutex is |
74 | // faster in every (benchmarked) scenario where the shared mode of |
75 | // the lock is actually useful. |
76 | // |
77 | // * Concurrent shared access to SharedMutex scales linearly, while total |
78 | // RWSpinLock throughput drops as more threads try to access the lock |
79 | // in shared mode. Under very heavy read contention SharedMutex can |
80 | // be two orders of magnitude faster than RWSpinLock (or any reader |
81 | // writer lock that doesn't use striping or deferral). |
82 | // |
83 | // * SharedMutex can safely protect blocking calls, because after an |
84 | // initial period of spinning it waits using futex(). |
85 | // |
86 | // * RWSpinLock prioritizes readers, SharedMutex has both reader- and |
87 | // writer-priority variants, but defaults to write priority. |
88 | // |
89 | // * RWSpinLock's upgradeable mode blocks new readers, while SharedMutex's |
90 | // doesn't. Both semantics are reasonable. The boost documentation |
91 | // doesn't explicitly talk about this behavior (except by omitting |
92 | // any statement that those lock modes conflict), but the boost |
93 | // implementations do allow new readers while the upgradeable mode |
94 | // is held. See https://github.com/boostorg/thread/blob/master/ |
95 | // include/boost/thread/pthread/shared_mutex.hpp |
96 | // |
97 | // * RWSpinLock::UpgradedHolder maps to SharedMutex::UpgradeHolder |
98 | // (UpgradeableHolder would be even more pedantically correct). |
99 | // SharedMutex's holders have fewer methods (no reset) and are less |
100 | // tolerant (promotion and downgrade crash if the donor doesn't own |
101 | // the lock, and you must use the default constructor rather than |
102 | // passing a nullptr to the pointer constructor). |
103 | // |
104 | // Both SharedMutex and RWSpinLock provide "exclusive", "upgrade", |
105 | // and "shared" modes. At all times num_threads_holding_exclusive + |
106 | // num_threads_holding_upgrade <= 1, and num_threads_holding_exclusive == |
107 | // 0 || num_threads_holding_shared == 0. RWSpinLock has the additional |
108 | // constraint that num_threads_holding_shared cannot increase while |
109 | // num_threads_holding_upgrade is non-zero. |
110 | // |
111 | // Comparison to the internal RWLock: |
112 | // |
113 | // * SharedMutex doesn't allow a maximum reader count to be configured, |
114 | // so it can't be used as a semaphore in the same way as RWLock. |
115 | // |
116 | // * SharedMutex is 4 bytes, RWLock is 256. |
117 | // |
118 | // * SharedMutex is as fast or faster than RWLock in all of my |
119 | // microbenchmarks, and has positive rather than negative scalability. |
120 | // |
121 | // * RWLock and SharedMutex are both writer priority locks. |
122 | // |
123 | // * SharedMutex avoids latency outliers as well as RWLock. |
124 | // |
125 | // * SharedMutex uses different names (t != 0 below): |
126 | // |
127 | // RWLock::lock(0) => SharedMutex::lock() |
128 | // |
129 | // RWLock::lock(t) => SharedMutex::try_lock_for(milliseconds(t)) |
130 | // |
131 | // RWLock::tryLock() => SharedMutex::try_lock() |
132 | // |
133 | // RWLock::unlock() => SharedMutex::unlock() |
134 | // |
135 | // RWLock::enter(0) => SharedMutex::lock_shared() |
136 | // |
137 | // RWLock::enter(t) => |
138 | // SharedMutex::try_lock_shared_for(milliseconds(t)) |
139 | // |
140 | // RWLock::tryEnter() => SharedMutex::try_lock_shared() |
141 | // |
142 | // RWLock::leave() => SharedMutex::unlock_shared() |
143 | // |
144 | // * RWLock allows the reader count to be adjusted by a value other |
145 | // than 1 during enter() or leave(). SharedMutex doesn't currently |
146 | // implement this feature. |
147 | // |
148 | // * RWLock's methods are marked const, SharedMutex's aren't. |
149 | // |
150 | // Reader-writer locks have the potential to allow concurrent access |
151 | // to shared read-mostly data, but in practice they often provide no |
152 | // improvement over a mutex. The problem is the cache coherence protocol |
153 | // of modern CPUs. Coherence is provided by making sure that when a cache |
154 | // line is written it is present in only one core's cache. Since a memory |
155 | // write is required to acquire a reader-writer lock in shared mode, the |
156 | // cache line holding the lock is invalidated in all of the other caches. |
157 | // This leads to cache misses when another thread wants to acquire or |
158 | // release the lock concurrently. When the RWLock is colocated with the |
159 | // data it protects (common), cache misses can also continue occur when |
160 | // a thread that already holds the lock tries to read the protected data. |
161 | // |
162 | // Ideally, a reader-writer lock would allow multiple cores to acquire |
163 | // and release the lock in shared mode without incurring any cache misses. |
164 | // This requires that each core records its shared access in a cache line |
165 | // that isn't read or written by other read-locking cores. (Writers will |
166 | // have to check all of the cache lines.) Typical server hardware when |
167 | // this comment was written has 16 L1 caches and cache lines of 64 bytes, |
168 | // so a lock striped over all L1 caches would occupy a prohibitive 1024 |
169 | // bytes. Nothing says that we need a separate set of per-core memory |
170 | // locations for each lock, however. Each SharedMutex instance is only |
171 | // 4 bytes, but all locks together share a 2K area in which they make a |
172 | // core-local record of lock acquisitions. |
173 | // |
174 | // SharedMutex's strategy of using a shared set of core-local stripes has |
175 | // a potential downside, because it means that acquisition of any lock in |
176 | // write mode can conflict with acquisition of any lock in shared mode. |
177 | // If a lock instance doesn't actually experience concurrency then this |
178 | // downside will outweight the upside of improved scalability for readers. |
179 | // To avoid this problem we dynamically detect concurrent accesses to |
180 | // SharedMutex, and don't start using the deferred mode unless we actually |
181 | // observe concurrency. See kNumSharedToStartDeferring. |
182 | // |
183 | // It is explicitly allowed to call unlock_shared() from a different |
184 | // thread than lock_shared(), so long as they are properly paired. |
185 | // unlock_shared() needs to find the location at which lock_shared() |
186 | // recorded the lock, which might be in the lock itself or in any of |
187 | // the shared slots. If you can conveniently pass state from lock |
188 | // acquisition to release then the fastest mechanism is to std::move |
189 | // the SharedMutex::ReadHolder instance or an SharedMutex::Token (using |
190 | // lock_shared(Token&) and unlock_shared(Token&)). The guard or token |
191 | // will tell unlock_shared where in deferredReaders[] to look for the |
192 | // deferred lock. The Token-less version of unlock_shared() works in all |
193 | // cases, but is optimized for the common (no inter-thread handoff) case. |
194 | // |
195 | // In both read- and write-priority mode, a waiting lock() (exclusive mode) |
196 | // only blocks readers after it has waited for an active upgrade lock to be |
197 | // released; until the upgrade lock is released (or upgraded or downgraded) |
198 | // readers will still be able to enter. Preferences about lock acquisition |
199 | // are not guaranteed to be enforced perfectly (even if they were, there |
200 | // is theoretically the chance that a thread could be arbitrarily suspended |
201 | // between calling lock() and SharedMutex code actually getting executed). |
202 | // |
203 | // try_*_for methods always try at least once, even if the duration |
204 | // is zero or negative. The duration type must be compatible with |
205 | // std::chrono::steady_clock. try_*_until methods also always try at |
206 | // least once. std::chrono::system_clock and std::chrono::steady_clock |
207 | // are supported. |
208 | // |
209 | // If you have observed by profiling that your SharedMutex-s are getting |
210 | // cache misses on deferredReaders[] due to another SharedMutex user, then |
211 | // you can use the tag type to create your own instantiation of the type. |
212 | // The contention threshold (see kNumSharedToStartDeferring) should make |
213 | // this unnecessary in all but the most extreme cases. Make sure to check |
214 | // that the increased icache and dcache footprint of the tagged result is |
215 | // worth it. |
216 | |
217 | // SharedMutex's use of thread local storage is an optimization, so |
218 | // for the case where thread local storage is not supported, define it |
219 | // away. |
220 | |
221 | // Note about TSAN (ThreadSanitizer): the SharedMutexWritePriority version |
222 | // (the default) of this mutex is annotated appropriately so that TSAN can |
223 | // perform lock inversion analysis. However, the SharedMutexReadPriority version |
224 | // is not annotated. This is because TSAN's lock order heuristic |
225 | // assumes that two calls to lock_shared must be ordered, which leads |
226 | // to too many false positives for the reader-priority case. |
227 | // |
228 | // Suppose thread A holds a SharedMutexWritePriority lock in shared mode and an |
229 | // independent thread B is waiting for exclusive access. Then a thread C's |
230 | // lock_shared can't proceed until A has released the lock. Discounting |
231 | // situations that never use exclusive mode (so no lock is necessary at all) |
232 | // this means that without higher-level reasoning it is not safe to ignore |
233 | // reader <-> reader interactions. |
234 | // |
235 | // This reasoning does not apply to SharedMutexReadPriority, because there are |
236 | // no actions by a thread B that can make C need to wait for A. Since the |
237 | // overwhelming majority of SharedMutex instances use write priority, we |
238 | // restrict the TSAN annotations to only SharedMutexWritePriority. |
239 | |
240 | #ifndef FOLLY_SHAREDMUTEX_TLS |
241 | #if !FOLLY_MOBILE |
242 | #define FOLLY_SHAREDMUTEX_TLS FOLLY_TLS |
243 | #else |
244 | #define FOLLY_SHAREDMUTEX_TLS |
245 | #endif |
246 | #endif |
247 | |
248 | namespace folly { |
249 | |
250 | struct SharedMutexToken { |
251 | enum class Type : uint16_t { |
252 | INVALID = 0, |
253 | INLINE_SHARED, |
254 | DEFERRED_SHARED, |
255 | }; |
256 | |
257 | Type type_; |
258 | uint16_t slot_; |
259 | }; |
260 | |
261 | namespace detail { |
262 | // Returns a guard that gives permission for the current thread to |
263 | // annotate, and adjust the annotation bits in, the SharedMutex at ptr. |
264 | std::unique_lock<std::mutex> sharedMutexAnnotationGuard(void* ptr); |
265 | } // namespace detail |
266 | |
267 | template < |
268 | bool ReaderPriority, |
269 | typename Tag_ = void, |
270 | template <typename> class Atom = std::atomic, |
271 | bool BlockImmediately = false, |
272 | bool AnnotateForThreadSanitizer = kIsSanitizeThread && !ReaderPriority> |
273 | class SharedMutexImpl { |
274 | public: |
275 | static constexpr bool kReaderPriority = ReaderPriority; |
276 | |
277 | typedef Tag_ Tag; |
278 | |
279 | typedef SharedMutexToken Token; |
280 | |
281 | class ReadHolder; |
282 | class UpgradeHolder; |
283 | class WriteHolder; |
284 | |
285 | constexpr SharedMutexImpl() noexcept : state_(0) {} |
286 | |
287 | SharedMutexImpl(const SharedMutexImpl&) = delete; |
288 | SharedMutexImpl(SharedMutexImpl&&) = delete; |
289 | SharedMutexImpl& operator=(const SharedMutexImpl&) = delete; |
290 | SharedMutexImpl& operator=(SharedMutexImpl&&) = delete; |
291 | |
292 | // It is an error to destroy an SharedMutex that still has |
293 | // any outstanding locks. This is checked if NDEBUG isn't defined. |
294 | // SharedMutex's exclusive mode can be safely used to guard the lock's |
295 | // own destruction. If, for example, you acquire the lock in exclusive |
296 | // mode and then observe that the object containing the lock is no longer |
297 | // needed, you can unlock() and then immediately destroy the lock. |
298 | // See https://sourceware.org/bugzilla/show_bug.cgi?id=13690 for a |
299 | // description about why this property needs to be explicitly mentioned. |
300 | ~SharedMutexImpl() { |
301 | auto state = state_.load(std::memory_order_relaxed); |
302 | if (UNLIKELY((state & kHasS) != 0)) { |
303 | cleanupTokenlessSharedDeferred(state); |
304 | } |
305 | |
306 | #ifndef NDEBUG |
307 | // These asserts check that everybody has released the lock before it |
308 | // is destroyed. If you arrive here while debugging that is likely |
309 | // the problem. (You could also have general heap corruption.) |
310 | |
311 | // if a futexWait fails to go to sleep because the value has been |
312 | // changed, we don't necessarily clean up the wait bits, so it is |
313 | // possible they will be set here in a correct system |
314 | assert((state & ~(kWaitingAny | kMayDefer | kAnnotationCreated)) == 0); |
315 | if ((state & kMayDefer) != 0) { |
316 | for (uint32_t slot = 0; slot < kMaxDeferredReaders; ++slot) { |
317 | auto slotValue = deferredReader(slot)->load(std::memory_order_relaxed); |
318 | assert(!slotValueIsThis(slotValue)); |
319 | } |
320 | } |
321 | #endif |
322 | annotateDestroy(); |
323 | } |
324 | |
325 | void lock() { |
326 | WaitForever ctx; |
327 | (void)lockExclusiveImpl(kHasSolo, ctx); |
328 | annotateAcquired(annotate_rwlock_level::wrlock); |
329 | } |
330 | |
331 | bool try_lock() { |
332 | WaitNever ctx; |
333 | auto result = lockExclusiveImpl(kHasSolo, ctx); |
334 | annotateTryAcquired(result, annotate_rwlock_level::wrlock); |
335 | return result; |
336 | } |
337 | |
338 | template <class Rep, class Period> |
339 | bool try_lock_for(const std::chrono::duration<Rep, Period>& duration) { |
340 | WaitForDuration<Rep, Period> ctx(duration); |
341 | auto result = lockExclusiveImpl(kHasSolo, ctx); |
342 | annotateTryAcquired(result, annotate_rwlock_level::wrlock); |
343 | return result; |
344 | } |
345 | |
346 | template <class Clock, class Duration> |
347 | bool try_lock_until( |
348 | const std::chrono::time_point<Clock, Duration>& absDeadline) { |
349 | WaitUntilDeadline<Clock, Duration> ctx{absDeadline}; |
350 | auto result = lockExclusiveImpl(kHasSolo, ctx); |
351 | annotateTryAcquired(result, annotate_rwlock_level::wrlock); |
352 | return result; |
353 | } |
354 | |
355 | void unlock() { |
356 | annotateReleased(annotate_rwlock_level::wrlock); |
357 | // It is possible that we have a left-over kWaitingNotS if the last |
358 | // unlock_shared() that let our matching lock() complete finished |
359 | // releasing before lock()'s futexWait went to sleep. Clean it up now |
360 | auto state = (state_ &= ~(kWaitingNotS | kPrevDefer | kHasE)); |
361 | assert((state & ~(kWaitingAny | kAnnotationCreated)) == 0); |
362 | wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS); |
363 | } |
364 | |
365 | // Managing the token yourself makes unlock_shared a bit faster |
366 | |
367 | void lock_shared() { |
368 | WaitForever ctx; |
369 | (void)lockSharedImpl(nullptr, ctx); |
370 | annotateAcquired(annotate_rwlock_level::rdlock); |
371 | } |
372 | |
373 | void lock_shared(Token& token) { |
374 | WaitForever ctx; |
375 | (void)lockSharedImpl(&token, ctx); |
376 | annotateAcquired(annotate_rwlock_level::rdlock); |
377 | } |
378 | |
379 | bool try_lock_shared() { |
380 | WaitNever ctx; |
381 | auto result = lockSharedImpl(nullptr, ctx); |
382 | annotateTryAcquired(result, annotate_rwlock_level::rdlock); |
383 | return result; |
384 | } |
385 | |
386 | bool try_lock_shared(Token& token) { |
387 | WaitNever ctx; |
388 | auto result = lockSharedImpl(&token, ctx); |
389 | annotateTryAcquired(result, annotate_rwlock_level::rdlock); |
390 | return result; |
391 | } |
392 | |
393 | template <class Rep, class Period> |
394 | bool try_lock_shared_for(const std::chrono::duration<Rep, Period>& duration) { |
395 | WaitForDuration<Rep, Period> ctx(duration); |
396 | auto result = lockSharedImpl(nullptr, ctx); |
397 | annotateTryAcquired(result, annotate_rwlock_level::rdlock); |
398 | return result; |
399 | } |
400 | |
401 | template <class Rep, class Period> |
402 | bool try_lock_shared_for( |
403 | const std::chrono::duration<Rep, Period>& duration, |
404 | Token& token) { |
405 | WaitForDuration<Rep, Period> ctx(duration); |
406 | auto result = lockSharedImpl(&token, ctx); |
407 | annotateTryAcquired(result, annotate_rwlock_level::rdlock); |
408 | return result; |
409 | } |
410 | |
411 | template <class Clock, class Duration> |
412 | bool try_lock_shared_until( |
413 | const std::chrono::time_point<Clock, Duration>& absDeadline) { |
414 | WaitUntilDeadline<Clock, Duration> ctx{absDeadline}; |
415 | auto result = lockSharedImpl(nullptr, ctx); |
416 | annotateTryAcquired(result, annotate_rwlock_level::rdlock); |
417 | return result; |
418 | } |
419 | |
420 | template <class Clock, class Duration> |
421 | bool try_lock_shared_until( |
422 | const std::chrono::time_point<Clock, Duration>& absDeadline, |
423 | Token& token) { |
424 | WaitUntilDeadline<Clock, Duration> ctx{absDeadline}; |
425 | auto result = lockSharedImpl(&token, ctx); |
426 | annotateTryAcquired(result, annotate_rwlock_level::rdlock); |
427 | return result; |
428 | } |
429 | |
430 | void unlock_shared() { |
431 | annotateReleased(annotate_rwlock_level::rdlock); |
432 | |
433 | auto state = state_.load(std::memory_order_acquire); |
434 | |
435 | // kPrevDefer can only be set if HasE or BegunE is set |
436 | assert((state & (kPrevDefer | kHasE | kBegunE)) != kPrevDefer); |
437 | |
438 | // lock() strips kMayDefer immediately, but then copies it to |
439 | // kPrevDefer so we can tell if the pre-lock() lock_shared() might |
440 | // have deferred |
441 | if ((state & (kMayDefer | kPrevDefer)) == 0 || |
442 | !tryUnlockTokenlessSharedDeferred()) { |
443 | // Matching lock_shared() couldn't have deferred, or the deferred |
444 | // lock has already been inlined by applyDeferredReaders() |
445 | unlockSharedInline(); |
446 | } |
447 | } |
448 | |
449 | void unlock_shared(Token& token) { |
450 | annotateReleased(annotate_rwlock_level::rdlock); |
451 | |
452 | assert( |
453 | token.type_ == Token::Type::INLINE_SHARED || |
454 | token.type_ == Token::Type::DEFERRED_SHARED); |
455 | |
456 | if (token.type_ != Token::Type::DEFERRED_SHARED || |
457 | !tryUnlockSharedDeferred(token.slot_)) { |
458 | unlockSharedInline(); |
459 | } |
460 | #ifndef NDEBUG |
461 | token.type_ = Token::Type::INVALID; |
462 | #endif |
463 | } |
464 | |
465 | void unlock_and_lock_shared() { |
466 | annotateReleased(annotate_rwlock_level::wrlock); |
467 | annotateAcquired(annotate_rwlock_level::rdlock); |
468 | // We can't use state_ -=, because we need to clear 2 bits (1 of which |
469 | // has an uncertain initial state) and set 1 other. We might as well |
470 | // clear the relevant wake bits at the same time. Note that since S |
471 | // doesn't block the beginning of a transition to E (writer priority |
472 | // can cut off new S, reader priority grabs BegunE and blocks deferred |
473 | // S) we need to wake E as well. |
474 | auto state = state_.load(std::memory_order_acquire); |
475 | do { |
476 | assert( |
477 | (state & ~(kWaitingAny | kPrevDefer | kAnnotationCreated)) == kHasE); |
478 | } while (!state_.compare_exchange_strong( |
479 | state, (state & ~(kWaitingAny | kPrevDefer | kHasE)) + kIncrHasS)); |
480 | if ((state & (kWaitingE | kWaitingU | kWaitingS)) != 0) { |
481 | futexWakeAll(kWaitingE | kWaitingU | kWaitingS); |
482 | } |
483 | } |
484 | |
485 | void unlock_and_lock_shared(Token& token) { |
486 | unlock_and_lock_shared(); |
487 | token.type_ = Token::Type::INLINE_SHARED; |
488 | } |
489 | |
490 | void lock_upgrade() { |
491 | WaitForever ctx; |
492 | (void)lockUpgradeImpl(ctx); |
493 | // For TSAN: treat upgrade locks as equivalent to read locks |
494 | annotateAcquired(annotate_rwlock_level::rdlock); |
495 | } |
496 | |
497 | bool try_lock_upgrade() { |
498 | WaitNever ctx; |
499 | auto result = lockUpgradeImpl(ctx); |
500 | annotateTryAcquired(result, annotate_rwlock_level::rdlock); |
501 | return result; |
502 | } |
503 | |
504 | template <class Rep, class Period> |
505 | bool try_lock_upgrade_for( |
506 | const std::chrono::duration<Rep, Period>& duration) { |
507 | WaitForDuration<Rep, Period> ctx(duration); |
508 | auto result = lockUpgradeImpl(ctx); |
509 | annotateTryAcquired(result, annotate_rwlock_level::rdlock); |
510 | return result; |
511 | } |
512 | |
513 | template <class Clock, class Duration> |
514 | bool try_lock_upgrade_until( |
515 | const std::chrono::time_point<Clock, Duration>& absDeadline) { |
516 | WaitUntilDeadline<Clock, Duration> ctx{absDeadline}; |
517 | auto result = lockUpgradeImpl(ctx); |
518 | annotateTryAcquired(result, annotate_rwlock_level::rdlock); |
519 | return result; |
520 | } |
521 | |
522 | void unlock_upgrade() { |
523 | annotateReleased(annotate_rwlock_level::rdlock); |
524 | auto state = (state_ -= kHasU); |
525 | assert((state & (kWaitingNotS | kHasSolo)) == 0); |
526 | wakeRegisteredWaiters(state, kWaitingE | kWaitingU); |
527 | } |
528 | |
529 | void unlock_upgrade_and_lock() { |
530 | // no waiting necessary, so waitMask is empty |
531 | WaitForever ctx; |
532 | (void)lockExclusiveImpl(0, ctx); |
533 | annotateReleased(annotate_rwlock_level::rdlock); |
534 | annotateAcquired(annotate_rwlock_level::wrlock); |
535 | } |
536 | |
537 | void unlock_upgrade_and_lock_shared() { |
538 | // No need to annotate for TSAN here because we model upgrade and shared |
539 | // locks as the same. |
540 | auto state = (state_ -= kHasU - kIncrHasS); |
541 | assert((state & (kWaitingNotS | kHasSolo)) == 0); |
542 | wakeRegisteredWaiters(state, kWaitingE | kWaitingU); |
543 | } |
544 | |
545 | void unlock_upgrade_and_lock_shared(Token& token) { |
546 | unlock_upgrade_and_lock_shared(); |
547 | token.type_ = Token::Type::INLINE_SHARED; |
548 | } |
549 | |
550 | void unlock_and_lock_upgrade() { |
551 | annotateReleased(annotate_rwlock_level::wrlock); |
552 | annotateAcquired(annotate_rwlock_level::rdlock); |
553 | // We can't use state_ -=, because we need to clear 2 bits (1 of |
554 | // which has an uncertain initial state) and set 1 other. We might |
555 | // as well clear the relevant wake bits at the same time. |
556 | auto state = state_.load(std::memory_order_acquire); |
557 | while (true) { |
558 | assert( |
559 | (state & ~(kWaitingAny | kPrevDefer | kAnnotationCreated)) == kHasE); |
560 | auto after = |
561 | (state & ~(kWaitingNotS | kWaitingS | kPrevDefer | kHasE)) + kHasU; |
562 | if (state_.compare_exchange_strong(state, after)) { |
563 | if ((state & kWaitingS) != 0) { |
564 | futexWakeAll(kWaitingS); |
565 | } |
566 | return; |
567 | } |
568 | } |
569 | } |
570 | |
571 | private: |
572 | typedef typename folly::detail::Futex<Atom> Futex; |
573 | |
574 | // Internally we use four kinds of wait contexts. These are structs |
575 | // that provide a doWait method that returns true if a futex wake |
576 | // was issued that intersects with the waitMask, false if there was a |
577 | // timeout and no more waiting should be performed. Spinning occurs |
578 | // before the wait context is invoked. |
579 | |
580 | struct WaitForever { |
581 | bool canBlock() { |
582 | return true; |
583 | } |
584 | bool canTimeOut() { |
585 | return false; |
586 | } |
587 | bool shouldTimeOut() { |
588 | return false; |
589 | } |
590 | |
591 | bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) { |
592 | detail::futexWait(&futex, expected, waitMask); |
593 | return true; |
594 | } |
595 | }; |
596 | |
597 | struct WaitNever { |
598 | bool canBlock() { |
599 | return false; |
600 | } |
601 | bool canTimeOut() { |
602 | return true; |
603 | } |
604 | bool shouldTimeOut() { |
605 | return true; |
606 | } |
607 | |
608 | bool doWait( |
609 | Futex& /* futex */, |
610 | uint32_t /* expected */, |
611 | uint32_t /* waitMask */) { |
612 | return false; |
613 | } |
614 | }; |
615 | |
616 | template <class Rep, class Period> |
617 | struct WaitForDuration { |
618 | std::chrono::duration<Rep, Period> duration_; |
619 | bool deadlineComputed_; |
620 | std::chrono::steady_clock::time_point deadline_; |
621 | |
622 | explicit WaitForDuration(const std::chrono::duration<Rep, Period>& duration) |
623 | : duration_(duration), deadlineComputed_(false) {} |
624 | |
625 | std::chrono::steady_clock::time_point deadline() { |
626 | if (!deadlineComputed_) { |
627 | deadline_ = std::chrono::steady_clock::now() + duration_; |
628 | deadlineComputed_ = true; |
629 | } |
630 | return deadline_; |
631 | } |
632 | |
633 | bool canBlock() { |
634 | return duration_.count() > 0; |
635 | } |
636 | bool canTimeOut() { |
637 | return true; |
638 | } |
639 | |
640 | bool shouldTimeOut() { |
641 | return std::chrono::steady_clock::now() > deadline(); |
642 | } |
643 | |
644 | bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) { |
645 | auto result = |
646 | detail::futexWaitUntil(&futex, expected, deadline(), waitMask); |
647 | return result != folly::detail::FutexResult::TIMEDOUT; |
648 | } |
649 | }; |
650 | |
651 | template <class Clock, class Duration> |
652 | struct WaitUntilDeadline { |
653 | std::chrono::time_point<Clock, Duration> absDeadline_; |
654 | |
655 | bool canBlock() { |
656 | return true; |
657 | } |
658 | bool canTimeOut() { |
659 | return true; |
660 | } |
661 | bool shouldTimeOut() { |
662 | return Clock::now() > absDeadline_; |
663 | } |
664 | |
665 | bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) { |
666 | auto result = |
667 | detail::futexWaitUntil(&futex, expected, absDeadline_, waitMask); |
668 | return result != folly::detail::FutexResult::TIMEDOUT; |
669 | } |
670 | }; |
671 | |
672 | void annotateLazyCreate() { |
673 | if (AnnotateForThreadSanitizer && |
674 | (state_.load() & kAnnotationCreated) == 0) { |
675 | auto guard = detail::sharedMutexAnnotationGuard(this); |
676 | // check again |
677 | if ((state_.load() & kAnnotationCreated) == 0) { |
678 | state_.fetch_or(kAnnotationCreated); |
679 | annotate_benign_race_sized( |
680 | &state_, sizeof(state_), "init TSAN" , __FILE__, __LINE__); |
681 | annotate_rwlock_create(this, __FILE__, __LINE__); |
682 | } |
683 | } |
684 | } |
685 | |
686 | void annotateDestroy() { |
687 | if (AnnotateForThreadSanitizer) { |
688 | annotateLazyCreate(); |
689 | annotate_rwlock_destroy(this, __FILE__, __LINE__); |
690 | } |
691 | } |
692 | |
693 | void annotateAcquired(annotate_rwlock_level w) { |
694 | if (AnnotateForThreadSanitizer) { |
695 | annotateLazyCreate(); |
696 | annotate_rwlock_acquired(this, w, __FILE__, __LINE__); |
697 | } |
698 | } |
699 | |
700 | void annotateTryAcquired(bool result, annotate_rwlock_level w) { |
701 | if (AnnotateForThreadSanitizer) { |
702 | annotateLazyCreate(); |
703 | annotate_rwlock_try_acquired(this, w, result, __FILE__, __LINE__); |
704 | } |
705 | } |
706 | |
707 | void annotateReleased(annotate_rwlock_level w) { |
708 | if (AnnotateForThreadSanitizer) { |
709 | assert((state_.load() & kAnnotationCreated) != 0); |
710 | annotate_rwlock_released(this, w, __FILE__, __LINE__); |
711 | } |
712 | } |
713 | |
714 | // 32 bits of state |
715 | Futex state_{}; |
716 | |
717 | // S count needs to be on the end, because we explicitly allow it to |
718 | // underflow. This can occur while we are in the middle of applying |
719 | // deferred locks (we remove them from deferredReaders[] before |
720 | // inlining them), or during token-less unlock_shared() if a racing |
721 | // lock_shared();unlock_shared() moves the deferredReaders slot while |
722 | // the first unlock_shared() is scanning. The former case is cleaned |
723 | // up before we finish applying the locks. The latter case can persist |
724 | // until destruction, when it is cleaned up. |
725 | static constexpr uint32_t kIncrHasS = 1 << 11; |
726 | static constexpr uint32_t kHasS = ~(kIncrHasS - 1); |
727 | |
728 | // Set if annotation has been completed for this instance. That annotation |
729 | // (and setting this bit afterward) must be guarded by one of the mutexes in |
730 | // annotationCreationGuards. |
731 | static constexpr uint32_t kAnnotationCreated = 1 << 10; |
732 | |
733 | // If false, then there are definitely no deferred read locks for this |
734 | // instance. Cleared after initialization and when exclusively locked. |
735 | static constexpr uint32_t kMayDefer = 1 << 9; |
736 | |
737 | // lock() cleared kMayDefer as soon as it starts draining readers (so |
738 | // that it doesn't have to do a second CAS once drain completes), but |
739 | // unlock_shared() still needs to know whether to scan deferredReaders[] |
740 | // or not. We copy kMayDefer to kPrevDefer when setting kHasE or |
741 | // kBegunE, and clear it when clearing those bits. |
742 | static constexpr uint32_t kPrevDefer = 1 << 8; |
743 | |
744 | // Exclusive-locked blocks all read locks and write locks. This bit |
745 | // may be set before all readers have finished, but in that case the |
746 | // thread that sets it won't return to the caller until all read locks |
747 | // have been released. |
748 | static constexpr uint32_t kHasE = 1 << 7; |
749 | |
750 | // Exclusive-draining means that lock() is waiting for existing readers |
751 | // to leave, but that new readers may still acquire shared access. |
752 | // This is only used in reader priority mode. New readers during |
753 | // drain must be inline. The difference between this and kHasU is that |
754 | // kBegunE prevents kMayDefer from being set. |
755 | static constexpr uint32_t kBegunE = 1 << 6; |
756 | |
757 | // At most one thread may have either exclusive or upgrade lock |
758 | // ownership. Unlike exclusive mode, ownership of the lock in upgrade |
759 | // mode doesn't preclude other threads holding the lock in shared mode. |
760 | // boost's concept for this doesn't explicitly say whether new shared |
761 | // locks can be acquired one lock_upgrade has succeeded, but doesn't |
762 | // list that as disallowed. RWSpinLock disallows new read locks after |
763 | // lock_upgrade has been acquired, but the boost implementation doesn't. |
764 | // We choose the latter. |
765 | static constexpr uint32_t kHasU = 1 << 5; |
766 | |
767 | // There are three states that we consider to be "solo", in that they |
768 | // cannot coexist with other solo states. These are kHasE, kBegunE, |
769 | // and kHasU. Note that S doesn't conflict with any of these, because |
770 | // setting the kHasE is only one of the two steps needed to actually |
771 | // acquire the lock in exclusive mode (the other is draining the existing |
772 | // S holders). |
773 | static constexpr uint32_t kHasSolo = kHasE | kBegunE | kHasU; |
774 | |
775 | // Once a thread sets kHasE it needs to wait for the current readers |
776 | // to exit the lock. We give this a separate wait identity from the |
777 | // waiting to set kHasE so that we can perform partial wakeups (wake |
778 | // one instead of wake all). |
779 | static constexpr uint32_t kWaitingNotS = 1 << 4; |
780 | |
781 | // When waking writers we can either wake them all, in which case we |
782 | // can clear kWaitingE, or we can call futexWake(1). futexWake tells |
783 | // us if anybody woke up, but even if we detect that nobody woke up we |
784 | // can't clear the bit after the fact without issuing another wakeup. |
785 | // To avoid thundering herds when there are lots of pending lock() |
786 | // without needing to call futexWake twice when there is only one |
787 | // waiter, kWaitingE actually encodes if we have observed multiple |
788 | // concurrent waiters. Tricky: ABA issues on futexWait mean that when |
789 | // we see kWaitingESingle we can't assume that there is only one. |
790 | static constexpr uint32_t kWaitingESingle = 1 << 2; |
791 | static constexpr uint32_t kWaitingEMultiple = 1 << 3; |
792 | static constexpr uint32_t kWaitingE = kWaitingESingle | kWaitingEMultiple; |
793 | |
794 | // kWaitingU is essentially a 1 bit saturating counter. It always |
795 | // requires a wakeAll. |
796 | static constexpr uint32_t kWaitingU = 1 << 1; |
797 | |
798 | // All blocked lock_shared() should be awoken, so it is correct (not |
799 | // suboptimal) to wakeAll if there are any shared readers. |
800 | static constexpr uint32_t kWaitingS = 1 << 0; |
801 | |
802 | // kWaitingAny is a mask of all of the bits that record the state of |
803 | // threads, rather than the state of the lock. It is convenient to be |
804 | // able to mask them off during asserts. |
805 | static constexpr uint32_t kWaitingAny = |
806 | kWaitingNotS | kWaitingE | kWaitingU | kWaitingS; |
807 | |
808 | // The reader count at which a reader will attempt to use the lock |
809 | // in deferred mode. If this value is 2, then the second concurrent |
810 | // reader will set kMayDefer and use deferredReaders[]. kMayDefer is |
811 | // cleared during exclusive access, so this threshold must be reached |
812 | // each time a lock is held in exclusive mode. |
813 | static constexpr uint32_t kNumSharedToStartDeferring = 2; |
814 | |
815 | // The typical number of spins that a thread will wait for a state |
816 | // transition. There is no bound on the number of threads that can wait |
817 | // for a writer, so we are pretty conservative here to limit the chance |
818 | // that we are starving the writer of CPU. Each spin is 6 or 7 nanos, |
819 | // almost all of which is in the pause instruction. |
820 | static constexpr uint32_t kMaxSpinCount = !BlockImmediately ? 1000 : 2; |
821 | |
822 | // The maximum number of soft yields before falling back to futex. |
823 | // If the preemption heuristic is activated we will fall back before |
824 | // this. A soft yield takes ~900 nanos (two sched_yield plus a call |
825 | // to getrusage, with checks of the goal at each step). Soft yields |
826 | // aren't compatible with deterministic execution under test (unlike |
827 | // futexWaitUntil, which has a capricious but deterministic back end). |
828 | static constexpr uint32_t kMaxSoftYieldCount = !BlockImmediately ? 1000 : 0; |
829 | |
830 | // If AccessSpreader assigns indexes from 0..k*n-1 on a system where some |
831 | // level of the memory hierarchy is symmetrically divided into k pieces |
832 | // (NUMA nodes, last-level caches, L1 caches, ...), then slot indexes |
833 | // that are the same after integer division by k share that resource. |
834 | // Our strategy for deferred readers is to probe up to numSlots/4 slots, |
835 | // using the full granularity of AccessSpreader for the start slot |
836 | // and then search outward. We can use AccessSpreader::current(n) |
837 | // without managing our own spreader if kMaxDeferredReaders <= |
838 | // AccessSpreader::kMaxCpus, which is currently 128. |
839 | // |
840 | // Our 2-socket E5-2660 machines have 8 L1 caches on each chip, |
841 | // with 64 byte cache lines. That means we need 64*16 bytes of |
842 | // deferredReaders[] to give each L1 its own playground. On x86_64 |
843 | // each DeferredReaderSlot is 8 bytes, so we need kMaxDeferredReaders |
844 | // * kDeferredSeparationFactor >= 64 * 16 / 8 == 128. If |
845 | // kDeferredSearchDistance * kDeferredSeparationFactor <= |
846 | // 64 / 8 then we will search only within a single cache line, which |
847 | // guarantees we won't have inter-L1 contention. We give ourselves |
848 | // a factor of 2 on the core count, which should hold us for a couple |
849 | // processor generations. deferredReaders[] is 2048 bytes currently. |
850 | public: |
851 | static constexpr uint32_t kMaxDeferredReaders = 64; |
852 | static constexpr uint32_t kDeferredSearchDistance = 2; |
853 | static constexpr uint32_t kDeferredSeparationFactor = 4; |
854 | |
855 | private: |
856 | static_assert( |
857 | !(kMaxDeferredReaders & (kMaxDeferredReaders - 1)), |
858 | "kMaxDeferredReaders must be a power of 2" ); |
859 | static_assert( |
860 | !(kDeferredSearchDistance & (kDeferredSearchDistance - 1)), |
861 | "kDeferredSearchDistance must be a power of 2" ); |
862 | |
863 | // The number of deferred locks that can be simultaneously acquired |
864 | // by a thread via the token-less methods without performing any heap |
865 | // allocations. Each of these costs 3 pointers (24 bytes, probably) |
866 | // per thread. There's not much point in making this larger than |
867 | // kDeferredSearchDistance. |
868 | static constexpr uint32_t kTokenStackTLSCapacity = 2; |
869 | |
870 | // We need to make sure that if there is a lock_shared() |
871 | // and lock_shared(token) followed by unlock_shared() and |
872 | // unlock_shared(token), the token-less unlock doesn't null |
873 | // out deferredReaders[token.slot_]. If we allowed that, then |
874 | // unlock_shared(token) wouldn't be able to assume that its lock |
875 | // had been inlined by applyDeferredReaders when it finds that |
876 | // deferredReaders[token.slot_] no longer points to this. We accomplish |
877 | // this by stealing bit 0 from the pointer to record that the slot's |
878 | // element has no token, hence our use of uintptr_t in deferredReaders[]. |
879 | static constexpr uintptr_t kTokenless = 0x1; |
880 | |
881 | // This is the starting location for Token-less unlock_shared(). |
882 | static FOLLY_SHAREDMUTEX_TLS uint32_t tls_lastTokenlessSlot; |
883 | |
884 | // Last deferred reader slot used. |
885 | static FOLLY_SHAREDMUTEX_TLS uint32_t tls_lastDeferredReaderSlot; |
886 | |
887 | // Only indexes divisible by kDeferredSeparationFactor are used. |
888 | // If any of those elements points to a SharedMutexImpl, then it |
889 | // should be considered that there is a shared lock on that instance. |
890 | // See kTokenless. |
891 | public: |
892 | typedef Atom<uintptr_t> DeferredReaderSlot; |
893 | |
894 | private: |
895 | alignas(hardware_destructive_interference_size) static DeferredReaderSlot |
896 | deferredReaders[kMaxDeferredReaders * kDeferredSeparationFactor]; |
897 | |
898 | // Performs an exclusive lock, waiting for state_ & waitMask to be |
899 | // zero first |
900 | template <class WaitContext> |
901 | bool lockExclusiveImpl(uint32_t preconditionGoalMask, WaitContext& ctx) { |
902 | uint32_t state = state_.load(std::memory_order_acquire); |
903 | if (LIKELY( |
904 | (state & (preconditionGoalMask | kMayDefer | kHasS)) == 0 && |
905 | state_.compare_exchange_strong(state, (state | kHasE) & ~kHasU))) { |
906 | return true; |
907 | } else { |
908 | return lockExclusiveImpl(state, preconditionGoalMask, ctx); |
909 | } |
910 | } |
911 | |
912 | template <class WaitContext> |
913 | bool lockExclusiveImpl( |
914 | uint32_t& state, |
915 | uint32_t preconditionGoalMask, |
916 | WaitContext& ctx) { |
917 | while (true) { |
918 | if (UNLIKELY((state & preconditionGoalMask) != 0) && |
919 | !waitForZeroBits(state, preconditionGoalMask, kWaitingE, ctx) && |
920 | ctx.canTimeOut()) { |
921 | return false; |
922 | } |
923 | |
924 | uint32_t after = (state & kMayDefer) == 0 ? 0 : kPrevDefer; |
925 | if (!kReaderPriority || (state & (kMayDefer | kHasS)) == 0) { |
926 | // Block readers immediately, either because we are in write |
927 | // priority mode or because we can acquire the lock in one |
928 | // step. Note that if state has kHasU, then we are doing an |
929 | // unlock_upgrade_and_lock() and we should clear it (reader |
930 | // priority branch also does this). |
931 | after |= (state | kHasE) & ~(kHasU | kMayDefer); |
932 | } else { |
933 | after |= (state | kBegunE) & ~(kHasU | kMayDefer); |
934 | } |
935 | if (state_.compare_exchange_strong(state, after)) { |
936 | auto before = state; |
937 | state = after; |
938 | |
939 | // If we set kHasE (writer priority) then no new readers can |
940 | // arrive. If we set kBegunE then they can still enter, but |
941 | // they must be inline. Either way we need to either spin on |
942 | // deferredReaders[] slots, or inline them so that we can wait on |
943 | // kHasS to zero itself. deferredReaders[] is pointers, which on |
944 | // x86_64 are bigger than futex() can handle, so we inline the |
945 | // deferred locks instead of trying to futexWait on each slot. |
946 | // Readers are responsible for rechecking state_ after recording |
947 | // a deferred read to avoid atomicity problems between the state_ |
948 | // CAS and applyDeferredReader's reads of deferredReaders[]. |
949 | if (UNLIKELY((before & kMayDefer) != 0)) { |
950 | applyDeferredReaders(state, ctx); |
951 | } |
952 | while (true) { |
953 | assert((state & (kHasE | kBegunE)) != 0 && (state & kHasU) == 0); |
954 | if (UNLIKELY((state & kHasS) != 0) && |
955 | !waitForZeroBits(state, kHasS, kWaitingNotS, ctx) && |
956 | ctx.canTimeOut()) { |
957 | // Ugh. We blocked new readers and other writers for a while, |
958 | // but were unable to complete. Move on. On the plus side |
959 | // we can clear kWaitingNotS because nobody else can piggyback |
960 | // on it. |
961 | state = (state_ &= ~(kPrevDefer | kHasE | kBegunE | kWaitingNotS)); |
962 | wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS); |
963 | return false; |
964 | } |
965 | |
966 | if (kReaderPriority && (state & kHasE) == 0) { |
967 | assert((state & kBegunE) != 0); |
968 | if (!state_.compare_exchange_strong( |
969 | state, (state & ~kBegunE) | kHasE)) { |
970 | continue; |
971 | } |
972 | } |
973 | |
974 | return true; |
975 | } |
976 | } |
977 | } |
978 | } |
979 | |
980 | template <class WaitContext> |
981 | bool waitForZeroBits( |
982 | uint32_t& state, |
983 | uint32_t goal, |
984 | uint32_t waitMask, |
985 | WaitContext& ctx) { |
986 | uint32_t spinCount = 0; |
987 | while (true) { |
988 | state = state_.load(std::memory_order_acquire); |
989 | if ((state & goal) == 0) { |
990 | return true; |
991 | } |
992 | asm_volatile_pause(); |
993 | ++spinCount; |
994 | if (UNLIKELY(spinCount >= kMaxSpinCount)) { |
995 | return ctx.canBlock() && |
996 | yieldWaitForZeroBits(state, goal, waitMask, ctx); |
997 | } |
998 | } |
999 | } |
1000 | |
1001 | template <class WaitContext> |
1002 | bool yieldWaitForZeroBits( |
1003 | uint32_t& state, |
1004 | uint32_t goal, |
1005 | uint32_t waitMask, |
1006 | WaitContext& ctx) { |
1007 | #ifdef RUSAGE_THREAD |
1008 | struct rusage usage; |
1009 | std::memset(&usage, 0, sizeof(usage)); |
1010 | long before = -1; |
1011 | #endif |
1012 | for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount; |
1013 | ++yieldCount) { |
1014 | for (int softState = 0; softState < 3; ++softState) { |
1015 | if (softState < 2) { |
1016 | std::this_thread::yield(); |
1017 | } else { |
1018 | #ifdef RUSAGE_THREAD |
1019 | getrusage(RUSAGE_THREAD, &usage); |
1020 | #endif |
1021 | } |
1022 | if (((state = state_.load(std::memory_order_acquire)) & goal) == 0) { |
1023 | return true; |
1024 | } |
1025 | if (ctx.shouldTimeOut()) { |
1026 | return false; |
1027 | } |
1028 | } |
1029 | #ifdef RUSAGE_THREAD |
1030 | if (before >= 0 && usage.ru_nivcsw >= before + 2) { |
1031 | // One involuntary csw might just be occasional background work, |
1032 | // but if we get two in a row then we guess that there is someone |
1033 | // else who can profitably use this CPU. Fall back to futex |
1034 | break; |
1035 | } |
1036 | before = usage.ru_nivcsw; |
1037 | #endif |
1038 | } |
1039 | return futexWaitForZeroBits(state, goal, waitMask, ctx); |
1040 | } |
1041 | |
1042 | template <class WaitContext> |
1043 | bool futexWaitForZeroBits( |
1044 | uint32_t& state, |
1045 | uint32_t goal, |
1046 | uint32_t waitMask, |
1047 | WaitContext& ctx) { |
1048 | assert( |
1049 | waitMask == kWaitingNotS || waitMask == kWaitingE || |
1050 | waitMask == kWaitingU || waitMask == kWaitingS); |
1051 | |
1052 | while (true) { |
1053 | state = state_.load(std::memory_order_acquire); |
1054 | if ((state & goal) == 0) { |
1055 | return true; |
1056 | } |
1057 | |
1058 | auto after = state; |
1059 | if (waitMask == kWaitingE) { |
1060 | if ((state & kWaitingESingle) != 0) { |
1061 | after |= kWaitingEMultiple; |
1062 | } else { |
1063 | after |= kWaitingESingle; |
1064 | } |
1065 | } else { |
1066 | after |= waitMask; |
1067 | } |
1068 | |
1069 | // CAS is better than atomic |= here, because it lets us avoid |
1070 | // setting the wait flag when the goal is concurrently achieved |
1071 | if (after != state && !state_.compare_exchange_strong(state, after)) { |
1072 | continue; |
1073 | } |
1074 | |
1075 | if (!ctx.doWait(state_, after, waitMask)) { |
1076 | // timed out |
1077 | return false; |
1078 | } |
1079 | } |
1080 | } |
1081 | |
1082 | // Wakes up waiters registered in state_ as appropriate, clearing the |
1083 | // awaiting bits for anybody that was awoken. Tries to perform direct |
1084 | // single wakeup of an exclusive waiter if appropriate |
1085 | void wakeRegisteredWaiters(uint32_t& state, uint32_t wakeMask) { |
1086 | if (UNLIKELY((state & wakeMask) != 0)) { |
1087 | wakeRegisteredWaitersImpl(state, wakeMask); |
1088 | } |
1089 | } |
1090 | |
1091 | void wakeRegisteredWaitersImpl(uint32_t& state, uint32_t wakeMask) { |
1092 | // If there are multiple lock() pending only one of them will actually |
1093 | // get to wake up, so issuing futexWakeAll will make a thundering herd. |
1094 | // There's nothing stopping us from issuing futexWake(1) instead, |
1095 | // so long as the wait bits are still an accurate reflection of |
1096 | // the waiters. If we notice (via futexWake's return value) that |
1097 | // nobody woke up then we can try again with the normal wake-all path. |
1098 | // Note that we can't just clear the bits at that point; we need to |
1099 | // clear the bits and then issue another wakeup. |
1100 | // |
1101 | // It is possible that we wake an E waiter but an outside S grabs the |
1102 | // lock instead, at which point we should wake pending U and S waiters. |
1103 | // Rather than tracking state to make the failing E regenerate the |
1104 | // wakeup, we just disable the optimization in the case that there |
1105 | // are waiting U or S that we are eligible to wake. |
1106 | if ((wakeMask & kWaitingE) == kWaitingE && |
1107 | (state & wakeMask) == kWaitingE && |
1108 | detail::futexWake(&state_, 1, kWaitingE) > 0) { |
1109 | // somebody woke up, so leave state_ as is and clear it later |
1110 | return; |
1111 | } |
1112 | |
1113 | if ((state & wakeMask) != 0) { |
1114 | auto prev = state_.fetch_and(~wakeMask); |
1115 | if ((prev & wakeMask) != 0) { |
1116 | futexWakeAll(wakeMask); |
1117 | } |
1118 | state = prev & ~wakeMask; |
1119 | } |
1120 | } |
1121 | |
1122 | void futexWakeAll(uint32_t wakeMask) { |
1123 | detail::futexWake(&state_, std::numeric_limits<int>::max(), wakeMask); |
1124 | } |
1125 | |
1126 | DeferredReaderSlot* deferredReader(uint32_t slot) { |
1127 | return &deferredReaders[slot * kDeferredSeparationFactor]; |
1128 | } |
1129 | |
1130 | uintptr_t tokenfulSlotValue() { |
1131 | return reinterpret_cast<uintptr_t>(this); |
1132 | } |
1133 | |
1134 | uintptr_t tokenlessSlotValue() { |
1135 | return tokenfulSlotValue() | kTokenless; |
1136 | } |
1137 | |
1138 | bool slotValueIsThis(uintptr_t slotValue) { |
1139 | return (slotValue & ~kTokenless) == tokenfulSlotValue(); |
1140 | } |
1141 | |
1142 | // Clears any deferredReaders[] that point to this, adjusting the inline |
1143 | // shared lock count to compensate. Does some spinning and yielding |
1144 | // to avoid the work. Always finishes the application, even if ctx |
1145 | // times out. |
1146 | template <class WaitContext> |
1147 | void applyDeferredReaders(uint32_t& state, WaitContext& ctx) { |
1148 | uint32_t slot = 0; |
1149 | |
1150 | uint32_t spinCount = 0; |
1151 | while (true) { |
1152 | while (!slotValueIsThis( |
1153 | deferredReader(slot)->load(std::memory_order_acquire))) { |
1154 | if (++slot == kMaxDeferredReaders) { |
1155 | return; |
1156 | } |
1157 | } |
1158 | asm_volatile_pause(); |
1159 | if (UNLIKELY(++spinCount >= kMaxSpinCount)) { |
1160 | applyDeferredReaders(state, ctx, slot); |
1161 | return; |
1162 | } |
1163 | } |
1164 | } |
1165 | |
1166 | template <class WaitContext> |
1167 | void applyDeferredReaders(uint32_t& state, WaitContext& ctx, uint32_t slot) { |
1168 | #ifdef RUSAGE_THREAD |
1169 | struct rusage usage; |
1170 | std::memset(&usage, 0, sizeof(usage)); |
1171 | long before = -1; |
1172 | #endif |
1173 | for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount; |
1174 | ++yieldCount) { |
1175 | for (int softState = 0; softState < 3; ++softState) { |
1176 | if (softState < 2) { |
1177 | std::this_thread::yield(); |
1178 | } else { |
1179 | #ifdef RUSAGE_THREAD |
1180 | getrusage(RUSAGE_THREAD, &usage); |
1181 | #endif |
1182 | } |
1183 | while (!slotValueIsThis( |
1184 | deferredReader(slot)->load(std::memory_order_acquire))) { |
1185 | if (++slot == kMaxDeferredReaders) { |
1186 | return; |
1187 | } |
1188 | } |
1189 | if (ctx.shouldTimeOut()) { |
1190 | // finish applying immediately on timeout |
1191 | break; |
1192 | } |
1193 | } |
1194 | #ifdef RUSAGE_THREAD |
1195 | if (before >= 0 && usage.ru_nivcsw >= before + 2) { |
1196 | // heuristic says run queue is not empty |
1197 | break; |
1198 | } |
1199 | before = usage.ru_nivcsw; |
1200 | #endif |
1201 | } |
1202 | |
1203 | uint32_t movedSlotCount = 0; |
1204 | for (; slot < kMaxDeferredReaders; ++slot) { |
1205 | auto slotPtr = deferredReader(slot); |
1206 | auto slotValue = slotPtr->load(std::memory_order_acquire); |
1207 | if (slotValueIsThis(slotValue) && |
1208 | slotPtr->compare_exchange_strong(slotValue, 0)) { |
1209 | ++movedSlotCount; |
1210 | } |
1211 | } |
1212 | |
1213 | if (movedSlotCount > 0) { |
1214 | state = (state_ += movedSlotCount * kIncrHasS); |
1215 | } |
1216 | assert((state & (kHasE | kBegunE)) != 0); |
1217 | |
1218 | // if state + kIncrHasS overflows (off the end of state) then either |
1219 | // we have 2^(32-9) readers (almost certainly an application bug) |
1220 | // or we had an underflow (also a bug) |
1221 | assert(state < state + kIncrHasS); |
1222 | } |
1223 | |
1224 | // It is straightfoward to make a token-less lock_shared() and |
1225 | // unlock_shared() either by making the token-less version always use |
1226 | // INLINE_SHARED mode or by removing the token version. Supporting |
1227 | // deferred operation for both types is trickier than it appears, because |
1228 | // the purpose of the token it so that unlock_shared doesn't have to |
1229 | // look in other slots for its deferred lock. Token-less unlock_shared |
1230 | // might place a deferred lock in one place and then release a different |
1231 | // slot that was originally used by the token-ful version. If this was |
1232 | // important we could solve the problem by differentiating the deferred |
1233 | // locks so that cross-variety release wouldn't occur. The best way |
1234 | // is probably to steal a bit from the pointer, making deferredLocks[] |
1235 | // an array of Atom<uintptr_t>. |
1236 | |
1237 | template <class WaitContext> |
1238 | bool lockSharedImpl(Token* token, WaitContext& ctx) { |
1239 | uint32_t state = state_.load(std::memory_order_relaxed); |
1240 | if ((state & (kHasS | kMayDefer | kHasE)) == 0 && |
1241 | state_.compare_exchange_strong(state, state + kIncrHasS)) { |
1242 | if (token != nullptr) { |
1243 | token->type_ = Token::Type::INLINE_SHARED; |
1244 | } |
1245 | return true; |
1246 | } |
1247 | return lockSharedImpl(state, token, ctx); |
1248 | } |
1249 | |
1250 | template <class WaitContext> |
1251 | bool lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx); |
1252 | |
1253 | // Updates the state in/out argument as if the locks were made inline, |
1254 | // but does not update state_ |
1255 | void cleanupTokenlessSharedDeferred(uint32_t& state) { |
1256 | for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) { |
1257 | auto slotPtr = deferredReader(i); |
1258 | auto slotValue = slotPtr->load(std::memory_order_relaxed); |
1259 | if (slotValue == tokenlessSlotValue()) { |
1260 | slotPtr->store(0, std::memory_order_relaxed); |
1261 | state += kIncrHasS; |
1262 | if ((state & kHasS) == 0) { |
1263 | break; |
1264 | } |
1265 | } |
1266 | } |
1267 | } |
1268 | |
1269 | bool tryUnlockTokenlessSharedDeferred(); |
1270 | |
1271 | bool tryUnlockSharedDeferred(uint32_t slot) { |
1272 | assert(slot < kMaxDeferredReaders); |
1273 | auto slotValue = tokenfulSlotValue(); |
1274 | return deferredReader(slot)->compare_exchange_strong(slotValue, 0); |
1275 | } |
1276 | |
1277 | uint32_t unlockSharedInline() { |
1278 | uint32_t state = (state_ -= kIncrHasS); |
1279 | assert( |
1280 | (state & (kHasE | kBegunE | kMayDefer)) != 0 || |
1281 | state < state + kIncrHasS); |
1282 | if ((state & kHasS) == 0) { |
1283 | // Only the second half of lock() can be blocked by a non-zero |
1284 | // reader count, so that's the only thing we need to wake |
1285 | wakeRegisteredWaiters(state, kWaitingNotS); |
1286 | } |
1287 | return state; |
1288 | } |
1289 | |
1290 | template <class WaitContext> |
1291 | bool lockUpgradeImpl(WaitContext& ctx) { |
1292 | uint32_t state; |
1293 | do { |
1294 | if (!waitForZeroBits(state, kHasSolo, kWaitingU, ctx)) { |
1295 | return false; |
1296 | } |
1297 | } while (!state_.compare_exchange_strong(state, state | kHasU)); |
1298 | return true; |
1299 | } |
1300 | |
1301 | public: |
1302 | class ReadHolder { |
1303 | ReadHolder() : lock_(nullptr) {} |
1304 | |
1305 | public: |
1306 | explicit ReadHolder(const SharedMutexImpl* lock) |
1307 | : lock_(const_cast<SharedMutexImpl*>(lock)) { |
1308 | if (lock_) { |
1309 | lock_->lock_shared(token_); |
1310 | } |
1311 | } |
1312 | |
1313 | explicit ReadHolder(const SharedMutexImpl& lock) |
1314 | : lock_(const_cast<SharedMutexImpl*>(&lock)) { |
1315 | lock_->lock_shared(token_); |
1316 | } |
1317 | |
1318 | ReadHolder(ReadHolder&& rhs) noexcept |
1319 | : lock_(rhs.lock_), token_(rhs.token_) { |
1320 | rhs.lock_ = nullptr; |
1321 | } |
1322 | |
1323 | // Downgrade from upgrade mode |
1324 | explicit ReadHolder(UpgradeHolder&& upgraded) : lock_(upgraded.lock_) { |
1325 | assert(upgraded.lock_ != nullptr); |
1326 | upgraded.lock_ = nullptr; |
1327 | lock_->unlock_upgrade_and_lock_shared(token_); |
1328 | } |
1329 | |
1330 | // Downgrade from exclusive mode |
1331 | explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) { |
1332 | assert(writer.lock_ != nullptr); |
1333 | writer.lock_ = nullptr; |
1334 | lock_->unlock_and_lock_shared(token_); |
1335 | } |
1336 | |
1337 | ReadHolder& operator=(ReadHolder&& rhs) noexcept { |
1338 | std::swap(lock_, rhs.lock_); |
1339 | std::swap(token_, rhs.token_); |
1340 | return *this; |
1341 | } |
1342 | |
1343 | ReadHolder(const ReadHolder& rhs) = delete; |
1344 | ReadHolder& operator=(const ReadHolder& rhs) = delete; |
1345 | |
1346 | ~ReadHolder() { |
1347 | unlock(); |
1348 | } |
1349 | |
1350 | void unlock() { |
1351 | if (lock_) { |
1352 | lock_->unlock_shared(token_); |
1353 | lock_ = nullptr; |
1354 | } |
1355 | } |
1356 | |
1357 | private: |
1358 | friend class UpgradeHolder; |
1359 | friend class WriteHolder; |
1360 | SharedMutexImpl* lock_; |
1361 | SharedMutexToken token_; |
1362 | }; |
1363 | |
1364 | class UpgradeHolder { |
1365 | UpgradeHolder() : lock_(nullptr) {} |
1366 | |
1367 | public: |
1368 | explicit UpgradeHolder(SharedMutexImpl* lock) : lock_(lock) { |
1369 | if (lock_) { |
1370 | lock_->lock_upgrade(); |
1371 | } |
1372 | } |
1373 | |
1374 | explicit UpgradeHolder(SharedMutexImpl& lock) : lock_(&lock) { |
1375 | lock_->lock_upgrade(); |
1376 | } |
1377 | |
1378 | // Downgrade from exclusive mode |
1379 | explicit UpgradeHolder(WriteHolder&& writer) : lock_(writer.lock_) { |
1380 | assert(writer.lock_ != nullptr); |
1381 | writer.lock_ = nullptr; |
1382 | lock_->unlock_and_lock_upgrade(); |
1383 | } |
1384 | |
1385 | UpgradeHolder(UpgradeHolder&& rhs) noexcept : lock_(rhs.lock_) { |
1386 | rhs.lock_ = nullptr; |
1387 | } |
1388 | |
1389 | UpgradeHolder& operator=(UpgradeHolder&& rhs) noexcept { |
1390 | std::swap(lock_, rhs.lock_); |
1391 | return *this; |
1392 | } |
1393 | |
1394 | UpgradeHolder(const UpgradeHolder& rhs) = delete; |
1395 | UpgradeHolder& operator=(const UpgradeHolder& rhs) = delete; |
1396 | |
1397 | ~UpgradeHolder() { |
1398 | unlock(); |
1399 | } |
1400 | |
1401 | void unlock() { |
1402 | if (lock_) { |
1403 | lock_->unlock_upgrade(); |
1404 | lock_ = nullptr; |
1405 | } |
1406 | } |
1407 | |
1408 | private: |
1409 | friend class WriteHolder; |
1410 | friend class ReadHolder; |
1411 | SharedMutexImpl* lock_; |
1412 | }; |
1413 | |
1414 | class WriteHolder { |
1415 | WriteHolder() : lock_(nullptr) {} |
1416 | |
1417 | public: |
1418 | explicit WriteHolder(SharedMutexImpl* lock) : lock_(lock) { |
1419 | if (lock_) { |
1420 | lock_->lock(); |
1421 | } |
1422 | } |
1423 | |
1424 | explicit WriteHolder(SharedMutexImpl& lock) : lock_(&lock) { |
1425 | lock_->lock(); |
1426 | } |
1427 | |
1428 | // Promotion from upgrade mode |
1429 | explicit WriteHolder(UpgradeHolder&& upgrade) : lock_(upgrade.lock_) { |
1430 | assert(upgrade.lock_ != nullptr); |
1431 | upgrade.lock_ = nullptr; |
1432 | lock_->unlock_upgrade_and_lock(); |
1433 | } |
1434 | |
1435 | // README: |
1436 | // |
1437 | // It is intended that WriteHolder(ReadHolder&& rhs) do not exist. |
1438 | // |
1439 | // Shared locks (read) can not safely upgrade to unique locks (write). |
1440 | // That upgrade path is a well-known recipe for deadlock, so we explicitly |
1441 | // disallow it. |
1442 | // |
1443 | // If you need to do a conditional mutation, you have a few options: |
1444 | // 1. Check the condition under a shared lock and release it. |
1445 | // Then maybe check the condition again under a unique lock and maybe do |
1446 | // the mutation. |
1447 | // 2. Check the condition once under an upgradeable lock. |
1448 | // Then maybe upgrade the lock to a unique lock and do the mutation. |
1449 | // 3. Check the condition and maybe perform the mutation under a unique |
1450 | // lock. |
1451 | // |
1452 | // Relevant upgradeable lock notes: |
1453 | // * At most one upgradeable lock can be held at a time for a given shared |
1454 | // mutex, just like a unique lock. |
1455 | // * An upgradeable lock may be held concurrently with any number of shared |
1456 | // locks. |
1457 | // * An upgradeable lock may be upgraded atomically to a unique lock. |
1458 | |
1459 | WriteHolder(WriteHolder&& rhs) noexcept : lock_(rhs.lock_) { |
1460 | rhs.lock_ = nullptr; |
1461 | } |
1462 | |
1463 | WriteHolder& operator=(WriteHolder&& rhs) noexcept { |
1464 | std::swap(lock_, rhs.lock_); |
1465 | return *this; |
1466 | } |
1467 | |
1468 | WriteHolder(const WriteHolder& rhs) = delete; |
1469 | WriteHolder& operator=(const WriteHolder& rhs) = delete; |
1470 | |
1471 | ~WriteHolder() { |
1472 | unlock(); |
1473 | } |
1474 | |
1475 | void unlock() { |
1476 | if (lock_) { |
1477 | lock_->unlock(); |
1478 | lock_ = nullptr; |
1479 | } |
1480 | } |
1481 | |
1482 | private: |
1483 | friend class ReadHolder; |
1484 | friend class UpgradeHolder; |
1485 | SharedMutexImpl* lock_; |
1486 | }; |
1487 | |
1488 | // Adapters for Synchronized<> |
1489 | friend void acquireRead(SharedMutexImpl& lock) { |
1490 | lock.lock_shared(); |
1491 | } |
1492 | friend void acquireReadWrite(SharedMutexImpl& lock) { |
1493 | lock.lock(); |
1494 | } |
1495 | friend void releaseRead(SharedMutexImpl& lock) { |
1496 | lock.unlock_shared(); |
1497 | } |
1498 | friend void releaseReadWrite(SharedMutexImpl& lock) { |
1499 | lock.unlock(); |
1500 | } |
1501 | friend bool acquireRead(SharedMutexImpl& lock, unsigned int ms) { |
1502 | return lock.try_lock_shared_for(std::chrono::milliseconds(ms)); |
1503 | } |
1504 | friend bool acquireReadWrite(SharedMutexImpl& lock, unsigned int ms) { |
1505 | return lock.try_lock_for(std::chrono::milliseconds(ms)); |
1506 | } |
1507 | }; |
1508 | |
1509 | typedef SharedMutexImpl<true> SharedMutexReadPriority; |
1510 | typedef SharedMutexImpl<false> SharedMutexWritePriority; |
1511 | typedef SharedMutexWritePriority SharedMutex; |
1512 | typedef SharedMutexImpl<false, void, std::atomic, false, false> |
1513 | SharedMutexSuppressTSAN; |
1514 | |
1515 | // Prevent the compiler from instantiating these in other translation units. |
1516 | // They are instantiated once in SharedMutex.cpp |
1517 | extern template class SharedMutexImpl<true>; |
1518 | extern template class SharedMutexImpl<false>; |
1519 | |
1520 | template < |
1521 | bool ReaderPriority, |
1522 | typename Tag_, |
1523 | template <typename> class Atom, |
1524 | bool BlockImmediately, |
1525 | bool AnnotateForThreadSanitizer> |
1526 | alignas(hardware_destructive_interference_size) typename SharedMutexImpl< |
1527 | ReaderPriority, |
1528 | Tag_, |
1529 | Atom, |
1530 | BlockImmediately, |
1531 | AnnotateForThreadSanitizer>::DeferredReaderSlot |
1532 | SharedMutexImpl< |
1533 | ReaderPriority, |
1534 | Tag_, |
1535 | Atom, |
1536 | BlockImmediately, |
1537 | AnnotateForThreadSanitizer>::deferredReaders |
1538 | [kMaxDeferredReaders * kDeferredSeparationFactor] = {}; |
1539 | |
1540 | template < |
1541 | bool ReaderPriority, |
1542 | typename Tag_, |
1543 | template <typename> class Atom, |
1544 | bool BlockImmediately, |
1545 | bool AnnotateForThreadSanitizer> |
1546 | FOLLY_SHAREDMUTEX_TLS uint32_t SharedMutexImpl< |
1547 | ReaderPriority, |
1548 | Tag_, |
1549 | Atom, |
1550 | BlockImmediately, |
1551 | AnnotateForThreadSanitizer>::tls_lastTokenlessSlot = 0; |
1552 | |
1553 | template < |
1554 | bool ReaderPriority, |
1555 | typename Tag_, |
1556 | template <typename> class Atom, |
1557 | bool BlockImmediately, |
1558 | bool AnnotateForThreadSanitizer> |
1559 | FOLLY_SHAREDMUTEX_TLS uint32_t SharedMutexImpl< |
1560 | ReaderPriority, |
1561 | Tag_, |
1562 | Atom, |
1563 | BlockImmediately, |
1564 | AnnotateForThreadSanitizer>::tls_lastDeferredReaderSlot = 0; |
1565 | |
1566 | template < |
1567 | bool ReaderPriority, |
1568 | typename Tag_, |
1569 | template <typename> class Atom, |
1570 | bool BlockImmediately, |
1571 | bool AnnotateForThreadSanitizer> |
1572 | bool SharedMutexImpl< |
1573 | ReaderPriority, |
1574 | Tag_, |
1575 | Atom, |
1576 | BlockImmediately, |
1577 | AnnotateForThreadSanitizer>::tryUnlockTokenlessSharedDeferred() { |
1578 | auto bestSlot = tls_lastTokenlessSlot; |
1579 | for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) { |
1580 | auto slotPtr = deferredReader(bestSlot ^ i); |
1581 | auto slotValue = slotPtr->load(std::memory_order_relaxed); |
1582 | if (slotValue == tokenlessSlotValue() && |
1583 | slotPtr->compare_exchange_strong(slotValue, 0)) { |
1584 | tls_lastTokenlessSlot = bestSlot ^ i; |
1585 | return true; |
1586 | } |
1587 | } |
1588 | return false; |
1589 | } |
1590 | |
1591 | template < |
1592 | bool ReaderPriority, |
1593 | typename Tag_, |
1594 | template <typename> class Atom, |
1595 | bool BlockImmediately, |
1596 | bool AnnotateForThreadSanitizer> |
1597 | template <class WaitContext> |
1598 | bool SharedMutexImpl< |
1599 | ReaderPriority, |
1600 | Tag_, |
1601 | Atom, |
1602 | BlockImmediately, |
1603 | AnnotateForThreadSanitizer>:: |
1604 | lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx) { |
1605 | while (true) { |
1606 | if (UNLIKELY((state & kHasE) != 0) && |
1607 | !waitForZeroBits(state, kHasE, kWaitingS, ctx) && ctx.canTimeOut()) { |
1608 | return false; |
1609 | } |
1610 | |
1611 | uint32_t slot = tls_lastDeferredReaderSlot; |
1612 | uintptr_t slotValue = 1; // any non-zero value will do |
1613 | |
1614 | bool canAlreadyDefer = (state & kMayDefer) != 0; |
1615 | bool aboveDeferThreshold = |
1616 | (state & kHasS) >= (kNumSharedToStartDeferring - 1) * kIncrHasS; |
1617 | bool drainInProgress = ReaderPriority && (state & kBegunE) != 0; |
1618 | if (canAlreadyDefer || (aboveDeferThreshold && !drainInProgress)) { |
1619 | /* Try using the most recent slot first. */ |
1620 | slotValue = deferredReader(slot)->load(std::memory_order_relaxed); |
1621 | if (slotValue != 0) { |
1622 | // starting point for our empty-slot search, can change after |
1623 | // calling waitForZeroBits |
1624 | uint32_t bestSlot = |
1625 | (uint32_t)folly::AccessSpreader<Atom>::current(kMaxDeferredReaders); |
1626 | |
1627 | // deferred readers are already enabled, or it is time to |
1628 | // enable them if we can find a slot |
1629 | for (uint32_t i = 0; i < kDeferredSearchDistance; ++i) { |
1630 | slot = bestSlot ^ i; |
1631 | assert(slot < kMaxDeferredReaders); |
1632 | slotValue = deferredReader(slot)->load(std::memory_order_relaxed); |
1633 | if (slotValue == 0) { |
1634 | // found empty slot |
1635 | tls_lastDeferredReaderSlot = slot; |
1636 | break; |
1637 | } |
1638 | } |
1639 | } |
1640 | } |
1641 | |
1642 | if (slotValue != 0) { |
1643 | // not yet deferred, or no empty slots |
1644 | if (state_.compare_exchange_strong(state, state + kIncrHasS)) { |
1645 | // successfully recorded the read lock inline |
1646 | if (token != nullptr) { |
1647 | token->type_ = Token::Type::INLINE_SHARED; |
1648 | } |
1649 | return true; |
1650 | } |
1651 | // state is updated, try again |
1652 | continue; |
1653 | } |
1654 | |
1655 | // record that deferred readers might be in use if necessary |
1656 | if ((state & kMayDefer) == 0) { |
1657 | if (!state_.compare_exchange_strong(state, state | kMayDefer)) { |
1658 | // keep going if CAS failed because somebody else set the bit |
1659 | // for us |
1660 | if ((state & (kHasE | kMayDefer)) != kMayDefer) { |
1661 | continue; |
1662 | } |
1663 | } |
1664 | // state = state | kMayDefer; |
1665 | } |
1666 | |
1667 | // try to use the slot |
1668 | bool gotSlot = deferredReader(slot)->compare_exchange_strong( |
1669 | slotValue, |
1670 | token == nullptr ? tokenlessSlotValue() : tokenfulSlotValue()); |
1671 | |
1672 | // If we got the slot, we need to verify that an exclusive lock |
1673 | // didn't happen since we last checked. If we didn't get the slot we |
1674 | // need to recheck state_ anyway to make sure we don't waste too much |
1675 | // work. It is also possible that since we checked state_ someone |
1676 | // has acquired and released the write lock, clearing kMayDefer. |
1677 | // Both cases are covered by looking for the readers-possible bit, |
1678 | // because it is off when the exclusive lock bit is set. |
1679 | state = state_.load(std::memory_order_acquire); |
1680 | |
1681 | if (!gotSlot) { |
1682 | continue; |
1683 | } |
1684 | |
1685 | if (token == nullptr) { |
1686 | tls_lastTokenlessSlot = slot; |
1687 | } |
1688 | |
1689 | if ((state & kMayDefer) != 0) { |
1690 | assert((state & kHasE) == 0); |
1691 | // success |
1692 | if (token != nullptr) { |
1693 | token->type_ = Token::Type::DEFERRED_SHARED; |
1694 | token->slot_ = (uint16_t)slot; |
1695 | } |
1696 | return true; |
1697 | } |
1698 | |
1699 | // release the slot before retrying |
1700 | if (token == nullptr) { |
1701 | // We can't rely on slot. Token-less slot values can be freed by |
1702 | // any unlock_shared(), so we need to do the full deferredReader |
1703 | // search during unlock. Unlike unlock_shared(), we can't trust |
1704 | // kPrevDefer here. This deferred lock isn't visible to lock() |
1705 | // (that's the whole reason we're undoing it) so there might have |
1706 | // subsequently been an unlock() and lock() with no intervening |
1707 | // transition to deferred mode. |
1708 | if (!tryUnlockTokenlessSharedDeferred()) { |
1709 | unlockSharedInline(); |
1710 | } |
1711 | } else { |
1712 | if (!tryUnlockSharedDeferred(slot)) { |
1713 | unlockSharedInline(); |
1714 | } |
1715 | } |
1716 | |
1717 | // We got here not because the lock was unavailable, but because |
1718 | // we lost a compare-and-swap. Try-lock is typically allowed to |
1719 | // have spurious failures, but there is no lock efficiency gain |
1720 | // from exploiting that freedom here. |
1721 | } |
1722 | } |
1723 | |
1724 | } // namespace folly |
1725 | |