1 | /* |
2 | * Copyright 2017-present Facebook, Inc. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | #pragma once |
17 | |
18 | #include <atomic> |
19 | #include <functional> |
20 | #include <limits> |
21 | |
22 | #include <folly/Indestructible.h> |
23 | #include <folly/Optional.h> |
24 | #include <folly/detail/TurnSequencer.h> |
25 | #include <folly/executors/QueuedImmediateExecutor.h> |
26 | #include <folly/synchronization/detail/ThreadCachedInts.h> |
27 | #include <folly/synchronization/detail/ThreadCachedLists.h> |
28 | |
29 | // Implementation of proposed Read-Copy-Update (RCU) C++ API |
30 | // http://open-std.org/JTC1/SC22/WG21/docs/papers/2017/p0566r3.pdf |
31 | |
32 | // Overview |
33 | |
34 | // This file provides a low-overhead mechanism to guarantee ordering |
35 | // between operations on shared data. In the simplest usage pattern, |
36 | // readers enter a critical section, view some state, and leave the |
37 | // critical section, while writers modify shared state and then defer |
38 | // some cleanup operations. Proper use of these classes will guarantee |
39 | // that a cleanup operation that is deferred during a reader critical |
40 | // section will not be executed until after that critical section is |
41 | // over. |
42 | |
43 | // Example |
44 | |
45 | // As an example, suppose we have some configuration data that gets |
46 | // periodically updated. We need to protect ourselves on *every* read |
47 | // path, even if updates are only vanishly rare, because we don't know |
48 | // if some writer will come along and yank the data out from under us. |
49 | // |
50 | // Here's how that might look without deferral: |
51 | // |
52 | // void doSomething(IPAddress host); |
53 | // |
54 | // folly::SharedMutex sm; |
55 | // ConfigData* globalConfigData; |
56 | // |
57 | // void reader() { |
58 | // while (true) { |
59 | // sm.lock_shared(); |
60 | // IPAddress curManagementServer = globalConfigData->managementServerIP; |
61 | // sm.unlock_shared(); |
62 | // doSomethingWith(curManagementServer); |
63 | // } |
64 | // } |
65 | // |
66 | // void writer() { |
67 | // while (true) { |
68 | // std::this_thread::sleep_for(std::chrono::seconds(60)); |
69 | // ConfigData* oldConfigData = globalConfigData; |
70 | // ConfigData* newConfigData = loadConfigDataFromRemoteServer(); |
71 | // sm.lock(); |
72 | // globalConfigData = newConfigData; |
73 | // sm.unlock(); |
74 | // delete oldConfigData; |
75 | // } |
76 | // } |
77 | // |
78 | // The readers have to lock_shared and unlock_shared every iteration, even |
79 | // during the overwhelming majority of the time in which there's no writer |
80 | // present. These functions are surprisingly expensive; there's around 30ns of |
81 | // overhead per critical section on my machine. |
82 | // |
83 | // Let's get rid of the locking. The readers and writer may proceed |
84 | // concurrently. Here's how this would look: |
85 | // |
86 | // void doSomething(IPAddress host); |
87 | // |
88 | // std::atomic<ConfigData*> globalConfigData; |
89 | // |
90 | // void reader() { |
91 | // while (true) { |
92 | // ConfigData* configData = globalConfigData.load(); |
93 | // IPAddress curManagementServer = configData->managementServerIP; |
94 | // doSomethingWith(curManagementServer); |
95 | // } |
96 | // } |
97 | // |
98 | // void writer() { |
99 | // while (true) { |
100 | // std::this_thread::sleep_for(std::chrono::seconds(60)); |
101 | // ConfigData* newConfigData = loadConfigDataFromRemoteServer(); |
102 | // globalConfigData.store(newConfigData); |
103 | // // We can't delete the old ConfigData; we don't know when the readers |
104 | // // will be done with it! I guess we have to leak it... |
105 | // } |
106 | // } |
107 | // |
108 | // This works and is fast, but we don't ever reclaim the memory we |
109 | // allocated for the copy of the data. We need a way for the writer to |
110 | // know that no readers observed the old value of the pointer and are |
111 | // still using it. Tweaking this slightly, we want a way for the |
112 | // writer to say "I want some operation (deleting the old ConfigData) |
113 | // to happen, but only after I know that all readers that started |
114 | // before I requested the action have finished.". The classes in this |
115 | // namespace allow this. Here's how we'd express this idea: |
116 | // |
117 | // void doSomething(IPAddress host); |
118 | // std::atomic<ConfigData*> globalConfigData; |
119 | // |
120 | // |
121 | // void reader() { |
122 | // while (true) { |
123 | // IPAddress curManagementServer; |
124 | // { |
125 | // // We're about to do some reads we want to protect; if we read a |
126 | // // pointer, we need to make sure that if the writer comes along and |
127 | // // updates it, the writer's cleanup operation won't happen until we're |
128 | // // done accessing the pointed-to data. We get a Guard on that |
129 | // // domain; as long as it exists, no function subsequently passed to |
130 | // // invokeEventually will execute. |
131 | // rcu_reader guard; |
132 | // ConfigData* configData = globalConfigData.load(); |
133 | // // We created a guard before we read globalConfigData; we know that the |
134 | // // pointer will remain valid until the guard is destroyed. |
135 | // curManagementServer = configData->managementServerIP; |
136 | // // Guard is released here; retired objects may be freed. |
137 | // } |
138 | // doSomethingWith(curManagementServer); |
139 | // } |
140 | // } |
141 | // |
142 | // void writer() { |
143 | // |
144 | // while (true) { |
145 | // std::this_thread::sleep_for(std::chrono::seconds(60)); |
146 | // ConfigData* oldConfigData = globalConfigData.load(); |
147 | // ConfigData* newConfigData = loadConfigDataFromRemoteServer(); |
148 | // globalConfigData.store(newConfigData); |
149 | // rcu_retire(oldConfigData); |
150 | // // alternatively, in a blocking manner: |
151 | // // synchronize_rcu(); |
152 | // // delete oldConfigData; |
153 | // } |
154 | // } |
155 | // |
156 | // This gets us close to the speed of the second solution, without |
157 | // leaking memory. A rcu_reader costs about 4 ns, faster than the |
158 | // lock_shared() / unlock_shared() pair in the more traditional |
159 | // mutex-based approach from our first attempt, and the writer |
160 | // never blocks the readers. |
161 | |
162 | // Notes |
163 | |
164 | // This implementation does implement an rcu_domain, and provides a default |
165 | // one for use per the standard implementation. |
166 | // |
167 | // rcu_domain: |
168 | // A "universe" of deferred execution. Each rcu_domain has an |
169 | // executor on which deferred functions may execute. Readers obtain |
170 | // Tokens from an rcu_domain and release them back to it. |
171 | // rcu_domains should in general be completely separated; it's never |
172 | // correct to pass a token from one domain to another, and |
173 | // rcu_reader objects created on one domain do not prevent functions |
174 | // deferred on other domains from running. It's intended that most |
175 | // callers should only ever use the default, global domain. |
176 | // |
177 | // Creation of a domain takes a template tag argument, which |
178 | // defaults to RcuTag. To access different domains, you have to pass a |
179 | // different tag. The global domain is preferred for almost all |
180 | // purposes, unless a different executor is required. |
181 | // |
182 | // You should use a custom rcu_domain if you can't avoid sleeping |
183 | // during reader critical sections (because you don't want to block |
184 | // all other users of the domain while you sleep), or you want to change |
185 | // the default executor type. |
186 | |
187 | // API correctness limitations: |
188 | // - Exceptions: |
189 | // In short, nothing about this is exception safe. retire functions should |
190 | // not throw exceptions in their destructors, move constructors or call |
191 | // operators. |
192 | // |
193 | // Performance limitations: |
194 | // - Blocking: |
195 | // A blocked reader will block invocation of deferred functions until it |
196 | // becomes unblocked. Sleeping while holding a rcu_reader can have bad |
197 | // performance consequences. |
198 | // |
199 | // API questions you might have: |
200 | // - Nested critical sections: |
201 | // These are fine. The following is explicitly allowed by the standard, up to |
202 | // a nesting level of 100: |
203 | // rcu_reader reader1; |
204 | // doSomeWork(); |
205 | // rcu_reader reader2; |
206 | // doSomeMoreWork(); |
207 | // - Restrictions on retired()ed functions: |
208 | // Any operation is safe from within a retired function's |
209 | // execution; you can retire additional functions or add a new domain call to |
210 | // the domain. |
211 | // - rcu_domain destruction: |
212 | // Destruction of a domain assumes previous synchronization: all remaining |
213 | // call and retire calls are immediately added to the executor. |
214 | |
215 | // Overheads |
216 | |
217 | // Deferral latency is as low as is practical: overhead involves running |
218 | // several global memory barriers on the machine to ensure all readers are gone. |
219 | // |
220 | // Currently use of MPMCQueue is the bottleneck for deferred calls, more |
221 | // specialized queues could be used if available, since only a single reader |
222 | // reads the queue, and can splice all of the items to the executor if possible. |
223 | // |
224 | // synchronize_rcu() call latency is on the order of 10ms. Multiple |
225 | // separate threads can share a synchronized period and should scale. |
226 | // |
227 | // rcu_retire() is a queue push, and on the order of 150 ns, however, |
228 | // the current implementation may synchronize if the retire queue is full, |
229 | // resulting in tail latencies of ~10ms. |
230 | // |
231 | // rcu_reader creation/destruction is ~4ns. By comparison, |
232 | // folly::SharedMutex::lock_shared + unlock_shared pair is ~26ns |
233 | |
234 | // Hazard pointers vs. RCU: |
235 | // |
236 | // Hazard pointers protect pointers, generally malloc()d pieces of memory, and |
237 | // each hazptr only protects one such block. |
238 | // |
239 | // RCU protects critical sections, *all* memory is protected as long |
240 | // as the critical section is active. |
241 | // |
242 | // For example, this has implications for linked lists: If you wish to |
243 | // free an entire linked list, you simply rcu_retire() each node with |
244 | // RCU: readers will see either an entirely valid list, or no list at |
245 | // all. |
246 | // |
247 | // Conversely with hazptrs, generally lists are walked with |
248 | // hand-over-hand hazptrs. Only one node is protected at a time. So |
249 | // anywhere in the middle of the list, hazptrs may read NULL, and throw |
250 | // away all current work. Alternatively, reference counting can be used, |
251 | // (as if each node was a shared_ptr<node>), so that readers will always see |
252 | // *the remaining part* of the list as valid, however parts before its current |
253 | // hazptr may be freed. |
254 | // |
255 | // So roughly: RCU is simple, but an all-or-nothing affair. A single rcu_reader |
256 | // can block all reclamation. Hazptrs will reclaim exactly as much as possible, |
257 | // at the cost of extra work writing traversal code |
258 | // |
259 | // Reproduced from |
260 | // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0461r1.pdf |
261 | // |
262 | // Reference Counting RCU Hazptr |
263 | // |
264 | // Unreclaimed objects Bounded Unbounded Bounded |
265 | // |
266 | // Contention among readers High None None |
267 | // |
268 | // Traversal forward progress lock-free wait-free lock-free |
269 | // |
270 | // Reclamation forward progress lock-free blocking wait-free |
271 | // |
272 | // Traversal speed atomic no-overhead folly's is |
273 | // no-overhead |
274 | // |
275 | // Reference acquisition unconditional unconditional conditional |
276 | // |
277 | // Automatic reclamation yes no no |
278 | // |
279 | // Purpose of domains N/A isolate slow configeration |
280 | // readers |
281 | |
282 | namespace folly { |
283 | |
284 | struct RcuTag; |
285 | |
286 | template <typename Tag = RcuTag> |
287 | class rcu_domain; |
288 | |
289 | // Opaque token used to match up lock_shared() and unlock_shared() |
290 | // pairs. |
291 | template <typename Tag> |
292 | class rcu_token { |
293 | public: |
294 | rcu_token() {} |
295 | ~rcu_token() = default; |
296 | |
297 | rcu_token(const rcu_token&) = delete; |
298 | rcu_token(rcu_token&& other) = default; |
299 | rcu_token& operator=(const rcu_token& other) = delete; |
300 | rcu_token& operator=(rcu_token&& other) = default; |
301 | |
302 | private: |
303 | explicit rcu_token(uint64_t epoch) : epoch_(epoch) {} |
304 | |
305 | friend class rcu_domain<Tag>; |
306 | uint64_t epoch_; |
307 | }; |
308 | |
309 | // For most usages, rcu_domain is unnecessary, and you can use |
310 | // rcu_reader and rcu_retire/synchronize_rcu directly. |
311 | template <typename Tag> |
312 | class rcu_domain { |
313 | using list_head = typename detail::ThreadCachedLists<Tag>::ListHead; |
314 | using list_node = typename detail::ThreadCachedLists<Tag>::Node; |
315 | |
316 | public: |
317 | /* |
318 | * If an executor is passed, it is used to run calls and delete |
319 | * retired objects. |
320 | */ |
321 | explicit rcu_domain(Executor* executor = nullptr) noexcept; |
322 | |
323 | rcu_domain(const rcu_domain&) = delete; |
324 | rcu_domain(rcu_domain&&) = delete; |
325 | rcu_domain& operator=(const rcu_domain&) = delete; |
326 | rcu_domain& operator=(rcu_domain&&) = delete; |
327 | ~rcu_domain(); |
328 | |
329 | // Reader locks: Prevent any calls from occuring, retired memory |
330 | // from being freed, and synchronize() calls from completing until |
331 | // all preceding lock_shared() sections are finished. |
332 | |
333 | // Note: can potentially allocate on thread first use. |
334 | FOLLY_ALWAYS_INLINE rcu_token<Tag> lock_shared(); |
335 | FOLLY_ALWAYS_INLINE void unlock_shared(rcu_token<Tag>&&); |
336 | |
337 | // Call a function after concurrent critical sections have finished. |
338 | // Does not block unless the queue is full, then may block to wait |
339 | // for scheduler thread, but generally does not wait for full |
340 | // synchronization. |
341 | template <typename T> |
342 | void call(T&& cbin); |
343 | void retire(list_node* node) noexcept; |
344 | |
345 | // Ensure concurrent critical sections have finished. |
346 | // Always waits for full synchronization. |
347 | // read lock *must not* be held. |
348 | void synchronize() noexcept; |
349 | |
350 | private: |
351 | detail::ThreadCachedInts<Tag> counters_; |
352 | // Global epoch. |
353 | std::atomic<uint64_t> version_{0}; |
354 | // Future epochs being driven by threads in synchronize |
355 | std::atomic<uint64_t> work_{0}; |
356 | // Matches version_, for waking up threads in synchronize that are |
357 | // sharing an epoch. |
358 | detail::TurnSequencer<std::atomic> turn_; |
359 | // Only one thread can drive half_sync. |
360 | std::mutex syncMutex_; |
361 | // Currently half_sync takes ~16ms due to heavy barriers. |
362 | // Ensure that if used in a single thread, max GC time |
363 | // is limited to 1% of total CPU time. |
364 | static constexpr uint64_t syncTimePeriod_{1600 * 2 /* full sync is 2x */}; |
365 | std::atomic<uint64_t> syncTime_{0}; |
366 | // call()s waiting to move through two epochs. |
367 | detail::ThreadCachedLists<Tag> q_; |
368 | // Executor callbacks will eventually be run on. |
369 | Executor* executor_{nullptr}; |
370 | static bool singleton_; // Ensure uniqueness per-tag. |
371 | |
372 | // Queues for callbacks waiting to go through two epochs. |
373 | list_head queues_[2]{}; |
374 | |
375 | // Move queues through one epoch (half of a full synchronize()). |
376 | // Will block waiting for readers to exit if blocking is true. |
377 | // blocking must *not* be true if the current thread is locked, |
378 | // or will deadlock. |
379 | // |
380 | // returns a list of callbacks ready to run in cbs. |
381 | void half_sync(bool blocking, list_head& cbs); |
382 | }; |
383 | |
384 | extern folly::Indestructible<rcu_domain<RcuTag>*> rcu_default_domain_; |
385 | |
386 | inline rcu_domain<RcuTag>* rcu_default_domain() { |
387 | return *rcu_default_domain_; |
388 | } |
389 | |
390 | // Main reader guard class. |
391 | template <typename Tag = RcuTag> |
392 | class rcu_reader_domain { |
393 | public: |
394 | explicit FOLLY_ALWAYS_INLINE rcu_reader_domain( |
395 | rcu_domain<Tag>* domain = rcu_default_domain()) noexcept |
396 | : epoch_(domain->lock_shared()), domain_(domain) {} |
397 | explicit rcu_reader_domain( |
398 | std::defer_lock_t, |
399 | rcu_domain<Tag>* domain = rcu_default_domain()) noexcept |
400 | : domain_(domain) {} |
401 | rcu_reader_domain(const rcu_reader_domain&) = delete; |
402 | rcu_reader_domain(rcu_reader_domain&& other) noexcept |
403 | : epoch_(std::move(other.epoch_)), |
404 | domain_(std::exchange(other.domain_, nullptr)) {} |
405 | rcu_reader_domain& operator=(const rcu_reader_domain&) = delete; |
406 | rcu_reader_domain& operator=(rcu_reader_domain&& other) noexcept { |
407 | if (epoch_.has_value()) { |
408 | domain_->unlock_shared(std::move(epoch_.value())); |
409 | } |
410 | epoch_ = std::move(other.epoch_); |
411 | domain_ = std::move(other.domain_); |
412 | return *this; |
413 | } |
414 | |
415 | FOLLY_ALWAYS_INLINE ~rcu_reader_domain() noexcept { |
416 | if (epoch_.has_value()) { |
417 | domain_->unlock_shared(std::move(epoch_.value())); |
418 | } |
419 | } |
420 | |
421 | void swap(rcu_reader_domain& other) noexcept { |
422 | std::swap(epoch_, other.epoch_); |
423 | std::swap(domain_, other.domain_); |
424 | } |
425 | |
426 | FOLLY_ALWAYS_INLINE void lock() noexcept { |
427 | DCHECK(!epoch_.has_value()); |
428 | epoch_ = domain_->lock_shared(); |
429 | } |
430 | |
431 | FOLLY_ALWAYS_INLINE void unlock() noexcept { |
432 | DCHECK(epoch_.has_value()); |
433 | domain_->unlock_shared(std::move(epoch_.value())); |
434 | } |
435 | |
436 | private: |
437 | Optional<rcu_token<Tag>> epoch_; |
438 | rcu_domain<Tag>* domain_; |
439 | }; |
440 | |
441 | using rcu_reader = rcu_reader_domain<RcuTag>; |
442 | |
443 | template <typename Tag = RcuTag> |
444 | inline void swap( |
445 | rcu_reader_domain<Tag>& a, |
446 | rcu_reader_domain<Tag>& b) noexcept { |
447 | a.swap(b); |
448 | } |
449 | |
450 | template <typename Tag = RcuTag> |
451 | inline void synchronize_rcu( |
452 | rcu_domain<Tag>* domain = rcu_default_domain()) noexcept { |
453 | domain->synchronize(); |
454 | } |
455 | |
456 | template <typename Tag = RcuTag> |
457 | inline void rcu_barrier( |
458 | rcu_domain<Tag>* domain = rcu_default_domain()) noexcept { |
459 | domain->synchronize(); |
460 | } |
461 | |
462 | // Free-function retire. Always allocates. |
463 | template < |
464 | typename T, |
465 | typename D = std::default_delete<T>, |
466 | typename Tag = RcuTag> |
467 | void rcu_retire( |
468 | T* p, |
469 | D d = {}, |
470 | rcu_domain<Tag>* domain = rcu_default_domain()) { |
471 | domain->call([p, del = std::move(d)]() { del(p); }); |
472 | } |
473 | |
474 | // Base class for rcu objects. retire() will use preallocated storage |
475 | // from rcu_obj_base, vs. rcu_retire() which always allocates. |
476 | template < |
477 | typename T, |
478 | typename D = std::default_delete<T>, |
479 | typename Tag = RcuTag> |
480 | class rcu_obj_base : detail::ThreadCachedListsBase::Node { |
481 | public: |
482 | void retire(D d = {}, rcu_domain<Tag>* domain = rcu_default_domain()) { |
483 | // This implementation assumes folly::Function has enough |
484 | // inline storage for D, otherwise, it allocates. |
485 | this->cb_ = [this, d = std::move(d)]() { d(static_cast<T*>(this)); }; |
486 | domain->retire(this); |
487 | } |
488 | }; |
489 | |
490 | } // namespace folly |
491 | |
492 | #include <folly/synchronization/Rcu-inl.h> |
493 | |