1/*
2 * Copyright 2017-present Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16#pragma once
17
18#include <atomic>
19#include <functional>
20#include <limits>
21
22#include <folly/Indestructible.h>
23#include <folly/Optional.h>
24#include <folly/detail/TurnSequencer.h>
25#include <folly/executors/QueuedImmediateExecutor.h>
26#include <folly/synchronization/detail/ThreadCachedInts.h>
27#include <folly/synchronization/detail/ThreadCachedLists.h>
28
29// Implementation of proposed Read-Copy-Update (RCU) C++ API
30// http://open-std.org/JTC1/SC22/WG21/docs/papers/2017/p0566r3.pdf
31
32// Overview
33
34// This file provides a low-overhead mechanism to guarantee ordering
35// between operations on shared data. In the simplest usage pattern,
36// readers enter a critical section, view some state, and leave the
37// critical section, while writers modify shared state and then defer
38// some cleanup operations. Proper use of these classes will guarantee
39// that a cleanup operation that is deferred during a reader critical
40// section will not be executed until after that critical section is
41// over.
42
43// Example
44
45// As an example, suppose we have some configuration data that gets
46// periodically updated. We need to protect ourselves on *every* read
47// path, even if updates are only vanishly rare, because we don't know
48// if some writer will come along and yank the data out from under us.
49//
50// Here's how that might look without deferral:
51//
52// void doSomething(IPAddress host);
53//
54// folly::SharedMutex sm;
55// ConfigData* globalConfigData;
56//
57// void reader() {
58// while (true) {
59// sm.lock_shared();
60// IPAddress curManagementServer = globalConfigData->managementServerIP;
61// sm.unlock_shared();
62// doSomethingWith(curManagementServer);
63// }
64// }
65//
66// void writer() {
67// while (true) {
68// std::this_thread::sleep_for(std::chrono::seconds(60));
69// ConfigData* oldConfigData = globalConfigData;
70// ConfigData* newConfigData = loadConfigDataFromRemoteServer();
71// sm.lock();
72// globalConfigData = newConfigData;
73// sm.unlock();
74// delete oldConfigData;
75// }
76// }
77//
78// The readers have to lock_shared and unlock_shared every iteration, even
79// during the overwhelming majority of the time in which there's no writer
80// present. These functions are surprisingly expensive; there's around 30ns of
81// overhead per critical section on my machine.
82//
83// Let's get rid of the locking. The readers and writer may proceed
84// concurrently. Here's how this would look:
85//
86// void doSomething(IPAddress host);
87//
88// std::atomic<ConfigData*> globalConfigData;
89//
90// void reader() {
91// while (true) {
92// ConfigData* configData = globalConfigData.load();
93// IPAddress curManagementServer = configData->managementServerIP;
94// doSomethingWith(curManagementServer);
95// }
96// }
97//
98// void writer() {
99// while (true) {
100// std::this_thread::sleep_for(std::chrono::seconds(60));
101// ConfigData* newConfigData = loadConfigDataFromRemoteServer();
102// globalConfigData.store(newConfigData);
103// // We can't delete the old ConfigData; we don't know when the readers
104// // will be done with it! I guess we have to leak it...
105// }
106// }
107//
108// This works and is fast, but we don't ever reclaim the memory we
109// allocated for the copy of the data. We need a way for the writer to
110// know that no readers observed the old value of the pointer and are
111// still using it. Tweaking this slightly, we want a way for the
112// writer to say "I want some operation (deleting the old ConfigData)
113// to happen, but only after I know that all readers that started
114// before I requested the action have finished.". The classes in this
115// namespace allow this. Here's how we'd express this idea:
116//
117// void doSomething(IPAddress host);
118// std::atomic<ConfigData*> globalConfigData;
119//
120//
121// void reader() {
122// while (true) {
123// IPAddress curManagementServer;
124// {
125// // We're about to do some reads we want to protect; if we read a
126// // pointer, we need to make sure that if the writer comes along and
127// // updates it, the writer's cleanup operation won't happen until we're
128// // done accessing the pointed-to data. We get a Guard on that
129// // domain; as long as it exists, no function subsequently passed to
130// // invokeEventually will execute.
131// rcu_reader guard;
132// ConfigData* configData = globalConfigData.load();
133// // We created a guard before we read globalConfigData; we know that the
134// // pointer will remain valid until the guard is destroyed.
135// curManagementServer = configData->managementServerIP;
136// // Guard is released here; retired objects may be freed.
137// }
138// doSomethingWith(curManagementServer);
139// }
140// }
141//
142// void writer() {
143//
144// while (true) {
145// std::this_thread::sleep_for(std::chrono::seconds(60));
146// ConfigData* oldConfigData = globalConfigData.load();
147// ConfigData* newConfigData = loadConfigDataFromRemoteServer();
148// globalConfigData.store(newConfigData);
149// rcu_retire(oldConfigData);
150// // alternatively, in a blocking manner:
151// // synchronize_rcu();
152// // delete oldConfigData;
153// }
154// }
155//
156// This gets us close to the speed of the second solution, without
157// leaking memory. A rcu_reader costs about 4 ns, faster than the
158// lock_shared() / unlock_shared() pair in the more traditional
159// mutex-based approach from our first attempt, and the writer
160// never blocks the readers.
161
162// Notes
163
164// This implementation does implement an rcu_domain, and provides a default
165// one for use per the standard implementation.
166//
167// rcu_domain:
168// A "universe" of deferred execution. Each rcu_domain has an
169// executor on which deferred functions may execute. Readers obtain
170// Tokens from an rcu_domain and release them back to it.
171// rcu_domains should in general be completely separated; it's never
172// correct to pass a token from one domain to another, and
173// rcu_reader objects created on one domain do not prevent functions
174// deferred on other domains from running. It's intended that most
175// callers should only ever use the default, global domain.
176//
177// Creation of a domain takes a template tag argument, which
178// defaults to RcuTag. To access different domains, you have to pass a
179// different tag. The global domain is preferred for almost all
180// purposes, unless a different executor is required.
181//
182// You should use a custom rcu_domain if you can't avoid sleeping
183// during reader critical sections (because you don't want to block
184// all other users of the domain while you sleep), or you want to change
185// the default executor type.
186
187// API correctness limitations:
188// - Exceptions:
189// In short, nothing about this is exception safe. retire functions should
190// not throw exceptions in their destructors, move constructors or call
191// operators.
192//
193// Performance limitations:
194// - Blocking:
195// A blocked reader will block invocation of deferred functions until it
196// becomes unblocked. Sleeping while holding a rcu_reader can have bad
197// performance consequences.
198//
199// API questions you might have:
200// - Nested critical sections:
201// These are fine. The following is explicitly allowed by the standard, up to
202// a nesting level of 100:
203// rcu_reader reader1;
204// doSomeWork();
205// rcu_reader reader2;
206// doSomeMoreWork();
207// - Restrictions on retired()ed functions:
208// Any operation is safe from within a retired function's
209// execution; you can retire additional functions or add a new domain call to
210// the domain.
211// - rcu_domain destruction:
212// Destruction of a domain assumes previous synchronization: all remaining
213// call and retire calls are immediately added to the executor.
214
215// Overheads
216
217// Deferral latency is as low as is practical: overhead involves running
218// several global memory barriers on the machine to ensure all readers are gone.
219//
220// Currently use of MPMCQueue is the bottleneck for deferred calls, more
221// specialized queues could be used if available, since only a single reader
222// reads the queue, and can splice all of the items to the executor if possible.
223//
224// synchronize_rcu() call latency is on the order of 10ms. Multiple
225// separate threads can share a synchronized period and should scale.
226//
227// rcu_retire() is a queue push, and on the order of 150 ns, however,
228// the current implementation may synchronize if the retire queue is full,
229// resulting in tail latencies of ~10ms.
230//
231// rcu_reader creation/destruction is ~4ns. By comparison,
232// folly::SharedMutex::lock_shared + unlock_shared pair is ~26ns
233
234// Hazard pointers vs. RCU:
235//
236// Hazard pointers protect pointers, generally malloc()d pieces of memory, and
237// each hazptr only protects one such block.
238//
239// RCU protects critical sections, *all* memory is protected as long
240// as the critical section is active.
241//
242// For example, this has implications for linked lists: If you wish to
243// free an entire linked list, you simply rcu_retire() each node with
244// RCU: readers will see either an entirely valid list, or no list at
245// all.
246//
247// Conversely with hazptrs, generally lists are walked with
248// hand-over-hand hazptrs. Only one node is protected at a time. So
249// anywhere in the middle of the list, hazptrs may read NULL, and throw
250// away all current work. Alternatively, reference counting can be used,
251// (as if each node was a shared_ptr<node>), so that readers will always see
252// *the remaining part* of the list as valid, however parts before its current
253// hazptr may be freed.
254//
255// So roughly: RCU is simple, but an all-or-nothing affair. A single rcu_reader
256// can block all reclamation. Hazptrs will reclaim exactly as much as possible,
257// at the cost of extra work writing traversal code
258//
259// Reproduced from
260// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0461r1.pdf
261//
262// Reference Counting RCU Hazptr
263//
264// Unreclaimed objects Bounded Unbounded Bounded
265//
266// Contention among readers High None None
267//
268// Traversal forward progress lock-free wait-free lock-free
269//
270// Reclamation forward progress lock-free blocking wait-free
271//
272// Traversal speed atomic no-overhead folly's is
273// no-overhead
274//
275// Reference acquisition unconditional unconditional conditional
276//
277// Automatic reclamation yes no no
278//
279// Purpose of domains N/A isolate slow configeration
280// readers
281
282namespace folly {
283
284struct RcuTag;
285
286template <typename Tag = RcuTag>
287class rcu_domain;
288
289// Opaque token used to match up lock_shared() and unlock_shared()
290// pairs.
291template <typename Tag>
292class rcu_token {
293 public:
294 rcu_token() {}
295 ~rcu_token() = default;
296
297 rcu_token(const rcu_token&) = delete;
298 rcu_token(rcu_token&& other) = default;
299 rcu_token& operator=(const rcu_token& other) = delete;
300 rcu_token& operator=(rcu_token&& other) = default;
301
302 private:
303 explicit rcu_token(uint64_t epoch) : epoch_(epoch) {}
304
305 friend class rcu_domain<Tag>;
306 uint64_t epoch_;
307};
308
309// For most usages, rcu_domain is unnecessary, and you can use
310// rcu_reader and rcu_retire/synchronize_rcu directly.
311template <typename Tag>
312class rcu_domain {
313 using list_head = typename detail::ThreadCachedLists<Tag>::ListHead;
314 using list_node = typename detail::ThreadCachedLists<Tag>::Node;
315
316 public:
317 /*
318 * If an executor is passed, it is used to run calls and delete
319 * retired objects.
320 */
321 explicit rcu_domain(Executor* executor = nullptr) noexcept;
322
323 rcu_domain(const rcu_domain&) = delete;
324 rcu_domain(rcu_domain&&) = delete;
325 rcu_domain& operator=(const rcu_domain&) = delete;
326 rcu_domain& operator=(rcu_domain&&) = delete;
327 ~rcu_domain();
328
329 // Reader locks: Prevent any calls from occuring, retired memory
330 // from being freed, and synchronize() calls from completing until
331 // all preceding lock_shared() sections are finished.
332
333 // Note: can potentially allocate on thread first use.
334 FOLLY_ALWAYS_INLINE rcu_token<Tag> lock_shared();
335 FOLLY_ALWAYS_INLINE void unlock_shared(rcu_token<Tag>&&);
336
337 // Call a function after concurrent critical sections have finished.
338 // Does not block unless the queue is full, then may block to wait
339 // for scheduler thread, but generally does not wait for full
340 // synchronization.
341 template <typename T>
342 void call(T&& cbin);
343 void retire(list_node* node) noexcept;
344
345 // Ensure concurrent critical sections have finished.
346 // Always waits for full synchronization.
347 // read lock *must not* be held.
348 void synchronize() noexcept;
349
350 private:
351 detail::ThreadCachedInts<Tag> counters_;
352 // Global epoch.
353 std::atomic<uint64_t> version_{0};
354 // Future epochs being driven by threads in synchronize
355 std::atomic<uint64_t> work_{0};
356 // Matches version_, for waking up threads in synchronize that are
357 // sharing an epoch.
358 detail::TurnSequencer<std::atomic> turn_;
359 // Only one thread can drive half_sync.
360 std::mutex syncMutex_;
361 // Currently half_sync takes ~16ms due to heavy barriers.
362 // Ensure that if used in a single thread, max GC time
363 // is limited to 1% of total CPU time.
364 static constexpr uint64_t syncTimePeriod_{1600 * 2 /* full sync is 2x */};
365 std::atomic<uint64_t> syncTime_{0};
366 // call()s waiting to move through two epochs.
367 detail::ThreadCachedLists<Tag> q_;
368 // Executor callbacks will eventually be run on.
369 Executor* executor_{nullptr};
370 static bool singleton_; // Ensure uniqueness per-tag.
371
372 // Queues for callbacks waiting to go through two epochs.
373 list_head queues_[2]{};
374
375 // Move queues through one epoch (half of a full synchronize()).
376 // Will block waiting for readers to exit if blocking is true.
377 // blocking must *not* be true if the current thread is locked,
378 // or will deadlock.
379 //
380 // returns a list of callbacks ready to run in cbs.
381 void half_sync(bool blocking, list_head& cbs);
382};
383
384extern folly::Indestructible<rcu_domain<RcuTag>*> rcu_default_domain_;
385
386inline rcu_domain<RcuTag>* rcu_default_domain() {
387 return *rcu_default_domain_;
388}
389
390// Main reader guard class.
391template <typename Tag = RcuTag>
392class rcu_reader_domain {
393 public:
394 explicit FOLLY_ALWAYS_INLINE rcu_reader_domain(
395 rcu_domain<Tag>* domain = rcu_default_domain()) noexcept
396 : epoch_(domain->lock_shared()), domain_(domain) {}
397 explicit rcu_reader_domain(
398 std::defer_lock_t,
399 rcu_domain<Tag>* domain = rcu_default_domain()) noexcept
400 : domain_(domain) {}
401 rcu_reader_domain(const rcu_reader_domain&) = delete;
402 rcu_reader_domain(rcu_reader_domain&& other) noexcept
403 : epoch_(std::move(other.epoch_)),
404 domain_(std::exchange(other.domain_, nullptr)) {}
405 rcu_reader_domain& operator=(const rcu_reader_domain&) = delete;
406 rcu_reader_domain& operator=(rcu_reader_domain&& other) noexcept {
407 if (epoch_.has_value()) {
408 domain_->unlock_shared(std::move(epoch_.value()));
409 }
410 epoch_ = std::move(other.epoch_);
411 domain_ = std::move(other.domain_);
412 return *this;
413 }
414
415 FOLLY_ALWAYS_INLINE ~rcu_reader_domain() noexcept {
416 if (epoch_.has_value()) {
417 domain_->unlock_shared(std::move(epoch_.value()));
418 }
419 }
420
421 void swap(rcu_reader_domain& other) noexcept {
422 std::swap(epoch_, other.epoch_);
423 std::swap(domain_, other.domain_);
424 }
425
426 FOLLY_ALWAYS_INLINE void lock() noexcept {
427 DCHECK(!epoch_.has_value());
428 epoch_ = domain_->lock_shared();
429 }
430
431 FOLLY_ALWAYS_INLINE void unlock() noexcept {
432 DCHECK(epoch_.has_value());
433 domain_->unlock_shared(std::move(epoch_.value()));
434 }
435
436 private:
437 Optional<rcu_token<Tag>> epoch_;
438 rcu_domain<Tag>* domain_;
439};
440
441using rcu_reader = rcu_reader_domain<RcuTag>;
442
443template <typename Tag = RcuTag>
444inline void swap(
445 rcu_reader_domain<Tag>& a,
446 rcu_reader_domain<Tag>& b) noexcept {
447 a.swap(b);
448}
449
450template <typename Tag = RcuTag>
451inline void synchronize_rcu(
452 rcu_domain<Tag>* domain = rcu_default_domain()) noexcept {
453 domain->synchronize();
454}
455
456template <typename Tag = RcuTag>
457inline void rcu_barrier(
458 rcu_domain<Tag>* domain = rcu_default_domain()) noexcept {
459 domain->synchronize();
460}
461
462// Free-function retire. Always allocates.
463template <
464 typename T,
465 typename D = std::default_delete<T>,
466 typename Tag = RcuTag>
467void rcu_retire(
468 T* p,
469 D d = {},
470 rcu_domain<Tag>* domain = rcu_default_domain()) {
471 domain->call([p, del = std::move(d)]() { del(p); });
472}
473
474// Base class for rcu objects. retire() will use preallocated storage
475// from rcu_obj_base, vs. rcu_retire() which always allocates.
476template <
477 typename T,
478 typename D = std::default_delete<T>,
479 typename Tag = RcuTag>
480class rcu_obj_base : detail::ThreadCachedListsBase::Node {
481 public:
482 void retire(D d = {}, rcu_domain<Tag>* domain = rcu_default_domain()) {
483 // This implementation assumes folly::Function has enough
484 // inline storage for D, otherwise, it allocates.
485 this->cb_ = [this, d = std::move(d)]() { d(static_cast<T*>(this)); };
486 domain->retire(this);
487 }
488};
489
490} // namespace folly
491
492#include <folly/synchronization/Rcu-inl.h>
493