Rcu.h source code [folly/synchronization/Rcu.h]

1	/*
2	* Copyright 2017-present Facebook, Inc.
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16	#pragma once
17
18	#include <atomic>
19	#include <functional>
20	#include <limits>
21
22	#include <folly/Indestructible.h>
23	#include <folly/Optional.h>
24	#include <folly/detail/TurnSequencer.h>
25	#include <folly/executors/QueuedImmediateExecutor.h>
26	#include <folly/synchronization/detail/ThreadCachedInts.h>
27	#include <folly/synchronization/detail/ThreadCachedLists.h>
28
29	// Implementation of proposed Read-Copy-Update (RCU) C++ API
30	// http://open-std.org/JTC1/SC22/WG21/docs/papers/2017/p0566r3.pdf
31
32	// Overview
33
34	// This file provides a low-overhead mechanism to guarantee ordering
35	// between operations on shared data. In the simplest usage pattern,
36	// readers enter a critical section, view some state, and leave the
37	// critical section, while writers modify shared state and then defer
38	// some cleanup operations. Proper use of these classes will guarantee
39	// that a cleanup operation that is deferred during a reader critical
40	// section will not be executed until after that critical section is
41	// over.
42
43	// Example
44
45	// As an example, suppose we have some configuration data that gets
46	// periodically updated. We need to protect ourselves on every* read*
47	// path, even if updates are only vanishly rare, because we don't know
48	// if some writer will come along and yank the data out from under us.
49	//
50	// Here's how that might look without deferral:
51	//
52	// void doSomething(IPAddress host);
53	//
54	// folly::SharedMutex sm;
55	// ConfigData globalConfigData;*
56	//
57	// void reader() {
58	// while (true) {
59	// sm.lock_shared();
60	// IPAddress curManagementServer = globalConfigData->managementServerIP;
61	// sm.unlock_shared();
62	// doSomethingWith(curManagementServer);
63	// }
64	// }
65	//
66	// void writer() {
67	// while (true) {
68	// std::this_thread::sleep_for(std::chrono::seconds(60));
69	// ConfigData oldConfigData = globalConfigData;*
70	// ConfigData newConfigData = loadConfigDataFromRemoteServer();*
71	// sm.lock();
72	// globalConfigData = newConfigData;
73	// sm.unlock();
74	// delete oldConfigData;
75	// }
76	// }
77	//
78	// The readers have to lock_shared and unlock_shared every iteration, even
79	// during the overwhelming majority of the time in which there's no writer
80	// present. These functions are surprisingly expensive; there's around 30ns of
81	// overhead per critical section on my machine.
82	//
83	// Let's get rid of the locking. The readers and writer may proceed
84	// concurrently. Here's how this would look:
85	//
86	// void doSomething(IPAddress host);
87	//
88	// std::atomic<ConfigData> globalConfigData;*
89	//
90	// void reader() {
91	// while (true) {
92	// ConfigData configData = globalConfigData.load();*
93	// IPAddress curManagementServer = configData->managementServerIP;
94	// doSomethingWith(curManagementServer);
95	// }
96	// }
97	//
98	// void writer() {
99	// while (true) {
100	// std::this_thread::sleep_for(std::chrono::seconds(60));
101	// ConfigData newConfigData = loadConfigDataFromRemoteServer();*
102	// globalConfigData.store(newConfigData);
103	// // We can't delete the old ConfigData; we don't know when the readers
104	// // will be done with it! I guess we have to leak it...
105	// }
106	// }
107	//
108	// This works and is fast, but we don't ever reclaim the memory we
109	// allocated for the copy of the data. We need a way for the writer to
110	// know that no readers observed the old value of the pointer and are
111	// still using it. Tweaking this slightly, we want a way for the
112	// writer to say "I want some operation (deleting the old ConfigData)
113	// to happen, but only after I know that all readers that started
114	// before I requested the action have finished.". The classes in this
115	// namespace allow this. Here's how we'd express this idea:
116	//
117	// void doSomething(IPAddress host);
118	// std::atomic<ConfigData> globalConfigData;*
119	//
120	//
121	// void reader() {
122	// while (true) {
123	// IPAddress curManagementServer;
124	// {
125	// // We're about to do some reads we want to protect; if we read a
126	// // pointer, we need to make sure that if the writer comes along and
127	// // updates it, the writer's cleanup operation won't happen until we're
128	// // done accessing the pointed-to data. We get a Guard on that
129	// // domain; as long as it exists, no function subsequently passed to
130	// // invokeEventually will execute.
131	// rcu_reader guard;
132	// ConfigData configData = globalConfigData.load();*
133	// // We created a guard before we read globalConfigData; we know that the
134	// // pointer will remain valid until the guard is destroyed.
135	// curManagementServer = configData->managementServerIP;
136	// // Guard is released here; retired objects may be freed.
137	// }
138	// doSomethingWith(curManagementServer);
139	// }
140	// }
141	//
142	// void writer() {
143	//
144	// while (true) {
145	// std::this_thread::sleep_for(std::chrono::seconds(60));
146	// ConfigData oldConfigData = globalConfigData.load();*
147	// ConfigData newConfigData = loadConfigDataFromRemoteServer();*
148	// globalConfigData.store(newConfigData);
149	// rcu_retire(oldConfigData);
150	// // alternatively, in a blocking manner:
151	// // synchronize_rcu();
152	// // delete oldConfigData;
153	// }
154	// }
155	//
156	// This gets us close to the speed of the second solution, without
157	// leaking memory. A rcu_reader costs about 4 ns, faster than the
158	// lock_shared() / unlock_shared() pair in the more traditional
159	// mutex-based approach from our first attempt, and the writer
160	// never blocks the readers.
161
162	// Notes
163
164	// This implementation does implement an rcu_domain, and provides a default
165	// one for use per the standard implementation.
166	//
167	// rcu_domain:
168	// A "universe" of deferred execution. Each rcu_domain has an
169	// executor on which deferred functions may execute. Readers obtain
170	// Tokens from an rcu_domain and release them back to it.
171	// rcu_domains should in general be completely separated; it's never
172	// correct to pass a token from one domain to another, and
173	// rcu_reader objects created on one domain do not prevent functions
174	// deferred on other domains from running. It's intended that most
175	// callers should only ever use the default, global domain.
176	//
177	// Creation of a domain takes a template tag argument, which
178	// defaults to RcuTag. To access different domains, you have to pass a
179	// different tag. The global domain is preferred for almost all
180	// purposes, unless a different executor is required.
181	//
182	// You should use a custom rcu_domain if you can't avoid sleeping
183	// during reader critical sections (because you don't want to block
184	// all other users of the domain while you sleep), or you want to change
185	// the default executor type.
186
187	// API correctness limitations:
188	// - Exceptions:
189	// In short, nothing about this is exception safe. retire functions should
190	// not throw exceptions in their destructors, move constructors or call
191	// operators.
192	//
193	// Performance limitations:
194	// - Blocking:
195	// A blocked reader will block invocation of deferred functions until it
196	// becomes unblocked. Sleeping while holding a rcu_reader can have bad
197	// performance consequences.
198	//
199	// API questions you might have:
200	// - Nested critical sections:
201	// These are fine. The following is explicitly allowed by the standard, up to
202	// a nesting level of 100:
203	// rcu_reader reader1;
204	// doSomeWork();
205	// rcu_reader reader2;
206	// doSomeMoreWork();
207	// - Restrictions on retired()ed functions:
208	// Any operation is safe from within a retired function's
209	// execution; you can retire additional functions or add a new domain call to
210	// the domain.
211	// - rcu_domain destruction:
212	// Destruction of a domain assumes previous synchronization: all remaining
213	// call and retire calls are immediately added to the executor.
214
215	// Overheads
216
217	// Deferral latency is as low as is practical: overhead involves running
218	// several global memory barriers on the machine to ensure all readers are gone.
219	//
220	// Currently use of MPMCQueue is the bottleneck for deferred calls, more
221	// specialized queues could be used if available, since only a single reader
222	// reads the queue, and can splice all of the items to the executor if possible.
223	//
224	// synchronize_rcu() call latency is on the order of 10ms. Multiple
225	// separate threads can share a synchronized period and should scale.
226	//
227	// rcu_retire() is a queue push, and on the order of 150 ns, however,
228	// the current implementation may synchronize if the retire queue is full,
229	// resulting in tail latencies of ~10ms.
230	//
231	// rcu_reader creation/destruction is ~4ns. By comparison,
232	// folly::SharedMutex::lock_shared + unlock_shared pair is ~26ns
233
234	// Hazard pointers vs. RCU:
235	//
236	// Hazard pointers protect pointers, generally malloc()d pieces of memory, and
237	// each hazptr only protects one such block.
238	//
239	// RCU protects critical sections, all* memory is protected as long*
240	// as the critical section is active.
241	//
242	// For example, this has implications for linked lists: If you wish to
243	// free an entire linked list, you simply rcu_retire() each node with
244	// RCU: readers will see either an entirely valid list, or no list at
245	// all.
246	//
247	// Conversely with hazptrs, generally lists are walked with
248	// hand-over-hand hazptrs. Only one node is protected at a time. So
249	// anywhere in the middle of the list, hazptrs may read NULL, and throw
250	// away all current work. Alternatively, reference counting can be used,
251	// (as if each node was a shared_ptr<node>), so that readers will always see
252	// the remaining part* of the list as valid, however parts before its current*
253	// hazptr may be freed.
254	//
255	// So roughly: RCU is simple, but an all-or-nothing affair. A single rcu_reader
256	// can block all reclamation. Hazptrs will reclaim exactly as much as possible,
257	// at the cost of extra work writing traversal code
258	//
259	// Reproduced from
260	// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0461r1.pdf
261	//
262	// Reference Counting RCU Hazptr
263	//
264	// Unreclaimed objects Bounded Unbounded Bounded
265	//
266	// Contention among readers High None None
267	//
268	// Traversal forward progress lock-free wait-free lock-free
269	//
270	// Reclamation forward progress lock-free blocking wait-free
271	//
272	// Traversal speed atomic no-overhead folly's is
273	// no-overhead
274	//
275	// Reference acquisition unconditional unconditional conditional
276	//
277	// Automatic reclamation yes no no
278	//
279	// Purpose of domains N/A isolate slow configeration
280	// readers
281
282	namespace folly {
283
284	struct RcuTag;
285
286	template <typename Tag = RcuTag>
287	class rcu_domain;
288
289	// Opaque token used to match up lock_shared() and unlock_shared()
290	// pairs.
291	template <typename Tag>
292	class rcu_token {
293	public:
294	rcu_token() {}
295	~rcu_token() = default;
296
297	rcu_token(const rcu_token&) = delete;
298	rcu_token(rcu_token&& other) = default;
299	rcu_token& operator=(const rcu_token& other) = delete;
300	rcu_token& operator=(rcu_token&& other) = default;
301
302	private:
303	explicit rcu_token(uint64_t epoch) : epoch_(epoch) {}
304
305	friend class rcu_domain<Tag>;
306	uint64_t epoch_;
307	};
308
309	// For most usages, rcu_domain is unnecessary, and you can use
310	// rcu_reader and rcu_retire/synchronize_rcu directly.
311	template <typename Tag>
312	class rcu_domain {
313	using list_head = typename detail::ThreadCachedLists<Tag>::ListHead;
314	using list_node = typename detail::ThreadCachedLists<Tag>::Node;
315
316	public:
317	/*
318	* If an executor is passed, it is used to run calls and delete
319	* retired objects.
320	*/
321	explicit rcu_domain(Executor* executor = nullptr) noexcept;
322
323	rcu_domain(const rcu_domain&) = delete;
324	rcu_domain(rcu_domain&&) = delete;
325	rcu_domain& operator=(const rcu_domain&) = delete;
326	rcu_domain& operator=(rcu_domain&&) = delete;
327	~rcu_domain();
328
329	// Reader locks: Prevent any calls from occuring, retired memory
330	// from being freed, and synchronize() calls from completing until
331	// all preceding lock_shared() sections are finished.
332
333	// Note: can potentially allocate on thread first use.
334	FOLLY_ALWAYS_INLINE rcu_token<Tag> lock_shared();
335	FOLLY_ALWAYS_INLINE void unlock_shared(rcu_token<Tag>&&);
336
337	// Call a function after concurrent critical sections have finished.
338	// Does not block unless the queue is full, then may block to wait
339	// for scheduler thread, but generally does not wait for full
340	// synchronization.
341	template <typename T>
342	void call(T&& cbin);
343	void retire(list_node* node) noexcept;
344
345	// Ensure concurrent critical sections have finished.
346	// Always waits for full synchronization.
347	// read lock must not* be held.*
348	void synchronize() noexcept;
349
350	private:
351	detail::ThreadCachedInts<Tag> counters_;
352	// Global epoch.
353	std::atomic<uint64_t> version_{`0`};
354	// Future epochs being driven by threads in synchronize
355	std::atomic<uint64_t> work_{`0`};
356	// Matches version_, for waking up threads in synchronize that are
357	// sharing an epoch.
358	detail::TurnSequencer<std::atomic> turn_;
359	// Only one thread can drive half_sync.
360	std::mutex syncMutex_;
361	// Currently half_sync takes ~16ms due to heavy barriers.
362	// Ensure that if used in a single thread, max GC time
363	// is limited to 1% of total CPU time.
364	static constexpr uint64_t syncTimePeriod_{`1600` * `2` / full sync is 2x /};
365	std::atomic<uint64_t> syncTime_{`0`};
366	// call()s waiting to move through two epochs.
367	detail::ThreadCachedLists<Tag> q_;
368	// Executor callbacks will eventually be run on.
369	Executor* executor_{nullptr};
370	static bool singleton_; // Ensure uniqueness per-tag.
371
372	// Queues for callbacks waiting to go through two epochs.
373	list_head queues_[`2`]{};
374
375	// Move queues through one epoch (half of a full synchronize()).
376	// Will block waiting for readers to exit if blocking is true.
377	// blocking must not* be true if the current thread is locked,*
378	// or will deadlock.
379	//
380	// returns a list of callbacks ready to run in cbs.
381	void half_sync(bool blocking, list_head& cbs);
382	};
383
384	extern folly::Indestructible<rcu_domain<RcuTag>*> rcu_default_domain_;
385
386	inline rcu_domain<RcuTag>* rcu_default_domain() {
387	return *rcu_default_domain_;
388	}
389
390	// Main reader guard class.
391	template <typename Tag = RcuTag>
392	class rcu_reader_domain {
393	public:
394	explicit FOLLY_ALWAYS_INLINE rcu_reader_domain(
395	rcu_domain<Tag>* domain = rcu_default_domain()) noexcept
396	: epoch_(domain->lock_shared()), domain_(domain) {}
397	explicit rcu_reader_domain(
398	std::defer_lock_t,
399	rcu_domain<Tag>* domain = rcu_default_domain()) noexcept
400	: domain_(domain) {}
401	rcu_reader_domain(const rcu_reader_domain&) = delete;
402	rcu_reader_domain(rcu_reader_domain&& other) noexcept
403	: epoch_(std::move(other.epoch_)),
404	domain_(std::exchange(other.domain_, nullptr)) {}
405	rcu_reader_domain& operator=(const rcu_reader_domain&) = delete;
406	rcu_reader_domain& operator=(rcu_reader_domain&& other) noexcept {
407	if (epoch_.has_value()) {
408	domain_->unlock_shared(std::move(epoch_.value()));
409	}
410	epoch_ = std::move(other.epoch_);
411	domain_ = std::move(other.domain_);
412	return *this;
413	}
414
415	FOLLY_ALWAYS_INLINE ~rcu_reader_domain() noexcept {
416	if (epoch_.has_value()) {
417	domain_->unlock_shared(std::move(epoch_.value()));
418	}
419	}
420
421	void swap(rcu_reader_domain& other) noexcept {
422	std::swap(epoch_, other.epoch_);
423	std::swap(domain_, other.domain_);
424	}
425
426	FOLLY_ALWAYS_INLINE void lock() noexcept {
427	DCHECK(!epoch_.has_value());
428	epoch_ = domain_->lock_shared();
429	}
430
431	FOLLY_ALWAYS_INLINE void unlock() noexcept {
432	DCHECK(epoch_.has_value());
433	domain_->unlock_shared(std::move(epoch_.value()));
434	}
435
436	private:
437	Optional<rcu_token<Tag>> epoch_;
438	rcu_domain<Tag>* domain_;
439	};
440
441	using rcu_reader = rcu_reader_domain<RcuTag>;
442
443	template <typename Tag = RcuTag>
444	inline void swap(
445	rcu_reader_domain<Tag>& a,
446	rcu_reader_domain<Tag>& b) noexcept {
447	a.swap(b);
448	}
449
450	template <typename Tag = RcuTag>
451	inline void synchronize_rcu(
452	rcu_domain<Tag>* domain = rcu_default_domain()) noexcept {
453	domain->synchronize();
454	}
455
456	template <typename Tag = RcuTag>
457	inline void rcu_barrier(
458	rcu_domain<Tag>* domain = rcu_default_domain()) noexcept {
459	domain->synchronize();
460	}
461
462	// Free-function retire. Always allocates.
463	template <
464	typename T,
465	typename D = std::default_delete<T>,
466	typename Tag = RcuTag>
467	void rcu_retire(
468	T* p,
469	D d = {},
470	rcu_domain<Tag>* domain = rcu_default_domain()) {
471	domain->call([p, del = std::move(d)]() { del(p); });
472	}
473
474	// Base class for rcu objects. retire() will use preallocated storage
475	// from rcu_obj_base, vs. rcu_retire() which always allocates.
476	template <
477	typename T,
478	typename D = std::default_delete<T>,
479	typename Tag = RcuTag>
480	class rcu_obj_base : detail::ThreadCachedListsBase::Node {
481	public:
482	void retire(D d = {}, rcu_domain<Tag>* domain = rcu_default_domain()) {
483	// This implementation assumes folly::Function has enough
484	// inline storage for D, otherwise, it allocates.
485	this->cb_ = [this, d = std::move(d)]() { d(static_cast<T>(this*)); };
486	domain->retire(this);
487	}
488	};
489
490	} // namespace folly
491
492	#include <folly/synchronization/Rcu-inl.h>
493

Browse the source code of folly/synchronization/Rcu.h