IndexedMemPool.h source code [folly/IndexedMemPool.h]

1	/*
2	* Copyright 2014-present Facebook, Inc.
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	#pragma once
18
19	#include <assert.h>
20	#include <errno.h>
21	#include <stdint.h>
22
23	#include <type_traits>
24
25	#include <boost/noncopyable.hpp>
26	#include <folly/Portability.h>
27	#include <folly/concurrency/CacheLocality.h>
28	#include <folly/portability/SysMman.h>
29	#include <folly/portability/Unistd.h>
30	#include <folly/synchronization/AtomicStruct.h>
31
32	// Ignore shadowing warnings within this file, so includers can use -Wshadow.
33	FOLLY_PUSH_WARNING
34	FOLLY_GNU_DISABLE_WARNING("-Wshadow")
35
36	namespace folly {
37
38	namespace detail {
39	template <typename Pool>
40	struct IndexedMemPoolRecycler;
41	} // namespace detail
42
43	template <
44	typename T,
45	bool EagerRecycleWhenTrivial = false,
46	bool EagerRecycleWhenNotTrivial = true>
47	struct IndexedMemPoolTraits {
48	static constexpr bool eagerRecycle() {
49	return std::is_trivial<T>::value ? EagerRecycleWhenTrivial
50	: EagerRecycleWhenNotTrivial;
51	}
52
53	/// Called when the element pointed to by ptr is allocated for the
54	/// first time.
55	static void initialize(T* ptr) {
56	if (!eagerRecycle()) {
57	new (ptr) T();
58	}
59	}
60
61	/// Called when the element pointed to by ptr is freed at the pool
62	/// destruction time.
63	static void cleanup(T* ptr) {
64	if (!eagerRecycle()) {
65	ptr->~T();
66	}
67	}
68
69	/// Called when the element is allocated with the arguments forwarded from
70	/// IndexedMemPool::allocElem.
71	template <typename... Args>
72	static void onAllocate(T* ptr, Args&&... args) {
73	static_assert(
74	sizeof...(Args) == `0` \|\| eagerRecycle(),
75	"emplace-style allocation requires eager recycle, "
76	"which is defaulted only for non-trivial types");
77	if (eagerRecycle()) {
78	new (ptr) T(std::forward<Args>(args)...);
79	}
80	}
81
82	/// Called when the element is recycled.
83	static void onRecycle(T* ptr) {
84	if (eagerRecycle()) {
85	ptr->~T();
86	}
87	}
88	};
89
90	/// IndexedMemPool traits that implements the lazy lifecycle strategy. In this
91	/// strategy elements are default-constructed the first time they are allocated,
92	/// and destroyed when the pool itself is destroyed.
93	template <typename T>
94	using IndexedMemPoolTraitsLazyRecycle = IndexedMemPoolTraits<T, false, false>;
95
96	/// IndexedMemPool traits that implements the eager lifecycle strategy. In this
97	/// strategy elements are constructed when they are allocated from the pool and
98	/// destroyed when recycled.
99	template <typename T>
100	using IndexedMemPoolTraitsEagerRecycle = IndexedMemPoolTraits<T, true, true>;
101
102	/// Instances of IndexedMemPool dynamically allocate and then pool their
103	/// element type (T), returning 4-byte integer indices that can be passed
104	/// to the pool's operator[] method to access or obtain pointers to the
105	/// actual elements. The memory backing items returned from the pool
106	/// will always be readable, even if items have been returned to the pool.
107	/// These two features are useful for lock-free algorithms. The indexing
108	/// behavior makes it easy to build tagged pointer-like-things, since
109	/// a large number of elements can be managed using fewer bits than a
110	/// full pointer. The access-after-free behavior makes it safe to read
111	/// from T-s even after they have been recycled, since it is guaranteed
112	/// that the memory won't have been returned to the OS and unmapped
113	/// (the algorithm must still use a mechanism to validate that the read
114	/// was correct, but it doesn't have to worry about page faults), and if
115	/// the elements use internal sequence numbers it can be guaranteed that
116	/// there won't be an ABA match due to the element being overwritten with
117	/// a different type that has the same bit pattern.
118	///
119	/// The object lifecycle strategy is controlled by the Traits parameter.
120	/// One strategy, implemented by IndexedMemPoolTraitsEagerRecycle, is to
121	/// construct objects when they are allocated from the pool and destroy
122	/// them when they are recycled. In this mode allocIndex and allocElem
123	/// have emplace-like semantics. In another strategy, implemented by
124	/// IndexedMemPoolTraitsLazyRecycle, objects are default-constructed the
125	/// first time they are removed from the pool, and deleted when the pool
126	/// itself is deleted. By default the first mode is used for non-trivial
127	/// T, and the second is used for trivial T. Clients can customize the
128	/// object lifecycle by providing their own Traits implementation.
129	/// See IndexedMemPoolTraits for a Traits example.
130	///
131	/// IMPORTANT: Space for extra elements is allocated to account for those
132	/// that are inaccessible because they are in other local lists, so the
133	/// actual number of items that can be allocated ranges from capacity to
134	/// capacity + (NumLocalLists_-1)LocalListLimit_. This is important if*
135	/// you are trying to maximize the capacity of the pool while constraining
136	/// the bit size of the resulting pointers, because the pointers will
137	/// actually range up to the boosted capacity. See maxIndexForCapacity
138	/// and capacityForMaxIndex.
139	///
140	/// To avoid contention, NumLocalLists_ free lists of limited (less than
141	/// or equal to LocalListLimit_) size are maintained, and each thread
142	/// retrieves and returns entries from its associated local list. If the
143	/// local list becomes too large then elements are placed in bulk in a
144	/// global free list. This allows items to be efficiently recirculated
145	/// from consumers to producers. AccessSpreader is used to access the
146	/// local lists, so there is no performance advantage to having more
147	/// local lists than L1 caches.
148	///
149	/// The pool mmap-s the entire necessary address space when the pool is
150	/// constructed, but delays element construction. This means that only
151	/// elements that are actually returned to the caller get paged into the
152	/// process's resident set (RSS).
153	template <
154	typename T,
155	uint32_t NumLocalLists_ = `32`,
156	uint32_t LocalListLimit_ = `200`,
157	template <typename> class Atom = std::atomic,
158	typename Traits = IndexedMemPoolTraits<T>>
159	struct IndexedMemPool : boost::noncopyable {
160	typedef T value_type;
161
162	typedef std::unique_ptr<T, detail::IndexedMemPoolRecycler<IndexedMemPool>>
163	UniquePtr;
164
165	static_assert(LocalListLimit_ <= `255`, "LocalListLimit must fit in 8 bits");
166	enum {
167	NumLocalLists = NumLocalLists_,
168	LocalListLimit = LocalListLimit_,
169	};
170
171	static_assert(
172	std::is_nothrow_default_constructible<Atom<uint32_t>>::value,
173	"Atom must be nothrow default constructible");
174
175	// these are public because clients may need to reason about the number
176	// of bits required to hold indices from a pool, given its capacity
177
178	static constexpr uint32_t maxIndexForCapacity(uint32_t capacity) {
179	// index of std::numeric_limits<uint32_t>::max() is reserved for isAllocated
180	// tracking
181	return uint32_t(std::min(
182	uint64_t(capacity) + (NumLocalLists - `1`) * LocalListLimit,
183	uint64_t(std::numeric_limits<uint32_t>::max() - `1`)));
184	}
185
186	static constexpr uint32_t capacityForMaxIndex(uint32_t maxIndex) {
187	return maxIndex - (NumLocalLists - `1`) * LocalListLimit;
188	}
189
190	/// Constructs a pool that can allocate at least _capacity_ elements,
191	/// even if all the local lists are full
192	explicit IndexedMemPool(uint32_t capacity)
193	: actualCapacity_(maxIndexForCapacity(capacity)),
194	size_(`0`),
195	globalHead_(TaggedPtr{}) {
196	const size_t needed = sizeof(Slot) * (actualCapacity_ + `1`);
197	size_t pagesize = size_t(sysconf(_SC_PAGESIZE));
198	mmapLength_ = ((needed - `1`) & ~(pagesize - `1`)) + pagesize;
199	assert(needed <= mmapLength_ && mmapLength_ < needed + pagesize);
200	assert((mmapLength_ % pagesize) == `0`);
201
202	slots_ = static_cast<Slot*>(mmap(
203	nullptr,
204	mmapLength_,
205	PROT_READ \| PROT_WRITE,
206	MAP_PRIVATE \| MAP_ANONYMOUS,
207	-`1`,
208	`0`));
209	if (slots_ == MAP_FAILED) {
210	assert(errno == ENOMEM);
211	throw std::bad_alloc ();
212	}
213	}
214
215	/// Destroys all of the contained elements
216	~IndexedMemPool() {
217	using A = Atom<uint32_t>;
218	for (uint32_t i = maxAllocatedIndex(); i > `0`; --i) {
219	Traits::cleanup(&slots_[i].elem);
220	slots_[i].localNext.~A();
221	slots_[i].globalNext.~A();
222	}
223	munmap(slots_, mmapLength_);
224	}
225
226	/// Returns a lower bound on the number of elements that may be
227	/// simultaneously allocated and not yet recycled. Because of the
228	/// local lists it is possible that more elements than this are returned
229	/// successfully
230	uint32_t capacity() {
231	return capacityForMaxIndex(actualCapacity_);
232	}
233
234	/// Returns the maximum index of elements ever allocated in this pool
235	/// including elements that have been recycled.
236	uint32_t maxAllocatedIndex() const {
237	// Take the minimum since it is possible that size_ > actualCapacity_.
238	// This can happen if there are multiple concurrent requests
239	// when size_ == actualCapacity_ - 1.
240	return std::min(uint32_t(size_), uint32_t(actualCapacity_));
241	}
242
243	/// Finds a slot with a non-zero index, emplaces a T there if we're
244	/// using the eager recycle lifecycle mode, and returns the index,
245	/// or returns 0 if no elements are available. Passes a pointer to
246	/// the element to Traits::onAllocate before the slot is marked as
247	/// allocated.
248	template <typename... Args>
249	uint32_t allocIndex(Args&&... args) {
250	auto idx = localPop(localHead());
251	if (idx != `0`) {
252	Slot& s = slot(idx);
253	Traits::onAllocate(&s.elem, std::forward<Args>(args)...);
254	markAllocated(s);
255	}
256	return idx;
257	}
258
259	/// If an element is available, returns a std::unique_ptr to it that will
260	/// recycle the element to the pool when it is reclaimed, otherwise returns
261	/// a null (falsy) std::unique_ptr. Passes a pointer to the element to
262	/// Traits::onAllocate before the slot is marked as allocated.
263	template <typename... Args>
264	UniquePtr allocElem(Args&&... args) {
265	auto idx = allocIndex(std::forward<Args>(args)...);
266	T* ptr = idx == `0` ? nullptr : &slot(idx).elem;
267	return UniquePtr(ptr, typename UniquePtr::deleter_type(this));
268	}
269
270	/// Gives up ownership previously granted by alloc()
271	void recycleIndex(uint32_t idx) {
272	assert(isAllocated(idx));
273	localPush(localHead(), idx);
274	}
275
276	/// Provides access to the pooled element referenced by idx
277	T& operator[](uint32_t idx) {
278	return slot(idx).elem;
279	}
280
281	/// Provides access to the pooled element referenced by idx
282	const T& operator[](uint32_t idx) const {
283	return slot(idx).elem;
284	}
285
286	/// If elem == &pool[idx], then pool.locateElem(elem) == idx. Also,
287	/// pool.locateElem(nullptr) == 0
288	uint32_t locateElem(const T* elem) const {
289	if (!elem) {
290	return `0`;
291	}
292
293	static_assert(std::is_standard_layout<Slot>::value, "offsetof needs POD");
294
295	auto slot = reinterpret_cast<const Slot*>(
296	reinterpret_cast<const char*>(elem) - offsetof(Slot, elem));
297	auto rv = uint32_t(slot - slots_);
298
299	// this assert also tests that rv is in range
300	assert(elem == &(*this)[rv]);
301	return rv;
302	}
303
304	/// Returns true iff idx has been alloc()ed and not recycleIndex()ed
305	bool isAllocated(uint32_t idx) const {
306	return slot(idx).localNext.load(std::memory_order_acquire) == uint32_t(-`1`);
307	}
308
309	private:
310	///////////// types
311
312	struct Slot {
313	T elem;
314	Atom<uint32_t> localNext;
315	Atom<uint32_t> globalNext;
316
317	Slot() : localNext{}, globalNext{} {}
318	};
319
320	struct TaggedPtr {
321	uint32_t idx;
322
323	// size is bottom 8 bits, tag in top 24. g++'s code generation for
324	// bitfields seems to depend on the phase of the moon, plus we can
325	// do better because we can rely on other checks to avoid masking
326	uint32_t tagAndSize;
327
328	enum : uint32_t {
329	SizeBits = `8`,
330	SizeMask = (`1U` << SizeBits) - `1`,
331	TagIncr = `1U` << SizeBits,
332	};
333
334	uint32_t size() const {
335	return tagAndSize & SizeMask;
336	}
337
338	TaggedPtr withSize(uint32_t repl) const {
339	assert(repl <= LocalListLimit);
340	return TaggedPtr{idx, (tagAndSize & ~SizeMask) \| repl};
341	}
342
343	TaggedPtr withSizeIncr() const {
344	assert(size() < LocalListLimit);
345	return TaggedPtr{idx, tagAndSize + `1`};
346	}
347
348	TaggedPtr withSizeDecr() const {
349	assert(size() > `0`);
350	return TaggedPtr{idx, tagAndSize - `1`};
351	}
352
353	TaggedPtr withIdx(uint32_t repl) const {
354	return TaggedPtr{repl, tagAndSize + TagIncr};
355	}
356
357	TaggedPtr withEmpty() const {
358	return withIdx(`0`).withSize(`0`);
359	}
360	};
361
362	struct alignas(hardware_destructive_interference_size) LocalList {
363	AtomicStruct<TaggedPtr, Atom> head;
364
365	LocalList() : head(TaggedPtr{}) {}
366	};
367
368	////////// fields
369
370	/// the number of bytes allocated from mmap, which is a multiple of
371	/// the page size of the machine
372	size_t mmapLength_;
373
374	/// the actual number of slots that we will allocate, to guarantee
375	/// that we will satisfy the capacity requested at construction time.
376	/// They will be numbered 1..actualCapacity_ (note the 1-based counting),
377	/// and occupy slots_[1..actualCapacity_].
378	uint32_t actualCapacity_;
379
380	/// this records the number of slots that have actually been constructed.
381	/// To allow use of atomic ++ instead of CAS, we let this overflow.
382	/// The actual number of constructed elements is min(actualCapacity_,
383	/// size_)
384	Atom<uint32_t> size_;
385
386	/// raw storage, only 1..min(size_,actualCapacity_) (inclusive) are
387	/// actually constructed. Note that slots_[0] is not constructed or used
388	alignas(hardware_destructive_interference_size) Slot* slots_;
389
390	/// use AccessSpreader to find your list. We use stripes instead of
391	/// thread-local to avoid the need to grow or shrink on thread start
392	/// or join. These are heads of lists chained with localNext
393	LocalList local_[NumLocalLists];
394
395	/// this is the head of a list of node chained by globalNext, that are
396	/// themselves each the head of a list chained by localNext
397	alignas(hardware_destructive_interference_size)
398	AtomicStruct<TaggedPtr, Atom> globalHead_;
399
400	///////////// private methods
401
402	uint32_t slotIndex(uint32_t idx) const {
403	assert(
404	`0` < idx && idx <= actualCapacity_ &&
405	idx <= size_.load(std::memory_order_acquire));
406	return idx;
407	}
408
409	Slot& slot(uint32_t idx) {
410	return slots_[slotIndex(idx)];
411	}
412
413	const Slot& slot(uint32_t idx) const {
414	return slots_[slotIndex(idx)];
415	}
416
417	// localHead references a full list chained by localNext. s should
418	// reference slot(localHead), it is passed as a micro-optimization
419	void globalPush(Slot& s, uint32_t localHead) {
420	while (true) {
421	TaggedPtr gh = globalHead_.load(std::memory_order_acquire);
422	s.globalNext.store(gh.idx, std::memory_order_relaxed);
423	if (globalHead_.compare_exchange_strong(gh, gh.withIdx(localHead))) {
424	// success
425	return;
426	}
427	}
428	}
429
430	// idx references a single node
431	void localPush(AtomicStruct<TaggedPtr, Atom>& head, uint32_t idx) {
432	Slot& s = slot(idx);
433	TaggedPtr h = head.load(std::memory_order_acquire);
434	while (true) {
435	s.localNext.store(h.idx, std::memory_order_release);
436	Traits::onRecycle(&slot(idx).elem);
437
438	if (h.size() == LocalListLimit) {
439	// push will overflow local list, steal it instead
440	if (head.compare_exchange_strong(h, h.withEmpty())) {
441	// steal was successful, put everything in the global list
442	globalPush(s, idx);
443	return;
444	}
445	} else {
446	// local list has space
447	if (head.compare_exchange_strong(h, h.withIdx(idx).withSizeIncr())) {
448	// success
449	return;
450	}
451	}
452	// h was updated by failing CAS
453	}
454	}
455
456	// returns 0 if empty
457	uint32_t globalPop() {
458	while (true) {
459	TaggedPtr gh = globalHead_.load(std::memory_order_acquire);
460	if (gh.idx == `0` \|\|
461	globalHead_.compare_exchange_strong(
462	gh,
463	gh.withIdx(
464	slot(gh.idx).globalNext.load(std::memory_order_relaxed)))) {
465	// global list is empty, or pop was successful
466	return gh.idx;
467	}
468	}
469	}
470
471	// returns 0 if allocation failed
472	uint32_t localPop(AtomicStruct<TaggedPtr, Atom>& head) {
473	while (true) {
474	TaggedPtr h = head.load(std::memory_order_acquire);
475	if (h.idx != `0`) {
476	// local list is non-empty, try to pop
477	Slot& s = slot(h.idx);
478	auto next = s.localNext.load(std::memory_order_relaxed);
479	if (head.compare_exchange_strong(h, h.withIdx(next).withSizeDecr())) {
480	// success
481	return h.idx;
482	}
483	continue;
484	}
485
486	uint32_t idx = globalPop();
487	if (idx == `0`) {
488	// global list is empty, allocate and construct new slot
489	if (size_.load(std::memory_order_relaxed) >= actualCapacity_ \|\|
490	(idx = ++size_) > actualCapacity_) {
491	// allocation failed
492	return `0`;
493	}
494	Slot& s = slot(idx);
495	// Atom is enforced above to be nothrow-default-constructible
496	// As an optimization, use default-initialization (no parens) rather
497	// than direct-initialization (with parens): these locations are
498	// stored-to before they are loaded-from
499	new (&s.localNext) Atom<uint32_t>;
500	new (&s.globalNext) Atom<uint32_t>;
501	Traits::initialize(&s.elem);
502	return idx;
503	}
504
505	Slot& s = slot(idx);
506	auto next = s.localNext.load(std::memory_order_relaxed);
507	if (head.compare_exchange_strong(
508	h, h.withIdx(next).withSize(LocalListLimit))) {
509	// global list moved to local list, keep head for us
510	return idx;
511	}
512	// local bulk push failed, return idx to the global list and try again
513	globalPush(s, idx);
514	}
515	}
516
517	AtomicStruct<TaggedPtr, Atom>& localHead() {
518	auto stripe = AccessSpreader<Atom>::current(NumLocalLists);
519	return local_[stripe].head;
520	}
521
522	void markAllocated(Slot& slot) {
523	slot.localNext.store(uint32_t(-`1`), std::memory_order_release);
524	}
525
526	public:
527	static constexpr std::size_t kSlotSize = sizeof(Slot);
528	};
529
530	namespace detail {
531
532	/// This is a stateful Deleter functor, which allows std::unique_ptr
533	/// to track elements allocated from an IndexedMemPool by tracking the
534	/// associated pool. See IndexedMemPool::allocElem.
535	template <typename Pool>
536	struct IndexedMemPoolRecycler {
537	Pool* pool;
538
539	explicit IndexedMemPoolRecycler(Pool* pool) : pool(pool) {}
540
541	IndexedMemPoolRecycler(const IndexedMemPoolRecycler<Pool>& rhs) = default;
542	IndexedMemPoolRecycler& operator=(const IndexedMemPoolRecycler<Pool>& rhs) =
543	default;
544
545	void operator()(typename Pool::value_type* elem) const {
546	pool->recycleIndex(pool->locateElem(elem));
547	}
548	};
549
550	} // namespace detail
551
552	} // namespace folly
553
554	FOLLY_POP_WARNING
555

Browse the source code of folly/IndexedMemPool.h