1/*
2 * Copyright 2014-present Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#pragma once
18
19#include <assert.h>
20#include <errno.h>
21#include <stdint.h>
22
23#include <type_traits>
24
25#include <boost/noncopyable.hpp>
26#include <folly/Portability.h>
27#include <folly/concurrency/CacheLocality.h>
28#include <folly/portability/SysMman.h>
29#include <folly/portability/Unistd.h>
30#include <folly/synchronization/AtomicStruct.h>
31
32// Ignore shadowing warnings within this file, so includers can use -Wshadow.
33FOLLY_PUSH_WARNING
34FOLLY_GNU_DISABLE_WARNING("-Wshadow")
35
36namespace folly {
37
38namespace detail {
39template <typename Pool>
40struct IndexedMemPoolRecycler;
41} // namespace detail
42
43template <
44 typename T,
45 bool EagerRecycleWhenTrivial = false,
46 bool EagerRecycleWhenNotTrivial = true>
47struct IndexedMemPoolTraits {
48 static constexpr bool eagerRecycle() {
49 return std::is_trivial<T>::value ? EagerRecycleWhenTrivial
50 : EagerRecycleWhenNotTrivial;
51 }
52
53 /// Called when the element pointed to by ptr is allocated for the
54 /// first time.
55 static void initialize(T* ptr) {
56 if (!eagerRecycle()) {
57 new (ptr) T();
58 }
59 }
60
61 /// Called when the element pointed to by ptr is freed at the pool
62 /// destruction time.
63 static void cleanup(T* ptr) {
64 if (!eagerRecycle()) {
65 ptr->~T();
66 }
67 }
68
69 /// Called when the element is allocated with the arguments forwarded from
70 /// IndexedMemPool::allocElem.
71 template <typename... Args>
72 static void onAllocate(T* ptr, Args&&... args) {
73 static_assert(
74 sizeof...(Args) == 0 || eagerRecycle(),
75 "emplace-style allocation requires eager recycle, "
76 "which is defaulted only for non-trivial types");
77 if (eagerRecycle()) {
78 new (ptr) T(std::forward<Args>(args)...);
79 }
80 }
81
82 /// Called when the element is recycled.
83 static void onRecycle(T* ptr) {
84 if (eagerRecycle()) {
85 ptr->~T();
86 }
87 }
88};
89
90/// IndexedMemPool traits that implements the lazy lifecycle strategy. In this
91/// strategy elements are default-constructed the first time they are allocated,
92/// and destroyed when the pool itself is destroyed.
93template <typename T>
94using IndexedMemPoolTraitsLazyRecycle = IndexedMemPoolTraits<T, false, false>;
95
96/// IndexedMemPool traits that implements the eager lifecycle strategy. In this
97/// strategy elements are constructed when they are allocated from the pool and
98/// destroyed when recycled.
99template <typename T>
100using IndexedMemPoolTraitsEagerRecycle = IndexedMemPoolTraits<T, true, true>;
101
102/// Instances of IndexedMemPool dynamically allocate and then pool their
103/// element type (T), returning 4-byte integer indices that can be passed
104/// to the pool's operator[] method to access or obtain pointers to the
105/// actual elements. The memory backing items returned from the pool
106/// will always be readable, even if items have been returned to the pool.
107/// These two features are useful for lock-free algorithms. The indexing
108/// behavior makes it easy to build tagged pointer-like-things, since
109/// a large number of elements can be managed using fewer bits than a
110/// full pointer. The access-after-free behavior makes it safe to read
111/// from T-s even after they have been recycled, since it is guaranteed
112/// that the memory won't have been returned to the OS and unmapped
113/// (the algorithm must still use a mechanism to validate that the read
114/// was correct, but it doesn't have to worry about page faults), and if
115/// the elements use internal sequence numbers it can be guaranteed that
116/// there won't be an ABA match due to the element being overwritten with
117/// a different type that has the same bit pattern.
118///
119/// The object lifecycle strategy is controlled by the Traits parameter.
120/// One strategy, implemented by IndexedMemPoolTraitsEagerRecycle, is to
121/// construct objects when they are allocated from the pool and destroy
122/// them when they are recycled. In this mode allocIndex and allocElem
123/// have emplace-like semantics. In another strategy, implemented by
124/// IndexedMemPoolTraitsLazyRecycle, objects are default-constructed the
125/// first time they are removed from the pool, and deleted when the pool
126/// itself is deleted. By default the first mode is used for non-trivial
127/// T, and the second is used for trivial T. Clients can customize the
128/// object lifecycle by providing their own Traits implementation.
129/// See IndexedMemPoolTraits for a Traits example.
130///
131/// IMPORTANT: Space for extra elements is allocated to account for those
132/// that are inaccessible because they are in other local lists, so the
133/// actual number of items that can be allocated ranges from capacity to
134/// capacity + (NumLocalLists_-1)*LocalListLimit_. This is important if
135/// you are trying to maximize the capacity of the pool while constraining
136/// the bit size of the resulting pointers, because the pointers will
137/// actually range up to the boosted capacity. See maxIndexForCapacity
138/// and capacityForMaxIndex.
139///
140/// To avoid contention, NumLocalLists_ free lists of limited (less than
141/// or equal to LocalListLimit_) size are maintained, and each thread
142/// retrieves and returns entries from its associated local list. If the
143/// local list becomes too large then elements are placed in bulk in a
144/// global free list. This allows items to be efficiently recirculated
145/// from consumers to producers. AccessSpreader is used to access the
146/// local lists, so there is no performance advantage to having more
147/// local lists than L1 caches.
148///
149/// The pool mmap-s the entire necessary address space when the pool is
150/// constructed, but delays element construction. This means that only
151/// elements that are actually returned to the caller get paged into the
152/// process's resident set (RSS).
153template <
154 typename T,
155 uint32_t NumLocalLists_ = 32,
156 uint32_t LocalListLimit_ = 200,
157 template <typename> class Atom = std::atomic,
158 typename Traits = IndexedMemPoolTraits<T>>
159struct IndexedMemPool : boost::noncopyable {
160 typedef T value_type;
161
162 typedef std::unique_ptr<T, detail::IndexedMemPoolRecycler<IndexedMemPool>>
163 UniquePtr;
164
165 static_assert(LocalListLimit_ <= 255, "LocalListLimit must fit in 8 bits");
166 enum {
167 NumLocalLists = NumLocalLists_,
168 LocalListLimit = LocalListLimit_,
169 };
170
171 static_assert(
172 std::is_nothrow_default_constructible<Atom<uint32_t>>::value,
173 "Atom must be nothrow default constructible");
174
175 // these are public because clients may need to reason about the number
176 // of bits required to hold indices from a pool, given its capacity
177
178 static constexpr uint32_t maxIndexForCapacity(uint32_t capacity) {
179 // index of std::numeric_limits<uint32_t>::max() is reserved for isAllocated
180 // tracking
181 return uint32_t(std::min(
182 uint64_t(capacity) + (NumLocalLists - 1) * LocalListLimit,
183 uint64_t(std::numeric_limits<uint32_t>::max() - 1)));
184 }
185
186 static constexpr uint32_t capacityForMaxIndex(uint32_t maxIndex) {
187 return maxIndex - (NumLocalLists - 1) * LocalListLimit;
188 }
189
190 /// Constructs a pool that can allocate at least _capacity_ elements,
191 /// even if all the local lists are full
192 explicit IndexedMemPool(uint32_t capacity)
193 : actualCapacity_(maxIndexForCapacity(capacity)),
194 size_(0),
195 globalHead_(TaggedPtr{}) {
196 const size_t needed = sizeof(Slot) * (actualCapacity_ + 1);
197 size_t pagesize = size_t(sysconf(_SC_PAGESIZE));
198 mmapLength_ = ((needed - 1) & ~(pagesize - 1)) + pagesize;
199 assert(needed <= mmapLength_ && mmapLength_ < needed + pagesize);
200 assert((mmapLength_ % pagesize) == 0);
201
202 slots_ = static_cast<Slot*>(mmap(
203 nullptr,
204 mmapLength_,
205 PROT_READ | PROT_WRITE,
206 MAP_PRIVATE | MAP_ANONYMOUS,
207 -1,
208 0));
209 if (slots_ == MAP_FAILED) {
210 assert(errno == ENOMEM);
211 throw std::bad_alloc();
212 }
213 }
214
215 /// Destroys all of the contained elements
216 ~IndexedMemPool() {
217 using A = Atom<uint32_t>;
218 for (uint32_t i = maxAllocatedIndex(); i > 0; --i) {
219 Traits::cleanup(&slots_[i].elem);
220 slots_[i].localNext.~A();
221 slots_[i].globalNext.~A();
222 }
223 munmap(slots_, mmapLength_);
224 }
225
226 /// Returns a lower bound on the number of elements that may be
227 /// simultaneously allocated and not yet recycled. Because of the
228 /// local lists it is possible that more elements than this are returned
229 /// successfully
230 uint32_t capacity() {
231 return capacityForMaxIndex(actualCapacity_);
232 }
233
234 /// Returns the maximum index of elements ever allocated in this pool
235 /// including elements that have been recycled.
236 uint32_t maxAllocatedIndex() const {
237 // Take the minimum since it is possible that size_ > actualCapacity_.
238 // This can happen if there are multiple concurrent requests
239 // when size_ == actualCapacity_ - 1.
240 return std::min(uint32_t(size_), uint32_t(actualCapacity_));
241 }
242
243 /// Finds a slot with a non-zero index, emplaces a T there if we're
244 /// using the eager recycle lifecycle mode, and returns the index,
245 /// or returns 0 if no elements are available. Passes a pointer to
246 /// the element to Traits::onAllocate before the slot is marked as
247 /// allocated.
248 template <typename... Args>
249 uint32_t allocIndex(Args&&... args) {
250 auto idx = localPop(localHead());
251 if (idx != 0) {
252 Slot& s = slot(idx);
253 Traits::onAllocate(&s.elem, std::forward<Args>(args)...);
254 markAllocated(s);
255 }
256 return idx;
257 }
258
259 /// If an element is available, returns a std::unique_ptr to it that will
260 /// recycle the element to the pool when it is reclaimed, otherwise returns
261 /// a null (falsy) std::unique_ptr. Passes a pointer to the element to
262 /// Traits::onAllocate before the slot is marked as allocated.
263 template <typename... Args>
264 UniquePtr allocElem(Args&&... args) {
265 auto idx = allocIndex(std::forward<Args>(args)...);
266 T* ptr = idx == 0 ? nullptr : &slot(idx).elem;
267 return UniquePtr(ptr, typename UniquePtr::deleter_type(this));
268 }
269
270 /// Gives up ownership previously granted by alloc()
271 void recycleIndex(uint32_t idx) {
272 assert(isAllocated(idx));
273 localPush(localHead(), idx);
274 }
275
276 /// Provides access to the pooled element referenced by idx
277 T& operator[](uint32_t idx) {
278 return slot(idx).elem;
279 }
280
281 /// Provides access to the pooled element referenced by idx
282 const T& operator[](uint32_t idx) const {
283 return slot(idx).elem;
284 }
285
286 /// If elem == &pool[idx], then pool.locateElem(elem) == idx. Also,
287 /// pool.locateElem(nullptr) == 0
288 uint32_t locateElem(const T* elem) const {
289 if (!elem) {
290 return 0;
291 }
292
293 static_assert(std::is_standard_layout<Slot>::value, "offsetof needs POD");
294
295 auto slot = reinterpret_cast<const Slot*>(
296 reinterpret_cast<const char*>(elem) - offsetof(Slot, elem));
297 auto rv = uint32_t(slot - slots_);
298
299 // this assert also tests that rv is in range
300 assert(elem == &(*this)[rv]);
301 return rv;
302 }
303
304 /// Returns true iff idx has been alloc()ed and not recycleIndex()ed
305 bool isAllocated(uint32_t idx) const {
306 return slot(idx).localNext.load(std::memory_order_acquire) == uint32_t(-1);
307 }
308
309 private:
310 ///////////// types
311
312 struct Slot {
313 T elem;
314 Atom<uint32_t> localNext;
315 Atom<uint32_t> globalNext;
316
317 Slot() : localNext{}, globalNext{} {}
318 };
319
320 struct TaggedPtr {
321 uint32_t idx;
322
323 // size is bottom 8 bits, tag in top 24. g++'s code generation for
324 // bitfields seems to depend on the phase of the moon, plus we can
325 // do better because we can rely on other checks to avoid masking
326 uint32_t tagAndSize;
327
328 enum : uint32_t {
329 SizeBits = 8,
330 SizeMask = (1U << SizeBits) - 1,
331 TagIncr = 1U << SizeBits,
332 };
333
334 uint32_t size() const {
335 return tagAndSize & SizeMask;
336 }
337
338 TaggedPtr withSize(uint32_t repl) const {
339 assert(repl <= LocalListLimit);
340 return TaggedPtr{idx, (tagAndSize & ~SizeMask) | repl};
341 }
342
343 TaggedPtr withSizeIncr() const {
344 assert(size() < LocalListLimit);
345 return TaggedPtr{idx, tagAndSize + 1};
346 }
347
348 TaggedPtr withSizeDecr() const {
349 assert(size() > 0);
350 return TaggedPtr{idx, tagAndSize - 1};
351 }
352
353 TaggedPtr withIdx(uint32_t repl) const {
354 return TaggedPtr{repl, tagAndSize + TagIncr};
355 }
356
357 TaggedPtr withEmpty() const {
358 return withIdx(0).withSize(0);
359 }
360 };
361
362 struct alignas(hardware_destructive_interference_size) LocalList {
363 AtomicStruct<TaggedPtr, Atom> head;
364
365 LocalList() : head(TaggedPtr{}) {}
366 };
367
368 ////////// fields
369
370 /// the number of bytes allocated from mmap, which is a multiple of
371 /// the page size of the machine
372 size_t mmapLength_;
373
374 /// the actual number of slots that we will allocate, to guarantee
375 /// that we will satisfy the capacity requested at construction time.
376 /// They will be numbered 1..actualCapacity_ (note the 1-based counting),
377 /// and occupy slots_[1..actualCapacity_].
378 uint32_t actualCapacity_;
379
380 /// this records the number of slots that have actually been constructed.
381 /// To allow use of atomic ++ instead of CAS, we let this overflow.
382 /// The actual number of constructed elements is min(actualCapacity_,
383 /// size_)
384 Atom<uint32_t> size_;
385
386 /// raw storage, only 1..min(size_,actualCapacity_) (inclusive) are
387 /// actually constructed. Note that slots_[0] is not constructed or used
388 alignas(hardware_destructive_interference_size) Slot* slots_;
389
390 /// use AccessSpreader to find your list. We use stripes instead of
391 /// thread-local to avoid the need to grow or shrink on thread start
392 /// or join. These are heads of lists chained with localNext
393 LocalList local_[NumLocalLists];
394
395 /// this is the head of a list of node chained by globalNext, that are
396 /// themselves each the head of a list chained by localNext
397 alignas(hardware_destructive_interference_size)
398 AtomicStruct<TaggedPtr, Atom> globalHead_;
399
400 ///////////// private methods
401
402 uint32_t slotIndex(uint32_t idx) const {
403 assert(
404 0 < idx && idx <= actualCapacity_ &&
405 idx <= size_.load(std::memory_order_acquire));
406 return idx;
407 }
408
409 Slot& slot(uint32_t idx) {
410 return slots_[slotIndex(idx)];
411 }
412
413 const Slot& slot(uint32_t idx) const {
414 return slots_[slotIndex(idx)];
415 }
416
417 // localHead references a full list chained by localNext. s should
418 // reference slot(localHead), it is passed as a micro-optimization
419 void globalPush(Slot& s, uint32_t localHead) {
420 while (true) {
421 TaggedPtr gh = globalHead_.load(std::memory_order_acquire);
422 s.globalNext.store(gh.idx, std::memory_order_relaxed);
423 if (globalHead_.compare_exchange_strong(gh, gh.withIdx(localHead))) {
424 // success
425 return;
426 }
427 }
428 }
429
430 // idx references a single node
431 void localPush(AtomicStruct<TaggedPtr, Atom>& head, uint32_t idx) {
432 Slot& s = slot(idx);
433 TaggedPtr h = head.load(std::memory_order_acquire);
434 while (true) {
435 s.localNext.store(h.idx, std::memory_order_release);
436 Traits::onRecycle(&slot(idx).elem);
437
438 if (h.size() == LocalListLimit) {
439 // push will overflow local list, steal it instead
440 if (head.compare_exchange_strong(h, h.withEmpty())) {
441 // steal was successful, put everything in the global list
442 globalPush(s, idx);
443 return;
444 }
445 } else {
446 // local list has space
447 if (head.compare_exchange_strong(h, h.withIdx(idx).withSizeIncr())) {
448 // success
449 return;
450 }
451 }
452 // h was updated by failing CAS
453 }
454 }
455
456 // returns 0 if empty
457 uint32_t globalPop() {
458 while (true) {
459 TaggedPtr gh = globalHead_.load(std::memory_order_acquire);
460 if (gh.idx == 0 ||
461 globalHead_.compare_exchange_strong(
462 gh,
463 gh.withIdx(
464 slot(gh.idx).globalNext.load(std::memory_order_relaxed)))) {
465 // global list is empty, or pop was successful
466 return gh.idx;
467 }
468 }
469 }
470
471 // returns 0 if allocation failed
472 uint32_t localPop(AtomicStruct<TaggedPtr, Atom>& head) {
473 while (true) {
474 TaggedPtr h = head.load(std::memory_order_acquire);
475 if (h.idx != 0) {
476 // local list is non-empty, try to pop
477 Slot& s = slot(h.idx);
478 auto next = s.localNext.load(std::memory_order_relaxed);
479 if (head.compare_exchange_strong(h, h.withIdx(next).withSizeDecr())) {
480 // success
481 return h.idx;
482 }
483 continue;
484 }
485
486 uint32_t idx = globalPop();
487 if (idx == 0) {
488 // global list is empty, allocate and construct new slot
489 if (size_.load(std::memory_order_relaxed) >= actualCapacity_ ||
490 (idx = ++size_) > actualCapacity_) {
491 // allocation failed
492 return 0;
493 }
494 Slot& s = slot(idx);
495 // Atom is enforced above to be nothrow-default-constructible
496 // As an optimization, use default-initialization (no parens) rather
497 // than direct-initialization (with parens): these locations are
498 // stored-to before they are loaded-from
499 new (&s.localNext) Atom<uint32_t>;
500 new (&s.globalNext) Atom<uint32_t>;
501 Traits::initialize(&s.elem);
502 return idx;
503 }
504
505 Slot& s = slot(idx);
506 auto next = s.localNext.load(std::memory_order_relaxed);
507 if (head.compare_exchange_strong(
508 h, h.withIdx(next).withSize(LocalListLimit))) {
509 // global list moved to local list, keep head for us
510 return idx;
511 }
512 // local bulk push failed, return idx to the global list and try again
513 globalPush(s, idx);
514 }
515 }
516
517 AtomicStruct<TaggedPtr, Atom>& localHead() {
518 auto stripe = AccessSpreader<Atom>::current(NumLocalLists);
519 return local_[stripe].head;
520 }
521
522 void markAllocated(Slot& slot) {
523 slot.localNext.store(uint32_t(-1), std::memory_order_release);
524 }
525
526 public:
527 static constexpr std::size_t kSlotSize = sizeof(Slot);
528};
529
530namespace detail {
531
532/// This is a stateful Deleter functor, which allows std::unique_ptr
533/// to track elements allocated from an IndexedMemPool by tracking the
534/// associated pool. See IndexedMemPool::allocElem.
535template <typename Pool>
536struct IndexedMemPoolRecycler {
537 Pool* pool;
538
539 explicit IndexedMemPoolRecycler(Pool* pool) : pool(pool) {}
540
541 IndexedMemPoolRecycler(const IndexedMemPoolRecycler<Pool>& rhs) = default;
542 IndexedMemPoolRecycler& operator=(const IndexedMemPoolRecycler<Pool>& rhs) =
543 default;
544
545 void operator()(typename Pool::value_type* elem) const {
546 pool->recycleIndex(pool->locateElem(elem));
547 }
548};
549
550} // namespace detail
551
552} // namespace folly
553
554FOLLY_POP_WARNING
555