1 | /* |
2 | * Copyright 2014-present Facebook, Inc. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | #pragma once |
18 | |
19 | #include <assert.h> |
20 | #include <errno.h> |
21 | #include <stdint.h> |
22 | |
23 | #include <type_traits> |
24 | |
25 | #include <boost/noncopyable.hpp> |
26 | #include <folly/Portability.h> |
27 | #include <folly/concurrency/CacheLocality.h> |
28 | #include <folly/portability/SysMman.h> |
29 | #include <folly/portability/Unistd.h> |
30 | #include <folly/synchronization/AtomicStruct.h> |
31 | |
32 | // Ignore shadowing warnings within this file, so includers can use -Wshadow. |
33 | FOLLY_PUSH_WARNING |
34 | FOLLY_GNU_DISABLE_WARNING("-Wshadow" ) |
35 | |
36 | namespace folly { |
37 | |
38 | namespace detail { |
39 | template <typename Pool> |
40 | struct IndexedMemPoolRecycler; |
41 | } // namespace detail |
42 | |
43 | template < |
44 | typename T, |
45 | bool EagerRecycleWhenTrivial = false, |
46 | bool EagerRecycleWhenNotTrivial = true> |
47 | struct IndexedMemPoolTraits { |
48 | static constexpr bool eagerRecycle() { |
49 | return std::is_trivial<T>::value ? EagerRecycleWhenTrivial |
50 | : EagerRecycleWhenNotTrivial; |
51 | } |
52 | |
53 | /// Called when the element pointed to by ptr is allocated for the |
54 | /// first time. |
55 | static void initialize(T* ptr) { |
56 | if (!eagerRecycle()) { |
57 | new (ptr) T(); |
58 | } |
59 | } |
60 | |
61 | /// Called when the element pointed to by ptr is freed at the pool |
62 | /// destruction time. |
63 | static void cleanup(T* ptr) { |
64 | if (!eagerRecycle()) { |
65 | ptr->~T(); |
66 | } |
67 | } |
68 | |
69 | /// Called when the element is allocated with the arguments forwarded from |
70 | /// IndexedMemPool::allocElem. |
71 | template <typename... Args> |
72 | static void onAllocate(T* ptr, Args&&... args) { |
73 | static_assert( |
74 | sizeof...(Args) == 0 || eagerRecycle(), |
75 | "emplace-style allocation requires eager recycle, " |
76 | "which is defaulted only for non-trivial types" ); |
77 | if (eagerRecycle()) { |
78 | new (ptr) T(std::forward<Args>(args)...); |
79 | } |
80 | } |
81 | |
82 | /// Called when the element is recycled. |
83 | static void onRecycle(T* ptr) { |
84 | if (eagerRecycle()) { |
85 | ptr->~T(); |
86 | } |
87 | } |
88 | }; |
89 | |
90 | /// IndexedMemPool traits that implements the lazy lifecycle strategy. In this |
91 | /// strategy elements are default-constructed the first time they are allocated, |
92 | /// and destroyed when the pool itself is destroyed. |
93 | template <typename T> |
94 | using IndexedMemPoolTraitsLazyRecycle = IndexedMemPoolTraits<T, false, false>; |
95 | |
96 | /// IndexedMemPool traits that implements the eager lifecycle strategy. In this |
97 | /// strategy elements are constructed when they are allocated from the pool and |
98 | /// destroyed when recycled. |
99 | template <typename T> |
100 | using IndexedMemPoolTraitsEagerRecycle = IndexedMemPoolTraits<T, true, true>; |
101 | |
102 | /// Instances of IndexedMemPool dynamically allocate and then pool their |
103 | /// element type (T), returning 4-byte integer indices that can be passed |
104 | /// to the pool's operator[] method to access or obtain pointers to the |
105 | /// actual elements. The memory backing items returned from the pool |
106 | /// will always be readable, even if items have been returned to the pool. |
107 | /// These two features are useful for lock-free algorithms. The indexing |
108 | /// behavior makes it easy to build tagged pointer-like-things, since |
109 | /// a large number of elements can be managed using fewer bits than a |
110 | /// full pointer. The access-after-free behavior makes it safe to read |
111 | /// from T-s even after they have been recycled, since it is guaranteed |
112 | /// that the memory won't have been returned to the OS and unmapped |
113 | /// (the algorithm must still use a mechanism to validate that the read |
114 | /// was correct, but it doesn't have to worry about page faults), and if |
115 | /// the elements use internal sequence numbers it can be guaranteed that |
116 | /// there won't be an ABA match due to the element being overwritten with |
117 | /// a different type that has the same bit pattern. |
118 | /// |
119 | /// The object lifecycle strategy is controlled by the Traits parameter. |
120 | /// One strategy, implemented by IndexedMemPoolTraitsEagerRecycle, is to |
121 | /// construct objects when they are allocated from the pool and destroy |
122 | /// them when they are recycled. In this mode allocIndex and allocElem |
123 | /// have emplace-like semantics. In another strategy, implemented by |
124 | /// IndexedMemPoolTraitsLazyRecycle, objects are default-constructed the |
125 | /// first time they are removed from the pool, and deleted when the pool |
126 | /// itself is deleted. By default the first mode is used for non-trivial |
127 | /// T, and the second is used for trivial T. Clients can customize the |
128 | /// object lifecycle by providing their own Traits implementation. |
129 | /// See IndexedMemPoolTraits for a Traits example. |
130 | /// |
131 | /// IMPORTANT: Space for extra elements is allocated to account for those |
132 | /// that are inaccessible because they are in other local lists, so the |
133 | /// actual number of items that can be allocated ranges from capacity to |
134 | /// capacity + (NumLocalLists_-1)*LocalListLimit_. This is important if |
135 | /// you are trying to maximize the capacity of the pool while constraining |
136 | /// the bit size of the resulting pointers, because the pointers will |
137 | /// actually range up to the boosted capacity. See maxIndexForCapacity |
138 | /// and capacityForMaxIndex. |
139 | /// |
140 | /// To avoid contention, NumLocalLists_ free lists of limited (less than |
141 | /// or equal to LocalListLimit_) size are maintained, and each thread |
142 | /// retrieves and returns entries from its associated local list. If the |
143 | /// local list becomes too large then elements are placed in bulk in a |
144 | /// global free list. This allows items to be efficiently recirculated |
145 | /// from consumers to producers. AccessSpreader is used to access the |
146 | /// local lists, so there is no performance advantage to having more |
147 | /// local lists than L1 caches. |
148 | /// |
149 | /// The pool mmap-s the entire necessary address space when the pool is |
150 | /// constructed, but delays element construction. This means that only |
151 | /// elements that are actually returned to the caller get paged into the |
152 | /// process's resident set (RSS). |
153 | template < |
154 | typename T, |
155 | uint32_t NumLocalLists_ = 32, |
156 | uint32_t LocalListLimit_ = 200, |
157 | template <typename> class Atom = std::atomic, |
158 | typename Traits = IndexedMemPoolTraits<T>> |
159 | struct IndexedMemPool : boost::noncopyable { |
160 | typedef T value_type; |
161 | |
162 | typedef std::unique_ptr<T, detail::IndexedMemPoolRecycler<IndexedMemPool>> |
163 | UniquePtr; |
164 | |
165 | static_assert(LocalListLimit_ <= 255, "LocalListLimit must fit in 8 bits" ); |
166 | enum { |
167 | NumLocalLists = NumLocalLists_, |
168 | LocalListLimit = LocalListLimit_, |
169 | }; |
170 | |
171 | static_assert( |
172 | std::is_nothrow_default_constructible<Atom<uint32_t>>::value, |
173 | "Atom must be nothrow default constructible" ); |
174 | |
175 | // these are public because clients may need to reason about the number |
176 | // of bits required to hold indices from a pool, given its capacity |
177 | |
178 | static constexpr uint32_t maxIndexForCapacity(uint32_t capacity) { |
179 | // index of std::numeric_limits<uint32_t>::max() is reserved for isAllocated |
180 | // tracking |
181 | return uint32_t(std::min( |
182 | uint64_t(capacity) + (NumLocalLists - 1) * LocalListLimit, |
183 | uint64_t(std::numeric_limits<uint32_t>::max() - 1))); |
184 | } |
185 | |
186 | static constexpr uint32_t capacityForMaxIndex(uint32_t maxIndex) { |
187 | return maxIndex - (NumLocalLists - 1) * LocalListLimit; |
188 | } |
189 | |
190 | /// Constructs a pool that can allocate at least _capacity_ elements, |
191 | /// even if all the local lists are full |
192 | explicit IndexedMemPool(uint32_t capacity) |
193 | : actualCapacity_(maxIndexForCapacity(capacity)), |
194 | size_(0), |
195 | globalHead_(TaggedPtr{}) { |
196 | const size_t needed = sizeof(Slot) * (actualCapacity_ + 1); |
197 | size_t pagesize = size_t(sysconf(_SC_PAGESIZE)); |
198 | mmapLength_ = ((needed - 1) & ~(pagesize - 1)) + pagesize; |
199 | assert(needed <= mmapLength_ && mmapLength_ < needed + pagesize); |
200 | assert((mmapLength_ % pagesize) == 0); |
201 | |
202 | slots_ = static_cast<Slot*>(mmap( |
203 | nullptr, |
204 | mmapLength_, |
205 | PROT_READ | PROT_WRITE, |
206 | MAP_PRIVATE | MAP_ANONYMOUS, |
207 | -1, |
208 | 0)); |
209 | if (slots_ == MAP_FAILED) { |
210 | assert(errno == ENOMEM); |
211 | throw std::bad_alloc(); |
212 | } |
213 | } |
214 | |
215 | /// Destroys all of the contained elements |
216 | ~IndexedMemPool() { |
217 | using A = Atom<uint32_t>; |
218 | for (uint32_t i = maxAllocatedIndex(); i > 0; --i) { |
219 | Traits::cleanup(&slots_[i].elem); |
220 | slots_[i].localNext.~A(); |
221 | slots_[i].globalNext.~A(); |
222 | } |
223 | munmap(slots_, mmapLength_); |
224 | } |
225 | |
226 | /// Returns a lower bound on the number of elements that may be |
227 | /// simultaneously allocated and not yet recycled. Because of the |
228 | /// local lists it is possible that more elements than this are returned |
229 | /// successfully |
230 | uint32_t capacity() { |
231 | return capacityForMaxIndex(actualCapacity_); |
232 | } |
233 | |
234 | /// Returns the maximum index of elements ever allocated in this pool |
235 | /// including elements that have been recycled. |
236 | uint32_t maxAllocatedIndex() const { |
237 | // Take the minimum since it is possible that size_ > actualCapacity_. |
238 | // This can happen if there are multiple concurrent requests |
239 | // when size_ == actualCapacity_ - 1. |
240 | return std::min(uint32_t(size_), uint32_t(actualCapacity_)); |
241 | } |
242 | |
243 | /// Finds a slot with a non-zero index, emplaces a T there if we're |
244 | /// using the eager recycle lifecycle mode, and returns the index, |
245 | /// or returns 0 if no elements are available. Passes a pointer to |
246 | /// the element to Traits::onAllocate before the slot is marked as |
247 | /// allocated. |
248 | template <typename... Args> |
249 | uint32_t allocIndex(Args&&... args) { |
250 | auto idx = localPop(localHead()); |
251 | if (idx != 0) { |
252 | Slot& s = slot(idx); |
253 | Traits::onAllocate(&s.elem, std::forward<Args>(args)...); |
254 | markAllocated(s); |
255 | } |
256 | return idx; |
257 | } |
258 | |
259 | /// If an element is available, returns a std::unique_ptr to it that will |
260 | /// recycle the element to the pool when it is reclaimed, otherwise returns |
261 | /// a null (falsy) std::unique_ptr. Passes a pointer to the element to |
262 | /// Traits::onAllocate before the slot is marked as allocated. |
263 | template <typename... Args> |
264 | UniquePtr allocElem(Args&&... args) { |
265 | auto idx = allocIndex(std::forward<Args>(args)...); |
266 | T* ptr = idx == 0 ? nullptr : &slot(idx).elem; |
267 | return UniquePtr(ptr, typename UniquePtr::deleter_type(this)); |
268 | } |
269 | |
270 | /// Gives up ownership previously granted by alloc() |
271 | void recycleIndex(uint32_t idx) { |
272 | assert(isAllocated(idx)); |
273 | localPush(localHead(), idx); |
274 | } |
275 | |
276 | /// Provides access to the pooled element referenced by idx |
277 | T& operator[](uint32_t idx) { |
278 | return slot(idx).elem; |
279 | } |
280 | |
281 | /// Provides access to the pooled element referenced by idx |
282 | const T& operator[](uint32_t idx) const { |
283 | return slot(idx).elem; |
284 | } |
285 | |
286 | /// If elem == &pool[idx], then pool.locateElem(elem) == idx. Also, |
287 | /// pool.locateElem(nullptr) == 0 |
288 | uint32_t locateElem(const T* elem) const { |
289 | if (!elem) { |
290 | return 0; |
291 | } |
292 | |
293 | static_assert(std::is_standard_layout<Slot>::value, "offsetof needs POD" ); |
294 | |
295 | auto slot = reinterpret_cast<const Slot*>( |
296 | reinterpret_cast<const char*>(elem) - offsetof(Slot, elem)); |
297 | auto rv = uint32_t(slot - slots_); |
298 | |
299 | // this assert also tests that rv is in range |
300 | assert(elem == &(*this)[rv]); |
301 | return rv; |
302 | } |
303 | |
304 | /// Returns true iff idx has been alloc()ed and not recycleIndex()ed |
305 | bool isAllocated(uint32_t idx) const { |
306 | return slot(idx).localNext.load(std::memory_order_acquire) == uint32_t(-1); |
307 | } |
308 | |
309 | private: |
310 | ///////////// types |
311 | |
312 | struct Slot { |
313 | T elem; |
314 | Atom<uint32_t> localNext; |
315 | Atom<uint32_t> globalNext; |
316 | |
317 | Slot() : localNext{}, globalNext{} {} |
318 | }; |
319 | |
320 | struct TaggedPtr { |
321 | uint32_t idx; |
322 | |
323 | // size is bottom 8 bits, tag in top 24. g++'s code generation for |
324 | // bitfields seems to depend on the phase of the moon, plus we can |
325 | // do better because we can rely on other checks to avoid masking |
326 | uint32_t tagAndSize; |
327 | |
328 | enum : uint32_t { |
329 | SizeBits = 8, |
330 | SizeMask = (1U << SizeBits) - 1, |
331 | TagIncr = 1U << SizeBits, |
332 | }; |
333 | |
334 | uint32_t size() const { |
335 | return tagAndSize & SizeMask; |
336 | } |
337 | |
338 | TaggedPtr withSize(uint32_t repl) const { |
339 | assert(repl <= LocalListLimit); |
340 | return TaggedPtr{idx, (tagAndSize & ~SizeMask) | repl}; |
341 | } |
342 | |
343 | TaggedPtr withSizeIncr() const { |
344 | assert(size() < LocalListLimit); |
345 | return TaggedPtr{idx, tagAndSize + 1}; |
346 | } |
347 | |
348 | TaggedPtr withSizeDecr() const { |
349 | assert(size() > 0); |
350 | return TaggedPtr{idx, tagAndSize - 1}; |
351 | } |
352 | |
353 | TaggedPtr withIdx(uint32_t repl) const { |
354 | return TaggedPtr{repl, tagAndSize + TagIncr}; |
355 | } |
356 | |
357 | TaggedPtr withEmpty() const { |
358 | return withIdx(0).withSize(0); |
359 | } |
360 | }; |
361 | |
362 | struct alignas(hardware_destructive_interference_size) LocalList { |
363 | AtomicStruct<TaggedPtr, Atom> head; |
364 | |
365 | LocalList() : head(TaggedPtr{}) {} |
366 | }; |
367 | |
368 | ////////// fields |
369 | |
370 | /// the number of bytes allocated from mmap, which is a multiple of |
371 | /// the page size of the machine |
372 | size_t mmapLength_; |
373 | |
374 | /// the actual number of slots that we will allocate, to guarantee |
375 | /// that we will satisfy the capacity requested at construction time. |
376 | /// They will be numbered 1..actualCapacity_ (note the 1-based counting), |
377 | /// and occupy slots_[1..actualCapacity_]. |
378 | uint32_t actualCapacity_; |
379 | |
380 | /// this records the number of slots that have actually been constructed. |
381 | /// To allow use of atomic ++ instead of CAS, we let this overflow. |
382 | /// The actual number of constructed elements is min(actualCapacity_, |
383 | /// size_) |
384 | Atom<uint32_t> size_; |
385 | |
386 | /// raw storage, only 1..min(size_,actualCapacity_) (inclusive) are |
387 | /// actually constructed. Note that slots_[0] is not constructed or used |
388 | alignas(hardware_destructive_interference_size) Slot* slots_; |
389 | |
390 | /// use AccessSpreader to find your list. We use stripes instead of |
391 | /// thread-local to avoid the need to grow or shrink on thread start |
392 | /// or join. These are heads of lists chained with localNext |
393 | LocalList local_[NumLocalLists]; |
394 | |
395 | /// this is the head of a list of node chained by globalNext, that are |
396 | /// themselves each the head of a list chained by localNext |
397 | alignas(hardware_destructive_interference_size) |
398 | AtomicStruct<TaggedPtr, Atom> globalHead_; |
399 | |
400 | ///////////// private methods |
401 | |
402 | uint32_t slotIndex(uint32_t idx) const { |
403 | assert( |
404 | 0 < idx && idx <= actualCapacity_ && |
405 | idx <= size_.load(std::memory_order_acquire)); |
406 | return idx; |
407 | } |
408 | |
409 | Slot& slot(uint32_t idx) { |
410 | return slots_[slotIndex(idx)]; |
411 | } |
412 | |
413 | const Slot& slot(uint32_t idx) const { |
414 | return slots_[slotIndex(idx)]; |
415 | } |
416 | |
417 | // localHead references a full list chained by localNext. s should |
418 | // reference slot(localHead), it is passed as a micro-optimization |
419 | void globalPush(Slot& s, uint32_t localHead) { |
420 | while (true) { |
421 | TaggedPtr gh = globalHead_.load(std::memory_order_acquire); |
422 | s.globalNext.store(gh.idx, std::memory_order_relaxed); |
423 | if (globalHead_.compare_exchange_strong(gh, gh.withIdx(localHead))) { |
424 | // success |
425 | return; |
426 | } |
427 | } |
428 | } |
429 | |
430 | // idx references a single node |
431 | void localPush(AtomicStruct<TaggedPtr, Atom>& head, uint32_t idx) { |
432 | Slot& s = slot(idx); |
433 | TaggedPtr h = head.load(std::memory_order_acquire); |
434 | while (true) { |
435 | s.localNext.store(h.idx, std::memory_order_release); |
436 | Traits::onRecycle(&slot(idx).elem); |
437 | |
438 | if (h.size() == LocalListLimit) { |
439 | // push will overflow local list, steal it instead |
440 | if (head.compare_exchange_strong(h, h.withEmpty())) { |
441 | // steal was successful, put everything in the global list |
442 | globalPush(s, idx); |
443 | return; |
444 | } |
445 | } else { |
446 | // local list has space |
447 | if (head.compare_exchange_strong(h, h.withIdx(idx).withSizeIncr())) { |
448 | // success |
449 | return; |
450 | } |
451 | } |
452 | // h was updated by failing CAS |
453 | } |
454 | } |
455 | |
456 | // returns 0 if empty |
457 | uint32_t globalPop() { |
458 | while (true) { |
459 | TaggedPtr gh = globalHead_.load(std::memory_order_acquire); |
460 | if (gh.idx == 0 || |
461 | globalHead_.compare_exchange_strong( |
462 | gh, |
463 | gh.withIdx( |
464 | slot(gh.idx).globalNext.load(std::memory_order_relaxed)))) { |
465 | // global list is empty, or pop was successful |
466 | return gh.idx; |
467 | } |
468 | } |
469 | } |
470 | |
471 | // returns 0 if allocation failed |
472 | uint32_t localPop(AtomicStruct<TaggedPtr, Atom>& head) { |
473 | while (true) { |
474 | TaggedPtr h = head.load(std::memory_order_acquire); |
475 | if (h.idx != 0) { |
476 | // local list is non-empty, try to pop |
477 | Slot& s = slot(h.idx); |
478 | auto next = s.localNext.load(std::memory_order_relaxed); |
479 | if (head.compare_exchange_strong(h, h.withIdx(next).withSizeDecr())) { |
480 | // success |
481 | return h.idx; |
482 | } |
483 | continue; |
484 | } |
485 | |
486 | uint32_t idx = globalPop(); |
487 | if (idx == 0) { |
488 | // global list is empty, allocate and construct new slot |
489 | if (size_.load(std::memory_order_relaxed) >= actualCapacity_ || |
490 | (idx = ++size_) > actualCapacity_) { |
491 | // allocation failed |
492 | return 0; |
493 | } |
494 | Slot& s = slot(idx); |
495 | // Atom is enforced above to be nothrow-default-constructible |
496 | // As an optimization, use default-initialization (no parens) rather |
497 | // than direct-initialization (with parens): these locations are |
498 | // stored-to before they are loaded-from |
499 | new (&s.localNext) Atom<uint32_t>; |
500 | new (&s.globalNext) Atom<uint32_t>; |
501 | Traits::initialize(&s.elem); |
502 | return idx; |
503 | } |
504 | |
505 | Slot& s = slot(idx); |
506 | auto next = s.localNext.load(std::memory_order_relaxed); |
507 | if (head.compare_exchange_strong( |
508 | h, h.withIdx(next).withSize(LocalListLimit))) { |
509 | // global list moved to local list, keep head for us |
510 | return idx; |
511 | } |
512 | // local bulk push failed, return idx to the global list and try again |
513 | globalPush(s, idx); |
514 | } |
515 | } |
516 | |
517 | AtomicStruct<TaggedPtr, Atom>& localHead() { |
518 | auto stripe = AccessSpreader<Atom>::current(NumLocalLists); |
519 | return local_[stripe].head; |
520 | } |
521 | |
522 | void markAllocated(Slot& slot) { |
523 | slot.localNext.store(uint32_t(-1), std::memory_order_release); |
524 | } |
525 | |
526 | public: |
527 | static constexpr std::size_t kSlotSize = sizeof(Slot); |
528 | }; |
529 | |
530 | namespace detail { |
531 | |
532 | /// This is a stateful Deleter functor, which allows std::unique_ptr |
533 | /// to track elements allocated from an IndexedMemPool by tracking the |
534 | /// associated pool. See IndexedMemPool::allocElem. |
535 | template <typename Pool> |
536 | struct IndexedMemPoolRecycler { |
537 | Pool* pool; |
538 | |
539 | explicit IndexedMemPoolRecycler(Pool* pool) : pool(pool) {} |
540 | |
541 | IndexedMemPoolRecycler(const IndexedMemPoolRecycler<Pool>& rhs) = default; |
542 | IndexedMemPoolRecycler& operator=(const IndexedMemPoolRecycler<Pool>& rhs) = |
543 | default; |
544 | |
545 | void operator()(typename Pool::value_type* elem) const { |
546 | pool->recycleIndex(pool->locateElem(elem)); |
547 | } |
548 | }; |
549 | |
550 | } // namespace detail |
551 | |
552 | } // namespace folly |
553 | |
554 | FOLLY_POP_WARNING |
555 | |