| 1 | // Copyright (c) Microsoft Corporation. All rights reserved. | 
| 2 | // Licensed under the MIT license. | 
| 3 |  | 
| 4 | #pragma once | 
| 5 |  | 
| 6 | #include <atomic> | 
| 7 | #include <cassert> | 
| 8 | #ifdef _DEBUG | 
| 9 | #include <cstring> | 
| 10 | #endif | 
| 11 |  | 
| 12 | #include "status.h" | 
| 13 | #include "thread.h" | 
| 14 |  | 
| 15 | /// A fast allocator intended for mostly-FIFO workloads (e.g., allocating contexts for file-I/O | 
| 16 | /// callbacks). Each thread allocates by bumping the tail of its current segment; when it fills a | 
| 17 | /// segment, it malloc()s a new one. Any thread frees by decrementing the allocation's segment's | 
| 18 | /// ref count; when a (filled) segment's ref count reaches 0, we free() it. So long as the workload | 
| 19 | /// is mostly FIFO, we don't leak memory. | 
| 20 |  | 
| 21 | namespace FASTER { | 
| 22 | namespace core { | 
| 23 |  | 
| 24 | /// Internal classes and structures. | 
| 25 | namespace lss_memory { | 
| 26 |  | 
| 27 | /// Size of each segment (in bytes). (In experiments, a segment size of 16,000 worked well for | 
| 28 | /// on Windows, while 8,000 worked well on Linux.) | 
| 29 | #ifdef _WIN32 | 
| 30 | static constexpr uint32_t kSegmentSize = 16000; | 
| 31 | #else | 
| 32 | static constexpr uint32_t kSegmentSize = 8000; | 
| 33 | #endif | 
| 34 |  | 
| 35 | /// Preserving Windows malloc() behavior, all LSS allocations are aligned to 16 bytes. | 
| 36 | static constexpr uint32_t kBaseAlignment = 16; | 
| 37 |  | 
| 38 | /// Header, prepended to all allocated blocks; used to find the ref count variable, to decrement it | 
| 39 | /// when the block is freed. (The allocation size isn't needed, since LSS allocations are | 
| 40 | /// essentially stack allocations; but _DEBUG mode includes it for the benefit of the caller.) | 
| 41 | #ifdef _DEBUG | 
| 42 | struct alignas(8) Header { | 
| 43 |   Header(uint32_t size_, uint32_t offset_) | 
| 44 |     : offset{ offset_ } | 
| 45 |     , size{ size_ } { | 
| 46 |   } | 
| 47 |  | 
| 48 |   /// Offset from the head of the segment allocator's buffer to the memory block. | 
| 49 |   uint32_t offset; | 
| 50 |  | 
| 51 |   /// Size of the memory block. | 
| 52 |   uint32_t size; | 
| 53 | }; | 
| 54 | static_assert(sizeof(Header) == 8, "Header is not 8 bytes!" ); | 
| 55 | #else | 
| 56 | struct  { | 
| 57 |   (uint16_t offset_) | 
| 58 |     : offset{ offset_ } { | 
| 59 |   } | 
| 60 |  | 
| 61 |   /// Offset from the head of the segment allocator's buffer to the memory block. | 
| 62 |   uint16_t ; | 
| 63 | }; | 
| 64 | static_assert(sizeof(Header) == 2, "Header is not 2 bytes!" ); | 
| 65 | #endif | 
| 66 |  | 
| 67 | class ThreadAllocator; | 
| 68 |  | 
| 69 | class SegmentState { | 
| 70 |  public: | 
| 71 |   SegmentState() | 
| 72 |     : control{ 0 } { | 
| 73 |   } | 
| 74 |  | 
| 75 |   SegmentState(uint64_t control_) | 
| 76 |     : control{ control_ } { | 
| 77 |   } | 
| 78 |  | 
| 79 |   SegmentState(uint32_t allocations_, uint32_t frees_) | 
| 80 |     : frees{ frees_ } | 
| 81 |     , allocations{ allocations_ } { | 
| 82 |   } | 
| 83 |  | 
| 84 |   union { | 
| 85 |       struct { | 
| 86 |         /// Count of memory blocks freed inside this segment. Incremented on each free. Frees can | 
| 87 |         /// take place on any thread. | 
| 88 |         uint32_t frees; | 
| 89 |         /// If this segment is sealed, then the count of memory blocks allocated inside this | 
| 90 |         /// segment. Otherwise, zero. | 
| 91 |         uint32_t allocations; | 
| 92 |       }; | 
| 93 |       /// 64-bit control field, used so that threads can read the allocation count atomically at | 
| 94 |       /// the same time they increment the free count atomically. | 
| 95 |       std::atomic<uint64_t> control; | 
| 96 |     }; | 
| 97 | }; | 
| 98 | static_assert(sizeof(SegmentState) == 8, "sizeof(SegmentState) != 8" ); | 
| 99 | static_assert(kSegmentSize < UINT16_MAX / 2, "kSegmentSize too large for offset size!" ); | 
| 100 |  | 
| 101 | /// Allocation takes place inside segments. When a segment is no longer needed, we add it to the | 
| 102 | /// garbage list. | 
| 103 | class SegmentAllocator { | 
| 104 |  public: | 
| 105 |   /// Offset from the head of the class to the head of its buffer_ field. | 
| 106 | #ifdef _DEBUG | 
| 107 |   static constexpr uint32_t kBufferOffset = 8; | 
| 108 | #else | 
| 109 |   static constexpr uint32_t kBufferOffset = 14; | 
| 110 | #endif | 
| 111 |  | 
| 112 |   /// Initialize the segment allocator and allocate the segment. | 
| 113 |   SegmentAllocator() | 
| 114 |     : state{} { | 
| 115 | #ifdef _DEBUG | 
| 116 |     // Debug LSS memory codes: | 
| 117 |     //  - 0xBA - initialized, not allocated. | 
| 118 |     std::memset(buffer, 0xBA, kSegmentSize); | 
| 119 | #endif | 
| 120 |   } | 
| 121 |  | 
| 122 |   /// Free the specified memory block. The block must be inside this segment! Returns true if the | 
| 123 |   /// segment was freed; otherwise, returns false. | 
| 124 |   void Free(void* bytes); | 
| 125 |  | 
| 126 |   /// Seal the segment--no more blocks will be allocated inside this segment. Returns true if the | 
| 127 |   /// segment was freed; otherwise, returns false. | 
| 128 |   void Seal(uint32_t blocks_allocated); | 
| 129 |  | 
| 130 |  private: | 
| 131 |   /// Decrement the active references count, effectively freeing one allocation. Also frees the | 
| 132 |   /// segment if (1) it is sealed and (2) its active references count is now zero. Returns true if | 
| 133 |   /// the segment was freed; otherwise, returns false. | 
| 134 |   void Free(); | 
| 135 |  | 
| 136 |  public: | 
| 137 |   /// Segment allocator state (8 bytes). | 
| 138 |   SegmentState state; | 
| 139 |  | 
| 140 |   /// Padding, as needed, so that the first user allocation, at buffer_[sizeof(Header)] is 16-byte | 
| 141 |   /// aligned. | 
| 142 |   /// (In _DEBUG builds, sizeof(Header) == 8, so we require 0 bytes padding; in release builds, | 
| 143 |   /// sizeof(Header) == 2, so we require 6 bytes padding.) | 
| 144 |  private: | 
| 145 | #ifdef _DEBUG | 
| 146 | #else | 
| 147 |   uint8_t padding_[6]; | 
| 148 | #endif | 
| 149 |  | 
| 150 |  public: | 
| 151 |   /// This segment's memory. (First allocation's 8-byte Header starts at 8 (mod 16), so the | 
| 152 |   /// allocation's contents will start at 0 (mod 16), as desired.) | 
| 153 |   uint8_t buffer[kSegmentSize]; | 
| 154 | }; | 
| 155 |  | 
| 156 | /// Allocator for a single thread. Allocates only; frees are directed by the global allocator | 
| 157 | /// object directly to the relevant segment allocator. | 
| 158 | class alignas(64) ThreadAllocator { | 
| 159 |  public: | 
| 160 |   static constexpr uint32_t kCacheLineSize = 64; | 
| 161 |  | 
| 162 |   /// Initialize the thread allocator. The real work happens lazily, when Allocate() is called for | 
| 163 |   /// the first time. | 
| 164 |   ThreadAllocator() | 
| 165 |     : segment_allocator_{ nullptr } | 
| 166 |     , segment_offset_{ 0 } | 
| 167 |     , allocations_{ 0 } { | 
| 168 |   } | 
| 169 |  | 
| 170 |   /// Allocate a memory block of the specified size < kSegmentSize. If allocation fails, returns | 
| 171 |   /// nullptr. | 
| 172 |   void* Allocate(uint32_t size); | 
| 173 |   void* AllocateAligned(uint32_t size, uint32_t offset); | 
| 174 |  | 
| 175 |  private: | 
| 176 |   inline uint32_t Reserve(uint32_t block_size) { | 
| 177 |     assert(block_size <= kSegmentSize); | 
| 178 |     ++allocations_; | 
| 179 |     uint32_t result = segment_offset_; | 
| 180 |     assert(result <= kSegmentSize); | 
| 181 |     segment_offset_ += block_size; | 
| 182 |     return result; | 
| 183 |   } | 
| 184 |  | 
| 185 |   /// Segment inside which each thread's new allocations occur (pointer, 8 bytes). | 
| 186 |   SegmentAllocator* segment_allocator_; | 
| 187 |  | 
| 188 |   /// Offset, into the active segment, of the next allocation. | 
| 189 |   uint32_t segment_offset_; | 
| 190 |  | 
| 191 |   /// Number of blocks allocated inside the active segment. | 
| 192 |   uint32_t allocations_; | 
| 193 | }; | 
| 194 | static_assert(sizeof(ThreadAllocator) == 64, "sizeof(ThreadAllocator) != 64." ); | 
| 195 |  | 
| 196 | } // namespace lss_memory | 
| 197 |  | 
| 198 | /// The LSS allocator allocates memory from a log-structured store, but does not perform garbage | 
| 199 | /// collection. Memory is allocated from segments; each segment is freed only after all of its | 
| 200 | /// allocations have been freed. This means that if a single allocation inside a segment is still | 
| 201 | /// alive, the entire segment is still alive. | 
| 202 | /// The LSS allocator works well in the case where memory usage is almost FIFO. In that case, all | 
| 203 | /// of the segment's allocations will eventually be freed, so the segment will be freed. The LSS | 
| 204 | /// allocator is intended to replace the (synchronous) function call stack, for asynchronous | 
| 205 | /// continuations. | 
| 206 | class LssAllocator { | 
| 207 |  public: | 
| 208 |   /// Maximum number of threads supported. For each possible thread, we reserve an 8-byte | 
| 209 |   /// ThreadAllocator; so the memory required is 8 * (kMaxThreadCount) bytes. For each actual | 
| 210 |   /// thread, we reserve a full SegmentAllocator, of size approximately kSegmentSize. | 
| 211 |   static constexpr size_t kMaxThreadCount = Thread::kMaxNumThreads; | 
| 212 |  | 
| 213 |   /// Size of each segment (in bytes). | 
| 214 |   static constexpr uint32_t kSegmentSize = lss_memory::kSegmentSize; | 
| 215 |  | 
| 216 |   /// Preserving Windows malloc() behavior, all LSS allocations are aligned to 16 bytes. | 
| 217 |   static constexpr uint32_t kBaseAlignment = lss_memory::kBaseAlignment; | 
| 218 |  | 
| 219 |   /// Initialize the LSS allocator. The real work happens lazily, when a thread calls Allocate() | 
| 220 |   /// for the first time. | 
| 221 |   LssAllocator() { | 
| 222 |     for(size_t idx = 0; idx < kMaxThreadCount; ++idx) { | 
| 223 |       thread_allocators_[idx] = lss_memory::ThreadAllocator{}; | 
| 224 |     } | 
| 225 |   } | 
| 226 |  | 
| 227 |   /// Allocate a memory block of the specified size. Note that size must be < kSegmentSize, since | 
| 228 |   /// the allocation will take place inside a segment. The Allocate() code is ultimately single- | 
| 229 |   /// threaded, since we maintain a separate ThreadAllocator per thread, each with its own | 
| 230 |   /// SegmentAllocator. If allocation fails, returns nullptr. | 
| 231 |   void* Allocate(uint32_t size); | 
| 232 |   void* AllocateAligned(uint32_t size, uint32_t alignment); | 
| 233 |  | 
| 234 |   /// Free the specified memory block. The Free() code is thread-safe, since the Free() request is | 
| 235 |   /// always directed to the SegmentAllocator() that originally allocated the code--regardless of | 
| 236 |   /// what thread it is issued from. | 
| 237 |   void Free(void* bytes); | 
| 238 |  | 
| 239 |  private: | 
| 240 |   /// To reduce contention (and avoid needing atomic primitives in the allocation path), we | 
| 241 |   /// maintain a unique allocator per thread. | 
| 242 |   lss_memory::ThreadAllocator thread_allocators_[kMaxThreadCount]; | 
| 243 | }; | 
| 244 |  | 
| 245 | /// The global LSS allocator instance. | 
| 246 | extern LssAllocator lss_allocator; | 
| 247 |  | 
| 248 | } | 
| 249 | } // namespace FASTER::core | 
| 250 |  |