1// Copyright (c) Microsoft Corporation. All rights reserved.
2// Licensed under the MIT license.
3
4#pragma once
5
6#include <atomic>
7#include <cassert>
8#ifdef _DEBUG
9#include <cstring>
10#endif
11
12#include "status.h"
13#include "thread.h"
14
15/// A fast allocator intended for mostly-FIFO workloads (e.g., allocating contexts for file-I/O
16/// callbacks). Each thread allocates by bumping the tail of its current segment; when it fills a
17/// segment, it malloc()s a new one. Any thread frees by decrementing the allocation's segment's
18/// ref count; when a (filled) segment's ref count reaches 0, we free() it. So long as the workload
19/// is mostly FIFO, we don't leak memory.
20
21namespace FASTER {
22namespace core {
23
24/// Internal classes and structures.
25namespace lss_memory {
26
27/// Size of each segment (in bytes). (In experiments, a segment size of 16,000 worked well for
28/// on Windows, while 8,000 worked well on Linux.)
29#ifdef _WIN32
30static constexpr uint32_t kSegmentSize = 16000;
31#else
32static constexpr uint32_t kSegmentSize = 8000;
33#endif
34
35/// Preserving Windows malloc() behavior, all LSS allocations are aligned to 16 bytes.
36static constexpr uint32_t kBaseAlignment = 16;
37
38/// Header, prepended to all allocated blocks; used to find the ref count variable, to decrement it
39/// when the block is freed. (The allocation size isn't needed, since LSS allocations are
40/// essentially stack allocations; but _DEBUG mode includes it for the benefit of the caller.)
41#ifdef _DEBUG
42struct alignas(8) Header {
43 Header(uint32_t size_, uint32_t offset_)
44 : offset{ offset_ }
45 , size{ size_ } {
46 }
47
48 /// Offset from the head of the segment allocator's buffer to the memory block.
49 uint32_t offset;
50
51 /// Size of the memory block.
52 uint32_t size;
53};
54static_assert(sizeof(Header) == 8, "Header is not 8 bytes!");
55#else
56struct Header {
57 Header(uint16_t offset_)
58 : offset{ offset_ } {
59 }
60
61 /// Offset from the head of the segment allocator's buffer to the memory block.
62 uint16_t offset;
63};
64static_assert(sizeof(Header) == 2, "Header is not 2 bytes!");
65#endif
66
67class ThreadAllocator;
68
69class SegmentState {
70 public:
71 SegmentState()
72 : control{ 0 } {
73 }
74
75 SegmentState(uint64_t control_)
76 : control{ control_ } {
77 }
78
79 SegmentState(uint32_t allocations_, uint32_t frees_)
80 : frees{ frees_ }
81 , allocations{ allocations_ } {
82 }
83
84 union {
85 struct {
86 /// Count of memory blocks freed inside this segment. Incremented on each free. Frees can
87 /// take place on any thread.
88 uint32_t frees;
89 /// If this segment is sealed, then the count of memory blocks allocated inside this
90 /// segment. Otherwise, zero.
91 uint32_t allocations;
92 };
93 /// 64-bit control field, used so that threads can read the allocation count atomically at
94 /// the same time they increment the free count atomically.
95 std::atomic<uint64_t> control;
96 };
97};
98static_assert(sizeof(SegmentState) == 8, "sizeof(SegmentState) != 8");
99static_assert(kSegmentSize < UINT16_MAX / 2, "kSegmentSize too large for offset size!");
100
101/// Allocation takes place inside segments. When a segment is no longer needed, we add it to the
102/// garbage list.
103class SegmentAllocator {
104 public:
105 /// Offset from the head of the class to the head of its buffer_ field.
106#ifdef _DEBUG
107 static constexpr uint32_t kBufferOffset = 8;
108#else
109 static constexpr uint32_t kBufferOffset = 14;
110#endif
111
112 /// Initialize the segment allocator and allocate the segment.
113 SegmentAllocator()
114 : state{} {
115#ifdef _DEBUG
116 // Debug LSS memory codes:
117 // - 0xBA - initialized, not allocated.
118 std::memset(buffer, 0xBA, kSegmentSize);
119#endif
120 }
121
122 /// Free the specified memory block. The block must be inside this segment! Returns true if the
123 /// segment was freed; otherwise, returns false.
124 void Free(void* bytes);
125
126 /// Seal the segment--no more blocks will be allocated inside this segment. Returns true if the
127 /// segment was freed; otherwise, returns false.
128 void Seal(uint32_t blocks_allocated);
129
130 private:
131 /// Decrement the active references count, effectively freeing one allocation. Also frees the
132 /// segment if (1) it is sealed and (2) its active references count is now zero. Returns true if
133 /// the segment was freed; otherwise, returns false.
134 void Free();
135
136 public:
137 /// Segment allocator state (8 bytes).
138 SegmentState state;
139
140 /// Padding, as needed, so that the first user allocation, at buffer_[sizeof(Header)] is 16-byte
141 /// aligned.
142 /// (In _DEBUG builds, sizeof(Header) == 8, so we require 0 bytes padding; in release builds,
143 /// sizeof(Header) == 2, so we require 6 bytes padding.)
144 private:
145#ifdef _DEBUG
146#else
147 uint8_t padding_[6];
148#endif
149
150 public:
151 /// This segment's memory. (First allocation's 8-byte Header starts at 8 (mod 16), so the
152 /// allocation's contents will start at 0 (mod 16), as desired.)
153 uint8_t buffer[kSegmentSize];
154};
155
156/// Allocator for a single thread. Allocates only; frees are directed by the global allocator
157/// object directly to the relevant segment allocator.
158class alignas(64) ThreadAllocator {
159 public:
160 static constexpr uint32_t kCacheLineSize = 64;
161
162 /// Initialize the thread allocator. The real work happens lazily, when Allocate() is called for
163 /// the first time.
164 ThreadAllocator()
165 : segment_allocator_{ nullptr }
166 , segment_offset_{ 0 }
167 , allocations_{ 0 } {
168 }
169
170 /// Allocate a memory block of the specified size < kSegmentSize. If allocation fails, returns
171 /// nullptr.
172 void* Allocate(uint32_t size);
173 void* AllocateAligned(uint32_t size, uint32_t offset);
174
175 private:
176 inline uint32_t Reserve(uint32_t block_size) {
177 assert(block_size <= kSegmentSize);
178 ++allocations_;
179 uint32_t result = segment_offset_;
180 assert(result <= kSegmentSize);
181 segment_offset_ += block_size;
182 return result;
183 }
184
185 /// Segment inside which each thread's new allocations occur (pointer, 8 bytes).
186 SegmentAllocator* segment_allocator_;
187
188 /// Offset, into the active segment, of the next allocation.
189 uint32_t segment_offset_;
190
191 /// Number of blocks allocated inside the active segment.
192 uint32_t allocations_;
193};
194static_assert(sizeof(ThreadAllocator) == 64, "sizeof(ThreadAllocator) != 64.");
195
196} // namespace lss_memory
197
198/// The LSS allocator allocates memory from a log-structured store, but does not perform garbage
199/// collection. Memory is allocated from segments; each segment is freed only after all of its
200/// allocations have been freed. This means that if a single allocation inside a segment is still
201/// alive, the entire segment is still alive.
202/// The LSS allocator works well in the case where memory usage is almost FIFO. In that case, all
203/// of the segment's allocations will eventually be freed, so the segment will be freed. The LSS
204/// allocator is intended to replace the (synchronous) function call stack, for asynchronous
205/// continuations.
206class LssAllocator {
207 public:
208 /// Maximum number of threads supported. For each possible thread, we reserve an 8-byte
209 /// ThreadAllocator; so the memory required is 8 * (kMaxThreadCount) bytes. For each actual
210 /// thread, we reserve a full SegmentAllocator, of size approximately kSegmentSize.
211 static constexpr size_t kMaxThreadCount = Thread::kMaxNumThreads;
212
213 /// Size of each segment (in bytes).
214 static constexpr uint32_t kSegmentSize = lss_memory::kSegmentSize;
215
216 /// Preserving Windows malloc() behavior, all LSS allocations are aligned to 16 bytes.
217 static constexpr uint32_t kBaseAlignment = lss_memory::kBaseAlignment;
218
219 /// Initialize the LSS allocator. The real work happens lazily, when a thread calls Allocate()
220 /// for the first time.
221 LssAllocator() {
222 for(size_t idx = 0; idx < kMaxThreadCount; ++idx) {
223 thread_allocators_[idx] = lss_memory::ThreadAllocator{};
224 }
225 }
226
227 /// Allocate a memory block of the specified size. Note that size must be < kSegmentSize, since
228 /// the allocation will take place inside a segment. The Allocate() code is ultimately single-
229 /// threaded, since we maintain a separate ThreadAllocator per thread, each with its own
230 /// SegmentAllocator. If allocation fails, returns nullptr.
231 void* Allocate(uint32_t size);
232 void* AllocateAligned(uint32_t size, uint32_t alignment);
233
234 /// Free the specified memory block. The Free() code is thread-safe, since the Free() request is
235 /// always directed to the SegmentAllocator() that originally allocated the code--regardless of
236 /// what thread it is issued from.
237 void Free(void* bytes);
238
239 private:
240 /// To reduce contention (and avoid needing atomic primitives in the allocation path), we
241 /// maintain a unique allocator per thread.
242 lss_memory::ThreadAllocator thread_allocators_[kMaxThreadCount];
243};
244
245/// The global LSS allocator instance.
246extern LssAllocator lss_allocator;
247
248}
249} // namespace FASTER::core
250