1 | #pragma once |
2 | |
3 | #include <string.h> |
4 | |
5 | #ifdef NDEBUG |
6 | #define ALLOCATOR_ASLR 0 |
7 | #else |
8 | #define ALLOCATOR_ASLR 1 |
9 | #endif |
10 | |
11 | #include <pcg_random.hpp> |
12 | #include <Common/thread_local_rng.h> |
13 | |
14 | #if !defined(__APPLE__) && !defined(__FreeBSD__) |
15 | #include <malloc.h> |
16 | #endif |
17 | |
18 | #include <cstdlib> |
19 | #include <algorithm> |
20 | #include <sys/mman.h> |
21 | |
22 | #include <Core/Defines.h> |
23 | #ifdef THREAD_SANITIZER |
24 | /// Thread sanitizer does not intercept mremap. The usage of mremap will lead to false positives. |
25 | #define DISABLE_MREMAP 1 |
26 | #endif |
27 | #include <common/mremap.h> |
28 | |
29 | #include <Common/MemoryTracker.h> |
30 | #include <Common/Exception.h> |
31 | #include <Common/formatReadable.h> |
32 | |
33 | #include <Common/Allocator_fwd.h> |
34 | |
35 | |
36 | /// Required for older Darwin builds, that lack definition of MAP_ANONYMOUS |
37 | #ifndef MAP_ANONYMOUS |
38 | #define MAP_ANONYMOUS MAP_ANON |
39 | #endif |
40 | |
41 | /** |
42 | * Many modern allocators (for example, tcmalloc) do not do a mremap for |
43 | * realloc, even in case of large enough chunks of memory. Although this allows |
44 | * you to increase performance and reduce memory consumption during realloc. |
45 | * To fix this, we do mremap manually if the chunk of memory is large enough. |
46 | * The threshold (64 MB) is chosen quite large, since changing the address |
47 | * space is very slow, especially in the case of a large number of threads. We |
48 | * expect that the set of operations mmap/something to do/mremap can only be |
49 | * performed about 1000 times per second. |
50 | * |
51 | * P.S. This is also required, because tcmalloc can not allocate a chunk of |
52 | * memory greater than 16 GB. |
53 | */ |
54 | #ifdef NDEBUG |
55 | static constexpr size_t MMAP_THRESHOLD = 64 * (1ULL << 20); |
56 | #else |
57 | /** |
58 | * In debug build, use small mmap threshold to reproduce more memory |
59 | * stomping bugs. Along with ASLR it will hopefully detect more issues than |
60 | * ASan. The program may fail due to the limit on number of memory mappings. |
61 | */ |
62 | static constexpr size_t MMAP_THRESHOLD = 4096; |
63 | #endif |
64 | |
65 | static constexpr size_t MMAP_MIN_ALIGNMENT = 4096; |
66 | static constexpr size_t MALLOC_MIN_ALIGNMENT = 8; |
67 | |
68 | namespace DB |
69 | { |
70 | namespace ErrorCodes |
71 | { |
72 | extern const int BAD_ARGUMENTS; |
73 | extern const int CANNOT_ALLOCATE_MEMORY; |
74 | extern const int CANNOT_MUNMAP; |
75 | extern const int CANNOT_MREMAP; |
76 | } |
77 | } |
78 | |
79 | /** Responsible for allocating / freeing memory. Used, for example, in PODArray, Arena. |
80 | * Also used in hash tables. |
81 | * The interface is different from std::allocator |
82 | * - the presence of the method realloc, which for large chunks of memory uses mremap; |
83 | * - passing the size into the `free` method; |
84 | * - by the presence of the `alignment` argument; |
85 | * - the possibility of zeroing memory (used in hash tables); |
86 | * - random hint address for mmap |
87 | * - mmap_threshold for using mmap less or more |
88 | */ |
89 | template <bool clear_memory_, bool mmap_populate> |
90 | class Allocator |
91 | { |
92 | public: |
93 | /// Allocate memory range. |
94 | void * alloc(size_t size, size_t alignment = 0) |
95 | { |
96 | CurrentMemoryTracker::alloc(size); |
97 | return allocNoTrack(size, alignment); |
98 | } |
99 | |
100 | /// Free memory range. |
101 | void free(void * buf, size_t size) |
102 | { |
103 | freeNoTrack(buf, size); |
104 | CurrentMemoryTracker::free(size); |
105 | } |
106 | |
107 | /** Enlarge memory range. |
108 | * Data from old range is moved to the beginning of new range. |
109 | * Address of memory range could change. |
110 | */ |
111 | void * realloc(void * buf, size_t old_size, size_t new_size, size_t alignment = 0) |
112 | { |
113 | if (old_size == new_size) |
114 | { |
115 | /// nothing to do. |
116 | /// BTW, it's not possible to change alignment while doing realloc. |
117 | } |
118 | else if (old_size < MMAP_THRESHOLD && new_size < MMAP_THRESHOLD |
119 | && alignment <= MALLOC_MIN_ALIGNMENT) |
120 | { |
121 | /// Resize malloc'd memory region with no special alignment requirement. |
122 | CurrentMemoryTracker::realloc(old_size, new_size); |
123 | |
124 | void * new_buf = ::realloc(buf, new_size); |
125 | if (nullptr == new_buf) |
126 | DB::throwFromErrno("Allocator: Cannot realloc from " + formatReadableSizeWithBinarySuffix(old_size) + " to " + formatReadableSizeWithBinarySuffix(new_size) + "." , DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY); |
127 | |
128 | buf = new_buf; |
129 | if constexpr (clear_memory) |
130 | if (new_size > old_size) |
131 | memset(reinterpret_cast<char *>(buf) + old_size, 0, new_size - old_size); |
132 | } |
133 | else if (old_size >= MMAP_THRESHOLD && new_size >= MMAP_THRESHOLD) |
134 | { |
135 | /// Resize mmap'd memory region. |
136 | CurrentMemoryTracker::realloc(old_size, new_size); |
137 | |
138 | // On apple and freebsd self-implemented mremap used (common/mremap.h) |
139 | buf = clickhouse_mremap(buf, old_size, new_size, MREMAP_MAYMOVE, |
140 | PROT_READ | PROT_WRITE, mmap_flags, -1, 0); |
141 | if (MAP_FAILED == buf) |
142 | DB::throwFromErrno("Allocator: Cannot mremap memory chunk from " + formatReadableSizeWithBinarySuffix(old_size) + " to " + formatReadableSizeWithBinarySuffix(new_size) + "." , DB::ErrorCodes::CANNOT_MREMAP); |
143 | |
144 | /// No need for zero-fill, because mmap guarantees it. |
145 | } |
146 | else if (new_size < MMAP_THRESHOLD) |
147 | { |
148 | /// Small allocs that requires a copy. Assume there's enough memory in system. Call CurrentMemoryTracker once. |
149 | CurrentMemoryTracker::realloc(old_size, new_size); |
150 | |
151 | void * new_buf = allocNoTrack(new_size, alignment); |
152 | memcpy(new_buf, buf, std::min(old_size, new_size)); |
153 | freeNoTrack(buf, old_size); |
154 | buf = new_buf; |
155 | } |
156 | else |
157 | { |
158 | /// Big allocs that requires a copy. MemoryTracker is called inside 'alloc', 'free' methods. |
159 | |
160 | void * new_buf = alloc(new_size, alignment); |
161 | memcpy(new_buf, buf, std::min(old_size, new_size)); |
162 | free(buf, old_size); |
163 | buf = new_buf; |
164 | } |
165 | |
166 | return buf; |
167 | } |
168 | |
169 | protected: |
170 | static constexpr size_t getStackThreshold() |
171 | { |
172 | return 0; |
173 | } |
174 | |
175 | static constexpr bool clear_memory = clear_memory_; |
176 | |
177 | // Freshly mmapped pages are copy-on-write references to a global zero page. |
178 | // On the first write, a page fault occurs, and an actual writable page is |
179 | // allocated. If we are going to use this memory soon, such as when resizing |
180 | // hash tables, it makes sense to pre-fault the pages by passing |
181 | // MAP_POPULATE to mmap(). This takes some time, but should be faster |
182 | // overall than having a hot loop interrupted by page faults. |
183 | // It is only supported on Linux. |
184 | static constexpr int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS |
185 | #if defined(OS_LINUX) |
186 | | (mmap_populate ? MAP_POPULATE : 0) |
187 | #endif |
188 | ; |
189 | |
190 | private: |
191 | void * allocNoTrack(size_t size, size_t alignment) |
192 | { |
193 | void * buf; |
194 | |
195 | if (size >= MMAP_THRESHOLD) |
196 | { |
197 | if (alignment > MMAP_MIN_ALIGNMENT) |
198 | throw DB::Exception("Too large alignment " + formatReadableSizeWithBinarySuffix(alignment) + ": more than page size when allocating " |
199 | + formatReadableSizeWithBinarySuffix(size) + "." , DB::ErrorCodes::BAD_ARGUMENTS); |
200 | |
201 | buf = mmap(getMmapHint(), size, PROT_READ | PROT_WRITE, |
202 | mmap_flags, -1, 0); |
203 | if (MAP_FAILED == buf) |
204 | DB::throwFromErrno("Allocator: Cannot mmap " + formatReadableSizeWithBinarySuffix(size) + "." , DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY); |
205 | |
206 | /// No need for zero-fill, because mmap guarantees it. |
207 | } |
208 | else |
209 | { |
210 | if (alignment <= MALLOC_MIN_ALIGNMENT) |
211 | { |
212 | if constexpr (clear_memory) |
213 | buf = ::calloc(size, 1); |
214 | else |
215 | buf = ::malloc(size); |
216 | |
217 | if (nullptr == buf) |
218 | DB::throwFromErrno("Allocator: Cannot malloc " + formatReadableSizeWithBinarySuffix(size) + "." , DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY); |
219 | } |
220 | else |
221 | { |
222 | buf = nullptr; |
223 | int res = posix_memalign(&buf, alignment, size); |
224 | |
225 | if (0 != res) |
226 | DB::throwFromErrno("Cannot allocate memory (posix_memalign) " + formatReadableSizeWithBinarySuffix(size) + "." , DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY, res); |
227 | |
228 | if constexpr (clear_memory) |
229 | memset(buf, 0, size); |
230 | } |
231 | } |
232 | return buf; |
233 | } |
234 | |
235 | void freeNoTrack(void * buf, size_t size) |
236 | { |
237 | if (size >= MMAP_THRESHOLD) |
238 | { |
239 | if (0 != munmap(buf, size)) |
240 | DB::throwFromErrno("Allocator: Cannot munmap " + formatReadableSizeWithBinarySuffix(size) + "." , DB::ErrorCodes::CANNOT_MUNMAP); |
241 | } |
242 | else |
243 | { |
244 | ::free(buf); |
245 | } |
246 | } |
247 | |
248 | #ifndef NDEBUG |
249 | /// In debug builds, request mmap() at random addresses (a kind of ASLR), to |
250 | /// reproduce more memory stomping bugs. Note that Linux doesn't do it by |
251 | /// default. This may lead to worse TLB performance. |
252 | void * getMmapHint() |
253 | { |
254 | return reinterpret_cast<void *>(std::uniform_int_distribution<intptr_t>(0x100000000000UL, 0x700000000000UL)(thread_local_rng)); |
255 | } |
256 | #else |
257 | void * getMmapHint() |
258 | { |
259 | return nullptr; |
260 | } |
261 | #endif |
262 | }; |
263 | |
264 | /** When using AllocatorWithStackMemory, located on the stack, |
265 | * GCC 4.9 mistakenly assumes that we can call `free` from a pointer to the stack. |
266 | * In fact, the combination of conditions inside AllocatorWithStackMemory does not allow this. |
267 | */ |
268 | #if !__clang__ |
269 | #pragma GCC diagnostic push |
270 | #pragma GCC diagnostic ignored "-Wfree-nonheap-object" |
271 | #endif |
272 | |
273 | /** Allocator with optimization to place small memory ranges in automatic memory. |
274 | */ |
275 | template <typename Base, size_t N, size_t Alignment> |
276 | class AllocatorWithStackMemory : private Base |
277 | { |
278 | private: |
279 | alignas(Alignment) char stack_memory[N]; |
280 | |
281 | public: |
282 | /// Do not use boost::noncopyable to avoid the warning about direct base |
283 | /// being inaccessible due to ambiguity, when derived classes are also |
284 | /// noncopiable (-Winaccessible-base). |
285 | AllocatorWithStackMemory(const AllocatorWithStackMemory&) = delete; |
286 | AllocatorWithStackMemory & operator = (const AllocatorWithStackMemory&) = delete; |
287 | AllocatorWithStackMemory() = default; |
288 | ~AllocatorWithStackMemory() = default; |
289 | |
290 | void * alloc(size_t size) |
291 | { |
292 | if (size <= N) |
293 | { |
294 | if constexpr (Base::clear_memory) |
295 | memset(stack_memory, 0, N); |
296 | return stack_memory; |
297 | } |
298 | |
299 | return Base::alloc(size, Alignment); |
300 | } |
301 | |
302 | void free(void * buf, size_t size) |
303 | { |
304 | if (size > N) |
305 | Base::free(buf, size); |
306 | } |
307 | |
308 | void * realloc(void * buf, size_t old_size, size_t new_size) |
309 | { |
310 | /// Was in stack_memory, will remain there. |
311 | if (new_size <= N) |
312 | return buf; |
313 | |
314 | /// Already was big enough to not fit in stack_memory. |
315 | if (old_size > N) |
316 | return Base::realloc(buf, old_size, new_size, Alignment); |
317 | |
318 | /// Was in stack memory, but now will not fit there. |
319 | void * new_buf = Base::alloc(new_size, Alignment); |
320 | memcpy(new_buf, buf, old_size); |
321 | return new_buf; |
322 | } |
323 | |
324 | protected: |
325 | static constexpr size_t getStackThreshold() |
326 | { |
327 | return N; |
328 | } |
329 | }; |
330 | |
331 | |
332 | #if !__clang__ |
333 | #pragma GCC diagnostic pop |
334 | #endif |
335 | |