1 | // Copyright 2009-2021 Intel Corporation |
2 | // SPDX-License-Identifier: Apache-2.0 |
3 | |
4 | #pragma once |
5 | |
6 | #include "default.h" |
7 | #include "device.h" |
8 | #include "scene.h" |
9 | #include "primref.h" |
10 | |
11 | #if defined(APPLE) && defined(__aarch64__) |
12 | #include <mutex> |
13 | #endif |
14 | |
15 | namespace embree |
16 | { |
17 | class FastAllocator |
18 | { |
19 | /*! maximum supported alignment */ |
20 | static const size_t maxAlignment = 64; |
21 | |
22 | /*! maximum allocation size */ |
23 | |
24 | /* default settings */ |
25 | //static const size_t defaultBlockSize = 4096; |
26 | #define maxAllocationSize size_t(2*1024*1024-maxAlignment) |
27 | |
28 | static const size_t MAX_THREAD_USED_BLOCK_SLOTS = 8; |
29 | |
30 | public: |
31 | |
32 | struct ThreadLocal2; |
33 | enum AllocationType { ALIGNED_MALLOC, EMBREE_OS_MALLOC, SHARED, ANY_TYPE }; |
34 | |
35 | /*! Per thread structure holding the current memory block. */ |
36 | struct __aligned(64) ThreadLocal |
37 | { |
38 | ALIGNED_CLASS_(64); |
39 | public: |
40 | |
41 | /*! Constructor for usage with ThreadLocalData */ |
42 | __forceinline ThreadLocal (ThreadLocal2* parent) |
43 | : parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {} |
44 | |
45 | /*! initialize allocator */ |
46 | void init(FastAllocator* alloc) |
47 | { |
48 | ptr = nullptr; |
49 | cur = end = 0; |
50 | bytesUsed = 0; |
51 | bytesWasted = 0; |
52 | allocBlockSize = 0; |
53 | if (alloc) allocBlockSize = alloc->defaultBlockSize; |
54 | } |
55 | |
56 | /* Allocate aligned memory from the threads memory block. */ |
57 | __forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16) |
58 | { |
59 | /* bind the thread local allocator to the proper FastAllocator*/ |
60 | parent->bind(alloc); |
61 | |
62 | assert(align <= maxAlignment); |
63 | bytesUsed += bytes; |
64 | |
65 | /* try to allocate in local block */ |
66 | size_t ofs = (align - cur) & (align-1); |
67 | cur += bytes + ofs; |
68 | if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; } |
69 | cur -= bytes + ofs; |
70 | |
71 | /* if allocation is too large allocate with parent allocator */ |
72 | if (4*bytes > allocBlockSize) { |
73 | return alloc->malloc(bytes,maxAlignment,false); |
74 | } |
75 | |
76 | /* get new partial block if allocation failed */ |
77 | size_t blockSize = allocBlockSize; |
78 | ptr = (char*) alloc->malloc(blockSize,maxAlignment,true); |
79 | bytesWasted += end-cur; |
80 | cur = 0; end = blockSize; |
81 | |
82 | /* retry allocation */ |
83 | ofs = (align - cur) & (align-1); |
84 | cur += bytes + ofs; |
85 | if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; } |
86 | cur -= bytes + ofs; |
87 | |
88 | /* get new full block if allocation failed */ |
89 | blockSize = allocBlockSize; |
90 | ptr = (char*) alloc->malloc(blockSize,maxAlignment,false); |
91 | bytesWasted += end-cur; |
92 | cur = 0; end = blockSize; |
93 | |
94 | /* retry allocation */ |
95 | ofs = (align - cur) & (align-1); |
96 | cur += bytes + ofs; |
97 | if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; } |
98 | cur -= bytes + ofs; |
99 | |
100 | /* should never happen as large allocations get handled specially above */ |
101 | assert(false); |
102 | return nullptr; |
103 | } |
104 | |
105 | |
106 | /*! returns amount of used bytes */ |
107 | __forceinline size_t getUsedBytes() const { return bytesUsed; } |
108 | |
109 | /*! returns amount of free bytes */ |
110 | __forceinline size_t getFreeBytes() const { return end-cur; } |
111 | |
112 | /*! returns amount of wasted bytes */ |
113 | __forceinline size_t getWastedBytes() const { return bytesWasted; } |
114 | |
115 | private: |
116 | ThreadLocal2* parent; |
117 | char* ptr; //!< pointer to memory block |
118 | size_t cur; //!< current location of the allocator |
119 | size_t end; //!< end of the memory block |
120 | size_t allocBlockSize; //!< block size for allocations |
121 | size_t bytesUsed; //!< number of total bytes allocated |
122 | size_t bytesWasted; //!< number of bytes wasted |
123 | }; |
124 | |
125 | /*! Two thread local structures. */ |
126 | struct __aligned(64) ThreadLocal2 |
127 | { |
128 | ALIGNED_CLASS_(64); |
129 | public: |
130 | |
131 | __forceinline ThreadLocal2() |
132 | : alloc(nullptr), alloc0(this), alloc1(this) {} |
133 | |
134 | /*! bind to fast allocator */ |
135 | __forceinline void bind(FastAllocator* alloc_i) |
136 | { |
137 | assert(alloc_i); |
138 | if (alloc.load() == alloc_i) return; |
139 | #if defined(APPLE) && defined(__aarch64__) |
140 | std::scoped_lock lock(mutex); |
141 | #else |
142 | Lock<SpinLock> lock(mutex); |
143 | #endif |
144 | //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind |
145 | if (alloc.load()) { |
146 | alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes(); |
147 | alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes(); |
148 | alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes(); |
149 | } |
150 | alloc0.init(alloc_i); |
151 | alloc1.init(alloc_i); |
152 | alloc.store(alloc_i); |
153 | alloc_i->join(this); |
154 | } |
155 | |
156 | /*! unbind to fast allocator */ |
157 | void unbind(FastAllocator* alloc_i) |
158 | { |
159 | assert(alloc_i); |
160 | if (alloc.load() != alloc_i) return; |
161 | #if defined(APPLE) && defined(__aarch64__) |
162 | std::scoped_lock lock(mutex); |
163 | #else |
164 | Lock<SpinLock> lock(mutex); |
165 | #endif |
166 | if (alloc.load() != alloc_i) return; // required as a different thread calls unbind |
167 | alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes(); |
168 | alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes(); |
169 | alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes(); |
170 | alloc0.init(nullptr); |
171 | alloc1.init(nullptr); |
172 | alloc.store(nullptr); |
173 | } |
174 | |
175 | public: |
176 | #if defined(APPLE) && defined(__aarch64__) |
177 | std::mutex mutex; |
178 | #else |
179 | SpinLock mutex; //!< required as unbind is called from other threads |
180 | #endif |
181 | std::atomic<FastAllocator*> alloc; //!< parent allocator |
182 | ThreadLocal alloc0; |
183 | ThreadLocal alloc1; |
184 | }; |
185 | |
186 | FastAllocator (Device* device, bool osAllocation) |
187 | : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0), |
188 | growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC), |
189 | primrefarray(device,0) |
190 | { |
191 | for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) |
192 | { |
193 | threadUsedBlocks[i] = nullptr; |
194 | threadBlocks[i] = nullptr; |
195 | assert(!slotMutex[i].isLocked()); |
196 | } |
197 | } |
198 | |
199 | ~FastAllocator () { |
200 | clear(); |
201 | } |
202 | |
203 | /*! returns the device attached to this allocator */ |
204 | Device* getDevice() { |
205 | return device; |
206 | } |
207 | |
208 | void share(mvector<PrimRef>& primrefarray_i) { |
209 | primrefarray = std::move(primrefarray_i); |
210 | } |
211 | |
212 | void unshare(mvector<PrimRef>& primrefarray_o) |
213 | { |
214 | reset(); // this removes blocks that are allocated inside the shared primref array |
215 | primrefarray_o = std::move(primrefarray); |
216 | } |
217 | |
218 | /*! returns first fast thread local allocator */ |
219 | __forceinline ThreadLocal* _threadLocal() { |
220 | return &threadLocal2()->alloc0; |
221 | } |
222 | |
223 | void setOSallocation(bool flag) |
224 | { |
225 | atype = flag ? EMBREE_OS_MALLOC : ALIGNED_MALLOC; |
226 | } |
227 | |
228 | private: |
229 | |
230 | /*! returns both fast thread local allocators */ |
231 | __forceinline ThreadLocal2* threadLocal2() |
232 | { |
233 | ThreadLocal2* alloc = thread_local_allocator2; |
234 | if (alloc == nullptr) { |
235 | thread_local_allocator2 = alloc = new ThreadLocal2; |
236 | #if defined(APPLE) && defined(__aarch64__) |
237 | std::scoped_lock lock(s_thread_local_allocators_lock); |
238 | #else |
239 | Lock<SpinLock> lock(s_thread_local_allocators_lock); |
240 | #endif |
241 | s_thread_local_allocators.push_back(make_unique(alloc)); |
242 | } |
243 | return alloc; |
244 | } |
245 | |
246 | public: |
247 | |
248 | __forceinline void join(ThreadLocal2* alloc) |
249 | { |
250 | #if defined(APPLE) && defined(__aarch64__) |
251 | std::scoped_lock lock(s_thread_local_allocators_lock); |
252 | #else |
253 | Lock<SpinLock> lock(thread_local_allocators_lock); |
254 | #endif |
255 | thread_local_allocators.push_back(alloc); |
256 | } |
257 | |
258 | public: |
259 | |
260 | struct CachedAllocator |
261 | { |
262 | __forceinline CachedAllocator(void* ptr) |
263 | : alloc(nullptr), talloc0(nullptr), talloc1(nullptr) |
264 | { |
265 | assert(ptr == nullptr); |
266 | } |
267 | |
268 | __forceinline CachedAllocator(FastAllocator* alloc, ThreadLocal2* talloc) |
269 | : alloc(alloc), talloc0(&talloc->alloc0), talloc1(alloc->use_single_mode ? &talloc->alloc0 : &talloc->alloc1) {} |
270 | |
271 | __forceinline operator bool () const { |
272 | return alloc != nullptr; |
273 | } |
274 | |
275 | __forceinline void* operator() (size_t bytes, size_t align = 16) const { |
276 | return talloc0->malloc(alloc,bytes,align); |
277 | } |
278 | |
279 | __forceinline void* malloc0 (size_t bytes, size_t align = 16) const { |
280 | return talloc0->malloc(alloc,bytes,align); |
281 | } |
282 | |
283 | __forceinline void* malloc1 (size_t bytes, size_t align = 16) const { |
284 | return talloc1->malloc(alloc,bytes,align); |
285 | } |
286 | |
287 | public: |
288 | FastAllocator* alloc; |
289 | ThreadLocal* talloc0; |
290 | ThreadLocal* talloc1; |
291 | }; |
292 | |
293 | __forceinline CachedAllocator getCachedAllocator() { |
294 | return CachedAllocator(this,threadLocal2()); |
295 | } |
296 | |
297 | /*! Builder interface to create thread local allocator */ |
298 | struct Create |
299 | { |
300 | public: |
301 | __forceinline Create (FastAllocator* allocator) : allocator(allocator) {} |
302 | __forceinline CachedAllocator operator() () const { return allocator->getCachedAllocator(); } |
303 | |
304 | private: |
305 | FastAllocator* allocator; |
306 | }; |
307 | |
308 | void internal_fix_used_blocks() |
309 | { |
310 | /* move thread local blocks to global block list */ |
311 | for (size_t i = 0; i < MAX_THREAD_USED_BLOCK_SLOTS; i++) |
312 | { |
313 | while (threadBlocks[i].load() != nullptr) { |
314 | Block* nextUsedBlock = threadBlocks[i].load()->next; |
315 | threadBlocks[i].load()->next = usedBlocks.load(); |
316 | usedBlocks = threadBlocks[i].load(); |
317 | threadBlocks[i] = nextUsedBlock; |
318 | } |
319 | threadBlocks[i] = nullptr; |
320 | } |
321 | } |
322 | |
323 | static const size_t threadLocalAllocOverhead = 20; //! 20 means 5% parallel allocation overhead through unfilled thread local blocks |
324 | static const size_t mainAllocOverheadStatic = 20; //! 20 means 5% allocation overhead through unfilled main alloc blocks |
325 | static const size_t mainAllocOverheadDynamic = 8; //! 20 means 12.5% allocation overhead through unfilled main alloc blocks |
326 | |
327 | /* calculates a single threaded threshold for the builders such |
328 | * that for small scenes the overhead of partly allocated blocks |
329 | * per thread is low */ |
330 | size_t fixSingleThreadThreshold(size_t branchingFactor, size_t defaultThreshold, size_t numPrimitives, size_t bytesEstimated) |
331 | { |
332 | if (numPrimitives == 0 || bytesEstimated == 0) |
333 | return defaultThreshold; |
334 | |
335 | /* calculate block size in bytes to fulfill threadLocalAllocOverhead constraint */ |
336 | const size_t single_mode_factor = use_single_mode ? 1 : 2; |
337 | const size_t threadCount = TaskScheduler::threadCount(); |
338 | const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSize; |
339 | |
340 | /* if we do not have to limit number of threads use optimal thresdhold */ |
341 | if ( (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount) |
342 | return defaultThreshold; |
343 | |
344 | /* otherwise limit number of threads by calculating proper single thread threshold */ |
345 | else { |
346 | double bytesPerPrimitive = double(bytesEstimated)/double(numPrimitives); |
347 | return size_t(ceil(branchingFactor*singleThreadBytes/bytesPerPrimitive)); |
348 | } |
349 | } |
350 | |
351 | __forceinline size_t alignSize(size_t i) { |
352 | return (i+127)/128*128; |
353 | } |
354 | |
355 | /*! initializes the grow size */ |
356 | __forceinline void initGrowSizeAndNumSlots(size_t bytesEstimated, bool fast) |
357 | { |
358 | /* we do not need single thread local allocator mode */ |
359 | use_single_mode = false; |
360 | |
361 | /* calculate growSize such that at most mainAllocationOverhead gets wasted when a block stays unused */ |
362 | size_t mainAllocOverhead = fast ? mainAllocOverheadDynamic : mainAllocOverheadStatic; |
363 | size_t blockSize = alignSize(bytesEstimated/mainAllocOverhead); |
364 | growSize = maxGrowSize = clamp(blockSize,size_t(1024),maxAllocationSize); |
365 | |
366 | /* if we reached the maxAllocationSize for growSize, we can |
367 | * increase the number of allocation slots by still guaranteeing |
368 | * the mainAllocationOverhead */ |
369 | slotMask = 0x0; |
370 | |
371 | if (MAX_THREAD_USED_BLOCK_SLOTS >= 2 && bytesEstimated > 2*mainAllocOverhead*growSize) slotMask = 0x1; |
372 | if (MAX_THREAD_USED_BLOCK_SLOTS >= 4 && bytesEstimated > 4*mainAllocOverhead*growSize) slotMask = 0x3; |
373 | if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 8*mainAllocOverhead*growSize) slotMask = 0x7; |
374 | if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 16*mainAllocOverhead*growSize) { growSize *= 2; } /* if the overhead is tiny, double the growSize */ |
375 | |
376 | /* set the thread local alloc block size */ |
377 | size_t defaultBlockSizeSwitch = PAGE_SIZE+maxAlignment; |
378 | |
379 | /* for sufficiently large scene we can increase the defaultBlockSize over the defaultBlockSizeSwitch size */ |
380 | #if 0 // we do not do this as a block size of 4160 if for some reason best for KNL |
381 | const size_t threadCount = TaskScheduler::threadCount(); |
382 | const size_t single_mode_factor = use_single_mode ? 1 : 2; |
383 | const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSizeSwitch; |
384 | if (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount) |
385 | defaultBlockSize = min(max(defaultBlockSizeSwitch,bytesEstimated/(single_mode_factor*threadLocalAllocOverhead*threadCount)),growSize); |
386 | |
387 | /* otherwise we grow the defaultBlockSize up to defaultBlockSizeSwitch */ |
388 | else |
389 | #endif |
390 | defaultBlockSize = clamp(blockSize,size_t(1024),defaultBlockSizeSwitch); |
391 | |
392 | if (bytesEstimated == 0) { |
393 | maxGrowSize = maxAllocationSize; // special mode if builder cannot estimate tree size |
394 | defaultBlockSize = defaultBlockSizeSwitch; |
395 | } |
396 | log2_grow_size_scale = 0; |
397 | |
398 | if (device->alloc_main_block_size != 0) growSize = device->alloc_main_block_size; |
399 | if (device->alloc_num_main_slots >= 1 ) slotMask = 0x0; |
400 | if (device->alloc_num_main_slots >= 2 ) slotMask = 0x1; |
401 | if (device->alloc_num_main_slots >= 4 ) slotMask = 0x3; |
402 | if (device->alloc_num_main_slots >= 8 ) slotMask = 0x7; |
403 | if (device->alloc_thread_block_size != 0) defaultBlockSize = device->alloc_thread_block_size; |
404 | if (device->alloc_single_thread_alloc != -1) use_single_mode = device->alloc_single_thread_alloc; |
405 | } |
406 | |
407 | /*! initializes the allocator */ |
408 | void init(size_t bytesAllocate, size_t bytesReserve, size_t bytesEstimate) |
409 | { |
410 | internal_fix_used_blocks(); |
411 | /* distribute the allocation to multiple thread block slots */ |
412 | slotMask = MAX_THREAD_USED_BLOCK_SLOTS-1; // FIXME: remove |
413 | if (usedBlocks.load() || freeBlocks.load()) { reset(); return; } |
414 | if (bytesReserve == 0) bytesReserve = bytesAllocate; |
415 | freeBlocks = Block::create(device,bytesAllocate,bytesReserve,nullptr,atype); |
416 | estimatedSize = bytesEstimate; |
417 | initGrowSizeAndNumSlots(bytesEstimate,true); |
418 | } |
419 | |
420 | /*! initializes the allocator */ |
421 | void init_estimate(size_t bytesEstimate) |
422 | { |
423 | internal_fix_used_blocks(); |
424 | if (usedBlocks.load() || freeBlocks.load()) { reset(); return; } |
425 | /* single allocator mode ? */ |
426 | estimatedSize = bytesEstimate; |
427 | //initGrowSizeAndNumSlots(bytesEstimate,false); |
428 | initGrowSizeAndNumSlots(bytesEstimate,false); |
429 | |
430 | } |
431 | |
432 | /*! frees state not required after build */ |
433 | __forceinline void cleanup() |
434 | { |
435 | internal_fix_used_blocks(); |
436 | |
437 | /* unbind all thread local allocators */ |
438 | for (auto alloc : thread_local_allocators) alloc->unbind(this); |
439 | thread_local_allocators.clear(); |
440 | } |
441 | |
442 | /*! resets the allocator, memory blocks get reused */ |
443 | void reset () |
444 | { |
445 | internal_fix_used_blocks(); |
446 | |
447 | bytesUsed.store(0); |
448 | bytesFree.store(0); |
449 | bytesWasted.store(0); |
450 | |
451 | /* reset all used blocks and move them to begin of free block list */ |
452 | while (usedBlocks.load() != nullptr) { |
453 | usedBlocks.load()->reset_block(); |
454 | Block* nextUsedBlock = usedBlocks.load()->next; |
455 | usedBlocks.load()->next = freeBlocks.load(); |
456 | freeBlocks = usedBlocks.load(); |
457 | usedBlocks = nextUsedBlock; |
458 | } |
459 | |
460 | /* remove all shared blocks as they are re-added during build */ |
461 | freeBlocks.store(Block::remove_shared_blocks(freeBlocks.load())); |
462 | |
463 | for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) |
464 | { |
465 | threadUsedBlocks[i] = nullptr; |
466 | threadBlocks[i] = nullptr; |
467 | } |
468 | |
469 | /* unbind all thread local allocators */ |
470 | for (auto alloc : thread_local_allocators) alloc->unbind(this); |
471 | thread_local_allocators.clear(); |
472 | } |
473 | |
474 | /*! frees all allocated memory */ |
475 | __forceinline void clear() |
476 | { |
477 | cleanup(); |
478 | bytesUsed.store(0); |
479 | bytesFree.store(0); |
480 | bytesWasted.store(0); |
481 | if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device); usedBlocks = nullptr; |
482 | if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device); freeBlocks = nullptr; |
483 | for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) { |
484 | threadUsedBlocks[i] = nullptr; |
485 | threadBlocks[i] = nullptr; |
486 | } |
487 | primrefarray.clear(); |
488 | } |
489 | |
490 | __forceinline size_t incGrowSizeScale() |
491 | { |
492 | size_t scale = log2_grow_size_scale.fetch_add(1)+1; |
493 | return size_t(1) << min(size_t(16),scale); |
494 | } |
495 | |
496 | /*! thread safe allocation of memory */ |
497 | void* malloc(size_t& bytes, size_t align, bool partial) |
498 | { |
499 | assert(align <= maxAlignment); |
500 | |
501 | while (true) |
502 | { |
503 | /* allocate using current block */ |
504 | size_t threadID = TaskScheduler::threadID(); |
505 | size_t slot = threadID & slotMask; |
506 | Block* myUsedBlocks = threadUsedBlocks[slot]; |
507 | if (myUsedBlocks) { |
508 | void* ptr = myUsedBlocks->malloc(device,bytes,align,partial); |
509 | if (ptr) return ptr; |
510 | } |
511 | |
512 | /* throw error if allocation is too large */ |
513 | if (bytes > maxAllocationSize) |
514 | throw_RTCError(RTC_ERROR_UNKNOWN,"allocation is too large" ); |
515 | |
516 | /* parallel block creation in case of no freeBlocks, avoids single global mutex */ |
517 | if (likely(freeBlocks.load() == nullptr)) |
518 | { |
519 | #if defined(APPLE) && defined(__aarch64__) |
520 | std::scoped_lock lock(slotMutex[slot]); |
521 | #else |
522 | Lock<SpinLock> lock(slotMutex[slot]); |
523 | #endif |
524 | if (myUsedBlocks == threadUsedBlocks[slot]) { |
525 | const size_t alignedBytes = (bytes+(align-1)) & ~(align-1); |
526 | const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes); |
527 | assert(allocSize >= bytes); |
528 | threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here! |
529 | // FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail. |
530 | } |
531 | continue; |
532 | } |
533 | |
534 | /* if this fails allocate new block */ |
535 | { |
536 | #if defined(APPLE) && defined(__aarch64__) |
537 | std::scoped_lock lock(mutex); |
538 | #else |
539 | Lock<SpinLock> lock(mutex); |
540 | #endif |
541 | if (myUsedBlocks == threadUsedBlocks[slot]) |
542 | { |
543 | if (freeBlocks.load() != nullptr) { |
544 | Block* nextFreeBlock = freeBlocks.load()->next; |
545 | freeBlocks.load()->next = usedBlocks; |
546 | __memory_barrier(); |
547 | usedBlocks = freeBlocks.load(); |
548 | threadUsedBlocks[slot] = freeBlocks.load(); |
549 | freeBlocks = nextFreeBlock; |
550 | } else { |
551 | const size_t allocSize = min(growSize*incGrowSizeScale(),maxGrowSize); |
552 | usedBlocks = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above! |
553 | } |
554 | } |
555 | } |
556 | } |
557 | } |
558 | |
559 | /*! add new block */ |
560 | void addBlock(void* ptr, ssize_t bytes) |
561 | { |
562 | #if defined(APPLE) && defined(__aarch64__) |
563 | std::scoped_lock lock(mutex); |
564 | #else |
565 | Lock<SpinLock> lock(mutex); |
566 | #endif |
567 | const size_t = offsetof(Block,data[0]); |
568 | void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1)); |
569 | size_t ofs = (size_t) aptr - (size_t) ptr; |
570 | bytes -= ofs; |
571 | if (bytes < 4096) return; // ignore empty or very small blocks |
572 | freeBlocks = new (aptr) Block(SHARED,bytes-sizeof_Header,bytes-sizeof_Header,freeBlocks,ofs); |
573 | } |
574 | |
575 | /* special allocation only used from morton builder only a single time for each build */ |
576 | void* specialAlloc(size_t bytes) |
577 | { |
578 | assert(freeBlocks.load() != nullptr && freeBlocks.load()->getBlockAllocatedBytes() >= bytes); |
579 | return freeBlocks.load()->ptr(); |
580 | } |
581 | |
582 | struct Statistics |
583 | { |
584 | Statistics () |
585 | : bytesUsed(0), bytesFree(0), bytesWasted(0) {} |
586 | |
587 | Statistics (size_t bytesUsed, size_t bytesFree, size_t bytesWasted) |
588 | : bytesUsed(bytesUsed), bytesFree(bytesFree), bytesWasted(bytesWasted) {} |
589 | |
590 | Statistics (FastAllocator* alloc, AllocationType atype, bool huge_pages = false) |
591 | : bytesUsed(0), bytesFree(0), bytesWasted(0) |
592 | { |
593 | Block* usedBlocks = alloc->usedBlocks.load(); |
594 | Block* freeBlocks = alloc->freeBlocks.load(); |
595 | if (usedBlocks) bytesUsed += usedBlocks->getUsedBytes(atype,huge_pages); |
596 | if (freeBlocks) bytesFree += freeBlocks->getAllocatedBytes(atype,huge_pages); |
597 | if (usedBlocks) bytesFree += usedBlocks->getFreeBytes(atype,huge_pages); |
598 | if (freeBlocks) bytesWasted += freeBlocks->getWastedBytes(atype,huge_pages); |
599 | if (usedBlocks) bytesWasted += usedBlocks->getWastedBytes(atype,huge_pages); |
600 | } |
601 | |
602 | std::string str(size_t numPrimitives) |
603 | { |
604 | std::stringstream str; |
605 | str.setf(std::ios::fixed, std::ios::floatfield); |
606 | str << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, " |
607 | << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, " |
608 | << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, " |
609 | << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesAllocatedTotal() << " MB, " |
610 | << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesAllocatedTotal())/double(numPrimitives); |
611 | return str.str(); |
612 | } |
613 | |
614 | friend Statistics operator+ ( const Statistics& a, const Statistics& b) |
615 | { |
616 | return Statistics(a.bytesUsed+b.bytesUsed, |
617 | a.bytesFree+b.bytesFree, |
618 | a.bytesWasted+b.bytesWasted); |
619 | } |
620 | |
621 | size_t bytesAllocatedTotal() const { |
622 | return bytesUsed + bytesFree + bytesWasted; |
623 | } |
624 | |
625 | public: |
626 | size_t bytesUsed; |
627 | size_t bytesFree; |
628 | size_t bytesWasted; |
629 | }; |
630 | |
631 | Statistics getStatistics(AllocationType atype, bool huge_pages = false) { |
632 | return Statistics(this,atype,huge_pages); |
633 | } |
634 | |
635 | size_t getUsedBytes() { |
636 | return bytesUsed; |
637 | } |
638 | |
639 | size_t getWastedBytes() { |
640 | return bytesWasted; |
641 | } |
642 | |
643 | struct AllStatistics |
644 | { |
645 | AllStatistics (FastAllocator* alloc) |
646 | |
647 | : bytesUsed(alloc->bytesUsed), |
648 | bytesFree(alloc->bytesFree), |
649 | bytesWasted(alloc->bytesWasted), |
650 | stat_all(alloc,ANY_TYPE), |
651 | stat_malloc(alloc,ALIGNED_MALLOC), |
652 | stat_4K(alloc,EMBREE_OS_MALLOC,false), |
653 | stat_2M(alloc,EMBREE_OS_MALLOC,true), |
654 | stat_shared(alloc,SHARED) {} |
655 | |
656 | AllStatistics (size_t bytesUsed, |
657 | size_t bytesFree, |
658 | size_t bytesWasted, |
659 | Statistics stat_all, |
660 | Statistics stat_malloc, |
661 | Statistics stat_4K, |
662 | Statistics stat_2M, |
663 | Statistics stat_shared) |
664 | |
665 | : bytesUsed(bytesUsed), |
666 | bytesFree(bytesFree), |
667 | bytesWasted(bytesWasted), |
668 | stat_all(stat_all), |
669 | stat_malloc(stat_malloc), |
670 | stat_4K(stat_4K), |
671 | stat_2M(stat_2M), |
672 | stat_shared(stat_shared) {} |
673 | |
674 | friend AllStatistics operator+ (const AllStatistics& a, const AllStatistics& b) |
675 | { |
676 | return AllStatistics(a.bytesUsed+b.bytesUsed, |
677 | a.bytesFree+b.bytesFree, |
678 | a.bytesWasted+b.bytesWasted, |
679 | a.stat_all + b.stat_all, |
680 | a.stat_malloc + b.stat_malloc, |
681 | a.stat_4K + b.stat_4K, |
682 | a.stat_2M + b.stat_2M, |
683 | a.stat_shared + b.stat_shared); |
684 | } |
685 | |
686 | void print(size_t numPrimitives) |
687 | { |
688 | std::stringstream str0; |
689 | str0.setf(std::ios::fixed, std::ios::floatfield); |
690 | str0 << " alloc : " |
691 | << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, " |
692 | << " " |
693 | << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed)/double(numPrimitives); |
694 | std::cout << str0.str() << std::endl; |
695 | |
696 | std::stringstream str1; |
697 | str1.setf(std::ios::fixed, std::ios::floatfield); |
698 | str1 << " alloc : " |
699 | << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, " |
700 | << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, " |
701 | << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, " |
702 | << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*(bytesUsed+bytesFree+bytesWasted) << " MB, " |
703 | << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed+bytesFree+bytesWasted)/double(numPrimitives); |
704 | std::cout << str1.str() << std::endl; |
705 | |
706 | std::cout << " total : " << stat_all.str(numPrimitives) << std::endl; |
707 | std::cout << " 4K : " << stat_4K.str(numPrimitives) << std::endl; |
708 | std::cout << " 2M : " << stat_2M.str(numPrimitives) << std::endl; |
709 | std::cout << " malloc: " << stat_malloc.str(numPrimitives) << std::endl; |
710 | std::cout << " shared: " << stat_shared.str(numPrimitives) << std::endl; |
711 | } |
712 | |
713 | private: |
714 | size_t bytesUsed; |
715 | size_t bytesFree; |
716 | size_t bytesWasted; |
717 | Statistics stat_all; |
718 | Statistics stat_malloc; |
719 | Statistics stat_4K; |
720 | Statistics stat_2M; |
721 | Statistics stat_shared; |
722 | }; |
723 | |
724 | void print_blocks() |
725 | { |
726 | std::cout << " estimatedSize = " << estimatedSize << ", slotMask = " << slotMask << ", use_single_mode = " << use_single_mode << ", maxGrowSize = " << maxGrowSize << ", defaultBlockSize = " << defaultBlockSize << std::endl; |
727 | |
728 | std::cout << " used blocks = " ; |
729 | if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list(); |
730 | std::cout << "[END]" << std::endl; |
731 | |
732 | std::cout << " free blocks = " ; |
733 | if (freeBlocks.load() != nullptr) freeBlocks.load()->print_list(); |
734 | std::cout << "[END]" << std::endl; |
735 | } |
736 | |
737 | private: |
738 | |
739 | struct Block |
740 | { |
741 | static Block* create(MemoryMonitorInterface* device, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype) |
742 | { |
743 | /* We avoid using os_malloc for small blocks as this could |
744 | * cause a risk of fragmenting the virtual address space and |
745 | * reach the limit of vm.max_map_count = 65k under Linux. */ |
746 | if (atype == EMBREE_OS_MALLOC && bytesAllocate < maxAllocationSize) |
747 | atype = ALIGNED_MALLOC; |
748 | |
749 | /* we need to additionally allocate some header */ |
750 | const size_t = offsetof(Block,data[0]); |
751 | bytesAllocate = sizeof_Header+bytesAllocate; |
752 | bytesReserve = sizeof_Header+bytesReserve; |
753 | |
754 | /* consume full 4k pages with using os_malloc */ |
755 | if (atype == EMBREE_OS_MALLOC) { |
756 | bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1)); |
757 | bytesReserve = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1)); |
758 | } |
759 | |
760 | /* either use alignedMalloc or os_malloc */ |
761 | void *ptr = nullptr; |
762 | if (atype == ALIGNED_MALLOC) |
763 | { |
764 | /* special handling for default block size */ |
765 | if (bytesAllocate == (2*PAGE_SIZE_2M)) |
766 | { |
767 | const size_t alignment = maxAlignment; |
768 | if (device) device->memoryMonitor(bytesAllocate+alignment,false); |
769 | ptr = alignedMalloc(bytesAllocate,alignment); |
770 | |
771 | /* give hint to transparently convert these pages to 2MB pages */ |
772 | const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-1); |
773 | os_advise((void*)(ptr_aligned_begin + 0),PAGE_SIZE_2M); // may fail if no memory mapped before block |
774 | os_advise((void*)(ptr_aligned_begin + 1*PAGE_SIZE_2M),PAGE_SIZE_2M); |
775 | os_advise((void*)(ptr_aligned_begin + 2*PAGE_SIZE_2M),PAGE_SIZE_2M); // may fail if no memory mapped after block |
776 | |
777 | return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment); |
778 | } |
779 | else |
780 | { |
781 | const size_t alignment = maxAlignment; |
782 | if (device) device->memoryMonitor(bytesAllocate+alignment,false); |
783 | ptr = alignedMalloc(bytesAllocate,alignment); |
784 | return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment); |
785 | } |
786 | } |
787 | else if (atype == EMBREE_OS_MALLOC) |
788 | { |
789 | if (device) device->memoryMonitor(bytesAllocate,false); |
790 | bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages); |
791 | return new (ptr) Block(EMBREE_OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages); |
792 | } |
793 | else |
794 | assert(false); |
795 | |
796 | return NULL; |
797 | } |
798 | |
799 | Block (AllocationType atype, size_t bytesAllocate, size_t bytesReserve, Block* next, size_t wasted, bool huge_pages = false) |
800 | : cur(0), allocEnd(bytesAllocate), reserveEnd(bytesReserve), next(next), wasted(wasted), atype(atype), huge_pages(huge_pages) |
801 | { |
802 | assert((((size_t)&data[0]) & (maxAlignment-1)) == 0); |
803 | } |
804 | |
805 | static Block* remove_shared_blocks(Block* head) |
806 | { |
807 | Block** prev_next = &head; |
808 | for (Block* block = head; block; block = block->next) { |
809 | if (block->atype == SHARED) *prev_next = block->next; |
810 | else prev_next = &block->next; |
811 | } |
812 | return head; |
813 | } |
814 | |
815 | void clear_list(MemoryMonitorInterface* device) |
816 | { |
817 | Block* block = this; |
818 | while (block) { |
819 | Block* next = block->next; |
820 | block->clear_block(device); |
821 | block = next; |
822 | } |
823 | } |
824 | |
825 | void clear_block (MemoryMonitorInterface* device) |
826 | { |
827 | const size_t = offsetof(Block,data[0]); |
828 | const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes(); |
829 | |
830 | if (atype == ALIGNED_MALLOC) { |
831 | alignedFree(this); |
832 | if (device) device->memoryMonitor(-sizeof_Alloced,true); |
833 | } |
834 | |
835 | else if (atype == EMBREE_OS_MALLOC) { |
836 | size_t sizeof_This = sizeof_Header+reserveEnd; |
837 | os_free(this,sizeof_This,huge_pages); |
838 | if (device) device->memoryMonitor(-sizeof_Alloced,true); |
839 | } |
840 | |
841 | else /* if (atype == SHARED) */ { |
842 | } |
843 | } |
844 | |
845 | void* malloc(MemoryMonitorInterface* device, size_t& bytes_in, size_t align, bool partial) |
846 | { |
847 | size_t bytes = bytes_in; |
848 | assert(align <= maxAlignment); |
849 | bytes = (bytes+(align-1)) & ~(align-1); |
850 | if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr; |
851 | const size_t i = cur.fetch_add(bytes); |
852 | if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr; |
853 | if (unlikely(i > reserveEnd)) return nullptr; |
854 | bytes_in = bytes = min(bytes,reserveEnd-i); |
855 | |
856 | if (i+bytes > allocEnd) { |
857 | if (device) device->memoryMonitor(i+bytes-max(i,allocEnd),true); |
858 | } |
859 | return &data[i]; |
860 | } |
861 | |
862 | void* ptr() { |
863 | return &data[cur]; |
864 | } |
865 | |
866 | void reset_block () |
867 | { |
868 | allocEnd = max(allocEnd,(size_t)cur); |
869 | cur = 0; |
870 | } |
871 | |
872 | size_t getBlockUsedBytes() const { |
873 | return min(size_t(cur),reserveEnd); |
874 | } |
875 | |
876 | size_t getBlockFreeBytes() const { |
877 | return getBlockAllocatedBytes() - getBlockUsedBytes(); |
878 | } |
879 | |
880 | size_t getBlockAllocatedBytes() const { |
881 | return min(max(allocEnd,size_t(cur)),reserveEnd); |
882 | } |
883 | |
884 | size_t getBlockWastedBytes() const { |
885 | const size_t = offsetof(Block,data[0]); |
886 | return sizeof_Header + wasted; |
887 | } |
888 | |
889 | size_t getBlockReservedBytes() const { |
890 | return reserveEnd; |
891 | } |
892 | |
893 | bool hasType(AllocationType atype_i, bool huge_pages_i) const |
894 | { |
895 | if (atype_i == ANY_TYPE ) return true; |
896 | else if (atype == EMBREE_OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages; |
897 | else return atype_i == atype; |
898 | } |
899 | |
900 | size_t getUsedBytes(AllocationType atype, bool huge_pages = false) const { |
901 | size_t bytes = 0; |
902 | for (const Block* block = this; block; block = block->next) { |
903 | if (!block->hasType(atype,huge_pages)) continue; |
904 | bytes += block->getBlockUsedBytes(); |
905 | } |
906 | return bytes; |
907 | } |
908 | |
909 | size_t getFreeBytes(AllocationType atype, bool huge_pages = false) const { |
910 | size_t bytes = 0; |
911 | for (const Block* block = this; block; block = block->next) { |
912 | if (!block->hasType(atype,huge_pages)) continue; |
913 | bytes += block->getBlockFreeBytes(); |
914 | } |
915 | return bytes; |
916 | } |
917 | |
918 | size_t getWastedBytes(AllocationType atype, bool huge_pages = false) const { |
919 | size_t bytes = 0; |
920 | for (const Block* block = this; block; block = block->next) { |
921 | if (!block->hasType(atype,huge_pages)) continue; |
922 | bytes += block->getBlockWastedBytes(); |
923 | } |
924 | return bytes; |
925 | } |
926 | |
927 | size_t getAllocatedBytes(AllocationType atype, bool huge_pages = false) const { |
928 | size_t bytes = 0; |
929 | for (const Block* block = this; block; block = block->next) { |
930 | if (!block->hasType(atype,huge_pages)) continue; |
931 | bytes += block->getBlockAllocatedBytes(); |
932 | } |
933 | return bytes; |
934 | } |
935 | |
936 | void print_list () |
937 | { |
938 | for (const Block* block = this; block; block = block->next) |
939 | block->print_block(); |
940 | } |
941 | |
942 | void print_block() const |
943 | { |
944 | if (atype == ALIGNED_MALLOC) std::cout << "A" ; |
945 | else if (atype == EMBREE_OS_MALLOC) std::cout << "O" ; |
946 | else if (atype == SHARED) std::cout << "S" ; |
947 | if (huge_pages) std::cout << "H" ; |
948 | size_t bytesUsed = getBlockUsedBytes(); |
949 | size_t bytesFree = getBlockFreeBytes(); |
950 | size_t bytesWasted = getBlockWastedBytes(); |
951 | std::cout << "[" << bytesUsed << ", " << bytesFree << ", " << bytesWasted << "] " ; |
952 | } |
953 | |
954 | public: |
955 | std::atomic<size_t> cur; //!< current location of the allocator |
956 | std::atomic<size_t> allocEnd; //!< end of the allocated memory region |
957 | std::atomic<size_t> reserveEnd; //!< end of the reserved memory region |
958 | Block* next; //!< pointer to next block in list |
959 | size_t wasted; //!< amount of memory wasted through block alignment |
960 | AllocationType atype; //!< allocation mode of the block |
961 | bool huge_pages; //!< whether the block uses huge pages |
962 | char align[maxAlignment-5*sizeof(size_t)-sizeof(AllocationType)-sizeof(bool)]; //!< align data to maxAlignment |
963 | char data[1]; //!< here starts memory to use for allocations |
964 | }; |
965 | |
966 | private: |
967 | Device* device; |
968 | SpinLock mutex; |
969 | size_t slotMask; |
970 | std::atomic<Block*> threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS]; |
971 | std::atomic<Block*> usedBlocks; |
972 | std::atomic<Block*> freeBlocks; |
973 | |
974 | std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS]; |
975 | #if defined(APPLE) && defined(__aarch64__) |
976 | std::mutex slotMutex[MAX_THREAD_USED_BLOCK_SLOTS]; |
977 | #else |
978 | PaddedSpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS]; |
979 | #endif |
980 | |
981 | bool use_single_mode; |
982 | size_t defaultBlockSize; |
983 | size_t estimatedSize; |
984 | size_t growSize; |
985 | size_t maxGrowSize; |
986 | std::atomic<size_t> log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove |
987 | std::atomic<size_t> bytesUsed; |
988 | std::atomic<size_t> bytesFree; |
989 | std::atomic<size_t> bytesWasted; |
990 | static __thread ThreadLocal2* thread_local_allocator2; |
991 | static SpinLock s_thread_local_allocators_lock; |
992 | static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators; |
993 | #if defined(APPLE) && defined(__aarch64__) |
994 | std::mutex thread_local_allocators_lock; |
995 | #else |
996 | SpinLock thread_local_allocators_lock; |
997 | #endif |
998 | std::vector<ThreadLocal2*> thread_local_allocators; |
999 | AllocationType atype; |
1000 | mvector<PrimRef> primrefarray; //!< primrefarray used to allocate nodes |
1001 | }; |
1002 | } |
1003 | |