1// Copyright 2009-2021 Intel Corporation
2// SPDX-License-Identifier: Apache-2.0
3
4#pragma once
5
6#include "default.h"
7#include "device.h"
8#include "scene.h"
9#include "primref.h"
10
11#if defined(APPLE) && defined(__aarch64__)
12#include <mutex>
13#endif
14
15namespace embree
16{
17 class FastAllocator
18 {
19 /*! maximum supported alignment */
20 static const size_t maxAlignment = 64;
21
22 /*! maximum allocation size */
23
24 /* default settings */
25 //static const size_t defaultBlockSize = 4096;
26#define maxAllocationSize size_t(2*1024*1024-maxAlignment)
27
28 static const size_t MAX_THREAD_USED_BLOCK_SLOTS = 8;
29
30 public:
31
32 struct ThreadLocal2;
33 enum AllocationType { ALIGNED_MALLOC, EMBREE_OS_MALLOC, SHARED, ANY_TYPE };
34
35 /*! Per thread structure holding the current memory block. */
36 struct __aligned(64) ThreadLocal
37 {
38 ALIGNED_CLASS_(64);
39 public:
40
41 /*! Constructor for usage with ThreadLocalData */
42 __forceinline ThreadLocal (ThreadLocal2* parent)
43 : parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {}
44
45 /*! initialize allocator */
46 void init(FastAllocator* alloc)
47 {
48 ptr = nullptr;
49 cur = end = 0;
50 bytesUsed = 0;
51 bytesWasted = 0;
52 allocBlockSize = 0;
53 if (alloc) allocBlockSize = alloc->defaultBlockSize;
54 }
55
56 /* Allocate aligned memory from the threads memory block. */
57 __forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16)
58 {
59 /* bind the thread local allocator to the proper FastAllocator*/
60 parent->bind(alloc);
61
62 assert(align <= maxAlignment);
63 bytesUsed += bytes;
64
65 /* try to allocate in local block */
66 size_t ofs = (align - cur) & (align-1);
67 cur += bytes + ofs;
68 if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
69 cur -= bytes + ofs;
70
71 /* if allocation is too large allocate with parent allocator */
72 if (4*bytes > allocBlockSize) {
73 return alloc->malloc(bytes,maxAlignment,false);
74 }
75
76 /* get new partial block if allocation failed */
77 size_t blockSize = allocBlockSize;
78 ptr = (char*) alloc->malloc(blockSize,maxAlignment,true);
79 bytesWasted += end-cur;
80 cur = 0; end = blockSize;
81
82 /* retry allocation */
83 ofs = (align - cur) & (align-1);
84 cur += bytes + ofs;
85 if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
86 cur -= bytes + ofs;
87
88 /* get new full block if allocation failed */
89 blockSize = allocBlockSize;
90 ptr = (char*) alloc->malloc(blockSize,maxAlignment,false);
91 bytesWasted += end-cur;
92 cur = 0; end = blockSize;
93
94 /* retry allocation */
95 ofs = (align - cur) & (align-1);
96 cur += bytes + ofs;
97 if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
98 cur -= bytes + ofs;
99
100 /* should never happen as large allocations get handled specially above */
101 assert(false);
102 return nullptr;
103 }
104
105
106 /*! returns amount of used bytes */
107 __forceinline size_t getUsedBytes() const { return bytesUsed; }
108
109 /*! returns amount of free bytes */
110 __forceinline size_t getFreeBytes() const { return end-cur; }
111
112 /*! returns amount of wasted bytes */
113 __forceinline size_t getWastedBytes() const { return bytesWasted; }
114
115 private:
116 ThreadLocal2* parent;
117 char* ptr; //!< pointer to memory block
118 size_t cur; //!< current location of the allocator
119 size_t end; //!< end of the memory block
120 size_t allocBlockSize; //!< block size for allocations
121 size_t bytesUsed; //!< number of total bytes allocated
122 size_t bytesWasted; //!< number of bytes wasted
123 };
124
125 /*! Two thread local structures. */
126 struct __aligned(64) ThreadLocal2
127 {
128 ALIGNED_CLASS_(64);
129 public:
130
131 __forceinline ThreadLocal2()
132 : alloc(nullptr), alloc0(this), alloc1(this) {}
133
134 /*! bind to fast allocator */
135 __forceinline void bind(FastAllocator* alloc_i)
136 {
137 assert(alloc_i);
138 if (alloc.load() == alloc_i) return;
139#if defined(APPLE) && defined(__aarch64__)
140 std::scoped_lock lock(mutex);
141#else
142 Lock<SpinLock> lock(mutex);
143#endif
144 //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind
145 if (alloc.load()) {
146 alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
147 alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();
148 alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
149 }
150 alloc0.init(alloc_i);
151 alloc1.init(alloc_i);
152 alloc.store(alloc_i);
153 alloc_i->join(this);
154 }
155
156 /*! unbind to fast allocator */
157 void unbind(FastAllocator* alloc_i)
158 {
159 assert(alloc_i);
160 if (alloc.load() != alloc_i) return;
161#if defined(APPLE) && defined(__aarch64__)
162 std::scoped_lock lock(mutex);
163#else
164 Lock<SpinLock> lock(mutex);
165#endif
166 if (alloc.load() != alloc_i) return; // required as a different thread calls unbind
167 alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
168 alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();
169 alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
170 alloc0.init(nullptr);
171 alloc1.init(nullptr);
172 alloc.store(nullptr);
173 }
174
175 public:
176#if defined(APPLE) && defined(__aarch64__)
177 std::mutex mutex;
178#else
179 SpinLock mutex; //!< required as unbind is called from other threads
180#endif
181 std::atomic<FastAllocator*> alloc; //!< parent allocator
182 ThreadLocal alloc0;
183 ThreadLocal alloc1;
184 };
185
186 FastAllocator (Device* device, bool osAllocation)
187 : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0),
188 growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC),
189 primrefarray(device,0)
190 {
191 for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
192 {
193 threadUsedBlocks[i] = nullptr;
194 threadBlocks[i] = nullptr;
195 assert(!slotMutex[i].isLocked());
196 }
197 }
198
199 ~FastAllocator () {
200 clear();
201 }
202
203 /*! returns the device attached to this allocator */
204 Device* getDevice() {
205 return device;
206 }
207
208 void share(mvector<PrimRef>& primrefarray_i) {
209 primrefarray = std::move(primrefarray_i);
210 }
211
212 void unshare(mvector<PrimRef>& primrefarray_o)
213 {
214 reset(); // this removes blocks that are allocated inside the shared primref array
215 primrefarray_o = std::move(primrefarray);
216 }
217
218 /*! returns first fast thread local allocator */
219 __forceinline ThreadLocal* _threadLocal() {
220 return &threadLocal2()->alloc0;
221 }
222
223 void setOSallocation(bool flag)
224 {
225 atype = flag ? EMBREE_OS_MALLOC : ALIGNED_MALLOC;
226 }
227
228 private:
229
230 /*! returns both fast thread local allocators */
231 __forceinline ThreadLocal2* threadLocal2()
232 {
233 ThreadLocal2* alloc = thread_local_allocator2;
234 if (alloc == nullptr) {
235 thread_local_allocator2 = alloc = new ThreadLocal2;
236#if defined(APPLE) && defined(__aarch64__)
237 std::scoped_lock lock(s_thread_local_allocators_lock);
238#else
239 Lock<SpinLock> lock(s_thread_local_allocators_lock);
240#endif
241 s_thread_local_allocators.push_back(make_unique(alloc));
242 }
243 return alloc;
244 }
245
246 public:
247
248 __forceinline void join(ThreadLocal2* alloc)
249 {
250#if defined(APPLE) && defined(__aarch64__)
251 std::scoped_lock lock(s_thread_local_allocators_lock);
252#else
253 Lock<SpinLock> lock(thread_local_allocators_lock);
254#endif
255 thread_local_allocators.push_back(alloc);
256 }
257
258 public:
259
260 struct CachedAllocator
261 {
262 __forceinline CachedAllocator(void* ptr)
263 : alloc(nullptr), talloc0(nullptr), talloc1(nullptr)
264 {
265 assert(ptr == nullptr);
266 }
267
268 __forceinline CachedAllocator(FastAllocator* alloc, ThreadLocal2* talloc)
269 : alloc(alloc), talloc0(&talloc->alloc0), talloc1(alloc->use_single_mode ? &talloc->alloc0 : &talloc->alloc1) {}
270
271 __forceinline operator bool () const {
272 return alloc != nullptr;
273 }
274
275 __forceinline void* operator() (size_t bytes, size_t align = 16) const {
276 return talloc0->malloc(alloc,bytes,align);
277 }
278
279 __forceinline void* malloc0 (size_t bytes, size_t align = 16) const {
280 return talloc0->malloc(alloc,bytes,align);
281 }
282
283 __forceinline void* malloc1 (size_t bytes, size_t align = 16) const {
284 return talloc1->malloc(alloc,bytes,align);
285 }
286
287 public:
288 FastAllocator* alloc;
289 ThreadLocal* talloc0;
290 ThreadLocal* talloc1;
291 };
292
293 __forceinline CachedAllocator getCachedAllocator() {
294 return CachedAllocator(this,threadLocal2());
295 }
296
297 /*! Builder interface to create thread local allocator */
298 struct Create
299 {
300 public:
301 __forceinline Create (FastAllocator* allocator) : allocator(allocator) {}
302 __forceinline CachedAllocator operator() () const { return allocator->getCachedAllocator(); }
303
304 private:
305 FastAllocator* allocator;
306 };
307
308 void internal_fix_used_blocks()
309 {
310 /* move thread local blocks to global block list */
311 for (size_t i = 0; i < MAX_THREAD_USED_BLOCK_SLOTS; i++)
312 {
313 while (threadBlocks[i].load() != nullptr) {
314 Block* nextUsedBlock = threadBlocks[i].load()->next;
315 threadBlocks[i].load()->next = usedBlocks.load();
316 usedBlocks = threadBlocks[i].load();
317 threadBlocks[i] = nextUsedBlock;
318 }
319 threadBlocks[i] = nullptr;
320 }
321 }
322
323 static const size_t threadLocalAllocOverhead = 20; //! 20 means 5% parallel allocation overhead through unfilled thread local blocks
324 static const size_t mainAllocOverheadStatic = 20; //! 20 means 5% allocation overhead through unfilled main alloc blocks
325 static const size_t mainAllocOverheadDynamic = 8; //! 20 means 12.5% allocation overhead through unfilled main alloc blocks
326
327 /* calculates a single threaded threshold for the builders such
328 * that for small scenes the overhead of partly allocated blocks
329 * per thread is low */
330 size_t fixSingleThreadThreshold(size_t branchingFactor, size_t defaultThreshold, size_t numPrimitives, size_t bytesEstimated)
331 {
332 if (numPrimitives == 0 || bytesEstimated == 0)
333 return defaultThreshold;
334
335 /* calculate block size in bytes to fulfill threadLocalAllocOverhead constraint */
336 const size_t single_mode_factor = use_single_mode ? 1 : 2;
337 const size_t threadCount = TaskScheduler::threadCount();
338 const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSize;
339
340 /* if we do not have to limit number of threads use optimal thresdhold */
341 if ( (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
342 return defaultThreshold;
343
344 /* otherwise limit number of threads by calculating proper single thread threshold */
345 else {
346 double bytesPerPrimitive = double(bytesEstimated)/double(numPrimitives);
347 return size_t(ceil(branchingFactor*singleThreadBytes/bytesPerPrimitive));
348 }
349 }
350
351 __forceinline size_t alignSize(size_t i) {
352 return (i+127)/128*128;
353 }
354
355 /*! initializes the grow size */
356 __forceinline void initGrowSizeAndNumSlots(size_t bytesEstimated, bool fast)
357 {
358 /* we do not need single thread local allocator mode */
359 use_single_mode = false;
360
361 /* calculate growSize such that at most mainAllocationOverhead gets wasted when a block stays unused */
362 size_t mainAllocOverhead = fast ? mainAllocOverheadDynamic : mainAllocOverheadStatic;
363 size_t blockSize = alignSize(bytesEstimated/mainAllocOverhead);
364 growSize = maxGrowSize = clamp(blockSize,size_t(1024),maxAllocationSize);
365
366 /* if we reached the maxAllocationSize for growSize, we can
367 * increase the number of allocation slots by still guaranteeing
368 * the mainAllocationOverhead */
369 slotMask = 0x0;
370
371 if (MAX_THREAD_USED_BLOCK_SLOTS >= 2 && bytesEstimated > 2*mainAllocOverhead*growSize) slotMask = 0x1;
372 if (MAX_THREAD_USED_BLOCK_SLOTS >= 4 && bytesEstimated > 4*mainAllocOverhead*growSize) slotMask = 0x3;
373 if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 8*mainAllocOverhead*growSize) slotMask = 0x7;
374 if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 16*mainAllocOverhead*growSize) { growSize *= 2; } /* if the overhead is tiny, double the growSize */
375
376 /* set the thread local alloc block size */
377 size_t defaultBlockSizeSwitch = PAGE_SIZE+maxAlignment;
378
379 /* for sufficiently large scene we can increase the defaultBlockSize over the defaultBlockSizeSwitch size */
380#if 0 // we do not do this as a block size of 4160 if for some reason best for KNL
381 const size_t threadCount = TaskScheduler::threadCount();
382 const size_t single_mode_factor = use_single_mode ? 1 : 2;
383 const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSizeSwitch;
384 if (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
385 defaultBlockSize = min(max(defaultBlockSizeSwitch,bytesEstimated/(single_mode_factor*threadLocalAllocOverhead*threadCount)),growSize);
386
387 /* otherwise we grow the defaultBlockSize up to defaultBlockSizeSwitch */
388 else
389#endif
390 defaultBlockSize = clamp(blockSize,size_t(1024),defaultBlockSizeSwitch);
391
392 if (bytesEstimated == 0) {
393 maxGrowSize = maxAllocationSize; // special mode if builder cannot estimate tree size
394 defaultBlockSize = defaultBlockSizeSwitch;
395 }
396 log2_grow_size_scale = 0;
397
398 if (device->alloc_main_block_size != 0) growSize = device->alloc_main_block_size;
399 if (device->alloc_num_main_slots >= 1 ) slotMask = 0x0;
400 if (device->alloc_num_main_slots >= 2 ) slotMask = 0x1;
401 if (device->alloc_num_main_slots >= 4 ) slotMask = 0x3;
402 if (device->alloc_num_main_slots >= 8 ) slotMask = 0x7;
403 if (device->alloc_thread_block_size != 0) defaultBlockSize = device->alloc_thread_block_size;
404 if (device->alloc_single_thread_alloc != -1) use_single_mode = device->alloc_single_thread_alloc;
405 }
406
407 /*! initializes the allocator */
408 void init(size_t bytesAllocate, size_t bytesReserve, size_t bytesEstimate)
409 {
410 internal_fix_used_blocks();
411 /* distribute the allocation to multiple thread block slots */
412 slotMask = MAX_THREAD_USED_BLOCK_SLOTS-1; // FIXME: remove
413 if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
414 if (bytesReserve == 0) bytesReserve = bytesAllocate;
415 freeBlocks = Block::create(device,bytesAllocate,bytesReserve,nullptr,atype);
416 estimatedSize = bytesEstimate;
417 initGrowSizeAndNumSlots(bytesEstimate,true);
418 }
419
420 /*! initializes the allocator */
421 void init_estimate(size_t bytesEstimate)
422 {
423 internal_fix_used_blocks();
424 if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
425 /* single allocator mode ? */
426 estimatedSize = bytesEstimate;
427 //initGrowSizeAndNumSlots(bytesEstimate,false);
428 initGrowSizeAndNumSlots(bytesEstimate,false);
429
430 }
431
432 /*! frees state not required after build */
433 __forceinline void cleanup()
434 {
435 internal_fix_used_blocks();
436
437 /* unbind all thread local allocators */
438 for (auto alloc : thread_local_allocators) alloc->unbind(this);
439 thread_local_allocators.clear();
440 }
441
442 /*! resets the allocator, memory blocks get reused */
443 void reset ()
444 {
445 internal_fix_used_blocks();
446
447 bytesUsed.store(0);
448 bytesFree.store(0);
449 bytesWasted.store(0);
450
451 /* reset all used blocks and move them to begin of free block list */
452 while (usedBlocks.load() != nullptr) {
453 usedBlocks.load()->reset_block();
454 Block* nextUsedBlock = usedBlocks.load()->next;
455 usedBlocks.load()->next = freeBlocks.load();
456 freeBlocks = usedBlocks.load();
457 usedBlocks = nextUsedBlock;
458 }
459
460 /* remove all shared blocks as they are re-added during build */
461 freeBlocks.store(Block::remove_shared_blocks(freeBlocks.load()));
462
463 for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
464 {
465 threadUsedBlocks[i] = nullptr;
466 threadBlocks[i] = nullptr;
467 }
468
469 /* unbind all thread local allocators */
470 for (auto alloc : thread_local_allocators) alloc->unbind(this);
471 thread_local_allocators.clear();
472 }
473
474 /*! frees all allocated memory */
475 __forceinline void clear()
476 {
477 cleanup();
478 bytesUsed.store(0);
479 bytesFree.store(0);
480 bytesWasted.store(0);
481 if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device); usedBlocks = nullptr;
482 if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device); freeBlocks = nullptr;
483 for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) {
484 threadUsedBlocks[i] = nullptr;
485 threadBlocks[i] = nullptr;
486 }
487 primrefarray.clear();
488 }
489
490 __forceinline size_t incGrowSizeScale()
491 {
492 size_t scale = log2_grow_size_scale.fetch_add(1)+1;
493 return size_t(1) << min(size_t(16),scale);
494 }
495
496 /*! thread safe allocation of memory */
497 void* malloc(size_t& bytes, size_t align, bool partial)
498 {
499 assert(align <= maxAlignment);
500
501 while (true)
502 {
503 /* allocate using current block */
504 size_t threadID = TaskScheduler::threadID();
505 size_t slot = threadID & slotMask;
506 Block* myUsedBlocks = threadUsedBlocks[slot];
507 if (myUsedBlocks) {
508 void* ptr = myUsedBlocks->malloc(device,bytes,align,partial);
509 if (ptr) return ptr;
510 }
511
512 /* throw error if allocation is too large */
513 if (bytes > maxAllocationSize)
514 throw_RTCError(RTC_ERROR_UNKNOWN,"allocation is too large");
515
516 /* parallel block creation in case of no freeBlocks, avoids single global mutex */
517 if (likely(freeBlocks.load() == nullptr))
518 {
519#if defined(APPLE) && defined(__aarch64__)
520 std::scoped_lock lock(slotMutex[slot]);
521#else
522 Lock<SpinLock> lock(slotMutex[slot]);
523#endif
524 if (myUsedBlocks == threadUsedBlocks[slot]) {
525 const size_t alignedBytes = (bytes+(align-1)) & ~(align-1);
526 const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes);
527 assert(allocSize >= bytes);
528 threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here!
529 // FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail.
530 }
531 continue;
532 }
533
534 /* if this fails allocate new block */
535 {
536#if defined(APPLE) && defined(__aarch64__)
537 std::scoped_lock lock(mutex);
538#else
539 Lock<SpinLock> lock(mutex);
540#endif
541 if (myUsedBlocks == threadUsedBlocks[slot])
542 {
543 if (freeBlocks.load() != nullptr) {
544 Block* nextFreeBlock = freeBlocks.load()->next;
545 freeBlocks.load()->next = usedBlocks;
546 __memory_barrier();
547 usedBlocks = freeBlocks.load();
548 threadUsedBlocks[slot] = freeBlocks.load();
549 freeBlocks = nextFreeBlock;
550 } else {
551 const size_t allocSize = min(growSize*incGrowSizeScale(),maxGrowSize);
552 usedBlocks = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above!
553 }
554 }
555 }
556 }
557 }
558
559 /*! add new block */
560 void addBlock(void* ptr, ssize_t bytes)
561 {
562#if defined(APPLE) && defined(__aarch64__)
563 std::scoped_lock lock(mutex);
564#else
565 Lock<SpinLock> lock(mutex);
566#endif
567 const size_t sizeof_Header = offsetof(Block,data[0]);
568 void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1));
569 size_t ofs = (size_t) aptr - (size_t) ptr;
570 bytes -= ofs;
571 if (bytes < 4096) return; // ignore empty or very small blocks
572 freeBlocks = new (aptr) Block(SHARED,bytes-sizeof_Header,bytes-sizeof_Header,freeBlocks,ofs);
573 }
574
575 /* special allocation only used from morton builder only a single time for each build */
576 void* specialAlloc(size_t bytes)
577 {
578 assert(freeBlocks.load() != nullptr && freeBlocks.load()->getBlockAllocatedBytes() >= bytes);
579 return freeBlocks.load()->ptr();
580 }
581
582 struct Statistics
583 {
584 Statistics ()
585 : bytesUsed(0), bytesFree(0), bytesWasted(0) {}
586
587 Statistics (size_t bytesUsed, size_t bytesFree, size_t bytesWasted)
588 : bytesUsed(bytesUsed), bytesFree(bytesFree), bytesWasted(bytesWasted) {}
589
590 Statistics (FastAllocator* alloc, AllocationType atype, bool huge_pages = false)
591 : bytesUsed(0), bytesFree(0), bytesWasted(0)
592 {
593 Block* usedBlocks = alloc->usedBlocks.load();
594 Block* freeBlocks = alloc->freeBlocks.load();
595 if (usedBlocks) bytesUsed += usedBlocks->getUsedBytes(atype,huge_pages);
596 if (freeBlocks) bytesFree += freeBlocks->getAllocatedBytes(atype,huge_pages);
597 if (usedBlocks) bytesFree += usedBlocks->getFreeBytes(atype,huge_pages);
598 if (freeBlocks) bytesWasted += freeBlocks->getWastedBytes(atype,huge_pages);
599 if (usedBlocks) bytesWasted += usedBlocks->getWastedBytes(atype,huge_pages);
600 }
601
602 std::string str(size_t numPrimitives)
603 {
604 std::stringstream str;
605 str.setf(std::ios::fixed, std::ios::floatfield);
606 str << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
607 << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "
608 << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "
609 << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesAllocatedTotal() << " MB, "
610 << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesAllocatedTotal())/double(numPrimitives);
611 return str.str();
612 }
613
614 friend Statistics operator+ ( const Statistics& a, const Statistics& b)
615 {
616 return Statistics(a.bytesUsed+b.bytesUsed,
617 a.bytesFree+b.bytesFree,
618 a.bytesWasted+b.bytesWasted);
619 }
620
621 size_t bytesAllocatedTotal() const {
622 return bytesUsed + bytesFree + bytesWasted;
623 }
624
625 public:
626 size_t bytesUsed;
627 size_t bytesFree;
628 size_t bytesWasted;
629 };
630
631 Statistics getStatistics(AllocationType atype, bool huge_pages = false) {
632 return Statistics(this,atype,huge_pages);
633 }
634
635 size_t getUsedBytes() {
636 return bytesUsed;
637 }
638
639 size_t getWastedBytes() {
640 return bytesWasted;
641 }
642
643 struct AllStatistics
644 {
645 AllStatistics (FastAllocator* alloc)
646
647 : bytesUsed(alloc->bytesUsed),
648 bytesFree(alloc->bytesFree),
649 bytesWasted(alloc->bytesWasted),
650 stat_all(alloc,ANY_TYPE),
651 stat_malloc(alloc,ALIGNED_MALLOC),
652 stat_4K(alloc,EMBREE_OS_MALLOC,false),
653 stat_2M(alloc,EMBREE_OS_MALLOC,true),
654 stat_shared(alloc,SHARED) {}
655
656 AllStatistics (size_t bytesUsed,
657 size_t bytesFree,
658 size_t bytesWasted,
659 Statistics stat_all,
660 Statistics stat_malloc,
661 Statistics stat_4K,
662 Statistics stat_2M,
663 Statistics stat_shared)
664
665 : bytesUsed(bytesUsed),
666 bytesFree(bytesFree),
667 bytesWasted(bytesWasted),
668 stat_all(stat_all),
669 stat_malloc(stat_malloc),
670 stat_4K(stat_4K),
671 stat_2M(stat_2M),
672 stat_shared(stat_shared) {}
673
674 friend AllStatistics operator+ (const AllStatistics& a, const AllStatistics& b)
675 {
676 return AllStatistics(a.bytesUsed+b.bytesUsed,
677 a.bytesFree+b.bytesFree,
678 a.bytesWasted+b.bytesWasted,
679 a.stat_all + b.stat_all,
680 a.stat_malloc + b.stat_malloc,
681 a.stat_4K + b.stat_4K,
682 a.stat_2M + b.stat_2M,
683 a.stat_shared + b.stat_shared);
684 }
685
686 void print(size_t numPrimitives)
687 {
688 std::stringstream str0;
689 str0.setf(std::ios::fixed, std::ios::floatfield);
690 str0 << " alloc : "
691 << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
692 << " "
693 << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed)/double(numPrimitives);
694 std::cout << str0.str() << std::endl;
695
696 std::stringstream str1;
697 str1.setf(std::ios::fixed, std::ios::floatfield);
698 str1 << " alloc : "
699 << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
700 << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "
701 << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "
702 << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*(bytesUsed+bytesFree+bytesWasted) << " MB, "
703 << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed+bytesFree+bytesWasted)/double(numPrimitives);
704 std::cout << str1.str() << std::endl;
705
706 std::cout << " total : " << stat_all.str(numPrimitives) << std::endl;
707 std::cout << " 4K : " << stat_4K.str(numPrimitives) << std::endl;
708 std::cout << " 2M : " << stat_2M.str(numPrimitives) << std::endl;
709 std::cout << " malloc: " << stat_malloc.str(numPrimitives) << std::endl;
710 std::cout << " shared: " << stat_shared.str(numPrimitives) << std::endl;
711 }
712
713 private:
714 size_t bytesUsed;
715 size_t bytesFree;
716 size_t bytesWasted;
717 Statistics stat_all;
718 Statistics stat_malloc;
719 Statistics stat_4K;
720 Statistics stat_2M;
721 Statistics stat_shared;
722 };
723
724 void print_blocks()
725 {
726 std::cout << " estimatedSize = " << estimatedSize << ", slotMask = " << slotMask << ", use_single_mode = " << use_single_mode << ", maxGrowSize = " << maxGrowSize << ", defaultBlockSize = " << defaultBlockSize << std::endl;
727
728 std::cout << " used blocks = ";
729 if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list();
730 std::cout << "[END]" << std::endl;
731
732 std::cout << " free blocks = ";
733 if (freeBlocks.load() != nullptr) freeBlocks.load()->print_list();
734 std::cout << "[END]" << std::endl;
735 }
736
737 private:
738
739 struct Block
740 {
741 static Block* create(MemoryMonitorInterface* device, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype)
742 {
743 /* We avoid using os_malloc for small blocks as this could
744 * cause a risk of fragmenting the virtual address space and
745 * reach the limit of vm.max_map_count = 65k under Linux. */
746 if (atype == EMBREE_OS_MALLOC && bytesAllocate < maxAllocationSize)
747 atype = ALIGNED_MALLOC;
748
749 /* we need to additionally allocate some header */
750 const size_t sizeof_Header = offsetof(Block,data[0]);
751 bytesAllocate = sizeof_Header+bytesAllocate;
752 bytesReserve = sizeof_Header+bytesReserve;
753
754 /* consume full 4k pages with using os_malloc */
755 if (atype == EMBREE_OS_MALLOC) {
756 bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1));
757 bytesReserve = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1));
758 }
759
760 /* either use alignedMalloc or os_malloc */
761 void *ptr = nullptr;
762 if (atype == ALIGNED_MALLOC)
763 {
764 /* special handling for default block size */
765 if (bytesAllocate == (2*PAGE_SIZE_2M))
766 {
767 const size_t alignment = maxAlignment;
768 if (device) device->memoryMonitor(bytesAllocate+alignment,false);
769 ptr = alignedMalloc(bytesAllocate,alignment);
770
771 /* give hint to transparently convert these pages to 2MB pages */
772 const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-1);
773 os_advise((void*)(ptr_aligned_begin + 0),PAGE_SIZE_2M); // may fail if no memory mapped before block
774 os_advise((void*)(ptr_aligned_begin + 1*PAGE_SIZE_2M),PAGE_SIZE_2M);
775 os_advise((void*)(ptr_aligned_begin + 2*PAGE_SIZE_2M),PAGE_SIZE_2M); // may fail if no memory mapped after block
776
777 return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
778 }
779 else
780 {
781 const size_t alignment = maxAlignment;
782 if (device) device->memoryMonitor(bytesAllocate+alignment,false);
783 ptr = alignedMalloc(bytesAllocate,alignment);
784 return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
785 }
786 }
787 else if (atype == EMBREE_OS_MALLOC)
788 {
789 if (device) device->memoryMonitor(bytesAllocate,false);
790 bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages);
791 return new (ptr) Block(EMBREE_OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
792 }
793 else
794 assert(false);
795
796 return NULL;
797 }
798
799 Block (AllocationType atype, size_t bytesAllocate, size_t bytesReserve, Block* next, size_t wasted, bool huge_pages = false)
800 : cur(0), allocEnd(bytesAllocate), reserveEnd(bytesReserve), next(next), wasted(wasted), atype(atype), huge_pages(huge_pages)
801 {
802 assert((((size_t)&data[0]) & (maxAlignment-1)) == 0);
803 }
804
805 static Block* remove_shared_blocks(Block* head)
806 {
807 Block** prev_next = &head;
808 for (Block* block = head; block; block = block->next) {
809 if (block->atype == SHARED) *prev_next = block->next;
810 else prev_next = &block->next;
811 }
812 return head;
813 }
814
815 void clear_list(MemoryMonitorInterface* device)
816 {
817 Block* block = this;
818 while (block) {
819 Block* next = block->next;
820 block->clear_block(device);
821 block = next;
822 }
823 }
824
825 void clear_block (MemoryMonitorInterface* device)
826 {
827 const size_t sizeof_Header = offsetof(Block,data[0]);
828 const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes();
829
830 if (atype == ALIGNED_MALLOC) {
831 alignedFree(this);
832 if (device) device->memoryMonitor(-sizeof_Alloced,true);
833 }
834
835 else if (atype == EMBREE_OS_MALLOC) {
836 size_t sizeof_This = sizeof_Header+reserveEnd;
837 os_free(this,sizeof_This,huge_pages);
838 if (device) device->memoryMonitor(-sizeof_Alloced,true);
839 }
840
841 else /* if (atype == SHARED) */ {
842 }
843 }
844
845 void* malloc(MemoryMonitorInterface* device, size_t& bytes_in, size_t align, bool partial)
846 {
847 size_t bytes = bytes_in;
848 assert(align <= maxAlignment);
849 bytes = (bytes+(align-1)) & ~(align-1);
850 if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr;
851 const size_t i = cur.fetch_add(bytes);
852 if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr;
853 if (unlikely(i > reserveEnd)) return nullptr;
854 bytes_in = bytes = min(bytes,reserveEnd-i);
855
856 if (i+bytes > allocEnd) {
857 if (device) device->memoryMonitor(i+bytes-max(i,allocEnd),true);
858 }
859 return &data[i];
860 }
861
862 void* ptr() {
863 return &data[cur];
864 }
865
866 void reset_block ()
867 {
868 allocEnd = max(allocEnd,(size_t)cur);
869 cur = 0;
870 }
871
872 size_t getBlockUsedBytes() const {
873 return min(size_t(cur),reserveEnd);
874 }
875
876 size_t getBlockFreeBytes() const {
877 return getBlockAllocatedBytes() - getBlockUsedBytes();
878 }
879
880 size_t getBlockAllocatedBytes() const {
881 return min(max(allocEnd,size_t(cur)),reserveEnd);
882 }
883
884 size_t getBlockWastedBytes() const {
885 const size_t sizeof_Header = offsetof(Block,data[0]);
886 return sizeof_Header + wasted;
887 }
888
889 size_t getBlockReservedBytes() const {
890 return reserveEnd;
891 }
892
893 bool hasType(AllocationType atype_i, bool huge_pages_i) const
894 {
895 if (atype_i == ANY_TYPE ) return true;
896 else if (atype == EMBREE_OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
897 else return atype_i == atype;
898 }
899
900 size_t getUsedBytes(AllocationType atype, bool huge_pages = false) const {
901 size_t bytes = 0;
902 for (const Block* block = this; block; block = block->next) {
903 if (!block->hasType(atype,huge_pages)) continue;
904 bytes += block->getBlockUsedBytes();
905 }
906 return bytes;
907 }
908
909 size_t getFreeBytes(AllocationType atype, bool huge_pages = false) const {
910 size_t bytes = 0;
911 for (const Block* block = this; block; block = block->next) {
912 if (!block->hasType(atype,huge_pages)) continue;
913 bytes += block->getBlockFreeBytes();
914 }
915 return bytes;
916 }
917
918 size_t getWastedBytes(AllocationType atype, bool huge_pages = false) const {
919 size_t bytes = 0;
920 for (const Block* block = this; block; block = block->next) {
921 if (!block->hasType(atype,huge_pages)) continue;
922 bytes += block->getBlockWastedBytes();
923 }
924 return bytes;
925 }
926
927 size_t getAllocatedBytes(AllocationType atype, bool huge_pages = false) const {
928 size_t bytes = 0;
929 for (const Block* block = this; block; block = block->next) {
930 if (!block->hasType(atype,huge_pages)) continue;
931 bytes += block->getBlockAllocatedBytes();
932 }
933 return bytes;
934 }
935
936 void print_list ()
937 {
938 for (const Block* block = this; block; block = block->next)
939 block->print_block();
940 }
941
942 void print_block() const
943 {
944 if (atype == ALIGNED_MALLOC) std::cout << "A";
945 else if (atype == EMBREE_OS_MALLOC) std::cout << "O";
946 else if (atype == SHARED) std::cout << "S";
947 if (huge_pages) std::cout << "H";
948 size_t bytesUsed = getBlockUsedBytes();
949 size_t bytesFree = getBlockFreeBytes();
950 size_t bytesWasted = getBlockWastedBytes();
951 std::cout << "[" << bytesUsed << ", " << bytesFree << ", " << bytesWasted << "] ";
952 }
953
954 public:
955 std::atomic<size_t> cur; //!< current location of the allocator
956 std::atomic<size_t> allocEnd; //!< end of the allocated memory region
957 std::atomic<size_t> reserveEnd; //!< end of the reserved memory region
958 Block* next; //!< pointer to next block in list
959 size_t wasted; //!< amount of memory wasted through block alignment
960 AllocationType atype; //!< allocation mode of the block
961 bool huge_pages; //!< whether the block uses huge pages
962 char align[maxAlignment-5*sizeof(size_t)-sizeof(AllocationType)-sizeof(bool)]; //!< align data to maxAlignment
963 char data[1]; //!< here starts memory to use for allocations
964 };
965
966 private:
967 Device* device;
968 SpinLock mutex;
969 size_t slotMask;
970 std::atomic<Block*> threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
971 std::atomic<Block*> usedBlocks;
972 std::atomic<Block*> freeBlocks;
973
974 std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
975#if defined(APPLE) && defined(__aarch64__)
976 std::mutex slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
977#else
978 PaddedSpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
979#endif
980
981 bool use_single_mode;
982 size_t defaultBlockSize;
983 size_t estimatedSize;
984 size_t growSize;
985 size_t maxGrowSize;
986 std::atomic<size_t> log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove
987 std::atomic<size_t> bytesUsed;
988 std::atomic<size_t> bytesFree;
989 std::atomic<size_t> bytesWasted;
990 static __thread ThreadLocal2* thread_local_allocator2;
991 static SpinLock s_thread_local_allocators_lock;
992 static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;
993#if defined(APPLE) && defined(__aarch64__)
994 std::mutex thread_local_allocators_lock;
995#else
996 SpinLock thread_local_allocators_lock;
997#endif
998 std::vector<ThreadLocal2*> thread_local_allocators;
999 AllocationType atype;
1000 mvector<PrimRef> primrefarray; //!< primrefarray used to allocate nodes
1001 };
1002}
1003