alloc.h source code [Godot/thirdparty/embree/kernels/common/alloc.h]

1	// Copyright 2009-2021 Intel Corporation
2	// SPDX-License-Identifier: Apache-2.0
3
4	#pragma once
5
6	#include "default.h"
7	#include "device.h"
8	#include "scene.h"
9	#include "primref.h"
10
11	#if defined(APPLE) && defined(__aarch64__)
12	#include <mutex>
13	#endif
14
15	namespace embree
16	{
17	class FastAllocator
18	{
19	/! maximum supported alignment /
20	static const size_t maxAlignment = `64`;
21
22	/! maximum allocation size /
23
24	/ default settings /
25	//static const size_t defaultBlockSize = 4096;
26	#define maxAllocationSize size_t(210241024-maxAlignment)
27
28	static const size_t MAX_THREAD_USED_BLOCK_SLOTS = `8`;
29
30	public:
31
32	struct ThreadLocal2;
33	enum AllocationType { ALIGNED_MALLOC, EMBREE_OS_MALLOC, SHARED, ANY_TYPE };
34
35	/! Per thread structure holding the current memory block. /
36	struct __aligned(`64`) ThreadLocal
37	{
38	ALIGNED_CLASS_(`64`);
39	public:
40
41	/! Constructor for usage with ThreadLocalData /
42	__forceinline ThreadLocal (ThreadLocal2* parent)
43	: parent(parent), ptr(nullptr), cur(`0`), end(`0`), allocBlockSize(`0`), bytesUsed(`0`), bytesWasted(`0`) {}
44
45	/! initialize allocator /
46	void init(FastAllocator* alloc)
47	{
48	ptr = nullptr;
49	cur = end = `0`;
50	bytesUsed = `0`;
51	bytesWasted = `0`;
52	allocBlockSize = `0`;
53	if (alloc) allocBlockSize = alloc->defaultBlockSize;
54	}
55
56	/ Allocate aligned memory from the threads memory block. /
57	__forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = `16`)
58	{
59	/ bind the thread local allocator to the proper FastAllocator/
60	parent->bind(alloc);
61
62	assert(align <= maxAlignment);
63	bytesUsed += bytes;
64
65	/ try to allocate in local block /
66	size_t ofs = (align - cur) & (align-`1`);
67	cur += bytes + ofs;
68	if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
69	cur -= bytes + ofs;
70
71	/ if allocation is too large allocate with parent allocator /
72	if (`4`*bytes > allocBlockSize) {
73	return alloc->malloc(bytes,maxAlignment,false);
74	}
75
76	/ get new partial block if allocation failed /
77	size_t blockSize = allocBlockSize;
78	ptr = (char) alloc->malloc(blockSize,maxAlignment,true*);
79	bytesWasted += end-cur;
80	cur = `0`; end = blockSize;
81
82	/ retry allocation /
83	ofs = (align - cur) & (align-`1`);
84	cur += bytes + ofs;
85	if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
86	cur -= bytes + ofs;
87
88	/ get new full block if allocation failed /
89	blockSize = allocBlockSize;
90	ptr = (char) alloc->malloc(blockSize,maxAlignment,false*);
91	bytesWasted += end-cur;
92	cur = `0`; end = blockSize;
93
94	/ retry allocation /
95	ofs = (align - cur) & (align-`1`);
96	cur += bytes + ofs;
97	if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
98	cur -= bytes + ofs;
99
100	/ should never happen as large allocations get handled specially above /
101	assert(false);
102	return nullptr;
103	}
104
105
106	/! returns amount of used bytes /
107	__forceinline size_t getUsedBytes() const { return bytesUsed; }
108
109	/! returns amount of free bytes /
110	__forceinline size_t getFreeBytes() const { return end-cur; }
111
112	/! returns amount of wasted bytes /
113	__forceinline size_t getWastedBytes() const { return bytesWasted; }
114
115	private:
116	ThreadLocal2* parent;
117	char* ptr; //!< pointer to memory block
118	size_t cur; //!< current location of the allocator
119	size_t end; //!< end of the memory block
120	size_t allocBlockSize; //!< block size for allocations
121	size_t bytesUsed; //!< number of total bytes allocated
122	size_t bytesWasted; //!< number of bytes wasted
123	};
124
125	/! Two thread local structures. /
126	struct __aligned(`64`) ThreadLocal2
127	{
128	ALIGNED_CLASS_(`64`);
129	public:
130
131	__forceinline ThreadLocal2()
132	: alloc (nullptr), alloc0 (this), alloc1 (this) {}
133
134	/! bind to fast allocator /
135	__forceinline void bind(FastAllocator* alloc_i)
136	{
137	assert(alloc_i);
138	if (alloc.load() == alloc_i) return;
139	#if defined(APPLE) && defined(__aarch64__)
140	std::scoped_lock lock(mutex);
141	#else
142	Lock<SpinLock> lock(mutex);
143	#endif
144	//if (alloc.load() == alloc_i) return; // not required as only one thread calls bind
145	if (alloc.load()) {
146	alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
147	alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();
148	alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
149	}
150	alloc0.init(alloc_i);
151	alloc1.init(alloc_i);
152	alloc.store(alloc_i);
153	alloc_i->join(this);
154	}
155
156	/! unbind to fast allocator /
157	void unbind(FastAllocator* alloc_i)
158	{
159	assert(alloc_i);
160	if (alloc.load() != alloc_i) return;
161	#if defined(APPLE) && defined(__aarch64__)
162	std::scoped_lock lock(mutex);
163	#else
164	Lock<SpinLock> lock(mutex);
165	#endif
166	if (alloc.load() != alloc_i) return; // required as a different thread calls unbind
167	alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
168	alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();
169	alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
170	alloc0.init(nullptr);
171	alloc1.init(nullptr);
172	alloc.store(nullptr);
173	}
174
175	public:
176	#if defined(APPLE) && defined(__aarch64__)
177	std::mutex mutex;
178	#else
179	SpinLock mutex; //!< required as unbind is called from other threads
180	#endif
181	std::atomic<FastAllocator> alloc; //!< parent allocator*
182	ThreadLocal alloc0;
183	ThreadLocal alloc1;
184	};
185
186	FastAllocator (Device* device, bool osAllocation)
187	: device(device), slotMask(`0`), usedBlocks (nullptr), freeBlocks (nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(`0`),
188	growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale (`0`), bytesUsed (`0`), bytesFree (`0`), bytesWasted (`0`), atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC),
189	primrefarray (device,`0`)
190	{
191	for (size_t i=`0`; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
192	{
193	threadUsedBlocks[i] = nullptr;
194	threadBlocks[i] = nullptr;
195	assert(!slotMutex[i].isLocked());
196	}
197	}
198
199	~FastAllocator () {
200	clear();
201	}
202
203	/! returns the device attached to this allocator /
204	Device* getDevice() {
205	return device;
206	}
207
208	void share(mvector<PrimRef>& primrefarray_i) {
209	primrefarray = std::move(primrefarray_i);
210	}
211
212	void unshare(mvector<PrimRef>& primrefarray_o)
213	{
214	reset(); // this removes blocks that are allocated inside the shared primref array
215	primrefarray_o = std::move(primrefarray);
216	}
217
218	/! returns first fast thread local allocator /
219	__forceinline ThreadLocal* _threadLocal() {
220	return &threadLocal2()->alloc0;
221	}
222
223	void setOSallocation(bool flag)
224	{
225	atype = flag ? EMBREE_OS_MALLOC : ALIGNED_MALLOC;
226	}
227
228	private:
229
230	/! returns both fast thread local allocators /
231	__forceinline ThreadLocal2* threadLocal2()
232	{
233	ThreadLocal2* alloc = thread_local_allocator2;
234	if (alloc == nullptr) {
235	thread_local_allocator2 = alloc = new ThreadLocal2;
236	#if defined(APPLE) && defined(__aarch64__)
237	std::scoped_lock lock(s_thread_local_allocators_lock);
238	#else
239	Lock<SpinLock> lock(s_thread_local_allocators_lock);
240	#endif
241	s_thread_local_allocators.push_back(make_unique(alloc));
242	}
243	return alloc;
244	}
245
246	public:
247
248	__forceinline void join(ThreadLocal2* alloc)
249	{
250	#if defined(APPLE) && defined(__aarch64__)
251	std::scoped_lock lock(s_thread_local_allocators_lock);
252	#else
253	Lock<SpinLock> lock(thread_local_allocators_lock);
254	#endif
255	thread_local_allocators.push_back(alloc);
256	}
257
258	public:
259
260	struct CachedAllocator
261	{
262	__forceinline CachedAllocator(void* ptr)
263	: alloc(nullptr), talloc0(nullptr), talloc1(nullptr)
264	{
265	assert(ptr == nullptr);
266	}
267
268	__forceinline CachedAllocator(FastAllocator* alloc, ThreadLocal2* talloc)
269	: alloc(alloc), talloc0(&talloc->alloc0), talloc1(alloc->use_single_mode ? &talloc->alloc0 : &talloc->alloc1) {}
270
271	__forceinline operator bool () const {
272	return alloc != nullptr;
273	}
274
275	__forceinline void* operator() (size_t bytes, size_t align = `16`) const {
276	return talloc0->malloc(alloc,bytes,align);
277	}
278
279	__forceinline void* malloc0 (size_t bytes, size_t align = `16`) const {
280	return talloc0->malloc(alloc,bytes,align);
281	}
282
283	__forceinline void* malloc1 (size_t bytes, size_t align = `16`) const {
284	return talloc1->malloc(alloc,bytes,align);
285	}
286
287	public:
288	FastAllocator* alloc;
289	ThreadLocal* talloc0;
290	ThreadLocal* talloc1;
291	};
292
293	__forceinline CachedAllocator getCachedAllocator() {
294	return CachedAllocator (this,threadLocal2());
295	}
296
297	/! Builder interface to create thread local allocator /
298	struct Create
299	{
300	public:
301	__forceinline Create (FastAllocator* allocator) : allocator(allocator) {}
302	__forceinline CachedAllocator operator() () const { return allocator->getCachedAllocator(); }
303
304	private:
305	FastAllocator* allocator;
306	};
307
308	void internal_fix_used_blocks()
309	{
310	/ move thread local blocks to global block list /
311	for (size_t i = `0`; i < MAX_THREAD_USED_BLOCK_SLOTS; i++)
312	{
313	while (threadBlocks[i].load() != nullptr) {
314	Block* nextUsedBlock = threadBlocks[i].load()->next;
315	threadBlocks[i].load()->next = usedBlocks.load();
316	usedBlocks = threadBlocks[i].load();
317	threadBlocks[i] = nextUsedBlock;
318	}
319	threadBlocks[i] = nullptr;
320	}
321	}
322
323	static const size_t threadLocalAllocOverhead = `20`; //! 20 means 5% parallel allocation overhead through unfilled thread local blocks
324	static const size_t mainAllocOverheadStatic = `20`; //! 20 means 5% allocation overhead through unfilled main alloc blocks
325	static const size_t mainAllocOverheadDynamic = `8`; //! 20 means 12.5% allocation overhead through unfilled main alloc blocks
326
327	/ calculates a single threaded threshold for the builders such*
328	* that for small scenes the overhead of partly allocated blocks
329	* per thread is low */
330	size_t fixSingleThreadThreshold(size_t branchingFactor, size_t defaultThreshold, size_t numPrimitives, size_t bytesEstimated)
331	{
332	if (numPrimitives == `0` \|\| bytesEstimated == `0`)
333	return defaultThreshold;
334
335	/ calculate block size in bytes to fulfill threadLocalAllocOverhead constraint /
336	const size_t single_mode_factor = use_single_mode ? `1` : `2`;
337	const size_t threadCount = TaskScheduler::threadCount();
338	const size_t singleThreadBytes = single_mode_factorthreadLocalAllocOverheaddefaultBlockSize;
339
340	/ if we do not have to limit number of threads use optimal thresdhold /
341	if ( (bytesEstimated+(singleThreadBytes-`1`))/singleThreadBytes >= threadCount)
342	return defaultThreshold;
343
344	/ otherwise limit number of threads by calculating proper single thread threshold /
345	else {
346	double bytesPerPrimitive = double(bytesEstimated)/double(numPrimitives);
347	return size_t(ceil(branchingFactor*singleThreadBytes/bytesPerPrimitive));
348	}
349	}
350
351	__forceinline size_t alignSize(size_t i) {
352	return (i+`127`)/`128`*`128`;
353	}
354
355	/! initializes the grow size /
356	__forceinline void initGrowSizeAndNumSlots(size_t bytesEstimated, bool fast)
357	{
358	/ we do not need single thread local allocator mode /
359	use_single_mode = false;
360
361	/ calculate growSize such that at most mainAllocationOverhead gets wasted when a block stays unused /
362	size_t mainAllocOverhead = fast ? mainAllocOverheadDynamic : mainAllocOverheadStatic;
363	size_t blockSize = alignSize(bytesEstimated/mainAllocOverhead);
364	growSize = maxGrowSize = clamp(blockSize,size_t(`1024`),maxAllocationSize);
365
366	/ if we reached the maxAllocationSize for growSize, we can*
367	* increase the number of allocation slots by still guaranteeing
368	* the mainAllocationOverhead */
369	slotMask = `0x0`;
370
371	if (MAX_THREAD_USED_BLOCK_SLOTS >= `2` && bytesEstimated > `2`mainAllocOverheadgrowSize) slotMask = `0x1`;
372	if (MAX_THREAD_USED_BLOCK_SLOTS >= `4` && bytesEstimated > `4`mainAllocOverheadgrowSize) slotMask = `0x3`;
373	if (MAX_THREAD_USED_BLOCK_SLOTS >= `8` && bytesEstimated > `8`mainAllocOverheadgrowSize) slotMask = `0x7`;
374	if (MAX_THREAD_USED_BLOCK_SLOTS >= `8` && bytesEstimated > `16`mainAllocOverheadgrowSize) { growSize = `2`; } /* if the overhead is tiny, double the growSize /
375
376	/ set the thread local alloc block size /
377	size_t defaultBlockSizeSwitch = PAGE_SIZE+maxAlignment;
378
379	/ for sufficiently large scene we can increase the defaultBlockSize over the defaultBlockSizeSwitch size /
380	#if 0 // we do not do this as a block size of 4160 if for some reason best for KNL
381	const size_t threadCount = TaskScheduler::threadCount();
382	const size_t single_mode_factor = use_single_mode ? `1` : `2`;
383	const size_t singleThreadBytes = single_mode_factorthreadLocalAllocOverheaddefaultBlockSizeSwitch;
384	if (bytesEstimated+(singleThreadBytes-`1`))/singleThreadBytes >= threadCount)
385	defaultBlockSize = min(max(defaultBlockSizeSwitch,bytesEstimated/(single_mode_factorthreadLocalAllocOverheadthreadCount)),growSize);
386
387	/ otherwise we grow the defaultBlockSize up to defaultBlockSizeSwitch /
388	else
389	#endif
390	defaultBlockSize = clamp(blockSize,size_t(`1024`),defaultBlockSizeSwitch);
391
392	if (bytesEstimated == `0`) {
393	maxGrowSize = maxAllocationSize; // special mode if builder cannot estimate tree size
394	defaultBlockSize = defaultBlockSizeSwitch;
395	}
396	log2_grow_size_scale = `0`;
397
398	if (device->alloc_main_block_size != `0`) growSize = device->alloc_main_block_size;
399	if (device->alloc_num_main_slots >= `1` ) slotMask = `0x0`;
400	if (device->alloc_num_main_slots >= `2` ) slotMask = `0x1`;
401	if (device->alloc_num_main_slots >= `4` ) slotMask = `0x3`;
402	if (device->alloc_num_main_slots >= `8` ) slotMask = `0x7`;
403	if (device->alloc_thread_block_size != `0`) defaultBlockSize = device->alloc_thread_block_size;
404	if (device->alloc_single_thread_alloc != -`1`) use_single_mode = device->alloc_single_thread_alloc;
405	}
406
407	/! initializes the allocator /
408	void init(size_t bytesAllocate, size_t bytesReserve, size_t bytesEstimate)
409	{
410	internal_fix_used_blocks();
411	/ distribute the allocation to multiple thread block slots /
412	slotMask = MAX_THREAD_USED_BLOCK_SLOTS-`1`; // FIXME: remove
413	if (usedBlocks.load() \|\| freeBlocks.load()) { reset(); return; }
414	if (bytesReserve == `0`) bytesReserve = bytesAllocate;
415	freeBlocks = Block::create(device,bytesAllocate,bytesReserve,nullptr,atype);
416	estimatedSize = bytesEstimate;
417	initGrowSizeAndNumSlots(bytesEstimate,true);
418	}
419
420	/! initializes the allocator /
421	void init_estimate(size_t bytesEstimate)
422	{
423	internal_fix_used_blocks();
424	if (usedBlocks.load() \|\| freeBlocks.load()) { reset(); return; }
425	/ single allocator mode ? /
426	estimatedSize = bytesEstimate;
427	//initGrowSizeAndNumSlots(bytesEstimate,false);
428	initGrowSizeAndNumSlots(bytesEstimate,false);
429
430	}
431
432	/! frees state not required after build /
433	__forceinline void cleanup()
434	{
435	internal_fix_used_blocks();
436
437	/ unbind all thread local allocators /
438	for (auto alloc : thread_local_allocators) alloc->unbind(this);
439	thread_local_allocators.clear();
440	}
441
442	/! resets the allocator, memory blocks get reused /
443	void reset ()
444	{
445	internal_fix_used_blocks();
446
447	bytesUsed.store(`0`);
448	bytesFree.store(`0`);
449	bytesWasted.store(`0`);
450
451	/ reset all used blocks and move them to begin of free block list /
452	while (usedBlocks.load() != nullptr) {
453	usedBlocks.load()->reset_block();
454	Block* nextUsedBlock = usedBlocks.load()->next;
455	usedBlocks.load()->next = freeBlocks.load();
456	freeBlocks = usedBlocks.load();
457	usedBlocks = nextUsedBlock;
458	}
459
460	/ remove all shared blocks as they are re-added during build /
461	freeBlocks.store(Block::remove_shared_blocks(freeBlocks.load()));
462
463	for (size_t i=`0`; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
464	{
465	threadUsedBlocks[i] = nullptr;
466	threadBlocks[i] = nullptr;
467	}
468
469	/ unbind all thread local allocators /
470	for (auto alloc : thread_local_allocators) alloc->unbind(this);
471	thread_local_allocators.clear();
472	}
473
474	/! frees all allocated memory /
475	__forceinline void clear()
476	{
477	cleanup();
478	bytesUsed.store(`0`);
479	bytesFree.store(`0`);
480	bytesWasted.store(`0`);
481	if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device); usedBlocks = nullptr;
482	if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device); freeBlocks = nullptr;
483	for (size_t i=`0`; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) {
484	threadUsedBlocks[i] = nullptr;
485	threadBlocks[i] = nullptr;
486	}
487	primrefarray.clear();
488	}
489
490	__forceinline size_t incGrowSizeScale()
491	{
492	size_t scale = log2_grow_size_scale.fetch_add(`1`)+`1`;
493	return size_t(`1`) << min(size_t(`16`),scale);
494	}
495
496	/! thread safe allocation of memory /
497	void* malloc(size_t& bytes, size_t align, bool partial)
498	{
499	assert(align <= maxAlignment);
500
501	while (true)
502	{
503	/ allocate using current block /
504	size_t threadID = TaskScheduler::threadID();
505	size_t slot = threadID & slotMask;
506	Block* myUsedBlocks = threadUsedBlocks[slot];
507	if (myUsedBlocks) {
508	void* ptr = myUsedBlocks->malloc(device,bytes,align,partial);
509	if (ptr) return ptr;
510	}
511
512	/ throw error if allocation is too large /
513	if (bytes > maxAllocationSize)
514	throw_RTCError(RTC_ERROR_UNKNOWN,"allocation is too large");
515
516	/ parallel block creation in case of no freeBlocks, avoids single global mutex /
517	if (likely(freeBlocks.load() == nullptr))
518	{
519	#if defined(APPLE) && defined(__aarch64__)
520	std::scoped_lock lock(slotMutex[slot]);
521	#else
522	Lock<SpinLock> lock(slotMutex[slot]);
523	#endif
524	if (myUsedBlocks == threadUsedBlocks[slot]) {
525	const size_t alignedBytes = (bytes+(align-`1`)) & ~(align-`1`);
526	const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes);
527	assert(allocSize >= bytes);
528	threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here!
529	// FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail.
530	}
531	continue;
532	}
533
534	/ if this fails allocate new block /
535	{
536	#if defined(APPLE) && defined(__aarch64__)
537	std::scoped_lock lock(mutex);
538	#else
539	Lock<SpinLock> lock(mutex);
540	#endif
541	if (myUsedBlocks == threadUsedBlocks[slot])
542	{
543	if (freeBlocks.load() != nullptr) {
544	Block* nextFreeBlock = freeBlocks.load()->next;
545	freeBlocks.load()->next = usedBlocks;
546	__memory_barrier();
547	usedBlocks = freeBlocks.load();
548	threadUsedBlocks[slot] = freeBlocks.load();
549	freeBlocks = nextFreeBlock;
550	} else {
551	const size_t allocSize = min(growSize*incGrowSizeScale(),maxGrowSize);
552	usedBlocks = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above!
553	}
554	}
555	}
556	}
557	}
558
559	/! add new block /
560	void addBlock(void* ptr, ssize_t bytes)
561	{
562	#if defined(APPLE) && defined(__aarch64__)
563	std::scoped_lock lock(mutex);
564	#else
565	Lock<SpinLock> lock(mutex);
566	#endif
567	const size_t sizeof_Header = offsetof(Block,data[`0`]);
568	void* aptr = (void*) ((((size_t)ptr)+maxAlignment-`1`) & ~(maxAlignment-`1`));
569	size_t ofs = (size_t) aptr - (size_t) ptr;
570	bytes -= ofs;
571	if (bytes < `4096`) return; // ignore empty or very small blocks
572	freeBlocks = new (aptr) Block (SHARED,bytes-sizeof_Header,bytes-sizeof_Header,freeBlocks,ofs);
573	}
574
575	/ special allocation only used from morton builder only a single time for each build /
576	void* specialAlloc(size_t bytes)
577	{
578	assert(freeBlocks.load() != nullptr && freeBlocks.load()->getBlockAllocatedBytes() >= bytes);
579	return freeBlocks.load()->ptr();
580	}
581
582	struct Statistics
583	{
584	Statistics ()
585	: bytesUsed(`0`), bytesFree(`0`), bytesWasted(`0`) {}
586
587	Statistics (size_t bytesUsed, size_t bytesFree, size_t bytesWasted)
588	: bytesUsed(bytesUsed), bytesFree(bytesFree), bytesWasted(bytesWasted) {}
589
590	Statistics (FastAllocator* alloc, AllocationType atype, bool huge_pages = false)
591	: bytesUsed(`0`), bytesFree(`0`), bytesWasted(`0`)
592	{
593	Block* usedBlocks = alloc->usedBlocks.load();
594	Block* freeBlocks = alloc->freeBlocks.load();
595	if (usedBlocks) bytesUsed += usedBlocks->getUsedBytes(atype,huge_pages);
596	if (freeBlocks) bytesFree += freeBlocks->getAllocatedBytes(atype,huge_pages);
597	if (usedBlocks) bytesFree += usedBlocks->getFreeBytes(atype,huge_pages);
598	if (freeBlocks) bytesWasted += freeBlocks->getWastedBytes(atype,huge_pages);
599	if (usedBlocks) bytesWasted += usedBlocks->getWastedBytes(atype,huge_pages);
600	}
601
602	std::string str(size_t numPrimitives)
603	{
604	std::stringstream str;
605	str.setf(std::ios::fixed, std::ios::floatfield);
606	str << "used = " << std::setw(`7`) << std::setprecision(`3`) << `1E-6f`*bytesUsed << " MB, "
607	<< "free = " << std::setw(`7`) << std::setprecision(`3`) << `1E-6f`*bytesFree << " MB, "
608	<< "wasted = " << std::setw(`7`) << std::setprecision(`3`) << `1E-6f`*bytesWasted << " MB, "
609	<< "total = " << std::setw(`7`) << std::setprecision(`3`) << `1E-6f`*bytesAllocatedTotal() << " MB, "
610	<< "#bytes/prim = " << std::setw(`6`) << std::setprecision(`2`) << double(bytesAllocatedTotal())/double(numPrimitives);
611	return str.str();
612	}
613
614	friend Statistics operator+ ( const Statistics& a, const Statistics& b)
615	{
616	return Statistics (a.bytesUsed+b.bytesUsed,
617	a.bytesFree+b.bytesFree,
618	a.bytesWasted+b.bytesWasted);
619	}
620
621	size_t bytesAllocatedTotal() const {
622	return bytesUsed + bytesFree + bytesWasted;
623	}
624
625	public:
626	size_t bytesUsed;
627	size_t bytesFree;
628	size_t bytesWasted;
629	};
630
631	Statistics getStatistics(AllocationType atype, bool huge_pages = false) {
632	return Statistics (this,atype,huge_pages);
633	}
634
635	size_t getUsedBytes() {
636	return bytesUsed;
637	}
638
639	size_t getWastedBytes() {
640	return bytesWasted;
641	}
642
643	struct AllStatistics
644	{
645	AllStatistics (FastAllocator* alloc)
646
647	: bytesUsed(alloc->bytesUsed),
648	bytesFree(alloc->bytesFree),
649	bytesWasted(alloc->bytesWasted),
650	stat_all (alloc,ANY_TYPE),
651	stat_malloc (alloc,ALIGNED_MALLOC),
652	stat_4K (alloc,EMBREE_OS_MALLOC,false),
653	stat_2M (alloc,EMBREE_OS_MALLOC,true),
654	stat_shared (alloc,SHARED) {}
655
656	AllStatistics (size_t bytesUsed,
657	size_t bytesFree,
658	size_t bytesWasted,
659	Statistics stat_all,
660	Statistics stat_malloc,
661	Statistics stat_4K,
662	Statistics stat_2M,
663	Statistics stat_shared)
664
665	: bytesUsed(bytesUsed),
666	bytesFree(bytesFree),
667	bytesWasted(bytesWasted),
668	stat_all (stat_all),
669	stat_malloc (stat_malloc),
670	stat_4K (stat_4K),
671	stat_2M (stat_2M),
672	stat_shared (stat_shared) {}
673
674	friend AllStatistics operator+ (const AllStatistics& a, const AllStatistics& b)
675	{
676	return AllStatistics (a.bytesUsed+b.bytesUsed,
677	a.bytesFree+b.bytesFree,
678	a.bytesWasted+b.bytesWasted,
679	a.stat_all + b.stat_all,
680	a.stat_malloc + b.stat_malloc,
681	a.stat_4K + b.stat_4K,
682	a.stat_2M + b.stat_2M,
683	a.stat_shared + b.stat_shared);
684	}
685
686	void print(size_t numPrimitives)
687	{
688	std::stringstream str0;
689	str0.setf(std::ios::fixed, std::ios::floatfield);
690	str0 << " alloc : "
691	<< "used = " << std::setw(`7`) << std::setprecision(`3`) << `1E-6f`*bytesUsed << " MB, "
692	<< " "
693	<< "#bytes/prim = " << std::setw(`6`) << std::setprecision(`2`) << double(bytesUsed)/double(numPrimitives);
694	std::cout << str0.str() << std::endl;
695
696	std::stringstream str1;
697	str1.setf(std::ios::fixed, std::ios::floatfield);
698	str1 << " alloc : "
699	<< "used = " << std::setw(`7`) << std::setprecision(`3`) << `1E-6f`*bytesUsed << " MB, "
700	<< "free = " << std::setw(`7`) << std::setprecision(`3`) << `1E-6f`*bytesFree << " MB, "
701	<< "wasted = " << std::setw(`7`) << std::setprecision(`3`) << `1E-6f`*bytesWasted << " MB, "
702	<< "total = " << std::setw(`7`) << std::setprecision(`3`) << `1E-6f`*(bytesUsed+bytesFree+bytesWasted) << " MB, "
703	<< "#bytes/prim = " << std::setw(`6`) << std::setprecision(`2`) << double(bytesUsed+bytesFree+bytesWasted)/double(numPrimitives);
704	std::cout << str1.str() << std::endl;
705
706	std::cout << " total : " << stat_all.str(numPrimitives) << std::endl;
707	std::cout << " 4K : " << stat_4K.str(numPrimitives) << std::endl;
708	std::cout << " 2M : " << stat_2M.str(numPrimitives) << std::endl;
709	std::cout << " malloc: " << stat_malloc.str(numPrimitives) << std::endl;
710	std::cout << " shared: " << stat_shared.str(numPrimitives) << std::endl;
711	}
712
713	private:
714	size_t bytesUsed;
715	size_t bytesFree;
716	size_t bytesWasted;
717	Statistics stat_all;
718	Statistics stat_malloc;
719	Statistics stat_4K;
720	Statistics stat_2M;
721	Statistics stat_shared;
722	};
723
724	void print_blocks()
725	{
726	std::cout << " estimatedSize = " << estimatedSize << ", slotMask = " << slotMask << ", use_single_mode = " << use_single_mode << ", maxGrowSize = " << maxGrowSize << ", defaultBlockSize = " << defaultBlockSize << std::endl;
727
728	std::cout << " used blocks = ";
729	if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list();
730	std::cout << "[END]" << std::endl;
731
732	std::cout << " free blocks = ";
733	if (freeBlocks.load() != nullptr) freeBlocks.load()->print_list();
734	std::cout << "[END]" << std::endl;
735	}
736
737	private:
738
739	struct Block
740	{
741	static Block* create(MemoryMonitorInterface* device, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype)
742	{
743	/ We avoid using os_malloc for small blocks as this could*
744	* cause a risk of fragmenting the virtual address space and
745	* reach the limit of vm.max_map_count = 65k under Linux. */
746	if (atype == EMBREE_OS_MALLOC && bytesAllocate < maxAllocationSize)
747	atype = ALIGNED_MALLOC;
748
749	/ we need to additionally allocate some header /
750	const size_t sizeof_Header = offsetof(Block,data[`0`]);
751	bytesAllocate = sizeof_Header+bytesAllocate;
752	bytesReserve = sizeof_Header+bytesReserve;
753
754	/ consume full 4k pages with using os_malloc /
755	if (atype == EMBREE_OS_MALLOC) {
756	bytesAllocate = ((bytesAllocate+PAGE_SIZE-`1`) & ~(PAGE_SIZE-`1`));
757	bytesReserve = ((bytesReserve +PAGE_SIZE-`1`) & ~(PAGE_SIZE-`1`));
758	}
759
760	/ either use alignedMalloc or os_malloc /
761	void ptr = nullptr*;
762	if (atype == ALIGNED_MALLOC)
763	{
764	/ special handling for default block size /
765	if (bytesAllocate == (`2`*PAGE_SIZE_2M))
766	{
767	const size_t alignment = maxAlignment;
768	if (device) device->memoryMonitor(bytesAllocate+alignment,false);
769	ptr = alignedMalloc(bytesAllocate,alignment);
770
771	/ give hint to transparently convert these pages to 2MB pages /
772	const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-`1`);
773	os_advise((void)(ptr_aligned_begin + `0`),PAGE_SIZE_2M); // may fail if no memory mapped before block*
774	os_advise((void)(ptr_aligned_begin + `1`PAGE_SIZE_2M),PAGE_SIZE_2M);
775	os_advise((void)(ptr_aligned_begin + `2`PAGE_SIZE_2M),PAGE_SIZE_2M); // may fail if no memory mapped after block
776
777	return new (ptr) Block (ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
778	}
779	else
780	{
781	const size_t alignment = maxAlignment;
782	if (device) device->memoryMonitor(bytesAllocate+alignment,false);
783	ptr = alignedMalloc(bytesAllocate,alignment);
784	return new (ptr) Block (ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
785	}
786	}
787	else if (atype == EMBREE_OS_MALLOC)
788	{
789	if (device) device->memoryMonitor(bytesAllocate,false);
790	bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages);
791	return new (ptr) Block (EMBREE_OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,`0`,huge_pages);
792	}
793	else
794	assert(false);
795
796	return NULL;
797	}
798
799	Block (AllocationType atype, size_t bytesAllocate, size_t bytesReserve, Block* next, size_t wasted, bool huge_pages = false)
800	: cur (`0`), allocEnd (bytesAllocate), reserveEnd (bytesReserve), next(next), wasted(wasted), atype(atype), huge_pages(huge_pages)
801	{
802	assert((((size_t)&data[`0`]) & (maxAlignment-`1`)) == `0`);
803	}
804
805	static Block* remove_shared_blocks(Block* head)
806	{
807	Block** prev_next = &head;
808	for (Block* block = head; block; block = block->next) {
809	if (block->atype == SHARED) *prev_next = block->next;
810	else prev_next = &block->next;
811	}
812	return head;
813	}
814
815	void clear_list(MemoryMonitorInterface* device)
816	{
817	Block* block = this;
818	while (block) {
819	Block* next = block->next;
820	block->clear_block(device);
821	block = next;
822	}
823	}
824
825	void clear_block (MemoryMonitorInterface* device)
826	{
827	const size_t sizeof_Header = offsetof(Block,data[`0`]);
828	const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes();
829
830	if (atype == ALIGNED_MALLOC) {
831	alignedFree(this);
832	if (device) device->memoryMonitor(-sizeof_Alloced,true);
833	}
834
835	else if (atype == EMBREE_OS_MALLOC) {
836	size_t sizeof_This = sizeof_Header+reserveEnd;
837	os_free(this,sizeof_This,huge_pages);
838	if (device) device->memoryMonitor(-sizeof_Alloced,true);
839	}
840
841	else / if (atype == SHARED) / {
842	}
843	}
844
845	void* malloc(MemoryMonitorInterface* device, size_t& bytes_in, size_t align, bool partial)
846	{
847	size_t bytes = bytes_in;
848	assert(align <= maxAlignment);
849	bytes = (bytes+(align-`1`)) & ~(align-`1`);
850	if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr;
851	const size_t i = cur.fetch_add(bytes);
852	if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr;
853	if (unlikely(i > reserveEnd)) return nullptr;
854	bytes_in = bytes = min(bytes,reserveEnd-i);
855
856	if (i+bytes > allocEnd) {
857	if (device) device->memoryMonitor(i+bytes-max(i,allocEnd),true);
858	}
859	return &data[i];
860	}
861
862	void* ptr() {
863	return &data[cur];
864	}
865
866	void reset_block ()
867	{
868	allocEnd = max(allocEnd,(size_t)cur);
869	cur = `0`;
870	}
871
872	size_t getBlockUsedBytes() const {
873	return min(size_t(cur),reserveEnd);
874	}
875
876	size_t getBlockFreeBytes() const {
877	return getBlockAllocatedBytes() - getBlockUsedBytes();
878	}
879
880	size_t getBlockAllocatedBytes() const {
881	return min(max(allocEnd,size_t(cur)),reserveEnd);
882	}
883
884	size_t getBlockWastedBytes() const {
885	const size_t sizeof_Header = offsetof(Block,data[`0`]);
886	return sizeof_Header + wasted;
887	}
888
889	size_t getBlockReservedBytes() const {
890	return reserveEnd;
891	}
892
893	bool hasType(AllocationType atype_i, bool huge_pages_i) const
894	{
895	if (atype_i == ANY_TYPE ) return true;
896	else if (atype == EMBREE_OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
897	else return atype_i == atype;
898	}
899
900	size_t getUsedBytes(AllocationType atype, bool huge_pages = false) const {
901	size_t bytes = `0`;
902	for (const Block* block = this; block; block = block->next) {
903	if (!block->hasType(atype,huge_pages)) continue;
904	bytes += block->getBlockUsedBytes();
905	}
906	return bytes;
907	}
908
909	size_t getFreeBytes(AllocationType atype, bool huge_pages = false) const {
910	size_t bytes = `0`;
911	for (const Block* block = this; block; block = block->next) {
912	if (!block->hasType(atype,huge_pages)) continue;
913	bytes += block->getBlockFreeBytes();
914	}
915	return bytes;
916	}
917
918	size_t getWastedBytes(AllocationType atype, bool huge_pages = false) const {
919	size_t bytes = `0`;
920	for (const Block* block = this; block; block = block->next) {
921	if (!block->hasType(atype,huge_pages)) continue;
922	bytes += block->getBlockWastedBytes();
923	}
924	return bytes;
925	}
926
927	size_t getAllocatedBytes(AllocationType atype, bool huge_pages = false) const {
928	size_t bytes = `0`;
929	for (const Block* block = this; block; block = block->next) {
930	if (!block->hasType(atype,huge_pages)) continue;
931	bytes += block->getBlockAllocatedBytes();
932	}
933	return bytes;
934	}
935
936	void print_list ()
937	{
938	for (const Block* block = this; block; block = block->next)
939	block->print_block();
940	}
941
942	void print_block() const
943	{
944	if (atype == ALIGNED_MALLOC) std::cout << "A";
945	else if (atype == EMBREE_OS_MALLOC) std::cout << "O";
946	else if (atype == SHARED) std::cout << "S";
947	if (huge_pages) std::cout << "H";
948	size_t bytesUsed = getBlockUsedBytes();
949	size_t bytesFree = getBlockFreeBytes();
950	size_t bytesWasted = getBlockWastedBytes();
951	std::cout << "[" << bytesUsed << ", " << bytesFree << ", " << bytesWasted << "] ";
952	}
953
954	public:
955	std::atomic<size_t> cur; //!< current location of the allocator
956	std::atomic<size_t> allocEnd; //!< end of the allocated memory region
957	std::atomic<size_t> reserveEnd; //!< end of the reserved memory region
958	Block* next; //!< pointer to next block in list
959	size_t wasted; //!< amount of memory wasted through block alignment
960	AllocationType atype; //!< allocation mode of the block
961	bool huge_pages; //!< whether the block uses huge pages
962	char align[maxAlignment-`5`*sizeof(size_t)-sizeof(AllocationType)-sizeof(bool)]; //!< align data to maxAlignment
963	char data[`1`]; //!< here starts memory to use for allocations
964	};
965
966	private:
967	Device* device;
968	SpinLock mutex;
969	size_t slotMask;
970	std::atomic<Block*> threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
971	std::atomic<Block*> usedBlocks;
972	std::atomic<Block*> freeBlocks;
973
974	std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
975	#if defined(APPLE) && defined(__aarch64__)
976	std::mutex slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
977	#else
978	PaddedSpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
979	#endif
980
981	bool use_single_mode;
982	size_t defaultBlockSize;
983	size_t estimatedSize;
984	size_t growSize;
985	size_t maxGrowSize;
986	std::atomic<size_t> log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove
987	std::atomic<size_t> bytesUsed;
988	std::atomic<size_t> bytesFree;
989	std::atomic<size_t> bytesWasted;
990	static __thread ThreadLocal2* thread_local_allocator2;
991	static SpinLock s_thread_local_allocators_lock;
992	static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;
993	#if defined(APPLE) && defined(__aarch64__)
994	std::mutex thread_local_allocators_lock;
995	#else
996	SpinLock thread_local_allocators_lock;
997	#endif
998	std::vector<ThreadLocal2*> thread_local_allocators;
999	AllocationType atype;
1000	mvector<PrimRef> primrefarray; //!< primrefarray used to allocate nodes
1001	};
1002	}
1003

Browse the source code of Godot/thirdparty/embree/kernels/common/alloc.h