1// Copyright 2009-2021 Intel Corporation
2// SPDX-License-Identifier: Apache-2.0
3
4#pragma once
5
6#include "../common/default.h"
7
8/* force a complete cache invalidation when running out of allocation space */
9#define FORCE_SIMPLE_FLUSH 0
10
11#define THREAD_BLOCK_ATOMIC_ADD 4
12
13#if defined(DEBUG)
14#define CACHE_STATS(x)
15#else
16#define CACHE_STATS(x)
17#endif
18
19namespace embree
20{
21 class SharedTessellationCacheStats
22 {
23 public:
24 /* stats */
25 static std::atomic<size_t> cache_accesses;
26 static std::atomic<size_t> cache_hits;
27 static std::atomic<size_t> cache_misses;
28 static std::atomic<size_t> cache_flushes;
29 static size_t cache_num_patches;
30 __aligned(64) static SpinLock mtx;
31
32 /* print stats for debugging */
33 static void printStats();
34 static void clearStats();
35 };
36
37 void resizeTessellationCache(size_t new_size);
38 void resetTessellationCache();
39
40 ////////////////////////////////////////////////////////////////////////////////
41 ////////////////////////////////////////////////////////////////////////////////
42 ////////////////////////////////////////////////////////////////////////////////
43
44 struct __aligned(64) ThreadWorkState
45 {
46 ALIGNED_STRUCT_(64);
47
48 std::atomic<size_t> counter;
49 ThreadWorkState* next;
50 bool allocated;
51
52 __forceinline ThreadWorkState(bool allocated = false)
53 : counter(0), next(nullptr), allocated(allocated)
54 {
55 assert( ((size_t)this % 64) == 0 );
56 }
57 };
58
59 class __aligned(64) SharedLazyTessellationCache
60 {
61 public:
62
63 static const size_t NUM_CACHE_SEGMENTS = 8;
64 static const size_t NUM_PREALLOC_THREAD_WORK_STATES = 512;
65 static const size_t COMMIT_INDEX_SHIFT = 32+8;
66#if defined(__64BIT__)
67 static const size_t REF_TAG_MASK = 0xffffffffff;
68#else
69 static const size_t REF_TAG_MASK = 0x7FFFFFFF;
70#endif
71 static const size_t MAX_TESSELLATION_CACHE_SIZE = REF_TAG_MASK+1;
72 static const size_t BLOCK_SIZE = 64;
73
74
75 /*! Per thread tessellation ref cache */
76 static __thread ThreadWorkState* init_t_state;
77 static ThreadWorkState* current_t_state;
78
79 static __forceinline ThreadWorkState *threadState()
80 {
81 if (unlikely(!init_t_state))
82 /* sets init_t_state, can't return pointer due to macosx icc bug*/
83 SharedLazyTessellationCache::sharedLazyTessellationCache.getNextRenderThreadWorkState();
84 return init_t_state;
85 }
86
87 struct Tag
88 {
89 __forceinline Tag() : data(0) {}
90
91 __forceinline Tag(void* ptr, size_t combinedTime) {
92 init(ptr,combinedTime);
93 }
94
95 __forceinline Tag(size_t ptr, size_t combinedTime) {
96 init((void*)ptr,combinedTime);
97 }
98
99 __forceinline void init(void* ptr, size_t combinedTime)
100 {
101 if (ptr == nullptr) {
102 data = 0;
103 return;
104 }
105 int64_t new_root_ref = (int64_t) ptr;
106 new_root_ref -= (int64_t)SharedLazyTessellationCache::sharedLazyTessellationCache.getDataPtr();
107 assert( new_root_ref <= (int64_t)REF_TAG_MASK );
108 new_root_ref |= (int64_t)combinedTime << COMMIT_INDEX_SHIFT;
109 data = new_root_ref;
110 }
111
112 __forceinline int64_t get() const { return data.load(); }
113 __forceinline void set( int64_t v ) { data.store(v); }
114 __forceinline void reset() { data.store(0); }
115
116 private:
117 atomic<int64_t> data;
118 };
119
120 static __forceinline size_t extractCommitIndex(const int64_t v) { return v >> SharedLazyTessellationCache::COMMIT_INDEX_SHIFT; }
121
122 struct CacheEntry
123 {
124 Tag tag;
125 SpinLock mutex;
126 };
127
128 private:
129
130 float *data;
131 bool hugepages;
132 size_t size;
133 size_t maxBlocks;
134 ThreadWorkState *threadWorkState;
135
136 __aligned(64) std::atomic<size_t> localTime;
137 __aligned(64) std::atomic<size_t> next_block;
138 __aligned(64) SpinLock reset_state;
139 __aligned(64) SpinLock linkedlist_mtx;
140 __aligned(64) std::atomic<size_t> switch_block_threshold;
141 __aligned(64) std::atomic<size_t> numRenderThreads;
142
143
144 public:
145
146
147 SharedLazyTessellationCache();
148 ~SharedLazyTessellationCache();
149
150 void getNextRenderThreadWorkState();
151
152 __forceinline size_t maxAllocSize() const {
153 return switch_block_threshold;
154 }
155
156 __forceinline size_t getCurrentIndex() { return localTime.load(); }
157 __forceinline void addCurrentIndex(const size_t i=1) { localTime.fetch_add(i); }
158
159 __forceinline size_t getTime(const size_t globalTime) {
160 return localTime.load()+NUM_CACHE_SEGMENTS*globalTime;
161 }
162
163
164 __forceinline size_t lockThread (ThreadWorkState *const t_state, const ssize_t plus=1) { return t_state->counter.fetch_add(plus); }
165 __forceinline size_t unlockThread(ThreadWorkState *const t_state, const ssize_t plus=-1) { assert(isLocked(t_state)); return t_state->counter.fetch_add(plus); }
166
167 __forceinline bool isLocked(ThreadWorkState *const t_state) { return t_state->counter.load() != 0; }
168
169 static __forceinline void lock () { sharedLazyTessellationCache.lockThread(threadState()); }
170 static __forceinline void unlock() { sharedLazyTessellationCache.unlockThread(threadState()); }
171 static __forceinline bool isLocked() { return sharedLazyTessellationCache.isLocked(threadState()); }
172 static __forceinline size_t getState() { return threadState()->counter.load(); }
173 static __forceinline void lockThreadLoop() { sharedLazyTessellationCache.lockThreadLoop(threadState()); }
174
175 static __forceinline size_t getTCacheTime(const size_t globalTime) {
176 return sharedLazyTessellationCache.getTime(globalTime);
177 }
178
179 /* per thread lock */
180 __forceinline void lockThreadLoop (ThreadWorkState *const t_state)
181 {
182 while(1)
183 {
184 size_t lock = SharedLazyTessellationCache::sharedLazyTessellationCache.lockThread(t_state,1);
185 if (unlikely(lock >= THREAD_BLOCK_ATOMIC_ADD))
186 {
187 /* lock failed wait until sync phase is over */
188 sharedLazyTessellationCache.unlockThread(t_state,-1);
189 sharedLazyTessellationCache.waitForUsersLessEqual(t_state,0);
190 }
191 else
192 break;
193 }
194 }
195
196 static __forceinline void* lookup(CacheEntry& entry, size_t globalTime)
197 {
198 const int64_t subdiv_patch_root_ref = entry.tag.get();
199 CACHE_STATS(SharedTessellationCacheStats::cache_accesses++);
200
201 if (likely(subdiv_patch_root_ref != 0))
202 {
203 const size_t subdiv_patch_root = (subdiv_patch_root_ref & REF_TAG_MASK) + (size_t)sharedLazyTessellationCache.getDataPtr();
204 const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref);
205
206 if (likely( sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime) ))
207 {
208 CACHE_STATS(SharedTessellationCacheStats::cache_hits++);
209 return (void*) subdiv_patch_root;
210 }
211 }
212 CACHE_STATS(SharedTessellationCacheStats::cache_misses++);
213 return nullptr;
214 }
215
216 template<typename Constructor>
217 static __forceinline auto lookup (CacheEntry& entry, size_t globalTime, const Constructor constructor, const bool before=false) -> decltype(constructor())
218 {
219 ThreadWorkState *t_state = SharedLazyTessellationCache::threadState();
220
221 while (true)
222 {
223 sharedLazyTessellationCache.lockThreadLoop(t_state);
224 void* patch = SharedLazyTessellationCache::lookup(entry,globalTime);
225 if (patch) return (decltype(constructor())) patch;
226
227 if (entry.mutex.try_lock())
228 {
229 if (!validTag(entry.tag,globalTime))
230 {
231 auto timeBefore = sharedLazyTessellationCache.getTime(globalTime);
232 auto ret = constructor(); // thread is locked here!
233 assert(ret);
234 /* this should never return nullptr */
235 auto timeAfter = sharedLazyTessellationCache.getTime(globalTime);
236 auto time = before ? timeBefore : timeAfter;
237 __memory_barrier();
238 entry.tag = SharedLazyTessellationCache::Tag(ret,time);
239 __memory_barrier();
240 entry.mutex.unlock();
241 return ret;
242 }
243 entry.mutex.unlock();
244 }
245 SharedLazyTessellationCache::sharedLazyTessellationCache.unlockThread(t_state);
246 }
247 }
248
249 __forceinline bool validCacheIndex(const size_t i, const size_t globalTime)
250 {
251#if FORCE_SIMPLE_FLUSH == 1
252 return i == getTime(globalTime);
253#else
254 return i+(NUM_CACHE_SEGMENTS-1) >= getTime(globalTime);
255#endif
256 }
257
258 static __forceinline bool validTime(const size_t oldtime, const size_t newTime)
259 {
260 return oldtime+(NUM_CACHE_SEGMENTS-1) >= newTime;
261 }
262
263
264 static __forceinline bool validTag(const Tag& tag, size_t globalTime)
265 {
266 const int64_t subdiv_patch_root_ref = tag.get();
267 if (subdiv_patch_root_ref == 0) return false;
268 const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref);
269 return sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime);
270 }
271
272 void waitForUsersLessEqual(ThreadWorkState *const t_state,
273 const unsigned int users);
274
275 __forceinline size_t alloc(const size_t blocks)
276 {
277 if (unlikely(blocks >= switch_block_threshold))
278 throw_RTCError(RTC_ERROR_INVALID_OPERATION,"allocation exceeds size of tessellation cache segment");
279
280 assert(blocks < switch_block_threshold);
281 size_t index = next_block.fetch_add(blocks);
282 if (unlikely(index + blocks >= switch_block_threshold)) return (size_t)-1;
283 return index;
284 }
285
286 static __forceinline void* malloc(const size_t bytes)
287 {
288 size_t block_index = -1;
289 ThreadWorkState *const t_state = threadState();
290 while (true)
291 {
292 block_index = sharedLazyTessellationCache.alloc((bytes+BLOCK_SIZE-1)/BLOCK_SIZE);
293 if (block_index == (size_t)-1)
294 {
295 sharedLazyTessellationCache.unlockThread(t_state);
296 sharedLazyTessellationCache.allocNextSegment();
297 sharedLazyTessellationCache.lockThread(t_state);
298 continue;
299 }
300 break;
301 }
302 return sharedLazyTessellationCache.getBlockPtr(block_index);
303 }
304
305 __forceinline void *getBlockPtr(const size_t block_index)
306 {
307 assert(block_index < maxBlocks);
308 assert(data);
309 assert(block_index*16 <= size);
310 return (void*)&data[block_index*16];
311 }
312
313 __forceinline void* getDataPtr() { return data; }
314 __forceinline size_t getNumUsedBytes() { return next_block * BLOCK_SIZE; }
315 __forceinline size_t getMaxBlocks() { return maxBlocks; }
316 __forceinline size_t getSize() { return size; }
317
318 void allocNextSegment();
319 void realloc(const size_t newSize);
320
321 void reset();
322
323 static SharedLazyTessellationCache sharedLazyTessellationCache;
324 };
325}
326