1 | // Copyright 2009-2021 Intel Corporation |
2 | // SPDX-License-Identifier: Apache-2.0 |
3 | |
4 | #pragma once |
5 | |
6 | #include "../common/default.h" |
7 | |
8 | /* force a complete cache invalidation when running out of allocation space */ |
9 | #define FORCE_SIMPLE_FLUSH 0 |
10 | |
11 | #define THREAD_BLOCK_ATOMIC_ADD 4 |
12 | |
13 | #if defined(DEBUG) |
14 | #define CACHE_STATS(x) |
15 | #else |
16 | #define CACHE_STATS(x) |
17 | #endif |
18 | |
19 | namespace embree |
20 | { |
21 | class SharedTessellationCacheStats |
22 | { |
23 | public: |
24 | /* stats */ |
25 | static std::atomic<size_t> cache_accesses; |
26 | static std::atomic<size_t> cache_hits; |
27 | static std::atomic<size_t> cache_misses; |
28 | static std::atomic<size_t> cache_flushes; |
29 | static size_t cache_num_patches; |
30 | __aligned(64) static SpinLock mtx; |
31 | |
32 | /* print stats for debugging */ |
33 | static void printStats(); |
34 | static void clearStats(); |
35 | }; |
36 | |
37 | void resizeTessellationCache(size_t new_size); |
38 | void resetTessellationCache(); |
39 | |
40 | //////////////////////////////////////////////////////////////////////////////// |
41 | //////////////////////////////////////////////////////////////////////////////// |
42 | //////////////////////////////////////////////////////////////////////////////// |
43 | |
44 | struct __aligned(64) ThreadWorkState |
45 | { |
46 | ALIGNED_STRUCT_(64); |
47 | |
48 | std::atomic<size_t> counter; |
49 | ThreadWorkState* next; |
50 | bool allocated; |
51 | |
52 | __forceinline ThreadWorkState(bool allocated = false) |
53 | : counter(0), next(nullptr), allocated(allocated) |
54 | { |
55 | assert( ((size_t)this % 64) == 0 ); |
56 | } |
57 | }; |
58 | |
59 | class __aligned(64) SharedLazyTessellationCache |
60 | { |
61 | public: |
62 | |
63 | static const size_t NUM_CACHE_SEGMENTS = 8; |
64 | static const size_t NUM_PREALLOC_THREAD_WORK_STATES = 512; |
65 | static const size_t COMMIT_INDEX_SHIFT = 32+8; |
66 | #if defined(__64BIT__) |
67 | static const size_t REF_TAG_MASK = 0xffffffffff; |
68 | #else |
69 | static const size_t REF_TAG_MASK = 0x7FFFFFFF; |
70 | #endif |
71 | static const size_t MAX_TESSELLATION_CACHE_SIZE = REF_TAG_MASK+1; |
72 | static const size_t BLOCK_SIZE = 64; |
73 | |
74 | |
75 | /*! Per thread tessellation ref cache */ |
76 | static __thread ThreadWorkState* init_t_state; |
77 | static ThreadWorkState* current_t_state; |
78 | |
79 | static __forceinline ThreadWorkState *threadState() |
80 | { |
81 | if (unlikely(!init_t_state)) |
82 | /* sets init_t_state, can't return pointer due to macosx icc bug*/ |
83 | SharedLazyTessellationCache::sharedLazyTessellationCache.getNextRenderThreadWorkState(); |
84 | return init_t_state; |
85 | } |
86 | |
87 | struct Tag |
88 | { |
89 | __forceinline Tag() : data(0) {} |
90 | |
91 | __forceinline Tag(void* ptr, size_t combinedTime) { |
92 | init(ptr,combinedTime); |
93 | } |
94 | |
95 | __forceinline Tag(size_t ptr, size_t combinedTime) { |
96 | init((void*)ptr,combinedTime); |
97 | } |
98 | |
99 | __forceinline void init(void* ptr, size_t combinedTime) |
100 | { |
101 | if (ptr == nullptr) { |
102 | data = 0; |
103 | return; |
104 | } |
105 | int64_t new_root_ref = (int64_t) ptr; |
106 | new_root_ref -= (int64_t)SharedLazyTessellationCache::sharedLazyTessellationCache.getDataPtr(); |
107 | assert( new_root_ref <= (int64_t)REF_TAG_MASK ); |
108 | new_root_ref |= (int64_t)combinedTime << COMMIT_INDEX_SHIFT; |
109 | data = new_root_ref; |
110 | } |
111 | |
112 | __forceinline int64_t get() const { return data.load(); } |
113 | __forceinline void set( int64_t v ) { data.store(v); } |
114 | __forceinline void reset() { data.store(0); } |
115 | |
116 | private: |
117 | atomic<int64_t> data; |
118 | }; |
119 | |
120 | static __forceinline size_t (const int64_t v) { return v >> SharedLazyTessellationCache::COMMIT_INDEX_SHIFT; } |
121 | |
122 | struct CacheEntry |
123 | { |
124 | Tag tag; |
125 | SpinLock mutex; |
126 | }; |
127 | |
128 | private: |
129 | |
130 | float *data; |
131 | bool hugepages; |
132 | size_t size; |
133 | size_t maxBlocks; |
134 | ThreadWorkState *threadWorkState; |
135 | |
136 | __aligned(64) std::atomic<size_t> localTime; |
137 | __aligned(64) std::atomic<size_t> next_block; |
138 | __aligned(64) SpinLock reset_state; |
139 | __aligned(64) SpinLock linkedlist_mtx; |
140 | __aligned(64) std::atomic<size_t> switch_block_threshold; |
141 | __aligned(64) std::atomic<size_t> numRenderThreads; |
142 | |
143 | |
144 | public: |
145 | |
146 | |
147 | SharedLazyTessellationCache(); |
148 | ~SharedLazyTessellationCache(); |
149 | |
150 | void getNextRenderThreadWorkState(); |
151 | |
152 | __forceinline size_t maxAllocSize() const { |
153 | return switch_block_threshold; |
154 | } |
155 | |
156 | __forceinline size_t getCurrentIndex() { return localTime.load(); } |
157 | __forceinline void addCurrentIndex(const size_t i=1) { localTime.fetch_add(i); } |
158 | |
159 | __forceinline size_t getTime(const size_t globalTime) { |
160 | return localTime.load()+NUM_CACHE_SEGMENTS*globalTime; |
161 | } |
162 | |
163 | |
164 | __forceinline size_t lockThread (ThreadWorkState *const t_state, const ssize_t plus=1) { return t_state->counter.fetch_add(plus); } |
165 | __forceinline size_t unlockThread(ThreadWorkState *const t_state, const ssize_t plus=-1) { assert(isLocked(t_state)); return t_state->counter.fetch_add(plus); } |
166 | |
167 | __forceinline bool isLocked(ThreadWorkState *const t_state) { return t_state->counter.load() != 0; } |
168 | |
169 | static __forceinline void lock () { sharedLazyTessellationCache.lockThread(threadState()); } |
170 | static __forceinline void unlock() { sharedLazyTessellationCache.unlockThread(threadState()); } |
171 | static __forceinline bool isLocked() { return sharedLazyTessellationCache.isLocked(threadState()); } |
172 | static __forceinline size_t getState() { return threadState()->counter.load(); } |
173 | static __forceinline void lockThreadLoop() { sharedLazyTessellationCache.lockThreadLoop(threadState()); } |
174 | |
175 | static __forceinline size_t getTCacheTime(const size_t globalTime) { |
176 | return sharedLazyTessellationCache.getTime(globalTime); |
177 | } |
178 | |
179 | /* per thread lock */ |
180 | __forceinline void lockThreadLoop (ThreadWorkState *const t_state) |
181 | { |
182 | while(1) |
183 | { |
184 | size_t lock = SharedLazyTessellationCache::sharedLazyTessellationCache.lockThread(t_state,1); |
185 | if (unlikely(lock >= THREAD_BLOCK_ATOMIC_ADD)) |
186 | { |
187 | /* lock failed wait until sync phase is over */ |
188 | sharedLazyTessellationCache.unlockThread(t_state,-1); |
189 | sharedLazyTessellationCache.waitForUsersLessEqual(t_state,0); |
190 | } |
191 | else |
192 | break; |
193 | } |
194 | } |
195 | |
196 | static __forceinline void* lookup(CacheEntry& entry, size_t globalTime) |
197 | { |
198 | const int64_t subdiv_patch_root_ref = entry.tag.get(); |
199 | CACHE_STATS(SharedTessellationCacheStats::cache_accesses++); |
200 | |
201 | if (likely(subdiv_patch_root_ref != 0)) |
202 | { |
203 | const size_t subdiv_patch_root = (subdiv_patch_root_ref & REF_TAG_MASK) + (size_t)sharedLazyTessellationCache.getDataPtr(); |
204 | const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref); |
205 | |
206 | if (likely( sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime) )) |
207 | { |
208 | CACHE_STATS(SharedTessellationCacheStats::cache_hits++); |
209 | return (void*) subdiv_patch_root; |
210 | } |
211 | } |
212 | CACHE_STATS(SharedTessellationCacheStats::cache_misses++); |
213 | return nullptr; |
214 | } |
215 | |
216 | template<typename Constructor> |
217 | static __forceinline auto lookup (CacheEntry& entry, size_t globalTime, const Constructor constructor, const bool before=false) -> decltype(constructor()) |
218 | { |
219 | ThreadWorkState *t_state = SharedLazyTessellationCache::threadState(); |
220 | |
221 | while (true) |
222 | { |
223 | sharedLazyTessellationCache.lockThreadLoop(t_state); |
224 | void* patch = SharedLazyTessellationCache::lookup(entry,globalTime); |
225 | if (patch) return (decltype(constructor())) patch; |
226 | |
227 | if (entry.mutex.try_lock()) |
228 | { |
229 | if (!validTag(entry.tag,globalTime)) |
230 | { |
231 | auto timeBefore = sharedLazyTessellationCache.getTime(globalTime); |
232 | auto ret = constructor(); // thread is locked here! |
233 | assert(ret); |
234 | /* this should never return nullptr */ |
235 | auto timeAfter = sharedLazyTessellationCache.getTime(globalTime); |
236 | auto time = before ? timeBefore : timeAfter; |
237 | __memory_barrier(); |
238 | entry.tag = SharedLazyTessellationCache::Tag(ret,time); |
239 | __memory_barrier(); |
240 | entry.mutex.unlock(); |
241 | return ret; |
242 | } |
243 | entry.mutex.unlock(); |
244 | } |
245 | SharedLazyTessellationCache::sharedLazyTessellationCache.unlockThread(t_state); |
246 | } |
247 | } |
248 | |
249 | __forceinline bool validCacheIndex(const size_t i, const size_t globalTime) |
250 | { |
251 | #if FORCE_SIMPLE_FLUSH == 1 |
252 | return i == getTime(globalTime); |
253 | #else |
254 | return i+(NUM_CACHE_SEGMENTS-1) >= getTime(globalTime); |
255 | #endif |
256 | } |
257 | |
258 | static __forceinline bool validTime(const size_t oldtime, const size_t newTime) |
259 | { |
260 | return oldtime+(NUM_CACHE_SEGMENTS-1) >= newTime; |
261 | } |
262 | |
263 | |
264 | static __forceinline bool validTag(const Tag& tag, size_t globalTime) |
265 | { |
266 | const int64_t subdiv_patch_root_ref = tag.get(); |
267 | if (subdiv_patch_root_ref == 0) return false; |
268 | const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref); |
269 | return sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime); |
270 | } |
271 | |
272 | void waitForUsersLessEqual(ThreadWorkState *const t_state, |
273 | const unsigned int users); |
274 | |
275 | __forceinline size_t alloc(const size_t blocks) |
276 | { |
277 | if (unlikely(blocks >= switch_block_threshold)) |
278 | throw_RTCError(RTC_ERROR_INVALID_OPERATION,"allocation exceeds size of tessellation cache segment" ); |
279 | |
280 | assert(blocks < switch_block_threshold); |
281 | size_t index = next_block.fetch_add(blocks); |
282 | if (unlikely(index + blocks >= switch_block_threshold)) return (size_t)-1; |
283 | return index; |
284 | } |
285 | |
286 | static __forceinline void* malloc(const size_t bytes) |
287 | { |
288 | size_t block_index = -1; |
289 | ThreadWorkState *const t_state = threadState(); |
290 | while (true) |
291 | { |
292 | block_index = sharedLazyTessellationCache.alloc((bytes+BLOCK_SIZE-1)/BLOCK_SIZE); |
293 | if (block_index == (size_t)-1) |
294 | { |
295 | sharedLazyTessellationCache.unlockThread(t_state); |
296 | sharedLazyTessellationCache.allocNextSegment(); |
297 | sharedLazyTessellationCache.lockThread(t_state); |
298 | continue; |
299 | } |
300 | break; |
301 | } |
302 | return sharedLazyTessellationCache.getBlockPtr(block_index); |
303 | } |
304 | |
305 | __forceinline void *getBlockPtr(const size_t block_index) |
306 | { |
307 | assert(block_index < maxBlocks); |
308 | assert(data); |
309 | assert(block_index*16 <= size); |
310 | return (void*)&data[block_index*16]; |
311 | } |
312 | |
313 | __forceinline void* getDataPtr() { return data; } |
314 | __forceinline size_t getNumUsedBytes() { return next_block * BLOCK_SIZE; } |
315 | __forceinline size_t getMaxBlocks() { return maxBlocks; } |
316 | __forceinline size_t getSize() { return size; } |
317 | |
318 | void allocNextSegment(); |
319 | void realloc(const size_t newSize); |
320 | |
321 | void reset(); |
322 | |
323 | static SharedLazyTessellationCache sharedLazyTessellationCache; |
324 | }; |
325 | } |
326 | |