| 1 | // Copyright 2009-2021 Intel Corporation |
| 2 | // SPDX-License-Identifier: Apache-2.0 |
| 3 | |
| 4 | #pragma once |
| 5 | |
| 6 | #include "../common/default.h" |
| 7 | |
| 8 | /* force a complete cache invalidation when running out of allocation space */ |
| 9 | #define FORCE_SIMPLE_FLUSH 0 |
| 10 | |
| 11 | #define THREAD_BLOCK_ATOMIC_ADD 4 |
| 12 | |
| 13 | #if defined(DEBUG) |
| 14 | #define CACHE_STATS(x) |
| 15 | #else |
| 16 | #define CACHE_STATS(x) |
| 17 | #endif |
| 18 | |
| 19 | namespace embree |
| 20 | { |
| 21 | class SharedTessellationCacheStats |
| 22 | { |
| 23 | public: |
| 24 | /* stats */ |
| 25 | static std::atomic<size_t> cache_accesses; |
| 26 | static std::atomic<size_t> cache_hits; |
| 27 | static std::atomic<size_t> cache_misses; |
| 28 | static std::atomic<size_t> cache_flushes; |
| 29 | static size_t cache_num_patches; |
| 30 | __aligned(64) static SpinLock mtx; |
| 31 | |
| 32 | /* print stats for debugging */ |
| 33 | static void printStats(); |
| 34 | static void clearStats(); |
| 35 | }; |
| 36 | |
| 37 | void resizeTessellationCache(size_t new_size); |
| 38 | void resetTessellationCache(); |
| 39 | |
| 40 | //////////////////////////////////////////////////////////////////////////////// |
| 41 | //////////////////////////////////////////////////////////////////////////////// |
| 42 | //////////////////////////////////////////////////////////////////////////////// |
| 43 | |
| 44 | struct __aligned(64) ThreadWorkState |
| 45 | { |
| 46 | ALIGNED_STRUCT_(64); |
| 47 | |
| 48 | std::atomic<size_t> counter; |
| 49 | ThreadWorkState* next; |
| 50 | bool allocated; |
| 51 | |
| 52 | __forceinline ThreadWorkState(bool allocated = false) |
| 53 | : counter(0), next(nullptr), allocated(allocated) |
| 54 | { |
| 55 | assert( ((size_t)this % 64) == 0 ); |
| 56 | } |
| 57 | }; |
| 58 | |
| 59 | class __aligned(64) SharedLazyTessellationCache |
| 60 | { |
| 61 | public: |
| 62 | |
| 63 | static const size_t NUM_CACHE_SEGMENTS = 8; |
| 64 | static const size_t NUM_PREALLOC_THREAD_WORK_STATES = 512; |
| 65 | static const size_t COMMIT_INDEX_SHIFT = 32+8; |
| 66 | #if defined(__64BIT__) |
| 67 | static const size_t REF_TAG_MASK = 0xffffffffff; |
| 68 | #else |
| 69 | static const size_t REF_TAG_MASK = 0x7FFFFFFF; |
| 70 | #endif |
| 71 | static const size_t MAX_TESSELLATION_CACHE_SIZE = REF_TAG_MASK+1; |
| 72 | static const size_t BLOCK_SIZE = 64; |
| 73 | |
| 74 | |
| 75 | /*! Per thread tessellation ref cache */ |
| 76 | static __thread ThreadWorkState* init_t_state; |
| 77 | static ThreadWorkState* current_t_state; |
| 78 | |
| 79 | static __forceinline ThreadWorkState *threadState() |
| 80 | { |
| 81 | if (unlikely(!init_t_state)) |
| 82 | /* sets init_t_state, can't return pointer due to macosx icc bug*/ |
| 83 | SharedLazyTessellationCache::sharedLazyTessellationCache.getNextRenderThreadWorkState(); |
| 84 | return init_t_state; |
| 85 | } |
| 86 | |
| 87 | struct Tag |
| 88 | { |
| 89 | __forceinline Tag() : data(0) {} |
| 90 | |
| 91 | __forceinline Tag(void* ptr, size_t combinedTime) { |
| 92 | init(ptr,combinedTime); |
| 93 | } |
| 94 | |
| 95 | __forceinline Tag(size_t ptr, size_t combinedTime) { |
| 96 | init((void*)ptr,combinedTime); |
| 97 | } |
| 98 | |
| 99 | __forceinline void init(void* ptr, size_t combinedTime) |
| 100 | { |
| 101 | if (ptr == nullptr) { |
| 102 | data = 0; |
| 103 | return; |
| 104 | } |
| 105 | int64_t new_root_ref = (int64_t) ptr; |
| 106 | new_root_ref -= (int64_t)SharedLazyTessellationCache::sharedLazyTessellationCache.getDataPtr(); |
| 107 | assert( new_root_ref <= (int64_t)REF_TAG_MASK ); |
| 108 | new_root_ref |= (int64_t)combinedTime << COMMIT_INDEX_SHIFT; |
| 109 | data = new_root_ref; |
| 110 | } |
| 111 | |
| 112 | __forceinline int64_t get() const { return data.load(); } |
| 113 | __forceinline void set( int64_t v ) { data.store(v); } |
| 114 | __forceinline void reset() { data.store(0); } |
| 115 | |
| 116 | private: |
| 117 | atomic<int64_t> data; |
| 118 | }; |
| 119 | |
| 120 | static __forceinline size_t (const int64_t v) { return v >> SharedLazyTessellationCache::COMMIT_INDEX_SHIFT; } |
| 121 | |
| 122 | struct CacheEntry |
| 123 | { |
| 124 | Tag tag; |
| 125 | SpinLock mutex; |
| 126 | }; |
| 127 | |
| 128 | private: |
| 129 | |
| 130 | float *data; |
| 131 | bool hugepages; |
| 132 | size_t size; |
| 133 | size_t maxBlocks; |
| 134 | ThreadWorkState *threadWorkState; |
| 135 | |
| 136 | __aligned(64) std::atomic<size_t> localTime; |
| 137 | __aligned(64) std::atomic<size_t> next_block; |
| 138 | __aligned(64) SpinLock reset_state; |
| 139 | __aligned(64) SpinLock linkedlist_mtx; |
| 140 | __aligned(64) std::atomic<size_t> switch_block_threshold; |
| 141 | __aligned(64) std::atomic<size_t> numRenderThreads; |
| 142 | |
| 143 | |
| 144 | public: |
| 145 | |
| 146 | |
| 147 | SharedLazyTessellationCache(); |
| 148 | ~SharedLazyTessellationCache(); |
| 149 | |
| 150 | void getNextRenderThreadWorkState(); |
| 151 | |
| 152 | __forceinline size_t maxAllocSize() const { |
| 153 | return switch_block_threshold; |
| 154 | } |
| 155 | |
| 156 | __forceinline size_t getCurrentIndex() { return localTime.load(); } |
| 157 | __forceinline void addCurrentIndex(const size_t i=1) { localTime.fetch_add(i); } |
| 158 | |
| 159 | __forceinline size_t getTime(const size_t globalTime) { |
| 160 | return localTime.load()+NUM_CACHE_SEGMENTS*globalTime; |
| 161 | } |
| 162 | |
| 163 | |
| 164 | __forceinline size_t lockThread (ThreadWorkState *const t_state, const ssize_t plus=1) { return t_state->counter.fetch_add(plus); } |
| 165 | __forceinline size_t unlockThread(ThreadWorkState *const t_state, const ssize_t plus=-1) { assert(isLocked(t_state)); return t_state->counter.fetch_add(plus); } |
| 166 | |
| 167 | __forceinline bool isLocked(ThreadWorkState *const t_state) { return t_state->counter.load() != 0; } |
| 168 | |
| 169 | static __forceinline void lock () { sharedLazyTessellationCache.lockThread(threadState()); } |
| 170 | static __forceinline void unlock() { sharedLazyTessellationCache.unlockThread(threadState()); } |
| 171 | static __forceinline bool isLocked() { return sharedLazyTessellationCache.isLocked(threadState()); } |
| 172 | static __forceinline size_t getState() { return threadState()->counter.load(); } |
| 173 | static __forceinline void lockThreadLoop() { sharedLazyTessellationCache.lockThreadLoop(threadState()); } |
| 174 | |
| 175 | static __forceinline size_t getTCacheTime(const size_t globalTime) { |
| 176 | return sharedLazyTessellationCache.getTime(globalTime); |
| 177 | } |
| 178 | |
| 179 | /* per thread lock */ |
| 180 | __forceinline void lockThreadLoop (ThreadWorkState *const t_state) |
| 181 | { |
| 182 | while(1) |
| 183 | { |
| 184 | size_t lock = SharedLazyTessellationCache::sharedLazyTessellationCache.lockThread(t_state,1); |
| 185 | if (unlikely(lock >= THREAD_BLOCK_ATOMIC_ADD)) |
| 186 | { |
| 187 | /* lock failed wait until sync phase is over */ |
| 188 | sharedLazyTessellationCache.unlockThread(t_state,-1); |
| 189 | sharedLazyTessellationCache.waitForUsersLessEqual(t_state,0); |
| 190 | } |
| 191 | else |
| 192 | break; |
| 193 | } |
| 194 | } |
| 195 | |
| 196 | static __forceinline void* lookup(CacheEntry& entry, size_t globalTime) |
| 197 | { |
| 198 | const int64_t subdiv_patch_root_ref = entry.tag.get(); |
| 199 | CACHE_STATS(SharedTessellationCacheStats::cache_accesses++); |
| 200 | |
| 201 | if (likely(subdiv_patch_root_ref != 0)) |
| 202 | { |
| 203 | const size_t subdiv_patch_root = (subdiv_patch_root_ref & REF_TAG_MASK) + (size_t)sharedLazyTessellationCache.getDataPtr(); |
| 204 | const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref); |
| 205 | |
| 206 | if (likely( sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime) )) |
| 207 | { |
| 208 | CACHE_STATS(SharedTessellationCacheStats::cache_hits++); |
| 209 | return (void*) subdiv_patch_root; |
| 210 | } |
| 211 | } |
| 212 | CACHE_STATS(SharedTessellationCacheStats::cache_misses++); |
| 213 | return nullptr; |
| 214 | } |
| 215 | |
| 216 | template<typename Constructor> |
| 217 | static __forceinline auto lookup (CacheEntry& entry, size_t globalTime, const Constructor constructor, const bool before=false) -> decltype(constructor()) |
| 218 | { |
| 219 | ThreadWorkState *t_state = SharedLazyTessellationCache::threadState(); |
| 220 | |
| 221 | while (true) |
| 222 | { |
| 223 | sharedLazyTessellationCache.lockThreadLoop(t_state); |
| 224 | void* patch = SharedLazyTessellationCache::lookup(entry,globalTime); |
| 225 | if (patch) return (decltype(constructor())) patch; |
| 226 | |
| 227 | if (entry.mutex.try_lock()) |
| 228 | { |
| 229 | if (!validTag(entry.tag,globalTime)) |
| 230 | { |
| 231 | auto timeBefore = sharedLazyTessellationCache.getTime(globalTime); |
| 232 | auto ret = constructor(); // thread is locked here! |
| 233 | assert(ret); |
| 234 | /* this should never return nullptr */ |
| 235 | auto timeAfter = sharedLazyTessellationCache.getTime(globalTime); |
| 236 | auto time = before ? timeBefore : timeAfter; |
| 237 | __memory_barrier(); |
| 238 | entry.tag = SharedLazyTessellationCache::Tag(ret,time); |
| 239 | __memory_barrier(); |
| 240 | entry.mutex.unlock(); |
| 241 | return ret; |
| 242 | } |
| 243 | entry.mutex.unlock(); |
| 244 | } |
| 245 | SharedLazyTessellationCache::sharedLazyTessellationCache.unlockThread(t_state); |
| 246 | } |
| 247 | } |
| 248 | |
| 249 | __forceinline bool validCacheIndex(const size_t i, const size_t globalTime) |
| 250 | { |
| 251 | #if FORCE_SIMPLE_FLUSH == 1 |
| 252 | return i == getTime(globalTime); |
| 253 | #else |
| 254 | return i+(NUM_CACHE_SEGMENTS-1) >= getTime(globalTime); |
| 255 | #endif |
| 256 | } |
| 257 | |
| 258 | static __forceinline bool validTime(const size_t oldtime, const size_t newTime) |
| 259 | { |
| 260 | return oldtime+(NUM_CACHE_SEGMENTS-1) >= newTime; |
| 261 | } |
| 262 | |
| 263 | |
| 264 | static __forceinline bool validTag(const Tag& tag, size_t globalTime) |
| 265 | { |
| 266 | const int64_t subdiv_patch_root_ref = tag.get(); |
| 267 | if (subdiv_patch_root_ref == 0) return false; |
| 268 | const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref); |
| 269 | return sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime); |
| 270 | } |
| 271 | |
| 272 | void waitForUsersLessEqual(ThreadWorkState *const t_state, |
| 273 | const unsigned int users); |
| 274 | |
| 275 | __forceinline size_t alloc(const size_t blocks) |
| 276 | { |
| 277 | if (unlikely(blocks >= switch_block_threshold)) |
| 278 | throw_RTCError(RTC_ERROR_INVALID_OPERATION,"allocation exceeds size of tessellation cache segment" ); |
| 279 | |
| 280 | assert(blocks < switch_block_threshold); |
| 281 | size_t index = next_block.fetch_add(blocks); |
| 282 | if (unlikely(index + blocks >= switch_block_threshold)) return (size_t)-1; |
| 283 | return index; |
| 284 | } |
| 285 | |
| 286 | static __forceinline void* malloc(const size_t bytes) |
| 287 | { |
| 288 | size_t block_index = -1; |
| 289 | ThreadWorkState *const t_state = threadState(); |
| 290 | while (true) |
| 291 | { |
| 292 | block_index = sharedLazyTessellationCache.alloc((bytes+BLOCK_SIZE-1)/BLOCK_SIZE); |
| 293 | if (block_index == (size_t)-1) |
| 294 | { |
| 295 | sharedLazyTessellationCache.unlockThread(t_state); |
| 296 | sharedLazyTessellationCache.allocNextSegment(); |
| 297 | sharedLazyTessellationCache.lockThread(t_state); |
| 298 | continue; |
| 299 | } |
| 300 | break; |
| 301 | } |
| 302 | return sharedLazyTessellationCache.getBlockPtr(block_index); |
| 303 | } |
| 304 | |
| 305 | __forceinline void *getBlockPtr(const size_t block_index) |
| 306 | { |
| 307 | assert(block_index < maxBlocks); |
| 308 | assert(data); |
| 309 | assert(block_index*16 <= size); |
| 310 | return (void*)&data[block_index*16]; |
| 311 | } |
| 312 | |
| 313 | __forceinline void* getDataPtr() { return data; } |
| 314 | __forceinline size_t getNumUsedBytes() { return next_block * BLOCK_SIZE; } |
| 315 | __forceinline size_t getMaxBlocks() { return maxBlocks; } |
| 316 | __forceinline size_t getSize() { return size; } |
| 317 | |
| 318 | void allocNextSegment(); |
| 319 | void realloc(const size_t newSize); |
| 320 | |
| 321 | void reset(); |
| 322 | |
| 323 | static SharedLazyTessellationCache sharedLazyTessellationCache; |
| 324 | }; |
| 325 | } |
| 326 | |