| 1 | // Copyright 2009-2021 Intel Corporation |
| 2 | // SPDX-License-Identifier: Apache-2.0 |
| 3 | |
| 4 | #pragma once |
| 5 | |
| 6 | #include "../common/builder.h" |
| 7 | #include "../../common/algorithms/parallel_reduce.h" |
| 8 | |
| 9 | namespace embree |
| 10 | { |
| 11 | namespace isa |
| 12 | { |
| 13 | struct BVHBuilderMorton |
| 14 | { |
| 15 | static const size_t MAX_BRANCHING_FACTOR = 8; //!< maximum supported BVH branching factor |
| 16 | static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree of we are that many levels before the maximum tree depth |
| 17 | |
| 18 | /*! settings for morton builder */ |
| 19 | struct Settings |
| 20 | { |
| 21 | /*! default settings */ |
| 22 | Settings () |
| 23 | : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) {} |
| 24 | |
| 25 | /*! initialize settings from API settings */ |
| 26 | Settings (const RTCBuildArguments& settings) |
| 27 | : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) |
| 28 | { |
| 29 | if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor; |
| 30 | if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth )) maxDepth = settings.maxDepth; |
| 31 | if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize )) minLeafSize = settings.minLeafSize; |
| 32 | if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize )) maxLeafSize = settings.maxLeafSize; |
| 33 | |
| 34 | minLeafSize = min(minLeafSize,maxLeafSize); |
| 35 | } |
| 36 | |
| 37 | Settings (size_t branchingFactor, size_t maxDepth, size_t minLeafSize, size_t maxLeafSize, size_t singleThreadThreshold) |
| 38 | : branchingFactor(branchingFactor), maxDepth(maxDepth), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), singleThreadThreshold(singleThreadThreshold) |
| 39 | { |
| 40 | minLeafSize = min(minLeafSize,maxLeafSize); |
| 41 | } |
| 42 | |
| 43 | public: |
| 44 | size_t branchingFactor; //!< branching factor of BVH to build |
| 45 | size_t maxDepth; //!< maximum depth of BVH to build |
| 46 | size_t minLeafSize; //!< minimum size of a leaf |
| 47 | size_t maxLeafSize; //!< maximum size of a leaf |
| 48 | size_t singleThreadThreshold; //!< threshold when we switch to single threaded build |
| 49 | }; |
| 50 | |
| 51 | /*! Build primitive consisting of morton code and primitive ID. */ |
| 52 | struct __aligned(8) BuildPrim |
| 53 | { |
| 54 | union { |
| 55 | struct { |
| 56 | unsigned int code; //!< morton code |
| 57 | unsigned int index; //!< i'th primitive |
| 58 | }; |
| 59 | uint64_t t; |
| 60 | }; |
| 61 | |
| 62 | /*! interface for radix sort */ |
| 63 | __forceinline operator unsigned() const { return code; } |
| 64 | |
| 65 | /*! interface for standard sort */ |
| 66 | __forceinline bool operator<(const BuildPrim &m) const { return code < m.code; } |
| 67 | }; |
| 68 | |
| 69 | /*! maps bounding box to morton code */ |
| 70 | struct MortonCodeMapping |
| 71 | { |
| 72 | static const size_t LATTICE_BITS_PER_DIM = 10; |
| 73 | static const size_t LATTICE_SIZE_PER_DIM = size_t(1) << LATTICE_BITS_PER_DIM; |
| 74 | |
| 75 | vfloat4 base; |
| 76 | vfloat4 scale; |
| 77 | |
| 78 | __forceinline MortonCodeMapping(const BBox3fa& bounds) |
| 79 | { |
| 80 | base = (vfloat4)bounds.lower; |
| 81 | const vfloat4 diag = (vfloat4)bounds.upper - (vfloat4)bounds.lower; |
| 82 | scale = select(diag > vfloat4(1E-19f), rcp(diag) * vfloat4(LATTICE_SIZE_PER_DIM * 0.99f),vfloat4(0.0f)); |
| 83 | } |
| 84 | |
| 85 | __forceinline const vint4 bin (const BBox3fa& box) const |
| 86 | { |
| 87 | const vfloat4 lower = (vfloat4)box.lower; |
| 88 | const vfloat4 upper = (vfloat4)box.upper; |
| 89 | const vfloat4 centroid = lower+upper; |
| 90 | return vint4((centroid-base)*scale); |
| 91 | } |
| 92 | |
| 93 | __forceinline unsigned int code (const BBox3fa& box) const |
| 94 | { |
| 95 | const vint4 binID = bin(box); |
| 96 | const unsigned int x = extract<0>(binID); |
| 97 | const unsigned int y = extract<1>(binID); |
| 98 | const unsigned int z = extract<2>(binID); |
| 99 | const unsigned int xyz = bitInterleave(x,y,z); |
| 100 | return xyz; |
| 101 | } |
| 102 | }; |
| 103 | |
| 104 | #if defined (__AVX2__) |
| 105 | |
| 106 | /*! for AVX2 there is a fast scalar bitInterleave */ |
| 107 | struct MortonCodeGenerator |
| 108 | { |
| 109 | __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest) |
| 110 | : mapping(mapping), dest(dest) {} |
| 111 | |
| 112 | __forceinline void operator() (const BBox3fa& b, const unsigned index) |
| 113 | { |
| 114 | dest->index = index; |
| 115 | dest->code = mapping.code(b); |
| 116 | dest++; |
| 117 | } |
| 118 | |
| 119 | public: |
| 120 | const MortonCodeMapping mapping; |
| 121 | BuildPrim* dest; |
| 122 | size_t currentID; |
| 123 | }; |
| 124 | |
| 125 | #else |
| 126 | |
| 127 | /*! before AVX2 is it better to use the SSE version of bitInterleave */ |
| 128 | struct MortonCodeGenerator |
| 129 | { |
| 130 | __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest) |
| 131 | : mapping(mapping), dest(dest), currentID(0), slots(0), ax(0), ay(0), az(0), ai(0) {} |
| 132 | |
| 133 | __forceinline ~MortonCodeGenerator() |
| 134 | { |
| 135 | if (slots != 0) |
| 136 | { |
| 137 | const vint4 code = bitInterleave(ax,ay,az); |
| 138 | for (size_t i=0; i<slots; i++) { |
| 139 | dest[currentID-slots+i].index = ai[i]; |
| 140 | dest[currentID-slots+i].code = code[i]; |
| 141 | } |
| 142 | } |
| 143 | } |
| 144 | |
| 145 | __forceinline void operator() (const BBox3fa& b, const unsigned index) |
| 146 | { |
| 147 | const vint4 binID = mapping.bin(b); |
| 148 | ax[slots] = extract<0>(binID); |
| 149 | ay[slots] = extract<1>(binID); |
| 150 | az[slots] = extract<2>(binID); |
| 151 | ai[slots] = index; |
| 152 | slots++; |
| 153 | currentID++; |
| 154 | |
| 155 | if (slots == 4) |
| 156 | { |
| 157 | const vint4 code = bitInterleave(ax,ay,az); |
| 158 | vint4::storeu(&dest[currentID-4],unpacklo(code,ai)); |
| 159 | vint4::storeu(&dest[currentID-2],unpackhi(code,ai)); |
| 160 | slots = 0; |
| 161 | } |
| 162 | } |
| 163 | |
| 164 | public: |
| 165 | const MortonCodeMapping mapping; |
| 166 | BuildPrim* dest; |
| 167 | size_t currentID; |
| 168 | size_t slots; |
| 169 | vint4 ax, ay, az, ai; |
| 170 | }; |
| 171 | |
| 172 | #endif |
| 173 | |
| 174 | template< |
| 175 | typename ReductionTy, |
| 176 | typename Allocator, |
| 177 | typename CreateAllocator, |
| 178 | typename CreateNodeFunc, |
| 179 | typename SetNodeBoundsFunc, |
| 180 | typename CreateLeafFunc, |
| 181 | typename CalculateBounds, |
| 182 | typename ProgressMonitor> |
| 183 | |
| 184 | class BuilderT : private Settings |
| 185 | { |
| 186 | ALIGNED_CLASS_(16); |
| 187 | |
| 188 | public: |
| 189 | |
| 190 | BuilderT (CreateAllocator& createAllocator, |
| 191 | CreateNodeFunc& createNode, |
| 192 | SetNodeBoundsFunc& setBounds, |
| 193 | CreateLeafFunc& createLeaf, |
| 194 | CalculateBounds& calculateBounds, |
| 195 | ProgressMonitor& progressMonitor, |
| 196 | const Settings& settings) |
| 197 | |
| 198 | : Settings(settings), |
| 199 | createAllocator(createAllocator), |
| 200 | createNode(createNode), |
| 201 | setBounds(setBounds), |
| 202 | createLeaf(createLeaf), |
| 203 | calculateBounds(calculateBounds), |
| 204 | progressMonitor(progressMonitor), |
| 205 | morton(nullptr) {} |
| 206 | |
| 207 | ReductionTy createLargeLeaf(size_t depth, const range<unsigned>& current, Allocator alloc) |
| 208 | { |
| 209 | /* this should never occur but is a fatal error */ |
| 210 | if (depth > maxDepth) |
| 211 | throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached" ); |
| 212 | |
| 213 | /* create leaf for few primitives */ |
| 214 | if (current.size() <= maxLeafSize) |
| 215 | return createLeaf(current,alloc); |
| 216 | |
| 217 | /* fill all children by always splitting the largest one */ |
| 218 | range<unsigned> children[MAX_BRANCHING_FACTOR]; |
| 219 | size_t numChildren = 1; |
| 220 | children[0] = current; |
| 221 | |
| 222 | do { |
| 223 | |
| 224 | /* find best child with largest number of primitives */ |
| 225 | size_t bestChild = -1; |
| 226 | size_t bestSize = 0; |
| 227 | for (size_t i=0; i<numChildren; i++) |
| 228 | { |
| 229 | /* ignore leaves as they cannot get split */ |
| 230 | if (children[i].size() <= maxLeafSize) |
| 231 | continue; |
| 232 | |
| 233 | /* remember child with largest size */ |
| 234 | if (children[i].size() > bestSize) { |
| 235 | bestSize = children[i].size(); |
| 236 | bestChild = i; |
| 237 | } |
| 238 | } |
| 239 | if (bestChild == size_t(-1)) break; |
| 240 | |
| 241 | /*! split best child into left and right child */ |
| 242 | auto split = children[bestChild].split(); |
| 243 | |
| 244 | /* add new children left and right */ |
| 245 | children[bestChild] = children[numChildren-1]; |
| 246 | children[numChildren-1] = split.first; |
| 247 | children[numChildren+0] = split.second; |
| 248 | numChildren++; |
| 249 | |
| 250 | } while (numChildren < branchingFactor); |
| 251 | |
| 252 | /* create node */ |
| 253 | auto node = createNode(alloc,numChildren); |
| 254 | |
| 255 | /* recurse into each child */ |
| 256 | ReductionTy bounds[MAX_BRANCHING_FACTOR]; |
| 257 | for (size_t i=0; i<numChildren; i++) |
| 258 | bounds[i] = createLargeLeaf(depth+1,children[i],alloc); |
| 259 | |
| 260 | return setBounds(node,bounds,numChildren); |
| 261 | } |
| 262 | |
| 263 | /*! recreates morton codes when reaching a region where all codes are identical */ |
| 264 | __noinline void recreateMortonCodes(const range<unsigned>& current) const |
| 265 | { |
| 266 | /* fast path for small ranges */ |
| 267 | if (likely(current.size() < 1024)) |
| 268 | { |
| 269 | /*! recalculate centroid bounds */ |
| 270 | BBox3fa centBounds(empty); |
| 271 | for (size_t i=current.begin(); i<current.end(); i++) |
| 272 | centBounds.extend(center2(calculateBounds(morton[i]))); |
| 273 | |
| 274 | /* recalculate morton codes */ |
| 275 | MortonCodeMapping mapping(centBounds); |
| 276 | for (size_t i=current.begin(); i<current.end(); i++) |
| 277 | morton[i].code = mapping.code(calculateBounds(morton[i])); |
| 278 | |
| 279 | /* sort morton codes */ |
| 280 | std::sort(morton+current.begin(),morton+current.end()); |
| 281 | } |
| 282 | else |
| 283 | { |
| 284 | /*! recalculate centroid bounds */ |
| 285 | auto calculateCentBounds = [&] ( const range<unsigned>& r ) { |
| 286 | BBox3fa centBounds = empty; |
| 287 | for (size_t i=r.begin(); i<r.end(); i++) |
| 288 | centBounds.extend(center2(calculateBounds(morton[i]))); |
| 289 | return centBounds; |
| 290 | }; |
| 291 | const BBox3fa centBounds = parallel_reduce(current.begin(), current.end(), unsigned(1024), |
| 292 | BBox3fa(empty), calculateCentBounds, BBox3fa::merge); |
| 293 | |
| 294 | /* recalculate morton codes */ |
| 295 | MortonCodeMapping mapping(centBounds); |
| 296 | parallel_for(current.begin(), current.end(), unsigned(1024), [&] ( const range<unsigned>& r ) { |
| 297 | for (size_t i=r.begin(); i<r.end(); i++) { |
| 298 | morton[i].code = mapping.code(calculateBounds(morton[i])); |
| 299 | } |
| 300 | }); |
| 301 | |
| 302 | /*! sort morton codes */ |
| 303 | #if defined(TASKING_TBB) |
| 304 | tbb::parallel_sort(morton+current.begin(),morton+current.end()); |
| 305 | #else |
| 306 | radixsort32(morton+current.begin(),current.size()); |
| 307 | #endif |
| 308 | } |
| 309 | } |
| 310 | |
| 311 | __forceinline void split(const range<unsigned>& current, range<unsigned>& left, range<unsigned>& right) const |
| 312 | { |
| 313 | const unsigned int code_start = morton[current.begin()].code; |
| 314 | const unsigned int code_end = morton[current.end()-1].code; |
| 315 | unsigned int bitpos = lzcnt(code_start^code_end); |
| 316 | |
| 317 | /* if all items mapped to same morton code, then re-create new morton codes for the items */ |
| 318 | if (unlikely(bitpos == 32)) |
| 319 | { |
| 320 | recreateMortonCodes(current); |
| 321 | const unsigned int code_start = morton[current.begin()].code; |
| 322 | const unsigned int code_end = morton[current.end()-1].code; |
| 323 | bitpos = lzcnt(code_start^code_end); |
| 324 | |
| 325 | /* if the morton code is still the same, goto fall back split */ |
| 326 | if (unlikely(bitpos == 32)) { |
| 327 | current.split(left,right); |
| 328 | return; |
| 329 | } |
| 330 | } |
| 331 | |
| 332 | /* split the items at the topmost different morton code bit */ |
| 333 | const unsigned int bitpos_diff = 31-bitpos; |
| 334 | const unsigned int bitmask = 1 << bitpos_diff; |
| 335 | |
| 336 | /* find location where bit differs using binary search */ |
| 337 | unsigned begin = current.begin(); |
| 338 | unsigned end = current.end(); |
| 339 | while (begin + 1 != end) { |
| 340 | const unsigned mid = (begin+end)/2; |
| 341 | const unsigned bit = morton[mid].code & bitmask; |
| 342 | if (bit == 0) begin = mid; else end = mid; |
| 343 | } |
| 344 | unsigned center = end; |
| 345 | #if defined(DEBUG) |
| 346 | for (unsigned int i=begin; i<center; i++) assert((morton[i].code & bitmask) == 0); |
| 347 | for (unsigned int i=center; i<end; i++) assert((morton[i].code & bitmask) == bitmask); |
| 348 | #endif |
| 349 | |
| 350 | left = make_range(current.begin(),center); |
| 351 | right = make_range(center,current.end()); |
| 352 | } |
| 353 | |
| 354 | ReductionTy recurse(size_t depth, const range<unsigned>& current, Allocator alloc, bool toplevel) |
| 355 | { |
| 356 | /* get thread local allocator */ |
| 357 | if (!alloc) |
| 358 | alloc = createAllocator(); |
| 359 | |
| 360 | /* call memory monitor function to signal progress */ |
| 361 | if (toplevel && current.size() <= singleThreadThreshold) |
| 362 | progressMonitor(current.size()); |
| 363 | |
| 364 | /* create leaf node */ |
| 365 | if (unlikely(depth+MIN_LARGE_LEAF_LEVELS >= maxDepth || current.size() <= minLeafSize)) |
| 366 | return createLargeLeaf(depth,current,alloc); |
| 367 | |
| 368 | /* fill all children by always splitting the one with the largest surface area */ |
| 369 | range<unsigned> children[MAX_BRANCHING_FACTOR]; |
| 370 | split(current,children[0],children[1]); |
| 371 | size_t numChildren = 2; |
| 372 | |
| 373 | while (numChildren < branchingFactor) |
| 374 | { |
| 375 | /* find best child with largest number of primitives */ |
| 376 | int bestChild = -1; |
| 377 | unsigned bestItems = 0; |
| 378 | for (unsigned int i=0; i<numChildren; i++) |
| 379 | { |
| 380 | /* ignore leaves as they cannot get split */ |
| 381 | if (children[i].size() <= minLeafSize) |
| 382 | continue; |
| 383 | |
| 384 | /* remember child with largest area */ |
| 385 | if (children[i].size() > bestItems) { |
| 386 | bestItems = children[i].size(); |
| 387 | bestChild = i; |
| 388 | } |
| 389 | } |
| 390 | if (bestChild == -1) break; |
| 391 | |
| 392 | /*! split best child into left and right child */ |
| 393 | range<unsigned> left, right; |
| 394 | split(children[bestChild],left,right); |
| 395 | |
| 396 | /* add new children left and right */ |
| 397 | children[bestChild] = children[numChildren-1]; |
| 398 | children[numChildren-1] = left; |
| 399 | children[numChildren+0] = right; |
| 400 | numChildren++; |
| 401 | } |
| 402 | |
| 403 | /* create leaf node if no split is possible */ |
| 404 | if (unlikely(numChildren == 1)) |
| 405 | return createLeaf(current,alloc); |
| 406 | |
| 407 | /* allocate node */ |
| 408 | auto node = createNode(alloc,numChildren); |
| 409 | |
| 410 | /* process top parts of tree parallel */ |
| 411 | ReductionTy bounds[MAX_BRANCHING_FACTOR]; |
| 412 | if (current.size() > singleThreadThreshold) |
| 413 | { |
| 414 | /*! parallel_for is faster than spawning sub-tasks */ |
| 415 | parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) { |
| 416 | for (size_t i=r.begin(); i<r.end(); i++) { |
| 417 | bounds[i] = recurse(depth+1,children[i],nullptr,true); |
| 418 | _mm_mfence(); // to allow non-temporal stores during build |
| 419 | } |
| 420 | }); |
| 421 | } |
| 422 | |
| 423 | /* finish tree sequentially */ |
| 424 | else |
| 425 | { |
| 426 | for (size_t i=0; i<numChildren; i++) |
| 427 | bounds[i] = recurse(depth+1,children[i],alloc,false); |
| 428 | } |
| 429 | |
| 430 | return setBounds(node,bounds,numChildren); |
| 431 | } |
| 432 | |
| 433 | /* build function */ |
| 434 | ReductionTy build(BuildPrim* src, BuildPrim* tmp, size_t numPrimitives) |
| 435 | { |
| 436 | /* sort morton codes */ |
| 437 | morton = src; |
| 438 | radix_sort_u32(src,tmp,numPrimitives,singleThreadThreshold); |
| 439 | |
| 440 | /* build BVH */ |
| 441 | const ReductionTy root = recurse(1, range<unsigned>(0,(unsigned)numPrimitives), nullptr, true); |
| 442 | _mm_mfence(); // to allow non-temporal stores during build |
| 443 | return root; |
| 444 | } |
| 445 | |
| 446 | public: |
| 447 | CreateAllocator& createAllocator; |
| 448 | CreateNodeFunc& createNode; |
| 449 | SetNodeBoundsFunc& setBounds; |
| 450 | CreateLeafFunc& createLeaf; |
| 451 | CalculateBounds& calculateBounds; |
| 452 | ProgressMonitor& progressMonitor; |
| 453 | |
| 454 | public: |
| 455 | BuildPrim* morton; |
| 456 | }; |
| 457 | |
| 458 | |
| 459 | template< |
| 460 | typename ReductionTy, |
| 461 | typename CreateAllocFunc, |
| 462 | typename CreateNodeFunc, |
| 463 | typename SetBoundsFunc, |
| 464 | typename CreateLeafFunc, |
| 465 | typename CalculateBoundsFunc, |
| 466 | typename ProgressMonitor> |
| 467 | |
| 468 | static ReductionTy build(CreateAllocFunc createAllocator, |
| 469 | CreateNodeFunc createNode, |
| 470 | SetBoundsFunc setBounds, |
| 471 | CreateLeafFunc createLeaf, |
| 472 | CalculateBoundsFunc calculateBounds, |
| 473 | ProgressMonitor progressMonitor, |
| 474 | BuildPrim* src, |
| 475 | BuildPrim* tmp, |
| 476 | size_t numPrimitives, |
| 477 | const Settings& settings) |
| 478 | { |
| 479 | typedef BuilderT< |
| 480 | ReductionTy, |
| 481 | decltype(createAllocator()), |
| 482 | CreateAllocFunc, |
| 483 | CreateNodeFunc, |
| 484 | SetBoundsFunc, |
| 485 | CreateLeafFunc, |
| 486 | CalculateBoundsFunc, |
| 487 | ProgressMonitor> Builder; |
| 488 | |
| 489 | Builder builder(createAllocator, |
| 490 | createNode, |
| 491 | setBounds, |
| 492 | createLeaf, |
| 493 | calculateBounds, |
| 494 | progressMonitor, |
| 495 | settings); |
| 496 | |
| 497 | return builder.build(src,tmp,numPrimitives); |
| 498 | } |
| 499 | }; |
| 500 | } |
| 501 | } |
| 502 | |