1// Copyright 2009-2021 Intel Corporation
2// SPDX-License-Identifier: Apache-2.0
3
4#pragma once
5
6#include "../common/builder.h"
7#include "../../common/algorithms/parallel_reduce.h"
8
9namespace embree
10{
11 namespace isa
12 {
13 struct BVHBuilderMorton
14 {
15 static const size_t MAX_BRANCHING_FACTOR = 8; //!< maximum supported BVH branching factor
16 static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree of we are that many levels before the maximum tree depth
17
18 /*! settings for morton builder */
19 struct Settings
20 {
21 /*! default settings */
22 Settings ()
23 : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) {}
24
25 /*! initialize settings from API settings */
26 Settings (const RTCBuildArguments& settings)
27 : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024)
28 {
29 if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor;
30 if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth )) maxDepth = settings.maxDepth;
31 if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize )) minLeafSize = settings.minLeafSize;
32 if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize )) maxLeafSize = settings.maxLeafSize;
33
34 minLeafSize = min(minLeafSize,maxLeafSize);
35 }
36
37 Settings (size_t branchingFactor, size_t maxDepth, size_t minLeafSize, size_t maxLeafSize, size_t singleThreadThreshold)
38 : branchingFactor(branchingFactor), maxDepth(maxDepth), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), singleThreadThreshold(singleThreadThreshold)
39 {
40 minLeafSize = min(minLeafSize,maxLeafSize);
41 }
42
43 public:
44 size_t branchingFactor; //!< branching factor of BVH to build
45 size_t maxDepth; //!< maximum depth of BVH to build
46 size_t minLeafSize; //!< minimum size of a leaf
47 size_t maxLeafSize; //!< maximum size of a leaf
48 size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
49 };
50
51 /*! Build primitive consisting of morton code and primitive ID. */
52 struct __aligned(8) BuildPrim
53 {
54 union {
55 struct {
56 unsigned int code; //!< morton code
57 unsigned int index; //!< i'th primitive
58 };
59 uint64_t t;
60 };
61
62 /*! interface for radix sort */
63 __forceinline operator unsigned() const { return code; }
64
65 /*! interface for standard sort */
66 __forceinline bool operator<(const BuildPrim &m) const { return code < m.code; }
67 };
68
69 /*! maps bounding box to morton code */
70 struct MortonCodeMapping
71 {
72 static const size_t LATTICE_BITS_PER_DIM = 10;
73 static const size_t LATTICE_SIZE_PER_DIM = size_t(1) << LATTICE_BITS_PER_DIM;
74
75 vfloat4 base;
76 vfloat4 scale;
77
78 __forceinline MortonCodeMapping(const BBox3fa& bounds)
79 {
80 base = (vfloat4)bounds.lower;
81 const vfloat4 diag = (vfloat4)bounds.upper - (vfloat4)bounds.lower;
82 scale = select(diag > vfloat4(1E-19f), rcp(diag) * vfloat4(LATTICE_SIZE_PER_DIM * 0.99f),vfloat4(0.0f));
83 }
84
85 __forceinline const vint4 bin (const BBox3fa& box) const
86 {
87 const vfloat4 lower = (vfloat4)box.lower;
88 const vfloat4 upper = (vfloat4)box.upper;
89 const vfloat4 centroid = lower+upper;
90 return vint4((centroid-base)*scale);
91 }
92
93 __forceinline unsigned int code (const BBox3fa& box) const
94 {
95 const vint4 binID = bin(box);
96 const unsigned int x = extract<0>(binID);
97 const unsigned int y = extract<1>(binID);
98 const unsigned int z = extract<2>(binID);
99 const unsigned int xyz = bitInterleave(x,y,z);
100 return xyz;
101 }
102 };
103
104#if defined (__AVX2__)
105
106 /*! for AVX2 there is a fast scalar bitInterleave */
107 struct MortonCodeGenerator
108 {
109 __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
110 : mapping(mapping), dest(dest) {}
111
112 __forceinline void operator() (const BBox3fa& b, const unsigned index)
113 {
114 dest->index = index;
115 dest->code = mapping.code(b);
116 dest++;
117 }
118
119 public:
120 const MortonCodeMapping mapping;
121 BuildPrim* dest;
122 size_t currentID;
123 };
124
125#else
126
127 /*! before AVX2 is it better to use the SSE version of bitInterleave */
128 struct MortonCodeGenerator
129 {
130 __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
131 : mapping(mapping), dest(dest), currentID(0), slots(0), ax(0), ay(0), az(0), ai(0) {}
132
133 __forceinline ~MortonCodeGenerator()
134 {
135 if (slots != 0)
136 {
137 const vint4 code = bitInterleave(ax,ay,az);
138 for (size_t i=0; i<slots; i++) {
139 dest[currentID-slots+i].index = ai[i];
140 dest[currentID-slots+i].code = code[i];
141 }
142 }
143 }
144
145 __forceinline void operator() (const BBox3fa& b, const unsigned index)
146 {
147 const vint4 binID = mapping.bin(b);
148 ax[slots] = extract<0>(binID);
149 ay[slots] = extract<1>(binID);
150 az[slots] = extract<2>(binID);
151 ai[slots] = index;
152 slots++;
153 currentID++;
154
155 if (slots == 4)
156 {
157 const vint4 code = bitInterleave(ax,ay,az);
158 vint4::storeu(&dest[currentID-4],unpacklo(code,ai));
159 vint4::storeu(&dest[currentID-2],unpackhi(code,ai));
160 slots = 0;
161 }
162 }
163
164 public:
165 const MortonCodeMapping mapping;
166 BuildPrim* dest;
167 size_t currentID;
168 size_t slots;
169 vint4 ax, ay, az, ai;
170 };
171
172#endif
173
174 template<
175 typename ReductionTy,
176 typename Allocator,
177 typename CreateAllocator,
178 typename CreateNodeFunc,
179 typename SetNodeBoundsFunc,
180 typename CreateLeafFunc,
181 typename CalculateBounds,
182 typename ProgressMonitor>
183
184 class BuilderT : private Settings
185 {
186 ALIGNED_CLASS_(16);
187
188 public:
189
190 BuilderT (CreateAllocator& createAllocator,
191 CreateNodeFunc& createNode,
192 SetNodeBoundsFunc& setBounds,
193 CreateLeafFunc& createLeaf,
194 CalculateBounds& calculateBounds,
195 ProgressMonitor& progressMonitor,
196 const Settings& settings)
197
198 : Settings(settings),
199 createAllocator(createAllocator),
200 createNode(createNode),
201 setBounds(setBounds),
202 createLeaf(createLeaf),
203 calculateBounds(calculateBounds),
204 progressMonitor(progressMonitor),
205 morton(nullptr) {}
206
207 ReductionTy createLargeLeaf(size_t depth, const range<unsigned>& current, Allocator alloc)
208 {
209 /* this should never occur but is a fatal error */
210 if (depth > maxDepth)
211 throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
212
213 /* create leaf for few primitives */
214 if (current.size() <= maxLeafSize)
215 return createLeaf(current,alloc);
216
217 /* fill all children by always splitting the largest one */
218 range<unsigned> children[MAX_BRANCHING_FACTOR];
219 size_t numChildren = 1;
220 children[0] = current;
221
222 do {
223
224 /* find best child with largest number of primitives */
225 size_t bestChild = -1;
226 size_t bestSize = 0;
227 for (size_t i=0; i<numChildren; i++)
228 {
229 /* ignore leaves as they cannot get split */
230 if (children[i].size() <= maxLeafSize)
231 continue;
232
233 /* remember child with largest size */
234 if (children[i].size() > bestSize) {
235 bestSize = children[i].size();
236 bestChild = i;
237 }
238 }
239 if (bestChild == size_t(-1)) break;
240
241 /*! split best child into left and right child */
242 auto split = children[bestChild].split();
243
244 /* add new children left and right */
245 children[bestChild] = children[numChildren-1];
246 children[numChildren-1] = split.first;
247 children[numChildren+0] = split.second;
248 numChildren++;
249
250 } while (numChildren < branchingFactor);
251
252 /* create node */
253 auto node = createNode(alloc,numChildren);
254
255 /* recurse into each child */
256 ReductionTy bounds[MAX_BRANCHING_FACTOR];
257 for (size_t i=0; i<numChildren; i++)
258 bounds[i] = createLargeLeaf(depth+1,children[i],alloc);
259
260 return setBounds(node,bounds,numChildren);
261 }
262
263 /*! recreates morton codes when reaching a region where all codes are identical */
264 __noinline void recreateMortonCodes(const range<unsigned>& current) const
265 {
266 /* fast path for small ranges */
267 if (likely(current.size() < 1024))
268 {
269 /*! recalculate centroid bounds */
270 BBox3fa centBounds(empty);
271 for (size_t i=current.begin(); i<current.end(); i++)
272 centBounds.extend(center2(calculateBounds(morton[i])));
273
274 /* recalculate morton codes */
275 MortonCodeMapping mapping(centBounds);
276 for (size_t i=current.begin(); i<current.end(); i++)
277 morton[i].code = mapping.code(calculateBounds(morton[i]));
278
279 /* sort morton codes */
280 std::sort(morton+current.begin(),morton+current.end());
281 }
282 else
283 {
284 /*! recalculate centroid bounds */
285 auto calculateCentBounds = [&] ( const range<unsigned>& r ) {
286 BBox3fa centBounds = empty;
287 for (size_t i=r.begin(); i<r.end(); i++)
288 centBounds.extend(center2(calculateBounds(morton[i])));
289 return centBounds;
290 };
291 const BBox3fa centBounds = parallel_reduce(current.begin(), current.end(), unsigned(1024),
292 BBox3fa(empty), calculateCentBounds, BBox3fa::merge);
293
294 /* recalculate morton codes */
295 MortonCodeMapping mapping(centBounds);
296 parallel_for(current.begin(), current.end(), unsigned(1024), [&] ( const range<unsigned>& r ) {
297 for (size_t i=r.begin(); i<r.end(); i++) {
298 morton[i].code = mapping.code(calculateBounds(morton[i]));
299 }
300 });
301
302 /*! sort morton codes */
303#if defined(TASKING_TBB)
304 tbb::parallel_sort(morton+current.begin(),morton+current.end());
305#else
306 radixsort32(morton+current.begin(),current.size());
307#endif
308 }
309 }
310
311 __forceinline void split(const range<unsigned>& current, range<unsigned>& left, range<unsigned>& right) const
312 {
313 const unsigned int code_start = morton[current.begin()].code;
314 const unsigned int code_end = morton[current.end()-1].code;
315 unsigned int bitpos = lzcnt(code_start^code_end);
316
317 /* if all items mapped to same morton code, then re-create new morton codes for the items */
318 if (unlikely(bitpos == 32))
319 {
320 recreateMortonCodes(current);
321 const unsigned int code_start = morton[current.begin()].code;
322 const unsigned int code_end = morton[current.end()-1].code;
323 bitpos = lzcnt(code_start^code_end);
324
325 /* if the morton code is still the same, goto fall back split */
326 if (unlikely(bitpos == 32)) {
327 current.split(left,right);
328 return;
329 }
330 }
331
332 /* split the items at the topmost different morton code bit */
333 const unsigned int bitpos_diff = 31-bitpos;
334 const unsigned int bitmask = 1 << bitpos_diff;
335
336 /* find location where bit differs using binary search */
337 unsigned begin = current.begin();
338 unsigned end = current.end();
339 while (begin + 1 != end) {
340 const unsigned mid = (begin+end)/2;
341 const unsigned bit = morton[mid].code & bitmask;
342 if (bit == 0) begin = mid; else end = mid;
343 }
344 unsigned center = end;
345#if defined(DEBUG)
346 for (unsigned int i=begin; i<center; i++) assert((morton[i].code & bitmask) == 0);
347 for (unsigned int i=center; i<end; i++) assert((morton[i].code & bitmask) == bitmask);
348#endif
349
350 left = make_range(current.begin(),center);
351 right = make_range(center,current.end());
352 }
353
354 ReductionTy recurse(size_t depth, const range<unsigned>& current, Allocator alloc, bool toplevel)
355 {
356 /* get thread local allocator */
357 if (!alloc)
358 alloc = createAllocator();
359
360 /* call memory monitor function to signal progress */
361 if (toplevel && current.size() <= singleThreadThreshold)
362 progressMonitor(current.size());
363
364 /* create leaf node */
365 if (unlikely(depth+MIN_LARGE_LEAF_LEVELS >= maxDepth || current.size() <= minLeafSize))
366 return createLargeLeaf(depth,current,alloc);
367
368 /* fill all children by always splitting the one with the largest surface area */
369 range<unsigned> children[MAX_BRANCHING_FACTOR];
370 split(current,children[0],children[1]);
371 size_t numChildren = 2;
372
373 while (numChildren < branchingFactor)
374 {
375 /* find best child with largest number of primitives */
376 int bestChild = -1;
377 unsigned bestItems = 0;
378 for (unsigned int i=0; i<numChildren; i++)
379 {
380 /* ignore leaves as they cannot get split */
381 if (children[i].size() <= minLeafSize)
382 continue;
383
384 /* remember child with largest area */
385 if (children[i].size() > bestItems) {
386 bestItems = children[i].size();
387 bestChild = i;
388 }
389 }
390 if (bestChild == -1) break;
391
392 /*! split best child into left and right child */
393 range<unsigned> left, right;
394 split(children[bestChild],left,right);
395
396 /* add new children left and right */
397 children[bestChild] = children[numChildren-1];
398 children[numChildren-1] = left;
399 children[numChildren+0] = right;
400 numChildren++;
401 }
402
403 /* create leaf node if no split is possible */
404 if (unlikely(numChildren == 1))
405 return createLeaf(current,alloc);
406
407 /* allocate node */
408 auto node = createNode(alloc,numChildren);
409
410 /* process top parts of tree parallel */
411 ReductionTy bounds[MAX_BRANCHING_FACTOR];
412 if (current.size() > singleThreadThreshold)
413 {
414 /*! parallel_for is faster than spawning sub-tasks */
415 parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
416 for (size_t i=r.begin(); i<r.end(); i++) {
417 bounds[i] = recurse(depth+1,children[i],nullptr,true);
418 _mm_mfence(); // to allow non-temporal stores during build
419 }
420 });
421 }
422
423 /* finish tree sequentially */
424 else
425 {
426 for (size_t i=0; i<numChildren; i++)
427 bounds[i] = recurse(depth+1,children[i],alloc,false);
428 }
429
430 return setBounds(node,bounds,numChildren);
431 }
432
433 /* build function */
434 ReductionTy build(BuildPrim* src, BuildPrim* tmp, size_t numPrimitives)
435 {
436 /* sort morton codes */
437 morton = src;
438 radix_sort_u32(src,tmp,numPrimitives,singleThreadThreshold);
439
440 /* build BVH */
441 const ReductionTy root = recurse(1, range<unsigned>(0,(unsigned)numPrimitives), nullptr, true);
442 _mm_mfence(); // to allow non-temporal stores during build
443 return root;
444 }
445
446 public:
447 CreateAllocator& createAllocator;
448 CreateNodeFunc& createNode;
449 SetNodeBoundsFunc& setBounds;
450 CreateLeafFunc& createLeaf;
451 CalculateBounds& calculateBounds;
452 ProgressMonitor& progressMonitor;
453
454 public:
455 BuildPrim* morton;
456 };
457
458
459 template<
460 typename ReductionTy,
461 typename CreateAllocFunc,
462 typename CreateNodeFunc,
463 typename SetBoundsFunc,
464 typename CreateLeafFunc,
465 typename CalculateBoundsFunc,
466 typename ProgressMonitor>
467
468 static ReductionTy build(CreateAllocFunc createAllocator,
469 CreateNodeFunc createNode,
470 SetBoundsFunc setBounds,
471 CreateLeafFunc createLeaf,
472 CalculateBoundsFunc calculateBounds,
473 ProgressMonitor progressMonitor,
474 BuildPrim* src,
475 BuildPrim* tmp,
476 size_t numPrimitives,
477 const Settings& settings)
478 {
479 typedef BuilderT<
480 ReductionTy,
481 decltype(createAllocator()),
482 CreateAllocFunc,
483 CreateNodeFunc,
484 SetBoundsFunc,
485 CreateLeafFunc,
486 CalculateBoundsFunc,
487 ProgressMonitor> Builder;
488
489 Builder builder(createAllocator,
490 createNode,
491 setBounds,
492 createLeaf,
493 calculateBounds,
494 progressMonitor,
495 settings);
496
497 return builder.build(src,tmp,numPrimitives);
498 }
499 };
500 }
501}
502