1// Copyright 2009-2021 Intel Corporation
2// SPDX-License-Identifier: Apache-2.0
3
4#pragma once
5
6#include "bvh.h"
7#include "node_intersector1.h"
8#include "../common/stack_item.h"
9
10#define NEW_SORTING_CODE 1
11
12namespace embree
13{
14 namespace isa
15 {
16 /*! BVH regular node traversal for single rays. */
17 template<int N, int types>
18 class BVHNNodeTraverser1Hit;
19
20#if defined(__AVX512VL__) // SKX
21
22 template<int N>
23 __forceinline void isort_update(vint<N> &dist, const vint<N> &d)
24 {
25 const vint<N> dist_shift = align_shift_right<N-1>(dist,dist);
26 const vboolf<N> m_geq = d >= dist;
27 const vboolf<N> m_geq_shift = m_geq << 1;
28 dist = select(m_geq,d,dist);
29 dist = select(m_geq_shift,dist_shift,dist);
30 }
31
32 template<int N>
33 __forceinline void isort_quick_update(vint<N> &dist, const vint<N> &d) {
34 dist = align_shift_right<N-1>(dist,permute(d,vint<N>(zero)));
35 }
36
37 __forceinline size_t permuteExtract(const vint8& index, const vllong4& n0, const vllong4& n1) {
38 return toScalar(permutex2var((__m256i)index,n0,n1));
39 }
40
41 __forceinline float permuteExtract(const vint8& index, const vfloat8& n) {
42 return toScalar(permute(n,index));
43 }
44
45#endif
46
47 /* Specialization for BVH4. */
48 template<int types>
49 class BVHNNodeTraverser1Hit<4, types>
50 {
51 typedef BVH4 BVH;
52 typedef BVH4::NodeRef NodeRef;
53 typedef BVH4::BaseNode BaseNode;
54
55
56 public:
57 /* Traverses a node with at least one hit child. Optimized for finding the closest hit (intersection). */
58 static __forceinline void traverseClosestHit(NodeRef& cur,
59 size_t mask,
60 const vfloat4& tNear,
61 StackItemT<NodeRef>*& stackPtr,
62 StackItemT<NodeRef>* stackEnd)
63 {
64 assert(mask != 0);
65 const BaseNode* node = cur.baseNode();
66
67 /*! one child is hit, continue with that child */
68 size_t r = bscf(mask);
69 cur = node->child(r);
70 BVH::prefetch(cur,types);
71 if (likely(mask == 0)) {
72 assert(cur != BVH::emptyNode);
73 return;
74 }
75
76 /*! two children are hit, push far child, and continue with closer child */
77 NodeRef c0 = cur;
78 const unsigned int d0 = ((unsigned int*)&tNear)[r];
79 r = bscf(mask);
80 NodeRef c1 = node->child(r);
81 BVH::prefetch(c1,types);
82 const unsigned int d1 = ((unsigned int*)&tNear)[r];
83 assert(c0 != BVH::emptyNode);
84 assert(c1 != BVH::emptyNode);
85 if (likely(mask == 0)) {
86 assert(stackPtr < stackEnd);
87 if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; return; }
88 else { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; return; }
89 }
90
91#if NEW_SORTING_CODE == 1
92 vint4 s0((size_t)c0,(size_t)d0);
93 vint4 s1((size_t)c1,(size_t)d1);
94 r = bscf(mask);
95 NodeRef c2 = node->child(r); BVH::prefetch(c2,types); unsigned int d2 = ((unsigned int*)&tNear)[r];
96 vint4 s2((size_t)c2,(size_t)d2);
97 /* 3 hits */
98 if (likely(mask == 0)) {
99 StackItemT<NodeRef>::sort3(s0,s1,s2);
100 *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1;
101 cur = toSizeT(s2);
102 stackPtr+=2;
103 return;
104 }
105 r = bscf(mask);
106 NodeRef c3 = node->child(r); BVH::prefetch(c3,types); unsigned int d3 = ((unsigned int*)&tNear)[r];
107 vint4 s3((size_t)c3,(size_t)d3);
108 /* 4 hits */
109 StackItemT<NodeRef>::sort4(s0,s1,s2,s3);
110 *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2;
111 cur = toSizeT(s3);
112 stackPtr+=3;
113#else
114 /*! Here starts the slow path for 3 or 4 hit children. We push
115 * all nodes onto the stack to sort them there. */
116 assert(stackPtr < stackEnd);
117 stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++;
118 assert(stackPtr < stackEnd);
119 stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++;
120
121 /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */
122 assert(stackPtr < stackEnd);
123 r = bscf(mask);
124 NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
125 assert(c != BVH::emptyNode);
126 if (likely(mask == 0)) {
127 sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]);
128 cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
129 return;
130 }
131
132 /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */
133 assert(stackPtr < stackEnd);
134 r = bscf(mask);
135 c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
136 assert(c != BVH::emptyNode);
137 sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]);
138 cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
139#endif
140 }
141
142 /* Traverses a node with at least one hit child. Optimized for finding any hit (occlusion). */
143 static __forceinline void traverseAnyHit(NodeRef& cur,
144 size_t mask,
145 const vfloat4& tNear,
146 NodeRef*& stackPtr,
147 NodeRef* stackEnd)
148 {
149 const BaseNode* node = cur.baseNode();
150
151 /*! one child is hit, continue with that child */
152 size_t r = bscf(mask);
153 cur = node->child(r);
154 BVH::prefetch(cur,types);
155
156 /* simpler in sequence traversal order */
157 assert(cur != BVH::emptyNode);
158 if (likely(mask == 0)) return;
159 assert(stackPtr < stackEnd);
160 *stackPtr = cur; stackPtr++;
161
162 for (; ;)
163 {
164 r = bscf(mask);
165 cur = node->child(r); BVH::prefetch(cur,types);
166 assert(cur != BVH::emptyNode);
167 if (likely(mask == 0)) return;
168 assert(stackPtr < stackEnd);
169 *stackPtr = cur; stackPtr++;
170 }
171 }
172 };
173
174 /* Specialization for BVH8. */
175 template<int types>
176 class BVHNNodeTraverser1Hit<8, types>
177 {
178 typedef BVH8 BVH;
179 typedef BVH8::NodeRef NodeRef;
180 typedef BVH8::BaseNode BaseNode;
181
182#if defined(__AVX512VL__)
183 template<class NodeRef, class BaseNode>
184 static __forceinline void traverseClosestHitAVX512VL8(NodeRef& cur,
185 size_t mask,
186 const vfloat8& tNear,
187 StackItemT<NodeRef>*& stackPtr,
188 StackItemT<NodeRef>* stackEnd)
189 {
190 assert(mask != 0);
191 const BaseNode* node = cur.baseNode();
192 const vllong4 n0 = vllong4::loadu((vllong4*)&node->children[0]);
193 const vllong4 n1 = vllong4::loadu((vllong4*)&node->children[4]);
194 vint8 distance_i = (asInt(tNear) & 0xfffffff8) | vint8(step);
195 distance_i = vint8::compact((int)mask,distance_i,distance_i);
196 cur = permuteExtract(distance_i,n0,n1);
197 BVH::prefetch(cur,types);
198
199 mask &= mask-1;
200 if (likely(mask == 0)) return;
201
202 /* 2 hits: order A0 B0 */
203 const vint8 d0(distance_i);
204 const vint8 d1(shuffle<1>(distance_i));
205 cur = permuteExtract(d1,n0,n1);
206 BVH::prefetch(cur,types);
207
208 const vint8 dist_A0 = min(d0, d1);
209 const vint8 dist_B0 = max(d0, d1);
210 assert(dist_A0[0] < dist_B0[0]);
211
212 mask &= mask-1;
213 if (likely(mask == 0)) {
214 cur = permuteExtract(dist_A0,n0,n1);
215 stackPtr[0].ptr = permuteExtract(dist_B0,n0,n1);
216 *(float*)&stackPtr[0].dist = permuteExtract(dist_B0,tNear);
217 stackPtr++;
218 return;
219 }
220
221 /* 3 hits: order A1 B1 C1 */
222
223 const vint8 d2(shuffle<2>(distance_i));
224 cur = permuteExtract(d2,n0,n1);
225 BVH::prefetch(cur,types);
226
227 const vint8 dist_A1 = min(dist_A0,d2);
228 const vint8 dist_tmp_B1 = max(dist_A0,d2);
229 const vint8 dist_B1 = min(dist_B0,dist_tmp_B1);
230 const vint8 dist_C1 = max(dist_B0,dist_tmp_B1);
231 assert(dist_A1[0] < dist_B1[0]);
232 assert(dist_B1[0] < dist_C1[0]);
233
234 mask &= mask-1;
235 if (likely(mask == 0)) {
236 cur = permuteExtract(dist_A1,n0,n1);
237 stackPtr[0].ptr = permuteExtract(dist_C1,n0,n1);
238 *(float*)&stackPtr[0].dist = permuteExtract(dist_C1,tNear);
239 stackPtr[1].ptr = permuteExtract(dist_B1,n0,n1);
240 *(float*)&stackPtr[1].dist = permuteExtract(dist_B1,tNear);
241 stackPtr+=2;
242 return;
243 }
244
245 /* 4 hits: order A2 B2 C2 D2 */
246
247 const vint8 d3(shuffle<3>(distance_i));
248 cur = permuteExtract(d3,n0,n1);
249 BVH::prefetch(cur,types);
250
251 const vint8 dist_A2 = min(dist_A1,d3);
252 const vint8 dist_tmp_B2 = max(dist_A1,d3);
253 const vint8 dist_B2 = min(dist_B1,dist_tmp_B2);
254 const vint8 dist_tmp_C2 = max(dist_B1,dist_tmp_B2);
255 const vint8 dist_C2 = min(dist_C1,dist_tmp_C2);
256 const vint8 dist_D2 = max(dist_C1,dist_tmp_C2);
257 assert(dist_A2[0] < dist_B2[0]);
258 assert(dist_B2[0] < dist_C2[0]);
259 assert(dist_C2[0] < dist_D2[0]);
260
261 mask &= mask-1;
262 if (likely(mask == 0)) {
263 cur = permuteExtract(dist_A2,n0,n1);
264 stackPtr[0].ptr = permuteExtract(dist_D2,n0,n1);
265 *(float*)&stackPtr[0].dist = permuteExtract(dist_D2,tNear);
266 stackPtr[1].ptr = permuteExtract(dist_C2,n0,n1);
267 *(float*)&stackPtr[1].dist = permuteExtract(dist_C2,tNear);
268 stackPtr[2].ptr = permuteExtract(dist_B2,n0,n1);
269 *(float*)&stackPtr[2].dist = permuteExtract(dist_B2,tNear);
270 stackPtr+=3;
271 return;
272 }
273
274 /* >=5 hits: reverse to descending order for writing to stack */
275
276 distance_i = align_shift_right<3>(distance_i,distance_i);
277 const size_t hits = 4 + popcnt(mask);
278 vint8 dist(INT_MIN); // this will work with -0.0f (0x80000000) as distance, isort_update uses >= to insert
279
280 isort_quick_update<8>(dist,dist_A2);
281 isort_quick_update<8>(dist,dist_B2);
282 isort_quick_update<8>(dist,dist_C2);
283 isort_quick_update<8>(dist,dist_D2);
284
285 do {
286
287 distance_i = align_shift_right<1>(distance_i,distance_i);
288 cur = permuteExtract(distance_i,n0,n1);
289 BVH::prefetch(cur,types);
290 const vint8 new_dist(permute(distance_i,vint8(zero)));
291 mask &= mask-1;
292 isort_update<8>(dist,new_dist);
293
294 } while(mask);
295
296 for (size_t i=0; i<7; i++)
297 assert(dist[i+0]>=dist[i+1]);
298
299 for (size_t i=0;i<hits-1;i++)
300 {
301 stackPtr->ptr = permuteExtract(dist,n0,n1);
302 *(float*)&stackPtr->dist = permuteExtract(dist,tNear);
303 dist = align_shift_right<1>(dist,dist);
304 stackPtr++;
305 }
306 cur = permuteExtract(dist,n0,n1);
307 }
308#endif
309
310 public:
311 static __forceinline void traverseClosestHit(NodeRef& cur,
312 size_t mask,
313 const vfloat8& tNear,
314 StackItemT<NodeRef>*& stackPtr,
315 StackItemT<NodeRef>* stackEnd)
316 {
317 assert(mask != 0);
318#if defined(__AVX512VL__)
319 traverseClosestHitAVX512VL8<NodeRef,BaseNode>(cur,mask,tNear,stackPtr,stackEnd);
320#else
321
322 const BaseNode* node = cur.baseNode();
323
324 /*! one child is hit, continue with that child */
325 size_t r = bscf(mask);
326 cur = node->child(r);
327 BVH::prefetch(cur,types);
328 if (likely(mask == 0)) {
329 assert(cur != BVH::emptyNode);
330 return;
331 }
332
333 /*! two children are hit, push far child, and continue with closer child */
334 NodeRef c0 = cur;
335 const unsigned int d0 = ((unsigned int*)&tNear)[r];
336 r = bscf(mask);
337 NodeRef c1 = node->child(r);
338 BVH::prefetch(c1,types);
339 const unsigned int d1 = ((unsigned int*)&tNear)[r];
340
341 assert(c0 != BVH::emptyNode);
342 assert(c1 != BVH::emptyNode);
343 if (likely(mask == 0)) {
344 assert(stackPtr < stackEnd);
345 if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; return; }
346 else { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; return; }
347 }
348#if NEW_SORTING_CODE == 1
349 vint4 s0((size_t)c0,(size_t)d0);
350 vint4 s1((size_t)c1,(size_t)d1);
351
352 r = bscf(mask);
353 NodeRef c2 = node->child(r); BVH::prefetch(c2,types); unsigned int d2 = ((unsigned int*)&tNear)[r];
354 vint4 s2((size_t)c2,(size_t)d2);
355 /* 3 hits */
356 if (likely(mask == 0)) {
357 StackItemT<NodeRef>::sort3(s0,s1,s2);
358 *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1;
359 cur = toSizeT(s2);
360 stackPtr+=2;
361 return;
362 }
363 r = bscf(mask);
364 NodeRef c3 = node->child(r); BVH::prefetch(c3,types); unsigned int d3 = ((unsigned int*)&tNear)[r];
365 vint4 s3((size_t)c3,(size_t)d3);
366 /* 4 hits */
367 if (likely(mask == 0)) {
368 StackItemT<NodeRef>::sort4(s0,s1,s2,s3);
369 *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2;
370 cur = toSizeT(s3);
371 stackPtr+=3;
372 return;
373 }
374 *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2; *(vint4*)&stackPtr[3] = s3;
375 /*! fallback case if more than 4 children are hit */
376 StackItemT<NodeRef>* stackFirst = stackPtr;
377 stackPtr+=4;
378 while (1)
379 {
380 assert(stackPtr < stackEnd);
381 r = bscf(mask);
382 NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = *(unsigned int*)&tNear[r];
383 const vint4 s((size_t)c,(size_t)d);
384 *(vint4*)stackPtr++ = s;
385 assert(c != BVH::emptyNode);
386 if (unlikely(mask == 0)) break;
387 }
388 sort(stackFirst,stackPtr);
389 cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
390#else
391 /*! Here starts the slow path for 3 or 4 hit children. We push
392 * all nodes onto the stack to sort them there. */
393 assert(stackPtr < stackEnd);
394 stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++;
395 assert(stackPtr < stackEnd);
396 stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++;
397
398 /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */
399 assert(stackPtr < stackEnd);
400 r = bscf(mask);
401 NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
402 assert(c != BVH::emptyNode);
403 if (likely(mask == 0)) {
404 sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]);
405 cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
406 return;
407 }
408
409 /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */
410 assert(stackPtr < stackEnd);
411 r = bscf(mask);
412 c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
413 assert(c != BVH::emptyNode);
414 if (likely(mask == 0)) {
415 sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]);
416 cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
417 return;
418 }
419 /*! fallback case if more than 4 children are hit */
420 StackItemT<NodeRef>* stackFirst = stackPtr-4;
421 while (1)
422 {
423 assert(stackPtr < stackEnd);
424 r = bscf(mask);
425 c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
426 assert(c != BVH::emptyNode);
427 if (unlikely(mask == 0)) break;
428 }
429 sort(stackFirst,stackPtr);
430 cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
431#endif
432#endif
433 }
434
435 static __forceinline void traverseAnyHit(NodeRef& cur,
436 size_t mask,
437 const vfloat8& tNear,
438 NodeRef*& stackPtr,
439 NodeRef* stackEnd)
440 {
441 const BaseNode* node = cur.baseNode();
442
443 /*! one child is hit, continue with that child */
444 size_t r = bscf(mask);
445 cur = node->child(r);
446 BVH::prefetch(cur,types);
447
448 /* simpler in sequence traversal order */
449 assert(cur != BVH::emptyNode);
450 if (likely(mask == 0)) return;
451 assert(stackPtr < stackEnd);
452 *stackPtr = cur; stackPtr++;
453
454 for (; ;)
455 {
456 r = bscf(mask);
457 cur = node->child(r); BVH::prefetch(cur,types);
458 assert(cur != BVH::emptyNode);
459 if (likely(mask == 0)) return;
460 assert(stackPtr < stackEnd);
461 *stackPtr = cur; stackPtr++;
462 }
463 }
464 };
465 }
466}
467