1// Copyright 2009-2021 Intel Corporation
2// SPDX-License-Identifier: Apache-2.0
3
4#pragma once
5
6#include "node_intersector_packet_stream.h"
7#include "node_intersector_frustum.h"
8#include "bvh_traverser_stream.h"
9
10namespace embree
11{
12 namespace isa
13 {
14 /*! BVH ray stream intersector. */
15 template<int N, int types, bool robust, typename PrimitiveIntersector>
16 class BVHNIntersectorStream
17 {
18 /* shortcuts for frequently used types */
19 template<int K> using PrimitiveIntersectorK = typename PrimitiveIntersector::template Type<K>;
20 template<int K> using PrimitiveK = typename PrimitiveIntersectorK<K>::PrimitiveK;
21 typedef BVHN<N> BVH;
22 typedef typename BVH::NodeRef NodeRef;
23 typedef typename BVH::BaseNode BaseNode;
24 typedef typename BVH::AABBNode AABBNode;
25 typedef typename BVH::AABBNodeMB AABBNodeMB;
26
27 template<int K>
28 __forceinline static size_t initPacketsAndFrustum(RayK<K>** inputPackets, size_t numOctantRays,
29 TravRayKStream<K, robust>* packets, Frustum<robust>& frustum, bool& commonOctant)
30 {
31 const size_t numPackets = (numOctantRays+K-1)/K;
32
33 Vec3vf<K> tmp_min_rdir(pos_inf);
34 Vec3vf<K> tmp_max_rdir(neg_inf);
35 Vec3vf<K> tmp_min_org(pos_inf);
36 Vec3vf<K> tmp_max_org(neg_inf);
37 vfloat<K> tmp_min_dist(pos_inf);
38 vfloat<K> tmp_max_dist(neg_inf);
39
40 size_t m_active = 0;
41 for (size_t i = 0; i < numPackets; i++)
42 {
43 const vfloat<K> tnear = inputPackets[i]->tnear();
44 const vfloat<K> tfar = inputPackets[i]->tfar;
45 vbool<K> m_valid = (tnear <= tfar) & (tnear >= 0.0f);
46
47#if defined(EMBREE_IGNORE_INVALID_RAYS)
48 m_valid &= inputPackets[i]->valid();
49#endif
50
51 m_active |= (size_t)movemask(m_valid) << (i*K);
52
53 vfloat<K> packet_min_dist = max(tnear, 0.0f);
54 vfloat<K> packet_max_dist = select(m_valid, tfar, neg_inf);
55 tmp_min_dist = min(tmp_min_dist, packet_min_dist);
56 tmp_max_dist = max(tmp_max_dist, packet_max_dist);
57
58 const Vec3vf<K>& org = inputPackets[i]->org;
59 const Vec3vf<K>& dir = inputPackets[i]->dir;
60
61 new (&packets[i]) TravRayKStream<K, robust>(org, dir, packet_min_dist, packet_max_dist);
62
63 tmp_min_rdir = min(tmp_min_rdir, select(m_valid, packets[i].rdir, Vec3vf<K>(pos_inf)));
64 tmp_max_rdir = max(tmp_max_rdir, select(m_valid, packets[i].rdir, Vec3vf<K>(neg_inf)));
65 tmp_min_org = min(tmp_min_org , select(m_valid,org , Vec3vf<K>(pos_inf)));
66 tmp_max_org = max(tmp_max_org , select(m_valid,org , Vec3vf<K>(neg_inf)));
67 }
68
69 m_active &= (numOctantRays == (8 * sizeof(size_t))) ? (size_t)-1 : (((size_t)1 << numOctantRays)-1);
70
71
72 const Vec3fa reduced_min_rdir(reduce_min(tmp_min_rdir.x),
73 reduce_min(tmp_min_rdir.y),
74 reduce_min(tmp_min_rdir.z));
75
76 const Vec3fa reduced_max_rdir(reduce_max(tmp_max_rdir.x),
77 reduce_max(tmp_max_rdir.y),
78 reduce_max(tmp_max_rdir.z));
79
80 const Vec3fa reduced_min_origin(reduce_min(tmp_min_org.x),
81 reduce_min(tmp_min_org.y),
82 reduce_min(tmp_min_org.z));
83
84 const Vec3fa reduced_max_origin(reduce_max(tmp_max_org.x),
85 reduce_max(tmp_max_org.y),
86 reduce_max(tmp_max_org.z));
87
88 commonOctant =
89 (reduced_max_rdir.x < 0.0f || reduced_min_rdir.x >= 0.0f) &&
90 (reduced_max_rdir.y < 0.0f || reduced_min_rdir.y >= 0.0f) &&
91 (reduced_max_rdir.z < 0.0f || reduced_min_rdir.z >= 0.0f);
92
93 const float frustum_min_dist = reduce_min(tmp_min_dist);
94 const float frustum_max_dist = reduce_max(tmp_max_dist);
95
96 frustum.init(reduced_min_origin, reduced_max_origin,
97 reduced_min_rdir, reduced_max_rdir,
98 frustum_min_dist, frustum_max_dist,
99 N);
100
101 return m_active;
102 }
103
104 template<int K>
105 __forceinline static size_t intersectAABBNodePacket(size_t m_active,
106 const TravRayKStream<K,robust>* packets,
107 const AABBNode* __restrict__ node,
108 size_t boxID,
109 const NearFarPrecalculations& nf)
110 {
111 assert(m_active);
112 const size_t startPacketID = bsf(m_active) / K;
113 const size_t endPacketID = bsr(m_active) / K;
114 size_t m_trav_active = 0;
115 for (size_t i = startPacketID; i <= endPacketID; i++)
116 {
117 const size_t m_hit = intersectNodeK<N>(node, boxID, packets[i], nf);
118 m_trav_active |= m_hit << (i*K);
119 }
120 return m_trav_active;
121 }
122
123 template<int K>
124 __forceinline static size_t traverseCoherentStream(size_t m_active,
125 TravRayKStream<K, robust>* packets,
126 const AABBNode* __restrict__ node,
127 const Frustum<robust>& frustum,
128 size_t* maskK,
129 vfloat<N>& dist)
130 {
131 size_t m_node_hit = intersectNodeFrustum<N>(node, frustum, dist);
132 const size_t first_index = bsf(m_active);
133 const size_t first_packetID = first_index / K;
134 const size_t first_rayID = first_index % K;
135 size_t m_first_hit = intersectNode1<N>(node, packets[first_packetID], first_rayID, frustum.nf);
136
137 /* this make traversal independent of the ordering of rays */
138 size_t m_node = m_node_hit ^ m_first_hit;
139 while (unlikely(m_node))
140 {
141 const size_t boxID = bscf(m_node);
142 const size_t m_current = m_active & intersectAABBNodePacket(m_active, packets, node, boxID, frustum.nf);
143 m_node_hit ^= m_current ? (size_t)0 : ((size_t)1 << boxID);
144 maskK[boxID] = m_current;
145 }
146 return m_node_hit;
147 }
148
149 // TODO: explicit 16-wide path for KNL
150 template<int K>
151 __forceinline static vint<N> traverseIncoherentStream(size_t m_active,
152 TravRayKStreamFast<K>* __restrict__ packets,
153 const AABBNode* __restrict__ node,
154 const NearFarPrecalculations& nf,
155 const int shiftTable[32])
156 {
157 const vfloat<N> bminX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
158 const vfloat<N> bminY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
159 const vfloat<N> bminZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
160 const vfloat<N> bmaxX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
161 const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
162 const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
163 assert(m_active);
164 vint<N> vmask(zero);
165 do
166 {
167 STAT3(shadow.trav_nodes,1,1,1);
168 const size_t rayID = bscf(m_active);
169 assert(rayID < MAX_INTERNAL_STREAM_SIZE);
170 TravRayKStream<K,robust> &p = packets[rayID / K];
171 const size_t i = rayID % K;
172 const vint<N> bitmask(shiftTable[rayID]);
173
174#if defined (__aarch64__)
175 const vfloat<N> tNearX = madd(bminX, p.rdir.x[i], p.neg_org_rdir.x[i]);
176 const vfloat<N> tNearY = madd(bminY, p.rdir.y[i], p.neg_org_rdir.y[i]);
177 const vfloat<N> tNearZ = madd(bminZ, p.rdir.z[i], p.neg_org_rdir.z[i]);
178 const vfloat<N> tFarX = madd(bmaxX, p.rdir.x[i], p.neg_org_rdir.x[i]);
179 const vfloat<N> tFarY = madd(bmaxY, p.rdir.y[i], p.neg_org_rdir.y[i]);
180 const vfloat<N> tFarZ = madd(bmaxZ, p.rdir.z[i], p.neg_org_rdir.z[i]);
181#else
182 const vfloat<N> tNearX = msub(bminX, p.rdir.x[i], p.org_rdir.x[i]);
183 const vfloat<N> tNearY = msub(bminY, p.rdir.y[i], p.org_rdir.y[i]);
184 const vfloat<N> tNearZ = msub(bminZ, p.rdir.z[i], p.org_rdir.z[i]);
185 const vfloat<N> tFarX = msub(bmaxX, p.rdir.x[i], p.org_rdir.x[i]);
186 const vfloat<N> tFarY = msub(bmaxY, p.rdir.y[i], p.org_rdir.y[i]);
187 const vfloat<N> tFarZ = msub(bmaxZ, p.rdir.z[i], p.org_rdir.z[i]);
188#endif
189
190 const vfloat<N> tNear = maxi(tNearX, tNearY, tNearZ, vfloat<N>(p.tnear[i]));
191 const vfloat<N> tFar = mini(tFarX , tFarY , tFarZ, vfloat<N>(p.tfar[i]));
192
193 const vbool<N> hit_mask = tNear <= tFar;
194#if defined(__AVX2__)
195 vmask = vmask | (bitmask & vint<N>(hit_mask));
196#else
197 vmask = select(hit_mask, vmask | bitmask, vmask);
198#endif
199 } while(m_active);
200 return vmask;
201 }
202
203 template<int K>
204 __forceinline static vint<N> traverseIncoherentStream(size_t m_active,
205 TravRayKStreamRobust<K>* __restrict__ packets,
206 const AABBNode* __restrict__ node,
207 const NearFarPrecalculations& nf,
208 const int shiftTable[32])
209 {
210 const vfloat<N> bminX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
211 const vfloat<N> bminY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
212 const vfloat<N> bminZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
213 const vfloat<N> bmaxX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
214 const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
215 const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
216 assert(m_active);
217 vint<N> vmask(zero);
218 do
219 {
220 STAT3(shadow.trav_nodes,1,1,1);
221 const size_t rayID = bscf(m_active);
222 assert(rayID < MAX_INTERNAL_STREAM_SIZE);
223 TravRayKStream<K,robust> &p = packets[rayID / K];
224 const size_t i = rayID % K;
225 const vint<N> bitmask(shiftTable[rayID]);
226 const vfloat<N> tNearX = (bminX - p.org.x[i]) * p.rdir.x[i];
227 const vfloat<N> tNearY = (bminY - p.org.y[i]) * p.rdir.y[i];
228 const vfloat<N> tNearZ = (bminZ - p.org.z[i]) * p.rdir.z[i];
229 const vfloat<N> tFarX = (bmaxX - p.org.x[i]) * p.rdir.x[i];
230 const vfloat<N> tFarY = (bmaxY - p.org.y[i]) * p.rdir.y[i];
231 const vfloat<N> tFarZ = (bmaxZ - p.org.z[i]) * p.rdir.z[i];
232 const vfloat<N> tNear = maxi(tNearX, tNearY, tNearZ, vfloat<N>(p.tnear[i]));
233 const vfloat<N> tFar = mini(tFarX , tFarY , tFarZ, vfloat<N>(p.tfar[i]));
234 const float round_down = 1.0f-2.0f*float(ulp);
235 const float round_up = 1.0f+2.0f*float(ulp);
236 const vbool<N> hit_mask = round_down*tNear <= round_up*tFar;
237#if defined(__AVX2__)
238 vmask = vmask | (bitmask & vint<N>(hit_mask));
239#else
240 vmask = select(hit_mask, vmask | bitmask, vmask);
241#endif
242 } while(m_active);
243 return vmask;
244 }
245
246
247 static const size_t stackSizeSingle = 1+(N-1)*BVH::maxDepth;
248
249 public:
250 static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context);
251 static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context);
252
253 private:
254 template<int K>
255 static void intersectCoherent(Accel::Intersectors* This, RayHitK<K>** inputRays, size_t numRays, IntersectContext* context);
256
257 template<int K>
258 static void occludedCoherent(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context);
259
260 template<int K>
261 static void occludedIncoherent(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context);
262 };
263
264
265 /*! BVH ray stream intersector with direct fallback to packets. */
266 template<int N>
267 class BVHNIntersectorStreamPacketFallback
268 {
269 public:
270 static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context);
271 static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context);
272
273 private:
274 template<int K>
275 static void intersectK(Accel::Intersectors* This, RayHitK<K>** inputRays, size_t numRays, IntersectContext* context);
276
277 template<int K>
278 static void occludedK(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context);
279 };
280 }
281}
282