1 | // Copyright 2009-2021 Intel Corporation |
2 | // SPDX-License-Identifier: Apache-2.0 |
3 | |
4 | #pragma once |
5 | |
6 | #include "node_intersector_packet_stream.h" |
7 | #include "node_intersector_frustum.h" |
8 | #include "bvh_traverser_stream.h" |
9 | |
10 | namespace embree |
11 | { |
12 | namespace isa |
13 | { |
14 | /*! BVH ray stream intersector. */ |
15 | template<int N, int types, bool robust, typename PrimitiveIntersector> |
16 | class BVHNIntersectorStream |
17 | { |
18 | /* shortcuts for frequently used types */ |
19 | template<int K> using PrimitiveIntersectorK = typename PrimitiveIntersector::template Type<K>; |
20 | template<int K> using PrimitiveK = typename PrimitiveIntersectorK<K>::PrimitiveK; |
21 | typedef BVHN<N> BVH; |
22 | typedef typename BVH::NodeRef NodeRef; |
23 | typedef typename BVH::BaseNode BaseNode; |
24 | typedef typename BVH::AABBNode AABBNode; |
25 | typedef typename BVH::AABBNodeMB AABBNodeMB; |
26 | |
27 | template<int K> |
28 | __forceinline static size_t initPacketsAndFrustum(RayK<K>** inputPackets, size_t numOctantRays, |
29 | TravRayKStream<K, robust>* packets, Frustum<robust>& frustum, bool& commonOctant) |
30 | { |
31 | const size_t numPackets = (numOctantRays+K-1)/K; |
32 | |
33 | Vec3vf<K> tmp_min_rdir(pos_inf); |
34 | Vec3vf<K> tmp_max_rdir(neg_inf); |
35 | Vec3vf<K> tmp_min_org(pos_inf); |
36 | Vec3vf<K> tmp_max_org(neg_inf); |
37 | vfloat<K> tmp_min_dist(pos_inf); |
38 | vfloat<K> tmp_max_dist(neg_inf); |
39 | |
40 | size_t m_active = 0; |
41 | for (size_t i = 0; i < numPackets; i++) |
42 | { |
43 | const vfloat<K> tnear = inputPackets[i]->tnear(); |
44 | const vfloat<K> tfar = inputPackets[i]->tfar; |
45 | vbool<K> m_valid = (tnear <= tfar) & (tnear >= 0.0f); |
46 | |
47 | #if defined(EMBREE_IGNORE_INVALID_RAYS) |
48 | m_valid &= inputPackets[i]->valid(); |
49 | #endif |
50 | |
51 | m_active |= (size_t)movemask(m_valid) << (i*K); |
52 | |
53 | vfloat<K> packet_min_dist = max(tnear, 0.0f); |
54 | vfloat<K> packet_max_dist = select(m_valid, tfar, neg_inf); |
55 | tmp_min_dist = min(tmp_min_dist, packet_min_dist); |
56 | tmp_max_dist = max(tmp_max_dist, packet_max_dist); |
57 | |
58 | const Vec3vf<K>& org = inputPackets[i]->org; |
59 | const Vec3vf<K>& dir = inputPackets[i]->dir; |
60 | |
61 | new (&packets[i]) TravRayKStream<K, robust>(org, dir, packet_min_dist, packet_max_dist); |
62 | |
63 | tmp_min_rdir = min(tmp_min_rdir, select(m_valid, packets[i].rdir, Vec3vf<K>(pos_inf))); |
64 | tmp_max_rdir = max(tmp_max_rdir, select(m_valid, packets[i].rdir, Vec3vf<K>(neg_inf))); |
65 | tmp_min_org = min(tmp_min_org , select(m_valid,org , Vec3vf<K>(pos_inf))); |
66 | tmp_max_org = max(tmp_max_org , select(m_valid,org , Vec3vf<K>(neg_inf))); |
67 | } |
68 | |
69 | m_active &= (numOctantRays == (8 * sizeof(size_t))) ? (size_t)-1 : (((size_t)1 << numOctantRays)-1); |
70 | |
71 | |
72 | const Vec3fa reduced_min_rdir(reduce_min(tmp_min_rdir.x), |
73 | reduce_min(tmp_min_rdir.y), |
74 | reduce_min(tmp_min_rdir.z)); |
75 | |
76 | const Vec3fa reduced_max_rdir(reduce_max(tmp_max_rdir.x), |
77 | reduce_max(tmp_max_rdir.y), |
78 | reduce_max(tmp_max_rdir.z)); |
79 | |
80 | const Vec3fa reduced_min_origin(reduce_min(tmp_min_org.x), |
81 | reduce_min(tmp_min_org.y), |
82 | reduce_min(tmp_min_org.z)); |
83 | |
84 | const Vec3fa reduced_max_origin(reduce_max(tmp_max_org.x), |
85 | reduce_max(tmp_max_org.y), |
86 | reduce_max(tmp_max_org.z)); |
87 | |
88 | commonOctant = |
89 | (reduced_max_rdir.x < 0.0f || reduced_min_rdir.x >= 0.0f) && |
90 | (reduced_max_rdir.y < 0.0f || reduced_min_rdir.y >= 0.0f) && |
91 | (reduced_max_rdir.z < 0.0f || reduced_min_rdir.z >= 0.0f); |
92 | |
93 | const float frustum_min_dist = reduce_min(tmp_min_dist); |
94 | const float frustum_max_dist = reduce_max(tmp_max_dist); |
95 | |
96 | frustum.init(reduced_min_origin, reduced_max_origin, |
97 | reduced_min_rdir, reduced_max_rdir, |
98 | frustum_min_dist, frustum_max_dist, |
99 | N); |
100 | |
101 | return m_active; |
102 | } |
103 | |
104 | template<int K> |
105 | __forceinline static size_t intersectAABBNodePacket(size_t m_active, |
106 | const TravRayKStream<K,robust>* packets, |
107 | const AABBNode* __restrict__ node, |
108 | size_t boxID, |
109 | const NearFarPrecalculations& nf) |
110 | { |
111 | assert(m_active); |
112 | const size_t startPacketID = bsf(m_active) / K; |
113 | const size_t endPacketID = bsr(m_active) / K; |
114 | size_t m_trav_active = 0; |
115 | for (size_t i = startPacketID; i <= endPacketID; i++) |
116 | { |
117 | const size_t m_hit = intersectNodeK<N>(node, boxID, packets[i], nf); |
118 | m_trav_active |= m_hit << (i*K); |
119 | } |
120 | return m_trav_active; |
121 | } |
122 | |
123 | template<int K> |
124 | __forceinline static size_t traverseCoherentStream(size_t m_active, |
125 | TravRayKStream<K, robust>* packets, |
126 | const AABBNode* __restrict__ node, |
127 | const Frustum<robust>& frustum, |
128 | size_t* maskK, |
129 | vfloat<N>& dist) |
130 | { |
131 | size_t m_node_hit = intersectNodeFrustum<N>(node, frustum, dist); |
132 | const size_t first_index = bsf(m_active); |
133 | const size_t first_packetID = first_index / K; |
134 | const size_t first_rayID = first_index % K; |
135 | size_t m_first_hit = intersectNode1<N>(node, packets[first_packetID], first_rayID, frustum.nf); |
136 | |
137 | /* this make traversal independent of the ordering of rays */ |
138 | size_t m_node = m_node_hit ^ m_first_hit; |
139 | while (unlikely(m_node)) |
140 | { |
141 | const size_t boxID = bscf(m_node); |
142 | const size_t m_current = m_active & intersectAABBNodePacket(m_active, packets, node, boxID, frustum.nf); |
143 | m_node_hit ^= m_current ? (size_t)0 : ((size_t)1 << boxID); |
144 | maskK[boxID] = m_current; |
145 | } |
146 | return m_node_hit; |
147 | } |
148 | |
149 | // TODO: explicit 16-wide path for KNL |
150 | template<int K> |
151 | __forceinline static vint<N> traverseIncoherentStream(size_t m_active, |
152 | TravRayKStreamFast<K>* __restrict__ packets, |
153 | const AABBNode* __restrict__ node, |
154 | const NearFarPrecalculations& nf, |
155 | const int shiftTable[32]) |
156 | { |
157 | const vfloat<N> bminX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX)); |
158 | const vfloat<N> bminY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY)); |
159 | const vfloat<N> bminZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ)); |
160 | const vfloat<N> bmaxX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX)); |
161 | const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY)); |
162 | const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ)); |
163 | assert(m_active); |
164 | vint<N> vmask(zero); |
165 | do |
166 | { |
167 | STAT3(shadow.trav_nodes,1,1,1); |
168 | const size_t rayID = bscf(m_active); |
169 | assert(rayID < MAX_INTERNAL_STREAM_SIZE); |
170 | TravRayKStream<K,robust> &p = packets[rayID / K]; |
171 | const size_t i = rayID % K; |
172 | const vint<N> bitmask(shiftTable[rayID]); |
173 | |
174 | #if defined (__aarch64__) |
175 | const vfloat<N> tNearX = madd(bminX, p.rdir.x[i], p.neg_org_rdir.x[i]); |
176 | const vfloat<N> tNearY = madd(bminY, p.rdir.y[i], p.neg_org_rdir.y[i]); |
177 | const vfloat<N> tNearZ = madd(bminZ, p.rdir.z[i], p.neg_org_rdir.z[i]); |
178 | const vfloat<N> tFarX = madd(bmaxX, p.rdir.x[i], p.neg_org_rdir.x[i]); |
179 | const vfloat<N> tFarY = madd(bmaxY, p.rdir.y[i], p.neg_org_rdir.y[i]); |
180 | const vfloat<N> tFarZ = madd(bmaxZ, p.rdir.z[i], p.neg_org_rdir.z[i]); |
181 | #else |
182 | const vfloat<N> tNearX = msub(bminX, p.rdir.x[i], p.org_rdir.x[i]); |
183 | const vfloat<N> tNearY = msub(bminY, p.rdir.y[i], p.org_rdir.y[i]); |
184 | const vfloat<N> tNearZ = msub(bminZ, p.rdir.z[i], p.org_rdir.z[i]); |
185 | const vfloat<N> tFarX = msub(bmaxX, p.rdir.x[i], p.org_rdir.x[i]); |
186 | const vfloat<N> tFarY = msub(bmaxY, p.rdir.y[i], p.org_rdir.y[i]); |
187 | const vfloat<N> tFarZ = msub(bmaxZ, p.rdir.z[i], p.org_rdir.z[i]); |
188 | #endif |
189 | |
190 | const vfloat<N> tNear = maxi(tNearX, tNearY, tNearZ, vfloat<N>(p.tnear[i])); |
191 | const vfloat<N> tFar = mini(tFarX , tFarY , tFarZ, vfloat<N>(p.tfar[i])); |
192 | |
193 | const vbool<N> hit_mask = tNear <= tFar; |
194 | #if defined(__AVX2__) |
195 | vmask = vmask | (bitmask & vint<N>(hit_mask)); |
196 | #else |
197 | vmask = select(hit_mask, vmask | bitmask, vmask); |
198 | #endif |
199 | } while(m_active); |
200 | return vmask; |
201 | } |
202 | |
203 | template<int K> |
204 | __forceinline static vint<N> traverseIncoherentStream(size_t m_active, |
205 | TravRayKStreamRobust<K>* __restrict__ packets, |
206 | const AABBNode* __restrict__ node, |
207 | const NearFarPrecalculations& nf, |
208 | const int shiftTable[32]) |
209 | { |
210 | const vfloat<N> bminX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX)); |
211 | const vfloat<N> bminY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY)); |
212 | const vfloat<N> bminZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ)); |
213 | const vfloat<N> bmaxX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX)); |
214 | const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY)); |
215 | const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ)); |
216 | assert(m_active); |
217 | vint<N> vmask(zero); |
218 | do |
219 | { |
220 | STAT3(shadow.trav_nodes,1,1,1); |
221 | const size_t rayID = bscf(m_active); |
222 | assert(rayID < MAX_INTERNAL_STREAM_SIZE); |
223 | TravRayKStream<K,robust> &p = packets[rayID / K]; |
224 | const size_t i = rayID % K; |
225 | const vint<N> bitmask(shiftTable[rayID]); |
226 | const vfloat<N> tNearX = (bminX - p.org.x[i]) * p.rdir.x[i]; |
227 | const vfloat<N> tNearY = (bminY - p.org.y[i]) * p.rdir.y[i]; |
228 | const vfloat<N> tNearZ = (bminZ - p.org.z[i]) * p.rdir.z[i]; |
229 | const vfloat<N> tFarX = (bmaxX - p.org.x[i]) * p.rdir.x[i]; |
230 | const vfloat<N> tFarY = (bmaxY - p.org.y[i]) * p.rdir.y[i]; |
231 | const vfloat<N> tFarZ = (bmaxZ - p.org.z[i]) * p.rdir.z[i]; |
232 | const vfloat<N> tNear = maxi(tNearX, tNearY, tNearZ, vfloat<N>(p.tnear[i])); |
233 | const vfloat<N> tFar = mini(tFarX , tFarY , tFarZ, vfloat<N>(p.tfar[i])); |
234 | const float round_down = 1.0f-2.0f*float(ulp); |
235 | const float round_up = 1.0f+2.0f*float(ulp); |
236 | const vbool<N> hit_mask = round_down*tNear <= round_up*tFar; |
237 | #if defined(__AVX2__) |
238 | vmask = vmask | (bitmask & vint<N>(hit_mask)); |
239 | #else |
240 | vmask = select(hit_mask, vmask | bitmask, vmask); |
241 | #endif |
242 | } while(m_active); |
243 | return vmask; |
244 | } |
245 | |
246 | |
247 | static const size_t stackSizeSingle = 1+(N-1)*BVH::maxDepth; |
248 | |
249 | public: |
250 | static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context); |
251 | static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context); |
252 | |
253 | private: |
254 | template<int K> |
255 | static void intersectCoherent(Accel::Intersectors* This, RayHitK<K>** inputRays, size_t numRays, IntersectContext* context); |
256 | |
257 | template<int K> |
258 | static void occludedCoherent(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context); |
259 | |
260 | template<int K> |
261 | static void occludedIncoherent(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context); |
262 | }; |
263 | |
264 | |
265 | /*! BVH ray stream intersector with direct fallback to packets. */ |
266 | template<int N> |
267 | class BVHNIntersectorStreamPacketFallback |
268 | { |
269 | public: |
270 | static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context); |
271 | static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context); |
272 | |
273 | private: |
274 | template<int K> |
275 | static void intersectK(Accel::Intersectors* This, RayHitK<K>** inputRays, size_t numRays, IntersectContext* context); |
276 | |
277 | template<int K> |
278 | static void occludedK(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context); |
279 | }; |
280 | } |
281 | } |
282 | |