| 1 | // Copyright 2009-2021 Intel Corporation |
| 2 | // SPDX-License-Identifier: Apache-2.0 |
| 3 | |
| 4 | #pragma once |
| 5 | |
| 6 | #include "node_intersector_packet_stream.h" |
| 7 | #include "node_intersector_frustum.h" |
| 8 | #include "bvh_traverser_stream.h" |
| 9 | |
| 10 | namespace embree |
| 11 | { |
| 12 | namespace isa |
| 13 | { |
| 14 | /*! BVH ray stream intersector. */ |
| 15 | template<int N, int types, bool robust, typename PrimitiveIntersector> |
| 16 | class BVHNIntersectorStream |
| 17 | { |
| 18 | /* shortcuts for frequently used types */ |
| 19 | template<int K> using PrimitiveIntersectorK = typename PrimitiveIntersector::template Type<K>; |
| 20 | template<int K> using PrimitiveK = typename PrimitiveIntersectorK<K>::PrimitiveK; |
| 21 | typedef BVHN<N> BVH; |
| 22 | typedef typename BVH::NodeRef NodeRef; |
| 23 | typedef typename BVH::BaseNode BaseNode; |
| 24 | typedef typename BVH::AABBNode AABBNode; |
| 25 | typedef typename BVH::AABBNodeMB AABBNodeMB; |
| 26 | |
| 27 | template<int K> |
| 28 | __forceinline static size_t initPacketsAndFrustum(RayK<K>** inputPackets, size_t numOctantRays, |
| 29 | TravRayKStream<K, robust>* packets, Frustum<robust>& frustum, bool& commonOctant) |
| 30 | { |
| 31 | const size_t numPackets = (numOctantRays+K-1)/K; |
| 32 | |
| 33 | Vec3vf<K> tmp_min_rdir(pos_inf); |
| 34 | Vec3vf<K> tmp_max_rdir(neg_inf); |
| 35 | Vec3vf<K> tmp_min_org(pos_inf); |
| 36 | Vec3vf<K> tmp_max_org(neg_inf); |
| 37 | vfloat<K> tmp_min_dist(pos_inf); |
| 38 | vfloat<K> tmp_max_dist(neg_inf); |
| 39 | |
| 40 | size_t m_active = 0; |
| 41 | for (size_t i = 0; i < numPackets; i++) |
| 42 | { |
| 43 | const vfloat<K> tnear = inputPackets[i]->tnear(); |
| 44 | const vfloat<K> tfar = inputPackets[i]->tfar; |
| 45 | vbool<K> m_valid = (tnear <= tfar) & (tnear >= 0.0f); |
| 46 | |
| 47 | #if defined(EMBREE_IGNORE_INVALID_RAYS) |
| 48 | m_valid &= inputPackets[i]->valid(); |
| 49 | #endif |
| 50 | |
| 51 | m_active |= (size_t)movemask(m_valid) << (i*K); |
| 52 | |
| 53 | vfloat<K> packet_min_dist = max(tnear, 0.0f); |
| 54 | vfloat<K> packet_max_dist = select(m_valid, tfar, neg_inf); |
| 55 | tmp_min_dist = min(tmp_min_dist, packet_min_dist); |
| 56 | tmp_max_dist = max(tmp_max_dist, packet_max_dist); |
| 57 | |
| 58 | const Vec3vf<K>& org = inputPackets[i]->org; |
| 59 | const Vec3vf<K>& dir = inputPackets[i]->dir; |
| 60 | |
| 61 | new (&packets[i]) TravRayKStream<K, robust>(org, dir, packet_min_dist, packet_max_dist); |
| 62 | |
| 63 | tmp_min_rdir = min(tmp_min_rdir, select(m_valid, packets[i].rdir, Vec3vf<K>(pos_inf))); |
| 64 | tmp_max_rdir = max(tmp_max_rdir, select(m_valid, packets[i].rdir, Vec3vf<K>(neg_inf))); |
| 65 | tmp_min_org = min(tmp_min_org , select(m_valid,org , Vec3vf<K>(pos_inf))); |
| 66 | tmp_max_org = max(tmp_max_org , select(m_valid,org , Vec3vf<K>(neg_inf))); |
| 67 | } |
| 68 | |
| 69 | m_active &= (numOctantRays == (8 * sizeof(size_t))) ? (size_t)-1 : (((size_t)1 << numOctantRays)-1); |
| 70 | |
| 71 | |
| 72 | const Vec3fa reduced_min_rdir(reduce_min(tmp_min_rdir.x), |
| 73 | reduce_min(tmp_min_rdir.y), |
| 74 | reduce_min(tmp_min_rdir.z)); |
| 75 | |
| 76 | const Vec3fa reduced_max_rdir(reduce_max(tmp_max_rdir.x), |
| 77 | reduce_max(tmp_max_rdir.y), |
| 78 | reduce_max(tmp_max_rdir.z)); |
| 79 | |
| 80 | const Vec3fa reduced_min_origin(reduce_min(tmp_min_org.x), |
| 81 | reduce_min(tmp_min_org.y), |
| 82 | reduce_min(tmp_min_org.z)); |
| 83 | |
| 84 | const Vec3fa reduced_max_origin(reduce_max(tmp_max_org.x), |
| 85 | reduce_max(tmp_max_org.y), |
| 86 | reduce_max(tmp_max_org.z)); |
| 87 | |
| 88 | commonOctant = |
| 89 | (reduced_max_rdir.x < 0.0f || reduced_min_rdir.x >= 0.0f) && |
| 90 | (reduced_max_rdir.y < 0.0f || reduced_min_rdir.y >= 0.0f) && |
| 91 | (reduced_max_rdir.z < 0.0f || reduced_min_rdir.z >= 0.0f); |
| 92 | |
| 93 | const float frustum_min_dist = reduce_min(tmp_min_dist); |
| 94 | const float frustum_max_dist = reduce_max(tmp_max_dist); |
| 95 | |
| 96 | frustum.init(reduced_min_origin, reduced_max_origin, |
| 97 | reduced_min_rdir, reduced_max_rdir, |
| 98 | frustum_min_dist, frustum_max_dist, |
| 99 | N); |
| 100 | |
| 101 | return m_active; |
| 102 | } |
| 103 | |
| 104 | template<int K> |
| 105 | __forceinline static size_t intersectAABBNodePacket(size_t m_active, |
| 106 | const TravRayKStream<K,robust>* packets, |
| 107 | const AABBNode* __restrict__ node, |
| 108 | size_t boxID, |
| 109 | const NearFarPrecalculations& nf) |
| 110 | { |
| 111 | assert(m_active); |
| 112 | const size_t startPacketID = bsf(m_active) / K; |
| 113 | const size_t endPacketID = bsr(m_active) / K; |
| 114 | size_t m_trav_active = 0; |
| 115 | for (size_t i = startPacketID; i <= endPacketID; i++) |
| 116 | { |
| 117 | const size_t m_hit = intersectNodeK<N>(node, boxID, packets[i], nf); |
| 118 | m_trav_active |= m_hit << (i*K); |
| 119 | } |
| 120 | return m_trav_active; |
| 121 | } |
| 122 | |
| 123 | template<int K> |
| 124 | __forceinline static size_t traverseCoherentStream(size_t m_active, |
| 125 | TravRayKStream<K, robust>* packets, |
| 126 | const AABBNode* __restrict__ node, |
| 127 | const Frustum<robust>& frustum, |
| 128 | size_t* maskK, |
| 129 | vfloat<N>& dist) |
| 130 | { |
| 131 | size_t m_node_hit = intersectNodeFrustum<N>(node, frustum, dist); |
| 132 | const size_t first_index = bsf(m_active); |
| 133 | const size_t first_packetID = first_index / K; |
| 134 | const size_t first_rayID = first_index % K; |
| 135 | size_t m_first_hit = intersectNode1<N>(node, packets[first_packetID], first_rayID, frustum.nf); |
| 136 | |
| 137 | /* this make traversal independent of the ordering of rays */ |
| 138 | size_t m_node = m_node_hit ^ m_first_hit; |
| 139 | while (unlikely(m_node)) |
| 140 | { |
| 141 | const size_t boxID = bscf(m_node); |
| 142 | const size_t m_current = m_active & intersectAABBNodePacket(m_active, packets, node, boxID, frustum.nf); |
| 143 | m_node_hit ^= m_current ? (size_t)0 : ((size_t)1 << boxID); |
| 144 | maskK[boxID] = m_current; |
| 145 | } |
| 146 | return m_node_hit; |
| 147 | } |
| 148 | |
| 149 | // TODO: explicit 16-wide path for KNL |
| 150 | template<int K> |
| 151 | __forceinline static vint<N> traverseIncoherentStream(size_t m_active, |
| 152 | TravRayKStreamFast<K>* __restrict__ packets, |
| 153 | const AABBNode* __restrict__ node, |
| 154 | const NearFarPrecalculations& nf, |
| 155 | const int shiftTable[32]) |
| 156 | { |
| 157 | const vfloat<N> bminX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX)); |
| 158 | const vfloat<N> bminY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY)); |
| 159 | const vfloat<N> bminZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ)); |
| 160 | const vfloat<N> bmaxX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX)); |
| 161 | const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY)); |
| 162 | const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ)); |
| 163 | assert(m_active); |
| 164 | vint<N> vmask(zero); |
| 165 | do |
| 166 | { |
| 167 | STAT3(shadow.trav_nodes,1,1,1); |
| 168 | const size_t rayID = bscf(m_active); |
| 169 | assert(rayID < MAX_INTERNAL_STREAM_SIZE); |
| 170 | TravRayKStream<K,robust> &p = packets[rayID / K]; |
| 171 | const size_t i = rayID % K; |
| 172 | const vint<N> bitmask(shiftTable[rayID]); |
| 173 | |
| 174 | #if defined (__aarch64__) |
| 175 | const vfloat<N> tNearX = madd(bminX, p.rdir.x[i], p.neg_org_rdir.x[i]); |
| 176 | const vfloat<N> tNearY = madd(bminY, p.rdir.y[i], p.neg_org_rdir.y[i]); |
| 177 | const vfloat<N> tNearZ = madd(bminZ, p.rdir.z[i], p.neg_org_rdir.z[i]); |
| 178 | const vfloat<N> tFarX = madd(bmaxX, p.rdir.x[i], p.neg_org_rdir.x[i]); |
| 179 | const vfloat<N> tFarY = madd(bmaxY, p.rdir.y[i], p.neg_org_rdir.y[i]); |
| 180 | const vfloat<N> tFarZ = madd(bmaxZ, p.rdir.z[i], p.neg_org_rdir.z[i]); |
| 181 | #else |
| 182 | const vfloat<N> tNearX = msub(bminX, p.rdir.x[i], p.org_rdir.x[i]); |
| 183 | const vfloat<N> tNearY = msub(bminY, p.rdir.y[i], p.org_rdir.y[i]); |
| 184 | const vfloat<N> tNearZ = msub(bminZ, p.rdir.z[i], p.org_rdir.z[i]); |
| 185 | const vfloat<N> tFarX = msub(bmaxX, p.rdir.x[i], p.org_rdir.x[i]); |
| 186 | const vfloat<N> tFarY = msub(bmaxY, p.rdir.y[i], p.org_rdir.y[i]); |
| 187 | const vfloat<N> tFarZ = msub(bmaxZ, p.rdir.z[i], p.org_rdir.z[i]); |
| 188 | #endif |
| 189 | |
| 190 | const vfloat<N> tNear = maxi(tNearX, tNearY, tNearZ, vfloat<N>(p.tnear[i])); |
| 191 | const vfloat<N> tFar = mini(tFarX , tFarY , tFarZ, vfloat<N>(p.tfar[i])); |
| 192 | |
| 193 | const vbool<N> hit_mask = tNear <= tFar; |
| 194 | #if defined(__AVX2__) |
| 195 | vmask = vmask | (bitmask & vint<N>(hit_mask)); |
| 196 | #else |
| 197 | vmask = select(hit_mask, vmask | bitmask, vmask); |
| 198 | #endif |
| 199 | } while(m_active); |
| 200 | return vmask; |
| 201 | } |
| 202 | |
| 203 | template<int K> |
| 204 | __forceinline static vint<N> traverseIncoherentStream(size_t m_active, |
| 205 | TravRayKStreamRobust<K>* __restrict__ packets, |
| 206 | const AABBNode* __restrict__ node, |
| 207 | const NearFarPrecalculations& nf, |
| 208 | const int shiftTable[32]) |
| 209 | { |
| 210 | const vfloat<N> bminX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX)); |
| 211 | const vfloat<N> bminY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY)); |
| 212 | const vfloat<N> bminZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ)); |
| 213 | const vfloat<N> bmaxX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX)); |
| 214 | const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY)); |
| 215 | const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ)); |
| 216 | assert(m_active); |
| 217 | vint<N> vmask(zero); |
| 218 | do |
| 219 | { |
| 220 | STAT3(shadow.trav_nodes,1,1,1); |
| 221 | const size_t rayID = bscf(m_active); |
| 222 | assert(rayID < MAX_INTERNAL_STREAM_SIZE); |
| 223 | TravRayKStream<K,robust> &p = packets[rayID / K]; |
| 224 | const size_t i = rayID % K; |
| 225 | const vint<N> bitmask(shiftTable[rayID]); |
| 226 | const vfloat<N> tNearX = (bminX - p.org.x[i]) * p.rdir.x[i]; |
| 227 | const vfloat<N> tNearY = (bminY - p.org.y[i]) * p.rdir.y[i]; |
| 228 | const vfloat<N> tNearZ = (bminZ - p.org.z[i]) * p.rdir.z[i]; |
| 229 | const vfloat<N> tFarX = (bmaxX - p.org.x[i]) * p.rdir.x[i]; |
| 230 | const vfloat<N> tFarY = (bmaxY - p.org.y[i]) * p.rdir.y[i]; |
| 231 | const vfloat<N> tFarZ = (bmaxZ - p.org.z[i]) * p.rdir.z[i]; |
| 232 | const vfloat<N> tNear = maxi(tNearX, tNearY, tNearZ, vfloat<N>(p.tnear[i])); |
| 233 | const vfloat<N> tFar = mini(tFarX , tFarY , tFarZ, vfloat<N>(p.tfar[i])); |
| 234 | const float round_down = 1.0f-2.0f*float(ulp); |
| 235 | const float round_up = 1.0f+2.0f*float(ulp); |
| 236 | const vbool<N> hit_mask = round_down*tNear <= round_up*tFar; |
| 237 | #if defined(__AVX2__) |
| 238 | vmask = vmask | (bitmask & vint<N>(hit_mask)); |
| 239 | #else |
| 240 | vmask = select(hit_mask, vmask | bitmask, vmask); |
| 241 | #endif |
| 242 | } while(m_active); |
| 243 | return vmask; |
| 244 | } |
| 245 | |
| 246 | |
| 247 | static const size_t stackSizeSingle = 1+(N-1)*BVH::maxDepth; |
| 248 | |
| 249 | public: |
| 250 | static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context); |
| 251 | static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context); |
| 252 | |
| 253 | private: |
| 254 | template<int K> |
| 255 | static void intersectCoherent(Accel::Intersectors* This, RayHitK<K>** inputRays, size_t numRays, IntersectContext* context); |
| 256 | |
| 257 | template<int K> |
| 258 | static void occludedCoherent(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context); |
| 259 | |
| 260 | template<int K> |
| 261 | static void occludedIncoherent(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context); |
| 262 | }; |
| 263 | |
| 264 | |
| 265 | /*! BVH ray stream intersector with direct fallback to packets. */ |
| 266 | template<int N> |
| 267 | class BVHNIntersectorStreamPacketFallback |
| 268 | { |
| 269 | public: |
| 270 | static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context); |
| 271 | static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context); |
| 272 | |
| 273 | private: |
| 274 | template<int K> |
| 275 | static void intersectK(Accel::Intersectors* This, RayHitK<K>** inputRays, size_t numRays, IntersectContext* context); |
| 276 | |
| 277 | template<int K> |
| 278 | static void occludedK(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context); |
| 279 | }; |
| 280 | } |
| 281 | } |
| 282 | |