| 1 | // Copyright 2009-2021 Intel Corporation |
| 2 | // SPDX-License-Identifier: Apache-2.0 |
| 3 | |
| 4 | #pragma once |
| 5 | |
| 6 | #include "bvh_node_base.h" |
| 7 | |
| 8 | namespace embree |
| 9 | { |
| 10 | /*! BVHN Quantized Node */ |
| 11 | template<int N> |
| 12 | struct __aligned(8) QuantizedBaseNode_t |
| 13 | { |
| 14 | typedef unsigned char T; |
| 15 | static const T MIN_QUAN = 0; |
| 16 | static const T MAX_QUAN = 255; |
| 17 | |
| 18 | /*! Clears the node. */ |
| 19 | __forceinline void clear() { |
| 20 | for (size_t i=0; i<N; i++) lower_x[i] = lower_y[i] = lower_z[i] = MAX_QUAN; |
| 21 | for (size_t i=0; i<N; i++) upper_x[i] = upper_y[i] = upper_z[i] = MIN_QUAN; |
| 22 | } |
| 23 | |
| 24 | /*! Returns bounds of specified child. */ |
| 25 | __forceinline BBox3fa bounds(size_t i) const |
| 26 | { |
| 27 | assert(i < N); |
| 28 | const Vec3fa lower(madd(scale.x,(float)lower_x[i],start.x), |
| 29 | madd(scale.y,(float)lower_y[i],start.y), |
| 30 | madd(scale.z,(float)lower_z[i],start.z)); |
| 31 | const Vec3fa upper(madd(scale.x,(float)upper_x[i],start.x), |
| 32 | madd(scale.y,(float)upper_y[i],start.y), |
| 33 | madd(scale.z,(float)upper_z[i],start.z)); |
| 34 | return BBox3fa(lower,upper); |
| 35 | } |
| 36 | |
| 37 | /*! Returns extent of bounds of specified child. */ |
| 38 | __forceinline Vec3fa extent(size_t i) const { |
| 39 | return bounds(i).size(); |
| 40 | } |
| 41 | |
| 42 | static __forceinline void init_dim(const vfloat<N> &lower, |
| 43 | const vfloat<N> &upper, |
| 44 | T lower_quant[N], |
| 45 | T upper_quant[N], |
| 46 | float &start, |
| 47 | float &scale) |
| 48 | { |
| 49 | /* quantize bounds */ |
| 50 | const vbool<N> m_valid = lower != vfloat<N>(pos_inf); |
| 51 | const float minF = reduce_min(lower); |
| 52 | const float maxF = reduce_max(upper); |
| 53 | float diff = (1.0f+2.0f*float(ulp))*(maxF - minF); |
| 54 | float decode_scale = diff / float(MAX_QUAN); |
| 55 | if (decode_scale == 0.0f) decode_scale = 2.0f*FLT_MIN; // result may have been flushed to zero |
| 56 | assert(madd(decode_scale,float(MAX_QUAN),minF) >= maxF); |
| 57 | const float encode_scale = diff > 0 ? (float(MAX_QUAN) / diff) : 0.0f; |
| 58 | vint<N> ilower = max(vint<N>(floor((lower - vfloat<N>(minF))*vfloat<N>(encode_scale))),MIN_QUAN); |
| 59 | vint<N> iupper = min(vint<N>(ceil ((upper - vfloat<N>(minF))*vfloat<N>(encode_scale))),MAX_QUAN); |
| 60 | |
| 61 | /* lower/upper correction */ |
| 62 | vbool<N> m_lower_correction = (madd(vfloat<N>(ilower),decode_scale,minF)) > lower; |
| 63 | vbool<N> m_upper_correction = (madd(vfloat<N>(iupper),decode_scale,minF)) < upper; |
| 64 | ilower = max(select(m_lower_correction,ilower-1,ilower),MIN_QUAN); |
| 65 | iupper = min(select(m_upper_correction,iupper+1,iupper),MAX_QUAN); |
| 66 | |
| 67 | /* disable invalid lanes */ |
| 68 | ilower = select(m_valid,ilower,MAX_QUAN); |
| 69 | iupper = select(m_valid,iupper,MIN_QUAN); |
| 70 | |
| 71 | /* store as uchar to memory */ |
| 72 | vint<N>::store(lower_quant,ilower); |
| 73 | vint<N>::store(upper_quant,iupper); |
| 74 | start = minF; |
| 75 | scale = decode_scale; |
| 76 | |
| 77 | #if defined(DEBUG) |
| 78 | vfloat<N> extract_lower( vint<N>::loadu(lower_quant) ); |
| 79 | vfloat<N> extract_upper( vint<N>::loadu(upper_quant) ); |
| 80 | vfloat<N> final_extract_lower = madd(extract_lower,decode_scale,minF); |
| 81 | vfloat<N> final_extract_upper = madd(extract_upper,decode_scale,minF); |
| 82 | assert( (movemask(final_extract_lower <= lower ) & movemask(m_valid)) == movemask(m_valid)); |
| 83 | assert( (movemask(final_extract_upper >= upper ) & movemask(m_valid)) == movemask(m_valid)); |
| 84 | #endif |
| 85 | } |
| 86 | |
| 87 | __forceinline void init_dim(AABBNode_t<NodeRefPtr<N>,N>& node) |
| 88 | { |
| 89 | init_dim(node.lower_x,node.upper_x,lower_x,upper_x,start.x,scale.x); |
| 90 | init_dim(node.lower_y,node.upper_y,lower_y,upper_y,start.y,scale.y); |
| 91 | init_dim(node.lower_z,node.upper_z,lower_z,upper_z,start.z,scale.z); |
| 92 | } |
| 93 | |
| 94 | __forceinline vbool<N> validMask() const { return vint<N>::loadu(lower_x) <= vint<N>::loadu(upper_x); } |
| 95 | |
| 96 | #if defined(__AVX512F__) // KNL |
| 97 | __forceinline vbool16 validMask16() const { return le(0xff,vint<16>::loadu(lower_x),vint<16>::loadu(upper_x)); } |
| 98 | #endif |
| 99 | __forceinline vfloat<N> dequantizeLowerX() const { return madd(vfloat<N>(vint<N>::loadu(lower_x)),scale.x,vfloat<N>(start.x)); } |
| 100 | |
| 101 | __forceinline vfloat<N> dequantizeUpperX() const { return madd(vfloat<N>(vint<N>::loadu(upper_x)),scale.x,vfloat<N>(start.x)); } |
| 102 | |
| 103 | __forceinline vfloat<N> dequantizeLowerY() const { return madd(vfloat<N>(vint<N>::loadu(lower_y)),scale.y,vfloat<N>(start.y)); } |
| 104 | |
| 105 | __forceinline vfloat<N> dequantizeUpperY() const { return madd(vfloat<N>(vint<N>::loadu(upper_y)),scale.y,vfloat<N>(start.y)); } |
| 106 | |
| 107 | __forceinline vfloat<N> dequantizeLowerZ() const { return madd(vfloat<N>(vint<N>::loadu(lower_z)),scale.z,vfloat<N>(start.z)); } |
| 108 | |
| 109 | __forceinline vfloat<N> dequantizeUpperZ() const { return madd(vfloat<N>(vint<N>::loadu(upper_z)),scale.z,vfloat<N>(start.z)); } |
| 110 | |
| 111 | template <int M> |
| 112 | __forceinline vfloat<M> dequantize(const size_t offset) const { return vfloat<M>(vint<M>::loadu(all_planes+offset)); } |
| 113 | |
| 114 | #if defined(__AVX512F__) |
| 115 | __forceinline vfloat16 dequantizeLowerUpperX(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_x),p)),scale.x,vfloat16(start.x)); } |
| 116 | __forceinline vfloat16 dequantizeLowerUpperY(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_y),p)),scale.y,vfloat16(start.y)); } |
| 117 | __forceinline vfloat16 dequantizeLowerUpperZ(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_z),p)),scale.z,vfloat16(start.z)); } |
| 118 | #endif |
| 119 | |
| 120 | union { |
| 121 | struct { |
| 122 | T lower_x[N]; //!< 8bit discretized X dimension of lower bounds of all N children |
| 123 | T upper_x[N]; //!< 8bit discretized X dimension of upper bounds of all N children |
| 124 | T lower_y[N]; //!< 8bit discretized Y dimension of lower bounds of all N children |
| 125 | T upper_y[N]; //!< 8bit discretized Y dimension of upper bounds of all N children |
| 126 | T lower_z[N]; //!< 8bit discretized Z dimension of lower bounds of all N children |
| 127 | T upper_z[N]; //!< 8bit discretized Z dimension of upper bounds of all N children |
| 128 | }; |
| 129 | T all_planes[6*N]; |
| 130 | }; |
| 131 | |
| 132 | Vec3f start; |
| 133 | Vec3f scale; |
| 134 | |
| 135 | friend embree_ostream operator<<(embree_ostream o, const QuantizedBaseNode_t& n) |
| 136 | { |
| 137 | o << "QuantizedBaseNode { " << embree_endl; |
| 138 | o << " start " << n.start << embree_endl; |
| 139 | o << " scale " << n.scale << embree_endl; |
| 140 | o << " lower_x " << vuint<N>::loadu(n.lower_x) << embree_endl; |
| 141 | o << " upper_x " << vuint<N>::loadu(n.upper_x) << embree_endl; |
| 142 | o << " lower_y " << vuint<N>::loadu(n.lower_y) << embree_endl; |
| 143 | o << " upper_y " << vuint<N>::loadu(n.upper_y) << embree_endl; |
| 144 | o << " lower_z " << vuint<N>::loadu(n.lower_z) << embree_endl; |
| 145 | o << " upper_z " << vuint<N>::loadu(n.upper_z) << embree_endl; |
| 146 | o << "}" << embree_endl; |
| 147 | return o; |
| 148 | } |
| 149 | |
| 150 | }; |
| 151 | |
| 152 | template<typename NodeRef, int N> |
| 153 | struct __aligned(8) QuantizedNode_t : public BaseNode_t<NodeRef, N>, QuantizedBaseNode_t<N> |
| 154 | { |
| 155 | using BaseNode_t<NodeRef,N>::children; |
| 156 | using QuantizedBaseNode_t<N>::lower_x; |
| 157 | using QuantizedBaseNode_t<N>::upper_x; |
| 158 | using QuantizedBaseNode_t<N>::lower_y; |
| 159 | using QuantizedBaseNode_t<N>::upper_y; |
| 160 | using QuantizedBaseNode_t<N>::lower_z; |
| 161 | using QuantizedBaseNode_t<N>::upper_z; |
| 162 | using QuantizedBaseNode_t<N>::start; |
| 163 | using QuantizedBaseNode_t<N>::scale; |
| 164 | using QuantizedBaseNode_t<N>::init_dim; |
| 165 | |
| 166 | __forceinline void setRef(size_t i, const NodeRef& ref) { |
| 167 | assert(i < N); |
| 168 | children[i] = ref; |
| 169 | } |
| 170 | |
| 171 | struct Create2 |
| 172 | { |
| 173 | template<typename BuildRecord> |
| 174 | __forceinline NodeRef operator() (BuildRecord* children, const size_t n, const FastAllocator::CachedAllocator& alloc) const |
| 175 | { |
| 176 | __aligned(64) AABBNode_t<NodeRef,N> node; |
| 177 | node.clear(); |
| 178 | for (size_t i=0; i<n; i++) { |
| 179 | node.setBounds(i,children[i].bounds()); |
| 180 | } |
| 181 | QuantizedNode_t *qnode = (QuantizedNode_t*) alloc.malloc0(sizeof(QuantizedNode_t), NodeRef::byteAlignment); |
| 182 | qnode->init(node); |
| 183 | |
| 184 | return (size_t)qnode | NodeRef::tyQuantizedNode; |
| 185 | } |
| 186 | }; |
| 187 | |
| 188 | struct Set2 |
| 189 | { |
| 190 | template<typename BuildRecord> |
| 191 | __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const |
| 192 | { |
| 193 | #if defined(DEBUG) |
| 194 | // check that empty children are only at the end of the child list |
| 195 | bool emptyChild = false; |
| 196 | for (size_t i=0; i<num; i++) { |
| 197 | emptyChild |= (children[i] == NodeRef::emptyNode); |
| 198 | assert(emptyChild == (children[i] == NodeRef::emptyNode)); |
| 199 | } |
| 200 | #endif |
| 201 | QuantizedNode_t* node = ref.quantizedNode(); |
| 202 | for (size_t i=0; i<num; i++) node->setRef(i,children[i]); |
| 203 | return ref; |
| 204 | } |
| 205 | }; |
| 206 | |
| 207 | __forceinline void init(AABBNode_t<NodeRef,N>& node) |
| 208 | { |
| 209 | for (size_t i=0;i<N;i++) children[i] = NodeRef::emptyNode; |
| 210 | init_dim(node); |
| 211 | } |
| 212 | |
| 213 | }; |
| 214 | |
| 215 | /*! BVHN Quantized Node */ |
| 216 | template<int N> |
| 217 | struct __aligned(8) QuantizedBaseNodeMB_t |
| 218 | { |
| 219 | QuantizedBaseNode_t<N> node0; |
| 220 | QuantizedBaseNode_t<N> node1; |
| 221 | |
| 222 | /*! Clears the node. */ |
| 223 | __forceinline void clear() { |
| 224 | node0.clear(); |
| 225 | node1.clear(); |
| 226 | } |
| 227 | |
| 228 | /*! Returns bounds of specified child. */ |
| 229 | __forceinline BBox3fa bounds(size_t i) const |
| 230 | { |
| 231 | assert(i < N); |
| 232 | BBox3fa bounds0 = node0.bounds(i); |
| 233 | BBox3fa bounds1 = node1.bounds(i); |
| 234 | bounds0.extend(bounds1); |
| 235 | return bounds0; |
| 236 | } |
| 237 | |
| 238 | /*! Returns extent of bounds of specified child. */ |
| 239 | __forceinline Vec3fa extent(size_t i) const { |
| 240 | return bounds(i).size(); |
| 241 | } |
| 242 | |
| 243 | __forceinline vbool<N> validMask() const { return node0.validMask(); } |
| 244 | |
| 245 | template<typename T> |
| 246 | __forceinline vfloat<N> dequantizeLowerX(const T t) const { return lerp(node0.dequantizeLowerX(),node1.dequantizeLowerX(),t); } |
| 247 | template<typename T> |
| 248 | __forceinline vfloat<N> dequantizeUpperX(const T t) const { return lerp(node0.dequantizeUpperX(),node1.dequantizeUpperX(),t); } |
| 249 | template<typename T> |
| 250 | __forceinline vfloat<N> dequantizeLowerY(const T t) const { return lerp(node0.dequantizeLowerY(),node1.dequantizeLowerY(),t); } |
| 251 | template<typename T> |
| 252 | __forceinline vfloat<N> dequantizeUpperY(const T t) const { return lerp(node0.dequantizeUpperY(),node1.dequantizeUpperY(),t); } |
| 253 | template<typename T> |
| 254 | __forceinline vfloat<N> dequantizeLowerZ(const T t) const { return lerp(node0.dequantizeLowerZ(),node1.dequantizeLowerZ(),t); } |
| 255 | template<typename T> |
| 256 | __forceinline vfloat<N> dequantizeUpperZ(const T t) const { return lerp(node0.dequantizeUpperZ(),node1.dequantizeUpperZ(),t); } |
| 257 | |
| 258 | |
| 259 | template<int M> |
| 260 | __forceinline vfloat<M> dequantizeLowerX(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerX()[i]),vfloat<M>(node1.dequantizeLowerX()[i]),t); } |
| 261 | template<int M> |
| 262 | __forceinline vfloat<M> dequantizeUpperX(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperX()[i]),vfloat<M>(node1.dequantizeUpperX()[i]),t); } |
| 263 | template<int M> |
| 264 | __forceinline vfloat<M> dequantizeLowerY(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerY()[i]),vfloat<M>(node1.dequantizeLowerY()[i]),t); } |
| 265 | template<int M> |
| 266 | __forceinline vfloat<M> dequantizeUpperY(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperY()[i]),vfloat<M>(node1.dequantizeUpperY()[i]),t); } |
| 267 | template<int M> |
| 268 | __forceinline vfloat<M> dequantizeLowerZ(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerZ()[i]),vfloat<M>(node1.dequantizeLowerZ()[i]),t); } |
| 269 | template<int M> |
| 270 | __forceinline vfloat<M> dequantizeUpperZ(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperZ()[i]),vfloat<M>(node1.dequantizeUpperZ()[i]),t); } |
| 271 | |
| 272 | }; |
| 273 | } |
| 274 | |