| 1 | // Copyright 2009-2021 Intel Corporation |
| 2 | // SPDX-License-Identifier: Apache-2.0 |
| 3 | |
| 4 | #pragma once |
| 5 | |
| 6 | #include "../sys/alloc.h" |
| 7 | #include "math.h" |
| 8 | #include "../simd/sse.h" |
| 9 | |
| 10 | namespace embree |
| 11 | { |
| 12 | //////////////////////////////////////////////////////////////////////////////// |
| 13 | /// SSE Vec2fa Type |
| 14 | //////////////////////////////////////////////////////////////////////////////// |
| 15 | |
| 16 | struct __aligned(16) Vec2fa |
| 17 | { |
| 18 | ALIGNED_STRUCT_(16); |
| 19 | |
| 20 | typedef float Scalar; |
| 21 | enum { N = 2 }; |
| 22 | union { |
| 23 | __m128 m128; |
| 24 | struct { float x,y,az,aw; }; |
| 25 | }; |
| 26 | |
| 27 | //////////////////////////////////////////////////////////////////////////////// |
| 28 | /// Constructors, Assignment & Cast Operators |
| 29 | //////////////////////////////////////////////////////////////////////////////// |
| 30 | |
| 31 | __forceinline Vec2fa( ) {} |
| 32 | __forceinline Vec2fa( const __m128 a ) : m128(a) {} |
| 33 | |
| 34 | __forceinline Vec2fa ( const Vec2<float>& other ) { x = other.x; y = other.y; } |
| 35 | __forceinline Vec2fa& operator =( const Vec2<float>& other ) { x = other.x; y = other.y; return *this; } |
| 36 | |
| 37 | __forceinline Vec2fa ( const Vec2fa& other ) { m128 = other.m128; } |
| 38 | __forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; } |
| 39 | |
| 40 | __forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(a)) {} |
| 41 | __forceinline Vec2fa( const float x, const float y) : m128(_mm_set_ps(y, y, y, x)) {} |
| 42 | |
| 43 | __forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} |
| 44 | |
| 45 | __forceinline operator const __m128&() const { return m128; } |
| 46 | __forceinline operator __m128&() { return m128; } |
| 47 | |
| 48 | //////////////////////////////////////////////////////////////////////////////// |
| 49 | /// Loads and Stores |
| 50 | //////////////////////////////////////////////////////////////////////////////// |
| 51 | |
| 52 | static __forceinline Vec2fa load( const void* const a ) { |
| 53 | return Vec2fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1)))); |
| 54 | } |
| 55 | |
| 56 | static __forceinline Vec2fa loadu( const void* const a ) { |
| 57 | return Vec2fa(_mm_and_ps(_mm_loadu_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1)))); |
| 58 | } |
| 59 | |
| 60 | static __forceinline void storeu ( void* ptr, const Vec2fa& v ) { |
| 61 | _mm_storeu_ps((float*)ptr,v); |
| 62 | } |
| 63 | |
| 64 | //////////////////////////////////////////////////////////////////////////////// |
| 65 | /// Constants |
| 66 | //////////////////////////////////////////////////////////////////////////////// |
| 67 | |
| 68 | __forceinline Vec2fa( ZeroTy ) : m128(_mm_setzero_ps()) {} |
| 69 | __forceinline Vec2fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {} |
| 70 | __forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} |
| 71 | __forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} |
| 72 | |
| 73 | //////////////////////////////////////////////////////////////////////////////// |
| 74 | /// Array Access |
| 75 | //////////////////////////////////////////////////////////////////////////////// |
| 76 | |
| 77 | __forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; } |
| 78 | __forceinline float& operator []( const size_t index ) { assert(index < 2); return (&x)[index]; } |
| 79 | }; |
| 80 | |
| 81 | //////////////////////////////////////////////////////////////////////////////// |
| 82 | /// Unary Operators |
| 83 | //////////////////////////////////////////////////////////////////////////////// |
| 84 | |
| 85 | __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; } |
| 86 | __forceinline Vec2fa operator -( const Vec2fa& a ) { |
| 87 | const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); |
| 88 | return _mm_xor_ps(a.m128, mask); |
| 89 | } |
| 90 | __forceinline Vec2fa abs ( const Vec2fa& a ) { |
| 91 | const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); |
| 92 | return _mm_and_ps(a.m128, mask); |
| 93 | } |
| 94 | __forceinline Vec2fa sign ( const Vec2fa& a ) { |
| 95 | return blendv_ps(Vec2fa(one), -Vec2fa(one), _mm_cmplt_ps (a,Vec2fa(zero))); |
| 96 | } |
| 97 | |
| 98 | __forceinline Vec2fa rcp ( const Vec2fa& a ) |
| 99 | { |
| 100 | #if defined(__aarch64__) |
| 101 | __m128 reciprocal = _mm_rcp_ps(a.m128); |
| 102 | reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); |
| 103 | reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); |
| 104 | return (const Vec2fa)reciprocal; |
| 105 | #else |
| 106 | #if defined(__AVX512VL__) |
| 107 | const Vec2fa r = _mm_rcp14_ps(a.m128); |
| 108 | #else |
| 109 | const Vec2fa r = _mm_rcp_ps(a.m128); |
| 110 | #endif |
| 111 | |
| 112 | #if defined(__AVX2__) |
| 113 | const Vec2fa h_n = _mm_fnmadd_ps(a, r, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0) |
| 114 | const Vec2fa res = _mm_fmadd_ps(r, h_n, r); // Then compute r + r * h_n |
| 115 | #else |
| 116 | const Vec2fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a, r)); // First, compute 1 - a * r (which will be very close to 0) |
| 117 | const Vec2fa res = _mm_add_ps(r,_mm_mul_ps(r, h_n)); // Then compute r + r * h_n |
| 118 | #endif |
| 119 | |
| 120 | return res; |
| 121 | #endif //defined(__aarch64__) |
| 122 | } |
| 123 | |
| 124 | __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); } |
| 125 | __forceinline Vec2fa sqr ( const Vec2fa& a ) { return _mm_mul_ps(a,a); } |
| 126 | |
| 127 | __forceinline Vec2fa rsqrt( const Vec2fa& a ) |
| 128 | { |
| 129 | #if defined(__aarch64__) |
| 130 | __m128 r = _mm_rsqrt_ps(a.m128); |
| 131 | r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); |
| 132 | r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); |
| 133 | return r; |
| 134 | #else |
| 135 | |
| 136 | #if defined(__AVX512VL__) |
| 137 | __m128 r = _mm_rsqrt14_ps(a.m128); |
| 138 | #else |
| 139 | __m128 r = _mm_rsqrt_ps(a.m128); |
| 140 | #endif |
| 141 | return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); |
| 142 | |
| 143 | #endif |
| 144 | } |
| 145 | |
| 146 | __forceinline Vec2fa zero_fix(const Vec2fa& a) { |
| 147 | return blendv_ps(a, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); |
| 148 | } |
| 149 | __forceinline Vec2fa rcp_safe(const Vec2fa& a) { |
| 150 | return rcp(zero_fix(a)); |
| 151 | } |
| 152 | __forceinline Vec2fa log ( const Vec2fa& a ) { |
| 153 | return Vec2fa(logf(a.x),logf(a.y)); |
| 154 | } |
| 155 | |
| 156 | __forceinline Vec2fa exp ( const Vec2fa& a ) { |
| 157 | return Vec2fa(expf(a.x),expf(a.y)); |
| 158 | } |
| 159 | |
| 160 | //////////////////////////////////////////////////////////////////////////////// |
| 161 | /// Binary Operators |
| 162 | //////////////////////////////////////////////////////////////////////////////// |
| 163 | |
| 164 | __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a.m128, b.m128); } |
| 165 | __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a.m128, b.m128); } |
| 166 | __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a.m128, b.m128); } |
| 167 | __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); } |
| 168 | __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; } |
| 169 | __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a.m128,b.m128); } |
| 170 | __forceinline Vec2fa operator /( const Vec2fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } |
| 171 | __forceinline Vec2fa operator /( const float a, const Vec2fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } |
| 172 | |
| 173 | __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); } |
| 174 | __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); } |
| 175 | |
| 176 | #if defined(__aarch64__) || defined(__SSE4_1__) |
| 177 | __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) { |
| 178 | const vint4 ai = _mm_castps_si128(a); |
| 179 | const vint4 bi = _mm_castps_si128(b); |
| 180 | const vint4 ci = _mm_min_epi32(ai,bi); |
| 181 | return _mm_castsi128_ps(ci); |
| 182 | } |
| 183 | #endif |
| 184 | |
| 185 | #if defined(__aarch64__) || defined(__SSE4_1__) |
| 186 | __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) { |
| 187 | const vint4 ai = _mm_castps_si128(a); |
| 188 | const vint4 bi = _mm_castps_si128(b); |
| 189 | const vint4 ci = _mm_max_epi32(ai,bi); |
| 190 | return _mm_castsi128_ps(ci); |
| 191 | } |
| 192 | #endif |
| 193 | |
| 194 | __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) { |
| 195 | return Vec2fa(powf(a.x,b),powf(a.y,b)); |
| 196 | } |
| 197 | |
| 198 | //////////////////////////////////////////////////////////////////////////////// |
| 199 | /// Ternary Operators |
| 200 | //////////////////////////////////////////////////////////////////////////////// |
| 201 | |
| 202 | #if defined(__AVX2__) |
| 203 | __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); } |
| 204 | __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); } |
| 205 | __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); } |
| 206 | __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); } |
| 207 | #else |
| 208 | __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; } |
| 209 | __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; } |
| 210 | __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b+c;} |
| 211 | __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b-c; } |
| 212 | #endif |
| 213 | |
| 214 | __forceinline Vec2fa madd ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(Vec2fa(a),b,c); } |
| 215 | __forceinline Vec2fa msub ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(Vec2fa(a),b,c); } |
| 216 | __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(Vec2fa(a),b,c); } |
| 217 | __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(Vec2fa(a),b,c); } |
| 218 | |
| 219 | //////////////////////////////////////////////////////////////////////////////// |
| 220 | /// Assignment Operators |
| 221 | //////////////////////////////////////////////////////////////////////////////// |
| 222 | |
| 223 | __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; } |
| 224 | __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; } |
| 225 | __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; } |
| 226 | __forceinline Vec2fa& operator *=( Vec2fa& a, const float b ) { return a = a * b; } |
| 227 | __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; } |
| 228 | __forceinline Vec2fa& operator /=( Vec2fa& a, const float b ) { return a = a / b; } |
| 229 | |
| 230 | //////////////////////////////////////////////////////////////////////////////// |
| 231 | /// Reductions |
| 232 | //////////////////////////////////////////////////////////////////////////////// |
| 233 | |
| 234 | __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; } |
| 235 | __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; } |
| 236 | __forceinline float reduce_min(const Vec2fa& v) { return min(v.x,v.y); } |
| 237 | __forceinline float reduce_max(const Vec2fa& v) { return max(v.x,v.y); } |
| 238 | |
| 239 | //////////////////////////////////////////////////////////////////////////////// |
| 240 | /// Comparison Operators |
| 241 | //////////////////////////////////////////////////////////////////////////////// |
| 242 | |
| 243 | __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 3) == 3; } |
| 244 | __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; } |
| 245 | |
| 246 | //////////////////////////////////////////////////////////////////////////////// |
| 247 | /// Euclidean Space Operators |
| 248 | //////////////////////////////////////////////////////////////////////////////// |
| 249 | |
| 250 | #if defined(__SSE4_1__) |
| 251 | __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) { |
| 252 | return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F)); |
| 253 | } |
| 254 | #else |
| 255 | __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) { |
| 256 | return reduce_add(a*b); |
| 257 | } |
| 258 | #endif |
| 259 | |
| 260 | __forceinline Vec2fa cross ( const Vec2fa& a ) { |
| 261 | return Vec2fa(-a.y,a.x); |
| 262 | } |
| 263 | |
| 264 | __forceinline float sqr_length ( const Vec2fa& a ) { return dot(a,a); } |
| 265 | __forceinline float rcp_length ( const Vec2fa& a ) { return rsqrt(dot(a,a)); } |
| 266 | __forceinline float rcp_length2( const Vec2fa& a ) { return rcp(dot(a,a)); } |
| 267 | __forceinline float length ( const Vec2fa& a ) { return sqrt(dot(a,a)); } |
| 268 | __forceinline Vec2fa normalize( const Vec2fa& a ) { return a*rsqrt(dot(a,a)); } |
| 269 | __forceinline float distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a-b); } |
| 270 | |
| 271 | //////////////////////////////////////////////////////////////////////////////// |
| 272 | /// Select |
| 273 | //////////////////////////////////////////////////////////////////////////////// |
| 274 | |
| 275 | __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) { |
| 276 | __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); |
| 277 | return blendv_ps(f, t, mask); |
| 278 | } |
| 279 | |
| 280 | __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) { |
| 281 | return madd(1.0f-t,v0,t*v1); |
| 282 | } |
| 283 | |
| 284 | __forceinline int maxDim ( const Vec2fa& a ) |
| 285 | { |
| 286 | const Vec2fa b = abs(a); |
| 287 | if (b.x > b.y) return 0; |
| 288 | else return 1; |
| 289 | } |
| 290 | |
| 291 | //////////////////////////////////////////////////////////////////////////////// |
| 292 | /// Rounding Functions |
| 293 | //////////////////////////////////////////////////////////////////////////////// |
| 294 | |
| 295 | #if defined(__aarch64__) |
| 296 | //__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); } |
| 297 | __forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); } |
| 298 | __forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); } |
| 299 | #elif defined (__SSE4_1__) |
| 300 | //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } |
| 301 | __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); } |
| 302 | __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); } |
| 303 | #else |
| 304 | //__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); } |
| 305 | __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(a.x),floorf(a.y)); } |
| 306 | __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(ceilf (a.x),ceilf (a.y)); } |
| 307 | #endif |
| 308 | |
| 309 | //////////////////////////////////////////////////////////////////////////////// |
| 310 | /// Output Operators |
| 311 | //////////////////////////////////////////////////////////////////////////////// |
| 312 | |
| 313 | __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) { |
| 314 | return cout << "(" << a.x << ", " << a.y << ")" ; |
| 315 | } |
| 316 | |
| 317 | typedef Vec2fa Vec2fa_t; |
| 318 | } |
| 319 | |