1// Copyright 2009-2021 Intel Corporation
2// SPDX-License-Identifier: Apache-2.0
3
4#pragma once
5
6#include "../sys/alloc.h"
7#include "math.h"
8#include "../simd/sse.h"
9
10namespace embree
11{
12 ////////////////////////////////////////////////////////////////////////////////
13 /// SSE Vec3fa Type
14 ////////////////////////////////////////////////////////////////////////////////
15
16 struct __aligned(16) Vec3fa
17 {
18 ALIGNED_STRUCT_(16);
19
20 typedef float Scalar;
21 enum { N = 3 };
22 union {
23 __m128 m128;
24 struct { float x,y,z; };
25 };
26
27 ////////////////////////////////////////////////////////////////////////////////
28 /// Constructors, Assignment & Cast Operators
29 ////////////////////////////////////////////////////////////////////////////////
30
31 __forceinline Vec3fa( ) {}
32 __forceinline Vec3fa( const __m128 a ) : m128(a) {}
33
34 __forceinline Vec3fa ( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); }
35 //__forceinline Vec3fa& operator =( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
36
37 __forceinline Vec3fa ( const Vec3fa& other ) { m128 = other.m128; }
38 __forceinline Vec3fa& operator =( const Vec3fa& other ) { m128 = other.m128; return *this; }
39
40 __forceinline explicit Vec3fa( const float a ) : m128(_mm_set1_ps(a)) {}
41 __forceinline Vec3fa( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {}
42
43 __forceinline explicit Vec3fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
44
45 __forceinline explicit operator const vfloat4() const { return vfloat4(m128); }
46 __forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); }
47 __forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); }
48 __forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); }
49
50 //__forceinline operator const __m128&() const { return m128; }
51 //__forceinline operator __m128&() { return m128; }
52
53 ////////////////////////////////////////////////////////////////////////////////
54 /// Loads and Stores
55 ////////////////////////////////////////////////////////////////////////////////
56
57 static __forceinline Vec3fa load( const void* const a ) {
58#if defined(__aarch64__)
59 __m128 t = _mm_load_ps((float*)a);
60 t[3] = 0.0f;
61 return Vec3fa(t);
62#else
63 return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
64#endif
65 }
66
67 static __forceinline Vec3fa loadu( const void* const a ) {
68 return Vec3fa(_mm_loadu_ps((float*)a));
69 }
70
71 static __forceinline void storeu ( void* ptr, const Vec3fa& v ) {
72 _mm_storeu_ps((float*)ptr,v.m128);
73 }
74
75 ////////////////////////////////////////////////////////////////////////////////
76 /// Constants
77 ////////////////////////////////////////////////////////////////////////////////
78
79 __forceinline Vec3fa( ZeroTy ) : m128(_mm_setzero_ps()) {}
80 __forceinline Vec3fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {}
81 __forceinline Vec3fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
82 __forceinline Vec3fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
83
84 ////////////////////////////////////////////////////////////////////////////////
85 /// Array Access
86 ////////////////////////////////////////////////////////////////////////////////
87
88 __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
89 __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; }
90 };
91
92 ////////////////////////////////////////////////////////////////////////////////
93 /// Unary Operators
94 ////////////////////////////////////////////////////////////////////////////////
95
96 __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; }
97 __forceinline Vec3fa operator -( const Vec3fa& a ) {
98#if defined(__aarch64__)
99 return vnegq_f32(a.m128);
100#else
101 const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
102 return _mm_xor_ps(a.m128, mask);
103#endif
104 }
105 __forceinline Vec3fa abs ( const Vec3fa& a ) {
106#if defined(__aarch64__)
107 return _mm_abs_ps(a.m128);
108#else
109 const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
110 return _mm_and_ps(a.m128, mask);
111#endif
112 }
113 __forceinline Vec3fa sign ( const Vec3fa& a ) {
114 return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128));
115 }
116
117 __forceinline Vec3fa rcp ( const Vec3fa& a )
118 {
119#if defined(__aarch64__)
120 return vdivq_f32(vdupq_n_f32(1.0f),a.m128);
121#else
122
123#if defined(__AVX512VL__)
124 const Vec3fa r = _mm_rcp14_ps(a.m128);
125#else
126 const Vec3fa r = _mm_rcp_ps(a.m128);
127#endif
128
129#if defined(__AVX2__)
130 const Vec3fa h_n = _mm_fnmadd_ps(a.m128, r.m128, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0)
131 const Vec3fa res = _mm_fmadd_ps(r.m128, h_n.m128, r.m128); // Then compute r + r * h_n
132#else
133 const Vec3fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a.m128, r.m128)); // First, compute 1 - a * r (which will be very close to 0)
134 const Vec3fa res = _mm_add_ps(r.m128,_mm_mul_ps(r.m128, h_n.m128)); // Then compute r + r * h_n
135#endif
136
137 return res;
138#endif //defined(__aarch64__)
139 }
140
141 __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); }
142 __forceinline Vec3fa sqr ( const Vec3fa& a ) { return _mm_mul_ps(a.m128,a.m128); }
143
144 __forceinline Vec3fa rsqrt( const Vec3fa& a )
145 {
146#if defined(__aarch64__)
147 __m128 r = _mm_rsqrt_ps(a.m128);
148 r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
149 r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
150 return r;
151#else
152
153#if defined(__AVX512VL__)
154 __m128 r = _mm_rsqrt14_ps(a.m128);
155#else
156 __m128 r = _mm_rsqrt_ps(a.m128);
157#endif
158 return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
159#endif
160 }
161
162 __forceinline Vec3fa zero_fix(const Vec3fa& a) {
163 return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
164 }
165 __forceinline Vec3fa rcp_safe(const Vec3fa& a) {
166 return rcp(zero_fix(a));
167 }
168 __forceinline Vec3fa log ( const Vec3fa& a ) {
169 return Vec3fa(logf(a.x),logf(a.y),logf(a.z));
170 }
171
172 __forceinline Vec3fa exp ( const Vec3fa& a ) {
173 return Vec3fa(expf(a.x),expf(a.y),expf(a.z));
174 }
175
176 ////////////////////////////////////////////////////////////////////////////////
177 /// Binary Operators
178 ////////////////////////////////////////////////////////////////////////////////
179
180 __forceinline Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return _mm_add_ps(a.m128, b.m128); }
181 __forceinline Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return _mm_sub_ps(a.m128, b.m128); }
182 __forceinline Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return _mm_mul_ps(a.m128, b.m128); }
183 __forceinline Vec3fa operator *( const Vec3fa& a, const float b ) { return a * Vec3fa(b); }
184 __forceinline Vec3fa operator *( const float a, const Vec3fa& b ) { return Vec3fa(a) * b; }
185 __forceinline Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return _mm_div_ps(a.m128,b.m128); }
186 __forceinline Vec3fa operator /( const Vec3fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
187 __forceinline Vec3fa operator /( const float a, const Vec3fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
188
189 __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); }
190 __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); }
191
192#if defined(__aarch64__) || defined(__SSE4_1__)
193 __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) {
194 const vint4 ai = _mm_castps_si128(a.m128);
195 const vint4 bi = _mm_castps_si128(b.m128);
196 const vint4 ci = _mm_min_epi32(ai,bi);
197 return _mm_castsi128_ps(ci);
198 }
199#endif
200
201#if defined(__aarch64__) || defined(__SSE4_1__)
202 __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) {
203 const vint4 ai = _mm_castps_si128(a.m128);
204 const vint4 bi = _mm_castps_si128(b.m128);
205 const vint4 ci = _mm_max_epi32(ai,bi);
206 return _mm_castsi128_ps(ci);
207 }
208#endif
209
210 __forceinline Vec3fa pow ( const Vec3fa& a, const float& b ) {
211 return Vec3fa(powf(a.x,b),powf(a.y,b),powf(a.z,b));
212 }
213
214 ////////////////////////////////////////////////////////////////////////////////
215 /// Ternary Operators
216 ////////////////////////////////////////////////////////////////////////////////
217
218#if defined(__AVX2__) || defined(__ARM_NEON)
219 __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
220 __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
221 __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
222 __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
223#else
224 __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; }
225 __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;}
226 __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; }
227 __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
228#endif
229
230 __forceinline Vec3fa madd ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); }
231 __forceinline Vec3fa msub ( const float a, const Vec3fa& b, const Vec3fa& c) { return msub(Vec3fa(a),b,c); }
232 __forceinline Vec3fa nmadd ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmadd(Vec3fa(a),b,c); }
233 __forceinline Vec3fa nmsub ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmsub(Vec3fa(a),b,c); }
234
235 ////////////////////////////////////////////////////////////////////////////////
236 /// Assignment Operators
237 ////////////////////////////////////////////////////////////////////////////////
238
239 __forceinline Vec3fa& operator +=( Vec3fa& a, const Vec3fa& b ) { return a = a + b; }
240 __forceinline Vec3fa& operator -=( Vec3fa& a, const Vec3fa& b ) { return a = a - b; }
241 __forceinline Vec3fa& operator *=( Vec3fa& a, const Vec3fa& b ) { return a = a * b; }
242 __forceinline Vec3fa& operator *=( Vec3fa& a, const float b ) { return a = a * b; }
243 __forceinline Vec3fa& operator /=( Vec3fa& a, const Vec3fa& b ) { return a = a / b; }
244 __forceinline Vec3fa& operator /=( Vec3fa& a, const float b ) { return a = a / b; }
245
246 ////////////////////////////////////////////////////////////////////////////////
247 /// Reductions
248 ////////////////////////////////////////////////////////////////////////////////
249#if defined(__aarch64__)
250 __forceinline float reduce_add(const Vec3fa& v) {
251 float32x4_t t = v.m128;
252 t[3] = 0.0f;
253 return vaddvq_f32(t);
254 }
255
256 __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
257 __forceinline float reduce_min(const Vec3fa& v) {
258 float32x4_t t = v.m128;
259 t[3] = t[2];
260 return vminvq_f32(t);
261 }
262 __forceinline float reduce_max(const Vec3fa& v) {
263 float32x4_t t = v.m128;
264 t[3] = t[2];
265 return vmaxvq_f32(t);
266 }
267#else
268 __forceinline float reduce_add(const Vec3fa& v) {
269 const vfloat4 a(v.m128);
270 const vfloat4 b = shuffle<1>(a);
271 const vfloat4 c = shuffle<2>(a);
272 return _mm_cvtss_f32(a+b+c);
273 }
274
275 __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
276 __forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); }
277 __forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); }
278#endif
279
280 ////////////////////////////////////////////////////////////////////////////////
281 /// Comparison Operators
282 ////////////////////////////////////////////////////////////////////////////////
283
284 __forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
285 __forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
286
287 __forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }
288 __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
289 __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
290 __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); }
291 #if defined(__aarch64__)
292 __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.m128, b.m128); }
293 __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.m128, b.m128); }
294#else
295 __forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.m128, b.m128); }
296 __forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.m128, b.m128); }
297#endif
298
299 __forceinline bool isvalid ( const Vec3fa& v ) {
300 return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE)));
301 }
302
303 __forceinline bool is_finite ( const Vec3fa& a ) {
304 return all(ge_mask(a,Vec3fa(-FLT_MAX)) & le_mask(a,Vec3fa(+FLT_MAX)));
305 }
306
307 __forceinline bool isvalid4 ( const Vec3fa& v ) {
308 return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));
309 }
310
311 __forceinline bool is_finite4 ( const Vec3fa& a ) {
312 return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));
313 }
314
315 ////////////////////////////////////////////////////////////////////////////////
316 /// Euclidean Space Operators
317 ////////////////////////////////////////////////////////////////////////////////
318
319#if defined(__SSE4_1__)
320 __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
321 return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));
322 }
323#else
324 __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
325 return reduce_add(a*b);
326 }
327#endif
328
329 __forceinline Vec3fa cross ( const Vec3fa& a, const Vec3fa& b )
330 {
331 vfloat4 a0 = vfloat4(a.m128);
332 vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
333 vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
334 vfloat4 b1 = vfloat4(b.m128);
335 return Vec3fa(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)));
336 }
337
338 __forceinline float sqr_length ( const Vec3fa& a ) { return dot(a,a); }
339 __forceinline float rcp_length ( const Vec3fa& a ) { return rsqrt(dot(a,a)); }
340 __forceinline float rcp_length2( const Vec3fa& a ) { return rcp(dot(a,a)); }
341 __forceinline float length ( const Vec3fa& a ) { return sqrt(dot(a,a)); }
342 __forceinline Vec3fa normalize( const Vec3fa& a ) { return a*rsqrt(dot(a,a)); }
343 __forceinline float distance ( const Vec3fa& a, const Vec3fa& b ) { return length(a-b); }
344 __forceinline float halfArea ( const Vec3fa& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); }
345 __forceinline float area ( const Vec3fa& d ) { return 2.0f*halfArea(d); }
346
347 __forceinline Vec3fa normalize_safe( const Vec3fa& a ) {
348 const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
349 }
350
351 /*! differentiated normalization */
352 __forceinline Vec3fa dnormalize(const Vec3fa& p, const Vec3fa& dp)
353 {
354 const float pp = dot(p,p);
355 const float pdp = dot(p,dp);
356 return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
357 }
358
359 ////////////////////////////////////////////////////////////////////////////////
360 /// Select
361 ////////////////////////////////////////////////////////////////////////////////
362
363 __forceinline Vec3fa select( bool s, const Vec3fa& t, const Vec3fa& f ) {
364 __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
365 return blendv_ps(f.m128, t.m128, mask);
366 }
367
368 __forceinline Vec3fa select( const Vec3ba& s, const Vec3fa& t, const Vec3fa& f ) {
369 return blendv_ps(f.m128, t.m128, s);
370 }
371
372 __forceinline Vec3fa lerp(const Vec3fa& v0, const Vec3fa& v1, const float t) {
373 return madd(1.0f-t,v0,t*v1);
374 }
375
376 __forceinline int maxDim ( const Vec3fa& a )
377 {
378 const Vec3fa b = abs(a);
379 if (b.x > b.y) {
380 if (b.x > b.z) return 0; else return 2;
381 } else {
382 if (b.y > b.z) return 1; else return 2;
383 }
384 }
385
386 ////////////////////////////////////////////////////////////////////////////////
387 /// Rounding Functions
388 ////////////////////////////////////////////////////////////////////////////////
389
390#if defined(__aarch64__)
391 __forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.m128); }
392 __forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.m128); }
393 __forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.m128); }
394#elif defined (__SSE4_1__)
395 __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
396 __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); }
397 __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); }
398#else
399 __forceinline Vec3fa trunc( const Vec3fa& a ) { return Vec3fa(truncf(a.x),truncf(a.y),truncf(a.z)); }
400 __forceinline Vec3fa floor( const Vec3fa& a ) { return Vec3fa(floorf(a.x),floorf(a.y),floorf(a.z)); }
401 __forceinline Vec3fa ceil ( const Vec3fa& a ) { return Vec3fa(ceilf (a.x),ceilf (a.y),ceilf (a.z)); }
402#endif
403
404 ////////////////////////////////////////////////////////////////////////////////
405 /// Output Operators
406 ////////////////////////////////////////////////////////////////////////////////
407
408 __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fa& a) {
409 return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
410 }
411
412 typedef Vec3fa Vec3fa_t;
413
414
415 ////////////////////////////////////////////////////////////////////////////////
416 /// SSE Vec3fx Type
417 ////////////////////////////////////////////////////////////////////////////////
418
419 struct __aligned(16) Vec3fx
420 {
421 ALIGNED_STRUCT_(16);
422
423 typedef float Scalar;
424 enum { N = 3 };
425 union {
426 __m128 m128;
427 struct { float x,y,z; union { int a; unsigned u; float w; }; };
428 };
429
430 ////////////////////////////////////////////////////////////////////////////////
431 /// Constructors, Assignment & Cast Operators
432 ////////////////////////////////////////////////////////////////////////////////
433
434 __forceinline Vec3fx( ) {}
435 __forceinline Vec3fx( const __m128 a ) : m128(a) {}
436
437 __forceinline explicit Vec3fx(const Vec3fa& v) : m128(v.m128) {}
438 __forceinline operator Vec3fa () const { return Vec3fa(m128); }
439
440 __forceinline explicit Vec3fx ( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); }
441 //__forceinline Vec3fx& operator =( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
442
443 __forceinline Vec3fx ( const Vec3fx& other ) { m128 = other.m128; }
444
445 __forceinline Vec3fx& operator =( const Vec3fx& other ) { m128 = other.m128; return *this; }
446
447 __forceinline explicit Vec3fx( const float a ) : m128(_mm_set1_ps(a)) {}
448 __forceinline Vec3fx( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {}
449
450 __forceinline Vec3fx( const Vec3fa& other, const int a1) { m128 = other.m128; a = a1; }
451 __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; }
452 __forceinline Vec3fx( const Vec3fa& other, const float w1) {
453#if defined (__aarch64__)
454 m128 = other.m128; m128[3] = w1;
455#elif defined (__SSE4_1__)
456 m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4);
457#else
458 const vint4 mask(-1,-1,-1,0);
459 m128 = select(vboolf4(_mm_castsi128_ps(mask)),vfloat4(other.m128),vfloat4(w1));
460#endif
461 }
462 //__forceinline Vec3fx( const float x, const float y, const float z, const int a) : x(x), y(y), z(z), a(a) {} // not working properly!
463 //__forceinline Vec3fx( const float x, const float y, const float z, const unsigned a) : x(x), y(y), z(z), u(a) {} // not working properly!
464 __forceinline Vec3fx( const float x, const float y, const float z, const float w) : m128(_mm_set_ps(w, z, y, x)) {}
465
466 //__forceinline explicit Vec3fx( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
467
468 __forceinline explicit operator const vfloat4() const { return vfloat4(m128); }
469 __forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); }
470 __forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); }
471 __forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); }
472
473 //__forceinline operator const __m128&() const { return m128; }
474 //__forceinline operator __m128&() { return m128; }
475
476 ////////////////////////////////////////////////////////////////////////////////
477 /// Loads and Stores
478 ////////////////////////////////////////////////////////////////////////////////
479
480 static __forceinline Vec3fx load( const void* const a ) {
481 return Vec3fx(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
482 }
483
484 static __forceinline Vec3fx loadu( const void* const a ) {
485 return Vec3fx(_mm_loadu_ps((float*)a));
486 }
487
488 static __forceinline void storeu ( void* ptr, const Vec3fx& v ) {
489 _mm_storeu_ps((float*)ptr,v.m128);
490 }
491
492 ////////////////////////////////////////////////////////////////////////////////
493 /// Constants
494 ////////////////////////////////////////////////////////////////////////////////
495
496 __forceinline Vec3fx( ZeroTy ) : m128(_mm_setzero_ps()) {}
497 __forceinline Vec3fx( OneTy ) : m128(_mm_set1_ps(1.0f)) {}
498 __forceinline Vec3fx( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
499 __forceinline Vec3fx( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
500
501 ////////////////////////////////////////////////////////////////////////////////
502 /// Array Access
503 ////////////////////////////////////////////////////////////////////////////////
504
505 __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
506 __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; }
507 };
508
509 ////////////////////////////////////////////////////////////////////////////////
510 /// Unary Operators
511 ////////////////////////////////////////////////////////////////////////////////
512
513 __forceinline Vec3fx operator +( const Vec3fx& a ) { return a; }
514 __forceinline Vec3fx operator -( const Vec3fx& a ) {
515 const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
516 return _mm_xor_ps(a.m128, mask);
517 }
518 __forceinline Vec3fx abs ( const Vec3fx& a ) {
519 const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
520 return _mm_and_ps(a.m128, mask);
521 }
522 __forceinline Vec3fx sign ( const Vec3fx& a ) {
523 return blendv_ps(Vec3fx(one).m128, (-Vec3fx(one)).m128, _mm_cmplt_ps (a.m128,Vec3fx(zero).m128));
524 }
525
526 __forceinline Vec3fx rcp ( const Vec3fx& a )
527 {
528#if defined(__AVX512VL__)
529 const Vec3fx r = _mm_rcp14_ps(a.m128);
530#else
531 const Vec3fx r = _mm_rcp_ps(a.m128);
532#endif
533
534#if defined(__AVX2__)
535 const Vec3fx res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f)));
536#else
537 const Vec3fx res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128)));
538 //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
539#endif
540
541 return res;
542 }
543
544 __forceinline Vec3fx sqrt ( const Vec3fx& a ) { return _mm_sqrt_ps(a.m128); }
545 __forceinline Vec3fx sqr ( const Vec3fx& a ) { return _mm_mul_ps(a.m128,a.m128); }
546
547 __forceinline Vec3fx rsqrt( const Vec3fx& a )
548 {
549#if defined(__AVX512VL__)
550 __m128 r = _mm_rsqrt14_ps(a.m128);
551#else
552 __m128 r = _mm_rsqrt_ps(a.m128);
553#endif
554 return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
555 }
556
557 __forceinline Vec3fx zero_fix(const Vec3fx& a) {
558 return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
559 }
560 __forceinline Vec3fx rcp_safe(const Vec3fx& a) {
561 return rcp(zero_fix(a));
562 }
563 __forceinline Vec3fx log ( const Vec3fx& a ) {
564 return Vec3fx(logf(a.x),logf(a.y),logf(a.z));
565 }
566
567 __forceinline Vec3fx exp ( const Vec3fx& a ) {
568 return Vec3fx(expf(a.x),expf(a.y),expf(a.z));
569 }
570
571 ////////////////////////////////////////////////////////////////////////////////
572 /// Binary Operators
573 ////////////////////////////////////////////////////////////////////////////////
574
575 __forceinline Vec3fx operator +( const Vec3fx& a, const Vec3fx& b ) { return _mm_add_ps(a.m128, b.m128); }
576 __forceinline Vec3fx operator -( const Vec3fx& a, const Vec3fx& b ) { return _mm_sub_ps(a.m128, b.m128); }
577 __forceinline Vec3fx operator *( const Vec3fx& a, const Vec3fx& b ) { return _mm_mul_ps(a.m128, b.m128); }
578 __forceinline Vec3fx operator *( const Vec3fx& a, const float b ) { return a * Vec3fx(b); }
579 __forceinline Vec3fx operator *( const float a, const Vec3fx& b ) { return Vec3fx(a) * b; }
580 __forceinline Vec3fx operator /( const Vec3fx& a, const Vec3fx& b ) { return _mm_div_ps(a.m128,b.m128); }
581 __forceinline Vec3fx operator /( const Vec3fx& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
582 __forceinline Vec3fx operator /( const float a, const Vec3fx& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
583
584 __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); }
585 __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); }
586
587#if defined(__SSE4_1__) || defined(__aarch64__)
588 __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) {
589 const vint4 ai = _mm_castps_si128(a.m128);
590 const vint4 bi = _mm_castps_si128(b.m128);
591 const vint4 ci = _mm_min_epi32(ai,bi);
592 return _mm_castsi128_ps(ci);
593 }
594#endif
595
596#if defined(__SSE4_1__) || defined(__aarch64__)
597 __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) {
598 const vint4 ai = _mm_castps_si128(a.m128);
599 const vint4 bi = _mm_castps_si128(b.m128);
600 const vint4 ci = _mm_max_epi32(ai,bi);
601 return _mm_castsi128_ps(ci);
602 }
603#endif
604
605 __forceinline Vec3fx pow ( const Vec3fx& a, const float& b ) {
606 return Vec3fx(powf(a.x,b),powf(a.y,b),powf(a.z,b));
607 }
608
609 ////////////////////////////////////////////////////////////////////////////////
610 /// Ternary Operators
611 ////////////////////////////////////////////////////////////////////////////////
612
613#if defined(__AVX2__)
614 __forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
615 __forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
616 __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
617 __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
618#else
619 __forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b+c; }
620 __forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b-c; }
621 __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b+c;}
622 __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b-c; }
623#endif
624
625 __forceinline Vec3fx madd ( const float a, const Vec3fx& b, const Vec3fx& c) { return madd(Vec3fx(a),b,c); }
626 __forceinline Vec3fx msub ( const float a, const Vec3fx& b, const Vec3fx& c) { return msub(Vec3fx(a),b,c); }
627 __forceinline Vec3fx nmadd ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmadd(Vec3fx(a),b,c); }
628 __forceinline Vec3fx nmsub ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmsub(Vec3fx(a),b,c); }
629
630 ////////////////////////////////////////////////////////////////////////////////
631 /// Assignment Operators
632 ////////////////////////////////////////////////////////////////////////////////
633
634 __forceinline Vec3fx& operator +=( Vec3fx& a, const Vec3fx& b ) { return a = a + b; }
635 __forceinline Vec3fx& operator -=( Vec3fx& a, const Vec3fx& b ) { return a = a - b; }
636 __forceinline Vec3fx& operator *=( Vec3fx& a, const Vec3fx& b ) { return a = a * b; }
637 __forceinline Vec3fx& operator *=( Vec3fx& a, const float b ) { return a = a * b; }
638 __forceinline Vec3fx& operator /=( Vec3fx& a, const Vec3fx& b ) { return a = a / b; }
639 __forceinline Vec3fx& operator /=( Vec3fx& a, const float b ) { return a = a / b; }
640
641 ////////////////////////////////////////////////////////////////////////////////
642 /// Reductions
643 ////////////////////////////////////////////////////////////////////////////////
644
645 __forceinline float reduce_add(const Vec3fx& v) {
646 const vfloat4 a(v.m128);
647 const vfloat4 b = shuffle<1>(a);
648 const vfloat4 c = shuffle<2>(a);
649 return _mm_cvtss_f32(a+b+c);
650 }
651
652 __forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; }
653 __forceinline float reduce_min(const Vec3fx& v) { return min(v.x,v.y,v.z); }
654 __forceinline float reduce_max(const Vec3fx& v) { return max(v.x,v.y,v.z); }
655
656 ////////////////////////////////////////////////////////////////////////////////
657 /// Comparison Operators
658 ////////////////////////////////////////////////////////////////////////////////
659
660 __forceinline bool operator ==( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
661 __forceinline bool operator !=( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
662
663 __forceinline Vec3ba eq_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }
664 __forceinline Vec3ba neq_mask(const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
665 __forceinline Vec3ba lt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
666 __forceinline Vec3ba le_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmple_ps (a.m128, b.m128); }
667 __forceinline Vec3ba gt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnle_ps(a.m128, b.m128); }
668 __forceinline Vec3ba ge_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); }
669
670 __forceinline bool isvalid ( const Vec3fx& v ) {
671 return all(gt_mask(v,Vec3fx(-FLT_LARGE)) & lt_mask(v,Vec3fx(+FLT_LARGE)));
672 }
673
674 __forceinline bool is_finite ( const Vec3fx& a ) {
675 return all(ge_mask(a,Vec3fx(-FLT_MAX)) & le_mask(a,Vec3fx(+FLT_MAX)));
676 }
677
678 __forceinline bool isvalid4 ( const Vec3fx& v ) {
679 return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));
680 }
681
682 __forceinline bool is_finite4 ( const Vec3fx& a ) {
683 return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));
684 }
685
686 ////////////////////////////////////////////////////////////////////////////////
687 /// Euclidean Space Operators
688 ////////////////////////////////////////////////////////////////////////////////
689
690#if defined(__SSE4_1__)
691 __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
692 return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));
693 }
694#else
695 __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
696 return reduce_add(a*b);
697 }
698#endif
699
700 __forceinline Vec3fx cross ( const Vec3fx& a, const Vec3fx& b )
701 {
702 vfloat4 a0 = vfloat4(a.m128);
703 vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
704 vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
705 vfloat4 b1 = vfloat4(b.m128);
706 return Vec3fx(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)));
707 }
708
709 __forceinline float sqr_length ( const Vec3fx& a ) { return dot(a,a); }
710 __forceinline float rcp_length ( const Vec3fx& a ) { return rsqrt(dot(a,a)); }
711 __forceinline float rcp_length2( const Vec3fx& a ) { return rcp(dot(a,a)); }
712 __forceinline float length ( const Vec3fx& a ) { return sqrt(dot(a,a)); }
713 __forceinline Vec3fx normalize( const Vec3fx& a ) { return a*rsqrt(dot(a,a)); }
714 __forceinline float distance ( const Vec3fx& a, const Vec3fx& b ) { return length(a-b); }
715 __forceinline float halfArea ( const Vec3fx& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); }
716 __forceinline float area ( const Vec3fx& d ) { return 2.0f*halfArea(d); }
717
718 __forceinline Vec3fx normalize_safe( const Vec3fx& a ) {
719 const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
720 }
721
722 /*! differentiated normalization */
723 __forceinline Vec3fx dnormalize(const Vec3fx& p, const Vec3fx& dp)
724 {
725 const float pp = dot(p,p);
726 const float pdp = dot(p,dp);
727 return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
728 }
729
730 ////////////////////////////////////////////////////////////////////////////////
731 /// Select
732 ////////////////////////////////////////////////////////////////////////////////
733
734 __forceinline Vec3fx select( bool s, const Vec3fx& t, const Vec3fx& f ) {
735 __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
736 return blendv_ps(f.m128, t.m128, mask);
737 }
738
739 __forceinline Vec3fx select( const Vec3ba& s, const Vec3fx& t, const Vec3fx& f ) {
740 return blendv_ps(f.m128, t.m128, s);
741 }
742
743 __forceinline Vec3fx lerp(const Vec3fx& v0, const Vec3fx& v1, const float t) {
744 return madd(1.0f-t,v0,t*v1);
745 }
746
747 __forceinline int maxDim ( const Vec3fx& a )
748 {
749 const Vec3fx b = abs(a);
750 if (b.x > b.y) {
751 if (b.x > b.z) return 0; else return 2;
752 } else {
753 if (b.y > b.z) return 1; else return 2;
754 }
755 }
756
757 ////////////////////////////////////////////////////////////////////////////////
758 /// Rounding Functions
759 ////////////////////////////////////////////////////////////////////////////////
760
761#if defined(__aarch64__)
762 __forceinline Vec3fx trunc(const Vec3fx& a) { return vrndq_f32(a.m128); }
763 __forceinline Vec3fx floor(const Vec3fx& a) { return vrndmq_f32(a.m128); }
764 __forceinline Vec3fx ceil (const Vec3fx& a) { return vrndpq_f32(a.m128); }
765#elif defined (__SSE4_1__)
766 __forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
767 __forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); }
768 __forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); }
769#else
770 __forceinline Vec3fx trunc( const Vec3fx& a ) { return Vec3fx(truncf(a.x),truncf(a.y),truncf(a.z)); }
771 __forceinline Vec3fx floor( const Vec3fx& a ) { return Vec3fx(floorf(a.x),floorf(a.y),floorf(a.z)); }
772 __forceinline Vec3fx ceil ( const Vec3fx& a ) { return Vec3fx(ceilf (a.x),ceilf (a.y),ceilf (a.z)); }
773#endif
774
775 ////////////////////////////////////////////////////////////////////////////////
776 /// Output Operators
777 ////////////////////////////////////////////////////////////////////////////////
778
779 __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fx& a) {
780 return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
781 }
782
783
784 typedef Vec3fx Vec3ff;
785}
786