1// Copyright 2009-2021 Intel Corporation
2// SPDX-License-Identifier: Apache-2.0
3
4#pragma once
5
6#include "../sys/alloc.h"
7#include "math.h"
8#include "../simd/sse.h"
9
10namespace embree
11{
12 ////////////////////////////////////////////////////////////////////////////////
13 /// SSE Vec2fa Type
14 ////////////////////////////////////////////////////////////////////////////////
15
16 struct __aligned(16) Vec2fa
17 {
18 ALIGNED_STRUCT_(16);
19
20 typedef float Scalar;
21 enum { N = 2 };
22 union {
23 __m128 m128;
24 struct { float x,y,az,aw; };
25 };
26
27 ////////////////////////////////////////////////////////////////////////////////
28 /// Constructors, Assignment & Cast Operators
29 ////////////////////////////////////////////////////////////////////////////////
30
31 __forceinline Vec2fa( ) {}
32 __forceinline Vec2fa( const __m128 a ) : m128(a) {}
33
34 __forceinline Vec2fa ( const Vec2<float>& other ) { x = other.x; y = other.y; }
35 __forceinline Vec2fa& operator =( const Vec2<float>& other ) { x = other.x; y = other.y; return *this; }
36
37 __forceinline Vec2fa ( const Vec2fa& other ) { m128 = other.m128; }
38 __forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; }
39
40 __forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(a)) {}
41 __forceinline Vec2fa( const float x, const float y) : m128(_mm_set_ps(y, y, y, x)) {}
42
43 __forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
44
45 __forceinline operator const __m128&() const { return m128; }
46 __forceinline operator __m128&() { return m128; }
47
48 ////////////////////////////////////////////////////////////////////////////////
49 /// Loads and Stores
50 ////////////////////////////////////////////////////////////////////////////////
51
52 static __forceinline Vec2fa load( const void* const a ) {
53 return Vec2fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1))));
54 }
55
56 static __forceinline Vec2fa loadu( const void* const a ) {
57 return Vec2fa(_mm_and_ps(_mm_loadu_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1))));
58 }
59
60 static __forceinline void storeu ( void* ptr, const Vec2fa& v ) {
61 _mm_storeu_ps((float*)ptr,v);
62 }
63
64 ////////////////////////////////////////////////////////////////////////////////
65 /// Constants
66 ////////////////////////////////////////////////////////////////////////////////
67
68 __forceinline Vec2fa( ZeroTy ) : m128(_mm_setzero_ps()) {}
69 __forceinline Vec2fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {}
70 __forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
71 __forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
72
73 ////////////////////////////////////////////////////////////////////////////////
74 /// Array Access
75 ////////////////////////////////////////////////////////////////////////////////
76
77 __forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; }
78 __forceinline float& operator []( const size_t index ) { assert(index < 2); return (&x)[index]; }
79 };
80
81 ////////////////////////////////////////////////////////////////////////////////
82 /// Unary Operators
83 ////////////////////////////////////////////////////////////////////////////////
84
85 __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; }
86 __forceinline Vec2fa operator -( const Vec2fa& a ) {
87 const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
88 return _mm_xor_ps(a.m128, mask);
89 }
90 __forceinline Vec2fa abs ( const Vec2fa& a ) {
91 const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
92 return _mm_and_ps(a.m128, mask);
93 }
94 __forceinline Vec2fa sign ( const Vec2fa& a ) {
95 return blendv_ps(Vec2fa(one), -Vec2fa(one), _mm_cmplt_ps (a,Vec2fa(zero)));
96 }
97
98 __forceinline Vec2fa rcp ( const Vec2fa& a )
99 {
100#if defined(__aarch64__)
101 __m128 reciprocal = _mm_rcp_ps(a.m128);
102 reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
103 reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
104 return (const Vec2fa)reciprocal;
105#else
106#if defined(__AVX512VL__)
107 const Vec2fa r = _mm_rcp14_ps(a.m128);
108#else
109 const Vec2fa r = _mm_rcp_ps(a.m128);
110#endif
111
112#if defined(__AVX2__)
113 const Vec2fa h_n = _mm_fnmadd_ps(a, r, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0)
114 const Vec2fa res = _mm_fmadd_ps(r, h_n, r); // Then compute r + r * h_n
115#else
116 const Vec2fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a, r)); // First, compute 1 - a * r (which will be very close to 0)
117 const Vec2fa res = _mm_add_ps(r,_mm_mul_ps(r, h_n)); // Then compute r + r * h_n
118#endif
119
120 return res;
121#endif //defined(__aarch64__)
122 }
123
124 __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); }
125 __forceinline Vec2fa sqr ( const Vec2fa& a ) { return _mm_mul_ps(a,a); }
126
127 __forceinline Vec2fa rsqrt( const Vec2fa& a )
128 {
129#if defined(__aarch64__)
130 __m128 r = _mm_rsqrt_ps(a.m128);
131 r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
132 r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
133 return r;
134#else
135
136#if defined(__AVX512VL__)
137 __m128 r = _mm_rsqrt14_ps(a.m128);
138#else
139 __m128 r = _mm_rsqrt_ps(a.m128);
140#endif
141 return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
142
143#endif
144 }
145
146 __forceinline Vec2fa zero_fix(const Vec2fa& a) {
147 return blendv_ps(a, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
148 }
149 __forceinline Vec2fa rcp_safe(const Vec2fa& a) {
150 return rcp(zero_fix(a));
151 }
152 __forceinline Vec2fa log ( const Vec2fa& a ) {
153 return Vec2fa(logf(a.x),logf(a.y));
154 }
155
156 __forceinline Vec2fa exp ( const Vec2fa& a ) {
157 return Vec2fa(expf(a.x),expf(a.y));
158 }
159
160 ////////////////////////////////////////////////////////////////////////////////
161 /// Binary Operators
162 ////////////////////////////////////////////////////////////////////////////////
163
164 __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a.m128, b.m128); }
165 __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a.m128, b.m128); }
166 __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a.m128, b.m128); }
167 __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); }
168 __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; }
169 __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a.m128,b.m128); }
170 __forceinline Vec2fa operator /( const Vec2fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
171 __forceinline Vec2fa operator /( const float a, const Vec2fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
172
173 __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); }
174 __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); }
175
176#if defined(__aarch64__) || defined(__SSE4_1__)
177 __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) {
178 const vint4 ai = _mm_castps_si128(a);
179 const vint4 bi = _mm_castps_si128(b);
180 const vint4 ci = _mm_min_epi32(ai,bi);
181 return _mm_castsi128_ps(ci);
182 }
183#endif
184
185#if defined(__aarch64__) || defined(__SSE4_1__)
186 __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) {
187 const vint4 ai = _mm_castps_si128(a);
188 const vint4 bi = _mm_castps_si128(b);
189 const vint4 ci = _mm_max_epi32(ai,bi);
190 return _mm_castsi128_ps(ci);
191 }
192#endif
193
194 __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) {
195 return Vec2fa(powf(a.x,b),powf(a.y,b));
196 }
197
198 ////////////////////////////////////////////////////////////////////////////////
199 /// Ternary Operators
200 ////////////////////////////////////////////////////////////////////////////////
201
202#if defined(__AVX2__)
203 __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); }
204 __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); }
205 __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); }
206 __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); }
207#else
208 __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; }
209 __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; }
210 __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b+c;}
211 __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b-c; }
212#endif
213
214 __forceinline Vec2fa madd ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(Vec2fa(a),b,c); }
215 __forceinline Vec2fa msub ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(Vec2fa(a),b,c); }
216 __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(Vec2fa(a),b,c); }
217 __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(Vec2fa(a),b,c); }
218
219 ////////////////////////////////////////////////////////////////////////////////
220 /// Assignment Operators
221 ////////////////////////////////////////////////////////////////////////////////
222
223 __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; }
224 __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; }
225 __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; }
226 __forceinline Vec2fa& operator *=( Vec2fa& a, const float b ) { return a = a * b; }
227 __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; }
228 __forceinline Vec2fa& operator /=( Vec2fa& a, const float b ) { return a = a / b; }
229
230 ////////////////////////////////////////////////////////////////////////////////
231 /// Reductions
232 ////////////////////////////////////////////////////////////////////////////////
233
234 __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; }
235 __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; }
236 __forceinline float reduce_min(const Vec2fa& v) { return min(v.x,v.y); }
237 __forceinline float reduce_max(const Vec2fa& v) { return max(v.x,v.y); }
238
239 ////////////////////////////////////////////////////////////////////////////////
240 /// Comparison Operators
241 ////////////////////////////////////////////////////////////////////////////////
242
243 __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 3) == 3; }
244 __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; }
245
246 ////////////////////////////////////////////////////////////////////////////////
247 /// Euclidean Space Operators
248 ////////////////////////////////////////////////////////////////////////////////
249
250#if defined(__SSE4_1__)
251 __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
252 return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F));
253 }
254#else
255 __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
256 return reduce_add(a*b);
257 }
258#endif
259
260 __forceinline Vec2fa cross ( const Vec2fa& a ) {
261 return Vec2fa(-a.y,a.x);
262 }
263
264 __forceinline float sqr_length ( const Vec2fa& a ) { return dot(a,a); }
265 __forceinline float rcp_length ( const Vec2fa& a ) { return rsqrt(dot(a,a)); }
266 __forceinline float rcp_length2( const Vec2fa& a ) { return rcp(dot(a,a)); }
267 __forceinline float length ( const Vec2fa& a ) { return sqrt(dot(a,a)); }
268 __forceinline Vec2fa normalize( const Vec2fa& a ) { return a*rsqrt(dot(a,a)); }
269 __forceinline float distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a-b); }
270
271 ////////////////////////////////////////////////////////////////////////////////
272 /// Select
273 ////////////////////////////////////////////////////////////////////////////////
274
275 __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) {
276 __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
277 return blendv_ps(f, t, mask);
278 }
279
280 __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) {
281 return madd(1.0f-t,v0,t*v1);
282 }
283
284 __forceinline int maxDim ( const Vec2fa& a )
285 {
286 const Vec2fa b = abs(a);
287 if (b.x > b.y) return 0;
288 else return 1;
289 }
290
291 ////////////////////////////////////////////////////////////////////////////////
292 /// Rounding Functions
293 ////////////////////////////////////////////////////////////////////////////////
294
295#if defined(__aarch64__)
296 //__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); }
297 __forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); }
298 __forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); }
299#elif defined (__SSE4_1__)
300 //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
301 __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); }
302 __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); }
303#else
304 //__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); }
305 __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(a.x),floorf(a.y)); }
306 __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(ceilf (a.x),ceilf (a.y)); }
307#endif
308
309 ////////////////////////////////////////////////////////////////////////////////
310 /// Output Operators
311 ////////////////////////////////////////////////////////////////////////////////
312
313 __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) {
314 return cout << "(" << a.x << ", " << a.y << ")";
315 }
316
317 typedef Vec2fa Vec2fa_t;
318}
319