1 | // Copyright 2009-2021 Intel Corporation |
2 | // SPDX-License-Identifier: Apache-2.0 |
3 | |
4 | #pragma once |
5 | |
6 | #include "../sys/alloc.h" |
7 | #include "math.h" |
8 | #include "../simd/sse.h" |
9 | |
10 | namespace embree |
11 | { |
12 | //////////////////////////////////////////////////////////////////////////////// |
13 | /// SSE Vec2fa Type |
14 | //////////////////////////////////////////////////////////////////////////////// |
15 | |
16 | struct __aligned(16) Vec2fa |
17 | { |
18 | ALIGNED_STRUCT_(16); |
19 | |
20 | typedef float Scalar; |
21 | enum { N = 2 }; |
22 | union { |
23 | __m128 m128; |
24 | struct { float x,y,az,aw; }; |
25 | }; |
26 | |
27 | //////////////////////////////////////////////////////////////////////////////// |
28 | /// Constructors, Assignment & Cast Operators |
29 | //////////////////////////////////////////////////////////////////////////////// |
30 | |
31 | __forceinline Vec2fa( ) {} |
32 | __forceinline Vec2fa( const __m128 a ) : m128(a) {} |
33 | |
34 | __forceinline Vec2fa ( const Vec2<float>& other ) { x = other.x; y = other.y; } |
35 | __forceinline Vec2fa& operator =( const Vec2<float>& other ) { x = other.x; y = other.y; return *this; } |
36 | |
37 | __forceinline Vec2fa ( const Vec2fa& other ) { m128 = other.m128; } |
38 | __forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; } |
39 | |
40 | __forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(a)) {} |
41 | __forceinline Vec2fa( const float x, const float y) : m128(_mm_set_ps(y, y, y, x)) {} |
42 | |
43 | __forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} |
44 | |
45 | __forceinline operator const __m128&() const { return m128; } |
46 | __forceinline operator __m128&() { return m128; } |
47 | |
48 | //////////////////////////////////////////////////////////////////////////////// |
49 | /// Loads and Stores |
50 | //////////////////////////////////////////////////////////////////////////////// |
51 | |
52 | static __forceinline Vec2fa load( const void* const a ) { |
53 | return Vec2fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1)))); |
54 | } |
55 | |
56 | static __forceinline Vec2fa loadu( const void* const a ) { |
57 | return Vec2fa(_mm_and_ps(_mm_loadu_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1)))); |
58 | } |
59 | |
60 | static __forceinline void storeu ( void* ptr, const Vec2fa& v ) { |
61 | _mm_storeu_ps((float*)ptr,v); |
62 | } |
63 | |
64 | //////////////////////////////////////////////////////////////////////////////// |
65 | /// Constants |
66 | //////////////////////////////////////////////////////////////////////////////// |
67 | |
68 | __forceinline Vec2fa( ZeroTy ) : m128(_mm_setzero_ps()) {} |
69 | __forceinline Vec2fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {} |
70 | __forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} |
71 | __forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} |
72 | |
73 | //////////////////////////////////////////////////////////////////////////////// |
74 | /// Array Access |
75 | //////////////////////////////////////////////////////////////////////////////// |
76 | |
77 | __forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; } |
78 | __forceinline float& operator []( const size_t index ) { assert(index < 2); return (&x)[index]; } |
79 | }; |
80 | |
81 | //////////////////////////////////////////////////////////////////////////////// |
82 | /// Unary Operators |
83 | //////////////////////////////////////////////////////////////////////////////// |
84 | |
85 | __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; } |
86 | __forceinline Vec2fa operator -( const Vec2fa& a ) { |
87 | const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); |
88 | return _mm_xor_ps(a.m128, mask); |
89 | } |
90 | __forceinline Vec2fa abs ( const Vec2fa& a ) { |
91 | const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); |
92 | return _mm_and_ps(a.m128, mask); |
93 | } |
94 | __forceinline Vec2fa sign ( const Vec2fa& a ) { |
95 | return blendv_ps(Vec2fa(one), -Vec2fa(one), _mm_cmplt_ps (a,Vec2fa(zero))); |
96 | } |
97 | |
98 | __forceinline Vec2fa rcp ( const Vec2fa& a ) |
99 | { |
100 | #if defined(__aarch64__) |
101 | __m128 reciprocal = _mm_rcp_ps(a.m128); |
102 | reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); |
103 | reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); |
104 | return (const Vec2fa)reciprocal; |
105 | #else |
106 | #if defined(__AVX512VL__) |
107 | const Vec2fa r = _mm_rcp14_ps(a.m128); |
108 | #else |
109 | const Vec2fa r = _mm_rcp_ps(a.m128); |
110 | #endif |
111 | |
112 | #if defined(__AVX2__) |
113 | const Vec2fa h_n = _mm_fnmadd_ps(a, r, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0) |
114 | const Vec2fa res = _mm_fmadd_ps(r, h_n, r); // Then compute r + r * h_n |
115 | #else |
116 | const Vec2fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a, r)); // First, compute 1 - a * r (which will be very close to 0) |
117 | const Vec2fa res = _mm_add_ps(r,_mm_mul_ps(r, h_n)); // Then compute r + r * h_n |
118 | #endif |
119 | |
120 | return res; |
121 | #endif //defined(__aarch64__) |
122 | } |
123 | |
124 | __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); } |
125 | __forceinline Vec2fa sqr ( const Vec2fa& a ) { return _mm_mul_ps(a,a); } |
126 | |
127 | __forceinline Vec2fa rsqrt( const Vec2fa& a ) |
128 | { |
129 | #if defined(__aarch64__) |
130 | __m128 r = _mm_rsqrt_ps(a.m128); |
131 | r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); |
132 | r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); |
133 | return r; |
134 | #else |
135 | |
136 | #if defined(__AVX512VL__) |
137 | __m128 r = _mm_rsqrt14_ps(a.m128); |
138 | #else |
139 | __m128 r = _mm_rsqrt_ps(a.m128); |
140 | #endif |
141 | return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); |
142 | |
143 | #endif |
144 | } |
145 | |
146 | __forceinline Vec2fa zero_fix(const Vec2fa& a) { |
147 | return blendv_ps(a, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); |
148 | } |
149 | __forceinline Vec2fa rcp_safe(const Vec2fa& a) { |
150 | return rcp(zero_fix(a)); |
151 | } |
152 | __forceinline Vec2fa log ( const Vec2fa& a ) { |
153 | return Vec2fa(logf(a.x),logf(a.y)); |
154 | } |
155 | |
156 | __forceinline Vec2fa exp ( const Vec2fa& a ) { |
157 | return Vec2fa(expf(a.x),expf(a.y)); |
158 | } |
159 | |
160 | //////////////////////////////////////////////////////////////////////////////// |
161 | /// Binary Operators |
162 | //////////////////////////////////////////////////////////////////////////////// |
163 | |
164 | __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a.m128, b.m128); } |
165 | __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a.m128, b.m128); } |
166 | __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a.m128, b.m128); } |
167 | __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); } |
168 | __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; } |
169 | __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a.m128,b.m128); } |
170 | __forceinline Vec2fa operator /( const Vec2fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } |
171 | __forceinline Vec2fa operator /( const float a, const Vec2fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } |
172 | |
173 | __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); } |
174 | __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); } |
175 | |
176 | #if defined(__aarch64__) || defined(__SSE4_1__) |
177 | __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) { |
178 | const vint4 ai = _mm_castps_si128(a); |
179 | const vint4 bi = _mm_castps_si128(b); |
180 | const vint4 ci = _mm_min_epi32(ai,bi); |
181 | return _mm_castsi128_ps(ci); |
182 | } |
183 | #endif |
184 | |
185 | #if defined(__aarch64__) || defined(__SSE4_1__) |
186 | __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) { |
187 | const vint4 ai = _mm_castps_si128(a); |
188 | const vint4 bi = _mm_castps_si128(b); |
189 | const vint4 ci = _mm_max_epi32(ai,bi); |
190 | return _mm_castsi128_ps(ci); |
191 | } |
192 | #endif |
193 | |
194 | __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) { |
195 | return Vec2fa(powf(a.x,b),powf(a.y,b)); |
196 | } |
197 | |
198 | //////////////////////////////////////////////////////////////////////////////// |
199 | /// Ternary Operators |
200 | //////////////////////////////////////////////////////////////////////////////// |
201 | |
202 | #if defined(__AVX2__) |
203 | __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); } |
204 | __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); } |
205 | __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); } |
206 | __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); } |
207 | #else |
208 | __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; } |
209 | __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; } |
210 | __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b+c;} |
211 | __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b-c; } |
212 | #endif |
213 | |
214 | __forceinline Vec2fa madd ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(Vec2fa(a),b,c); } |
215 | __forceinline Vec2fa msub ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(Vec2fa(a),b,c); } |
216 | __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(Vec2fa(a),b,c); } |
217 | __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(Vec2fa(a),b,c); } |
218 | |
219 | //////////////////////////////////////////////////////////////////////////////// |
220 | /// Assignment Operators |
221 | //////////////////////////////////////////////////////////////////////////////// |
222 | |
223 | __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; } |
224 | __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; } |
225 | __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; } |
226 | __forceinline Vec2fa& operator *=( Vec2fa& a, const float b ) { return a = a * b; } |
227 | __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; } |
228 | __forceinline Vec2fa& operator /=( Vec2fa& a, const float b ) { return a = a / b; } |
229 | |
230 | //////////////////////////////////////////////////////////////////////////////// |
231 | /// Reductions |
232 | //////////////////////////////////////////////////////////////////////////////// |
233 | |
234 | __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; } |
235 | __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; } |
236 | __forceinline float reduce_min(const Vec2fa& v) { return min(v.x,v.y); } |
237 | __forceinline float reduce_max(const Vec2fa& v) { return max(v.x,v.y); } |
238 | |
239 | //////////////////////////////////////////////////////////////////////////////// |
240 | /// Comparison Operators |
241 | //////////////////////////////////////////////////////////////////////////////// |
242 | |
243 | __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 3) == 3; } |
244 | __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; } |
245 | |
246 | //////////////////////////////////////////////////////////////////////////////// |
247 | /// Euclidean Space Operators |
248 | //////////////////////////////////////////////////////////////////////////////// |
249 | |
250 | #if defined(__SSE4_1__) |
251 | __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) { |
252 | return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F)); |
253 | } |
254 | #else |
255 | __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) { |
256 | return reduce_add(a*b); |
257 | } |
258 | #endif |
259 | |
260 | __forceinline Vec2fa cross ( const Vec2fa& a ) { |
261 | return Vec2fa(-a.y,a.x); |
262 | } |
263 | |
264 | __forceinline float sqr_length ( const Vec2fa& a ) { return dot(a,a); } |
265 | __forceinline float rcp_length ( const Vec2fa& a ) { return rsqrt(dot(a,a)); } |
266 | __forceinline float rcp_length2( const Vec2fa& a ) { return rcp(dot(a,a)); } |
267 | __forceinline float length ( const Vec2fa& a ) { return sqrt(dot(a,a)); } |
268 | __forceinline Vec2fa normalize( const Vec2fa& a ) { return a*rsqrt(dot(a,a)); } |
269 | __forceinline float distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a-b); } |
270 | |
271 | //////////////////////////////////////////////////////////////////////////////// |
272 | /// Select |
273 | //////////////////////////////////////////////////////////////////////////////// |
274 | |
275 | __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) { |
276 | __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); |
277 | return blendv_ps(f, t, mask); |
278 | } |
279 | |
280 | __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) { |
281 | return madd(1.0f-t,v0,t*v1); |
282 | } |
283 | |
284 | __forceinline int maxDim ( const Vec2fa& a ) |
285 | { |
286 | const Vec2fa b = abs(a); |
287 | if (b.x > b.y) return 0; |
288 | else return 1; |
289 | } |
290 | |
291 | //////////////////////////////////////////////////////////////////////////////// |
292 | /// Rounding Functions |
293 | //////////////////////////////////////////////////////////////////////////////// |
294 | |
295 | #if defined(__aarch64__) |
296 | //__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); } |
297 | __forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); } |
298 | __forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); } |
299 | #elif defined (__SSE4_1__) |
300 | //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } |
301 | __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); } |
302 | __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); } |
303 | #else |
304 | //__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); } |
305 | __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(a.x),floorf(a.y)); } |
306 | __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(ceilf (a.x),ceilf (a.y)); } |
307 | #endif |
308 | |
309 | //////////////////////////////////////////////////////////////////////////////// |
310 | /// Output Operators |
311 | //////////////////////////////////////////////////////////////////////////////// |
312 | |
313 | __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) { |
314 | return cout << "(" << a.x << ", " << a.y << ")" ; |
315 | } |
316 | |
317 | typedef Vec2fa Vec2fa_t; |
318 | } |
319 | |