1 | // Copyright 2009-2021 Intel Corporation |
2 | // SPDX-License-Identifier: Apache-2.0 |
3 | |
4 | #pragma once |
5 | |
6 | #include "../sys/alloc.h" |
7 | #include "math.h" |
8 | #include "../simd/sse.h" |
9 | |
10 | namespace embree |
11 | { |
12 | //////////////////////////////////////////////////////////////////////////////// |
13 | /// SSE Vec3fa Type |
14 | //////////////////////////////////////////////////////////////////////////////// |
15 | |
16 | struct __aligned(16) Vec3fa |
17 | { |
18 | ALIGNED_STRUCT_(16); |
19 | |
20 | typedef float Scalar; |
21 | enum { N = 3 }; |
22 | union { |
23 | __m128 m128; |
24 | struct { float x,y,z; }; |
25 | }; |
26 | |
27 | //////////////////////////////////////////////////////////////////////////////// |
28 | /// Constructors, Assignment & Cast Operators |
29 | //////////////////////////////////////////////////////////////////////////////// |
30 | |
31 | __forceinline Vec3fa( ) {} |
32 | __forceinline Vec3fa( const __m128 a ) : m128(a) {} |
33 | |
34 | __forceinline Vec3fa ( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); } |
35 | //__forceinline Vec3fa& operator =( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; } |
36 | |
37 | __forceinline Vec3fa ( const Vec3fa& other ) { m128 = other.m128; } |
38 | __forceinline Vec3fa& operator =( const Vec3fa& other ) { m128 = other.m128; return *this; } |
39 | |
40 | __forceinline explicit Vec3fa( const float a ) : m128(_mm_set1_ps(a)) {} |
41 | __forceinline Vec3fa( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {} |
42 | |
43 | __forceinline explicit Vec3fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} |
44 | |
45 | __forceinline explicit operator const vfloat4() const { return vfloat4(m128); } |
46 | __forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); } |
47 | __forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); } |
48 | __forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); } |
49 | |
50 | //__forceinline operator const __m128&() const { return m128; } |
51 | //__forceinline operator __m128&() { return m128; } |
52 | |
53 | //////////////////////////////////////////////////////////////////////////////// |
54 | /// Loads and Stores |
55 | //////////////////////////////////////////////////////////////////////////////// |
56 | |
57 | static __forceinline Vec3fa load( const void* const a ) { |
58 | #if defined(__aarch64__) |
59 | __m128 t = _mm_load_ps((float*)a); |
60 | t[3] = 0.0f; |
61 | return Vec3fa(t); |
62 | #else |
63 | return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)))); |
64 | #endif |
65 | } |
66 | |
67 | static __forceinline Vec3fa loadu( const void* const a ) { |
68 | return Vec3fa(_mm_loadu_ps((float*)a)); |
69 | } |
70 | |
71 | static __forceinline void storeu ( void* ptr, const Vec3fa& v ) { |
72 | _mm_storeu_ps((float*)ptr,v.m128); |
73 | } |
74 | |
75 | //////////////////////////////////////////////////////////////////////////////// |
76 | /// Constants |
77 | //////////////////////////////////////////////////////////////////////////////// |
78 | |
79 | __forceinline Vec3fa( ZeroTy ) : m128(_mm_setzero_ps()) {} |
80 | __forceinline Vec3fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {} |
81 | __forceinline Vec3fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} |
82 | __forceinline Vec3fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} |
83 | |
84 | //////////////////////////////////////////////////////////////////////////////// |
85 | /// Array Access |
86 | //////////////////////////////////////////////////////////////////////////////// |
87 | |
88 | __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } |
89 | __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } |
90 | }; |
91 | |
92 | //////////////////////////////////////////////////////////////////////////////// |
93 | /// Unary Operators |
94 | //////////////////////////////////////////////////////////////////////////////// |
95 | |
96 | __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; } |
97 | __forceinline Vec3fa operator -( const Vec3fa& a ) { |
98 | #if defined(__aarch64__) |
99 | return vnegq_f32(a.m128); |
100 | #else |
101 | const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); |
102 | return _mm_xor_ps(a.m128, mask); |
103 | #endif |
104 | } |
105 | __forceinline Vec3fa abs ( const Vec3fa& a ) { |
106 | #if defined(__aarch64__) |
107 | return _mm_abs_ps(a.m128); |
108 | #else |
109 | const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); |
110 | return _mm_and_ps(a.m128, mask); |
111 | #endif |
112 | } |
113 | __forceinline Vec3fa sign ( const Vec3fa& a ) { |
114 | return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128)); |
115 | } |
116 | |
117 | __forceinline Vec3fa rcp ( const Vec3fa& a ) |
118 | { |
119 | #if defined(__aarch64__) |
120 | return vdivq_f32(vdupq_n_f32(1.0f),a.m128); |
121 | #else |
122 | |
123 | #if defined(__AVX512VL__) |
124 | const Vec3fa r = _mm_rcp14_ps(a.m128); |
125 | #else |
126 | const Vec3fa r = _mm_rcp_ps(a.m128); |
127 | #endif |
128 | |
129 | #if defined(__AVX2__) |
130 | const Vec3fa h_n = _mm_fnmadd_ps(a.m128, r.m128, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0) |
131 | const Vec3fa res = _mm_fmadd_ps(r.m128, h_n.m128, r.m128); // Then compute r + r * h_n |
132 | #else |
133 | const Vec3fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a.m128, r.m128)); // First, compute 1 - a * r (which will be very close to 0) |
134 | const Vec3fa res = _mm_add_ps(r.m128,_mm_mul_ps(r.m128, h_n.m128)); // Then compute r + r * h_n |
135 | #endif |
136 | |
137 | return res; |
138 | #endif //defined(__aarch64__) |
139 | } |
140 | |
141 | __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); } |
142 | __forceinline Vec3fa sqr ( const Vec3fa& a ) { return _mm_mul_ps(a.m128,a.m128); } |
143 | |
144 | __forceinline Vec3fa rsqrt( const Vec3fa& a ) |
145 | { |
146 | #if defined(__aarch64__) |
147 | __m128 r = _mm_rsqrt_ps(a.m128); |
148 | r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); |
149 | r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); |
150 | return r; |
151 | #else |
152 | |
153 | #if defined(__AVX512VL__) |
154 | __m128 r = _mm_rsqrt14_ps(a.m128); |
155 | #else |
156 | __m128 r = _mm_rsqrt_ps(a.m128); |
157 | #endif |
158 | return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); |
159 | #endif |
160 | } |
161 | |
162 | __forceinline Vec3fa zero_fix(const Vec3fa& a) { |
163 | return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); |
164 | } |
165 | __forceinline Vec3fa rcp_safe(const Vec3fa& a) { |
166 | return rcp(zero_fix(a)); |
167 | } |
168 | __forceinline Vec3fa log ( const Vec3fa& a ) { |
169 | return Vec3fa(logf(a.x),logf(a.y),logf(a.z)); |
170 | } |
171 | |
172 | __forceinline Vec3fa exp ( const Vec3fa& a ) { |
173 | return Vec3fa(expf(a.x),expf(a.y),expf(a.z)); |
174 | } |
175 | |
176 | //////////////////////////////////////////////////////////////////////////////// |
177 | /// Binary Operators |
178 | //////////////////////////////////////////////////////////////////////////////// |
179 | |
180 | __forceinline Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return _mm_add_ps(a.m128, b.m128); } |
181 | __forceinline Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return _mm_sub_ps(a.m128, b.m128); } |
182 | __forceinline Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return _mm_mul_ps(a.m128, b.m128); } |
183 | __forceinline Vec3fa operator *( const Vec3fa& a, const float b ) { return a * Vec3fa(b); } |
184 | __forceinline Vec3fa operator *( const float a, const Vec3fa& b ) { return Vec3fa(a) * b; } |
185 | __forceinline Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return _mm_div_ps(a.m128,b.m128); } |
186 | __forceinline Vec3fa operator /( const Vec3fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } |
187 | __forceinline Vec3fa operator /( const float a, const Vec3fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } |
188 | |
189 | __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); } |
190 | __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); } |
191 | |
192 | #if defined(__aarch64__) || defined(__SSE4_1__) |
193 | __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) { |
194 | const vint4 ai = _mm_castps_si128(a.m128); |
195 | const vint4 bi = _mm_castps_si128(b.m128); |
196 | const vint4 ci = _mm_min_epi32(ai,bi); |
197 | return _mm_castsi128_ps(ci); |
198 | } |
199 | #endif |
200 | |
201 | #if defined(__aarch64__) || defined(__SSE4_1__) |
202 | __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) { |
203 | const vint4 ai = _mm_castps_si128(a.m128); |
204 | const vint4 bi = _mm_castps_si128(b.m128); |
205 | const vint4 ci = _mm_max_epi32(ai,bi); |
206 | return _mm_castsi128_ps(ci); |
207 | } |
208 | #endif |
209 | |
210 | __forceinline Vec3fa pow ( const Vec3fa& a, const float& b ) { |
211 | return Vec3fa(powf(a.x,b),powf(a.y,b),powf(a.z,b)); |
212 | } |
213 | |
214 | //////////////////////////////////////////////////////////////////////////////// |
215 | /// Ternary Operators |
216 | //////////////////////////////////////////////////////////////////////////////// |
217 | |
218 | #if defined(__AVX2__) || defined(__ARM_NEON) |
219 | __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); } |
220 | __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); } |
221 | __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); } |
222 | __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); } |
223 | #else |
224 | __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; } |
225 | __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;} |
226 | __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; } |
227 | __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; } |
228 | #endif |
229 | |
230 | __forceinline Vec3fa madd ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); } |
231 | __forceinline Vec3fa msub ( const float a, const Vec3fa& b, const Vec3fa& c) { return msub(Vec3fa(a),b,c); } |
232 | __forceinline Vec3fa nmadd ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmadd(Vec3fa(a),b,c); } |
233 | __forceinline Vec3fa nmsub ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmsub(Vec3fa(a),b,c); } |
234 | |
235 | //////////////////////////////////////////////////////////////////////////////// |
236 | /// Assignment Operators |
237 | //////////////////////////////////////////////////////////////////////////////// |
238 | |
239 | __forceinline Vec3fa& operator +=( Vec3fa& a, const Vec3fa& b ) { return a = a + b; } |
240 | __forceinline Vec3fa& operator -=( Vec3fa& a, const Vec3fa& b ) { return a = a - b; } |
241 | __forceinline Vec3fa& operator *=( Vec3fa& a, const Vec3fa& b ) { return a = a * b; } |
242 | __forceinline Vec3fa& operator *=( Vec3fa& a, const float b ) { return a = a * b; } |
243 | __forceinline Vec3fa& operator /=( Vec3fa& a, const Vec3fa& b ) { return a = a / b; } |
244 | __forceinline Vec3fa& operator /=( Vec3fa& a, const float b ) { return a = a / b; } |
245 | |
246 | //////////////////////////////////////////////////////////////////////////////// |
247 | /// Reductions |
248 | //////////////////////////////////////////////////////////////////////////////// |
249 | #if defined(__aarch64__) |
250 | __forceinline float reduce_add(const Vec3fa& v) { |
251 | float32x4_t t = v.m128; |
252 | t[3] = 0.0f; |
253 | return vaddvq_f32(t); |
254 | } |
255 | |
256 | __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; } |
257 | __forceinline float reduce_min(const Vec3fa& v) { |
258 | float32x4_t t = v.m128; |
259 | t[3] = t[2]; |
260 | return vminvq_f32(t); |
261 | } |
262 | __forceinline float reduce_max(const Vec3fa& v) { |
263 | float32x4_t t = v.m128; |
264 | t[3] = t[2]; |
265 | return vmaxvq_f32(t); |
266 | } |
267 | #else |
268 | __forceinline float reduce_add(const Vec3fa& v) { |
269 | const vfloat4 a(v.m128); |
270 | const vfloat4 b = shuffle<1>(a); |
271 | const vfloat4 c = shuffle<2>(a); |
272 | return _mm_cvtss_f32(a+b+c); |
273 | } |
274 | |
275 | __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; } |
276 | __forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); } |
277 | __forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); } |
278 | #endif |
279 | |
280 | //////////////////////////////////////////////////////////////////////////////// |
281 | /// Comparison Operators |
282 | //////////////////////////////////////////////////////////////////////////////// |
283 | |
284 | __forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; } |
285 | __forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; } |
286 | |
287 | __forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpeq_ps (a.m128, b.m128); } |
288 | __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); } |
289 | __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); } |
290 | __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); } |
291 | #if defined(__aarch64__) |
292 | __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.m128, b.m128); } |
293 | __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.m128, b.m128); } |
294 | #else |
295 | __forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.m128, b.m128); } |
296 | __forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.m128, b.m128); } |
297 | #endif |
298 | |
299 | __forceinline bool isvalid ( const Vec3fa& v ) { |
300 | return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE))); |
301 | } |
302 | |
303 | __forceinline bool is_finite ( const Vec3fa& a ) { |
304 | return all(ge_mask(a,Vec3fa(-FLT_MAX)) & le_mask(a,Vec3fa(+FLT_MAX))); |
305 | } |
306 | |
307 | __forceinline bool isvalid4 ( const Vec3fa& v ) { |
308 | return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE))); |
309 | } |
310 | |
311 | __forceinline bool is_finite4 ( const Vec3fa& a ) { |
312 | return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX))); |
313 | } |
314 | |
315 | //////////////////////////////////////////////////////////////////////////////// |
316 | /// Euclidean Space Operators |
317 | //////////////////////////////////////////////////////////////////////////////// |
318 | |
319 | #if defined(__SSE4_1__) |
320 | __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) { |
321 | return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F)); |
322 | } |
323 | #else |
324 | __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) { |
325 | return reduce_add(a*b); |
326 | } |
327 | #endif |
328 | |
329 | __forceinline Vec3fa cross ( const Vec3fa& a, const Vec3fa& b ) |
330 | { |
331 | vfloat4 a0 = vfloat4(a.m128); |
332 | vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128)); |
333 | vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128)); |
334 | vfloat4 b1 = vfloat4(b.m128); |
335 | return Vec3fa(shuffle<1,2,0,3>(msub(a0,b0,a1*b1))); |
336 | } |
337 | |
338 | __forceinline float sqr_length ( const Vec3fa& a ) { return dot(a,a); } |
339 | __forceinline float rcp_length ( const Vec3fa& a ) { return rsqrt(dot(a,a)); } |
340 | __forceinline float rcp_length2( const Vec3fa& a ) { return rcp(dot(a,a)); } |
341 | __forceinline float length ( const Vec3fa& a ) { return sqrt(dot(a,a)); } |
342 | __forceinline Vec3fa normalize( const Vec3fa& a ) { return a*rsqrt(dot(a,a)); } |
343 | __forceinline float distance ( const Vec3fa& a, const Vec3fa& b ) { return length(a-b); } |
344 | __forceinline float halfArea ( const Vec3fa& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); } |
345 | __forceinline float area ( const Vec3fa& d ) { return 2.0f*halfArea(d); } |
346 | |
347 | __forceinline Vec3fa normalize_safe( const Vec3fa& a ) { |
348 | const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d); |
349 | } |
350 | |
351 | /*! differentiated normalization */ |
352 | __forceinline Vec3fa dnormalize(const Vec3fa& p, const Vec3fa& dp) |
353 | { |
354 | const float pp = dot(p,p); |
355 | const float pdp = dot(p,dp); |
356 | return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp); |
357 | } |
358 | |
359 | //////////////////////////////////////////////////////////////////////////////// |
360 | /// Select |
361 | //////////////////////////////////////////////////////////////////////////////// |
362 | |
363 | __forceinline Vec3fa select( bool s, const Vec3fa& t, const Vec3fa& f ) { |
364 | __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); |
365 | return blendv_ps(f.m128, t.m128, mask); |
366 | } |
367 | |
368 | __forceinline Vec3fa select( const Vec3ba& s, const Vec3fa& t, const Vec3fa& f ) { |
369 | return blendv_ps(f.m128, t.m128, s); |
370 | } |
371 | |
372 | __forceinline Vec3fa lerp(const Vec3fa& v0, const Vec3fa& v1, const float t) { |
373 | return madd(1.0f-t,v0,t*v1); |
374 | } |
375 | |
376 | __forceinline int maxDim ( const Vec3fa& a ) |
377 | { |
378 | const Vec3fa b = abs(a); |
379 | if (b.x > b.y) { |
380 | if (b.x > b.z) return 0; else return 2; |
381 | } else { |
382 | if (b.y > b.z) return 1; else return 2; |
383 | } |
384 | } |
385 | |
386 | //////////////////////////////////////////////////////////////////////////////// |
387 | /// Rounding Functions |
388 | //////////////////////////////////////////////////////////////////////////////// |
389 | |
390 | #if defined(__aarch64__) |
391 | __forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.m128); } |
392 | __forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.m128); } |
393 | __forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.m128); } |
394 | #elif defined (__SSE4_1__) |
395 | __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); } |
396 | __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); } |
397 | __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); } |
398 | #else |
399 | __forceinline Vec3fa trunc( const Vec3fa& a ) { return Vec3fa(truncf(a.x),truncf(a.y),truncf(a.z)); } |
400 | __forceinline Vec3fa floor( const Vec3fa& a ) { return Vec3fa(floorf(a.x),floorf(a.y),floorf(a.z)); } |
401 | __forceinline Vec3fa ceil ( const Vec3fa& a ) { return Vec3fa(ceilf (a.x),ceilf (a.y),ceilf (a.z)); } |
402 | #endif |
403 | |
404 | //////////////////////////////////////////////////////////////////////////////// |
405 | /// Output Operators |
406 | //////////////////////////////////////////////////////////////////////////////// |
407 | |
408 | __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fa& a) { |
409 | return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")" ; |
410 | } |
411 | |
412 | typedef Vec3fa Vec3fa_t; |
413 | |
414 | |
415 | //////////////////////////////////////////////////////////////////////////////// |
416 | /// SSE Vec3fx Type |
417 | //////////////////////////////////////////////////////////////////////////////// |
418 | |
419 | struct __aligned(16) Vec3fx |
420 | { |
421 | ALIGNED_STRUCT_(16); |
422 | |
423 | typedef float Scalar; |
424 | enum { N = 3 }; |
425 | union { |
426 | __m128 m128; |
427 | struct { float x,y,z; union { int a; unsigned u; float w; }; }; |
428 | }; |
429 | |
430 | //////////////////////////////////////////////////////////////////////////////// |
431 | /// Constructors, Assignment & Cast Operators |
432 | //////////////////////////////////////////////////////////////////////////////// |
433 | |
434 | __forceinline Vec3fx( ) {} |
435 | __forceinline Vec3fx( const __m128 a ) : m128(a) {} |
436 | |
437 | __forceinline explicit Vec3fx(const Vec3fa& v) : m128(v.m128) {} |
438 | __forceinline operator Vec3fa () const { return Vec3fa(m128); } |
439 | |
440 | __forceinline explicit Vec3fx ( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); } |
441 | //__forceinline Vec3fx& operator =( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; } |
442 | |
443 | __forceinline Vec3fx ( const Vec3fx& other ) { m128 = other.m128; } |
444 | |
445 | __forceinline Vec3fx& operator =( const Vec3fx& other ) { m128 = other.m128; return *this; } |
446 | |
447 | __forceinline explicit Vec3fx( const float a ) : m128(_mm_set1_ps(a)) {} |
448 | __forceinline Vec3fx( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {} |
449 | |
450 | __forceinline Vec3fx( const Vec3fa& other, const int a1) { m128 = other.m128; a = a1; } |
451 | __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; } |
452 | __forceinline Vec3fx( const Vec3fa& other, const float w1) { |
453 | #if defined (__aarch64__) |
454 | m128 = other.m128; m128[3] = w1; |
455 | #elif defined (__SSE4_1__) |
456 | m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4); |
457 | #else |
458 | const vint4 mask(-1,-1,-1,0); |
459 | m128 = select(vboolf4(_mm_castsi128_ps(mask)),vfloat4(other.m128),vfloat4(w1)); |
460 | #endif |
461 | } |
462 | //__forceinline Vec3fx( const float x, const float y, const float z, const int a) : x(x), y(y), z(z), a(a) {} // not working properly! |
463 | //__forceinline Vec3fx( const float x, const float y, const float z, const unsigned a) : x(x), y(y), z(z), u(a) {} // not working properly! |
464 | __forceinline Vec3fx( const float x, const float y, const float z, const float w) : m128(_mm_set_ps(w, z, y, x)) {} |
465 | |
466 | //__forceinline explicit Vec3fx( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} |
467 | |
468 | __forceinline explicit operator const vfloat4() const { return vfloat4(m128); } |
469 | __forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); } |
470 | __forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); } |
471 | __forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); } |
472 | |
473 | //__forceinline operator const __m128&() const { return m128; } |
474 | //__forceinline operator __m128&() { return m128; } |
475 | |
476 | //////////////////////////////////////////////////////////////////////////////// |
477 | /// Loads and Stores |
478 | //////////////////////////////////////////////////////////////////////////////// |
479 | |
480 | static __forceinline Vec3fx load( const void* const a ) { |
481 | return Vec3fx(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)))); |
482 | } |
483 | |
484 | static __forceinline Vec3fx loadu( const void* const a ) { |
485 | return Vec3fx(_mm_loadu_ps((float*)a)); |
486 | } |
487 | |
488 | static __forceinline void storeu ( void* ptr, const Vec3fx& v ) { |
489 | _mm_storeu_ps((float*)ptr,v.m128); |
490 | } |
491 | |
492 | //////////////////////////////////////////////////////////////////////////////// |
493 | /// Constants |
494 | //////////////////////////////////////////////////////////////////////////////// |
495 | |
496 | __forceinline Vec3fx( ZeroTy ) : m128(_mm_setzero_ps()) {} |
497 | __forceinline Vec3fx( OneTy ) : m128(_mm_set1_ps(1.0f)) {} |
498 | __forceinline Vec3fx( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} |
499 | __forceinline Vec3fx( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} |
500 | |
501 | //////////////////////////////////////////////////////////////////////////////// |
502 | /// Array Access |
503 | //////////////////////////////////////////////////////////////////////////////// |
504 | |
505 | __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } |
506 | __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } |
507 | }; |
508 | |
509 | //////////////////////////////////////////////////////////////////////////////// |
510 | /// Unary Operators |
511 | //////////////////////////////////////////////////////////////////////////////// |
512 | |
513 | __forceinline Vec3fx operator +( const Vec3fx& a ) { return a; } |
514 | __forceinline Vec3fx operator -( const Vec3fx& a ) { |
515 | const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); |
516 | return _mm_xor_ps(a.m128, mask); |
517 | } |
518 | __forceinline Vec3fx abs ( const Vec3fx& a ) { |
519 | const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); |
520 | return _mm_and_ps(a.m128, mask); |
521 | } |
522 | __forceinline Vec3fx sign ( const Vec3fx& a ) { |
523 | return blendv_ps(Vec3fx(one).m128, (-Vec3fx(one)).m128, _mm_cmplt_ps (a.m128,Vec3fx(zero).m128)); |
524 | } |
525 | |
526 | __forceinline Vec3fx rcp ( const Vec3fx& a ) |
527 | { |
528 | #if defined(__AVX512VL__) |
529 | const Vec3fx r = _mm_rcp14_ps(a.m128); |
530 | #else |
531 | const Vec3fx r = _mm_rcp_ps(a.m128); |
532 | #endif |
533 | |
534 | #if defined(__AVX2__) |
535 | const Vec3fx res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f))); |
536 | #else |
537 | const Vec3fx res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128))); |
538 | //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); |
539 | #endif |
540 | |
541 | return res; |
542 | } |
543 | |
544 | __forceinline Vec3fx sqrt ( const Vec3fx& a ) { return _mm_sqrt_ps(a.m128); } |
545 | __forceinline Vec3fx sqr ( const Vec3fx& a ) { return _mm_mul_ps(a.m128,a.m128); } |
546 | |
547 | __forceinline Vec3fx rsqrt( const Vec3fx& a ) |
548 | { |
549 | #if defined(__AVX512VL__) |
550 | __m128 r = _mm_rsqrt14_ps(a.m128); |
551 | #else |
552 | __m128 r = _mm_rsqrt_ps(a.m128); |
553 | #endif |
554 | return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); |
555 | } |
556 | |
557 | __forceinline Vec3fx zero_fix(const Vec3fx& a) { |
558 | return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); |
559 | } |
560 | __forceinline Vec3fx rcp_safe(const Vec3fx& a) { |
561 | return rcp(zero_fix(a)); |
562 | } |
563 | __forceinline Vec3fx log ( const Vec3fx& a ) { |
564 | return Vec3fx(logf(a.x),logf(a.y),logf(a.z)); |
565 | } |
566 | |
567 | __forceinline Vec3fx exp ( const Vec3fx& a ) { |
568 | return Vec3fx(expf(a.x),expf(a.y),expf(a.z)); |
569 | } |
570 | |
571 | //////////////////////////////////////////////////////////////////////////////// |
572 | /// Binary Operators |
573 | //////////////////////////////////////////////////////////////////////////////// |
574 | |
575 | __forceinline Vec3fx operator +( const Vec3fx& a, const Vec3fx& b ) { return _mm_add_ps(a.m128, b.m128); } |
576 | __forceinline Vec3fx operator -( const Vec3fx& a, const Vec3fx& b ) { return _mm_sub_ps(a.m128, b.m128); } |
577 | __forceinline Vec3fx operator *( const Vec3fx& a, const Vec3fx& b ) { return _mm_mul_ps(a.m128, b.m128); } |
578 | __forceinline Vec3fx operator *( const Vec3fx& a, const float b ) { return a * Vec3fx(b); } |
579 | __forceinline Vec3fx operator *( const float a, const Vec3fx& b ) { return Vec3fx(a) * b; } |
580 | __forceinline Vec3fx operator /( const Vec3fx& a, const Vec3fx& b ) { return _mm_div_ps(a.m128,b.m128); } |
581 | __forceinline Vec3fx operator /( const Vec3fx& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } |
582 | __forceinline Vec3fx operator /( const float a, const Vec3fx& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } |
583 | |
584 | __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); } |
585 | __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); } |
586 | |
587 | #if defined(__SSE4_1__) || defined(__aarch64__) |
588 | __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) { |
589 | const vint4 ai = _mm_castps_si128(a.m128); |
590 | const vint4 bi = _mm_castps_si128(b.m128); |
591 | const vint4 ci = _mm_min_epi32(ai,bi); |
592 | return _mm_castsi128_ps(ci); |
593 | } |
594 | #endif |
595 | |
596 | #if defined(__SSE4_1__) || defined(__aarch64__) |
597 | __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) { |
598 | const vint4 ai = _mm_castps_si128(a.m128); |
599 | const vint4 bi = _mm_castps_si128(b.m128); |
600 | const vint4 ci = _mm_max_epi32(ai,bi); |
601 | return _mm_castsi128_ps(ci); |
602 | } |
603 | #endif |
604 | |
605 | __forceinline Vec3fx pow ( const Vec3fx& a, const float& b ) { |
606 | return Vec3fx(powf(a.x,b),powf(a.y,b),powf(a.z,b)); |
607 | } |
608 | |
609 | //////////////////////////////////////////////////////////////////////////////// |
610 | /// Ternary Operators |
611 | //////////////////////////////////////////////////////////////////////////////// |
612 | |
613 | #if defined(__AVX2__) |
614 | __forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); } |
615 | __forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); } |
616 | __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); } |
617 | __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); } |
618 | #else |
619 | __forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b+c; } |
620 | __forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b-c; } |
621 | __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b+c;} |
622 | __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b-c; } |
623 | #endif |
624 | |
625 | __forceinline Vec3fx madd ( const float a, const Vec3fx& b, const Vec3fx& c) { return madd(Vec3fx(a),b,c); } |
626 | __forceinline Vec3fx msub ( const float a, const Vec3fx& b, const Vec3fx& c) { return msub(Vec3fx(a),b,c); } |
627 | __forceinline Vec3fx nmadd ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmadd(Vec3fx(a),b,c); } |
628 | __forceinline Vec3fx nmsub ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmsub(Vec3fx(a),b,c); } |
629 | |
630 | //////////////////////////////////////////////////////////////////////////////// |
631 | /// Assignment Operators |
632 | //////////////////////////////////////////////////////////////////////////////// |
633 | |
634 | __forceinline Vec3fx& operator +=( Vec3fx& a, const Vec3fx& b ) { return a = a + b; } |
635 | __forceinline Vec3fx& operator -=( Vec3fx& a, const Vec3fx& b ) { return a = a - b; } |
636 | __forceinline Vec3fx& operator *=( Vec3fx& a, const Vec3fx& b ) { return a = a * b; } |
637 | __forceinline Vec3fx& operator *=( Vec3fx& a, const float b ) { return a = a * b; } |
638 | __forceinline Vec3fx& operator /=( Vec3fx& a, const Vec3fx& b ) { return a = a / b; } |
639 | __forceinline Vec3fx& operator /=( Vec3fx& a, const float b ) { return a = a / b; } |
640 | |
641 | //////////////////////////////////////////////////////////////////////////////// |
642 | /// Reductions |
643 | //////////////////////////////////////////////////////////////////////////////// |
644 | |
645 | __forceinline float reduce_add(const Vec3fx& v) { |
646 | const vfloat4 a(v.m128); |
647 | const vfloat4 b = shuffle<1>(a); |
648 | const vfloat4 c = shuffle<2>(a); |
649 | return _mm_cvtss_f32(a+b+c); |
650 | } |
651 | |
652 | __forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; } |
653 | __forceinline float reduce_min(const Vec3fx& v) { return min(v.x,v.y,v.z); } |
654 | __forceinline float reduce_max(const Vec3fx& v) { return max(v.x,v.y,v.z); } |
655 | |
656 | //////////////////////////////////////////////////////////////////////////////// |
657 | /// Comparison Operators |
658 | //////////////////////////////////////////////////////////////////////////////// |
659 | |
660 | __forceinline bool operator ==( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; } |
661 | __forceinline bool operator !=( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; } |
662 | |
663 | __forceinline Vec3ba eq_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpeq_ps (a.m128, b.m128); } |
664 | __forceinline Vec3ba neq_mask(const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpneq_ps(a.m128, b.m128); } |
665 | __forceinline Vec3ba lt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmplt_ps (a.m128, b.m128); } |
666 | __forceinline Vec3ba le_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmple_ps (a.m128, b.m128); } |
667 | __forceinline Vec3ba gt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnle_ps(a.m128, b.m128); } |
668 | __forceinline Vec3ba ge_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); } |
669 | |
670 | __forceinline bool isvalid ( const Vec3fx& v ) { |
671 | return all(gt_mask(v,Vec3fx(-FLT_LARGE)) & lt_mask(v,Vec3fx(+FLT_LARGE))); |
672 | } |
673 | |
674 | __forceinline bool is_finite ( const Vec3fx& a ) { |
675 | return all(ge_mask(a,Vec3fx(-FLT_MAX)) & le_mask(a,Vec3fx(+FLT_MAX))); |
676 | } |
677 | |
678 | __forceinline bool isvalid4 ( const Vec3fx& v ) { |
679 | return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE))); |
680 | } |
681 | |
682 | __forceinline bool is_finite4 ( const Vec3fx& a ) { |
683 | return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX))); |
684 | } |
685 | |
686 | //////////////////////////////////////////////////////////////////////////////// |
687 | /// Euclidean Space Operators |
688 | //////////////////////////////////////////////////////////////////////////////// |
689 | |
690 | #if defined(__SSE4_1__) |
691 | __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) { |
692 | return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F)); |
693 | } |
694 | #else |
695 | __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) { |
696 | return reduce_add(a*b); |
697 | } |
698 | #endif |
699 | |
700 | __forceinline Vec3fx cross ( const Vec3fx& a, const Vec3fx& b ) |
701 | { |
702 | vfloat4 a0 = vfloat4(a.m128); |
703 | vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128)); |
704 | vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128)); |
705 | vfloat4 b1 = vfloat4(b.m128); |
706 | return Vec3fx(shuffle<1,2,0,3>(msub(a0,b0,a1*b1))); |
707 | } |
708 | |
709 | __forceinline float sqr_length ( const Vec3fx& a ) { return dot(a,a); } |
710 | __forceinline float rcp_length ( const Vec3fx& a ) { return rsqrt(dot(a,a)); } |
711 | __forceinline float rcp_length2( const Vec3fx& a ) { return rcp(dot(a,a)); } |
712 | __forceinline float length ( const Vec3fx& a ) { return sqrt(dot(a,a)); } |
713 | __forceinline Vec3fx normalize( const Vec3fx& a ) { return a*rsqrt(dot(a,a)); } |
714 | __forceinline float distance ( const Vec3fx& a, const Vec3fx& b ) { return length(a-b); } |
715 | __forceinline float halfArea ( const Vec3fx& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); } |
716 | __forceinline float area ( const Vec3fx& d ) { return 2.0f*halfArea(d); } |
717 | |
718 | __forceinline Vec3fx normalize_safe( const Vec3fx& a ) { |
719 | const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d); |
720 | } |
721 | |
722 | /*! differentiated normalization */ |
723 | __forceinline Vec3fx dnormalize(const Vec3fx& p, const Vec3fx& dp) |
724 | { |
725 | const float pp = dot(p,p); |
726 | const float pdp = dot(p,dp); |
727 | return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp); |
728 | } |
729 | |
730 | //////////////////////////////////////////////////////////////////////////////// |
731 | /// Select |
732 | //////////////////////////////////////////////////////////////////////////////// |
733 | |
734 | __forceinline Vec3fx select( bool s, const Vec3fx& t, const Vec3fx& f ) { |
735 | __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); |
736 | return blendv_ps(f.m128, t.m128, mask); |
737 | } |
738 | |
739 | __forceinline Vec3fx select( const Vec3ba& s, const Vec3fx& t, const Vec3fx& f ) { |
740 | return blendv_ps(f.m128, t.m128, s); |
741 | } |
742 | |
743 | __forceinline Vec3fx lerp(const Vec3fx& v0, const Vec3fx& v1, const float t) { |
744 | return madd(1.0f-t,v0,t*v1); |
745 | } |
746 | |
747 | __forceinline int maxDim ( const Vec3fx& a ) |
748 | { |
749 | const Vec3fx b = abs(a); |
750 | if (b.x > b.y) { |
751 | if (b.x > b.z) return 0; else return 2; |
752 | } else { |
753 | if (b.y > b.z) return 1; else return 2; |
754 | } |
755 | } |
756 | |
757 | //////////////////////////////////////////////////////////////////////////////// |
758 | /// Rounding Functions |
759 | //////////////////////////////////////////////////////////////////////////////// |
760 | |
761 | #if defined(__aarch64__) |
762 | __forceinline Vec3fx trunc(const Vec3fx& a) { return vrndq_f32(a.m128); } |
763 | __forceinline Vec3fx floor(const Vec3fx& a) { return vrndmq_f32(a.m128); } |
764 | __forceinline Vec3fx ceil (const Vec3fx& a) { return vrndpq_f32(a.m128); } |
765 | #elif defined (__SSE4_1__) |
766 | __forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); } |
767 | __forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); } |
768 | __forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); } |
769 | #else |
770 | __forceinline Vec3fx trunc( const Vec3fx& a ) { return Vec3fx(truncf(a.x),truncf(a.y),truncf(a.z)); } |
771 | __forceinline Vec3fx floor( const Vec3fx& a ) { return Vec3fx(floorf(a.x),floorf(a.y),floorf(a.z)); } |
772 | __forceinline Vec3fx ceil ( const Vec3fx& a ) { return Vec3fx(ceilf (a.x),ceilf (a.y),ceilf (a.z)); } |
773 | #endif |
774 | |
775 | //////////////////////////////////////////////////////////////////////////////// |
776 | /// Output Operators |
777 | //////////////////////////////////////////////////////////////////////////////// |
778 | |
779 | __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fx& a) { |
780 | return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")" ; |
781 | } |
782 | |
783 | |
784 | typedef Vec3fx Vec3ff; |
785 | } |
786 | |