1 | // This file is part of Eigen, a lightweight C++ template library |
2 | // for linear algebra. |
3 | // |
4 | // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> |
5 | // |
6 | // This Source Code Form is subject to the terms of the Mozilla |
7 | // Public License v. 2.0. If a copy of the MPL was not distributed |
8 | // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. |
9 | |
10 | #ifndef EIGEN_PACKET_MATH_SSE_H |
11 | #define EIGEN_PACKET_MATH_SSE_H |
12 | |
13 | namespace Eigen { |
14 | |
15 | namespace internal { |
16 | |
17 | #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD |
18 | #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 |
19 | #endif |
20 | |
21 | #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS |
22 | #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) |
23 | #endif |
24 | |
25 | #ifdef __FMA__ |
26 | #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD |
27 | #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 |
28 | #endif |
29 | #endif |
30 | |
31 | #if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)) || EIGEN_OS_QNX |
32 | // With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot |
33 | // have overloads for both types without linking error. |
34 | // One solution is to increase ABI version using -fabi-version=4 (or greater). |
35 | // Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper |
36 | // structure: |
37 | template<typename T> |
38 | struct eigen_packet_wrapper |
39 | { |
40 | EIGEN_ALWAYS_INLINE operator T&() { return m_val; } |
41 | EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; } |
42 | EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {} |
43 | EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {} |
44 | EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) { |
45 | m_val = v; |
46 | return *this; |
47 | } |
48 | |
49 | T m_val; |
50 | }; |
51 | typedef eigen_packet_wrapper<__m128> Packet4f; |
52 | typedef eigen_packet_wrapper<__m128i> Packet4i; |
53 | typedef eigen_packet_wrapper<__m128d> Packet2d; |
54 | #else |
55 | typedef __m128 Packet4f; |
56 | typedef __m128i Packet4i; |
57 | typedef __m128d Packet2d; |
58 | #endif |
59 | |
60 | template<> struct is_arithmetic<__m128> { enum { value = true }; }; |
61 | template<> struct is_arithmetic<__m128i> { enum { value = true }; }; |
62 | template<> struct is_arithmetic<__m128d> { enum { value = true }; }; |
63 | |
64 | #define vec4f_swizzle1(v,p,q,r,s) \ |
65 | (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p))))) |
66 | |
67 | #define vec4i_swizzle1(v,p,q,r,s) \ |
68 | (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p)))) |
69 | |
70 | #define vec2d_swizzle1(v,p,q) \ |
71 | (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2))))) |
72 | |
73 | #define vec4f_swizzle2(a,b,p,q,r,s) \ |
74 | (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p)))) |
75 | |
76 | #define vec4i_swizzle2(a,b,p,q,r,s) \ |
77 | (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p)))))) |
78 | |
79 | #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ |
80 | const Packet4f p4f_##NAME = pset1<Packet4f>(X) |
81 | |
82 | #define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ |
83 | const Packet2d p2d_##NAME = pset1<Packet2d>(X) |
84 | |
85 | #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ |
86 | const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1<Packet4i>(X)) |
87 | |
88 | #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ |
89 | const Packet4i p4i_##NAME = pset1<Packet4i>(X) |
90 | |
91 | |
92 | // Use the packet_traits defined in AVX/PacketMath.h instead if we're going |
93 | // to leverage AVX instructions. |
94 | #ifndef EIGEN_VECTORIZE_AVX |
95 | template<> struct packet_traits<float> : default_packet_traits |
96 | { |
97 | typedef Packet4f type; |
98 | typedef Packet4f half; |
99 | enum { |
100 | Vectorizable = 1, |
101 | AlignedOnScalar = 1, |
102 | size=4, |
103 | HasHalfPacket = 0, |
104 | |
105 | HasDiv = 1, |
106 | HasSin = EIGEN_FAST_MATH, |
107 | HasCos = EIGEN_FAST_MATH, |
108 | HasLog = 1, |
109 | HasExp = 1, |
110 | HasSqrt = 1, |
111 | HasRsqrt = 1, |
112 | HasTanh = EIGEN_FAST_MATH, |
113 | HasBlend = 1 |
114 | |
115 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
116 | , |
117 | HasRound = 1, |
118 | HasFloor = 1, |
119 | HasCeil = 1 |
120 | #endif |
121 | }; |
122 | }; |
123 | template<> struct packet_traits<double> : default_packet_traits |
124 | { |
125 | typedef Packet2d type; |
126 | typedef Packet2d half; |
127 | enum { |
128 | Vectorizable = 1, |
129 | AlignedOnScalar = 1, |
130 | size=2, |
131 | HasHalfPacket = 0, |
132 | |
133 | HasDiv = 1, |
134 | HasExp = 1, |
135 | HasSqrt = 1, |
136 | HasRsqrt = 1, |
137 | HasBlend = 1 |
138 | |
139 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
140 | , |
141 | HasRound = 1, |
142 | HasFloor = 1, |
143 | HasCeil = 1 |
144 | #endif |
145 | }; |
146 | }; |
147 | #endif |
148 | template<> struct packet_traits<int> : default_packet_traits |
149 | { |
150 | typedef Packet4i type; |
151 | typedef Packet4i half; |
152 | enum { |
153 | Vectorizable = 1, |
154 | AlignedOnScalar = 1, |
155 | size=4, |
156 | |
157 | HasBlend = 1 |
158 | }; |
159 | }; |
160 | |
161 | template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; |
162 | template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; |
163 | template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; |
164 | |
165 | #ifndef EIGEN_VECTORIZE_AVX |
166 | template<> struct scalar_div_cost<float,true> { enum { value = 7 }; }; |
167 | template<> struct scalar_div_cost<double,true> { enum { value = 8 }; }; |
168 | #endif |
169 | |
170 | #if EIGEN_COMP_MSVC==1500 |
171 | // Workaround MSVC 9 internal compiler error. |
172 | // TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode |
173 | // TODO: let's check whether there does not exist a better fix, like adding a pset0() function. (it crashed on pset1(0)). |
174 | template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps(from,from,from,from); } |
175 | template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); } |
176 | template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set_epi32(from,from,from,from); } |
177 | #else |
178 | template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps1(from); } |
179 | template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); } |
180 | template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); } |
181 | #endif |
182 | |
183 | // GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction. |
184 | // However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203) |
185 | // Using inline assembly is also not an option because then gcc fails to reorder properly the instructions. |
186 | // Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply. |
187 | // Also note that with AVX, we want it to generate a vbroadcastss. |
188 | #if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__) |
189 | template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) { |
190 | return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0); |
191 | } |
192 | #endif |
193 | |
194 | template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); } |
195 | template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); } |
196 | template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); } |
197 | |
198 | template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); } |
199 | template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); } |
200 | template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); } |
201 | |
202 | template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); } |
203 | template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); } |
204 | template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); } |
205 | |
206 | template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) |
207 | { |
208 | const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000)); |
209 | return _mm_xor_ps(a,mask); |
210 | } |
211 | template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) |
212 | { |
213 | const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000)); |
214 | return _mm_xor_pd(a,mask); |
215 | } |
216 | template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) |
217 | { |
218 | return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a); |
219 | } |
220 | |
221 | template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } |
222 | template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } |
223 | template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } |
224 | |
225 | template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); } |
226 | template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); } |
227 | template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) |
228 | { |
229 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
230 | return _mm_mullo_epi32(a,b); |
231 | #else |
232 | // this version is slightly faster than 4 scalar products |
233 | return vec4i_swizzle1( |
234 | vec4i_swizzle2( |
235 | _mm_mul_epu32(a,b), |
236 | _mm_mul_epu32(vec4i_swizzle1(a,1,0,3,2), |
237 | vec4i_swizzle1(b,1,0,3,2)), |
238 | 0,2,0,2), |
239 | 0,2,1,3); |
240 | #endif |
241 | } |
242 | |
243 | template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); } |
244 | template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); } |
245 | |
246 | // for some weird raisons, it has to be overloaded for packet of integers |
247 | template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } |
248 | #ifdef __FMA__ |
249 | template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); } |
250 | template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); } |
251 | #endif |
252 | |
253 | template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); } |
254 | template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); } |
255 | template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) |
256 | { |
257 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
258 | return _mm_min_epi32(a,b); |
259 | #else |
260 | // after some bench, this version *is* faster than a scalar implementation |
261 | Packet4i mask = _mm_cmplt_epi32(a,b); |
262 | return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b)); |
263 | #endif |
264 | } |
265 | |
266 | template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); } |
267 | template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); } |
268 | template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) |
269 | { |
270 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
271 | return _mm_max_epi32(a,b); |
272 | #else |
273 | // after some bench, this version *is* faster than a scalar implementation |
274 | Packet4i mask = _mm_cmpgt_epi32(a,b); |
275 | return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b)); |
276 | #endif |
277 | } |
278 | |
279 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
280 | template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); } |
281 | template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, 0); } |
282 | |
283 | template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return _mm_ceil_ps(a); } |
284 | template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return _mm_ceil_pd(a); } |
285 | |
286 | template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); } |
287 | template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); } |
288 | #endif |
289 | |
290 | template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } |
291 | template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } |
292 | template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); } |
293 | |
294 | template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); } |
295 | template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); } |
296 | template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); } |
297 | |
298 | template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); } |
299 | template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); } |
300 | template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); } |
301 | |
302 | template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); } |
303 | template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); } |
304 | template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); } |
305 | |
306 | template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } |
307 | template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } |
308 | template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); } |
309 | |
310 | #if EIGEN_COMP_MSVC |
311 | template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { |
312 | EIGEN_DEBUG_UNALIGNED_LOAD |
313 | #if (EIGEN_COMP_MSVC==1600) |
314 | // NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps |
315 | // (i.e., it does not generate an unaligned load!! |
316 | __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from)); |
317 | res = _mm_loadh_pi(res, (const __m64*)(from+2)); |
318 | return res; |
319 | #else |
320 | return _mm_loadu_ps(from); |
321 | #endif |
322 | } |
323 | #else |
324 | // NOTE: with the code below, MSVC's compiler crashes! |
325 | |
326 | template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) |
327 | { |
328 | EIGEN_DEBUG_UNALIGNED_LOAD |
329 | return _mm_loadu_ps(from); |
330 | } |
331 | #endif |
332 | |
333 | template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) |
334 | { |
335 | EIGEN_DEBUG_UNALIGNED_LOAD |
336 | return _mm_loadu_pd(from); |
337 | } |
338 | template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) |
339 | { |
340 | EIGEN_DEBUG_UNALIGNED_LOAD |
341 | return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); |
342 | } |
343 | |
344 | |
345 | template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) |
346 | { |
347 | return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1); |
348 | } |
349 | template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) |
350 | { return pset1<Packet2d>(from[0]); } |
351 | template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) |
352 | { |
353 | Packet4i tmp; |
354 | tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from)); |
355 | return vec4i_swizzle1(tmp, 0, 0, 1, 1); |
356 | } |
357 | |
358 | template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); } |
359 | template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); } |
360 | template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } |
361 | |
362 | template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); } |
363 | template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); } |
364 | template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); } |
365 | |
366 | template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) |
367 | { |
368 | return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); |
369 | } |
370 | template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) |
371 | { |
372 | return _mm_set_pd(from[1*stride], from[0*stride]); |
373 | } |
374 | template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) |
375 | { |
376 | return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); |
377 | } |
378 | |
379 | template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) |
380 | { |
381 | to[stride*0] = _mm_cvtss_f32(from); |
382 | to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1)); |
383 | to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2)); |
384 | to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3)); |
385 | } |
386 | template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) |
387 | { |
388 | to[stride*0] = _mm_cvtsd_f64(from); |
389 | to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1)); |
390 | } |
391 | template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) |
392 | { |
393 | to[stride*0] = _mm_cvtsi128_si32(from); |
394 | to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)); |
395 | to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)); |
396 | to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)); |
397 | } |
398 | |
399 | // some compilers might be tempted to perform multiple moves instead of using a vector path. |
400 | template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a) |
401 | { |
402 | Packet4f pa = _mm_set_ss(a); |
403 | pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0))); |
404 | } |
405 | // some compilers might be tempted to perform multiple moves instead of using a vector path. |
406 | template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a) |
407 | { |
408 | Packet2d pa = _mm_set_sd(a); |
409 | pstore(to, Packet2d(vec2d_swizzle1(pa,0,0))); |
410 | } |
411 | |
412 | #if EIGEN_COMP_PGI |
413 | typedef const void * SsePrefetchPtrType; |
414 | #else |
415 | typedef const char * SsePrefetchPtrType; |
416 | #endif |
417 | |
418 | #ifndef EIGEN_VECTORIZE_AVX |
419 | template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } |
420 | template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } |
421 | template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } |
422 | #endif |
423 | |
424 | #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64 |
425 | // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 |
426 | // Direct of the struct members fixed bug #62. |
427 | template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; } |
428 | template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; } |
429 | template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; } |
430 | #elif EIGEN_COMP_MSVC_STRICT |
431 | // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 |
432 | template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; } |
433 | template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; } |
434 | template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; } |
435 | #else |
436 | template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); } |
437 | template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); } |
438 | template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); } |
439 | #endif |
440 | |
441 | template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) |
442 | { return _mm_shuffle_ps(a,a,0x1B); } |
443 | template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) |
444 | { return _mm_shuffle_pd(a,a,0x1); } |
445 | template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) |
446 | { return _mm_shuffle_epi32(a,0x1B); } |
447 | |
448 | template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) |
449 | { |
450 | const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); |
451 | return _mm_and_ps(a,mask); |
452 | } |
453 | template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) |
454 | { |
455 | const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); |
456 | return _mm_and_pd(a,mask); |
457 | } |
458 | template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) |
459 | { |
460 | #ifdef EIGEN_VECTORIZE_SSSE3 |
461 | return _mm_abs_epi32(a); |
462 | #else |
463 | Packet4i aux = _mm_srai_epi32(a,31); |
464 | return _mm_sub_epi32(_mm_xor_si128(a,aux),aux); |
465 | #endif |
466 | } |
467 | |
468 | // with AVX, the default implementations based on pload1 are faster |
469 | #ifndef __AVX__ |
470 | template<> EIGEN_STRONG_INLINE void |
471 | pbroadcast4<Packet4f>(const float *a, |
472 | Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) |
473 | { |
474 | a3 = pload<Packet4f>(a); |
475 | a0 = vec4f_swizzle1(a3, 0,0,0,0); |
476 | a1 = vec4f_swizzle1(a3, 1,1,1,1); |
477 | a2 = vec4f_swizzle1(a3, 2,2,2,2); |
478 | a3 = vec4f_swizzle1(a3, 3,3,3,3); |
479 | } |
480 | template<> EIGEN_STRONG_INLINE void |
481 | pbroadcast4<Packet2d>(const double *a, |
482 | Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) |
483 | { |
484 | #ifdef EIGEN_VECTORIZE_SSE3 |
485 | a0 = _mm_loaddup_pd(a+0); |
486 | a1 = _mm_loaddup_pd(a+1); |
487 | a2 = _mm_loaddup_pd(a+2); |
488 | a3 = _mm_loaddup_pd(a+3); |
489 | #else |
490 | a1 = pload<Packet2d>(a); |
491 | a0 = vec2d_swizzle1(a1, 0,0); |
492 | a1 = vec2d_swizzle1(a1, 1,1); |
493 | a3 = pload<Packet2d>(a+2); |
494 | a2 = vec2d_swizzle1(a3, 0,0); |
495 | a3 = vec2d_swizzle1(a3, 1,1); |
496 | #endif |
497 | } |
498 | #endif |
499 | |
500 | EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) |
501 | { |
502 | vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55)); |
503 | vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA)); |
504 | vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF)); |
505 | vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00)); |
506 | } |
507 | |
508 | #ifdef EIGEN_VECTORIZE_SSE3 |
509 | template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) |
510 | { |
511 | return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3])); |
512 | } |
513 | |
514 | template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) |
515 | { |
516 | return _mm_hadd_pd(vecs[0], vecs[1]); |
517 | } |
518 | |
519 | #else |
520 | template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) |
521 | { |
522 | Packet4f tmp0, tmp1, tmp2; |
523 | tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]); |
524 | tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]); |
525 | tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]); |
526 | tmp0 = _mm_add_ps(tmp0, tmp1); |
527 | tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]); |
528 | tmp1 = _mm_add_ps(tmp1, tmp2); |
529 | tmp2 = _mm_movehl_ps(tmp1, tmp0); |
530 | tmp0 = _mm_movelh_ps(tmp0, tmp1); |
531 | return _mm_add_ps(tmp0, tmp2); |
532 | } |
533 | |
534 | template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) |
535 | { |
536 | return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1])); |
537 | } |
538 | #endif // SSE3 |
539 | |
540 | template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) |
541 | { |
542 | // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures |
543 | // (from Nehalem to Haswell) |
544 | // #ifdef EIGEN_VECTORIZE_SSE3 |
545 | // Packet4f tmp = _mm_add_ps(a, vec4f_swizzle1(a,2,3,2,3)); |
546 | // return pfirst<Packet4f>(_mm_hadd_ps(tmp, tmp)); |
547 | // #else |
548 | Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a)); |
549 | return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); |
550 | // #endif |
551 | } |
552 | |
553 | template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) |
554 | { |
555 | // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures |
556 | // (from Nehalem to Haswell) |
557 | // #ifdef EIGEN_VECTORIZE_SSE3 |
558 | // return pfirst<Packet2d>(_mm_hadd_pd(a, a)); |
559 | // #else |
560 | return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a))); |
561 | // #endif |
562 | } |
563 | |
564 | #ifdef EIGEN_VECTORIZE_SSSE3 |
565 | template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) |
566 | { |
567 | return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3])); |
568 | } |
569 | template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) |
570 | { |
571 | Packet4i tmp0 = _mm_hadd_epi32(a,a); |
572 | return pfirst<Packet4i>(_mm_hadd_epi32(tmp0,tmp0)); |
573 | } |
574 | #else |
575 | template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) |
576 | { |
577 | Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a)); |
578 | return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)); |
579 | } |
580 | |
581 | template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) |
582 | { |
583 | Packet4i tmp0, tmp1, tmp2; |
584 | tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]); |
585 | tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]); |
586 | tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]); |
587 | tmp0 = _mm_add_epi32(tmp0, tmp1); |
588 | tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]); |
589 | tmp1 = _mm_add_epi32(tmp1, tmp2); |
590 | tmp2 = _mm_unpacklo_epi64(tmp0, tmp1); |
591 | tmp0 = _mm_unpackhi_epi64(tmp0, tmp1); |
592 | return _mm_add_epi32(tmp0, tmp2); |
593 | } |
594 | #endif |
595 | // Other reduction functions: |
596 | |
597 | // mul |
598 | template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) |
599 | { |
600 | Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a)); |
601 | return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); |
602 | } |
603 | template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) |
604 | { |
605 | return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a,a))); |
606 | } |
607 | template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) |
608 | { |
609 | // after some experiments, it is seems this is the fastest way to implement it |
610 | // for GCC (eg., reusing pmul is very slow !) |
611 | // TODO try to call _mm_mul_epu32 directly |
612 | EIGEN_ALIGN16 int aux[4]; |
613 | pstore(aux, a); |
614 | return (aux[0] * aux[1]) * (aux[2] * aux[3]);; |
615 | } |
616 | |
617 | // min |
618 | template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) |
619 | { |
620 | Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a)); |
621 | return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); |
622 | } |
623 | template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) |
624 | { |
625 | return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a,a))); |
626 | } |
627 | template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) |
628 | { |
629 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
630 | Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); |
631 | return pfirst<Packet4i>(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); |
632 | #else |
633 | // after some experiments, it is seems this is the fastest way to implement it |
634 | // for GCC (eg., it does not like using std::min after the pstore !!) |
635 | EIGEN_ALIGN16 int aux[4]; |
636 | pstore(aux, a); |
637 | int aux0 = aux[0]<aux[1] ? aux[0] : aux[1]; |
638 | int aux2 = aux[2]<aux[3] ? aux[2] : aux[3]; |
639 | return aux0<aux2 ? aux0 : aux2; |
640 | #endif // EIGEN_VECTORIZE_SSE4_1 |
641 | } |
642 | |
643 | // max |
644 | template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) |
645 | { |
646 | Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a)); |
647 | return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); |
648 | } |
649 | template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) |
650 | { |
651 | return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a,a))); |
652 | } |
653 | template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) |
654 | { |
655 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
656 | Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); |
657 | return pfirst<Packet4i>(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); |
658 | #else |
659 | // after some experiments, it is seems this is the fastest way to implement it |
660 | // for GCC (eg., it does not like using std::min after the pstore !!) |
661 | EIGEN_ALIGN16 int aux[4]; |
662 | pstore(aux, a); |
663 | int aux0 = aux[0]>aux[1] ? aux[0] : aux[1]; |
664 | int aux2 = aux[2]>aux[3] ? aux[2] : aux[3]; |
665 | return aux0>aux2 ? aux0 : aux2; |
666 | #endif // EIGEN_VECTORIZE_SSE4_1 |
667 | } |
668 | |
669 | #if EIGEN_COMP_GNUC |
670 | // template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) |
671 | // { |
672 | // Packet4f res = b; |
673 | // asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c)); |
674 | // return res; |
675 | // } |
676 | // EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i& a, const Packet4i& b, const int i) |
677 | // { |
678 | // Packet4i res = a; |
679 | // asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i)); |
680 | // return res; |
681 | // } |
682 | #endif |
683 | |
684 | #ifdef EIGEN_VECTORIZE_SSSE3 |
685 | // SSSE3 versions |
686 | template<int Offset> |
687 | struct palign_impl<Offset,Packet4f> |
688 | { |
689 | static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) |
690 | { |
691 | if (Offset!=0) |
692 | first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4)); |
693 | } |
694 | }; |
695 | |
696 | template<int Offset> |
697 | struct palign_impl<Offset,Packet4i> |
698 | { |
699 | static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) |
700 | { |
701 | if (Offset!=0) |
702 | first = _mm_alignr_epi8(second,first, Offset*4); |
703 | } |
704 | }; |
705 | |
706 | template<int Offset> |
707 | struct palign_impl<Offset,Packet2d> |
708 | { |
709 | static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) |
710 | { |
711 | if (Offset==1) |
712 | first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8)); |
713 | } |
714 | }; |
715 | #else |
716 | // SSE2 versions |
717 | template<int Offset> |
718 | struct palign_impl<Offset,Packet4f> |
719 | { |
720 | static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) |
721 | { |
722 | if (Offset==1) |
723 | { |
724 | first = _mm_move_ss(first,second); |
725 | first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39)); |
726 | } |
727 | else if (Offset==2) |
728 | { |
729 | first = _mm_movehl_ps(first,first); |
730 | first = _mm_movelh_ps(first,second); |
731 | } |
732 | else if (Offset==3) |
733 | { |
734 | first = _mm_move_ss(first,second); |
735 | first = _mm_shuffle_ps(first,second,0x93); |
736 | } |
737 | } |
738 | }; |
739 | |
740 | template<int Offset> |
741 | struct palign_impl<Offset,Packet4i> |
742 | { |
743 | static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) |
744 | { |
745 | if (Offset==1) |
746 | { |
747 | first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); |
748 | first = _mm_shuffle_epi32(first,0x39); |
749 | } |
750 | else if (Offset==2) |
751 | { |
752 | first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first))); |
753 | first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); |
754 | } |
755 | else if (Offset==3) |
756 | { |
757 | first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); |
758 | first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93)); |
759 | } |
760 | } |
761 | }; |
762 | |
763 | template<int Offset> |
764 | struct palign_impl<Offset,Packet2d> |
765 | { |
766 | static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) |
767 | { |
768 | if (Offset==1) |
769 | { |
770 | first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first))); |
771 | first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second))); |
772 | } |
773 | } |
774 | }; |
775 | #endif |
776 | |
777 | EIGEN_DEVICE_FUNC inline void |
778 | ptranspose(PacketBlock<Packet4f,4>& kernel) { |
779 | _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]); |
780 | } |
781 | |
782 | EIGEN_DEVICE_FUNC inline void |
783 | ptranspose(PacketBlock<Packet2d,2>& kernel) { |
784 | __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]); |
785 | kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]); |
786 | kernel.packet[1] = tmp; |
787 | } |
788 | |
789 | EIGEN_DEVICE_FUNC inline void |
790 | ptranspose(PacketBlock<Packet4i,4>& kernel) { |
791 | __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]); |
792 | __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]); |
793 | __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]); |
794 | __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]); |
795 | |
796 | kernel.packet[0] = _mm_unpacklo_epi64(T0, T1); |
797 | kernel.packet[1] = _mm_unpackhi_epi64(T0, T1); |
798 | kernel.packet[2] = _mm_unpacklo_epi64(T2, T3); |
799 | kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); |
800 | } |
801 | |
802 | template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { |
803 | const __m128i zero = _mm_setzero_si128(); |
804 | const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); |
805 | __m128i false_mask = _mm_cmpeq_epi32(select, zero); |
806 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
807 | return _mm_blendv_epi8(thenPacket, elsePacket, false_mask); |
808 | #else |
809 | return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket)); |
810 | #endif |
811 | } |
812 | template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { |
813 | const __m128 zero = _mm_setzero_ps(); |
814 | const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); |
815 | __m128 false_mask = _mm_cmpeq_ps(select, zero); |
816 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
817 | return _mm_blendv_ps(thenPacket, elsePacket, false_mask); |
818 | #else |
819 | return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket)); |
820 | #endif |
821 | } |
822 | template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { |
823 | const __m128d zero = _mm_setzero_pd(); |
824 | const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]); |
825 | __m128d false_mask = _mm_cmpeq_pd(select, zero); |
826 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
827 | return _mm_blendv_pd(thenPacket, elsePacket, false_mask); |
828 | #else |
829 | return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket)); |
830 | #endif |
831 | } |
832 | |
833 | template<> EIGEN_STRONG_INLINE Packet4f pinsertfirst(const Packet4f& a, float b) |
834 | { |
835 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
836 | return _mm_blend_ps(a,pset1<Packet4f>(b),1); |
837 | #else |
838 | return _mm_move_ss(a, _mm_load_ss(&b)); |
839 | #endif |
840 | } |
841 | |
842 | template<> EIGEN_STRONG_INLINE Packet2d pinsertfirst(const Packet2d& a, double b) |
843 | { |
844 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
845 | return _mm_blend_pd(a,pset1<Packet2d>(b),1); |
846 | #else |
847 | return _mm_move_sd(a, _mm_load_sd(&b)); |
848 | #endif |
849 | } |
850 | |
851 | template<> EIGEN_STRONG_INLINE Packet4f pinsertlast(const Packet4f& a, float b) |
852 | { |
853 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
854 | return _mm_blend_ps(a,pset1<Packet4f>(b),(1<<3)); |
855 | #else |
856 | const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x0,0x0,0x0,0xFFFFFFFF)); |
857 | return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, pset1<Packet4f>(b))); |
858 | #endif |
859 | } |
860 | |
861 | template<> EIGEN_STRONG_INLINE Packet2d pinsertlast(const Packet2d& a, double b) |
862 | { |
863 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
864 | return _mm_blend_pd(a,pset1<Packet2d>(b),(1<<1)); |
865 | #else |
866 | const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x0,0xFFFFFFFF,0xFFFFFFFF)); |
867 | return _mm_or_pd(_mm_andnot_pd(mask, a), _mm_and_pd(mask, pset1<Packet2d>(b))); |
868 | #endif |
869 | } |
870 | |
871 | // Scalar path for pmadd with FMA to ensure consistency with vectorized path. |
872 | #ifdef __FMA__ |
873 | template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) { |
874 | return ::fmaf(a,b,c); |
875 | } |
876 | template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) { |
877 | return ::fma(a,b,c); |
878 | } |
879 | #endif |
880 | |
881 | } // end namespace internal |
882 | |
883 | } // end namespace Eigen |
884 | |
885 | #if EIGEN_COMP_PGI |
886 | // PGI++ does not define the following intrinsics in C++ mode. |
887 | static inline __m128 _mm_castpd_ps (__m128d x) { return reinterpret_cast<__m128&>(x); } |
888 | static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); } |
889 | static inline __m128d _mm_castps_pd (__m128 x) { return reinterpret_cast<__m128d&>(x); } |
890 | static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); } |
891 | static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); } |
892 | static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); } |
893 | #endif |
894 | |
895 | #endif // EIGEN_PACKET_MATH_SSE_H |
896 | |