| 1 | // This file is part of Eigen, a lightweight C++ template library | 
|---|
| 2 | // for linear algebra. | 
|---|
| 3 | // | 
|---|
| 4 | // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> | 
|---|
| 5 | // | 
|---|
| 6 | // This Source Code Form is subject to the terms of the Mozilla | 
|---|
| 7 | // Public License v. 2.0. If a copy of the MPL was not distributed | 
|---|
| 8 | // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. | 
|---|
| 9 |  | 
|---|
| 10 | #ifndef EIGEN_PACKET_MATH_SSE_H | 
|---|
| 11 | #define EIGEN_PACKET_MATH_SSE_H | 
|---|
| 12 |  | 
|---|
| 13 | namespace Eigen { | 
|---|
| 14 |  | 
|---|
| 15 | namespace internal { | 
|---|
| 16 |  | 
|---|
| 17 | #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD | 
|---|
| 18 | #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 | 
|---|
| 19 | #endif | 
|---|
| 20 |  | 
|---|
| 21 | #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS | 
|---|
| 22 | #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) | 
|---|
| 23 | #endif | 
|---|
| 24 |  | 
|---|
| 25 | #ifdef __FMA__ | 
|---|
| 26 | #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD | 
|---|
| 27 | #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 | 
|---|
| 28 | #endif | 
|---|
| 29 | #endif | 
|---|
| 30 |  | 
|---|
| 31 | #if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)) || EIGEN_OS_QNX | 
|---|
| 32 | // With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot | 
|---|
| 33 | // have overloads for both types without linking error. | 
|---|
| 34 | // One solution is to increase ABI version using -fabi-version=4 (or greater). | 
|---|
| 35 | // Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper | 
|---|
| 36 | // structure: | 
|---|
| 37 | template<typename T> | 
|---|
| 38 | struct eigen_packet_wrapper | 
|---|
| 39 | { | 
|---|
| 40 | EIGEN_ALWAYS_INLINE operator T&() { return m_val; } | 
|---|
| 41 | EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; } | 
|---|
| 42 | EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {} | 
|---|
| 43 | EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {} | 
|---|
| 44 | EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) { | 
|---|
| 45 | m_val = v; | 
|---|
| 46 | return *this; | 
|---|
| 47 | } | 
|---|
| 48 |  | 
|---|
| 49 | T m_val; | 
|---|
| 50 | }; | 
|---|
| 51 | typedef eigen_packet_wrapper<__m128>  Packet4f; | 
|---|
| 52 | typedef eigen_packet_wrapper<__m128i> Packet4i; | 
|---|
| 53 | typedef eigen_packet_wrapper<__m128d> Packet2d; | 
|---|
| 54 | #else | 
|---|
| 55 | typedef __m128  Packet4f; | 
|---|
| 56 | typedef __m128i Packet4i; | 
|---|
| 57 | typedef __m128d Packet2d; | 
|---|
| 58 | #endif | 
|---|
| 59 |  | 
|---|
| 60 | template<> struct is_arithmetic<__m128>  { enum { value = true }; }; | 
|---|
| 61 | template<> struct is_arithmetic<__m128i> { enum { value = true }; }; | 
|---|
| 62 | template<> struct is_arithmetic<__m128d> { enum { value = true }; }; | 
|---|
| 63 |  | 
|---|
| 64 | #define vec4f_swizzle1(v,p,q,r,s) \ | 
|---|
| 65 | (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p))))) | 
|---|
| 66 |  | 
|---|
| 67 | #define vec4i_swizzle1(v,p,q,r,s) \ | 
|---|
| 68 | (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p)))) | 
|---|
| 69 |  | 
|---|
| 70 | #define vec2d_swizzle1(v,p,q) \ | 
|---|
| 71 | (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2))))) | 
|---|
| 72 |  | 
|---|
| 73 | #define vec4f_swizzle2(a,b,p,q,r,s) \ | 
|---|
| 74 | (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p)))) | 
|---|
| 75 |  | 
|---|
| 76 | #define vec4i_swizzle2(a,b,p,q,r,s) \ | 
|---|
| 77 | (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p)))))) | 
|---|
| 78 |  | 
|---|
| 79 | #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ | 
|---|
| 80 | const Packet4f p4f_##NAME = pset1<Packet4f>(X) | 
|---|
| 81 |  | 
|---|
| 82 | #define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ | 
|---|
| 83 | const Packet2d p2d_##NAME = pset1<Packet2d>(X) | 
|---|
| 84 |  | 
|---|
| 85 | #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ | 
|---|
| 86 | const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1<Packet4i>(X)) | 
|---|
| 87 |  | 
|---|
| 88 | #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ | 
|---|
| 89 | const Packet4i p4i_##NAME = pset1<Packet4i>(X) | 
|---|
| 90 |  | 
|---|
| 91 |  | 
|---|
| 92 | // Use the packet_traits defined in AVX/PacketMath.h instead if we're going | 
|---|
| 93 | // to leverage AVX instructions. | 
|---|
| 94 | #ifndef EIGEN_VECTORIZE_AVX | 
|---|
| 95 | template<> struct packet_traits<float>  : default_packet_traits | 
|---|
| 96 | { | 
|---|
| 97 | typedef Packet4f type; | 
|---|
| 98 | typedef Packet4f half; | 
|---|
| 99 | enum { | 
|---|
| 100 | Vectorizable = 1, | 
|---|
| 101 | AlignedOnScalar = 1, | 
|---|
| 102 | size=4, | 
|---|
| 103 | HasHalfPacket = 0, | 
|---|
| 104 |  | 
|---|
| 105 | HasDiv  = 1, | 
|---|
| 106 | HasSin  = EIGEN_FAST_MATH, | 
|---|
| 107 | HasCos  = EIGEN_FAST_MATH, | 
|---|
| 108 | HasLog  = 1, | 
|---|
| 109 | HasExp  = 1, | 
|---|
| 110 | HasSqrt = 1, | 
|---|
| 111 | HasRsqrt = 1, | 
|---|
| 112 | HasTanh  = EIGEN_FAST_MATH, | 
|---|
| 113 | HasBlend = 1 | 
|---|
| 114 |  | 
|---|
| 115 | #ifdef EIGEN_VECTORIZE_SSE4_1 | 
|---|
| 116 | , | 
|---|
| 117 | HasRound = 1, | 
|---|
| 118 | HasFloor = 1, | 
|---|
| 119 | HasCeil = 1 | 
|---|
| 120 | #endif | 
|---|
| 121 | }; | 
|---|
| 122 | }; | 
|---|
| 123 | template<> struct packet_traits<double> : default_packet_traits | 
|---|
| 124 | { | 
|---|
| 125 | typedef Packet2d type; | 
|---|
| 126 | typedef Packet2d half; | 
|---|
| 127 | enum { | 
|---|
| 128 | Vectorizable = 1, | 
|---|
| 129 | AlignedOnScalar = 1, | 
|---|
| 130 | size=2, | 
|---|
| 131 | HasHalfPacket = 0, | 
|---|
| 132 |  | 
|---|
| 133 | HasDiv  = 1, | 
|---|
| 134 | HasExp  = 1, | 
|---|
| 135 | HasSqrt = 1, | 
|---|
| 136 | HasRsqrt = 1, | 
|---|
| 137 | HasBlend = 1 | 
|---|
| 138 |  | 
|---|
| 139 | #ifdef EIGEN_VECTORIZE_SSE4_1 | 
|---|
| 140 | , | 
|---|
| 141 | HasRound = 1, | 
|---|
| 142 | HasFloor = 1, | 
|---|
| 143 | HasCeil = 1 | 
|---|
| 144 | #endif | 
|---|
| 145 | }; | 
|---|
| 146 | }; | 
|---|
| 147 | #endif | 
|---|
| 148 | template<> struct packet_traits<int>    : default_packet_traits | 
|---|
| 149 | { | 
|---|
| 150 | typedef Packet4i type; | 
|---|
| 151 | typedef Packet4i half; | 
|---|
| 152 | enum { | 
|---|
| 153 | Vectorizable = 1, | 
|---|
| 154 | AlignedOnScalar = 1, | 
|---|
| 155 | size=4, | 
|---|
| 156 |  | 
|---|
| 157 | HasBlend = 1 | 
|---|
| 158 | }; | 
|---|
| 159 | }; | 
|---|
| 160 |  | 
|---|
| 161 | template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; | 
|---|
| 162 | template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; | 
|---|
| 163 | template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; | 
|---|
| 164 |  | 
|---|
| 165 | #ifndef EIGEN_VECTORIZE_AVX | 
|---|
| 166 | template<> struct scalar_div_cost<float,true> { enum { value = 7 }; }; | 
|---|
| 167 | template<> struct scalar_div_cost<double,true> { enum { value = 8 }; }; | 
|---|
| 168 | #endif | 
|---|
| 169 |  | 
|---|
| 170 | #if EIGEN_COMP_MSVC==1500 | 
|---|
| 171 | // Workaround MSVC 9 internal compiler error. | 
|---|
| 172 | // TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode | 
|---|
| 173 | // TODO: let's check whether there does not exist a better fix, like adding a pset0() function. (it crashed on pset1(0)). | 
|---|
| 174 | template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set_ps(from,from,from,from); } | 
|---|
| 175 | template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); } | 
|---|
| 176 | template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set_epi32(from,from,from,from); } | 
|---|
| 177 | #else | 
|---|
| 178 | template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set_ps1(from); } | 
|---|
| 179 | template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); } | 
|---|
| 180 | template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set1_epi32(from); } | 
|---|
| 181 | #endif | 
|---|
| 182 |  | 
|---|
| 183 | // GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction. | 
|---|
| 184 | // However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203) | 
|---|
| 185 | // Using inline assembly is also not an option because then gcc fails to reorder properly the instructions. | 
|---|
| 186 | // Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply. | 
|---|
| 187 | // Also note that with AVX, we want it to generate a vbroadcastss. | 
|---|
| 188 | #if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__) | 
|---|
| 189 | template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) { | 
|---|
| 190 | return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0); | 
|---|
| 191 | } | 
|---|
| 192 | #endif | 
|---|
| 193 |  | 
|---|
| 194 | template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); } | 
|---|
| 195 | template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); } | 
|---|
| 196 | template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); } | 
|---|
| 197 |  | 
|---|
| 198 | template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); } | 
|---|
| 199 | template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); } | 
|---|
| 200 | template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); } | 
|---|
| 201 |  | 
|---|
| 202 | template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); } | 
|---|
| 203 | template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); } | 
|---|
| 204 | template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); } | 
|---|
| 205 |  | 
|---|
| 206 | template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) | 
|---|
| 207 | { | 
|---|
| 208 | const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000)); | 
|---|
| 209 | return _mm_xor_ps(a,mask); | 
|---|
| 210 | } | 
|---|
| 211 | template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) | 
|---|
| 212 | { | 
|---|
| 213 | const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000)); | 
|---|
| 214 | return _mm_xor_pd(a,mask); | 
|---|
| 215 | } | 
|---|
| 216 | template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) | 
|---|
| 217 | { | 
|---|
| 218 | return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a); | 
|---|
| 219 | } | 
|---|
| 220 |  | 
|---|
| 221 | template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } | 
|---|
| 222 | template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } | 
|---|
| 223 | template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } | 
|---|
| 224 |  | 
|---|
| 225 | template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); } | 
|---|
| 226 | template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); } | 
|---|
| 227 | template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) | 
|---|
| 228 | { | 
|---|
| 229 | #ifdef EIGEN_VECTORIZE_SSE4_1 | 
|---|
| 230 | return _mm_mullo_epi32(a,b); | 
|---|
| 231 | #else | 
|---|
| 232 | // this version is slightly faster than 4 scalar products | 
|---|
| 233 | return vec4i_swizzle1( | 
|---|
| 234 | vec4i_swizzle2( | 
|---|
| 235 | _mm_mul_epu32(a,b), | 
|---|
| 236 | _mm_mul_epu32(vec4i_swizzle1(a,1,0,3,2), | 
|---|
| 237 | vec4i_swizzle1(b,1,0,3,2)), | 
|---|
| 238 | 0,2,0,2), | 
|---|
| 239 | 0,2,1,3); | 
|---|
| 240 | #endif | 
|---|
| 241 | } | 
|---|
| 242 |  | 
|---|
| 243 | template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); } | 
|---|
| 244 | template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); } | 
|---|
| 245 |  | 
|---|
| 246 | // for some weird raisons, it has to be overloaded for packet of integers | 
|---|
| 247 | template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } | 
|---|
| 248 | #ifdef __FMA__ | 
|---|
| 249 | template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); } | 
|---|
| 250 | template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); } | 
|---|
| 251 | #endif | 
|---|
| 252 |  | 
|---|
| 253 | template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); } | 
|---|
| 254 | template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); } | 
|---|
| 255 | template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) | 
|---|
| 256 | { | 
|---|
| 257 | #ifdef EIGEN_VECTORIZE_SSE4_1 | 
|---|
| 258 | return _mm_min_epi32(a,b); | 
|---|
| 259 | #else | 
|---|
| 260 | // after some bench, this version *is* faster than a scalar implementation | 
|---|
| 261 | Packet4i mask = _mm_cmplt_epi32(a,b); | 
|---|
| 262 | return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b)); | 
|---|
| 263 | #endif | 
|---|
| 264 | } | 
|---|
| 265 |  | 
|---|
| 266 | template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); } | 
|---|
| 267 | template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); } | 
|---|
| 268 | template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) | 
|---|
| 269 | { | 
|---|
| 270 | #ifdef EIGEN_VECTORIZE_SSE4_1 | 
|---|
| 271 | return _mm_max_epi32(a,b); | 
|---|
| 272 | #else | 
|---|
| 273 | // after some bench, this version *is* faster than a scalar implementation | 
|---|
| 274 | Packet4i mask = _mm_cmpgt_epi32(a,b); | 
|---|
| 275 | return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b)); | 
|---|
| 276 | #endif | 
|---|
| 277 | } | 
|---|
| 278 |  | 
|---|
| 279 | #ifdef EIGEN_VECTORIZE_SSE4_1 | 
|---|
| 280 | template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); } | 
|---|
| 281 | template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, 0); } | 
|---|
| 282 |  | 
|---|
| 283 | template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return _mm_ceil_ps(a); } | 
|---|
| 284 | template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return _mm_ceil_pd(a); } | 
|---|
| 285 |  | 
|---|
| 286 | template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); } | 
|---|
| 287 | template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); } | 
|---|
| 288 | #endif | 
|---|
| 289 |  | 
|---|
| 290 | template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } | 
|---|
| 291 | template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } | 
|---|
| 292 | template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); } | 
|---|
| 293 |  | 
|---|
| 294 | template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); } | 
|---|
| 295 | template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); } | 
|---|
| 296 | template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); } | 
|---|
| 297 |  | 
|---|
| 298 | template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); } | 
|---|
| 299 | template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); } | 
|---|
| 300 | template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); } | 
|---|
| 301 |  | 
|---|
| 302 | template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); } | 
|---|
| 303 | template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); } | 
|---|
| 304 | template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); } | 
|---|
| 305 |  | 
|---|
| 306 | template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } | 
|---|
| 307 | template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } | 
|---|
| 308 | template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); } | 
|---|
| 309 |  | 
|---|
| 310 | #if EIGEN_COMP_MSVC | 
|---|
| 311 | template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*  from) { | 
|---|
| 312 | EIGEN_DEBUG_UNALIGNED_LOAD | 
|---|
| 313 | #if (EIGEN_COMP_MSVC==1600) | 
|---|
| 314 | // NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps | 
|---|
| 315 | // (i.e., it does not generate an unaligned load!! | 
|---|
| 316 | __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from)); | 
|---|
| 317 | res = _mm_loadh_pi(res, (const __m64*)(from+2)); | 
|---|
| 318 | return res; | 
|---|
| 319 | #else | 
|---|
| 320 | return _mm_loadu_ps(from); | 
|---|
| 321 | #endif | 
|---|
| 322 | } | 
|---|
| 323 | #else | 
|---|
| 324 | // NOTE: with the code below, MSVC's compiler crashes! | 
|---|
| 325 |  | 
|---|
| 326 | template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) | 
|---|
| 327 | { | 
|---|
| 328 | EIGEN_DEBUG_UNALIGNED_LOAD | 
|---|
| 329 | return _mm_loadu_ps(from); | 
|---|
| 330 | } | 
|---|
| 331 | #endif | 
|---|
| 332 |  | 
|---|
| 333 | template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) | 
|---|
| 334 | { | 
|---|
| 335 | EIGEN_DEBUG_UNALIGNED_LOAD | 
|---|
| 336 | return _mm_loadu_pd(from); | 
|---|
| 337 | } | 
|---|
| 338 | template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) | 
|---|
| 339 | { | 
|---|
| 340 | EIGEN_DEBUG_UNALIGNED_LOAD | 
|---|
| 341 | return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); | 
|---|
| 342 | } | 
|---|
| 343 |  | 
|---|
| 344 |  | 
|---|
| 345 | template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from) | 
|---|
| 346 | { | 
|---|
| 347 | return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1); | 
|---|
| 348 | } | 
|---|
| 349 | template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*  from) | 
|---|
| 350 | { return pset1<Packet2d>(from[0]); } | 
|---|
| 351 | template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from) | 
|---|
| 352 | { | 
|---|
| 353 | Packet4i tmp; | 
|---|
| 354 | tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from)); | 
|---|
| 355 | return vec4i_swizzle1(tmp, 0, 0, 1, 1); | 
|---|
| 356 | } | 
|---|
| 357 |  | 
|---|
| 358 | template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); } | 
|---|
| 359 | template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); } | 
|---|
| 360 | template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } | 
|---|
| 361 |  | 
|---|
| 362 | template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); } | 
|---|
| 363 | template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); } | 
|---|
| 364 | template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); } | 
|---|
| 365 |  | 
|---|
| 366 | template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) | 
|---|
| 367 | { | 
|---|
| 368 | return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); | 
|---|
| 369 | } | 
|---|
| 370 | template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) | 
|---|
| 371 | { | 
|---|
| 372 | return _mm_set_pd(from[1*stride], from[0*stride]); | 
|---|
| 373 | } | 
|---|
| 374 | template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) | 
|---|
| 375 | { | 
|---|
| 376 | return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); | 
|---|
| 377 | } | 
|---|
| 378 |  | 
|---|
| 379 | template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) | 
|---|
| 380 | { | 
|---|
| 381 | to[stride*0] = _mm_cvtss_f32(from); | 
|---|
| 382 | to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1)); | 
|---|
| 383 | to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2)); | 
|---|
| 384 | to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3)); | 
|---|
| 385 | } | 
|---|
| 386 | template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) | 
|---|
| 387 | { | 
|---|
| 388 | to[stride*0] = _mm_cvtsd_f64(from); | 
|---|
| 389 | to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1)); | 
|---|
| 390 | } | 
|---|
| 391 | template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) | 
|---|
| 392 | { | 
|---|
| 393 | to[stride*0] = _mm_cvtsi128_si32(from); | 
|---|
| 394 | to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)); | 
|---|
| 395 | to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)); | 
|---|
| 396 | to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)); | 
|---|
| 397 | } | 
|---|
| 398 |  | 
|---|
| 399 | // some compilers might be tempted to perform multiple moves instead of using a vector path. | 
|---|
| 400 | template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a) | 
|---|
| 401 | { | 
|---|
| 402 | Packet4f pa = _mm_set_ss(a); | 
|---|
| 403 | pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0))); | 
|---|
| 404 | } | 
|---|
| 405 | // some compilers might be tempted to perform multiple moves instead of using a vector path. | 
|---|
| 406 | template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a) | 
|---|
| 407 | { | 
|---|
| 408 | Packet2d pa = _mm_set_sd(a); | 
|---|
| 409 | pstore(to, Packet2d(vec2d_swizzle1(pa,0,0))); | 
|---|
| 410 | } | 
|---|
| 411 |  | 
|---|
| 412 | #if EIGEN_COMP_PGI | 
|---|
| 413 | typedef const void * SsePrefetchPtrType; | 
|---|
| 414 | #else | 
|---|
| 415 | typedef const char * SsePrefetchPtrType; | 
|---|
| 416 | #endif | 
|---|
| 417 |  | 
|---|
| 418 | #ifndef EIGEN_VECTORIZE_AVX | 
|---|
| 419 | template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } | 
|---|
| 420 | template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } | 
|---|
| 421 | template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } | 
|---|
| 422 | #endif | 
|---|
| 423 |  | 
|---|
| 424 | #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64 | 
|---|
| 425 | // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 | 
|---|
| 426 | // Direct of the struct members fixed bug #62. | 
|---|
| 427 | template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; } | 
|---|
| 428 | template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; } | 
|---|
| 429 | template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; } | 
|---|
| 430 | #elif EIGEN_COMP_MSVC_STRICT | 
|---|
| 431 | // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 | 
|---|
| 432 | template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; } | 
|---|
| 433 | template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; } | 
|---|
| 434 | template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; } | 
|---|
| 435 | #else | 
|---|
| 436 | template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); } | 
|---|
| 437 | template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); } | 
|---|
| 438 | template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); } | 
|---|
| 439 | #endif | 
|---|
| 440 |  | 
|---|
| 441 | template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) | 
|---|
| 442 | { return _mm_shuffle_ps(a,a,0x1B); } | 
|---|
| 443 | template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) | 
|---|
| 444 | { return _mm_shuffle_pd(a,a,0x1); } | 
|---|
| 445 | template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) | 
|---|
| 446 | { return _mm_shuffle_epi32(a,0x1B); } | 
|---|
| 447 |  | 
|---|
| 448 | template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) | 
|---|
| 449 | { | 
|---|
| 450 | const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); | 
|---|
| 451 | return _mm_and_ps(a,mask); | 
|---|
| 452 | } | 
|---|
| 453 | template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) | 
|---|
| 454 | { | 
|---|
| 455 | const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); | 
|---|
| 456 | return _mm_and_pd(a,mask); | 
|---|
| 457 | } | 
|---|
| 458 | template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) | 
|---|
| 459 | { | 
|---|
| 460 | #ifdef EIGEN_VECTORIZE_SSSE3 | 
|---|
| 461 | return _mm_abs_epi32(a); | 
|---|
| 462 | #else | 
|---|
| 463 | Packet4i aux = _mm_srai_epi32(a,31); | 
|---|
| 464 | return _mm_sub_epi32(_mm_xor_si128(a,aux),aux); | 
|---|
| 465 | #endif | 
|---|
| 466 | } | 
|---|
| 467 |  | 
|---|
| 468 | // with AVX, the default implementations based on pload1 are faster | 
|---|
| 469 | #ifndef __AVX__ | 
|---|
| 470 | template<> EIGEN_STRONG_INLINE void | 
|---|
| 471 | pbroadcast4<Packet4f>(const float *a, | 
|---|
| 472 | Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) | 
|---|
| 473 | { | 
|---|
| 474 | a3 = pload<Packet4f>(a); | 
|---|
| 475 | a0 = vec4f_swizzle1(a3, 0,0,0,0); | 
|---|
| 476 | a1 = vec4f_swizzle1(a3, 1,1,1,1); | 
|---|
| 477 | a2 = vec4f_swizzle1(a3, 2,2,2,2); | 
|---|
| 478 | a3 = vec4f_swizzle1(a3, 3,3,3,3); | 
|---|
| 479 | } | 
|---|
| 480 | template<> EIGEN_STRONG_INLINE void | 
|---|
| 481 | pbroadcast4<Packet2d>(const double *a, | 
|---|
| 482 | Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) | 
|---|
| 483 | { | 
|---|
| 484 | #ifdef EIGEN_VECTORIZE_SSE3 | 
|---|
| 485 | a0 = _mm_loaddup_pd(a+0); | 
|---|
| 486 | a1 = _mm_loaddup_pd(a+1); | 
|---|
| 487 | a2 = _mm_loaddup_pd(a+2); | 
|---|
| 488 | a3 = _mm_loaddup_pd(a+3); | 
|---|
| 489 | #else | 
|---|
| 490 | a1 = pload<Packet2d>(a); | 
|---|
| 491 | a0 = vec2d_swizzle1(a1, 0,0); | 
|---|
| 492 | a1 = vec2d_swizzle1(a1, 1,1); | 
|---|
| 493 | a3 = pload<Packet2d>(a+2); | 
|---|
| 494 | a2 = vec2d_swizzle1(a3, 0,0); | 
|---|
| 495 | a3 = vec2d_swizzle1(a3, 1,1); | 
|---|
| 496 | #endif | 
|---|
| 497 | } | 
|---|
| 498 | #endif | 
|---|
| 499 |  | 
|---|
| 500 | EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) | 
|---|
| 501 | { | 
|---|
| 502 | vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55)); | 
|---|
| 503 | vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA)); | 
|---|
| 504 | vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF)); | 
|---|
| 505 | vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00)); | 
|---|
| 506 | } | 
|---|
| 507 |  | 
|---|
| 508 | #ifdef EIGEN_VECTORIZE_SSE3 | 
|---|
| 509 | template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) | 
|---|
| 510 | { | 
|---|
| 511 | return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3])); | 
|---|
| 512 | } | 
|---|
| 513 |  | 
|---|
| 514 | template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) | 
|---|
| 515 | { | 
|---|
| 516 | return _mm_hadd_pd(vecs[0], vecs[1]); | 
|---|
| 517 | } | 
|---|
| 518 |  | 
|---|
| 519 | #else | 
|---|
| 520 | template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) | 
|---|
| 521 | { | 
|---|
| 522 | Packet4f tmp0, tmp1, tmp2; | 
|---|
| 523 | tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]); | 
|---|
| 524 | tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]); | 
|---|
| 525 | tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]); | 
|---|
| 526 | tmp0 = _mm_add_ps(tmp0, tmp1); | 
|---|
| 527 | tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]); | 
|---|
| 528 | tmp1 = _mm_add_ps(tmp1, tmp2); | 
|---|
| 529 | tmp2 = _mm_movehl_ps(tmp1, tmp0); | 
|---|
| 530 | tmp0 = _mm_movelh_ps(tmp0, tmp1); | 
|---|
| 531 | return _mm_add_ps(tmp0, tmp2); | 
|---|
| 532 | } | 
|---|
| 533 |  | 
|---|
| 534 | template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) | 
|---|
| 535 | { | 
|---|
| 536 | return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1])); | 
|---|
| 537 | } | 
|---|
| 538 | #endif  // SSE3 | 
|---|
| 539 |  | 
|---|
| 540 | template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) | 
|---|
| 541 | { | 
|---|
| 542 | // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures | 
|---|
| 543 | // (from Nehalem to Haswell) | 
|---|
| 544 | // #ifdef EIGEN_VECTORIZE_SSE3 | 
|---|
| 545 | //   Packet4f tmp = _mm_add_ps(a, vec4f_swizzle1(a,2,3,2,3)); | 
|---|
| 546 | //   return pfirst<Packet4f>(_mm_hadd_ps(tmp, tmp)); | 
|---|
| 547 | // #else | 
|---|
| 548 | Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a)); | 
|---|
| 549 | return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); | 
|---|
| 550 | // #endif | 
|---|
| 551 | } | 
|---|
| 552 |  | 
|---|
| 553 | template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) | 
|---|
| 554 | { | 
|---|
| 555 | // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures | 
|---|
| 556 | // (from Nehalem to Haswell) | 
|---|
| 557 | // #ifdef EIGEN_VECTORIZE_SSE3 | 
|---|
| 558 | //   return pfirst<Packet2d>(_mm_hadd_pd(a, a)); | 
|---|
| 559 | // #else | 
|---|
| 560 | return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a))); | 
|---|
| 561 | // #endif | 
|---|
| 562 | } | 
|---|
| 563 |  | 
|---|
| 564 | #ifdef EIGEN_VECTORIZE_SSSE3 | 
|---|
| 565 | template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) | 
|---|
| 566 | { | 
|---|
| 567 | return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3])); | 
|---|
| 568 | } | 
|---|
| 569 | template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) | 
|---|
| 570 | { | 
|---|
| 571 | Packet4i tmp0 = _mm_hadd_epi32(a,a); | 
|---|
| 572 | return pfirst<Packet4i>(_mm_hadd_epi32(tmp0,tmp0)); | 
|---|
| 573 | } | 
|---|
| 574 | #else | 
|---|
| 575 | template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) | 
|---|
| 576 | { | 
|---|
| 577 | Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a)); | 
|---|
| 578 | return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)); | 
|---|
| 579 | } | 
|---|
| 580 |  | 
|---|
| 581 | template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) | 
|---|
| 582 | { | 
|---|
| 583 | Packet4i tmp0, tmp1, tmp2; | 
|---|
| 584 | tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]); | 
|---|
| 585 | tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]); | 
|---|
| 586 | tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]); | 
|---|
| 587 | tmp0 = _mm_add_epi32(tmp0, tmp1); | 
|---|
| 588 | tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]); | 
|---|
| 589 | tmp1 = _mm_add_epi32(tmp1, tmp2); | 
|---|
| 590 | tmp2 = _mm_unpacklo_epi64(tmp0, tmp1); | 
|---|
| 591 | tmp0 = _mm_unpackhi_epi64(tmp0, tmp1); | 
|---|
| 592 | return _mm_add_epi32(tmp0, tmp2); | 
|---|
| 593 | } | 
|---|
| 594 | #endif | 
|---|
| 595 | // Other reduction functions: | 
|---|
| 596 |  | 
|---|
| 597 | // mul | 
|---|
| 598 | template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) | 
|---|
| 599 | { | 
|---|
| 600 | Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a)); | 
|---|
| 601 | return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); | 
|---|
| 602 | } | 
|---|
| 603 | template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) | 
|---|
| 604 | { | 
|---|
| 605 | return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a,a))); | 
|---|
| 606 | } | 
|---|
| 607 | template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) | 
|---|
| 608 | { | 
|---|
| 609 | // after some experiments, it is seems this is the fastest way to implement it | 
|---|
| 610 | // for GCC (eg., reusing pmul is very slow !) | 
|---|
| 611 | // TODO try to call _mm_mul_epu32 directly | 
|---|
| 612 | EIGEN_ALIGN16 int aux[4]; | 
|---|
| 613 | pstore(aux, a); | 
|---|
| 614 | return  (aux[0] * aux[1]) * (aux[2] * aux[3]);; | 
|---|
| 615 | } | 
|---|
| 616 |  | 
|---|
| 617 | // min | 
|---|
| 618 | template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) | 
|---|
| 619 | { | 
|---|
| 620 | Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a)); | 
|---|
| 621 | return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); | 
|---|
| 622 | } | 
|---|
| 623 | template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) | 
|---|
| 624 | { | 
|---|
| 625 | return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a,a))); | 
|---|
| 626 | } | 
|---|
| 627 | template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) | 
|---|
| 628 | { | 
|---|
| 629 | #ifdef EIGEN_VECTORIZE_SSE4_1 | 
|---|
| 630 | Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); | 
|---|
| 631 | return pfirst<Packet4i>(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); | 
|---|
| 632 | #else | 
|---|
| 633 | // after some experiments, it is seems this is the fastest way to implement it | 
|---|
| 634 | // for GCC (eg., it does not like using std::min after the pstore !!) | 
|---|
| 635 | EIGEN_ALIGN16 int aux[4]; | 
|---|
| 636 | pstore(aux, a); | 
|---|
| 637 | int aux0 = aux[0]<aux[1] ? aux[0] : aux[1]; | 
|---|
| 638 | int aux2 = aux[2]<aux[3] ? aux[2] : aux[3]; | 
|---|
| 639 | return aux0<aux2 ? aux0 : aux2; | 
|---|
| 640 | #endif // EIGEN_VECTORIZE_SSE4_1 | 
|---|
| 641 | } | 
|---|
| 642 |  | 
|---|
| 643 | // max | 
|---|
| 644 | template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) | 
|---|
| 645 | { | 
|---|
| 646 | Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a)); | 
|---|
| 647 | return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); | 
|---|
| 648 | } | 
|---|
| 649 | template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) | 
|---|
| 650 | { | 
|---|
| 651 | return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a,a))); | 
|---|
| 652 | } | 
|---|
| 653 | template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) | 
|---|
| 654 | { | 
|---|
| 655 | #ifdef EIGEN_VECTORIZE_SSE4_1 | 
|---|
| 656 | Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); | 
|---|
| 657 | return pfirst<Packet4i>(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); | 
|---|
| 658 | #else | 
|---|
| 659 | // after some experiments, it is seems this is the fastest way to implement it | 
|---|
| 660 | // for GCC (eg., it does not like using std::min after the pstore !!) | 
|---|
| 661 | EIGEN_ALIGN16 int aux[4]; | 
|---|
| 662 | pstore(aux, a); | 
|---|
| 663 | int aux0 = aux[0]>aux[1] ? aux[0] : aux[1]; | 
|---|
| 664 | int aux2 = aux[2]>aux[3] ? aux[2] : aux[3]; | 
|---|
| 665 | return aux0>aux2 ? aux0 : aux2; | 
|---|
| 666 | #endif // EIGEN_VECTORIZE_SSE4_1 | 
|---|
| 667 | } | 
|---|
| 668 |  | 
|---|
| 669 | #if EIGEN_COMP_GNUC | 
|---|
| 670 | // template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f&  a, const Packet4f&  b, const Packet4f&  c) | 
|---|
| 671 | // { | 
|---|
| 672 | //   Packet4f res = b; | 
|---|
| 673 | //   asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c)); | 
|---|
| 674 | //   return res; | 
|---|
| 675 | // } | 
|---|
| 676 | // EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i&  a, const Packet4i&  b, const int i) | 
|---|
| 677 | // { | 
|---|
| 678 | //   Packet4i res = a; | 
|---|
| 679 | //   asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i)); | 
|---|
| 680 | //   return res; | 
|---|
| 681 | // } | 
|---|
| 682 | #endif | 
|---|
| 683 |  | 
|---|
| 684 | #ifdef EIGEN_VECTORIZE_SSSE3 | 
|---|
| 685 | // SSSE3 versions | 
|---|
| 686 | template<int Offset> | 
|---|
| 687 | struct palign_impl<Offset,Packet4f> | 
|---|
| 688 | { | 
|---|
| 689 | static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) | 
|---|
| 690 | { | 
|---|
| 691 | if (Offset!=0) | 
|---|
| 692 | first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4)); | 
|---|
| 693 | } | 
|---|
| 694 | }; | 
|---|
| 695 |  | 
|---|
| 696 | template<int Offset> | 
|---|
| 697 | struct palign_impl<Offset,Packet4i> | 
|---|
| 698 | { | 
|---|
| 699 | static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) | 
|---|
| 700 | { | 
|---|
| 701 | if (Offset!=0) | 
|---|
| 702 | first = _mm_alignr_epi8(second,first, Offset*4); | 
|---|
| 703 | } | 
|---|
| 704 | }; | 
|---|
| 705 |  | 
|---|
| 706 | template<int Offset> | 
|---|
| 707 | struct palign_impl<Offset,Packet2d> | 
|---|
| 708 | { | 
|---|
| 709 | static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) | 
|---|
| 710 | { | 
|---|
| 711 | if (Offset==1) | 
|---|
| 712 | first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8)); | 
|---|
| 713 | } | 
|---|
| 714 | }; | 
|---|
| 715 | #else | 
|---|
| 716 | // SSE2 versions | 
|---|
| 717 | template<int Offset> | 
|---|
| 718 | struct palign_impl<Offset,Packet4f> | 
|---|
| 719 | { | 
|---|
| 720 | static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) | 
|---|
| 721 | { | 
|---|
| 722 | if (Offset==1) | 
|---|
| 723 | { | 
|---|
| 724 | first = _mm_move_ss(first,second); | 
|---|
| 725 | first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39)); | 
|---|
| 726 | } | 
|---|
| 727 | else if (Offset==2) | 
|---|
| 728 | { | 
|---|
| 729 | first = _mm_movehl_ps(first,first); | 
|---|
| 730 | first = _mm_movelh_ps(first,second); | 
|---|
| 731 | } | 
|---|
| 732 | else if (Offset==3) | 
|---|
| 733 | { | 
|---|
| 734 | first = _mm_move_ss(first,second); | 
|---|
| 735 | first = _mm_shuffle_ps(first,second,0x93); | 
|---|
| 736 | } | 
|---|
| 737 | } | 
|---|
| 738 | }; | 
|---|
| 739 |  | 
|---|
| 740 | template<int Offset> | 
|---|
| 741 | struct palign_impl<Offset,Packet4i> | 
|---|
| 742 | { | 
|---|
| 743 | static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) | 
|---|
| 744 | { | 
|---|
| 745 | if (Offset==1) | 
|---|
| 746 | { | 
|---|
| 747 | first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); | 
|---|
| 748 | first = _mm_shuffle_epi32(first,0x39); | 
|---|
| 749 | } | 
|---|
| 750 | else if (Offset==2) | 
|---|
| 751 | { | 
|---|
| 752 | first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first))); | 
|---|
| 753 | first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); | 
|---|
| 754 | } | 
|---|
| 755 | else if (Offset==3) | 
|---|
| 756 | { | 
|---|
| 757 | first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); | 
|---|
| 758 | first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93)); | 
|---|
| 759 | } | 
|---|
| 760 | } | 
|---|
| 761 | }; | 
|---|
| 762 |  | 
|---|
| 763 | template<int Offset> | 
|---|
| 764 | struct palign_impl<Offset,Packet2d> | 
|---|
| 765 | { | 
|---|
| 766 | static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) | 
|---|
| 767 | { | 
|---|
| 768 | if (Offset==1) | 
|---|
| 769 | { | 
|---|
| 770 | first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first))); | 
|---|
| 771 | first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second))); | 
|---|
| 772 | } | 
|---|
| 773 | } | 
|---|
| 774 | }; | 
|---|
| 775 | #endif | 
|---|
| 776 |  | 
|---|
| 777 | EIGEN_DEVICE_FUNC inline void | 
|---|
| 778 | ptranspose(PacketBlock<Packet4f,4>& kernel) { | 
|---|
| 779 | _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]); | 
|---|
| 780 | } | 
|---|
| 781 |  | 
|---|
| 782 | EIGEN_DEVICE_FUNC inline void | 
|---|
| 783 | ptranspose(PacketBlock<Packet2d,2>& kernel) { | 
|---|
| 784 | __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]); | 
|---|
| 785 | kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]); | 
|---|
| 786 | kernel.packet[1] = tmp; | 
|---|
| 787 | } | 
|---|
| 788 |  | 
|---|
| 789 | EIGEN_DEVICE_FUNC inline void | 
|---|
| 790 | ptranspose(PacketBlock<Packet4i,4>& kernel) { | 
|---|
| 791 | __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]); | 
|---|
| 792 | __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]); | 
|---|
| 793 | __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]); | 
|---|
| 794 | __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]); | 
|---|
| 795 |  | 
|---|
| 796 | kernel.packet[0] = _mm_unpacklo_epi64(T0, T1); | 
|---|
| 797 | kernel.packet[1] = _mm_unpackhi_epi64(T0, T1); | 
|---|
| 798 | kernel.packet[2] = _mm_unpacklo_epi64(T2, T3); | 
|---|
| 799 | kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); | 
|---|
| 800 | } | 
|---|
| 801 |  | 
|---|
| 802 | template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { | 
|---|
| 803 | const __m128i zero = _mm_setzero_si128(); | 
|---|
| 804 | const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); | 
|---|
| 805 | __m128i false_mask = _mm_cmpeq_epi32(select, zero); | 
|---|
| 806 | #ifdef EIGEN_VECTORIZE_SSE4_1 | 
|---|
| 807 | return _mm_blendv_epi8(thenPacket, elsePacket, false_mask); | 
|---|
| 808 | #else | 
|---|
| 809 | return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket)); | 
|---|
| 810 | #endif | 
|---|
| 811 | } | 
|---|
| 812 | template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { | 
|---|
| 813 | const __m128 zero = _mm_setzero_ps(); | 
|---|
| 814 | const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); | 
|---|
| 815 | __m128 false_mask = _mm_cmpeq_ps(select, zero); | 
|---|
| 816 | #ifdef EIGEN_VECTORIZE_SSE4_1 | 
|---|
| 817 | return _mm_blendv_ps(thenPacket, elsePacket, false_mask); | 
|---|
| 818 | #else | 
|---|
| 819 | return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket)); | 
|---|
| 820 | #endif | 
|---|
| 821 | } | 
|---|
| 822 | template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { | 
|---|
| 823 | const __m128d zero = _mm_setzero_pd(); | 
|---|
| 824 | const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]); | 
|---|
| 825 | __m128d false_mask = _mm_cmpeq_pd(select, zero); | 
|---|
| 826 | #ifdef EIGEN_VECTORIZE_SSE4_1 | 
|---|
| 827 | return _mm_blendv_pd(thenPacket, elsePacket, false_mask); | 
|---|
| 828 | #else | 
|---|
| 829 | return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket)); | 
|---|
| 830 | #endif | 
|---|
| 831 | } | 
|---|
| 832 |  | 
|---|
| 833 | template<> EIGEN_STRONG_INLINE Packet4f pinsertfirst(const Packet4f& a, float b) | 
|---|
| 834 | { | 
|---|
| 835 | #ifdef EIGEN_VECTORIZE_SSE4_1 | 
|---|
| 836 | return _mm_blend_ps(a,pset1<Packet4f>(b),1); | 
|---|
| 837 | #else | 
|---|
| 838 | return _mm_move_ss(a, _mm_load_ss(&b)); | 
|---|
| 839 | #endif | 
|---|
| 840 | } | 
|---|
| 841 |  | 
|---|
| 842 | template<> EIGEN_STRONG_INLINE Packet2d pinsertfirst(const Packet2d& a, double b) | 
|---|
| 843 | { | 
|---|
| 844 | #ifdef EIGEN_VECTORIZE_SSE4_1 | 
|---|
| 845 | return _mm_blend_pd(a,pset1<Packet2d>(b),1); | 
|---|
| 846 | #else | 
|---|
| 847 | return _mm_move_sd(a, _mm_load_sd(&b)); | 
|---|
| 848 | #endif | 
|---|
| 849 | } | 
|---|
| 850 |  | 
|---|
| 851 | template<> EIGEN_STRONG_INLINE Packet4f pinsertlast(const Packet4f& a, float b) | 
|---|
| 852 | { | 
|---|
| 853 | #ifdef EIGEN_VECTORIZE_SSE4_1 | 
|---|
| 854 | return _mm_blend_ps(a,pset1<Packet4f>(b),(1<<3)); | 
|---|
| 855 | #else | 
|---|
| 856 | const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x0,0x0,0x0,0xFFFFFFFF)); | 
|---|
| 857 | return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, pset1<Packet4f>(b))); | 
|---|
| 858 | #endif | 
|---|
| 859 | } | 
|---|
| 860 |  | 
|---|
| 861 | template<> EIGEN_STRONG_INLINE Packet2d pinsertlast(const Packet2d& a, double b) | 
|---|
| 862 | { | 
|---|
| 863 | #ifdef EIGEN_VECTORIZE_SSE4_1 | 
|---|
| 864 | return _mm_blend_pd(a,pset1<Packet2d>(b),(1<<1)); | 
|---|
| 865 | #else | 
|---|
| 866 | const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x0,0xFFFFFFFF,0xFFFFFFFF)); | 
|---|
| 867 | return _mm_or_pd(_mm_andnot_pd(mask, a), _mm_and_pd(mask, pset1<Packet2d>(b))); | 
|---|
| 868 | #endif | 
|---|
| 869 | } | 
|---|
| 870 |  | 
|---|
| 871 | // Scalar path for pmadd with FMA to ensure consistency with vectorized path. | 
|---|
| 872 | #ifdef __FMA__ | 
|---|
| 873 | template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) { | 
|---|
| 874 | return ::fmaf(a,b,c); | 
|---|
| 875 | } | 
|---|
| 876 | template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) { | 
|---|
| 877 | return ::fma(a,b,c); | 
|---|
| 878 | } | 
|---|
| 879 | #endif | 
|---|
| 880 |  | 
|---|
| 881 | } // end namespace internal | 
|---|
| 882 |  | 
|---|
| 883 | } // end namespace Eigen | 
|---|
| 884 |  | 
|---|
| 885 | #if EIGEN_COMP_PGI | 
|---|
| 886 | // PGI++ does not define the following intrinsics in C++ mode. | 
|---|
| 887 | static inline __m128  _mm_castpd_ps   (__m128d x) { return reinterpret_cast<__m128&>(x);  } | 
|---|
| 888 | static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); } | 
|---|
| 889 | static inline __m128d _mm_castps_pd   (__m128  x) { return reinterpret_cast<__m128d&>(x); } | 
|---|
| 890 | static inline __m128i _mm_castps_si128(__m128  x) { return reinterpret_cast<__m128i&>(x); } | 
|---|
| 891 | static inline __m128  _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x);  } | 
|---|
| 892 | static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); } | 
|---|
| 893 | #endif | 
|---|
| 894 |  | 
|---|
| 895 | #endif // EIGEN_PACKET_MATH_SSE_H | 
|---|
| 896 |  | 
|---|