1 | // This file is part of Eigen, a lightweight C++ template library |
2 | // for linear algebra. |
3 | // |
4 | // Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr> |
5 | // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com> |
6 | // |
7 | // This Source Code Form is subject to the terms of the Mozilla |
8 | // Public License v. 2.0. If a copy of the MPL was not distributed |
9 | // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. |
10 | |
11 | #ifndef EIGEN_GENERIC_PACKET_MATH_H |
12 | #define EIGEN_GENERIC_PACKET_MATH_H |
13 | |
14 | namespace Eigen { |
15 | |
16 | namespace internal { |
17 | |
18 | /** \internal |
19 | * \file GenericPacketMath.h |
20 | * |
21 | * Default implementation for types not supported by the vectorization. |
22 | * In practice these functions are provided to make easier the writing |
23 | * of generic vectorized code. |
24 | */ |
25 | |
26 | #ifndef EIGEN_DEBUG_ALIGNED_LOAD |
27 | #define EIGEN_DEBUG_ALIGNED_LOAD |
28 | #endif |
29 | |
30 | #ifndef EIGEN_DEBUG_UNALIGNED_LOAD |
31 | #define EIGEN_DEBUG_UNALIGNED_LOAD |
32 | #endif |
33 | |
34 | #ifndef EIGEN_DEBUG_ALIGNED_STORE |
35 | #define EIGEN_DEBUG_ALIGNED_STORE |
36 | #endif |
37 | |
38 | #ifndef EIGEN_DEBUG_UNALIGNED_STORE |
39 | #define EIGEN_DEBUG_UNALIGNED_STORE |
40 | #endif |
41 | |
42 | struct default_packet_traits |
43 | { |
44 | enum { |
45 | HasHalfPacket = 0, |
46 | |
47 | HasAdd = 1, |
48 | HasSub = 1, |
49 | HasMul = 1, |
50 | HasNegate = 1, |
51 | HasAbs = 1, |
52 | HasArg = 0, |
53 | HasAbs2 = 1, |
54 | HasMin = 1, |
55 | HasMax = 1, |
56 | HasConj = 1, |
57 | HasSetLinear = 1, |
58 | HasBlend = 0, |
59 | |
60 | HasDiv = 0, |
61 | HasSqrt = 0, |
62 | HasRsqrt = 0, |
63 | HasExp = 0, |
64 | HasLog = 0, |
65 | HasLog1p = 0, |
66 | HasLog10 = 0, |
67 | HasPow = 0, |
68 | |
69 | HasSin = 0, |
70 | HasCos = 0, |
71 | HasTan = 0, |
72 | HasASin = 0, |
73 | HasACos = 0, |
74 | HasATan = 0, |
75 | HasSinh = 0, |
76 | HasCosh = 0, |
77 | HasTanh = 0, |
78 | HasLGamma = 0, |
79 | HasDiGamma = 0, |
80 | HasZeta = 0, |
81 | HasPolygamma = 0, |
82 | HasErf = 0, |
83 | HasErfc = 0, |
84 | HasIGamma = 0, |
85 | HasIGammac = 0, |
86 | HasBetaInc = 0, |
87 | |
88 | HasRound = 0, |
89 | HasFloor = 0, |
90 | HasCeil = 0, |
91 | |
92 | HasSign = 0 |
93 | }; |
94 | }; |
95 | |
96 | template<typename T> struct packet_traits : default_packet_traits |
97 | { |
98 | typedef T type; |
99 | typedef T half; |
100 | enum { |
101 | Vectorizable = 0, |
102 | size = 1, |
103 | AlignedOnScalar = 0, |
104 | HasHalfPacket = 0 |
105 | }; |
106 | enum { |
107 | HasAdd = 0, |
108 | HasSub = 0, |
109 | HasMul = 0, |
110 | HasNegate = 0, |
111 | HasAbs = 0, |
112 | HasAbs2 = 0, |
113 | HasMin = 0, |
114 | HasMax = 0, |
115 | HasConj = 0, |
116 | HasSetLinear = 0 |
117 | }; |
118 | }; |
119 | |
120 | template<typename T> struct packet_traits<const T> : packet_traits<T> { }; |
121 | |
122 | template <typename Src, typename Tgt> struct type_casting_traits { |
123 | enum { |
124 | VectorizedCast = 0, |
125 | SrcCoeffRatio = 1, |
126 | TgtCoeffRatio = 1 |
127 | }; |
128 | }; |
129 | |
130 | |
131 | /** \internal \returns static_cast<TgtType>(a) (coeff-wise) */ |
132 | template <typename SrcPacket, typename TgtPacket> |
133 | EIGEN_DEVICE_FUNC inline TgtPacket |
134 | pcast(const SrcPacket& a) { |
135 | return static_cast<TgtPacket>(a); |
136 | } |
137 | template <typename SrcPacket, typename TgtPacket> |
138 | EIGEN_DEVICE_FUNC inline TgtPacket |
139 | pcast(const SrcPacket& a, const SrcPacket& /*b*/) { |
140 | return static_cast<TgtPacket>(a); |
141 | } |
142 | |
143 | template <typename SrcPacket, typename TgtPacket> |
144 | EIGEN_DEVICE_FUNC inline TgtPacket |
145 | pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const SrcPacket& /*d*/) { |
146 | return static_cast<TgtPacket>(a); |
147 | } |
148 | |
149 | /** \internal \returns a + b (coeff-wise) */ |
150 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |
151 | padd(const Packet& a, |
152 | const Packet& b) { return a+b; } |
153 | |
154 | /** \internal \returns a - b (coeff-wise) */ |
155 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |
156 | psub(const Packet& a, |
157 | const Packet& b) { return a-b; } |
158 | |
159 | /** \internal \returns -a (coeff-wise) */ |
160 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |
161 | pnegate(const Packet& a) { return -a; } |
162 | |
163 | /** \internal \returns conj(a) (coeff-wise) */ |
164 | |
165 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |
166 | pconj(const Packet& a) { return numext::conj(a); } |
167 | |
168 | /** \internal \returns a * b (coeff-wise) */ |
169 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |
170 | pmul(const Packet& a, |
171 | const Packet& b) { return a*b; } |
172 | |
173 | /** \internal \returns a / b (coeff-wise) */ |
174 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |
175 | pdiv(const Packet& a, |
176 | const Packet& b) { return a/b; } |
177 | |
178 | /** \internal \returns the min of \a a and \a b (coeff-wise) */ |
179 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |
180 | pmin(const Packet& a, |
181 | const Packet& b) { return numext::mini(a, b); } |
182 | |
183 | /** \internal \returns the max of \a a and \a b (coeff-wise) */ |
184 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |
185 | pmax(const Packet& a, |
186 | const Packet& b) { return numext::maxi(a, b); } |
187 | |
188 | /** \internal \returns the absolute value of \a a */ |
189 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |
190 | pabs(const Packet& a) { using std::abs; return abs(a); } |
191 | |
192 | /** \internal \returns the phase angle of \a a */ |
193 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |
194 | parg(const Packet& a) { using numext::arg; return arg(a); } |
195 | |
196 | /** \internal \returns the bitwise and of \a a and \a b */ |
197 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |
198 | pand(const Packet& a, const Packet& b) { return a & b; } |
199 | |
200 | /** \internal \returns the bitwise or of \a a and \a b */ |
201 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |
202 | por(const Packet& a, const Packet& b) { return a | b; } |
203 | |
204 | /** \internal \returns the bitwise xor of \a a and \a b */ |
205 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |
206 | pxor(const Packet& a, const Packet& b) { return a ^ b; } |
207 | |
208 | /** \internal \returns the bitwise andnot of \a a and \a b */ |
209 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |
210 | pandnot(const Packet& a, const Packet& b) { return a & (!b); } |
211 | |
212 | /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ |
213 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |
214 | pload(const typename unpacket_traits<Packet>::type* from) { return *from; } |
215 | |
216 | /** \internal \returns a packet version of \a *from, (un-aligned load) */ |
217 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |
218 | ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; } |
219 | |
220 | /** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ |
221 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |
222 | pset1(const typename unpacket_traits<Packet>::type& a) { return a; } |
223 | |
224 | /** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */ |
225 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |
226 | pload1(const typename unpacket_traits<Packet>::type *a) { return pset1<Packet>(*a); } |
227 | |
228 | /** \internal \returns a packet with elements of \a *from duplicated. |
229 | * For instance, for a packet of 8 elements, 4 scalars will be read from \a *from and |
230 | * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]} |
231 | * Currently, this function is only used for scalar * complex products. |
232 | */ |
233 | template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet |
234 | ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; } |
235 | |
236 | /** \internal \returns a packet with elements of \a *from quadrupled. |
237 | * For instance, for a packet of 8 elements, 2 scalars will be read from \a *from and |
238 | * replicated to form: {from[0],from[0],from[0],from[0],from[1],from[1],from[1],from[1]} |
239 | * Currently, this function is only used in matrix products. |
240 | * For packet-size smaller or equal to 4, this function is equivalent to pload1 |
241 | */ |
242 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |
243 | ploadquad(const typename unpacket_traits<Packet>::type* from) |
244 | { return pload1<Packet>(from); } |
245 | |
246 | /** \internal equivalent to |
247 | * \code |
248 | * a0 = pload1(a+0); |
249 | * a1 = pload1(a+1); |
250 | * a2 = pload1(a+2); |
251 | * a3 = pload1(a+3); |
252 | * \endcode |
253 | * \sa pset1, pload1, ploaddup, pbroadcast2 |
254 | */ |
255 | template<typename Packet> EIGEN_DEVICE_FUNC |
256 | inline void pbroadcast4(const typename unpacket_traits<Packet>::type *a, |
257 | Packet& a0, Packet& a1, Packet& a2, Packet& a3) |
258 | { |
259 | a0 = pload1<Packet>(a+0); |
260 | a1 = pload1<Packet>(a+1); |
261 | a2 = pload1<Packet>(a+2); |
262 | a3 = pload1<Packet>(a+3); |
263 | } |
264 | |
265 | /** \internal equivalent to |
266 | * \code |
267 | * a0 = pload1(a+0); |
268 | * a1 = pload1(a+1); |
269 | * \endcode |
270 | * \sa pset1, pload1, ploaddup, pbroadcast4 |
271 | */ |
272 | template<typename Packet> EIGEN_DEVICE_FUNC |
273 | inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a, |
274 | Packet& a0, Packet& a1) |
275 | { |
276 | a0 = pload1<Packet>(a+0); |
277 | a1 = pload1<Packet>(a+1); |
278 | } |
279 | |
280 | /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */ |
281 | template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet |
282 | plset(const typename unpacket_traits<Packet>::type& a) { return a; } |
283 | |
284 | /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */ |
285 | template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from) |
286 | { (*to) = from; } |
287 | |
288 | /** \internal copy the packet \a from to \a *to, (un-aligned store) */ |
289 | template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from) |
290 | { (*to) = from; } |
291 | |
292 | template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/) |
293 | { return ploadu<Packet>(from); } |
294 | |
295 | template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index /*stride*/) |
296 | { pstore(to, from); } |
297 | |
298 | /** \internal tries to do cache prefetching of \a addr */ |
299 | template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) |
300 | { |
301 | #ifdef __CUDA_ARCH__ |
302 | #if defined(__LP64__) |
303 | // 64-bit pointer operand constraint for inlined asm |
304 | asm(" prefetch.L1 [ %1 ];" : "=l" (addr) : "l" (addr)); |
305 | #else |
306 | // 32-bit pointer operand constraint for inlined asm |
307 | asm(" prefetch.L1 [ %1 ];" : "=r" (addr) : "r" (addr)); |
308 | #endif |
309 | #elif (!EIGEN_COMP_MSVC) && (EIGEN_COMP_GNUC || EIGEN_COMP_CLANG || EIGEN_COMP_ICC) |
310 | __builtin_prefetch(addr); |
311 | #endif |
312 | } |
313 | |
314 | /** \internal \returns the first element of a packet */ |
315 | template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type pfirst(const Packet& a) |
316 | { return a; } |
317 | |
318 | /** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */ |
319 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |
320 | preduxp(const Packet* vecs) { return vecs[0]; } |
321 | |
322 | /** \internal \returns the sum of the elements of \a a*/ |
323 | template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a) |
324 | { return a; } |
325 | |
326 | /** \internal \returns the sum of the elements of \a a by block of 4 elements. |
327 | * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7} |
328 | * For packet-size smaller or equal to 4, this boils down to a noop. |
329 | */ |
330 | template<typename Packet> EIGEN_DEVICE_FUNC inline |
331 | typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type |
332 | predux_downto4(const Packet& a) |
333 | { return a; } |
334 | |
335 | /** \internal \returns the product of the elements of \a a*/ |
336 | template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a) |
337 | { return a; } |
338 | |
339 | /** \internal \returns the min of the elements of \a a*/ |
340 | template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a) |
341 | { return a; } |
342 | |
343 | /** \internal \returns the max of the elements of \a a*/ |
344 | template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a) |
345 | { return a; } |
346 | |
347 | /** \internal \returns the reversed elements of \a a*/ |
348 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a) |
349 | { return a; } |
350 | |
351 | /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */ |
352 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a) |
353 | { |
354 | // FIXME: uncomment the following in case we drop the internal imag and real functions. |
355 | // using std::imag; |
356 | // using std::real; |
357 | return Packet(imag(a),real(a)); |
358 | } |
359 | |
360 | /************************** |
361 | * Special math functions |
362 | ***************************/ |
363 | |
364 | /** \internal \returns the sine of \a a (coeff-wise) */ |
365 | template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS |
366 | Packet psin(const Packet& a) { using std::sin; return sin(a); } |
367 | |
368 | /** \internal \returns the cosine of \a a (coeff-wise) */ |
369 | template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS |
370 | Packet pcos(const Packet& a) { using std::cos; return cos(a); } |
371 | |
372 | /** \internal \returns the tan of \a a (coeff-wise) */ |
373 | template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS |
374 | Packet ptan(const Packet& a) { using std::tan; return tan(a); } |
375 | |
376 | /** \internal \returns the arc sine of \a a (coeff-wise) */ |
377 | template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS |
378 | Packet pasin(const Packet& a) { using std::asin; return asin(a); } |
379 | |
380 | /** \internal \returns the arc cosine of \a a (coeff-wise) */ |
381 | template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS |
382 | Packet pacos(const Packet& a) { using std::acos; return acos(a); } |
383 | |
384 | /** \internal \returns the arc tangent of \a a (coeff-wise) */ |
385 | template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS |
386 | Packet patan(const Packet& a) { using std::atan; return atan(a); } |
387 | |
388 | /** \internal \returns the hyperbolic sine of \a a (coeff-wise) */ |
389 | template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS |
390 | Packet psinh(const Packet& a) { using std::sinh; return sinh(a); } |
391 | |
392 | /** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */ |
393 | template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS |
394 | Packet pcosh(const Packet& a) { using std::cosh; return cosh(a); } |
395 | |
396 | /** \internal \returns the hyperbolic tan of \a a (coeff-wise) */ |
397 | template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS |
398 | Packet ptanh(const Packet& a) { using std::tanh; return tanh(a); } |
399 | |
400 | /** \internal \returns the exp of \a a (coeff-wise) */ |
401 | template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS |
402 | Packet pexp(const Packet& a) { using std::exp; return exp(a); } |
403 | |
404 | /** \internal \returns the log of \a a (coeff-wise) */ |
405 | template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS |
406 | Packet plog(const Packet& a) { using std::log; return log(a); } |
407 | |
408 | /** \internal \returns the log1p of \a a (coeff-wise) */ |
409 | template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS |
410 | Packet plog1p(const Packet& a) { return numext::log1p(a); } |
411 | |
412 | /** \internal \returns the log10 of \a a (coeff-wise) */ |
413 | template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS |
414 | Packet plog10(const Packet& a) { using std::log10; return log10(a); } |
415 | |
416 | /** \internal \returns the square-root of \a a (coeff-wise) */ |
417 | template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS |
418 | Packet psqrt(const Packet& a) { using std::sqrt; return sqrt(a); } |
419 | |
420 | /** \internal \returns the reciprocal square-root of \a a (coeff-wise) */ |
421 | template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS |
422 | Packet prsqrt(const Packet& a) { |
423 | return pdiv(pset1<Packet>(1), psqrt(a)); |
424 | } |
425 | |
426 | /** \internal \returns the rounded value of \a a (coeff-wise) */ |
427 | template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS |
428 | Packet pround(const Packet& a) { using numext::round; return round(a); } |
429 | |
430 | /** \internal \returns the floor of \a a (coeff-wise) */ |
431 | template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS |
432 | Packet pfloor(const Packet& a) { using numext::floor; return floor(a); } |
433 | |
434 | /** \internal \returns the ceil of \a a (coeff-wise) */ |
435 | template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS |
436 | Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); } |
437 | |
438 | /*************************************************************************** |
439 | * The following functions might not have to be overwritten for vectorized types |
440 | ***************************************************************************/ |
441 | |
442 | /** \internal copy a packet with constant coeficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */ |
443 | // NOTE: this function must really be templated on the packet type (think about different packet types for the same scalar type) |
444 | template<typename Packet> |
445 | inline void pstore1(typename unpacket_traits<Packet>::type* to, const typename unpacket_traits<Packet>::type& a) |
446 | { |
447 | pstore(to, pset1<Packet>(a)); |
448 | } |
449 | |
450 | /** \internal \returns a * b + c (coeff-wise) */ |
451 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |
452 | pmadd(const Packet& a, |
453 | const Packet& b, |
454 | const Packet& c) |
455 | { return padd(pmul(a, b),c); } |
456 | |
457 | /** \internal \returns a packet version of \a *from. |
458 | * The pointer \a from must be aligned on a \a Alignment bytes boundary. */ |
459 | template<typename Packet, int Alignment> |
460 | EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits<Packet>::type* from) |
461 | { |
462 | if(Alignment >= unpacket_traits<Packet>::alignment) |
463 | return pload<Packet>(from); |
464 | else |
465 | return ploadu<Packet>(from); |
466 | } |
467 | |
468 | /** \internal copy the packet \a from to \a *to. |
469 | * The pointer \a from must be aligned on a \a Alignment bytes boundary. */ |
470 | template<typename Scalar, typename Packet, int Alignment> |
471 | EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from) |
472 | { |
473 | if(Alignment >= unpacket_traits<Packet>::alignment) |
474 | pstore(to, from); |
475 | else |
476 | pstoreu(to, from); |
477 | } |
478 | |
479 | /** \internal \returns a packet version of \a *from. |
480 | * Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the |
481 | * hardware if available to speedup the loading of data that won't be modified |
482 | * by the current computation. |
483 | */ |
484 | template<typename Packet, int LoadMode> |
485 | EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from) |
486 | { |
487 | return ploadt<Packet, LoadMode>(from); |
488 | } |
489 | |
490 | /** \internal default implementation of palign() allowing partial specialization */ |
491 | template<int Offset,typename PacketType> |
492 | struct palign_impl |
493 | { |
494 | // by default data are aligned, so there is nothing to be done :) |
495 | static inline void run(PacketType&, const PacketType&) {} |
496 | }; |
497 | |
498 | /** \internal update \a first using the concatenation of the packet_size minus \a Offset last elements |
499 | * of \a first and \a Offset first elements of \a second. |
500 | * |
501 | * This function is currently only used to optimize matrix-vector products on unligned matrices. |
502 | * It takes 2 packets that represent a contiguous memory array, and returns a packet starting |
503 | * at the position \a Offset. For instance, for packets of 4 elements, we have: |
504 | * Input: |
505 | * - first = {f0,f1,f2,f3} |
506 | * - second = {s0,s1,s2,s3} |
507 | * Output: |
508 | * - if Offset==0 then {f0,f1,f2,f3} |
509 | * - if Offset==1 then {f1,f2,f3,s0} |
510 | * - if Offset==2 then {f2,f3,s0,s1} |
511 | * - if Offset==3 then {f3,s0,s1,s3} |
512 | */ |
513 | template<int Offset,typename PacketType> |
514 | inline void palign(PacketType& first, const PacketType& second) |
515 | { |
516 | palign_impl<Offset,PacketType>::run(first,second); |
517 | } |
518 | |
519 | /*************************************************************************** |
520 | * Fast complex products (GCC generates a function call which is very slow) |
521 | ***************************************************************************/ |
522 | |
523 | // Eigen+CUDA does not support complexes. |
524 | #ifndef __CUDACC__ |
525 | |
526 | template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b) |
527 | { return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); } |
528 | |
529 | template<> inline std::complex<double> pmul(const std::complex<double>& a, const std::complex<double>& b) |
530 | { return std::complex<double>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); } |
531 | |
532 | #endif |
533 | |
534 | |
535 | /*************************************************************************** |
536 | * PacketBlock, that is a collection of N packets where the number of words |
537 | * in the packet is a multiple of N. |
538 | ***************************************************************************/ |
539 | template <typename Packet,int N=unpacket_traits<Packet>::size> struct PacketBlock { |
540 | Packet packet[N]; |
541 | }; |
542 | |
543 | template<typename Packet> EIGEN_DEVICE_FUNC inline void |
544 | ptranspose(PacketBlock<Packet,1>& /*kernel*/) { |
545 | // Nothing to do in the scalar case, i.e. a 1x1 matrix. |
546 | } |
547 | |
548 | /*************************************************************************** |
549 | * Selector, i.e. vector of N boolean values used to select (i.e. blend) |
550 | * words from 2 packets. |
551 | ***************************************************************************/ |
552 | template <size_t N> struct Selector { |
553 | bool select[N]; |
554 | }; |
555 | |
556 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |
557 | pblend(const Selector<unpacket_traits<Packet>::size>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) { |
558 | return ifPacket.select[0] ? thenPacket : elsePacket; |
559 | } |
560 | |
561 | /** \internal \returns \a a with the first coefficient replaced by the scalar b */ |
562 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |
563 | pinsertfirst(const Packet& a, typename unpacket_traits<Packet>::type b) |
564 | { |
565 | // Default implementation based on pblend. |
566 | // It must be specialized for higher performance. |
567 | Selector<unpacket_traits<Packet>::size> mask; |
568 | mask.select[0] = true; |
569 | // This for loop should be optimized away by the compiler. |
570 | for(Index i=1; i<unpacket_traits<Packet>::size; ++i) |
571 | mask.select[i] = false; |
572 | return pblend(mask, pset1<Packet>(b), a); |
573 | } |
574 | |
575 | /** \internal \returns \a a with the last coefficient replaced by the scalar b */ |
576 | template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |
577 | pinsertlast(const Packet& a, typename unpacket_traits<Packet>::type b) |
578 | { |
579 | // Default implementation based on pblend. |
580 | // It must be specialized for higher performance. |
581 | Selector<unpacket_traits<Packet>::size> mask; |
582 | // This for loop should be optimized away by the compiler. |
583 | for(Index i=0; i<unpacket_traits<Packet>::size-1; ++i) |
584 | mask.select[i] = false; |
585 | mask.select[unpacket_traits<Packet>::size-1] = true; |
586 | return pblend(mask, pset1<Packet>(b), a); |
587 | } |
588 | |
589 | } // end namespace internal |
590 | |
591 | } // end namespace Eigen |
592 | |
593 | #endif // EIGEN_GENERIC_PACKET_MATH_H |
594 | |