1 | // This file is part of Eigen, a lightweight C++ template library |
2 | // for linear algebra. |
3 | // |
4 | // Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr> |
5 | // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com> |
6 | // |
7 | // This Source Code Form is subject to the terms of the Mozilla |
8 | // Public License v. 2.0. If a copy of the MPL was not distributed |
9 | // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. |
10 | |
11 | #ifndef EIGEN_REDUX_H |
12 | #define EIGEN_REDUX_H |
13 | |
14 | namespace Eigen { |
15 | |
16 | namespace internal { |
17 | |
18 | // TODO |
19 | // * implement other kind of vectorization |
20 | // * factorize code |
21 | |
22 | /*************************************************************************** |
23 | * Part 1 : the logic deciding a strategy for vectorization and unrolling |
24 | ***************************************************************************/ |
25 | |
26 | template<typename Func, typename Derived> |
27 | struct redux_traits |
28 | { |
29 | public: |
30 | typedef typename find_best_packet<typename Derived::Scalar,Derived::SizeAtCompileTime>::type PacketType; |
31 | enum { |
32 | PacketSize = unpacket_traits<PacketType>::size, |
33 | InnerMaxSize = int(Derived::IsRowMajor) |
34 | ? Derived::MaxColsAtCompileTime |
35 | : Derived::MaxRowsAtCompileTime |
36 | }; |
37 | |
38 | enum { |
39 | MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit) |
40 | && (functor_traits<Func>::PacketAccess), |
41 | MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit), |
42 | MaySliceVectorize = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize |
43 | }; |
44 | |
45 | public: |
46 | enum { |
47 | Traversal = int(MayLinearVectorize) ? int(LinearVectorizedTraversal) |
48 | : int(MaySliceVectorize) ? int(SliceVectorizedTraversal) |
49 | : int(DefaultTraversal) |
50 | }; |
51 | |
52 | public: |
53 | enum { |
54 | Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost |
55 | : Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits<Func>::Cost, |
56 | UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize)) |
57 | }; |
58 | |
59 | public: |
60 | enum { |
61 | Unrolling = Cost <= UnrollingLimit ? CompleteUnrolling : NoUnrolling |
62 | }; |
63 | |
64 | #ifdef EIGEN_DEBUG_ASSIGN |
65 | static void debug() |
66 | { |
67 | std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl; |
68 | std::cerr.setf(std::ios::hex, std::ios::basefield); |
69 | EIGEN_DEBUG_VAR(Derived::Flags) |
70 | std::cerr.unsetf(std::ios::hex); |
71 | EIGEN_DEBUG_VAR(InnerMaxSize) |
72 | EIGEN_DEBUG_VAR(PacketSize) |
73 | EIGEN_DEBUG_VAR(MightVectorize) |
74 | EIGEN_DEBUG_VAR(MayLinearVectorize) |
75 | EIGEN_DEBUG_VAR(MaySliceVectorize) |
76 | EIGEN_DEBUG_VAR(Traversal) |
77 | EIGEN_DEBUG_VAR(UnrollingLimit) |
78 | EIGEN_DEBUG_VAR(Unrolling) |
79 | std::cerr << std::endl; |
80 | } |
81 | #endif |
82 | }; |
83 | |
84 | /*************************************************************************** |
85 | * Part 2 : unrollers |
86 | ***************************************************************************/ |
87 | |
88 | /*** no vectorization ***/ |
89 | |
90 | template<typename Func, typename Derived, int Start, int Length> |
91 | struct redux_novec_unroller |
92 | { |
93 | enum { |
94 | HalfLength = Length/2 |
95 | }; |
96 | |
97 | typedef typename Derived::Scalar Scalar; |
98 | |
99 | EIGEN_DEVICE_FUNC |
100 | static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) |
101 | { |
102 | return func(redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func), |
103 | redux_novec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func)); |
104 | } |
105 | }; |
106 | |
107 | template<typename Func, typename Derived, int Start> |
108 | struct redux_novec_unroller<Func, Derived, Start, 1> |
109 | { |
110 | enum { |
111 | outer = Start / Derived::InnerSizeAtCompileTime, |
112 | inner = Start % Derived::InnerSizeAtCompileTime |
113 | }; |
114 | |
115 | typedef typename Derived::Scalar Scalar; |
116 | |
117 | EIGEN_DEVICE_FUNC |
118 | static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&) |
119 | { |
120 | return mat.coeffByOuterInner(outer, inner); |
121 | } |
122 | }; |
123 | |
124 | // This is actually dead code and will never be called. It is required |
125 | // to prevent false warnings regarding failed inlining though |
126 | // for 0 length run() will never be called at all. |
127 | template<typename Func, typename Derived, int Start> |
128 | struct redux_novec_unroller<Func, Derived, Start, 0> |
129 | { |
130 | typedef typename Derived::Scalar Scalar; |
131 | EIGEN_DEVICE_FUNC |
132 | static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); } |
133 | }; |
134 | |
135 | /*** vectorization ***/ |
136 | |
137 | template<typename Func, typename Derived, int Start, int Length> |
138 | struct redux_vec_unroller |
139 | { |
140 | enum { |
141 | PacketSize = redux_traits<Func, Derived>::PacketSize, |
142 | HalfLength = Length/2 |
143 | }; |
144 | |
145 | typedef typename Derived::Scalar Scalar; |
146 | typedef typename redux_traits<Func, Derived>::PacketType PacketScalar; |
147 | |
148 | static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func) |
149 | { |
150 | return func.packetOp( |
151 | redux_vec_unroller<Func, Derived, Start, HalfLength>::run(mat,func), |
152 | redux_vec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func) ); |
153 | } |
154 | }; |
155 | |
156 | template<typename Func, typename Derived, int Start> |
157 | struct redux_vec_unroller<Func, Derived, Start, 1> |
158 | { |
159 | enum { |
160 | index = Start * redux_traits<Func, Derived>::PacketSize, |
161 | outer = index / int(Derived::InnerSizeAtCompileTime), |
162 | inner = index % int(Derived::InnerSizeAtCompileTime), |
163 | alignment = Derived::Alignment |
164 | }; |
165 | |
166 | typedef typename Derived::Scalar Scalar; |
167 | typedef typename redux_traits<Func, Derived>::PacketType PacketScalar; |
168 | |
169 | static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&) |
170 | { |
171 | return mat.template packetByOuterInner<alignment,PacketScalar>(outer, inner); |
172 | } |
173 | }; |
174 | |
175 | /*************************************************************************** |
176 | * Part 3 : implementation of all cases |
177 | ***************************************************************************/ |
178 | |
179 | template<typename Func, typename Derived, |
180 | int Traversal = redux_traits<Func, Derived>::Traversal, |
181 | int Unrolling = redux_traits<Func, Derived>::Unrolling |
182 | > |
183 | struct redux_impl; |
184 | |
185 | template<typename Func, typename Derived> |
186 | struct redux_impl<Func, Derived, DefaultTraversal, NoUnrolling> |
187 | { |
188 | typedef typename Derived::Scalar Scalar; |
189 | EIGEN_DEVICE_FUNC |
190 | static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) |
191 | { |
192 | eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix" ); |
193 | Scalar res; |
194 | res = mat.coeffByOuterInner(0, 0); |
195 | for(Index i = 1; i < mat.innerSize(); ++i) |
196 | res = func(res, mat.coeffByOuterInner(0, i)); |
197 | for(Index i = 1; i < mat.outerSize(); ++i) |
198 | for(Index j = 0; j < mat.innerSize(); ++j) |
199 | res = func(res, mat.coeffByOuterInner(i, j)); |
200 | return res; |
201 | } |
202 | }; |
203 | |
204 | template<typename Func, typename Derived> |
205 | struct redux_impl<Func,Derived, DefaultTraversal, CompleteUnrolling> |
206 | : public redux_novec_unroller<Func,Derived, 0, Derived::SizeAtCompileTime> |
207 | {}; |
208 | |
209 | template<typename Func, typename Derived> |
210 | struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling> |
211 | { |
212 | typedef typename Derived::Scalar Scalar; |
213 | typedef typename redux_traits<Func, Derived>::PacketType PacketScalar; |
214 | |
215 | static Scalar run(const Derived &mat, const Func& func) |
216 | { |
217 | const Index size = mat.size(); |
218 | |
219 | const Index packetSize = redux_traits<Func, Derived>::PacketSize; |
220 | const int packetAlignment = unpacket_traits<PacketScalar>::alignment; |
221 | enum { |
222 | alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned), |
223 | alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment) |
224 | }; |
225 | const Index alignedStart = internal::first_default_aligned(mat.nestedExpression()); |
226 | const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize); |
227 | const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize); |
228 | const Index alignedEnd2 = alignedStart + alignedSize2; |
229 | const Index alignedEnd = alignedStart + alignedSize; |
230 | Scalar res; |
231 | if(alignedSize) |
232 | { |
233 | PacketScalar packet_res0 = mat.template packet<alignment,PacketScalar>(alignedStart); |
234 | if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop |
235 | { |
236 | PacketScalar packet_res1 = mat.template packet<alignment,PacketScalar>(alignedStart+packetSize); |
237 | for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize) |
238 | { |
239 | packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(index)); |
240 | packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment,PacketScalar>(index+packetSize)); |
241 | } |
242 | |
243 | packet_res0 = func.packetOp(packet_res0,packet_res1); |
244 | if(alignedEnd>alignedEnd2) |
245 | packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(alignedEnd2)); |
246 | } |
247 | res = func.predux(packet_res0); |
248 | |
249 | for(Index index = 0; index < alignedStart; ++index) |
250 | res = func(res,mat.coeff(index)); |
251 | |
252 | for(Index index = alignedEnd; index < size; ++index) |
253 | res = func(res,mat.coeff(index)); |
254 | } |
255 | else // too small to vectorize anything. |
256 | // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize. |
257 | { |
258 | res = mat.coeff(0); |
259 | for(Index index = 1; index < size; ++index) |
260 | res = func(res,mat.coeff(index)); |
261 | } |
262 | |
263 | return res; |
264 | } |
265 | }; |
266 | |
267 | // NOTE: for SliceVectorizedTraversal we simply bypass unrolling |
268 | template<typename Func, typename Derived, int Unrolling> |
269 | struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling> |
270 | { |
271 | typedef typename Derived::Scalar Scalar; |
272 | typedef typename redux_traits<Func, Derived>::PacketType PacketType; |
273 | |
274 | EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func) |
275 | { |
276 | eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix" ); |
277 | const Index innerSize = mat.innerSize(); |
278 | const Index outerSize = mat.outerSize(); |
279 | enum { |
280 | packetSize = redux_traits<Func, Derived>::PacketSize |
281 | }; |
282 | const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize; |
283 | Scalar res; |
284 | if(packetedInnerSize) |
285 | { |
286 | PacketType packet_res = mat.template packet<Unaligned,PacketType>(0,0); |
287 | for(Index j=0; j<outerSize; ++j) |
288 | for(Index i=(j==0?packetSize:0); i<packetedInnerSize; i+=Index(packetSize)) |
289 | packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned,PacketType>(j,i)); |
290 | |
291 | res = func.predux(packet_res); |
292 | for(Index j=0; j<outerSize; ++j) |
293 | for(Index i=packetedInnerSize; i<innerSize; ++i) |
294 | res = func(res, mat.coeffByOuterInner(j,i)); |
295 | } |
296 | else // too small to vectorize anything. |
297 | // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize. |
298 | { |
299 | res = redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>::run(mat, func); |
300 | } |
301 | |
302 | return res; |
303 | } |
304 | }; |
305 | |
306 | template<typename Func, typename Derived> |
307 | struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling> |
308 | { |
309 | typedef typename Derived::Scalar Scalar; |
310 | |
311 | typedef typename redux_traits<Func, Derived>::PacketType PacketScalar; |
312 | enum { |
313 | PacketSize = redux_traits<Func, Derived>::PacketSize, |
314 | Size = Derived::SizeAtCompileTime, |
315 | VectorizedSize = (Size / PacketSize) * PacketSize |
316 | }; |
317 | EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) |
318 | { |
319 | eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix" ); |
320 | if (VectorizedSize > 0) { |
321 | Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func)); |
322 | if (VectorizedSize != Size) |
323 | res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func)); |
324 | return res; |
325 | } |
326 | else { |
327 | return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func); |
328 | } |
329 | } |
330 | }; |
331 | |
332 | // evaluator adaptor |
333 | template<typename _XprType> |
334 | class redux_evaluator |
335 | { |
336 | public: |
337 | typedef _XprType XprType; |
338 | EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {} |
339 | |
340 | typedef typename XprType::Scalar Scalar; |
341 | typedef typename XprType::CoeffReturnType CoeffReturnType; |
342 | typedef typename XprType::PacketScalar PacketScalar; |
343 | typedef typename XprType::PacketReturnType PacketReturnType; |
344 | |
345 | enum { |
346 | MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime, |
347 | MaxColsAtCompileTime = XprType::MaxColsAtCompileTime, |
348 | // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator |
349 | Flags = evaluator<XprType>::Flags & ~DirectAccessBit, |
350 | IsRowMajor = XprType::IsRowMajor, |
351 | SizeAtCompileTime = XprType::SizeAtCompileTime, |
352 | InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime, |
353 | CoeffReadCost = evaluator<XprType>::CoeffReadCost, |
354 | Alignment = evaluator<XprType>::Alignment |
355 | }; |
356 | |
357 | EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); } |
358 | EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); } |
359 | EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); } |
360 | EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); } |
361 | EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); } |
362 | |
363 | EIGEN_DEVICE_FUNC |
364 | CoeffReturnType coeff(Index row, Index col) const |
365 | { return m_evaluator.coeff(row, col); } |
366 | |
367 | EIGEN_DEVICE_FUNC |
368 | CoeffReturnType coeff(Index index) const |
369 | { return m_evaluator.coeff(index); } |
370 | |
371 | template<int LoadMode, typename PacketType> |
372 | PacketType packet(Index row, Index col) const |
373 | { return m_evaluator.template packet<LoadMode,PacketType>(row, col); } |
374 | |
375 | template<int LoadMode, typename PacketType> |
376 | PacketType packet(Index index) const |
377 | { return m_evaluator.template packet<LoadMode,PacketType>(index); } |
378 | |
379 | EIGEN_DEVICE_FUNC |
380 | CoeffReturnType coeffByOuterInner(Index outer, Index inner) const |
381 | { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } |
382 | |
383 | template<int LoadMode, typename PacketType> |
384 | PacketType packetByOuterInner(Index outer, Index inner) const |
385 | { return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } |
386 | |
387 | const XprType & nestedExpression() const { return m_xpr; } |
388 | |
389 | protected: |
390 | internal::evaluator<XprType> m_evaluator; |
391 | const XprType &m_xpr; |
392 | }; |
393 | |
394 | } // end namespace internal |
395 | |
396 | /*************************************************************************** |
397 | * Part 4 : public API |
398 | ***************************************************************************/ |
399 | |
400 | |
401 | /** \returns the result of a full redux operation on the whole matrix or vector using \a func |
402 | * |
403 | * The template parameter \a BinaryOp is the type of the functor \a func which must be |
404 | * an associative operator. Both current C++98 and C++11 functor styles are handled. |
405 | * |
406 | * \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise() |
407 | */ |
408 | template<typename Derived> |
409 | template<typename Func> |
410 | EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar |
411 | DenseBase<Derived>::redux(const Func& func) const |
412 | { |
413 | eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix" ); |
414 | |
415 | typedef typename internal::redux_evaluator<Derived> ThisEvaluator; |
416 | ThisEvaluator thisEval(derived()); |
417 | |
418 | return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func); |
419 | } |
420 | |
421 | /** \returns the minimum of all coefficients of \c *this. |
422 | * \warning the result is undefined if \c *this contains NaN. |
423 | */ |
424 | template<typename Derived> |
425 | EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar |
426 | DenseBase<Derived>::minCoeff() const |
427 | { |
428 | return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar>()); |
429 | } |
430 | |
431 | /** \returns the maximum of all coefficients of \c *this. |
432 | * \warning the result is undefined if \c *this contains NaN. |
433 | */ |
434 | template<typename Derived> |
435 | EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar |
436 | DenseBase<Derived>::maxCoeff() const |
437 | { |
438 | return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar>()); |
439 | } |
440 | |
441 | /** \returns the sum of all coefficients of \c *this |
442 | * |
443 | * If \c *this is empty, then the value 0 is returned. |
444 | * |
445 | * \sa trace(), prod(), mean() |
446 | */ |
447 | template<typename Derived> |
448 | EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar |
449 | DenseBase<Derived>::sum() const |
450 | { |
451 | if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0)) |
452 | return Scalar(0); |
453 | return derived().redux(Eigen::internal::scalar_sum_op<Scalar,Scalar>()); |
454 | } |
455 | |
456 | /** \returns the mean of all coefficients of *this |
457 | * |
458 | * \sa trace(), prod(), sum() |
459 | */ |
460 | template<typename Derived> |
461 | EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar |
462 | DenseBase<Derived>::mean() const |
463 | { |
464 | #ifdef __INTEL_COMPILER |
465 | #pragma warning push |
466 | #pragma warning ( disable : 2259 ) |
467 | #endif |
468 | return Scalar(derived().redux(Eigen::internal::scalar_sum_op<Scalar,Scalar>())) / Scalar(this->size()); |
469 | #ifdef __INTEL_COMPILER |
470 | #pragma warning pop |
471 | #endif |
472 | } |
473 | |
474 | /** \returns the product of all coefficients of *this |
475 | * |
476 | * Example: \include MatrixBase_prod.cpp |
477 | * Output: \verbinclude MatrixBase_prod.out |
478 | * |
479 | * \sa sum(), mean(), trace() |
480 | */ |
481 | template<typename Derived> |
482 | EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar |
483 | DenseBase<Derived>::prod() const |
484 | { |
485 | if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0)) |
486 | return Scalar(1); |
487 | return derived().redux(Eigen::internal::scalar_product_op<Scalar>()); |
488 | } |
489 | |
490 | /** \returns the trace of \c *this, i.e. the sum of the coefficients on the main diagonal. |
491 | * |
492 | * \c *this can be any matrix, not necessarily square. |
493 | * |
494 | * \sa diagonal(), sum() |
495 | */ |
496 | template<typename Derived> |
497 | EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar |
498 | MatrixBase<Derived>::trace() const |
499 | { |
500 | return derived().diagonal().sum(); |
501 | } |
502 | |
503 | } // end namespace Eigen |
504 | |
505 | #endif // EIGEN_REDUX_H |
506 | |