1 | /* Copyright (C) 2016 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_F_REDUCE_ADD_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_F_REDUCE_ADD_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/extract.h> |
17 | #include <simdpp/core/f_add.h> |
18 | #include <simdpp/core/permute2.h> |
19 | #include <simdpp/detail/extract128.h> |
20 | #include <simdpp/detail/workarounds.h> |
21 | |
22 | namespace simdpp { |
23 | namespace SIMDPP_ARCH_NAMESPACE { |
24 | namespace detail { |
25 | namespace insn { |
26 | |
27 | |
28 | static SIMDPP_INL |
29 | float i_reduce_add(const float32x4& a) |
30 | { |
31 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP |
32 | float r = a.el(0); |
33 | for (unsigned i = 1; i < a.length; i++) { |
34 | r += a.el(i); |
35 | } |
36 | return r; |
37 | #elif SIMDPP_USE_SSE3 |
38 | float32x4 b = a; |
39 | b = _mm_hadd_ps(b.native(), b.native()); |
40 | b = _mm_hadd_ps(b.native(), b.native()); |
41 | return _mm_cvtss_f32(b.native()); |
42 | #elif SIMDPP_USE_SSE2 |
43 | float32x4 sum2 = _mm_movehl_ps(a.native(), a.native()); |
44 | float32x4 sum = add(a, sum2); |
45 | sum = add(sum, permute2<1,0>(sum)); |
46 | return _mm_cvtss_f32(sum.native()); |
47 | #elif SIMDPP_USE_NEON_FLT_SP |
48 | float32x2_t a2 = vpadd_f32(vget_low_f32(a.native()), vget_high_f32(a.native())); |
49 | a2 = vpadd_f32(a2, a2); |
50 | return vget_lane_f32(a2, 0); |
51 | #elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
52 | float32x4 b = a; |
53 | b = add(b, move4_l<1>(b)); |
54 | b = add(b, move4_l<2>(b)); |
55 | return extract<0>(b); |
56 | #endif |
57 | } |
58 | |
59 | #if SIMDPP_USE_AVX |
60 | static SIMDPP_INL |
61 | float i_reduce_add(const float32x8& a) |
62 | { |
63 | __m128 ah = detail::extract128<1>(a).native(); |
64 | __m128 al = detail::extract128<0>(a).native(); |
65 | al = _mm_hadd_ps(al, ah); |
66 | al = _mm_hadd_ps(al, al); |
67 | al = _mm_hadd_ps(al, al); |
68 | return _mm_cvtss_f32(al); |
69 | } |
70 | #endif |
71 | |
72 | #if SIMDPP_USE_AVX512F |
73 | static SIMDPP_INL |
74 | float i_reduce_add(const float32<16>& a) |
75 | { |
76 | return i_reduce_add(add(extract256<0>(a), extract256<1>(a))); |
77 | } |
78 | #endif |
79 | |
80 | template<unsigned N> |
81 | SIMDPP_INL float i_reduce_add(const float32<N>& a) |
82 | { |
83 | float32v r = a.vec(0); |
84 | for (unsigned i = 1; i < a.vec_length; ++i) |
85 | r = add(r, a.vec(i)); |
86 | return i_reduce_add(r); |
87 | } |
88 | |
89 | // ----------------------------------------------------------------------------- |
90 | |
91 | static SIMDPP_INL |
92 | double i_reduce_add(const float64x2& a) |
93 | { |
94 | #if SIMDPP_USE_SSE3 |
95 | return _mm_cvtsd_f64(_mm_hadd_pd(a.native(), a.native())); |
96 | #elif SIMDPP_USE_SSE2 |
97 | float64x2 b = add(a, permute2<1,1>(a)); |
98 | return _mm_cvtsd_f64(b.native()); |
99 | #elif SIMDPP_USE_NEON64 |
100 | float64x2_t a2 = vpaddq_f64(a.native(), a.native()); |
101 | return vgetq_lane_f64(a2, 0); |
102 | #elif SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA |
103 | float64x2 b = add(a, permute2<1,1>(a)); |
104 | return extract<0>(b); |
105 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC |
106 | double r = a.el(0); |
107 | for (unsigned i = 1; i < a.length; i++) { |
108 | r += a.el(i); |
109 | } |
110 | return r; |
111 | #endif |
112 | } |
113 | |
114 | #if SIMDPP_USE_AVX |
115 | static SIMDPP_INL |
116 | double i_reduce_add(const float64x4& a) |
117 | { |
118 | __m128d ah = detail::extract128<1>(a).native(); |
119 | __m128d al = detail::extract128<0>(a).native(); |
120 | al = _mm_hadd_pd(al, ah); |
121 | al = _mm_hadd_pd(al, al); |
122 | return _mm_cvtsd_f64(al); |
123 | } |
124 | #endif |
125 | |
126 | #if SIMDPP_USE_AVX512F |
127 | static SIMDPP_INL |
128 | double i_reduce_add(const float64<8>& a) |
129 | { |
130 | return i_reduce_add(add(extract256<0>(a), extract256<1>(a))); |
131 | } |
132 | #endif |
133 | |
134 | template<unsigned N> |
135 | SIMDPP_INL double i_reduce_add(const float64<N>& a) |
136 | { |
137 | float64v r = a.vec(0); |
138 | for (unsigned i = 1; i < a.vec_length; ++i) |
139 | r = add(r, a.vec(i)); |
140 | return i_reduce_add(r); |
141 | } |
142 | |
143 | } // namespace insn |
144 | } // namespace detail |
145 | } // namespace SIMDPP_ARCH_NAMESPACE |
146 | } // namespace simdpp |
147 | |
148 | #endif |
149 | |
150 | |