1 | /* Copyright (C) 2016 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_ADD_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_ADD_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/extract.h> |
17 | #include <simdpp/core/move_l.h> |
18 | #include <simdpp/core/make_uint.h> |
19 | #include <simdpp/detail/extract128.h> |
20 | |
21 | namespace simdpp { |
22 | namespace SIMDPP_ARCH_NAMESPACE { |
23 | |
24 | // forward declarations |
25 | template<unsigned N, class E> SIMDPP_INL |
26 | int16_t reduce_add(const int8<N,E>& a); |
27 | template<unsigned N, class E> SIMDPP_INL |
28 | uint16_t reduce_add(const uint8<N,E>& a); |
29 | template<unsigned N, class E> SIMDPP_INL |
30 | int32_t reduce_add(const int16<N,E>& a); |
31 | template<unsigned N, class E> SIMDPP_INL |
32 | uint32_t reduce_add(const uint16<N,E>& a); |
33 | template<unsigned N, class E> SIMDPP_INL |
34 | int32_t reduce_add(const int32<N,E>& a); |
35 | template<unsigned N, class E> SIMDPP_INL |
36 | uint32_t reduce_add(const uint32<N,E>& a); |
37 | template<unsigned N, class E> SIMDPP_INL |
38 | int64_t reduce_add(const int64<N,E>& a); |
39 | template<unsigned N, class E> SIMDPP_INL |
40 | uint64_t reduce_add(const uint64<N,E>& a); |
41 | |
42 | namespace detail { |
43 | namespace insn { |
44 | |
45 | static SIMDPP_INL |
46 | uint16_t i_reduce_add(const uint8x16& a) |
47 | { |
48 | #if SIMDPP_USE_NULL |
49 | uint16_t r = a.el(0); |
50 | for (unsigned i = 1; i < a.length; i++) { |
51 | r += a.el(i); |
52 | } |
53 | return r; |
54 | #elif SIMDPP_USE_XOP |
55 | uint16x8 sum = _mm_haddq_epu8(a.native()); |
56 | return extract<0>(sum) + extract<4>(sum); |
57 | #elif SIMDPP_USE_SSE2 |
58 | uint16x8 sum = _mm_sad_epu8(a.native(), _mm_setzero_si128()); |
59 | return extract<0>(sum) + extract<4>(sum); |
60 | #elif SIMDPP_USE_NEON |
61 | uint16x8 a2 = vpaddlq_u8(a.native()); |
62 | uint32x4 a3 = vpaddlq_u16(a2.native()); |
63 | uint64x2 a4 = vpaddlq_u32(a3.native()); |
64 | a3 = a4; |
65 | uint32x2_t r = vadd_u32(vget_low_u32(a3.native()), vget_high_u32(a3.native())); |
66 | return vget_lane_u32(r, 0); |
67 | #elif SIMDPP_USE_ALTIVEC |
68 | uint32x4 sum = make_zero(); |
69 | sum = vec_sum4s(a.native(), sum.native()); |
70 | sum = add(sum, move4_l<2>(sum)); |
71 | sum = add(sum, move4_l<1>(sum)); |
72 | return extract<0>(sum); |
73 | #elif SIMDPP_USE_MSA |
74 | uint16<8> s16 = __msa_hadd_u_h(a.native(), a.native()); |
75 | uint32<4> s32 = __msa_hadd_u_w(s16.native(), s16.native()); |
76 | s32 = (uint64<2>) __msa_hadd_u_d(s32.native(), s32.native()); |
77 | s32 = add(s32, move4_l<2>(s32)); |
78 | return extract<0>(s32); |
79 | #endif |
80 | } |
81 | |
82 | #if SIMDPP_USE_AVX2 |
83 | static SIMDPP_INL |
84 | uint16_t i_reduce_add(const uint8x32& a) |
85 | { |
86 | uint16x16 sum2 = _mm256_sad_epu8(a.native(), _mm256_setzero_si256()); // results are in 0,2,4,6 elements |
87 | uint16x8 sum = add(detail::extract128<0>(sum2), detail::extract128<1>(sum2)); |
88 | return extract<0>(sum) + extract<4>(sum); |
89 | } |
90 | #endif |
91 | |
92 | #if SIMDPP_USE_AVX512BW |
93 | SIMDPP_INL uint16_t i_reduce_add(const uint8<64>& a) |
94 | { |
95 | uint64<8> sum2 = _mm512_sad_epu8(a.native(), _mm512_setzero_si512()); |
96 | return reduce_add(sum2); |
97 | } |
98 | #endif |
99 | |
100 | template<unsigned N> |
101 | SIMDPP_INL uint16_t i_reduce_add(const uint8<N>& a) |
102 | { |
103 | #if SIMDPP_USE_NULL |
104 | uint16_t r = 0; |
105 | for (unsigned j = 0; j < a.vec_length; ++j) { |
106 | for (unsigned i = 0; i < a.base_length; i++) { |
107 | r += a.vec(j).el(i); |
108 | } |
109 | } |
110 | return r; |
111 | #elif SIMDPP_USE_AVX512BW |
112 | uint64<8> sum2 = make_zero(); |
113 | for (unsigned j = 0; j < a.vec_length; ++j) { |
114 | uint64<8> sum = _mm512_sad_epu8(a.native(), _mm512_setzero_si512()); |
115 | sum2 = add(sum2, sum); |
116 | } |
117 | return reduce_add(sum2); |
118 | #elif SIMDPP_USE_AVX2 |
119 | uint16x16 r = make_zero(); |
120 | for (unsigned j = 0; j < a.vec_length; ++j) { |
121 | uint16x16 sum = _mm256_sad_epu8(a.vec(j).native(), _mm256_setzero_si256()); |
122 | r = add(r, sum); |
123 | } |
124 | uint16x8 rl = add(detail::extract128<0>(r), detail::extract128<1>(r)); |
125 | return extract<0>(rl) + extract<4>(rl); |
126 | #elif SIMDPP_USE_SSE2 |
127 | uint16x8 r = make_zero(); |
128 | for (unsigned j = 0; j < a.vec_length; ++j) { |
129 | #if SIMDPP_USE_XOP |
130 | uint16x8 sum = _mm_haddq_epu8(a.vec(j).native()); |
131 | #else |
132 | uint16x8 sum = _mm_sad_epu8(a.vec(j).native(), _mm_setzero_si128()); |
133 | #endif |
134 | r = add(r, sum); |
135 | } |
136 | return extract<0>(r) + extract<4>(r); |
137 | #elif SIMDPP_USE_NEON |
138 | uint16x8 r = make_zero(); |
139 | for (unsigned j = 0; j < a.vec_length; ++j) { |
140 | uint16x8 sum = vpaddlq_u8(a.vec(j).native()); |
141 | r = add(r, sum); |
142 | } |
143 | uint32x4 r2 = vpaddlq_u16(r.native()); |
144 | uint64x2 r3 = vpaddlq_u32(r2.native()); |
145 | r2 = r3; |
146 | uint32x2_t r4 = vadd_u32(vget_low_u32(r2.native()), |
147 | vget_high_u32(r2.native())); |
148 | return vget_lane_u32(r4, 0); |
149 | #elif SIMDPP_USE_ALTIVEC |
150 | uint32x4 sum = make_zero(); |
151 | for (unsigned j = 0; j < a.vec_length; ++j) { |
152 | sum = vec_sum4s(a.vec(j).native(), sum.native()); |
153 | } |
154 | sum = add(sum, move4_l<2>(sum)); |
155 | sum = add(sum, move4_l<1>(sum)); |
156 | return extract<0>(sum); |
157 | #elif SIMDPP_USE_MSA |
158 | uint16<8> r = make_zero(); |
159 | for (unsigned j = 0; j < a.vec_length; ++j) { |
160 | uint16x8 sum = __msa_hadd_u_h(a.vec(j).native(), a.vec(j).native()); |
161 | r = add(r, sum); |
162 | } |
163 | uint32<4> s32 = __msa_hadd_u_w(r.native(), r.native()); |
164 | s32 = (v4u32) __msa_hadd_u_d(s32.native(), s32.native()); |
165 | s32 = add(s32, move4_l<2>(s32)); |
166 | return extract<0>(s32); |
167 | #endif |
168 | } |
169 | |
170 | // ----------------------------------------------------------------------------- |
171 | |
172 | static SIMDPP_INL |
173 | int16_t i_reduce_add(const int8x16& a) |
174 | { |
175 | #if SIMDPP_USE_NULL |
176 | int16_t r = a.el(0); |
177 | for (unsigned i = 1; i < a.length; i++) { |
178 | r += a.el(i); |
179 | } |
180 | return r; |
181 | #elif SIMDPP_USE_XOP |
182 | uint16x8 sum = _mm_haddq_epi8(a.native()); |
183 | return extract<0>(sum) + extract<4>(sum); |
184 | #elif SIMDPP_USE_SSE2 |
185 | return i_reduce_add(uint8x16(bit_xor(a, 0x80))) - a.length*0x80; |
186 | #elif SIMDPP_USE_NEON |
187 | int16x8 a2 = vpaddlq_s8(a.native()); |
188 | int32x4 a3 = vpaddlq_s16(a2.native()); |
189 | int64x2 a4 = vpaddlq_s32(a3.native()); |
190 | a3 = a4; |
191 | int32x2_t r = vadd_s32(vget_low_s32(a3.native()), |
192 | vget_high_s32(a3.native())); |
193 | return vget_lane_s32(r, 0); |
194 | #elif SIMDPP_USE_ALTIVEC |
195 | int32x4 sum = make_zero(); |
196 | sum = vec_sum4s(a.native(), sum.native()); |
197 | sum = add(sum, move4_l<2>(sum)); |
198 | sum = add(sum, move4_l<1>(sum)); |
199 | return extract<0>(sum); |
200 | #elif SIMDPP_USE_MSA |
201 | int16<8> s16 = __msa_hadd_s_h(a.native(), a.native()); |
202 | int32<4> s32 = __msa_hadd_s_w(s16.native(), s16.native()); |
203 | s32 = (v4i32) __msa_hadd_s_d(s32.native(), s32.native()); |
204 | s32 = add(s32, move4_l<2>(s32)); |
205 | return extract<0>(s32); |
206 | #endif |
207 | } |
208 | |
209 | #if SIMDPP_USE_AVX2 |
210 | static SIMDPP_INL |
211 | int16_t i_reduce_add(const int8x32& a) |
212 | { |
213 | return i_reduce_add(uint8x32(bit_xor(a, 0x80))) - a.length*0x80; |
214 | } |
215 | #endif |
216 | |
217 | #if SIMDPP_USE_AVX512BW |
218 | SIMDPP_INL uint16_t i_reduce_add(const int8<64>& a) |
219 | { |
220 | return i_reduce_add(uint8<64>(bit_xor(a, 0x80))) - a.length*0x80; |
221 | } |
222 | #endif |
223 | |
224 | template<unsigned N> |
225 | SIMDPP_INL uint16_t i_reduce_add(const int8<N>& a) |
226 | { |
227 | #if SIMDPP_USE_NULL |
228 | uint16_t r = 0; |
229 | for (unsigned j = 0; j < a.vec_length; ++j) { |
230 | for (unsigned i = 0; i < a.base_length; i++) { |
231 | r += a.vec(j).el(i); |
232 | } |
233 | } |
234 | return r; |
235 | #elif SIMDPP_USE_AVX512BW || SIMDPP_USE_AVX2 |
236 | return i_reduce_add(uint8<N>(bit_xor(a, 0x80))) - a.length*0x80; |
237 | #elif SIMDPP_USE_XOP |
238 | int16x8 r = make_zero(); |
239 | for (unsigned j = 0; j < a.vec_length; ++j) { |
240 | int16x8 sum = _mm_haddq_epi8(a.vec(j).native()); |
241 | r = add(r, sum); |
242 | } |
243 | return extract<0>(r) + extract<4>(r); |
244 | #elif SIMDPP_USE_SSE2 |
245 | return i_reduce_add(uint8<N>(bit_xor(a, 0x80))) - a.length*0x80; |
246 | #elif SIMDPP_USE_NEON |
247 | int16x8 r = make_zero(); |
248 | for (unsigned j = 0; j < a.vec_length; ++j) { |
249 | int16x8 sum = vpaddlq_s8(a.vec(j).native()); |
250 | r = add(r, sum); |
251 | } |
252 | int32x4 r2 = vpaddlq_s16(r.native()); |
253 | int64x2 r3 = vpaddlq_s32(r2.native()); |
254 | r2 = r3; |
255 | int32x2_t r4 = vadd_s32(vget_low_s32(r2.native()), |
256 | vget_high_s32(r2.native())); |
257 | return vget_lane_s32(r4, 0); |
258 | #elif SIMDPP_USE_ALTIVEC |
259 | int32x4 sum = make_zero(); |
260 | for (unsigned j = 0; j < a.vec_length; ++j) { |
261 | sum = vec_sum4s(a.vec(j).native(), sum.native()); |
262 | } |
263 | sum = add(sum, move4_l<2>(sum)); |
264 | sum = add(sum, move4_l<1>(sum)); |
265 | return extract<0>(sum); |
266 | #elif SIMDPP_USE_MSA |
267 | int16<8> r = make_zero(); |
268 | for (unsigned j = 0; j < a.vec_length; ++j) { |
269 | int16x8 sum = __msa_hadd_s_h(a.vec(j).native(), a.vec(j).native()); |
270 | r = add(r, sum); |
271 | } |
272 | int32<4> s32 = __msa_hadd_s_w(r.native(), r.native()); |
273 | s32 = (v4i32) __msa_hadd_s_d(s32.native(), s32.native()); |
274 | s32 = add(s32, move4_l<2>(s32)); |
275 | return extract<0>(s32); |
276 | #endif |
277 | } |
278 | |
279 | // ----------------------------------------------------------------------------- |
280 | |
281 | static SIMDPP_INL |
282 | uint32_t i_reduce_add(const uint16x8& a) |
283 | { |
284 | #if SIMDPP_USE_NULL |
285 | uint32_t r = a.el(0); |
286 | for (unsigned i = 1; i < a.length; i++) { |
287 | r += a.el(i); |
288 | } |
289 | return r; |
290 | #elif SIMDPP_USE_XOP |
291 | uint32x4 sum = _mm_haddq_epu16(a.native()); // sum in the 0 and 2 elements |
292 | sum = add(sum, move4_l<2>(sum)); |
293 | return extract<0>(sum); |
294 | #elif SIMDPP_USE_SSE2 |
295 | uint16x8 ones = make_uint(1); |
296 | uint16x8 ca = bit_xor(a, 0x8000); |
297 | uint32x4 sum = _mm_madd_epi16(ca.native(), ones.native()); |
298 | // phadd is slower option on intel processors |
299 | sum = add(sum, move4_l<2>(sum)); |
300 | sum = add(sum, move4_l<1>(sum)); |
301 | return extract<0>(sum) + 0x8000 * a.length; |
302 | #elif SIMDPP_USE_NEON |
303 | uint32x4 a2 = vpaddlq_u16(a.native()); |
304 | uint64x2 a3 = vpaddlq_u32(a2.native()); |
305 | a2 = a3; |
306 | uint32x2_t r = vadd_u32(vget_low_u32(a2.native()), |
307 | vget_high_u32(a2.native())); |
308 | return vget_lane_u32(r, 0); |
309 | #elif SIMDPP_USE_ALTIVEC |
310 | int32x4 sum = make_zero(); |
311 | int16x8 ca = bit_xor(a, 0x8000); |
312 | sum = vec_sum4s(ca.native(), sum.native()); |
313 | sum = add(sum, move4_l<2>(sum)); |
314 | sum = add(sum, move4_l<1>(sum)); |
315 | return extract<0>(sum) + 0x8000 * a.length; |
316 | #elif SIMDPP_USE_MSA |
317 | uint32<4> s32 = __msa_hadd_u_w(a.native(), a.native()); |
318 | s32 = (v4u32) __msa_hadd_u_d(s32.native(), s32.native()); |
319 | s32 = add(s32, move4_l<2>(s32)); |
320 | return extract<0>(s32); |
321 | #endif |
322 | } |
323 | |
324 | #if SIMDPP_USE_AVX2 |
325 | static SIMDPP_INL |
326 | uint32_t i_reduce_add(const uint16x16& a) |
327 | { |
328 | uint16x16 ones = make_uint(1); |
329 | uint16x16 ca = bit_xor(a, 0x8000); |
330 | uint32x8 sum = _mm256_madd_epi16(ca.native(), ones.native()); |
331 | return reduce_add(sum) + 0x8000 * a.length; |
332 | } |
333 | #endif |
334 | |
335 | #if SIMDPP_USE_AVX512BW |
336 | SIMDPP_INL uint32_t i_reduce_add(const uint16<32>& a) |
337 | { |
338 | uint16<32> ones = make_uint(1); |
339 | uint16<32> ca = bit_xor(a, 0x8000); |
340 | uint32<16> sum = _mm512_madd_epi16(ca.native(), ones.native()); |
341 | return reduce_add(sum) + 0x8000 * a.length; |
342 | } |
343 | #endif |
344 | |
345 | template<unsigned N> |
346 | SIMDPP_INL uint32_t i_reduce_add(const uint16<N>& a) |
347 | { |
348 | #if SIMDPP_USE_NULL |
349 | uint32_t r = 0; |
350 | for (unsigned j = 0; j < a.vec_length; ++j) { |
351 | for (unsigned i = 0; i < a.base_length; i++) { |
352 | r += a.vec(j).el(i); |
353 | } |
354 | } |
355 | return r; |
356 | #elif SIMDPP_USE_AVX512BW |
357 | uint32<16> sum = make_zero(); |
358 | uint16<32> ones = make_uint(1); |
359 | for (unsigned j = 0; j < a.vec_length; ++j) { |
360 | uint16<32> ca = bit_xor(a.vec(j), 0x8000); |
361 | uint32<16> isum = _mm512_madd_epi16(ca.native(), ones.native()); |
362 | sum = add(sum, isum); |
363 | } |
364 | return reduce_add(sum) + 0x8000 * a.length; |
365 | #elif SIMDPP_USE_AVX2 |
366 | uint32x8 sum = make_zero(); |
367 | uint16x16 ones = make_uint(1); |
368 | for (unsigned j = 0; j < a.vec_length; ++j) { |
369 | uint16x16 ca = bit_xor(a.vec(j), 0x8000); |
370 | uint32x8 isum = _mm256_madd_epi16(ca.native(), ones.native()); |
371 | sum = add(sum, isum); |
372 | } |
373 | return reduce_add(sum) + 0x8000 * a.length; |
374 | #elif SIMDPP_USE_XOP |
375 | uint32x4 sum = make_zero(); |
376 | for (unsigned j = 0; j < a.vec_length; ++j) { |
377 | uint32x4 isum = _mm_haddq_epu16(a.vec(j).native()); |
378 | sum = add(sum, isum); |
379 | } |
380 | sum = add(sum, move4_l<2>(sum)); |
381 | return extract<0>(sum); |
382 | #elif SIMDPP_USE_SSE2 |
383 | uint32x4 sum = make_zero(); |
384 | uint16x8 ones = make_uint(1); |
385 | for (unsigned j = 0; j < a.vec_length; ++j) { |
386 | uint16x8 ca = bit_xor(a.vec(j), 0x8000); |
387 | uint32x4 isum = _mm_madd_epi16(ca.native(), ones.native()); |
388 | sum = add(sum, isum); |
389 | } |
390 | sum = add(sum, move4_l<2>(sum)); |
391 | sum = add(sum, move4_l<1>(sum)); |
392 | return extract<0>(sum) + 0x8000 * a.length; |
393 | #elif SIMDPP_USE_NEON |
394 | uint32x4 sum = make_zero(); |
395 | for (unsigned j = 0; j < a.vec_length; ++j) { |
396 | uint32x4 isum = vpaddlq_u16(a.vec(j).native()); |
397 | sum = add(sum, isum); |
398 | } |
399 | uint64x2 sum2 = vpaddlq_u32(sum.native()); |
400 | sum = sum2; |
401 | uint32x2_t sum3 = vadd_u32(vget_low_u32(sum.native()), |
402 | vget_high_u32(sum.native())); |
403 | return vget_lane_u32(sum3, 0); |
404 | #elif SIMDPP_USE_ALTIVEC |
405 | int32x4 sum = make_zero(); |
406 | for (unsigned j = 0; j < a.vec_length; ++j) { |
407 | int16x8 ca = bit_xor(a.vec(j), 0x8000); |
408 | sum = vec_sum4s(ca.native(), sum.native()); |
409 | } |
410 | sum = add(sum, move4_l<2>(sum)); |
411 | sum = add(sum, move4_l<1>(sum)); |
412 | return extract<0>(sum) + 0x8000 * a.length; |
413 | #elif SIMDPP_USE_MSA |
414 | uint32<4> r = make_zero(); |
415 | for (unsigned j = 0; j < a.vec_length; ++j) { |
416 | uint32<4> sum = __msa_hadd_u_w(a.vec(j).native(), a.vec(j).native()); |
417 | r = add(r, sum); |
418 | } |
419 | r = (uint64<2>) __msa_hadd_u_d(r.native(), r.native()); |
420 | r = add(r, move4_l<2>(r)); |
421 | return extract<0>(r); |
422 | #endif |
423 | } |
424 | |
425 | // ----------------------------------------------------------------------------- |
426 | |
427 | static SIMDPP_INL |
428 | int32_t i_reduce_add(const int16x8& a) |
429 | { |
430 | #if SIMDPP_USE_NULL |
431 | int32_t r = a.el(0); |
432 | for (unsigned i = 1; i < a.length; i++) { |
433 | r += a.el(i); |
434 | } |
435 | return r; |
436 | #elif SIMDPP_USE_XOP |
437 | int32x4 sum = _mm_haddq_epi16(a.native()); // sum in the 0 and 2 elements |
438 | sum = add(sum, move4_l<2>(sum)); |
439 | return extract<0>(sum); |
440 | #elif SIMDPP_USE_SSE2 |
441 | int16x8 ones = make_uint(1); |
442 | int32x4 sum = _mm_madd_epi16(a.native(), ones.native()); |
443 | return reduce_add(sum); |
444 | #elif SIMDPP_USE_NEON |
445 | int32x4 a2 = vpaddlq_s16(a.native()); |
446 | int64x2 a3 = vpaddlq_s32(a2.native()); |
447 | a2 = a3; |
448 | int32x2_t r = vadd_s32(vget_low_s32(a2.native()), vget_high_s32(a2.native())); |
449 | return vget_lane_s32(r, 0); |
450 | #elif SIMDPP_USE_ALTIVEC |
451 | int32x4 sum = make_zero(); |
452 | sum = vec_sum4s(a.native(), sum.native()); |
453 | sum = add(sum, move4_l<2>(sum)); |
454 | sum = add(sum, move4_l<1>(sum)); |
455 | return extract<0>(sum); |
456 | #elif SIMDPP_USE_MSA |
457 | int32<4> s32 = __msa_hadd_s_w(a.native(), a.native()); |
458 | s32 = (int64<2>) __msa_hadd_s_d(s32.native(), s32.native()); |
459 | s32 = add(s32, move4_l<2>(s32)); |
460 | return extract<0>(s32); |
461 | #endif |
462 | } |
463 | |
464 | #if SIMDPP_USE_AVX2 |
465 | static SIMDPP_INL |
466 | int32_t i_reduce_add(const int16x16& a) |
467 | { |
468 | int16x16 ones = make_uint(1); |
469 | int32x8 sum = _mm256_madd_epi16(a.native(), ones.native()); |
470 | return reduce_add(sum); |
471 | } |
472 | #endif |
473 | |
474 | #if SIMDPP_USE_AVX512BW |
475 | SIMDPP_INL int32_t i_reduce_add(const int16<32>& a) |
476 | { |
477 | int16<32> ones = make_uint(1); |
478 | int32<16> sum = _mm512_madd_epi16(a.native(), ones.native()); |
479 | return reduce_add(sum); |
480 | } |
481 | #endif |
482 | |
483 | template<unsigned N> |
484 | SIMDPP_INL int32_t i_reduce_add(const int16<N>& a) |
485 | { |
486 | #if SIMDPP_USE_NULL |
487 | int32_t r = 0; |
488 | for (unsigned j = 0; j < a.vec_length; ++j) { |
489 | for (unsigned i = 0; i < a.base_length; i++) { |
490 | r += a.vec(j).el(i); |
491 | } |
492 | } |
493 | return r; |
494 | #elif SIMDPP_USE_AVX512BW |
495 | int32<16> sum = make_zero(); |
496 | int16<32> ones = make_int(1); |
497 | for (unsigned j = 0; j < a.vec_length; ++j) { |
498 | int32<16> isum = _mm512_madd_epi16(a.vec(j).native(), ones.native()); |
499 | sum = add(sum, isum); |
500 | } |
501 | return reduce_add(sum); |
502 | #elif SIMDPP_USE_AVX2 |
503 | int32x8 sum = make_zero(); |
504 | int16x16 ones = make_int(1); |
505 | for (unsigned j = 0; j < a.vec_length; ++j) { |
506 | int32x8 isum = _mm256_madd_epi16(a.vec(j).native(), ones.native()); |
507 | sum = add(sum, isum); |
508 | } |
509 | return reduce_add(sum); |
510 | #elif SIMDPP_USE_XOP |
511 | int32x4 sum = make_zero(); |
512 | for (unsigned j = 0; j < a.vec_length; ++j) { |
513 | int32x4 isum = _mm_haddq_epi16(a.vec(j).native()); |
514 | sum = add(sum, isum); |
515 | } |
516 | // _mm_haddq_epi16 computes 64-bit results. |
517 | // 1 and 3 32-bit elements may be nonzero |
518 | sum = add(sum, move4_l<2>(sum)); |
519 | return extract<0>(sum); |
520 | #elif SIMDPP_USE_SSE2 |
521 | int32x4 sum = make_zero(); |
522 | int16x8 ones = make_int(1); |
523 | for (unsigned j = 0; j < a.vec_length; ++j) { |
524 | int32x4 isum = _mm_madd_epi16(a.vec(j).native(), ones.native()); |
525 | sum = add(sum, isum); |
526 | } |
527 | return reduce_add(sum); |
528 | #elif SIMDPP_USE_NEON |
529 | int32x4 sum = make_zero(); |
530 | for (unsigned j = 0; j < a.vec_length; ++j) { |
531 | int32x4 isum = vpaddlq_s16(a.vec(j).native()); |
532 | sum = add(sum, isum); |
533 | } |
534 | return reduce_add(sum); |
535 | #elif SIMDPP_USE_ALTIVEC |
536 | int32x4 sum = make_zero(); |
537 | for (unsigned j = 0; j < a.vec_length; ++j) { |
538 | sum = vec_sum4s(a.vec(j).native(), sum.native()); |
539 | } |
540 | return reduce_add(sum); |
541 | #elif SIMDPP_USE_MSA |
542 | int32<4> r = make_zero(); |
543 | for (unsigned j = 0; j < a.vec_length; ++j) { |
544 | int32<4> sum = __msa_hadd_s_w(a.vec(j).native(), |
545 | a.vec(j).native()); |
546 | r = add(r, sum); |
547 | } |
548 | r = (int64<2>) __msa_hadd_s_d(r.native(), r.native()); |
549 | r = add(r, move4_l<2>(r)); |
550 | return extract<0>(r); |
551 | #endif |
552 | } |
553 | |
554 | // ----------------------------------------------------------------------------- |
555 | |
556 | static SIMDPP_INL |
557 | uint32_t i_reduce_add(const uint32x4& a) |
558 | { |
559 | #if SIMDPP_USE_NULL |
560 | uint32_t r = a.el(0); |
561 | for (unsigned i = 1; i < a.length; i++) { |
562 | r += a.el(i); |
563 | } |
564 | return r; |
565 | #elif SIMDPP_USE_MSA |
566 | uint32x4 sum = a; |
567 | sum = (uint64<2>) __msa_hadd_u_d(sum.native(), sum.native()); |
568 | sum = add(sum, move4_l<2>(sum)); |
569 | return extract<0>(sum); |
570 | #else |
571 | uint32x4 sum = a; |
572 | sum = add(sum, move4_l<2>(sum)); |
573 | sum = add(sum, move4_l<1>(sum)); |
574 | return extract<0>(sum); |
575 | #endif |
576 | } |
577 | |
578 | #if SIMDPP_USE_AVX2 |
579 | static SIMDPP_INL |
580 | uint32_t i_reduce_add(const uint32x8& a) |
581 | { |
582 | uint32x4 sum = add(detail::extract128<0>(a), detail::extract128<1>(a)); |
583 | sum = add(sum, move4_l<2>(sum)); |
584 | sum = add(sum, move4_l<1>(sum)); |
585 | return extract<0>(sum); |
586 | } |
587 | #endif |
588 | |
589 | #if SIMDPP_USE_AVX512F |
590 | static SIMDPP_INL |
591 | uint32_t i_reduce_add(const uint32<16>& a) |
592 | { |
593 | return i_reduce_add(add(extract256<0>(a), extract256<1>(a))); |
594 | } |
595 | #endif |
596 | |
597 | template<unsigned N> |
598 | SIMDPP_INL uint32_t i_reduce_add(const uint32<N>& a) |
599 | { |
600 | #if SIMDPP_USE_NULL |
601 | uint32_t r = 0; |
602 | for (unsigned j = 0; j < a.vec_length; ++j) { |
603 | for (unsigned i = 0; i < a.base_length; i++) { |
604 | r += a.vec(j).el(i); |
605 | } |
606 | } |
607 | return r; |
608 | #else |
609 | uint32v sum = make_zero(); |
610 | for (unsigned j = 0; j < a.vec_length; ++j) { |
611 | sum = add(sum, a.vec(j)); |
612 | } |
613 | return i_reduce_add(sum); |
614 | #endif |
615 | } |
616 | |
617 | // ----------------------------------------------------------------------------- |
618 | |
619 | static SIMDPP_INL |
620 | uint64_t i_reduce_add(const uint64x2& a) |
621 | { |
622 | #if SIMDPP_USE_NULL |
623 | uint64_t r = a.el(0); |
624 | for (unsigned i = 1; i < a.length; i++) { |
625 | r += a.el(i); |
626 | } |
627 | return r; |
628 | #elif SIMDPP_USE_SSE2 |
629 | uint64x2 sum = a; |
630 | sum = add(sum, move2_l<1>(sum)); |
631 | return extract<0>(sum); |
632 | #elif SIMDPP_USE_NEON |
633 | uint64x1_t r = vadd_u64(vget_low_u64(a.native()), |
634 | vget_high_u64(a.native())); |
635 | return vget_lane_u64(r, 0); |
636 | #elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
637 | return extract<0>(a) + extract<1>(a); |
638 | #endif |
639 | } |
640 | |
641 | #if SIMDPP_USE_AVX2 |
642 | static SIMDPP_INL |
643 | uint64_t i_reduce_add(const uint64x4& a) |
644 | { |
645 | uint64x2 sum = add(detail::extract128<0>(a), detail::extract128<1>(a)); |
646 | sum = add(sum, move2_l<1>(sum)); |
647 | return extract<0>(sum); |
648 | } |
649 | #endif |
650 | |
651 | #if SIMDPP_USE_AVX512F |
652 | static SIMDPP_INL |
653 | uint64_t i_reduce_add(const uint64<8>& a) |
654 | { |
655 | return i_reduce_add(add(extract256<0>(a), extract256<1>(a))); |
656 | } |
657 | #endif |
658 | |
659 | template<unsigned N> |
660 | SIMDPP_INL uint64_t i_reduce_add(const uint64<N>& a) |
661 | { |
662 | #if SIMDPP_USE_NULL || (SIMDPP_USE_ALTIVEC && !SIMDPP_USE_VSX_207) |
663 | uint64_t r = 0; |
664 | for (unsigned j = 0; j < a.vec_length; ++j) { |
665 | for (unsigned i = 0; i < a.base_length; i++) { |
666 | r += a.vec(j).el(i); |
667 | } |
668 | } |
669 | return r; |
670 | #else |
671 | uint64v sum = make_zero(); |
672 | for (unsigned j = 0; j < a.vec_length; ++j) { |
673 | sum = add(sum, a.vec(j)); |
674 | } |
675 | return i_reduce_add(sum); |
676 | #endif |
677 | } |
678 | |
679 | // ----------------------------------------------------------------------------- |
680 | |
681 | |
682 | } // namespace insn |
683 | } // namespace detail |
684 | } // namespace SIMDPP_ARCH_NAMESPACE |
685 | } // namespace simdpp |
686 | |
687 | #endif |
688 | |
689 | |