1/* Copyright (C) 2016 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_ADD_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_ADD_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/extract.h>
17#include <simdpp/core/move_l.h>
18#include <simdpp/core/make_uint.h>
19#include <simdpp/detail/extract128.h>
20
21namespace simdpp {
22namespace SIMDPP_ARCH_NAMESPACE {
23
24// forward declarations
25template<unsigned N, class E> SIMDPP_INL
26int16_t reduce_add(const int8<N,E>& a);
27template<unsigned N, class E> SIMDPP_INL
28uint16_t reduce_add(const uint8<N,E>& a);
29template<unsigned N, class E> SIMDPP_INL
30int32_t reduce_add(const int16<N,E>& a);
31template<unsigned N, class E> SIMDPP_INL
32uint32_t reduce_add(const uint16<N,E>& a);
33template<unsigned N, class E> SIMDPP_INL
34int32_t reduce_add(const int32<N,E>& a);
35template<unsigned N, class E> SIMDPP_INL
36uint32_t reduce_add(const uint32<N,E>& a);
37template<unsigned N, class E> SIMDPP_INL
38int64_t reduce_add(const int64<N,E>& a);
39template<unsigned N, class E> SIMDPP_INL
40uint64_t reduce_add(const uint64<N,E>& a);
41
42namespace detail {
43namespace insn {
44
45static SIMDPP_INL
46uint16_t i_reduce_add(const uint8x16& a)
47{
48#if SIMDPP_USE_NULL
49 uint16_t r = a.el(0);
50 for (unsigned i = 1; i < a.length; i++) {
51 r += a.el(i);
52 }
53 return r;
54#elif SIMDPP_USE_XOP
55 uint16x8 sum = _mm_haddq_epu8(a.native());
56 return extract<0>(sum) + extract<4>(sum);
57#elif SIMDPP_USE_SSE2
58 uint16x8 sum = _mm_sad_epu8(a.native(), _mm_setzero_si128());
59 return extract<0>(sum) + extract<4>(sum);
60#elif SIMDPP_USE_NEON
61 uint16x8 a2 = vpaddlq_u8(a.native());
62 uint32x4 a3 = vpaddlq_u16(a2.native());
63 uint64x2 a4 = vpaddlq_u32(a3.native());
64 a3 = a4;
65 uint32x2_t r = vadd_u32(vget_low_u32(a3.native()), vget_high_u32(a3.native()));
66 return vget_lane_u32(r, 0);
67#elif SIMDPP_USE_ALTIVEC
68 uint32x4 sum = make_zero();
69 sum = vec_sum4s(a.native(), sum.native());
70 sum = add(sum, move4_l<2>(sum));
71 sum = add(sum, move4_l<1>(sum));
72 return extract<0>(sum);
73#elif SIMDPP_USE_MSA
74 uint16<8> s16 = __msa_hadd_u_h(a.native(), a.native());
75 uint32<4> s32 = __msa_hadd_u_w(s16.native(), s16.native());
76 s32 = (uint64<2>) __msa_hadd_u_d(s32.native(), s32.native());
77 s32 = add(s32, move4_l<2>(s32));
78 return extract<0>(s32);
79#endif
80}
81
82#if SIMDPP_USE_AVX2
83static SIMDPP_INL
84uint16_t i_reduce_add(const uint8x32& a)
85{
86 uint16x16 sum2 = _mm256_sad_epu8(a.native(), _mm256_setzero_si256()); // results are in 0,2,4,6 elements
87 uint16x8 sum = add(detail::extract128<0>(sum2), detail::extract128<1>(sum2));
88 return extract<0>(sum) + extract<4>(sum);
89}
90#endif
91
92#if SIMDPP_USE_AVX512BW
93SIMDPP_INL uint16_t i_reduce_add(const uint8<64>& a)
94{
95 uint64<8> sum2 = _mm512_sad_epu8(a.native(), _mm512_setzero_si512());
96 return reduce_add(sum2);
97}
98#endif
99
100template<unsigned N>
101SIMDPP_INL uint16_t i_reduce_add(const uint8<N>& a)
102{
103#if SIMDPP_USE_NULL
104 uint16_t r = 0;
105 for (unsigned j = 0; j < a.vec_length; ++j) {
106 for (unsigned i = 0; i < a.base_length; i++) {
107 r += a.vec(j).el(i);
108 }
109 }
110 return r;
111#elif SIMDPP_USE_AVX512BW
112 uint64<8> sum2 = make_zero();
113 for (unsigned j = 0; j < a.vec_length; ++j) {
114 uint64<8> sum = _mm512_sad_epu8(a.native(), _mm512_setzero_si512());
115 sum2 = add(sum2, sum);
116 }
117 return reduce_add(sum2);
118#elif SIMDPP_USE_AVX2
119 uint16x16 r = make_zero();
120 for (unsigned j = 0; j < a.vec_length; ++j) {
121 uint16x16 sum = _mm256_sad_epu8(a.vec(j).native(), _mm256_setzero_si256());
122 r = add(r, sum);
123 }
124 uint16x8 rl = add(detail::extract128<0>(r), detail::extract128<1>(r));
125 return extract<0>(rl) + extract<4>(rl);
126#elif SIMDPP_USE_SSE2
127 uint16x8 r = make_zero();
128 for (unsigned j = 0; j < a.vec_length; ++j) {
129#if SIMDPP_USE_XOP
130 uint16x8 sum = _mm_haddq_epu8(a.vec(j).native());
131#else
132 uint16x8 sum = _mm_sad_epu8(a.vec(j).native(), _mm_setzero_si128());
133#endif
134 r = add(r, sum);
135 }
136 return extract<0>(r) + extract<4>(r);
137#elif SIMDPP_USE_NEON
138 uint16x8 r = make_zero();
139 for (unsigned j = 0; j < a.vec_length; ++j) {
140 uint16x8 sum = vpaddlq_u8(a.vec(j).native());
141 r = add(r, sum);
142 }
143 uint32x4 r2 = vpaddlq_u16(r.native());
144 uint64x2 r3 = vpaddlq_u32(r2.native());
145 r2 = r3;
146 uint32x2_t r4 = vadd_u32(vget_low_u32(r2.native()),
147 vget_high_u32(r2.native()));
148 return vget_lane_u32(r4, 0);
149#elif SIMDPP_USE_ALTIVEC
150 uint32x4 sum = make_zero();
151 for (unsigned j = 0; j < a.vec_length; ++j) {
152 sum = vec_sum4s(a.vec(j).native(), sum.native());
153 }
154 sum = add(sum, move4_l<2>(sum));
155 sum = add(sum, move4_l<1>(sum));
156 return extract<0>(sum);
157#elif SIMDPP_USE_MSA
158 uint16<8> r = make_zero();
159 for (unsigned j = 0; j < a.vec_length; ++j) {
160 uint16x8 sum = __msa_hadd_u_h(a.vec(j).native(), a.vec(j).native());
161 r = add(r, sum);
162 }
163 uint32<4> s32 = __msa_hadd_u_w(r.native(), r.native());
164 s32 = (v4u32) __msa_hadd_u_d(s32.native(), s32.native());
165 s32 = add(s32, move4_l<2>(s32));
166 return extract<0>(s32);
167#endif
168}
169
170// -----------------------------------------------------------------------------
171
172static SIMDPP_INL
173int16_t i_reduce_add(const int8x16& a)
174{
175#if SIMDPP_USE_NULL
176 int16_t r = a.el(0);
177 for (unsigned i = 1; i < a.length; i++) {
178 r += a.el(i);
179 }
180 return r;
181#elif SIMDPP_USE_XOP
182 uint16x8 sum = _mm_haddq_epi8(a.native());
183 return extract<0>(sum) + extract<4>(sum);
184#elif SIMDPP_USE_SSE2
185 return i_reduce_add(uint8x16(bit_xor(a, 0x80))) - a.length*0x80;
186#elif SIMDPP_USE_NEON
187 int16x8 a2 = vpaddlq_s8(a.native());
188 int32x4 a3 = vpaddlq_s16(a2.native());
189 int64x2 a4 = vpaddlq_s32(a3.native());
190 a3 = a4;
191 int32x2_t r = vadd_s32(vget_low_s32(a3.native()),
192 vget_high_s32(a3.native()));
193 return vget_lane_s32(r, 0);
194#elif SIMDPP_USE_ALTIVEC
195 int32x4 sum = make_zero();
196 sum = vec_sum4s(a.native(), sum.native());
197 sum = add(sum, move4_l<2>(sum));
198 sum = add(sum, move4_l<1>(sum));
199 return extract<0>(sum);
200#elif SIMDPP_USE_MSA
201 int16<8> s16 = __msa_hadd_s_h(a.native(), a.native());
202 int32<4> s32 = __msa_hadd_s_w(s16.native(), s16.native());
203 s32 = (v4i32) __msa_hadd_s_d(s32.native(), s32.native());
204 s32 = add(s32, move4_l<2>(s32));
205 return extract<0>(s32);
206#endif
207}
208
209#if SIMDPP_USE_AVX2
210static SIMDPP_INL
211int16_t i_reduce_add(const int8x32& a)
212{
213 return i_reduce_add(uint8x32(bit_xor(a, 0x80))) - a.length*0x80;
214}
215#endif
216
217#if SIMDPP_USE_AVX512BW
218SIMDPP_INL uint16_t i_reduce_add(const int8<64>& a)
219{
220 return i_reduce_add(uint8<64>(bit_xor(a, 0x80))) - a.length*0x80;
221}
222#endif
223
224template<unsigned N>
225SIMDPP_INL uint16_t i_reduce_add(const int8<N>& a)
226{
227#if SIMDPP_USE_NULL
228 uint16_t r = 0;
229 for (unsigned j = 0; j < a.vec_length; ++j) {
230 for (unsigned i = 0; i < a.base_length; i++) {
231 r += a.vec(j).el(i);
232 }
233 }
234 return r;
235#elif SIMDPP_USE_AVX512BW || SIMDPP_USE_AVX2
236 return i_reduce_add(uint8<N>(bit_xor(a, 0x80))) - a.length*0x80;
237#elif SIMDPP_USE_XOP
238 int16x8 r = make_zero();
239 for (unsigned j = 0; j < a.vec_length; ++j) {
240 int16x8 sum = _mm_haddq_epi8(a.vec(j).native());
241 r = add(r, sum);
242 }
243 return extract<0>(r) + extract<4>(r);
244#elif SIMDPP_USE_SSE2
245 return i_reduce_add(uint8<N>(bit_xor(a, 0x80))) - a.length*0x80;
246#elif SIMDPP_USE_NEON
247 int16x8 r = make_zero();
248 for (unsigned j = 0; j < a.vec_length; ++j) {
249 int16x8 sum = vpaddlq_s8(a.vec(j).native());
250 r = add(r, sum);
251 }
252 int32x4 r2 = vpaddlq_s16(r.native());
253 int64x2 r3 = vpaddlq_s32(r2.native());
254 r2 = r3;
255 int32x2_t r4 = vadd_s32(vget_low_s32(r2.native()),
256 vget_high_s32(r2.native()));
257 return vget_lane_s32(r4, 0);
258#elif SIMDPP_USE_ALTIVEC
259 int32x4 sum = make_zero();
260 for (unsigned j = 0; j < a.vec_length; ++j) {
261 sum = vec_sum4s(a.vec(j).native(), sum.native());
262 }
263 sum = add(sum, move4_l<2>(sum));
264 sum = add(sum, move4_l<1>(sum));
265 return extract<0>(sum);
266#elif SIMDPP_USE_MSA
267 int16<8> r = make_zero();
268 for (unsigned j = 0; j < a.vec_length; ++j) {
269 int16x8 sum = __msa_hadd_s_h(a.vec(j).native(), a.vec(j).native());
270 r = add(r, sum);
271 }
272 int32<4> s32 = __msa_hadd_s_w(r.native(), r.native());
273 s32 = (v4i32) __msa_hadd_s_d(s32.native(), s32.native());
274 s32 = add(s32, move4_l<2>(s32));
275 return extract<0>(s32);
276#endif
277}
278
279// -----------------------------------------------------------------------------
280
281static SIMDPP_INL
282uint32_t i_reduce_add(const uint16x8& a)
283{
284#if SIMDPP_USE_NULL
285 uint32_t r = a.el(0);
286 for (unsigned i = 1; i < a.length; i++) {
287 r += a.el(i);
288 }
289 return r;
290#elif SIMDPP_USE_XOP
291 uint32x4 sum = _mm_haddq_epu16(a.native()); // sum in the 0 and 2 elements
292 sum = add(sum, move4_l<2>(sum));
293 return extract<0>(sum);
294#elif SIMDPP_USE_SSE2
295 uint16x8 ones = make_uint(1);
296 uint16x8 ca = bit_xor(a, 0x8000);
297 uint32x4 sum = _mm_madd_epi16(ca.native(), ones.native());
298 // phadd is slower option on intel processors
299 sum = add(sum, move4_l<2>(sum));
300 sum = add(sum, move4_l<1>(sum));
301 return extract<0>(sum) + 0x8000 * a.length;
302#elif SIMDPP_USE_NEON
303 uint32x4 a2 = vpaddlq_u16(a.native());
304 uint64x2 a3 = vpaddlq_u32(a2.native());
305 a2 = a3;
306 uint32x2_t r = vadd_u32(vget_low_u32(a2.native()),
307 vget_high_u32(a2.native()));
308 return vget_lane_u32(r, 0);
309#elif SIMDPP_USE_ALTIVEC
310 int32x4 sum = make_zero();
311 int16x8 ca = bit_xor(a, 0x8000);
312 sum = vec_sum4s(ca.native(), sum.native());
313 sum = add(sum, move4_l<2>(sum));
314 sum = add(sum, move4_l<1>(sum));
315 return extract<0>(sum) + 0x8000 * a.length;
316#elif SIMDPP_USE_MSA
317 uint32<4> s32 = __msa_hadd_u_w(a.native(), a.native());
318 s32 = (v4u32) __msa_hadd_u_d(s32.native(), s32.native());
319 s32 = add(s32, move4_l<2>(s32));
320 return extract<0>(s32);
321#endif
322}
323
324#if SIMDPP_USE_AVX2
325static SIMDPP_INL
326uint32_t i_reduce_add(const uint16x16& a)
327{
328 uint16x16 ones = make_uint(1);
329 uint16x16 ca = bit_xor(a, 0x8000);
330 uint32x8 sum = _mm256_madd_epi16(ca.native(), ones.native());
331 return reduce_add(sum) + 0x8000 * a.length;
332}
333#endif
334
335#if SIMDPP_USE_AVX512BW
336SIMDPP_INL uint32_t i_reduce_add(const uint16<32>& a)
337{
338 uint16<32> ones = make_uint(1);
339 uint16<32> ca = bit_xor(a, 0x8000);
340 uint32<16> sum = _mm512_madd_epi16(ca.native(), ones.native());
341 return reduce_add(sum) + 0x8000 * a.length;
342}
343#endif
344
345template<unsigned N>
346SIMDPP_INL uint32_t i_reduce_add(const uint16<N>& a)
347{
348#if SIMDPP_USE_NULL
349 uint32_t r = 0;
350 for (unsigned j = 0; j < a.vec_length; ++j) {
351 for (unsigned i = 0; i < a.base_length; i++) {
352 r += a.vec(j).el(i);
353 }
354 }
355 return r;
356#elif SIMDPP_USE_AVX512BW
357 uint32<16> sum = make_zero();
358 uint16<32> ones = make_uint(1);
359 for (unsigned j = 0; j < a.vec_length; ++j) {
360 uint16<32> ca = bit_xor(a.vec(j), 0x8000);
361 uint32<16> isum = _mm512_madd_epi16(ca.native(), ones.native());
362 sum = add(sum, isum);
363 }
364 return reduce_add(sum) + 0x8000 * a.length;
365#elif SIMDPP_USE_AVX2
366 uint32x8 sum = make_zero();
367 uint16x16 ones = make_uint(1);
368 for (unsigned j = 0; j < a.vec_length; ++j) {
369 uint16x16 ca = bit_xor(a.vec(j), 0x8000);
370 uint32x8 isum = _mm256_madd_epi16(ca.native(), ones.native());
371 sum = add(sum, isum);
372 }
373 return reduce_add(sum) + 0x8000 * a.length;
374#elif SIMDPP_USE_XOP
375 uint32x4 sum = make_zero();
376 for (unsigned j = 0; j < a.vec_length; ++j) {
377 uint32x4 isum = _mm_haddq_epu16(a.vec(j).native());
378 sum = add(sum, isum);
379 }
380 sum = add(sum, move4_l<2>(sum));
381 return extract<0>(sum);
382#elif SIMDPP_USE_SSE2
383 uint32x4 sum = make_zero();
384 uint16x8 ones = make_uint(1);
385 for (unsigned j = 0; j < a.vec_length; ++j) {
386 uint16x8 ca = bit_xor(a.vec(j), 0x8000);
387 uint32x4 isum = _mm_madd_epi16(ca.native(), ones.native());
388 sum = add(sum, isum);
389 }
390 sum = add(sum, move4_l<2>(sum));
391 sum = add(sum, move4_l<1>(sum));
392 return extract<0>(sum) + 0x8000 * a.length;
393#elif SIMDPP_USE_NEON
394 uint32x4 sum = make_zero();
395 for (unsigned j = 0; j < a.vec_length; ++j) {
396 uint32x4 isum = vpaddlq_u16(a.vec(j).native());
397 sum = add(sum, isum);
398 }
399 uint64x2 sum2 = vpaddlq_u32(sum.native());
400 sum = sum2;
401 uint32x2_t sum3 = vadd_u32(vget_low_u32(sum.native()),
402 vget_high_u32(sum.native()));
403 return vget_lane_u32(sum3, 0);
404#elif SIMDPP_USE_ALTIVEC
405 int32x4 sum = make_zero();
406 for (unsigned j = 0; j < a.vec_length; ++j) {
407 int16x8 ca = bit_xor(a.vec(j), 0x8000);
408 sum = vec_sum4s(ca.native(), sum.native());
409 }
410 sum = add(sum, move4_l<2>(sum));
411 sum = add(sum, move4_l<1>(sum));
412 return extract<0>(sum) + 0x8000 * a.length;
413#elif SIMDPP_USE_MSA
414 uint32<4> r = make_zero();
415 for (unsigned j = 0; j < a.vec_length; ++j) {
416 uint32<4> sum = __msa_hadd_u_w(a.vec(j).native(), a.vec(j).native());
417 r = add(r, sum);
418 }
419 r = (uint64<2>) __msa_hadd_u_d(r.native(), r.native());
420 r = add(r, move4_l<2>(r));
421 return extract<0>(r);
422#endif
423}
424
425// -----------------------------------------------------------------------------
426
427static SIMDPP_INL
428int32_t i_reduce_add(const int16x8& a)
429{
430#if SIMDPP_USE_NULL
431 int32_t r = a.el(0);
432 for (unsigned i = 1; i < a.length; i++) {
433 r += a.el(i);
434 }
435 return r;
436#elif SIMDPP_USE_XOP
437 int32x4 sum = _mm_haddq_epi16(a.native()); // sum in the 0 and 2 elements
438 sum = add(sum, move4_l<2>(sum));
439 return extract<0>(sum);
440#elif SIMDPP_USE_SSE2
441 int16x8 ones = make_uint(1);
442 int32x4 sum = _mm_madd_epi16(a.native(), ones.native());
443 return reduce_add(sum);
444#elif SIMDPP_USE_NEON
445 int32x4 a2 = vpaddlq_s16(a.native());
446 int64x2 a3 = vpaddlq_s32(a2.native());
447 a2 = a3;
448 int32x2_t r = vadd_s32(vget_low_s32(a2.native()), vget_high_s32(a2.native()));
449 return vget_lane_s32(r, 0);
450#elif SIMDPP_USE_ALTIVEC
451 int32x4 sum = make_zero();
452 sum = vec_sum4s(a.native(), sum.native());
453 sum = add(sum, move4_l<2>(sum));
454 sum = add(sum, move4_l<1>(sum));
455 return extract<0>(sum);
456#elif SIMDPP_USE_MSA
457 int32<4> s32 = __msa_hadd_s_w(a.native(), a.native());
458 s32 = (int64<2>) __msa_hadd_s_d(s32.native(), s32.native());
459 s32 = add(s32, move4_l<2>(s32));
460 return extract<0>(s32);
461#endif
462}
463
464#if SIMDPP_USE_AVX2
465static SIMDPP_INL
466int32_t i_reduce_add(const int16x16& a)
467{
468 int16x16 ones = make_uint(1);
469 int32x8 sum = _mm256_madd_epi16(a.native(), ones.native());
470 return reduce_add(sum);
471}
472#endif
473
474#if SIMDPP_USE_AVX512BW
475SIMDPP_INL int32_t i_reduce_add(const int16<32>& a)
476{
477 int16<32> ones = make_uint(1);
478 int32<16> sum = _mm512_madd_epi16(a.native(), ones.native());
479 return reduce_add(sum);
480}
481#endif
482
483template<unsigned N>
484SIMDPP_INL int32_t i_reduce_add(const int16<N>& a)
485{
486#if SIMDPP_USE_NULL
487 int32_t r = 0;
488 for (unsigned j = 0; j < a.vec_length; ++j) {
489 for (unsigned i = 0; i < a.base_length; i++) {
490 r += a.vec(j).el(i);
491 }
492 }
493 return r;
494#elif SIMDPP_USE_AVX512BW
495 int32<16> sum = make_zero();
496 int16<32> ones = make_int(1);
497 for (unsigned j = 0; j < a.vec_length; ++j) {
498 int32<16> isum = _mm512_madd_epi16(a.vec(j).native(), ones.native());
499 sum = add(sum, isum);
500 }
501 return reduce_add(sum);
502#elif SIMDPP_USE_AVX2
503 int32x8 sum = make_zero();
504 int16x16 ones = make_int(1);
505 for (unsigned j = 0; j < a.vec_length; ++j) {
506 int32x8 isum = _mm256_madd_epi16(a.vec(j).native(), ones.native());
507 sum = add(sum, isum);
508 }
509 return reduce_add(sum);
510#elif SIMDPP_USE_XOP
511 int32x4 sum = make_zero();
512 for (unsigned j = 0; j < a.vec_length; ++j) {
513 int32x4 isum = _mm_haddq_epi16(a.vec(j).native());
514 sum = add(sum, isum);
515 }
516 // _mm_haddq_epi16 computes 64-bit results.
517 // 1 and 3 32-bit elements may be nonzero
518 sum = add(sum, move4_l<2>(sum));
519 return extract<0>(sum);
520#elif SIMDPP_USE_SSE2
521 int32x4 sum = make_zero();
522 int16x8 ones = make_int(1);
523 for (unsigned j = 0; j < a.vec_length; ++j) {
524 int32x4 isum = _mm_madd_epi16(a.vec(j).native(), ones.native());
525 sum = add(sum, isum);
526 }
527 return reduce_add(sum);
528#elif SIMDPP_USE_NEON
529 int32x4 sum = make_zero();
530 for (unsigned j = 0; j < a.vec_length; ++j) {
531 int32x4 isum = vpaddlq_s16(a.vec(j).native());
532 sum = add(sum, isum);
533 }
534 return reduce_add(sum);
535#elif SIMDPP_USE_ALTIVEC
536 int32x4 sum = make_zero();
537 for (unsigned j = 0; j < a.vec_length; ++j) {
538 sum = vec_sum4s(a.vec(j).native(), sum.native());
539 }
540 return reduce_add(sum);
541#elif SIMDPP_USE_MSA
542 int32<4> r = make_zero();
543 for (unsigned j = 0; j < a.vec_length; ++j) {
544 int32<4> sum = __msa_hadd_s_w(a.vec(j).native(),
545 a.vec(j).native());
546 r = add(r, sum);
547 }
548 r = (int64<2>) __msa_hadd_s_d(r.native(), r.native());
549 r = add(r, move4_l<2>(r));
550 return extract<0>(r);
551#endif
552}
553
554// -----------------------------------------------------------------------------
555
556static SIMDPP_INL
557uint32_t i_reduce_add(const uint32x4& a)
558{
559#if SIMDPP_USE_NULL
560 uint32_t r = a.el(0);
561 for (unsigned i = 1; i < a.length; i++) {
562 r += a.el(i);
563 }
564 return r;
565#elif SIMDPP_USE_MSA
566 uint32x4 sum = a;
567 sum = (uint64<2>) __msa_hadd_u_d(sum.native(), sum.native());
568 sum = add(sum, move4_l<2>(sum));
569 return extract<0>(sum);
570#else
571 uint32x4 sum = a;
572 sum = add(sum, move4_l<2>(sum));
573 sum = add(sum, move4_l<1>(sum));
574 return extract<0>(sum);
575#endif
576}
577
578#if SIMDPP_USE_AVX2
579static SIMDPP_INL
580uint32_t i_reduce_add(const uint32x8& a)
581{
582 uint32x4 sum = add(detail::extract128<0>(a), detail::extract128<1>(a));
583 sum = add(sum, move4_l<2>(sum));
584 sum = add(sum, move4_l<1>(sum));
585 return extract<0>(sum);
586}
587#endif
588
589#if SIMDPP_USE_AVX512F
590static SIMDPP_INL
591uint32_t i_reduce_add(const uint32<16>& a)
592{
593 return i_reduce_add(add(extract256<0>(a), extract256<1>(a)));
594}
595#endif
596
597template<unsigned N>
598SIMDPP_INL uint32_t i_reduce_add(const uint32<N>& a)
599{
600#if SIMDPP_USE_NULL
601 uint32_t r = 0;
602 for (unsigned j = 0; j < a.vec_length; ++j) {
603 for (unsigned i = 0; i < a.base_length; i++) {
604 r += a.vec(j).el(i);
605 }
606 }
607 return r;
608#else
609 uint32v sum = make_zero();
610 for (unsigned j = 0; j < a.vec_length; ++j) {
611 sum = add(sum, a.vec(j));
612 }
613 return i_reduce_add(sum);
614#endif
615}
616
617// -----------------------------------------------------------------------------
618
619static SIMDPP_INL
620uint64_t i_reduce_add(const uint64x2& a)
621{
622#if SIMDPP_USE_NULL
623 uint64_t r = a.el(0);
624 for (unsigned i = 1; i < a.length; i++) {
625 r += a.el(i);
626 }
627 return r;
628#elif SIMDPP_USE_SSE2
629 uint64x2 sum = a;
630 sum = add(sum, move2_l<1>(sum));
631 return extract<0>(sum);
632#elif SIMDPP_USE_NEON
633 uint64x1_t r = vadd_u64(vget_low_u64(a.native()),
634 vget_high_u64(a.native()));
635 return vget_lane_u64(r, 0);
636#elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
637 return extract<0>(a) + extract<1>(a);
638#endif
639}
640
641#if SIMDPP_USE_AVX2
642static SIMDPP_INL
643uint64_t i_reduce_add(const uint64x4& a)
644{
645 uint64x2 sum = add(detail::extract128<0>(a), detail::extract128<1>(a));
646 sum = add(sum, move2_l<1>(sum));
647 return extract<0>(sum);
648}
649#endif
650
651#if SIMDPP_USE_AVX512F
652static SIMDPP_INL
653uint64_t i_reduce_add(const uint64<8>& a)
654{
655 return i_reduce_add(add(extract256<0>(a), extract256<1>(a)));
656}
657#endif
658
659template<unsigned N>
660SIMDPP_INL uint64_t i_reduce_add(const uint64<N>& a)
661{
662#if SIMDPP_USE_NULL || (SIMDPP_USE_ALTIVEC && !SIMDPP_USE_VSX_207)
663 uint64_t r = 0;
664 for (unsigned j = 0; j < a.vec_length; ++j) {
665 for (unsigned i = 0; i < a.base_length; i++) {
666 r += a.vec(j).el(i);
667 }
668 }
669 return r;
670#else
671 uint64v sum = make_zero();
672 for (unsigned j = 0; j < a.vec_length; ++j) {
673 sum = add(sum, a.vec(j));
674 }
675 return i_reduce_add(sum);
676#endif
677}
678
679// -----------------------------------------------------------------------------
680
681
682} // namespace insn
683} // namespace detail
684} // namespace SIMDPP_ARCH_NAMESPACE
685} // namespace simdpp
686
687#endif
688
689