1/* Copyright (C) 2016 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_F_REDUCE_ADD_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_F_REDUCE_ADD_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/extract.h>
17#include <simdpp/core/f_add.h>
18#include <simdpp/core/permute2.h>
19#include <simdpp/detail/extract128.h>
20#include <simdpp/detail/workarounds.h>
21
22namespace simdpp {
23namespace SIMDPP_ARCH_NAMESPACE {
24namespace detail {
25namespace insn {
26
27
28static SIMDPP_INL
29float i_reduce_add(const float32x4& a)
30{
31#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
32 float r = a.el(0);
33 for (unsigned i = 1; i < a.length; i++) {
34 r += a.el(i);
35 }
36 return r;
37#elif SIMDPP_USE_SSE3
38 float32x4 b = a;
39 b = _mm_hadd_ps(b.native(), b.native());
40 b = _mm_hadd_ps(b.native(), b.native());
41 return _mm_cvtss_f32(b.native());
42#elif SIMDPP_USE_SSE2
43 float32x4 sum2 = _mm_movehl_ps(a.native(), a.native());
44 float32x4 sum = add(a, sum2);
45 sum = add(sum, permute2<1,0>(sum));
46 return _mm_cvtss_f32(sum.native());
47#elif SIMDPP_USE_NEON_FLT_SP
48 float32x2_t a2 = vpadd_f32(vget_low_f32(a.native()), vget_high_f32(a.native()));
49 a2 = vpadd_f32(a2, a2);
50 return vget_lane_f32(a2, 0);
51#elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
52 float32x4 b = a;
53 b = add(b, move4_l<1>(b));
54 b = add(b, move4_l<2>(b));
55 return extract<0>(b);
56#endif
57}
58
59#if SIMDPP_USE_AVX
60static SIMDPP_INL
61float i_reduce_add(const float32x8& a)
62{
63 __m128 ah = detail::extract128<1>(a).native();
64 __m128 al = detail::extract128<0>(a).native();
65 al = _mm_hadd_ps(al, ah);
66 al = _mm_hadd_ps(al, al);
67 al = _mm_hadd_ps(al, al);
68 return _mm_cvtss_f32(al);
69}
70#endif
71
72#if SIMDPP_USE_AVX512F
73static SIMDPP_INL
74float i_reduce_add(const float32<16>& a)
75{
76 return i_reduce_add(add(extract256<0>(a), extract256<1>(a)));
77}
78#endif
79
80template<unsigned N>
81SIMDPP_INL float i_reduce_add(const float32<N>& a)
82{
83 float32v r = a.vec(0);
84 for (unsigned i = 1; i < a.vec_length; ++i)
85 r = add(r, a.vec(i));
86 return i_reduce_add(r);
87}
88
89// -----------------------------------------------------------------------------
90
91static SIMDPP_INL
92double i_reduce_add(const float64x2& a)
93{
94#if SIMDPP_USE_SSE3
95 return _mm_cvtsd_f64(_mm_hadd_pd(a.native(), a.native()));
96#elif SIMDPP_USE_SSE2
97 float64x2 b = add(a, permute2<1,1>(a));
98 return _mm_cvtsd_f64(b.native());
99#elif SIMDPP_USE_NEON64
100 float64x2_t a2 = vpaddq_f64(a.native(), a.native());
101 return vgetq_lane_f64(a2, 0);
102#elif SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
103 float64x2 b = add(a, permute2<1,1>(a));
104 return extract<0>(b);
105#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
106 double r = a.el(0);
107 for (unsigned i = 1; i < a.length; i++) {
108 r += a.el(i);
109 }
110 return r;
111#endif
112}
113
114#if SIMDPP_USE_AVX
115static SIMDPP_INL
116double i_reduce_add(const float64x4& a)
117{
118 __m128d ah = detail::extract128<1>(a).native();
119 __m128d al = detail::extract128<0>(a).native();
120 al = _mm_hadd_pd(al, ah);
121 al = _mm_hadd_pd(al, al);
122 return _mm_cvtsd_f64(al);
123}
124#endif
125
126#if SIMDPP_USE_AVX512F
127static SIMDPP_INL
128double i_reduce_add(const float64<8>& a)
129{
130 return i_reduce_add(add(extract256<0>(a), extract256<1>(a)));
131}
132#endif
133
134template<unsigned N>
135SIMDPP_INL double i_reduce_add(const float64<N>& a)
136{
137 float64v r = a.vec(0);
138 for (unsigned i = 1; i < a.vec_length; ++i)
139 r = add(r, a.vec(i));
140 return i_reduce_add(r);
141}
142
143} // namespace insn
144} // namespace detail
145} // namespace SIMDPP_ARCH_NAMESPACE
146} // namespace simdpp
147
148#endif
149
150