1/* Copyright (C) 2016 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_F_REDUCE_MUL_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_F_REDUCE_MUL_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/extract.h>
17#include <simdpp/core/f_mul.h>
18#include <simdpp/core/permute2.h>
19#include <simdpp/detail/extract128.h>
20#include <simdpp/detail/workarounds.h>
21
22namespace simdpp {
23namespace SIMDPP_ARCH_NAMESPACE {
24namespace detail {
25namespace insn {
26
27
28static SIMDPP_INL
29float i_reduce_mul(const float32x4& a)
30{
31#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
32 float r = a.el(0);
33 for (unsigned i = 1; i < a.length; i++) {
34 r *= a.el(i);
35 }
36 return r;
37#elif SIMDPP_USE_SSE2
38 float32x4 b = _mm_movehl_ps(a.native(), a.native());
39 b = mul(a, b);
40 b = mul(b, permute2<1,0>(b));
41 return _mm_cvtss_f32(b.native());
42#elif SIMDPP_USE_NEON_FLT_SP
43 float32x2_t a2 = vmul_f32(vget_low_f32(a.native()), vget_high_f32(a.native()));
44 a2 = vmul_f32(a2, vext_f32(a2, a2, 1));
45 return vget_lane_f32(a2, 0);
46#elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
47 float32x4 b = a;
48 b = mul(b, move4_l<1>(b));
49 b = mul(b, move4_l<2>(b));
50 return extract<0>(b);
51#endif
52}
53
54#if SIMDPP_USE_AVX
55static SIMDPP_INL
56float i_reduce_mul(const float32x8& a)
57{
58 float32x4 ah = detail::extract128<1>(a);
59 float32x4 al = detail::extract128<0>(a);
60 al = mul(al, ah);
61 return i_reduce_mul(al);
62}
63#endif
64
65#if SIMDPP_USE_AVX512F
66static SIMDPP_INL
67float i_reduce_mul(const float32<16>& a)
68{
69 return i_reduce_mul(mul(extract256<0>(a), extract256<1>(a)));
70}
71#endif
72
73template<unsigned N>
74SIMDPP_INL float i_reduce_mul(const float32<N>& a)
75{
76 float32v r = a.vec(0);
77 for (unsigned i = 1; i < a.vec_length; ++i)
78 r = mul(r, a.vec(i));
79 return i_reduce_mul(r);
80}
81
82// -----------------------------------------------------------------------------
83
84static SIMDPP_INL
85double i_reduce_mul(const float64x2& a)
86{
87#if SIMDPP_USE_SSE2
88 float64x2 b = mul(a, permute2<1,0>(a));
89 return _mm_cvtsd_f64(b.native());
90#elif SIMDPP_USE_NEON64
91 float64x1_t a2 = vmul_f64(vget_low_f64(a.native()), vget_high_f64(a.native()));
92 return vget_lane_f64(a2, 0);
93#elif SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
94 float64x2 b = mul(a, permute2<1,1>(a));
95 return extract<0>(b);
96#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
97 double r = a.el(0);
98 for (unsigned i = 1; i < a.length; i++) {
99 r *= a.el(i);
100 }
101 return r;
102#endif
103}
104
105#if SIMDPP_USE_AVX
106static SIMDPP_INL
107double i_reduce_mul(const float64x4& a)
108{
109 float64x2 ah = detail::extract128<1>(a);
110 float64x2 al = detail::extract128<0>(a);
111 al = mul(al, ah);
112 return i_reduce_mul(al);
113}
114#endif
115
116#if SIMDPP_USE_AVX512F
117static SIMDPP_INL
118double i_reduce_mul(const float64<8>& a)
119{
120 return i_reduce_mul(mul(extract256<0>(a), extract256<1>(a)));
121}
122#endif
123
124template<unsigned N>
125SIMDPP_INL double i_reduce_mul(const float64<N>& a)
126{
127 float64v r = a.vec(0);
128 for (unsigned i = 1; i < a.vec_length; ++i)
129 r = mul(r, a.vec(i));
130 return i_reduce_mul(r);
131}
132
133} // namespace insn
134} // namespace detail
135} // namespace SIMDPP_ARCH_NAMESPACE
136} // namespace simdpp
137
138#endif
139
140