1 | /* Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_F_RCP_RH_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_F_RCP_RH_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/f_sub.h> |
17 | #include <simdpp/core/f_mul.h> |
18 | #include <simdpp/core/make_float.h> |
19 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC |
20 | #include <cmath> |
21 | #include <simdpp/detail/null/math.h> |
22 | #endif |
23 | #include <simdpp/detail/vector_array_macros.h> |
24 | |
25 | namespace simdpp { |
26 | namespace SIMDPP_ARCH_NAMESPACE { |
27 | namespace detail { |
28 | namespace insn { |
29 | |
30 | |
31 | static SIMDPP_INL |
32 | float32x4 i_rcp_rh(const float32x4& cx, const float32x4& a) |
33 | { |
34 | float32<4> x = cx; |
35 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP |
36 | float32x4 r; |
37 | for (unsigned i = 0; i < a.length; i++) { |
38 | float ix = x.el(i); |
39 | float ia = a.el(i); |
40 | r.el(i) = ix*(2.0f - ix*ia); |
41 | } |
42 | return r; |
43 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_MSA |
44 | float32x4 r; |
45 | |
46 | r = mul(a, x); |
47 | r = sub(2.0, r); |
48 | x = mul(x, r); |
49 | |
50 | return x; |
51 | #elif SIMDPP_USE_NEON_FLT_SP |
52 | float32x4 r; |
53 | r = vrecpsq_f32(a.native(), x.native()); |
54 | x = mul(x, r); |
55 | |
56 | return x; |
57 | #elif SIMDPP_USE_ALTIVEC |
58 | float32x4 r, c2; |
59 | c2 = make_float(2.0f); |
60 | // -(x*a-c2) |
61 | r = vec_nmsub(x.native(), a.native(), c2.native()); |
62 | x = mul(x, r); |
63 | return x; |
64 | #endif |
65 | } |
66 | |
67 | #if SIMDPP_USE_AVX |
68 | static SIMDPP_INL |
69 | float32x8 i_rcp_rh(const float32x8& cx, const float32x8& a) |
70 | { |
71 | float32x8 r, x = cx; |
72 | |
73 | r = mul(a, x); |
74 | r = sub(2.0, r); |
75 | x = mul(x, r); |
76 | |
77 | return x; |
78 | } |
79 | #endif |
80 | |
81 | #if SIMDPP_USE_AVX512F |
82 | static SIMDPP_INL |
83 | float32<16> i_rcp_rh(const float32<16>& cx, const float32<16>& a) |
84 | { |
85 | float32<16> r, x = cx; |
86 | |
87 | r = mul(a, x); |
88 | r = sub(2.0, r); |
89 | x = mul(x, r); |
90 | |
91 | return x; |
92 | } |
93 | #endif |
94 | |
95 | template<unsigned N> SIMDPP_INL |
96 | float32<N> i_rcp_rh(const float32<N>& x, const float32<N>& a) |
97 | { |
98 | SIMDPP_VEC_ARRAY_IMPL2(float32<N>, i_rcp_rh, x, a); |
99 | } |
100 | |
101 | |
102 | } // namespace insn |
103 | } // namespace detail |
104 | } // namespace SIMDPP_ARCH_NAMESPACE |
105 | } // namespace simdpp |
106 | |
107 | #endif |
108 | |
109 | |