1 | /* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_F_FLOOR_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_F_FLOOR_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <cmath> |
16 | #include <simdpp/types.h> |
17 | #include <simdpp/core/f_abs.h> |
18 | #include <simdpp/core/bit_or.h> |
19 | #include <simdpp/core/blend.h> |
20 | #include <simdpp/core/cmp_eq.h> |
21 | #include <simdpp/core/cmp_gt.h> |
22 | #include <simdpp/core/i_shift_r.h> |
23 | #include <simdpp/core/i_sub.h> |
24 | #include <simdpp/core/to_float32.h> |
25 | #include <simdpp/core/to_int32.h> |
26 | #include <simdpp/detail/vector_array_macros.h> |
27 | |
28 | namespace simdpp { |
29 | namespace SIMDPP_ARCH_NAMESPACE { |
30 | namespace detail { |
31 | namespace insn { |
32 | |
33 | |
34 | static SIMDPP_INL |
35 | float32x4 i_floor(const float32x4& a) |
36 | { |
37 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP |
38 | float32x4 r; |
39 | for (unsigned i = 0; i < a.length; i++) { |
40 | r.el(i) = std::floor(a.el(i)); |
41 | } |
42 | return r; |
43 | #elif SIMDPP_USE_SSE4_1 |
44 | return _mm_floor_ps(a.native()); |
45 | #elif SIMDPP_USE_NEON64 |
46 | return vrndmq_f32(a.native()); |
47 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON_FLT_SP || SIMDPP_USE_MSA |
48 | //check if the value is not too large, or is zero |
49 | float32x4 ba = abs(a); |
50 | mask_float32x4 mask_range = cmp_le(ba, 8388607.0f); |
51 | mask_float32x4 mask_nonzero = cmp_gt(ba, 0); |
52 | mask_float32x4 mask = bit_and(mask_range, mask_nonzero); // takes care of nans and zeros |
53 | |
54 | //calculate the i_floor using trunc |
55 | int32x4 s = shift_r((uint32x4)a, 31); //=1 if a<0 |
56 | float32x4 at = (float32x4) sub((int32x4)a, s); //=nextafter towards +inf, if a<0 |
57 | int32x4 ia = to_int32(at); |
58 | ia = sub(ia, s); |
59 | float32x4 fa = to_float32(ia); |
60 | |
61 | //combine the results |
62 | return blend(fa, a, mask); |
63 | #elif SIMDPP_USE_ALTIVEC |
64 | return vec_floor(a.native()); |
65 | #endif |
66 | } |
67 | |
68 | #if SIMDPP_USE_AVX |
69 | static SIMDPP_INL |
70 | float32x8 i_floor(const float32x8& a) |
71 | { |
72 | return _mm256_floor_ps(a.native()); |
73 | } |
74 | #endif |
75 | |
76 | #if SIMDPP_USE_AVX512F |
77 | static SIMDPP_INL |
78 | float32<16> i_floor(const float32<16>& a) |
79 | { |
80 | return _mm512_floor_ps(a.native()); |
81 | } |
82 | #endif |
83 | |
84 | // ----------------------------------------------------------------------------- |
85 | |
86 | static SIMDPP_INL |
87 | float64x2 i_floor(const float64x2& a) |
88 | { |
89 | #if SIMDPP_USE_SSE4_1 |
90 | return _mm_floor_pd(a.native()); |
91 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_MSA |
92 | float64x2 af = abs(a); |
93 | // check if the value is not too large or is a nan |
94 | mask_float64x2 mask_range = cmp_le(af, 4503599627370495.0); |
95 | // check if truncate to zero or minus one |
96 | mask_float64x2 mask_1to1 = cmp_lt(af, 1.0); |
97 | |
98 | /* Emulate truncation for numbers not less than 1.0. |
99 | This is implemented by clearing the mantissa in the source number, |
100 | adding 1.0 and subtracting integer 1. The mantissa of the resulting |
101 | number will effectively contain a bit mask defining which bits need to |
102 | be cleared off the source number in order to truncate it. |
103 | */ |
104 | float64x2 clearbits = bit_and(af, 0x7ff0000000000000); // clear the mantissa |
105 | clearbits = add(clearbits, 1.0); |
106 | clearbits = (float64x2) sub(uint64x2(clearbits), 1); |
107 | clearbits = bit_andnot(clearbits, 0xfff0000000000000); // leave only the mantissa |
108 | |
109 | float64x2 a2 = bit_andnot(a, clearbits); // truncate |
110 | |
111 | // check if we need to subtract one (truncated bits when negative) |
112 | mask_float64x2 mask_neg = cmp_lt(a, 0.0); |
113 | mask_float64x2 mask_sub1 = cmp_gt(bit_and(a, clearbits), 0.0); |
114 | mask_sub1 = bit_and(mask_sub1, mask_neg); |
115 | |
116 | // one special case is when 'a' is in the range of (-1.0, 0.0) in which |
117 | // a & clearbits may still yield to zero. Thus this additional check |
118 | mask_sub1 = bit_or(mask_sub1, bit_and(mask_1to1, mask_neg)); |
119 | float64x2 sub1 = make_float(-1.0); |
120 | sub1 = bit_and(sub1, mask_sub1); |
121 | |
122 | a2 = bit_andnot(a, mask_1to1); |
123 | a2 = sub(a2, sub1); |
124 | |
125 | return blend(a2, a, mask_range); |
126 | #elif SIMDPP_USE_NEON64 |
127 | return vrndnq_f64(a.native()); |
128 | #elif SIMDPP_USE_VSX_206 |
129 | return vec_floor(a.native()); |
130 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC |
131 | float64x2 r; |
132 | for (unsigned i = 0; i < r.length; ++i) { |
133 | r.el(i) = std::floor(a.el(i)); |
134 | } |
135 | return r; |
136 | #endif |
137 | } |
138 | |
139 | #if SIMDPP_USE_AVX |
140 | static SIMDPP_INL |
141 | float64x4 i_floor(const float64x4& a) |
142 | { |
143 | return _mm256_floor_pd(a.native()); |
144 | } |
145 | #endif |
146 | |
147 | #if SIMDPP_USE_AVX512F |
148 | static SIMDPP_INL |
149 | float64<8> i_floor(const float64<8>& a) |
150 | { |
151 | return _mm512_floor_pd(a.native()); |
152 | } |
153 | #endif |
154 | |
155 | template<class V> SIMDPP_INL |
156 | V i_floor(const V& a) |
157 | { |
158 | SIMDPP_VEC_ARRAY_IMPL1(V, i_floor, a); |
159 | } |
160 | |
161 | } // namespace insn |
162 | } // namespace detail |
163 | } // namespace SIMDPP_ARCH_NAMESPACE |
164 | } // namespace simdpp |
165 | |
166 | #endif |
167 | |
168 | |