1 | /* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_F_TRUNC_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_F_TRUNC_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <cmath> |
16 | #include <simdpp/types.h> |
17 | #include <simdpp/core/cmp_gt.h> |
18 | #include <simdpp/core/cmp_lt.h> |
19 | #include <simdpp/core/cmp_le.h> |
20 | #include <simdpp/core/f_ceil.h> |
21 | #include <simdpp/core/f_add.h> |
22 | #include <simdpp/core/make_float.h> |
23 | #include <simdpp/core/make_uint.h> |
24 | #include <simdpp/core/to_float32.h> |
25 | #include <simdpp/core/to_float64.h> |
26 | #include <simdpp/core/to_int32.h> |
27 | #include <simdpp/core/to_int64.h> |
28 | #include <simdpp/detail/vector_array_macros.h> |
29 | |
30 | namespace simdpp { |
31 | namespace SIMDPP_ARCH_NAMESPACE { |
32 | namespace detail { |
33 | namespace insn { |
34 | |
35 | static SIMDPP_INL |
36 | float32x4 i_trunc(const float32x4& a) |
37 | { |
38 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP |
39 | float32x4 r; |
40 | for (unsigned i = 0; i < a.length; i++) { |
41 | r.el(i) = std::trunc(a.el(i)); |
42 | } |
43 | return r; |
44 | #elif SIMDPP_USE_SSE4_1 |
45 | return _mm_round_ps(a.native(), 3); // 3 = i_truncate |
46 | #elif SIMDPP_USE_NEON64 |
47 | return vrndq_f32(a.native()); // FIXME: in ARM8 A32 too |
48 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON_FLT_SP || SIMDPP_USE_MSA |
49 | //check if the value is not too large |
50 | float32x4 af = abs(a); |
51 | mask_float32x4 mask_range = cmp_lt(af, 8388608.0f); |
52 | |
53 | // don't change the sign of negative zero |
54 | mask_float32x4 mask_nonzero = cmp_gt(af, 0); |
55 | mask_float32x4 mask = bit_and(mask_range, mask_nonzero); |
56 | |
57 | //truncate |
58 | int32x4 ia = to_int32(a); |
59 | float32x4 fa = to_float32(ia); |
60 | |
61 | //combine the results |
62 | return blend(fa, a, mask); // takes care of NaNs |
63 | #elif SIMDPP_USE_ALTIVEC |
64 | return vec_trunc(a.native()); |
65 | #endif |
66 | } |
67 | |
68 | #if SIMDPP_USE_AVX |
69 | static SIMDPP_INL |
70 | float32x8 i_trunc(const float32x8& a) |
71 | { |
72 | return _mm256_round_ps(a.native(), 3); // 3 = i_truncate |
73 | } |
74 | #endif |
75 | |
76 | #if SIMDPP_USE_AVX512F |
77 | static SIMDPP_INL |
78 | float32<16> i_trunc(const float32<16>& a) |
79 | { |
80 | return _mm512_roundscale_ps(a.native(), 0x13); // scale by 1, truncate |
81 | } |
82 | #endif |
83 | |
84 | // ----------------------------------------------------------------------------- |
85 | |
86 | static SIMDPP_INL |
87 | float64x2 i_trunc(const float64x2& a) |
88 | { |
89 | #if SIMDPP_USE_SSE4_1 |
90 | return _mm_round_pd(a.native(), 3); |
91 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_MSA |
92 | float64x2 af = abs(a); |
93 | // check if the value is not too large or is a nan |
94 | mask_float64x2 mask_range = cmp_le(af, 4503599627370495.0); |
95 | // check if truncate to zero |
96 | float64x2 mask_zero = cmp_lt(af, 1.0).eval().unmask(); |
97 | |
98 | /* Emulate truncation for numbers not less than 1.0. |
99 | This is implemented by clearing the mantissa in the source number, |
100 | adding 1.0 and subtracting integer 1. The mantissa of the resulting |
101 | number will effectively contain a bit mask defining which bits need to |
102 | be cleared off the source number in order to truncate it. |
103 | */ |
104 | float64x2 clearbits = bit_and(af, 0x7ff0000000000000); // clear the mantissa |
105 | clearbits = add(clearbits, 1.0); |
106 | clearbits = (float64x2) sub(uint64x2(clearbits), 1); |
107 | clearbits = bit_andnot(clearbits, 0x7ff0000000000000); // leave only the mantissa |
108 | |
109 | float64x2 a2 = bit_andnot(a, clearbits); |
110 | a2 = bit_andnot(a2, mask_zero); |
111 | return blend(a2, a, mask_range); |
112 | #elif SIMDPP_USE_NEON64 |
113 | return vrndq_f64(a.native()); |
114 | #elif SIMDPP_USE_VSX_206 |
115 | return vec_trunc(a.native()); |
116 | #elif SIMDPP_USE_MSA |
117 | //check if the value is not too large |
118 | float64<2> af = abs(a); |
119 | mask_float64<2> mask_range = cmp_le(af, 4503599627370495.0); |
120 | |
121 | // don't change the sign of negative zero |
122 | mask_float64<2> mask_nonzero = cmp_gt(af, 0); |
123 | mask_float64<2> mask = bit_and(mask_range, mask_nonzero); |
124 | |
125 | //truncate |
126 | int64<2> ia = to_int64(a); |
127 | float64<2> fa = to_float64(ia); |
128 | |
129 | //combine the results |
130 | return blend(fa, a, mask); // takes care of NaNs |
131 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC |
132 | float64x2 r; |
133 | for (unsigned i = 0; i < r.length; ++i) { |
134 | r.el(i) = std::trunc(a.el(i)); |
135 | } |
136 | return r; |
137 | #endif |
138 | } |
139 | |
140 | #if SIMDPP_USE_AVX |
141 | static SIMDPP_INL |
142 | float64x4 i_trunc(const float64x4& a) |
143 | { |
144 | return _mm256_round_pd(a.native(), 3); // 3 = i_truncate |
145 | } |
146 | #endif |
147 | |
148 | #if SIMDPP_USE_AVX512F |
149 | static SIMDPP_INL |
150 | float64<8> i_trunc(const float64<8>& a) |
151 | { |
152 | return _mm512_roundscale_pd(a.native(), 0x13); // scale by 1, truncate |
153 | } |
154 | #endif |
155 | |
156 | // ----------------------------------------------------------------------------- |
157 | |
158 | template<class V> SIMDPP_INL |
159 | V i_trunc(const V& a) |
160 | { |
161 | SIMDPP_VEC_ARRAY_IMPL1(V, i_trunc, a); |
162 | } |
163 | |
164 | |
165 | } // namespace insn |
166 | } // namespace detail |
167 | } // namespace SIMDPP_ARCH_NAMESPACE |
168 | } // namespace simdpp |
169 | |
170 | #endif |
171 | |
172 | |