1 | /* Copyright (C) 2017 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_CONV_FLOAT_TO_INT32_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_CONV_FLOAT_TO_INT32_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/i_add.h> |
17 | #include <simdpp/core/f_sub.h> |
18 | #include <simdpp/core/cmp_ge.h> |
19 | #include <simdpp/detail/vector_array_conv_macros.h> |
20 | |
21 | namespace simdpp { |
22 | namespace SIMDPP_ARCH_NAMESPACE { |
23 | namespace detail { |
24 | namespace insn { |
25 | |
26 | // ----------------------------------------------------------------------------- |
27 | |
28 | static SIMDPP_INL |
29 | int32<4> i_to_int32(const float32<4>& a) |
30 | { |
31 | #if SIMDPP_USE_NULL |
32 | int32x4 r; |
33 | for (unsigned i = 0; i < a.length; i++) { |
34 | r.el(i) = int32_t(a.el(i)); |
35 | } |
36 | return r; |
37 | #elif SIMDPP_USE_SSE2 |
38 | return _mm_cvttps_epi32(a.native()); |
39 | #elif SIMDPP_USE_NEON && !SIMDPP_USE_NEON_FLT_SP |
40 | detail::mem_block<float32x4> mf(a); |
41 | detail::mem_block<int32x4> mi; |
42 | mi[0] = int(mf[0]); |
43 | mi[1] = int(mf[1]); |
44 | mi[2] = int(mf[2]); |
45 | mi[3] = int(mf[3]); |
46 | return mi; |
47 | #elif SIMDPP_USE_NEON_FLT_SP |
48 | return vcvtq_s32_f32(a.native()); |
49 | #elif SIMDPP_USE_MSA |
50 | return __msa_ftrunc_s_w(a.native()); |
51 | #elif SIMDPP_USE_ALTIVEC |
52 | return vec_cts(a.native(), 0); |
53 | #endif |
54 | } |
55 | |
56 | #if SIMDPP_USE_AVX |
57 | static SIMDPP_INL |
58 | int32<8> i_to_int32(const float32<8>& a) |
59 | { |
60 | #if SIMDPP_USE_AVX2 |
61 | return _mm256_cvttps_epi32(a.native()); |
62 | #else |
63 | __m256i r = _mm256_cvttps_epi32(a.native()); |
64 | uint32<4> r1, r2; |
65 | r1 = _mm256_castsi256_si128(r); |
66 | r2 = _mm256_extractf128_si256(r, 1); |
67 | return combine(r1, r2); |
68 | #endif |
69 | } |
70 | #endif |
71 | |
72 | #if SIMDPP_USE_AVX512F |
73 | static SIMDPP_INL |
74 | int32<16> i_to_int32(const float32<16>& a) |
75 | { |
76 | return _mm512_cvttps_epi32(a.native()); |
77 | } |
78 | #endif |
79 | |
80 | template<unsigned N> SIMDPP_INL |
81 | int32<N> i_to_int32(const float32<N>& a) |
82 | { |
83 | SIMDPP_VEC_ARRAY_IMPL_CONV_INSERT(int32<N>, i_to_int32, a) |
84 | } |
85 | |
86 | // ----------------------------------------------------------------------------- |
87 | |
88 | static SIMDPP_INL |
89 | uint32<4> i_to_uint32(const float32<4>& a) |
90 | { |
91 | #if SIMDPP_USE_NULL |
92 | uint32x4 r; |
93 | for (unsigned i = 0; i < a.length; i++) { |
94 | r.el(i) = uint32_t(a.el(i)); |
95 | } |
96 | return r; |
97 | #elif SIMDPP_USE_AVX512VL |
98 | return _mm_cvttps_epu32(a.native()); |
99 | #elif SIMDPP_USE_AVX512F |
100 | __m512 a512 = _mm512_castps128_ps512(a.native()); |
101 | return _mm512_castsi512_si128(_mm512_cvttps_epu32(a512)); |
102 | #elif SIMDPP_USE_NEON && !SIMDPP_USE_NEON_FLT_SP |
103 | detail::mem_block<float32x4> mf(a); |
104 | detail::mem_block<uint32x4> mi; |
105 | mi[0] = unsigned(mf[0]); |
106 | mi[1] = unsigned(mf[1]); |
107 | mi[2] = unsigned(mf[2]); |
108 | mi[3] = unsigned(mf[3]); |
109 | return mi; |
110 | #elif SIMDPP_USE_NEON_FLT_SP |
111 | return vcvtq_u32_f32(a.native()); |
112 | #elif SIMDPP_USE_MSA |
113 | return __msa_ftrunc_u_w(a.native()); |
114 | #elif SIMDPP_USE_ALTIVEC |
115 | return vec_ctu(a.native(), 0); |
116 | #else |
117 | // Smaller than 0x80000000 numbers can be represented as int32, so we can |
118 | // use i_to_int32 which is available as instruction on all supported |
119 | // architectures. Values >= 0x80000000 are biased into the range -0x80000000..0xffffffff. |
120 | // These conveniently convert through i_to_int32() to 0x80000000..0xffffffff. No further |
121 | // unbiasing is required. No attempt is made to produce a reliable overflow value for |
122 | // values outside the range 0 .. 0xffffffff. |
123 | mask_float32<4> is_large = cmp_ge(a, 0x80000000); |
124 | return uint32<4>( i_to_int32(sub(a, bit_and(is_large, splat<float32<4>>(0x100000000)))) ); |
125 | #endif |
126 | } |
127 | |
128 | #if SIMDPP_USE_AVX |
129 | static SIMDPP_INL |
130 | uint32<8> i_to_uint32(const float32<8>& a) |
131 | { |
132 | #if SIMDPP_USE_AVX512VL |
133 | return _mm256_cvttps_epu32(a.native()); |
134 | #elif SIMDPP_USE_AVX512F |
135 | __m512 a512 = _mm512_castps256_ps512(a.native()); |
136 | return _mm512_castsi512_si256(_mm512_cvttps_epu32(a512)); |
137 | #else |
138 | mask_float32<8> is_large = cmp_ge(a, 0x80000000); |
139 | return uint32<8>( i_to_int32(sub(a, bit_and(is_large, splat<float32<8>>(0x100000000)))) ); |
140 | #endif |
141 | } |
142 | #endif |
143 | |
144 | #if SIMDPP_USE_AVX512F |
145 | static SIMDPP_INL |
146 | uint32<16> i_to_uint32(const float32<16>& a) |
147 | { |
148 | return _mm512_cvttps_epu32(a.native()); |
149 | } |
150 | #endif |
151 | |
152 | template<unsigned N> SIMDPP_INL |
153 | uint32<N> i_to_uint32(const float32<N>& a) |
154 | { |
155 | SIMDPP_VEC_ARRAY_IMPL_CONV_INSERT(uint32<N>, i_to_uint32, a) |
156 | } |
157 | |
158 | // ----------------------------------------------------------------------------- |
159 | |
160 | static SIMDPP_INL |
161 | uint32<4> i_to_uint32(const float64<4>& a) |
162 | { |
163 | #if SIMDPP_USE_AVX512VL |
164 | return _mm256_cvttpd_epu32(a.native()); |
165 | #elif SIMDPP_USE_AVX512F |
166 | __m512d a512 = _mm512_castpd256_pd512(a.native()); |
167 | return _mm256_castsi256_si128(_mm512_cvttpd_epu32(a512)); |
168 | #elif SIMDPP_USE_NEON64 |
169 | uint64x2_t r1, r2; |
170 | r1 = vcvtq_u64_f64(a.vec(0).native()); |
171 | r2 = vcvtq_u64_f64(a.vec(1).native()); |
172 | // FIXME: saturation |
173 | uint32<4> r = vcombine_u32(vqmovn_u64(r1), vqmovn_u64(r2)); |
174 | return r; |
175 | #elif SIMDPP_USE_VSX_206 |
176 | uint32<4> r, r1, r2; |
177 | r1 = (__vector uint32_t) vec_ctu(a.vec(0).native(), 0); |
178 | r2 = (__vector uint32_t) vec_ctu(a.vec(1).native(), 0); |
179 | r = unzip4_lo(r1, r2); |
180 | return r; |
181 | #elif SIMDPP_USE_MSA |
182 | uint64<2> r1, r2; |
183 | r1 = __msa_ftrunc_u_d(a.vec(0).native()); |
184 | r2 = __msa_ftrunc_u_d(a.vec(1).native()); |
185 | return unzip4_lo((uint32<4>)r1, (uint32<4>)r2); |
186 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC |
187 | detail::mem_block<uint32x4> r; |
188 | r[0] = uint32_t(a.vec(0).el(0)); |
189 | r[1] = uint32_t(a.vec(0).el(1)); |
190 | r[2] = uint32_t(a.vec(1).el(0)); |
191 | r[3] = uint32_t(a.vec(1).el(1)); |
192 | return r; |
193 | #else |
194 | return SIMDPP_NOT_IMPLEMENTED1(a); |
195 | #endif |
196 | } |
197 | |
198 | #if SIMDPP_USE_AVX |
199 | static SIMDPP_INL |
200 | uint32<8> i_to_uint32(const float64<8>& a) |
201 | { |
202 | #if SIMDPP_USE_AVX512F |
203 | return _mm512_cvttpd_epu32(a.native()); |
204 | #else |
205 | return SIMDPP_NOT_IMPLEMENTED1(a); |
206 | #endif |
207 | } |
208 | #endif |
209 | |
210 | #if SIMDPP_USE_AVX512F |
211 | static SIMDPP_INL |
212 | uint32<16> i_to_uint32(const float64<16>& a) |
213 | { |
214 | uint32<8> r0, r1; |
215 | r0 = _mm512_cvttpd_epu32(a.vec(0).native()); |
216 | r1 = _mm512_cvttpd_epu32(a.vec(1).native()); |
217 | return combine(r0, r1); |
218 | } |
219 | #endif |
220 | |
221 | template<unsigned N> SIMDPP_INL |
222 | uint32<N> i_to_uint32(const float64<N>& a) |
223 | { |
224 | SIMDPP_VEC_ARRAY_IMPL_CONV_EXTRACT(uint32<N>, i_to_uint32, a) |
225 | } |
226 | |
227 | // ----------------------------------------------------------------------------- |
228 | |
229 | static SIMDPP_INL |
230 | int32x4 i_to_int32(const float64x4& a) |
231 | { |
232 | #if SIMDPP_USE_AVX512VL |
233 | return _mm256_cvttpd_epi32(a.native()); |
234 | #elif SIMDPP_USE_SSE2 |
235 | int32x4 r, r1, r2; |
236 | float64x2 a1, a2; |
237 | split(a, a1, a2); |
238 | r1 = _mm_cvttpd_epi32(a1.native()); |
239 | r2 = _mm_cvttpd_epi32(a2.native()); |
240 | r = zip2_lo(int64<2>(r1), int64<2>(r2)); |
241 | return r; |
242 | #elif SIMDPP_USE_NEON64 |
243 | int64x2_t r1, r2; |
244 | r1 = vcvtq_s64_f64(a.vec(0).native()); |
245 | r2 = vcvtq_s64_f64(a.vec(1).native()); |
246 | // FIXME: saturation |
247 | int32<4> r = vcombine_s32(vqmovn_s64(r1), vqmovn_s64(r2)); |
248 | return r; |
249 | #elif SIMDPP_USE_VSX_206 |
250 | int32<4> r, r1, r2; |
251 | r1 = (__vector int32_t) vec_cts(a.vec(0).native(), 0); |
252 | r2 = (__vector int32_t) vec_cts(a.vec(1).native(), 0); |
253 | r = unzip4_lo(r1, r2); |
254 | return r; |
255 | #elif SIMDPP_USE_MSA |
256 | int64<2> r1, r2; |
257 | r1 = __msa_ftrunc_s_d(a.vec(0).native()); |
258 | r2 = __msa_ftrunc_s_d(a.vec(1).native()); |
259 | return unzip4_lo((int32<4>)r1, (int32<4>)r2); |
260 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC |
261 | detail::mem_block<int32x4> r; |
262 | r[0] = int32_t(a.vec(0).el(0)); |
263 | r[1] = int32_t(a.vec(0).el(1)); |
264 | r[2] = int32_t(a.vec(1).el(0)); |
265 | r[3] = int32_t(a.vec(1).el(1)); |
266 | return r; |
267 | #else |
268 | return SIMDPP_NOT_IMPLEMENTED1(a); |
269 | #endif |
270 | } |
271 | |
272 | #if SIMDPP_USE_AVX |
273 | static SIMDPP_INL |
274 | int32<8> i_to_int32(const float64<8>& a) |
275 | { |
276 | #if SIMDPP_USE_AVX512F |
277 | return _mm512_cvttpd_epi32(a.native()); |
278 | #else |
279 | int32<4> r1, r2; |
280 | r1 = _mm256_cvttpd_epi32(a.vec(0).native()); |
281 | r2 = _mm256_cvttpd_epi32(a.vec(1).native()); |
282 | return combine(r1, r2); |
283 | #endif |
284 | } |
285 | #endif |
286 | |
287 | #if SIMDPP_USE_AVX512F |
288 | static SIMDPP_INL |
289 | int32<16> i_to_int32(const float64<16>& a) |
290 | { |
291 | int32<8> r0, r1; |
292 | r0 = _mm512_cvttpd_epi32(a.vec(0).native()); |
293 | r1 = _mm512_cvttpd_epi32(a.vec(1).native()); |
294 | return combine(r0, r1); |
295 | } |
296 | #endif |
297 | |
298 | template<unsigned N> SIMDPP_INL |
299 | int32<N> i_to_int32(const float64<N>& a) |
300 | { |
301 | SIMDPP_VEC_ARRAY_IMPL_CONV_EXTRACT(int32<N>, i_to_int32, a) |
302 | } |
303 | |
304 | |
305 | } // namespace insn |
306 | } // namespace detail |
307 | } // namespace SIMDPP_ARCH_NAMESPACE |
308 | } // namespace simdpp |
309 | |
310 | #endif |
311 | |
312 | |
313 | |