1/* Copyright (C) 2017 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_CONV_FLOAT_TO_INT32_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_CONV_FLOAT_TO_INT32_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/i_add.h>
17#include <simdpp/core/f_sub.h>
18#include <simdpp/core/cmp_ge.h>
19#include <simdpp/detail/vector_array_conv_macros.h>
20
21namespace simdpp {
22namespace SIMDPP_ARCH_NAMESPACE {
23namespace detail {
24namespace insn {
25
26// -----------------------------------------------------------------------------
27
28static SIMDPP_INL
29int32<4> i_to_int32(const float32<4>& a)
30{
31#if SIMDPP_USE_NULL
32 int32x4 r;
33 for (unsigned i = 0; i < a.length; i++) {
34 r.el(i) = int32_t(a.el(i));
35 }
36 return r;
37#elif SIMDPP_USE_SSE2
38 return _mm_cvttps_epi32(a.native());
39#elif SIMDPP_USE_NEON && !SIMDPP_USE_NEON_FLT_SP
40 detail::mem_block<float32x4> mf(a);
41 detail::mem_block<int32x4> mi;
42 mi[0] = int(mf[0]);
43 mi[1] = int(mf[1]);
44 mi[2] = int(mf[2]);
45 mi[3] = int(mf[3]);
46 return mi;
47#elif SIMDPP_USE_NEON_FLT_SP
48 return vcvtq_s32_f32(a.native());
49#elif SIMDPP_USE_MSA
50 return __msa_ftrunc_s_w(a.native());
51#elif SIMDPP_USE_ALTIVEC
52 return vec_cts(a.native(), 0);
53#endif
54}
55
56#if SIMDPP_USE_AVX
57static SIMDPP_INL
58int32<8> i_to_int32(const float32<8>& a)
59{
60#if SIMDPP_USE_AVX2
61 return _mm256_cvttps_epi32(a.native());
62#else
63 __m256i r = _mm256_cvttps_epi32(a.native());
64 uint32<4> r1, r2;
65 r1 = _mm256_castsi256_si128(r);
66 r2 = _mm256_extractf128_si256(r, 1);
67 return combine(r1, r2);
68#endif
69}
70#endif
71
72#if SIMDPP_USE_AVX512F
73static SIMDPP_INL
74int32<16> i_to_int32(const float32<16>& a)
75{
76 return _mm512_cvttps_epi32(a.native());
77}
78#endif
79
80template<unsigned N> SIMDPP_INL
81int32<N> i_to_int32(const float32<N>& a)
82{
83 SIMDPP_VEC_ARRAY_IMPL_CONV_INSERT(int32<N>, i_to_int32, a)
84}
85
86// -----------------------------------------------------------------------------
87
88static SIMDPP_INL
89uint32<4> i_to_uint32(const float32<4>& a)
90{
91#if SIMDPP_USE_NULL
92 uint32x4 r;
93 for (unsigned i = 0; i < a.length; i++) {
94 r.el(i) = uint32_t(a.el(i));
95 }
96 return r;
97#elif SIMDPP_USE_AVX512VL
98 return _mm_cvttps_epu32(a.native());
99#elif SIMDPP_USE_AVX512F
100 __m512 a512 = _mm512_castps128_ps512(a.native());
101 return _mm512_castsi512_si128(_mm512_cvttps_epu32(a512));
102#elif SIMDPP_USE_NEON && !SIMDPP_USE_NEON_FLT_SP
103 detail::mem_block<float32x4> mf(a);
104 detail::mem_block<uint32x4> mi;
105 mi[0] = unsigned(mf[0]);
106 mi[1] = unsigned(mf[1]);
107 mi[2] = unsigned(mf[2]);
108 mi[3] = unsigned(mf[3]);
109 return mi;
110#elif SIMDPP_USE_NEON_FLT_SP
111 return vcvtq_u32_f32(a.native());
112#elif SIMDPP_USE_MSA
113 return __msa_ftrunc_u_w(a.native());
114#elif SIMDPP_USE_ALTIVEC
115 return vec_ctu(a.native(), 0);
116#else
117 // Smaller than 0x80000000 numbers can be represented as int32, so we can
118 // use i_to_int32 which is available as instruction on all supported
119 // architectures. Values >= 0x80000000 are biased into the range -0x80000000..0xffffffff.
120 // These conveniently convert through i_to_int32() to 0x80000000..0xffffffff. No further
121 // unbiasing is required. No attempt is made to produce a reliable overflow value for
122 // values outside the range 0 .. 0xffffffff.
123 mask_float32<4> is_large = cmp_ge(a, 0x80000000);
124 return uint32<4>( i_to_int32(sub(a, bit_and(is_large, splat<float32<4>>(0x100000000)))) );
125#endif
126}
127
128#if SIMDPP_USE_AVX
129static SIMDPP_INL
130uint32<8> i_to_uint32(const float32<8>& a)
131{
132#if SIMDPP_USE_AVX512VL
133 return _mm256_cvttps_epu32(a.native());
134#elif SIMDPP_USE_AVX512F
135 __m512 a512 = _mm512_castps256_ps512(a.native());
136 return _mm512_castsi512_si256(_mm512_cvttps_epu32(a512));
137#else
138 mask_float32<8> is_large = cmp_ge(a, 0x80000000);
139 return uint32<8>( i_to_int32(sub(a, bit_and(is_large, splat<float32<8>>(0x100000000)))) );
140#endif
141}
142#endif
143
144#if SIMDPP_USE_AVX512F
145static SIMDPP_INL
146uint32<16> i_to_uint32(const float32<16>& a)
147{
148 return _mm512_cvttps_epu32(a.native());
149}
150#endif
151
152template<unsigned N> SIMDPP_INL
153uint32<N> i_to_uint32(const float32<N>& a)
154{
155 SIMDPP_VEC_ARRAY_IMPL_CONV_INSERT(uint32<N>, i_to_uint32, a)
156}
157
158// -----------------------------------------------------------------------------
159
160static SIMDPP_INL
161uint32<4> i_to_uint32(const float64<4>& a)
162{
163#if SIMDPP_USE_AVX512VL
164 return _mm256_cvttpd_epu32(a.native());
165#elif SIMDPP_USE_AVX512F
166 __m512d a512 = _mm512_castpd256_pd512(a.native());
167 return _mm256_castsi256_si128(_mm512_cvttpd_epu32(a512));
168#elif SIMDPP_USE_NEON64
169 uint64x2_t r1, r2;
170 r1 = vcvtq_u64_f64(a.vec(0).native());
171 r2 = vcvtq_u64_f64(a.vec(1).native());
172 // FIXME: saturation
173 uint32<4> r = vcombine_u32(vqmovn_u64(r1), vqmovn_u64(r2));
174 return r;
175#elif SIMDPP_USE_VSX_206
176 uint32<4> r, r1, r2;
177 r1 = (__vector uint32_t) vec_ctu(a.vec(0).native(), 0);
178 r2 = (__vector uint32_t) vec_ctu(a.vec(1).native(), 0);
179 r = unzip4_lo(r1, r2);
180 return r;
181#elif SIMDPP_USE_MSA
182 uint64<2> r1, r2;
183 r1 = __msa_ftrunc_u_d(a.vec(0).native());
184 r2 = __msa_ftrunc_u_d(a.vec(1).native());
185 return unzip4_lo((uint32<4>)r1, (uint32<4>)r2);
186#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
187 detail::mem_block<uint32x4> r;
188 r[0] = uint32_t(a.vec(0).el(0));
189 r[1] = uint32_t(a.vec(0).el(1));
190 r[2] = uint32_t(a.vec(1).el(0));
191 r[3] = uint32_t(a.vec(1).el(1));
192 return r;
193#else
194 return SIMDPP_NOT_IMPLEMENTED1(a);
195#endif
196}
197
198#if SIMDPP_USE_AVX
199static SIMDPP_INL
200uint32<8> i_to_uint32(const float64<8>& a)
201{
202#if SIMDPP_USE_AVX512F
203 return _mm512_cvttpd_epu32(a.native());
204#else
205 return SIMDPP_NOT_IMPLEMENTED1(a);
206#endif
207}
208#endif
209
210#if SIMDPP_USE_AVX512F
211static SIMDPP_INL
212uint32<16> i_to_uint32(const float64<16>& a)
213{
214 uint32<8> r0, r1;
215 r0 = _mm512_cvttpd_epu32(a.vec(0).native());
216 r1 = _mm512_cvttpd_epu32(a.vec(1).native());
217 return combine(r0, r1);
218}
219#endif
220
221template<unsigned N> SIMDPP_INL
222uint32<N> i_to_uint32(const float64<N>& a)
223{
224 SIMDPP_VEC_ARRAY_IMPL_CONV_EXTRACT(uint32<N>, i_to_uint32, a)
225}
226
227// -----------------------------------------------------------------------------
228
229static SIMDPP_INL
230int32x4 i_to_int32(const float64x4& a)
231{
232#if SIMDPP_USE_AVX512VL
233 return _mm256_cvttpd_epi32(a.native());
234#elif SIMDPP_USE_SSE2
235 int32x4 r, r1, r2;
236 float64x2 a1, a2;
237 split(a, a1, a2);
238 r1 = _mm_cvttpd_epi32(a1.native());
239 r2 = _mm_cvttpd_epi32(a2.native());
240 r = zip2_lo(int64<2>(r1), int64<2>(r2));
241 return r;
242#elif SIMDPP_USE_NEON64
243 int64x2_t r1, r2;
244 r1 = vcvtq_s64_f64(a.vec(0).native());
245 r2 = vcvtq_s64_f64(a.vec(1).native());
246 // FIXME: saturation
247 int32<4> r = vcombine_s32(vqmovn_s64(r1), vqmovn_s64(r2));
248 return r;
249#elif SIMDPP_USE_VSX_206
250 int32<4> r, r1, r2;
251 r1 = (__vector int32_t) vec_cts(a.vec(0).native(), 0);
252 r2 = (__vector int32_t) vec_cts(a.vec(1).native(), 0);
253 r = unzip4_lo(r1, r2);
254 return r;
255#elif SIMDPP_USE_MSA
256 int64<2> r1, r2;
257 r1 = __msa_ftrunc_s_d(a.vec(0).native());
258 r2 = __msa_ftrunc_s_d(a.vec(1).native());
259 return unzip4_lo((int32<4>)r1, (int32<4>)r2);
260#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
261 detail::mem_block<int32x4> r;
262 r[0] = int32_t(a.vec(0).el(0));
263 r[1] = int32_t(a.vec(0).el(1));
264 r[2] = int32_t(a.vec(1).el(0));
265 r[3] = int32_t(a.vec(1).el(1));
266 return r;
267#else
268 return SIMDPP_NOT_IMPLEMENTED1(a);
269#endif
270}
271
272#if SIMDPP_USE_AVX
273static SIMDPP_INL
274int32<8> i_to_int32(const float64<8>& a)
275{
276#if SIMDPP_USE_AVX512F
277 return _mm512_cvttpd_epi32(a.native());
278#else
279 int32<4> r1, r2;
280 r1 = _mm256_cvttpd_epi32(a.vec(0).native());
281 r2 = _mm256_cvttpd_epi32(a.vec(1).native());
282 return combine(r1, r2);
283#endif
284}
285#endif
286
287#if SIMDPP_USE_AVX512F
288static SIMDPP_INL
289int32<16> i_to_int32(const float64<16>& a)
290{
291 int32<8> r0, r1;
292 r0 = _mm512_cvttpd_epi32(a.vec(0).native());
293 r1 = _mm512_cvttpd_epi32(a.vec(1).native());
294 return combine(r0, r1);
295}
296#endif
297
298template<unsigned N> SIMDPP_INL
299int32<N> i_to_int32(const float64<N>& a)
300{
301 SIMDPP_VEC_ARRAY_IMPL_CONV_EXTRACT(int32<N>, i_to_int32, a)
302}
303
304
305} // namespace insn
306} // namespace detail
307} // namespace SIMDPP_ARCH_NAMESPACE
308} // namespace simdpp
309
310#endif
311
312
313