1/* Copyright (C) 2013-2017 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_CONV_ANY_TO_FLOAT32_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_CONV_ANY_TO_FLOAT32_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/combine.h>
17#include <simdpp/core/zip_hi.h>
18#include <simdpp/core/zip_lo.h>
19#include <simdpp/core/insert.h>
20#include <simdpp/core/f_mul.h>
21#include <simdpp/core/make_shuffle_bytes_mask.h>
22#include <simdpp/core/shuffle_bytes16.h>
23#include <simdpp/core/detail/subvec_extract.h>
24#include <simdpp/detail/mem_block.h>
25#include <simdpp/detail/insn/conv_extend_to_int32.h>
26#include <simdpp/detail/insn/conv_any_to_float64.h>
27
28namespace simdpp {
29namespace SIMDPP_ARCH_NAMESPACE {
30namespace detail {
31namespace insn {
32
33
34static SIMDPP_INL
35float32<4> i_to_float32(const float64<4>& a)
36{
37#if SIMDPP_USE_AVX
38 return _mm256_cvtpd_ps(a.native());
39#elif SIMDPP_USE_SSE2
40 float32x4 r1, r2;
41 r1 = _mm_cvtpd_ps(a.vec(0).native());
42 r2 = _mm_cvtpd_ps(a.vec(1).native());
43 return _mm_movelh_ps(r1.native(), r2.native());
44#elif SIMDPP_USE_NEON64
45 float32<4> r;
46 r = vcvt_high_f32_f64(vcvt_f32_f64(a.vec(0).native()),
47 a.vec(1).native());
48 return r;
49#elif SIMDPP_USE_VSX_206
50 float32<4> lo, hi;
51 uint32<4> shuffle_mask;
52 lo = __builtin_vsx_xvcvdpsp(a.vec(0).native());
53 hi = __builtin_vsx_xvcvdpsp(a.vec(1).native());
54 shuffle_mask = make_shuffle_bytes16_mask<0,2,4,6>(shuffle_mask);
55 return shuffle_bytes16(lo, hi, shuffle_mask);
56#elif SIMDPP_USE_MSA
57 return __msa_fexdo_w(a.vec(0).native(), a.vec(1).native());
58#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC
59 detail::mem_block<float32x4> r;
60 r[0] = float(a.vec(0).el(0));
61 r[1] = float(a.vec(0).el(1));
62 r[2] = float(a.vec(1).el(0));
63 r[3] = float(a.vec(1).el(1));
64 return r;
65#endif
66}
67
68#if SIMDPP_USE_AVX
69static SIMDPP_INL
70float32<8> i_to_float32(const float64<8>& a)
71{
72#if SIMDPP_USE_AVX512F
73 return _mm512_cvt_roundpd_ps(a.native(), (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC));
74#else
75 float32x4 r1, r2;
76 r1 = i_to_float32(a.vec(0));
77 r2 = i_to_float32(a.vec(1));
78 return combine(r1, r2);
79#endif
80}
81#endif
82
83#if SIMDPP_USE_AVX512F
84static SIMDPP_INL
85float32<16> i_to_float32(const float64<16>& a)
86{
87 float32<8> r1, r2;
88 r1 = i_to_float32(a.vec(0));
89 r2 = i_to_float32(a.vec(1));
90 return combine(r1, r2);
91}
92#endif
93
94template<unsigned N> SIMDPP_INL
95float32<N> i_to_float32(const float64<N>& a)
96{
97 SIMDPP_VEC_ARRAY_IMPL_CONV_EXTRACT(float32<N>, i_to_float32, a)
98}
99
100// -----------------------------------------------------------------------------
101
102static SIMDPP_INL
103float32<4> i_to_float32(const int64<4>& a)
104{
105#if SIMDPP_USE_NULL
106 float32<4> r;
107 for (unsigned i = 0; i < a.length; i++) {
108 r.el(i) = float(a.vec(i/2).el(i%2));
109 }
110 return r;
111#elif SIMDPP_USE_AVX512VL
112 return _mm256_cvtepi64_ps(a.native());
113#else
114 return i_to_float32(i_to_float64(a));
115#endif
116}
117
118#if SIMDPP_USE_AVX
119static SIMDPP_INL
120float32<8> i_to_float32(const int64<8>& a)
121{
122#if SIMDPP_USE_AVX512DQ
123 return _mm512_cvtepi64_ps(a.native());
124#else
125 return SIMDPP_NOT_IMPLEMENTED1(a);
126#endif
127}
128#endif
129
130#if SIMDPP_USE_AVX512F
131static SIMDPP_INL
132float32<16> i_to_float32(const int64<16>& a)
133{
134#if SIMDPP_USE_AVX512DQ
135 float32<8> r0 = _mm512_cvtepi64_ps(a.vec(0).native());
136 float32<8> r1 = _mm512_cvtepi64_ps(a.vec(1).native());
137 return combine(r0, r1);
138#else
139 return i_to_float32(i_to_float64(a));
140#endif
141}
142#endif
143
144template<unsigned N> SIMDPP_INL
145float32<N> i_to_float32(const int64<N>& a)
146{
147 return i_to_float32(i_to_float64(a));
148}
149
150// -----------------------------------------------------------------------------
151
152static SIMDPP_INL
153float32<4> i_to_float32(const uint64<4>& a)
154{
155#if SIMDPP_USE_NULL
156 float32<4> r;
157 for (unsigned i = 0; i < a.length; i++) {
158 r.el(i) = float(a.vec(i/2).el(i%2));
159 }
160 return r;
161#elif SIMDPP_USE_AVX512VL
162 return _mm256_cvtepu64_ps(a.native());
163#else
164 return i_to_float32(i_to_float64(a));
165#endif
166}
167
168#if SIMDPP_USE_AVX
169static SIMDPP_INL
170float32<8> i_to_float32(const uint64<8>& a)
171{
172#if SIMDPP_USE_AVX512DQ
173 return _mm512_cvtepu64_ps(a.native());
174#else
175 return SIMDPP_NOT_IMPLEMENTED1(a);
176#endif
177}
178#endif
179
180#if SIMDPP_USE_AVX512F
181static SIMDPP_INL
182float32<16> i_to_float32(const uint64<16>& a)
183{
184#if SIMDPP_USE_AVX512DQ
185 float32<8> r0 = _mm512_cvtepu64_ps(a.vec(0).native());
186 float32<8> r1 = _mm512_cvtepu64_ps(a.vec(1).native());
187 return combine(r0, r1);
188#else
189 return i_to_float32(i_to_float64(a));
190#endif
191}
192#endif
193
194template<unsigned N> SIMDPP_INL
195float32<N> i_to_float32(const uint64<N>& a)
196{
197 return i_to_float32(i_to_float64(a));
198}
199
200// -----------------------------------------------------------------------------
201
202static SIMDPP_INL
203float32<4> i_to_float32(const int32<4>& a)
204{
205#if SIMDPP_USE_NULL
206 float32<4> r;
207 for (unsigned i = 0; i < a.length; i++) {
208 r.el(i) = float(a.el(i));
209 }
210 return r;
211#elif SIMDPP_USE_SSE2
212 return _mm_cvtepi32_ps(a.native());
213#elif SIMDPP_USE_NEON && !SIMDPP_USE_NEON_FLT_SP
214 detail::mem_block<int32<4>> mi(a);
215 float32<4> r;
216 r.el(0) = float(mi[0]);
217 r.el(1) = float(mi[1]);
218 r.el(2) = float(mi[2]);
219 r.el(3) = float(mi[3]);
220 return r;
221#elif SIMDPP_USE_NEON_FLT_SP
222 return vcvtq_f32_s32(a.native());
223#elif SIMDPP_USE_ALTIVEC
224 return vec_ctf(a.native(), 0);
225#elif SIMDPP_USE_MSA
226 return __msa_ffint_s_w(a.native());
227#endif
228}
229
230#if SIMDPP_USE_AVX
231static SIMDPP_INL
232float32x8 i_to_float32(const int32x8& a)
233{
234#if SIMDPP_USE_AVX2
235 return _mm256_cvtepi32_ps(a.native());
236#else
237 __m256i a1;
238 a1 = _mm256_castsi128_si256(a.vec(0).native());
239 a1 = _mm256_insertf128_si256(a1, a.vec(1).native(), 1);
240 return _mm256_cvtepi32_ps(a1);
241#endif
242}
243#endif
244
245#if SIMDPP_USE_AVX512F
246static SIMDPP_INL
247float32<16> i_to_float32(const int32<16>& a)
248{
249 return _mm512_cvtepi32_ps(a.native());
250}
251#endif
252
253template<unsigned N> SIMDPP_INL
254float32<N> i_to_float32(const int32<N>& a)
255{
256 SIMDPP_VEC_ARRAY_IMPL_CONV_EXTRACT(float32<N>, i_to_float32, a)
257}
258
259// -----------------------------------------------------------------------------
260
261static SIMDPP_INL
262float32<4> i_to_float32(const uint32<4>& a)
263{
264#if SIMDPP_USE_NULL
265 float32<4> r;
266 for (unsigned i = 0; i < a.length; i++) {
267 r.el(i) = float(a.el(i));
268 }
269 return r;
270#elif SIMDPP_USE_SSE2
271 // true when a is in the range [0x80000000, 0xffffffff)
272 mask_float32<4> is_large = mask_float32<4>(cmp_lt(int32<4>(a), 0));
273
274 float32<4> f_a = _mm_cvtepi32_ps(a.native());
275 // f_a has values in the range [0x80000000, 0xffffffff) wrapped around to
276 // negative values. Conditionally bias the result to fix that. Note, that
277 // the result is in sufficient precision even for large argument values.
278 // The result has lowest precision around 0x80000000, and the precision
279 // increases going towards 0xffffffff. The final result after bias will
280 // have lower precision in this whole range.
281 return add(f_a, bit_and(is_large, splat<float32<4>>(0x100000000)));
282#elif SIMDPP_USE_NEON && !SIMDPP_USE_NEON_FLT_SP
283 detail::mem_block<uint32<4>> mi(a);
284 detail::mem_block<float32<4>> mf;
285 mf[0] = float(mi[0]);
286 mf[1] = float(mi[1]);
287 mf[2] = float(mi[2]);
288 mf[3] = float(mi[3]);
289 return mf;
290#elif SIMDPP_USE_NEON_FLT_SP
291 return vcvtq_f32_u32(a.native());
292#elif SIMDPP_USE_ALTIVEC
293 return vec_ctf(a.native(), 0);
294#elif SIMDPP_USE_MSA
295 return __msa_ffint_u_w(a.native());
296#endif
297}
298
299#if SIMDPP_USE_AVX
300static SIMDPP_INL
301float32x8 i_to_float32(const uint32x8& a)
302{
303#if SIMDPP_USE_AVX512F
304 __m512i a512 = _mm512_castsi256_si512(a.native());
305 return _mm512_castps512_ps256(_mm512_cvtepu32_ps(a512));
306#elif SIMDPP_USE_AVX2
307 // true when a is in the range [0x80000000, 0xffffffff)
308 mask_float32<8> is_large = mask_float32<8>(cmp_lt(int32<8>(a), 0));
309
310 float32<8> f_a = _mm256_cvtepi32_ps(a.native());
311 return add(f_a, bit_and(is_large, splat<float32<8>>(0x100000000)));
312#else
313 return combine(i_to_float32(a.vec(0)), i_to_float32(a.vec(1)));
314#endif
315}
316#endif
317
318#if SIMDPP_USE_AVX512F
319static SIMDPP_INL
320float32<16> i_to_float32(const uint32<16>& a)
321{
322 return _mm512_cvtepu32_ps(a.native());
323}
324#endif
325
326template<unsigned N> SIMDPP_INL
327float32<N> i_to_float32(const uint32<N>& a)
328{
329 SIMDPP_VEC_ARRAY_IMPL_CONV_EXTRACT(float32<N>, i_to_float32, a)
330}
331
332// -----------------------------------------------------------------------------
333
334template<unsigned N> SIMDPP_INL
335float32<N> i_to_float32(const uint16<N>& a)
336{
337 return i_to_float32(i_to_uint32(a));
338}
339
340template<unsigned N> SIMDPP_INL
341float32<N> i_to_float32(const int16<N>& a)
342{
343 return i_to_float32(i_to_int32(a));
344}
345
346// -----------------------------------------------------------------------------
347
348template<unsigned N> SIMDPP_INL
349float32<N> i_to_float32(const uint8<N>& a)
350{
351 return i_to_float32(i_to_uint32(a));
352}
353
354template<unsigned N> SIMDPP_INL
355float32<N> i_to_float32(const int8<N>& a)
356{
357 return i_to_float32(i_to_int32(a));
358}
359
360// -----------------------------------------------------------------------------
361
362} // namespace insn
363} // namespace detail
364} // namespace SIMDPP_ARCH_NAMESPACE
365} // namespace simdpp
366
367#endif
368
369
370