1 | /* Copyright (C) 2013-2017 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_CONV_ANY_TO_FLOAT32_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_CONV_ANY_TO_FLOAT32_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/combine.h> |
17 | #include <simdpp/core/zip_hi.h> |
18 | #include <simdpp/core/zip_lo.h> |
19 | #include <simdpp/core/insert.h> |
20 | #include <simdpp/core/f_mul.h> |
21 | #include <simdpp/core/make_shuffle_bytes_mask.h> |
22 | #include <simdpp/core/shuffle_bytes16.h> |
23 | #include <simdpp/core/detail/subvec_extract.h> |
24 | #include <simdpp/detail/mem_block.h> |
25 | #include <simdpp/detail/insn/conv_extend_to_int32.h> |
26 | #include <simdpp/detail/insn/conv_any_to_float64.h> |
27 | |
28 | namespace simdpp { |
29 | namespace SIMDPP_ARCH_NAMESPACE { |
30 | namespace detail { |
31 | namespace insn { |
32 | |
33 | |
34 | static SIMDPP_INL |
35 | float32<4> i_to_float32(const float64<4>& a) |
36 | { |
37 | #if SIMDPP_USE_AVX |
38 | return _mm256_cvtpd_ps(a.native()); |
39 | #elif SIMDPP_USE_SSE2 |
40 | float32x4 r1, r2; |
41 | r1 = _mm_cvtpd_ps(a.vec(0).native()); |
42 | r2 = _mm_cvtpd_ps(a.vec(1).native()); |
43 | return _mm_movelh_ps(r1.native(), r2.native()); |
44 | #elif SIMDPP_USE_NEON64 |
45 | float32<4> r; |
46 | r = vcvt_high_f32_f64(vcvt_f32_f64(a.vec(0).native()), |
47 | a.vec(1).native()); |
48 | return r; |
49 | #elif SIMDPP_USE_VSX_206 |
50 | float32<4> lo, hi; |
51 | uint32<4> shuffle_mask; |
52 | lo = __builtin_vsx_xvcvdpsp(a.vec(0).native()); |
53 | hi = __builtin_vsx_xvcvdpsp(a.vec(1).native()); |
54 | shuffle_mask = make_shuffle_bytes16_mask<0,2,4,6>(shuffle_mask); |
55 | return shuffle_bytes16(lo, hi, shuffle_mask); |
56 | #elif SIMDPP_USE_MSA |
57 | return __msa_fexdo_w(a.vec(0).native(), a.vec(1).native()); |
58 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC |
59 | detail::mem_block<float32x4> r; |
60 | r[0] = float(a.vec(0).el(0)); |
61 | r[1] = float(a.vec(0).el(1)); |
62 | r[2] = float(a.vec(1).el(0)); |
63 | r[3] = float(a.vec(1).el(1)); |
64 | return r; |
65 | #endif |
66 | } |
67 | |
68 | #if SIMDPP_USE_AVX |
69 | static SIMDPP_INL |
70 | float32<8> i_to_float32(const float64<8>& a) |
71 | { |
72 | #if SIMDPP_USE_AVX512F |
73 | return _mm512_cvt_roundpd_ps(a.native(), (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)); |
74 | #else |
75 | float32x4 r1, r2; |
76 | r1 = i_to_float32(a.vec(0)); |
77 | r2 = i_to_float32(a.vec(1)); |
78 | return combine(r1, r2); |
79 | #endif |
80 | } |
81 | #endif |
82 | |
83 | #if SIMDPP_USE_AVX512F |
84 | static SIMDPP_INL |
85 | float32<16> i_to_float32(const float64<16>& a) |
86 | { |
87 | float32<8> r1, r2; |
88 | r1 = i_to_float32(a.vec(0)); |
89 | r2 = i_to_float32(a.vec(1)); |
90 | return combine(r1, r2); |
91 | } |
92 | #endif |
93 | |
94 | template<unsigned N> SIMDPP_INL |
95 | float32<N> i_to_float32(const float64<N>& a) |
96 | { |
97 | SIMDPP_VEC_ARRAY_IMPL_CONV_EXTRACT(float32<N>, i_to_float32, a) |
98 | } |
99 | |
100 | // ----------------------------------------------------------------------------- |
101 | |
102 | static SIMDPP_INL |
103 | float32<4> i_to_float32(const int64<4>& a) |
104 | { |
105 | #if SIMDPP_USE_NULL |
106 | float32<4> r; |
107 | for (unsigned i = 0; i < a.length; i++) { |
108 | r.el(i) = float(a.vec(i/2).el(i%2)); |
109 | } |
110 | return r; |
111 | #elif SIMDPP_USE_AVX512VL |
112 | return _mm256_cvtepi64_ps(a.native()); |
113 | #else |
114 | return i_to_float32(i_to_float64(a)); |
115 | #endif |
116 | } |
117 | |
118 | #if SIMDPP_USE_AVX |
119 | static SIMDPP_INL |
120 | float32<8> i_to_float32(const int64<8>& a) |
121 | { |
122 | #if SIMDPP_USE_AVX512DQ |
123 | return _mm512_cvtepi64_ps(a.native()); |
124 | #else |
125 | return SIMDPP_NOT_IMPLEMENTED1(a); |
126 | #endif |
127 | } |
128 | #endif |
129 | |
130 | #if SIMDPP_USE_AVX512F |
131 | static SIMDPP_INL |
132 | float32<16> i_to_float32(const int64<16>& a) |
133 | { |
134 | #if SIMDPP_USE_AVX512DQ |
135 | float32<8> r0 = _mm512_cvtepi64_ps(a.vec(0).native()); |
136 | float32<8> r1 = _mm512_cvtepi64_ps(a.vec(1).native()); |
137 | return combine(r0, r1); |
138 | #else |
139 | return i_to_float32(i_to_float64(a)); |
140 | #endif |
141 | } |
142 | #endif |
143 | |
144 | template<unsigned N> SIMDPP_INL |
145 | float32<N> i_to_float32(const int64<N>& a) |
146 | { |
147 | return i_to_float32(i_to_float64(a)); |
148 | } |
149 | |
150 | // ----------------------------------------------------------------------------- |
151 | |
152 | static SIMDPP_INL |
153 | float32<4> i_to_float32(const uint64<4>& a) |
154 | { |
155 | #if SIMDPP_USE_NULL |
156 | float32<4> r; |
157 | for (unsigned i = 0; i < a.length; i++) { |
158 | r.el(i) = float(a.vec(i/2).el(i%2)); |
159 | } |
160 | return r; |
161 | #elif SIMDPP_USE_AVX512VL |
162 | return _mm256_cvtepu64_ps(a.native()); |
163 | #else |
164 | return i_to_float32(i_to_float64(a)); |
165 | #endif |
166 | } |
167 | |
168 | #if SIMDPP_USE_AVX |
169 | static SIMDPP_INL |
170 | float32<8> i_to_float32(const uint64<8>& a) |
171 | { |
172 | #if SIMDPP_USE_AVX512DQ |
173 | return _mm512_cvtepu64_ps(a.native()); |
174 | #else |
175 | return SIMDPP_NOT_IMPLEMENTED1(a); |
176 | #endif |
177 | } |
178 | #endif |
179 | |
180 | #if SIMDPP_USE_AVX512F |
181 | static SIMDPP_INL |
182 | float32<16> i_to_float32(const uint64<16>& a) |
183 | { |
184 | #if SIMDPP_USE_AVX512DQ |
185 | float32<8> r0 = _mm512_cvtepu64_ps(a.vec(0).native()); |
186 | float32<8> r1 = _mm512_cvtepu64_ps(a.vec(1).native()); |
187 | return combine(r0, r1); |
188 | #else |
189 | return i_to_float32(i_to_float64(a)); |
190 | #endif |
191 | } |
192 | #endif |
193 | |
194 | template<unsigned N> SIMDPP_INL |
195 | float32<N> i_to_float32(const uint64<N>& a) |
196 | { |
197 | return i_to_float32(i_to_float64(a)); |
198 | } |
199 | |
200 | // ----------------------------------------------------------------------------- |
201 | |
202 | static SIMDPP_INL |
203 | float32<4> i_to_float32(const int32<4>& a) |
204 | { |
205 | #if SIMDPP_USE_NULL |
206 | float32<4> r; |
207 | for (unsigned i = 0; i < a.length; i++) { |
208 | r.el(i) = float(a.el(i)); |
209 | } |
210 | return r; |
211 | #elif SIMDPP_USE_SSE2 |
212 | return _mm_cvtepi32_ps(a.native()); |
213 | #elif SIMDPP_USE_NEON && !SIMDPP_USE_NEON_FLT_SP |
214 | detail::mem_block<int32<4>> mi(a); |
215 | float32<4> r; |
216 | r.el(0) = float(mi[0]); |
217 | r.el(1) = float(mi[1]); |
218 | r.el(2) = float(mi[2]); |
219 | r.el(3) = float(mi[3]); |
220 | return r; |
221 | #elif SIMDPP_USE_NEON_FLT_SP |
222 | return vcvtq_f32_s32(a.native()); |
223 | #elif SIMDPP_USE_ALTIVEC |
224 | return vec_ctf(a.native(), 0); |
225 | #elif SIMDPP_USE_MSA |
226 | return __msa_ffint_s_w(a.native()); |
227 | #endif |
228 | } |
229 | |
230 | #if SIMDPP_USE_AVX |
231 | static SIMDPP_INL |
232 | float32x8 i_to_float32(const int32x8& a) |
233 | { |
234 | #if SIMDPP_USE_AVX2 |
235 | return _mm256_cvtepi32_ps(a.native()); |
236 | #else |
237 | __m256i a1; |
238 | a1 = _mm256_castsi128_si256(a.vec(0).native()); |
239 | a1 = _mm256_insertf128_si256(a1, a.vec(1).native(), 1); |
240 | return _mm256_cvtepi32_ps(a1); |
241 | #endif |
242 | } |
243 | #endif |
244 | |
245 | #if SIMDPP_USE_AVX512F |
246 | static SIMDPP_INL |
247 | float32<16> i_to_float32(const int32<16>& a) |
248 | { |
249 | return _mm512_cvtepi32_ps(a.native()); |
250 | } |
251 | #endif |
252 | |
253 | template<unsigned N> SIMDPP_INL |
254 | float32<N> i_to_float32(const int32<N>& a) |
255 | { |
256 | SIMDPP_VEC_ARRAY_IMPL_CONV_EXTRACT(float32<N>, i_to_float32, a) |
257 | } |
258 | |
259 | // ----------------------------------------------------------------------------- |
260 | |
261 | static SIMDPP_INL |
262 | float32<4> i_to_float32(const uint32<4>& a) |
263 | { |
264 | #if SIMDPP_USE_NULL |
265 | float32<4> r; |
266 | for (unsigned i = 0; i < a.length; i++) { |
267 | r.el(i) = float(a.el(i)); |
268 | } |
269 | return r; |
270 | #elif SIMDPP_USE_SSE2 |
271 | // true when a is in the range [0x80000000, 0xffffffff) |
272 | mask_float32<4> is_large = mask_float32<4>(cmp_lt(int32<4>(a), 0)); |
273 | |
274 | float32<4> f_a = _mm_cvtepi32_ps(a.native()); |
275 | // f_a has values in the range [0x80000000, 0xffffffff) wrapped around to |
276 | // negative values. Conditionally bias the result to fix that. Note, that |
277 | // the result is in sufficient precision even for large argument values. |
278 | // The result has lowest precision around 0x80000000, and the precision |
279 | // increases going towards 0xffffffff. The final result after bias will |
280 | // have lower precision in this whole range. |
281 | return add(f_a, bit_and(is_large, splat<float32<4>>(0x100000000))); |
282 | #elif SIMDPP_USE_NEON && !SIMDPP_USE_NEON_FLT_SP |
283 | detail::mem_block<uint32<4>> mi(a); |
284 | detail::mem_block<float32<4>> mf; |
285 | mf[0] = float(mi[0]); |
286 | mf[1] = float(mi[1]); |
287 | mf[2] = float(mi[2]); |
288 | mf[3] = float(mi[3]); |
289 | return mf; |
290 | #elif SIMDPP_USE_NEON_FLT_SP |
291 | return vcvtq_f32_u32(a.native()); |
292 | #elif SIMDPP_USE_ALTIVEC |
293 | return vec_ctf(a.native(), 0); |
294 | #elif SIMDPP_USE_MSA |
295 | return __msa_ffint_u_w(a.native()); |
296 | #endif |
297 | } |
298 | |
299 | #if SIMDPP_USE_AVX |
300 | static SIMDPP_INL |
301 | float32x8 i_to_float32(const uint32x8& a) |
302 | { |
303 | #if SIMDPP_USE_AVX512F |
304 | __m512i a512 = _mm512_castsi256_si512(a.native()); |
305 | return _mm512_castps512_ps256(_mm512_cvtepu32_ps(a512)); |
306 | #elif SIMDPP_USE_AVX2 |
307 | // true when a is in the range [0x80000000, 0xffffffff) |
308 | mask_float32<8> is_large = mask_float32<8>(cmp_lt(int32<8>(a), 0)); |
309 | |
310 | float32<8> f_a = _mm256_cvtepi32_ps(a.native()); |
311 | return add(f_a, bit_and(is_large, splat<float32<8>>(0x100000000))); |
312 | #else |
313 | return combine(i_to_float32(a.vec(0)), i_to_float32(a.vec(1))); |
314 | #endif |
315 | } |
316 | #endif |
317 | |
318 | #if SIMDPP_USE_AVX512F |
319 | static SIMDPP_INL |
320 | float32<16> i_to_float32(const uint32<16>& a) |
321 | { |
322 | return _mm512_cvtepu32_ps(a.native()); |
323 | } |
324 | #endif |
325 | |
326 | template<unsigned N> SIMDPP_INL |
327 | float32<N> i_to_float32(const uint32<N>& a) |
328 | { |
329 | SIMDPP_VEC_ARRAY_IMPL_CONV_EXTRACT(float32<N>, i_to_float32, a) |
330 | } |
331 | |
332 | // ----------------------------------------------------------------------------- |
333 | |
334 | template<unsigned N> SIMDPP_INL |
335 | float32<N> i_to_float32(const uint16<N>& a) |
336 | { |
337 | return i_to_float32(i_to_uint32(a)); |
338 | } |
339 | |
340 | template<unsigned N> SIMDPP_INL |
341 | float32<N> i_to_float32(const int16<N>& a) |
342 | { |
343 | return i_to_float32(i_to_int32(a)); |
344 | } |
345 | |
346 | // ----------------------------------------------------------------------------- |
347 | |
348 | template<unsigned N> SIMDPP_INL |
349 | float32<N> i_to_float32(const uint8<N>& a) |
350 | { |
351 | return i_to_float32(i_to_uint32(a)); |
352 | } |
353 | |
354 | template<unsigned N> SIMDPP_INL |
355 | float32<N> i_to_float32(const int8<N>& a) |
356 | { |
357 | return i_to_float32(i_to_int32(a)); |
358 | } |
359 | |
360 | // ----------------------------------------------------------------------------- |
361 | |
362 | } // namespace insn |
363 | } // namespace detail |
364 | } // namespace SIMDPP_ARCH_NAMESPACE |
365 | } // namespace simdpp |
366 | |
367 | #endif |
368 | |
369 | |
370 | |