1 | /* Copyright (C) 2013-2017 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_CONV_ANY_TO_FLOAT64_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_CONV_ANY_TO_FLOAT64_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/combine.h> |
17 | #include <simdpp/core/f_add.h> |
18 | #include <simdpp/core/f_neg.h> |
19 | #include <simdpp/core/i_shift_r.h> |
20 | #include <simdpp/core/move_l.h> |
21 | #include <simdpp/core/zip_lo.h> |
22 | #include <simdpp/core/zip_hi.h> |
23 | #include <simdpp/core/detail/subvec_insert.h> |
24 | #include <simdpp/detail/mem_block.h> |
25 | #include <simdpp/detail/insn/conv_extend_to_int64.h> |
26 | |
27 | namespace simdpp { |
28 | namespace SIMDPP_ARCH_NAMESPACE { |
29 | namespace detail { |
30 | namespace insn { |
31 | |
32 | |
33 | static SIMDPP_INL |
34 | float64x4 i_to_float64(const float32x4& a) |
35 | { |
36 | #if SIMDPP_USE_AVX |
37 | return _mm256_cvtps_pd(a.native()); |
38 | #elif SIMDPP_USE_SSE2 |
39 | float64x2 r1, r2; |
40 | r1 = _mm_cvtps_pd(a.native()); |
41 | r2 = _mm_cvtps_pd(move4_l<2>(a).eval().native()); |
42 | return combine(r1, r2); |
43 | #elif SIMDPP_USE_NEON64 |
44 | float64<2> r1, r2; |
45 | r1 = vcvt_f64_f32(vget_low_f32(a.native())); |
46 | r2 = vcvt_high_f64_f32(a.native()); |
47 | return combine(r1, r2); |
48 | #elif SIMDPP_USE_VSX_206 |
49 | float32<4> lo, hi; |
50 | #if SIMDPP_BIG_ENDIAN |
51 | lo = zip4_lo(a, (float32<4>) make_zero()); |
52 | hi = zip4_hi(a, (float32<4>) make_zero()); |
53 | #else |
54 | lo = zip4_lo((float32<4>) make_zero(), a); |
55 | hi = zip4_hi((float32<4>) make_zero(), a); |
56 | #endif |
57 | float64<2> lo_f, hi_f; |
58 | lo_f = __builtin_vsx_xvcvspdp(lo.native()); |
59 | hi_f = __builtin_vsx_xvcvspdp(hi.native()); |
60 | return combine(lo_f, hi_f); |
61 | #elif SIMDPP_USE_MSA |
62 | float64<2> lo, hi; |
63 | lo = __msa_fexupr_d(a.native()); |
64 | hi = __msa_fexupl_d(a.native()); |
65 | return combine(lo, hi); |
66 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC |
67 | detail::mem_block<float32x4> ax(a); |
68 | float64x4 r; |
69 | r.vec(0).el(0) = double(ax[0]); |
70 | r.vec(0).el(1) = double(ax[1]); |
71 | r.vec(1).el(0) = double(ax[2]); |
72 | r.vec(1).el(1) = double(ax[3]); |
73 | return r; |
74 | #endif |
75 | } |
76 | |
77 | #if SIMDPP_USE_AVX |
78 | static SIMDPP_INL |
79 | float64<8> i_to_float64(const float32x8& a) |
80 | { |
81 | #if SIMDPP_USE_AVX512F |
82 | return _mm512_cvtps_pd(a.native()); |
83 | #else |
84 | float64x4 r1, r2; |
85 | float32x4 a1, a2; |
86 | split(a, a1, a2); |
87 | r1 = _mm256_cvtps_pd(a1.native()); |
88 | r2 = _mm256_cvtps_pd(a2.native()); |
89 | return combine(r1, r2); |
90 | #endif |
91 | } |
92 | #endif |
93 | |
94 | #if SIMDPP_USE_AVX512F |
95 | static SIMDPP_INL |
96 | float64<16> i_to_float64(const float32<16>& a) |
97 | { |
98 | float64<8> r1, r2; |
99 | float32<8> a1, a2; |
100 | split(a, a1, a2); |
101 | r1 = _mm512_cvtps_pd(a1.native()); |
102 | r2 = _mm512_cvtps_pd(a2.native()); |
103 | return combine(r1, r2); |
104 | } |
105 | #endif |
106 | |
107 | template<unsigned N> SIMDPP_INL |
108 | float64<N> i_to_float64(const float32<N>& a) |
109 | { |
110 | float64<N> r; |
111 | for (unsigned i = 0; i < a.vec_length; ++i) { |
112 | detail::subvec_insert(r, i_to_float64(a.vec(i)), i); |
113 | } |
114 | return r; |
115 | } |
116 | |
117 | // ----------------------------------------------------------------------------- |
118 | |
119 | static SIMDPP_INL |
120 | float64x4 i_to_float64(const int32x4& a) |
121 | { |
122 | #if SIMDPP_USE_AVX |
123 | return _mm256_cvtepi32_pd(a.native()); |
124 | #elif SIMDPP_USE_SSE2 |
125 | float64x2 r1, r2; |
126 | r1 = _mm_cvtepi32_pd(a.native()); |
127 | r2 = _mm_cvtepi32_pd(move4_l<2>(a).eval().native()); |
128 | return combine(r1, r2); |
129 | #elif SIMDPP_USE_NEON64 |
130 | float64<2> r1, r2; |
131 | r1 = vcvtq_f64_s64(vmovl_s32(vget_low_s32(a.native()))); |
132 | r2 = vcvtq_f64_s64(vmovl_s32(vget_high_s32(a.native()))); |
133 | return combine(r1, r2); |
134 | #elif SIMDPP_USE_VSX_206 |
135 | #if SIMDPP_USE_VSX_207 |
136 | int64<4> a64 = i_to_int64(a); |
137 | __vector int64_t b0 = a64.vec(0).native(); |
138 | __vector int64_t b1 = a64.vec(1).native(); |
139 | #else |
140 | int32<4> sign = shift_r<31>(a); |
141 | __vector int64_t b0 = (__vector int64_t) vec_mergeh(a.native(), sign.native()); |
142 | __vector int64_t b1 = (__vector int64_t) vec_mergel(a.native(), sign.native()); |
143 | #endif |
144 | |
145 | float64<4> r; |
146 | r.vec(0) = vec_ctf(b0, 0); |
147 | r.vec(1) = vec_ctf(b1, 0); |
148 | return r; |
149 | #elif SIMDPP_USE_MSA |
150 | int64<4> a64 = i_to_int64(a); |
151 | float64<4> r; |
152 | r.vec(0) = __msa_ffint_s_d(a64.vec(0).native()); |
153 | r.vec(1) = __msa_ffint_s_d(a64.vec(1).native()); |
154 | return r; |
155 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC |
156 | detail::mem_block<int32x4> ax(a); |
157 | float64x4 r; |
158 | r.vec(0).el(0) = double(ax[0]); |
159 | r.vec(0).el(1) = double(ax[1]); |
160 | r.vec(1).el(0) = double(ax[2]); |
161 | r.vec(1).el(1) = double(ax[3]); |
162 | return r; |
163 | #endif |
164 | } |
165 | |
166 | #if SIMDPP_USE_AVX2 |
167 | static SIMDPP_INL |
168 | float64<8> i_to_float64(const int32x8& a) |
169 | { |
170 | #if SIMDPP_USE_AVX512F |
171 | return _mm512_cvtepi32_pd(a.native()); |
172 | #else |
173 | float64x4 r1, r2; |
174 | int32x4 a1, a2; |
175 | split(a, a1, a2); |
176 | r1 = _mm256_cvtepi32_pd(a1.native()); |
177 | r2 = _mm256_cvtepi32_pd(a2.native()); |
178 | return combine(r1, r2); |
179 | #endif |
180 | } |
181 | #endif |
182 | |
183 | #if SIMDPP_USE_AVX512F |
184 | static SIMDPP_INL |
185 | float64<16> i_to_float64(const int32<16>& a) |
186 | { |
187 | float64<8> r1, r2; |
188 | r1 = _mm512_cvtepi32_pd(_mm512_castsi512_si256(a.native())); |
189 | r2 = _mm512_cvtepi32_pd(_mm512_extracti64x4_epi64(a.native(), 1)); |
190 | return combine(r1, r2); |
191 | } |
192 | #endif |
193 | |
194 | template<unsigned N> SIMDPP_INL |
195 | float64<N> i_to_float64(const int32<N>& a) |
196 | { |
197 | float64<N> r; |
198 | for (unsigned i = 0; i < a.vec_length; ++i) { |
199 | detail::subvec_insert(r, i_to_float64(a.vec(i)), i); |
200 | } |
201 | return r; |
202 | } |
203 | |
204 | // ----------------------------------------------------------------------------- |
205 | |
206 | static SIMDPP_INL |
207 | float64<4> i_to_float64(const uint32<4>& a) |
208 | { |
209 | #if SIMDPP_USE_AVX512VL |
210 | return _mm256_cvtepu32_pd(a.native()); |
211 | #elif SIMDPP_USE_SSE2 |
212 | float64<4> f; |
213 | #if SIMDPP_USE_AVX |
214 | f = _mm256_cvtepi32_pd(a.native()); |
215 | #else |
216 | f.vec(0) = _mm_cvtepi32_pd(a.native()); |
217 | f.vec(1) = _mm_cvtepi32_pd(move4_l<2>(a).eval().native()); |
218 | #endif |
219 | // if result is negative, we converted integer larger than 0x7fffffff |
220 | mask_float64<4> is_large = cmp_lt(f, 0); |
221 | return blend(add(f, (double)0x100000000), f, is_large); |
222 | #elif SIMDPP_USE_NEON64 |
223 | float64<2> r1, r2; |
224 | r1 = vcvtq_f64_u64(vmovl_u32(vget_low_u32(a.native()))); |
225 | r2 = vcvtq_f64_u64(vmovl_u32(vget_high_u32(a.native()))); |
226 | return combine(r1, r2); |
227 | #elif SIMDPP_USE_VSX_206 |
228 | #if SIMDPP_USE_VSX_207 |
229 | uint64<4> a64 = i_to_uint64(a); |
230 | __vector uint64_t b0 = a64.vec(0).native(); |
231 | __vector uint64_t b1 = a64.vec(1).native(); |
232 | #else |
233 | uint32<4> zero = make_zero(); |
234 | __vector uint64_t b0 = (__vector uint64_t) vec_mergeh(a.native(), zero.native()); |
235 | __vector uint64_t b1 = (__vector uint64_t) vec_mergel(a.native(), zero.native()); |
236 | #endif |
237 | |
238 | float64<4> r; |
239 | r.vec(0) = vec_ctf(b0, 0); |
240 | r.vec(1) = vec_ctf(b1, 0); |
241 | return r; |
242 | #elif SIMDPP_USE_MSA |
243 | uint64<4> a64 = i_to_uint64(a); |
244 | float64<4> r; |
245 | r.vec(0) = __msa_ffint_u_d(a64.vec(0).native()); |
246 | r.vec(1) = __msa_ffint_u_d(a64.vec(1).native()); |
247 | return r; |
248 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC |
249 | detail::mem_block<uint32<4>> ax(a); |
250 | float64<4> r; |
251 | r.vec(0).el(0) = double(ax[0]); |
252 | r.vec(0).el(1) = double(ax[1]); |
253 | r.vec(1).el(0) = double(ax[2]); |
254 | r.vec(1).el(1) = double(ax[3]); |
255 | return r; |
256 | #endif |
257 | } |
258 | |
259 | #if SIMDPP_USE_AVX2 |
260 | static SIMDPP_INL |
261 | float64<8> i_to_float64(const uint32<8>& a) |
262 | { |
263 | #if SIMDPP_USE_AVX512F |
264 | return _mm512_cvtepu32_pd(a.native()); |
265 | #else |
266 | uint32<4> a0, a1; |
267 | float64<8> f; |
268 | split(a, a0, a1); |
269 | |
270 | f.vec(0) = _mm256_cvtepi32_pd(a0.native()); |
271 | f.vec(1) = _mm256_cvtepi32_pd(a1.native()); |
272 | |
273 | // if result is negative, we converted integer larger than 0x7fffffff |
274 | mask_float64<8> is_large = cmp_lt(f, 0); |
275 | return blend(add(f, (double)0x100000000), f, is_large); |
276 | #endif |
277 | } |
278 | #endif |
279 | |
280 | #if SIMDPP_USE_AVX512F |
281 | static SIMDPP_INL |
282 | float64<16> i_to_float64(const uint32<16>& a) |
283 | { |
284 | float64<16> r; |
285 | uint32<8> a0, a1; |
286 | split(a, a0, a1); |
287 | |
288 | r.vec(0) = _mm512_cvtepu32_pd(a0.native()); |
289 | r.vec(1) = _mm512_cvtepu32_pd(a1.native()); |
290 | return r; |
291 | } |
292 | #endif |
293 | |
294 | template<unsigned N> SIMDPP_INL |
295 | float64<N> i_to_float64(const uint32<N>& a) |
296 | { |
297 | float64<N> r; |
298 | for (unsigned i = 0; i < a.vec_length; ++i) { |
299 | detail::subvec_insert(r, i_to_float64(a.vec(i)), i); |
300 | } |
301 | return r; |
302 | } |
303 | |
304 | // ----------------------------------------------------------------------------- |
305 | |
306 | template<unsigned N> SIMDPP_INL |
307 | float64<N> i_to_float64(const uint16<N>& a) |
308 | { |
309 | return i_to_float64(i_to_uint32(a)); |
310 | } |
311 | |
312 | template<unsigned N> SIMDPP_INL |
313 | float64<N> i_to_float64(const int16<N>& a) |
314 | { |
315 | return i_to_float64(i_to_int32(a)); |
316 | } |
317 | |
318 | // ----------------------------------------------------------------------------- |
319 | |
320 | template<unsigned N> SIMDPP_INL |
321 | float64<N> i_to_float64(const uint8<N>& a) |
322 | { |
323 | return i_to_float64(i_to_uint32(a)); |
324 | } |
325 | |
326 | template<unsigned N> SIMDPP_INL |
327 | float64<N> i_to_float64(const int8<N>& a) |
328 | { |
329 | return i_to_float64(i_to_int32(a)); |
330 | } |
331 | |
332 | // ----------------------------------------------------------------------------- |
333 | |
334 | static SIMDPP_INL |
335 | float64<2> i_to_float64(const int64<2>& a) |
336 | { |
337 | #if SIMDPP_USE_AVX512VL |
338 | return _mm_cvtepi64_pd(a.native()); |
339 | #elif SIMDPP_USE_NEON64 |
340 | return vcvtq_f64_s64(a.native()); |
341 | #elif SIMDPP_USE_VSX_207 |
342 | return vec_ctf(a.native(), 0); |
343 | #elif SIMDPP_USE_VSX_206 |
344 | int32<4> a32; a32 = a; // a stores 64-bit values in GPR |
345 | return vec_ctf((__vector int64_t)a32.native(), 0); |
346 | #elif SIMDPP_USE_MSA |
347 | return __msa_ffint_s_d(a.native()); |
348 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC |
349 | detail::mem_block<int64<2>> ax(a); |
350 | float64<2> r; |
351 | r.el(0) = double(ax[0]); |
352 | r.el(1) = double(ax[1]); |
353 | return r; |
354 | #else |
355 | return SIMDPP_NOT_IMPLEMENTED1(a); |
356 | #endif |
357 | } |
358 | |
359 | #if SIMDPP_USE_AVX |
360 | static SIMDPP_INL |
361 | float64<4> i_to_float64(const int64<4>& a) |
362 | { |
363 | #if SIMDPP_USE_AVX512VL |
364 | return _mm256_cvtepi64_pd(a.native()); |
365 | #else |
366 | return SIMDPP_NOT_IMPLEMENTED1(a); |
367 | #endif |
368 | } |
369 | #endif |
370 | |
371 | #if SIMDPP_USE_AVX512F |
372 | static SIMDPP_INL |
373 | float64<8> i_to_float64(const int64<8>& a) |
374 | { |
375 | return _mm512_cvtepi64_pd(a.native()); |
376 | } |
377 | #endif |
378 | |
379 | template<unsigned N> SIMDPP_INL |
380 | float64<N> i_to_float64(const int64<N>& a) |
381 | { |
382 | float64<N> r; |
383 | for (unsigned i = 0; i < a.vec_length; ++i) { |
384 | detail::subvec_insert(r, i_to_float64(a.vec(i)), i); |
385 | } |
386 | return r; |
387 | } |
388 | |
389 | // ----------------------------------------------------------------------------- |
390 | |
391 | static SIMDPP_INL |
392 | float64<2> i_to_float64(const uint64<2>& a) |
393 | { |
394 | #if SIMDPP_USE_AVX512VL |
395 | return _mm_cvtepu64_pd(a.native()); |
396 | #elif SIMDPP_USE_NEON64 |
397 | return vcvtq_f64_u64(a.native()); |
398 | #elif SIMDPP_USE_VSX_207 |
399 | return vec_ctf(a.native(), 0); |
400 | #elif SIMDPP_USE_VSX_206 |
401 | uint32<4> a32; a32 = a; // a stores 64-bit values in GPR |
402 | return vec_ctf((__vector uint64_t)a32.native(), 0); |
403 | #elif SIMDPP_USE_MSA |
404 | return __msa_ffint_u_d(a.native()); |
405 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC |
406 | detail::mem_block<uint64<2>> ax(a); |
407 | float64<2> r; |
408 | r.el(0) = double(ax[0]); |
409 | r.el(1) = double(ax[1]); |
410 | return r; |
411 | #else |
412 | return SIMDPP_NOT_IMPLEMENTED1(a); |
413 | #endif |
414 | } |
415 | |
416 | #if SIMDPP_USE_AVX |
417 | static SIMDPP_INL |
418 | float64<4> i_to_float64(const uint64<4>& a) |
419 | { |
420 | #if SIMDPP_USE_AVX512VL |
421 | return _mm256_cvtepu64_pd(a.native()); |
422 | #else |
423 | return SIMDPP_NOT_IMPLEMENTED1(a); |
424 | #endif |
425 | } |
426 | #endif |
427 | |
428 | #if SIMDPP_USE_AVX512F |
429 | static SIMDPP_INL |
430 | float64<8> i_to_float64(const uint64<8>& a) |
431 | { |
432 | return _mm512_cvtepu64_pd(a.native()); |
433 | } |
434 | #endif |
435 | |
436 | template<unsigned N> SIMDPP_INL |
437 | float64<N> i_to_float64(const uint64<N>& a) |
438 | { |
439 | float64<N> r; |
440 | for (unsigned i = 0; i < a.vec_length; ++i) { |
441 | detail::subvec_insert(r, i_to_float64(a.vec(i)), i); |
442 | } |
443 | return r; |
444 | } |
445 | |
446 | // ----------------------------------------------------------------------------- |
447 | |
448 | } // namespace insn |
449 | } // namespace detail |
450 | } // namespace SIMDPP_ARCH_NAMESPACE |
451 | } // namespace simdpp |
452 | |
453 | #endif |
454 | |
455 | |
456 | |