1 | /* Copyright (C) 2013-2017 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_CONV_EXTEND_TO_INT64_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_CONV_EXTEND_TO_INT64_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/i_shift_r.h> |
17 | #include <simdpp/core/move_l.h> |
18 | #include <simdpp/core/zip_lo.h> |
19 | #include <simdpp/core/combine.h> |
20 | #include <simdpp/detail/mem_block.h> |
21 | #include <simdpp/detail/vector_array_conv_macros.h> |
22 | |
23 | namespace simdpp { |
24 | namespace SIMDPP_ARCH_NAMESPACE { |
25 | namespace detail { |
26 | namespace insn { |
27 | |
28 | // ----------------------------------------------------------------------------- |
29 | |
30 | static SIMDPP_INL |
31 | uint64<4> i_to_uint64(const uint32<4>& a) |
32 | { |
33 | #if SIMDPP_USE_NULL |
34 | uint64x4 r; |
35 | r.vec(0).el(0) = uint64_t(a.el(0)); |
36 | r.vec(0).el(1) = uint64_t(a.el(1)); |
37 | r.vec(1).el(0) = uint64_t(a.el(2)); |
38 | r.vec(1).el(1) = uint64_t(a.el(3)); |
39 | return r; |
40 | #elif SIMDPP_USE_AVX2 |
41 | return _mm256_cvtepu32_epi64(a.native()); |
42 | #elif SIMDPP_USE_SSE4_1 |
43 | uint64x2 r1, r2; |
44 | r1 = _mm_cvtepu32_epi64(a.native()); |
45 | r2 = _mm_cvtepu32_epi64(move4_l<2>(a).eval().native()); |
46 | return combine(r1, r2); |
47 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_MSA || SIMDPP_USE_VSX_207 |
48 | return (uint64x4) combine(zip4_lo(a, (uint32x4) make_zero()), |
49 | zip4_hi(a, (uint32x4) make_zero())); |
50 | #elif SIMDPP_USE_NEON |
51 | uint64x2 r1, r2; |
52 | r1 = vmovl_u32(vget_low_u32(a.native())); |
53 | r2 = vmovl_u32(vget_high_u32(a.native())); |
54 | return combine(r1, r2); |
55 | #elif SIMDPP_USE_ALTIVEC |
56 | uint64x4 r; |
57 | mem_block<uint32x4> b = a; |
58 | r.vec(0).el(0) = b[0]; |
59 | r.vec(0).el(1) = b[1]; |
60 | r.vec(1).el(0) = b[2]; |
61 | r.vec(1).el(1) = b[3]; |
62 | return r; |
63 | #endif |
64 | } |
65 | |
66 | #if SIMDPP_USE_AVX2 |
67 | static SIMDPP_INL |
68 | uint64<8> i_to_uint64(const uint32<8>& a) |
69 | { |
70 | uint32<4> a1, a2; |
71 | uint64<4> r1, r2; |
72 | split(a, a1, a2); |
73 | r1 = _mm256_cvtepu32_epi64(a1.native()); |
74 | r2 = _mm256_cvtepu32_epi64(a2.native()); |
75 | return combine(r1, r2); |
76 | } |
77 | #endif |
78 | |
79 | #if SIMDPP_USE_AVX512F |
80 | static SIMDPP_INL |
81 | uint64<16> i_to_uint64(const uint32<16>& a) |
82 | { |
83 | uint32<8> a1, a2; |
84 | uint64<8> r1, r2; |
85 | split(a, a1, a2); |
86 | r1 = _mm512_cvtepu32_epi64(a1.native()); |
87 | r2 = _mm512_cvtepu32_epi64(a2.native()); |
88 | return combine(r1, r2); |
89 | } |
90 | #endif |
91 | |
92 | template<unsigned N> SIMDPP_INL |
93 | uint64<N> i_to_uint64(const uint32<N>& a) |
94 | { |
95 | SIMDPP_VEC_ARRAY_IMPL_CONV_INSERT(uint64<N>, i_to_uint64, a) |
96 | } |
97 | |
98 | // ----------------------------------------------------------------------------- |
99 | |
100 | static SIMDPP_INL |
101 | uint64<8> i_to_uint64(const uint16<8>& a) |
102 | { |
103 | #if SIMDPP_USE_NULL |
104 | uint64<8> r; |
105 | for (unsigned i = 0; i < 8; i++) { |
106 | r.vec(i/2).el(i%2) = uint64_t(a.vec(0).el(i)); |
107 | } |
108 | return r; |
109 | #elif SIMDPP_USE_AVX512F |
110 | return _mm512_cvtepu16_epi64(a.native()); |
111 | #elif SIMDPP_USE_AVX2 |
112 | uint64<8> r; |
113 | r.vec(0) = _mm256_cvtepu16_epi64(a.native()); |
114 | r.vec(1) = _mm256_cvtepu16_epi64(move8_l<4>(a).eval().native()); |
115 | return r; |
116 | #elif SIMDPP_USE_SSE4_1 |
117 | uint64<8> r; |
118 | r.vec(0) = _mm_cvtepu16_epi64(a.native()); |
119 | r.vec(1) = _mm_cvtepu16_epi64(move8_l<2>(a).eval().native()); |
120 | r.vec(2) = _mm_cvtepu16_epi64(move8_l<4>(a).eval().native()); |
121 | r.vec(3) = _mm_cvtepu16_epi64(move8_l<6>(a).eval().native()); |
122 | return r; |
123 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_MSA || SIMDPP_USE_VSX_207 |
124 | return i_to_uint64(i_to_uint32(a)); |
125 | #elif SIMDPP_USE_ALTIVEC |
126 | uint64<8> r; |
127 | mem_block<uint16<8>> b = a; |
128 | for (unsigned i = 0; i < 8; i++) { |
129 | r.vec(i/2).el(i%2) = uint64_t(b[i]); |
130 | } |
131 | return r; |
132 | #endif |
133 | } |
134 | |
135 | #if SIMDPP_USE_AVX2 |
136 | static SIMDPP_INL |
137 | uint64<16> i_to_uint64(const uint16<16>& a) |
138 | { |
139 | #if SIMDPP_USE_AVX512F |
140 | uint64<16> r; |
141 | uint16<8> a0, a1; |
142 | split(a, a0, a1); |
143 | r.vec(0) = _mm512_cvtepu16_epi64(a0.native()); |
144 | r.vec(1) = _mm512_cvtepu16_epi64(a1.native()); |
145 | return r; |
146 | #else |
147 | uint64<16> r; |
148 | uint16<8> a0, a1; |
149 | split(a, a0, a1); |
150 | r.vec(0) = _mm256_cvtepu16_epi64(a0.native()); |
151 | r.vec(1) = _mm256_cvtepu16_epi64(move8_l<4>(a0).eval().native()); |
152 | r.vec(2) = _mm256_cvtepu16_epi64(a1.native()); |
153 | r.vec(3) = _mm256_cvtepu16_epi64(move8_l<4>(a1).eval().native()); |
154 | return r; |
155 | #endif |
156 | } |
157 | #endif |
158 | |
159 | #if SIMDPP_USE_AVX512F |
160 | static SIMDPP_INL |
161 | uint64<32> i_to_uint64(const uint16<32>& a) |
162 | { |
163 | uint64<32> r; |
164 | uint16<16> a01, a23; |
165 | uint16<8> a0, a1, a2, a3; |
166 | #if SIMDPP_USE_AVX512BW |
167 | split(a, a01, a23); |
168 | #else |
169 | a01 = a.vec(0); |
170 | a23 = a.vec(1); |
171 | #endif |
172 | split(a01, a0, a1); |
173 | split(a23, a2, a3); |
174 | |
175 | r.vec(0) = _mm512_cvtepu16_epi64(a0.native()); |
176 | r.vec(1) = _mm512_cvtepu16_epi64(a1.native()); |
177 | r.vec(2) = _mm512_cvtepu16_epi64(a2.native()); |
178 | r.vec(3) = _mm512_cvtepu16_epi64(a3.native()); |
179 | return r; |
180 | } |
181 | #endif |
182 | |
183 | template<unsigned N> SIMDPP_INL |
184 | uint64<N> i_to_uint64(const uint16<N>& a) |
185 | { |
186 | SIMDPP_VEC_ARRAY_IMPL_CONV_INSERT(uint64<N>, i_to_uint64, a) |
187 | } |
188 | |
189 | // ----------------------------------------------------------------------------- |
190 | |
191 | static SIMDPP_INL |
192 | uint64<16> i_to_uint64(const uint8<16>& a) |
193 | { |
194 | #if SIMDPP_USE_NULL |
195 | uint64<16> r; |
196 | for (unsigned i = 0; i < 16; i++) { |
197 | r.vec(i/2).el(i%2) = uint64_t(a.vec(0).el(i)); |
198 | } |
199 | return r; |
200 | #elif SIMDPP_USE_AVX512F |
201 | uint64<16> r; |
202 | r.vec(0) = _mm512_cvtepu8_epi64(a.native()); |
203 | r.vec(1) = _mm512_cvtepu8_epi64(move16_l<8>(a).eval().native()); |
204 | return r; |
205 | #elif SIMDPP_USE_AVX2 |
206 | uint64<16> r; |
207 | r.vec(0) = _mm256_cvtepu8_epi64(a.native()); |
208 | r.vec(1) = _mm256_cvtepu8_epi64(move16_l<4>(a).eval().native()); |
209 | r.vec(2) = _mm256_cvtepu8_epi64(move16_l<8>(a).eval().native()); |
210 | r.vec(3) = _mm256_cvtepu8_epi64(move16_l<12>(a).eval().native()); |
211 | return r; |
212 | #elif SIMDPP_USE_SSE4_1 |
213 | uint64<16> r; |
214 | r.vec(0) = _mm_cvtepu8_epi64(a.native()); |
215 | r.vec(1) = _mm_cvtepu8_epi64(move16_l<2>(a).eval().native()); |
216 | r.vec(2) = _mm_cvtepu8_epi64(move16_l<4>(a).eval().native()); |
217 | r.vec(3) = _mm_cvtepu8_epi64(move16_l<6>(a).eval().native()); |
218 | r.vec(4) = _mm_cvtepu8_epi64(move16_l<8>(a).eval().native()); |
219 | r.vec(5) = _mm_cvtepu8_epi64(move16_l<10>(a).eval().native()); |
220 | r.vec(6) = _mm_cvtepu8_epi64(move16_l<12>(a).eval().native()); |
221 | r.vec(7) = _mm_cvtepu8_epi64(move16_l<14>(a).eval().native()); |
222 | return r; |
223 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_MSA || SIMDPP_USE_VSX_207 |
224 | return i_to_uint64(i_to_uint32(a)); |
225 | #elif SIMDPP_USE_ALTIVEC |
226 | uint64<16> r; |
227 | mem_block<uint8<16>> b = a; |
228 | for (unsigned i = 0; i < 16; i++) { |
229 | r.vec(i/2).el(i%2) = uint64_t(b[i]); |
230 | } |
231 | return r; |
232 | #endif |
233 | } |
234 | |
235 | #if SIMDPP_USE_AVX2 |
236 | static SIMDPP_INL |
237 | uint64<32> i_to_uint64(const uint8<32>& a) |
238 | { |
239 | #if SIMDPP_USE_AVX512F |
240 | uint64<32> r; |
241 | uint8<16> a0, a1; |
242 | split(a, a0, a1); |
243 | r.vec(0) = _mm512_cvtepu8_epi64(a0.native()); |
244 | r.vec(1) = _mm512_cvtepu8_epi64(move16_l<8>(a0).eval().native()); |
245 | r.vec(2) = _mm512_cvtepu8_epi64(a1.native()); |
246 | r.vec(3) = _mm512_cvtepu8_epi64(move16_l<8>(a1).eval().native()); |
247 | return r; |
248 | #else |
249 | uint64<32> r; |
250 | uint8<16> a0, a1; |
251 | split(a, a0, a1); |
252 | r.vec(0) = _mm256_cvtepu8_epi64(a0.native()); |
253 | r.vec(1) = _mm256_cvtepu8_epi64(move16_l<4>(a0).eval().native()); |
254 | r.vec(2) = _mm256_cvtepu8_epi64(move16_l<8>(a0).eval().native()); |
255 | r.vec(3) = _mm256_cvtepu8_epi64(move16_l<12>(a0).eval().native()); |
256 | r.vec(4) = _mm256_cvtepu8_epi64(a1.native()); |
257 | r.vec(5) = _mm256_cvtepu8_epi64(move16_l<4>(a1).eval().native()); |
258 | r.vec(6) = _mm256_cvtepu8_epi64(move16_l<8>(a1).eval().native()); |
259 | r.vec(7) = _mm256_cvtepu8_epi64(move16_l<12>(a1).eval().native()); |
260 | return r; |
261 | #endif |
262 | } |
263 | #endif |
264 | |
265 | #if SIMDPP_USE_AVX512F |
266 | static SIMDPP_INL |
267 | uint64<64> i_to_uint64(const uint8<64>& a) |
268 | { |
269 | uint64<64> r; |
270 | uint8<32> a01, a23; |
271 | uint8<16> a0, a1, a2, a3; |
272 | #if SIMDPP_USE_AVX512BW |
273 | split(a, a01, a23); |
274 | #else |
275 | a01 = a.vec(0); |
276 | a23 = a.vec(1); |
277 | #endif |
278 | split(a01, a0, a1); |
279 | split(a23, a2, a3); |
280 | |
281 | r.vec(0) = _mm512_cvtepu8_epi64(a0.native()); |
282 | r.vec(1) = _mm512_cvtepu8_epi64(move16_l<8>(a0).eval().native()); |
283 | r.vec(2) = _mm512_cvtepu8_epi64(a1.native()); |
284 | r.vec(3) = _mm512_cvtepu8_epi64(move16_l<8>(a1).eval().native()); |
285 | r.vec(4) = _mm512_cvtepu8_epi64(a2.native()); |
286 | r.vec(5) = _mm512_cvtepu8_epi64(move16_l<8>(a2).eval().native()); |
287 | r.vec(6) = _mm512_cvtepu8_epi64(a3.native()); |
288 | r.vec(7) = _mm512_cvtepu8_epi64(move16_l<8>(a3).eval().native()); |
289 | return r; |
290 | } |
291 | #endif |
292 | |
293 | template<unsigned N> SIMDPP_INL |
294 | uint64<N> i_to_uint64(const uint8<N>& a) |
295 | { |
296 | SIMDPP_VEC_ARRAY_IMPL_CONV_INSERT(uint64<N>, i_to_uint64, a) |
297 | } |
298 | |
299 | // ----------------------------------------------------------------------------- |
300 | |
301 | static SIMDPP_INL |
302 | int64<4> i_to_int64(const int32<4>& a) |
303 | { |
304 | #if SIMDPP_USE_NULL |
305 | int64<4> r; |
306 | r.vec(0).el(0) = int64_t(a.el(0)); |
307 | r.vec(0).el(1) = int64_t(a.el(1)); |
308 | r.vec(1).el(0) = int64_t(a.el(2)); |
309 | r.vec(1).el(1) = int64_t(a.el(3)); |
310 | return r; |
311 | #elif SIMDPP_USE_AVX2 |
312 | return _mm256_cvtepi32_epi64(a.native()); |
313 | #elif SIMDPP_USE_SSE4_1 |
314 | uint64x2 r1, r2; |
315 | r1 = _mm_cvtepi32_epi64(a.native()); |
316 | r2 = _mm_cvtepi32_epi64(move4_l<2>(a).eval().native()); |
317 | return combine(r1, r2); |
318 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_MSA || SIMDPP_USE_VSX_207 |
319 | int32x4 sign = shift_r<31>(a); |
320 | int64x2 lo, hi; |
321 | lo = zip4_lo(a, sign); |
322 | hi = zip4_hi(a, sign); |
323 | return combine(lo, hi); |
324 | #elif SIMDPP_USE_NEON |
325 | int64x2 r1, r2; |
326 | r1 = vmovl_s32(vget_low_s32(a.native())); |
327 | r2 = vmovl_s32(vget_high_s32(a.native())); |
328 | return combine(r1, r2); |
329 | #elif SIMDPP_USE_ALTIVEC |
330 | int64x4 r; |
331 | mem_block<int32x4> b = a; |
332 | r.vec(0).el(0) = b[0]; |
333 | r.vec(0).el(1) = b[1]; |
334 | r.vec(1).el(0) = b[2]; |
335 | r.vec(1).el(1) = b[3]; |
336 | return r; |
337 | #endif |
338 | } |
339 | |
340 | #if SIMDPP_USE_AVX2 |
341 | static SIMDPP_INL |
342 | int64<8> i_to_int64(const int32<8>& a) |
343 | { |
344 | int32<4> a1, a2; |
345 | int64<4> r1, r2; |
346 | split(a, a1, a2); |
347 | r1 = _mm256_cvtepi32_epi64(a1.native()); |
348 | r2 = _mm256_cvtepi32_epi64(a2.native()); |
349 | return combine(r1, r2); |
350 | } |
351 | #endif |
352 | |
353 | #if SIMDPP_USE_AVX512F |
354 | static SIMDPP_INL |
355 | int64<16> i_to_int64(const int32<16>& a) |
356 | { |
357 | int64<8> r1, r2; |
358 | r1 = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(a.native())); |
359 | r2 = _mm512_cvtepi32_epi64(_mm512_extracti64x4_epi64(a.native(), 1)); |
360 | return combine(r1, r2); |
361 | } |
362 | #endif |
363 | |
364 | template<unsigned N> SIMDPP_INL |
365 | int64<N> i_to_int64(const int32<N>& a) |
366 | { |
367 | SIMDPP_VEC_ARRAY_IMPL_CONV_INSERT(int64<N>, i_to_int64, a) |
368 | } |
369 | |
370 | // ----------------------------------------------------------------------------- |
371 | |
372 | static SIMDPP_INL |
373 | int64<8> i_to_int64(const int16<8>& a) |
374 | { |
375 | #if SIMDPP_USE_NULL |
376 | int64<8> r; |
377 | for (unsigned i = 0; i < 8; i++) { |
378 | r.vec(i/2).el(i%2) = int64_t(a.vec(0).el(i)); |
379 | } |
380 | return r; |
381 | #elif SIMDPP_USE_AVX512F |
382 | return _mm512_cvtepi16_epi64(a.native()); |
383 | #elif SIMDPP_USE_AVX2 |
384 | int64<8> r; |
385 | r.vec(0) = _mm256_cvtepi16_epi64(a.native()); |
386 | r.vec(1) = _mm256_cvtepi16_epi64(move8_l<4>(a).eval().native()); |
387 | return r; |
388 | #elif SIMDPP_USE_SSE4_1 |
389 | int64<8> r; |
390 | r.vec(0) = _mm_cvtepi16_epi64(a.native()); |
391 | r.vec(1) = _mm_cvtepi16_epi64(move8_l<2>(a).eval().native()); |
392 | r.vec(2) = _mm_cvtepi16_epi64(move8_l<4>(a).eval().native()); |
393 | r.vec(3) = _mm_cvtepi16_epi64(move8_l<6>(a).eval().native()); |
394 | return r; |
395 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_MSA || SIMDPP_USE_VSX_207 |
396 | return i_to_int64(i_to_int32(a)); |
397 | #elif SIMDPP_USE_ALTIVEC |
398 | int64<8> r; |
399 | mem_block<int16<8>> b = a; |
400 | for (unsigned i = 0; i < 8; i++) { |
401 | r.vec(i/2).el(i%2) = int64_t(b[i]); |
402 | } |
403 | return r; |
404 | #endif |
405 | } |
406 | |
407 | #if SIMDPP_USE_AVX2 |
408 | static SIMDPP_INL |
409 | int64<16> i_to_int64(const int16<16>& a) |
410 | { |
411 | #if SIMDPP_USE_AVX512F |
412 | int64<16> r; |
413 | int16<8> a0, a1; |
414 | split(a, a0, a1); |
415 | r.vec(0) = _mm512_cvtepi16_epi64(a0.native()); |
416 | r.vec(1) = _mm512_cvtepi16_epi64(a1.native()); |
417 | return r; |
418 | #else |
419 | int64<16> r; |
420 | int16<8> a0, a1; |
421 | split(a, a0, a1); |
422 | r.vec(0) = _mm256_cvtepi16_epi64(a0.native()); |
423 | r.vec(1) = _mm256_cvtepi16_epi64(move8_l<4>(a0).eval().native()); |
424 | r.vec(2) = _mm256_cvtepi16_epi64(a1.native()); |
425 | r.vec(3) = _mm256_cvtepi16_epi64(move8_l<4>(a1).eval().native()); |
426 | return r; |
427 | #endif |
428 | } |
429 | #endif |
430 | |
431 | #if SIMDPP_USE_AVX512F |
432 | static SIMDPP_INL |
433 | int64<32> i_to_int64(const int16<32>& a) |
434 | { |
435 | int64<32> r; |
436 | int16<16> a01, a23; |
437 | int16<8> a0, a1, a2, a3; |
438 | #if SIMDPP_USE_AVX512BW |
439 | split(a, a01, a23); |
440 | #else |
441 | a01 = a.vec(0); |
442 | a23 = a.vec(1); |
443 | #endif |
444 | split(a01, a0, a1); |
445 | split(a23, a2, a3); |
446 | |
447 | r.vec(0) = _mm512_cvtepi16_epi64(a0.native()); |
448 | r.vec(1) = _mm512_cvtepi16_epi64(a1.native()); |
449 | r.vec(2) = _mm512_cvtepi16_epi64(a2.native()); |
450 | r.vec(3) = _mm512_cvtepi16_epi64(a3.native()); |
451 | return r; |
452 | } |
453 | #endif |
454 | |
455 | template<unsigned N> SIMDPP_INL |
456 | int64<N> i_to_int64(const int16<N>& a) |
457 | { |
458 | SIMDPP_VEC_ARRAY_IMPL_CONV_INSERT(int64<N>, i_to_int64, a) |
459 | } |
460 | |
461 | // ----------------------------------------------------------------------------- |
462 | |
463 | static SIMDPP_INL |
464 | int64<16> i_to_int64(const int8<16>& a) |
465 | { |
466 | #if SIMDPP_USE_NULL |
467 | int64<16> r; |
468 | for (unsigned i = 0; i < 16; i++) { |
469 | r.vec(i/2).el(i%2) = int64_t(a.vec(0).el(i)); |
470 | } |
471 | return r; |
472 | #elif SIMDPP_USE_AVX512F |
473 | int64<16> r; |
474 | r.vec(0) = _mm512_cvtepi8_epi64(a.native()); |
475 | r.vec(1) = _mm512_cvtepi8_epi64(move16_l<8>(a).eval().native()); |
476 | return r; |
477 | #elif SIMDPP_USE_AVX2 |
478 | int64<16> r; |
479 | r.vec(0) = _mm256_cvtepi8_epi64(a.native()); |
480 | r.vec(1) = _mm256_cvtepi8_epi64(move16_l<4>(a).eval().native()); |
481 | r.vec(2) = _mm256_cvtepi8_epi64(move16_l<8>(a).eval().native()); |
482 | r.vec(3) = _mm256_cvtepi8_epi64(move16_l<12>(a).eval().native()); |
483 | return r; |
484 | #elif SIMDPP_USE_SSE4_1 |
485 | int64<16> r; |
486 | r.vec(0) = _mm_cvtepi8_epi64(a.native()); |
487 | r.vec(1) = _mm_cvtepi8_epi64(move16_l<2>(a).eval().native()); |
488 | r.vec(2) = _mm_cvtepi8_epi64(move16_l<4>(a).eval().native()); |
489 | r.vec(3) = _mm_cvtepi8_epi64(move16_l<6>(a).eval().native()); |
490 | r.vec(4) = _mm_cvtepi8_epi64(move16_l<8>(a).eval().native()); |
491 | r.vec(5) = _mm_cvtepi8_epi64(move16_l<10>(a).eval().native()); |
492 | r.vec(6) = _mm_cvtepi8_epi64(move16_l<12>(a).eval().native()); |
493 | r.vec(7) = _mm_cvtepi8_epi64(move16_l<14>(a).eval().native()); |
494 | return r; |
495 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_MSA || SIMDPP_USE_VSX_207 |
496 | return i_to_int64(i_to_int32(a)); |
497 | #elif SIMDPP_USE_ALTIVEC |
498 | int64<16> r; |
499 | mem_block<int8<16>> b = a; |
500 | for (unsigned i = 0; i < 16; i++) { |
501 | r.vec(i/2).el(i%2) = int64_t(b[i]); |
502 | } |
503 | return r; |
504 | #endif |
505 | } |
506 | |
507 | #if SIMDPP_USE_AVX2 |
508 | static SIMDPP_INL |
509 | int64<32> i_to_int64(const int8<32>& a) |
510 | { |
511 | #if SIMDPP_USE_AVX512F |
512 | int64<32> r; |
513 | int8<16> a0, a1; |
514 | split(a, a0, a1); |
515 | r.vec(0) = _mm512_cvtepi8_epi64(a0.native()); |
516 | r.vec(1) = _mm512_cvtepi8_epi64(move16_l<8>(a0).eval().native()); |
517 | r.vec(2) = _mm512_cvtepi8_epi64(a1.native()); |
518 | r.vec(3) = _mm512_cvtepi8_epi64(move16_l<8>(a1).eval().native()); |
519 | return r; |
520 | #else |
521 | int64<32> r; |
522 | int8<16> a0, a1; |
523 | split(a, a0, a1); |
524 | r.vec(0) = _mm256_cvtepi8_epi64(a0.native()); |
525 | r.vec(1) = _mm256_cvtepi8_epi64(move16_l<4>(a0).eval().native()); |
526 | r.vec(2) = _mm256_cvtepi8_epi64(move16_l<8>(a0).eval().native()); |
527 | r.vec(3) = _mm256_cvtepi8_epi64(move16_l<12>(a0).eval().native()); |
528 | r.vec(4) = _mm256_cvtepi8_epi64(a1.native()); |
529 | r.vec(5) = _mm256_cvtepi8_epi64(move16_l<4>(a1).eval().native()); |
530 | r.vec(6) = _mm256_cvtepi8_epi64(move16_l<8>(a1).eval().native()); |
531 | r.vec(7) = _mm256_cvtepi8_epi64(move16_l<12>(a1).eval().native()); |
532 | return r; |
533 | #endif |
534 | } |
535 | #endif |
536 | |
537 | #if SIMDPP_USE_AVX512F |
538 | static SIMDPP_INL |
539 | int64<64> i_to_int64(const int8<64>& a) |
540 | { |
541 | int64<64> r; |
542 | int8<32> a01, a23; |
543 | int8<16> a0, a1, a2, a3; |
544 | #if SIMDPP_USE_AVX512BW |
545 | split(a, a01, a23); |
546 | #else |
547 | a01 = a.vec(0); |
548 | a23 = a.vec(1); |
549 | #endif |
550 | split(a01, a0, a1); |
551 | split(a23, a2, a3); |
552 | |
553 | r.vec(0) = _mm512_cvtepi8_epi64(a0.native()); |
554 | r.vec(1) = _mm512_cvtepi8_epi64(move16_l<8>(a0).eval().native()); |
555 | r.vec(2) = _mm512_cvtepi8_epi64(a1.native()); |
556 | r.vec(3) = _mm512_cvtepi8_epi64(move16_l<8>(a1).eval().native()); |
557 | r.vec(4) = _mm512_cvtepi8_epi64(a2.native()); |
558 | r.vec(5) = _mm512_cvtepi8_epi64(move16_l<8>(a2).eval().native()); |
559 | r.vec(6) = _mm512_cvtepi8_epi64(a3.native()); |
560 | r.vec(7) = _mm512_cvtepi8_epi64(move16_l<8>(a3).eval().native()); |
561 | return r; |
562 | } |
563 | #endif |
564 | |
565 | template<unsigned N> SIMDPP_INL |
566 | int64<N> i_to_int64(const int8<N>& a) |
567 | { |
568 | SIMDPP_VEC_ARRAY_IMPL_CONV_INSERT(int64<N>, i_to_int64, a) |
569 | } |
570 | |
571 | // ----------------------------------------------------------------------------- |
572 | |
573 | } // namespace insn |
574 | } // namespace detail |
575 | } // namespace SIMDPP_ARCH_NAMESPACE |
576 | } // namespace simdpp |
577 | |
578 | #endif |
579 | |
580 | |
581 | |