1/* Copyright (C) 2013-2017 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_CONV_EXTEND_TO_INT32_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_CONV_EXTEND_TO_INT32_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/combine.h>
17#include <simdpp/detail/insn/conv_extend_to_int16.h>
18#include <simdpp/core/i_shift_r.h>
19#include <simdpp/core/move_l.h>
20#include <simdpp/core/zip_hi.h>
21#include <simdpp/core/zip_lo.h>
22#include <simdpp/core/unzip_lo.h>
23#include <simdpp/detail/vector_array_conv_macros.h>
24
25namespace simdpp {
26namespace SIMDPP_ARCH_NAMESPACE {
27namespace detail {
28namespace insn {
29
30// -----------------------------------------------------------------------------
31
32static SIMDPP_INL
33uint32<8> i_to_uint32(const uint16<8>& a)
34{
35#if SIMDPP_USE_NULL
36 uint32<8> r;
37 for (unsigned i = 0; i < r.length; i++) {
38 r.vec(i/4).el(i%4) = uint32_t(a.vec(0).el(i));
39 }
40 return r;
41#elif SIMDPP_USE_AVX2
42 return _mm256_cvtepu16_epi32(a.native());
43#elif SIMDPP_USE_SSE4_1
44 uint32<8> r;
45 r.vec(0) = _mm_cvtepu16_epi32(a.native());
46 r.vec(1) = _mm_cvtepu16_epi32(move8_l<4>(a).eval().native());
47 return r;
48#elif SIMDPP_USE_SSE2 || SIMDPP_USE_MSA || (SIMDPP_USE_ALTIVEC && SIMDPP_LITTLE_ENDIAN)
49 uint16<8> zero = make_zero();
50 uint32<8> r;
51 r.vec(0) = zip8_lo(a, zero);
52 r.vec(1) = zip8_hi(a, zero);
53 return r;
54#elif (SIMDPP_USE_ALTIVEC && SIMDPP_BIG_ENDIAN)
55 uint16<8> zero = make_zero();
56 uint32<8> r;
57 r.vec(0) = zip8_lo(zero, a);
58 r.vec(1) = zip8_hi(zero, a);
59 return r;
60#elif SIMDPP_USE_NEON
61 uint32<8> r;
62 r.vec(0) = vmovl_u16(vget_low_u16(a.vec(0).native()));
63 r.vec(1) = vmovl_u16(vget_high_u16(a.vec(1).native()));
64 return r;
65#endif
66}
67
68#if SIMDPP_USE_AVX2
69SIMDPP_INL uint32<16> i_to_uint32(const uint16<16>& a)
70{
71#if SIMDPP_USE_AVX512F
72 return _mm512_cvtepu16_epi32(a.native());
73#else
74 uint32<16> r;
75 uint16<8> a0, a1;
76 split(a, a0, a1);
77 r.vec(0) = _mm256_cvtepu16_epi32(a0.native());
78 r.vec(1) = _mm256_cvtepu16_epi32(a1.native());
79 return r;
80#endif
81}
82#endif
83
84#if SIMDPP_USE_AVX512BW
85SIMDPP_INL uint32<32> i_to_uint32(const uint16<32>& a)
86{
87 uint32<32> r;
88 uint16<16> a0, a1;
89 split(a, a0, a1);
90 r.vec(0) = _mm512_cvtepu16_epi32(a0.native());
91 r.vec(1) = _mm512_cvtepu16_epi32(a1.native());
92 return r;
93}
94#endif
95
96template<unsigned N> SIMDPP_INL
97uint32<N> i_to_uint32(const uint16<N>& a)
98{
99 SIMDPP_VEC_ARRAY_IMPL_CONV_INSERT(uint32<N>, i_to_uint32, a)
100}
101
102// -----------------------------------------------------------------------------
103
104static SIMDPP_INL
105uint32<16> i_to_uint32(const uint8<16>& a)
106{
107#if SIMDPP_USE_NULL
108 uint32<16> r;
109 for (unsigned i = 0; i < r.length; i++) {
110 r.vec(i/4).el(i%4) = uint32_t(a.vec(0).el(i));
111 }
112 return r;
113#elif SIMDPP_USE_AVX512F
114 return _mm512_cvtepu8_epi32(a.native());
115#elif SIMDPP_USE_AVX2
116 uint32<16> r;
117 r.vec(0) = _mm256_cvtepu8_epi32(a.native());
118 r.vec(1) = _mm256_cvtepu8_epi32(move16_l<8>(a).eval().native());
119 return r;
120#elif SIMDPP_USE_SSE4_1
121 uint32<16> r;
122 r.vec(0) = _mm_cvtepu8_epi32(a.native());
123 r.vec(1) = _mm_cvtepu8_epi32(move16_l<4>(a).eval().native());
124 r.vec(2) = _mm_cvtepu8_epi32(move16_l<8>(a).eval().native());
125 r.vec(3) = _mm_cvtepu8_epi32(move16_l<12>(a).eval().native());
126 return r;
127#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
128 return i_to_uint32(i_to_uint16(a));
129#endif
130}
131
132#if SIMDPP_USE_AVX2
133SIMDPP_INL uint32<32> i_to_uint32(const uint8<32>& a)
134{
135#if SIMDPP_USE_AVX512F
136 uint32<32> r;
137 uint8<16> a0, a1;
138 split(a, a0, a1);
139 r.vec(0) = _mm512_cvtepu8_epi32(a0.native());
140 r.vec(1) = _mm512_cvtepu8_epi32(a1.native());
141 return r;
142#else
143 uint32<32> r;
144 uint8<16> a0, a1;
145 split(a, a0, a1);
146 r.vec(0) = _mm256_cvtepu8_epi32(a0.native());
147 r.vec(1) = _mm256_cvtepu8_epi32(move16_l<8>(a0).eval().native());
148 r.vec(2) = _mm256_cvtepu8_epi32(a1.native());
149 r.vec(3) = _mm256_cvtepu8_epi32(move16_l<8>(a1).eval().native());
150 return r;
151#endif
152}
153#endif
154
155#if SIMDPP_USE_AVX512BW
156SIMDPP_INL uint32<64> i_to_uint32(const uint8<64>& a)
157{
158 uint32<64> r;
159 uint8<32> a01, a23;
160 uint8<16> a0, a1, a2, a3;
161 split(a, a01, a23);
162 split(a01, a0, a1);
163 split(a23, a2, a3);
164
165 r.vec(0) = _mm512_cvtepu8_epi32(a0.native());
166 r.vec(1) = _mm512_cvtepu8_epi32(a1.native());
167 r.vec(2) = _mm512_cvtepu8_epi32(a2.native());
168 r.vec(3) = _mm512_cvtepu8_epi32(a3.native());
169 return r;
170}
171#endif
172
173template<unsigned N> SIMDPP_INL
174uint32<N> i_to_uint32(const uint8<N>& a)
175{
176 SIMDPP_VEC_ARRAY_IMPL_CONV_INSERT(uint32<N>, i_to_uint32, a)
177}
178
179// -----------------------------------------------------------------------------
180
181static SIMDPP_INL
182int32<8> i_to_int32(const int16<8>& a)
183{
184#if SIMDPP_USE_NULL
185 int32<8> r;
186 for (unsigned i = 0; i < r.length; i++) {
187 r.vec(i/4).el(i%4) = int32_t(a.vec(0).el(i));
188 }
189 return r;
190#elif SIMDPP_USE_AVX2
191 return _mm256_cvtepi16_epi32(a.native());
192#elif SIMDPP_USE_SSE4_1
193 int32x8 r;
194 r.vec(0) = _mm_cvtepi16_epi32(a.native());
195 r.vec(1) = _mm_cvtepi16_epi32(move8_l<4>(a).eval().native());
196 return r;
197#elif SIMDPP_USE_SSE2 || SIMDPP_USE_MSA
198 int16x8 sign = shift_r<15>(a);
199 int32x4 lo, hi;
200 lo = zip8_lo(a, sign);
201 hi = zip8_hi(a, sign);
202 return combine(lo, hi);
203#elif SIMDPP_USE_NEON
204 int32x8 r;
205 r.vec(0) = vmovl_s16(vget_low_s16(a.vec(0).native()));
206 r.vec(1) = vmovl_s16(vget_high_s16(a.vec(1).native()));
207 return r;
208#elif SIMDPP_USE_ALTIVEC
209 int32x4 b0, b1;
210 b0 = vec_unpackh((__vector int16_t)a.vec(0).native());
211 b1 = vec_unpackl((__vector int16_t)a.vec(0).native());
212 return combine(b0, b1);
213#endif
214}
215
216#if SIMDPP_USE_AVX2
217static SIMDPP_INL
218int32<16> i_to_int32(const int16<16>& a)
219{
220#if SIMDPP_USE_AVX512F
221 return _mm512_cvtepi16_epi32(a.native());
222#else
223 int32<8> r0, r1;
224 int16<8> a0, a1;
225 split(a, a0, a1);
226 r0 = _mm256_cvtepi16_epi32(a0.native());
227 r1 = _mm256_cvtepi16_epi32(a1.native());
228 return combine(r0, r1);
229#endif
230}
231#endif
232
233#if SIMDPP_USE_AVX512BW
234SIMDPP_INL int32<32> i_to_int32(const int16<32>& a)
235{
236 int32<16> r0, r1;
237 int16<16> a0, a1;
238 split(a, a0, a1);
239 r0 = _mm512_cvtepi16_epi32(a0.native());
240 r1 = _mm512_cvtepi16_epi32(a1.native());
241 return combine(r0, r1);
242}
243#endif
244
245template<unsigned N> SIMDPP_INL
246int32<N> i_to_int32(const int16<N>& a)
247{
248 SIMDPP_VEC_ARRAY_IMPL_CONV_INSERT(int32<N>, i_to_int32, a)
249}
250
251// -----------------------------------------------------------------------------
252
253static SIMDPP_INL
254int32<16> i_to_int32(const int8<16>& a)
255{
256#if SIMDPP_USE_NULL
257 int32<16> r;
258 for (unsigned i = 0; i < r.length; i++) {
259 r.vec(i/4).el(i%4) = int32_t(a.vec(0).el(i));
260 }
261 return r;
262#elif SIMDPP_USE_AVX512F
263 return _mm512_cvtepi8_epi32(a.native());
264#elif SIMDPP_USE_AVX2
265 int32<16> r;
266 r.vec(0) = _mm256_cvtepi8_epi32(a.native());
267 r.vec(1) = _mm256_cvtepi8_epi32(move16_l<8>(a).eval().native());
268 return r;
269#elif SIMDPP_USE_SSE4_1
270 int32<16> r;
271 r.vec(0) = _mm_cvtepi8_epi32(a.native());
272 r.vec(1) = _mm_cvtepi8_epi32(move16_l<4>(a).eval().native());
273 r.vec(2) = _mm_cvtepi8_epi32(move16_l<8>(a).eval().native());
274 r.vec(3) = _mm_cvtepi8_epi32(move16_l<12>(a).eval().native());
275 return r;
276#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
277 return i_to_int32(i_to_int16(a));
278#endif
279}
280
281#if SIMDPP_USE_AVX2
282static SIMDPP_INL
283int32<32> i_to_int32(const int8<32>& a)
284{
285#if SIMDPP_USE_AVX512F
286 int32<32> r;
287 int8<16> a0, a1;
288 split(a, a0, a1);
289 r.vec(0) = _mm512_cvtepi8_epi32(a0.native());
290 r.vec(1) = _mm512_cvtepi8_epi32(a1.native());
291 return r;
292#else
293 int32<32> r;
294 int8<16> a0, a1;
295 split(a, a0, a1);
296 r.vec(0) = _mm256_cvtepi8_epi32(a0.native());
297 r.vec(1) = _mm256_cvtepi8_epi32(move16_l<8>(a0).eval().native());
298 r.vec(2) = _mm256_cvtepi8_epi32(a1.native());
299 r.vec(3) = _mm256_cvtepi8_epi32(move16_l<8>(a1).eval().native());
300 return r;
301#endif
302}
303#endif
304
305#if SIMDPP_USE_AVX512BW
306SIMDPP_INL int32<64> i_to_int32(const int8<64>& a)
307{
308 int32<64> r;
309 int8<32> a01, a23;
310 int8<16> a0, a1, a2, a3;
311 split(a, a01, a23);
312 split(a01, a0, a1);
313 split(a23, a2, a3);
314
315 r.vec(0) = _mm512_cvtepi8_epi32(a0.native());
316 r.vec(1) = _mm512_cvtepi8_epi32(a1.native());
317 r.vec(2) = _mm512_cvtepi8_epi32(a2.native());
318 r.vec(3) = _mm512_cvtepi8_epi32(a3.native());
319 return r;
320}
321#endif
322
323template<unsigned N> SIMDPP_INL
324int32<N> i_to_int32(const int8<N>& a)
325{
326 SIMDPP_VEC_ARRAY_IMPL_CONV_INSERT(int32<N>, i_to_int32, a)
327}
328
329
330} // namespace insn
331} // namespace detail
332} // namespace SIMDPP_ARCH_NAMESPACE
333} // namespace simdpp
334
335#endif
336
337
338