1 | /* Copyright (C) 2013-2017 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_CONV_EXTEND_TO_INT32_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_CONV_EXTEND_TO_INT32_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/combine.h> |
17 | #include <simdpp/detail/insn/conv_extend_to_int16.h> |
18 | #include <simdpp/core/i_shift_r.h> |
19 | #include <simdpp/core/move_l.h> |
20 | #include <simdpp/core/zip_hi.h> |
21 | #include <simdpp/core/zip_lo.h> |
22 | #include <simdpp/core/unzip_lo.h> |
23 | #include <simdpp/detail/vector_array_conv_macros.h> |
24 | |
25 | namespace simdpp { |
26 | namespace SIMDPP_ARCH_NAMESPACE { |
27 | namespace detail { |
28 | namespace insn { |
29 | |
30 | // ----------------------------------------------------------------------------- |
31 | |
32 | static SIMDPP_INL |
33 | uint32<8> i_to_uint32(const uint16<8>& a) |
34 | { |
35 | #if SIMDPP_USE_NULL |
36 | uint32<8> r; |
37 | for (unsigned i = 0; i < r.length; i++) { |
38 | r.vec(i/4).el(i%4) = uint32_t(a.vec(0).el(i)); |
39 | } |
40 | return r; |
41 | #elif SIMDPP_USE_AVX2 |
42 | return _mm256_cvtepu16_epi32(a.native()); |
43 | #elif SIMDPP_USE_SSE4_1 |
44 | uint32<8> r; |
45 | r.vec(0) = _mm_cvtepu16_epi32(a.native()); |
46 | r.vec(1) = _mm_cvtepu16_epi32(move8_l<4>(a).eval().native()); |
47 | return r; |
48 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_MSA || (SIMDPP_USE_ALTIVEC && SIMDPP_LITTLE_ENDIAN) |
49 | uint16<8> zero = make_zero(); |
50 | uint32<8> r; |
51 | r.vec(0) = zip8_lo(a, zero); |
52 | r.vec(1) = zip8_hi(a, zero); |
53 | return r; |
54 | #elif (SIMDPP_USE_ALTIVEC && SIMDPP_BIG_ENDIAN) |
55 | uint16<8> zero = make_zero(); |
56 | uint32<8> r; |
57 | r.vec(0) = zip8_lo(zero, a); |
58 | r.vec(1) = zip8_hi(zero, a); |
59 | return r; |
60 | #elif SIMDPP_USE_NEON |
61 | uint32<8> r; |
62 | r.vec(0) = vmovl_u16(vget_low_u16(a.vec(0).native())); |
63 | r.vec(1) = vmovl_u16(vget_high_u16(a.vec(1).native())); |
64 | return r; |
65 | #endif |
66 | } |
67 | |
68 | #if SIMDPP_USE_AVX2 |
69 | SIMDPP_INL uint32<16> i_to_uint32(const uint16<16>& a) |
70 | { |
71 | #if SIMDPP_USE_AVX512F |
72 | return _mm512_cvtepu16_epi32(a.native()); |
73 | #else |
74 | uint32<16> r; |
75 | uint16<8> a0, a1; |
76 | split(a, a0, a1); |
77 | r.vec(0) = _mm256_cvtepu16_epi32(a0.native()); |
78 | r.vec(1) = _mm256_cvtepu16_epi32(a1.native()); |
79 | return r; |
80 | #endif |
81 | } |
82 | #endif |
83 | |
84 | #if SIMDPP_USE_AVX512BW |
85 | SIMDPP_INL uint32<32> i_to_uint32(const uint16<32>& a) |
86 | { |
87 | uint32<32> r; |
88 | uint16<16> a0, a1; |
89 | split(a, a0, a1); |
90 | r.vec(0) = _mm512_cvtepu16_epi32(a0.native()); |
91 | r.vec(1) = _mm512_cvtepu16_epi32(a1.native()); |
92 | return r; |
93 | } |
94 | #endif |
95 | |
96 | template<unsigned N> SIMDPP_INL |
97 | uint32<N> i_to_uint32(const uint16<N>& a) |
98 | { |
99 | SIMDPP_VEC_ARRAY_IMPL_CONV_INSERT(uint32<N>, i_to_uint32, a) |
100 | } |
101 | |
102 | // ----------------------------------------------------------------------------- |
103 | |
104 | static SIMDPP_INL |
105 | uint32<16> i_to_uint32(const uint8<16>& a) |
106 | { |
107 | #if SIMDPP_USE_NULL |
108 | uint32<16> r; |
109 | for (unsigned i = 0; i < r.length; i++) { |
110 | r.vec(i/4).el(i%4) = uint32_t(a.vec(0).el(i)); |
111 | } |
112 | return r; |
113 | #elif SIMDPP_USE_AVX512F |
114 | return _mm512_cvtepu8_epi32(a.native()); |
115 | #elif SIMDPP_USE_AVX2 |
116 | uint32<16> r; |
117 | r.vec(0) = _mm256_cvtepu8_epi32(a.native()); |
118 | r.vec(1) = _mm256_cvtepu8_epi32(move16_l<8>(a).eval().native()); |
119 | return r; |
120 | #elif SIMDPP_USE_SSE4_1 |
121 | uint32<16> r; |
122 | r.vec(0) = _mm_cvtepu8_epi32(a.native()); |
123 | r.vec(1) = _mm_cvtepu8_epi32(move16_l<4>(a).eval().native()); |
124 | r.vec(2) = _mm_cvtepu8_epi32(move16_l<8>(a).eval().native()); |
125 | r.vec(3) = _mm_cvtepu8_epi32(move16_l<12>(a).eval().native()); |
126 | return r; |
127 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
128 | return i_to_uint32(i_to_uint16(a)); |
129 | #endif |
130 | } |
131 | |
132 | #if SIMDPP_USE_AVX2 |
133 | SIMDPP_INL uint32<32> i_to_uint32(const uint8<32>& a) |
134 | { |
135 | #if SIMDPP_USE_AVX512F |
136 | uint32<32> r; |
137 | uint8<16> a0, a1; |
138 | split(a, a0, a1); |
139 | r.vec(0) = _mm512_cvtepu8_epi32(a0.native()); |
140 | r.vec(1) = _mm512_cvtepu8_epi32(a1.native()); |
141 | return r; |
142 | #else |
143 | uint32<32> r; |
144 | uint8<16> a0, a1; |
145 | split(a, a0, a1); |
146 | r.vec(0) = _mm256_cvtepu8_epi32(a0.native()); |
147 | r.vec(1) = _mm256_cvtepu8_epi32(move16_l<8>(a0).eval().native()); |
148 | r.vec(2) = _mm256_cvtepu8_epi32(a1.native()); |
149 | r.vec(3) = _mm256_cvtepu8_epi32(move16_l<8>(a1).eval().native()); |
150 | return r; |
151 | #endif |
152 | } |
153 | #endif |
154 | |
155 | #if SIMDPP_USE_AVX512BW |
156 | SIMDPP_INL uint32<64> i_to_uint32(const uint8<64>& a) |
157 | { |
158 | uint32<64> r; |
159 | uint8<32> a01, a23; |
160 | uint8<16> a0, a1, a2, a3; |
161 | split(a, a01, a23); |
162 | split(a01, a0, a1); |
163 | split(a23, a2, a3); |
164 | |
165 | r.vec(0) = _mm512_cvtepu8_epi32(a0.native()); |
166 | r.vec(1) = _mm512_cvtepu8_epi32(a1.native()); |
167 | r.vec(2) = _mm512_cvtepu8_epi32(a2.native()); |
168 | r.vec(3) = _mm512_cvtepu8_epi32(a3.native()); |
169 | return r; |
170 | } |
171 | #endif |
172 | |
173 | template<unsigned N> SIMDPP_INL |
174 | uint32<N> i_to_uint32(const uint8<N>& a) |
175 | { |
176 | SIMDPP_VEC_ARRAY_IMPL_CONV_INSERT(uint32<N>, i_to_uint32, a) |
177 | } |
178 | |
179 | // ----------------------------------------------------------------------------- |
180 | |
181 | static SIMDPP_INL |
182 | int32<8> i_to_int32(const int16<8>& a) |
183 | { |
184 | #if SIMDPP_USE_NULL |
185 | int32<8> r; |
186 | for (unsigned i = 0; i < r.length; i++) { |
187 | r.vec(i/4).el(i%4) = int32_t(a.vec(0).el(i)); |
188 | } |
189 | return r; |
190 | #elif SIMDPP_USE_AVX2 |
191 | return _mm256_cvtepi16_epi32(a.native()); |
192 | #elif SIMDPP_USE_SSE4_1 |
193 | int32x8 r; |
194 | r.vec(0) = _mm_cvtepi16_epi32(a.native()); |
195 | r.vec(1) = _mm_cvtepi16_epi32(move8_l<4>(a).eval().native()); |
196 | return r; |
197 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_MSA |
198 | int16x8 sign = shift_r<15>(a); |
199 | int32x4 lo, hi; |
200 | lo = zip8_lo(a, sign); |
201 | hi = zip8_hi(a, sign); |
202 | return combine(lo, hi); |
203 | #elif SIMDPP_USE_NEON |
204 | int32x8 r; |
205 | r.vec(0) = vmovl_s16(vget_low_s16(a.vec(0).native())); |
206 | r.vec(1) = vmovl_s16(vget_high_s16(a.vec(1).native())); |
207 | return r; |
208 | #elif SIMDPP_USE_ALTIVEC |
209 | int32x4 b0, b1; |
210 | b0 = vec_unpackh((__vector int16_t)a.vec(0).native()); |
211 | b1 = vec_unpackl((__vector int16_t)a.vec(0).native()); |
212 | return combine(b0, b1); |
213 | #endif |
214 | } |
215 | |
216 | #if SIMDPP_USE_AVX2 |
217 | static SIMDPP_INL |
218 | int32<16> i_to_int32(const int16<16>& a) |
219 | { |
220 | #if SIMDPP_USE_AVX512F |
221 | return _mm512_cvtepi16_epi32(a.native()); |
222 | #else |
223 | int32<8> r0, r1; |
224 | int16<8> a0, a1; |
225 | split(a, a0, a1); |
226 | r0 = _mm256_cvtepi16_epi32(a0.native()); |
227 | r1 = _mm256_cvtepi16_epi32(a1.native()); |
228 | return combine(r0, r1); |
229 | #endif |
230 | } |
231 | #endif |
232 | |
233 | #if SIMDPP_USE_AVX512BW |
234 | SIMDPP_INL int32<32> i_to_int32(const int16<32>& a) |
235 | { |
236 | int32<16> r0, r1; |
237 | int16<16> a0, a1; |
238 | split(a, a0, a1); |
239 | r0 = _mm512_cvtepi16_epi32(a0.native()); |
240 | r1 = _mm512_cvtepi16_epi32(a1.native()); |
241 | return combine(r0, r1); |
242 | } |
243 | #endif |
244 | |
245 | template<unsigned N> SIMDPP_INL |
246 | int32<N> i_to_int32(const int16<N>& a) |
247 | { |
248 | SIMDPP_VEC_ARRAY_IMPL_CONV_INSERT(int32<N>, i_to_int32, a) |
249 | } |
250 | |
251 | // ----------------------------------------------------------------------------- |
252 | |
253 | static SIMDPP_INL |
254 | int32<16> i_to_int32(const int8<16>& a) |
255 | { |
256 | #if SIMDPP_USE_NULL |
257 | int32<16> r; |
258 | for (unsigned i = 0; i < r.length; i++) { |
259 | r.vec(i/4).el(i%4) = int32_t(a.vec(0).el(i)); |
260 | } |
261 | return r; |
262 | #elif SIMDPP_USE_AVX512F |
263 | return _mm512_cvtepi8_epi32(a.native()); |
264 | #elif SIMDPP_USE_AVX2 |
265 | int32<16> r; |
266 | r.vec(0) = _mm256_cvtepi8_epi32(a.native()); |
267 | r.vec(1) = _mm256_cvtepi8_epi32(move16_l<8>(a).eval().native()); |
268 | return r; |
269 | #elif SIMDPP_USE_SSE4_1 |
270 | int32<16> r; |
271 | r.vec(0) = _mm_cvtepi8_epi32(a.native()); |
272 | r.vec(1) = _mm_cvtepi8_epi32(move16_l<4>(a).eval().native()); |
273 | r.vec(2) = _mm_cvtepi8_epi32(move16_l<8>(a).eval().native()); |
274 | r.vec(3) = _mm_cvtepi8_epi32(move16_l<12>(a).eval().native()); |
275 | return r; |
276 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
277 | return i_to_int32(i_to_int16(a)); |
278 | #endif |
279 | } |
280 | |
281 | #if SIMDPP_USE_AVX2 |
282 | static SIMDPP_INL |
283 | int32<32> i_to_int32(const int8<32>& a) |
284 | { |
285 | #if SIMDPP_USE_AVX512F |
286 | int32<32> r; |
287 | int8<16> a0, a1; |
288 | split(a, a0, a1); |
289 | r.vec(0) = _mm512_cvtepi8_epi32(a0.native()); |
290 | r.vec(1) = _mm512_cvtepi8_epi32(a1.native()); |
291 | return r; |
292 | #else |
293 | int32<32> r; |
294 | int8<16> a0, a1; |
295 | split(a, a0, a1); |
296 | r.vec(0) = _mm256_cvtepi8_epi32(a0.native()); |
297 | r.vec(1) = _mm256_cvtepi8_epi32(move16_l<8>(a0).eval().native()); |
298 | r.vec(2) = _mm256_cvtepi8_epi32(a1.native()); |
299 | r.vec(3) = _mm256_cvtepi8_epi32(move16_l<8>(a1).eval().native()); |
300 | return r; |
301 | #endif |
302 | } |
303 | #endif |
304 | |
305 | #if SIMDPP_USE_AVX512BW |
306 | SIMDPP_INL int32<64> i_to_int32(const int8<64>& a) |
307 | { |
308 | int32<64> r; |
309 | int8<32> a01, a23; |
310 | int8<16> a0, a1, a2, a3; |
311 | split(a, a01, a23); |
312 | split(a01, a0, a1); |
313 | split(a23, a2, a3); |
314 | |
315 | r.vec(0) = _mm512_cvtepi8_epi32(a0.native()); |
316 | r.vec(1) = _mm512_cvtepi8_epi32(a1.native()); |
317 | r.vec(2) = _mm512_cvtepi8_epi32(a2.native()); |
318 | r.vec(3) = _mm512_cvtepi8_epi32(a3.native()); |
319 | return r; |
320 | } |
321 | #endif |
322 | |
323 | template<unsigned N> SIMDPP_INL |
324 | int32<N> i_to_int32(const int8<N>& a) |
325 | { |
326 | SIMDPP_VEC_ARRAY_IMPL_CONV_INSERT(int32<N>, i_to_int32, a) |
327 | } |
328 | |
329 | |
330 | } // namespace insn |
331 | } // namespace detail |
332 | } // namespace SIMDPP_ARCH_NAMESPACE |
333 | } // namespace simdpp |
334 | |
335 | #endif |
336 | |
337 | |
338 | |