1/* Copyright (C) 2013-2017 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_CONV_SHRINK_TO_INT8_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_CONV_SHRINK_TO_INT8_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/make_shuffle_bytes_mask.h>
17#include <simdpp/core/permute4.h>
18#include <simdpp/core/permute_bytes16.h>
19#include <simdpp/core/shuffle4x2.h>
20#include <simdpp/core/unzip_lo.h>
21#include <simdpp/detail/insn/conv_shrink_to_int16.h>
22#include <simdpp/detail/insn/conv_shrink_to_int32.h>
23
24namespace simdpp {
25namespace SIMDPP_ARCH_NAMESPACE {
26namespace detail {
27namespace insn {
28
29// -----------------------------------------------------------------------------
30
31SIMDPP_INL uint8<16> i_to_uint8(const uint16<16>& a)
32{
33#if SIMDPP_USE_NULL
34 uint8<16> r;
35 for (unsigned i = 0; i < 16; i++) {
36 r.el(i) = uint8_t(a.vec(i/8).el(i%8));
37 }
38 return r;
39#elif SIMDPP_USE_AVX512VL
40 return _mm256_cvtepi16_epi8(a.native());
41#elif SIMDPP_USE_SSSE3
42 uint8<32> perm_mask = make_shuffle_bytes16_mask<0,2,4,6,8,10,12,14,
43 0,0,0,0,0,0,0,0>(perm_mask);
44 uint8<32> a8;
45 uint64<4> a64;
46 a8 = a;
47 a64 = permute_bytes16(a8, perm_mask);
48#if SIMDPP_USE_AVX2
49 a64 = permute4<0,2,0,2>(a64);
50 return _mm256_castsi256_si128(a64.native());
51#else
52 return (uint8<16>) zip2_lo(a64.vec(0), a64.vec(1));
53#endif
54#elif SIMDPP_USE_NEON64
55 uint8x8_t low = vmovn_u16(a.vec(0).native());
56 return vmovn_high_u16(low, a.vec(1).native());
57#elif SIMDPP_USE_NEON
58 uint8x8_t low = vmovn_u16(a.vec(0).native());
59 uint8x8_t high = vmovn_u16(a.vec(1).native());
60 return vcombine_u8(low, high);
61#elif SIMDPP_USE_ALTIVEC
62 return vec_pack(a.vec(0).native(), a.vec(1).native());
63#elif SIMDPP_USE_SSE2 || SIMDPP_USE_MSA
64 uint8<16> r1, r2;
65 r1 = a.vec(0);
66 r2 = a.vec(1);
67 return unzip16_lo(r1, r2);
68#endif
69}
70
71#if SIMDPP_USE_AVX2
72SIMDPP_INL uint8<32> i_to_uint8(const uint16<32>& a)
73{
74#if SIMDPP_USE_AVX512BW
75 return _mm512_cvtepi16_epi8(a.native());
76#else
77 uint8<32> perm_mask = make_shuffle_bytes16_mask<0,2,4,6,8,10,12,14,
78 0,0,0,0,0,0,0,0>(perm_mask);
79 uint8<32> a8_0, a8_1;
80 uint64<4> a64_0, a64_1;
81 a8_0 = a.vec(0);
82 a8_1 = a.vec(1);
83 a64_0 = permute_bytes16(a8_0, perm_mask);
84 a64_1 = permute_bytes16(a8_1, perm_mask);
85 return (uint8<32>) shuffle4x2<0,2,4,6>(a64_0, a64_1);
86#endif
87}
88#endif
89
90#if SIMDPP_USE_AVX512BW
91SIMDPP_INL uint8<64> i_to_uint8(const uint16<64>& a)
92{
93 uint8<32> r1 = _mm512_cvtepi16_epi8(a.vec(0).native());
94 uint8<32> r2 = _mm512_cvtepi16_epi8(a.vec(1).native());
95 return combine(r1, r2);
96}
97#endif
98
99template<unsigned N> SIMDPP_INL
100uint8<N> i_to_uint8(const uint16<N>& a)
101{
102 SIMDPP_VEC_ARRAY_IMPL_CONV_EXTRACT(uint8<N>, i_to_uint8, a)
103}
104
105// -----------------------------------------------------------------------------
106
107SIMDPP_INL uint8<16> i_to_uint8(const uint32<16>& a)
108{
109#if SIMDPP_USE_NULL
110 uint8<16> r;
111 for (unsigned i = 0; i < 16; i++) {
112 r.el(i) = uint8_t(a.vec(i/4).el(i%4));
113 }
114 return r;
115#elif SIMDPP_USE_AVX512F
116 return _mm512_cvtepi32_epi8(a.native());
117#elif SIMDPP_USE_SSSE3
118 uint8<64> perm_mask = make_shuffle_bytes16_mask<0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0>(perm_mask);
119 uint8<64> a8;
120 uint32<16> a32;
121 a8 = a;
122 a32 = permute_bytes16(a8, perm_mask);
123
124 uint32<4> b0, b1, b2, b3;
125#if SIMDPP_USE_AVX2
126 split(a32.vec(0), b0, b1);
127 split(a32.vec(1), b2, b3);
128#else
129 b0 = a32.vec(0);
130 b1 = a32.vec(1);
131 b2 = a32.vec(2);
132 b3 = a32.vec(3);
133#endif
134 uint64<2> r0, r1;
135 r0 = zip4_lo(b0, b1);
136 r1 = zip4_lo(b2, b3);
137 return (uint8<16>) zip2_lo(r0, r1);
138#else
139 uint16<16> a16 = i_to_uint16(a);
140 return i_to_uint8(a16);
141#endif
142}
143
144#if SIMDPP_USE_AVX2
145SIMDPP_INL uint8<32> i_to_uint8(const uint32<32>& a)
146{
147#if SIMDPP_USE_AVX512F
148 uint8<16> r0 = _mm512_cvtepi32_epi8(a.vec(0).native());
149 uint8<16> r1 = _mm512_cvtepi32_epi8(a.vec(1).native());
150 return combine(r0, r1);
151#else
152 uint8<32> perm_mask = make_shuffle_bytes16_mask<0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0>(perm_mask);
153 uint8<32> a8_0, a8_1, a8_2, a8_3;
154 uint32<8> a32_0, a32_1, a32_2, a32_3;
155 uint64<4> a64_0, a64_1;
156 uint32<4> b32_0, b32_1, c32_0, c32_1;
157 a8_0 = a.vec(0);
158 a8_1 = a.vec(1);
159 a8_2 = a.vec(2);
160 a8_3 = a.vec(3);
161 a32_0 = permute_bytes16(a8_0, perm_mask);
162 a32_1 = permute_bytes16(a8_1, perm_mask);
163 a32_2 = permute_bytes16(a8_2, perm_mask);
164 a32_3 = permute_bytes16(a8_3, perm_mask);
165 a64_0 = zip4_lo(a32_0, a32_1);
166 a64_1 = zip4_lo(a32_2, a32_3);
167 a32_0 = zip2_lo(a64_0, a64_1);
168 split(a32_0, b32_0, b32_1);
169 c32_0 = unzip4_lo(b32_0, b32_1);
170 c32_1 = unzip4_hi(b32_0, b32_1);
171 return (uint8<32>) combine(c32_0, c32_1);
172#endif
173}
174#endif
175
176#if SIMDPP_USE_AVX512BW
177SIMDPP_INL uint8<64> i_to_uint8(const uint32<64>& a)
178{
179 uint8<16> r0 = _mm512_cvtepi32_epi8(a.vec(0).native());
180 uint8<16> r1 = _mm512_cvtepi32_epi8(a.vec(1).native());
181 uint8<16> r2 = _mm512_cvtepi32_epi8(a.vec(2).native());
182 uint8<16> r3 = _mm512_cvtepi32_epi8(a.vec(3).native());
183 return combine(combine(r0, r1), combine(r2, r3));
184}
185#endif
186
187template<unsigned N> SIMDPP_INL
188uint8<N> i_to_uint8(const uint32<N>& a)
189{
190 SIMDPP_VEC_ARRAY_IMPL_CONV_EXTRACT(uint8<N>, i_to_uint8, a)
191}
192
193// -----------------------------------------------------------------------------
194
195SIMDPP_INL uint8<16> i_to_uint8(const uint64<16>& a)
196{
197#if SIMDPP_USE_NULL
198 uint8<16> r;
199 for (unsigned i = 0; i < 16; i++) {
200 r.el(i) = uint8_t(a.vec(i/2).el(i%2));
201 }
202 return r;
203#elif SIMDPP_USE_AVX512F
204 __m128i r0 = _mm512_cvtepi64_epi8(a.vec(0).native());
205 __m128i r1 = _mm512_cvtepi64_epi8(a.vec(1).native());
206 return _mm_unpacklo_epi64(r0, r1);
207#elif SIMDPP_USE_AVX2
208 uint8<32> perm_mask = make_shuffle_bytes16_mask<0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0>(perm_mask);
209 uint16<16> a16_0, a16_1, a16_2, a16_3;
210 uint32<8> a32_0, a32_1;
211 uint64<4> a64_0;
212 uint16<8> b16;
213
214 a16_0 = permute_bytes16((uint8<32>) a.vec(0), perm_mask);
215 a16_1 = permute_bytes16((uint8<32>) a.vec(1), perm_mask);
216 a16_2 = permute_bytes16((uint8<32>) a.vec(2), perm_mask);
217 a16_3 = permute_bytes16((uint8<32>) a.vec(3), perm_mask);
218
219 a32_0 = zip8_lo(a16_0, a16_1);
220 a32_1 = zip8_lo(a16_2, a16_3);
221
222 a64_0 = zip4_lo(a32_0, a32_1);
223 a64_0 = permute4<0,2,0,2>(a64_0);
224
225 b16 = _mm256_castsi256_si128(a64_0.native());
226
227 uint16<8> perm_mask2 = make_shuffle_bytes16_mask<0,4,1,5,2,6,3,7>(perm_mask2);
228 b16 = permute_bytes16(b16, perm_mask2);
229 return (uint8<16>) b16;
230#else
231 // TODO: SSSE3
232 uint32<16> a32 = i_to_uint32(a);
233 return i_to_uint8(a32);
234#endif
235}
236
237#if SIMDPP_USE_AVX2
238SIMDPP_INL uint8<32> i_to_uint8(const uint64<32>& a)
239{
240#if SIMDPP_USE_AVX512F
241 __m128i r0 = _mm512_cvtepi64_epi8(a.vec(0).native());
242 __m128i r1 = _mm512_cvtepi64_epi8(a.vec(1).native());
243 __m128i r2 = _mm512_cvtepi64_epi8(a.vec(2).native());
244 __m128i r3 = _mm512_cvtepi64_epi8(a.vec(3).native());
245 uint8<16> r01 = _mm_unpacklo_epi64(r0, r1);
246 uint8<16> r23 = _mm_unpacklo_epi64(r2, r3);
247 return combine(r01, r23);
248#else
249 uint8<32> perm_mask = make_shuffle_bytes16_mask<0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0>(perm_mask);
250 uint16<16> a16_0, a16_1, a16_2, a16_3, a16_4, a16_5, a16_6, a16_7;
251 uint32<8> a32_0, a32_1, a32_2, a32_3;
252 uint64<4> a64_0, a64_1;
253
254 a16_0 = permute_bytes16((uint8<32>) a.vec(0), perm_mask);
255 a16_1 = permute_bytes16((uint8<32>) a.vec(1), perm_mask);
256 a16_2 = permute_bytes16((uint8<32>) a.vec(2), perm_mask);
257 a16_3 = permute_bytes16((uint8<32>) a.vec(3), perm_mask);
258 a16_4 = permute_bytes16((uint8<32>) a.vec(4), perm_mask);
259 a16_5 = permute_bytes16((uint8<32>) a.vec(5), perm_mask);
260 a16_6 = permute_bytes16((uint8<32>) a.vec(6), perm_mask);
261 a16_7 = permute_bytes16((uint8<32>) a.vec(7), perm_mask);
262
263 a32_0 = zip8_lo(a16_0, a16_1);
264 a32_1 = zip8_lo(a16_2, a16_3);
265 a32_2 = zip8_lo(a16_4, a16_5);
266 a32_3 = zip8_lo(a16_6, a16_7);
267
268 a64_0 = zip4_lo(a32_0, a32_1);
269 a64_1 = zip4_lo(a32_2, a32_3);
270 a64_0 = zip2_lo(a64_0, a64_1);
271 a16_0 = permute4<0,2,1,3>(a64_0);
272
273 uint16<16> perm_mask2 = make_shuffle_bytes16_mask<0,4,1,5,2,6,3,7>(perm_mask2);
274 a16_0 = permute_bytes16(a16_0, perm_mask2);
275 return (uint8<32>) a16_0;
276#endif
277}
278#endif
279
280#if SIMDPP_USE_AVX512BW
281SIMDPP_INL uint8<64> i_to_uint8(const uint64<64>& a)
282{
283 __m128i r0 = _mm512_cvtepi64_epi8(a.vec(0).native());
284 __m128i r1 = _mm512_cvtepi64_epi8(a.vec(1).native());
285 __m128i r2 = _mm512_cvtepi64_epi8(a.vec(2).native());
286 __m128i r3 = _mm512_cvtepi64_epi8(a.vec(3).native());
287 __m128i r4 = _mm512_cvtepi64_epi8(a.vec(4).native());
288 __m128i r5 = _mm512_cvtepi64_epi8(a.vec(5).native());
289 __m128i r6 = _mm512_cvtepi64_epi8(a.vec(6).native());
290 __m128i r7 = _mm512_cvtepi64_epi8(a.vec(7).native());
291 uint8<16> r01 = _mm_unpacklo_epi64(r0, r1);
292 uint8<16> r23 = _mm_unpacklo_epi64(r2, r3);
293 uint8<16> r45 = _mm_unpacklo_epi64(r4, r5);
294 uint8<16> r67 = _mm_unpacklo_epi64(r6, r7);
295 return combine(combine(r01, r23), combine(r45, r67));
296}
297#endif
298
299template<unsigned N> SIMDPP_INL
300uint8<N> i_to_uint8(const uint64<N>& a)
301{
302 SIMDPP_VEC_ARRAY_IMPL_CONV_EXTRACT(uint8<N>, i_to_uint8, a)
303}
304
305// -----------------------------------------------------------------------------
306
307} // namespace insn
308} // namespace detail
309} // namespace SIMDPP_ARCH_NAMESPACE
310} // namespace simdpp
311
312#endif
313
314
315