conv_shrink_to_int8.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/conv_shrink_to_int8.h]

1	/ Copyright (C) 2013-2017 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_CONV_SHRINK_TO_INT8_H
9	#define LIBSIMDPP_SIMDPP_DETAIL_INSN_CONV_SHRINK_TO_INT8_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/types.h>
16	#include <simdpp/core/make_shuffle_bytes_mask.h>
17	#include <simdpp/core/permute4.h>
18	#include <simdpp/core/permute_bytes16.h>
19	#include <simdpp/core/shuffle4x2.h>
20	#include <simdpp/core/unzip_lo.h>
21	#include <simdpp/detail/insn/conv_shrink_to_int16.h>
22	#include <simdpp/detail/insn/conv_shrink_to_int32.h>
23
24	namespace simdpp {
25	namespace SIMDPP_ARCH_NAMESPACE {
26	namespace detail {
27	namespace insn {
28
29	// -----------------------------------------------------------------------------
30
31	SIMDPP_INL uint8<`16`> i_to_uint8(const uint16<`16`>& a)
32	{
33	#if SIMDPP_USE_NULL
34	uint8<`16`> r;
35	for (unsigned i = `0`; i < `16`; i++) {
36	r.el(i) = uint8_t(a.vec(i/`8`).el(i%`8`));
37	}
38	return r;
39	#elif SIMDPP_USE_AVX512VL
40	return _mm256_cvtepi16_epi8(a.native());
41	#elif SIMDPP_USE_SSSE3
42	uint8<`32`> perm_mask = make_shuffle_bytes16_mask<`0`,`2`,`4`,`6`,`8`,`10`,`12`,`14`,
43	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`>(perm_mask);
44	uint8<`32`> a8;
45	uint64<`4`> a64;
46	a8 = a;
47	a64 = permute_bytes16(a8, perm_mask);
48	#if SIMDPP_USE_AVX2
49	a64 = permute4<`0`,`2`,`0`,`2`>(a64);
50	return _mm256_castsi256_si128(a64.native());
51	#else
52	return (uint8<`16`>) zip2_lo(a64.vec(`0`), a64.vec(`1`));
53	#endif
54	#elif SIMDPP_USE_NEON64
55	uint8x8_t low = vmovn_u16(a.vec(`0`).native());
56	return vmovn_high_u16(low, a.vec(`1`).native());
57	#elif SIMDPP_USE_NEON
58	uint8x8_t low = vmovn_u16(a.vec(`0`).native());
59	uint8x8_t high = vmovn_u16(a.vec(`1`).native());
60	return vcombine_u8(low, high);
61	#elif SIMDPP_USE_ALTIVEC
62	return vec_pack(a.vec(`0`).native(), a.vec(`1`).native());
63	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_MSA
64	uint8<`16`> r1, r2;
65	r1 = a.vec(`0`);
66	r2 = a.vec(`1`);
67	return unzip16_lo(r1, r2);
68	#endif
69	}
70
71	#if SIMDPP_USE_AVX2
72	SIMDPP_INL uint8<`32`> i_to_uint8(const uint16<`32`>& a)
73	{
74	#if SIMDPP_USE_AVX512BW
75	return _mm512_cvtepi16_epi8(a.native());
76	#else
77	uint8<`32`> perm_mask = make_shuffle_bytes16_mask<`0`,`2`,`4`,`6`,`8`,`10`,`12`,`14`,
78	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`>(perm_mask);
79	uint8<`32`> a8_0, a8_1;
80	uint64<`4`> a64_0, a64_1;
81	a8_0 = a.vec(`0`);
82	a8_1 = a.vec(`1`);
83	a64_0 = permute_bytes16(a8_0, perm_mask);
84	a64_1 = permute_bytes16(a8_1, perm_mask);
85	return (uint8<`32`>) shuffle4x2<`0`,`2`,`4`,`6`>(a64_0, a64_1);
86	#endif
87	}
88	#endif
89
90	#if SIMDPP_USE_AVX512BW
91	SIMDPP_INL uint8<`64`> i_to_uint8(const uint16<`64`>& a)
92	{
93	uint8<`32`> r1 = _mm512_cvtepi16_epi8(a.vec(`0`).native());
94	uint8<`32`> r2 = _mm512_cvtepi16_epi8(a.vec(`1`).native());
95	return combine(r1, r2);
96	}
97	#endif
98
99	template<unsigned N> SIMDPP_INL
100	uint8<N> i_to_uint8(const uint16<N>& a)
101	{
102	SIMDPP_VEC_ARRAY_IMPL_CONV_EXTRACT(uint8<N>, i_to_uint8, a)
103	}
104
105	// -----------------------------------------------------------------------------
106
107	SIMDPP_INL uint8<`16`> i_to_uint8(const uint32<`16`>& a)
108	{
109	#if SIMDPP_USE_NULL
110	uint8<`16`> r;
111	for (unsigned i = `0`; i < `16`; i++) {
112	r.el(i) = uint8_t(a.vec(i/`4`).el(i%`4`));
113	}
114	return r;
115	#elif SIMDPP_USE_AVX512F
116	return _mm512_cvtepi32_epi8(a.native());
117	#elif SIMDPP_USE_SSSE3
118	uint8<`64`> perm_mask = make_shuffle_bytes16_mask<`0`,`4`,`8`,`12`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`>(perm_mask);
119	uint8<`64`> a8;
120	uint32<`16`> a32;
121	a8 = a;
122	a32 = permute_bytes16(a8, perm_mask);
123
124	uint32<`4`> b0, b1, b2, b3;
125	#if SIMDPP_USE_AVX2
126	split(a32.vec(`0`), b0, b1);
127	split(a32.vec(`1`), b2, b3);
128	#else
129	b0 = a32.vec(`0`);
130	b1 = a32.vec(`1`);
131	b2 = a32.vec(`2`);
132	b3 = a32.vec(`3`);
133	#endif
134	uint64<`2`> r0, r1;
135	r0 = zip4_lo(b0, b1);
136	r1 = zip4_lo(b2, b3);
137	return (uint8<`16`>) zip2_lo(r0, r1);
138	#else
139	uint16<`16`> a16 = i_to_uint16(a);
140	return i_to_uint8(a16);
141	#endif
142	}
143
144	#if SIMDPP_USE_AVX2
145	SIMDPP_INL uint8<`32`> i_to_uint8(const uint32<`32`>& a)
146	{
147	#if SIMDPP_USE_AVX512F
148	uint8<`16`> r0 = _mm512_cvtepi32_epi8(a.vec(`0`).native());
149	uint8<`16`> r1 = _mm512_cvtepi32_epi8(a.vec(`1`).native());
150	return combine(r0, r1);
151	#else
152	uint8<`32`> perm_mask = make_shuffle_bytes16_mask<`0`,`4`,`8`,`12`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`>(perm_mask);
153	uint8<`32`> a8_0, a8_1, a8_2, a8_3;
154	uint32<`8`> a32_0, a32_1, a32_2, a32_3;
155	uint64<`4`> a64_0, a64_1;
156	uint32<`4`> b32_0, b32_1, c32_0, c32_1;
157	a8_0 = a.vec(`0`);
158	a8_1 = a.vec(`1`);
159	a8_2 = a.vec(`2`);
160	a8_3 = a.vec(`3`);
161	a32_0 = permute_bytes16(a8_0, perm_mask);
162	a32_1 = permute_bytes16(a8_1, perm_mask);
163	a32_2 = permute_bytes16(a8_2, perm_mask);
164	a32_3 = permute_bytes16(a8_3, perm_mask);
165	a64_0 = zip4_lo(a32_0, a32_1);
166	a64_1 = zip4_lo(a32_2, a32_3);
167	a32_0 = zip2_lo(a64_0, a64_1);
168	split(a32_0, b32_0, b32_1);
169	c32_0 = unzip4_lo(b32_0, b32_1);
170	c32_1 = unzip4_hi(b32_0, b32_1);
171	return (uint8<`32`>) combine(c32_0, c32_1);
172	#endif
173	}
174	#endif
175
176	#if SIMDPP_USE_AVX512BW
177	SIMDPP_INL uint8<`64`> i_to_uint8(const uint32<`64`>& a)
178	{
179	uint8<`16`> r0 = _mm512_cvtepi32_epi8(a.vec(`0`).native());
180	uint8<`16`> r1 = _mm512_cvtepi32_epi8(a.vec(`1`).native());
181	uint8<`16`> r2 = _mm512_cvtepi32_epi8(a.vec(`2`).native());
182	uint8<`16`> r3 = _mm512_cvtepi32_epi8(a.vec(`3`).native());
183	return combine(combine(r0, r1), combine(r2, r3));
184	}
185	#endif
186
187	template<unsigned N> SIMDPP_INL
188	uint8<N> i_to_uint8(const uint32<N>& a)
189	{
190	SIMDPP_VEC_ARRAY_IMPL_CONV_EXTRACT(uint8<N>, i_to_uint8, a)
191	}
192
193	// -----------------------------------------------------------------------------
194
195	SIMDPP_INL uint8<`16`> i_to_uint8(const uint64<`16`>& a)
196	{
197	#if SIMDPP_USE_NULL
198	uint8<`16`> r;
199	for (unsigned i = `0`; i < `16`; i++) {
200	r.el(i) = uint8_t(a.vec(i/`2`).el(i%`2`));
201	}
202	return r;
203	#elif SIMDPP_USE_AVX512F
204	__m128i r0 = _mm512_cvtepi64_epi8(a.vec(`0`).native());
205	__m128i r1 = _mm512_cvtepi64_epi8(a.vec(`1`).native());
206	return _mm_unpacklo_epi64(r0, r1);
207	#elif SIMDPP_USE_AVX2
208	uint8<`32`> perm_mask = make_shuffle_bytes16_mask<`0`,`8`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`>(perm_mask);
209	uint16<`16`> a16_0, a16_1, a16_2, a16_3;
210	uint32<`8`> a32_0, a32_1;
211	uint64<`4`> a64_0;
212	uint16<`8`> b16;
213
214	a16_0 = permute_bytes16((uint8<`32`>) a.vec(`0`), perm_mask);
215	a16_1 = permute_bytes16((uint8<`32`>) a.vec(`1`), perm_mask);
216	a16_2 = permute_bytes16((uint8<`32`>) a.vec(`2`), perm_mask);
217	a16_3 = permute_bytes16((uint8<`32`>) a.vec(`3`), perm_mask);
218
219	a32_0 = zip8_lo(a16_0, a16_1);
220	a32_1 = zip8_lo(a16_2, a16_3);
221
222	a64_0 = zip4_lo(a32_0, a32_1);
223	a64_0 = permute4<`0`,`2`,`0`,`2`>(a64_0);
224
225	b16 = _mm256_castsi256_si128(a64_0.native());
226
227	uint16<`8`> perm_mask2 = make_shuffle_bytes16_mask<`0`,`4`,`1`,`5`,`2`,`6`,`3`,`7`>(perm_mask2);
228	b16 = permute_bytes16(b16, perm_mask2);
229	return (uint8<`16`>) b16;
230	#else
231	// TODO: SSSE3
232	uint32<`16`> a32 = i_to_uint32(a);
233	return i_to_uint8(a32);
234	#endif
235	}
236
237	#if SIMDPP_USE_AVX2
238	SIMDPP_INL uint8<`32`> i_to_uint8(const uint64<`32`>& a)
239	{
240	#if SIMDPP_USE_AVX512F
241	__m128i r0 = _mm512_cvtepi64_epi8(a.vec(`0`).native());
242	__m128i r1 = _mm512_cvtepi64_epi8(a.vec(`1`).native());
243	__m128i r2 = _mm512_cvtepi64_epi8(a.vec(`2`).native());
244	__m128i r3 = _mm512_cvtepi64_epi8(a.vec(`3`).native());
245	uint8<`16`> r01 = _mm_unpacklo_epi64(r0, r1);
246	uint8<`16`> r23 = _mm_unpacklo_epi64(r2, r3);
247	return combine(r01, r23);
248	#else
249	uint8<`32`> perm_mask = make_shuffle_bytes16_mask<`0`,`8`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`>(perm_mask);
250	uint16<`16`> a16_0, a16_1, a16_2, a16_3, a16_4, a16_5, a16_6, a16_7;
251	uint32<`8`> a32_0, a32_1, a32_2, a32_3;
252	uint64<`4`> a64_0, a64_1;
253
254	a16_0 = permute_bytes16((uint8<`32`>) a.vec(`0`), perm_mask);
255	a16_1 = permute_bytes16((uint8<`32`>) a.vec(`1`), perm_mask);
256	a16_2 = permute_bytes16((uint8<`32`>) a.vec(`2`), perm_mask);
257	a16_3 = permute_bytes16((uint8<`32`>) a.vec(`3`), perm_mask);
258	a16_4 = permute_bytes16((uint8<`32`>) a.vec(`4`), perm_mask);
259	a16_5 = permute_bytes16((uint8<`32`>) a.vec(`5`), perm_mask);
260	a16_6 = permute_bytes16((uint8<`32`>) a.vec(`6`), perm_mask);
261	a16_7 = permute_bytes16((uint8<`32`>) a.vec(`7`), perm_mask);
262
263	a32_0 = zip8_lo(a16_0, a16_1);
264	a32_1 = zip8_lo(a16_2, a16_3);
265	a32_2 = zip8_lo(a16_4, a16_5);
266	a32_3 = zip8_lo(a16_6, a16_7);
267
268	a64_0 = zip4_lo(a32_0, a32_1);
269	a64_1 = zip4_lo(a32_2, a32_3);
270	a64_0 = zip2_lo(a64_0, a64_1);
271	a16_0 = permute4<`0`,`2`,`1`,`3`>(a64_0);
272
273	uint16<`16`> perm_mask2 = make_shuffle_bytes16_mask<`0`,`4`,`1`,`5`,`2`,`6`,`3`,`7`>(perm_mask2);
274	a16_0 = permute_bytes16(a16_0, perm_mask2);
275	return (uint8<`32`>) a16_0;
276	#endif
277	}
278	#endif
279
280	#if SIMDPP_USE_AVX512BW
281	SIMDPP_INL uint8<`64`> i_to_uint8(const uint64<`64`>& a)
282	{
283	__m128i r0 = _mm512_cvtepi64_epi8(a.vec(`0`).native());
284	__m128i r1 = _mm512_cvtepi64_epi8(a.vec(`1`).native());
285	__m128i r2 = _mm512_cvtepi64_epi8(a.vec(`2`).native());
286	__m128i r3 = _mm512_cvtepi64_epi8(a.vec(`3`).native());
287	__m128i r4 = _mm512_cvtepi64_epi8(a.vec(`4`).native());
288	__m128i r5 = _mm512_cvtepi64_epi8(a.vec(`5`).native());
289	__m128i r6 = _mm512_cvtepi64_epi8(a.vec(`6`).native());
290	__m128i r7 = _mm512_cvtepi64_epi8(a.vec(`7`).native());
291	uint8<`16`> r01 = _mm_unpacklo_epi64(r0, r1);
292	uint8<`16`> r23 = _mm_unpacklo_epi64(r2, r3);
293	uint8<`16`> r45 = _mm_unpacklo_epi64(r4, r5);
294	uint8<`16`> r67 = _mm_unpacklo_epi64(r6, r7);
295	return combine(combine(r01, r23), combine(r45, r67));
296	}
297	#endif
298
299	template<unsigned N> SIMDPP_INL
300	uint8<N> i_to_uint8(const uint64<N>& a)
301	{
302	SIMDPP_VEC_ARRAY_IMPL_CONV_EXTRACT(uint8<N>, i_to_uint8, a)
303	}
304
305	// -----------------------------------------------------------------------------
306
307	} // namespace insn
308	} // namespace detail
309	} // namespace SIMDPP_ARCH_NAMESPACE
310	} // namespace simdpp
311
312	#endif
313
314
315

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/conv_shrink_to_int8.h