conv_shrink_to_int16.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/conv_shrink_to_int16.h]

1	/ Copyright (C) 2013-2017 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_CONV_SHRINK_TO_INT16_H
9	#define LIBSIMDPP_SIMDPP_DETAIL_INSN_CONV_SHRINK_TO_INT16_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/types.h>
16	#include <simdpp/core/permute4.h>
17	#include <simdpp/core/unzip_lo.h>
18	#include <simdpp/detail/insn/conv_shrink_to_int32.h>
19
20	namespace simdpp {
21	namespace SIMDPP_ARCH_NAMESPACE {
22	namespace detail {
23	namespace insn {
24
25
26	// -----------------------------------------------------------------------------
27
28	SIMDPP_INL uint16<`8`> i_to_uint16(const uint32<`8`>& a)
29	{
30	#if SIMDPP_USE_NULL
31	uint16<`8`> r;
32	for (unsigned i = `0`; i < `8`; i++) {
33	r.el(i) = uint16_t(a.vec(i/`4`).el(i%`4`));
34	}
35	return r;
36	#elif SIMDPP_USE_AVX512VL
37	return _mm256_cvtepi32_epi16(a.native());
38	#elif SIMDPP_USE_SSSE3
39	uint16<`16`> perm_mask = make_shuffle_bytes16_mask<`0`,`2`,`4`,`6`,`0`,`0`,`0`,`0`>(perm_mask);
40	uint16<`16`> a16;
41	uint64<`4`> a64;
42	a16 = a;
43	a64 = permute_bytes16(a16, perm_mask);
44	#if SIMDPP_USE_AVX2
45	a64 = permute4<`0`,`2`,`0`,`2`>(a64);
46	return _mm256_castsi256_si128(a64.native());
47	#else
48	return (uint16<`8`>) zip2_lo(a64.vec(`0`), a64.vec(`1`));
49	#endif
50	#elif SIMDPP_USE_NEON64
51	uint16x4_t low = vmovn_u32(a.vec(`0`).native());
52	return vmovn_high_u32(low, a.vec(`1`).native());
53	#elif SIMDPP_USE_NEON
54	uint16x4_t low = vmovn_u32(a.vec(`0`).native());
55	uint16x4_t high = vmovn_u32(a.vec(`1`).native());
56	return vcombine_u16(low, high);
57	#elif SIMDPP_USE_ALTIVEC
58	return vec_pack(a.vec(`0`).native(), a.vec(`1`).native());
59	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_MSA
60	uint16<`8`> r1, r2;
61	r1 = a.vec(`0`);
62	r2 = a.vec(`1`);
63	return unzip8_lo(r1, r2);
64	#endif
65	}
66
67	#if SIMDPP_USE_AVX2
68	SIMDPP_INL uint16<`16`> i_to_uint16(const uint32<`16`>& a)
69	{
70	#if SIMDPP_USE_AVX512BW
71	return _mm512_cvtepi32_epi16(a.native());
72	#elif SIMDPP_USE_AVX512F
73	uint16<`32`> perm_mask = make_shuffle_bytes16_mask<`0`,`2`,`4`,`6`,`0`,`0`,`0`,`0`>(perm_mask);
74	uint64<`8`> a64;
75	uint64<`4`> a64_0, a64_1;
76	a64 = permute_bytes16((uint16<`32`>) a, perm_mask);
77	split(a64, a64_0, a64_1);
78	a64_0 = zip2_lo(a64_0, a64_1);
79	a64_0 = permute4<`0`,`2`,`1`,`3`>(a64_0);
80	return (uint16<`16`>) a64_0;
81	#else
82	uint16<`16`> perm_mask = make_shuffle_bytes16_mask<`0`,`2`,`4`,`6`,`0`,`0`,`0`,`0`>(perm_mask);
83	uint64<`4`> a64_0, a64_1;
84	a64_0 = permute_bytes16((uint16<`16`>) a.vec(`0`), perm_mask);
85	a64_1 = permute_bytes16((uint16<`16`>) a.vec(`1`), perm_mask);
86	a64_0 = zip2_lo(a64_0, a64_1);
87	a64_0 = permute4<`0`,`2`,`1`,`3`>(a64_0);
88	return (uint16<`16`>) a64_0;
89	#endif
90	}
91	#endif
92
93	#if SIMDPP_USE_AVX512BW
94	SIMDPP_INL uint16<`32`> i_to_uint16(const uint32<`32`>& a)
95	{
96	uint16<`16`> r1 = _mm512_cvtepi32_epi16(a.vec(`0`).native());
97	uint16<`16`> r2 = _mm512_cvtepi32_epi16(a.vec(`1`).native());
98	return combine(r1, r2);
99	}
100	#endif
101
102	template<unsigned N> SIMDPP_INL
103	uint16<N> i_to_uint16(const uint32<N>& a)
104	{
105	SIMDPP_VEC_ARRAY_IMPL_CONV_EXTRACT(uint16<N>, i_to_uint16, a)
106	}
107
108	// -----------------------------------------------------------------------------
109
110	SIMDPP_INL uint16<`8`> i_to_uint16(const uint64<`8`>& a)
111	{
112	#if SIMDPP_USE_NULL
113	uint16<`8`> r;
114	for (unsigned i = `0`; i < `8`; i++) {
115	r.el(i) = uint16_t(a.vec(i/`2`).el(i%`2`));
116	}
117	return r;
118	#elif SIMDPP_USE_AVX512F
119	return _mm512_cvtepi64_epi16(a.native());
120	#elif SIMDPP_USE_AVX2
121	uint16<`16`> perm_mask = make_shuffle_bytes16_mask<`0`,`4`,`0`,`0`,`0`,`0`,`0`,`0`>(perm_mask);
122	uint32<`8`> a32_0, a32_1;
123	uint64<`4`> a64_0;
124	uint32<`4`> b32;
125	a32_0 = permute_bytes16((uint16<`16`>) a.vec(`0`), perm_mask);
126	a32_1 = permute_bytes16((uint16<`16`>) a.vec(`1`), perm_mask);
127	a64_0 = zip4_lo(a32_0, a32_1);
128	a32_0 = permute4<`0`,`2`,`1`,`3`>(a64_0);
129	b32 = _mm256_castsi256_si128(a32_0.native());
130	b32 = permute4<`0`,`2`,`1`,`3`>(b32);
131	return (uint16<`8`>) b32;
132	#elif SIMDPP_USE_SSSE3
133	uint16<`8`> perm_mask = make_shuffle_bytes16_mask<`0`,`4`,`0`,`0`,`0`,`0`,`0`,`0`>(perm_mask);
134	uint32<`4`> a32_0, a32_1, a32_2, a32_3;
135	uint64<`2`> a64_0, a64_1;
136	a32_0 = permute_bytes16((uint16<`8`>) a.vec(`0`), perm_mask);
137	a32_1 = permute_bytes16((uint16<`8`>) a.vec(`1`), perm_mask);
138	a32_2 = permute_bytes16((uint16<`8`>) a.vec(`2`), perm_mask);
139	a32_3 = permute_bytes16((uint16<`8`>) a.vec(`3`), perm_mask);
140	a64_0 = zip4_lo(a32_0, a32_1);
141	a64_1 = zip4_lo(a32_2, a32_3);
142	a64_0 = zip2_lo(a64_0, a64_1);
143	return (uint16<`8`>) a64_0;
144	#else
145	uint32<`8`> a32 = i_to_uint32(a);
146	return i_to_uint16(a32);
147	#endif
148	}
149
150	#if SIMDPP_USE_AVX2
151	SIMDPP_INL uint16<`16`> i_to_uint16(const uint64<`16`>& a)
152	{
153	#if SIMDPP_USE_AVX512F
154	uint16<`8`> r0 = _mm512_cvtepi64_epi16(a.vec(`0`).native());
155	uint16<`8`> r1 = _mm512_cvtepi64_epi16(a.vec(`1`).native());
156	return combine(r0, r1);
157	#else
158	uint16<`16`> perm_mask = make_shuffle_bytes16_mask<`0`,`4`,`0`,`0`,`0`,`0`,`0`,`0`>(perm_mask);
159	uint32<`8`> a32_0, a32_1, a32_2, a32_3;
160	uint64<`4`> a64_0, a64_1;
161	a32_0 = permute_bytes16((uint16<`16`>) a.vec(`0`), perm_mask);
162	a32_1 = permute_bytes16((uint16<`16`>) a.vec(`1`), perm_mask);
163	a32_2 = permute_bytes16((uint16<`16`>) a.vec(`2`), perm_mask);
164	a32_3 = permute_bytes16((uint16<`16`>) a.vec(`3`), perm_mask);
165	a64_0 = zip4_lo(a32_0, a32_1);
166	a64_1 = zip4_lo(a32_2, a32_3);
167	a64_0 = zip2_lo(a64_0, a64_1);
168	a32_0 = permute4<`0`,`2`,`1`,`3`>(a64_0);
169	a32_0 = permute4<`0`,`2`,`1`,`3`>(a32_0);
170	return (uint16<`16`>) a32_0;
171	#endif
172	}
173	#endif
174
175	#if SIMDPP_USE_AVX512BW
176	SIMDPP_INL uint16<`32`> i_to_uint16(const uint64<`32`>& a)
177	{
178	uint16<`8`> r0 = _mm512_cvtepi64_epi16(a.vec(`0`).native());
179	uint16<`8`> r1 = _mm512_cvtepi64_epi16(a.vec(`1`).native());
180	uint16<`8`> r2 = _mm512_cvtepi64_epi16(a.vec(`2`).native());
181	uint16<`8`> r3 = _mm512_cvtepi64_epi16(a.vec(`3`).native());
182	return combine(combine(r0, r1), combine(r2, r3));
183	}
184	#endif
185
186	template<unsigned N> SIMDPP_INL
187	uint16<N> i_to_uint16(const uint64<N>& a)
188	{
189	SIMDPP_VEC_ARRAY_IMPL_CONV_EXTRACT(uint16<N>, i_to_uint16, a)
190	}
191
192	} // namespace insn
193	} // namespace detail
194	} // namespace SIMDPP_ARCH_NAMESPACE
195	} // namespace simdpp
196
197	#endif
198
199
200

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/conv_shrink_to_int16.h