i_shift_l_v.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/i_shift_l_v.h]

1	/ Copyright (C) 2017 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_CORE_I_SHIFT_L_V_H
9	#define LIBSIMDPP_SIMDPP_CORE_I_SHIFT_L_V_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/types.h>
16	#include <simdpp/detail/null/math.h>
17	#include <simdpp/detail/insn/i_shift.h>
18	#include <simdpp/detail/shuffle/shuffle_mask.h>
19	#include <simdpp/core/i_mul.h>
20	#include <simdpp/core/permute_bytes16.h>
21	#include <simdpp/detail/vector_array_macros.h>
22
23	namespace simdpp {
24	namespace SIMDPP_ARCH_NAMESPACE {
25	namespace detail {
26	namespace insn {
27
28	// emulates 8-bit variable shift using 16-bit variable shift
29	template<class U8> SIMDPP_INL
30	U8 v_emul_shift_l_v8_using_v16(const U8& a, const U8& count)
31	{
32	using U16 = typename same_width<U8>::u16;
33
34	U16 a16; a16 = a;
35	U16 c16; c16 = count;
36
37	U16 select_mask = make_uint(`0xff00`);
38	U16 a_lo = a16;
39	U16 a_hi = bit_and(a16, select_mask);
40	U16 c_lo = bit_andnot(c16, select_mask);
41	U16 c_hi = shift_r<`8`>(c16);
42	a_lo = shift_l(a_lo, c_lo);
43	a_hi = shift_l(a_hi, c_hi);
44	a_lo = bit_andnot(a_lo, select_mask);
45
46	a16 = bit_or(a_lo, a_hi);
47	return (U8) a16;
48	}
49
50	// emulates 8-bit variable shift using permute_bytes16 and 16-bit multiplication
51	template<class U8> SIMDPP_INL
52	U8 v_emul_shift_l_v8_using_mul(const U8& a, const U8& count)
53	{
54	using U16 = typename same_width<U8>::u16;
55
56	// Variable shift is implemented by obtaining 1 << countN for each element
57	// from a and then multiplying each element by that number. Implementation
58	// is complicated by the fact, that only 16-bit multiplication is available.
59	U8 mulshift_mask = make_uint(`0x01`, `0x02`, `0x04`, `0x08`,
60	`0x10`, `0x20`, `0x40`, `0x80`);
61	U16 mulshift = (U16) permute_bytes16(mulshift_mask, count);
62
63	U16 a16; a16 = a;
64	U16 a16_lo, a16_hi, mulshift_lo, mulshift_hi;
65	U16 select_mask = make_uint(`0x00ff`);
66
67	// Move the element values to the high byte of the 16-bit elements and the
68	// shift values to the low byte. The results will have the low byte clear
69	// which will help composing the result back to a single vector.
70	a16_lo = shift_l<`8`>(a16);
71	mulshift_lo = bit_and(mulshift, select_mask);
72	a16_hi = bit_andnot(a16, select_mask);
73	mulshift_hi = shift_r<`8`>(mulshift);
74
75	a16_lo = mul_lo(a16_lo, mulshift_lo);
76	a16_hi = mul_lo(a16_hi, mulshift_hi);
77
78	a16_lo = shift_r<`8`>(a16_lo);
79	a16 = bit_or(a16_lo, a16_hi);
80	return (U8) a16;
81	}
82
83	static SIMDPP_INL
84	uint8<`16`> i_shift_l_v(const uint8<`16`>& a, const uint8<`16`>& count)
85	{
86	#if SIMDPP_USE_NULL
87	return detail::null::shift_l_v(a, count);
88	#elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
89	return v_emul_shift_l_v8_using_v16(a, count);
90	#elif SIMDPP_USE_SSSE3
91	return v_emul_shift_l_v8_using_mul(a, count);
92	#elif SIMDPP_USE_NEON
93	return vshlq_u8(a.native(), vreinterpretq_s8_u8(count.native()));
94	#elif SIMDPP_USE_ALTIVEC
95	return vec_sl(a.native(), count.native());
96	#elif SIMDPP_USE_MSA
97	return (v16u8) __msa_sll_b((v16i8)a.native(), (v16i8)count.native());
98	#else
99	return SIMDPP_NOT_IMPLEMENTED2(a, count);
100	#endif
101	}
102
103	#if SIMDPP_USE_AVX2
104	static SIMDPP_INL
105	uint8<`32`> i_shift_l_v(const uint8<`32`>& a, const uint8<`32`>& count)
106	{
107	#if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
108	return v_emul_shift_l_v8_using_v16(a, count);
109	#else
110	return v_emul_shift_l_v8_using_mul(a, count);
111	#endif
112	}
113	#endif
114
115	#if SIMDPP_USE_AVX512BW
116	static SIMDPP_INL
117	uint8<`64`> i_shift_l_v(const uint8<`64`>& a, const uint8<`64`>& count)
118	{
119	return v_emul_shift_l_v8_using_v16(a, count);
120	}
121	#endif
122
123	// -----------------------------------------------------------------------------
124
125	// emulates 16-bit variable shift using permute_bytes16 and 16-bit multiplication
126	template<class U16>
127	U16 v_emul_shift_l_v16_using_mul(const U16& a, const U16& count)
128	{
129	using U8 = typename same_width<U16>::u8;
130
131	// Variable shift is implemented by obtaining 1 << countN for each element
132	// from a and then multiplying each element by that number. The
133	// implementation is complicated by the fact that permute_bytes16 permutes
134	// 8-bit elements instead of 16 which would be optimal in this case
135	U8 mulshift_mask = make_uint(`0x01`, `0x02`, `0x04`, `0x08`,
136	`0x10`, `0x20`, `0x40`, `0x80`,
137	`0x00`, `0x00`, `0x00`, `0x00`,
138	`0x00`, `0x00`, `0x00`, `0x00`);
139	U16 qcount = bit_or(count, shift_l<`8`>(count));
140
141	// toggle the 4-th bit so that the high byte takes zeros from the mulshift
142	// mask when the shift count is higher than 8.
143	qcount = bit_xor(qcount, `0x0800`);
144	U16 mulshift = (U16) permute_bytes16(mulshift_mask, (U8) qcount);
145	return mul_lo(a, mulshift);
146	}
147
148	static SIMDPP_INL
149	uint16<`8`> i_shift_l_v(const uint16<`8`>& a, const uint16<`8`>& count)
150	{
151	#if SIMDPP_USE_NULL
152	return detail::null::shift_l_v(a, count);
153	#elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
154	return _mm_sllv_epi16(a.native(), count.native());
155	#elif SIMDPP_USE_SSSE3
156	return v_emul_shift_l_v16_using_mul(a, count);
157	#elif SIMDPP_USE_NEON
158	return vshlq_u16(a.native(), vreinterpretq_s16_u16(count.native()));
159	#elif SIMDPP_USE_ALTIVEC
160	return vec_sl(a.native(), count.native());
161	#elif SIMDPP_USE_MSA
162	return (v8u16) __msa_sll_h((v8i16)a.native(), (v8i16)count.native());
163	#else
164	return SIMDPP_NOT_IMPLEMENTED2(a, count);
165	#endif
166	}
167
168	#if SIMDPP_USE_AVX2
169	static SIMDPP_INL
170	uint16<`16`> i_shift_l_v(const uint16<`16`>& a, const uint16<`16`>& count)
171	{
172	#if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
173	return _mm256_sllv_epi16(a.native(), count.native());
174	#else
175	return v_emul_shift_l_v16_using_mul(a, count);
176	#endif
177	}
178	#endif
179
180	#if SIMDPP_USE_AVX512BW
181	SIMDPP_INL uint16<`32`> i_shift_l_v(const uint16<`32`>& a, const uint16<`32`>& count)
182	{
183	return _mm512_sllv_epi16(a.native(), count.native());
184	}
185	#endif
186
187	// -----------------------------------------------------------------------------
188
189	static SIMDPP_INL
190	uint32<`4`> i_shift_l_v(const uint32<`4`>& a, const uint32<`4`>& count)
191	{
192	#if SIMDPP_USE_NULL
193	return detail::null::shift_l_v(a, count);
194	#elif SIMDPP_USE_AVX2
195	return _mm_sllv_epi32(a.native(), count.native());
196	#elif SIMDPP_USE_SSE2
197	uint32<`4`> count0 = count;
198	#if SIMDPP_USE_SSE4_1
199	uint32<`4`> zero = make_zero();
200	count0 = _mm_blend_epi16(count0.native(), zero.native(), `0xcc`);
201	#else
202	uint32<`4`> mask = make_uint(`0xffffffff`, `0`, `0xffffffff`, `0`);
203	count0 = bit_and(count0, mask);
204	#endif
205	uint32<`4`> count1 = _mm_srli_epi64(count.native(), `32`);
206	uint32<`4`> count2 = _mm_srli_si128(count0.native(), `8`);
207	uint32<`4`> count3 = _mm_srli_si128(count.native(), `12`);
208
209	__m128i a0 = _mm_sll_epi32(a.native(), count0.native());
210	__m128i a1 = _mm_sll_epi32(a.native(), count1.native());
211	__m128i a2 = _mm_sll_epi32(a.native(), count2.native());
212	__m128i a3 = _mm_sll_epi32(a.native(), count3.native());
213	#if SIMDPP_USE_SSE4_1
214	a0 = _mm_blend_epi16(a0, a1, `0x0c`);
215	a2 = _mm_blend_epi16(a2, a3, `0xc0`);
216	a0 = _mm_blend_epi16(a0, a2, `0xf0`);
217	#else
218	__m128 f0 = _mm_shuffle_ps(_mm_castsi128_ps(a0),
219	_mm_castsi128_ps(a1),
220	SIMDPP_SHUFFLE_MASK_4x4(`0`, `0`, `1`, `1`));
221	__m128 f1 = _mm_shuffle_ps(_mm_castsi128_ps(a2),
222	_mm_castsi128_ps(a3),
223	SIMDPP_SHUFFLE_MASK_4x4(`2`, `2`, `3`, `3`));
224	f0 = _mm_shuffle_ps(f0, f1, SIMDPP_SHUFFLE_MASK_4x4(`0`, `2`, `0`, `2`));
225	a0 = _mm_castps_si128(f0);
226	#endif
227	return a0;
228	#elif SIMDPP_USE_NEON
229	return vshlq_u32(a.native(), vreinterpretq_s32_u32(count.native()));
230	#elif SIMDPP_USE_ALTIVEC
231	return vec_sl(a.native(), count.native());
232	#elif SIMDPP_USE_MSA
233	return (v4u32) __msa_sll_w((v4i32)a.native(), (v4i32)count.native());
234	#endif
235	}
236
237	#if SIMDPP_USE_AVX2
238	static SIMDPP_INL
239	uint32<`8`> i_shift_l_v(const uint32<`8`>& a, const uint32<`8`>& count)
240	{
241	return _mm256_sllv_epi32(a.native(), count.native());
242	}
243	#endif
244
245	#if SIMDPP_USE_AVX512F
246	SIMDPP_INL uint32<`16`> i_shift_l_v(const uint32<`16`>& a, const uint32<`16`>& count)
247	{
248	return _mm512_sllv_epi32(a.native(), count.native());
249	}
250	#endif
251
252	// -----------------------------------------------------------------------------
253
254	template<class V, class U> SIMDPP_INL
255	V i_shift_l_v(const V& a, const U& b)
256	{
257	SIMDPP_VEC_ARRAY_IMPL2(V, i_shift_l_v, a, b);
258	}
259
260	} // namespace insn
261	} // namespace detail
262	} // namespace SIMDPP_ARCH_NAMESPACE
263	} // namespace simdpp
264
265	#endif
266

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/i_shift_l_v.h