i_shift_r_v.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/i_shift_r_v.h]

1	/ Copyright (C) 2017 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_CORE_I_SHIFT_R_V_H
9	#define LIBSIMDPP_SIMDPP_CORE_I_SHIFT_R_V_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/types.h>
16	#include <simdpp/detail/null/math.h>
17	#include <simdpp/detail/insn/i_shift.h>
18	#include <simdpp/detail/shuffle/shuffle_mask.h>
19	#include <simdpp/core/i_neg.h>
20	#include <simdpp/core/i_mul.h>
21	#include <simdpp/core/permute_bytes16.h>
22	#include <simdpp/detail/vector_array_macros.h>
23
24	namespace simdpp {
25	namespace SIMDPP_ARCH_NAMESPACE {
26	namespace detail {
27	namespace insn {
28
29	// emulates 8-bit variable shift using 16-bit variable shift
30	template<class U8> SIMDPP_INL
31	U8 v_emul_shift_r_u8_using_v16(const U8& a, const U8& count)
32	{
33	using U16 = typename same_width<U8>::u16;
34
35	U16 a16; a16 = a;
36	U16 c16; c16 = count;
37
38	U16 select_mask = make_uint(`0x00ff`);
39	U16 a_lo = bit_and(a16, select_mask);
40	U16 a_hi = a16;
41	U16 c_lo = bit_and(c16, select_mask);
42	U16 c_hi = shift_r<`8`>(c16);
43	a_lo = shift_r(a_lo, c_lo);
44	a_hi = shift_r(a_hi, c_hi);
45	a_hi = bit_andnot(a_hi, select_mask);
46
47	a16 = bit_or(a_lo, a_hi);
48	return (U8) a16;
49	}
50
51	// emulates 8-bit variable shift using permute_bytes16 and 16-bit multiplication
52	template<class U8> SIMDPP_INL
53	U8 v_emul_shift_r_u8_using_mul(const U8& a, const U8& count)
54	{
55	using U16 = typename same_width<U8>::u16;
56
57	// Variable shift is implemented by reusing shifter in 16-bit unsigned
58	// multiplication. The result is obtained by computing 1 << (8-countN)
59	// for each element from a, multiplying each element by that number and
60	// selecting the high half of the result.
61	U8 mulshift_mask = make_uint(`0x80`, `0x40`, `0x20`, `0x10`,
62	`0x08`, `0x04`, `0x02`, `0x01`,
63	`0x00`, `0x00`, `0x00`, `0x00`,
64	`0x00`, `0x00`, `0x00`, `0x00`);
65	U16 mulshift = (U16) permute_bytes16(mulshift_mask, count);
66	U16 a16; a16 = a;
67	U16 a16_lo, a16_hi, mulshift_lo, mulshift_hi;
68	U16 select_mask = make_uint(`0x00ff`);
69
70	// Move the element values to the high byte of the 16-bit elements and the
71	// shift values to the low 9 bits. The 9-th bit is needed because in order
72	// to shift by 0 the element values need to be multiplied by 0x100.
73	// The results will have the high byte clear which will help composing the
74	// result back to a single vector.
75	a16_lo = shift_l<`8`>(a16);
76	mulshift_lo = bit_and(mulshift, select_mask);
77	mulshift_lo = shift_l<`1`>(mulshift_lo);
78	a16_hi = bit_andnot(a16, select_mask);
79	mulshift_hi = shift_l<`1`>(shift_r<`8`>(mulshift));
80
81	a16_lo = mul_hi(a16_lo, mulshift_lo);
82	a16_hi = mul_hi(a16_hi, mulshift_hi);
83
84	a16_hi = shift_l<`8`>(a16_hi);
85	a16 = bit_or(a16_lo, a16_hi);
86	return (U8) a16;
87	}
88
89	static SIMDPP_INL
90	uint8<`16`> i_shift_r_v(const uint8<`16`>& a, const uint8<`16`>& count)
91	{
92	#if SIMDPP_USE_NULL
93	return detail::null::shift_r_v(a, count);
94	#elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
95	return v_emul_shift_r_u8_using_v16(a, count);
96	#elif SIMDPP_USE_SSSE3
97	return v_emul_shift_r_u8_using_mul(a, count);
98	#elif SIMDPP_USE_NEON
99	int8<`16`> qcount = neg((int8<`16`>)count);
100	return vshlq_u8(a.native(), qcount.native());
101	#elif SIMDPP_USE_ALTIVEC
102	return vec_sr(a.native(), count.native());
103	#elif SIMDPP_USE_MSA
104	return (v16u8) __msa_srl_b((v16i8)a.native(), (v16i8)count.native());
105	#else
106	return SIMDPP_NOT_IMPLEMENTED2(a, count);
107	#endif
108	}
109
110	#if SIMDPP_USE_AVX2
111	static SIMDPP_INL
112	uint8<`32`> i_shift_r_v(const uint8<`32`>& a, const uint8<`32`>& count)
113	{
114	#if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
115	return v_emul_shift_r_u8_using_v16(a, count);
116	#else
117	return v_emul_shift_r_u8_using_mul(a, count);
118	#endif
119	}
120	#endif
121
122	#if SIMDPP_USE_AVX512BW
123	static SIMDPP_INL
124	uint8<`64`> i_shift_r_v(const uint8<`64`>& a, const uint8<`64`>& count)
125	{
126	return v_emul_shift_r_u8_using_v16(a, count);
127	}
128	#endif
129
130	// -----------------------------------------------------------------------------
131
132	// emulates 8-bit variable shift using 16-bit variable shift
133	template<class I8, class U8> SIMDPP_INL
134	I8 v_emul_shift_r_i8_using_v16(const I8& a, const U8& count)
135	{
136	using I16 = typename same_width<I8>::i16;
137	using U16 = typename same_width<I8>::u16;
138
139	U16 a16; a16 = a;
140	U16 c16; c16 = count;
141
142	U16 select_mask = make_uint(`0x00ff`);
143	U16 a_lo = shift_l<`8`>(a16);
144	U16 a_hi = a16;
145	U16 c_lo = bit_and(c16, select_mask);
146	U16 c_hi = shift_r<`8`>(c16);
147	a_lo = shift_r((I16)a_lo, c_lo);
148	a_hi = shift_r((I16)a_hi, c_hi);
149
150	a_lo = shift_r<`8`>(a_lo);
151	a_hi = bit_andnot(a_hi, select_mask);
152	a16 = bit_or(a_lo, a_hi);
153
154	return (I8) a16;
155	}
156
157	template<class I8, class U8> SIMDPP_INL
158	I8 v_emul_shift_r_i8_using_mul(const I8& a, const U8& count)
159	{
160	using U16 = typename same_width<U8>::u16;
161	using I16 = typename same_width<U8>::i16;
162
163	// Variable shift is implemented by reusing shifter in 16-bit signed
164	// multiplication. The result is obtained by computing 1 << (8-countN)
165	// for each element from a, multiplying each element by that number and
166	// selecting the high half of the result.
167	U8 mulshift_mask = make_uint(`0x80`, `0x40`, `0x20`, `0x10`,
168	`0x08`, `0x04`, `0x02`, `0x01`,
169	`0x00`, `0x00`, `0x00`, `0x00`,
170	`0x00`, `0x00`, `0x00`, `0x00`);
171	U16 mulshift = (U16) permute_bytes16(mulshift_mask, count);
172	U16 a16; a16 = a;
173	U16 a16_lo, a16_hi, mulshift_lo, mulshift_hi;
174	U16 select_mask = make_uint(`0x00ff`);
175
176	// Move the element values to the high byte of the 16-bit elements and the
177	// shift values to the low 9 bits. The 9-th bit is needed because in order
178	// to shift by 0 the element values need to be multiplied by 0x100.
179	// Note that the results may have nonzero high byte because this is signed
180	// multiplication.
181	a16_lo = shift_l<`8`>(a16);
182	mulshift_lo = bit_and(mulshift, select_mask);
183	mulshift_lo = shift_l<`1`>(mulshift_lo);
184	a16_hi = bit_andnot(a16, select_mask);
185	mulshift_hi = shift_l<`1`>(shift_r<`8`>(mulshift));
186
187	a16_lo = mul_hi((I16)a16_lo, (I16)mulshift_lo);
188	a16_hi = mul_hi((I16)a16_hi, (I16)mulshift_hi);
189
190	a16_hi = shift_l<`8`>(a16_hi);
191	a16_lo = bit_and(a16_lo, select_mask);
192	a16 = bit_or(a16_lo, a16_hi);
193	return (U8) a16;
194	}
195
196	static SIMDPP_INL
197	int8<`16`> i_shift_r_v(const int8<`16`>& a, const uint8<`16`>& count)
198	{
199	#if SIMDPP_USE_NULL
200	return detail::null::shift_r_v(a, count);
201	#elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
202	return v_emul_shift_r_i8_using_v16(a, count);
203	#elif SIMDPP_USE_SSSE3
204	return v_emul_shift_r_i8_using_mul(a, count);
205	#elif SIMDPP_USE_NEON
206	int8<`16`> qcount = neg((int8<`16`>)count);
207	return vshlq_s8(a.native(), qcount.native());
208	#elif SIMDPP_USE_ALTIVEC
209	return vec_sra(a.native(), count.native());
210	#elif SIMDPP_USE_MSA
211	return __msa_sra_b(a.native(), (v16i8) count.native());
212	#else
213	return SIMDPP_NOT_IMPLEMENTED2(a, count);
214	#endif
215	}
216
217	#if SIMDPP_USE_AVX2
218	static SIMDPP_INL
219	int8<`32`> i_shift_r_v(const int8<`32`>& a, const uint8<`32`>& count)
220	{
221	#if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
222	return v_emul_shift_r_i8_using_v16(a, count);
223	#else
224	return v_emul_shift_r_i8_using_mul(a, count);
225	#endif
226	}
227	#endif
228
229	#if SIMDPP_USE_AVX512BW
230	static SIMDPP_INL
231	int8<`64`> i_shift_r_v(const int8<`64`>& a, const uint8<`64`>& count)
232	{
233	return v_emul_shift_r_i8_using_v16(a, count);
234	}
235	#endif
236
237	// -----------------------------------------------------------------------------
238
239	// emulates 16-bit variable shift using permute_bytes16 and 16-bit multiplication
240	template<class U16>
241	U16 v_emul_shift_r_u16_using_mul(const U16& a, const U16& count)
242	{
243	using U8 = typename same_width<U16>::u8;
244	using M16 = typename U16::mask_vector_type;
245	// Variable shift is implemented by reusing shifter in 16-bit unsigned
246	// multiplication. The result is obtained by computing 1 << (16-countN-1)
247	// for each element from a, multiplying each element by that number and
248	// selecting the high half of the result. Note that the highest shift
249	// available when using 16-bit multiplication is 15, which needs to be
250	// worked around by extra instructions.
251	M16 is_same = cmp_eq(count, `0`);
252	M16 is_zero = cmp_gt(count, `15`);
253
254	U8 mulshift_mask = make_uint(`0x00`, `0x80`, `0x40`, `0x20`,
255	`0x10`, `0x08`, `0x04`, `0x02`,
256	`0x01`, `0x00`, `0x00`, `0x00`,
257	`0x00`, `0x00`, `0x00`, `0x00`);
258
259	// permute_bytes16 permutes 8-bit elements instead of 16 which would be
260	// optimal in this case. We need to construct the selector in special way
261	// for 8-bit permutation.
262	// The 4-th is toggled bit so that the high byte takes zeros from the
263	// mulshift mask when the shift count is higher than 8.
264	U16 qcount = bit_or(count, shift_l<`8`>(count));
265	qcount = bit_xor(qcount, `0x0008`);
266
267	U16 mulshift = (U16) permute_bytes16(mulshift_mask, (U8) qcount);
268	U16 res = mul_hi(a, mulshift);
269	res = blend(a, res, is_same);
270	res = bit_andnot(res, is_zero);
271	return res;
272	}
273
274	static SIMDPP_INL
275	uint16<`8`> i_shift_r_v(const uint16<`8`>& a, const uint16<`8`>& count)
276	{
277	#if SIMDPP_USE_NULL
278	return detail::null::shift_r_v(a, count);
279	#elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
280	return _mm_srlv_epi16(a.native(), count.native());
281	#elif SIMDPP_USE_SSSE3
282	return v_emul_shift_r_u16_using_mul(a, count);
283	#elif SIMDPP_USE_NEON
284	int16<`8`> qcount = neg((int16<`8`>)count);
285	return vshlq_u16(a.native(), qcount.native());
286	#elif SIMDPP_USE_ALTIVEC
287	return vec_sr(a.native(), count.native());
288	#elif SIMDPP_USE_MSA
289	return (v8u16) __msa_srl_h((v8i16)a.native(), (v8i16)count.native());
290	#else
291	return SIMDPP_NOT_IMPLEMENTED2(a, count);
292	#endif
293	}
294
295	#if SIMDPP_USE_AVX2
296	static SIMDPP_INL
297	uint16<`16`> i_shift_r_v(const uint16<`16`>& a, const uint16<`16`>& count)
298	{
299	#if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
300	return _mm256_srlv_epi16(a.native(), count.native());
301	#else
302	return v_emul_shift_r_u16_using_mul(a, count);
303	#endif
304	}
305	#endif
306
307	#if SIMDPP_USE_AVX512BW
308	SIMDPP_INL uint16<`32`> i_shift_r_v(const uint16<`32`>& a, const uint16<`32`>& count)
309	{
310	return _mm512_srlv_epi16(a.native(), count.native());
311	}
312	#endif
313
314	// -----------------------------------------------------------------------------
315
316	static SIMDPP_INL
317	int16<`8`> i_shift_r_v(const int16<`8`>& a, const uint16<`8`>& count)
318	{
319	#if SIMDPP_USE_NULL
320	return detail::null::shift_r_v(a, count);
321	#elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
322	return _mm_srav_epi16(a.native(), count.native());
323	#elif SIMDPP_USE_AVX512BW
324	__m512i a512 = _mm512_castsi128_si512(a.native());
325	__m512i count512 = _mm512_castsi128_si512(count.native());
326	return _mm512_castsi512_si128(_mm512_srav_epi16(a512, count512));
327	#elif SIMDPP_USE_NEON
328	int16<`8`> qcount = neg((int16<`8`>)count);
329	return vshlq_s16(a.native(), qcount.native());
330	#elif SIMDPP_USE_ALTIVEC
331	return vec_sra(a.native(), count.native());
332	#elif SIMDPP_USE_MSA
333	return __msa_sra_h(a.native(), (v8i16) count.native());
334	#else
335	return SIMDPP_NOT_IMPLEMENTED2(a, count);
336	#endif
337	}
338
339	#if SIMDPP_USE_AVX2
340	static SIMDPP_INL
341	int16<`16`> i_shift_r_v(const int16<`16`>& a, const uint16<`16`>& count)
342	{
343	#if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
344	return _mm256_srav_epi16(a.native(), count.native());
345	#elif SIMDPP_USE_AVX512BW
346	__m512i a512 = _mm512_castsi256_si512(a.native());
347	__m512i count512 = _mm512_castsi256_si512(count.native());
348	return _mm512_castsi512_si256(_mm512_srav_epi16(a512, count512));
349	#else
350	return SIMDPP_NOT_IMPLEMENTED2(a, count);
351	#endif
352	}
353	#endif
354
355	#if SIMDPP_USE_AVX512BW
356	SIMDPP_INL int16<`32`> i_shift_r_v(const int16<`32`>& a, const uint16<`32`>& count)
357	{
358	return _mm512_srav_epi16(a.native(), count.native());
359	}
360	#endif
361
362	// -----------------------------------------------------------------------------
363
364	static SIMDPP_INL
365	uint32<`4`> i_shift_r_v(const uint32<`4`>& a, const uint32<`4`>& count)
366	{
367	#if SIMDPP_USE_NULL
368	return detail::null::shift_r_v(a, count);
369	#elif SIMDPP_USE_AVX2
370	return _mm_srlv_epi32(a.native(), count.native());
371	#elif SIMDPP_USE_SSE2
372	uint32<`4`> count0 = count;
373	#if SIMDPP_USE_SSE4_1
374	uint32<`4`> zero = make_zero();
375	count0 = _mm_blend_epi16(count0.native(), zero.native(), `0xcc`);
376	#else
377	uint32<`4`> mask = make_uint(`0xffffffff`, `0`, `0xffffffff`, `0`);
378	count0 = bit_and(count0, mask);
379	#endif
380	uint32<`4`> count1 = _mm_srli_epi64(count.native(), `32`);
381	uint32<`4`> count2 = _mm_srli_si128(count0.native(), `8`);
382	uint32<`4`> count3 = _mm_srli_si128(count.native(), `12`);
383
384	__m128i a0 = _mm_srl_epi32(a.native(), count0.native());
385	__m128i a1 = _mm_srl_epi32(a.native(), count1.native());
386	__m128i a2 = _mm_srl_epi32(a.native(), count2.native());
387	__m128i a3 = _mm_srl_epi32(a.native(), count3.native());
388	#if SIMDPP_USE_SSE4_1
389	a0 = _mm_blend_epi16(a0, a1, `0x0c`);
390	a2 = _mm_blend_epi16(a2, a3, `0xc0`);
391	a0 = _mm_blend_epi16(a0, a2, `0xf0`);
392	#else
393	__m128 f0 = _mm_shuffle_ps(_mm_castsi128_ps(a0),
394	_mm_castsi128_ps(a1),
395	SIMDPP_SHUFFLE_MASK_4x4(`0`, `0`, `1`, `1`));
396	__m128 f1 = _mm_shuffle_ps(_mm_castsi128_ps(a2),
397	_mm_castsi128_ps(a3),
398	SIMDPP_SHUFFLE_MASK_4x4(`2`, `2`, `3`, `3`));
399	f0 = _mm_shuffle_ps(f0, f1, SIMDPP_SHUFFLE_MASK_4x4(`0`, `2`, `0`, `2`));
400	a0 = _mm_castps_si128(f0);
401	#endif
402	return a0;
403	#elif SIMDPP_USE_NEON
404	int32<`4`> qcount = neg((int32<`4`>)count);
405	return vshlq_u32(a.native(), qcount.native());
406	#elif SIMDPP_USE_ALTIVEC
407	return vec_sr(a.native(), count.native());
408	#elif SIMDPP_USE_MSA
409	return (v4u32) __msa_srl_w((v4i32)a.native(), (v4i32)count.native());
410	#endif
411	}
412
413	#if SIMDPP_USE_AVX2
414	static SIMDPP_INL
415	uint32<`8`> i_shift_r_v(const uint32<`8`>& a, const uint32<`8`>& count)
416	{
417	return _mm256_srlv_epi32(a.native(), count.native());
418	}
419	#endif
420
421	#if SIMDPP_USE_AVX512F
422	static SIMDPP_INL
423	uint32<`16`> i_shift_r_v(const uint32<`16`>& a, const uint32<`16`>& count)
424	{
425	return _mm512_srlv_epi32(a.native(), count.native());
426	}
427	#endif
428
429	// -----------------------------------------------------------------------------
430
431	static SIMDPP_INL
432	int32<`4`> i_shift_r_v(const int32<`4`>& a, const uint32<`4`>& count)
433	{
434	#if SIMDPP_USE_NULL
435	return detail::null::shift_r_v(a, count);
436	#elif SIMDPP_USE_AVX2
437	return _mm_srav_epi32(a.native(), count.native());
438	#elif SIMDPP_USE_SSE2
439	uint32<`4`> count0 = count;
440	#if SIMDPP_USE_SSE4_1
441	uint32<`4`> zero = make_zero();
442	count0 = _mm_blend_epi16(count0.native(), zero.native(), `0xcc`);
443	#else
444	uint32<`4`> mask = make_uint(`0xffffffff`, `0`, `0xffffffff`, `0`);
445	count0 = bit_and(count0, mask);
446	#endif
447	uint32<`4`> count1 = _mm_srli_epi64(count.native(), `32`);
448	uint32<`4`> count2 = _mm_srli_si128(count0.native(), `8`);
449	uint32<`4`> count3 = _mm_srli_si128(count.native(), `12`);
450
451	__m128i a0 = _mm_sra_epi32(a.native(), count0.native());
452	__m128i a1 = _mm_sra_epi32(a.native(), count1.native());
453	__m128i a2 = _mm_sra_epi32(a.native(), count2.native());
454	__m128i a3 = _mm_sra_epi32(a.native(), count3.native());
455	#if SIMDPP_USE_SSE4_1
456	a0 = _mm_blend_epi16(a0, a1, `0x0c`);
457	a2 = _mm_blend_epi16(a2, a3, `0xc0`);
458	a0 = _mm_blend_epi16(a0, a2, `0xf0`);
459	#else
460	__m128 f0 = _mm_shuffle_ps(_mm_castsi128_ps(a0),
461	_mm_castsi128_ps(a1),
462	SIMDPP_SHUFFLE_MASK_4x4(`0`, `0`, `1`, `1`));
463	__m128 f1 = _mm_shuffle_ps(_mm_castsi128_ps(a2),
464	_mm_castsi128_ps(a3),
465	SIMDPP_SHUFFLE_MASK_4x4(`2`, `2`, `3`, `3`));
466	f0 = _mm_shuffle_ps(f0, f1, SIMDPP_SHUFFLE_MASK_4x4(`0`, `2`, `0`, `2`));
467	a0 = _mm_castps_si128(f0);
468	#endif
469	return a0;
470	#elif SIMDPP_USE_NEON
471	int32<`4`> qcount = neg((int32<`4`>)count);
472	return vshlq_s32(a.native(), qcount.native());
473	#elif SIMDPP_USE_ALTIVEC
474	return vec_sra(a.native(), count.native());
475	#elif SIMDPP_USE_MSA
476	return __msa_sra_w(a.native(), (v4i32)count.native());
477	#endif
478	}
479
480	#if SIMDPP_USE_AVX2
481	static SIMDPP_INL
482	int32<`8`> i_shift_r_v(const int32<`8`>& a, const uint32<`8`>& count)
483	{
484	return _mm256_srav_epi32(a.native(), count.native());
485	}
486	#endif
487
488	#if SIMDPP_USE_AVX512F
489	static SIMDPP_INL
490	int32<`16`> i_shift_r_v(const int32<`16`>& a, const uint32<`16`>& count)
491	{
492	return _mm512_srav_epi32(a.native(), count.native());
493	}
494	#endif
495
496	// -----------------------------------------------------------------------------
497
498	template<class V, class U> SIMDPP_INL
499	V i_shift_r_v(const V& a, const U& b)
500	{
501	SIMDPP_VEC_ARRAY_IMPL2(V, i_shift_r_v, a, b);
502	}
503
504	} // namespace insn
505	} // namespace detail
506	} // namespace SIMDPP_ARCH_NAMESPACE
507	} // namespace simdpp
508
509	#endif
510

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/i_shift_r_v.h