1/* Copyright (C) 2017 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_CORE_I_SHIFT_R_V_H
9#define LIBSIMDPP_SIMDPP_CORE_I_SHIFT_R_V_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/detail/null/math.h>
17#include <simdpp/detail/insn/i_shift.h>
18#include <simdpp/detail/shuffle/shuffle_mask.h>
19#include <simdpp/core/i_neg.h>
20#include <simdpp/core/i_mul.h>
21#include <simdpp/core/permute_bytes16.h>
22#include <simdpp/detail/vector_array_macros.h>
23
24namespace simdpp {
25namespace SIMDPP_ARCH_NAMESPACE {
26namespace detail {
27namespace insn {
28
29// emulates 8-bit variable shift using 16-bit variable shift
30template<class U8> SIMDPP_INL
31U8 v_emul_shift_r_u8_using_v16(const U8& a, const U8& count)
32{
33 using U16 = typename same_width<U8>::u16;
34
35 U16 a16; a16 = a;
36 U16 c16; c16 = count;
37
38 U16 select_mask = make_uint(0x00ff);
39 U16 a_lo = bit_and(a16, select_mask);
40 U16 a_hi = a16;
41 U16 c_lo = bit_and(c16, select_mask);
42 U16 c_hi = shift_r<8>(c16);
43 a_lo = shift_r(a_lo, c_lo);
44 a_hi = shift_r(a_hi, c_hi);
45 a_hi = bit_andnot(a_hi, select_mask);
46
47 a16 = bit_or(a_lo, a_hi);
48 return (U8) a16;
49}
50
51// emulates 8-bit variable shift using permute_bytes16 and 16-bit multiplication
52template<class U8> SIMDPP_INL
53U8 v_emul_shift_r_u8_using_mul(const U8& a, const U8& count)
54{
55 using U16 = typename same_width<U8>::u16;
56
57 // Variable shift is implemented by reusing shifter in 16-bit unsigned
58 // multiplication. The result is obtained by computing 1 << (8-countN)
59 // for each element from a, multiplying each element by that number and
60 // selecting the high half of the result.
61 U8 mulshift_mask = make_uint(0x80, 0x40, 0x20, 0x10,
62 0x08, 0x04, 0x02, 0x01,
63 0x00, 0x00, 0x00, 0x00,
64 0x00, 0x00, 0x00, 0x00);
65 U16 mulshift = (U16) permute_bytes16(mulshift_mask, count);
66 U16 a16; a16 = a;
67 U16 a16_lo, a16_hi, mulshift_lo, mulshift_hi;
68 U16 select_mask = make_uint(0x00ff);
69
70 // Move the element values to the high byte of the 16-bit elements and the
71 // shift values to the low 9 bits. The 9-th bit is needed because in order
72 // to shift by 0 the element values need to be multiplied by 0x100.
73 // The results will have the high byte clear which will help composing the
74 // result back to a single vector.
75 a16_lo = shift_l<8>(a16);
76 mulshift_lo = bit_and(mulshift, select_mask);
77 mulshift_lo = shift_l<1>(mulshift_lo);
78 a16_hi = bit_andnot(a16, select_mask);
79 mulshift_hi = shift_l<1>(shift_r<8>(mulshift));
80
81 a16_lo = mul_hi(a16_lo, mulshift_lo);
82 a16_hi = mul_hi(a16_hi, mulshift_hi);
83
84 a16_hi = shift_l<8>(a16_hi);
85 a16 = bit_or(a16_lo, a16_hi);
86 return (U8) a16;
87}
88
89static SIMDPP_INL
90uint8<16> i_shift_r_v(const uint8<16>& a, const uint8<16>& count)
91{
92#if SIMDPP_USE_NULL
93 return detail::null::shift_r_v(a, count);
94#elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
95 return v_emul_shift_r_u8_using_v16(a, count);
96#elif SIMDPP_USE_SSSE3
97 return v_emul_shift_r_u8_using_mul(a, count);
98#elif SIMDPP_USE_NEON
99 int8<16> qcount = neg((int8<16>)count);
100 return vshlq_u8(a.native(), qcount.native());
101#elif SIMDPP_USE_ALTIVEC
102 return vec_sr(a.native(), count.native());
103#elif SIMDPP_USE_MSA
104 return (v16u8) __msa_srl_b((v16i8)a.native(), (v16i8)count.native());
105#else
106 return SIMDPP_NOT_IMPLEMENTED2(a, count);
107#endif
108}
109
110#if SIMDPP_USE_AVX2
111static SIMDPP_INL
112uint8<32> i_shift_r_v(const uint8<32>& a, const uint8<32>& count)
113{
114#if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
115 return v_emul_shift_r_u8_using_v16(a, count);
116#else
117 return v_emul_shift_r_u8_using_mul(a, count);
118#endif
119}
120#endif
121
122#if SIMDPP_USE_AVX512BW
123static SIMDPP_INL
124uint8<64> i_shift_r_v(const uint8<64>& a, const uint8<64>& count)
125{
126 return v_emul_shift_r_u8_using_v16(a, count);
127}
128#endif
129
130// -----------------------------------------------------------------------------
131
132// emulates 8-bit variable shift using 16-bit variable shift
133template<class I8, class U8> SIMDPP_INL
134I8 v_emul_shift_r_i8_using_v16(const I8& a, const U8& count)
135{
136 using I16 = typename same_width<I8>::i16;
137 using U16 = typename same_width<I8>::u16;
138
139 U16 a16; a16 = a;
140 U16 c16; c16 = count;
141
142 U16 select_mask = make_uint(0x00ff);
143 U16 a_lo = shift_l<8>(a16);
144 U16 a_hi = a16;
145 U16 c_lo = bit_and(c16, select_mask);
146 U16 c_hi = shift_r<8>(c16);
147 a_lo = shift_r((I16)a_lo, c_lo);
148 a_hi = shift_r((I16)a_hi, c_hi);
149
150 a_lo = shift_r<8>(a_lo);
151 a_hi = bit_andnot(a_hi, select_mask);
152 a16 = bit_or(a_lo, a_hi);
153
154 return (I8) a16;
155}
156
157template<class I8, class U8> SIMDPP_INL
158I8 v_emul_shift_r_i8_using_mul(const I8& a, const U8& count)
159{
160 using U16 = typename same_width<U8>::u16;
161 using I16 = typename same_width<U8>::i16;
162
163 // Variable shift is implemented by reusing shifter in 16-bit signed
164 // multiplication. The result is obtained by computing 1 << (8-countN)
165 // for each element from a, multiplying each element by that number and
166 // selecting the high half of the result.
167 U8 mulshift_mask = make_uint(0x80, 0x40, 0x20, 0x10,
168 0x08, 0x04, 0x02, 0x01,
169 0x00, 0x00, 0x00, 0x00,
170 0x00, 0x00, 0x00, 0x00);
171 U16 mulshift = (U16) permute_bytes16(mulshift_mask, count);
172 U16 a16; a16 = a;
173 U16 a16_lo, a16_hi, mulshift_lo, mulshift_hi;
174 U16 select_mask = make_uint(0x00ff);
175
176 // Move the element values to the high byte of the 16-bit elements and the
177 // shift values to the low 9 bits. The 9-th bit is needed because in order
178 // to shift by 0 the element values need to be multiplied by 0x100.
179 // Note that the results may have nonzero high byte because this is signed
180 // multiplication.
181 a16_lo = shift_l<8>(a16);
182 mulshift_lo = bit_and(mulshift, select_mask);
183 mulshift_lo = shift_l<1>(mulshift_lo);
184 a16_hi = bit_andnot(a16, select_mask);
185 mulshift_hi = shift_l<1>(shift_r<8>(mulshift));
186
187 a16_lo = mul_hi((I16)a16_lo, (I16)mulshift_lo);
188 a16_hi = mul_hi((I16)a16_hi, (I16)mulshift_hi);
189
190 a16_hi = shift_l<8>(a16_hi);
191 a16_lo = bit_and(a16_lo, select_mask);
192 a16 = bit_or(a16_lo, a16_hi);
193 return (U8) a16;
194}
195
196static SIMDPP_INL
197int8<16> i_shift_r_v(const int8<16>& a, const uint8<16>& count)
198{
199#if SIMDPP_USE_NULL
200 return detail::null::shift_r_v(a, count);
201#elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
202 return v_emul_shift_r_i8_using_v16(a, count);
203#elif SIMDPP_USE_SSSE3
204 return v_emul_shift_r_i8_using_mul(a, count);
205#elif SIMDPP_USE_NEON
206 int8<16> qcount = neg((int8<16>)count);
207 return vshlq_s8(a.native(), qcount.native());
208#elif SIMDPP_USE_ALTIVEC
209 return vec_sra(a.native(), count.native());
210#elif SIMDPP_USE_MSA
211 return __msa_sra_b(a.native(), (v16i8) count.native());
212#else
213 return SIMDPP_NOT_IMPLEMENTED2(a, count);
214#endif
215}
216
217#if SIMDPP_USE_AVX2
218static SIMDPP_INL
219int8<32> i_shift_r_v(const int8<32>& a, const uint8<32>& count)
220{
221#if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
222 return v_emul_shift_r_i8_using_v16(a, count);
223#else
224 return v_emul_shift_r_i8_using_mul(a, count);
225#endif
226}
227#endif
228
229#if SIMDPP_USE_AVX512BW
230static SIMDPP_INL
231int8<64> i_shift_r_v(const int8<64>& a, const uint8<64>& count)
232{
233 return v_emul_shift_r_i8_using_v16(a, count);
234}
235#endif
236
237// -----------------------------------------------------------------------------
238
239// emulates 16-bit variable shift using permute_bytes16 and 16-bit multiplication
240template<class U16>
241U16 v_emul_shift_r_u16_using_mul(const U16& a, const U16& count)
242{
243 using U8 = typename same_width<U16>::u8;
244 using M16 = typename U16::mask_vector_type;
245 // Variable shift is implemented by reusing shifter in 16-bit unsigned
246 // multiplication. The result is obtained by computing 1 << (16-countN-1)
247 // for each element from a, multiplying each element by that number and
248 // selecting the high half of the result. Note that the highest shift
249 // available when using 16-bit multiplication is 15, which needs to be
250 // worked around by extra instructions.
251 M16 is_same = cmp_eq(count, 0);
252 M16 is_zero = cmp_gt(count, 15);
253
254 U8 mulshift_mask = make_uint(0x00, 0x80, 0x40, 0x20,
255 0x10, 0x08, 0x04, 0x02,
256 0x01, 0x00, 0x00, 0x00,
257 0x00, 0x00, 0x00, 0x00);
258
259 // permute_bytes16 permutes 8-bit elements instead of 16 which would be
260 // optimal in this case. We need to construct the selector in special way
261 // for 8-bit permutation.
262 // The 4-th is toggled bit so that the high byte takes zeros from the
263 // mulshift mask when the shift count is higher than 8.
264 U16 qcount = bit_or(count, shift_l<8>(count));
265 qcount = bit_xor(qcount, 0x0008);
266
267 U16 mulshift = (U16) permute_bytes16(mulshift_mask, (U8) qcount);
268 U16 res = mul_hi(a, mulshift);
269 res = blend(a, res, is_same);
270 res = bit_andnot(res, is_zero);
271 return res;
272}
273
274static SIMDPP_INL
275uint16<8> i_shift_r_v(const uint16<8>& a, const uint16<8>& count)
276{
277#if SIMDPP_USE_NULL
278 return detail::null::shift_r_v(a, count);
279#elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
280 return _mm_srlv_epi16(a.native(), count.native());
281#elif SIMDPP_USE_SSSE3
282 return v_emul_shift_r_u16_using_mul(a, count);
283#elif SIMDPP_USE_NEON
284 int16<8> qcount = neg((int16<8>)count);
285 return vshlq_u16(a.native(), qcount.native());
286#elif SIMDPP_USE_ALTIVEC
287 return vec_sr(a.native(), count.native());
288#elif SIMDPP_USE_MSA
289 return (v8u16) __msa_srl_h((v8i16)a.native(), (v8i16)count.native());
290#else
291 return SIMDPP_NOT_IMPLEMENTED2(a, count);
292#endif
293}
294
295#if SIMDPP_USE_AVX2
296static SIMDPP_INL
297uint16<16> i_shift_r_v(const uint16<16>& a, const uint16<16>& count)
298{
299#if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
300 return _mm256_srlv_epi16(a.native(), count.native());
301#else
302 return v_emul_shift_r_u16_using_mul(a, count);
303#endif
304}
305#endif
306
307#if SIMDPP_USE_AVX512BW
308SIMDPP_INL uint16<32> i_shift_r_v(const uint16<32>& a, const uint16<32>& count)
309{
310 return _mm512_srlv_epi16(a.native(), count.native());
311}
312#endif
313
314// -----------------------------------------------------------------------------
315
316static SIMDPP_INL
317int16<8> i_shift_r_v(const int16<8>& a, const uint16<8>& count)
318{
319#if SIMDPP_USE_NULL
320 return detail::null::shift_r_v(a, count);
321#elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
322 return _mm_srav_epi16(a.native(), count.native());
323#elif SIMDPP_USE_AVX512BW
324 __m512i a512 = _mm512_castsi128_si512(a.native());
325 __m512i count512 = _mm512_castsi128_si512(count.native());
326 return _mm512_castsi512_si128(_mm512_srav_epi16(a512, count512));
327#elif SIMDPP_USE_NEON
328 int16<8> qcount = neg((int16<8>)count);
329 return vshlq_s16(a.native(), qcount.native());
330#elif SIMDPP_USE_ALTIVEC
331 return vec_sra(a.native(), count.native());
332#elif SIMDPP_USE_MSA
333 return __msa_sra_h(a.native(), (v8i16) count.native());
334#else
335 return SIMDPP_NOT_IMPLEMENTED2(a, count);
336#endif
337}
338
339#if SIMDPP_USE_AVX2
340static SIMDPP_INL
341int16<16> i_shift_r_v(const int16<16>& a, const uint16<16>& count)
342{
343#if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
344 return _mm256_srav_epi16(a.native(), count.native());
345#elif SIMDPP_USE_AVX512BW
346 __m512i a512 = _mm512_castsi256_si512(a.native());
347 __m512i count512 = _mm512_castsi256_si512(count.native());
348 return _mm512_castsi512_si256(_mm512_srav_epi16(a512, count512));
349#else
350 return SIMDPP_NOT_IMPLEMENTED2(a, count);
351#endif
352}
353#endif
354
355#if SIMDPP_USE_AVX512BW
356SIMDPP_INL int16<32> i_shift_r_v(const int16<32>& a, const uint16<32>& count)
357{
358 return _mm512_srav_epi16(a.native(), count.native());
359}
360#endif
361
362// -----------------------------------------------------------------------------
363
364static SIMDPP_INL
365uint32<4> i_shift_r_v(const uint32<4>& a, const uint32<4>& count)
366{
367#if SIMDPP_USE_NULL
368 return detail::null::shift_r_v(a, count);
369#elif SIMDPP_USE_AVX2
370 return _mm_srlv_epi32(a.native(), count.native());
371#elif SIMDPP_USE_SSE2
372 uint32<4> count0 = count;
373#if SIMDPP_USE_SSE4_1
374 uint32<4> zero = make_zero();
375 count0 = _mm_blend_epi16(count0.native(), zero.native(), 0xcc);
376#else
377 uint32<4> mask = make_uint(0xffffffff, 0, 0xffffffff, 0);
378 count0 = bit_and(count0, mask);
379#endif
380 uint32<4> count1 = _mm_srli_epi64(count.native(), 32);
381 uint32<4> count2 = _mm_srli_si128(count0.native(), 8);
382 uint32<4> count3 = _mm_srli_si128(count.native(), 12);
383
384 __m128i a0 = _mm_srl_epi32(a.native(), count0.native());
385 __m128i a1 = _mm_srl_epi32(a.native(), count1.native());
386 __m128i a2 = _mm_srl_epi32(a.native(), count2.native());
387 __m128i a3 = _mm_srl_epi32(a.native(), count3.native());
388#if SIMDPP_USE_SSE4_1
389 a0 = _mm_blend_epi16(a0, a1, 0x0c);
390 a2 = _mm_blend_epi16(a2, a3, 0xc0);
391 a0 = _mm_blend_epi16(a0, a2, 0xf0);
392#else
393 __m128 f0 = _mm_shuffle_ps(_mm_castsi128_ps(a0),
394 _mm_castsi128_ps(a1),
395 SIMDPP_SHUFFLE_MASK_4x4(0, 0, 1, 1));
396 __m128 f1 = _mm_shuffle_ps(_mm_castsi128_ps(a2),
397 _mm_castsi128_ps(a3),
398 SIMDPP_SHUFFLE_MASK_4x4(2, 2, 3, 3));
399 f0 = _mm_shuffle_ps(f0, f1, SIMDPP_SHUFFLE_MASK_4x4(0, 2, 0, 2));
400 a0 = _mm_castps_si128(f0);
401#endif
402 return a0;
403#elif SIMDPP_USE_NEON
404 int32<4> qcount = neg((int32<4>)count);
405 return vshlq_u32(a.native(), qcount.native());
406#elif SIMDPP_USE_ALTIVEC
407 return vec_sr(a.native(), count.native());
408#elif SIMDPP_USE_MSA
409 return (v4u32) __msa_srl_w((v4i32)a.native(), (v4i32)count.native());
410#endif
411}
412
413#if SIMDPP_USE_AVX2
414static SIMDPP_INL
415uint32<8> i_shift_r_v(const uint32<8>& a, const uint32<8>& count)
416{
417 return _mm256_srlv_epi32(a.native(), count.native());
418}
419#endif
420
421#if SIMDPP_USE_AVX512F
422static SIMDPP_INL
423uint32<16> i_shift_r_v(const uint32<16>& a, const uint32<16>& count)
424{
425 return _mm512_srlv_epi32(a.native(), count.native());
426}
427#endif
428
429// -----------------------------------------------------------------------------
430
431static SIMDPP_INL
432int32<4> i_shift_r_v(const int32<4>& a, const uint32<4>& count)
433{
434#if SIMDPP_USE_NULL
435 return detail::null::shift_r_v(a, count);
436#elif SIMDPP_USE_AVX2
437 return _mm_srav_epi32(a.native(), count.native());
438#elif SIMDPP_USE_SSE2
439 uint32<4> count0 = count;
440#if SIMDPP_USE_SSE4_1
441 uint32<4> zero = make_zero();
442 count0 = _mm_blend_epi16(count0.native(), zero.native(), 0xcc);
443#else
444 uint32<4> mask = make_uint(0xffffffff, 0, 0xffffffff, 0);
445 count0 = bit_and(count0, mask);
446#endif
447 uint32<4> count1 = _mm_srli_epi64(count.native(), 32);
448 uint32<4> count2 = _mm_srli_si128(count0.native(), 8);
449 uint32<4> count3 = _mm_srli_si128(count.native(), 12);
450
451 __m128i a0 = _mm_sra_epi32(a.native(), count0.native());
452 __m128i a1 = _mm_sra_epi32(a.native(), count1.native());
453 __m128i a2 = _mm_sra_epi32(a.native(), count2.native());
454 __m128i a3 = _mm_sra_epi32(a.native(), count3.native());
455#if SIMDPP_USE_SSE4_1
456 a0 = _mm_blend_epi16(a0, a1, 0x0c);
457 a2 = _mm_blend_epi16(a2, a3, 0xc0);
458 a0 = _mm_blend_epi16(a0, a2, 0xf0);
459#else
460 __m128 f0 = _mm_shuffle_ps(_mm_castsi128_ps(a0),
461 _mm_castsi128_ps(a1),
462 SIMDPP_SHUFFLE_MASK_4x4(0, 0, 1, 1));
463 __m128 f1 = _mm_shuffle_ps(_mm_castsi128_ps(a2),
464 _mm_castsi128_ps(a3),
465 SIMDPP_SHUFFLE_MASK_4x4(2, 2, 3, 3));
466 f0 = _mm_shuffle_ps(f0, f1, SIMDPP_SHUFFLE_MASK_4x4(0, 2, 0, 2));
467 a0 = _mm_castps_si128(f0);
468#endif
469 return a0;
470#elif SIMDPP_USE_NEON
471 int32<4> qcount = neg((int32<4>)count);
472 return vshlq_s32(a.native(), qcount.native());
473#elif SIMDPP_USE_ALTIVEC
474 return vec_sra(a.native(), count.native());
475#elif SIMDPP_USE_MSA
476 return __msa_sra_w(a.native(), (v4i32)count.native());
477#endif
478}
479
480#if SIMDPP_USE_AVX2
481static SIMDPP_INL
482int32<8> i_shift_r_v(const int32<8>& a, const uint32<8>& count)
483{
484 return _mm256_srav_epi32(a.native(), count.native());
485}
486#endif
487
488#if SIMDPP_USE_AVX512F
489static SIMDPP_INL
490int32<16> i_shift_r_v(const int32<16>& a, const uint32<16>& count)
491{
492 return _mm512_srav_epi32(a.native(), count.native());
493}
494#endif
495
496// -----------------------------------------------------------------------------
497
498template<class V, class U> SIMDPP_INL
499V i_shift_r_v(const V& a, const U& b)
500{
501 SIMDPP_VEC_ARRAY_IMPL2(V, i_shift_r_v, a, b);
502}
503
504} // namespace insn
505} // namespace detail
506} // namespace SIMDPP_ARCH_NAMESPACE
507} // namespace simdpp
508
509#endif
510