1/* Copyright (C) 2017 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_CORE_I_SHIFT_L_V_H
9#define LIBSIMDPP_SIMDPP_CORE_I_SHIFT_L_V_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/detail/null/math.h>
17#include <simdpp/detail/insn/i_shift.h>
18#include <simdpp/detail/shuffle/shuffle_mask.h>
19#include <simdpp/core/i_mul.h>
20#include <simdpp/core/permute_bytes16.h>
21#include <simdpp/detail/vector_array_macros.h>
22
23namespace simdpp {
24namespace SIMDPP_ARCH_NAMESPACE {
25namespace detail {
26namespace insn {
27
28// emulates 8-bit variable shift using 16-bit variable shift
29template<class U8> SIMDPP_INL
30U8 v_emul_shift_l_v8_using_v16(const U8& a, const U8& count)
31{
32 using U16 = typename same_width<U8>::u16;
33
34 U16 a16; a16 = a;
35 U16 c16; c16 = count;
36
37 U16 select_mask = make_uint(0xff00);
38 U16 a_lo = a16;
39 U16 a_hi = bit_and(a16, select_mask);
40 U16 c_lo = bit_andnot(c16, select_mask);
41 U16 c_hi = shift_r<8>(c16);
42 a_lo = shift_l(a_lo, c_lo);
43 a_hi = shift_l(a_hi, c_hi);
44 a_lo = bit_andnot(a_lo, select_mask);
45
46 a16 = bit_or(a_lo, a_hi);
47 return (U8) a16;
48}
49
50// emulates 8-bit variable shift using permute_bytes16 and 16-bit multiplication
51template<class U8> SIMDPP_INL
52U8 v_emul_shift_l_v8_using_mul(const U8& a, const U8& count)
53{
54 using U16 = typename same_width<U8>::u16;
55
56 // Variable shift is implemented by obtaining 1 << countN for each element
57 // from a and then multiplying each element by that number. Implementation
58 // is complicated by the fact, that only 16-bit multiplication is available.
59 U8 mulshift_mask = make_uint(0x01, 0x02, 0x04, 0x08,
60 0x10, 0x20, 0x40, 0x80);
61 U16 mulshift = (U16) permute_bytes16(mulshift_mask, count);
62
63 U16 a16; a16 = a;
64 U16 a16_lo, a16_hi, mulshift_lo, mulshift_hi;
65 U16 select_mask = make_uint(0x00ff);
66
67 // Move the element values to the high byte of the 16-bit elements and the
68 // shift values to the low byte. The results will have the low byte clear
69 // which will help composing the result back to a single vector.
70 a16_lo = shift_l<8>(a16);
71 mulshift_lo = bit_and(mulshift, select_mask);
72 a16_hi = bit_andnot(a16, select_mask);
73 mulshift_hi = shift_r<8>(mulshift);
74
75 a16_lo = mul_lo(a16_lo, mulshift_lo);
76 a16_hi = mul_lo(a16_hi, mulshift_hi);
77
78 a16_lo = shift_r<8>(a16_lo);
79 a16 = bit_or(a16_lo, a16_hi);
80 return (U8) a16;
81}
82
83static SIMDPP_INL
84uint8<16> i_shift_l_v(const uint8<16>& a, const uint8<16>& count)
85{
86#if SIMDPP_USE_NULL
87 return detail::null::shift_l_v(a, count);
88#elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
89 return v_emul_shift_l_v8_using_v16(a, count);
90#elif SIMDPP_USE_SSSE3
91 return v_emul_shift_l_v8_using_mul(a, count);
92#elif SIMDPP_USE_NEON
93 return vshlq_u8(a.native(), vreinterpretq_s8_u8(count.native()));
94#elif SIMDPP_USE_ALTIVEC
95 return vec_sl(a.native(), count.native());
96#elif SIMDPP_USE_MSA
97 return (v16u8) __msa_sll_b((v16i8)a.native(), (v16i8)count.native());
98#else
99 return SIMDPP_NOT_IMPLEMENTED2(a, count);
100#endif
101}
102
103#if SIMDPP_USE_AVX2
104static SIMDPP_INL
105uint8<32> i_shift_l_v(const uint8<32>& a, const uint8<32>& count)
106{
107#if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
108 return v_emul_shift_l_v8_using_v16(a, count);
109#else
110 return v_emul_shift_l_v8_using_mul(a, count);
111#endif
112}
113#endif
114
115#if SIMDPP_USE_AVX512BW
116static SIMDPP_INL
117uint8<64> i_shift_l_v(const uint8<64>& a, const uint8<64>& count)
118{
119 return v_emul_shift_l_v8_using_v16(a, count);
120}
121#endif
122
123// -----------------------------------------------------------------------------
124
125// emulates 16-bit variable shift using permute_bytes16 and 16-bit multiplication
126template<class U16>
127U16 v_emul_shift_l_v16_using_mul(const U16& a, const U16& count)
128{
129 using U8 = typename same_width<U16>::u8;
130
131 // Variable shift is implemented by obtaining 1 << countN for each element
132 // from a and then multiplying each element by that number. The
133 // implementation is complicated by the fact that permute_bytes16 permutes
134 // 8-bit elements instead of 16 which would be optimal in this case
135 U8 mulshift_mask = make_uint(0x01, 0x02, 0x04, 0x08,
136 0x10, 0x20, 0x40, 0x80,
137 0x00, 0x00, 0x00, 0x00,
138 0x00, 0x00, 0x00, 0x00);
139 U16 qcount = bit_or(count, shift_l<8>(count));
140
141 // toggle the 4-th bit so that the high byte takes zeros from the mulshift
142 // mask when the shift count is higher than 8.
143 qcount = bit_xor(qcount, 0x0800);
144 U16 mulshift = (U16) permute_bytes16(mulshift_mask, (U8) qcount);
145 return mul_lo(a, mulshift);
146}
147
148static SIMDPP_INL
149uint16<8> i_shift_l_v(const uint16<8>& a, const uint16<8>& count)
150{
151#if SIMDPP_USE_NULL
152 return detail::null::shift_l_v(a, count);
153#elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
154 return _mm_sllv_epi16(a.native(), count.native());
155#elif SIMDPP_USE_SSSE3
156 return v_emul_shift_l_v16_using_mul(a, count);
157#elif SIMDPP_USE_NEON
158 return vshlq_u16(a.native(), vreinterpretq_s16_u16(count.native()));
159#elif SIMDPP_USE_ALTIVEC
160 return vec_sl(a.native(), count.native());
161#elif SIMDPP_USE_MSA
162 return (v8u16) __msa_sll_h((v8i16)a.native(), (v8i16)count.native());
163#else
164 return SIMDPP_NOT_IMPLEMENTED2(a, count);
165#endif
166}
167
168#if SIMDPP_USE_AVX2
169static SIMDPP_INL
170uint16<16> i_shift_l_v(const uint16<16>& a, const uint16<16>& count)
171{
172#if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
173 return _mm256_sllv_epi16(a.native(), count.native());
174#else
175 return v_emul_shift_l_v16_using_mul(a, count);
176#endif
177}
178#endif
179
180#if SIMDPP_USE_AVX512BW
181SIMDPP_INL uint16<32> i_shift_l_v(const uint16<32>& a, const uint16<32>& count)
182{
183 return _mm512_sllv_epi16(a.native(), count.native());
184}
185#endif
186
187// -----------------------------------------------------------------------------
188
189static SIMDPP_INL
190uint32<4> i_shift_l_v(const uint32<4>& a, const uint32<4>& count)
191{
192#if SIMDPP_USE_NULL
193 return detail::null::shift_l_v(a, count);
194#elif SIMDPP_USE_AVX2
195 return _mm_sllv_epi32(a.native(), count.native());
196#elif SIMDPP_USE_SSE2
197 uint32<4> count0 = count;
198#if SIMDPP_USE_SSE4_1
199 uint32<4> zero = make_zero();
200 count0 = _mm_blend_epi16(count0.native(), zero.native(), 0xcc);
201#else
202 uint32<4> mask = make_uint(0xffffffff, 0, 0xffffffff, 0);
203 count0 = bit_and(count0, mask);
204#endif
205 uint32<4> count1 = _mm_srli_epi64(count.native(), 32);
206 uint32<4> count2 = _mm_srli_si128(count0.native(), 8);
207 uint32<4> count3 = _mm_srli_si128(count.native(), 12);
208
209 __m128i a0 = _mm_sll_epi32(a.native(), count0.native());
210 __m128i a1 = _mm_sll_epi32(a.native(), count1.native());
211 __m128i a2 = _mm_sll_epi32(a.native(), count2.native());
212 __m128i a3 = _mm_sll_epi32(a.native(), count3.native());
213#if SIMDPP_USE_SSE4_1
214 a0 = _mm_blend_epi16(a0, a1, 0x0c);
215 a2 = _mm_blend_epi16(a2, a3, 0xc0);
216 a0 = _mm_blend_epi16(a0, a2, 0xf0);
217#else
218 __m128 f0 = _mm_shuffle_ps(_mm_castsi128_ps(a0),
219 _mm_castsi128_ps(a1),
220 SIMDPP_SHUFFLE_MASK_4x4(0, 0, 1, 1));
221 __m128 f1 = _mm_shuffle_ps(_mm_castsi128_ps(a2),
222 _mm_castsi128_ps(a3),
223 SIMDPP_SHUFFLE_MASK_4x4(2, 2, 3, 3));
224 f0 = _mm_shuffle_ps(f0, f1, SIMDPP_SHUFFLE_MASK_4x4(0, 2, 0, 2));
225 a0 = _mm_castps_si128(f0);
226#endif
227 return a0;
228#elif SIMDPP_USE_NEON
229 return vshlq_u32(a.native(), vreinterpretq_s32_u32(count.native()));
230#elif SIMDPP_USE_ALTIVEC
231 return vec_sl(a.native(), count.native());
232#elif SIMDPP_USE_MSA
233 return (v4u32) __msa_sll_w((v4i32)a.native(), (v4i32)count.native());
234#endif
235}
236
237#if SIMDPP_USE_AVX2
238static SIMDPP_INL
239uint32<8> i_shift_l_v(const uint32<8>& a, const uint32<8>& count)
240{
241 return _mm256_sllv_epi32(a.native(), count.native());
242}
243#endif
244
245#if SIMDPP_USE_AVX512F
246SIMDPP_INL uint32<16> i_shift_l_v(const uint32<16>& a, const uint32<16>& count)
247{
248 return _mm512_sllv_epi32(a.native(), count.native());
249}
250#endif
251
252// -----------------------------------------------------------------------------
253
254template<class V, class U> SIMDPP_INL
255V i_shift_l_v(const V& a, const U& b)
256{
257 SIMDPP_VEC_ARRAY_IMPL2(V, i_shift_l_v, a, b);
258}
259
260} // namespace insn
261} // namespace detail
262} // namespace SIMDPP_ARCH_NAMESPACE
263} // namespace simdpp
264
265#endif
266