1 | /* Copyright (C) 2017 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_CORE_I_SHIFT_L_V_H |
9 | #define LIBSIMDPP_SIMDPP_CORE_I_SHIFT_L_V_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/detail/null/math.h> |
17 | #include <simdpp/detail/insn/i_shift.h> |
18 | #include <simdpp/detail/shuffle/shuffle_mask.h> |
19 | #include <simdpp/core/i_mul.h> |
20 | #include <simdpp/core/permute_bytes16.h> |
21 | #include <simdpp/detail/vector_array_macros.h> |
22 | |
23 | namespace simdpp { |
24 | namespace SIMDPP_ARCH_NAMESPACE { |
25 | namespace detail { |
26 | namespace insn { |
27 | |
28 | // emulates 8-bit variable shift using 16-bit variable shift |
29 | template<class U8> SIMDPP_INL |
30 | U8 v_emul_shift_l_v8_using_v16(const U8& a, const U8& count) |
31 | { |
32 | using U16 = typename same_width<U8>::u16; |
33 | |
34 | U16 a16; a16 = a; |
35 | U16 c16; c16 = count; |
36 | |
37 | U16 select_mask = make_uint(0xff00); |
38 | U16 a_lo = a16; |
39 | U16 a_hi = bit_and(a16, select_mask); |
40 | U16 c_lo = bit_andnot(c16, select_mask); |
41 | U16 c_hi = shift_r<8>(c16); |
42 | a_lo = shift_l(a_lo, c_lo); |
43 | a_hi = shift_l(a_hi, c_hi); |
44 | a_lo = bit_andnot(a_lo, select_mask); |
45 | |
46 | a16 = bit_or(a_lo, a_hi); |
47 | return (U8) a16; |
48 | } |
49 | |
50 | // emulates 8-bit variable shift using permute_bytes16 and 16-bit multiplication |
51 | template<class U8> SIMDPP_INL |
52 | U8 v_emul_shift_l_v8_using_mul(const U8& a, const U8& count) |
53 | { |
54 | using U16 = typename same_width<U8>::u16; |
55 | |
56 | // Variable shift is implemented by obtaining 1 << countN for each element |
57 | // from a and then multiplying each element by that number. Implementation |
58 | // is complicated by the fact, that only 16-bit multiplication is available. |
59 | U8 mulshift_mask = make_uint(0x01, 0x02, 0x04, 0x08, |
60 | 0x10, 0x20, 0x40, 0x80); |
61 | U16 mulshift = (U16) permute_bytes16(mulshift_mask, count); |
62 | |
63 | U16 a16; a16 = a; |
64 | U16 a16_lo, a16_hi, mulshift_lo, mulshift_hi; |
65 | U16 select_mask = make_uint(0x00ff); |
66 | |
67 | // Move the element values to the high byte of the 16-bit elements and the |
68 | // shift values to the low byte. The results will have the low byte clear |
69 | // which will help composing the result back to a single vector. |
70 | a16_lo = shift_l<8>(a16); |
71 | mulshift_lo = bit_and(mulshift, select_mask); |
72 | a16_hi = bit_andnot(a16, select_mask); |
73 | mulshift_hi = shift_r<8>(mulshift); |
74 | |
75 | a16_lo = mul_lo(a16_lo, mulshift_lo); |
76 | a16_hi = mul_lo(a16_hi, mulshift_hi); |
77 | |
78 | a16_lo = shift_r<8>(a16_lo); |
79 | a16 = bit_or(a16_lo, a16_hi); |
80 | return (U8) a16; |
81 | } |
82 | |
83 | static SIMDPP_INL |
84 | uint8<16> i_shift_l_v(const uint8<16>& a, const uint8<16>& count) |
85 | { |
86 | #if SIMDPP_USE_NULL |
87 | return detail::null::shift_l_v(a, count); |
88 | #elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL |
89 | return v_emul_shift_l_v8_using_v16(a, count); |
90 | #elif SIMDPP_USE_SSSE3 |
91 | return v_emul_shift_l_v8_using_mul(a, count); |
92 | #elif SIMDPP_USE_NEON |
93 | return vshlq_u8(a.native(), vreinterpretq_s8_u8(count.native())); |
94 | #elif SIMDPP_USE_ALTIVEC |
95 | return vec_sl(a.native(), count.native()); |
96 | #elif SIMDPP_USE_MSA |
97 | return (v16u8) __msa_sll_b((v16i8)a.native(), (v16i8)count.native()); |
98 | #else |
99 | return SIMDPP_NOT_IMPLEMENTED2(a, count); |
100 | #endif |
101 | } |
102 | |
103 | #if SIMDPP_USE_AVX2 |
104 | static SIMDPP_INL |
105 | uint8<32> i_shift_l_v(const uint8<32>& a, const uint8<32>& count) |
106 | { |
107 | #if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL |
108 | return v_emul_shift_l_v8_using_v16(a, count); |
109 | #else |
110 | return v_emul_shift_l_v8_using_mul(a, count); |
111 | #endif |
112 | } |
113 | #endif |
114 | |
115 | #if SIMDPP_USE_AVX512BW |
116 | static SIMDPP_INL |
117 | uint8<64> i_shift_l_v(const uint8<64>& a, const uint8<64>& count) |
118 | { |
119 | return v_emul_shift_l_v8_using_v16(a, count); |
120 | } |
121 | #endif |
122 | |
123 | // ----------------------------------------------------------------------------- |
124 | |
125 | // emulates 16-bit variable shift using permute_bytes16 and 16-bit multiplication |
126 | template<class U16> |
127 | U16 v_emul_shift_l_v16_using_mul(const U16& a, const U16& count) |
128 | { |
129 | using U8 = typename same_width<U16>::u8; |
130 | |
131 | // Variable shift is implemented by obtaining 1 << countN for each element |
132 | // from a and then multiplying each element by that number. The |
133 | // implementation is complicated by the fact that permute_bytes16 permutes |
134 | // 8-bit elements instead of 16 which would be optimal in this case |
135 | U8 mulshift_mask = make_uint(0x01, 0x02, 0x04, 0x08, |
136 | 0x10, 0x20, 0x40, 0x80, |
137 | 0x00, 0x00, 0x00, 0x00, |
138 | 0x00, 0x00, 0x00, 0x00); |
139 | U16 qcount = bit_or(count, shift_l<8>(count)); |
140 | |
141 | // toggle the 4-th bit so that the high byte takes zeros from the mulshift |
142 | // mask when the shift count is higher than 8. |
143 | qcount = bit_xor(qcount, 0x0800); |
144 | U16 mulshift = (U16) permute_bytes16(mulshift_mask, (U8) qcount); |
145 | return mul_lo(a, mulshift); |
146 | } |
147 | |
148 | static SIMDPP_INL |
149 | uint16<8> i_shift_l_v(const uint16<8>& a, const uint16<8>& count) |
150 | { |
151 | #if SIMDPP_USE_NULL |
152 | return detail::null::shift_l_v(a, count); |
153 | #elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL |
154 | return _mm_sllv_epi16(a.native(), count.native()); |
155 | #elif SIMDPP_USE_SSSE3 |
156 | return v_emul_shift_l_v16_using_mul(a, count); |
157 | #elif SIMDPP_USE_NEON |
158 | return vshlq_u16(a.native(), vreinterpretq_s16_u16(count.native())); |
159 | #elif SIMDPP_USE_ALTIVEC |
160 | return vec_sl(a.native(), count.native()); |
161 | #elif SIMDPP_USE_MSA |
162 | return (v8u16) __msa_sll_h((v8i16)a.native(), (v8i16)count.native()); |
163 | #else |
164 | return SIMDPP_NOT_IMPLEMENTED2(a, count); |
165 | #endif |
166 | } |
167 | |
168 | #if SIMDPP_USE_AVX2 |
169 | static SIMDPP_INL |
170 | uint16<16> i_shift_l_v(const uint16<16>& a, const uint16<16>& count) |
171 | { |
172 | #if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL |
173 | return _mm256_sllv_epi16(a.native(), count.native()); |
174 | #else |
175 | return v_emul_shift_l_v16_using_mul(a, count); |
176 | #endif |
177 | } |
178 | #endif |
179 | |
180 | #if SIMDPP_USE_AVX512BW |
181 | SIMDPP_INL uint16<32> i_shift_l_v(const uint16<32>& a, const uint16<32>& count) |
182 | { |
183 | return _mm512_sllv_epi16(a.native(), count.native()); |
184 | } |
185 | #endif |
186 | |
187 | // ----------------------------------------------------------------------------- |
188 | |
189 | static SIMDPP_INL |
190 | uint32<4> i_shift_l_v(const uint32<4>& a, const uint32<4>& count) |
191 | { |
192 | #if SIMDPP_USE_NULL |
193 | return detail::null::shift_l_v(a, count); |
194 | #elif SIMDPP_USE_AVX2 |
195 | return _mm_sllv_epi32(a.native(), count.native()); |
196 | #elif SIMDPP_USE_SSE2 |
197 | uint32<4> count0 = count; |
198 | #if SIMDPP_USE_SSE4_1 |
199 | uint32<4> zero = make_zero(); |
200 | count0 = _mm_blend_epi16(count0.native(), zero.native(), 0xcc); |
201 | #else |
202 | uint32<4> mask = make_uint(0xffffffff, 0, 0xffffffff, 0); |
203 | count0 = bit_and(count0, mask); |
204 | #endif |
205 | uint32<4> count1 = _mm_srli_epi64(count.native(), 32); |
206 | uint32<4> count2 = _mm_srli_si128(count0.native(), 8); |
207 | uint32<4> count3 = _mm_srli_si128(count.native(), 12); |
208 | |
209 | __m128i a0 = _mm_sll_epi32(a.native(), count0.native()); |
210 | __m128i a1 = _mm_sll_epi32(a.native(), count1.native()); |
211 | __m128i a2 = _mm_sll_epi32(a.native(), count2.native()); |
212 | __m128i a3 = _mm_sll_epi32(a.native(), count3.native()); |
213 | #if SIMDPP_USE_SSE4_1 |
214 | a0 = _mm_blend_epi16(a0, a1, 0x0c); |
215 | a2 = _mm_blend_epi16(a2, a3, 0xc0); |
216 | a0 = _mm_blend_epi16(a0, a2, 0xf0); |
217 | #else |
218 | __m128 f0 = _mm_shuffle_ps(_mm_castsi128_ps(a0), |
219 | _mm_castsi128_ps(a1), |
220 | SIMDPP_SHUFFLE_MASK_4x4(0, 0, 1, 1)); |
221 | __m128 f1 = _mm_shuffle_ps(_mm_castsi128_ps(a2), |
222 | _mm_castsi128_ps(a3), |
223 | SIMDPP_SHUFFLE_MASK_4x4(2, 2, 3, 3)); |
224 | f0 = _mm_shuffle_ps(f0, f1, SIMDPP_SHUFFLE_MASK_4x4(0, 2, 0, 2)); |
225 | a0 = _mm_castps_si128(f0); |
226 | #endif |
227 | return a0; |
228 | #elif SIMDPP_USE_NEON |
229 | return vshlq_u32(a.native(), vreinterpretq_s32_u32(count.native())); |
230 | #elif SIMDPP_USE_ALTIVEC |
231 | return vec_sl(a.native(), count.native()); |
232 | #elif SIMDPP_USE_MSA |
233 | return (v4u32) __msa_sll_w((v4i32)a.native(), (v4i32)count.native()); |
234 | #endif |
235 | } |
236 | |
237 | #if SIMDPP_USE_AVX2 |
238 | static SIMDPP_INL |
239 | uint32<8> i_shift_l_v(const uint32<8>& a, const uint32<8>& count) |
240 | { |
241 | return _mm256_sllv_epi32(a.native(), count.native()); |
242 | } |
243 | #endif |
244 | |
245 | #if SIMDPP_USE_AVX512F |
246 | SIMDPP_INL uint32<16> i_shift_l_v(const uint32<16>& a, const uint32<16>& count) |
247 | { |
248 | return _mm512_sllv_epi32(a.native(), count.native()); |
249 | } |
250 | #endif |
251 | |
252 | // ----------------------------------------------------------------------------- |
253 | |
254 | template<class V, class U> SIMDPP_INL |
255 | V i_shift_l_v(const V& a, const U& b) |
256 | { |
257 | SIMDPP_VEC_ARRAY_IMPL2(V, i_shift_l_v, a, b); |
258 | } |
259 | |
260 | } // namespace insn |
261 | } // namespace detail |
262 | } // namespace SIMDPP_ARCH_NAMESPACE |
263 | } // namespace simdpp |
264 | |
265 | #endif |
266 | |