1/* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_SHIFT_L_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_SHIFT_L_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/detail/not_implemented.h>
17#include <simdpp/core/bit_and.h>
18#include <simdpp/core/bit_andnot.h>
19#include <simdpp/core/set_splat.h>
20#include <simdpp/detail/insn/i_shift.h>
21#include <simdpp/detail/null/math.h>
22#include <simdpp/detail/vector_array_macros.h>
23
24namespace simdpp {
25namespace SIMDPP_ARCH_NAMESPACE {
26namespace detail {
27namespace insn {
28
29
30static SIMDPP_INL
31uint8x16 i_shift_l(const uint8x16& a, unsigned count)
32{
33#if SIMDPP_USE_NULL
34 return detail::null::shift_l(a, count);
35#elif SIMDPP_USE_AVX2
36 uint16x8 mask, a16;
37 uint16_t mask1 = (0x00ff >> (8-count)) << 8;
38
39 a16 = a;
40 mask = splat(mask1);
41 a16 = shift_l(a16, count);
42 a16 = bit_andnot(a16, mask);
43 return uint8x16(a16);
44#elif SIMDPP_USE_SSE2
45 uint16x8 mask, a16;
46 mask = make_ones();
47 mask = shift_r(mask, 16-count);
48 mask = shift_l(mask, 8);
49
50 a16 = a;
51 a16 = shift_l(a16, count);
52 a16 = bit_andnot(a16, mask);
53 return uint8x16(a16);
54#elif SIMDPP_USE_NEON
55 int8x16 shift = splat(count);
56 return vshlq_u8(a.native(), shift.native());
57#elif SIMDPP_USE_ALTIVEC
58 uint8x16 shift = splat(count);
59 return vec_sl(a.native(), shift.native());
60#elif SIMDPP_USE_MSA
61 int8x16 shift = splat(count);
62 return (v16u8) __msa_sll_b((v16i8)a.native(), shift.native());
63#endif
64}
65
66#if SIMDPP_USE_AVX2
67static SIMDPP_INL
68uint8x32 i_shift_l(const uint8x32& a, unsigned count)
69{
70 uint16x16 mask, a16;
71 uint16_t mask1 = (0x00ff >> (8-count)) << 8;
72
73 a16 = a;
74 mask = splat(mask1);
75 a16 = shift_l(a16, count);
76 a16 = bit_andnot(a16, mask);
77 return uint8<32>(a16);
78}
79#endif
80
81#if SIMDPP_USE_AVX512BW
82SIMDPP_INL uint8<64> i_shift_l(const uint8<64>& a, unsigned count)
83{
84 uint16<32> mask, a16;
85 uint16_t mask1 = (0x00ff >> (8-count)) << 8;
86
87 a16 = a;
88 mask = splat(mask1);
89 a16 = shift_l(a16, count);
90 a16 = bit_andnot(a16, mask);
91 return uint8<64>(a16);
92}
93#endif
94
95// -----------------------------------------------------------------------------
96
97static SIMDPP_INL
98uint16x8 i_shift_l(const uint16x8& a, unsigned count)
99{
100#if SIMDPP_USE_NULL
101 return detail::null::shift_l(a, count);
102#elif SIMDPP_USE_SSE2
103 return _mm_sll_epi16(a.native(), _mm_cvtsi32_si128(count));
104#elif SIMDPP_USE_NEON
105 int16x8 shift = splat(count);
106 return vshlq_u16(a.native(), shift.native());
107#elif SIMDPP_USE_ALTIVEC
108 uint16x8 shift = splat(count);
109 return vec_sl(a.native(), shift.native());
110#elif SIMDPP_USE_MSA
111 int16x8 shift = splat(count);
112 return (v8u16) __msa_sll_h((v8i16) a.native(), shift.native());
113#endif
114}
115
116#if SIMDPP_USE_AVX2
117static SIMDPP_INL
118uint16x16 i_shift_l(const uint16x16& a, unsigned count)
119{
120#if SIMDPP_WORKAROUND_AVX2_SHIFT_INTRINSICS
121 __m256i r = a.native();
122 __m128i x = _mm_cvtsi32_si128(count);
123 __asm("vpsllw %1, %2, %0" : "=x"(r) : "x"(x), "x"(r));
124 return r;
125#else
126 return _mm256_sll_epi16(a.native(), _mm_cvtsi32_si128(count));
127#endif
128}
129#endif
130
131#if SIMDPP_USE_AVX512BW
132SIMDPP_INL uint16<32> i_shift_l(const uint16<32>& a, unsigned count)
133{
134 return _mm512_sll_epi16(a.native(), _mm_cvtsi32_si128(count));
135}
136#endif
137
138// -----------------------------------------------------------------------------
139
140static SIMDPP_INL
141uint32x4 i_shift_l(const uint32x4& a, unsigned count)
142{
143#if SIMDPP_USE_NULL
144 return detail::null::shift_l(a, count);
145#elif SIMDPP_USE_SSE2
146 return _mm_sll_epi32(a.native(), _mm_cvtsi32_si128(count));
147#elif SIMDPP_USE_NEON
148 int32x4 shift = splat(count);
149 return vshlq_u32(a.native(), shift.native());
150#elif SIMDPP_USE_ALTIVEC
151 uint32x4 shift = splat(count);
152 return vec_sl(a.native(), shift.native());
153#elif SIMDPP_USE_MSA
154 int32x4 shift = splat(count);
155 return (v4u32) __msa_sll_w((v4i32) a.native(), shift.native());
156#endif
157}
158
159#if SIMDPP_USE_AVX2
160static SIMDPP_INL
161uint32x8 i_shift_l(const uint32x8& a, unsigned count)
162{
163#if SIMDPP_WORKAROUND_AVX2_SHIFT_INTRINSICS
164 __m256i r = a.native();
165 __m128i x = _mm_cvtsi32_si128(count);
166 __asm("vpslld %1, %2, %0" : "=x"(r) : "x"(x), "x"(r));
167 return r;
168#else
169 return _mm256_sll_epi32(a.native(), _mm_cvtsi32_si128(count));
170#endif
171}
172#endif
173
174#if SIMDPP_USE_AVX512F
175static SIMDPP_INL
176uint32<16> i_shift_l(const uint32<16>& a, unsigned count)
177{
178 return _mm512_sll_epi32(a.native(), _mm_cvtsi32_si128(count));
179}
180#endif
181
182// -----------------------------------------------------------------------------
183
184static SIMDPP_INL
185uint64x2 i_shift_l(const uint64x2& a, unsigned count)
186{
187#if SIMDPP_USE_SSE2
188 return _mm_sll_epi64(a.native(), _mm_cvtsi32_si128(count));
189#elif SIMDPP_USE_NEON
190 int64x2 shift = splat(count);
191 return vshlq_u64(a.native(), shift.native());
192#elif SIMDPP_USE_VSX_207
193 uint64x2 shift = splat(count);
194 return vec_sl(a.native(), shift.native());
195#elif SIMDPP_USE_MSA
196 int32x4 shift = splat(count);
197 return (v2u64) __msa_sll_d((v2i64) a.native(), (v2i64) shift.native());
198#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
199 return detail::null::shift_l(a, count);
200#endif
201}
202
203#if SIMDPP_USE_AVX2
204static SIMDPP_INL
205uint64x4 i_shift_l(const uint64x4& a, unsigned count)
206{
207#if SIMDPP_WORKAROUND_AVX2_SHIFT_INTRINSICS
208 __m256i r = a.native();
209 __m128i x = _mm_cvtsi32_si128(count);
210 __asm("vpsllq %1, %2, %0" : "=x"(r) : "x"(x), "x"(r));
211 return r;
212#else
213 return _mm256_sll_epi64(a.native(), _mm_cvtsi32_si128(count));
214#endif
215}
216#endif
217
218#if SIMDPP_USE_AVX512F
219static SIMDPP_INL
220uint64<8> i_shift_l(const uint64<8>& a, unsigned count)
221{
222 return _mm512_sll_epi64(a.native(), _mm_cvtsi32_si128(count));
223}
224#endif
225
226// -----------------------------------------------------------------------------
227
228template<class V> SIMDPP_INL
229V i_shift_l(const V& a, unsigned count)
230{
231 SIMDPP_VEC_ARRAY_IMPL2S(V, i_shift_l, a, count);
232}
233
234// -----------------------------------------------------------------------------
235
236template<unsigned count, unsigned N> SIMDPP_INL
237uint8<N> sse_shift_l_8(const uint8<N>& a)
238{
239 uint8_t mask1 = 0xff >> count;
240 uint8<N> mask = make_uint(mask1);
241
242 uint16<N/2> a16 = (uint16<N/2>) bit_and(a, mask);
243 a16 = shift_l<count>(a16);
244
245 return uint8<N>(a16);
246}
247
248template<unsigned count> SIMDPP_INL
249uint8x16 i_shift_l(const uint8x16& a)
250{
251 static_assert(count < 8, "Shift out of bounds");
252#if SIMDPP_USE_NULL
253 return i_shift_l(a, count);
254#elif SIMDPP_USE_SSE2
255 return sse_shift_l_8<count>(a);
256#elif SIMDPP_USE_NEON
257 return vshlq_n_u8(a.native(), count);
258#elif SIMDPP_USE_ALTIVEC
259 uint8x16 shift = make_uint(count);
260 return vec_sl(a.native(), shift.native());
261#elif SIMDPP_USE_MSA
262 return (v16u8) __msa_slli_b((v16i8) a.native(), count);
263#endif
264}
265
266#if SIMDPP_USE_AVX2
267template<unsigned count> SIMDPP_INL
268uint8<32> i_shift_l(const uint8<32>& a)
269{
270 static_assert(count < 8, "Shift out of bounds");
271 return sse_shift_l_8<count>(a);
272}
273#endif
274
275#if SIMDPP_USE_AVX512BW
276template<unsigned count> SIMDPP_INL
277uint8<64> i_shift_l(const uint8<64>& a)
278{
279 static_assert(count < 8, "Shift out of bounds");
280 return sse_shift_l_8<count>(a);
281}
282#endif
283
284// -----------------------------------------------------------------------------
285
286template<unsigned count> SIMDPP_INL
287uint16x8 i_shift_l(const uint16x8& a)
288{
289 static_assert(count < 16, "Shift out of bounds");
290#if SIMDPP_USE_NULL
291 return i_shift_l(a, count);
292#elif SIMDPP_USE_SSE2
293 return _mm_slli_epi16(a.native(), count);
294#elif SIMDPP_USE_NEON
295 return vshlq_n_u16(a.native(), count);
296#elif SIMDPP_USE_ALTIVEC
297 uint16x8 shift = make_uint(count);
298 return vec_sl(a.native(), shift.native());
299#elif SIMDPP_USE_MSA
300 return (v8u16) __msa_slli_h((v8i16) a.native(), count);
301#endif
302}
303
304#if SIMDPP_USE_AVX2
305template<unsigned count> SIMDPP_INL
306uint16x16 i_shift_l(const uint16x16& a)
307{
308 static_assert(count < 16, "Shift out of bounds");
309 return _mm256_slli_epi16(a.native(), count);
310}
311#endif
312
313#if SIMDPP_USE_AVX512BW
314template<unsigned count> SIMDPP_INL
315uint16<32> i_shift_l(const uint16<32>& a)
316{
317 static_assert(count < 16, "Shift out of bounds");
318 return _mm512_slli_epi16(a.native(), count);
319}
320#endif
321
322// -----------------------------------------------------------------------------
323
324template<unsigned count> SIMDPP_INL
325uint32x4 i_shift_l(const uint32x4& a)
326{
327 static_assert(count < 32, "Shift out of bounds");
328#if SIMDPP_USE_NULL
329 return i_shift_l(a, count);
330#elif SIMDPP_USE_SSE2
331 return _mm_slli_epi32(a.native(), count);
332#elif SIMDPP_USE_NEON
333 return vshlq_n_u32(a.native(), count);
334#elif SIMDPP_USE_ALTIVEC
335 uint32x4 shift = make_uint(count);
336 return vec_sl(a.native(), shift.native());
337#elif SIMDPP_USE_MSA
338 return (v4u32) __msa_slli_w((v4i32) a.native(), count);
339#endif
340}
341
342#if SIMDPP_USE_AVX2
343template<unsigned count> SIMDPP_INL
344uint32x8 i_shift_l(const uint32x8& a)
345{
346 static_assert(count < 32, "Shift out of bounds");
347 return _mm256_slli_epi32(a.native(), count);
348}
349#endif
350
351#if SIMDPP_USE_AVX512F
352template<unsigned count> SIMDPP_INL
353uint32<16> i_shift_l(const uint32<16>& a)
354{
355 static_assert(count < 32, "Shift out of bounds");
356 return _mm512_slli_epi32(a.native(), count);
357}
358#endif
359
360// -----------------------------------------------------------------------------
361
362template<unsigned count> SIMDPP_INL
363uint64x2 i_shift_l(const uint64x2& a)
364{
365 static_assert(count < 64, "Shift out of bounds");
366#if SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
367 return i_shift_l(a, count);
368#elif SIMDPP_USE_SSE2
369 return _mm_slli_epi64(a.native(), count);
370#elif SIMDPP_USE_NEON
371 return vshlq_n_u64(a.native(), count);
372#elif SIMDPP_USE_MSA
373 return (v2u64) __msa_slli_d((v2i64) a.native(), count);
374#else
375 return SIMDPP_NOT_IMPLEMENTED1(a);
376#endif
377}
378
379#if SIMDPP_USE_AVX2
380template<unsigned count> SIMDPP_INL
381uint64x4 i_shift_l(const uint64x4& a)
382{
383 static_assert(count < 64, "Shift out of bounds");
384 return _mm256_slli_epi64(a.native(), count);
385}
386#endif
387
388#if SIMDPP_USE_AVX512F
389template<unsigned count> SIMDPP_INL
390uint64<8> i_shift_l(const uint64<8>& a)
391{
392 static_assert(count < 64, "Shift out of bounds");
393 return _mm512_slli_epi64(a.native(), count);
394}
395#endif
396
397// -----------------------------------------------------------------------------
398
399template<unsigned count, class V> SIMDPP_INL
400V i_shift_l(const V& a)
401{
402 SIMDPP_VEC_ARRAY_IMPL1(V, i_shift_l<count>, a);
403}
404
405template<bool no_shift>
406struct i_shift_l_wrapper {
407 template<unsigned count, class V>
408 static SIMDPP_INL V run(const V& arg) { return i_shift_l<count>(arg); }
409};
410template<>
411struct i_shift_l_wrapper<true> {
412 template<unsigned count, class V>
413 static SIMDPP_INL V run(const V& arg) { return arg; }
414};
415
416} // namespace insn
417} // namespace detail
418} // namespace SIMDPP_ARCH_NAMESPACE
419} // namespace simdpp
420
421#endif
422
423