1/* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_SHIFT_R_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_SHIFT_R_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/detail/not_implemented.h>
17#include <simdpp/core/bit_and.h>
18#include <simdpp/core/bit_andnot.h>
19#include <simdpp/core/bit_or.h>
20#include <simdpp/core/i_add.h>
21#include <simdpp/core/i_sub.h>
22#include <simdpp/core/splat.h>
23#include <simdpp/core/set_splat.h>
24#include <simdpp/core/permute4.h>
25#include <simdpp/core/shuffle2.h>
26#include <simdpp/detail/insn/i_shift.h>
27#include <simdpp/detail/null/math.h>
28#include <simdpp/detail/vector_array_macros.h>
29
30namespace simdpp {
31namespace SIMDPP_ARCH_NAMESPACE {
32namespace detail {
33namespace insn {
34
35
36static SIMDPP_INL
37int8x16 i_shift_r(const int8x16& a, unsigned count)
38{
39#if SIMDPP_USE_NULL
40 return detail::null::shift_r(a, count);
41#elif SIMDPP_USE_SSE2
42 uint16x8 hi, lo;
43 lo = hi = a;
44
45 lo = shift_l<8>(lo);
46 lo = shift_r(int16x8(lo), count);
47 lo = shift_r<8>(lo);
48
49 hi = shift_r(int16x8(hi), 8+count);
50 hi = shift_l<8>(hi);
51 return (int8<16>) bit_or(lo, hi); //higher part of lo is already clear
52#elif SIMDPP_USE_NEON
53 int8x16 shift = splat(-int(count));
54 return vshlq_s8(a.native(), shift.native());
55#elif SIMDPP_USE_ALTIVEC
56 uint8x16 shift = splat(count);
57 return vec_sra(a.native(), shift.native());
58#elif SIMDPP_USE_MSA
59 int8x16 shift = splat(count);
60 return __msa_sra_b(a.native(), shift.native());
61#endif
62}
63
64#if SIMDPP_USE_AVX2
65static SIMDPP_INL
66int8x32 i_shift_r(const int8x32& a, unsigned count)
67{
68 uint16x16 hi, lo;
69 lo = hi = a;
70
71 lo = shift_l<8>(lo);
72 lo = shift_r(int16x16(lo), count);
73 lo = shift_r<8>(lo);
74
75 hi = shift_r(int16x16(hi), 8+count);
76 hi = shift_l<8>(hi);
77 return (int8<32>) bit_or(lo, hi); //higher part of lo is already clear
78}
79#endif
80
81#if SIMDPP_USE_AVX512BW
82SIMDPP_INL int8<64> i_shift_r(const int8<64>& a, unsigned count)
83{
84 uint16<32> hi, lo;
85 lo = hi = a;
86
87 lo = shift_l<8>(lo);
88 lo = shift_r(int16<32>(lo), count);
89 lo = shift_r<8>(lo);
90
91 hi = shift_r(int16<32>(hi), 8+count);
92 hi = shift_l<8>(hi);
93 return (int8<64>) bit_or(lo, hi); //higher part of lo is already clear
94}
95#endif
96
97// -----------------------------------------------------------------------------
98
99static SIMDPP_INL
100uint8x16 i_shift_r(const uint8x16& a, unsigned count)
101{
102#if SIMDPP_USE_NULL
103 return detail::null::shift_r(a, count);
104#elif SIMDPP_USE_SSE2
105 uint16x8 mask, a16;
106 mask = make_ones();
107 mask = shift_l(mask, 16-count);
108 mask = shift_r<8>(mask);
109
110 a16 = a;
111 a16 = shift_r(a16, count);
112 a16 = bit_andnot(a16, mask);
113 return uint8x16(a16);
114#elif SIMDPP_USE_NEON
115 int8x16 shift = splat(-int(count));
116 return vshlq_u8(a.native(), shift.native());
117#elif SIMDPP_USE_ALTIVEC
118 uint8x16 shift = splat(count);
119 return vec_sr(a.native(), shift.native());
120#elif SIMDPP_USE_MSA
121 int8x16 shift = splat(count);
122 return (v16u8) __msa_srl_b((v16i8) a.native(), shift.native());
123#endif
124}
125
126#if SIMDPP_USE_AVX2
127static SIMDPP_INL
128uint8x32 i_shift_r(const uint8x32& a, unsigned count)
129{
130 unsigned shift = 8 - count;
131 uint16_t mask1 = (0xff >> shift) << shift;
132 uint16x16 mask, a16;
133 mask = splat(mask1);
134
135 a16 = a;
136 a16 = shift_r(a16, count);
137 a16 = bit_andnot(a16, mask);
138 return uint8x32(a16);
139}
140#endif
141
142#if SIMDPP_USE_AVX512BW
143SIMDPP_INL uint8<64> i_shift_r(const uint8<64>& a, unsigned count)
144{
145 unsigned shift = 8 - count;
146 uint16_t mask1 = (0xff >> shift) << shift;
147 uint16<32> mask, a16;
148 mask = splat(mask1);
149
150 a16 = a;
151 a16 = shift_r(a16, count);
152 a16 = bit_andnot(a16, mask);
153 return uint8<64>(a16);
154}
155#endif
156
157// -----------------------------------------------------------------------------
158
159static SIMDPP_INL
160int16x8 i_shift_r(const int16x8& a, unsigned count)
161{
162#if SIMDPP_USE_NULL
163 return detail::null::shift_r(a, count);
164#elif SIMDPP_USE_SSE2
165 return _mm_sra_epi16(a.native(), _mm_cvtsi32_si128(count));
166#elif SIMDPP_USE_NEON
167 int16x8 shift = splat(-int(count));
168 return vshlq_s16(a.native(), shift.native());
169#elif SIMDPP_USE_ALTIVEC
170 uint16x8 shift = splat(count);
171 return vec_sra(a.native(), shift.native());
172#elif SIMDPP_USE_MSA
173 int16x8 shift = splat(count);
174 return __msa_sra_h(a.native(), shift.native());
175#endif
176}
177
178#if SIMDPP_USE_AVX2
179static SIMDPP_INL
180int16x16 i_shift_r(const int16x16& a, unsigned count)
181{
182#if SIMDPP_WORKAROUND_AVX2_SHIFT_INTRINSICS
183 __m256i r = a.native();
184 __m128i x = _mm_cvtsi32_si128(count);
185 __asm("vpsraw %1, %2, %0" : "=x"(r) : "x"(x), "x"(r));
186 return r;
187#else
188 return _mm256_sra_epi16(a.native(), _mm_cvtsi32_si128(count));
189#endif
190}
191#endif
192
193#if SIMDPP_USE_AVX512BW
194SIMDPP_INL int16<32> i_shift_r(const int16<32>& a, unsigned count)
195{
196 return _mm512_sra_epi16(a.native(), _mm_cvtsi32_si128(count));
197}
198#endif
199
200// -----------------------------------------------------------------------------
201
202static SIMDPP_INL
203uint16x8 i_shift_r(const uint16x8& a, unsigned count)
204{
205#if SIMDPP_USE_NULL
206 return detail::null::shift_r(a, count);
207#elif SIMDPP_USE_SSE2
208 return _mm_srli_epi16(a.native(), count);
209#elif SIMDPP_USE_NEON
210 int16x8 shift = splat(-int(count));
211 return vshlq_u16(a.native(), shift.native());
212#elif SIMDPP_USE_ALTIVEC
213 uint16x8 shift = splat(count);
214 return vec_sr(a.native(), shift.native());
215#elif SIMDPP_USE_MSA
216 int16x8 shift = splat(count);
217 return (v8u16) __msa_srl_h((v8i16) a.native(), shift.native());
218#endif
219}
220
221#if SIMDPP_USE_AVX2
222static SIMDPP_INL
223uint16x16 i_shift_r(const uint16x16& a, unsigned count)
224{
225#if SIMDPP_WORKAROUND_AVX2_SHIFT_INTRINSICS
226 __m256i r = a.native();
227 __m128i x = _mm_cvtsi32_si128(count);
228 __asm("vpsrlw %1, %2, %0" : "=x"(r) : "x"(x), "x"(r));
229 return r;
230#else
231 return _mm256_srl_epi16(a.native(), _mm_cvtsi32_si128(count));
232#endif
233}
234#endif
235
236#if SIMDPP_USE_AVX512BW
237SIMDPP_INL uint16<32> i_shift_r(const uint16<32>& a, unsigned count)
238{
239 return _mm512_srl_epi16(a.native(), _mm_cvtsi32_si128(count));
240}
241#endif
242
243// -----------------------------------------------------------------------------
244
245static SIMDPP_INL
246int32x4 i_shift_r(const int32x4& a, unsigned count)
247{
248#if SIMDPP_USE_NULL
249 return detail::null::shift_r(a, count);
250#elif SIMDPP_USE_SSE2
251 return _mm_sra_epi32(a.native(), _mm_cvtsi32_si128(count));
252#elif SIMDPP_USE_NEON
253 int32x4 shift = splat(-int(count));
254 return vshlq_s32(a.native(), shift.native());
255#elif SIMDPP_USE_ALTIVEC
256 uint32x4 shift = splat(count);
257 return vec_sra(a.native(), shift.native());
258#elif SIMDPP_USE_MSA
259 int32x4 shift = splat(count);
260 return __msa_sra_w(a.native(), shift.native());
261#endif
262}
263
264#if SIMDPP_USE_AVX2
265static SIMDPP_INL
266int32x8 i_shift_r(const int32x8& a, unsigned count)
267{
268#if SIMDPP_WORKAROUND_AVX2_SHIFT_INTRINSICS
269 __m256i r = a.native();
270 __m128i x = _mm_cvtsi32_si128(count);
271 __asm("vpsrad %1, %2, %0" : "=x"(r) : "x"(x), "x"(r));
272 return r;
273#else
274 return _mm256_sra_epi32(a.native(), _mm_cvtsi32_si128(count));
275#endif
276}
277#endif
278
279#if SIMDPP_USE_AVX512F
280static SIMDPP_INL
281int32<16> i_shift_r(const int32<16>& a, unsigned count)
282{
283 return _mm512_sra_epi32(a.native(), _mm_cvtsi32_si128(count));
284}
285#endif
286
287// -----------------------------------------------------------------------------
288
289static SIMDPP_INL
290uint32x4 i_shift_r(const uint32x4& a, unsigned count)
291{
292#if SIMDPP_USE_NULL
293 return detail::null::shift_r(a, count);
294#elif SIMDPP_USE_SSE2
295 return _mm_srl_epi32(a.native(), _mm_cvtsi32_si128(count));
296#elif SIMDPP_USE_NEON
297 int32x4 shift = splat(-int(count));
298 return vshlq_u32(a.native(), shift.native());
299#elif SIMDPP_USE_ALTIVEC
300 uint32x4 shift = splat(count);
301 return vec_sr(a.native(), shift.native());
302#elif SIMDPP_USE_MSA
303 int32x4 shift = splat(count);
304 return (v4u32) __msa_srl_w((v4i32) a.native(), shift.native());
305#endif
306}
307
308#if SIMDPP_USE_AVX2
309static SIMDPP_INL
310uint32x8 i_shift_r(const uint32x8& a, unsigned count)
311{
312#if SIMDPP_WORKAROUND_AVX2_SHIFT_INTRINSICS
313 __m256i r = a.native();
314 __m128i x = _mm_cvtsi32_si128(count);
315 __asm("vpsrld %1, %2, %0" : "=x"(r) : "x"(x), "x"(r));
316 return r;
317#else
318 return _mm256_srl_epi32(a.native(), _mm_cvtsi32_si128(count));
319#endif
320}
321#endif
322
323#if SIMDPP_USE_AVX512F
324static SIMDPP_INL
325uint32<16> i_shift_r(const uint32<16>& a, unsigned count)
326{
327 return _mm512_srl_epi32(a.native(), _mm_cvtsi32_si128(count));
328}
329#endif
330
331// -----------------------------------------------------------------------------
332
333static SIMDPP_INL
334uint64x2 i_shift_r(const uint64x2& a, unsigned count)
335{
336#if SIMDPP_USE_SSE2
337 return _mm_srl_epi64(a.native(), _mm_cvtsi32_si128(count));
338#elif SIMDPP_USE_NEON
339 int64x2 shift = splat(-int(count));
340 return vshlq_u64(a.native(), shift.native());
341#elif SIMDPP_USE_VSX_207
342 uint64x2 shift = splat(count);
343 return vec_sr(a.native(), shift.native());
344#elif SIMDPP_USE_MSA
345 int32x4 shift = splat(count);
346 return (v2u64) __msa_srl_d((v2i64) a.native(), (v2i64) shift.native());
347#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
348 return detail::null::shift_r(a, count);
349#endif
350}
351
352#if SIMDPP_USE_AVX2
353static SIMDPP_INL
354uint64x4 i_shift_r(const uint64x4& a, unsigned count)
355{
356#if SIMDPP_WORKAROUND_AVX2_SHIFT_INTRINSICS
357 __m256i r = a.native();
358 __m128i x = _mm_cvtsi32_si128(count);
359 __asm("vpsrlq %1, %2, %0" : "=x"(r) : "x"(x), "x"(r));
360 return r;
361#else
362 return _mm256_srl_epi64(a.native(), _mm_cvtsi32_si128(count));
363#endif
364}
365#endif
366
367#if SIMDPP_USE_AVX512F
368static SIMDPP_INL
369uint64<8> i_shift_r(const uint64<8>& a, unsigned count)
370{
371 return _mm512_srl_epi64(a.native(), _mm_cvtsi32_si128(count));
372}
373#endif
374
375// -----------------------------------------------------------------------------
376
377static SIMDPP_INL
378int64x2 i_shift_r(const int64x2& a, unsigned count)
379{
380#if SIMDPP_USE_SSE2
381 uint64<2> ret, bias;
382 bias = make_uint(0x8000000000000000);
383 ret = shift_r(add(uint64<2>(a), bias), count);
384 ret = sub(ret, shift_r(bias, count));
385 return (int64<2>) ret;
386#elif SIMDPP_USE_NEON
387 int64x2 shift = splat(-int(count));
388 return vshlq_s64(a.native(), shift.native());
389#elif SIMDPP_USE_VSX_207
390 uint64x2 shift = splat(count);
391 return vec_sra(a.native(), shift.native());
392#elif SIMDPP_USE_MSA
393 int32x4 shift = splat(count);
394 return __msa_sra_d(a.native(), (v2i64) shift.native());
395#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
396 return detail::null::shift_r(a, count);
397#endif
398}
399
400#if SIMDPP_USE_AVX2
401static SIMDPP_INL
402int64x4 i_shift_r(const int64x4& a, unsigned count)
403{
404 uint64<4> ret, bias;
405 bias = make_uint(0x8000000000000000);
406 ret = shift_r(add(uint64<4>(a), bias), count);
407 ret = sub(ret, shift_r(bias, count));
408 return (int64<4>) ret;
409}
410#endif
411
412#if SIMDPP_USE_AVX512F
413static SIMDPP_INL
414int64<8> i_shift_r(const int64<8>& a, unsigned count)
415{
416 return _mm512_sra_epi64(a.native(), _mm_cvtsi32_si128(count));
417}
418#endif
419
420// -----------------------------------------------------------------------------
421
422template<class V> SIMDPP_INL
423V i_shift_r(const V& a, unsigned count)
424{
425 SIMDPP_VEC_ARRAY_IMPL2S(V, i_shift_r, a, count);
426}
427
428
429// -----------------------------------------------------------------------------
430
431template<unsigned count, unsigned N> SIMDPP_INL
432uint8<N> shift_r_u8(const uint8<N>& a);
433
434
435template<unsigned count> SIMDPP_INL
436int8x16 i_shift_r(const int8x16& a)
437{
438 static_assert(count < 8, "Shift out of bounds");
439#if SIMDPP_USE_NULL
440 return i_shift_r(a, count);
441#elif SIMDPP_USE_SSE2
442 uint16<8> hi, lo;
443 lo = hi = a;
444
445 lo = shift_l<8>(lo);
446 lo = shift_r<count>(int16<8>(lo));
447 lo = shift_r<8>(lo);
448
449 hi = shift_r<8+count>(int16<8>(hi));
450 hi = shift_l<8>(hi);
451 return (int8<16>) bit_or(lo, hi); //higher part of lo is already clear
452#elif SIMDPP_USE_NEON
453 return vshrq_n_s8(a.native(), count);
454#elif SIMDPP_USE_ALTIVEC
455 uint8x16 shift = make_uint(count);
456 return vec_sra(a.native(), shift.native());
457#elif SIMDPP_USE_MSA
458 return __msa_srai_b(a.native(), count);
459#endif
460}
461
462#if SIMDPP_USE_AVX2
463template<unsigned count> SIMDPP_INL
464int8x32 i_shift_r(const int8x32& a)
465{
466 static_assert(count < 8, "Shift out of bounds");
467 uint16<16> hi, lo;
468 lo = hi = a;
469
470 lo = shift_l<8>(lo);
471 lo = shift_r<count>(int16<16>(lo));
472 lo = shift_r<8>(lo);
473
474 hi = shift_r<8+count>(int16<16>(hi));
475 hi = shift_l<8>(hi);
476 return (int8<32>) bit_or(lo, hi); //higher part of lo is already clear
477}
478#endif
479
480#if SIMDPP_USE_AVX512BW
481template<unsigned count> SIMDPP_INL
482int8<64> i_shift_r(const int8<64>& a)
483{
484 static_assert(count < 8, "Shift out of bounds");
485 uint16<32> hi, lo;
486 lo = hi = a;
487
488 lo = shift_l<8>(lo);
489 lo = shift_r<count>(int16<32>(lo));
490 lo = shift_r<8>(lo);
491
492 hi = shift_r<8+count>(int16<32>(hi));
493 hi = shift_l<8>(hi);
494 return (int8<64>) bit_or(lo, hi); //higher part of lo is already clear
495}
496#endif
497
498// -----------------------------------------------------------------------------
499
500template<unsigned count, unsigned N> SIMDPP_INL
501uint8<N> sse_shift_r_u8(const uint8<N>& a)
502{
503 uint8_t mask1 = (0xff << count) & 0xff;
504 uint8<N> mask = make_uint(mask1);
505
506 uint16<N/2> a16 = (uint16<N/2>) bit_and(a, mask);
507 a16 = shift_r<count>(a16);
508
509 return uint8<N>(a16);
510}
511
512template<unsigned count> SIMDPP_INL
513uint8x16 i_shift_r(const uint8x16& a)
514{
515 static_assert(count < 8, "Shift out of bounds");
516#if SIMDPP_USE_NULL
517 return i_shift_r(a, count);
518#elif SIMDPP_USE_SSE2
519 return sse_shift_r_u8<count>(a);
520#elif SIMDPP_USE_NEON
521 return vshrq_n_u8(a.native(), count);
522#elif SIMDPP_USE_ALTIVEC
523 uint8x16 shift = make_uint(count);
524 return vec_sr(a.native(), shift.native());
525#elif SIMDPP_USE_MSA
526 return (v16u8) __msa_srli_b((v16i8) a.native(), count);
527#endif
528}
529
530#if SIMDPP_USE_AVX2
531template<unsigned count> SIMDPP_INL
532uint8x32 i_shift_r(const uint8x32& a)
533{
534 static_assert(count < 8, "Shift out of bounds");
535 return sse_shift_r_u8<count>(a);
536}
537#endif
538
539#if SIMDPP_USE_AVX512BW
540template<unsigned count> SIMDPP_INL
541uint8<64> i_shift_r(const uint8<64>& a)
542{
543 static_assert(count < 8, "Shift out of bounds");
544 return sse_shift_r_u8<count>(a);
545}
546#endif
547
548// -----------------------------------------------------------------------------
549
550template<unsigned count> SIMDPP_INL
551int16x8 i_shift_r(const int16x8& a)
552{
553 static_assert(count < 16, "Shift out of bounds");
554#if SIMDPP_USE_NULL
555 return detail::null::shift_r(a, count);
556#elif SIMDPP_USE_SSE2
557 return _mm_srai_epi16(a.native(), count);
558#elif SIMDPP_USE_NEON
559 return vshrq_n_s16(a.native(), count);
560#elif SIMDPP_USE_ALTIVEC
561 uint16x8 shift = make_uint(count);
562 return vec_sra(a.native(), shift.native());
563#elif SIMDPP_USE_MSA
564 return __msa_srai_h(a.native(), count);
565#endif
566}
567
568#if SIMDPP_USE_AVX2
569template<unsigned count> SIMDPP_INL
570int16x16 i_shift_r(const int16x16& a)
571{
572 static_assert(count < 16, "Shift out of bounds");
573 return _mm256_srai_epi16(a.native(), count);
574}
575#endif
576
577#if SIMDPP_USE_AVX512BW
578template<unsigned count> SIMDPP_INL
579int16<32> i_shift_r(const int16<32>& a)
580{
581 static_assert(count < 16, "Shift out of bounds");
582 return _mm512_srai_epi16(a.native(), count);
583}
584#endif
585
586// -----------------------------------------------------------------------------
587
588template<unsigned count> SIMDPP_INL
589uint16x8 i_shift_r(const uint16x8& a)
590{
591 static_assert(count < 16, "Shift out of bounds");
592#if SIMDPP_USE_NULL
593 return i_shift_r(a, count);
594#elif SIMDPP_USE_SSE2
595 return _mm_srli_epi16(a.native(), count);
596#elif SIMDPP_USE_NEON
597 return vshrq_n_u16(a.native(), count);
598#elif SIMDPP_USE_ALTIVEC
599 uint16x8 shift = make_uint(count);
600 return vec_sr(a.native(), shift.native());
601#elif SIMDPP_USE_MSA
602 return (v8u16) __msa_srli_h((v8i16) a.native(), count);
603#endif
604}
605
606#if SIMDPP_USE_AVX2
607template<unsigned count> SIMDPP_INL
608uint16x16 i_shift_r(const uint16x16& a)
609{
610 static_assert(count < 16, "Shift out of bounds");
611 return _mm256_srli_epi16(a.native(), count);
612}
613#endif
614
615#if SIMDPP_USE_AVX512BW
616template<unsigned count> SIMDPP_INL
617uint16<32> i_shift_r(const uint16<32>& a)
618{
619 static_assert(count < 16, "Shift out of bounds");
620 return _mm512_srli_epi16(a.native(), count);
621}
622#endif
623
624// -----------------------------------------------------------------------------
625
626template<unsigned count> SIMDPP_INL
627int32x4 i_shift_r(const int32x4& a)
628{
629 static_assert(count < 32, "Shift out of bounds");
630#if SIMDPP_USE_NULL
631 return i_shift_r(a, count);
632#elif SIMDPP_USE_SSE2
633 return _mm_srai_epi32(a.native(), count);
634#elif SIMDPP_USE_NEON
635 return vshrq_n_s32(a.native(), count);
636#elif SIMDPP_USE_ALTIVEC
637 uint32x4 shift = make_uint(count);
638 return vec_sra(a.native(), shift.native());
639#elif SIMDPP_USE_MSA
640 return __msa_srai_w(a.native(), count);
641#endif
642}
643
644#if SIMDPP_USE_AVX2
645template<unsigned count> SIMDPP_INL
646int32x8 i_shift_r(const int32x8& a)
647{
648 static_assert(count < 32, "Shift out of bounds");
649 return _mm256_srai_epi32(a.native(), count);
650}
651#endif
652
653#if SIMDPP_USE_AVX512F
654template<unsigned count> SIMDPP_INL
655int32<16> i_shift_r(const int32<16>& a)
656{
657 static_assert(count < 32, "Shift out of bounds");
658 return _mm512_srai_epi32(a.native(), count);
659}
660#endif
661
662// -----------------------------------------------------------------------------
663
664template<unsigned count> SIMDPP_INL
665uint32x4 i_shift_r(const uint32x4& a)
666{
667 static_assert(count < 32, "Shift out of bounds");
668#if SIMDPP_USE_NULL
669 return i_shift_r(a, count);
670#elif SIMDPP_USE_SSE2
671 return _mm_srli_epi32(a.native(), count);
672#elif SIMDPP_USE_NEON
673 return vshrq_n_u32(a.native(), count);
674#elif SIMDPP_USE_ALTIVEC
675 uint32x4 shift = make_uint(count);
676 return vec_sr(a.native(), shift.native());
677#elif SIMDPP_USE_MSA
678 return (v4u32) __msa_srli_w((v4i32) a.native(), count);
679#endif
680}
681
682#if SIMDPP_USE_AVX2
683template<unsigned count> SIMDPP_INL
684uint32x8 i_shift_r(const uint32x8& a)
685{
686 static_assert(count < 32, "Shift out of bounds");
687 return _mm256_srli_epi32(a.native(), count);
688}
689#endif
690
691#if SIMDPP_USE_AVX512F
692template<unsigned count> SIMDPP_INL
693uint32<16> i_shift_r(const uint32<16>& a)
694{
695 static_assert(count < 32, "Shift out of bounds");
696 return _mm512_srli_epi32(a.native(), count);
697}
698#endif
699
700// -----------------------------------------------------------------------------
701
702template<unsigned count> SIMDPP_INL
703int64x2 i_shift_r(const int64x2& a)
704{
705 static_assert(count < 64, "Shift out of bounds");
706#if SIMDPP_USE_NEON
707 return vshrq_n_s64(a.native(), count);
708#elif SIMDPP_USE_VSX_207
709 uint64x2 shift = splat(count);
710 return vec_sra(a.native(), shift.native());
711#elif SIMDPP_USE_MSA
712 return __msa_srai_d(a.native(), count);
713#elif SIMDPP_USE_NULL || SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC
714 return i_shift_r(a, count);
715#else
716 return SIMDPP_NOT_IMPLEMENTED_TEMPLATE1(int64<count>, a);
717#endif
718}
719
720#if SIMDPP_USE_AVX2
721template<unsigned count> SIMDPP_INL
722int64x4 i_shift_r(const int64x4& a)
723{
724 return i_shift_r(a, count);
725}
726#endif
727
728#if SIMDPP_USE_AVX512F
729template<unsigned count> SIMDPP_INL
730int64<8> i_shift_r(const int64<8>& a)
731{
732 static_assert(count < 64, "Shift out of bounds");
733 return _mm512_srai_epi64(a.native(), count);
734}
735#endif
736
737// -----------------------------------------------------------------------------
738
739template<unsigned count> SIMDPP_INL
740uint64x2 i_shift_r(const uint64x2& a)
741{
742 static_assert(count < 64, "Shift out of bounds");
743#if SIMDPP_USE_SSE2
744 return _mm_srli_epi64(a.native(), count);
745#elif SIMDPP_USE_NEON
746 return vshrq_n_u64(a.native(), count);
747#elif SIMDPP_USE_VSX_207
748 uint64x2 shift = splat(count);
749 return vec_sr(a.native(), shift.native());
750#elif SIMDPP_USE_MSA
751 return (v2u64) __msa_srli_d((v2i64) a.native(), count);
752#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
753 return i_shift_r(a, count);
754#else
755 return SIMDPP_NOT_IMPLEMENTED_TEMPLATE1(int64<count>, a);
756#endif
757}
758
759#if SIMDPP_USE_AVX2
760template<unsigned count> SIMDPP_INL
761uint64x4 i_shift_r(const uint64x4& a)
762{
763 static_assert(count < 64, "Shift out of bounds");
764 return _mm256_srli_epi64(a.native(), count);
765}
766#endif
767
768#if SIMDPP_USE_AVX512F
769template<unsigned count> SIMDPP_INL
770uint64<8> i_shift_r(const uint64<8>& a)
771{
772 static_assert(count < 64, "Shift out of bounds");
773 return _mm512_srli_epi64(a.native(), count);
774}
775#endif
776
777// -----------------------------------------------------------------------------
778
779template<unsigned count, class V> SIMDPP_INL
780V i_shift_r(const V& a)
781{
782 static_assert(count < 64, "Shift out of bounds");
783 SIMDPP_VEC_ARRAY_IMPL1(V, i_shift_r<count>, a);
784}
785
786// -----------------------------------------------------------------------------
787
788template<bool no_shift>
789struct i_shift_r_wrapper {
790 template<unsigned count, class V>
791 static SIMDPP_INL V run(const V& arg) { return i_shift_r<count>(arg); }
792};
793template<>
794struct i_shift_r_wrapper<true> {
795 template<unsigned count, class V>
796 static SIMDPP_INL V run(const V& arg) { return arg; }
797};
798
799} // namespace insn
800} // namespace detail
801} // namespace SIMDPP_ARCH_NAMESPACE
802} // namespace simdpp
803
804#endif
805
806