1 | /* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_SHIFT_R_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_SHIFT_R_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/detail/not_implemented.h> |
17 | #include <simdpp/core/bit_and.h> |
18 | #include <simdpp/core/bit_andnot.h> |
19 | #include <simdpp/core/bit_or.h> |
20 | #include <simdpp/core/i_add.h> |
21 | #include <simdpp/core/i_sub.h> |
22 | #include <simdpp/core/splat.h> |
23 | #include <simdpp/core/set_splat.h> |
24 | #include <simdpp/core/permute4.h> |
25 | #include <simdpp/core/shuffle2.h> |
26 | #include <simdpp/detail/insn/i_shift.h> |
27 | #include <simdpp/detail/null/math.h> |
28 | #include <simdpp/detail/vector_array_macros.h> |
29 | |
30 | namespace simdpp { |
31 | namespace SIMDPP_ARCH_NAMESPACE { |
32 | namespace detail { |
33 | namespace insn { |
34 | |
35 | |
36 | static SIMDPP_INL |
37 | int8x16 i_shift_r(const int8x16& a, unsigned count) |
38 | { |
39 | #if SIMDPP_USE_NULL |
40 | return detail::null::shift_r(a, count); |
41 | #elif SIMDPP_USE_SSE2 |
42 | uint16x8 hi, lo; |
43 | lo = hi = a; |
44 | |
45 | lo = shift_l<8>(lo); |
46 | lo = shift_r(int16x8(lo), count); |
47 | lo = shift_r<8>(lo); |
48 | |
49 | hi = shift_r(int16x8(hi), 8+count); |
50 | hi = shift_l<8>(hi); |
51 | return (int8<16>) bit_or(lo, hi); //higher part of lo is already clear |
52 | #elif SIMDPP_USE_NEON |
53 | int8x16 shift = splat(-int(count)); |
54 | return vshlq_s8(a.native(), shift.native()); |
55 | #elif SIMDPP_USE_ALTIVEC |
56 | uint8x16 shift = splat(count); |
57 | return vec_sra(a.native(), shift.native()); |
58 | #elif SIMDPP_USE_MSA |
59 | int8x16 shift = splat(count); |
60 | return __msa_sra_b(a.native(), shift.native()); |
61 | #endif |
62 | } |
63 | |
64 | #if SIMDPP_USE_AVX2 |
65 | static SIMDPP_INL |
66 | int8x32 i_shift_r(const int8x32& a, unsigned count) |
67 | { |
68 | uint16x16 hi, lo; |
69 | lo = hi = a; |
70 | |
71 | lo = shift_l<8>(lo); |
72 | lo = shift_r(int16x16(lo), count); |
73 | lo = shift_r<8>(lo); |
74 | |
75 | hi = shift_r(int16x16(hi), 8+count); |
76 | hi = shift_l<8>(hi); |
77 | return (int8<32>) bit_or(lo, hi); //higher part of lo is already clear |
78 | } |
79 | #endif |
80 | |
81 | #if SIMDPP_USE_AVX512BW |
82 | SIMDPP_INL int8<64> i_shift_r(const int8<64>& a, unsigned count) |
83 | { |
84 | uint16<32> hi, lo; |
85 | lo = hi = a; |
86 | |
87 | lo = shift_l<8>(lo); |
88 | lo = shift_r(int16<32>(lo), count); |
89 | lo = shift_r<8>(lo); |
90 | |
91 | hi = shift_r(int16<32>(hi), 8+count); |
92 | hi = shift_l<8>(hi); |
93 | return (int8<64>) bit_or(lo, hi); //higher part of lo is already clear |
94 | } |
95 | #endif |
96 | |
97 | // ----------------------------------------------------------------------------- |
98 | |
99 | static SIMDPP_INL |
100 | uint8x16 i_shift_r(const uint8x16& a, unsigned count) |
101 | { |
102 | #if SIMDPP_USE_NULL |
103 | return detail::null::shift_r(a, count); |
104 | #elif SIMDPP_USE_SSE2 |
105 | uint16x8 mask, a16; |
106 | mask = make_ones(); |
107 | mask = shift_l(mask, 16-count); |
108 | mask = shift_r<8>(mask); |
109 | |
110 | a16 = a; |
111 | a16 = shift_r(a16, count); |
112 | a16 = bit_andnot(a16, mask); |
113 | return uint8x16(a16); |
114 | #elif SIMDPP_USE_NEON |
115 | int8x16 shift = splat(-int(count)); |
116 | return vshlq_u8(a.native(), shift.native()); |
117 | #elif SIMDPP_USE_ALTIVEC |
118 | uint8x16 shift = splat(count); |
119 | return vec_sr(a.native(), shift.native()); |
120 | #elif SIMDPP_USE_MSA |
121 | int8x16 shift = splat(count); |
122 | return (v16u8) __msa_srl_b((v16i8) a.native(), shift.native()); |
123 | #endif |
124 | } |
125 | |
126 | #if SIMDPP_USE_AVX2 |
127 | static SIMDPP_INL |
128 | uint8x32 i_shift_r(const uint8x32& a, unsigned count) |
129 | { |
130 | unsigned shift = 8 - count; |
131 | uint16_t mask1 = (0xff >> shift) << shift; |
132 | uint16x16 mask, a16; |
133 | mask = splat(mask1); |
134 | |
135 | a16 = a; |
136 | a16 = shift_r(a16, count); |
137 | a16 = bit_andnot(a16, mask); |
138 | return uint8x32(a16); |
139 | } |
140 | #endif |
141 | |
142 | #if SIMDPP_USE_AVX512BW |
143 | SIMDPP_INL uint8<64> i_shift_r(const uint8<64>& a, unsigned count) |
144 | { |
145 | unsigned shift = 8 - count; |
146 | uint16_t mask1 = (0xff >> shift) << shift; |
147 | uint16<32> mask, a16; |
148 | mask = splat(mask1); |
149 | |
150 | a16 = a; |
151 | a16 = shift_r(a16, count); |
152 | a16 = bit_andnot(a16, mask); |
153 | return uint8<64>(a16); |
154 | } |
155 | #endif |
156 | |
157 | // ----------------------------------------------------------------------------- |
158 | |
159 | static SIMDPP_INL |
160 | int16x8 i_shift_r(const int16x8& a, unsigned count) |
161 | { |
162 | #if SIMDPP_USE_NULL |
163 | return detail::null::shift_r(a, count); |
164 | #elif SIMDPP_USE_SSE2 |
165 | return _mm_sra_epi16(a.native(), _mm_cvtsi32_si128(count)); |
166 | #elif SIMDPP_USE_NEON |
167 | int16x8 shift = splat(-int(count)); |
168 | return vshlq_s16(a.native(), shift.native()); |
169 | #elif SIMDPP_USE_ALTIVEC |
170 | uint16x8 shift = splat(count); |
171 | return vec_sra(a.native(), shift.native()); |
172 | #elif SIMDPP_USE_MSA |
173 | int16x8 shift = splat(count); |
174 | return __msa_sra_h(a.native(), shift.native()); |
175 | #endif |
176 | } |
177 | |
178 | #if SIMDPP_USE_AVX2 |
179 | static SIMDPP_INL |
180 | int16x16 i_shift_r(const int16x16& a, unsigned count) |
181 | { |
182 | #if SIMDPP_WORKAROUND_AVX2_SHIFT_INTRINSICS |
183 | __m256i r = a.native(); |
184 | __m128i x = _mm_cvtsi32_si128(count); |
185 | __asm("vpsraw %1, %2, %0" : "=x" (r) : "x" (x), "x" (r)); |
186 | return r; |
187 | #else |
188 | return _mm256_sra_epi16(a.native(), _mm_cvtsi32_si128(count)); |
189 | #endif |
190 | } |
191 | #endif |
192 | |
193 | #if SIMDPP_USE_AVX512BW |
194 | SIMDPP_INL int16<32> i_shift_r(const int16<32>& a, unsigned count) |
195 | { |
196 | return _mm512_sra_epi16(a.native(), _mm_cvtsi32_si128(count)); |
197 | } |
198 | #endif |
199 | |
200 | // ----------------------------------------------------------------------------- |
201 | |
202 | static SIMDPP_INL |
203 | uint16x8 i_shift_r(const uint16x8& a, unsigned count) |
204 | { |
205 | #if SIMDPP_USE_NULL |
206 | return detail::null::shift_r(a, count); |
207 | #elif SIMDPP_USE_SSE2 |
208 | return _mm_srli_epi16(a.native(), count); |
209 | #elif SIMDPP_USE_NEON |
210 | int16x8 shift = splat(-int(count)); |
211 | return vshlq_u16(a.native(), shift.native()); |
212 | #elif SIMDPP_USE_ALTIVEC |
213 | uint16x8 shift = splat(count); |
214 | return vec_sr(a.native(), shift.native()); |
215 | #elif SIMDPP_USE_MSA |
216 | int16x8 shift = splat(count); |
217 | return (v8u16) __msa_srl_h((v8i16) a.native(), shift.native()); |
218 | #endif |
219 | } |
220 | |
221 | #if SIMDPP_USE_AVX2 |
222 | static SIMDPP_INL |
223 | uint16x16 i_shift_r(const uint16x16& a, unsigned count) |
224 | { |
225 | #if SIMDPP_WORKAROUND_AVX2_SHIFT_INTRINSICS |
226 | __m256i r = a.native(); |
227 | __m128i x = _mm_cvtsi32_si128(count); |
228 | __asm("vpsrlw %1, %2, %0" : "=x" (r) : "x" (x), "x" (r)); |
229 | return r; |
230 | #else |
231 | return _mm256_srl_epi16(a.native(), _mm_cvtsi32_si128(count)); |
232 | #endif |
233 | } |
234 | #endif |
235 | |
236 | #if SIMDPP_USE_AVX512BW |
237 | SIMDPP_INL uint16<32> i_shift_r(const uint16<32>& a, unsigned count) |
238 | { |
239 | return _mm512_srl_epi16(a.native(), _mm_cvtsi32_si128(count)); |
240 | } |
241 | #endif |
242 | |
243 | // ----------------------------------------------------------------------------- |
244 | |
245 | static SIMDPP_INL |
246 | int32x4 i_shift_r(const int32x4& a, unsigned count) |
247 | { |
248 | #if SIMDPP_USE_NULL |
249 | return detail::null::shift_r(a, count); |
250 | #elif SIMDPP_USE_SSE2 |
251 | return _mm_sra_epi32(a.native(), _mm_cvtsi32_si128(count)); |
252 | #elif SIMDPP_USE_NEON |
253 | int32x4 shift = splat(-int(count)); |
254 | return vshlq_s32(a.native(), shift.native()); |
255 | #elif SIMDPP_USE_ALTIVEC |
256 | uint32x4 shift = splat(count); |
257 | return vec_sra(a.native(), shift.native()); |
258 | #elif SIMDPP_USE_MSA |
259 | int32x4 shift = splat(count); |
260 | return __msa_sra_w(a.native(), shift.native()); |
261 | #endif |
262 | } |
263 | |
264 | #if SIMDPP_USE_AVX2 |
265 | static SIMDPP_INL |
266 | int32x8 i_shift_r(const int32x8& a, unsigned count) |
267 | { |
268 | #if SIMDPP_WORKAROUND_AVX2_SHIFT_INTRINSICS |
269 | __m256i r = a.native(); |
270 | __m128i x = _mm_cvtsi32_si128(count); |
271 | __asm("vpsrad %1, %2, %0" : "=x" (r) : "x" (x), "x" (r)); |
272 | return r; |
273 | #else |
274 | return _mm256_sra_epi32(a.native(), _mm_cvtsi32_si128(count)); |
275 | #endif |
276 | } |
277 | #endif |
278 | |
279 | #if SIMDPP_USE_AVX512F |
280 | static SIMDPP_INL |
281 | int32<16> i_shift_r(const int32<16>& a, unsigned count) |
282 | { |
283 | return _mm512_sra_epi32(a.native(), _mm_cvtsi32_si128(count)); |
284 | } |
285 | #endif |
286 | |
287 | // ----------------------------------------------------------------------------- |
288 | |
289 | static SIMDPP_INL |
290 | uint32x4 i_shift_r(const uint32x4& a, unsigned count) |
291 | { |
292 | #if SIMDPP_USE_NULL |
293 | return detail::null::shift_r(a, count); |
294 | #elif SIMDPP_USE_SSE2 |
295 | return _mm_srl_epi32(a.native(), _mm_cvtsi32_si128(count)); |
296 | #elif SIMDPP_USE_NEON |
297 | int32x4 shift = splat(-int(count)); |
298 | return vshlq_u32(a.native(), shift.native()); |
299 | #elif SIMDPP_USE_ALTIVEC |
300 | uint32x4 shift = splat(count); |
301 | return vec_sr(a.native(), shift.native()); |
302 | #elif SIMDPP_USE_MSA |
303 | int32x4 shift = splat(count); |
304 | return (v4u32) __msa_srl_w((v4i32) a.native(), shift.native()); |
305 | #endif |
306 | } |
307 | |
308 | #if SIMDPP_USE_AVX2 |
309 | static SIMDPP_INL |
310 | uint32x8 i_shift_r(const uint32x8& a, unsigned count) |
311 | { |
312 | #if SIMDPP_WORKAROUND_AVX2_SHIFT_INTRINSICS |
313 | __m256i r = a.native(); |
314 | __m128i x = _mm_cvtsi32_si128(count); |
315 | __asm("vpsrld %1, %2, %0" : "=x" (r) : "x" (x), "x" (r)); |
316 | return r; |
317 | #else |
318 | return _mm256_srl_epi32(a.native(), _mm_cvtsi32_si128(count)); |
319 | #endif |
320 | } |
321 | #endif |
322 | |
323 | #if SIMDPP_USE_AVX512F |
324 | static SIMDPP_INL |
325 | uint32<16> i_shift_r(const uint32<16>& a, unsigned count) |
326 | { |
327 | return _mm512_srl_epi32(a.native(), _mm_cvtsi32_si128(count)); |
328 | } |
329 | #endif |
330 | |
331 | // ----------------------------------------------------------------------------- |
332 | |
333 | static SIMDPP_INL |
334 | uint64x2 i_shift_r(const uint64x2& a, unsigned count) |
335 | { |
336 | #if SIMDPP_USE_SSE2 |
337 | return _mm_srl_epi64(a.native(), _mm_cvtsi32_si128(count)); |
338 | #elif SIMDPP_USE_NEON |
339 | int64x2 shift = splat(-int(count)); |
340 | return vshlq_u64(a.native(), shift.native()); |
341 | #elif SIMDPP_USE_VSX_207 |
342 | uint64x2 shift = splat(count); |
343 | return vec_sr(a.native(), shift.native()); |
344 | #elif SIMDPP_USE_MSA |
345 | int32x4 shift = splat(count); |
346 | return (v2u64) __msa_srl_d((v2i64) a.native(), (v2i64) shift.native()); |
347 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC |
348 | return detail::null::shift_r(a, count); |
349 | #endif |
350 | } |
351 | |
352 | #if SIMDPP_USE_AVX2 |
353 | static SIMDPP_INL |
354 | uint64x4 i_shift_r(const uint64x4& a, unsigned count) |
355 | { |
356 | #if SIMDPP_WORKAROUND_AVX2_SHIFT_INTRINSICS |
357 | __m256i r = a.native(); |
358 | __m128i x = _mm_cvtsi32_si128(count); |
359 | __asm("vpsrlq %1, %2, %0" : "=x" (r) : "x" (x), "x" (r)); |
360 | return r; |
361 | #else |
362 | return _mm256_srl_epi64(a.native(), _mm_cvtsi32_si128(count)); |
363 | #endif |
364 | } |
365 | #endif |
366 | |
367 | #if SIMDPP_USE_AVX512F |
368 | static SIMDPP_INL |
369 | uint64<8> i_shift_r(const uint64<8>& a, unsigned count) |
370 | { |
371 | return _mm512_srl_epi64(a.native(), _mm_cvtsi32_si128(count)); |
372 | } |
373 | #endif |
374 | |
375 | // ----------------------------------------------------------------------------- |
376 | |
377 | static SIMDPP_INL |
378 | int64x2 i_shift_r(const int64x2& a, unsigned count) |
379 | { |
380 | #if SIMDPP_USE_SSE2 |
381 | uint64<2> ret, bias; |
382 | bias = make_uint(0x8000000000000000); |
383 | ret = shift_r(add(uint64<2>(a), bias), count); |
384 | ret = sub(ret, shift_r(bias, count)); |
385 | return (int64<2>) ret; |
386 | #elif SIMDPP_USE_NEON |
387 | int64x2 shift = splat(-int(count)); |
388 | return vshlq_s64(a.native(), shift.native()); |
389 | #elif SIMDPP_USE_VSX_207 |
390 | uint64x2 shift = splat(count); |
391 | return vec_sra(a.native(), shift.native()); |
392 | #elif SIMDPP_USE_MSA |
393 | int32x4 shift = splat(count); |
394 | return __msa_sra_d(a.native(), (v2i64) shift.native()); |
395 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC |
396 | return detail::null::shift_r(a, count); |
397 | #endif |
398 | } |
399 | |
400 | #if SIMDPP_USE_AVX2 |
401 | static SIMDPP_INL |
402 | int64x4 i_shift_r(const int64x4& a, unsigned count) |
403 | { |
404 | uint64<4> ret, bias; |
405 | bias = make_uint(0x8000000000000000); |
406 | ret = shift_r(add(uint64<4>(a), bias), count); |
407 | ret = sub(ret, shift_r(bias, count)); |
408 | return (int64<4>) ret; |
409 | } |
410 | #endif |
411 | |
412 | #if SIMDPP_USE_AVX512F |
413 | static SIMDPP_INL |
414 | int64<8> i_shift_r(const int64<8>& a, unsigned count) |
415 | { |
416 | return _mm512_sra_epi64(a.native(), _mm_cvtsi32_si128(count)); |
417 | } |
418 | #endif |
419 | |
420 | // ----------------------------------------------------------------------------- |
421 | |
422 | template<class V> SIMDPP_INL |
423 | V i_shift_r(const V& a, unsigned count) |
424 | { |
425 | SIMDPP_VEC_ARRAY_IMPL2S(V, i_shift_r, a, count); |
426 | } |
427 | |
428 | |
429 | // ----------------------------------------------------------------------------- |
430 | |
431 | template<unsigned count, unsigned N> SIMDPP_INL |
432 | uint8<N> shift_r_u8(const uint8<N>& a); |
433 | |
434 | |
435 | template<unsigned count> SIMDPP_INL |
436 | int8x16 i_shift_r(const int8x16& a) |
437 | { |
438 | static_assert(count < 8, "Shift out of bounds" ); |
439 | #if SIMDPP_USE_NULL |
440 | return i_shift_r(a, count); |
441 | #elif SIMDPP_USE_SSE2 |
442 | uint16<8> hi, lo; |
443 | lo = hi = a; |
444 | |
445 | lo = shift_l<8>(lo); |
446 | lo = shift_r<count>(int16<8>(lo)); |
447 | lo = shift_r<8>(lo); |
448 | |
449 | hi = shift_r<8+count>(int16<8>(hi)); |
450 | hi = shift_l<8>(hi); |
451 | return (int8<16>) bit_or(lo, hi); //higher part of lo is already clear |
452 | #elif SIMDPP_USE_NEON |
453 | return vshrq_n_s8(a.native(), count); |
454 | #elif SIMDPP_USE_ALTIVEC |
455 | uint8x16 shift = make_uint(count); |
456 | return vec_sra(a.native(), shift.native()); |
457 | #elif SIMDPP_USE_MSA |
458 | return __msa_srai_b(a.native(), count); |
459 | #endif |
460 | } |
461 | |
462 | #if SIMDPP_USE_AVX2 |
463 | template<unsigned count> SIMDPP_INL |
464 | int8x32 i_shift_r(const int8x32& a) |
465 | { |
466 | static_assert(count < 8, "Shift out of bounds" ); |
467 | uint16<16> hi, lo; |
468 | lo = hi = a; |
469 | |
470 | lo = shift_l<8>(lo); |
471 | lo = shift_r<count>(int16<16>(lo)); |
472 | lo = shift_r<8>(lo); |
473 | |
474 | hi = shift_r<8+count>(int16<16>(hi)); |
475 | hi = shift_l<8>(hi); |
476 | return (int8<32>) bit_or(lo, hi); //higher part of lo is already clear |
477 | } |
478 | #endif |
479 | |
480 | #if SIMDPP_USE_AVX512BW |
481 | template<unsigned count> SIMDPP_INL |
482 | int8<64> i_shift_r(const int8<64>& a) |
483 | { |
484 | static_assert(count < 8, "Shift out of bounds" ); |
485 | uint16<32> hi, lo; |
486 | lo = hi = a; |
487 | |
488 | lo = shift_l<8>(lo); |
489 | lo = shift_r<count>(int16<32>(lo)); |
490 | lo = shift_r<8>(lo); |
491 | |
492 | hi = shift_r<8+count>(int16<32>(hi)); |
493 | hi = shift_l<8>(hi); |
494 | return (int8<64>) bit_or(lo, hi); //higher part of lo is already clear |
495 | } |
496 | #endif |
497 | |
498 | // ----------------------------------------------------------------------------- |
499 | |
500 | template<unsigned count, unsigned N> SIMDPP_INL |
501 | uint8<N> sse_shift_r_u8(const uint8<N>& a) |
502 | { |
503 | uint8_t mask1 = (0xff << count) & 0xff; |
504 | uint8<N> mask = make_uint(mask1); |
505 | |
506 | uint16<N/2> a16 = (uint16<N/2>) bit_and(a, mask); |
507 | a16 = shift_r<count>(a16); |
508 | |
509 | return uint8<N>(a16); |
510 | } |
511 | |
512 | template<unsigned count> SIMDPP_INL |
513 | uint8x16 i_shift_r(const uint8x16& a) |
514 | { |
515 | static_assert(count < 8, "Shift out of bounds" ); |
516 | #if SIMDPP_USE_NULL |
517 | return i_shift_r(a, count); |
518 | #elif SIMDPP_USE_SSE2 |
519 | return sse_shift_r_u8<count>(a); |
520 | #elif SIMDPP_USE_NEON |
521 | return vshrq_n_u8(a.native(), count); |
522 | #elif SIMDPP_USE_ALTIVEC |
523 | uint8x16 shift = make_uint(count); |
524 | return vec_sr(a.native(), shift.native()); |
525 | #elif SIMDPP_USE_MSA |
526 | return (v16u8) __msa_srli_b((v16i8) a.native(), count); |
527 | #endif |
528 | } |
529 | |
530 | #if SIMDPP_USE_AVX2 |
531 | template<unsigned count> SIMDPP_INL |
532 | uint8x32 i_shift_r(const uint8x32& a) |
533 | { |
534 | static_assert(count < 8, "Shift out of bounds" ); |
535 | return sse_shift_r_u8<count>(a); |
536 | } |
537 | #endif |
538 | |
539 | #if SIMDPP_USE_AVX512BW |
540 | template<unsigned count> SIMDPP_INL |
541 | uint8<64> i_shift_r(const uint8<64>& a) |
542 | { |
543 | static_assert(count < 8, "Shift out of bounds" ); |
544 | return sse_shift_r_u8<count>(a); |
545 | } |
546 | #endif |
547 | |
548 | // ----------------------------------------------------------------------------- |
549 | |
550 | template<unsigned count> SIMDPP_INL |
551 | int16x8 i_shift_r(const int16x8& a) |
552 | { |
553 | static_assert(count < 16, "Shift out of bounds" ); |
554 | #if SIMDPP_USE_NULL |
555 | return detail::null::shift_r(a, count); |
556 | #elif SIMDPP_USE_SSE2 |
557 | return _mm_srai_epi16(a.native(), count); |
558 | #elif SIMDPP_USE_NEON |
559 | return vshrq_n_s16(a.native(), count); |
560 | #elif SIMDPP_USE_ALTIVEC |
561 | uint16x8 shift = make_uint(count); |
562 | return vec_sra(a.native(), shift.native()); |
563 | #elif SIMDPP_USE_MSA |
564 | return __msa_srai_h(a.native(), count); |
565 | #endif |
566 | } |
567 | |
568 | #if SIMDPP_USE_AVX2 |
569 | template<unsigned count> SIMDPP_INL |
570 | int16x16 i_shift_r(const int16x16& a) |
571 | { |
572 | static_assert(count < 16, "Shift out of bounds" ); |
573 | return _mm256_srai_epi16(a.native(), count); |
574 | } |
575 | #endif |
576 | |
577 | #if SIMDPP_USE_AVX512BW |
578 | template<unsigned count> SIMDPP_INL |
579 | int16<32> i_shift_r(const int16<32>& a) |
580 | { |
581 | static_assert(count < 16, "Shift out of bounds" ); |
582 | return _mm512_srai_epi16(a.native(), count); |
583 | } |
584 | #endif |
585 | |
586 | // ----------------------------------------------------------------------------- |
587 | |
588 | template<unsigned count> SIMDPP_INL |
589 | uint16x8 i_shift_r(const uint16x8& a) |
590 | { |
591 | static_assert(count < 16, "Shift out of bounds" ); |
592 | #if SIMDPP_USE_NULL |
593 | return i_shift_r(a, count); |
594 | #elif SIMDPP_USE_SSE2 |
595 | return _mm_srli_epi16(a.native(), count); |
596 | #elif SIMDPP_USE_NEON |
597 | return vshrq_n_u16(a.native(), count); |
598 | #elif SIMDPP_USE_ALTIVEC |
599 | uint16x8 shift = make_uint(count); |
600 | return vec_sr(a.native(), shift.native()); |
601 | #elif SIMDPP_USE_MSA |
602 | return (v8u16) __msa_srli_h((v8i16) a.native(), count); |
603 | #endif |
604 | } |
605 | |
606 | #if SIMDPP_USE_AVX2 |
607 | template<unsigned count> SIMDPP_INL |
608 | uint16x16 i_shift_r(const uint16x16& a) |
609 | { |
610 | static_assert(count < 16, "Shift out of bounds" ); |
611 | return _mm256_srli_epi16(a.native(), count); |
612 | } |
613 | #endif |
614 | |
615 | #if SIMDPP_USE_AVX512BW |
616 | template<unsigned count> SIMDPP_INL |
617 | uint16<32> i_shift_r(const uint16<32>& a) |
618 | { |
619 | static_assert(count < 16, "Shift out of bounds" ); |
620 | return _mm512_srli_epi16(a.native(), count); |
621 | } |
622 | #endif |
623 | |
624 | // ----------------------------------------------------------------------------- |
625 | |
626 | template<unsigned count> SIMDPP_INL |
627 | int32x4 i_shift_r(const int32x4& a) |
628 | { |
629 | static_assert(count < 32, "Shift out of bounds" ); |
630 | #if SIMDPP_USE_NULL |
631 | return i_shift_r(a, count); |
632 | #elif SIMDPP_USE_SSE2 |
633 | return _mm_srai_epi32(a.native(), count); |
634 | #elif SIMDPP_USE_NEON |
635 | return vshrq_n_s32(a.native(), count); |
636 | #elif SIMDPP_USE_ALTIVEC |
637 | uint32x4 shift = make_uint(count); |
638 | return vec_sra(a.native(), shift.native()); |
639 | #elif SIMDPP_USE_MSA |
640 | return __msa_srai_w(a.native(), count); |
641 | #endif |
642 | } |
643 | |
644 | #if SIMDPP_USE_AVX2 |
645 | template<unsigned count> SIMDPP_INL |
646 | int32x8 i_shift_r(const int32x8& a) |
647 | { |
648 | static_assert(count < 32, "Shift out of bounds" ); |
649 | return _mm256_srai_epi32(a.native(), count); |
650 | } |
651 | #endif |
652 | |
653 | #if SIMDPP_USE_AVX512F |
654 | template<unsigned count> SIMDPP_INL |
655 | int32<16> i_shift_r(const int32<16>& a) |
656 | { |
657 | static_assert(count < 32, "Shift out of bounds" ); |
658 | return _mm512_srai_epi32(a.native(), count); |
659 | } |
660 | #endif |
661 | |
662 | // ----------------------------------------------------------------------------- |
663 | |
664 | template<unsigned count> SIMDPP_INL |
665 | uint32x4 i_shift_r(const uint32x4& a) |
666 | { |
667 | static_assert(count < 32, "Shift out of bounds" ); |
668 | #if SIMDPP_USE_NULL |
669 | return i_shift_r(a, count); |
670 | #elif SIMDPP_USE_SSE2 |
671 | return _mm_srli_epi32(a.native(), count); |
672 | #elif SIMDPP_USE_NEON |
673 | return vshrq_n_u32(a.native(), count); |
674 | #elif SIMDPP_USE_ALTIVEC |
675 | uint32x4 shift = make_uint(count); |
676 | return vec_sr(a.native(), shift.native()); |
677 | #elif SIMDPP_USE_MSA |
678 | return (v4u32) __msa_srli_w((v4i32) a.native(), count); |
679 | #endif |
680 | } |
681 | |
682 | #if SIMDPP_USE_AVX2 |
683 | template<unsigned count> SIMDPP_INL |
684 | uint32x8 i_shift_r(const uint32x8& a) |
685 | { |
686 | static_assert(count < 32, "Shift out of bounds" ); |
687 | return _mm256_srli_epi32(a.native(), count); |
688 | } |
689 | #endif |
690 | |
691 | #if SIMDPP_USE_AVX512F |
692 | template<unsigned count> SIMDPP_INL |
693 | uint32<16> i_shift_r(const uint32<16>& a) |
694 | { |
695 | static_assert(count < 32, "Shift out of bounds" ); |
696 | return _mm512_srli_epi32(a.native(), count); |
697 | } |
698 | #endif |
699 | |
700 | // ----------------------------------------------------------------------------- |
701 | |
702 | template<unsigned count> SIMDPP_INL |
703 | int64x2 i_shift_r(const int64x2& a) |
704 | { |
705 | static_assert(count < 64, "Shift out of bounds" ); |
706 | #if SIMDPP_USE_NEON |
707 | return vshrq_n_s64(a.native(), count); |
708 | #elif SIMDPP_USE_VSX_207 |
709 | uint64x2 shift = splat(count); |
710 | return vec_sra(a.native(), shift.native()); |
711 | #elif SIMDPP_USE_MSA |
712 | return __msa_srai_d(a.native(), count); |
713 | #elif SIMDPP_USE_NULL || SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC |
714 | return i_shift_r(a, count); |
715 | #else |
716 | return SIMDPP_NOT_IMPLEMENTED_TEMPLATE1(int64<count>, a); |
717 | #endif |
718 | } |
719 | |
720 | #if SIMDPP_USE_AVX2 |
721 | template<unsigned count> SIMDPP_INL |
722 | int64x4 i_shift_r(const int64x4& a) |
723 | { |
724 | return i_shift_r(a, count); |
725 | } |
726 | #endif |
727 | |
728 | #if SIMDPP_USE_AVX512F |
729 | template<unsigned count> SIMDPP_INL |
730 | int64<8> i_shift_r(const int64<8>& a) |
731 | { |
732 | static_assert(count < 64, "Shift out of bounds" ); |
733 | return _mm512_srai_epi64(a.native(), count); |
734 | } |
735 | #endif |
736 | |
737 | // ----------------------------------------------------------------------------- |
738 | |
739 | template<unsigned count> SIMDPP_INL |
740 | uint64x2 i_shift_r(const uint64x2& a) |
741 | { |
742 | static_assert(count < 64, "Shift out of bounds" ); |
743 | #if SIMDPP_USE_SSE2 |
744 | return _mm_srli_epi64(a.native(), count); |
745 | #elif SIMDPP_USE_NEON |
746 | return vshrq_n_u64(a.native(), count); |
747 | #elif SIMDPP_USE_VSX_207 |
748 | uint64x2 shift = splat(count); |
749 | return vec_sr(a.native(), shift.native()); |
750 | #elif SIMDPP_USE_MSA |
751 | return (v2u64) __msa_srli_d((v2i64) a.native(), count); |
752 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC |
753 | return i_shift_r(a, count); |
754 | #else |
755 | return SIMDPP_NOT_IMPLEMENTED_TEMPLATE1(int64<count>, a); |
756 | #endif |
757 | } |
758 | |
759 | #if SIMDPP_USE_AVX2 |
760 | template<unsigned count> SIMDPP_INL |
761 | uint64x4 i_shift_r(const uint64x4& a) |
762 | { |
763 | static_assert(count < 64, "Shift out of bounds" ); |
764 | return _mm256_srli_epi64(a.native(), count); |
765 | } |
766 | #endif |
767 | |
768 | #if SIMDPP_USE_AVX512F |
769 | template<unsigned count> SIMDPP_INL |
770 | uint64<8> i_shift_r(const uint64<8>& a) |
771 | { |
772 | static_assert(count < 64, "Shift out of bounds" ); |
773 | return _mm512_srli_epi64(a.native(), count); |
774 | } |
775 | #endif |
776 | |
777 | // ----------------------------------------------------------------------------- |
778 | |
779 | template<unsigned count, class V> SIMDPP_INL |
780 | V i_shift_r(const V& a) |
781 | { |
782 | static_assert(count < 64, "Shift out of bounds" ); |
783 | SIMDPP_VEC_ARRAY_IMPL1(V, i_shift_r<count>, a); |
784 | } |
785 | |
786 | // ----------------------------------------------------------------------------- |
787 | |
788 | template<bool no_shift> |
789 | struct i_shift_r_wrapper { |
790 | template<unsigned count, class V> |
791 | static SIMDPP_INL V run(const V& arg) { return i_shift_r<count>(arg); } |
792 | }; |
793 | template<> |
794 | struct i_shift_r_wrapper<true> { |
795 | template<unsigned count, class V> |
796 | static SIMDPP_INL V run(const V& arg) { return arg; } |
797 | }; |
798 | |
799 | } // namespace insn |
800 | } // namespace detail |
801 | } // namespace SIMDPP_ARCH_NAMESPACE |
802 | } // namespace simdpp |
803 | |
804 | #endif |
805 | |
806 | |