1 | /* Copyright (C) 2017 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_CORE_I_SHIFT_R_V_H |
9 | #define LIBSIMDPP_SIMDPP_CORE_I_SHIFT_R_V_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/detail/null/math.h> |
17 | #include <simdpp/detail/insn/i_shift.h> |
18 | #include <simdpp/detail/shuffle/shuffle_mask.h> |
19 | #include <simdpp/core/i_neg.h> |
20 | #include <simdpp/core/i_mul.h> |
21 | #include <simdpp/core/permute_bytes16.h> |
22 | #include <simdpp/detail/vector_array_macros.h> |
23 | |
24 | namespace simdpp { |
25 | namespace SIMDPP_ARCH_NAMESPACE { |
26 | namespace detail { |
27 | namespace insn { |
28 | |
29 | // emulates 8-bit variable shift using 16-bit variable shift |
30 | template<class U8> SIMDPP_INL |
31 | U8 v_emul_shift_r_u8_using_v16(const U8& a, const U8& count) |
32 | { |
33 | using U16 = typename same_width<U8>::u16; |
34 | |
35 | U16 a16; a16 = a; |
36 | U16 c16; c16 = count; |
37 | |
38 | U16 select_mask = make_uint(0x00ff); |
39 | U16 a_lo = bit_and(a16, select_mask); |
40 | U16 a_hi = a16; |
41 | U16 c_lo = bit_and(c16, select_mask); |
42 | U16 c_hi = shift_r<8>(c16); |
43 | a_lo = shift_r(a_lo, c_lo); |
44 | a_hi = shift_r(a_hi, c_hi); |
45 | a_hi = bit_andnot(a_hi, select_mask); |
46 | |
47 | a16 = bit_or(a_lo, a_hi); |
48 | return (U8) a16; |
49 | } |
50 | |
51 | // emulates 8-bit variable shift using permute_bytes16 and 16-bit multiplication |
52 | template<class U8> SIMDPP_INL |
53 | U8 v_emul_shift_r_u8_using_mul(const U8& a, const U8& count) |
54 | { |
55 | using U16 = typename same_width<U8>::u16; |
56 | |
57 | // Variable shift is implemented by reusing shifter in 16-bit unsigned |
58 | // multiplication. The result is obtained by computing 1 << (8-countN) |
59 | // for each element from a, multiplying each element by that number and |
60 | // selecting the high half of the result. |
61 | U8 mulshift_mask = make_uint(0x80, 0x40, 0x20, 0x10, |
62 | 0x08, 0x04, 0x02, 0x01, |
63 | 0x00, 0x00, 0x00, 0x00, |
64 | 0x00, 0x00, 0x00, 0x00); |
65 | U16 mulshift = (U16) permute_bytes16(mulshift_mask, count); |
66 | U16 a16; a16 = a; |
67 | U16 a16_lo, a16_hi, mulshift_lo, mulshift_hi; |
68 | U16 select_mask = make_uint(0x00ff); |
69 | |
70 | // Move the element values to the high byte of the 16-bit elements and the |
71 | // shift values to the low 9 bits. The 9-th bit is needed because in order |
72 | // to shift by 0 the element values need to be multiplied by 0x100. |
73 | // The results will have the high byte clear which will help composing the |
74 | // result back to a single vector. |
75 | a16_lo = shift_l<8>(a16); |
76 | mulshift_lo = bit_and(mulshift, select_mask); |
77 | mulshift_lo = shift_l<1>(mulshift_lo); |
78 | a16_hi = bit_andnot(a16, select_mask); |
79 | mulshift_hi = shift_l<1>(shift_r<8>(mulshift)); |
80 | |
81 | a16_lo = mul_hi(a16_lo, mulshift_lo); |
82 | a16_hi = mul_hi(a16_hi, mulshift_hi); |
83 | |
84 | a16_hi = shift_l<8>(a16_hi); |
85 | a16 = bit_or(a16_lo, a16_hi); |
86 | return (U8) a16; |
87 | } |
88 | |
89 | static SIMDPP_INL |
90 | uint8<16> i_shift_r_v(const uint8<16>& a, const uint8<16>& count) |
91 | { |
92 | #if SIMDPP_USE_NULL |
93 | return detail::null::shift_r_v(a, count); |
94 | #elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL |
95 | return v_emul_shift_r_u8_using_v16(a, count); |
96 | #elif SIMDPP_USE_SSSE3 |
97 | return v_emul_shift_r_u8_using_mul(a, count); |
98 | #elif SIMDPP_USE_NEON |
99 | int8<16> qcount = neg((int8<16>)count); |
100 | return vshlq_u8(a.native(), qcount.native()); |
101 | #elif SIMDPP_USE_ALTIVEC |
102 | return vec_sr(a.native(), count.native()); |
103 | #elif SIMDPP_USE_MSA |
104 | return (v16u8) __msa_srl_b((v16i8)a.native(), (v16i8)count.native()); |
105 | #else |
106 | return SIMDPP_NOT_IMPLEMENTED2(a, count); |
107 | #endif |
108 | } |
109 | |
110 | #if SIMDPP_USE_AVX2 |
111 | static SIMDPP_INL |
112 | uint8<32> i_shift_r_v(const uint8<32>& a, const uint8<32>& count) |
113 | { |
114 | #if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL |
115 | return v_emul_shift_r_u8_using_v16(a, count); |
116 | #else |
117 | return v_emul_shift_r_u8_using_mul(a, count); |
118 | #endif |
119 | } |
120 | #endif |
121 | |
122 | #if SIMDPP_USE_AVX512BW |
123 | static SIMDPP_INL |
124 | uint8<64> i_shift_r_v(const uint8<64>& a, const uint8<64>& count) |
125 | { |
126 | return v_emul_shift_r_u8_using_v16(a, count); |
127 | } |
128 | #endif |
129 | |
130 | // ----------------------------------------------------------------------------- |
131 | |
132 | // emulates 8-bit variable shift using 16-bit variable shift |
133 | template<class I8, class U8> SIMDPP_INL |
134 | I8 v_emul_shift_r_i8_using_v16(const I8& a, const U8& count) |
135 | { |
136 | using I16 = typename same_width<I8>::i16; |
137 | using U16 = typename same_width<I8>::u16; |
138 | |
139 | U16 a16; a16 = a; |
140 | U16 c16; c16 = count; |
141 | |
142 | U16 select_mask = make_uint(0x00ff); |
143 | U16 a_lo = shift_l<8>(a16); |
144 | U16 a_hi = a16; |
145 | U16 c_lo = bit_and(c16, select_mask); |
146 | U16 c_hi = shift_r<8>(c16); |
147 | a_lo = shift_r((I16)a_lo, c_lo); |
148 | a_hi = shift_r((I16)a_hi, c_hi); |
149 | |
150 | a_lo = shift_r<8>(a_lo); |
151 | a_hi = bit_andnot(a_hi, select_mask); |
152 | a16 = bit_or(a_lo, a_hi); |
153 | |
154 | return (I8) a16; |
155 | } |
156 | |
157 | template<class I8, class U8> SIMDPP_INL |
158 | I8 v_emul_shift_r_i8_using_mul(const I8& a, const U8& count) |
159 | { |
160 | using U16 = typename same_width<U8>::u16; |
161 | using I16 = typename same_width<U8>::i16; |
162 | |
163 | // Variable shift is implemented by reusing shifter in 16-bit signed |
164 | // multiplication. The result is obtained by computing 1 << (8-countN) |
165 | // for each element from a, multiplying each element by that number and |
166 | // selecting the high half of the result. |
167 | U8 mulshift_mask = make_uint(0x80, 0x40, 0x20, 0x10, |
168 | 0x08, 0x04, 0x02, 0x01, |
169 | 0x00, 0x00, 0x00, 0x00, |
170 | 0x00, 0x00, 0x00, 0x00); |
171 | U16 mulshift = (U16) permute_bytes16(mulshift_mask, count); |
172 | U16 a16; a16 = a; |
173 | U16 a16_lo, a16_hi, mulshift_lo, mulshift_hi; |
174 | U16 select_mask = make_uint(0x00ff); |
175 | |
176 | // Move the element values to the high byte of the 16-bit elements and the |
177 | // shift values to the low 9 bits. The 9-th bit is needed because in order |
178 | // to shift by 0 the element values need to be multiplied by 0x100. |
179 | // Note that the results may have nonzero high byte because this is signed |
180 | // multiplication. |
181 | a16_lo = shift_l<8>(a16); |
182 | mulshift_lo = bit_and(mulshift, select_mask); |
183 | mulshift_lo = shift_l<1>(mulshift_lo); |
184 | a16_hi = bit_andnot(a16, select_mask); |
185 | mulshift_hi = shift_l<1>(shift_r<8>(mulshift)); |
186 | |
187 | a16_lo = mul_hi((I16)a16_lo, (I16)mulshift_lo); |
188 | a16_hi = mul_hi((I16)a16_hi, (I16)mulshift_hi); |
189 | |
190 | a16_hi = shift_l<8>(a16_hi); |
191 | a16_lo = bit_and(a16_lo, select_mask); |
192 | a16 = bit_or(a16_lo, a16_hi); |
193 | return (U8) a16; |
194 | } |
195 | |
196 | static SIMDPP_INL |
197 | int8<16> i_shift_r_v(const int8<16>& a, const uint8<16>& count) |
198 | { |
199 | #if SIMDPP_USE_NULL |
200 | return detail::null::shift_r_v(a, count); |
201 | #elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL |
202 | return v_emul_shift_r_i8_using_v16(a, count); |
203 | #elif SIMDPP_USE_SSSE3 |
204 | return v_emul_shift_r_i8_using_mul(a, count); |
205 | #elif SIMDPP_USE_NEON |
206 | int8<16> qcount = neg((int8<16>)count); |
207 | return vshlq_s8(a.native(), qcount.native()); |
208 | #elif SIMDPP_USE_ALTIVEC |
209 | return vec_sra(a.native(), count.native()); |
210 | #elif SIMDPP_USE_MSA |
211 | return __msa_sra_b(a.native(), (v16i8) count.native()); |
212 | #else |
213 | return SIMDPP_NOT_IMPLEMENTED2(a, count); |
214 | #endif |
215 | } |
216 | |
217 | #if SIMDPP_USE_AVX2 |
218 | static SIMDPP_INL |
219 | int8<32> i_shift_r_v(const int8<32>& a, const uint8<32>& count) |
220 | { |
221 | #if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL |
222 | return v_emul_shift_r_i8_using_v16(a, count); |
223 | #else |
224 | return v_emul_shift_r_i8_using_mul(a, count); |
225 | #endif |
226 | } |
227 | #endif |
228 | |
229 | #if SIMDPP_USE_AVX512BW |
230 | static SIMDPP_INL |
231 | int8<64> i_shift_r_v(const int8<64>& a, const uint8<64>& count) |
232 | { |
233 | return v_emul_shift_r_i8_using_v16(a, count); |
234 | } |
235 | #endif |
236 | |
237 | // ----------------------------------------------------------------------------- |
238 | |
239 | // emulates 16-bit variable shift using permute_bytes16 and 16-bit multiplication |
240 | template<class U16> |
241 | U16 v_emul_shift_r_u16_using_mul(const U16& a, const U16& count) |
242 | { |
243 | using U8 = typename same_width<U16>::u8; |
244 | using M16 = typename U16::mask_vector_type; |
245 | // Variable shift is implemented by reusing shifter in 16-bit unsigned |
246 | // multiplication. The result is obtained by computing 1 << (16-countN-1) |
247 | // for each element from a, multiplying each element by that number and |
248 | // selecting the high half of the result. Note that the highest shift |
249 | // available when using 16-bit multiplication is 15, which needs to be |
250 | // worked around by extra instructions. |
251 | M16 is_same = cmp_eq(count, 0); |
252 | M16 is_zero = cmp_gt(count, 15); |
253 | |
254 | U8 mulshift_mask = make_uint(0x00, 0x80, 0x40, 0x20, |
255 | 0x10, 0x08, 0x04, 0x02, |
256 | 0x01, 0x00, 0x00, 0x00, |
257 | 0x00, 0x00, 0x00, 0x00); |
258 | |
259 | // permute_bytes16 permutes 8-bit elements instead of 16 which would be |
260 | // optimal in this case. We need to construct the selector in special way |
261 | // for 8-bit permutation. |
262 | // The 4-th is toggled bit so that the high byte takes zeros from the |
263 | // mulshift mask when the shift count is higher than 8. |
264 | U16 qcount = bit_or(count, shift_l<8>(count)); |
265 | qcount = bit_xor(qcount, 0x0008); |
266 | |
267 | U16 mulshift = (U16) permute_bytes16(mulshift_mask, (U8) qcount); |
268 | U16 res = mul_hi(a, mulshift); |
269 | res = blend(a, res, is_same); |
270 | res = bit_andnot(res, is_zero); |
271 | return res; |
272 | } |
273 | |
274 | static SIMDPP_INL |
275 | uint16<8> i_shift_r_v(const uint16<8>& a, const uint16<8>& count) |
276 | { |
277 | #if SIMDPP_USE_NULL |
278 | return detail::null::shift_r_v(a, count); |
279 | #elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL |
280 | return _mm_srlv_epi16(a.native(), count.native()); |
281 | #elif SIMDPP_USE_SSSE3 |
282 | return v_emul_shift_r_u16_using_mul(a, count); |
283 | #elif SIMDPP_USE_NEON |
284 | int16<8> qcount = neg((int16<8>)count); |
285 | return vshlq_u16(a.native(), qcount.native()); |
286 | #elif SIMDPP_USE_ALTIVEC |
287 | return vec_sr(a.native(), count.native()); |
288 | #elif SIMDPP_USE_MSA |
289 | return (v8u16) __msa_srl_h((v8i16)a.native(), (v8i16)count.native()); |
290 | #else |
291 | return SIMDPP_NOT_IMPLEMENTED2(a, count); |
292 | #endif |
293 | } |
294 | |
295 | #if SIMDPP_USE_AVX2 |
296 | static SIMDPP_INL |
297 | uint16<16> i_shift_r_v(const uint16<16>& a, const uint16<16>& count) |
298 | { |
299 | #if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL |
300 | return _mm256_srlv_epi16(a.native(), count.native()); |
301 | #else |
302 | return v_emul_shift_r_u16_using_mul(a, count); |
303 | #endif |
304 | } |
305 | #endif |
306 | |
307 | #if SIMDPP_USE_AVX512BW |
308 | SIMDPP_INL uint16<32> i_shift_r_v(const uint16<32>& a, const uint16<32>& count) |
309 | { |
310 | return _mm512_srlv_epi16(a.native(), count.native()); |
311 | } |
312 | #endif |
313 | |
314 | // ----------------------------------------------------------------------------- |
315 | |
316 | static SIMDPP_INL |
317 | int16<8> i_shift_r_v(const int16<8>& a, const uint16<8>& count) |
318 | { |
319 | #if SIMDPP_USE_NULL |
320 | return detail::null::shift_r_v(a, count); |
321 | #elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL |
322 | return _mm_srav_epi16(a.native(), count.native()); |
323 | #elif SIMDPP_USE_AVX512BW |
324 | __m512i a512 = _mm512_castsi128_si512(a.native()); |
325 | __m512i count512 = _mm512_castsi128_si512(count.native()); |
326 | return _mm512_castsi512_si128(_mm512_srav_epi16(a512, count512)); |
327 | #elif SIMDPP_USE_NEON |
328 | int16<8> qcount = neg((int16<8>)count); |
329 | return vshlq_s16(a.native(), qcount.native()); |
330 | #elif SIMDPP_USE_ALTIVEC |
331 | return vec_sra(a.native(), count.native()); |
332 | #elif SIMDPP_USE_MSA |
333 | return __msa_sra_h(a.native(), (v8i16) count.native()); |
334 | #else |
335 | return SIMDPP_NOT_IMPLEMENTED2(a, count); |
336 | #endif |
337 | } |
338 | |
339 | #if SIMDPP_USE_AVX2 |
340 | static SIMDPP_INL |
341 | int16<16> i_shift_r_v(const int16<16>& a, const uint16<16>& count) |
342 | { |
343 | #if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL |
344 | return _mm256_srav_epi16(a.native(), count.native()); |
345 | #elif SIMDPP_USE_AVX512BW |
346 | __m512i a512 = _mm512_castsi256_si512(a.native()); |
347 | __m512i count512 = _mm512_castsi256_si512(count.native()); |
348 | return _mm512_castsi512_si256(_mm512_srav_epi16(a512, count512)); |
349 | #else |
350 | return SIMDPP_NOT_IMPLEMENTED2(a, count); |
351 | #endif |
352 | } |
353 | #endif |
354 | |
355 | #if SIMDPP_USE_AVX512BW |
356 | SIMDPP_INL int16<32> i_shift_r_v(const int16<32>& a, const uint16<32>& count) |
357 | { |
358 | return _mm512_srav_epi16(a.native(), count.native()); |
359 | } |
360 | #endif |
361 | |
362 | // ----------------------------------------------------------------------------- |
363 | |
364 | static SIMDPP_INL |
365 | uint32<4> i_shift_r_v(const uint32<4>& a, const uint32<4>& count) |
366 | { |
367 | #if SIMDPP_USE_NULL |
368 | return detail::null::shift_r_v(a, count); |
369 | #elif SIMDPP_USE_AVX2 |
370 | return _mm_srlv_epi32(a.native(), count.native()); |
371 | #elif SIMDPP_USE_SSE2 |
372 | uint32<4> count0 = count; |
373 | #if SIMDPP_USE_SSE4_1 |
374 | uint32<4> zero = make_zero(); |
375 | count0 = _mm_blend_epi16(count0.native(), zero.native(), 0xcc); |
376 | #else |
377 | uint32<4> mask = make_uint(0xffffffff, 0, 0xffffffff, 0); |
378 | count0 = bit_and(count0, mask); |
379 | #endif |
380 | uint32<4> count1 = _mm_srli_epi64(count.native(), 32); |
381 | uint32<4> count2 = _mm_srli_si128(count0.native(), 8); |
382 | uint32<4> count3 = _mm_srli_si128(count.native(), 12); |
383 | |
384 | __m128i a0 = _mm_srl_epi32(a.native(), count0.native()); |
385 | __m128i a1 = _mm_srl_epi32(a.native(), count1.native()); |
386 | __m128i a2 = _mm_srl_epi32(a.native(), count2.native()); |
387 | __m128i a3 = _mm_srl_epi32(a.native(), count3.native()); |
388 | #if SIMDPP_USE_SSE4_1 |
389 | a0 = _mm_blend_epi16(a0, a1, 0x0c); |
390 | a2 = _mm_blend_epi16(a2, a3, 0xc0); |
391 | a0 = _mm_blend_epi16(a0, a2, 0xf0); |
392 | #else |
393 | __m128 f0 = _mm_shuffle_ps(_mm_castsi128_ps(a0), |
394 | _mm_castsi128_ps(a1), |
395 | SIMDPP_SHUFFLE_MASK_4x4(0, 0, 1, 1)); |
396 | __m128 f1 = _mm_shuffle_ps(_mm_castsi128_ps(a2), |
397 | _mm_castsi128_ps(a3), |
398 | SIMDPP_SHUFFLE_MASK_4x4(2, 2, 3, 3)); |
399 | f0 = _mm_shuffle_ps(f0, f1, SIMDPP_SHUFFLE_MASK_4x4(0, 2, 0, 2)); |
400 | a0 = _mm_castps_si128(f0); |
401 | #endif |
402 | return a0; |
403 | #elif SIMDPP_USE_NEON |
404 | int32<4> qcount = neg((int32<4>)count); |
405 | return vshlq_u32(a.native(), qcount.native()); |
406 | #elif SIMDPP_USE_ALTIVEC |
407 | return vec_sr(a.native(), count.native()); |
408 | #elif SIMDPP_USE_MSA |
409 | return (v4u32) __msa_srl_w((v4i32)a.native(), (v4i32)count.native()); |
410 | #endif |
411 | } |
412 | |
413 | #if SIMDPP_USE_AVX2 |
414 | static SIMDPP_INL |
415 | uint32<8> i_shift_r_v(const uint32<8>& a, const uint32<8>& count) |
416 | { |
417 | return _mm256_srlv_epi32(a.native(), count.native()); |
418 | } |
419 | #endif |
420 | |
421 | #if SIMDPP_USE_AVX512F |
422 | static SIMDPP_INL |
423 | uint32<16> i_shift_r_v(const uint32<16>& a, const uint32<16>& count) |
424 | { |
425 | return _mm512_srlv_epi32(a.native(), count.native()); |
426 | } |
427 | #endif |
428 | |
429 | // ----------------------------------------------------------------------------- |
430 | |
431 | static SIMDPP_INL |
432 | int32<4> i_shift_r_v(const int32<4>& a, const uint32<4>& count) |
433 | { |
434 | #if SIMDPP_USE_NULL |
435 | return detail::null::shift_r_v(a, count); |
436 | #elif SIMDPP_USE_AVX2 |
437 | return _mm_srav_epi32(a.native(), count.native()); |
438 | #elif SIMDPP_USE_SSE2 |
439 | uint32<4> count0 = count; |
440 | #if SIMDPP_USE_SSE4_1 |
441 | uint32<4> zero = make_zero(); |
442 | count0 = _mm_blend_epi16(count0.native(), zero.native(), 0xcc); |
443 | #else |
444 | uint32<4> mask = make_uint(0xffffffff, 0, 0xffffffff, 0); |
445 | count0 = bit_and(count0, mask); |
446 | #endif |
447 | uint32<4> count1 = _mm_srli_epi64(count.native(), 32); |
448 | uint32<4> count2 = _mm_srli_si128(count0.native(), 8); |
449 | uint32<4> count3 = _mm_srli_si128(count.native(), 12); |
450 | |
451 | __m128i a0 = _mm_sra_epi32(a.native(), count0.native()); |
452 | __m128i a1 = _mm_sra_epi32(a.native(), count1.native()); |
453 | __m128i a2 = _mm_sra_epi32(a.native(), count2.native()); |
454 | __m128i a3 = _mm_sra_epi32(a.native(), count3.native()); |
455 | #if SIMDPP_USE_SSE4_1 |
456 | a0 = _mm_blend_epi16(a0, a1, 0x0c); |
457 | a2 = _mm_blend_epi16(a2, a3, 0xc0); |
458 | a0 = _mm_blend_epi16(a0, a2, 0xf0); |
459 | #else |
460 | __m128 f0 = _mm_shuffle_ps(_mm_castsi128_ps(a0), |
461 | _mm_castsi128_ps(a1), |
462 | SIMDPP_SHUFFLE_MASK_4x4(0, 0, 1, 1)); |
463 | __m128 f1 = _mm_shuffle_ps(_mm_castsi128_ps(a2), |
464 | _mm_castsi128_ps(a3), |
465 | SIMDPP_SHUFFLE_MASK_4x4(2, 2, 3, 3)); |
466 | f0 = _mm_shuffle_ps(f0, f1, SIMDPP_SHUFFLE_MASK_4x4(0, 2, 0, 2)); |
467 | a0 = _mm_castps_si128(f0); |
468 | #endif |
469 | return a0; |
470 | #elif SIMDPP_USE_NEON |
471 | int32<4> qcount = neg((int32<4>)count); |
472 | return vshlq_s32(a.native(), qcount.native()); |
473 | #elif SIMDPP_USE_ALTIVEC |
474 | return vec_sra(a.native(), count.native()); |
475 | #elif SIMDPP_USE_MSA |
476 | return __msa_sra_w(a.native(), (v4i32)count.native()); |
477 | #endif |
478 | } |
479 | |
480 | #if SIMDPP_USE_AVX2 |
481 | static SIMDPP_INL |
482 | int32<8> i_shift_r_v(const int32<8>& a, const uint32<8>& count) |
483 | { |
484 | return _mm256_srav_epi32(a.native(), count.native()); |
485 | } |
486 | #endif |
487 | |
488 | #if SIMDPP_USE_AVX512F |
489 | static SIMDPP_INL |
490 | int32<16> i_shift_r_v(const int32<16>& a, const uint32<16>& count) |
491 | { |
492 | return _mm512_srav_epi32(a.native(), count.native()); |
493 | } |
494 | #endif |
495 | |
496 | // ----------------------------------------------------------------------------- |
497 | |
498 | template<class V, class U> SIMDPP_INL |
499 | V i_shift_r_v(const V& a, const U& b) |
500 | { |
501 | SIMDPP_VEC_ARRAY_IMPL2(V, i_shift_r_v, a, b); |
502 | } |
503 | |
504 | } // namespace insn |
505 | } // namespace detail |
506 | } // namespace SIMDPP_ARCH_NAMESPACE |
507 | } // namespace simdpp |
508 | |
509 | #endif |
510 | |