1/* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_MOVE_R_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_MOVE_R_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/detail/shuffle/shuffle_mask.h>
17#include <simdpp/detail/vector_array_macros.h>
18
19namespace simdpp {
20namespace SIMDPP_ARCH_NAMESPACE {
21namespace detail {
22namespace insn {
23
24
25template<unsigned shift> SIMDPP_INL
26uint8x16 i_move16_r(const uint8x16& a)
27{
28 static_assert(shift <= 16, "Selector out of range");
29
30#if SIMDPP_USE_NULL
31 return detail::null::move_n_r<shift>(a);
32#elif SIMDPP_USE_SSE2
33 return _mm_slli_si128(a.native(), shift);
34#elif SIMDPP_USE_NEON
35 uint8x16 zero = make_zero();
36 return vextq_u8(zero.native(), a.native(), 16-shift);
37#elif SIMDPP_USE_ALTIVEC
38 // return align<16-shift>((uint8x16) make_zero(), a);
39 return vec_sld_biendian<16-shift>((uint8<16>)make_zero(), a);
40#elif SIMDPP_USE_MSA
41 uint8x16 zero = make_zero();
42 return (v16u8) __msa_sldi_b((v16i8)a.native(), (v16i8)zero.native(), 16-shift);
43#endif
44}
45
46#if SIMDPP_USE_AVX2
47template<unsigned shift> SIMDPP_INL
48uint8x32 i_move16_r(const uint8x32& a)
49{
50 static_assert(shift <= 16, "Selector out of range");
51 return _mm256_slli_si256(a.native(), shift);
52}
53#endif
54
55#if SIMDPP_USE_AVX512BW
56template<unsigned shift> SIMDPP_INL
57uint8<64> i_move16_r(const uint8<64>& a)
58{
59 static_assert(shift <= 16, "Selector out of range");
60 return _mm512_bslli_epi128(a.native(), shift);
61}
62#endif
63
64template<unsigned shift, unsigned N> SIMDPP_INL
65uint8<N> i_move16_r(const uint8<N>& a)
66{
67 static_assert(shift <= 16, "Selector out of range");
68 SIMDPP_VEC_ARRAY_IMPL1(uint8<N>, i_move16_r<shift>, a);
69}
70
71// -----------------------------------------------------------------------------
72
73template<unsigned shift> SIMDPP_INL
74uint16<8> i_move8_r(const uint16<8>& a)
75{
76#if SIMDPP_USE_NULL
77 return detail::null::move_n_r<shift>(a);
78#else
79 return (uint16<8>) i_move16_r<shift*2>(uint8<16>(a));
80#endif
81}
82
83#if SIMDPP_USE_AVX2
84template<unsigned shift> SIMDPP_INL
85uint16<16> i_move8_r(const uint16<16>& a)
86{
87 static_assert(shift <= 8, "Selector out of range");
88 return _mm256_slli_si256(a.native(), shift*2);
89}
90#endif
91
92#if SIMDPP_USE_AVX512BW
93template<unsigned shift> SIMDPP_INL
94uint16<32> i_move8_r(const uint16<32>& a)
95{
96 static_assert(shift <= 8, "Selector out of range");
97 return _mm512_bslli_epi128(a.native(), shift*2);
98}
99#endif
100
101template<unsigned shift, unsigned N> SIMDPP_INL
102uint16<N> i_move8_r(const uint16<N>& a)
103{
104 SIMDPP_VEC_ARRAY_IMPL1(uint16<N>, i_move8_r<shift>, a);
105}
106
107// -----------------------------------------------------------------------------
108
109template<unsigned shift> SIMDPP_INL
110uint32<4> i_move4_r(const uint32<4>& a)
111{
112#if SIMDPP_USE_NULL
113 return detail::null::move_n_r<shift>(a);
114#else
115 return (uint32<4>) i_move16_r<shift*4>(uint8<16>(a));
116#endif
117}
118
119#if SIMDPP_USE_AVX2
120template<unsigned shift> SIMDPP_INL
121uint32<8> i_move4_r(const uint32<8>& a)
122{
123 static_assert(shift <= 4, "Selector out of range");
124 return _mm256_slli_si256(a.native(), shift*4);
125}
126#endif
127
128#if SIMDPP_USE_AVX512F
129template<unsigned shift> SIMDPP_INL
130uint32<16> i_move4_r(const uint32<16>& a)
131{
132 static_assert(shift <= 4, "Selector out of range");
133 switch (shift) {
134 default:
135 case 0: return a;
136 case 1: return _mm512_maskz_shuffle_epi32(0xeeee, a.native(),
137 _MM_PERM_ENUM(_MM_SHUFFLE(2, 1, 0, 0)));
138 case 2: return _mm512_maskz_shuffle_epi32(0xcccc, a.native(),
139 _MM_PERM_ENUM(_MM_SHUFFLE(1, 0, 0, 0)));
140 case 3: return _mm512_maskz_shuffle_epi32(0x8888, a.native(),
141 _MM_PERM_ENUM(_MM_SHUFFLE(0, 0, 0, 0)));
142 case 4: return make_zero();
143 }
144}
145#endif
146
147template<unsigned shift, unsigned N> SIMDPP_INL
148uint32<N> i_move4_r(const uint32<N>& a)
149{
150 SIMDPP_VEC_ARRAY_IMPL1(uint32<N>, i_move4_r<shift>, a);
151}
152
153// -----------------------------------------------------------------------------
154
155template<unsigned shift> SIMDPP_INL
156uint64<2> i_move2_r(const uint64<2>& a)
157{
158#if SIMDPP_USE_NULL || (SIMDPP_USE_ALTIVEC && !SIMDPP_USE_VSX_207)
159 return detail::null::move_n_r<shift>(a);
160#else
161 return (uint64<2>) i_move16_r<shift*8>(uint8<16>(a));
162#endif
163}
164
165#if SIMDPP_USE_AVX2
166template<unsigned shift> SIMDPP_INL
167uint64<4> i_move2_r(const uint64<4>& a)
168{
169 static_assert(shift <= 2, "Selector out of range");
170 return _mm256_slli_si256(a.native(), shift*8);
171}
172#endif
173
174#if SIMDPP_USE_AVX512F
175template<unsigned shift> SIMDPP_INL
176uint64<8> i_move2_r(const uint64<8>& a)
177{
178 static_assert(shift <= 4, "Selector out of range");
179 return (uint64<8>) i_move4_r<shift*2>(uint32<16>(a));
180}
181#endif
182
183template<unsigned shift, unsigned N> SIMDPP_INL
184uint64<N> i_move2_r(const uint64<N>& a)
185{
186 SIMDPP_VEC_ARRAY_IMPL1(uint64<N>, i_move2_r<shift>, a);
187}
188
189// -----------------------------------------------------------------------------
190
191template<unsigned shift> SIMDPP_INL
192float32<4> i_move4_r(const float32<4>& a)
193{
194#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
195 return detail::null::move_n_r<shift>(a);
196#else
197 return (float32<4>) i_move16_r<shift*4>(uint8<16>(a));
198#endif
199}
200
201#if SIMDPP_USE_AVX
202template<unsigned shift> SIMDPP_INL
203float32<8> i_move4_r(const float32<8>& a)
204{
205 static_assert(shift <= 4, "Selector out of range");
206 return (float32<8>) i_move16_r<shift*4>(uint8<32>(a));
207}
208#endif
209
210#if SIMDPP_USE_AVX512F
211template<unsigned shift> SIMDPP_INL
212float32<16> i_move4_r(const float32<16>& a)
213{
214 static_assert(shift <= 4, "Selector out of range");
215 switch (shift) {
216 default:
217 case 0: return a;
218 case 1: return _mm512_maskz_shuffle_ps(0xeeee, a.native(), a.native(),
219 _MM_SHUFFLE(2, 1, 0, 0));
220 case 2: return _mm512_maskz_shuffle_ps(0xcccc, a.native(), a.native(),
221 _MM_SHUFFLE(1, 0, 0, 0));
222 case 3: return _mm512_maskz_shuffle_ps(0x8888, a.native(), a.native(),
223 _MM_SHUFFLE(0, 0, 0, 0));
224 case 4: return make_zero();
225 }
226}
227#endif
228
229template<unsigned shift, unsigned N> SIMDPP_INL
230float32<N> i_move4_r(const float32<N>& a)
231{
232 SIMDPP_VEC_ARRAY_IMPL1(float32<N>, i_move4_r<shift>, a);
233}
234
235// -----------------------------------------------------------------------------
236
237template<unsigned shift> SIMDPP_INL
238float64<2> i_move2_r(const float64<2>& a)
239{
240#if SIMDPP_USE_SSE2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
241 return (float64<2>) i_move16_r<shift*8>(uint8<16>(a));
242#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC
243 return detail::null::move_n_r<shift>(a);
244#endif
245}
246
247#if SIMDPP_USE_AVX
248template<unsigned shift> SIMDPP_INL
249float64<4> i_move2_r(const float64<4>& a)
250{
251 static_assert(shift <= 2, "Selector out of range");
252 return (float64<4>) i_move16_r<shift*8>(uint8<32>(a));
253}
254#endif
255
256#if SIMDPP_USE_AVX512F
257template<unsigned shift> SIMDPP_INL
258float64<8> i_move2_r(const float64<8>& a)
259{
260 static_assert(shift <= 2, "Selector out of range");
261 switch (shift) {
262 default:
263 case 0: return a;
264 case 1: return _mm512_maskz_shuffle_pd(0xaa, a.native(), a.native(), SIMDPP_SHUFFLE_MASK_2x2_4(0, 0));
265 case 2: return make_zero();
266 }
267}
268#endif
269
270template<unsigned shift, unsigned N> SIMDPP_INL
271float64<N> i_move2_r(const float64<N>& a)
272{
273 SIMDPP_VEC_ARRAY_IMPL1(float64<N>, i_move2_r<shift>, a);
274}
275
276// -----------------------------------------------------------------------------
277// Certain compilers don't like zero or full vector width moves. The templates
278// below offer a warkaround
279
280template<unsigned count>
281struct i_move2_r_wrapper {
282 template<class V>
283 static SIMDPP_INL V run(const V& arg) { return i_move2_r<count>(arg); }
284};
285template<>
286struct i_move2_r_wrapper<0> {
287 template<class V>
288 static SIMDPP_INL V run(const V& arg) { return arg; }
289};
290template<>
291struct i_move2_r_wrapper<2> {
292 template<class V>
293 static SIMDPP_INL V run(const V&) { return (V) make_zero(); }
294};
295
296template<unsigned count>
297struct i_move4_r_wrapper {
298 template<class V>
299 static SIMDPP_INL V run(const V& arg) { return i_move4_r<count>(arg); }
300};
301template<>
302struct i_move4_r_wrapper<0> {
303 template<class V>
304 static SIMDPP_INL V run(const V& arg) { return arg; }
305};
306template<>
307struct i_move4_r_wrapper<4> {
308 template<class V>
309 static SIMDPP_INL V run(const V&) { return (V) make_zero(); }
310};
311
312template<unsigned count>
313struct i_move8_r_wrapper {
314 template<class V>
315 static SIMDPP_INL V run(const V& arg) { return i_move8_r<count>(arg); }
316};
317template<>
318struct i_move8_r_wrapper<0> {
319 template<class V>
320 static SIMDPP_INL V run(const V& arg) { return arg; }
321};
322template<>
323struct i_move8_r_wrapper<8> {
324 template<class V>
325 static SIMDPP_INL V run(const V&) { return (V) make_zero(); }
326};
327
328template<unsigned count>
329struct i_move16_r_wrapper {
330 template<class V>
331 static SIMDPP_INL V run(const V& arg) { return i_move16_r<count>(arg); }
332};
333template<>
334struct i_move16_r_wrapper<0> {
335 template<class V>
336 static SIMDPP_INL V run(const V& arg) { return arg; }
337};
338template<>
339struct i_move16_r_wrapper<16> {
340 template<class V>
341 static SIMDPP_INL V run(const V&) { return (V) make_zero(); }
342};
343
344} // namespace insn
345} // namespace detail
346} // namespace SIMDPP_ARCH_NAMESPACE
347} // namespace simdpp
348
349#endif
350
351