1/* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_MOVE_L_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_MOVE_L_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/permute4.h>
17#include <simdpp/detail/null/shuffle.h>
18#include <simdpp/detail/shuffle/shuffle_mask.h>
19#include <simdpp/detail/vector_array_macros.h>
20
21namespace simdpp {
22namespace SIMDPP_ARCH_NAMESPACE {
23namespace detail {
24namespace insn {
25
26#if SIMDPP_USE_ALTIVEC
27template<unsigned shift> SIMDPP_INL
28uint8<16> vec_sld_biendian(const uint8<16>& lower, const uint8<16>& upper)
29{
30#if SIMDPP_BIG_ENDIAN
31 return vec_sld(lower.native(), upper.native(), shift);
32#else
33 // by default GCC adjusts vec_sld element order to match endianness of the target
34 return vec_sld(upper.native(), lower.native(), 16 - shift);
35#endif
36}
37#endif
38
39template<unsigned shift> SIMDPP_INL
40uint8x16 i_move16_l(const uint8x16& a)
41{
42 static_assert(shift <= 16, "Selector out of range");
43#if SIMDPP_USE_NULL
44 return detail::null::move_n_l<shift>(a);
45#elif SIMDPP_USE_SSE2
46 return _mm_srli_si128(a.native(), shift);
47#elif SIMDPP_USE_NEON
48 uint8x16 z = make_zero();
49 return vextq_u8(a.native(), z.native(), shift);
50#elif SIMDPP_USE_ALTIVEC
51 // return align<shift>(a, (uint8x16) make_zero());
52 return vec_sld_biendian<shift>((uint8<16>)a, (uint8<16>)make_zero());
53#elif SIMDPP_USE_MSA
54 uint8x16 zero = make_zero();
55 return (v16u8) __msa_sldi_b((v16i8)zero.native(), (v16i8)a.native(), shift);
56#endif
57}
58
59#if SIMDPP_USE_AVX2
60template<unsigned shift> SIMDPP_INL
61uint8x32 i_move16_l(const uint8x32& a)
62{
63 static_assert(shift <= 16, "Selector out of range");
64 return _mm256_srli_si256(a.native(), shift);
65}
66#endif
67
68#if SIMDPP_USE_AVX512BW
69template<unsigned shift> SIMDPP_INL
70uint8<64> i_move16_l(const uint8<64>& a)
71{
72 static_assert(shift <= 16, "Selector out of range");
73 return _mm512_bsrli_epi128(a.native(), shift);
74}
75#endif
76
77template<unsigned shift, unsigned N> SIMDPP_INL
78uint8<N> i_move16_l(const uint8<N>& a)
79{
80 static_assert(shift <= 16, "Selector out of range");
81 SIMDPP_VEC_ARRAY_IMPL1(uint8<N>, i_move16_l<shift>, a);
82}
83
84// -----------------------------------------------------------------------------
85
86template<unsigned shift> SIMDPP_INL
87uint16<8> i_move8_l(const uint16<8>& a)
88{
89#if SIMDPP_USE_NULL
90 return detail::null::move_n_l<shift>(a);
91#else
92 return (uint16<8>) i_move16_l<shift*2>(uint8<16>(a));
93#endif
94}
95
96#if SIMDPP_USE_AVX2
97template<unsigned shift> SIMDPP_INL
98uint16<16> i_move8_l(const uint16<16>& a)
99{
100 static_assert(shift <= 8, "Selector out of range");
101 return _mm256_srli_si256(a.native(), shift*2);
102}
103#endif
104
105#if SIMDPP_USE_AVX512BW
106template<unsigned shift> SIMDPP_INL
107uint16<32> i_move8_l(const uint16<32>& a)
108{
109 static_assert(shift <= 8, "Selector out of range");
110 return _mm512_bsrli_epi128(a.native(), shift*2);
111}
112#endif
113
114template<unsigned shift, unsigned N> SIMDPP_INL
115uint16<N> i_move8_l(const uint16<N>& a)
116{
117 SIMDPP_VEC_ARRAY_IMPL1(uint16<N>, i_move8_l<shift>, a);
118}
119
120// -----------------------------------------------------------------------------
121
122template<unsigned shift> SIMDPP_INL
123uint32<4> i_move4_l(const uint32<4>& a)
124{
125#if SIMDPP_USE_NULL
126 return detail::null::move_n_l<shift>(a);
127#else
128 return (uint32<4>) i_move16_l<shift*4>(uint8<16>(a));
129#endif
130}
131
132#if SIMDPP_USE_AVX2
133template<unsigned shift> SIMDPP_INL
134uint32<8> i_move4_l(const uint32<8>& a)
135{
136 static_assert(shift <= 4, "Selector out of range");
137 return _mm256_srli_si256(a.native(), shift*4);
138}
139#endif
140
141#if SIMDPP_USE_AVX512F
142template<unsigned shift> SIMDPP_INL
143uint32<16> i_move4_l(const uint32<16>& a)
144{
145 static_assert(shift <= 4, "Selector out of range");
146 switch (shift) {
147 default:
148 case 0: return a;
149 case 1: return _mm512_maskz_shuffle_epi32(0x7777, a.native(),
150 _MM_PERM_ENUM(_MM_SHUFFLE(3, 3, 2, 1)));
151 case 2: return _mm512_maskz_shuffle_epi32(0x3333, a.native(),
152 _MM_PERM_ENUM(_MM_SHUFFLE(3, 3, 3, 2)));
153 case 3: return _mm512_maskz_shuffle_epi32(0x1111, a.native(),
154 _MM_PERM_ENUM(_MM_SHUFFLE(3, 3, 3, 3)));
155 case 4: return make_zero();
156 }
157}
158#endif
159
160template<unsigned shift, unsigned N> SIMDPP_INL
161uint32<N> i_move4_l(const uint32<N>& a)
162{
163 SIMDPP_VEC_ARRAY_IMPL1(uint32<N>, i_move4_l<shift>, a);
164}
165
166// -----------------------------------------------------------------------------
167
168template<unsigned shift> SIMDPP_INL
169uint64<2> i_move2_l(const uint64<2>& a)
170{
171#if SIMDPP_USE_NULL || (SIMDPP_USE_ALTIVEC && !SIMDPP_USE_VSX_207)
172 return detail::null::move_n_l<shift>(a);
173#else
174 return (uint64<2>) i_move16_l<shift*8>(uint8<16>(a));
175#endif
176}
177
178#if SIMDPP_USE_AVX2
179template<unsigned shift> SIMDPP_INL
180uint64<4> i_move2_l(const uint64<4>& a)
181{
182 static_assert(shift <= 2, "Selector out of range");
183 return _mm256_srli_si256(a.native(), shift*8);
184}
185#endif
186
187#if SIMDPP_USE_AVX512F
188template<unsigned shift> SIMDPP_INL
189uint64<8> i_move2_l(const uint64<8>& a)
190{
191 static_assert(shift <= 4, "Selector out of range");
192 return (uint64<8>) i_move4_l<shift*2>(uint32<16>(a));
193}
194#endif
195
196template<unsigned shift, unsigned N> SIMDPP_INL
197uint64<N> i_move2_l(const uint64<N>& a)
198{
199 SIMDPP_VEC_ARRAY_IMPL1(uint64<N>, i_move2_l<shift>, a);
200}
201
202// -----------------------------------------------------------------------------
203
204template<unsigned shift> SIMDPP_INL
205float32<4> i_move4_l(const float32<4>& a)
206{
207#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
208 return detail::null::move_n_l<shift>(a);
209#else
210 return (float32<4>) i_move16_l<shift*4>(uint8<16>(a));
211#endif
212}
213
214#if SIMDPP_USE_AVX
215template<unsigned shift> SIMDPP_INL
216float32<8> i_move4_l(const float32<8>& a)
217{
218 static_assert(shift <= 4, "Selector out of range");
219 return (float32<8>) i_move16_l<shift*4>(uint8<32>(a));
220}
221#endif
222
223#if SIMDPP_USE_AVX512F
224template<unsigned shift> SIMDPP_INL
225float32<16> i_move4_l(const float32<16>& a)
226{
227 static_assert(shift <= 4, "Selector out of range");
228 switch (shift) {
229 default:
230 case 0: return a;
231 case 1: return _mm512_maskz_shuffle_ps(0x7777, a.native(), a.native(),
232 _MM_SHUFFLE(3, 3, 2, 1));
233 case 2: return _mm512_maskz_shuffle_ps(0x3333, a.native(), a.native(),
234 _MM_SHUFFLE(3, 3, 3, 2));
235 case 3: return _mm512_maskz_shuffle_ps(0x1111, a.native(), a.native(),
236 _MM_SHUFFLE(3, 3, 3, 3));
237 case 4: return make_zero();
238 }
239}
240#endif
241
242template<unsigned shift, unsigned N> SIMDPP_INL
243float32<N> i_move4_l(const float32<N>& a)
244{
245 SIMDPP_VEC_ARRAY_IMPL1(float32<N>, i_move4_l<shift>, a);
246}
247
248// -----------------------------------------------------------------------------
249
250template<unsigned shift> SIMDPP_INL
251float64<2> i_move2_l(const float64<2>& a)
252{
253#if SIMDPP_USE_SSE2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
254 return (float64<2>) i_move16_l<shift*8>(uint8<16>(a));
255#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
256 return detail::null::move_n_l<shift>(a);
257#endif
258}
259
260#if SIMDPP_USE_AVX
261template<unsigned shift> SIMDPP_INL
262float64<4> i_move2_l(const float64<4>& a)
263{
264 static_assert(shift <= 2, "Selector out of range");
265 return (float64<4>) i_move16_l<shift*8>(uint8<32>(a));
266}
267#endif
268
269#if SIMDPP_USE_AVX512F
270template<unsigned shift> SIMDPP_INL
271float64<8> i_move2_l(const float64<8>& a)
272{
273 static_assert(shift <= 2, "Selector out of range");
274 switch (shift) {
275 default:
276 case 0: return a;
277 case 1: return _mm512_maskz_shuffle_pd(0x55, a.native(), a.native(),
278 SIMDPP_SHUFFLE_MASK_2x2_4(1, 1));
279 case 2: return make_zero();
280 }
281}
282#endif
283
284template<unsigned shift, unsigned N> SIMDPP_INL
285float64<N> i_move2_l(const float64<N>& a)
286{
287 SIMDPP_VEC_ARRAY_IMPL1(float64<N>, i_move2_l<shift>, a);
288}
289
290// -----------------------------------------------------------------------------
291// Certain compilers don't like zero or full vector width moves. The templates
292// below offer a warkaround
293
294template<unsigned count>
295struct i_move2_l_wrapper {
296 template<class V>
297 static SIMDPP_INL V run(const V& arg) { return i_move2_l<count>(arg); }
298};
299template<>
300struct i_move2_l_wrapper<0> {
301 template<class V>
302 static SIMDPP_INL V run(const V& arg) { return arg; }
303};
304template<>
305struct i_move2_l_wrapper<2> {
306 template<class V>
307 static SIMDPP_INL V run(const V&) { return (V) make_zero(); }
308};
309
310template<unsigned count>
311struct i_move4_l_wrapper {
312 template<class V>
313 static SIMDPP_INL V run(const V& arg) { return i_move4_l<count>(arg); }
314};
315template<>
316struct i_move4_l_wrapper<0> {
317 template<class V>
318 static SIMDPP_INL V run(const V& arg) { return arg; }
319};
320template<>
321struct i_move4_l_wrapper<4> {
322 template<class V>
323 static SIMDPP_INL V run(const V&) { return (V) make_zero(); }
324};
325
326template<unsigned count>
327struct i_move8_l_wrapper {
328 template<class V>
329 static SIMDPP_INL V run(const V& arg) { return i_move8_l<count>(arg); }
330};
331template<>
332struct i_move8_l_wrapper<0> {
333 template<class V>
334 static SIMDPP_INL V run(const V& arg) { return arg; }
335};
336template<>
337struct i_move8_l_wrapper<8> {
338 template<class V>
339 static SIMDPP_INL V run(const V&) { return (V) make_zero(); }
340};
341
342template<unsigned count>
343struct i_move16_l_wrapper {
344 template<class V>
345 static SIMDPP_INL V run(const V& arg) { return i_move16_l<count>(arg); }
346};
347template<>
348struct i_move16_l_wrapper<0> {
349 template<class V>
350 static SIMDPP_INL V run(const V& arg) { return arg; }
351};
352template<>
353struct i_move16_l_wrapper<16> {
354 template<class V>
355 static SIMDPP_INL V run(const V&) { return (V) make_zero(); }
356};
357
358} // namespace insn
359} // namespace detail
360} // namespace SIMDPP_ARCH_NAMESPACE
361} // namespace simdpp
362
363#endif
364
365