1 | /* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_MOVE_R_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_MOVE_R_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/detail/shuffle/shuffle_mask.h> |
17 | #include <simdpp/detail/vector_array_macros.h> |
18 | |
19 | namespace simdpp { |
20 | namespace SIMDPP_ARCH_NAMESPACE { |
21 | namespace detail { |
22 | namespace insn { |
23 | |
24 | |
25 | template<unsigned shift> SIMDPP_INL |
26 | uint8x16 i_move16_r(const uint8x16& a) |
27 | { |
28 | static_assert(shift <= 16, "Selector out of range" ); |
29 | |
30 | #if SIMDPP_USE_NULL |
31 | return detail::null::move_n_r<shift>(a); |
32 | #elif SIMDPP_USE_SSE2 |
33 | return _mm_slli_si128(a.native(), shift); |
34 | #elif SIMDPP_USE_NEON |
35 | uint8x16 zero = make_zero(); |
36 | return vextq_u8(zero.native(), a.native(), 16-shift); |
37 | #elif SIMDPP_USE_ALTIVEC |
38 | // return align<16-shift>((uint8x16) make_zero(), a); |
39 | return vec_sld_biendian<16-shift>((uint8<16>)make_zero(), a); |
40 | #elif SIMDPP_USE_MSA |
41 | uint8x16 zero = make_zero(); |
42 | return (v16u8) __msa_sldi_b((v16i8)a.native(), (v16i8)zero.native(), 16-shift); |
43 | #endif |
44 | } |
45 | |
46 | #if SIMDPP_USE_AVX2 |
47 | template<unsigned shift> SIMDPP_INL |
48 | uint8x32 i_move16_r(const uint8x32& a) |
49 | { |
50 | static_assert(shift <= 16, "Selector out of range" ); |
51 | return _mm256_slli_si256(a.native(), shift); |
52 | } |
53 | #endif |
54 | |
55 | #if SIMDPP_USE_AVX512BW |
56 | template<unsigned shift> SIMDPP_INL |
57 | uint8<64> i_move16_r(const uint8<64>& a) |
58 | { |
59 | static_assert(shift <= 16, "Selector out of range" ); |
60 | return _mm512_bslli_epi128(a.native(), shift); |
61 | } |
62 | #endif |
63 | |
64 | template<unsigned shift, unsigned N> SIMDPP_INL |
65 | uint8<N> i_move16_r(const uint8<N>& a) |
66 | { |
67 | static_assert(shift <= 16, "Selector out of range" ); |
68 | SIMDPP_VEC_ARRAY_IMPL1(uint8<N>, i_move16_r<shift>, a); |
69 | } |
70 | |
71 | // ----------------------------------------------------------------------------- |
72 | |
73 | template<unsigned shift> SIMDPP_INL |
74 | uint16<8> i_move8_r(const uint16<8>& a) |
75 | { |
76 | #if SIMDPP_USE_NULL |
77 | return detail::null::move_n_r<shift>(a); |
78 | #else |
79 | return (uint16<8>) i_move16_r<shift*2>(uint8<16>(a)); |
80 | #endif |
81 | } |
82 | |
83 | #if SIMDPP_USE_AVX2 |
84 | template<unsigned shift> SIMDPP_INL |
85 | uint16<16> i_move8_r(const uint16<16>& a) |
86 | { |
87 | static_assert(shift <= 8, "Selector out of range" ); |
88 | return _mm256_slli_si256(a.native(), shift*2); |
89 | } |
90 | #endif |
91 | |
92 | #if SIMDPP_USE_AVX512BW |
93 | template<unsigned shift> SIMDPP_INL |
94 | uint16<32> i_move8_r(const uint16<32>& a) |
95 | { |
96 | static_assert(shift <= 8, "Selector out of range" ); |
97 | return _mm512_bslli_epi128(a.native(), shift*2); |
98 | } |
99 | #endif |
100 | |
101 | template<unsigned shift, unsigned N> SIMDPP_INL |
102 | uint16<N> i_move8_r(const uint16<N>& a) |
103 | { |
104 | SIMDPP_VEC_ARRAY_IMPL1(uint16<N>, i_move8_r<shift>, a); |
105 | } |
106 | |
107 | // ----------------------------------------------------------------------------- |
108 | |
109 | template<unsigned shift> SIMDPP_INL |
110 | uint32<4> i_move4_r(const uint32<4>& a) |
111 | { |
112 | #if SIMDPP_USE_NULL |
113 | return detail::null::move_n_r<shift>(a); |
114 | #else |
115 | return (uint32<4>) i_move16_r<shift*4>(uint8<16>(a)); |
116 | #endif |
117 | } |
118 | |
119 | #if SIMDPP_USE_AVX2 |
120 | template<unsigned shift> SIMDPP_INL |
121 | uint32<8> i_move4_r(const uint32<8>& a) |
122 | { |
123 | static_assert(shift <= 4, "Selector out of range" ); |
124 | return _mm256_slli_si256(a.native(), shift*4); |
125 | } |
126 | #endif |
127 | |
128 | #if SIMDPP_USE_AVX512F |
129 | template<unsigned shift> SIMDPP_INL |
130 | uint32<16> i_move4_r(const uint32<16>& a) |
131 | { |
132 | static_assert(shift <= 4, "Selector out of range" ); |
133 | switch (shift) { |
134 | default: |
135 | case 0: return a; |
136 | case 1: return _mm512_maskz_shuffle_epi32(0xeeee, a.native(), |
137 | _MM_PERM_ENUM(_MM_SHUFFLE(2, 1, 0, 0))); |
138 | case 2: return _mm512_maskz_shuffle_epi32(0xcccc, a.native(), |
139 | _MM_PERM_ENUM(_MM_SHUFFLE(1, 0, 0, 0))); |
140 | case 3: return _mm512_maskz_shuffle_epi32(0x8888, a.native(), |
141 | _MM_PERM_ENUM(_MM_SHUFFLE(0, 0, 0, 0))); |
142 | case 4: return make_zero(); |
143 | } |
144 | } |
145 | #endif |
146 | |
147 | template<unsigned shift, unsigned N> SIMDPP_INL |
148 | uint32<N> i_move4_r(const uint32<N>& a) |
149 | { |
150 | SIMDPP_VEC_ARRAY_IMPL1(uint32<N>, i_move4_r<shift>, a); |
151 | } |
152 | |
153 | // ----------------------------------------------------------------------------- |
154 | |
155 | template<unsigned shift> SIMDPP_INL |
156 | uint64<2> i_move2_r(const uint64<2>& a) |
157 | { |
158 | #if SIMDPP_USE_NULL || (SIMDPP_USE_ALTIVEC && !SIMDPP_USE_VSX_207) |
159 | return detail::null::move_n_r<shift>(a); |
160 | #else |
161 | return (uint64<2>) i_move16_r<shift*8>(uint8<16>(a)); |
162 | #endif |
163 | } |
164 | |
165 | #if SIMDPP_USE_AVX2 |
166 | template<unsigned shift> SIMDPP_INL |
167 | uint64<4> i_move2_r(const uint64<4>& a) |
168 | { |
169 | static_assert(shift <= 2, "Selector out of range" ); |
170 | return _mm256_slli_si256(a.native(), shift*8); |
171 | } |
172 | #endif |
173 | |
174 | #if SIMDPP_USE_AVX512F |
175 | template<unsigned shift> SIMDPP_INL |
176 | uint64<8> i_move2_r(const uint64<8>& a) |
177 | { |
178 | static_assert(shift <= 4, "Selector out of range" ); |
179 | return (uint64<8>) i_move4_r<shift*2>(uint32<16>(a)); |
180 | } |
181 | #endif |
182 | |
183 | template<unsigned shift, unsigned N> SIMDPP_INL |
184 | uint64<N> i_move2_r(const uint64<N>& a) |
185 | { |
186 | SIMDPP_VEC_ARRAY_IMPL1(uint64<N>, i_move2_r<shift>, a); |
187 | } |
188 | |
189 | // ----------------------------------------------------------------------------- |
190 | |
191 | template<unsigned shift> SIMDPP_INL |
192 | float32<4> i_move4_r(const float32<4>& a) |
193 | { |
194 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP |
195 | return detail::null::move_n_r<shift>(a); |
196 | #else |
197 | return (float32<4>) i_move16_r<shift*4>(uint8<16>(a)); |
198 | #endif |
199 | } |
200 | |
201 | #if SIMDPP_USE_AVX |
202 | template<unsigned shift> SIMDPP_INL |
203 | float32<8> i_move4_r(const float32<8>& a) |
204 | { |
205 | static_assert(shift <= 4, "Selector out of range" ); |
206 | return (float32<8>) i_move16_r<shift*4>(uint8<32>(a)); |
207 | } |
208 | #endif |
209 | |
210 | #if SIMDPP_USE_AVX512F |
211 | template<unsigned shift> SIMDPP_INL |
212 | float32<16> i_move4_r(const float32<16>& a) |
213 | { |
214 | static_assert(shift <= 4, "Selector out of range" ); |
215 | switch (shift) { |
216 | default: |
217 | case 0: return a; |
218 | case 1: return _mm512_maskz_shuffle_ps(0xeeee, a.native(), a.native(), |
219 | _MM_SHUFFLE(2, 1, 0, 0)); |
220 | case 2: return _mm512_maskz_shuffle_ps(0xcccc, a.native(), a.native(), |
221 | _MM_SHUFFLE(1, 0, 0, 0)); |
222 | case 3: return _mm512_maskz_shuffle_ps(0x8888, a.native(), a.native(), |
223 | _MM_SHUFFLE(0, 0, 0, 0)); |
224 | case 4: return make_zero(); |
225 | } |
226 | } |
227 | #endif |
228 | |
229 | template<unsigned shift, unsigned N> SIMDPP_INL |
230 | float32<N> i_move4_r(const float32<N>& a) |
231 | { |
232 | SIMDPP_VEC_ARRAY_IMPL1(float32<N>, i_move4_r<shift>, a); |
233 | } |
234 | |
235 | // ----------------------------------------------------------------------------- |
236 | |
237 | template<unsigned shift> SIMDPP_INL |
238 | float64<2> i_move2_r(const float64<2>& a) |
239 | { |
240 | #if SIMDPP_USE_SSE2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA |
241 | return (float64<2>) i_move16_r<shift*8>(uint8<16>(a)); |
242 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC |
243 | return detail::null::move_n_r<shift>(a); |
244 | #endif |
245 | } |
246 | |
247 | #if SIMDPP_USE_AVX |
248 | template<unsigned shift> SIMDPP_INL |
249 | float64<4> i_move2_r(const float64<4>& a) |
250 | { |
251 | static_assert(shift <= 2, "Selector out of range" ); |
252 | return (float64<4>) i_move16_r<shift*8>(uint8<32>(a)); |
253 | } |
254 | #endif |
255 | |
256 | #if SIMDPP_USE_AVX512F |
257 | template<unsigned shift> SIMDPP_INL |
258 | float64<8> i_move2_r(const float64<8>& a) |
259 | { |
260 | static_assert(shift <= 2, "Selector out of range" ); |
261 | switch (shift) { |
262 | default: |
263 | case 0: return a; |
264 | case 1: return _mm512_maskz_shuffle_pd(0xaa, a.native(), a.native(), SIMDPP_SHUFFLE_MASK_2x2_4(0, 0)); |
265 | case 2: return make_zero(); |
266 | } |
267 | } |
268 | #endif |
269 | |
270 | template<unsigned shift, unsigned N> SIMDPP_INL |
271 | float64<N> i_move2_r(const float64<N>& a) |
272 | { |
273 | SIMDPP_VEC_ARRAY_IMPL1(float64<N>, i_move2_r<shift>, a); |
274 | } |
275 | |
276 | // ----------------------------------------------------------------------------- |
277 | // Certain compilers don't like zero or full vector width moves. The templates |
278 | // below offer a warkaround |
279 | |
280 | template<unsigned count> |
281 | struct i_move2_r_wrapper { |
282 | template<class V> |
283 | static SIMDPP_INL V run(const V& arg) { return i_move2_r<count>(arg); } |
284 | }; |
285 | template<> |
286 | struct i_move2_r_wrapper<0> { |
287 | template<class V> |
288 | static SIMDPP_INL V run(const V& arg) { return arg; } |
289 | }; |
290 | template<> |
291 | struct i_move2_r_wrapper<2> { |
292 | template<class V> |
293 | static SIMDPP_INL V run(const V&) { return (V) make_zero(); } |
294 | }; |
295 | |
296 | template<unsigned count> |
297 | struct i_move4_r_wrapper { |
298 | template<class V> |
299 | static SIMDPP_INL V run(const V& arg) { return i_move4_r<count>(arg); } |
300 | }; |
301 | template<> |
302 | struct i_move4_r_wrapper<0> { |
303 | template<class V> |
304 | static SIMDPP_INL V run(const V& arg) { return arg; } |
305 | }; |
306 | template<> |
307 | struct i_move4_r_wrapper<4> { |
308 | template<class V> |
309 | static SIMDPP_INL V run(const V&) { return (V) make_zero(); } |
310 | }; |
311 | |
312 | template<unsigned count> |
313 | struct i_move8_r_wrapper { |
314 | template<class V> |
315 | static SIMDPP_INL V run(const V& arg) { return i_move8_r<count>(arg); } |
316 | }; |
317 | template<> |
318 | struct i_move8_r_wrapper<0> { |
319 | template<class V> |
320 | static SIMDPP_INL V run(const V& arg) { return arg; } |
321 | }; |
322 | template<> |
323 | struct i_move8_r_wrapper<8> { |
324 | template<class V> |
325 | static SIMDPP_INL V run(const V&) { return (V) make_zero(); } |
326 | }; |
327 | |
328 | template<unsigned count> |
329 | struct i_move16_r_wrapper { |
330 | template<class V> |
331 | static SIMDPP_INL V run(const V& arg) { return i_move16_r<count>(arg); } |
332 | }; |
333 | template<> |
334 | struct i_move16_r_wrapper<0> { |
335 | template<class V> |
336 | static SIMDPP_INL V run(const V& arg) { return arg; } |
337 | }; |
338 | template<> |
339 | struct i_move16_r_wrapper<16> { |
340 | template<class V> |
341 | static SIMDPP_INL V run(const V&) { return (V) make_zero(); } |
342 | }; |
343 | |
344 | } // namespace insn |
345 | } // namespace detail |
346 | } // namespace SIMDPP_ARCH_NAMESPACE |
347 | } // namespace simdpp |
348 | |
349 | #endif |
350 | |
351 | |