1 | /* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_MOVE_L_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_MOVE_L_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/permute4.h> |
17 | #include <simdpp/detail/null/shuffle.h> |
18 | #include <simdpp/detail/shuffle/shuffle_mask.h> |
19 | #include <simdpp/detail/vector_array_macros.h> |
20 | |
21 | namespace simdpp { |
22 | namespace SIMDPP_ARCH_NAMESPACE { |
23 | namespace detail { |
24 | namespace insn { |
25 | |
26 | #if SIMDPP_USE_ALTIVEC |
27 | template<unsigned shift> SIMDPP_INL |
28 | uint8<16> vec_sld_biendian(const uint8<16>& lower, const uint8<16>& upper) |
29 | { |
30 | #if SIMDPP_BIG_ENDIAN |
31 | return vec_sld(lower.native(), upper.native(), shift); |
32 | #else |
33 | // by default GCC adjusts vec_sld element order to match endianness of the target |
34 | return vec_sld(upper.native(), lower.native(), 16 - shift); |
35 | #endif |
36 | } |
37 | #endif |
38 | |
39 | template<unsigned shift> SIMDPP_INL |
40 | uint8x16 i_move16_l(const uint8x16& a) |
41 | { |
42 | static_assert(shift <= 16, "Selector out of range" ); |
43 | #if SIMDPP_USE_NULL |
44 | return detail::null::move_n_l<shift>(a); |
45 | #elif SIMDPP_USE_SSE2 |
46 | return _mm_srli_si128(a.native(), shift); |
47 | #elif SIMDPP_USE_NEON |
48 | uint8x16 z = make_zero(); |
49 | return vextq_u8(a.native(), z.native(), shift); |
50 | #elif SIMDPP_USE_ALTIVEC |
51 | // return align<shift>(a, (uint8x16) make_zero()); |
52 | return vec_sld_biendian<shift>((uint8<16>)a, (uint8<16>)make_zero()); |
53 | #elif SIMDPP_USE_MSA |
54 | uint8x16 zero = make_zero(); |
55 | return (v16u8) __msa_sldi_b((v16i8)zero.native(), (v16i8)a.native(), shift); |
56 | #endif |
57 | } |
58 | |
59 | #if SIMDPP_USE_AVX2 |
60 | template<unsigned shift> SIMDPP_INL |
61 | uint8x32 i_move16_l(const uint8x32& a) |
62 | { |
63 | static_assert(shift <= 16, "Selector out of range" ); |
64 | return _mm256_srli_si256(a.native(), shift); |
65 | } |
66 | #endif |
67 | |
68 | #if SIMDPP_USE_AVX512BW |
69 | template<unsigned shift> SIMDPP_INL |
70 | uint8<64> i_move16_l(const uint8<64>& a) |
71 | { |
72 | static_assert(shift <= 16, "Selector out of range" ); |
73 | return _mm512_bsrli_epi128(a.native(), shift); |
74 | } |
75 | #endif |
76 | |
77 | template<unsigned shift, unsigned N> SIMDPP_INL |
78 | uint8<N> i_move16_l(const uint8<N>& a) |
79 | { |
80 | static_assert(shift <= 16, "Selector out of range" ); |
81 | SIMDPP_VEC_ARRAY_IMPL1(uint8<N>, i_move16_l<shift>, a); |
82 | } |
83 | |
84 | // ----------------------------------------------------------------------------- |
85 | |
86 | template<unsigned shift> SIMDPP_INL |
87 | uint16<8> i_move8_l(const uint16<8>& a) |
88 | { |
89 | #if SIMDPP_USE_NULL |
90 | return detail::null::move_n_l<shift>(a); |
91 | #else |
92 | return (uint16<8>) i_move16_l<shift*2>(uint8<16>(a)); |
93 | #endif |
94 | } |
95 | |
96 | #if SIMDPP_USE_AVX2 |
97 | template<unsigned shift> SIMDPP_INL |
98 | uint16<16> i_move8_l(const uint16<16>& a) |
99 | { |
100 | static_assert(shift <= 8, "Selector out of range" ); |
101 | return _mm256_srli_si256(a.native(), shift*2); |
102 | } |
103 | #endif |
104 | |
105 | #if SIMDPP_USE_AVX512BW |
106 | template<unsigned shift> SIMDPP_INL |
107 | uint16<32> i_move8_l(const uint16<32>& a) |
108 | { |
109 | static_assert(shift <= 8, "Selector out of range" ); |
110 | return _mm512_bsrli_epi128(a.native(), shift*2); |
111 | } |
112 | #endif |
113 | |
114 | template<unsigned shift, unsigned N> SIMDPP_INL |
115 | uint16<N> i_move8_l(const uint16<N>& a) |
116 | { |
117 | SIMDPP_VEC_ARRAY_IMPL1(uint16<N>, i_move8_l<shift>, a); |
118 | } |
119 | |
120 | // ----------------------------------------------------------------------------- |
121 | |
122 | template<unsigned shift> SIMDPP_INL |
123 | uint32<4> i_move4_l(const uint32<4>& a) |
124 | { |
125 | #if SIMDPP_USE_NULL |
126 | return detail::null::move_n_l<shift>(a); |
127 | #else |
128 | return (uint32<4>) i_move16_l<shift*4>(uint8<16>(a)); |
129 | #endif |
130 | } |
131 | |
132 | #if SIMDPP_USE_AVX2 |
133 | template<unsigned shift> SIMDPP_INL |
134 | uint32<8> i_move4_l(const uint32<8>& a) |
135 | { |
136 | static_assert(shift <= 4, "Selector out of range" ); |
137 | return _mm256_srli_si256(a.native(), shift*4); |
138 | } |
139 | #endif |
140 | |
141 | #if SIMDPP_USE_AVX512F |
142 | template<unsigned shift> SIMDPP_INL |
143 | uint32<16> i_move4_l(const uint32<16>& a) |
144 | { |
145 | static_assert(shift <= 4, "Selector out of range" ); |
146 | switch (shift) { |
147 | default: |
148 | case 0: return a; |
149 | case 1: return _mm512_maskz_shuffle_epi32(0x7777, a.native(), |
150 | _MM_PERM_ENUM(_MM_SHUFFLE(3, 3, 2, 1))); |
151 | case 2: return _mm512_maskz_shuffle_epi32(0x3333, a.native(), |
152 | _MM_PERM_ENUM(_MM_SHUFFLE(3, 3, 3, 2))); |
153 | case 3: return _mm512_maskz_shuffle_epi32(0x1111, a.native(), |
154 | _MM_PERM_ENUM(_MM_SHUFFLE(3, 3, 3, 3))); |
155 | case 4: return make_zero(); |
156 | } |
157 | } |
158 | #endif |
159 | |
160 | template<unsigned shift, unsigned N> SIMDPP_INL |
161 | uint32<N> i_move4_l(const uint32<N>& a) |
162 | { |
163 | SIMDPP_VEC_ARRAY_IMPL1(uint32<N>, i_move4_l<shift>, a); |
164 | } |
165 | |
166 | // ----------------------------------------------------------------------------- |
167 | |
168 | template<unsigned shift> SIMDPP_INL |
169 | uint64<2> i_move2_l(const uint64<2>& a) |
170 | { |
171 | #if SIMDPP_USE_NULL || (SIMDPP_USE_ALTIVEC && !SIMDPP_USE_VSX_207) |
172 | return detail::null::move_n_l<shift>(a); |
173 | #else |
174 | return (uint64<2>) i_move16_l<shift*8>(uint8<16>(a)); |
175 | #endif |
176 | } |
177 | |
178 | #if SIMDPP_USE_AVX2 |
179 | template<unsigned shift> SIMDPP_INL |
180 | uint64<4> i_move2_l(const uint64<4>& a) |
181 | { |
182 | static_assert(shift <= 2, "Selector out of range" ); |
183 | return _mm256_srli_si256(a.native(), shift*8); |
184 | } |
185 | #endif |
186 | |
187 | #if SIMDPP_USE_AVX512F |
188 | template<unsigned shift> SIMDPP_INL |
189 | uint64<8> i_move2_l(const uint64<8>& a) |
190 | { |
191 | static_assert(shift <= 4, "Selector out of range" ); |
192 | return (uint64<8>) i_move4_l<shift*2>(uint32<16>(a)); |
193 | } |
194 | #endif |
195 | |
196 | template<unsigned shift, unsigned N> SIMDPP_INL |
197 | uint64<N> i_move2_l(const uint64<N>& a) |
198 | { |
199 | SIMDPP_VEC_ARRAY_IMPL1(uint64<N>, i_move2_l<shift>, a); |
200 | } |
201 | |
202 | // ----------------------------------------------------------------------------- |
203 | |
204 | template<unsigned shift> SIMDPP_INL |
205 | float32<4> i_move4_l(const float32<4>& a) |
206 | { |
207 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP |
208 | return detail::null::move_n_l<shift>(a); |
209 | #else |
210 | return (float32<4>) i_move16_l<shift*4>(uint8<16>(a)); |
211 | #endif |
212 | } |
213 | |
214 | #if SIMDPP_USE_AVX |
215 | template<unsigned shift> SIMDPP_INL |
216 | float32<8> i_move4_l(const float32<8>& a) |
217 | { |
218 | static_assert(shift <= 4, "Selector out of range" ); |
219 | return (float32<8>) i_move16_l<shift*4>(uint8<32>(a)); |
220 | } |
221 | #endif |
222 | |
223 | #if SIMDPP_USE_AVX512F |
224 | template<unsigned shift> SIMDPP_INL |
225 | float32<16> i_move4_l(const float32<16>& a) |
226 | { |
227 | static_assert(shift <= 4, "Selector out of range" ); |
228 | switch (shift) { |
229 | default: |
230 | case 0: return a; |
231 | case 1: return _mm512_maskz_shuffle_ps(0x7777, a.native(), a.native(), |
232 | _MM_SHUFFLE(3, 3, 2, 1)); |
233 | case 2: return _mm512_maskz_shuffle_ps(0x3333, a.native(), a.native(), |
234 | _MM_SHUFFLE(3, 3, 3, 2)); |
235 | case 3: return _mm512_maskz_shuffle_ps(0x1111, a.native(), a.native(), |
236 | _MM_SHUFFLE(3, 3, 3, 3)); |
237 | case 4: return make_zero(); |
238 | } |
239 | } |
240 | #endif |
241 | |
242 | template<unsigned shift, unsigned N> SIMDPP_INL |
243 | float32<N> i_move4_l(const float32<N>& a) |
244 | { |
245 | SIMDPP_VEC_ARRAY_IMPL1(float32<N>, i_move4_l<shift>, a); |
246 | } |
247 | |
248 | // ----------------------------------------------------------------------------- |
249 | |
250 | template<unsigned shift> SIMDPP_INL |
251 | float64<2> i_move2_l(const float64<2>& a) |
252 | { |
253 | #if SIMDPP_USE_SSE2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA |
254 | return (float64<2>) i_move16_l<shift*8>(uint8<16>(a)); |
255 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC |
256 | return detail::null::move_n_l<shift>(a); |
257 | #endif |
258 | } |
259 | |
260 | #if SIMDPP_USE_AVX |
261 | template<unsigned shift> SIMDPP_INL |
262 | float64<4> i_move2_l(const float64<4>& a) |
263 | { |
264 | static_assert(shift <= 2, "Selector out of range" ); |
265 | return (float64<4>) i_move16_l<shift*8>(uint8<32>(a)); |
266 | } |
267 | #endif |
268 | |
269 | #if SIMDPP_USE_AVX512F |
270 | template<unsigned shift> SIMDPP_INL |
271 | float64<8> i_move2_l(const float64<8>& a) |
272 | { |
273 | static_assert(shift <= 2, "Selector out of range" ); |
274 | switch (shift) { |
275 | default: |
276 | case 0: return a; |
277 | case 1: return _mm512_maskz_shuffle_pd(0x55, a.native(), a.native(), |
278 | SIMDPP_SHUFFLE_MASK_2x2_4(1, 1)); |
279 | case 2: return make_zero(); |
280 | } |
281 | } |
282 | #endif |
283 | |
284 | template<unsigned shift, unsigned N> SIMDPP_INL |
285 | float64<N> i_move2_l(const float64<N>& a) |
286 | { |
287 | SIMDPP_VEC_ARRAY_IMPL1(float64<N>, i_move2_l<shift>, a); |
288 | } |
289 | |
290 | // ----------------------------------------------------------------------------- |
291 | // Certain compilers don't like zero or full vector width moves. The templates |
292 | // below offer a warkaround |
293 | |
294 | template<unsigned count> |
295 | struct i_move2_l_wrapper { |
296 | template<class V> |
297 | static SIMDPP_INL V run(const V& arg) { return i_move2_l<count>(arg); } |
298 | }; |
299 | template<> |
300 | struct i_move2_l_wrapper<0> { |
301 | template<class V> |
302 | static SIMDPP_INL V run(const V& arg) { return arg; } |
303 | }; |
304 | template<> |
305 | struct i_move2_l_wrapper<2> { |
306 | template<class V> |
307 | static SIMDPP_INL V run(const V&) { return (V) make_zero(); } |
308 | }; |
309 | |
310 | template<unsigned count> |
311 | struct i_move4_l_wrapper { |
312 | template<class V> |
313 | static SIMDPP_INL V run(const V& arg) { return i_move4_l<count>(arg); } |
314 | }; |
315 | template<> |
316 | struct i_move4_l_wrapper<0> { |
317 | template<class V> |
318 | static SIMDPP_INL V run(const V& arg) { return arg; } |
319 | }; |
320 | template<> |
321 | struct i_move4_l_wrapper<4> { |
322 | template<class V> |
323 | static SIMDPP_INL V run(const V&) { return (V) make_zero(); } |
324 | }; |
325 | |
326 | template<unsigned count> |
327 | struct i_move8_l_wrapper { |
328 | template<class V> |
329 | static SIMDPP_INL V run(const V& arg) { return i_move8_l<count>(arg); } |
330 | }; |
331 | template<> |
332 | struct i_move8_l_wrapper<0> { |
333 | template<class V> |
334 | static SIMDPP_INL V run(const V& arg) { return arg; } |
335 | }; |
336 | template<> |
337 | struct i_move8_l_wrapper<8> { |
338 | template<class V> |
339 | static SIMDPP_INL V run(const V&) { return (V) make_zero(); } |
340 | }; |
341 | |
342 | template<unsigned count> |
343 | struct i_move16_l_wrapper { |
344 | template<class V> |
345 | static SIMDPP_INL V run(const V& arg) { return i_move16_l<count>(arg); } |
346 | }; |
347 | template<> |
348 | struct i_move16_l_wrapper<0> { |
349 | template<class V> |
350 | static SIMDPP_INL V run(const V& arg) { return arg; } |
351 | }; |
352 | template<> |
353 | struct i_move16_l_wrapper<16> { |
354 | template<class V> |
355 | static SIMDPP_INL V run(const V&) { return (V) make_zero(); } |
356 | }; |
357 | |
358 | } // namespace insn |
359 | } // namespace detail |
360 | } // namespace SIMDPP_ARCH_NAMESPACE |
361 | } // namespace simdpp |
362 | |
363 | #endif |
364 | |
365 | |