1 | /* Copyright (C) 2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_SHUFFLE2x2_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_SHUFFLE2x2_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/make_shuffle_bytes_mask.h> |
17 | #include <simdpp/core/shuffle_bytes16.h> |
18 | #include <simdpp/detail/not_implemented.h> |
19 | #include <simdpp/detail/shuffle/neon_int32x4.h> |
20 | #include <simdpp/detail/shuffle/neon_int64x2.h> |
21 | #include <simdpp/detail/shuffle/shuffle_mask.h> |
22 | #include <simdpp/detail/vector_array_macros.h> |
23 | |
24 | namespace simdpp { |
25 | namespace SIMDPP_ARCH_NAMESPACE { |
26 | namespace detail { |
27 | namespace insn { |
28 | |
29 | |
30 | // ----------------------------------------------------------------------------- |
31 | // float32 |
32 | |
33 | template<unsigned s0, unsigned s1> SIMDPP_INL |
34 | float32<4> i_shuffle2x2(const float32<4>& a, const float32<4>& b) |
35 | { |
36 | static_assert(s0 < 4 && s1 < 4, "Selector out of range" ); |
37 | #if SIMDPP_USE_NULL |
38 | float32<4> r; |
39 | r.el(0) = s0 < 2 ? a.el(s0) : b.el(s0-2); |
40 | r.el(1) = s1 < 2 ? a.el(s1) : b.el(s1-2); |
41 | r.el(2) = s0 < 2 ? a.el(s0+2) : b.el(s0); |
42 | r.el(3) = s1 < 2 ? a.el(s1+2) : b.el(s1); |
43 | return r; |
44 | #elif SIMDPP_USE_SSE2 |
45 | if (s0 < 2 && s1 < 2) { |
46 | return _mm_shuffle_ps(a.native(), a.native(), |
47 | SIMDPP_SHUFFLE_MASK_4x4(s0,s1,s0+2,s1+2)); |
48 | } else if (s0 >= 2 && s1 >= 2) { |
49 | return _mm_shuffle_ps(b.native(), b.native(), |
50 | SIMDPP_SHUFFLE_MASK_4x4(s0-2,s1-2,s0,s1)); |
51 | #if SIMDPP_USE_SSE4_1 |
52 | } else if (s0 == 0 && s1 == 3) { |
53 | return _mm_blend_ps(a.native(), b.native(), 0xa); |
54 | } else if (s0 == 2 && s1 == 1) { |
55 | return _mm_blend_ps(b.native(), a.native(), 0xa); |
56 | #endif |
57 | } else if (s0 < 2) { // s1 >= 2 |
58 | __m128 t = _mm_shuffle_ps(a.native(), b.native(), |
59 | SIMDPP_SHUFFLE_MASK_4x4(s0,s0+1,s1-2,s1-1)); |
60 | return _mm_shuffle_ps(t, t, _MM_SHUFFLE(3,1,2,0)); |
61 | } else { // s0 >= 2, s1 < 2 |
62 | __m128 t = _mm_shuffle_ps(b.native(), a.native(), |
63 | SIMDPP_SHUFFLE_MASK_4x4(s1,s1+1,s0-2,s0-1)); |
64 | return _mm_shuffle_ps(t, t, _MM_SHUFFLE(3,1,2,0)); |
65 | } |
66 | #elif SIMDPP_USE_NEON |
67 | return (float32<4>) detail::neon_shuffle_int32x4::shuffle2x2<s0,s1>(float32<4>(a), float32<4>(b)); |
68 | #elif SIMDPP_USE_ALTIVEC |
69 | uint32<4> mask = make_shuffle_bytes16_mask<s0, s1>(mask); |
70 | return shuffle_bytes16(a, b, mask); |
71 | #elif SIMDPP_USE_MSA |
72 | const unsigned q0 = s0 < 2 ? s0 : s0 + 2; |
73 | const unsigned q1 = s1 < 2 ? s1 : s1 + 2; |
74 | uint32<4> mask = make_uint(q0,q1,q0+2,q1+2); |
75 | return (v4f32) __msa_vshf_w((v4i32)mask.native(), |
76 | (v4i32)b.native(), |
77 | (v4i32)a.native()); |
78 | #else |
79 | return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(int64<s0+4>, a, b); |
80 | #endif |
81 | } |
82 | |
83 | #if SIMDPP_USE_AVX |
84 | template<unsigned s0, unsigned s1> SIMDPP_INL |
85 | float32<8> i_shuffle2x2(const float32<8>& a, const float32<8>& b) |
86 | { |
87 | static_assert(s0 < 4 && s1 < 4, "Selector out of range" ); |
88 | if (s0 < 2 && s1 < 2) { |
89 | return _mm256_permute_ps(a.native(), SIMDPP_SHUFFLE_MASK_4x4(s0,s1,s0+2,s1+2)); |
90 | } else if (s0 >= 2 && s1 >= 2) { |
91 | return _mm256_permute_ps(b.native(), SIMDPP_SHUFFLE_MASK_4x4(s0-2,s1-2,s0,s1)); |
92 | } else if (s0 == 0 && s1 == 3) { |
93 | return _mm256_blend_ps(a.native(), b.native(), 0xaa); |
94 | } else if (s0 == 2 && s1 == 1) { |
95 | return _mm256_blend_ps(b.native(), a.native(), 0xaa); |
96 | } else if (s0 < 2) { // s1 >= 2 |
97 | __m256 t = _mm256_shuffle_ps(a.native(), b.native(), |
98 | SIMDPP_SHUFFLE_MASK_4x4(s0,s0+1,s1-2,s1-1)); |
99 | return _mm256_permute_ps(t, _MM_SHUFFLE(3,1,2,0)); |
100 | } else { // s0 >= 2, s1 < 2 |
101 | __m256 t = _mm256_shuffle_ps(b.native(), a.native(), |
102 | SIMDPP_SHUFFLE_MASK_4x4(s1,s1+1,s0-2,s0-1)); |
103 | return _mm256_permute_ps(t, _MM_SHUFFLE(3,1,2,0)); |
104 | } |
105 | } |
106 | #endif |
107 | |
108 | #if SIMDPP_USE_AVX512F |
109 | template<unsigned s0, unsigned s1> SIMDPP_INL |
110 | float32<16> i_shuffle2x2(const float32<16>& a, const float32<16>& b) |
111 | { |
112 | static_assert(s0 < 4 && s1 < 4, "Selector out of range" ); |
113 | if (s0 < 2 && s1 < 2) { |
114 | return _mm512_permute_ps(a.native(), SIMDPP_SHUFFLE_MASK_4x4(s0,s1,s0+2,s1+2)); |
115 | } else if (s0 >= 2 && s1 >= 2) { |
116 | return _mm512_permute_ps(b.native(), SIMDPP_SHUFFLE_MASK_4x4(s0-2,s1-2,s0,s1)); |
117 | } else if (s0 == 0 && s1 == 3) { |
118 | return _mm512_mask_blend_ps(0xaaaa, a.native(), b.native()); |
119 | } else if (s0 == 2 && s1 == 1) { |
120 | return _mm512_mask_blend_ps(0xaaaa, b.native(), a.native()); |
121 | } else if (s0 < 2) { // s1 >= 2 |
122 | __m512 t = _mm512_shuffle_ps(a.native(), b.native(), |
123 | SIMDPP_SHUFFLE_MASK_4x4(s0,s0+1,s1-2,s1-1)); |
124 | return _mm512_permute_ps(t, _MM_SHUFFLE(3,1,2,0)); |
125 | } else { // s0 >= 2, s1 < 2 |
126 | __m512 t = _mm512_shuffle_ps(b.native(), a.native(), |
127 | SIMDPP_SHUFFLE_MASK_4x4(s1,s1+1,s0-2,s0-1)); |
128 | return _mm512_permute_ps(t, _MM_SHUFFLE(3,1,2,0)); |
129 | } |
130 | } |
131 | #endif |
132 | |
133 | template<unsigned s0, unsigned s1, unsigned N> SIMDPP_INL |
134 | float32<N> i_shuffle2x2(const float32<N>& a, const float32<N>& b) |
135 | { |
136 | SIMDPP_VEC_ARRAY_IMPL2(float32<N>, (i_shuffle2x2<s0,s1>), a, b); |
137 | } |
138 | |
139 | // float64 |
140 | |
141 | |
142 | template<unsigned s0, unsigned s1> SIMDPP_INL |
143 | float64<2> i_shuffle2x2(const float64<2>& a, const float64<2>& b) |
144 | { |
145 | static_assert(s0 < 4 && s1 < 4, "Selector out of range" ); |
146 | #if SIMDPP_USE_SSE2 |
147 | if (s0 < 2 && s1 < 2) { |
148 | return _mm_shuffle_pd(a.native(), a.native(), |
149 | SIMDPP_SHUFFLE_MASK_2x2(s0, s1)); |
150 | } else if (s0 >= 2 && s1 >= 2) { |
151 | return _mm_shuffle_pd(b.native(), b.native(), |
152 | SIMDPP_SHUFFLE_MASK_2x2(s0-2,s1-2)); |
153 | #if SIMDPP_USE_SSE4_1 |
154 | } else if (s0 == 0 && s1 == 3) { |
155 | return _mm_blend_pd(a.native(), b.native(), 0x2); |
156 | } else if (s0 == 2 && s1 == 1) { |
157 | return _mm_blend_pd(b.native(), a.native(), 0x2); |
158 | #endif |
159 | } else if (s0 < 2) { // s1 >= 2 |
160 | return _mm_shuffle_pd(a.native(), b.native(), |
161 | SIMDPP_SHUFFLE_MASK_2x2(s0, s1-2)); |
162 | } else { // s0 >= 2, s1 < 2 |
163 | return _mm_shuffle_pd(b.native(), a.native(), |
164 | SIMDPP_SHUFFLE_MASK_2x2(s1, s0-2)); |
165 | } |
166 | #elif SIMDPP_USE_NEON64 |
167 | return (float64<2>)detail::neon_shuffle_int64x2::shuffle2x2<s0, s1>(uint64<2>(a), uint64<2>(b)); |
168 | #elif SIMDPP_USE_VSX_206 |
169 | __vector double da = a.native(), db = b.native(); |
170 | if (s0 < 2 && s1 < 2) { |
171 | return vec_xxpermdi(da, da, SIMDPP_VSX_SHUFFLE_MASK_2x2(s0, s1)); |
172 | } else if (s0 >= 2 && s1 >= 2) { |
173 | return vec_xxpermdi(db, db, SIMDPP_VSX_SHUFFLE_MASK_2x2(s0-2,s1-2)); |
174 | } else if (s0 < 2) { // s1 >= 2 |
175 | return vec_xxpermdi(da, db, SIMDPP_VSX_SHUFFLE_MASK_2x2(s0, s1-2)); |
176 | } else { // s0 >= 2, s1 < 2 |
177 | return vec_xxpermdi(db, da, SIMDPP_VSX_SHUFFLE_MASK_2x2(s1, s0-2)); |
178 | } |
179 | #elif SIMDPP_USE_MSA |
180 | uint64<2> mask = make_uint(s0, s1); |
181 | return (v2f64) __msa_vshf_d((v2i64)mask.native(), |
182 | (v2i64)b.native(), |
183 | (v2i64)a.native()); |
184 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC |
185 | float64<2> r; |
186 | r.el(0) = s0 < 2 ? a.el(s0) : b.el(s0-2); |
187 | r.el(1) = s1 < 2 ? a.el(s1) : b.el(s1-2); |
188 | return r; |
189 | #else |
190 | return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(int64<s0+4>, a, b); |
191 | #endif |
192 | } |
193 | |
194 | #if SIMDPP_USE_AVX |
195 | template<unsigned s0, unsigned s1> SIMDPP_INL |
196 | float64<4> i_shuffle2x2(const float64<4>& a, const float64<4>& b) |
197 | { |
198 | static_assert(s0 < 4 && s1 < 4, "Selector out of range" ); |
199 | if (s0 < 2 && s1 < 2) { |
200 | return _mm256_shuffle_pd(a.native(), a.native(), |
201 | SIMDPP_SHUFFLE_MASK_2x2_2(s0,s1)); |
202 | } else if (s0 >= 2 && s1 >= 2) { |
203 | return _mm256_shuffle_pd(b.native(), b.native(), |
204 | SIMDPP_SHUFFLE_MASK_2x2_2(s0-2,s1-2)); |
205 | } else if (s0 == 0 && s1 == 3) { |
206 | return _mm256_blend_pd(a.native(), b.native(), 0xa); |
207 | } else if (s0 == 2 && s1 == 1) { |
208 | return _mm256_blend_pd(b.native(), a.native(), 0xa); |
209 | } else if (s0 < 2) { // s1 >= 2 |
210 | return _mm256_shuffle_pd(a.native(), b.native(), |
211 | SIMDPP_SHUFFLE_MASK_2x2_2(s0,s1-2)); |
212 | } else { // s0 >= 2, s1 < 2 |
213 | return _mm256_shuffle_pd(b.native(), a.native(), |
214 | SIMDPP_SHUFFLE_MASK_2x2_2(s1,s0-2)); |
215 | } |
216 | } |
217 | #endif |
218 | |
219 | #if SIMDPP_USE_AVX512F |
220 | template<unsigned s0, unsigned s1> SIMDPP_INL |
221 | float64<8> i_shuffle2x2(const float64<8>& a, const float64<8>& b) |
222 | { |
223 | static_assert(s0 < 4 && s1 < 4, "Selector out of range" ); |
224 | if (s0 < 2 && s1 < 2) { |
225 | return _mm512_shuffle_pd(a.native(), a.native(), |
226 | SIMDPP_SHUFFLE_MASK_2x2_4(s0,s1)); |
227 | } else if (s0 >= 2 && s1 >= 2) { |
228 | return _mm512_shuffle_pd(b.native(), b.native(), |
229 | SIMDPP_SHUFFLE_MASK_2x2_4(s0-2,s1-2)); |
230 | } else if (s0 == 0 && s1 == 3) { |
231 | return _mm512_mask_blend_pd(0xaa, a.native(), b.native()); |
232 | } else if (s0 == 2 && s1 == 1) { |
233 | return _mm512_mask_blend_pd(0xaa, b.native(), a.native()); |
234 | } else if (s0 < 2) { // s1 >= 2 |
235 | return _mm512_shuffle_pd(a.native(), b.native(), |
236 | SIMDPP_SHUFFLE_MASK_2x2_4(s0,s1-2)); |
237 | } else { // s0 >= 2, s1 < 2 |
238 | return _mm512_shuffle_pd(b.native(), a.native(), |
239 | SIMDPP_SHUFFLE_MASK_2x2_4(s1,s0-2)); |
240 | } |
241 | } |
242 | #endif |
243 | |
244 | template<unsigned s0, unsigned s1, unsigned N> SIMDPP_INL |
245 | float64<N> i_shuffle2x2(const float64<N>& a, const float64<N>& b) |
246 | { |
247 | SIMDPP_VEC_ARRAY_IMPL2(float64<N>, (i_shuffle2x2<s0,s1>), a, b); |
248 | } |
249 | |
250 | // int32 |
251 | |
252 | template<unsigned s0, unsigned s1> SIMDPP_INL |
253 | uint32<4> i_shuffle2x2(const uint32<4>& a, const uint32<4>& b) |
254 | { |
255 | static_assert(s0 < 4 && s1 < 4, "Selector out of range" ); |
256 | #if SIMDPP_USE_NULL |
257 | uint32<4> r; |
258 | r.el(0) = s0 < 2 ? a.el(s0) : b.el(s0-2); |
259 | r.el(1) = s1 < 2 ? a.el(s1) : b.el(s1-2); |
260 | r.el(2) = s0 < 2 ? a.el(s0+2) : b.el(s0); |
261 | r.el(3) = s1 < 2 ? a.el(s1+2) : b.el(s1); |
262 | return r; |
263 | #elif SIMDPP_USE_SSE2 |
264 | if (s0 < 2 && s1 < 2) { |
265 | const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0,s1,s0+2,s1+2); |
266 | return _mm_shuffle_epi32(a.native(), mask); |
267 | } else if (s0 >= 2 && s1 >= 2) { |
268 | const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0-2,s1-2,s0,s1); |
269 | return _mm_shuffle_epi32(b.native(), mask); |
270 | #if SIMDPP_USE_SSE4_1 |
271 | } else if (s0 == 0 && s1 == 3) { |
272 | return _mm_blend_epi16(a.native(), b.native(), 0xcc); |
273 | } else if (s0 == 2 && s1 == 1) { |
274 | return _mm_blend_epi16(b.native(), a.native(), 0xcc); |
275 | #endif |
276 | } else if (s0 < 2) { // s1 >= 2 |
277 | float32<4> fa, fb; fa = a, fb = b; |
278 | __m128 t = _mm_shuffle_ps(fa.native(), fb.native(), |
279 | SIMDPP_SHUFFLE_MASK_4x4(s0,s0+1,s1-2,s1-1)); |
280 | return _mm_castps_si128(_mm_shuffle_ps(t, t, _MM_SHUFFLE(3,1,2,0))); |
281 | } else { // s0 >= 2, s1 < 2 |
282 | float32<4> fa, fb; fa = a, fb = b; |
283 | __m128 t = _mm_shuffle_ps(fb.native(), fa.native(), |
284 | SIMDPP_SHUFFLE_MASK_4x4(s1,s1+1,s0-2,s0-1)); |
285 | return _mm_castps_si128(_mm_shuffle_ps(t, t, _MM_SHUFFLE(3,1,2,0))); |
286 | } |
287 | #elif SIMDPP_USE_NEON |
288 | return detail::neon_shuffle_int32x4::shuffle2x2<s0,s1>(a, b); |
289 | #elif SIMDPP_USE_ALTIVEC |
290 | uint32<4> mask = make_shuffle_bytes16_mask<s0, s1>(mask); |
291 | return shuffle_bytes16(a, b, mask); |
292 | #elif SIMDPP_USE_MSA |
293 | const unsigned q0 = s0 < 2 ? s0 : s0 + 2; |
294 | const unsigned q1 = s1 < 2 ? s1 : s1 + 2; |
295 | uint32<4> mask = make_uint(q0,q1,q0+2,q1+2); |
296 | |
297 | return (v4u32) __msa_vshf_w((v4i32)(v4u32)mask, |
298 | (v4i32)(v4u32)b, |
299 | (v4i32)(v4u32)a); |
300 | #else |
301 | return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(int64<s0+4>, a, b); |
302 | #endif |
303 | } |
304 | |
305 | #if SIMDPP_USE_AVX2 |
306 | template<unsigned s0, unsigned s1> SIMDPP_INL |
307 | uint32<8> i_shuffle2x2(const uint32<8>& a, const uint32<8>& b) |
308 | { |
309 | static_assert(s0 < 4 && s1 < 4, "Selector out of range" ); |
310 | if (s0 < 2 && s1 < 2) { |
311 | const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0,s1,s0+2,s1+2); |
312 | return _mm256_shuffle_epi32(a.native(), mask); |
313 | } else if (s0 >= 2 && s1 >= 2) { |
314 | const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0-2,s1-2,s0,s1); |
315 | return _mm256_shuffle_epi32(b.native(), mask); |
316 | } else if (s0 == 0 && s1 == 3) { |
317 | return _mm256_blend_epi32(a.native(), b.native(), 0xa); |
318 | } else if (s0 == 2 && s1 == 1) { |
319 | return _mm256_blend_epi32(b.native(), a.native(), 0xa); |
320 | } else if (s0 < 2) { // s1 >= 2 |
321 | float32<8> fa, fb; fa = a, fb = b; |
322 | __m256 t = _mm256_shuffle_ps(fa.native(), fb.native(), |
323 | SIMDPP_SHUFFLE_MASK_4x4(s0,s0+1,s1-2,s1-1)); |
324 | return _mm256_castps_si256(_mm256_permute_ps(t, _MM_SHUFFLE(3,1,2,0))); |
325 | } else { // s0 >= 2, s1 < 2 |
326 | float32<8> fa, fb; fa = a, fb = b; |
327 | __m256 t = _mm256_shuffle_ps(fb.native(), fa.native(), |
328 | SIMDPP_SHUFFLE_MASK_4x4(s1,s1+1,s0-2,s0-1)); |
329 | return _mm256_castps_si256(_mm256_permute_ps(t, _MM_SHUFFLE(3,1,2,0))); |
330 | } |
331 | } |
332 | #endif |
333 | |
334 | #if SIMDPP_USE_AVX512F |
335 | template<unsigned s0, unsigned s1> SIMDPP_INL |
336 | uint32<16> i_shuffle2x2(const uint32<16>& a, const uint32<16>& b) |
337 | { |
338 | static_assert(s0 < 4 && s1 < 4, "Selector out of range" ); |
339 | if (s0 < 2 && s1 < 2) { |
340 | const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0,s1,s0+2,s1+2); |
341 | return _mm512_shuffle_epi32(a.native(), _MM_PERM_ENUM(mask)); |
342 | } else if (s0 >= 2 && s1 >= 2) { |
343 | const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0-2,s1-2,s0,s1); |
344 | return _mm512_shuffle_epi32(b.native(), _MM_PERM_ENUM(mask)); |
345 | } else if (s0 == 0 && s1 == 3) { |
346 | return _mm512_mask_blend_epi32(0xaaaa, a.native(), b.native()); |
347 | } else if (s0 == 2 && s1 == 1) { |
348 | return _mm512_mask_blend_epi32(0xaaaa, b.native(), a.native()); |
349 | } else if (s0 < 2) { // s1 >= 2 |
350 | float32<16> fa, fb; fa = a; fb = b; |
351 | __m512 t = _mm512_shuffle_ps(fa.native(), fb.native(), |
352 | SIMDPP_SHUFFLE_MASK_4x4(s0,s0+1,s1-2,s1-1)); |
353 | return _mm512_castps_si512(_mm512_permute_ps(t, _MM_SHUFFLE(3,1,2,0))); |
354 | } else { // s0 >= 2, s1 < 2 |
355 | float32<16> fa, fb; fa = a; fb = b; |
356 | __m512 t = _mm512_shuffle_ps(fb.native(), fa.native(), |
357 | SIMDPP_SHUFFLE_MASK_4x4(s1,s1+1,s0-2,s0-1)); |
358 | return _mm512_castps_si512(_mm512_permute_ps(t, _MM_SHUFFLE(3,1,2,0))); |
359 | } |
360 | } |
361 | #endif |
362 | |
363 | template<unsigned s0, unsigned s1, unsigned N> SIMDPP_INL |
364 | uint32<N> i_shuffle2x2(const uint32<N>& a, const uint32<N>& b) |
365 | { |
366 | SIMDPP_VEC_ARRAY_IMPL2(uint32<N>, (i_shuffle2x2<s0,s1>), a, b); |
367 | } |
368 | |
369 | // int64 |
370 | |
371 | template<unsigned s0, unsigned s1> SIMDPP_INL |
372 | uint64<2> i_shuffle2x2(const uint64<2>& a, const uint64<2>& b) |
373 | { |
374 | static_assert(s0 < 4 && s1 < 4, "Selector out of range" ); |
375 | #if SIMDPP_USE_SSE2 |
376 | if (s0 < 2 && s1 < 2) { |
377 | const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0*2, s0*2+1, s1*2, s1*2+1); |
378 | return _mm_shuffle_epi32(a.native(), mask); |
379 | } else if (s0 >= 2 && s1 >= 2) { |
380 | const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0*2, s0*2+1, s1*2, s1*2+1); |
381 | return _mm_shuffle_epi32(b.native(), mask); |
382 | } else if (s0 == 0 && s1 == 2) { |
383 | return _mm_unpacklo_epi64(a.native(), b.native()); |
384 | } else if (s0 == 2 && s1 == 0) { |
385 | return _mm_unpacklo_epi64(b.native(), a.native()); |
386 | } else if (s0 == 1 && s1 == 3) { |
387 | return _mm_unpackhi_epi64(a.native(), b.native()); |
388 | } else if (s0 == 3 && s1 == 1) { |
389 | return _mm_unpackhi_epi64(b.native(), a.native()); |
390 | #if SIMDPP_USE_SSE4_1 |
391 | } else if (s0 == 0 && s1 == 3) { |
392 | return _mm_blend_epi16(a.native(), b.native(), 0xf0); |
393 | } else if (s0 == 2 && s1 == 1) { |
394 | return _mm_blend_epi16(b.native(), a.native(), 0xf0); |
395 | #endif |
396 | #if SIMDPP_USE_SSSE3 |
397 | } else if (s0 == 1 && s1 == 2) { |
398 | return _mm_alignr_epi8(b.native(), a.native(), 8); |
399 | } else if (s0 == 3 && s1 == 0) { |
400 | return _mm_alignr_epi8(a.native(), b.native(), 8); |
401 | #endif |
402 | } else if (s0 < 2) { // s1 >= 2 |
403 | __m128d na = _mm_castsi128_pd(a.native()); |
404 | __m128d nb = _mm_castsi128_pd(b.native()); |
405 | __m128d res = _mm_shuffle_pd(na, nb, SIMDPP_SHUFFLE_MASK_2x2(s0, s1-2)); |
406 | return _mm_castpd_si128(res); |
407 | } else { // s0 >= 2, s1 < 2 |
408 | __m128d na = _mm_castsi128_pd(a.native()); |
409 | __m128d nb = _mm_castsi128_pd(b.native()); |
410 | __m128d res = _mm_shuffle_pd(nb, na, SIMDPP_SHUFFLE_MASK_2x2(s1, s0-2)); |
411 | return _mm_castpd_si128(res); |
412 | } |
413 | #elif SIMDPP_USE_NEON |
414 | return detail::neon_shuffle_int64x2::shuffle2x2<s0, s1>(a, b); |
415 | #elif SIMDPP_USE_VSX_207 |
416 | __vector uint64_t da = a.native(), db = b.native(); |
417 | if (s0 < 2 && s1 < 2) { |
418 | return vec_xxpermdi(da, da, SIMDPP_VSX_SHUFFLE_MASK_2x2(s0, s1)); |
419 | } else if (s0 >= 2 && s1 >= 2) { |
420 | return vec_xxpermdi(db, db, SIMDPP_VSX_SHUFFLE_MASK_2x2(s0-2,s1-2)); |
421 | } else if (s0 < 2) { // s1 >= 2 |
422 | return vec_xxpermdi(da, db, SIMDPP_VSX_SHUFFLE_MASK_2x2(s0, s1-2)); |
423 | } else { // s0 >= 2, s1 < 2 |
424 | return vec_xxpermdi(db, da, SIMDPP_VSX_SHUFFLE_MASK_2x2(s1, s0-2)); |
425 | } |
426 | #elif SIMDPP_USE_MSA |
427 | uint64<2> mask = make_uint(s0, s1); |
428 | |
429 | return (v2u64) __msa_vshf_d((v2i64)mask.native(), |
430 | (v2i64)b.native(), |
431 | (v2i64)a.native()); |
432 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC |
433 | uint64<2> r; |
434 | r.el(0) = s0 < 2 ? a.el(s0) : b.el(s0-2); |
435 | r.el(1) = s1 < 2 ? a.el(s1) : b.el(s1-2); |
436 | return r; |
437 | #else |
438 | return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(int64<s0+4>, a, b); |
439 | #endif |
440 | } |
441 | |
442 | #if SIMDPP_USE_AVX2 |
443 | template<unsigned s0, unsigned s1> SIMDPP_INL |
444 | uint64<4> i_shuffle2x2(const uint64<4>& a, const uint64<4>& b) |
445 | { |
446 | static_assert(s0 < 4 && s1 < 4, "Selector out of range" ); |
447 | if (s0 < 2 && s1 < 2) { |
448 | const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0*2, s0*2+1, s1*2, s1*2+1); |
449 | return _mm256_shuffle_epi32(a.native(), mask); |
450 | } else if (s0 >= 2 && s1 >= 2) { |
451 | const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0*2, s0*2+1, s1*2, s1*2+1); |
452 | return _mm256_shuffle_epi32(b.native(), mask); |
453 | } else if (s0 == 0 && s1 == 2) { |
454 | return _mm256_unpacklo_epi64(a.native(), b.native()); |
455 | } else if (s0 == 2 && s1 == 0) { |
456 | return _mm256_unpacklo_epi64(b.native(), a.native()); |
457 | } else if (s0 == 1 && s1 == 3) { |
458 | return _mm256_unpackhi_epi64(a.native(), b.native()); |
459 | } else if (s0 == 3 && s1 == 1) { |
460 | return _mm256_unpackhi_epi64(b.native(), a.native()); |
461 | } else if (s0 == 0 && s1 == 3) { |
462 | return _mm256_blend_epi32(a.native(), b.native(), 0xcc); |
463 | } else if (s0 == 2 && s1 == 1) { |
464 | return _mm256_blend_epi32(b.native(), a.native(), 0xcc); |
465 | } else if (s0 == 1 && s1 == 2) { |
466 | return _mm256_alignr_epi8(b.native(), a.native(), 8); |
467 | } else { // if (s0 == 3 && s1 == 0) |
468 | return _mm256_alignr_epi8(a.native(), b.native(), 8); |
469 | } |
470 | } |
471 | #endif |
472 | |
473 | #if SIMDPP_USE_AVX512F |
474 | template<unsigned s0, unsigned s1> SIMDPP_INL |
475 | uint64<8> i_shuffle2x2(const uint64<8>& a, const uint64<8>& b) |
476 | { |
477 | static_assert(s0 < 4 && s1 < 4, "Selector out of range" ); |
478 | if (s0 < 2 && s1 < 2) { |
479 | const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0,s1,s0+2,s1+2); |
480 | return _mm512_permutex_epi64(a.native(), mask); |
481 | } else if (s0 >= 2 && s1 >= 2) { |
482 | const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0-2,s1-2,s0,s1); |
483 | return _mm512_permutex_epi64(b.native(), mask); |
484 | } else if (s0 == 0 && s1 == 2) { |
485 | return _mm512_unpacklo_epi64(a.native(), b.native()); |
486 | } else if (s0 == 2 && s1 == 0) { |
487 | return _mm512_unpacklo_epi64(b.native(), a.native()); |
488 | } else if (s0 == 1 && s1 == 3) { |
489 | return _mm512_unpackhi_epi64(a.native(), b.native()); |
490 | } else if (s0 == 3 && s1 == 1) { |
491 | return _mm512_unpackhi_epi64(b.native(), a.native()); |
492 | } else if (s0 == 0 && s1 == 3) { |
493 | return _mm512_mask_blend_epi64(0xaa, a.native(), b.native()); |
494 | } else if (s0 == 2 && s1 == 1) { |
495 | return _mm512_mask_blend_epi64(0xaa, b.native(), a.native()); |
496 | } else if (s0 < 2) { // s1 >= 2 |
497 | __m512d na = _mm512_castsi512_pd(a.native()); |
498 | __m512d nb = _mm512_castsi512_pd(b.native()); |
499 | __m512d res = _mm512_shuffle_pd(na, nb, SIMDPP_SHUFFLE_MASK_2x2_4(s0, s1-2)); |
500 | return _mm512_castpd_si512(res); |
501 | } else { // s0 >= 2, s1 < 2 |
502 | __m512d na = _mm512_castsi512_pd(a.native()); |
503 | __m512d nb = _mm512_castsi512_pd(b.native()); |
504 | __m512d res = _mm512_shuffle_pd(nb, na, SIMDPP_SHUFFLE_MASK_2x2_4(s1, s0-2)); |
505 | return _mm512_castpd_si512(res); |
506 | } |
507 | /* GCC BUG |
508 | } else if (s0 == 1 && s1 == 2) { |
509 | return _mm512_alignr_epi8(b.native(), a.native(), 8); |
510 | } else if (s0 == 3 && s1 == 0) { |
511 | return _mm512_alignr_epi8(a.native(), b.native(), 8); |
512 | }*/ |
513 | } |
514 | #endif |
515 | |
516 | template<unsigned s0, unsigned s1, unsigned N> SIMDPP_INL |
517 | uint64<N> i_shuffle2x2(const uint64<N>& a, const uint64<N>& b) |
518 | { |
519 | SIMDPP_VEC_ARRAY_IMPL2(uint64<N>, (i_shuffle2x2<s0,s1>), a, b); |
520 | } |
521 | |
522 | |
523 | } // namespace insn |
524 | } // namespace detail |
525 | } // namespace SIMDPP_ARCH_NAMESPACE |
526 | } // namespace simdpp |
527 | |
528 | #endif |
529 | |
530 | |