1/* Copyright (C) 2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_SHUFFLE2x2_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_SHUFFLE2x2_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/make_shuffle_bytes_mask.h>
17#include <simdpp/core/shuffle_bytes16.h>
18#include <simdpp/detail/not_implemented.h>
19#include <simdpp/detail/shuffle/neon_int32x4.h>
20#include <simdpp/detail/shuffle/neon_int64x2.h>
21#include <simdpp/detail/shuffle/shuffle_mask.h>
22#include <simdpp/detail/vector_array_macros.h>
23
24namespace simdpp {
25namespace SIMDPP_ARCH_NAMESPACE {
26namespace detail {
27namespace insn {
28
29
30// -----------------------------------------------------------------------------
31// float32
32
33template<unsigned s0, unsigned s1> SIMDPP_INL
34float32<4> i_shuffle2x2(const float32<4>& a, const float32<4>& b)
35{
36 static_assert(s0 < 4 && s1 < 4, "Selector out of range");
37#if SIMDPP_USE_NULL
38 float32<4> r;
39 r.el(0) = s0 < 2 ? a.el(s0) : b.el(s0-2);
40 r.el(1) = s1 < 2 ? a.el(s1) : b.el(s1-2);
41 r.el(2) = s0 < 2 ? a.el(s0+2) : b.el(s0);
42 r.el(3) = s1 < 2 ? a.el(s1+2) : b.el(s1);
43 return r;
44#elif SIMDPP_USE_SSE2
45 if (s0 < 2 && s1 < 2) {
46 return _mm_shuffle_ps(a.native(), a.native(),
47 SIMDPP_SHUFFLE_MASK_4x4(s0,s1,s0+2,s1+2));
48 } else if (s0 >= 2 && s1 >= 2) {
49 return _mm_shuffle_ps(b.native(), b.native(),
50 SIMDPP_SHUFFLE_MASK_4x4(s0-2,s1-2,s0,s1));
51#if SIMDPP_USE_SSE4_1
52 } else if (s0 == 0 && s1 == 3) {
53 return _mm_blend_ps(a.native(), b.native(), 0xa);
54 } else if (s0 == 2 && s1 == 1) {
55 return _mm_blend_ps(b.native(), a.native(), 0xa);
56#endif
57 } else if (s0 < 2) { // s1 >= 2
58 __m128 t = _mm_shuffle_ps(a.native(), b.native(),
59 SIMDPP_SHUFFLE_MASK_4x4(s0,s0+1,s1-2,s1-1));
60 return _mm_shuffle_ps(t, t, _MM_SHUFFLE(3,1,2,0));
61 } else { // s0 >= 2, s1 < 2
62 __m128 t = _mm_shuffle_ps(b.native(), a.native(),
63 SIMDPP_SHUFFLE_MASK_4x4(s1,s1+1,s0-2,s0-1));
64 return _mm_shuffle_ps(t, t, _MM_SHUFFLE(3,1,2,0));
65 }
66#elif SIMDPP_USE_NEON
67 return (float32<4>) detail::neon_shuffle_int32x4::shuffle2x2<s0,s1>(float32<4>(a), float32<4>(b));
68#elif SIMDPP_USE_ALTIVEC
69 uint32<4> mask = make_shuffle_bytes16_mask<s0, s1>(mask);
70 return shuffle_bytes16(a, b, mask);
71#elif SIMDPP_USE_MSA
72 const unsigned q0 = s0 < 2 ? s0 : s0 + 2;
73 const unsigned q1 = s1 < 2 ? s1 : s1 + 2;
74 uint32<4> mask = make_uint(q0,q1,q0+2,q1+2);
75 return (v4f32) __msa_vshf_w((v4i32)mask.native(),
76 (v4i32)b.native(),
77 (v4i32)a.native());
78#else
79 return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(int64<s0+4>, a, b);
80#endif
81}
82
83#if SIMDPP_USE_AVX
84template<unsigned s0, unsigned s1> SIMDPP_INL
85float32<8> i_shuffle2x2(const float32<8>& a, const float32<8>& b)
86{
87 static_assert(s0 < 4 && s1 < 4, "Selector out of range");
88 if (s0 < 2 && s1 < 2) {
89 return _mm256_permute_ps(a.native(), SIMDPP_SHUFFLE_MASK_4x4(s0,s1,s0+2,s1+2));
90 } else if (s0 >= 2 && s1 >= 2) {
91 return _mm256_permute_ps(b.native(), SIMDPP_SHUFFLE_MASK_4x4(s0-2,s1-2,s0,s1));
92 } else if (s0 == 0 && s1 == 3) {
93 return _mm256_blend_ps(a.native(), b.native(), 0xaa);
94 } else if (s0 == 2 && s1 == 1) {
95 return _mm256_blend_ps(b.native(), a.native(), 0xaa);
96 } else if (s0 < 2) { // s1 >= 2
97 __m256 t = _mm256_shuffle_ps(a.native(), b.native(),
98 SIMDPP_SHUFFLE_MASK_4x4(s0,s0+1,s1-2,s1-1));
99 return _mm256_permute_ps(t, _MM_SHUFFLE(3,1,2,0));
100 } else { // s0 >= 2, s1 < 2
101 __m256 t = _mm256_shuffle_ps(b.native(), a.native(),
102 SIMDPP_SHUFFLE_MASK_4x4(s1,s1+1,s0-2,s0-1));
103 return _mm256_permute_ps(t, _MM_SHUFFLE(3,1,2,0));
104 }
105}
106#endif
107
108#if SIMDPP_USE_AVX512F
109template<unsigned s0, unsigned s1> SIMDPP_INL
110float32<16> i_shuffle2x2(const float32<16>& a, const float32<16>& b)
111{
112 static_assert(s0 < 4 && s1 < 4, "Selector out of range");
113 if (s0 < 2 && s1 < 2) {
114 return _mm512_permute_ps(a.native(), SIMDPP_SHUFFLE_MASK_4x4(s0,s1,s0+2,s1+2));
115 } else if (s0 >= 2 && s1 >= 2) {
116 return _mm512_permute_ps(b.native(), SIMDPP_SHUFFLE_MASK_4x4(s0-2,s1-2,s0,s1));
117 } else if (s0 == 0 && s1 == 3) {
118 return _mm512_mask_blend_ps(0xaaaa, a.native(), b.native());
119 } else if (s0 == 2 && s1 == 1) {
120 return _mm512_mask_blend_ps(0xaaaa, b.native(), a.native());
121 } else if (s0 < 2) { // s1 >= 2
122 __m512 t = _mm512_shuffle_ps(a.native(), b.native(),
123 SIMDPP_SHUFFLE_MASK_4x4(s0,s0+1,s1-2,s1-1));
124 return _mm512_permute_ps(t, _MM_SHUFFLE(3,1,2,0));
125 } else { // s0 >= 2, s1 < 2
126 __m512 t = _mm512_shuffle_ps(b.native(), a.native(),
127 SIMDPP_SHUFFLE_MASK_4x4(s1,s1+1,s0-2,s0-1));
128 return _mm512_permute_ps(t, _MM_SHUFFLE(3,1,2,0));
129 }
130}
131#endif
132
133template<unsigned s0, unsigned s1, unsigned N> SIMDPP_INL
134float32<N> i_shuffle2x2(const float32<N>& a, const float32<N>& b)
135{
136 SIMDPP_VEC_ARRAY_IMPL2(float32<N>, (i_shuffle2x2<s0,s1>), a, b);
137}
138
139// float64
140
141
142template<unsigned s0, unsigned s1> SIMDPP_INL
143float64<2> i_shuffle2x2(const float64<2>& a, const float64<2>& b)
144{
145 static_assert(s0 < 4 && s1 < 4, "Selector out of range");
146#if SIMDPP_USE_SSE2
147 if (s0 < 2 && s1 < 2) {
148 return _mm_shuffle_pd(a.native(), a.native(),
149 SIMDPP_SHUFFLE_MASK_2x2(s0, s1));
150 } else if (s0 >= 2 && s1 >= 2) {
151 return _mm_shuffle_pd(b.native(), b.native(),
152 SIMDPP_SHUFFLE_MASK_2x2(s0-2,s1-2));
153#if SIMDPP_USE_SSE4_1
154 } else if (s0 == 0 && s1 == 3) {
155 return _mm_blend_pd(a.native(), b.native(), 0x2);
156 } else if (s0 == 2 && s1 == 1) {
157 return _mm_blend_pd(b.native(), a.native(), 0x2);
158#endif
159 } else if (s0 < 2) { // s1 >= 2
160 return _mm_shuffle_pd(a.native(), b.native(),
161 SIMDPP_SHUFFLE_MASK_2x2(s0, s1-2));
162 } else { // s0 >= 2, s1 < 2
163 return _mm_shuffle_pd(b.native(), a.native(),
164 SIMDPP_SHUFFLE_MASK_2x2(s1, s0-2));
165 }
166#elif SIMDPP_USE_NEON64
167 return (float64<2>)detail::neon_shuffle_int64x2::shuffle2x2<s0, s1>(uint64<2>(a), uint64<2>(b));
168#elif SIMDPP_USE_VSX_206
169 __vector double da = a.native(), db = b.native();
170 if (s0 < 2 && s1 < 2) {
171 return vec_xxpermdi(da, da, SIMDPP_VSX_SHUFFLE_MASK_2x2(s0, s1));
172 } else if (s0 >= 2 && s1 >= 2) {
173 return vec_xxpermdi(db, db, SIMDPP_VSX_SHUFFLE_MASK_2x2(s0-2,s1-2));
174 } else if (s0 < 2) { // s1 >= 2
175 return vec_xxpermdi(da, db, SIMDPP_VSX_SHUFFLE_MASK_2x2(s0, s1-2));
176 } else { // s0 >= 2, s1 < 2
177 return vec_xxpermdi(db, da, SIMDPP_VSX_SHUFFLE_MASK_2x2(s1, s0-2));
178 }
179#elif SIMDPP_USE_MSA
180 uint64<2> mask = make_uint(s0, s1);
181 return (v2f64) __msa_vshf_d((v2i64)mask.native(),
182 (v2i64)b.native(),
183 (v2i64)a.native());
184#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC
185 float64<2> r;
186 r.el(0) = s0 < 2 ? a.el(s0) : b.el(s0-2);
187 r.el(1) = s1 < 2 ? a.el(s1) : b.el(s1-2);
188 return r;
189#else
190 return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(int64<s0+4>, a, b);
191#endif
192}
193
194#if SIMDPP_USE_AVX
195template<unsigned s0, unsigned s1> SIMDPP_INL
196float64<4> i_shuffle2x2(const float64<4>& a, const float64<4>& b)
197{
198 static_assert(s0 < 4 && s1 < 4, "Selector out of range");
199 if (s0 < 2 && s1 < 2) {
200 return _mm256_shuffle_pd(a.native(), a.native(),
201 SIMDPP_SHUFFLE_MASK_2x2_2(s0,s1));
202 } else if (s0 >= 2 && s1 >= 2) {
203 return _mm256_shuffle_pd(b.native(), b.native(),
204 SIMDPP_SHUFFLE_MASK_2x2_2(s0-2,s1-2));
205 } else if (s0 == 0 && s1 == 3) {
206 return _mm256_blend_pd(a.native(), b.native(), 0xa);
207 } else if (s0 == 2 && s1 == 1) {
208 return _mm256_blend_pd(b.native(), a.native(), 0xa);
209 } else if (s0 < 2) { // s1 >= 2
210 return _mm256_shuffle_pd(a.native(), b.native(),
211 SIMDPP_SHUFFLE_MASK_2x2_2(s0,s1-2));
212 } else { // s0 >= 2, s1 < 2
213 return _mm256_shuffle_pd(b.native(), a.native(),
214 SIMDPP_SHUFFLE_MASK_2x2_2(s1,s0-2));
215 }
216}
217#endif
218
219#if SIMDPP_USE_AVX512F
220template<unsigned s0, unsigned s1> SIMDPP_INL
221float64<8> i_shuffle2x2(const float64<8>& a, const float64<8>& b)
222{
223 static_assert(s0 < 4 && s1 < 4, "Selector out of range");
224 if (s0 < 2 && s1 < 2) {
225 return _mm512_shuffle_pd(a.native(), a.native(),
226 SIMDPP_SHUFFLE_MASK_2x2_4(s0,s1));
227 } else if (s0 >= 2 && s1 >= 2) {
228 return _mm512_shuffle_pd(b.native(), b.native(),
229 SIMDPP_SHUFFLE_MASK_2x2_4(s0-2,s1-2));
230 } else if (s0 == 0 && s1 == 3) {
231 return _mm512_mask_blend_pd(0xaa, a.native(), b.native());
232 } else if (s0 == 2 && s1 == 1) {
233 return _mm512_mask_blend_pd(0xaa, b.native(), a.native());
234 } else if (s0 < 2) { // s1 >= 2
235 return _mm512_shuffle_pd(a.native(), b.native(),
236 SIMDPP_SHUFFLE_MASK_2x2_4(s0,s1-2));
237 } else { // s0 >= 2, s1 < 2
238 return _mm512_shuffle_pd(b.native(), a.native(),
239 SIMDPP_SHUFFLE_MASK_2x2_4(s1,s0-2));
240 }
241}
242#endif
243
244template<unsigned s0, unsigned s1, unsigned N> SIMDPP_INL
245float64<N> i_shuffle2x2(const float64<N>& a, const float64<N>& b)
246{
247 SIMDPP_VEC_ARRAY_IMPL2(float64<N>, (i_shuffle2x2<s0,s1>), a, b);
248}
249
250// int32
251
252template<unsigned s0, unsigned s1> SIMDPP_INL
253uint32<4> i_shuffle2x2(const uint32<4>& a, const uint32<4>& b)
254{
255 static_assert(s0 < 4 && s1 < 4, "Selector out of range");
256#if SIMDPP_USE_NULL
257 uint32<4> r;
258 r.el(0) = s0 < 2 ? a.el(s0) : b.el(s0-2);
259 r.el(1) = s1 < 2 ? a.el(s1) : b.el(s1-2);
260 r.el(2) = s0 < 2 ? a.el(s0+2) : b.el(s0);
261 r.el(3) = s1 < 2 ? a.el(s1+2) : b.el(s1);
262 return r;
263#elif SIMDPP_USE_SSE2
264 if (s0 < 2 && s1 < 2) {
265 const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0,s1,s0+2,s1+2);
266 return _mm_shuffle_epi32(a.native(), mask);
267 } else if (s0 >= 2 && s1 >= 2) {
268 const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0-2,s1-2,s0,s1);
269 return _mm_shuffle_epi32(b.native(), mask);
270#if SIMDPP_USE_SSE4_1
271 } else if (s0 == 0 && s1 == 3) {
272 return _mm_blend_epi16(a.native(), b.native(), 0xcc);
273 } else if (s0 == 2 && s1 == 1) {
274 return _mm_blend_epi16(b.native(), a.native(), 0xcc);
275#endif
276 } else if (s0 < 2) { // s1 >= 2
277 float32<4> fa, fb; fa = a, fb = b;
278 __m128 t = _mm_shuffle_ps(fa.native(), fb.native(),
279 SIMDPP_SHUFFLE_MASK_4x4(s0,s0+1,s1-2,s1-1));
280 return _mm_castps_si128(_mm_shuffle_ps(t, t, _MM_SHUFFLE(3,1,2,0)));
281 } else { // s0 >= 2, s1 < 2
282 float32<4> fa, fb; fa = a, fb = b;
283 __m128 t = _mm_shuffle_ps(fb.native(), fa.native(),
284 SIMDPP_SHUFFLE_MASK_4x4(s1,s1+1,s0-2,s0-1));
285 return _mm_castps_si128(_mm_shuffle_ps(t, t, _MM_SHUFFLE(3,1,2,0)));
286 }
287#elif SIMDPP_USE_NEON
288 return detail::neon_shuffle_int32x4::shuffle2x2<s0,s1>(a, b);
289#elif SIMDPP_USE_ALTIVEC
290 uint32<4> mask = make_shuffle_bytes16_mask<s0, s1>(mask);
291 return shuffle_bytes16(a, b, mask);
292#elif SIMDPP_USE_MSA
293 const unsigned q0 = s0 < 2 ? s0 : s0 + 2;
294 const unsigned q1 = s1 < 2 ? s1 : s1 + 2;
295 uint32<4> mask = make_uint(q0,q1,q0+2,q1+2);
296
297 return (v4u32) __msa_vshf_w((v4i32)(v4u32)mask,
298 (v4i32)(v4u32)b,
299 (v4i32)(v4u32)a);
300#else
301 return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(int64<s0+4>, a, b);
302#endif
303}
304
305#if SIMDPP_USE_AVX2
306template<unsigned s0, unsigned s1> SIMDPP_INL
307uint32<8> i_shuffle2x2(const uint32<8>& a, const uint32<8>& b)
308{
309 static_assert(s0 < 4 && s1 < 4, "Selector out of range");
310 if (s0 < 2 && s1 < 2) {
311 const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0,s1,s0+2,s1+2);
312 return _mm256_shuffle_epi32(a.native(), mask);
313 } else if (s0 >= 2 && s1 >= 2) {
314 const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0-2,s1-2,s0,s1);
315 return _mm256_shuffle_epi32(b.native(), mask);
316 } else if (s0 == 0 && s1 == 3) {
317 return _mm256_blend_epi32(a.native(), b.native(), 0xa);
318 } else if (s0 == 2 && s1 == 1) {
319 return _mm256_blend_epi32(b.native(), a.native(), 0xa);
320 } else if (s0 < 2) { // s1 >= 2
321 float32<8> fa, fb; fa = a, fb = b;
322 __m256 t = _mm256_shuffle_ps(fa.native(), fb.native(),
323 SIMDPP_SHUFFLE_MASK_4x4(s0,s0+1,s1-2,s1-1));
324 return _mm256_castps_si256(_mm256_permute_ps(t, _MM_SHUFFLE(3,1,2,0)));
325 } else { // s0 >= 2, s1 < 2
326 float32<8> fa, fb; fa = a, fb = b;
327 __m256 t = _mm256_shuffle_ps(fb.native(), fa.native(),
328 SIMDPP_SHUFFLE_MASK_4x4(s1,s1+1,s0-2,s0-1));
329 return _mm256_castps_si256(_mm256_permute_ps(t, _MM_SHUFFLE(3,1,2,0)));
330 }
331}
332#endif
333
334#if SIMDPP_USE_AVX512F
335template<unsigned s0, unsigned s1> SIMDPP_INL
336uint32<16> i_shuffle2x2(const uint32<16>& a, const uint32<16>& b)
337{
338 static_assert(s0 < 4 && s1 < 4, "Selector out of range");
339 if (s0 < 2 && s1 < 2) {
340 const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0,s1,s0+2,s1+2);
341 return _mm512_shuffle_epi32(a.native(), _MM_PERM_ENUM(mask));
342 } else if (s0 >= 2 && s1 >= 2) {
343 const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0-2,s1-2,s0,s1);
344 return _mm512_shuffle_epi32(b.native(), _MM_PERM_ENUM(mask));
345 } else if (s0 == 0 && s1 == 3) {
346 return _mm512_mask_blend_epi32(0xaaaa, a.native(), b.native());
347 } else if (s0 == 2 && s1 == 1) {
348 return _mm512_mask_blend_epi32(0xaaaa, b.native(), a.native());
349 } else if (s0 < 2) { // s1 >= 2
350 float32<16> fa, fb; fa = a; fb = b;
351 __m512 t = _mm512_shuffle_ps(fa.native(), fb.native(),
352 SIMDPP_SHUFFLE_MASK_4x4(s0,s0+1,s1-2,s1-1));
353 return _mm512_castps_si512(_mm512_permute_ps(t, _MM_SHUFFLE(3,1,2,0)));
354 } else { // s0 >= 2, s1 < 2
355 float32<16> fa, fb; fa = a; fb = b;
356 __m512 t = _mm512_shuffle_ps(fb.native(), fa.native(),
357 SIMDPP_SHUFFLE_MASK_4x4(s1,s1+1,s0-2,s0-1));
358 return _mm512_castps_si512(_mm512_permute_ps(t, _MM_SHUFFLE(3,1,2,0)));
359 }
360}
361#endif
362
363template<unsigned s0, unsigned s1, unsigned N> SIMDPP_INL
364uint32<N> i_shuffle2x2(const uint32<N>& a, const uint32<N>& b)
365{
366 SIMDPP_VEC_ARRAY_IMPL2(uint32<N>, (i_shuffle2x2<s0,s1>), a, b);
367}
368
369// int64
370
371template<unsigned s0, unsigned s1> SIMDPP_INL
372uint64<2> i_shuffle2x2(const uint64<2>& a, const uint64<2>& b)
373{
374 static_assert(s0 < 4 && s1 < 4, "Selector out of range");
375#if SIMDPP_USE_SSE2
376 if (s0 < 2 && s1 < 2) {
377 const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0*2, s0*2+1, s1*2, s1*2+1);
378 return _mm_shuffle_epi32(a.native(), mask);
379 } else if (s0 >= 2 && s1 >= 2) {
380 const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0*2, s0*2+1, s1*2, s1*2+1);
381 return _mm_shuffle_epi32(b.native(), mask);
382 } else if (s0 == 0 && s1 == 2) {
383 return _mm_unpacklo_epi64(a.native(), b.native());
384 } else if (s0 == 2 && s1 == 0) {
385 return _mm_unpacklo_epi64(b.native(), a.native());
386 } else if (s0 == 1 && s1 == 3) {
387 return _mm_unpackhi_epi64(a.native(), b.native());
388 } else if (s0 == 3 && s1 == 1) {
389 return _mm_unpackhi_epi64(b.native(), a.native());
390#if SIMDPP_USE_SSE4_1
391 } else if (s0 == 0 && s1 == 3) {
392 return _mm_blend_epi16(a.native(), b.native(), 0xf0);
393 } else if (s0 == 2 && s1 == 1) {
394 return _mm_blend_epi16(b.native(), a.native(), 0xf0);
395#endif
396#if SIMDPP_USE_SSSE3
397 } else if (s0 == 1 && s1 == 2) {
398 return _mm_alignr_epi8(b.native(), a.native(), 8);
399 } else if (s0 == 3 && s1 == 0) {
400 return _mm_alignr_epi8(a.native(), b.native(), 8);
401#endif
402 } else if (s0 < 2) { // s1 >= 2
403 __m128d na = _mm_castsi128_pd(a.native());
404 __m128d nb = _mm_castsi128_pd(b.native());
405 __m128d res = _mm_shuffle_pd(na, nb, SIMDPP_SHUFFLE_MASK_2x2(s0, s1-2));
406 return _mm_castpd_si128(res);
407 } else { // s0 >= 2, s1 < 2
408 __m128d na = _mm_castsi128_pd(a.native());
409 __m128d nb = _mm_castsi128_pd(b.native());
410 __m128d res = _mm_shuffle_pd(nb, na, SIMDPP_SHUFFLE_MASK_2x2(s1, s0-2));
411 return _mm_castpd_si128(res);
412 }
413#elif SIMDPP_USE_NEON
414 return detail::neon_shuffle_int64x2::shuffle2x2<s0, s1>(a, b);
415#elif SIMDPP_USE_VSX_207
416 __vector uint64_t da = a.native(), db = b.native();
417 if (s0 < 2 && s1 < 2) {
418 return vec_xxpermdi(da, da, SIMDPP_VSX_SHUFFLE_MASK_2x2(s0, s1));
419 } else if (s0 >= 2 && s1 >= 2) {
420 return vec_xxpermdi(db, db, SIMDPP_VSX_SHUFFLE_MASK_2x2(s0-2,s1-2));
421 } else if (s0 < 2) { // s1 >= 2
422 return vec_xxpermdi(da, db, SIMDPP_VSX_SHUFFLE_MASK_2x2(s0, s1-2));
423 } else { // s0 >= 2, s1 < 2
424 return vec_xxpermdi(db, da, SIMDPP_VSX_SHUFFLE_MASK_2x2(s1, s0-2));
425 }
426#elif SIMDPP_USE_MSA
427 uint64<2> mask = make_uint(s0, s1);
428
429 return (v2u64) __msa_vshf_d((v2i64)mask.native(),
430 (v2i64)b.native(),
431 (v2i64)a.native());
432#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
433 uint64<2> r;
434 r.el(0) = s0 < 2 ? a.el(s0) : b.el(s0-2);
435 r.el(1) = s1 < 2 ? a.el(s1) : b.el(s1-2);
436 return r;
437#else
438 return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(int64<s0+4>, a, b);
439#endif
440}
441
442#if SIMDPP_USE_AVX2
443template<unsigned s0, unsigned s1> SIMDPP_INL
444uint64<4> i_shuffle2x2(const uint64<4>& a, const uint64<4>& b)
445{
446 static_assert(s0 < 4 && s1 < 4, "Selector out of range");
447 if (s0 < 2 && s1 < 2) {
448 const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0*2, s0*2+1, s1*2, s1*2+1);
449 return _mm256_shuffle_epi32(a.native(), mask);
450 } else if (s0 >= 2 && s1 >= 2) {
451 const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0*2, s0*2+1, s1*2, s1*2+1);
452 return _mm256_shuffle_epi32(b.native(), mask);
453 } else if (s0 == 0 && s1 == 2) {
454 return _mm256_unpacklo_epi64(a.native(), b.native());
455 } else if (s0 == 2 && s1 == 0) {
456 return _mm256_unpacklo_epi64(b.native(), a.native());
457 } else if (s0 == 1 && s1 == 3) {
458 return _mm256_unpackhi_epi64(a.native(), b.native());
459 } else if (s0 == 3 && s1 == 1) {
460 return _mm256_unpackhi_epi64(b.native(), a.native());
461 } else if (s0 == 0 && s1 == 3) {
462 return _mm256_blend_epi32(a.native(), b.native(), 0xcc);
463 } else if (s0 == 2 && s1 == 1) {
464 return _mm256_blend_epi32(b.native(), a.native(), 0xcc);
465 } else if (s0 == 1 && s1 == 2) {
466 return _mm256_alignr_epi8(b.native(), a.native(), 8);
467 } else { // if (s0 == 3 && s1 == 0)
468 return _mm256_alignr_epi8(a.native(), b.native(), 8);
469 }
470}
471#endif
472
473#if SIMDPP_USE_AVX512F
474template<unsigned s0, unsigned s1> SIMDPP_INL
475uint64<8> i_shuffle2x2(const uint64<8>& a, const uint64<8>& b)
476{
477 static_assert(s0 < 4 && s1 < 4, "Selector out of range");
478 if (s0 < 2 && s1 < 2) {
479 const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0,s1,s0+2,s1+2);
480 return _mm512_permutex_epi64(a.native(), mask);
481 } else if (s0 >= 2 && s1 >= 2) {
482 const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0-2,s1-2,s0,s1);
483 return _mm512_permutex_epi64(b.native(), mask);
484 } else if (s0 == 0 && s1 == 2) {
485 return _mm512_unpacklo_epi64(a.native(), b.native());
486 } else if (s0 == 2 && s1 == 0) {
487 return _mm512_unpacklo_epi64(b.native(), a.native());
488 } else if (s0 == 1 && s1 == 3) {
489 return _mm512_unpackhi_epi64(a.native(), b.native());
490 } else if (s0 == 3 && s1 == 1) {
491 return _mm512_unpackhi_epi64(b.native(), a.native());
492 } else if (s0 == 0 && s1 == 3) {
493 return _mm512_mask_blend_epi64(0xaa, a.native(), b.native());
494 } else if (s0 == 2 && s1 == 1) {
495 return _mm512_mask_blend_epi64(0xaa, b.native(), a.native());
496 } else if (s0 < 2) { // s1 >= 2
497 __m512d na = _mm512_castsi512_pd(a.native());
498 __m512d nb = _mm512_castsi512_pd(b.native());
499 __m512d res = _mm512_shuffle_pd(na, nb, SIMDPP_SHUFFLE_MASK_2x2_4(s0, s1-2));
500 return _mm512_castpd_si512(res);
501 } else { // s0 >= 2, s1 < 2
502 __m512d na = _mm512_castsi512_pd(a.native());
503 __m512d nb = _mm512_castsi512_pd(b.native());
504 __m512d res = _mm512_shuffle_pd(nb, na, SIMDPP_SHUFFLE_MASK_2x2_4(s1, s0-2));
505 return _mm512_castpd_si512(res);
506 }
507 /* GCC BUG
508 } else if (s0 == 1 && s1 == 2) {
509 return _mm512_alignr_epi8(b.native(), a.native(), 8);
510 } else if (s0 == 3 && s1 == 0) {
511 return _mm512_alignr_epi8(a.native(), b.native(), 8);
512 }*/
513}
514#endif
515
516template<unsigned s0, unsigned s1, unsigned N> SIMDPP_INL
517uint64<N> i_shuffle2x2(const uint64<N>& a, const uint64<N>& b)
518{
519 SIMDPP_VEC_ARRAY_IMPL2(uint64<N>, (i_shuffle2x2<s0,s1>), a, b);
520}
521
522
523} // namespace insn
524} // namespace detail
525} // namespace SIMDPP_ARCH_NAMESPACE
526} // namespace simdpp
527
528#endif
529
530