1/* Copyright (C) 2011-2012 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_PERMUTE_SSE_FLOAT32_4x2_H
9#define LIBSIMDPP_SIMDPP_DETAIL_PERMUTE_SSE_FLOAT32_4x2_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/setup_arch.h>
16#include <simdpp/types.h>
17#include <simdpp/detail/shuffle/shuffle_mask.h>
18
19#if SIMDPP_USE_SSE2
20
21namespace simdpp {
22namespace SIMDPP_ARCH_NAMESPACE {
23namespace detail {
24namespace sse_shuffle4x2_float32 {
25
26/* The code below implements generalized permutations for 2 elements sets
27 within float32 vectors.
28*/
29
30template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
31struct impl_selector {
32
33 // 0 1 2 3
34 // 4 5 6 7
35 static const bool is1_zip_lo1 = (s0==0 && s1==4 && s2==1 && s3==5);
36 static const bool is2_zip_lo2 = (s0==4 && s1==0 && s2==5 && s3==1);
37 static const bool is3_zip_hi1 = (s0==2 && s1==6 && s2==3 && s3==7);
38 static const bool is4_zip_hi2 = (s0==6 && s1==2 && s2==7 && s3==3);
39#if SIMDPP_USE_SSE4_1
40 static const bool is5_blend = (s0==0 || s0==4) && (s1==1 || s1==5) &&
41 (s2==2 || s2==6) && (s3==3 || s3==7);
42#else
43 static const bool is5_blend = false;
44#endif
45 static const bool is6_shuffle1 = s0 < 4 && s1 < 4 && s2 >= 4 && s3 >= 4;
46 static const bool is7_shuffle2 = s0 >= 4 && s1 >= 4 && s2 < 4 && s3 < 4;
47#if SIMDPP_USE_SSE4_1
48 static const bool is8_lobl_shuffle1 = s2 >= 4 && s3 >= 4 && s0 != s1;
49 static const bool is9_lobl_shuffle2 = s2 < 4 && s3 < 4 && s0 != s1;
50#else
51 static const bool is8_lobl_shuffle1 = false;
52 static const bool is9_lobl_shuffle2 = false;
53#endif
54 static const bool is10_losh_shuffle1 = s2 >= 4 && s3 >= 4;
55 static const bool is11_losh_shuffle2 = s2 < 4 && s3 < 4;
56#if SIMDPP_USE_SSE4_1
57 static const bool is12_hibl_shuffle1 = s0 >= 4 && s1 >= 4 && s2 != s3;
58 static const bool is13_hibl_shuffle2 = s0 < 4 && s1 < 4 && s2 != s3;
59#else
60 static const bool is12_hibl_shuffle1 = false;
61 static const bool is13_hibl_shuffle2 = false;
62#endif
63 static const bool is14_hish_shuffle1 = s0 >= 4 && s1 >= 4;
64 static const bool is15_hish_shuffle2 = s0 < 4 && s1 < 4;
65
66 static const int impl = is1_zip_lo1 ? 1 :
67 is2_zip_lo2 ? 2 :
68 is3_zip_hi1 ? 3 :
69 is4_zip_hi2 ? 4 :
70 is5_blend ? 5 :
71 is6_shuffle1 ? 6 :
72 is7_shuffle2 ? 7 :
73 is8_lobl_shuffle1 ? 8 :
74 is9_lobl_shuffle2 ? 9 :
75 is10_losh_shuffle1 ? 10 :
76 is11_losh_shuffle2 ? 11 :
77 is12_hibl_shuffle1 ? 12 :
78 is13_hibl_shuffle2 ? 13 :
79 is14_hish_shuffle1 ? 14 :
80 is15_hish_shuffle2 ? 15 : 16;
81};
82
83template<unsigned N> struct shuffle_impl {};
84
85// zip_lo1
86template<> struct shuffle_impl<1> {
87 template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
88 static float32<4> run(const float32<4>& a, const float32<4>& b)
89 {
90 return _mm_unpacklo_ps(a.native(), b.native());
91 }
92#if SIMDPP_USE_AVX
93 template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
94 static float32<8> run(const float32<8>& a, const float32<8>& b)
95 {
96 return _mm256_unpacklo_ps(a.native(), b.native());
97 }
98#endif
99#if SIMDPP_USE_AVX512F
100 template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
101 static float32<16> run(const float32<16>& a, const float32<16>& b)
102 {
103 return _mm512_unpacklo_ps(a.native(), b.native());
104 }
105#endif
106};
107
108// zip_lo2
109template<> struct shuffle_impl<2> {
110 template<unsigned, unsigned, unsigned, unsigned, unsigned N> SIMDPP_INL
111 static float32<N> run(const float32<N>& a, const float32<N>& b)
112 {
113 return shuffle_impl<1>::run<0,0,0,0>(b, a);
114 }
115};
116
117// zip_hi1
118template<> struct shuffle_impl<3> {
119 template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
120 static float32<4> run(const float32<4>& a, const float32<4>& b)
121 {
122 return _mm_unpackhi_ps(a.native(), b.native());
123 }
124#if SIMDPP_USE_AVX
125 template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
126 static float32<8> run(const float32<8>& a, const float32<8>& b)
127 {
128 return _mm256_unpackhi_ps(a.native(), b.native());
129 }
130#endif
131#if SIMDPP_USE_AVX512F
132 template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
133 static float32<16> run(const float32<16>& a, const float32<16>& b)
134 {
135 return _mm512_unpackhi_ps(a.native(), b.native());
136 }
137#endif
138};
139
140// zip_hi2
141template<> struct shuffle_impl<4> {
142 template<unsigned, unsigned, unsigned, unsigned, unsigned N> SIMDPP_INL
143 static float32<N> run(const float32<N>& a, const float32<N>& b)
144 {
145 return shuffle_impl<3>::run<0,0,0,0>(b, a);
146 }
147};
148
149// is5_blend
150#if SIMDPP_USE_SSE4_1
151template<> struct shuffle_impl<5> {
152 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
153 static float32<4> run(const float32<4>& a, const float32<4>& b)
154 {
155 return _mm_blend_ps(a.native(), b.native(),
156 SIMDPP_SHUFFLE_MASK_4x2(s0/4, s1/4, s2/4, s3/4));
157 }
158#if SIMDPP_USE_AVX
159 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
160 static float32<8> run(const float32<8>& a, const float32<8>& b)
161 {
162 return _mm256_blend_ps(a.native(), b.native(),
163 SIMDPP_SHUFFLE_MASK_4x2_2(s0/4, s1/4, s2/4, s3/4));
164 }
165#endif
166#if SIMDPP_USE_AVX512F
167 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
168 static float32<16> run(const float32<16>& a, const float32<16>& b)
169 {
170 return _mm512_mask_blend_ps(SIMDPP_SHUFFLE_MASK_4x2_4(s0/4, s1/4, s2/4, s3/4),
171 a.native(), b.native());
172 }
173#endif
174};
175#endif
176
177// is6_shuffle1
178template<> struct shuffle_impl<6> {
179 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
180 static float32<4> run(const float32<4>& a, const float32<4>& b)
181 {
182 return _mm_shuffle_ps(a.native(), b.native(),
183 _MM_SHUFFLE(s3-4, s2-4, s1, s0));
184 }
185#if SIMDPP_USE_AVX
186 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
187 static float32<8> run(const float32<8>& a, const float32<8>& b)
188 {
189 return _mm256_shuffle_ps(a.native(), b.native(),
190 _MM_SHUFFLE(s3-4, s2-4, s1, s0));
191 }
192#endif
193#if SIMDPP_USE_AVX512F
194 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
195 static float32<16> run(const float32<16>& a, const float32<16>& b)
196 {
197 return _mm512_shuffle_ps(a.native(), b.native(),
198 _MM_SHUFFLE(s3-4, s2-4, s1, s0));
199 }
200#endif
201};
202
203// is7_shuffle2
204template<> struct shuffle_impl<7> {
205 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
206 static float32<4> run(const float32<4>& a, const float32<4>& b)
207 {
208 return _mm_shuffle_ps(b.native(), a.native(),
209 _MM_SHUFFLE(s3, s2, s1-4, s0-4));
210 }
211#if SIMDPP_USE_AVX
212 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
213 static float32<8> run(const float32<8>& a, const float32<8>& b)
214 {
215 return _mm256_shuffle_ps(b.native(), a.native(),
216 _MM_SHUFFLE(s3, s2, s1-4, s0-4));
217 }
218#endif
219#if SIMDPP_USE_AVX512F
220 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
221 static float32<16> run(const float32<16>& a, const float32<16>& b)
222 {
223 return _mm512_shuffle_ps(b.native(), a.native(),
224 _MM_SHUFFLE(s3, s2, s1-4, s0-4));
225 }
226#endif
227};
228
229// selects those elements from b that have positions matching either s0 or s1
230#if SIMDPP_USE_SSE4_1
231template<unsigned s0, unsigned s1> SIMDPP_INL
232float32<4> select2_hi(const float32<4>& a, const float32<4>& b)
233{
234 const unsigned mask = (s0==4||s1==4 ? 1 : 0) |
235 (s0==5||s1==5 ? 2 : 0) |
236 (s0==6||s1==6 ? 4 : 0) |
237 (s0==7||s1==7 ? 8 : 0);
238 return _mm_blend_ps(a.native(), b.native(), mask);
239}
240#endif
241#if SIMDPP_USE_AVX
242template<unsigned s0, unsigned s1> SIMDPP_INL
243float32<8> select2_hi(const float32<8>& a, const float32<8>& b)
244{
245 const unsigned mask = (s0==4||s1==4 ? 1 : 0) |
246 (s0==5||s1==5 ? 2 : 0) |
247 (s0==6||s1==6 ? 4 : 0) |
248 (s0==7||s1==7 ? 8 : 0);
249 return _mm256_blend_ps(a.native(), b.native(), mask | mask << 4);
250}
251#endif
252#if SIMDPP_USE_AVX512F
253template<unsigned s0, unsigned s1> SIMDPP_INL
254float32<16> select2_hi(const float32<16>& a, const float32<16>& b)
255{
256 const unsigned mask = (s0==4||s1==4 ? 1 : 0) |
257 (s0==5||s1==5 ? 2 : 0) |
258 (s0==6||s1==6 ? 4 : 0) |
259 (s0==7||s1==7 ? 8 : 0);
260 const unsigned mask2 = mask | mask << 4 | mask << 8 | mask << 12;
261 return _mm512_mask_blend_ps(mask2, a.native(), b.native());
262}
263#endif
264
265// is8_lobl_shuffle1 = s2 >= 4 && s3 >= 4 && s0 != s1;
266template<> struct shuffle_impl<8> {
267#if SIMDPP_USE_SSE4_1
268 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
269 static float32<4> run(const float32<4>& a, const float32<4>& b)
270 {
271 __m128 ab1 = select2_hi<s0,s1>(a, b).native();
272 return _mm_shuffle_ps(ab1, b.native(),
273 _MM_SHUFFLE(s3-4, s2-4, s1%4, s0%4));
274 }
275#endif
276#if SIMDPP_USE_AVX
277 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
278 static float32<8> run(const float32<8>& a, const float32<8>& b)
279 {
280 __m256 ab1 = select2_hi<s0,s1>(a, b).native();
281 return _mm256_shuffle_ps(ab1, b.native(),
282 _MM_SHUFFLE(s3-4, s2-4, s1%4, s0%4));
283 }
284#endif
285#if SIMDPP_USE_AVX512F
286 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
287 static float32<16> run(const float32<16>& a, const float32<16>& b)
288 {
289 __m512 ab1 = select2_hi<s0,s1>(a, b).native();
290 return _mm512_shuffle_ps(ab1, b.native(),
291 _MM_SHUFFLE(s3-4, s2-4, s1%4, s0%4));
292 }
293#endif
294};
295
296// is9_lobl_shuffle2 = s2 < 4 && s3 < 4 && s0 != s1;
297template<> struct shuffle_impl<9> {
298#if SIMDPP_USE_SSE4_1
299 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
300 static float32<4> run(const float32<4>& a, const float32<4>& b)
301 {
302 __m128 ab1 = select2_hi<s0,s1>(a, b).native();
303 return _mm_shuffle_ps(ab1, a.native(),
304 _MM_SHUFFLE(s3, s2, s1%4, s0%4));
305 }
306#endif
307#if SIMDPP_USE_AVX
308 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
309 static float32<8> run(const float32<8>& a, const float32<8>& b)
310 {
311 __m256 ab1 = select2_hi<s0,s1>(a, b).native();
312 return _mm256_shuffle_ps(ab1, a.native(),
313 _MM_SHUFFLE(s3, s2, s1%4, s0%4));
314 }
315#endif
316#if SIMDPP_USE_AVX512F
317 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
318 static float32<16> run(const float32<16>& a, const float32<16>& b)
319 {
320 __m512 ab1 = select2_hi<s0,s1>(a, b).native();
321 return _mm512_shuffle_ps(ab1, a.native(),
322 _MM_SHUFFLE(s3, s2, s1%4, s0%4));
323 }
324#endif
325};
326
327// is10_losh_shuffle1 = s2 >= 4 && s3 >= 4;
328template<> struct shuffle_impl<10> {
329 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
330 static float32<4> run(const float32<4>& a, const float32<4>& b)
331 {
332 __m128 ab1 = _mm_shuffle_ps(a.native(), b.native(),
333 _MM_SHUFFLE(s1%4, s0%4, s1%4, s0%4));
334 return _mm_shuffle_ps(ab1, b.native(),
335 _MM_SHUFFLE(s3-4, s2-4, s1/4?3:1, s0/4?2:0));
336 }
337#if SIMDPP_USE_AVX
338 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
339 static float32<8> run(const float32<8>& a, const float32<8>& b)
340 {
341 __m256 ab1 = _mm256_shuffle_ps(a.native(), b.native(),
342 _MM_SHUFFLE(s1%4, s0%4, s1%4, s0%4));
343 return _mm256_shuffle_ps(ab1, b.native(),
344 _MM_SHUFFLE(s3-4, s2-4, s1/4?3:1, s0/4?2:0));
345 }
346#endif
347#if SIMDPP_USE_AVX512F
348 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
349 static float32<16> run(const float32<16>& a, const float32<16>& b)
350 {
351 __m512 ab1 = _mm512_shuffle_ps(a.native(), b.native(),
352 _MM_SHUFFLE(s1%4, s0%4, s1%4, s0%4));
353 return _mm512_shuffle_ps(ab1, b.native(),
354 _MM_SHUFFLE(s3-4, s2-4, s1/4?3:1, s0/4?2:0));
355 }
356#endif
357};
358
359// is11_losh_shuffle2 = s2 < 4 && s3 < 4;
360template<> struct shuffle_impl<11> {
361 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
362 static float32<4> run(const float32<4>& a, const float32<4>& b)
363 {
364 __m128 ab1 = _mm_shuffle_ps(a.native(), b.native(),
365 _MM_SHUFFLE(s1%4, s0%4, s1%4, s0%4));
366 return _mm_shuffle_ps(ab1, a.native(),
367 _MM_SHUFFLE(s3, s2, s1/4?3:1, s0/4?2:0));
368 }
369#if SIMDPP_USE_AVX
370 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
371 static float32<8> run(const float32<8>& a, const float32<8>& b)
372 {
373 __m256 ab1 = _mm256_shuffle_ps(a.native(), b.native(),
374 _MM_SHUFFLE(s1%4, s0%4, s1%4, s0%4));
375 return _mm256_shuffle_ps(ab1, a.native(),
376 _MM_SHUFFLE(s3, s2, s1/4?3:1, s0/4?2:0));
377 }
378#endif
379#if SIMDPP_USE_AVX512F
380 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
381 static float32<16> run(const float32<16>& a, const float32<16>& b)
382 {
383 __m512 ab1 = _mm512_shuffle_ps(a.native(), b.native(),
384 _MM_SHUFFLE(s1%4, s0%4, s1%4, s0%4));
385 return _mm512_shuffle_ps(ab1, a.native(),
386 _MM_SHUFFLE(s3, s2, s1/4?3:1, s0/4?2:0));
387 }
388#endif
389};
390
391
392// is12_hibl_shuffle1 = s0 >= 4 && s1 >= 4 && s2 != s3;
393template<> struct shuffle_impl<12> {
394#if SIMDPP_USE_SSE4_1
395 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
396 static float32<4> run(const float32<4>& a, const float32<4>& b)
397 {
398 __m128 ab2 = select2_hi<s2, s3>(a, b).native();
399 return _mm_shuffle_ps(b.native(), ab2,
400 _MM_SHUFFLE(s3%4, s2%4, s1-4, s0-4));
401 }
402#endif
403#if SIMDPP_USE_AVX
404 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
405 static float32<8> run(const float32<8>& a, const float32<8>& b)
406 {
407 __m256 ab2 = select2_hi<s2, s3>(a, b).native();
408 return _mm256_shuffle_ps(b.native(), ab2,
409 _MM_SHUFFLE(s3%4, s2%4, s1-4, s0-4));
410 }
411#endif
412#if SIMDPP_USE_AVX512F
413 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
414 static float32<16> run(const float32<16>& a, const float32<16>& b)
415 {
416 __m512 ab2 = select2_hi<s2, s3>(a, b).native();
417 return _mm512_shuffle_ps(b.native(), ab2,
418 _MM_SHUFFLE(s3%4, s2%4, s1-4, s0-4));
419 }
420#endif
421};
422
423// is13_hibl_shuffle2 = s0 < 4 && s1 < 4 && s2 != s3;
424template<> struct shuffle_impl<13> {
425#if SIMDPP_USE_SSE4_1
426 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
427 static float32<4> run(const float32<4>& a, const float32<4>& b)
428 {
429 __m128 ab2 = select2_hi<s2, s3>(a, b).native();
430 return _mm_shuffle_ps(a.native(), ab2,
431 _MM_SHUFFLE(s3%4, s2%4, s1, s0));
432 }
433#endif
434#if SIMDPP_USE_AVX
435 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
436 static float32<8> run(const float32<8>& a, const float32<8>& b)
437 {
438 __m256 ab2 = select2_hi<s2, s3>(a, b).native();
439 return _mm256_shuffle_ps(a.native(), ab2,
440 _MM_SHUFFLE(s3%4, s2%4, s1, s0));
441 }
442#endif
443#if SIMDPP_USE_AVX512F
444 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
445 static float32<16> run(const float32<16>& a, const float32<16>& b)
446 {
447 __m512 ab2 = select2_hi<s2, s3>(a, b).native();
448 return _mm512_shuffle_ps(a.native(), ab2,
449 _MM_SHUFFLE(s3%4, s2%4, s1, s0));
450 }
451#endif
452};
453
454// is14_hish_shuffle1 = s0 >= 4 && s1 >= 4;
455template<> struct shuffle_impl<14> {
456 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
457 static float32<4> run(const float32<4>& a, const float32<4>& b)
458 {
459 __m128 ab2 = _mm_shuffle_ps(a.native(), b.native(),
460 _MM_SHUFFLE(s3%4, s2%4, s3%4, s2%4));
461 return _mm_shuffle_ps(b.native(), ab2,
462 _MM_SHUFFLE(s3/4?3:1, s2/4?2:0, s1-4, s0-4));
463 }
464#if SIMDPP_USE_AVX
465 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
466 static float32<8> run(const float32<8>& a, const float32<8>& b)
467 {
468 __m256 ab2 = _mm256_shuffle_ps(a.native(), b.native(),
469 _MM_SHUFFLE(s3%4, s2%4, s3%4, s2%4));
470 return _mm256_shuffle_ps(b.native(), ab2,
471 _MM_SHUFFLE(s3/4?3:1, s2/4?2:0, s1-4, s0-4));
472 }
473#endif
474#if SIMDPP_USE_AVX512F
475 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
476 static float32<16> run(const float32<16>& a, const float32<16>& b)
477 {
478 __m512 ab2 = _mm512_shuffle_ps(a.native(), b.native(),
479 _MM_SHUFFLE(s3%4, s2%4, s3%4, s2%4));
480 return _mm512_shuffle_ps(b.native(), ab2,
481 _MM_SHUFFLE(s3/4?3:1, s2/4?2:0, s1-4, s0-4));
482 }
483#endif
484};
485
486// is15_hish_shuffle2: s0 < 4 && s1 < 4
487template<> struct shuffle_impl<15> {
488 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
489 static float32<4> run(const float32<4>& a, const float32<4>& b)
490 {
491 __m128 ab2 = _mm_shuffle_ps(a.native(), b.native(),
492 _MM_SHUFFLE(s3%4, s2%4, s3%4, s2%4));
493 return _mm_shuffle_ps(a.native(), ab2,
494 _MM_SHUFFLE(s3/4?3:1, s2/4?2:0, s1, s0));
495 }
496#if SIMDPP_USE_AVX
497 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
498 static float32<8> run(const float32<8>& a, const float32<8>& b)
499 {
500 __m256 ab2 = _mm256_shuffle_ps(a.native(), b.native(),
501 _MM_SHUFFLE(s3%4, s2%4, s3%4, s2%4));
502 return _mm256_shuffle_ps(a.native(), ab2,
503 _MM_SHUFFLE(s3/4?3:1, s2/4?2:0, s1, s0));
504 }
505#endif
506#if SIMDPP_USE_AVX512F
507 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
508 static float32<16> run(const float32<16>& a, const float32<16>& b)
509 {
510 __m512 ab2 = _mm512_shuffle_ps(a.native(), b.native(),
511 _MM_SHUFFLE(s3%4, s2%4, s3%4, s2%4));
512 return _mm512_shuffle_ps(a.native(), ab2,
513 _MM_SHUFFLE(s3/4?3:1, s2/4?2:0, s1, s0));
514 }
515#endif
516};
517
518// any
519template<> struct shuffle_impl<16> {
520 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
521 static float32<4> run(const float32<4>& a, const float32<4>& b)
522 {
523#if SIMDPP_USE_SSE4_1
524 __m128 ap = _mm_shuffle_ps(a.native(), a.native(),
525 SIMDPP_SHUFFLE_MASK_4x4(s0%4,s1%4,s2%4,s3%4));
526 __m128 bp = _mm_shuffle_ps(b.native(), b.native(),
527 SIMDPP_SHUFFLE_MASK_4x4(s0%4,s1%4,s2%4,s3%4));
528 return _mm_blend_ps(ap, bp, SIMDPP_SHUFFLE_MASK_4x2(s0/4,s1/4,s2/4,s3/4));
529#else
530 __m128 ab1 = _mm_shuffle_ps(a.native(), b.native(),
531 _MM_SHUFFLE(s1%4, s0%4, s1%4, s0%4));
532 __m128 ab2 = _mm_shuffle_ps(a.native(), b.native(),
533 _MM_SHUFFLE(s3%4, s2%4, s3%4, s2%4));
534 return _mm_shuffle_ps(ab1, ab2, _MM_SHUFFLE(s3/4?3:1, s2/4?2:0, s1/4?3:1, s0/4?2:0));
535#endif
536 }
537#if SIMDPP_USE_AVX
538 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
539 static float32<8> run(const float32<8>& a, const float32<8>& b)
540 {
541 __m256 ap = _mm256_shuffle_ps(a.native(), a.native(),
542 SIMDPP_SHUFFLE_MASK_4x4(s0%4,s1%4,s2%4,s3%4));
543 __m256 bp = _mm256_shuffle_ps(b.native(), b.native(),
544 SIMDPP_SHUFFLE_MASK_4x4(s0%4,s1%4,s2%4,s3%4));
545 return _mm256_blend_ps(ap, bp, SIMDPP_SHUFFLE_MASK_4x2_2(s0/4,s1/4,s2/4,s3/4));
546 }
547#endif
548#if SIMDPP_USE_AVX512F
549 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
550 static float32<16> run(const float32<16>& a, const float32<16>& b)
551 {
552 __m512 ap = _mm512_shuffle_ps(a.native(), a.native(),
553 SIMDPP_SHUFFLE_MASK_4x4(s0%4,s1%4,s2%4,s3%4));
554 const int mask = SIMDPP_SHUFFLE_MASK_4x2_4(s0/4,s1/4,s2/4,s3/4);
555 return _mm512_mask_shuffle_ps(ap, mask, b.native(), b.native(),
556 SIMDPP_SHUFFLE_MASK_4x4(s0%4,s1%4,s2%4,s3%4));
557 }
558#endif
559};
560
561template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N>
562float32<N> do_shuffle(const float32<N>& a, const float32<N>& b)
563{
564 return shuffle_impl<impl_selector<s0, s1, s2, s3>::impl>::template run<s0, s1, s2, s3>(a, b);
565}
566
567} // namespace sse_shuffle4x2_float32
568} // namespace detail
569} // namespace SIMDPP_ARCH_NAMESPACE
570} // namespace simdpp
571
572#endif
573#endif
574