1 | /* Copyright (C) 2011-2012 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_PERMUTE_SSE_FLOAT32_4x2_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_PERMUTE_SSE_FLOAT32_4x2_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/setup_arch.h> |
16 | #include <simdpp/types.h> |
17 | #include <simdpp/detail/shuffle/shuffle_mask.h> |
18 | |
19 | #if SIMDPP_USE_SSE2 |
20 | |
21 | namespace simdpp { |
22 | namespace SIMDPP_ARCH_NAMESPACE { |
23 | namespace detail { |
24 | namespace sse_shuffle4x2_float32 { |
25 | |
26 | /* The code below implements generalized permutations for 2 elements sets |
27 | within float32 vectors. |
28 | */ |
29 | |
30 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> |
31 | struct impl_selector { |
32 | |
33 | // 0 1 2 3 |
34 | // 4 5 6 7 |
35 | static const bool is1_zip_lo1 = (s0==0 && s1==4 && s2==1 && s3==5); |
36 | static const bool is2_zip_lo2 = (s0==4 && s1==0 && s2==5 && s3==1); |
37 | static const bool is3_zip_hi1 = (s0==2 && s1==6 && s2==3 && s3==7); |
38 | static const bool is4_zip_hi2 = (s0==6 && s1==2 && s2==7 && s3==3); |
39 | #if SIMDPP_USE_SSE4_1 |
40 | static const bool is5_blend = (s0==0 || s0==4) && (s1==1 || s1==5) && |
41 | (s2==2 || s2==6) && (s3==3 || s3==7); |
42 | #else |
43 | static const bool is5_blend = false; |
44 | #endif |
45 | static const bool is6_shuffle1 = s0 < 4 && s1 < 4 && s2 >= 4 && s3 >= 4; |
46 | static const bool is7_shuffle2 = s0 >= 4 && s1 >= 4 && s2 < 4 && s3 < 4; |
47 | #if SIMDPP_USE_SSE4_1 |
48 | static const bool is8_lobl_shuffle1 = s2 >= 4 && s3 >= 4 && s0 != s1; |
49 | static const bool is9_lobl_shuffle2 = s2 < 4 && s3 < 4 && s0 != s1; |
50 | #else |
51 | static const bool is8_lobl_shuffle1 = false; |
52 | static const bool is9_lobl_shuffle2 = false; |
53 | #endif |
54 | static const bool is10_losh_shuffle1 = s2 >= 4 && s3 >= 4; |
55 | static const bool is11_losh_shuffle2 = s2 < 4 && s3 < 4; |
56 | #if SIMDPP_USE_SSE4_1 |
57 | static const bool is12_hibl_shuffle1 = s0 >= 4 && s1 >= 4 && s2 != s3; |
58 | static const bool is13_hibl_shuffle2 = s0 < 4 && s1 < 4 && s2 != s3; |
59 | #else |
60 | static const bool is12_hibl_shuffle1 = false; |
61 | static const bool is13_hibl_shuffle2 = false; |
62 | #endif |
63 | static const bool is14_hish_shuffle1 = s0 >= 4 && s1 >= 4; |
64 | static const bool is15_hish_shuffle2 = s0 < 4 && s1 < 4; |
65 | |
66 | static const int impl = is1_zip_lo1 ? 1 : |
67 | is2_zip_lo2 ? 2 : |
68 | is3_zip_hi1 ? 3 : |
69 | is4_zip_hi2 ? 4 : |
70 | is5_blend ? 5 : |
71 | is6_shuffle1 ? 6 : |
72 | is7_shuffle2 ? 7 : |
73 | is8_lobl_shuffle1 ? 8 : |
74 | is9_lobl_shuffle2 ? 9 : |
75 | is10_losh_shuffle1 ? 10 : |
76 | is11_losh_shuffle2 ? 11 : |
77 | is12_hibl_shuffle1 ? 12 : |
78 | is13_hibl_shuffle2 ? 13 : |
79 | is14_hish_shuffle1 ? 14 : |
80 | is15_hish_shuffle2 ? 15 : 16; |
81 | }; |
82 | |
83 | template<unsigned N> struct shuffle_impl {}; |
84 | |
85 | // zip_lo1 |
86 | template<> struct shuffle_impl<1> { |
87 | template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL |
88 | static float32<4> run(const float32<4>& a, const float32<4>& b) |
89 | { |
90 | return _mm_unpacklo_ps(a.native(), b.native()); |
91 | } |
92 | #if SIMDPP_USE_AVX |
93 | template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL |
94 | static float32<8> run(const float32<8>& a, const float32<8>& b) |
95 | { |
96 | return _mm256_unpacklo_ps(a.native(), b.native()); |
97 | } |
98 | #endif |
99 | #if SIMDPP_USE_AVX512F |
100 | template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL |
101 | static float32<16> run(const float32<16>& a, const float32<16>& b) |
102 | { |
103 | return _mm512_unpacklo_ps(a.native(), b.native()); |
104 | } |
105 | #endif |
106 | }; |
107 | |
108 | // zip_lo2 |
109 | template<> struct shuffle_impl<2> { |
110 | template<unsigned, unsigned, unsigned, unsigned, unsigned N> SIMDPP_INL |
111 | static float32<N> run(const float32<N>& a, const float32<N>& b) |
112 | { |
113 | return shuffle_impl<1>::run<0,0,0,0>(b, a); |
114 | } |
115 | }; |
116 | |
117 | // zip_hi1 |
118 | template<> struct shuffle_impl<3> { |
119 | template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL |
120 | static float32<4> run(const float32<4>& a, const float32<4>& b) |
121 | { |
122 | return _mm_unpackhi_ps(a.native(), b.native()); |
123 | } |
124 | #if SIMDPP_USE_AVX |
125 | template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL |
126 | static float32<8> run(const float32<8>& a, const float32<8>& b) |
127 | { |
128 | return _mm256_unpackhi_ps(a.native(), b.native()); |
129 | } |
130 | #endif |
131 | #if SIMDPP_USE_AVX512F |
132 | template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL |
133 | static float32<16> run(const float32<16>& a, const float32<16>& b) |
134 | { |
135 | return _mm512_unpackhi_ps(a.native(), b.native()); |
136 | } |
137 | #endif |
138 | }; |
139 | |
140 | // zip_hi2 |
141 | template<> struct shuffle_impl<4> { |
142 | template<unsigned, unsigned, unsigned, unsigned, unsigned N> SIMDPP_INL |
143 | static float32<N> run(const float32<N>& a, const float32<N>& b) |
144 | { |
145 | return shuffle_impl<3>::run<0,0,0,0>(b, a); |
146 | } |
147 | }; |
148 | |
149 | // is5_blend |
150 | #if SIMDPP_USE_SSE4_1 |
151 | template<> struct shuffle_impl<5> { |
152 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
153 | static float32<4> run(const float32<4>& a, const float32<4>& b) |
154 | { |
155 | return _mm_blend_ps(a.native(), b.native(), |
156 | SIMDPP_SHUFFLE_MASK_4x2(s0/4, s1/4, s2/4, s3/4)); |
157 | } |
158 | #if SIMDPP_USE_AVX |
159 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
160 | static float32<8> run(const float32<8>& a, const float32<8>& b) |
161 | { |
162 | return _mm256_blend_ps(a.native(), b.native(), |
163 | SIMDPP_SHUFFLE_MASK_4x2_2(s0/4, s1/4, s2/4, s3/4)); |
164 | } |
165 | #endif |
166 | #if SIMDPP_USE_AVX512F |
167 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
168 | static float32<16> run(const float32<16>& a, const float32<16>& b) |
169 | { |
170 | return _mm512_mask_blend_ps(SIMDPP_SHUFFLE_MASK_4x2_4(s0/4, s1/4, s2/4, s3/4), |
171 | a.native(), b.native()); |
172 | } |
173 | #endif |
174 | }; |
175 | #endif |
176 | |
177 | // is6_shuffle1 |
178 | template<> struct shuffle_impl<6> { |
179 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
180 | static float32<4> run(const float32<4>& a, const float32<4>& b) |
181 | { |
182 | return _mm_shuffle_ps(a.native(), b.native(), |
183 | _MM_SHUFFLE(s3-4, s2-4, s1, s0)); |
184 | } |
185 | #if SIMDPP_USE_AVX |
186 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
187 | static float32<8> run(const float32<8>& a, const float32<8>& b) |
188 | { |
189 | return _mm256_shuffle_ps(a.native(), b.native(), |
190 | _MM_SHUFFLE(s3-4, s2-4, s1, s0)); |
191 | } |
192 | #endif |
193 | #if SIMDPP_USE_AVX512F |
194 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
195 | static float32<16> run(const float32<16>& a, const float32<16>& b) |
196 | { |
197 | return _mm512_shuffle_ps(a.native(), b.native(), |
198 | _MM_SHUFFLE(s3-4, s2-4, s1, s0)); |
199 | } |
200 | #endif |
201 | }; |
202 | |
203 | // is7_shuffle2 |
204 | template<> struct shuffle_impl<7> { |
205 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
206 | static float32<4> run(const float32<4>& a, const float32<4>& b) |
207 | { |
208 | return _mm_shuffle_ps(b.native(), a.native(), |
209 | _MM_SHUFFLE(s3, s2, s1-4, s0-4)); |
210 | } |
211 | #if SIMDPP_USE_AVX |
212 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
213 | static float32<8> run(const float32<8>& a, const float32<8>& b) |
214 | { |
215 | return _mm256_shuffle_ps(b.native(), a.native(), |
216 | _MM_SHUFFLE(s3, s2, s1-4, s0-4)); |
217 | } |
218 | #endif |
219 | #if SIMDPP_USE_AVX512F |
220 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
221 | static float32<16> run(const float32<16>& a, const float32<16>& b) |
222 | { |
223 | return _mm512_shuffle_ps(b.native(), a.native(), |
224 | _MM_SHUFFLE(s3, s2, s1-4, s0-4)); |
225 | } |
226 | #endif |
227 | }; |
228 | |
229 | // selects those elements from b that have positions matching either s0 or s1 |
230 | #if SIMDPP_USE_SSE4_1 |
231 | template<unsigned s0, unsigned s1> SIMDPP_INL |
232 | float32<4> select2_hi(const float32<4>& a, const float32<4>& b) |
233 | { |
234 | const unsigned mask = (s0==4||s1==4 ? 1 : 0) | |
235 | (s0==5||s1==5 ? 2 : 0) | |
236 | (s0==6||s1==6 ? 4 : 0) | |
237 | (s0==7||s1==7 ? 8 : 0); |
238 | return _mm_blend_ps(a.native(), b.native(), mask); |
239 | } |
240 | #endif |
241 | #if SIMDPP_USE_AVX |
242 | template<unsigned s0, unsigned s1> SIMDPP_INL |
243 | float32<8> select2_hi(const float32<8>& a, const float32<8>& b) |
244 | { |
245 | const unsigned mask = (s0==4||s1==4 ? 1 : 0) | |
246 | (s0==5||s1==5 ? 2 : 0) | |
247 | (s0==6||s1==6 ? 4 : 0) | |
248 | (s0==7||s1==7 ? 8 : 0); |
249 | return _mm256_blend_ps(a.native(), b.native(), mask | mask << 4); |
250 | } |
251 | #endif |
252 | #if SIMDPP_USE_AVX512F |
253 | template<unsigned s0, unsigned s1> SIMDPP_INL |
254 | float32<16> select2_hi(const float32<16>& a, const float32<16>& b) |
255 | { |
256 | const unsigned mask = (s0==4||s1==4 ? 1 : 0) | |
257 | (s0==5||s1==5 ? 2 : 0) | |
258 | (s0==6||s1==6 ? 4 : 0) | |
259 | (s0==7||s1==7 ? 8 : 0); |
260 | const unsigned mask2 = mask | mask << 4 | mask << 8 | mask << 12; |
261 | return _mm512_mask_blend_ps(mask2, a.native(), b.native()); |
262 | } |
263 | #endif |
264 | |
265 | // is8_lobl_shuffle1 = s2 >= 4 && s3 >= 4 && s0 != s1; |
266 | template<> struct shuffle_impl<8> { |
267 | #if SIMDPP_USE_SSE4_1 |
268 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
269 | static float32<4> run(const float32<4>& a, const float32<4>& b) |
270 | { |
271 | __m128 ab1 = select2_hi<s0,s1>(a, b).native(); |
272 | return _mm_shuffle_ps(ab1, b.native(), |
273 | _MM_SHUFFLE(s3-4, s2-4, s1%4, s0%4)); |
274 | } |
275 | #endif |
276 | #if SIMDPP_USE_AVX |
277 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
278 | static float32<8> run(const float32<8>& a, const float32<8>& b) |
279 | { |
280 | __m256 ab1 = select2_hi<s0,s1>(a, b).native(); |
281 | return _mm256_shuffle_ps(ab1, b.native(), |
282 | _MM_SHUFFLE(s3-4, s2-4, s1%4, s0%4)); |
283 | } |
284 | #endif |
285 | #if SIMDPP_USE_AVX512F |
286 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
287 | static float32<16> run(const float32<16>& a, const float32<16>& b) |
288 | { |
289 | __m512 ab1 = select2_hi<s0,s1>(a, b).native(); |
290 | return _mm512_shuffle_ps(ab1, b.native(), |
291 | _MM_SHUFFLE(s3-4, s2-4, s1%4, s0%4)); |
292 | } |
293 | #endif |
294 | }; |
295 | |
296 | // is9_lobl_shuffle2 = s2 < 4 && s3 < 4 && s0 != s1; |
297 | template<> struct shuffle_impl<9> { |
298 | #if SIMDPP_USE_SSE4_1 |
299 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
300 | static float32<4> run(const float32<4>& a, const float32<4>& b) |
301 | { |
302 | __m128 ab1 = select2_hi<s0,s1>(a, b).native(); |
303 | return _mm_shuffle_ps(ab1, a.native(), |
304 | _MM_SHUFFLE(s3, s2, s1%4, s0%4)); |
305 | } |
306 | #endif |
307 | #if SIMDPP_USE_AVX |
308 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
309 | static float32<8> run(const float32<8>& a, const float32<8>& b) |
310 | { |
311 | __m256 ab1 = select2_hi<s0,s1>(a, b).native(); |
312 | return _mm256_shuffle_ps(ab1, a.native(), |
313 | _MM_SHUFFLE(s3, s2, s1%4, s0%4)); |
314 | } |
315 | #endif |
316 | #if SIMDPP_USE_AVX512F |
317 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
318 | static float32<16> run(const float32<16>& a, const float32<16>& b) |
319 | { |
320 | __m512 ab1 = select2_hi<s0,s1>(a, b).native(); |
321 | return _mm512_shuffle_ps(ab1, a.native(), |
322 | _MM_SHUFFLE(s3, s2, s1%4, s0%4)); |
323 | } |
324 | #endif |
325 | }; |
326 | |
327 | // is10_losh_shuffle1 = s2 >= 4 && s3 >= 4; |
328 | template<> struct shuffle_impl<10> { |
329 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
330 | static float32<4> run(const float32<4>& a, const float32<4>& b) |
331 | { |
332 | __m128 ab1 = _mm_shuffle_ps(a.native(), b.native(), |
333 | _MM_SHUFFLE(s1%4, s0%4, s1%4, s0%4)); |
334 | return _mm_shuffle_ps(ab1, b.native(), |
335 | _MM_SHUFFLE(s3-4, s2-4, s1/4?3:1, s0/4?2:0)); |
336 | } |
337 | #if SIMDPP_USE_AVX |
338 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
339 | static float32<8> run(const float32<8>& a, const float32<8>& b) |
340 | { |
341 | __m256 ab1 = _mm256_shuffle_ps(a.native(), b.native(), |
342 | _MM_SHUFFLE(s1%4, s0%4, s1%4, s0%4)); |
343 | return _mm256_shuffle_ps(ab1, b.native(), |
344 | _MM_SHUFFLE(s3-4, s2-4, s1/4?3:1, s0/4?2:0)); |
345 | } |
346 | #endif |
347 | #if SIMDPP_USE_AVX512F |
348 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
349 | static float32<16> run(const float32<16>& a, const float32<16>& b) |
350 | { |
351 | __m512 ab1 = _mm512_shuffle_ps(a.native(), b.native(), |
352 | _MM_SHUFFLE(s1%4, s0%4, s1%4, s0%4)); |
353 | return _mm512_shuffle_ps(ab1, b.native(), |
354 | _MM_SHUFFLE(s3-4, s2-4, s1/4?3:1, s0/4?2:0)); |
355 | } |
356 | #endif |
357 | }; |
358 | |
359 | // is11_losh_shuffle2 = s2 < 4 && s3 < 4; |
360 | template<> struct shuffle_impl<11> { |
361 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
362 | static float32<4> run(const float32<4>& a, const float32<4>& b) |
363 | { |
364 | __m128 ab1 = _mm_shuffle_ps(a.native(), b.native(), |
365 | _MM_SHUFFLE(s1%4, s0%4, s1%4, s0%4)); |
366 | return _mm_shuffle_ps(ab1, a.native(), |
367 | _MM_SHUFFLE(s3, s2, s1/4?3:1, s0/4?2:0)); |
368 | } |
369 | #if SIMDPP_USE_AVX |
370 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
371 | static float32<8> run(const float32<8>& a, const float32<8>& b) |
372 | { |
373 | __m256 ab1 = _mm256_shuffle_ps(a.native(), b.native(), |
374 | _MM_SHUFFLE(s1%4, s0%4, s1%4, s0%4)); |
375 | return _mm256_shuffle_ps(ab1, a.native(), |
376 | _MM_SHUFFLE(s3, s2, s1/4?3:1, s0/4?2:0)); |
377 | } |
378 | #endif |
379 | #if SIMDPP_USE_AVX512F |
380 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
381 | static float32<16> run(const float32<16>& a, const float32<16>& b) |
382 | { |
383 | __m512 ab1 = _mm512_shuffle_ps(a.native(), b.native(), |
384 | _MM_SHUFFLE(s1%4, s0%4, s1%4, s0%4)); |
385 | return _mm512_shuffle_ps(ab1, a.native(), |
386 | _MM_SHUFFLE(s3, s2, s1/4?3:1, s0/4?2:0)); |
387 | } |
388 | #endif |
389 | }; |
390 | |
391 | |
392 | // is12_hibl_shuffle1 = s0 >= 4 && s1 >= 4 && s2 != s3; |
393 | template<> struct shuffle_impl<12> { |
394 | #if SIMDPP_USE_SSE4_1 |
395 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
396 | static float32<4> run(const float32<4>& a, const float32<4>& b) |
397 | { |
398 | __m128 ab2 = select2_hi<s2, s3>(a, b).native(); |
399 | return _mm_shuffle_ps(b.native(), ab2, |
400 | _MM_SHUFFLE(s3%4, s2%4, s1-4, s0-4)); |
401 | } |
402 | #endif |
403 | #if SIMDPP_USE_AVX |
404 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
405 | static float32<8> run(const float32<8>& a, const float32<8>& b) |
406 | { |
407 | __m256 ab2 = select2_hi<s2, s3>(a, b).native(); |
408 | return _mm256_shuffle_ps(b.native(), ab2, |
409 | _MM_SHUFFLE(s3%4, s2%4, s1-4, s0-4)); |
410 | } |
411 | #endif |
412 | #if SIMDPP_USE_AVX512F |
413 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
414 | static float32<16> run(const float32<16>& a, const float32<16>& b) |
415 | { |
416 | __m512 ab2 = select2_hi<s2, s3>(a, b).native(); |
417 | return _mm512_shuffle_ps(b.native(), ab2, |
418 | _MM_SHUFFLE(s3%4, s2%4, s1-4, s0-4)); |
419 | } |
420 | #endif |
421 | }; |
422 | |
423 | // is13_hibl_shuffle2 = s0 < 4 && s1 < 4 && s2 != s3; |
424 | template<> struct shuffle_impl<13> { |
425 | #if SIMDPP_USE_SSE4_1 |
426 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
427 | static float32<4> run(const float32<4>& a, const float32<4>& b) |
428 | { |
429 | __m128 ab2 = select2_hi<s2, s3>(a, b).native(); |
430 | return _mm_shuffle_ps(a.native(), ab2, |
431 | _MM_SHUFFLE(s3%4, s2%4, s1, s0)); |
432 | } |
433 | #endif |
434 | #if SIMDPP_USE_AVX |
435 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
436 | static float32<8> run(const float32<8>& a, const float32<8>& b) |
437 | { |
438 | __m256 ab2 = select2_hi<s2, s3>(a, b).native(); |
439 | return _mm256_shuffle_ps(a.native(), ab2, |
440 | _MM_SHUFFLE(s3%4, s2%4, s1, s0)); |
441 | } |
442 | #endif |
443 | #if SIMDPP_USE_AVX512F |
444 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
445 | static float32<16> run(const float32<16>& a, const float32<16>& b) |
446 | { |
447 | __m512 ab2 = select2_hi<s2, s3>(a, b).native(); |
448 | return _mm512_shuffle_ps(a.native(), ab2, |
449 | _MM_SHUFFLE(s3%4, s2%4, s1, s0)); |
450 | } |
451 | #endif |
452 | }; |
453 | |
454 | // is14_hish_shuffle1 = s0 >= 4 && s1 >= 4; |
455 | template<> struct shuffle_impl<14> { |
456 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
457 | static float32<4> run(const float32<4>& a, const float32<4>& b) |
458 | { |
459 | __m128 ab2 = _mm_shuffle_ps(a.native(), b.native(), |
460 | _MM_SHUFFLE(s3%4, s2%4, s3%4, s2%4)); |
461 | return _mm_shuffle_ps(b.native(), ab2, |
462 | _MM_SHUFFLE(s3/4?3:1, s2/4?2:0, s1-4, s0-4)); |
463 | } |
464 | #if SIMDPP_USE_AVX |
465 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
466 | static float32<8> run(const float32<8>& a, const float32<8>& b) |
467 | { |
468 | __m256 ab2 = _mm256_shuffle_ps(a.native(), b.native(), |
469 | _MM_SHUFFLE(s3%4, s2%4, s3%4, s2%4)); |
470 | return _mm256_shuffle_ps(b.native(), ab2, |
471 | _MM_SHUFFLE(s3/4?3:1, s2/4?2:0, s1-4, s0-4)); |
472 | } |
473 | #endif |
474 | #if SIMDPP_USE_AVX512F |
475 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
476 | static float32<16> run(const float32<16>& a, const float32<16>& b) |
477 | { |
478 | __m512 ab2 = _mm512_shuffle_ps(a.native(), b.native(), |
479 | _MM_SHUFFLE(s3%4, s2%4, s3%4, s2%4)); |
480 | return _mm512_shuffle_ps(b.native(), ab2, |
481 | _MM_SHUFFLE(s3/4?3:1, s2/4?2:0, s1-4, s0-4)); |
482 | } |
483 | #endif |
484 | }; |
485 | |
486 | // is15_hish_shuffle2: s0 < 4 && s1 < 4 |
487 | template<> struct shuffle_impl<15> { |
488 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
489 | static float32<4> run(const float32<4>& a, const float32<4>& b) |
490 | { |
491 | __m128 ab2 = _mm_shuffle_ps(a.native(), b.native(), |
492 | _MM_SHUFFLE(s3%4, s2%4, s3%4, s2%4)); |
493 | return _mm_shuffle_ps(a.native(), ab2, |
494 | _MM_SHUFFLE(s3/4?3:1, s2/4?2:0, s1, s0)); |
495 | } |
496 | #if SIMDPP_USE_AVX |
497 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
498 | static float32<8> run(const float32<8>& a, const float32<8>& b) |
499 | { |
500 | __m256 ab2 = _mm256_shuffle_ps(a.native(), b.native(), |
501 | _MM_SHUFFLE(s3%4, s2%4, s3%4, s2%4)); |
502 | return _mm256_shuffle_ps(a.native(), ab2, |
503 | _MM_SHUFFLE(s3/4?3:1, s2/4?2:0, s1, s0)); |
504 | } |
505 | #endif |
506 | #if SIMDPP_USE_AVX512F |
507 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
508 | static float32<16> run(const float32<16>& a, const float32<16>& b) |
509 | { |
510 | __m512 ab2 = _mm512_shuffle_ps(a.native(), b.native(), |
511 | _MM_SHUFFLE(s3%4, s2%4, s3%4, s2%4)); |
512 | return _mm512_shuffle_ps(a.native(), ab2, |
513 | _MM_SHUFFLE(s3/4?3:1, s2/4?2:0, s1, s0)); |
514 | } |
515 | #endif |
516 | }; |
517 | |
518 | // any |
519 | template<> struct shuffle_impl<16> { |
520 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
521 | static float32<4> run(const float32<4>& a, const float32<4>& b) |
522 | { |
523 | #if SIMDPP_USE_SSE4_1 |
524 | __m128 ap = _mm_shuffle_ps(a.native(), a.native(), |
525 | SIMDPP_SHUFFLE_MASK_4x4(s0%4,s1%4,s2%4,s3%4)); |
526 | __m128 bp = _mm_shuffle_ps(b.native(), b.native(), |
527 | SIMDPP_SHUFFLE_MASK_4x4(s0%4,s1%4,s2%4,s3%4)); |
528 | return _mm_blend_ps(ap, bp, SIMDPP_SHUFFLE_MASK_4x2(s0/4,s1/4,s2/4,s3/4)); |
529 | #else |
530 | __m128 ab1 = _mm_shuffle_ps(a.native(), b.native(), |
531 | _MM_SHUFFLE(s1%4, s0%4, s1%4, s0%4)); |
532 | __m128 ab2 = _mm_shuffle_ps(a.native(), b.native(), |
533 | _MM_SHUFFLE(s3%4, s2%4, s3%4, s2%4)); |
534 | return _mm_shuffle_ps(ab1, ab2, _MM_SHUFFLE(s3/4?3:1, s2/4?2:0, s1/4?3:1, s0/4?2:0)); |
535 | #endif |
536 | } |
537 | #if SIMDPP_USE_AVX |
538 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
539 | static float32<8> run(const float32<8>& a, const float32<8>& b) |
540 | { |
541 | __m256 ap = _mm256_shuffle_ps(a.native(), a.native(), |
542 | SIMDPP_SHUFFLE_MASK_4x4(s0%4,s1%4,s2%4,s3%4)); |
543 | __m256 bp = _mm256_shuffle_ps(b.native(), b.native(), |
544 | SIMDPP_SHUFFLE_MASK_4x4(s0%4,s1%4,s2%4,s3%4)); |
545 | return _mm256_blend_ps(ap, bp, SIMDPP_SHUFFLE_MASK_4x2_2(s0/4,s1/4,s2/4,s3/4)); |
546 | } |
547 | #endif |
548 | #if SIMDPP_USE_AVX512F |
549 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
550 | static float32<16> run(const float32<16>& a, const float32<16>& b) |
551 | { |
552 | __m512 ap = _mm512_shuffle_ps(a.native(), a.native(), |
553 | SIMDPP_SHUFFLE_MASK_4x4(s0%4,s1%4,s2%4,s3%4)); |
554 | const int mask = SIMDPP_SHUFFLE_MASK_4x2_4(s0/4,s1/4,s2/4,s3/4); |
555 | return _mm512_mask_shuffle_ps(ap, mask, b.native(), b.native(), |
556 | SIMDPP_SHUFFLE_MASK_4x4(s0%4,s1%4,s2%4,s3%4)); |
557 | } |
558 | #endif |
559 | }; |
560 | |
561 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> |
562 | float32<N> do_shuffle(const float32<N>& a, const float32<N>& b) |
563 | { |
564 | return shuffle_impl<impl_selector<s0, s1, s2, s3>::impl>::template run<s0, s1, s2, s3>(a, b); |
565 | } |
566 | |
567 | } // namespace sse_shuffle4x2_float32 |
568 | } // namespace detail |
569 | } // namespace SIMDPP_ARCH_NAMESPACE |
570 | } // namespace simdpp |
571 | |
572 | #endif |
573 | #endif |
574 | |