1/* Copyright (C) 2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_PERMUTE_SSE_FLOAT64_4x2_H
9#define LIBSIMDPP_SIMDPP_DETAIL_PERMUTE_SSE_FLOAT64_4x2_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/setup_arch.h>
16#include <simdpp/types.h>
17#include <simdpp/core/combine.h>
18#include <simdpp/core/split.h>
19#include <simdpp/detail/insn/shuffle2x2.h>
20#include <simdpp/detail/shuffle/shuffle_mask.h>
21
22#if SIMDPP_USE_AVX
23
24namespace simdpp {
25namespace SIMDPP_ARCH_NAMESPACE {
26namespace detail {
27namespace sse_shuffle4x2_float64 {
28
29/* The code below implements generalized permutations for 4 elements sets
30 within float64 vectors.
31*/
32
33template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
34struct impl_selector {
35
36 // 0 1 2 3
37 // 4 5 6 7
38 static const bool is1_zip_lo1 = (s0==0 && s1==4 && s2==2 && s3==6);
39 static const bool is2_zip_lo2 = (s0==4 && s1==0 && s2==6 && s3==2);
40 static const bool is3_zip_hi1 = (s0==1 && s1==5 && s2==3 && s3==7);
41 static const bool is4_zip_hi2 = (s0==5 && s1==1 && s2==7 && s3==3);
42 static const bool is5_blend = (s0==0 || s0==4) && (s1==1 || s1==5) &&
43 (s2==2 || s2==6) && (s3==3 || s3==7);
44#if SIMDPP_USE_AVX512F
45 static const bool is6_shuffle1 = false;
46 static const bool is7_shuffle2 = false;
47 static const bool is8_permx2var = true;
48#elif SIMDPP_USE_AVX2
49 // single shuffle_pd. Other cases handled by zip* and blend
50 static const bool is6_shuffle1 = (s0==1 && s1==4 && s2==3 && s3==6);
51 static const bool is7_shuffle2 = (s0==4 && s1==1 && s2==6 && s3==3);
52 static const bool is8_permx2var = false;
53#else
54 static const bool is6_shuffle1 = (s0/2==0) && (s1/2==2) && (s2/2==1) && (s3/2==3);
55 static const bool is7_shuffle2 = (s0/2==2) && (s1/2==0) && (s2/2==3) && (s3/2==1);
56 static const bool is8_permx2var = false;
57#endif
58#if SIMDPP_USE_AVX2
59 static const bool is9_perm_blend = true;
60#else
61 static const bool is9_perm_blend = false;
62#endif
63 static const int impl = is1_zip_lo1 ? 1 :
64 is2_zip_lo2 ? 2 :
65 is3_zip_hi1 ? 3 :
66 is4_zip_hi2 ? 4 :
67 is5_blend ? 5 :
68 is6_shuffle1 ? 6 :
69 is7_shuffle2 ? 7 :
70 is8_permx2var ? 8 :
71 is9_perm_blend ? 9 : 10;
72};
73
74template<unsigned N> struct shuffle_impl {};
75
76// zip_lo1
77template<> struct shuffle_impl<1> {
78 template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
79 static float64<4> run(const float64<4>& a, const float64<4>& b)
80 {
81 return _mm256_unpacklo_pd(a.native(), b.native());
82 }
83#if SIMDPP_USE_AVX512F
84 template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
85 static float64<8> run(const float64<8>& a, const float64<8>& b)
86 {
87 return _mm512_unpacklo_pd(a.native(), b.native());
88 }
89#endif
90};
91
92// zip_lo2
93template<> struct shuffle_impl<2> {
94 template<unsigned, unsigned, unsigned, unsigned, unsigned N> SIMDPP_INL
95 static float64<N> run(const float64<N>& a, const float64<N>& b)
96 {
97 return shuffle_impl<1>::run<0,0,0,0>(b, a);
98 }
99};
100
101// zip_hi1
102template<> struct shuffle_impl<3> {
103 template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
104 static float64<4> run(const float64<4>& a, const float64<4>& b)
105 {
106 return _mm256_unpackhi_pd(a.native(), b.native());
107 }
108#if SIMDPP_USE_AVX512F
109 template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
110 static float64<8> run(const float64<8>& a, const float64<8>& b)
111 {
112 return _mm512_unpackhi_pd(a.native(), b.native());
113 }
114#endif
115};
116
117// zip_hi2
118template<> struct shuffle_impl<4> {
119 template<unsigned, unsigned, unsigned, unsigned, unsigned N> SIMDPP_INL
120 static float64<N> run(const float64<N>& a, const float64<N>& b)
121 {
122 return shuffle_impl<3>::run<0,0,0,0>(b, a);
123 }
124};
125
126// is5_blend
127template<> struct shuffle_impl<5> {
128 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
129 static float64<4> run(const float64<4>& a, const float64<4>& b)
130 {
131 const unsigned mask = (s0<4 ? 0 : 1) | (s1<4 ? 0 : 2) | (s2<4 ? 0 : 4) | (s3<4 ? 0 : 8);
132 return _mm256_blend_pd(a.native(), b.native(), mask);
133 }
134#if SIMDPP_USE_AVX512F
135 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
136 static float64<8> run(const float64<8>& a, const float64<8>& b)
137 {
138 const unsigned mask = (s0<4 ? 0 : 1) | (s1<4 ? 0 : 2) | (s2<4 ? 0 : 4) | (s3<4 ? 0 : 8);
139 const unsigned mask2 = mask | mask << 4;
140 return _mm512_mask_blend_pd(mask2, a.native(), b.native());
141 }
142#endif
143};
144
145// is6_shuffle1 (only AVX-AVX2)
146template<> struct shuffle_impl<6> {
147 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
148 static float64<4> run(const float64<4>& a, const float64<4>& b)
149 {
150 if (s0%2 != s2%2 || s1%2 != s3%2) {
151 __m128d a1 = _mm256_castpd256_pd128(a.native());
152 __m128d b1 = _mm256_castpd256_pd128(b.native());
153 a1 = _mm_shuffle_pd(a1, b1, SIMDPP_SHUFFLE_MASK_2x2(s0, s1-4));
154 __m256d t = _mm256_shuffle_pd(a.native(), b.native(),
155 SIMDPP_SHUFFLE_MASK_2x2_2(s2-2,s3-6));
156 t = _mm256_insertf128_pd(t, a1, 0);
157 return t;
158 } else {
159 return _mm256_shuffle_pd(a.native(), b.native(),
160 SIMDPP_SHUFFLE_MASK_2x2_2(s0, s1-4));
161 }
162 }
163};
164
165// is7_shuffle2 (only AVX-AVX2)
166template<> struct shuffle_impl<7> {
167 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
168 static float64<4> run(const float64<4>& a, const float64<4>& b)
169 {
170 if (s0%2 != s2%2 || s1%2 != s3%2) {
171 __m128d a1 = _mm256_castpd256_pd128(a.native());
172 __m128d b1 = _mm256_castpd256_pd128(b.native());
173 a1 = _mm_shuffle_pd(b1, a1, SIMDPP_SHUFFLE_MASK_2x2(s1,s0-4));
174 __m256d t = _mm256_shuffle_pd(b.native(), a.native(),
175 SIMDPP_SHUFFLE_MASK_2x2_2(s3-2,s2-6));
176 t = _mm256_insertf128_pd(t, a1, 0);
177 return t;
178 } else {
179 return _mm256_shuffle_pd(b.native(), a.native(),
180 SIMDPP_SHUFFLE_MASK_2x2_2(s1,s0-4));
181 }
182 }
183};
184
185// is8_perm2xvar
186#if SIMDPP_USE_AVX512F
187template<> struct shuffle_impl<8> {
188
189 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
190 static float64<4> run(const float64<4>& a, const float64<4>& b)
191 {
192 uint8<16> mask = make_uint(s0, s1, s2, s3);
193 // FIXME: GCC BUG
194 // return _mm256_permutex2var_pd(a.native(), _mm256_cvtepi8_epi64(mask), b);
195 return _mm512_castpd512_pd256(_mm512_permutex2var_pd(_mm512_castpd256_pd512(a.native()),
196 _mm512_cvtepi8_epi64(mask.native()),
197 _mm512_castpd256_pd512(b.native())));
198 }
199
200 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
201 static float64<8> run(const float64<8>& a, const float64<8>& b)
202 {
203 const unsigned p0 = s0<4 ? s0 : s0+4;
204 const unsigned p1 = s1<4 ? s1 : s1+4;
205 const unsigned p2 = s2<4 ? s2 : s2+4;
206 const unsigned p3 = s3<4 ? s3 : s3+4;
207 uint8<16> mask = make_uint(p0, p1, p2, p3, p0+4, p1+4, p2+4, p3+4);
208 return _mm512_permutex2var_pd(a.native(), _mm512_cvtepi8_epi64(mask.native()), b.native());
209 }
210};
211#endif
212
213// is9_perm_blend
214#if SIMDPP_USE_AVX2
215template<> struct shuffle_impl<9> {
216 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
217 static float64<4> run(const float64<4>& a, const float64<4>& b)
218 {
219 const unsigned bl_mask = (s0<4 ? 0 : 1) | (s1<4 ? 0 : 2) | (s2<4 ? 0 : 4) | (s3<4 ? 0 : 8);
220 __m256d ta = _mm256_permute4x64_pd(a.native(), SIMDPP_SHUFFLE_MASK_4x4(s0%4, s1%4, s2%4, s3%4));
221 __m256d tb = _mm256_permute4x64_pd(b.native(), SIMDPP_SHUFFLE_MASK_4x4(s0%4, s1%4, s2%4, s3%4));
222 return _mm256_blend_pd(ta, tb, bl_mask);
223 }
224};
225#endif
226
227template<unsigned s0, unsigned s1, class V>
228V i_shuffle_emul_64x4_half(const V& a0, const V& a1, const V& b0, const V& b1)
229{
230 const V& h0 = s0 < 2 ? a0 :
231 s0 < 4 ? a1 :
232 s0 < 6 ? b0 : b1;
233 const V& h1 = s1 < 2 ? a0 :
234 s1 < 4 ? a1 :
235 s1 < 6 ? b0 : b1;
236 return insn::i_shuffle2x2<s0%2, s1%2+2>(h0, h1);
237}
238
239// any (only on AVX)
240template<> struct shuffle_impl<10> {
241 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
242 static float64<4> run(const float64<4>& a, const float64<4>& b)
243 {
244 float64<2> a0, a1, b0, b1, r0, r1;
245 split(a, a0, a1);
246 split(b, b0, b1);
247 r0 = i_shuffle_emul_64x4_half<s0,s1>(a0, a1, b0, b1);
248 r1 = i_shuffle_emul_64x4_half<s2,s3>(a0, a1, b0, b1);
249 return combine(r0, r1);
250 }
251};
252
253template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N>
254float64<N> do_shuffle(const float64<N>& a, const float64<N>& b)
255{
256 return shuffle_impl<impl_selector<s0, s1, s2, s3>::impl>::run<s0, s1, s2, s3>(a, b);
257}
258
259} // namespace sse_shuffle4x2_float64
260} // namespace detail
261} // namespace SIMDPP_ARCH_NAMESPACE
262} // namespace simdpp
263
264#endif
265#endif
266