1 | /* Copyright (C) 2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_PERMUTE_SSE_FLOAT64_4x2_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_PERMUTE_SSE_FLOAT64_4x2_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/setup_arch.h> |
16 | #include <simdpp/types.h> |
17 | #include <simdpp/core/combine.h> |
18 | #include <simdpp/core/split.h> |
19 | #include <simdpp/detail/insn/shuffle2x2.h> |
20 | #include <simdpp/detail/shuffle/shuffle_mask.h> |
21 | |
22 | #if SIMDPP_USE_AVX |
23 | |
24 | namespace simdpp { |
25 | namespace SIMDPP_ARCH_NAMESPACE { |
26 | namespace detail { |
27 | namespace sse_shuffle4x2_float64 { |
28 | |
29 | /* The code below implements generalized permutations for 4 elements sets |
30 | within float64 vectors. |
31 | */ |
32 | |
33 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> |
34 | struct impl_selector { |
35 | |
36 | // 0 1 2 3 |
37 | // 4 5 6 7 |
38 | static const bool is1_zip_lo1 = (s0==0 && s1==4 && s2==2 && s3==6); |
39 | static const bool is2_zip_lo2 = (s0==4 && s1==0 && s2==6 && s3==2); |
40 | static const bool is3_zip_hi1 = (s0==1 && s1==5 && s2==3 && s3==7); |
41 | static const bool is4_zip_hi2 = (s0==5 && s1==1 && s2==7 && s3==3); |
42 | static const bool is5_blend = (s0==0 || s0==4) && (s1==1 || s1==5) && |
43 | (s2==2 || s2==6) && (s3==3 || s3==7); |
44 | #if SIMDPP_USE_AVX512F |
45 | static const bool is6_shuffle1 = false; |
46 | static const bool is7_shuffle2 = false; |
47 | static const bool is8_permx2var = true; |
48 | #elif SIMDPP_USE_AVX2 |
49 | // single shuffle_pd. Other cases handled by zip* and blend |
50 | static const bool is6_shuffle1 = (s0==1 && s1==4 && s2==3 && s3==6); |
51 | static const bool is7_shuffle2 = (s0==4 && s1==1 && s2==6 && s3==3); |
52 | static const bool is8_permx2var = false; |
53 | #else |
54 | static const bool is6_shuffle1 = (s0/2==0) && (s1/2==2) && (s2/2==1) && (s3/2==3); |
55 | static const bool is7_shuffle2 = (s0/2==2) && (s1/2==0) && (s2/2==3) && (s3/2==1); |
56 | static const bool is8_permx2var = false; |
57 | #endif |
58 | #if SIMDPP_USE_AVX2 |
59 | static const bool is9_perm_blend = true; |
60 | #else |
61 | static const bool is9_perm_blend = false; |
62 | #endif |
63 | static const int impl = is1_zip_lo1 ? 1 : |
64 | is2_zip_lo2 ? 2 : |
65 | is3_zip_hi1 ? 3 : |
66 | is4_zip_hi2 ? 4 : |
67 | is5_blend ? 5 : |
68 | is6_shuffle1 ? 6 : |
69 | is7_shuffle2 ? 7 : |
70 | is8_permx2var ? 8 : |
71 | is9_perm_blend ? 9 : 10; |
72 | }; |
73 | |
74 | template<unsigned N> struct shuffle_impl {}; |
75 | |
76 | // zip_lo1 |
77 | template<> struct shuffle_impl<1> { |
78 | template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL |
79 | static float64<4> run(const float64<4>& a, const float64<4>& b) |
80 | { |
81 | return _mm256_unpacklo_pd(a.native(), b.native()); |
82 | } |
83 | #if SIMDPP_USE_AVX512F |
84 | template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL |
85 | static float64<8> run(const float64<8>& a, const float64<8>& b) |
86 | { |
87 | return _mm512_unpacklo_pd(a.native(), b.native()); |
88 | } |
89 | #endif |
90 | }; |
91 | |
92 | // zip_lo2 |
93 | template<> struct shuffle_impl<2> { |
94 | template<unsigned, unsigned, unsigned, unsigned, unsigned N> SIMDPP_INL |
95 | static float64<N> run(const float64<N>& a, const float64<N>& b) |
96 | { |
97 | return shuffle_impl<1>::run<0,0,0,0>(b, a); |
98 | } |
99 | }; |
100 | |
101 | // zip_hi1 |
102 | template<> struct shuffle_impl<3> { |
103 | template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL |
104 | static float64<4> run(const float64<4>& a, const float64<4>& b) |
105 | { |
106 | return _mm256_unpackhi_pd(a.native(), b.native()); |
107 | } |
108 | #if SIMDPP_USE_AVX512F |
109 | template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL |
110 | static float64<8> run(const float64<8>& a, const float64<8>& b) |
111 | { |
112 | return _mm512_unpackhi_pd(a.native(), b.native()); |
113 | } |
114 | #endif |
115 | }; |
116 | |
117 | // zip_hi2 |
118 | template<> struct shuffle_impl<4> { |
119 | template<unsigned, unsigned, unsigned, unsigned, unsigned N> SIMDPP_INL |
120 | static float64<N> run(const float64<N>& a, const float64<N>& b) |
121 | { |
122 | return shuffle_impl<3>::run<0,0,0,0>(b, a); |
123 | } |
124 | }; |
125 | |
126 | // is5_blend |
127 | template<> struct shuffle_impl<5> { |
128 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
129 | static float64<4> run(const float64<4>& a, const float64<4>& b) |
130 | { |
131 | const unsigned mask = (s0<4 ? 0 : 1) | (s1<4 ? 0 : 2) | (s2<4 ? 0 : 4) | (s3<4 ? 0 : 8); |
132 | return _mm256_blend_pd(a.native(), b.native(), mask); |
133 | } |
134 | #if SIMDPP_USE_AVX512F |
135 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
136 | static float64<8> run(const float64<8>& a, const float64<8>& b) |
137 | { |
138 | const unsigned mask = (s0<4 ? 0 : 1) | (s1<4 ? 0 : 2) | (s2<4 ? 0 : 4) | (s3<4 ? 0 : 8); |
139 | const unsigned mask2 = mask | mask << 4; |
140 | return _mm512_mask_blend_pd(mask2, a.native(), b.native()); |
141 | } |
142 | #endif |
143 | }; |
144 | |
145 | // is6_shuffle1 (only AVX-AVX2) |
146 | template<> struct shuffle_impl<6> { |
147 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
148 | static float64<4> run(const float64<4>& a, const float64<4>& b) |
149 | { |
150 | if (s0%2 != s2%2 || s1%2 != s3%2) { |
151 | __m128d a1 = _mm256_castpd256_pd128(a.native()); |
152 | __m128d b1 = _mm256_castpd256_pd128(b.native()); |
153 | a1 = _mm_shuffle_pd(a1, b1, SIMDPP_SHUFFLE_MASK_2x2(s0, s1-4)); |
154 | __m256d t = _mm256_shuffle_pd(a.native(), b.native(), |
155 | SIMDPP_SHUFFLE_MASK_2x2_2(s2-2,s3-6)); |
156 | t = _mm256_insertf128_pd(t, a1, 0); |
157 | return t; |
158 | } else { |
159 | return _mm256_shuffle_pd(a.native(), b.native(), |
160 | SIMDPP_SHUFFLE_MASK_2x2_2(s0, s1-4)); |
161 | } |
162 | } |
163 | }; |
164 | |
165 | // is7_shuffle2 (only AVX-AVX2) |
166 | template<> struct shuffle_impl<7> { |
167 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
168 | static float64<4> run(const float64<4>& a, const float64<4>& b) |
169 | { |
170 | if (s0%2 != s2%2 || s1%2 != s3%2) { |
171 | __m128d a1 = _mm256_castpd256_pd128(a.native()); |
172 | __m128d b1 = _mm256_castpd256_pd128(b.native()); |
173 | a1 = _mm_shuffle_pd(b1, a1, SIMDPP_SHUFFLE_MASK_2x2(s1,s0-4)); |
174 | __m256d t = _mm256_shuffle_pd(b.native(), a.native(), |
175 | SIMDPP_SHUFFLE_MASK_2x2_2(s3-2,s2-6)); |
176 | t = _mm256_insertf128_pd(t, a1, 0); |
177 | return t; |
178 | } else { |
179 | return _mm256_shuffle_pd(b.native(), a.native(), |
180 | SIMDPP_SHUFFLE_MASK_2x2_2(s1,s0-4)); |
181 | } |
182 | } |
183 | }; |
184 | |
185 | // is8_perm2xvar |
186 | #if SIMDPP_USE_AVX512F |
187 | template<> struct shuffle_impl<8> { |
188 | |
189 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
190 | static float64<4> run(const float64<4>& a, const float64<4>& b) |
191 | { |
192 | uint8<16> mask = make_uint(s0, s1, s2, s3); |
193 | // FIXME: GCC BUG |
194 | // return _mm256_permutex2var_pd(a.native(), _mm256_cvtepi8_epi64(mask), b); |
195 | return _mm512_castpd512_pd256(_mm512_permutex2var_pd(_mm512_castpd256_pd512(a.native()), |
196 | _mm512_cvtepi8_epi64(mask.native()), |
197 | _mm512_castpd256_pd512(b.native()))); |
198 | } |
199 | |
200 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
201 | static float64<8> run(const float64<8>& a, const float64<8>& b) |
202 | { |
203 | const unsigned p0 = s0<4 ? s0 : s0+4; |
204 | const unsigned p1 = s1<4 ? s1 : s1+4; |
205 | const unsigned p2 = s2<4 ? s2 : s2+4; |
206 | const unsigned p3 = s3<4 ? s3 : s3+4; |
207 | uint8<16> mask = make_uint(p0, p1, p2, p3, p0+4, p1+4, p2+4, p3+4); |
208 | return _mm512_permutex2var_pd(a.native(), _mm512_cvtepi8_epi64(mask.native()), b.native()); |
209 | } |
210 | }; |
211 | #endif |
212 | |
213 | // is9_perm_blend |
214 | #if SIMDPP_USE_AVX2 |
215 | template<> struct shuffle_impl<9> { |
216 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
217 | static float64<4> run(const float64<4>& a, const float64<4>& b) |
218 | { |
219 | const unsigned bl_mask = (s0<4 ? 0 : 1) | (s1<4 ? 0 : 2) | (s2<4 ? 0 : 4) | (s3<4 ? 0 : 8); |
220 | __m256d ta = _mm256_permute4x64_pd(a.native(), SIMDPP_SHUFFLE_MASK_4x4(s0%4, s1%4, s2%4, s3%4)); |
221 | __m256d tb = _mm256_permute4x64_pd(b.native(), SIMDPP_SHUFFLE_MASK_4x4(s0%4, s1%4, s2%4, s3%4)); |
222 | return _mm256_blend_pd(ta, tb, bl_mask); |
223 | } |
224 | }; |
225 | #endif |
226 | |
227 | template<unsigned s0, unsigned s1, class V> |
228 | V i_shuffle_emul_64x4_half(const V& a0, const V& a1, const V& b0, const V& b1) |
229 | { |
230 | const V& h0 = s0 < 2 ? a0 : |
231 | s0 < 4 ? a1 : |
232 | s0 < 6 ? b0 : b1; |
233 | const V& h1 = s1 < 2 ? a0 : |
234 | s1 < 4 ? a1 : |
235 | s1 < 6 ? b0 : b1; |
236 | return insn::i_shuffle2x2<s0%2, s1%2+2>(h0, h1); |
237 | } |
238 | |
239 | // any (only on AVX) |
240 | template<> struct shuffle_impl<10> { |
241 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
242 | static float64<4> run(const float64<4>& a, const float64<4>& b) |
243 | { |
244 | float64<2> a0, a1, b0, b1, r0, r1; |
245 | split(a, a0, a1); |
246 | split(b, b0, b1); |
247 | r0 = i_shuffle_emul_64x4_half<s0,s1>(a0, a1, b0, b1); |
248 | r1 = i_shuffle_emul_64x4_half<s2,s3>(a0, a1, b0, b1); |
249 | return combine(r0, r1); |
250 | } |
251 | }; |
252 | |
253 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> |
254 | float64<N> do_shuffle(const float64<N>& a, const float64<N>& b) |
255 | { |
256 | return shuffle_impl<impl_selector<s0, s1, s2, s3>::impl>::run<s0, s1, s2, s3>(a, b); |
257 | } |
258 | |
259 | } // namespace sse_shuffle4x2_float64 |
260 | } // namespace detail |
261 | } // namespace SIMDPP_ARCH_NAMESPACE |
262 | } // namespace simdpp |
263 | |
264 | #endif |
265 | #endif |
266 | |