sse_float64_4x2.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/shuffle/sse_float64_4x2.h]

1	/ Copyright (C) 2014 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_DETAIL_PERMUTE_SSE_FLOAT64_4x2_H
9	#define LIBSIMDPP_SIMDPP_DETAIL_PERMUTE_SSE_FLOAT64_4x2_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/setup_arch.h>
16	#include <simdpp/types.h>
17	#include <simdpp/core/combine.h>
18	#include <simdpp/core/split.h>
19	#include <simdpp/detail/insn/shuffle2x2.h>
20	#include <simdpp/detail/shuffle/shuffle_mask.h>
21
22	#if SIMDPP_USE_AVX
23
24	namespace simdpp {
25	namespace SIMDPP_ARCH_NAMESPACE {
26	namespace detail {
27	namespace sse_shuffle4x2_float64 {
28
29	/ The code below implements generalized permutations for 4 elements sets*
30	within float64 vectors.
31	*/
32
33	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
34	struct impl_selector {
35
36	// 0 1 2 3
37	// 4 5 6 7
38	static const bool is1_zip_lo1 = (s0==`0` && s1==`4` && s2==`2` && s3==`6`);
39	static const bool is2_zip_lo2 = (s0==`4` && s1==`0` && s2==`6` && s3==`2`);
40	static const bool is3_zip_hi1 = (s0==`1` && s1==`5` && s2==`3` && s3==`7`);
41	static const bool is4_zip_hi2 = (s0==`5` && s1==`1` && s2==`7` && s3==`3`);
42	static const bool is5_blend = (s0==`0` \|\| s0==`4`) && (s1==`1` \|\| s1==`5`) &&
43	(s2==`2` \|\| s2==`6`) && (s3==`3` \|\| s3==`7`);
44	#if SIMDPP_USE_AVX512F
45	static const bool is6_shuffle1 = false;
46	static const bool is7_shuffle2 = false;
47	static const bool is8_permx2var = true;
48	#elif SIMDPP_USE_AVX2
49	// single shuffle_pd. Other cases handled by zip and blend*
50	static const bool is6_shuffle1 = (s0==`1` && s1==`4` && s2==`3` && s3==`6`);
51	static const bool is7_shuffle2 = (s0==`4` && s1==`1` && s2==`6` && s3==`3`);
52	static const bool is8_permx2var = false;
53	#else
54	static const bool is6_shuffle1 = (s0/`2`==`0`) && (s1/`2`==`2`) && (s2/`2`==`1`) && (s3/`2`==`3`);
55	static const bool is7_shuffle2 = (s0/`2`==`2`) && (s1/`2`==`0`) && (s2/`2`==`3`) && (s3/`2`==`1`);
56	static const bool is8_permx2var = false;
57	#endif
58	#if SIMDPP_USE_AVX2
59	static const bool is9_perm_blend = true;
60	#else
61	static const bool is9_perm_blend = false;
62	#endif
63	static const int impl = is1_zip_lo1 ? `1` :
64	is2_zip_lo2 ? `2` :
65	is3_zip_hi1 ? `3` :
66	is4_zip_hi2 ? `4` :
67	is5_blend ? `5` :
68	is6_shuffle1 ? `6` :
69	is7_shuffle2 ? `7` :
70	is8_permx2var ? `8` :
71	is9_perm_blend ? `9` : `10`;
72	};
73
74	template<unsigned N> struct shuffle_impl {};
75
76	// zip_lo1
77	template<> struct shuffle_impl<`1`> {
78	template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
79	static float64<`4`> run(const float64<`4`>& a, const float64<`4`>& b)
80	{
81	return _mm256_unpacklo_pd(a.native(), b.native());
82	}
83	#if SIMDPP_USE_AVX512F
84	template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
85	static float64<`8`> run(const float64<`8`>& a, const float64<`8`>& b)
86	{
87	return _mm512_unpacklo_pd(a.native(), b.native());
88	}
89	#endif
90	};
91
92	// zip_lo2
93	template<> struct shuffle_impl<`2`> {
94	template<unsigned, unsigned, unsigned, unsigned, unsigned N> SIMDPP_INL
95	static float64<N> run(const float64<N>& a, const float64<N>& b)
96	{
97	return shuffle_impl<`1`>::run<`0`,`0`,`0`,`0`>(b, a);
98	}
99	};
100
101	// zip_hi1
102	template<> struct shuffle_impl<`3`> {
103	template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
104	static float64<`4`> run(const float64<`4`>& a, const float64<`4`>& b)
105	{
106	return _mm256_unpackhi_pd(a.native(), b.native());
107	}
108	#if SIMDPP_USE_AVX512F
109	template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
110	static float64<`8`> run(const float64<`8`>& a, const float64<`8`>& b)
111	{
112	return _mm512_unpackhi_pd(a.native(), b.native());
113	}
114	#endif
115	};
116
117	// zip_hi2
118	template<> struct shuffle_impl<`4`> {
119	template<unsigned, unsigned, unsigned, unsigned, unsigned N> SIMDPP_INL
120	static float64<N> run(const float64<N>& a, const float64<N>& b)
121	{
122	return shuffle_impl<`3`>::run<`0`,`0`,`0`,`0`>(b, a);
123	}
124	};
125
126	// is5_blend
127	template<> struct shuffle_impl<`5`> {
128	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
129	static float64<`4`> run(const float64<`4`>& a, const float64<`4`>& b)
130	{
131	const unsigned mask = (s0<`4` ? `0` : `1`) \| (s1<`4` ? `0` : `2`) \| (s2<`4` ? `0` : `4`) \| (s3<`4` ? `0` : `8`);
132	return _mm256_blend_pd(a.native(), b.native(), mask);
133	}
134	#if SIMDPP_USE_AVX512F
135	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
136	static float64<`8`> run(const float64<`8`>& a, const float64<`8`>& b)
137	{
138	const unsigned mask = (s0<`4` ? `0` : `1`) \| (s1<`4` ? `0` : `2`) \| (s2<`4` ? `0` : `4`) \| (s3<`4` ? `0` : `8`);
139	const unsigned mask2 = mask \| mask << `4`;
140	return _mm512_mask_blend_pd(mask2, a.native(), b.native());
141	}
142	#endif
143	};
144
145	// is6_shuffle1 (only AVX-AVX2)
146	template<> struct shuffle_impl<`6`> {
147	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
148	static float64<`4`> run(const float64<`4`>& a, const float64<`4`>& b)
149	{
150	if (s0%`2` != s2%`2` \|\| s1%`2` != s3%`2`) {
151	__m128d a1 = _mm256_castpd256_pd128(a.native());
152	__m128d b1 = _mm256_castpd256_pd128(b.native());
153	a1 = _mm_shuffle_pd(a1, b1, SIMDPP_SHUFFLE_MASK_2x2(s0, s1-`4`));
154	__m256d t = _mm256_shuffle_pd(a.native(), b.native(),
155	SIMDPP_SHUFFLE_MASK_2x2_2(s2-`2`,s3-`6`));
156	t = _mm256_insertf128_pd(t, a1, `0`);
157	return t;
158	} else {
159	return _mm256_shuffle_pd(a.native(), b.native(),
160	SIMDPP_SHUFFLE_MASK_2x2_2(s0, s1-`4`));
161	}
162	}
163	};
164
165	// is7_shuffle2 (only AVX-AVX2)
166	template<> struct shuffle_impl<`7`> {
167	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
168	static float64<`4`> run(const float64<`4`>& a, const float64<`4`>& b)
169	{
170	if (s0%`2` != s2%`2` \|\| s1%`2` != s3%`2`) {
171	__m128d a1 = _mm256_castpd256_pd128(a.native());
172	__m128d b1 = _mm256_castpd256_pd128(b.native());
173	a1 = _mm_shuffle_pd(b1, a1, SIMDPP_SHUFFLE_MASK_2x2(s1,s0-`4`));
174	__m256d t = _mm256_shuffle_pd(b.native(), a.native(),
175	SIMDPP_SHUFFLE_MASK_2x2_2(s3-`2`,s2-`6`));
176	t = _mm256_insertf128_pd(t, a1, `0`);
177	return t;
178	} else {
179	return _mm256_shuffle_pd(b.native(), a.native(),
180	SIMDPP_SHUFFLE_MASK_2x2_2(s1,s0-`4`));
181	}
182	}
183	};
184
185	// is8_perm2xvar
186	#if SIMDPP_USE_AVX512F
187	template<> struct shuffle_impl<`8`> {
188
189	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
190	static float64<`4`> run(const float64<`4`>& a, const float64<`4`>& b)
191	{
192	uint8<`16`> mask = make_uint(s0, s1, s2, s3);
193	// FIXME: GCC BUG
194	// return _mm256_permutex2var_pd(a.native(), _mm256_cvtepi8_epi64(mask), b);
195	return _mm512_castpd512_pd256(_mm512_permutex2var_pd(_mm512_castpd256_pd512(a.native()),
196	_mm512_cvtepi8_epi64(mask.native()),
197	_mm512_castpd256_pd512(b.native())));
198	}
199
200	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
201	static float64<`8`> run(const float64<`8`>& a, const float64<`8`>& b)
202	{
203	const unsigned p0 = s0<`4` ? s0 : s0+`4`;
204	const unsigned p1 = s1<`4` ? s1 : s1+`4`;
205	const unsigned p2 = s2<`4` ? s2 : s2+`4`;
206	const unsigned p3 = s3<`4` ? s3 : s3+`4`;
207	uint8<`16`> mask = make_uint(p0, p1, p2, p3, p0+`4`, p1+`4`, p2+`4`, p3+`4`);
208	return _mm512_permutex2var_pd(a.native(), _mm512_cvtepi8_epi64(mask.native()), b.native());
209	}
210	};
211	#endif
212
213	// is9_perm_blend
214	#if SIMDPP_USE_AVX2
215	template<> struct shuffle_impl<`9`> {
216	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
217	static float64<`4`> run(const float64<`4`>& a, const float64<`4`>& b)
218	{
219	const unsigned bl_mask = (s0<`4` ? `0` : `1`) \| (s1<`4` ? `0` : `2`) \| (s2<`4` ? `0` : `4`) \| (s3<`4` ? `0` : `8`);
220	__m256d ta = _mm256_permute4x64_pd(a.native(), SIMDPP_SHUFFLE_MASK_4x4(s0%`4`, s1%`4`, s2%`4`, s3%`4`));
221	__m256d tb = _mm256_permute4x64_pd(b.native(), SIMDPP_SHUFFLE_MASK_4x4(s0%`4`, s1%`4`, s2%`4`, s3%`4`));
222	return _mm256_blend_pd(ta, tb, bl_mask);
223	}
224	};
225	#endif
226
227	template<unsigned s0, unsigned s1, class V>
228	V i_shuffle_emul_64x4_half(const V& a0, const V& a1, const V& b0, const V& b1)
229	{
230	const V& h0 = s0 < `2` ? a0 :
231	s0 < `4` ? a1 :
232	s0 < `6` ? b0 : b1;
233	const V& h1 = s1 < `2` ? a0 :
234	s1 < `4` ? a1 :
235	s1 < `6` ? b0 : b1;
236	return insn::i_shuffle2x2<s0%`2`, s1%`2`+`2`>(h0, h1);
237	}
238
239	// any (only on AVX)
240	template<> struct shuffle_impl<`10`> {
241	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
242	static float64<`4`> run(const float64<`4`>& a, const float64<`4`>& b)
243	{
244	float64<`2`> a0, a1, b0, b1, r0, r1;
245	split(a, a0, a1);
246	split(b, b0, b1);
247	r0 = i_shuffle_emul_64x4_half<s0,s1>(a0, a1, b0, b1);
248	r1 = i_shuffle_emul_64x4_half<s2,s3>(a0, a1, b0, b1);
249	return combine(r0, r1);
250	}
251	};
252
253	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N>
254	float64<N> do_shuffle(const float64<N>& a, const float64<N>& b)
255	{
256	return shuffle_impl<impl_selector<s0, s1, s2, s3>::impl>::run<s0, s1, s2, s3>(a, b);
257	}
258
259	} // namespace sse_shuffle4x2_float64
260	} // namespace detail
261	} // namespace SIMDPP_ARCH_NAMESPACE
262	} // namespace simdpp
263
264	#endif
265	#endif
266

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/shuffle/sse_float64_4x2.h