sse_int64_4x2.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/shuffle/sse_int64_4x2.h]

1	/ Copyright (C) 2014 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_DETAIL_PERMUTE_SSE_INT64_4x2_H
9	#define LIBSIMDPP_SIMDPP_DETAIL_PERMUTE_SSE_INT64_4x2_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/setup_arch.h>
16	#include <simdpp/types.h>
17	#include <simdpp/core/combine.h>
18	#include <simdpp/core/split.h>
19	#include <simdpp/detail/insn/shuffle2x2.h>
20	#include <simdpp/detail/shuffle/shuffle_mask.h>
21
22	#if SIMDPP_USE_AVX2
23
24	namespace simdpp {
25	namespace SIMDPP_ARCH_NAMESPACE {
26	namespace detail {
27	namespace sse_shuffle4x2_int64 {
28
29	/ The code below implements generalized permutations for 4 elements sets*
30	within uint64 vectors.
31	*/
32
33	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
34	struct impl_selector {
35
36	// 0 1 2 3
37	// 4 5 6 7
38	static const bool is1_zip_lo1 = (s0==`0` && s1==`4` && s2==`2` && s3==`6`);
39	static const bool is2_zip_lo2 = (s0==`4` && s1==`0` && s2==`6` && s3==`2`);
40	static const bool is3_zip_hi1 = (s0==`1` && s1==`5` && s2==`3` && s3==`7`);
41	static const bool is4_zip_hi2 = (s0==`5` && s1==`1` && s2==`7` && s3==`3`);
42	static const bool is5_blend = (s0==`0` \|\| s0==`4`) && (s1==`1` \|\| s1==`5`) &&
43	(s2==`2` \|\| s2==`6`) && (s3==`3` \|\| s3==`7`);
44	#if SIMDPP_USE_AVX512F
45	static const bool is6_swap1 = false;
46	static const bool is7_swap2 = false;
47	static const bool is8_permx2var = true;
48	#else
49	static const bool is6_swap1 = (s0==`1` && s1==`4` && s2==`3` && s3==`6`);
50	static const bool is7_swap2 = (s0==`4` && s1==`1` && s2==`6` && s3==`3`);
51	static const bool is8_permx2var = false;
52	#endif
53	static const int impl = is1_zip_lo1 ? `1` :
54	is2_zip_lo2 ? `2` :
55	is3_zip_hi1 ? `3` :
56	is4_zip_hi2 ? `4` :
57	is5_blend ? `5` :
58	is6_swap1 ? `6` :
59	is7_swap2 ? `7` :
60	is8_permx2var ? `8` : `9`;
61	};
62
63	template<unsigned N> struct shuffle_impl {};
64
65	// zip_lo1
66	template<> struct shuffle_impl<`1`> {
67	template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
68	static uint64<`4`> run(const uint64<`4`>& a, const uint64<`4`>& b)
69	{
70	return _mm256_unpacklo_epi64(a.native(), b.native());
71	}
72	#if SIMDPP_USE_AVX512F
73	template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
74	static uint64<`8`> run(const uint64<`8`>& a, const uint64<`8`>& b)
75	{
76	return _mm512_unpacklo_epi64(a.native(), b.native());
77	}
78	#endif
79	};
80
81	// zip_lo2
82	template<> struct shuffle_impl<`2`> {
83	template<unsigned, unsigned, unsigned, unsigned, unsigned N> SIMDPP_INL
84	static uint64<N> run(const uint64<N>& a, const uint64<N>& b)
85	{
86	return shuffle_impl<`1`>::run<`0`,`0`,`0`,`0`>(b, a);
87	}
88	};
89
90	// zip_hi1
91	template<> struct shuffle_impl<`3`> {
92	template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
93	static uint64<`4`> run(const uint64<`4`>& a, const uint64<`4`>& b)
94	{
95	return _mm256_unpackhi_epi64(a.native(), b.native());
96	}
97	#if SIMDPP_USE_AVX512F
98	template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
99	static uint64<`8`> run(const uint64<`8`>& a, const uint64<`8`>& b)
100	{
101	return _mm512_unpackhi_epi64(a.native(), b.native());
102	}
103	#endif
104	};
105
106	// zip_hi2
107	template<> struct shuffle_impl<`4`> {
108	template<unsigned, unsigned, unsigned, unsigned, unsigned N> SIMDPP_INL
109	static uint64<N> run(const uint64<N>& a, const uint64<N>& b)
110	{
111	return shuffle_impl<`3`>::run<`0`,`0`,`0`,`0`>(b, a);
112	}
113	};
114
115	// is5_blend
116	template<> struct shuffle_impl<`5`> {
117	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
118	static uint64<`4`> run(const uint64<`4`>& a, const uint64<`4`>& b)
119	{
120	const unsigned mask = (s0<`4` ? `0` : `0x03`) \| (s1<`4` ? `0` : `0x0c`) \|
121	(s2<`4` ? `0` : `0x30`) \| (s3<`4` ? `0` : `0xc0`);
122	return _mm256_blend_epi32(a.native(), b.native(), mask);
123	}
124	#if SIMDPP_USE_AVX512F
125	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
126	static uint64<`8`> run(const uint64<`8`>& a, const uint64<`8`>& b)
127	{
128	const unsigned mask = (s0<`4` ? `0` : `1`) \| (s1<`4` ? `0` : `2`) \|
129	(s2<`4` ? `0` : `4`) \| (s3<`4` ? `0` : `8`);
130	const unsigned mask2 = mask \| mask << `4`;
131	return _mm512_mask_blend_epi64(mask2, a.native(), b.native());
132	}
133	#endif
134	};
135
136	// is6_swap1 (only AVX2)
137	template<> struct shuffle_impl<`6`> {
138	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
139	static uint64<`4`> run(const uint64<`4`>& a, const uint64<`4`>& b)
140	{
141	__m256i n = _mm256_blend_epi32(a.native(), b.native(), `0x33`);
142	return _mm256_permute4x64_epi64(n, _MM_SHUFFLE(`2`,`3`,`0`,`1`));
143	}
144	};
145
146	// is7_swap2 (only AVX2)
147	template<> struct shuffle_impl<`7`> {
148	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
149	static uint64<`4`> run(const uint64<`4`>& a, const uint64<`4`>& b)
150	{
151	__m256i n = _mm256_blend_epi32(a.native(), b.native(), `0xcc`);
152	return _mm256_permute4x64_epi64(n, _MM_SHUFFLE(`2`,`3`,`0`,`1`));
153	}
154	};
155
156	// is8_perm2xvar
157	#if SIMDPP_USE_AVX512F
158	template<> struct shuffle_impl<`8`> {
159
160	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
161	static uint64<`4`> run(const uint64<`4`>& a, const uint64<`4`>& b)
162	{
163	#if SIMDPP_USE_AVX512VL
164	uint8<`16`> mask = make_uint(s0, s1, s2, s3);
165	return _mm256_permutex2var_epi64(a.native(),
166	_mm256_cvtepi8_epi64(mask.native()),
167	b.native());
168	#else
169	const unsigned p0 = s0<`4` ? s0 : s0+`4`;
170	const unsigned p1 = s1<`4` ? s1 : s1+`4`;
171	const unsigned p2 = s2<`4` ? s2 : s2+`4`;
172	const unsigned p3 = s3<`4` ? s3 : s3+`4`;
173	uint8<`16`> mask = make_uint(p0, p1, p2, p3);
174	__m512i res = _mm512_permutex2var_epi64(_mm512_castsi256_si512(a.native()),
175	_mm512_cvtepi8_epi64(mask.native()),
176	_mm512_castsi256_si512(b.native()));
177	return _mm512_castsi512_si256(res);
178	#endif
179	}
180
181	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
182	static uint64<`8`> run(const uint64<`8`>& a, const uint64<`8`>& b)
183	{
184	const unsigned p0 = s0<`4` ? s0 : s0+`4`;
185	const unsigned p1 = s1<`4` ? s1 : s1+`4`;
186	const unsigned p2 = s2<`4` ? s2 : s2+`4`;
187	const unsigned p3 = s3<`4` ? s3 : s3+`4`;
188	uint8<`16`> mask = make_uint(p0, p1, p2, p3, p0+`4`, p1+`4`, p2+`4`, p3+`4`);
189	return _mm512_permutex2var_epi64(a.native(),
190	_mm512_cvtepi8_epi64(mask.native()),
191	b.native());
192	}
193	};
194	#endif
195
196	// any (only on AVX2)
197	template<> struct shuffle_impl<`9`> {
198	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
199	static uint64<`4`> run(const uint64<`4`>& a, const uint64<`4`>& b)
200	{
201	const unsigned shuf_mask = SIMDPP_SHUFFLE_MASK_4x4(s0%`4`, s1%`4`, s2%`4`, s3%`4`);
202	const unsigned bl_mask = (s0<`4` ? `0` : `0x03`) \| (s1<`4` ? `0` : `0x0c`) \|
203	(s2<`4` ? `0` : `0x30`) \| (s3<`4` ? `0` : `0xc0`);
204	__m256i ta = _mm256_permute4x64_epi64(a.native(), shuf_mask);
205	__m256i tb = _mm256_permute4x64_epi64(b.native(), shuf_mask);
206	return _mm256_blend_epi32(ta, tb, bl_mask);
207	}
208	};
209
210
211	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N>
212	uint64<N> do_shuffle(const uint64<N>& a, const uint64<N>& b)
213	{
214	const unsigned selector = impl_selector<s0, s1, s2, s3>::impl;
215	return shuffle_impl<selector>::template run<s0, s1, s2, s3>(a, b);
216	}
217
218	} // namespace sse_shuffle4x2_int64
219	} // namespace detail
220	} // namespace SIMDPP_ARCH_NAMESPACE
221	} // namespace simdpp
222
223	#endif
224	#endif
225

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/shuffle/sse_int64_4x2.h