sse_int32_4x2.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/shuffle/sse_int32_4x2.h]

1	/ Copyright (C) 2014 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_DETAIL_PERMUTE_SSE_INT32_4x2_H
9	#define LIBSIMDPP_SIMDPP_DETAIL_PERMUTE_SSE_INT32_4x2_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/setup_arch.h>
16	#include <simdpp/types.h>
17	#include <simdpp/core/blend.h>
18	#include <simdpp/detail/shuffle/shuffle_mask.h>
19	#include <simdpp/detail/shuffle/sse_float32_4x2.h>
20
21	#if SIMDPP_USE_SSE2
22
23	namespace simdpp {
24	namespace SIMDPP_ARCH_NAMESPACE {
25	namespace detail {
26	namespace sse_shuffle4x2_int32 {
27
28	/ The code below implements generalized permutations for 2 elements sets*
29	within uint32 vectors.
30	*/
31
32	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
33	struct impl_selector {
34
35	// 0 1 2 3
36	// 4 5 6 7
37	static const bool is1_zip_lo1 = (s0==`0` && s1==`4` && s2==`1` && s3==`5`);
38	static const bool is2_zip_lo2 = (s0==`4` && s1==`0` && s2==`5` && s3==`1`);
39	static const bool is3_zip_hi1 = (s0==`2` && s1==`6` && s2==`3` && s3==`7`);
40	static const bool is4_zip_hi2 = (s0==`6` && s1==`2` && s2==`7` && s3==`3`);
41	#if SIMDPP_USE_SSE4_1
42	static const bool is5_blend = (s0==`0` \|\| s0==`4`) && (s1==`1` \|\| s1==`5`) &&
43	(s2==`2` \|\| s2==`6`) && (s3==`3` \|\| s3==`7`);
44	#else
45	static const bool is5_blend = false;
46	#endif
47	#if SIMDPP_USE_SSSE3
48	static const bool is6_align = (s0==s1-`1`) && (s1==s2-`1`) && (s2==s3-`1`);
49	#else
50	static const bool is6_align = false;
51	#endif
52
53	static const int impl = is1_zip_lo1 ? `1` :
54	is2_zip_lo2 ? `2` :
55	is3_zip_hi1 ? `3` :
56	is4_zip_hi2 ? `4` :
57	is5_blend ? `5` :
58	is6_align ? `6` : `7`;
59	};
60
61	template<unsigned N> struct shuffle_impl {};
62
63	// zip_lo1
64	template<> struct shuffle_impl<`1`> {
65	template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
66	static uint32<`4`> run(const uint32<`4`>& a, const uint32<`4`>& b)
67	{
68	return _mm_unpacklo_epi32(a.native(), b.native());
69	}
70	#if SIMDPP_USE_AVX2
71	template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
72	static uint32<`8`> run(const uint32<`8`>& a, const uint32<`8`>& b)
73	{
74	return _mm256_unpacklo_epi32(a.native(), b.native());
75	}
76	#endif
77	#if SIMDPP_USE_AVX512F
78	template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
79	static uint32<`16`> run(const uint32<`16`>& a, const uint32<`16`>& b)
80	{
81	return _mm512_unpacklo_epi32(a.native(), b.native());
82	}
83	#endif
84	};
85
86	// zip_lo2
87	template<> struct shuffle_impl<`2`> {
88	template<unsigned, unsigned, unsigned, unsigned, unsigned N> SIMDPP_INL
89	static uint32<N> run(const uint32<N>& a, const uint32<N>& b)
90	{
91	return shuffle_impl<`1`>::run<`0`,`0`,`0`,`0`>(b, a);
92	}
93	};
94
95	// zip_hi1
96	template<> struct shuffle_impl<`3`> {
97	template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
98	static uint32<`4`> run(const uint32<`4`>& a, const uint32<`4`>& b)
99	{
100	return _mm_unpackhi_epi32(a.native(), b.native());
101	}
102	#if SIMDPP_USE_AVX2
103	template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
104	static uint32<`8`> run(const uint32<`8`>& a, const uint32<`8`>& b)
105	{
106	return _mm256_unpackhi_epi32(a.native(), b.native());
107	}
108	#endif
109	#if SIMDPP_USE_AVX512F
110	template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
111	static uint32<`16`> run(const uint32<`16`>& a, const uint32<`16`>& b)
112	{
113	return _mm512_unpackhi_epi32(a.native(), b.native());
114	}
115	#endif
116	};
117
118	// zip_hi2
119	template<> struct shuffle_impl<`4`> {
120	template<unsigned, unsigned, unsigned, unsigned, unsigned N> SIMDPP_INL
121	static uint32<N> run(const uint32<N>& a, const uint32<N>& b)
122	{
123	return shuffle_impl<`3`>::run<`0`,`0`,`0`,`0`>(b, a);
124	}
125	};
126
127	// is5_blend
128	#if SIMDPP_USE_SSE4_1
129	template<> struct shuffle_impl<`5`> {
130	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
131	static uint32<`4`> run(const uint32<`4`>& a, const uint32<`4`>& b)
132	{
133	const unsigned mask = (s0<`4` ? `0` : `0x03`) \| (s1<`4` ? `0` : `0x0c`) \|
134	(s2<`4` ? `0` : `0x30`) \| (s3<`4` ? `0` : `0xc0`);
135	return _mm_blend_epi16(a.native(), b.native(), mask);
136	}
137	#if SIMDPP_USE_AVX2
138	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
139	static uint32<`8`> run(const uint32<`8`>& a, const uint32<`8`>& b)
140	{
141	const unsigned mask = (s0<`4` ? `0` : `1`) \| (s1<`4` ? `0` : `2`) \|
142	(s2<`4` ? `0` : `4`) \| (s3<`4` ? `0` : `8`);
143	return _mm256_blend_epi32(a.native(), b.native(), mask \| mask << `4`);
144	}
145	#endif
146	#if SIMDPP_USE_AVX512F
147	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
148	static uint32<`16`> run(const uint32<`16`>& a, const uint32<`16`>& b)
149	{
150	const unsigned mask = (s0<`4` ? `0` : `1`) \| (s1<`4` ? `0` : `2`) \|
151	(s2<`4` ? `0` : `4`) \| (s3<`4` ? `0` : `8`);
152	const unsigned mask2 = mask \| mask << `4` \| mask << `8` \| mask << `12`;
153	return _mm512_mask_blend_epi32(mask2, a.native(), b.native());
154	}
155	#endif
156	};
157	#endif
158
159	#if SIMDPP_USE_SSSE3
160	template<> struct shuffle_impl<`6`> {
161	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
162	static uint32<`4`> run(const uint32<`4`>& a, const uint32<`4`>& b)
163	{
164	return _mm_alignr_epi8(b.native(), a.native(), s0*`4`);
165	}
166	#if SIMDPP_USE_AVX2
167	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
168	static uint32<`8`> run(const uint32<`8`>& a, const uint32<`8`>& b)
169	{
170	return _mm256_alignr_epi8(b.native(), a.native(), s0*`4`);
171	}
172	#endif
173	#if SIMDPP_USE_AVX512F
174	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
175	static uint32<`16`> run(const uint32<`16`>& a, const uint32<`16`>& b)
176	{
177	// note that _mm512_alignr_epi32 operates on entire vector
178	__m512i ap = _mm512_alignr_epi32(a.native(), a.native(), s0);
179	const int mask = SIMDPP_SHUFFLE_MASK_4x2_4(s0>`3`, s0>`2`, s0>`1`, s0>`0`);
180	return _mm512_mask_alignr_epi32(ap, mask, b.native(), b.native(), (s0+`12`)%`16`);
181	}
182	#endif
183	};
184	#endif
185
186	template<> struct shuffle_impl<`7`> {
187	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
188	static uint32<`4`> run(const uint32<`4`>& a, const uint32<`4`>& b)
189	{
190	#if SIMDPP_USE_AVX2
191	const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0%`4`, s1%`4`, s2%`4`, s3%`4`);
192	__m128i pa = _mm_shuffle_epi32(a.native(), mask);
193	__m128i pb = _mm_shuffle_epi32(b.native(), mask);
194	return _mm_blend_epi32(pa, pb, SIMDPP_SHUFFLE_MASK_4x2(s0/`4`,s1/`4`,s2/`4`,s3/`4`));
195	#elif SIMDPP_USE_SSE4_1
196	const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0%`4`, s1%`4`, s2%`4`, s3%`4`);
197	__m128i pa = _mm_shuffle_epi32(a.native(), mask);
198	__m128i pb = _mm_shuffle_epi32(b.native(), mask);
199	return _mm_blend_epi16(pa, pb, SIMDPP_SHUFFLE_MASK_4x4(s0/`4``0x3`,s1/`4``0x3`,s2/`4``0x3`,s3/`4``0x3`));
200	#else
201	__m128 na = _mm_castsi128_ps(a.native());
202	__m128 nb = _mm_castsi128_ps(b.native());
203	__m128 ab1 = _mm_shuffle_ps(na, nb, _MM_SHUFFLE(s1%`4`, s0%`4`, s1%`4`, s0%`4`));
204	__m128 ab2 = _mm_shuffle_ps(na, nb, _MM_SHUFFLE(s3%`4`, s2%`4`, s3%`4`, s2%`4`));
205	float32<`4`> r = _mm_shuffle_ps(ab1, ab2, _MM_SHUFFLE(s3/`4`?`3`:`1`, s2/`4`?`2`:`0`, s1/`4`?`3`:`1`, s0/`4`?`2`:`0`));
206	return uint32<`4`>(r);
207	#endif
208	}
209	#if SIMDPP_USE_AVX2
210	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
211	static uint32<`8`> run(const uint32<`8`>& a, const uint32<`8`>& b)
212	{
213	const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0%`4`, s1%`4`, s2%`4`, s3%`4`);
214	__m256i pa = _mm256_shuffle_epi32(a.native(), mask);
215	__m256i pb = _mm256_shuffle_epi32(b.native(), mask);
216	return _mm256_blend_epi32(pa, pb, SIMDPP_SHUFFLE_MASK_4x2_2(s0/`4`,s1/`4`,s2/`4`,s3/`4`));
217	}
218	#endif
219	#if SIMDPP_USE_AVX512F
220	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
221	static uint32<`16`> run(const uint32<`16`>& a, const uint32<`16`>& b)
222	{
223	const unsigned shuf_mask = SIMDPP_SHUFFLE_MASK_4x4(s0%`4`,s1%`4`,s2%`4`,s3%`4`);
224	__m512i ap = _mm512_shuffle_epi32(a.native(), _MM_PERM_ENUM(shuf_mask));
225	const int mask = SIMDPP_SHUFFLE_MASK_4x2_4(s0/`4`,s1/`4`,s2/`4`,s3/`4`);
226	return _mm512_mask_shuffle_epi32(ap, mask, b.native(),
227	_MM_PERM_ENUM(shuf_mask));
228	}
229	#endif
230	};
231
232	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N>
233	uint32<N> do_shuffle(const uint32<N>& a, const uint32<N>& b)
234	{
235	return shuffle_impl<impl_selector<s0, s1, s2, s3>::impl>::template run<s0, s1, s2, s3>(a, b);
236	}
237
238	} // namespace sse_shuffle4x2_uint32
239	} // namespace detail
240	} // namespace SIMDPP_ARCH_NAMESPACE
241	} // namespace simdpp
242
243	#endif
244	#endif
245

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/shuffle/sse_int32_4x2.h