neon_int32x4.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/shuffle/neon_int32x4.h]

1	/ Copyright (C) 2011-2012 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_DETAIL_SHUFFLE_NEON_INT32x4_H
9	#define LIBSIMDPP_DETAIL_SHUFFLE_NEON_INT32x4_H
10	#if SIMDPP_USE_NEON
11
12	#include <type_traits>
13
14	namespace simdpp {
15	namespace SIMDPP_ARCH_NAMESPACE {
16	namespace detail {
17	namespace neon_shuffle_int32x4 {
18
19	/*
20	The code below implements generalized permutations of elements within
21	int32x4 vectors using various shuffling instructions available on NEON.
22	*/
23	using _0 = std::integral_constant<unsigned, `0`>;
24	using _1 = std::integral_constant<unsigned, `1`>;
25	using _2 = std::integral_constant<unsigned, `2`>;
26	using _3 = std::integral_constant<unsigned, `3`>;
27	using T = uint32x4; // full vector
28	using H = uint32x2_t; // half vector
29
30
31	/// Returns the lower/higher part of a vector. Cost: 0
32	SIMDPP_INL H lo(T a) { return vget_low_u32(a.native()); }
33	SIMDPP_INL H hi(T a) { return vget_high_u32(a.native()); }
34
35	/// Cost: 1
36	template<unsigned N> SIMDPP_INL
37	T bcast(T a)
38	{
39	H h = (N < `2`) ? lo(a) : hi(a);
40	return (uint32x4_t) vdupq_lane_u32(h, N % `2`);
41	}
42
43	/// Combines two half vectors. Cost: 0
44	SIMDPP_INL T co(H lo, H hi){ return vcombine_u32(lo, hi); }
45
46	/// Reverses the elements in half-vector or half-vectors in a vector. Cost: 1
47	SIMDPP_INL H rev(H a) { return vrev64_u32(a); }
48	SIMDPP_INL T rev(T a) { return vrev64q_u32(a.native()); }
49
50	/// Duplicates the lower/higher element in the half-vector. Cost: 1
51	SIMDPP_INL H dup_lo(H a) { return vdup_lane_u32(a, `0`); }
52	SIMDPP_INL H dup_hi(H a) { return vdup_lane_u32(a, `1`); }
53
54	/* Shuffles two half-vectors or one vector*
55	Cost: If s0,s1 == 0,3 or 2,1 then 2, otherwise 0-1.
56	*/
57	template<unsigned s0, unsigned s1> SIMDPP_INL
58	H shf2x2(H a, H b)
59	{
60	const unsigned sel = s0*`4` + s1;
61	switch (sel) {
62	default:
63	case `0`: /00/ return dup_lo(a);
64	case `1`: /01/ return a;
65	case `2`: /02/ return vzip_u32(a, b).val[`0`];
66	case `3`: /03/ return rev(vext_u32(b, a, `1`));
67	case `4`: /10/ return rev(a);
68	case `5`: /11/ return dup_hi(a);
69	case `6`: /12/ return vext_u32(a, b, `1`);
70	case `7`: /13/ return vzip_u32(a, b).val[`1`];
71	case `8`: /20/ return vzip_u32(b, a).val[`0`];
72	case `9`: /21/ return rev(vext_u32(a, b, `1`));
73	case `10`: /22/ return dup_lo(b);
74	case `11`: /23/ return b;
75	case `12`: /30/ return vext_u32(b, a, `1`);
76	case `13`: /31/ return vzip_u32(b, a).val[`1`];
77	case `14`: /32/ return rev(b);
78	case `15`: /33/ return dup_hi(b);
79	}
80	}
81
82	// The first element comes from a, the second from b.
83	template<unsigned s0, unsigned s1> SIMDPP_INL
84	H shf_1x2(H a, H b)
85	{
86	const unsigned sel = s0*`2` + s1;
87	switch (sel) {
88	default:
89	case `0`: /00/ return vzip_u32(a, b).val[`0`];;
90	case `1`: /01/ return rev(vext_u32(b, a, `1`));
91	case `2`: /10/ return vext_u32(a, b, `1`);
92	case `3`: /11/ return vzip_u32(a, b).val[`1`];
93	}
94	}
95
96	template<unsigned sel>
97	H sel_lo_hi(T a, T b)
98	{
99	switch (sel) {
100	default:
101	case `0`: return lo(a);
102	case `1`: return hi(a);
103	case `2`: return lo(b);
104	case `3`: return hi(b);
105	}
106	}
107
108	template<unsigned s0, unsigned s1> SIMDPP_INL
109	H shf4x2(T a, T b)
110	{
111	return shf_1x2<s0%`2`, s1%`2`>(sel_lo_hi<s0/`2`>(a, b),
112	sel_lo_hi<s1/`2`>(a, b));
113	}
114
115	// 4-element permutations
116	SIMDPP_INL T perm4(_0,_0,_0,_0, T a) { return bcast<`0`>(a); }
117	SIMDPP_INL T perm4(_1,_1,_1,_1, T a) { return bcast<`1`>(a); }
118	SIMDPP_INL T perm4(_2,_2,_2,_2, T a) { return bcast<`2`>(a); }
119	SIMDPP_INL T perm4(_3,_3,_3,_3, T a) { return bcast<`3`>(a); }
120	SIMDPP_INL T perm4(_1,_0,_3,_2, T a) { return rev(a); }
121
122	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
123	T perm4(std::integral_constant<unsigned, s0>,
124	std::integral_constant<unsigned, s1>,
125	std::integral_constant<unsigned, s2>,
126	std::integral_constant<unsigned, s3>, const T& a)
127	{
128	return co(shf2x2<s0,s1>(lo(a), hi(a)), shf2x2<s2,s3>(lo(a), hi(a)));
129	}
130
131	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
132	T permute4(T a)
133	{
134	return perm4(std::integral_constant<unsigned, s0>{},
135	std::integral_constant<unsigned, s1>{},
136	std::integral_constant<unsigned, s2>{},
137	std::integral_constant<unsigned, s3>{}, a);
138	}
139
140	// 2-element shuffle: the first two elements must come from a, the last two -
141	// from b
142	template<unsigned s0, unsigned s1> SIMDPP_INL
143	T shuffle2(T a, T b)
144	{
145	return co(shf2x2<s0,s1>(lo(a), hi(a)), shf2x2<s0,s1>(lo(b), hi(b)));
146	}
147
148	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
149	T shuffle2(T a, T b)
150	{
151	return co(shf2x2<s0,s1>(lo(a), hi(a)), shf2x2<s2,s3>(lo(b), hi(b)));
152	}
153
154	// 4-element shuffle among two 2-element (sub) vectors
155	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
156	T shuffle2x2(const T& a, const T& b)
157	{
158	return co(shf2x2<s0,s1>(lo(a), lo(b)), shf2x2<s2,s3>(hi(a), hi(b)));
159	}
160
161	// 8-element shuffle among two 4-element vectors
162	template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
163	T shuffle4x2(const T& a, const T& b)
164	{
165	return co(shf4x2<s0,s1>(a, b), shf4x2<s2,s3>(a, b));
166	}
167
168	} // namespace neon_shuffle_int32x4
169	} // namespace detail
170	} // namespace SIMDPP_ARCH_NAMESPACE
171	} // namespace simdpp
172
173	#endif
174	#endif
175
176

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/shuffle/neon_int32x4.h