1 | /* Copyright (C) 2011-2012 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_DETAIL_SHUFFLE_NEON_INT32x4_H |
9 | #define LIBSIMDPP_DETAIL_SHUFFLE_NEON_INT32x4_H |
10 | #if SIMDPP_USE_NEON |
11 | |
12 | #include <type_traits> |
13 | |
14 | namespace simdpp { |
15 | namespace SIMDPP_ARCH_NAMESPACE { |
16 | namespace detail { |
17 | namespace neon_shuffle_int32x4 { |
18 | |
19 | /* |
20 | The code below implements generalized permutations of elements within |
21 | int32x4 vectors using various shuffling instructions available on NEON. |
22 | */ |
23 | using _0 = std::integral_constant<unsigned, 0>; |
24 | using _1 = std::integral_constant<unsigned, 1>; |
25 | using _2 = std::integral_constant<unsigned, 2>; |
26 | using _3 = std::integral_constant<unsigned, 3>; |
27 | using T = uint32x4; // full vector |
28 | using H = uint32x2_t; // half vector |
29 | |
30 | |
31 | /// Returns the lower/higher part of a vector. Cost: 0 |
32 | SIMDPP_INL H lo(T a) { return vget_low_u32(a.native()); } |
33 | SIMDPP_INL H hi(T a) { return vget_high_u32(a.native()); } |
34 | |
35 | /// Cost: 1 |
36 | template<unsigned N> SIMDPP_INL |
37 | T bcast(T a) |
38 | { |
39 | H h = (N < 2) ? lo(a) : hi(a); |
40 | return (uint32x4_t) vdupq_lane_u32(h, N % 2); |
41 | } |
42 | |
43 | /// Combines two half vectors. Cost: 0 |
44 | SIMDPP_INL T co(H lo, H hi){ return vcombine_u32(lo, hi); } |
45 | |
46 | /// Reverses the elements in half-vector or half-vectors in a vector. Cost: 1 |
47 | SIMDPP_INL H rev(H a) { return vrev64_u32(a); } |
48 | SIMDPP_INL T rev(T a) { return vrev64q_u32(a.native()); } |
49 | |
50 | /// Duplicates the lower/higher element in the half-vector. Cost: 1 |
51 | SIMDPP_INL H dup_lo(H a) { return vdup_lane_u32(a, 0); } |
52 | SIMDPP_INL H dup_hi(H a) { return vdup_lane_u32(a, 1); } |
53 | |
54 | /** Shuffles two half-vectors or one vector |
55 | Cost: If s0,s1 == 0,3 or 2,1 then 2, otherwise 0-1. |
56 | */ |
57 | template<unsigned s0, unsigned s1> SIMDPP_INL |
58 | H shf2x2(H a, H b) |
59 | { |
60 | const unsigned sel = s0*4 + s1; |
61 | switch (sel) { |
62 | default: |
63 | case 0: /*00*/ return dup_lo(a); |
64 | case 1: /*01*/ return a; |
65 | case 2: /*02*/ return vzip_u32(a, b).val[0]; |
66 | case 3: /*03*/ return rev(vext_u32(b, a, 1)); |
67 | case 4: /*10*/ return rev(a); |
68 | case 5: /*11*/ return dup_hi(a); |
69 | case 6: /*12*/ return vext_u32(a, b, 1); |
70 | case 7: /*13*/ return vzip_u32(a, b).val[1]; |
71 | case 8: /*20*/ return vzip_u32(b, a).val[0]; |
72 | case 9: /*21*/ return rev(vext_u32(a, b, 1)); |
73 | case 10: /*22*/ return dup_lo(b); |
74 | case 11: /*23*/ return b; |
75 | case 12: /*30*/ return vext_u32(b, a, 1); |
76 | case 13: /*31*/ return vzip_u32(b, a).val[1]; |
77 | case 14: /*32*/ return rev(b); |
78 | case 15: /*33*/ return dup_hi(b); |
79 | } |
80 | } |
81 | |
82 | // The first element comes from a, the second from b. |
83 | template<unsigned s0, unsigned s1> SIMDPP_INL |
84 | H shf_1x2(H a, H b) |
85 | { |
86 | const unsigned sel = s0*2 + s1; |
87 | switch (sel) { |
88 | default: |
89 | case 0: /*00*/ return vzip_u32(a, b).val[0];; |
90 | case 1: /*01*/ return rev(vext_u32(b, a, 1)); |
91 | case 2: /*10*/ return vext_u32(a, b, 1); |
92 | case 3: /*11*/ return vzip_u32(a, b).val[1]; |
93 | } |
94 | } |
95 | |
96 | template<unsigned sel> |
97 | H sel_lo_hi(T a, T b) |
98 | { |
99 | switch (sel) { |
100 | default: |
101 | case 0: return lo(a); |
102 | case 1: return hi(a); |
103 | case 2: return lo(b); |
104 | case 3: return hi(b); |
105 | } |
106 | } |
107 | |
108 | template<unsigned s0, unsigned s1> SIMDPP_INL |
109 | H shf4x2(T a, T b) |
110 | { |
111 | return shf_1x2<s0%2, s1%2>(sel_lo_hi<s0/2>(a, b), |
112 | sel_lo_hi<s1/2>(a, b)); |
113 | } |
114 | |
115 | // 4-element permutations |
116 | SIMDPP_INL T perm4(_0,_0,_0,_0, T a) { return bcast<0>(a); } |
117 | SIMDPP_INL T perm4(_1,_1,_1,_1, T a) { return bcast<1>(a); } |
118 | SIMDPP_INL T perm4(_2,_2,_2,_2, T a) { return bcast<2>(a); } |
119 | SIMDPP_INL T perm4(_3,_3,_3,_3, T a) { return bcast<3>(a); } |
120 | SIMDPP_INL T perm4(_1,_0,_3,_2, T a) { return rev(a); } |
121 | |
122 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
123 | T perm4(std::integral_constant<unsigned, s0>, |
124 | std::integral_constant<unsigned, s1>, |
125 | std::integral_constant<unsigned, s2>, |
126 | std::integral_constant<unsigned, s3>, const T& a) |
127 | { |
128 | return co(shf2x2<s0,s1>(lo(a), hi(a)), shf2x2<s2,s3>(lo(a), hi(a))); |
129 | } |
130 | |
131 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
132 | T permute4(T a) |
133 | { |
134 | return perm4(std::integral_constant<unsigned, s0>{}, |
135 | std::integral_constant<unsigned, s1>{}, |
136 | std::integral_constant<unsigned, s2>{}, |
137 | std::integral_constant<unsigned, s3>{}, a); |
138 | } |
139 | |
140 | // 2-element shuffle: the first two elements must come from a, the last two - |
141 | // from b |
142 | template<unsigned s0, unsigned s1> SIMDPP_INL |
143 | T shuffle2(T a, T b) |
144 | { |
145 | return co(shf2x2<s0,s1>(lo(a), hi(a)), shf2x2<s0,s1>(lo(b), hi(b))); |
146 | } |
147 | |
148 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
149 | T shuffle2(T a, T b) |
150 | { |
151 | return co(shf2x2<s0,s1>(lo(a), hi(a)), shf2x2<s2,s3>(lo(b), hi(b))); |
152 | } |
153 | |
154 | // 4-element shuffle among two 2-element (sub) vectors |
155 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
156 | T shuffle2x2(const T& a, const T& b) |
157 | { |
158 | return co(shf2x2<s0,s1>(lo(a), lo(b)), shf2x2<s2,s3>(hi(a), hi(b))); |
159 | } |
160 | |
161 | // 8-element shuffle among two 4-element vectors |
162 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
163 | T shuffle4x2(const T& a, const T& b) |
164 | { |
165 | return co(shf4x2<s0,s1>(a, b), shf4x2<s2,s3>(a, b)); |
166 | } |
167 | |
168 | } // namespace neon_shuffle_int32x4 |
169 | } // namespace detail |
170 | } // namespace SIMDPP_ARCH_NAMESPACE |
171 | } // namespace simdpp |
172 | |
173 | #endif |
174 | #endif |
175 | |
176 | |