1 | /* Copyright (C) 2011-2012 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_DETAIL_SHUFFLE_NEON_INT16x8_H |
9 | #define LIBSIMDPP_DETAIL_SHUFFLE_NEON_INT16x8_H |
10 | #if SIMDPP_USE_NEON |
11 | |
12 | #include <simdpp/core/make_shuffle_bytes_mask.h> |
13 | #include <simdpp/core/permute_bytes16.h> |
14 | #include <type_traits> |
15 | |
16 | namespace simdpp { |
17 | namespace SIMDPP_ARCH_NAMESPACE { |
18 | namespace detail { |
19 | namespace neon_shuffle_int16x8 { |
20 | |
21 | /* |
22 | The code below implements generalized permutations for 4 elements sets |
23 | within int16x8 vectors using various shuffling instructions available on |
24 | NEON. If no straightforward permutation is available, TBL instruction is |
25 | used. |
26 | |
27 | Note: the compiler should optimize all masks into one VMOV #imm instruction |
28 | */ |
29 | |
30 | using _0 = std::integral_constant<unsigned, 0>; |
31 | using _1 = std::integral_constant<unsigned, 1>; |
32 | using _2 = std::integral_constant<unsigned, 2>; |
33 | using _3 = std::integral_constant<unsigned, 3>; |
34 | using T = uint16x8; // full vector |
35 | using H = uint16x4_t; // half vector |
36 | |
37 | /// Cost: 2 |
38 | template<unsigned n> SIMDPP_INL |
39 | T bcast(T a) |
40 | { |
41 | H h1 = vget_low_u16(a.native()); |
42 | H h2 = vget_high_u16(a.native()); |
43 | h1 = vdup_lane_u16(h1, n); |
44 | h2 = vdup_lane_u16(h2, n); |
45 | return vcombine_u16(h1, h2); |
46 | } |
47 | |
48 | /** Selects the elements from two vectors according to selectors. 0 selects |
49 | the value from the first vector, 1 selects from the second |
50 | Cost: 2 |
51 | */ |
52 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
53 | T sel(T a, T b) |
54 | { |
55 | const uint64_t um = |
56 | (s0 > 0 ? 0xffffLL : 0) | |
57 | (s1 > 0 ? 0xffffLL << 16 : 0) | |
58 | (s2 > 0 ? 0xffffLL << 32 : 0) | |
59 | (s3 > 0 ? 0xffffLL << 48 : 0); |
60 | uint16x8_t mask = vreinterpretq_u16_u64(vmovq_n_u64(um)); |
61 | |
62 | return vbslq_u16(mask, b.native(), a.native()); |
63 | } |
64 | |
65 | /** Within each 4-element set moves the elements to the left or right. The |
66 | shifted-in values are undefined. |
67 | Cost: 1 |
68 | */ |
69 | template<unsigned shift> SIMDPP_INL |
70 | T mov_r(T a) |
71 | { |
72 | return vreinterpretq_u16_u64(vshlq_n_u64(vreinterpretq_u64_u16(a.native()), shift*16)); |
73 | } |
74 | |
75 | template<unsigned shift> SIMDPP_INL |
76 | T mov_l(T a) |
77 | { |
78 | return vreinterpretq_u16_u64(vshrq_n_u64(vreinterpretq_u64_u16(a.native()), shift*16)); |
79 | } |
80 | |
81 | /// Within each 4-element set: r0 = a3; r1 = a2; r2 = a1; r3 = a0; - 3210 |
82 | /// Cost: 1 |
83 | SIMDPP_INL T rev41(T a) |
84 | { |
85 | return vrev64q_u16(a.native()); |
86 | } |
87 | |
88 | /// Within each 4-element set: r0 = a2; r1 = a3; r2 = a0; r3 = a1; - 2301 |
89 | /// Cost: 1 |
90 | SIMDPP_INL T rev42(T a) |
91 | { |
92 | return vreinterpretq_u16_u32(vrev64q_u32(vreinterpretq_u32_u16(a.native()))); |
93 | } |
94 | |
95 | /// Within each 4-element set: r0 = a1; r1 = a0; r2 = a3; r3 = a2; - 1032 |
96 | /// Cost: 1 |
97 | SIMDPP_INL T rev21(T a) |
98 | { |
99 | return vrev32q_u16(a.native()); |
100 | } |
101 | |
102 | /// Within each 4-element set: r0 = a0; r1 = a0; r2 = a2; r3 = a2; - 0022 |
103 | /// Cost: 2 |
104 | SIMDPP_INL T dup_lo(T a) |
105 | { |
106 | T b = a; |
107 | return vtrnq_u16(a.native(), b.native()).val[0]; |
108 | } |
109 | |
110 | /// Within each 4-element set: r0 = a1; r1 = a1; r2 = a3; r3 = a3; - 1133 |
111 | /// Cost: 2 |
112 | SIMDPP_INL T dup_hi(T a) |
113 | { |
114 | T b = a; |
115 | return vtrnq_u16(a.native(), b.native()).val[1]; |
116 | } |
117 | |
118 | /// Within each 4-element set: r0 = a0; r1 = a1; r2 = a0; r3 = a1; - 0101 |
119 | /// Cost: 2 |
120 | SIMDPP_INL T dup2_lo(T a) |
121 | { |
122 | uint32x4_t i; |
123 | uint32x2_t lo, hi; |
124 | i = vreinterpretq_u32_u16(a.native()); |
125 | lo = vget_low_u32(i); |
126 | hi = vget_high_u32(i); |
127 | lo = vdup_lane_u32(lo, 0); |
128 | hi = vdup_lane_u32(hi, 0); |
129 | return vreinterpretq_u16_u32(vcombine_u32(lo, hi)); |
130 | } |
131 | |
132 | /// Within each 4-element set: r0 = a2; r1 = a3; r2 = a2; r3 = a3; - 2323 |
133 | /// Cost: 2 |
134 | SIMDPP_INL T dup2_hi(T a) |
135 | { |
136 | uint32x4_t i; |
137 | uint32x2_t lo, hi; |
138 | i = vreinterpretq_u32_u16(a.native()); |
139 | lo = vget_low_u32(i); |
140 | hi = vget_high_u32(i); |
141 | lo = vdup_lane_u32(lo, 1); |
142 | hi = vdup_lane_u32(hi, 1); |
143 | return vreinterpretq_u16_u32(vcombine_u32(lo, hi)); |
144 | } |
145 | |
146 | /// Within each 4-element set: r0 = a0; r1 = a0; r2 = a1; r3 = a1; - 0011 |
147 | /// Cost: 3 |
148 | SIMDPP_INL T dup_unpack_lo(T a) |
149 | { |
150 | H lo = vget_low_u16(a.native()); |
151 | H hi = vget_high_u16(a.native()); |
152 | H rlo = vzip_u16(lo, lo).val[0]; |
153 | H rhi = vzip_u16(hi, hi).val[0]; |
154 | return vcombine_u16(rlo, rhi); |
155 | } |
156 | |
157 | /// Within each 4-element set: r0 = a2; r1 = a2; r2 = a3; r3 = a3; - 2233 |
158 | /// Cost: 3 |
159 | SIMDPP_INL T dup_unpack_hi(T a) |
160 | { |
161 | H lo = vget_low_u16(a.native()); |
162 | H hi = vget_high_u16(a.native()); |
163 | H rlo = vzip_u16(lo, lo).val[1]; |
164 | H rhi = vzip_u16(hi, hi).val[1]; |
165 | return vcombine_u16(rlo, rhi); |
166 | } |
167 | |
168 | /// Swaps the n1-th element with the element in n2-th position. The values of |
169 | /// other elements are undefined. Cost: 1 |
170 | template<unsigned n1, unsigned n2> SIMDPP_INL |
171 | T swap1(T a) |
172 | { |
173 | const unsigned sel = n1*4 + n2; |
174 | switch (sel) { |
175 | case 0: //00 |
176 | case 5: //11 |
177 | case 10: //22 |
178 | case 15: //33 |
179 | return a; |
180 | case 1: //01 |
181 | case 4: //10 |
182 | case 11: //23 |
183 | case 14: //32 |
184 | return rev21(a); |
185 | case 2: //02 |
186 | case 8: //20 |
187 | case 7: //13 |
188 | case 13: //31 |
189 | return rev42(a); |
190 | case 3: //03 |
191 | case 12: //30 |
192 | case 6: //12 |
193 | case 9: //21 |
194 | return rev41(a); |
195 | } |
196 | return a; |
197 | } |
198 | |
199 | /// Cost: 3 (includes one 16-byte load) |
200 | template<unsigned N0, unsigned N1, unsigned N2, unsigned N3> SIMDPP_INL |
201 | T fallback(T a) |
202 | { |
203 | T mask; |
204 | mask = make_shuffle_bytes16_mask<N0,N1,N2,N3>(mask); |
205 | return permute_bytes16(a, mask); |
206 | } |
207 | |
208 | static SIMDPP_INL |
209 | T perm4(_0,_0,_0,_0, T a) { return bcast<0>(a); } |
210 | SIMDPP_INL T perm4(_0,_0,_0,_1, T a) { return sel<0,0,0,1>(bcast<0>(a), swap1<1,3>(a)); } |
211 | SIMDPP_INL T perm4(_0,_0,_0,_2, T a) { return sel<0,0,0,1>(bcast<0>(a), swap1<2,3>(a)); } |
212 | SIMDPP_INL T perm4(_0,_0,_0,_3, T a) { return sel<1,1,1,0>(a, bcast<0>(a)); } |
213 | SIMDPP_INL T perm4(_0,_0,_1,_0, T a) { return sel<0,0,1,0>(bcast<0>(a), swap1<1,2>(a)); } |
214 | SIMDPP_INL T perm4(_0,_0,_1,_1, T a) { return dup_unpack_lo(a); } |
215 | SIMDPP_INL T perm4(_0,_0,_1,_2, T a) { return sel<0,1,1,1>(a, mov_r<1>(a)); } |
216 | SIMDPP_INL T perm4(_0,_0,_1,_3, T a) { return sel<0,1,1,0>(a, mov_r<1>(a)); } |
217 | SIMDPP_INL T perm4(_0,_0,_2,_0, T a) { return sel<1,1,0,1>(a, bcast<0>(a));} |
218 | //SIMDPP_INL T perm4(_0,_0,_2,_1, T a) { } |
219 | SIMDPP_INL T perm4(_0,_0,_2,_2, T a) { return dup_lo(a); } |
220 | SIMDPP_INL T perm4(_0,_0,_2,_3, T a) { return sel<0,1,0,0>(a, swap1<0,1>(a)); } |
221 | SIMDPP_INL T perm4(_0,_0,_3,_0, T a) { return sel<0,0,1,0>(bcast<0>(a), swap1<3,2>(a)); } |
222 | //SIMDPP_INL T perm4(_0,_0,_3,_1, T a) { } |
223 | SIMDPP_INL T perm4(_0,_0,_3,_2, T a) { return sel<0,1,1,1>(a, rev21(a)); } |
224 | SIMDPP_INL T perm4(_0,_0,_3,_3, T a) { return sel<0,1,1,0>(a, rev21(a)); } |
225 | SIMDPP_INL T perm4(_0,_1,_0,_0, T a) { return sel<1,0,1,1>(a, bcast<0>(a)); } |
226 | SIMDPP_INL T perm4(_0,_1,_0,_1, T a) { return dup2_lo(a); } |
227 | //SIMDPP_INL T perm4(_0,_1,_0,_2, T a) { } |
228 | SIMDPP_INL T perm4(_0,_1,_0,_3, T a) { return sel<0,0,1,0>(a, swap1<0,2>(a)); } |
229 | SIMDPP_INL T perm4(_0,_1,_1,_0, T a) { return sel<0,0,1,1>(a, rev41(a)); } |
230 | SIMDPP_INL T perm4(_0,_1,_1,_1, T a) { return sel<0,0,1,1>(a, bcast<1>(a)); } |
231 | SIMDPP_INL T perm4(_0,_1,_1,_2, T a) { return sel<0,0,1,1>(a, mov_r<1>(a)); } |
232 | SIMDPP_INL T perm4(_0,_1,_1,_3, T a) { return sel<0,0,1,0>(a, swap1<1,2>(a)); } |
233 | SIMDPP_INL T perm4(_0,_1,_2,_0, T a) { return sel<0,0,0,1>(a, swap1<0,3>(a)); } |
234 | SIMDPP_INL T perm4(_0,_1,_2,_1, T a) { return sel<0,0,0,1>(a, swap1<1,3>(a)); } |
235 | SIMDPP_INL T perm4(_0,_1,_2,_2, T a) { return sel<0,0,0,1>(a, swap1<2,3>(a)); } |
236 | SIMDPP_INL T perm4(_0,_1,_2,_3, T a) { return a; } |
237 | //SIMDPP_INL T perm4(_0,_1,_3,_0, T a) { } |
238 | //SIMDPP_INL T perm4(_0,_1,_3,_1, T a) { } |
239 | SIMDPP_INL T perm4(_0,_1,_3,_2, T a) { return sel<0,0,1,1>(a, rev21(a)); } |
240 | SIMDPP_INL T perm4(_0,_1,_3,_3, T a) { return sel<0,0,1,0>(a, swap1<3,2>(a)); } |
241 | SIMDPP_INL T perm4(_0,_2,_0,_0, T a) { return sel<0,1,0,0>(bcast<0>(a), swap1<2,1>(a)); } |
242 | //SIMDPP_INL T perm4(_0,_2,_0,_1, T a) { } |
243 | //SIMDPP_INL T perm4(_0,_2,_0,_2, T a) { } |
244 | //SIMDPP_INL T perm4(_0,_2,_0,_3, T a) { } |
245 | SIMDPP_INL T perm4(_0,_2,_1,_0, T a) { return sel<0,1,1,1>(a, rev41(a)); } |
246 | //SIMDPP_INL T perm4(_0,_2,_1,_1, T a) { } |
247 | //SIMDPP_INL T perm4(_0,_2,_1,_2, T a) { } |
248 | SIMDPP_INL T perm4(_0,_2,_1,_3, T a) { return sel<0,1,1,0>(a, rev41(a)); } |
249 | SIMDPP_INL T perm4(_0,_2,_2,_0, T a) { return sel<0,1,0,1>(a, rev41(a)); } |
250 | //SIMDPP_INL T perm4(_0,_2,_2,_1, T a) { } |
251 | SIMDPP_INL T perm4(_0,_2,_2,_2, T a) { return sel<0,1,1,1>(a, bcast<2>(a)); } |
252 | SIMDPP_INL T perm4(_0,_2,_2,_3, T a) { return sel<0,1,0,0>(a, swap1<2,1>(a)); } |
253 | //SIMDPP_INL T perm4(_0,_2,_3,_0, T a) { } |
254 | //SIMDPP_INL T perm4(_0,_2,_3,_1, T a) { } |
255 | //SIMDPP_INL T perm4(_0,_2,_3,_2, T a) { } |
256 | SIMDPP_INL T perm4(_0,_2,_3,_3, T a) { return sel<0,1,1,0>(a, mov_l<1>(a)); } |
257 | SIMDPP_INL T perm4(_0,_3,_0,_0, T a) { return sel<1,0,1,1>(swap1<3,1>(a), bcast<0>(a)); } |
258 | SIMDPP_INL T perm4(_0,_3,_0,_1, T a) { return sel<0,1,1,1>(a, rev42(a)); } |
259 | //SIMDPP_INL T perm4(_0,_3,_0,_2, T a) { } |
260 | SIMDPP_INL T perm4(_0,_3,_0,_3, T a) { return sel<0,1,1,0>(a, rev42(a)); } |
261 | //SIMDPP_INL T perm4(_0,_3,_1,_0, T a) { } |
262 | //SIMDPP_INL T perm4(_0,_3,_1,_1, T a) { } |
263 | //SIMDPP_INL T perm4(_0,_3,_1,_2, T a) { } |
264 | //SIMDPP_INL T perm4(_0,_3,_1,_3, T a) { } |
265 | //SIMDPP_INL T perm4(_0,_3,_2,_0, T a) { } |
266 | SIMDPP_INL T perm4(_0,_3,_2,_1, T a) { return sel<0,1,0,1>(a, swap1<1,3>(a)); } |
267 | //SIMDPP_INL T perm4(_0,_3,_2,_2, T a) { } |
268 | SIMDPP_INL T perm4(_0,_3,_2,_3, T a) { return sel<0,1,0,0>(a, swap1<3,1>(a)); } |
269 | //SIMDPP_INL T perm4(_0,_3,_3,_0, T a) { } |
270 | //SIMDPP_INL T perm4(_0,_3,_3,_1, T a) { } |
271 | //SIMDPP_INL T perm4(_0,_3,_3,_2, T a) { } |
272 | SIMDPP_INL T perm4(_0,_3,_3,_3, T a) { return sel<0,1,1,1>(a, bcast<3>(a)); } |
273 | SIMDPP_INL T perm4(_1,_0,_0,_0, T a) { return sel<0,1,1,1>(swap1<1,0>(a), bcast<0>(a)); } |
274 | SIMDPP_INL T perm4(_1,_0,_0,_1, T a) { return rev21(sel<0,0,1,1>(a, rev41(a))); } |
275 | //SIMDPP_INL T perm4(_1,_0,_0,_2, T a) { } |
276 | //SIMDPP_INL T perm4(_1,_0,_0,_3, T a) { } |
277 | SIMDPP_INL T perm4(_1,_0,_1,_0, T a) { return rev21(dup2_lo(a)); } |
278 | SIMDPP_INL T perm4(_1,_0,_1,_1, T a) { return sel<1,0,1,1>(swap1<1,0>(a), bcast<1>(a)); } |
279 | SIMDPP_INL T perm4(_1,_0,_1,_2, T a) { return sel<0,1,1,1>(swap1<1,0>(a), mov_r<1>(a)); } |
280 | //SIMDPP_INL T perm4(_1,_0,_1,_3, T a) { } |
281 | //SIMDPP_INL T perm4(_1,_0,_2,_0, T a) { } |
282 | //SIMDPP_INL T perm4(_1,_0,_2,_1, T a) { } |
283 | SIMDPP_INL T perm4(_1,_0,_2,_2, T a) { return sel<0,0,1,1>(swap1<0,1>(a), bcast<2>(a)); } |
284 | SIMDPP_INL T perm4(_1,_0,_2,_3, T a) { return sel<1,1,0,0>(a, swap1<0,1>(a)); } |
285 | //SIMDPP_INL T perm4(_1,_0,_3,_0, T a) { } |
286 | //SIMDPP_INL T perm4(_1,_0,_3,_1, T a) { } |
287 | SIMDPP_INL T perm4(_1,_0,_3,_2, T a) { return rev21(a); } |
288 | SIMDPP_INL T perm4(_1,_0,_3,_3, T a) { return sel<0,0,1,1>(swap1<0,1>(a), bcast<3>(a)); } |
289 | SIMDPP_INL T perm4(_1,_1,_0,_0, T a) { return rev42(dup_unpack_lo(a)); } |
290 | SIMDPP_INL T perm4(_1,_1,_0,_1, T a) { return sel<1,1,0,1>(swap1<0,2>(a), bcast<1>(a));} |
291 | //SIMDPP_INL T perm4(_1,_1,_0,_2, T a) { } |
292 | //SIMDPP_INL T perm4(_1,_1,_0,_3, T a) { } |
293 | SIMDPP_INL T perm4(_1,_1,_1,_0, T a) { return sel<1,1,1,0>(swap1<0,3>(a), bcast<1>(a)); } |
294 | SIMDPP_INL T perm4(_1,_1,_1,_1, T a) { return bcast<1>(a); } |
295 | SIMDPP_INL T perm4(_1,_1,_1,_2, T a) { return sel<1,1,1,0>(swap1<2,3>(a), bcast<1>(a)); } |
296 | SIMDPP_INL T perm4(_1,_1,_1,_3, T a) { return sel<1,1,1,0>(a, bcast<1>(a)); } |
297 | //SIMDPP_INL T perm4(_1,_1,_2,_0, T a) { } |
298 | SIMDPP_INL T perm4(_1,_1,_2,_1, T a) { return sel<1,1,0,1>(a, bcast<1>(a)); } |
299 | SIMDPP_INL T perm4(_1,_1,_2,_2, T a) { return sel<1,0,0,1>(a, rev21(a)); } |
300 | SIMDPP_INL T perm4(_1,_1,_2,_3, T a) { return sel<1,0,0,0>(a, swap1<1,0>(a)); } |
301 | //SIMDPP_INL T perm4(_1,_1,_3,_0, T a) { } |
302 | SIMDPP_INL T perm4(_1,_1,_3,_1, T a) { return sel<1,1,0,1>(swap1<3,2>(a), bcast<1>(a)); } |
303 | //SIMDPP_INL T perm4(_1,_1,_3,_2, T a) { } |
304 | SIMDPP_INL T perm4(_1,_1,_3,_3, T a) { return dup_hi(a); } |
305 | //SIMDPP_INL T perm4(_1,_2,_0,_0, T a) { } |
306 | SIMDPP_INL T perm4(_1,_2,_0,_1, T a) { return sel<1,1,0,0>(rev42(a), mov_l<1>(a)); } |
307 | //SIMDPP_INL T perm4(_1,_2,_0,_2, T a) { } |
308 | //SIMDPP_INL T perm4(_1,_2,_0,_3, T a) { } |
309 | SIMDPP_INL T perm4(_1,_2,_1,_0, T a) { a = rev41(a); return sel<1,0,0,0>(a, swap1<2,0>(a)); } |
310 | SIMDPP_INL T perm4(_1,_2,_1,_1, T a) { return sel<1,0,1,1>(swap1<2,1>(a), bcast<1>(a)); } |
311 | //SIMDPP_INL T perm4(_1,_2,_1,_2, T a) { } |
312 | //SIMDPP_INL T perm4(_1,_2,_1,_3, T a) { } |
313 | //SIMDPP_INL T perm4(_1,_2,_2,_0, T a) { } |
314 | //SIMDPP_INL T perm4(_1,_2,_2,_1, T a) { } |
315 | SIMDPP_INL T perm4(_1,_2,_2,_2, T a) { return sel<0,1,1,1>(swap1<1,0>(a), bcast<2>(a)); } |
316 | //SIMDPP_INL T perm4(_1,_2,_2,_3, T a) { } |
317 | SIMDPP_INL T perm4(_1,_2,_3,_0, T a) { return sel<0,0,0,1>(mov_l<1>(a), swap1<0,3>(a)); } |
318 | SIMDPP_INL T perm4(_1,_2,_3,_1, T a) { return sel<0,0,0,1>(mov_l<1>(a), swap1<1,3>(a)); } |
319 | SIMDPP_INL T perm4(_1,_2,_3,_2, T a) { return sel<0,0,0,1>(mov_l<1>(a), swap1<2,3>(a)); } |
320 | SIMDPP_INL T perm4(_1,_2,_3,_3, T a) { return sel<0,0,0,1>(mov_l<1>(a), a); } |
321 | //SIMDPP_INL T perm4(_1,_3,_0,_0, T a) { } |
322 | SIMDPP_INL T perm4(_1,_3,_0,_1, T a) { return rev42(sel<0,0,1,0>(a, swap1<1,2>(a))); } |
323 | //SIMDPP_INL T perm4(_1,_3,_0,_2, T a) { } |
324 | //SIMDPP_INL T perm4(_1,_3,_0,_3, T a) { } |
325 | //SIMDPP_INL T perm4(_1,_3,_1,_0, T a) { } |
326 | SIMDPP_INL T perm4(_1,_3,_1,_1, T a) { return sel<1,0,1,1>(swap1<3,1>(a), bcast<1>(a)); } |
327 | //SIMDPP_INL T perm4(_1,_3,_1,_2, T a) { } |
328 | //SIMDPP_INL T perm4(_1,_3,_1,_3, T a) { } |
329 | //SIMDPP_INL T perm4(_1,_3,_2,_0, T a) { } |
330 | //SIMDPP_INL T perm4(_1,_3,_2,_1, T a) { } |
331 | //SIMDPP_INL T perm4(_1,_3,_2,_2, T a) { } |
332 | //SIMDPP_INL T perm4(_1,_3,_2,_3, T a) { } |
333 | //SIMDPP_INL T perm4(_1,_3,_3,_0, T a) { } |
334 | //SIMDPP_INL T perm4(_1,_3,_3,_1, T a) { } |
335 | //SIMDPP_INL T perm4(_1,_3,_3,_2, T a) { } |
336 | SIMDPP_INL T perm4(_1,_3,_3,_3, T a) { return sel<0,1,1,1>(swap1<1,0>(a), bcast<3>(a)); } |
337 | SIMDPP_INL T perm4(_2,_0,_0,_0, T a) { return sel<0,1,1,1>(swap1<2,0>(a), bcast<0>(a)); } |
338 | SIMDPP_INL T perm4(_2,_0,_0,_1, T a) { return rev42(sel<0,0,0,1>(a, swap1<0,3>(a))); } |
339 | //SIMDPP_INL T perm4(_2,_0,_0,_2, T a) { } |
340 | //SIMDPP_INL T perm4(_2,_0,_0,_3, T a) { } |
341 | //SIMDPP_INL T perm4(_2,_0,_1,_0, T a) { } |
342 | //SIMDPP_INL T perm4(_2,_0,_1,_1, T a) { } |
343 | SIMDPP_INL T perm4(_2,_0,_1,_2, T a) { return sel<0,1,1,1>(swap1<2,0>(a), mov_r<1>(a)); } |
344 | //SIMDPP_INL T perm4(_2,_0,_1,_3, T a) { } |
345 | //SIMDPP_INL T perm4(_2,_0,_2,_0, T a) { } |
346 | //SIMDPP_INL T perm4(_2,_0,_2,_1, T a) { } |
347 | SIMDPP_INL T perm4(_2,_0,_2,_2, T a) { return sel<1,0,1,1>(swap1<0,1>(a), bcast<2>(a)); } |
348 | //SIMDPP_INL T perm4(_2,_0,_2,_3, T a) { } |
349 | //SIMDPP_INL T perm4(_2,_0,_3,_0, T a) { } |
350 | //SIMDPP_INL T perm4(_2,_0,_3,_1, T a) { } |
351 | //SIMDPP_INL T perm4(_2,_0,_3,_2, T a) { } |
352 | //SIMDPP_INL T perm4(_2,_0,_3,_3, T a) { } |
353 | //SIMDPP_INL T perm4(_2,_1,_0,_0, T a) { } |
354 | SIMDPP_INL T perm4(_2,_1,_0,_1, T a) { return rev42(sel<0,0,0,1>(a, swap1<1,3>(a))); } |
355 | //SIMDPP_INL T perm4(_2,_1,_0,_2, T a) { } |
356 | SIMDPP_INL T perm4(_2,_1,_0,_3, T a) { return sel<1,0,1,0>(a, swap1<0,2>(a)); } |
357 | //SIMDPP_INL T perm4(_2,_1,_1,_0, T a) { } |
358 | SIMDPP_INL T perm4(_2,_1,_1,_1, T a) { return sel<0,1,1,1>(swap1<2,0>(a), bcast<1>(a)); } |
359 | //SIMDPP_INL T perm4(_2,_1,_1,_2, T a) { } |
360 | //SIMDPP_INL T perm4(_2,_1,_1,_3, T a) { } |
361 | //SIMDPP_INL T perm4(_2,_1,_2,_0, T a) { } |
362 | //SIMDPP_INL T perm4(_2,_1,_2,_1, T a) { } |
363 | SIMDPP_INL T perm4(_2,_1,_2,_2, T a) { return sel<1,0,1,1>(a, bcast<2>(a)); } |
364 | SIMDPP_INL T perm4(_2,_1,_2,_3, T a) { return sel<1,0,0,0>(a, swap1<2,0>(a)); } |
365 | //SIMDPP_INL T perm4(_2,_1,_3,_0, T a) { } |
366 | //SIMDPP_INL T perm4(_2,_1,_3,_1, T a) { } |
367 | //SIMDPP_INL T perm4(_2,_1,_3,_2, T a) { } |
368 | //SIMDPP_INL T perm4(_2,_1,_3,_3, T a) { } |
369 | SIMDPP_INL T perm4(_2,_2,_0,_0, T a) { return rev41(dup_lo(a)); } |
370 | SIMDPP_INL T perm4(_2,_2,_0,_1, T a) { return rev42(sel<0,0,0,1>(a, swap1<2,3>(a))); } |
371 | SIMDPP_INL T perm4(_2,_2,_0,_2, T a) { return sel<1,1,0,1>(swap1<0,2>(a), bcast<2>(a)); } |
372 | //SIMDPP_INL T perm4(_2,_2,_0,_3, T a) { } |
373 | SIMDPP_INL T perm4(_2,_2,_1,_0, T a) { a = rev41(a); return sel<1,0,0,0>(a, swap1<1,0>(a)); } |
374 | SIMDPP_INL T perm4(_2,_2,_1,_1, T a) { a = rev41(a); return sel<1,0,0,1>(a, rev21(a)); } |
375 | SIMDPP_INL T perm4(_2,_2,_1,_2, T a) { return sel<1,1,0,1>(swap1<1,2>(a), bcast<2>(a)); } |
376 | //SIMDPP_INL T perm4(_2,_2,_1,_3, T a) { } |
377 | SIMDPP_INL T perm4(_2,_2,_2,_0, T a) { return sel<1,1,1,0>(swap1<0,3>(a), bcast<2>(a)); } |
378 | SIMDPP_INL T perm4(_2,_2,_2,_1, T a) { return sel<1,1,1,0>(swap1<1,3>(a), bcast<2>(a)); } |
379 | SIMDPP_INL T perm4(_2,_2,_2,_2, T a) { return bcast<2>(a); } |
380 | SIMDPP_INL T perm4(_2,_2,_2,_3, T a) { return sel<1,1,1,0>(a, bcast<2>(a)); } |
381 | //SIMDPP_INL T perm4(_2,_2,_3,_0, T a) { } |
382 | //SIMDPP_INL T perm4(_2,_2,_3,_1, T a) { } |
383 | SIMDPP_INL T perm4(_2,_2,_3,_2, T a) { return sel<1,1,0,1>(swap1<3,2>(a), bcast<2>(a)); } |
384 | SIMDPP_INL T perm4(_2,_2,_3,_3, T a) { return dup_unpack_hi(a); } |
385 | SIMDPP_INL T perm4(_2,_3,_0,_0, T a) { return rev42(sel<0,1,0,0>(a, swap1<0,1>(a))); } |
386 | SIMDPP_INL T perm4(_2,_3,_0,_1, T a) { return rev42(a); } |
387 | SIMDPP_INL T perm4(_2,_3,_0,_2, T a) { return rev42(sel<0,1,0,0>(a, swap1<2,1>(a))); } |
388 | SIMDPP_INL T perm4(_2,_3,_0,_3, T a) { return rev42(sel<0,1,0,0>(a, swap1<3,1>(a))); } |
389 | SIMDPP_INL T perm4(_2,_3,_1,_0, T a) { return rev42(sel<1,1,0,0>(a, swap1<1,0>(a))); } |
390 | SIMDPP_INL T perm4(_2,_3,_1,_1, T a) { return rev42(sel<1,0,0,0>(a, swap1<1,0>(a))); } |
391 | SIMDPP_INL T perm4(_2,_3,_1,_2, T a) { return sel<0,0,1,1>(rev42(a), mov_r<1>(a)); } |
392 | //SIMDPP_INL T perm4(_2,_3,_1,_3, T a) { } |
393 | //SIMDPP_INL T perm4(_2,_3,_2,_0, T a) { } |
394 | SIMDPP_INL T perm4(_2,_3,_2,_1, T a) { return rev42(sel<1,0,0,0>(a, swap1<2,0>(a))); } |
395 | SIMDPP_INL T perm4(_2,_3,_2,_2, T a) { return sel<1,0,1,1>(swap1<3,1>(a), bcast<2>(a)); } |
396 | SIMDPP_INL T perm4(_2,_3,_2,_3, T a) { return dup2_hi(a); } |
397 | //SIMDPP_INL T perm4(_2,_3,_3,_0, T a) { } |
398 | SIMDPP_INL T perm4(_2,_3,_3,_1, T a) { return rev42(sel<1,0,0,0>(a, swap1<3,0>(a))); } |
399 | SIMDPP_INL T perm4(_2,_3,_3,_2, T a) { return rev21(sel<1,1,0,0>(a, rev41(a))); } |
400 | SIMDPP_INL T perm4(_2,_3,_3,_3, T a) { return sel<0,1,1,1>(swap1<2,0>(a), bcast<3>(a)); } |
401 | SIMDPP_INL T perm4(_3,_0,_0,_0, T a) { return sel<0,1,1,1>(swap1<3,0>(a), bcast<0>(a)); } |
402 | //SIMDPP_INL T perm4(_3,_0,_0,_1, T a) { } |
403 | //SIMDPP_INL T perm4(_3,_0,_0,_2, T a) { } |
404 | //SIMDPP_INL T perm4(_3,_0,_0,_3, T a) { } |
405 | SIMDPP_INL T perm4(_3,_0,_1,_0, T a) { a = rev41(a); return sel<0,1,0,0>(a, swap1<3,1>(a)); } |
406 | //SIMDPP_INL T perm4(_3,_0,_1,_1, T a) { } |
407 | SIMDPP_INL T perm4(_3,_0,_1,_2, T a) { return sel<0,1,1,1>(swap1<3,0>(a), mov_r<1>(a)); } |
408 | //SIMDPP_INL T perm4(_3,_0,_1,_3, T a) { } |
409 | //SIMDPP_INL T perm4(_3,_0,_2,_0, T a) { } |
410 | //SIMDPP_INL T perm4(_3,_0,_2,_1, T a) { } |
411 | //SIMDPP_INL T perm4(_3,_0,_2,_2, T a) { } |
412 | //SIMDPP_INL T perm4(_3,_0,_2,_3, T a) { } |
413 | //SIMDPP_INL T perm4(_3,_0,_3,_0, T a) { } |
414 | //SIMDPP_INL T perm4(_3,_0,_3,_1, T a) { } |
415 | //SIMDPP_INL T perm4(_3,_0,_3,_2, T a) { } |
416 | SIMDPP_INL T perm4(_3,_0,_3,_3, T a) { return sel<1,0,1,1>(swap1<0,1>(a), bcast<3>(a)); } |
417 | //SIMDPP_INL T perm4(_3,_1,_0,_0, T a) { } |
418 | //SIMDPP_INL T perm4(_3,_1,_0,_1, T a) { } |
419 | //SIMDPP_INL T perm4(_3,_1,_0,_2, T a) { } |
420 | //SIMDPP_INL T perm4(_3,_1,_0,_3, T a) { } |
421 | SIMDPP_INL T perm4(_3,_1,_1,_0, T a) { a = rev41(a); return sel<0,1,0,0>(a, swap1<2,1>(a)); } |
422 | SIMDPP_INL T perm4(_3,_1,_1,_1, T a) { return sel<0,1,1,1>(swap1<3,0>(a), bcast<1>(a)); } |
423 | //SIMDPP_INL T perm4(_3,_1,_1,_2, T a) { } |
424 | SIMDPP_INL T perm4(_3,_1,_1,_3, T a) { return sel<1,0,1,0>(a, rev41(a)); } |
425 | SIMDPP_INL T perm4(_3,_1,_2,_0, T a) { return sel<1,0,0,1>(a, swap1<3,0>(a)); } |
426 | //SIMDPP_INL T perm4(_3,_1,_2,_1, T a) { } |
427 | //SIMDPP_INL T perm4(_3,_1,_2,_2, T a) { } |
428 | SIMDPP_INL T perm4(_3,_1,_2,_3, T a) { return sel<1,0,0,0>(a, swap1<3,0>(a)); } |
429 | //SIMDPP_INL T perm4(_3,_1,_3,_0, T a) { } |
430 | //SIMDPP_INL T perm4(_3,_1,_3,_1, T a) { } |
431 | //SIMDPP_INL T perm4(_3,_1,_3,_2, T a) { } |
432 | SIMDPP_INL T perm4(_3,_1,_3,_3, T a) { return sel<1,0,1,1>(a, bcast<3>(a)); } |
433 | SIMDPP_INL T perm4(_3,_2,_0,_0, T a) { return rev41(sel<0,1,0,0>(a, swap1<0,1>(a))); } |
434 | SIMDPP_INL T perm4(_3,_2,_0,_1, T a) { return rev41(sel<1,1,0,0>(a, swap1<0,1>(a))); } |
435 | //SIMDPP_INL T perm4(_3,_2,_0,_2, T a) { } |
436 | //SIMDPP_INL T perm4(_3,_2,_0,_3, T a) { } |
437 | SIMDPP_INL T perm4(_3,_2,_1,_0, T a) { return rev41(a); } |
438 | SIMDPP_INL T perm4(_3,_2,_1,_1, T a) { a = rev41(a); return sel<0,0,0,1>(a, swap1<2,3>(a)); } |
439 | SIMDPP_INL T perm4(_3,_2,_1,_2, T a) { a = rev41(a); return sel<0,0,0,1>(a, swap1<1,3>(a)); } |
440 | SIMDPP_INL T perm4(_3,_2,_1,_3, T a) { a = rev41(a); return sel<0,0,0,1>(a, swap1<0,3>(a)); } |
441 | SIMDPP_INL T perm4(_3,_2,_2,_0, T a) { return rev41(sel<0,1,0,0>(a, swap1<2,1>(a))); } |
442 | SIMDPP_INL T perm4(_3,_2,_2,_1, T a) { a = rev41(a); return sel<0,0,1,1>(a, mov_r<1>(a)); } |
443 | SIMDPP_INL T perm4(_3,_2,_2,_2, T a) { return sel<0,1,1,1>(swap1<3,0>(a), bcast<2>(a)); } |
444 | SIMDPP_INL T perm4(_3,_2,_2,_3, T a) { return sel<1,1,0,0>(a, rev41(a)); } |
445 | SIMDPP_INL T perm4(_3,_2,_3,_0, T a) { return rev41(sel<0,1,0,0>(a, swap1<3,1>(a))); } |
446 | //SIMDPP_INL T perm4(_3,_2,_3,_1, T a) { } |
447 | SIMDPP_INL T perm4(_3,_2,_3,_2, T a) { return rev21(dup2_hi(a)); } |
448 | SIMDPP_INL T perm4(_3,_2,_3,_3, T a) { return sel<1,0,1,1>(swap1<2,1>(a), bcast<3>(a)); } |
449 | SIMDPP_INL T perm4(_3,_3,_0,_0, T a) { return rev42(sel<0,1,1,0>(a, rev21(a))); } |
450 | SIMDPP_INL T perm4(_3,_3,_0,_1, T a) { return rev42(sel<0,0,1,0>(a, swap1<3,2>(a))); } |
451 | //SIMDPP_INL T perm4(_3,_3,_0,_2, T a) { } |
452 | SIMDPP_INL T perm4(_3,_3,_0,_3, T a) { return sel<1,1,0,1>(swap1<0,2>(a), bcast<3>(a)); } |
453 | SIMDPP_INL T perm4(_3,_3,_1,_0, T a) { a = rev41(a); return sel<0,1,0,0>(a, swap1<0,1>(a)); } |
454 | SIMDPP_INL T perm4(_3,_3,_1,_1, T a) { return rev41(dup_hi(a)); } |
455 | SIMDPP_INL T perm4(_3,_3,_1,_2, T a) { return sel<1,1,0,0>(mov_r<1>(a), bcast<3>(a)); } |
456 | SIMDPP_INL T perm4(_3,_3,_1,_3, T a) { return sel<1,1,0,1>(swap1<1,2>(a), bcast<3>(a)); } |
457 | SIMDPP_INL T perm4(_3,_3,_2,_0, T a) { a = rev41(a); return sel<0,1,1,0>(a, mov_r<1>(a)); } |
458 | SIMDPP_INL T perm4(_3,_3,_2,_1, T a) { a = rev41(a); return sel<0,1,1,1>(a, mov_r<1>(a)); } |
459 | SIMDPP_INL T perm4(_3,_3,_2,_2, T a) { return rev41(dup_unpack_hi(a)); } |
460 | SIMDPP_INL T perm4(_3,_3,_2,_3, T a) { return sel<1,1,0,1>(a, bcast<3>(a)); } |
461 | SIMDPP_INL T perm4(_3,_3,_3,_0, T a) { return sel<1,1,1,0>(swap1<0,3>(a), bcast<3>(a)); } |
462 | SIMDPP_INL T perm4(_3,_3,_3,_1, T a) { return sel<1,1,1,0>(swap1<1,3>(a), bcast<3>(a)); } |
463 | SIMDPP_INL T perm4(_3,_3,_3,_2, T a) { return sel<1,1,1,0>(swap1<2,3>(a), bcast<3>(a)); } |
464 | SIMDPP_INL T perm4(_3,_3,_3,_3, T a) { return bcast<3>(a); } |
465 | |
466 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
467 | T perm4(std::integral_constant<unsigned, s0>, |
468 | std::integral_constant<unsigned, s1>, |
469 | std::integral_constant<unsigned, s2>, |
470 | std::integral_constant<unsigned, s3>, T a) |
471 | { |
472 | return fallback<s0,s1,s2,s3>(a); |
473 | } |
474 | |
475 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
476 | T permute4(T a) |
477 | { |
478 | return perm4(std::integral_constant<unsigned, s0>{}, |
479 | std::integral_constant<unsigned, s1>{}, |
480 | std::integral_constant<unsigned, s2>{}, |
481 | std::integral_constant<unsigned, s3>{}, a); |
482 | } |
483 | |
484 | } // namespace neon_shuffle_int16x8 |
485 | } // namespace detail |
486 | } // namespace SIMDPP_ARCH_NAMESPACE |
487 | } // namespace simdpp |
488 | |
489 | #endif |
490 | #endif |
491 | |