1/* Copyright (C) 2011-2012 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_DETAIL_SHUFFLE_NEON_INT16x8_H
9#define LIBSIMDPP_DETAIL_SHUFFLE_NEON_INT16x8_H
10#if SIMDPP_USE_NEON
11
12#include <simdpp/core/make_shuffle_bytes_mask.h>
13#include <simdpp/core/permute_bytes16.h>
14#include <type_traits>
15
16namespace simdpp {
17namespace SIMDPP_ARCH_NAMESPACE {
18namespace detail {
19namespace neon_shuffle_int16x8 {
20
21/*
22 The code below implements generalized permutations for 4 elements sets
23 within int16x8 vectors using various shuffling instructions available on
24 NEON. If no straightforward permutation is available, TBL instruction is
25 used.
26
27 Note: the compiler should optimize all masks into one VMOV #imm instruction
28*/
29
30using _0 = std::integral_constant<unsigned, 0>;
31using _1 = std::integral_constant<unsigned, 1>;
32using _2 = std::integral_constant<unsigned, 2>;
33using _3 = std::integral_constant<unsigned, 3>;
34using T = uint16x8; // full vector
35using H = uint16x4_t; // half vector
36
37/// Cost: 2
38template<unsigned n> SIMDPP_INL
39T bcast(T a)
40{
41 H h1 = vget_low_u16(a.native());
42 H h2 = vget_high_u16(a.native());
43 h1 = vdup_lane_u16(h1, n);
44 h2 = vdup_lane_u16(h2, n);
45 return vcombine_u16(h1, h2);
46}
47
48/** Selects the elements from two vectors according to selectors. 0 selects
49 the value from the first vector, 1 selects from the second
50 Cost: 2
51*/
52template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
53T sel(T a, T b)
54{
55 const uint64_t um =
56 (s0 > 0 ? 0xffffLL : 0) |
57 (s1 > 0 ? 0xffffLL << 16 : 0) |
58 (s2 > 0 ? 0xffffLL << 32 : 0) |
59 (s3 > 0 ? 0xffffLL << 48 : 0);
60 uint16x8_t mask = vreinterpretq_u16_u64(vmovq_n_u64(um));
61
62 return vbslq_u16(mask, b.native(), a.native());
63}
64
65/** Within each 4-element set moves the elements to the left or right. The
66 shifted-in values are undefined.
67 Cost: 1
68*/
69template<unsigned shift> SIMDPP_INL
70T mov_r(T a)
71{
72 return vreinterpretq_u16_u64(vshlq_n_u64(vreinterpretq_u64_u16(a.native()), shift*16));
73}
74
75template<unsigned shift> SIMDPP_INL
76T mov_l(T a)
77{
78 return vreinterpretq_u16_u64(vshrq_n_u64(vreinterpretq_u64_u16(a.native()), shift*16));
79}
80
81/// Within each 4-element set: r0 = a3; r1 = a2; r2 = a1; r3 = a0; - 3210
82/// Cost: 1
83SIMDPP_INL T rev41(T a)
84{
85 return vrev64q_u16(a.native());
86}
87
88/// Within each 4-element set: r0 = a2; r1 = a3; r2 = a0; r3 = a1; - 2301
89/// Cost: 1
90SIMDPP_INL T rev42(T a)
91{
92 return vreinterpretq_u16_u32(vrev64q_u32(vreinterpretq_u32_u16(a.native())));
93}
94
95/// Within each 4-element set: r0 = a1; r1 = a0; r2 = a3; r3 = a2; - 1032
96/// Cost: 1
97SIMDPP_INL T rev21(T a)
98{
99 return vrev32q_u16(a.native());
100}
101
102/// Within each 4-element set: r0 = a0; r1 = a0; r2 = a2; r3 = a2; - 0022
103/// Cost: 2
104SIMDPP_INL T dup_lo(T a)
105{
106 T b = a;
107 return vtrnq_u16(a.native(), b.native()).val[0];
108}
109
110/// Within each 4-element set: r0 = a1; r1 = a1; r2 = a3; r3 = a3; - 1133
111/// Cost: 2
112SIMDPP_INL T dup_hi(T a)
113{
114 T b = a;
115 return vtrnq_u16(a.native(), b.native()).val[1];
116}
117
118/// Within each 4-element set: r0 = a0; r1 = a1; r2 = a0; r3 = a1; - 0101
119/// Cost: 2
120SIMDPP_INL T dup2_lo(T a)
121{
122 uint32x4_t i;
123 uint32x2_t lo, hi;
124 i = vreinterpretq_u32_u16(a.native());
125 lo = vget_low_u32(i);
126 hi = vget_high_u32(i);
127 lo = vdup_lane_u32(lo, 0);
128 hi = vdup_lane_u32(hi, 0);
129 return vreinterpretq_u16_u32(vcombine_u32(lo, hi));
130}
131
132/// Within each 4-element set: r0 = a2; r1 = a3; r2 = a2; r3 = a3; - 2323
133/// Cost: 2
134SIMDPP_INL T dup2_hi(T a)
135{
136 uint32x4_t i;
137 uint32x2_t lo, hi;
138 i = vreinterpretq_u32_u16(a.native());
139 lo = vget_low_u32(i);
140 hi = vget_high_u32(i);
141 lo = vdup_lane_u32(lo, 1);
142 hi = vdup_lane_u32(hi, 1);
143 return vreinterpretq_u16_u32(vcombine_u32(lo, hi));
144}
145
146/// Within each 4-element set: r0 = a0; r1 = a0; r2 = a1; r3 = a1; - 0011
147/// Cost: 3
148SIMDPP_INL T dup_unpack_lo(T a)
149{
150 H lo = vget_low_u16(a.native());
151 H hi = vget_high_u16(a.native());
152 H rlo = vzip_u16(lo, lo).val[0];
153 H rhi = vzip_u16(hi, hi).val[0];
154 return vcombine_u16(rlo, rhi);
155}
156
157/// Within each 4-element set: r0 = a2; r1 = a2; r2 = a3; r3 = a3; - 2233
158/// Cost: 3
159SIMDPP_INL T dup_unpack_hi(T a)
160{
161 H lo = vget_low_u16(a.native());
162 H hi = vget_high_u16(a.native());
163 H rlo = vzip_u16(lo, lo).val[1];
164 H rhi = vzip_u16(hi, hi).val[1];
165 return vcombine_u16(rlo, rhi);
166}
167
168/// Swaps the n1-th element with the element in n2-th position. The values of
169/// other elements are undefined. Cost: 1
170template<unsigned n1, unsigned n2> SIMDPP_INL
171T swap1(T a)
172{
173 const unsigned sel = n1*4 + n2;
174 switch (sel) {
175 case 0: //00
176 case 5: //11
177 case 10: //22
178 case 15: //33
179 return a;
180 case 1: //01
181 case 4: //10
182 case 11: //23
183 case 14: //32
184 return rev21(a);
185 case 2: //02
186 case 8: //20
187 case 7: //13
188 case 13: //31
189 return rev42(a);
190 case 3: //03
191 case 12: //30
192 case 6: //12
193 case 9: //21
194 return rev41(a);
195 }
196 return a;
197}
198
199/// Cost: 3 (includes one 16-byte load)
200template<unsigned N0, unsigned N1, unsigned N2, unsigned N3> SIMDPP_INL
201T fallback(T a)
202{
203 T mask;
204 mask = make_shuffle_bytes16_mask<N0,N1,N2,N3>(mask);
205 return permute_bytes16(a, mask);
206}
207
208static SIMDPP_INL
209T perm4(_0,_0,_0,_0, T a) { return bcast<0>(a); }
210SIMDPP_INL T perm4(_0,_0,_0,_1, T a) { return sel<0,0,0,1>(bcast<0>(a), swap1<1,3>(a)); }
211SIMDPP_INL T perm4(_0,_0,_0,_2, T a) { return sel<0,0,0,1>(bcast<0>(a), swap1<2,3>(a)); }
212SIMDPP_INL T perm4(_0,_0,_0,_3, T a) { return sel<1,1,1,0>(a, bcast<0>(a)); }
213SIMDPP_INL T perm4(_0,_0,_1,_0, T a) { return sel<0,0,1,0>(bcast<0>(a), swap1<1,2>(a)); }
214SIMDPP_INL T perm4(_0,_0,_1,_1, T a) { return dup_unpack_lo(a); }
215SIMDPP_INL T perm4(_0,_0,_1,_2, T a) { return sel<0,1,1,1>(a, mov_r<1>(a)); }
216SIMDPP_INL T perm4(_0,_0,_1,_3, T a) { return sel<0,1,1,0>(a, mov_r<1>(a)); }
217SIMDPP_INL T perm4(_0,_0,_2,_0, T a) { return sel<1,1,0,1>(a, bcast<0>(a));}
218//SIMDPP_INL T perm4(_0,_0,_2,_1, T a) { }
219SIMDPP_INL T perm4(_0,_0,_2,_2, T a) { return dup_lo(a); }
220SIMDPP_INL T perm4(_0,_0,_2,_3, T a) { return sel<0,1,0,0>(a, swap1<0,1>(a)); }
221SIMDPP_INL T perm4(_0,_0,_3,_0, T a) { return sel<0,0,1,0>(bcast<0>(a), swap1<3,2>(a)); }
222//SIMDPP_INL T perm4(_0,_0,_3,_1, T a) { }
223SIMDPP_INL T perm4(_0,_0,_3,_2, T a) { return sel<0,1,1,1>(a, rev21(a)); }
224SIMDPP_INL T perm4(_0,_0,_3,_3, T a) { return sel<0,1,1,0>(a, rev21(a)); }
225SIMDPP_INL T perm4(_0,_1,_0,_0, T a) { return sel<1,0,1,1>(a, bcast<0>(a)); }
226SIMDPP_INL T perm4(_0,_1,_0,_1, T a) { return dup2_lo(a); }
227//SIMDPP_INL T perm4(_0,_1,_0,_2, T a) { }
228SIMDPP_INL T perm4(_0,_1,_0,_3, T a) { return sel<0,0,1,0>(a, swap1<0,2>(a)); }
229SIMDPP_INL T perm4(_0,_1,_1,_0, T a) { return sel<0,0,1,1>(a, rev41(a)); }
230SIMDPP_INL T perm4(_0,_1,_1,_1, T a) { return sel<0,0,1,1>(a, bcast<1>(a)); }
231SIMDPP_INL T perm4(_0,_1,_1,_2, T a) { return sel<0,0,1,1>(a, mov_r<1>(a)); }
232SIMDPP_INL T perm4(_0,_1,_1,_3, T a) { return sel<0,0,1,0>(a, swap1<1,2>(a)); }
233SIMDPP_INL T perm4(_0,_1,_2,_0, T a) { return sel<0,0,0,1>(a, swap1<0,3>(a)); }
234SIMDPP_INL T perm4(_0,_1,_2,_1, T a) { return sel<0,0,0,1>(a, swap1<1,3>(a)); }
235SIMDPP_INL T perm4(_0,_1,_2,_2, T a) { return sel<0,0,0,1>(a, swap1<2,3>(a)); }
236SIMDPP_INL T perm4(_0,_1,_2,_3, T a) { return a; }
237//SIMDPP_INL T perm4(_0,_1,_3,_0, T a) { }
238//SIMDPP_INL T perm4(_0,_1,_3,_1, T a) { }
239SIMDPP_INL T perm4(_0,_1,_3,_2, T a) { return sel<0,0,1,1>(a, rev21(a)); }
240SIMDPP_INL T perm4(_0,_1,_3,_3, T a) { return sel<0,0,1,0>(a, swap1<3,2>(a)); }
241SIMDPP_INL T perm4(_0,_2,_0,_0, T a) { return sel<0,1,0,0>(bcast<0>(a), swap1<2,1>(a)); }
242//SIMDPP_INL T perm4(_0,_2,_0,_1, T a) { }
243//SIMDPP_INL T perm4(_0,_2,_0,_2, T a) { }
244//SIMDPP_INL T perm4(_0,_2,_0,_3, T a) { }
245SIMDPP_INL T perm4(_0,_2,_1,_0, T a) { return sel<0,1,1,1>(a, rev41(a)); }
246//SIMDPP_INL T perm4(_0,_2,_1,_1, T a) { }
247//SIMDPP_INL T perm4(_0,_2,_1,_2, T a) { }
248SIMDPP_INL T perm4(_0,_2,_1,_3, T a) { return sel<0,1,1,0>(a, rev41(a)); }
249SIMDPP_INL T perm4(_0,_2,_2,_0, T a) { return sel<0,1,0,1>(a, rev41(a)); }
250//SIMDPP_INL T perm4(_0,_2,_2,_1, T a) { }
251SIMDPP_INL T perm4(_0,_2,_2,_2, T a) { return sel<0,1,1,1>(a, bcast<2>(a)); }
252SIMDPP_INL T perm4(_0,_2,_2,_3, T a) { return sel<0,1,0,0>(a, swap1<2,1>(a)); }
253//SIMDPP_INL T perm4(_0,_2,_3,_0, T a) { }
254//SIMDPP_INL T perm4(_0,_2,_3,_1, T a) { }
255//SIMDPP_INL T perm4(_0,_2,_3,_2, T a) { }
256SIMDPP_INL T perm4(_0,_2,_3,_3, T a) { return sel<0,1,1,0>(a, mov_l<1>(a)); }
257SIMDPP_INL T perm4(_0,_3,_0,_0, T a) { return sel<1,0,1,1>(swap1<3,1>(a), bcast<0>(a)); }
258SIMDPP_INL T perm4(_0,_3,_0,_1, T a) { return sel<0,1,1,1>(a, rev42(a)); }
259//SIMDPP_INL T perm4(_0,_3,_0,_2, T a) { }
260SIMDPP_INL T perm4(_0,_3,_0,_3, T a) { return sel<0,1,1,0>(a, rev42(a)); }
261//SIMDPP_INL T perm4(_0,_3,_1,_0, T a) { }
262//SIMDPP_INL T perm4(_0,_3,_1,_1, T a) { }
263//SIMDPP_INL T perm4(_0,_3,_1,_2, T a) { }
264//SIMDPP_INL T perm4(_0,_3,_1,_3, T a) { }
265//SIMDPP_INL T perm4(_0,_3,_2,_0, T a) { }
266SIMDPP_INL T perm4(_0,_3,_2,_1, T a) { return sel<0,1,0,1>(a, swap1<1,3>(a)); }
267//SIMDPP_INL T perm4(_0,_3,_2,_2, T a) { }
268SIMDPP_INL T perm4(_0,_3,_2,_3, T a) { return sel<0,1,0,0>(a, swap1<3,1>(a)); }
269//SIMDPP_INL T perm4(_0,_3,_3,_0, T a) { }
270//SIMDPP_INL T perm4(_0,_3,_3,_1, T a) { }
271//SIMDPP_INL T perm4(_0,_3,_3,_2, T a) { }
272SIMDPP_INL T perm4(_0,_3,_3,_3, T a) { return sel<0,1,1,1>(a, bcast<3>(a)); }
273SIMDPP_INL T perm4(_1,_0,_0,_0, T a) { return sel<0,1,1,1>(swap1<1,0>(a), bcast<0>(a)); }
274SIMDPP_INL T perm4(_1,_0,_0,_1, T a) { return rev21(sel<0,0,1,1>(a, rev41(a))); }
275//SIMDPP_INL T perm4(_1,_0,_0,_2, T a) { }
276//SIMDPP_INL T perm4(_1,_0,_0,_3, T a) { }
277SIMDPP_INL T perm4(_1,_0,_1,_0, T a) { return rev21(dup2_lo(a)); }
278SIMDPP_INL T perm4(_1,_0,_1,_1, T a) { return sel<1,0,1,1>(swap1<1,0>(a), bcast<1>(a)); }
279SIMDPP_INL T perm4(_1,_0,_1,_2, T a) { return sel<0,1,1,1>(swap1<1,0>(a), mov_r<1>(a)); }
280//SIMDPP_INL T perm4(_1,_0,_1,_3, T a) { }
281//SIMDPP_INL T perm4(_1,_0,_2,_0, T a) { }
282//SIMDPP_INL T perm4(_1,_0,_2,_1, T a) { }
283SIMDPP_INL T perm4(_1,_0,_2,_2, T a) { return sel<0,0,1,1>(swap1<0,1>(a), bcast<2>(a)); }
284SIMDPP_INL T perm4(_1,_0,_2,_3, T a) { return sel<1,1,0,0>(a, swap1<0,1>(a)); }
285//SIMDPP_INL T perm4(_1,_0,_3,_0, T a) { }
286//SIMDPP_INL T perm4(_1,_0,_3,_1, T a) { }
287SIMDPP_INL T perm4(_1,_0,_3,_2, T a) { return rev21(a); }
288SIMDPP_INL T perm4(_1,_0,_3,_3, T a) { return sel<0,0,1,1>(swap1<0,1>(a), bcast<3>(a)); }
289SIMDPP_INL T perm4(_1,_1,_0,_0, T a) { return rev42(dup_unpack_lo(a)); }
290SIMDPP_INL T perm4(_1,_1,_0,_1, T a) { return sel<1,1,0,1>(swap1<0,2>(a), bcast<1>(a));}
291//SIMDPP_INL T perm4(_1,_1,_0,_2, T a) { }
292//SIMDPP_INL T perm4(_1,_1,_0,_3, T a) { }
293SIMDPP_INL T perm4(_1,_1,_1,_0, T a) { return sel<1,1,1,0>(swap1<0,3>(a), bcast<1>(a)); }
294SIMDPP_INL T perm4(_1,_1,_1,_1, T a) { return bcast<1>(a); }
295SIMDPP_INL T perm4(_1,_1,_1,_2, T a) { return sel<1,1,1,0>(swap1<2,3>(a), bcast<1>(a)); }
296SIMDPP_INL T perm4(_1,_1,_1,_3, T a) { return sel<1,1,1,0>(a, bcast<1>(a)); }
297//SIMDPP_INL T perm4(_1,_1,_2,_0, T a) { }
298SIMDPP_INL T perm4(_1,_1,_2,_1, T a) { return sel<1,1,0,1>(a, bcast<1>(a)); }
299SIMDPP_INL T perm4(_1,_1,_2,_2, T a) { return sel<1,0,0,1>(a, rev21(a)); }
300SIMDPP_INL T perm4(_1,_1,_2,_3, T a) { return sel<1,0,0,0>(a, swap1<1,0>(a)); }
301//SIMDPP_INL T perm4(_1,_1,_3,_0, T a) { }
302SIMDPP_INL T perm4(_1,_1,_3,_1, T a) { return sel<1,1,0,1>(swap1<3,2>(a), bcast<1>(a)); }
303//SIMDPP_INL T perm4(_1,_1,_3,_2, T a) { }
304SIMDPP_INL T perm4(_1,_1,_3,_3, T a) { return dup_hi(a); }
305//SIMDPP_INL T perm4(_1,_2,_0,_0, T a) { }
306SIMDPP_INL T perm4(_1,_2,_0,_1, T a) { return sel<1,1,0,0>(rev42(a), mov_l<1>(a)); }
307//SIMDPP_INL T perm4(_1,_2,_0,_2, T a) { }
308//SIMDPP_INL T perm4(_1,_2,_0,_3, T a) { }
309SIMDPP_INL T perm4(_1,_2,_1,_0, T a) { a = rev41(a); return sel<1,0,0,0>(a, swap1<2,0>(a)); }
310SIMDPP_INL T perm4(_1,_2,_1,_1, T a) { return sel<1,0,1,1>(swap1<2,1>(a), bcast<1>(a)); }
311//SIMDPP_INL T perm4(_1,_2,_1,_2, T a) { }
312//SIMDPP_INL T perm4(_1,_2,_1,_3, T a) { }
313//SIMDPP_INL T perm4(_1,_2,_2,_0, T a) { }
314//SIMDPP_INL T perm4(_1,_2,_2,_1, T a) { }
315SIMDPP_INL T perm4(_1,_2,_2,_2, T a) { return sel<0,1,1,1>(swap1<1,0>(a), bcast<2>(a)); }
316//SIMDPP_INL T perm4(_1,_2,_2,_3, T a) { }
317SIMDPP_INL T perm4(_1,_2,_3,_0, T a) { return sel<0,0,0,1>(mov_l<1>(a), swap1<0,3>(a)); }
318SIMDPP_INL T perm4(_1,_2,_3,_1, T a) { return sel<0,0,0,1>(mov_l<1>(a), swap1<1,3>(a)); }
319SIMDPP_INL T perm4(_1,_2,_3,_2, T a) { return sel<0,0,0,1>(mov_l<1>(a), swap1<2,3>(a)); }
320SIMDPP_INL T perm4(_1,_2,_3,_3, T a) { return sel<0,0,0,1>(mov_l<1>(a), a); }
321//SIMDPP_INL T perm4(_1,_3,_0,_0, T a) { }
322SIMDPP_INL T perm4(_1,_3,_0,_1, T a) { return rev42(sel<0,0,1,0>(a, swap1<1,2>(a))); }
323//SIMDPP_INL T perm4(_1,_3,_0,_2, T a) { }
324//SIMDPP_INL T perm4(_1,_3,_0,_3, T a) { }
325//SIMDPP_INL T perm4(_1,_3,_1,_0, T a) { }
326SIMDPP_INL T perm4(_1,_3,_1,_1, T a) { return sel<1,0,1,1>(swap1<3,1>(a), bcast<1>(a)); }
327//SIMDPP_INL T perm4(_1,_3,_1,_2, T a) { }
328//SIMDPP_INL T perm4(_1,_3,_1,_3, T a) { }
329//SIMDPP_INL T perm4(_1,_3,_2,_0, T a) { }
330//SIMDPP_INL T perm4(_1,_3,_2,_1, T a) { }
331//SIMDPP_INL T perm4(_1,_3,_2,_2, T a) { }
332//SIMDPP_INL T perm4(_1,_3,_2,_3, T a) { }
333//SIMDPP_INL T perm4(_1,_3,_3,_0, T a) { }
334//SIMDPP_INL T perm4(_1,_3,_3,_1, T a) { }
335//SIMDPP_INL T perm4(_1,_3,_3,_2, T a) { }
336SIMDPP_INL T perm4(_1,_3,_3,_3, T a) { return sel<0,1,1,1>(swap1<1,0>(a), bcast<3>(a)); }
337SIMDPP_INL T perm4(_2,_0,_0,_0, T a) { return sel<0,1,1,1>(swap1<2,0>(a), bcast<0>(a)); }
338SIMDPP_INL T perm4(_2,_0,_0,_1, T a) { return rev42(sel<0,0,0,1>(a, swap1<0,3>(a))); }
339//SIMDPP_INL T perm4(_2,_0,_0,_2, T a) { }
340//SIMDPP_INL T perm4(_2,_0,_0,_3, T a) { }
341//SIMDPP_INL T perm4(_2,_0,_1,_0, T a) { }
342//SIMDPP_INL T perm4(_2,_0,_1,_1, T a) { }
343SIMDPP_INL T perm4(_2,_0,_1,_2, T a) { return sel<0,1,1,1>(swap1<2,0>(a), mov_r<1>(a)); }
344//SIMDPP_INL T perm4(_2,_0,_1,_3, T a) { }
345//SIMDPP_INL T perm4(_2,_0,_2,_0, T a) { }
346//SIMDPP_INL T perm4(_2,_0,_2,_1, T a) { }
347SIMDPP_INL T perm4(_2,_0,_2,_2, T a) { return sel<1,0,1,1>(swap1<0,1>(a), bcast<2>(a)); }
348//SIMDPP_INL T perm4(_2,_0,_2,_3, T a) { }
349//SIMDPP_INL T perm4(_2,_0,_3,_0, T a) { }
350//SIMDPP_INL T perm4(_2,_0,_3,_1, T a) { }
351//SIMDPP_INL T perm4(_2,_0,_3,_2, T a) { }
352//SIMDPP_INL T perm4(_2,_0,_3,_3, T a) { }
353//SIMDPP_INL T perm4(_2,_1,_0,_0, T a) { }
354SIMDPP_INL T perm4(_2,_1,_0,_1, T a) { return rev42(sel<0,0,0,1>(a, swap1<1,3>(a))); }
355//SIMDPP_INL T perm4(_2,_1,_0,_2, T a) { }
356SIMDPP_INL T perm4(_2,_1,_0,_3, T a) { return sel<1,0,1,0>(a, swap1<0,2>(a)); }
357//SIMDPP_INL T perm4(_2,_1,_1,_0, T a) { }
358SIMDPP_INL T perm4(_2,_1,_1,_1, T a) { return sel<0,1,1,1>(swap1<2,0>(a), bcast<1>(a)); }
359//SIMDPP_INL T perm4(_2,_1,_1,_2, T a) { }
360//SIMDPP_INL T perm4(_2,_1,_1,_3, T a) { }
361//SIMDPP_INL T perm4(_2,_1,_2,_0, T a) { }
362//SIMDPP_INL T perm4(_2,_1,_2,_1, T a) { }
363SIMDPP_INL T perm4(_2,_1,_2,_2, T a) { return sel<1,0,1,1>(a, bcast<2>(a)); }
364SIMDPP_INL T perm4(_2,_1,_2,_3, T a) { return sel<1,0,0,0>(a, swap1<2,0>(a)); }
365//SIMDPP_INL T perm4(_2,_1,_3,_0, T a) { }
366//SIMDPP_INL T perm4(_2,_1,_3,_1, T a) { }
367//SIMDPP_INL T perm4(_2,_1,_3,_2, T a) { }
368//SIMDPP_INL T perm4(_2,_1,_3,_3, T a) { }
369SIMDPP_INL T perm4(_2,_2,_0,_0, T a) { return rev41(dup_lo(a)); }
370SIMDPP_INL T perm4(_2,_2,_0,_1, T a) { return rev42(sel<0,0,0,1>(a, swap1<2,3>(a))); }
371SIMDPP_INL T perm4(_2,_2,_0,_2, T a) { return sel<1,1,0,1>(swap1<0,2>(a), bcast<2>(a)); }
372//SIMDPP_INL T perm4(_2,_2,_0,_3, T a) { }
373SIMDPP_INL T perm4(_2,_2,_1,_0, T a) { a = rev41(a); return sel<1,0,0,0>(a, swap1<1,0>(a)); }
374SIMDPP_INL T perm4(_2,_2,_1,_1, T a) { a = rev41(a); return sel<1,0,0,1>(a, rev21(a)); }
375SIMDPP_INL T perm4(_2,_2,_1,_2, T a) { return sel<1,1,0,1>(swap1<1,2>(a), bcast<2>(a)); }
376//SIMDPP_INL T perm4(_2,_2,_1,_3, T a) { }
377SIMDPP_INL T perm4(_2,_2,_2,_0, T a) { return sel<1,1,1,0>(swap1<0,3>(a), bcast<2>(a)); }
378SIMDPP_INL T perm4(_2,_2,_2,_1, T a) { return sel<1,1,1,0>(swap1<1,3>(a), bcast<2>(a)); }
379SIMDPP_INL T perm4(_2,_2,_2,_2, T a) { return bcast<2>(a); }
380SIMDPP_INL T perm4(_2,_2,_2,_3, T a) { return sel<1,1,1,0>(a, bcast<2>(a)); }
381//SIMDPP_INL T perm4(_2,_2,_3,_0, T a) { }
382//SIMDPP_INL T perm4(_2,_2,_3,_1, T a) { }
383SIMDPP_INL T perm4(_2,_2,_3,_2, T a) { return sel<1,1,0,1>(swap1<3,2>(a), bcast<2>(a)); }
384SIMDPP_INL T perm4(_2,_2,_3,_3, T a) { return dup_unpack_hi(a); }
385SIMDPP_INL T perm4(_2,_3,_0,_0, T a) { return rev42(sel<0,1,0,0>(a, swap1<0,1>(a))); }
386SIMDPP_INL T perm4(_2,_3,_0,_1, T a) { return rev42(a); }
387SIMDPP_INL T perm4(_2,_3,_0,_2, T a) { return rev42(sel<0,1,0,0>(a, swap1<2,1>(a))); }
388SIMDPP_INL T perm4(_2,_3,_0,_3, T a) { return rev42(sel<0,1,0,0>(a, swap1<3,1>(a))); }
389SIMDPP_INL T perm4(_2,_3,_1,_0, T a) { return rev42(sel<1,1,0,0>(a, swap1<1,0>(a))); }
390SIMDPP_INL T perm4(_2,_3,_1,_1, T a) { return rev42(sel<1,0,0,0>(a, swap1<1,0>(a))); }
391SIMDPP_INL T perm4(_2,_3,_1,_2, T a) { return sel<0,0,1,1>(rev42(a), mov_r<1>(a)); }
392//SIMDPP_INL T perm4(_2,_3,_1,_3, T a) { }
393//SIMDPP_INL T perm4(_2,_3,_2,_0, T a) { }
394SIMDPP_INL T perm4(_2,_3,_2,_1, T a) { return rev42(sel<1,0,0,0>(a, swap1<2,0>(a))); }
395SIMDPP_INL T perm4(_2,_3,_2,_2, T a) { return sel<1,0,1,1>(swap1<3,1>(a), bcast<2>(a)); }
396SIMDPP_INL T perm4(_2,_3,_2,_3, T a) { return dup2_hi(a); }
397//SIMDPP_INL T perm4(_2,_3,_3,_0, T a) { }
398SIMDPP_INL T perm4(_2,_3,_3,_1, T a) { return rev42(sel<1,0,0,0>(a, swap1<3,0>(a))); }
399SIMDPP_INL T perm4(_2,_3,_3,_2, T a) { return rev21(sel<1,1,0,0>(a, rev41(a))); }
400SIMDPP_INL T perm4(_2,_3,_3,_3, T a) { return sel<0,1,1,1>(swap1<2,0>(a), bcast<3>(a)); }
401SIMDPP_INL T perm4(_3,_0,_0,_0, T a) { return sel<0,1,1,1>(swap1<3,0>(a), bcast<0>(a)); }
402//SIMDPP_INL T perm4(_3,_0,_0,_1, T a) { }
403//SIMDPP_INL T perm4(_3,_0,_0,_2, T a) { }
404//SIMDPP_INL T perm4(_3,_0,_0,_3, T a) { }
405SIMDPP_INL T perm4(_3,_0,_1,_0, T a) { a = rev41(a); return sel<0,1,0,0>(a, swap1<3,1>(a)); }
406//SIMDPP_INL T perm4(_3,_0,_1,_1, T a) { }
407SIMDPP_INL T perm4(_3,_0,_1,_2, T a) { return sel<0,1,1,1>(swap1<3,0>(a), mov_r<1>(a)); }
408//SIMDPP_INL T perm4(_3,_0,_1,_3, T a) { }
409//SIMDPP_INL T perm4(_3,_0,_2,_0, T a) { }
410//SIMDPP_INL T perm4(_3,_0,_2,_1, T a) { }
411//SIMDPP_INL T perm4(_3,_0,_2,_2, T a) { }
412//SIMDPP_INL T perm4(_3,_0,_2,_3, T a) { }
413//SIMDPP_INL T perm4(_3,_0,_3,_0, T a) { }
414//SIMDPP_INL T perm4(_3,_0,_3,_1, T a) { }
415//SIMDPP_INL T perm4(_3,_0,_3,_2, T a) { }
416SIMDPP_INL T perm4(_3,_0,_3,_3, T a) { return sel<1,0,1,1>(swap1<0,1>(a), bcast<3>(a)); }
417//SIMDPP_INL T perm4(_3,_1,_0,_0, T a) { }
418//SIMDPP_INL T perm4(_3,_1,_0,_1, T a) { }
419//SIMDPP_INL T perm4(_3,_1,_0,_2, T a) { }
420//SIMDPP_INL T perm4(_3,_1,_0,_3, T a) { }
421SIMDPP_INL T perm4(_3,_1,_1,_0, T a) { a = rev41(a); return sel<0,1,0,0>(a, swap1<2,1>(a)); }
422SIMDPP_INL T perm4(_3,_1,_1,_1, T a) { return sel<0,1,1,1>(swap1<3,0>(a), bcast<1>(a)); }
423//SIMDPP_INL T perm4(_3,_1,_1,_2, T a) { }
424SIMDPP_INL T perm4(_3,_1,_1,_3, T a) { return sel<1,0,1,0>(a, rev41(a)); }
425SIMDPP_INL T perm4(_3,_1,_2,_0, T a) { return sel<1,0,0,1>(a, swap1<3,0>(a)); }
426//SIMDPP_INL T perm4(_3,_1,_2,_1, T a) { }
427//SIMDPP_INL T perm4(_3,_1,_2,_2, T a) { }
428SIMDPP_INL T perm4(_3,_1,_2,_3, T a) { return sel<1,0,0,0>(a, swap1<3,0>(a)); }
429//SIMDPP_INL T perm4(_3,_1,_3,_0, T a) { }
430//SIMDPP_INL T perm4(_3,_1,_3,_1, T a) { }
431//SIMDPP_INL T perm4(_3,_1,_3,_2, T a) { }
432SIMDPP_INL T perm4(_3,_1,_3,_3, T a) { return sel<1,0,1,1>(a, bcast<3>(a)); }
433SIMDPP_INL T perm4(_3,_2,_0,_0, T a) { return rev41(sel<0,1,0,0>(a, swap1<0,1>(a))); }
434SIMDPP_INL T perm4(_3,_2,_0,_1, T a) { return rev41(sel<1,1,0,0>(a, swap1<0,1>(a))); }
435//SIMDPP_INL T perm4(_3,_2,_0,_2, T a) { }
436//SIMDPP_INL T perm4(_3,_2,_0,_3, T a) { }
437SIMDPP_INL T perm4(_3,_2,_1,_0, T a) { return rev41(a); }
438SIMDPP_INL T perm4(_3,_2,_1,_1, T a) { a = rev41(a); return sel<0,0,0,1>(a, swap1<2,3>(a)); }
439SIMDPP_INL T perm4(_3,_2,_1,_2, T a) { a = rev41(a); return sel<0,0,0,1>(a, swap1<1,3>(a)); }
440SIMDPP_INL T perm4(_3,_2,_1,_3, T a) { a = rev41(a); return sel<0,0,0,1>(a, swap1<0,3>(a)); }
441SIMDPP_INL T perm4(_3,_2,_2,_0, T a) { return rev41(sel<0,1,0,0>(a, swap1<2,1>(a))); }
442SIMDPP_INL T perm4(_3,_2,_2,_1, T a) { a = rev41(a); return sel<0,0,1,1>(a, mov_r<1>(a)); }
443SIMDPP_INL T perm4(_3,_2,_2,_2, T a) { return sel<0,1,1,1>(swap1<3,0>(a), bcast<2>(a)); }
444SIMDPP_INL T perm4(_3,_2,_2,_3, T a) { return sel<1,1,0,0>(a, rev41(a)); }
445SIMDPP_INL T perm4(_3,_2,_3,_0, T a) { return rev41(sel<0,1,0,0>(a, swap1<3,1>(a))); }
446//SIMDPP_INL T perm4(_3,_2,_3,_1, T a) { }
447SIMDPP_INL T perm4(_3,_2,_3,_2, T a) { return rev21(dup2_hi(a)); }
448SIMDPP_INL T perm4(_3,_2,_3,_3, T a) { return sel<1,0,1,1>(swap1<2,1>(a), bcast<3>(a)); }
449SIMDPP_INL T perm4(_3,_3,_0,_0, T a) { return rev42(sel<0,1,1,0>(a, rev21(a))); }
450SIMDPP_INL T perm4(_3,_3,_0,_1, T a) { return rev42(sel<0,0,1,0>(a, swap1<3,2>(a))); }
451//SIMDPP_INL T perm4(_3,_3,_0,_2, T a) { }
452SIMDPP_INL T perm4(_3,_3,_0,_3, T a) { return sel<1,1,0,1>(swap1<0,2>(a), bcast<3>(a)); }
453SIMDPP_INL T perm4(_3,_3,_1,_0, T a) { a = rev41(a); return sel<0,1,0,0>(a, swap1<0,1>(a)); }
454SIMDPP_INL T perm4(_3,_3,_1,_1, T a) { return rev41(dup_hi(a)); }
455SIMDPP_INL T perm4(_3,_3,_1,_2, T a) { return sel<1,1,0,0>(mov_r<1>(a), bcast<3>(a)); }
456SIMDPP_INL T perm4(_3,_3,_1,_3, T a) { return sel<1,1,0,1>(swap1<1,2>(a), bcast<3>(a)); }
457SIMDPP_INL T perm4(_3,_3,_2,_0, T a) { a = rev41(a); return sel<0,1,1,0>(a, mov_r<1>(a)); }
458SIMDPP_INL T perm4(_3,_3,_2,_1, T a) { a = rev41(a); return sel<0,1,1,1>(a, mov_r<1>(a)); }
459SIMDPP_INL T perm4(_3,_3,_2,_2, T a) { return rev41(dup_unpack_hi(a)); }
460SIMDPP_INL T perm4(_3,_3,_2,_3, T a) { return sel<1,1,0,1>(a, bcast<3>(a)); }
461SIMDPP_INL T perm4(_3,_3,_3,_0, T a) { return sel<1,1,1,0>(swap1<0,3>(a), bcast<3>(a)); }
462SIMDPP_INL T perm4(_3,_3,_3,_1, T a) { return sel<1,1,1,0>(swap1<1,3>(a), bcast<3>(a)); }
463SIMDPP_INL T perm4(_3,_3,_3,_2, T a) { return sel<1,1,1,0>(swap1<2,3>(a), bcast<3>(a)); }
464SIMDPP_INL T perm4(_3,_3,_3,_3, T a) { return bcast<3>(a); }
465
466template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
467T perm4(std::integral_constant<unsigned, s0>,
468 std::integral_constant<unsigned, s1>,
469 std::integral_constant<unsigned, s2>,
470 std::integral_constant<unsigned, s3>, T a)
471{
472 return fallback<s0,s1,s2,s3>(a);
473}
474
475template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
476T permute4(T a)
477{
478 return perm4(std::integral_constant<unsigned, s0>{},
479 std::integral_constant<unsigned, s1>{},
480 std::integral_constant<unsigned, s2>{},
481 std::integral_constant<unsigned, s3>{}, a);
482}
483
484} // namespace neon_shuffle_int16x8
485} // namespace detail
486} // namespace SIMDPP_ARCH_NAMESPACE
487} // namespace simdpp
488
489#endif
490#endif
491