1/* Copyright (C) 2011-2012 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_DETAIL_SHUFFLE_NEON_INT32x4_H
9#define LIBSIMDPP_DETAIL_SHUFFLE_NEON_INT32x4_H
10#if SIMDPP_USE_NEON
11
12#include <type_traits>
13
14namespace simdpp {
15namespace SIMDPP_ARCH_NAMESPACE {
16namespace detail {
17namespace neon_shuffle_int32x4 {
18
19/*
20 The code below implements generalized permutations of elements within
21 int32x4 vectors using various shuffling instructions available on NEON.
22*/
23using _0 = std::integral_constant<unsigned, 0>;
24using _1 = std::integral_constant<unsigned, 1>;
25using _2 = std::integral_constant<unsigned, 2>;
26using _3 = std::integral_constant<unsigned, 3>;
27using T = uint32x4; // full vector
28using H = uint32x2_t; // half vector
29
30
31/// Returns the lower/higher part of a vector. Cost: 0
32SIMDPP_INL H lo(T a) { return vget_low_u32(a.native()); }
33SIMDPP_INL H hi(T a) { return vget_high_u32(a.native()); }
34
35/// Cost: 1
36template<unsigned N> SIMDPP_INL
37T bcast(T a)
38{
39 H h = (N < 2) ? lo(a) : hi(a);
40 return (uint32x4_t) vdupq_lane_u32(h, N % 2);
41}
42
43/// Combines two half vectors. Cost: 0
44SIMDPP_INL T co(H lo, H hi){ return vcombine_u32(lo, hi); }
45
46/// Reverses the elements in half-vector or half-vectors in a vector. Cost: 1
47SIMDPP_INL H rev(H a) { return vrev64_u32(a); }
48SIMDPP_INL T rev(T a) { return vrev64q_u32(a.native()); }
49
50/// Duplicates the lower/higher element in the half-vector. Cost: 1
51SIMDPP_INL H dup_lo(H a) { return vdup_lane_u32(a, 0); }
52SIMDPP_INL H dup_hi(H a) { return vdup_lane_u32(a, 1); }
53
54/** Shuffles two half-vectors or one vector
55 Cost: If s0,s1 == 0,3 or 2,1 then 2, otherwise 0-1.
56*/
57template<unsigned s0, unsigned s1> SIMDPP_INL
58H shf2x2(H a, H b)
59{
60 const unsigned sel = s0*4 + s1;
61 switch (sel) {
62 default:
63 case 0: /*00*/ return dup_lo(a);
64 case 1: /*01*/ return a;
65 case 2: /*02*/ return vzip_u32(a, b).val[0];
66 case 3: /*03*/ return rev(vext_u32(b, a, 1));
67 case 4: /*10*/ return rev(a);
68 case 5: /*11*/ return dup_hi(a);
69 case 6: /*12*/ return vext_u32(a, b, 1);
70 case 7: /*13*/ return vzip_u32(a, b).val[1];
71 case 8: /*20*/ return vzip_u32(b, a).val[0];
72 case 9: /*21*/ return rev(vext_u32(a, b, 1));
73 case 10: /*22*/ return dup_lo(b);
74 case 11: /*23*/ return b;
75 case 12: /*30*/ return vext_u32(b, a, 1);
76 case 13: /*31*/ return vzip_u32(b, a).val[1];
77 case 14: /*32*/ return rev(b);
78 case 15: /*33*/ return dup_hi(b);
79 }
80}
81
82// The first element comes from a, the second from b.
83template<unsigned s0, unsigned s1> SIMDPP_INL
84H shf_1x2(H a, H b)
85{
86 const unsigned sel = s0*2 + s1;
87 switch (sel) {
88 default:
89 case 0: /*00*/ return vzip_u32(a, b).val[0];;
90 case 1: /*01*/ return rev(vext_u32(b, a, 1));
91 case 2: /*10*/ return vext_u32(a, b, 1);
92 case 3: /*11*/ return vzip_u32(a, b).val[1];
93 }
94}
95
96template<unsigned sel>
97H sel_lo_hi(T a, T b)
98{
99 switch (sel) {
100 default:
101 case 0: return lo(a);
102 case 1: return hi(a);
103 case 2: return lo(b);
104 case 3: return hi(b);
105 }
106}
107
108template<unsigned s0, unsigned s1> SIMDPP_INL
109H shf4x2(T a, T b)
110{
111 return shf_1x2<s0%2, s1%2>(sel_lo_hi<s0/2>(a, b),
112 sel_lo_hi<s1/2>(a, b));
113}
114
115// 4-element permutations
116SIMDPP_INL T perm4(_0,_0,_0,_0, T a) { return bcast<0>(a); }
117SIMDPP_INL T perm4(_1,_1,_1,_1, T a) { return bcast<1>(a); }
118SIMDPP_INL T perm4(_2,_2,_2,_2, T a) { return bcast<2>(a); }
119SIMDPP_INL T perm4(_3,_3,_3,_3, T a) { return bcast<3>(a); }
120SIMDPP_INL T perm4(_1,_0,_3,_2, T a) { return rev(a); }
121
122template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
123T perm4(std::integral_constant<unsigned, s0>,
124 std::integral_constant<unsigned, s1>,
125 std::integral_constant<unsigned, s2>,
126 std::integral_constant<unsigned, s3>, const T& a)
127{
128 return co(shf2x2<s0,s1>(lo(a), hi(a)), shf2x2<s2,s3>(lo(a), hi(a)));
129}
130
131template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
132T permute4(T a)
133{
134 return perm4(std::integral_constant<unsigned, s0>{},
135 std::integral_constant<unsigned, s1>{},
136 std::integral_constant<unsigned, s2>{},
137 std::integral_constant<unsigned, s3>{}, a);
138}
139
140// 2-element shuffle: the first two elements must come from a, the last two -
141// from b
142template<unsigned s0, unsigned s1> SIMDPP_INL
143T shuffle2(T a, T b)
144{
145 return co(shf2x2<s0,s1>(lo(a), hi(a)), shf2x2<s0,s1>(lo(b), hi(b)));
146}
147
148template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
149T shuffle2(T a, T b)
150{
151 return co(shf2x2<s0,s1>(lo(a), hi(a)), shf2x2<s2,s3>(lo(b), hi(b)));
152}
153
154// 4-element shuffle among two 2-element (sub) vectors
155template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
156T shuffle2x2(const T& a, const T& b)
157{
158 return co(shf2x2<s0,s1>(lo(a), lo(b)), shf2x2<s2,s3>(hi(a), hi(b)));
159}
160
161// 8-element shuffle among two 4-element vectors
162template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
163T shuffle4x2(const T& a, const T& b)
164{
165 return co(shf4x2<s0,s1>(a, b), shf4x2<s2,s3>(a, b));
166}
167
168} // namespace neon_shuffle_int32x4
169} // namespace detail
170} // namespace SIMDPP_ARCH_NAMESPACE
171} // namespace simdpp
172
173#endif
174#endif
175
176