1 | /* Copyright (C) 2011-2012 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_DETAIL_SHUFFLE_NEON_INT64x2_H |
9 | #define LIBSIMDPP_DETAIL_SHUFFLE_NEON_INT64x2_H |
10 | #if SIMDPP_USE_NEON |
11 | |
12 | #include <type_traits> |
13 | |
14 | namespace simdpp { |
15 | namespace SIMDPP_ARCH_NAMESPACE { |
16 | namespace detail { |
17 | namespace neon_shuffle_int64x2 { |
18 | |
19 | #if SIMDPP_USE_NEON32 |
20 | |
21 | /* |
22 | The code below implements generalized permutations of elements within |
23 | int64x2 vectors using half-vector move instructions available on NEON. |
24 | */ |
25 | using T = uint64x2; // full vector |
26 | using H = uint64x1_t; // half vector |
27 | |
28 | |
29 | /// Returns the lower/higher part of a vector. Cost: 0 |
30 | SIMDPP_INL H lo(T a) { return vget_low_u64(a.native()); } |
31 | SIMDPP_INL H hi(T a) { return vget_high_u64(a.native()); } |
32 | |
33 | /// Combines two half vectors. Cost: 0 |
34 | SIMDPP_INL T co(H lo, H hi) { return vcombine_u64(lo, hi); } |
35 | |
36 | // 2-element permutation |
37 | template<unsigned s0, unsigned s1> SIMDPP_INL |
38 | T permute2(T a) |
39 | { |
40 | const unsigned sel = s0*2 + s1; |
41 | switch (sel) { |
42 | default: |
43 | case 0: /*00*/ return co(lo(a), lo(a)); |
44 | case 1: /*01*/ return a; |
45 | case 2: /*10*/ return co(hi(a), lo(a)); |
46 | case 3: /*11*/ return co(hi(a), hi(a)); |
47 | } |
48 | } |
49 | |
50 | // 2-element shuffle: the first element must come from a, the second - from b |
51 | template<unsigned s0, unsigned s1> SIMDPP_INL |
52 | T shuffle1(T a, T b) |
53 | { |
54 | const unsigned sel = s0*2 + s1; |
55 | switch (sel) { |
56 | default: |
57 | case 0: /*00*/ return co(lo(a), lo(b)); |
58 | case 1: /*01*/ return co(lo(a), hi(b)); |
59 | case 2: /*10*/ return co(hi(a), lo(b)); |
60 | case 3: /*11*/ return co(hi(a), hi(b)); |
61 | } |
62 | } |
63 | |
64 | template<unsigned s0, unsigned s1> SIMDPP_INL |
65 | T shuffle2x2(const T& a, const T& b) |
66 | { |
67 | const unsigned sel = s0*4 + s1; |
68 | switch (sel) { |
69 | default: |
70 | case 0: /*00*/ return co(lo(a), lo(a)); |
71 | case 1: /*01*/ return a; |
72 | case 2: /*02*/ return co(lo(a), lo(b)); |
73 | case 3: /*03*/ return co(lo(a), hi(b)); |
74 | case 4: /*10*/ return co(hi(a), lo(a)); |
75 | case 5: /*11*/ return co(hi(a), hi(a)); |
76 | case 6: /*12*/ return co(hi(a), lo(b)); |
77 | case 7: /*13*/ return co(hi(a), hi(b)); |
78 | case 8: /*20*/ return co(lo(b), lo(a)); |
79 | case 9: /*21*/ return co(lo(b), hi(a)); |
80 | case 10: /*22*/ return co(lo(b), lo(b)); |
81 | case 11: /*23*/ return b; |
82 | case 12: /*30*/ return co(hi(b), lo(a)); |
83 | case 13: /*31*/ return co(hi(b), hi(a)); |
84 | case 14: /*32*/ return co(hi(b), lo(b)); |
85 | case 15: /*33*/ return co(hi(b), hi(b)); |
86 | } |
87 | } |
88 | |
89 | #else // SIMDPP_USE_NEON64 |
90 | |
91 | using T = uint64x2; // full vector |
92 | |
93 | // Moves the high half of b onto high half of a |
94 | SIMDPP_INL T move_hi(const T& a, const T& b) |
95 | { |
96 | T mask = make_uint(0xffffffffffffffff, 0x0); |
97 | return vbslq_u64(mask.native(), a.native(), b.native()); |
98 | } |
99 | |
100 | // 2-element permutation |
101 | template<unsigned s0, unsigned s1> SIMDPP_INL |
102 | T permute2(const T& a) |
103 | { |
104 | const unsigned sel = s0*2 + s1; |
105 | switch (sel) { |
106 | default: |
107 | case 0: /*00*/ return vzip1q_u64(a.native(), a.native()); |
108 | case 1: /*01*/ return a; |
109 | case 2: /*10*/ return vextq_u64(a.native(), a.native(), 1); |
110 | case 3: /*11*/ return vzip2q_u64(a.native(), a.native()); |
111 | } |
112 | } |
113 | |
114 | // 2-element shuffle: the first element must come from a, the second - from b |
115 | template<unsigned s0, unsigned s1> SIMDPP_INL |
116 | T shuffle1(const T& a, const T& b) |
117 | { |
118 | const unsigned sel = s0*2 + s1; |
119 | switch (sel) { |
120 | default: |
121 | case 0: /*00*/ return vzip1q_u64(a.native(), b.native()); |
122 | case 1: /*01*/ return move_hi(a, b); |
123 | case 2: /*10*/ return vextq_u64(a.native(), b.native(), 1); |
124 | case 3: /*11*/ return vzip2q_u64(a.native(), b.native()); |
125 | } |
126 | } |
127 | |
128 | template<unsigned s0, unsigned s1> SIMDPP_INL |
129 | T shuffle2x2(const T& a, const T& b) |
130 | { |
131 | const unsigned sel = s0*4 + s1; |
132 | switch (sel) { |
133 | default: |
134 | case 0: /*00*/ return vzip1q_u64(a.native(), a.native()); |
135 | case 1: /*01*/ return a; |
136 | case 2: /*02*/ return vzip1q_u64(a.native(), b.native()); |
137 | case 3: /*03*/ return move_hi(a, b); |
138 | case 4: /*10*/ return vextq_u64(a.native(), a.native(), 1); |
139 | case 5: /*11*/ return vzip2q_u64(a.native(), a.native()); |
140 | case 6: /*12*/ return vextq_u64(a.native(), b.native(), 1); |
141 | case 7: /*13*/ return vzip2q_u64(a.native(), b.native()); |
142 | case 8: /*20*/ return vzip1q_u64(b.native(), a.native()); |
143 | case 9: /*21*/ return move_hi(b, a); |
144 | case 10: /*22*/ return vzip1q_u64(b.native(), b.native()); |
145 | case 11: /*23*/ return b; |
146 | case 12: /*30*/ return vextq_u64(b.native(), a.native(), 1); |
147 | case 13: /*31*/ return vzip2q_u64(b.native(), a.native()); |
148 | case 14: /*32*/ return vextq_u64(b.native(), b.native(), 1); |
149 | case 15: /*33*/ return vzip2q_u64(b.native(), b.native()); |
150 | } |
151 | } |
152 | |
153 | #endif |
154 | |
155 | } // namespace neon_shuffle_int64x2 |
156 | } // namespace detail |
157 | } // namespace SIMDPP_ARCH_NAMESPACE |
158 | } // namespace simdpp |
159 | |
160 | #endif |
161 | #endif |
162 | |
163 | |