1/* Copyright (C) 2011-2012 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_DETAIL_SHUFFLE_NEON_INT64x2_H
9#define LIBSIMDPP_DETAIL_SHUFFLE_NEON_INT64x2_H
10#if SIMDPP_USE_NEON
11
12#include <type_traits>
13
14namespace simdpp {
15namespace SIMDPP_ARCH_NAMESPACE {
16namespace detail {
17namespace neon_shuffle_int64x2 {
18
19#if SIMDPP_USE_NEON32
20
21/*
22 The code below implements generalized permutations of elements within
23 int64x2 vectors using half-vector move instructions available on NEON.
24*/
25using T = uint64x2; // full vector
26using H = uint64x1_t; // half vector
27
28
29/// Returns the lower/higher part of a vector. Cost: 0
30SIMDPP_INL H lo(T a) { return vget_low_u64(a.native()); }
31SIMDPP_INL H hi(T a) { return vget_high_u64(a.native()); }
32
33/// Combines two half vectors. Cost: 0
34SIMDPP_INL T co(H lo, H hi) { return vcombine_u64(lo, hi); }
35
36// 2-element permutation
37template<unsigned s0, unsigned s1> SIMDPP_INL
38T permute2(T a)
39{
40 const unsigned sel = s0*2 + s1;
41 switch (sel) {
42 default:
43 case 0: /*00*/ return co(lo(a), lo(a));
44 case 1: /*01*/ return a;
45 case 2: /*10*/ return co(hi(a), lo(a));
46 case 3: /*11*/ return co(hi(a), hi(a));
47 }
48}
49
50// 2-element shuffle: the first element must come from a, the second - from b
51template<unsigned s0, unsigned s1> SIMDPP_INL
52T shuffle1(T a, T b)
53{
54 const unsigned sel = s0*2 + s1;
55 switch (sel) {
56 default:
57 case 0: /*00*/ return co(lo(a), lo(b));
58 case 1: /*01*/ return co(lo(a), hi(b));
59 case 2: /*10*/ return co(hi(a), lo(b));
60 case 3: /*11*/ return co(hi(a), hi(b));
61 }
62}
63
64template<unsigned s0, unsigned s1> SIMDPP_INL
65T shuffle2x2(const T& a, const T& b)
66{
67 const unsigned sel = s0*4 + s1;
68 switch (sel) {
69 default:
70 case 0: /*00*/ return co(lo(a), lo(a));
71 case 1: /*01*/ return a;
72 case 2: /*02*/ return co(lo(a), lo(b));
73 case 3: /*03*/ return co(lo(a), hi(b));
74 case 4: /*10*/ return co(hi(a), lo(a));
75 case 5: /*11*/ return co(hi(a), hi(a));
76 case 6: /*12*/ return co(hi(a), lo(b));
77 case 7: /*13*/ return co(hi(a), hi(b));
78 case 8: /*20*/ return co(lo(b), lo(a));
79 case 9: /*21*/ return co(lo(b), hi(a));
80 case 10: /*22*/ return co(lo(b), lo(b));
81 case 11: /*23*/ return b;
82 case 12: /*30*/ return co(hi(b), lo(a));
83 case 13: /*31*/ return co(hi(b), hi(a));
84 case 14: /*32*/ return co(hi(b), lo(b));
85 case 15: /*33*/ return co(hi(b), hi(b));
86 }
87}
88
89#else // SIMDPP_USE_NEON64
90
91using T = uint64x2; // full vector
92
93// Moves the high half of b onto high half of a
94SIMDPP_INL T move_hi(const T& a, const T& b)
95{
96 T mask = make_uint(0xffffffffffffffff, 0x0);
97 return vbslq_u64(mask.native(), a.native(), b.native());
98}
99
100// 2-element permutation
101template<unsigned s0, unsigned s1> SIMDPP_INL
102T permute2(const T& a)
103{
104 const unsigned sel = s0*2 + s1;
105 switch (sel) {
106 default:
107 case 0: /*00*/ return vzip1q_u64(a.native(), a.native());
108 case 1: /*01*/ return a;
109 case 2: /*10*/ return vextq_u64(a.native(), a.native(), 1);
110 case 3: /*11*/ return vzip2q_u64(a.native(), a.native());
111 }
112}
113
114// 2-element shuffle: the first element must come from a, the second - from b
115template<unsigned s0, unsigned s1> SIMDPP_INL
116T shuffle1(const T& a, const T& b)
117{
118 const unsigned sel = s0*2 + s1;
119 switch (sel) {
120 default:
121 case 0: /*00*/ return vzip1q_u64(a.native(), b.native());
122 case 1: /*01*/ return move_hi(a, b);
123 case 2: /*10*/ return vextq_u64(a.native(), b.native(), 1);
124 case 3: /*11*/ return vzip2q_u64(a.native(), b.native());
125 }
126}
127
128template<unsigned s0, unsigned s1> SIMDPP_INL
129T shuffle2x2(const T& a, const T& b)
130{
131 const unsigned sel = s0*4 + s1;
132 switch (sel) {
133 default:
134 case 0: /*00*/ return vzip1q_u64(a.native(), a.native());
135 case 1: /*01*/ return a;
136 case 2: /*02*/ return vzip1q_u64(a.native(), b.native());
137 case 3: /*03*/ return move_hi(a, b);
138 case 4: /*10*/ return vextq_u64(a.native(), a.native(), 1);
139 case 5: /*11*/ return vzip2q_u64(a.native(), a.native());
140 case 6: /*12*/ return vextq_u64(a.native(), b.native(), 1);
141 case 7: /*13*/ return vzip2q_u64(a.native(), b.native());
142 case 8: /*20*/ return vzip1q_u64(b.native(), a.native());
143 case 9: /*21*/ return move_hi(b, a);
144 case 10: /*22*/ return vzip1q_u64(b.native(), b.native());
145 case 11: /*23*/ return b;
146 case 12: /*30*/ return vextq_u64(b.native(), a.native(), 1);
147 case 13: /*31*/ return vzip2q_u64(b.native(), a.native());
148 case 14: /*32*/ return vextq_u64(b.native(), b.native(), 1);
149 case 15: /*33*/ return vzip2q_u64(b.native(), b.native());
150 }
151}
152
153#endif
154
155} // namespace neon_shuffle_int64x2
156} // namespace detail
157} // namespace SIMDPP_ARCH_NAMESPACE
158} // namespace simdpp
159
160#endif
161#endif
162
163