| 1 | /* Copyright (C) 2011-2012 Povilas Kanapickas <povilas@radix.lt> |
| 2 | |
| 3 | Distributed under the Boost Software License, Version 1.0. |
| 4 | (See accompanying file LICENSE_1_0.txt or copy at |
| 5 | http://www.boost.org/LICENSE_1_0.txt) |
| 6 | */ |
| 7 | |
| 8 | #ifndef LIBSIMDPP_DETAIL_SHUFFLE_NEON_INT64x2_H |
| 9 | #define LIBSIMDPP_DETAIL_SHUFFLE_NEON_INT64x2_H |
| 10 | #if SIMDPP_USE_NEON |
| 11 | |
| 12 | #include <type_traits> |
| 13 | |
| 14 | namespace simdpp { |
| 15 | namespace SIMDPP_ARCH_NAMESPACE { |
| 16 | namespace detail { |
| 17 | namespace neon_shuffle_int64x2 { |
| 18 | |
| 19 | #if SIMDPP_USE_NEON32 |
| 20 | |
| 21 | /* |
| 22 | The code below implements generalized permutations of elements within |
| 23 | int64x2 vectors using half-vector move instructions available on NEON. |
| 24 | */ |
| 25 | using T = uint64x2; // full vector |
| 26 | using H = uint64x1_t; // half vector |
| 27 | |
| 28 | |
| 29 | /// Returns the lower/higher part of a vector. Cost: 0 |
| 30 | SIMDPP_INL H lo(T a) { return vget_low_u64(a.native()); } |
| 31 | SIMDPP_INL H hi(T a) { return vget_high_u64(a.native()); } |
| 32 | |
| 33 | /// Combines two half vectors. Cost: 0 |
| 34 | SIMDPP_INL T co(H lo, H hi) { return vcombine_u64(lo, hi); } |
| 35 | |
| 36 | // 2-element permutation |
| 37 | template<unsigned s0, unsigned s1> SIMDPP_INL |
| 38 | T permute2(T a) |
| 39 | { |
| 40 | const unsigned sel = s0*2 + s1; |
| 41 | switch (sel) { |
| 42 | default: |
| 43 | case 0: /*00*/ return co(lo(a), lo(a)); |
| 44 | case 1: /*01*/ return a; |
| 45 | case 2: /*10*/ return co(hi(a), lo(a)); |
| 46 | case 3: /*11*/ return co(hi(a), hi(a)); |
| 47 | } |
| 48 | } |
| 49 | |
| 50 | // 2-element shuffle: the first element must come from a, the second - from b |
| 51 | template<unsigned s0, unsigned s1> SIMDPP_INL |
| 52 | T shuffle1(T a, T b) |
| 53 | { |
| 54 | const unsigned sel = s0*2 + s1; |
| 55 | switch (sel) { |
| 56 | default: |
| 57 | case 0: /*00*/ return co(lo(a), lo(b)); |
| 58 | case 1: /*01*/ return co(lo(a), hi(b)); |
| 59 | case 2: /*10*/ return co(hi(a), lo(b)); |
| 60 | case 3: /*11*/ return co(hi(a), hi(b)); |
| 61 | } |
| 62 | } |
| 63 | |
| 64 | template<unsigned s0, unsigned s1> SIMDPP_INL |
| 65 | T shuffle2x2(const T& a, const T& b) |
| 66 | { |
| 67 | const unsigned sel = s0*4 + s1; |
| 68 | switch (sel) { |
| 69 | default: |
| 70 | case 0: /*00*/ return co(lo(a), lo(a)); |
| 71 | case 1: /*01*/ return a; |
| 72 | case 2: /*02*/ return co(lo(a), lo(b)); |
| 73 | case 3: /*03*/ return co(lo(a), hi(b)); |
| 74 | case 4: /*10*/ return co(hi(a), lo(a)); |
| 75 | case 5: /*11*/ return co(hi(a), hi(a)); |
| 76 | case 6: /*12*/ return co(hi(a), lo(b)); |
| 77 | case 7: /*13*/ return co(hi(a), hi(b)); |
| 78 | case 8: /*20*/ return co(lo(b), lo(a)); |
| 79 | case 9: /*21*/ return co(lo(b), hi(a)); |
| 80 | case 10: /*22*/ return co(lo(b), lo(b)); |
| 81 | case 11: /*23*/ return b; |
| 82 | case 12: /*30*/ return co(hi(b), lo(a)); |
| 83 | case 13: /*31*/ return co(hi(b), hi(a)); |
| 84 | case 14: /*32*/ return co(hi(b), lo(b)); |
| 85 | case 15: /*33*/ return co(hi(b), hi(b)); |
| 86 | } |
| 87 | } |
| 88 | |
| 89 | #else // SIMDPP_USE_NEON64 |
| 90 | |
| 91 | using T = uint64x2; // full vector |
| 92 | |
| 93 | // Moves the high half of b onto high half of a |
| 94 | SIMDPP_INL T move_hi(const T& a, const T& b) |
| 95 | { |
| 96 | T mask = make_uint(0xffffffffffffffff, 0x0); |
| 97 | return vbslq_u64(mask.native(), a.native(), b.native()); |
| 98 | } |
| 99 | |
| 100 | // 2-element permutation |
| 101 | template<unsigned s0, unsigned s1> SIMDPP_INL |
| 102 | T permute2(const T& a) |
| 103 | { |
| 104 | const unsigned sel = s0*2 + s1; |
| 105 | switch (sel) { |
| 106 | default: |
| 107 | case 0: /*00*/ return vzip1q_u64(a.native(), a.native()); |
| 108 | case 1: /*01*/ return a; |
| 109 | case 2: /*10*/ return vextq_u64(a.native(), a.native(), 1); |
| 110 | case 3: /*11*/ return vzip2q_u64(a.native(), a.native()); |
| 111 | } |
| 112 | } |
| 113 | |
| 114 | // 2-element shuffle: the first element must come from a, the second - from b |
| 115 | template<unsigned s0, unsigned s1> SIMDPP_INL |
| 116 | T shuffle1(const T& a, const T& b) |
| 117 | { |
| 118 | const unsigned sel = s0*2 + s1; |
| 119 | switch (sel) { |
| 120 | default: |
| 121 | case 0: /*00*/ return vzip1q_u64(a.native(), b.native()); |
| 122 | case 1: /*01*/ return move_hi(a, b); |
| 123 | case 2: /*10*/ return vextq_u64(a.native(), b.native(), 1); |
| 124 | case 3: /*11*/ return vzip2q_u64(a.native(), b.native()); |
| 125 | } |
| 126 | } |
| 127 | |
| 128 | template<unsigned s0, unsigned s1> SIMDPP_INL |
| 129 | T shuffle2x2(const T& a, const T& b) |
| 130 | { |
| 131 | const unsigned sel = s0*4 + s1; |
| 132 | switch (sel) { |
| 133 | default: |
| 134 | case 0: /*00*/ return vzip1q_u64(a.native(), a.native()); |
| 135 | case 1: /*01*/ return a; |
| 136 | case 2: /*02*/ return vzip1q_u64(a.native(), b.native()); |
| 137 | case 3: /*03*/ return move_hi(a, b); |
| 138 | case 4: /*10*/ return vextq_u64(a.native(), a.native(), 1); |
| 139 | case 5: /*11*/ return vzip2q_u64(a.native(), a.native()); |
| 140 | case 6: /*12*/ return vextq_u64(a.native(), b.native(), 1); |
| 141 | case 7: /*13*/ return vzip2q_u64(a.native(), b.native()); |
| 142 | case 8: /*20*/ return vzip1q_u64(b.native(), a.native()); |
| 143 | case 9: /*21*/ return move_hi(b, a); |
| 144 | case 10: /*22*/ return vzip1q_u64(b.native(), b.native()); |
| 145 | case 11: /*23*/ return b; |
| 146 | case 12: /*30*/ return vextq_u64(b.native(), a.native(), 1); |
| 147 | case 13: /*31*/ return vzip2q_u64(b.native(), a.native()); |
| 148 | case 14: /*32*/ return vextq_u64(b.native(), b.native(), 1); |
| 149 | case 15: /*33*/ return vzip2q_u64(b.native(), b.native()); |
| 150 | } |
| 151 | } |
| 152 | |
| 153 | #endif |
| 154 | |
| 155 | } // namespace neon_shuffle_int64x2 |
| 156 | } // namespace detail |
| 157 | } // namespace SIMDPP_ARCH_NAMESPACE |
| 158 | } // namespace simdpp |
| 159 | |
| 160 | #endif |
| 161 | #endif |
| 162 | |
| 163 | |