| 1 | /* Copyright (C) 2011-2012 Povilas Kanapickas <povilas@radix.lt> |
| 2 | |
| 3 | Distributed under the Boost Software License, Version 1.0. |
| 4 | (See accompanying file LICENSE_1_0.txt or copy at |
| 5 | http://www.boost.org/LICENSE_1_0.txt) |
| 6 | */ |
| 7 | |
| 8 | #ifndef LIBSIMDPP_DETAIL_SHUFFLE_NEON_INT32x4_H |
| 9 | #define LIBSIMDPP_DETAIL_SHUFFLE_NEON_INT32x4_H |
| 10 | #if SIMDPP_USE_NEON |
| 11 | |
| 12 | #include <type_traits> |
| 13 | |
| 14 | namespace simdpp { |
| 15 | namespace SIMDPP_ARCH_NAMESPACE { |
| 16 | namespace detail { |
| 17 | namespace neon_shuffle_int32x4 { |
| 18 | |
| 19 | /* |
| 20 | The code below implements generalized permutations of elements within |
| 21 | int32x4 vectors using various shuffling instructions available on NEON. |
| 22 | */ |
| 23 | using _0 = std::integral_constant<unsigned, 0>; |
| 24 | using _1 = std::integral_constant<unsigned, 1>; |
| 25 | using _2 = std::integral_constant<unsigned, 2>; |
| 26 | using _3 = std::integral_constant<unsigned, 3>; |
| 27 | using T = uint32x4; // full vector |
| 28 | using H = uint32x2_t; // half vector |
| 29 | |
| 30 | |
| 31 | /// Returns the lower/higher part of a vector. Cost: 0 |
| 32 | SIMDPP_INL H lo(T a) { return vget_low_u32(a.native()); } |
| 33 | SIMDPP_INL H hi(T a) { return vget_high_u32(a.native()); } |
| 34 | |
| 35 | /// Cost: 1 |
| 36 | template<unsigned N> SIMDPP_INL |
| 37 | T bcast(T a) |
| 38 | { |
| 39 | H h = (N < 2) ? lo(a) : hi(a); |
| 40 | return (uint32x4_t) vdupq_lane_u32(h, N % 2); |
| 41 | } |
| 42 | |
| 43 | /// Combines two half vectors. Cost: 0 |
| 44 | SIMDPP_INL T co(H lo, H hi){ return vcombine_u32(lo, hi); } |
| 45 | |
| 46 | /// Reverses the elements in half-vector or half-vectors in a vector. Cost: 1 |
| 47 | SIMDPP_INL H rev(H a) { return vrev64_u32(a); } |
| 48 | SIMDPP_INL T rev(T a) { return vrev64q_u32(a.native()); } |
| 49 | |
| 50 | /// Duplicates the lower/higher element in the half-vector. Cost: 1 |
| 51 | SIMDPP_INL H dup_lo(H a) { return vdup_lane_u32(a, 0); } |
| 52 | SIMDPP_INL H dup_hi(H a) { return vdup_lane_u32(a, 1); } |
| 53 | |
| 54 | /** Shuffles two half-vectors or one vector |
| 55 | Cost: If s0,s1 == 0,3 or 2,1 then 2, otherwise 0-1. |
| 56 | */ |
| 57 | template<unsigned s0, unsigned s1> SIMDPP_INL |
| 58 | H shf2x2(H a, H b) |
| 59 | { |
| 60 | const unsigned sel = s0*4 + s1; |
| 61 | switch (sel) { |
| 62 | default: |
| 63 | case 0: /*00*/ return dup_lo(a); |
| 64 | case 1: /*01*/ return a; |
| 65 | case 2: /*02*/ return vzip_u32(a, b).val[0]; |
| 66 | case 3: /*03*/ return rev(vext_u32(b, a, 1)); |
| 67 | case 4: /*10*/ return rev(a); |
| 68 | case 5: /*11*/ return dup_hi(a); |
| 69 | case 6: /*12*/ return vext_u32(a, b, 1); |
| 70 | case 7: /*13*/ return vzip_u32(a, b).val[1]; |
| 71 | case 8: /*20*/ return vzip_u32(b, a).val[0]; |
| 72 | case 9: /*21*/ return rev(vext_u32(a, b, 1)); |
| 73 | case 10: /*22*/ return dup_lo(b); |
| 74 | case 11: /*23*/ return b; |
| 75 | case 12: /*30*/ return vext_u32(b, a, 1); |
| 76 | case 13: /*31*/ return vzip_u32(b, a).val[1]; |
| 77 | case 14: /*32*/ return rev(b); |
| 78 | case 15: /*33*/ return dup_hi(b); |
| 79 | } |
| 80 | } |
| 81 | |
| 82 | // The first element comes from a, the second from b. |
| 83 | template<unsigned s0, unsigned s1> SIMDPP_INL |
| 84 | H shf_1x2(H a, H b) |
| 85 | { |
| 86 | const unsigned sel = s0*2 + s1; |
| 87 | switch (sel) { |
| 88 | default: |
| 89 | case 0: /*00*/ return vzip_u32(a, b).val[0];; |
| 90 | case 1: /*01*/ return rev(vext_u32(b, a, 1)); |
| 91 | case 2: /*10*/ return vext_u32(a, b, 1); |
| 92 | case 3: /*11*/ return vzip_u32(a, b).val[1]; |
| 93 | } |
| 94 | } |
| 95 | |
| 96 | template<unsigned sel> |
| 97 | H sel_lo_hi(T a, T b) |
| 98 | { |
| 99 | switch (sel) { |
| 100 | default: |
| 101 | case 0: return lo(a); |
| 102 | case 1: return hi(a); |
| 103 | case 2: return lo(b); |
| 104 | case 3: return hi(b); |
| 105 | } |
| 106 | } |
| 107 | |
| 108 | template<unsigned s0, unsigned s1> SIMDPP_INL |
| 109 | H shf4x2(T a, T b) |
| 110 | { |
| 111 | return shf_1x2<s0%2, s1%2>(sel_lo_hi<s0/2>(a, b), |
| 112 | sel_lo_hi<s1/2>(a, b)); |
| 113 | } |
| 114 | |
| 115 | // 4-element permutations |
| 116 | SIMDPP_INL T perm4(_0,_0,_0,_0, T a) { return bcast<0>(a); } |
| 117 | SIMDPP_INL T perm4(_1,_1,_1,_1, T a) { return bcast<1>(a); } |
| 118 | SIMDPP_INL T perm4(_2,_2,_2,_2, T a) { return bcast<2>(a); } |
| 119 | SIMDPP_INL T perm4(_3,_3,_3,_3, T a) { return bcast<3>(a); } |
| 120 | SIMDPP_INL T perm4(_1,_0,_3,_2, T a) { return rev(a); } |
| 121 | |
| 122 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
| 123 | T perm4(std::integral_constant<unsigned, s0>, |
| 124 | std::integral_constant<unsigned, s1>, |
| 125 | std::integral_constant<unsigned, s2>, |
| 126 | std::integral_constant<unsigned, s3>, const T& a) |
| 127 | { |
| 128 | return co(shf2x2<s0,s1>(lo(a), hi(a)), shf2x2<s2,s3>(lo(a), hi(a))); |
| 129 | } |
| 130 | |
| 131 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
| 132 | T permute4(T a) |
| 133 | { |
| 134 | return perm4(std::integral_constant<unsigned, s0>{}, |
| 135 | std::integral_constant<unsigned, s1>{}, |
| 136 | std::integral_constant<unsigned, s2>{}, |
| 137 | std::integral_constant<unsigned, s3>{}, a); |
| 138 | } |
| 139 | |
| 140 | // 2-element shuffle: the first two elements must come from a, the last two - |
| 141 | // from b |
| 142 | template<unsigned s0, unsigned s1> SIMDPP_INL |
| 143 | T shuffle2(T a, T b) |
| 144 | { |
| 145 | return co(shf2x2<s0,s1>(lo(a), hi(a)), shf2x2<s0,s1>(lo(b), hi(b))); |
| 146 | } |
| 147 | |
| 148 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
| 149 | T shuffle2(T a, T b) |
| 150 | { |
| 151 | return co(shf2x2<s0,s1>(lo(a), hi(a)), shf2x2<s2,s3>(lo(b), hi(b))); |
| 152 | } |
| 153 | |
| 154 | // 4-element shuffle among two 2-element (sub) vectors |
| 155 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
| 156 | T shuffle2x2(const T& a, const T& b) |
| 157 | { |
| 158 | return co(shf2x2<s0,s1>(lo(a), lo(b)), shf2x2<s2,s3>(hi(a), hi(b))); |
| 159 | } |
| 160 | |
| 161 | // 8-element shuffle among two 4-element vectors |
| 162 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
| 163 | T shuffle4x2(const T& a, const T& b) |
| 164 | { |
| 165 | return co(shf4x2<s0,s1>(a, b), shf4x2<s2,s3>(a, b)); |
| 166 | } |
| 167 | |
| 168 | } // namespace neon_shuffle_int32x4 |
| 169 | } // namespace detail |
| 170 | } // namespace SIMDPP_ARCH_NAMESPACE |
| 171 | } // namespace simdpp |
| 172 | |
| 173 | #endif |
| 174 | #endif |
| 175 | |
| 176 | |