| 1 | /* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt> |
| 2 | |
| 3 | Distributed under the Boost Software License, Version 1.0. |
| 4 | (See accompanying file LICENSE_1_0.txt or copy at |
| 5 | http://www.boost.org/LICENSE_1_0.txt) |
| 6 | */ |
| 7 | |
| 8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_SHUFFLE4x2_H |
| 9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_SHUFFLE4x2_H |
| 10 | |
| 11 | #ifndef LIBSIMDPP_SIMD_H |
| 12 | #error "This file must be included through simd.h" |
| 13 | #endif |
| 14 | |
| 15 | #include <simdpp/types.h> |
| 16 | #include <simdpp/core/make_shuffle_bytes_mask.h> |
| 17 | #include <simdpp/core/shuffle_bytes16.h> |
| 18 | #include <simdpp/detail/insn/shuffle2x2.h> |
| 19 | #include <simdpp/detail/shuffle/sse_float32_4x2.h> |
| 20 | #include <simdpp/detail/shuffle/sse_float64_4x2.h> |
| 21 | #include <simdpp/detail/shuffle/sse_int32_4x2.h> |
| 22 | #include <simdpp/detail/shuffle/sse_int64_4x2.h> |
| 23 | #include <simdpp/detail/shuffle/neon_int32x4.h> |
| 24 | #include <simdpp/detail/not_implemented.h> |
| 25 | #include <simdpp/detail/vector_array_macros.h> |
| 26 | |
| 27 | namespace simdpp { |
| 28 | namespace SIMDPP_ARCH_NAMESPACE { |
| 29 | namespace detail { |
| 30 | namespace insn { |
| 31 | |
| 32 | // ----------------------------------------------------------------------------- |
| 33 | // emulates 64x4 shuffle on architectures with 128-bit vectors |
| 34 | |
| 35 | template<unsigned s0, unsigned s1, class V> |
| 36 | V i_shuffle_emul_64x4_half(const V& a0, const V& a1, const V& b0, const V& b1) |
| 37 | { |
| 38 | const V& h0 = s0 < 2 ? a0 : |
| 39 | s0 < 4 ? a1 : |
| 40 | s0 < 6 ? b0 : b1; |
| 41 | const V& h1 = s1 < 2 ? a0 : |
| 42 | s1 < 4 ? a1 : |
| 43 | s1 < 6 ? b0 : b1; |
| 44 | return i_shuffle2x2<s0%2, s1%2+2>(h0, h1); |
| 45 | } |
| 46 | |
| 47 | // ----------------------------------------------------------------------------- |
| 48 | // float32 |
| 49 | |
| 50 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
| 51 | float32<4> i_shuffle4x2(const float32<4>& a, const float32<4>& b) |
| 52 | { |
| 53 | static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range" ); |
| 54 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP |
| 55 | float32<4> r; |
| 56 | r.el(0) = s0 < 4 ? a.el(s0) : b.el(s0-4); |
| 57 | r.el(1) = s1 < 4 ? a.el(s1) : b.el(s1-4); |
| 58 | r.el(2) = s2 < 4 ? a.el(s2) : b.el(s2-4); |
| 59 | r.el(3) = s3 < 4 ? a.el(s3) : b.el(s3-4); |
| 60 | return r; |
| 61 | #elif SIMDPP_USE_SSE2 |
| 62 | return sse_shuffle4x2_float32::do_shuffle<s0, s1, s2, s3>(a, b); |
| 63 | #elif SIMDPP_USE_NEON_FLT_SP |
| 64 | return (float32<4>)detail::neon_shuffle_int32x4::shuffle4x2<s0, s1, s2, s3>(uint32<4>(a), uint32<4>(b)); |
| 65 | #elif SIMDPP_USE_ALTIVEC |
| 66 | uint32<4> mask = make_shuffle_bytes16_mask<s0, s1, s2, s3>(mask); |
| 67 | return shuffle_bytes16(a, b, mask); |
| 68 | #elif SIMDPP_USE_MSA |
| 69 | uint32<4> mask = make_uint(s0,s1,s2,s3); |
| 70 | return (v4f32) __msa_vshf_w((v4i32) mask.native(), |
| 71 | (v4i32) b.native(), |
| 72 | (v4i32) a.native()); |
| 73 | #else |
| 74 | return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(int64<s0+4>, a, b); |
| 75 | #endif |
| 76 | } |
| 77 | |
| 78 | #if SIMDPP_USE_AVX |
| 79 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
| 80 | float32<8> i_shuffle4x2(const float32<8>& a, const float32<8>& b) |
| 81 | { |
| 82 | static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range" ); |
| 83 | return sse_shuffle4x2_float32::do_shuffle<s0, s1, s2, s3>(a, b); |
| 84 | } |
| 85 | #endif |
| 86 | |
| 87 | #if SIMDPP_USE_AVX512F |
| 88 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
| 89 | float32<16> i_shuffle4x2(const float32<16>& a, const float32<16>& b) |
| 90 | { |
| 91 | static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range" ); |
| 92 | return sse_shuffle4x2_float32::do_shuffle<s0, s1, s2, s3>(a, b); |
| 93 | } |
| 94 | #endif |
| 95 | |
| 96 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> SIMDPP_INL |
| 97 | float32<N> i_shuffle4x2(const float32<N>& a, const float32<N>& b) |
| 98 | { |
| 99 | SIMDPP_VEC_ARRAY_IMPL2(float32<N>, (i_shuffle4x2<s0,s1,s2,s3>), a, b); |
| 100 | } |
| 101 | |
| 102 | // ----------------------------------------------------------------------------- |
| 103 | // float64 |
| 104 | |
| 105 | #if SIMDPP_USE_AVX |
| 106 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
| 107 | float64<4> i_shuffle4x2(const float64<4>& a, const float64<4>& b) |
| 108 | { |
| 109 | static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range" ); |
| 110 | return sse_shuffle4x2_float64::do_shuffle<s0, s1, s2, s3>(a, b); |
| 111 | } |
| 112 | #endif |
| 113 | |
| 114 | #if SIMDPP_USE_AVX512F |
| 115 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
| 116 | float64<8> i_shuffle4x2(const float64<8>& a, const float64<8>& b) |
| 117 | { |
| 118 | static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range" ); |
| 119 | return sse_shuffle4x2_float64::do_shuffle<s0, s1, s2, s3>(a, b); |
| 120 | } |
| 121 | #endif |
| 122 | |
| 123 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> SIMDPP_INL |
| 124 | float64<N> i_shuffle4x2(const float64<N>& a, const float64<N>& b) |
| 125 | { |
| 126 | #if SIMDPP_USE_AVX |
| 127 | SIMDPP_VEC_ARRAY_IMPL2(float64<N>, (i_shuffle4x2<s0,s1,s2,s3>), a, b); |
| 128 | #else |
| 129 | float64<N> r; |
| 130 | for (unsigned i = 0; i < float64<N>::vec_length; i+=2) { |
| 131 | r.vec(i*2) = i_shuffle_emul_64x4_half<s0,s1>(a.vec(i*2), a.vec(i*2+1), |
| 132 | b.vec(i*2), b.vec(i*2+1)); |
| 133 | r.vec(i*2+1) = i_shuffle_emul_64x4_half<s2,s3>(a.vec(i*2), a.vec(i*2+1), |
| 134 | b.vec(i*2), b.vec(i*2+1)); |
| 135 | } |
| 136 | return r; |
| 137 | #endif |
| 138 | } |
| 139 | |
| 140 | // ----------------------------------------------------------------------------- |
| 141 | |
| 142 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
| 143 | uint32<4> i_shuffle4x2(const uint32<4>& a, const uint32<4>& b) |
| 144 | { |
| 145 | static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range" ); |
| 146 | #if SIMDPP_USE_NULL |
| 147 | uint32<4> r; |
| 148 | r.el(0) = s0 < 4 ? a.el(s0) : b.el(s0-4); |
| 149 | r.el(1) = s1 < 4 ? a.el(s1) : b.el(s1-4); |
| 150 | r.el(2) = s2 < 4 ? a.el(s2) : b.el(s2-4); |
| 151 | r.el(3) = s3 < 4 ? a.el(s3) : b.el(s3-4); |
| 152 | return r; |
| 153 | #elif SIMDPP_USE_SSE2 |
| 154 | return sse_shuffle4x2_int32::do_shuffle<s0, s1, s2, s3>(a, b); |
| 155 | #elif SIMDPP_USE_NEON |
| 156 | return detail::neon_shuffle_int32x4::shuffle4x2<s0, s1, s2, s3>(a, b); |
| 157 | #elif SIMDPP_USE_ALTIVEC |
| 158 | uint32<4> mask = make_shuffle_bytes16_mask<s0, s1, s2, s3>(mask); |
| 159 | return shuffle_bytes16(a, b, mask); |
| 160 | #elif SIMDPP_USE_MSA |
| 161 | uint32<4> mask = make_uint(s0,s1,s2,s3); |
| 162 | return (v4u32) __msa_vshf_w((v4i32) mask.native(), |
| 163 | (v4i32) b.native(), |
| 164 | (v4i32) a.native()); |
| 165 | #else |
| 166 | return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(int64<s0+4>, a, b); |
| 167 | #endif |
| 168 | } |
| 169 | |
| 170 | #if SIMDPP_USE_AVX2 |
| 171 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
| 172 | uint32<8> i_shuffle4x2(const uint32<8>& a, const uint32<8>& b) |
| 173 | { |
| 174 | static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range" ); |
| 175 | return sse_shuffle4x2_int32::do_shuffle<s0, s1, s2, s3>(a, b); |
| 176 | } |
| 177 | #endif |
| 178 | |
| 179 | #if SIMDPP_USE_AVX512F |
| 180 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
| 181 | uint32<16> i_shuffle4x2(const uint32<16>& a, const uint32<16>& b) |
| 182 | { |
| 183 | static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range" ); |
| 184 | return sse_shuffle4x2_int32::do_shuffle<s0, s1, s2, s3>(a, b); |
| 185 | } |
| 186 | #endif |
| 187 | |
| 188 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> SIMDPP_INL |
| 189 | uint32<N> i_shuffle4x2(const uint32<N>& a, const uint32<N>& b) |
| 190 | { |
| 191 | SIMDPP_VEC_ARRAY_IMPL2(uint32<N>, (i_shuffle4x2<s0,s1,s2,s3>), a, b); |
| 192 | } |
| 193 | |
| 194 | // ----------------------------------------------------------------------------- |
| 195 | // int64 |
| 196 | |
| 197 | #if SIMDPP_USE_AVX2 |
| 198 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
| 199 | uint64<4> i_shuffle4x2(const uint64<4>& a, const uint64<4>& b) |
| 200 | { |
| 201 | static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range" ); |
| 202 | return sse_shuffle4x2_int64::do_shuffle<s0, s1, s2, s3>(a, b); |
| 203 | } |
| 204 | #endif |
| 205 | |
| 206 | #if SIMDPP_USE_AVX512F |
| 207 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
| 208 | uint64<8> i_shuffle4x2(const uint64<8>& a, const uint64<8>& b) |
| 209 | { |
| 210 | static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range" ); |
| 211 | return sse_shuffle4x2_int64::do_shuffle<s0, s1, s2, s3>(a, b); |
| 212 | } |
| 213 | #endif |
| 214 | |
| 215 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> SIMDPP_INL |
| 216 | uint64<N> i_shuffle4x2(const uint64<N>& a, const uint64<N>& b) |
| 217 | { |
| 218 | #if SIMDPP_USE_AVX2 |
| 219 | SIMDPP_VEC_ARRAY_IMPL2(uint64<N>, (i_shuffle4x2<s0,s1,s2,s3>), a, b); |
| 220 | #else |
| 221 | uint64<N> r; |
| 222 | for (unsigned i = 0; i < uint64<N>::vec_length; i+=2) { |
| 223 | r.vec(i*2) = i_shuffle_emul_64x4_half<s0,s1>(a.vec(i*2), a.vec(i*2+1), |
| 224 | b.vec(i*2), b.vec(i*2+1)); |
| 225 | r.vec(i*2+1) = i_shuffle_emul_64x4_half<s2,s3>(a.vec(i*2), a.vec(i*2+1), |
| 226 | b.vec(i*2), b.vec(i*2+1)); |
| 227 | } |
| 228 | return r; |
| 229 | #endif |
| 230 | } |
| 231 | |
| 232 | |
| 233 | } // namespace insn |
| 234 | } // namespace detail |
| 235 | } // namespace SIMDPP_ARCH_NAMESPACE |
| 236 | } // namespace simdpp |
| 237 | |
| 238 | #endif |
| 239 | |
| 240 | |