1 | /* Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_SHUFFLE_BYTES16_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_SHUFFLE_BYTES16_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/bit_or.h> |
17 | #include <simdpp/core/blend.h> |
18 | #include <simdpp/core/cmp_lt.h> |
19 | #include <simdpp/core/i_add.h> |
20 | #include <simdpp/detail/not_implemented.h> |
21 | |
22 | namespace simdpp { |
23 | namespace SIMDPP_ARCH_NAMESPACE { |
24 | namespace detail { |
25 | namespace insn { |
26 | |
27 | |
28 | static SIMDPP_INL |
29 | uint8x16 i_shuffle_bytes16(const uint8x16& a, const uint8x16& b, const uint8x16& mask) |
30 | { |
31 | #if SIMDPP_USE_NULL |
32 | uint8x16 r; |
33 | |
34 | for (unsigned i = 0; i < 16; i++) { |
35 | unsigned j = mask.el(i) & 0x0f; |
36 | bool which = mask.el(i) < 0x10; |
37 | r.el(i) = which ? a.el(j) : b.el(j); |
38 | } |
39 | return r; |
40 | #elif SIMDPP_USE_XOP |
41 | return _mm_perm_epi8(a.native(), b.native(), mask.native()); |
42 | #elif SIMDPP_USE_AVX |
43 | // it's not advantageous to use _mm_blendv_epi8 on pre-AVX machines |
44 | // because it takes the same number of cycles as the alternative, but |
45 | // forces the result into xmm0 register. |
46 | int16x8 sel, ai, bi, r; |
47 | sel = _mm_slli_epi16(mask.native(), 3); |
48 | |
49 | ai = _mm_shuffle_epi8(a.native(), mask.native()); |
50 | bi = _mm_shuffle_epi8(b.native(), mask.native()); |
51 | r = _mm_blendv_epi8(ai.native(), bi.native(), sel.native()); |
52 | return (uint8<16>) r; |
53 | #elif SIMDPP_USE_SSSE3 |
54 | #if (defined(__clang__) && (__clang_major__ == 3) && (__clang_minor__ <= 7) && (__clang_minor__ >= 6)) |
55 | // Clang 3.7 and 3.6 incorrectly optimize certain cases of constant |
56 | // mask when the values are available for the compiler to collapse. |
57 | // Fortunately the overhead of the workaround is very small |
58 | uint8<16> ai, bi; |
59 | mask_int8<16> select_a = cmp_lt((int8<16>) mask, 0x10); |
60 | ai = _mm_shuffle_epi8(a.native(), mask.native()); |
61 | bi = _mm_shuffle_epi8(b.native(), mask.native()); |
62 | return blend(ai, bi, select_a); |
63 | #else |
64 | uint8x16 m1, m2, ai, bi; |
65 | // sets the 7-th bit if we want an element from the other vector |
66 | m1 = add(mask, 0x70); |
67 | m2 = add(mask, 0xf0); |
68 | |
69 | ai = _mm_shuffle_epi8(a.native(), m1.native()); |
70 | bi = _mm_shuffle_epi8(b.native(), m2.native()); |
71 | return bit_or(ai, bi); |
72 | #endif |
73 | #elif SIMDPP_USE_NEON32 |
74 | uint8x8x4_t table = {{vget_low_u8(a.native()), vget_high_u8(a.native()), |
75 | vget_low_u8(b.native()), vget_high_u8(b.native())}}; |
76 | uint8x8_t lo = vtbl4_u8(table, vget_low_u8(mask.native())); |
77 | uint8x8_t hi = vtbl4_u8(table, vget_high_u8(mask.native())); |
78 | return vcombine_u8(lo, hi); |
79 | #elif SIMDPP_USE_NEON64 |
80 | uint8x16x2_t table; |
81 | table.val[0] = a.native(); |
82 | table.val[1] = b.native(); |
83 | return vqtbl2q_u8(table, mask.native()); |
84 | #elif SIMDPP_USE_ALTIVEC |
85 | return vec_perm(a.native(), b.native(), mask.native()); |
86 | #elif SIMDPP_USE_MSA |
87 | return (v16u8) __msa_vshf_b((v16i8)mask.native(), |
88 | (v16i8)b.native(), |
89 | (v16i8)a.native()); |
90 | #else |
91 | return SIMDPP_NOT_IMPLEMENTED3(a, b, mask); |
92 | #endif |
93 | } |
94 | |
95 | #if SIMDPP_USE_AVX2 |
96 | static SIMDPP_INL |
97 | uint8x32 i_shuffle_bytes16(const uint8x32& a, const uint8x32& b, const uint8x32& mask) |
98 | { |
99 | int16x16 sel, ai, bi, r; |
100 | sel = mask; |
101 | sel = _mm256_slli_epi16(sel.native(), 3); // the top 3 bits are already clear |
102 | |
103 | ai = _mm256_shuffle_epi8(a.native(), mask.native()); |
104 | bi = _mm256_shuffle_epi8(b.native(), mask.native()); |
105 | r = _mm256_blendv_epi8(ai.native(), bi.native(), sel.native()); |
106 | return (uint8<32>) r; |
107 | } |
108 | #endif |
109 | |
110 | #if SIMDPP_USE_AVX512BW |
111 | SIMDPP_INL uint8<64> i_shuffle_bytes16(const uint8<64>& a, const uint8<64>& b, const uint8<64>& mask) |
112 | { |
113 | uint8<64> sel_mask, ai, bi, r; |
114 | sel_mask = make_uint(0x10); |
115 | __mmask64 sel = _mm512_test_epi8_mask(mask.native(), sel_mask.native()); |
116 | |
117 | ai = _mm512_shuffle_epi8(a.native(), mask.native()); |
118 | bi = _mm512_shuffle_epi8(b.native(), mask.native()); |
119 | r = _mm512_mask_blend_epi8(sel, ai.native(), bi.native()); |
120 | return r; |
121 | } |
122 | #endif |
123 | |
124 | template<unsigned N> SIMDPP_INL |
125 | uint8<N> i_shuffle_bytes16(const uint8<N>& a, const uint8<N>& b, const uint8<N>& mask) |
126 | { |
127 | SIMDPP_VEC_ARRAY_IMPL3(uint8<N>, i_shuffle_bytes16, a, b, mask); |
128 | } |
129 | |
130 | template<unsigned N> SIMDPP_INL |
131 | uint16<N> i_shuffle_bytes16(const uint16<N>& a, const uint16<N>& b, const uint16<N>& mask) |
132 | { |
133 | return (uint16<N>) i_shuffle_bytes16(uint8<N*2>(a), uint8<N*2>(b), uint8<N*2>(mask)); |
134 | } |
135 | template<unsigned N> SIMDPP_INL |
136 | uint32<N> i_shuffle_bytes16(const uint32<N>& a, const uint32<N>& b, const uint32<N>& mask) |
137 | { |
138 | return (uint32<N>) i_shuffle_bytes16(uint8<N*4>(a), uint8<N*4>(b), uint8<N*4>(mask)); |
139 | } |
140 | template<unsigned N> SIMDPP_INL |
141 | uint64<N> i_shuffle_bytes16(const uint64<N>& a, const uint64<N>& b, const uint64<N>& mask) |
142 | { |
143 | return (uint64<N>) i_shuffle_bytes16(uint8<N*8>(a), uint8<N*8>(b), uint8<N*8>(mask)); |
144 | } |
145 | template<unsigned N> SIMDPP_INL |
146 | float32<N> i_shuffle_bytes16(const float32<N>& a, const float32<N>& b, const uint32<N>& mask) |
147 | { |
148 | return float32<N>(i_shuffle_bytes16(uint32<N>(a), uint32<N>(b), mask)); |
149 | } |
150 | template<unsigned N> SIMDPP_INL |
151 | float64<N> i_shuffle_bytes16(const float64<N>& a, const float64<N>& b, const uint64<N>& mask) |
152 | { |
153 | return float64<N>(i_shuffle_bytes16(uint64<N>(a), uint64<N>(b), mask)); |
154 | } |
155 | |
156 | |
157 | } // namespace insn |
158 | } // namespace detail |
159 | } // namespace SIMDPP_ARCH_NAMESPACE |
160 | } // namespace simdpp |
161 | |
162 | #endif |
163 | |
164 | |