1 | /* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_PERMUTE4_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_PERMUTE4_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/make_shuffle_bytes_mask.h> |
17 | #include <simdpp/core/permute_bytes16.h> |
18 | #include <simdpp/detail/null/shuffle.h> |
19 | #include <simdpp/detail/shuffle/neon_int16x8.h> |
20 | #include <simdpp/detail/shuffle/neon_int32x4.h> |
21 | #include <simdpp/detail/shuffle/neon_int64x2.h> |
22 | #include <simdpp/detail/shuffle/shuffle_mask.h> |
23 | #include <simdpp/detail/vector_array_macros.h> |
24 | |
25 | namespace simdpp { |
26 | namespace SIMDPP_ARCH_NAMESPACE { |
27 | namespace detail { |
28 | namespace insn { |
29 | |
30 | // forward declarations due to circular dependencies |
31 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
32 | uint64x4 permute_emul(const uint64x4& a); |
33 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
34 | float64x4 permute_emul(const float64x4& a); |
35 | |
36 | // ---- |
37 | |
38 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
39 | uint16x8 i_permute4(const uint16x8& a) |
40 | { |
41 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
42 | #if SIMDPP_USE_NULL |
43 | return detail::null::permute<s0,s1,s2,s3>(a); |
44 | #elif SIMDPP_USE_SSE2 |
45 | uint16<8> b = a; |
46 | b = _mm_shufflelo_epi16(b.native(), SIMDPP_SHUFFLE_MASK_4x4(s0, s1, s2, s3)); |
47 | b = _mm_shufflehi_epi16(b.native(), SIMDPP_SHUFFLE_MASK_4x4(s0, s1, s2, s3)); |
48 | return b; |
49 | #elif SIMDPP_USE_NEON |
50 | return detail::neon_shuffle_int16x8::permute4<s0,s1,s2,s3>(a); |
51 | #elif SIMDPP_USE_ALTIVEC |
52 | // TODO optimize |
53 | uint16x8 mask = make_shuffle_bytes16_mask<s0,s1,s2,s3>(mask); |
54 | return permute_bytes16(a, mask); |
55 | #elif SIMDPP_USE_MSA |
56 | return (v8u16) __msa_shf_h((v8i16) a.native(), SIMDPP_SHUFFLE_MASK_4x4(s0,s1,s2,s3)); |
57 | #endif |
58 | } |
59 | |
60 | #if SIMDPP_USE_AVX2 |
61 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
62 | uint16x16 i_permute4(const uint16x16& a) |
63 | { |
64 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
65 | uint16<16> b = a; |
66 | b = _mm256_shufflelo_epi16(b.native(), SIMDPP_SHUFFLE_MASK_4x4(s0, s1, s2, s3)); |
67 | b = _mm256_shufflehi_epi16(b.native(), SIMDPP_SHUFFLE_MASK_4x4(s0, s1, s2, s3)); |
68 | return b; |
69 | } |
70 | #endif |
71 | |
72 | #if SIMDPP_USE_AVX512BW |
73 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
74 | uint16<32> i_permute4(const uint16<32>& a) |
75 | { |
76 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
77 | uint16<32> r = a; |
78 | r = _mm512_shufflelo_epi16(r.native(), SIMDPP_SHUFFLE_MASK_4x4(s0, s1, s2, s3)); |
79 | r = _mm512_shufflehi_epi16(r.native(), SIMDPP_SHUFFLE_MASK_4x4(s0, s1, s2, s3)); |
80 | return r; |
81 | } |
82 | #endif |
83 | |
84 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> SIMDPP_INL |
85 | uint16<N> i_permute4(const uint16<N>& a) |
86 | { |
87 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
88 | SIMDPP_VEC_ARRAY_IMPL1(uint16<N>, (i_permute4<s0,s1,s2,s3>), a); |
89 | } |
90 | |
91 | // ----------------------------------------------------------------------------- |
92 | |
93 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
94 | uint32x4 i_permute4(const uint32x4& a) |
95 | { |
96 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
97 | #if SIMDPP_USE_NULL |
98 | return detail::null::permute<s0,s1,s2,s3>(a); |
99 | #elif SIMDPP_USE_SSE2 |
100 | return _mm_shuffle_epi32(a.native(), _MM_SHUFFLE(s3, s2, s1, s0)); |
101 | #elif SIMDPP_USE_NEON |
102 | return detail::neon_shuffle_int32x4::permute4<s0,s1,s2,s3>(a); |
103 | #elif SIMDPP_USE_ALTIVEC |
104 | // TODO optimize |
105 | uint32x4 mask = make_shuffle_bytes16_mask<s0,s1,s2,s3>(mask); |
106 | return permute_bytes16(a, mask); |
107 | #elif SIMDPP_USE_MSA |
108 | return (v4u32) __msa_shf_w((v4i32) a.native(), SIMDPP_SHUFFLE_MASK_4x4(s0,s1,s2,s3)); |
109 | #endif |
110 | } |
111 | |
112 | #if SIMDPP_USE_AVX2 |
113 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
114 | uint32x8 i_permute4(const uint32x8& a) |
115 | { |
116 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
117 | return _mm256_shuffle_epi32(a.native(), _MM_SHUFFLE(s3, s2, s1, s0)); |
118 | } |
119 | #endif |
120 | |
121 | #if SIMDPP_USE_AVX512F |
122 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
123 | uint32<16> i_permute4(const uint32<16>& a) |
124 | { |
125 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
126 | return _mm512_shuffle_epi32(a.native(), _MM_PERM_ENUM(_MM_SHUFFLE(s3, s2, s1, s0))); |
127 | } |
128 | #endif |
129 | |
130 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> SIMDPP_INL |
131 | uint32<N> i_permute4(const uint32<N>& a) |
132 | { |
133 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
134 | SIMDPP_VEC_ARRAY_IMPL1(uint32<N>, (i_permute4<s0,s1,s2,s3>), a); |
135 | } |
136 | |
137 | // ----------------------------------------------------------------------------- |
138 | |
139 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
140 | float32x4 i_permute4(const float32x4& a) |
141 | { |
142 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
143 | #if SIMDPP_USE_NULL |
144 | return detail::null::permute<s0,s1,s2,s3>(a); |
145 | #elif SIMDPP_USE_SSE2 |
146 | return _mm_shuffle_ps(a.native(), a.native(), _MM_SHUFFLE(s3, s2, s1, s0)); |
147 | #elif SIMDPP_USE_NEON |
148 | return float32x4(detail::neon_shuffle_int32x4::permute4<s0,s1,s2,s3>(int32x4(a))); |
149 | #elif SIMDPP_USE_ALTIVEC |
150 | // TODO optimize |
151 | uint32x4 mask = make_shuffle_bytes16_mask<s0,s1,s2,s3>(mask); |
152 | return permute_bytes16(a, mask); |
153 | #elif SIMDPP_USE_MSA |
154 | return (v4f32) __msa_shf_w((v4i32) a.native(), SIMDPP_SHUFFLE_MASK_4x4(s0,s1,s2,s3)); |
155 | #endif |
156 | } |
157 | |
158 | #if SIMDPP_USE_AVX |
159 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
160 | float32x8 i_permute4(const float32x8& a) |
161 | { |
162 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
163 | return _mm256_shuffle_ps(a.native(), a.native(), _MM_SHUFFLE(s3, s2, s1, s0)); |
164 | } |
165 | #endif |
166 | |
167 | #if SIMDPP_USE_AVX512F |
168 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
169 | float32<16> i_permute4(const float32<16>& a) |
170 | { |
171 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
172 | return _mm512_shuffle_ps(a.native(), a.native(), _MM_SHUFFLE(s3, s2, s1, s0)); |
173 | } |
174 | #endif |
175 | |
176 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> SIMDPP_INL |
177 | float32<N> i_permute4(const float32<N>& a) |
178 | { |
179 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
180 | SIMDPP_VEC_ARRAY_IMPL1(float32<N>, (i_permute4<s0,s1,s2,s3>), a); |
181 | } |
182 | |
183 | // ----------------------------------------------------------------------------- |
184 | |
185 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
186 | uint64x4 i_permute4(const uint64x4& a) |
187 | { |
188 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
189 | #if SIMDPP_USE_AVX2 |
190 | return _mm256_permute4x64_epi64(a.native(), _MM_SHUFFLE(s3, s2, s1, s0)); |
191 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA |
192 | return permute_emul<s0,s1,s2,s3>(a); |
193 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC |
194 | uint64x4 r; |
195 | r.vec(0).el(0) = a.vec(s0/2).el(s0%2); |
196 | r.vec(0).el(1) = a.vec(s1/2).el(s1%2); |
197 | r.vec(1).el(0) = a.vec(s2/2).el(s2%2); |
198 | r.vec(1).el(1) = a.vec(s3/2).el(s3%2); |
199 | return r; |
200 | #endif |
201 | } |
202 | |
203 | #if SIMDPP_USE_AVX512F |
204 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
205 | uint64<8> i_permute4(const uint64<8>& a) |
206 | { |
207 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
208 | return _mm512_permutex_epi64(a.native(), _MM_SHUFFLE(s3, s2, s1, s0)); |
209 | } |
210 | #endif |
211 | |
212 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> SIMDPP_INL |
213 | uint64<N> i_permute4(const uint64<N>& a) |
214 | { |
215 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
216 | SIMDPP_VEC_ARRAY_IMPL1(uint64<N>, (i_permute4<s0,s1,s2,s3>), a); |
217 | } |
218 | |
219 | // ----------------------------------------------------------------------------- |
220 | |
221 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
222 | float64x4 i_permute4(const float64x4& a) |
223 | { |
224 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
225 | #if SIMDPP_USE_AVX2 |
226 | return _mm256_permute4x64_pd(a.native(), _MM_SHUFFLE(s3, s2, s1, s0)); |
227 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA |
228 | return permute_emul<s0,s1,s2,s3>(a); |
229 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC |
230 | float64x4 r; |
231 | r.vec(0).el(0) = a.vec(s0/2).el(s0%2); |
232 | r.vec(0).el(1) = a.vec(s1/2).el(s1%2); |
233 | r.vec(1).el(0) = a.vec(s2/2).el(s2%2); |
234 | r.vec(1).el(1) = a.vec(s3/2).el(s3%2); |
235 | return r; |
236 | #endif |
237 | } |
238 | |
239 | #if SIMDPP_USE_AVX512F |
240 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
241 | float64<8> i_permute4(const float64<8>& a) |
242 | { |
243 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
244 | return _mm512_permutex_pd(a.native(), _MM_SHUFFLE(s3, s2, s1, s0)); |
245 | } |
246 | #endif |
247 | |
248 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> SIMDPP_INL |
249 | float64<N> i_permute4(const float64<N>& a) |
250 | { |
251 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
252 | SIMDPP_VEC_ARRAY_IMPL1(float64<N>, (i_permute4<s0,s1,s2,s3>), a); |
253 | } |
254 | |
255 | } // namespace insn |
256 | } // namespace detail |
257 | } // namespace SIMDPP_ARCH_NAMESPACE |
258 | } // namespace simdpp |
259 | |
260 | #endif |
261 | |
262 | |