1/* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_PERMUTE4_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_PERMUTE4_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/make_shuffle_bytes_mask.h>
17#include <simdpp/core/permute_bytes16.h>
18#include <simdpp/detail/null/shuffle.h>
19#include <simdpp/detail/shuffle/neon_int16x8.h>
20#include <simdpp/detail/shuffle/neon_int32x4.h>
21#include <simdpp/detail/shuffle/neon_int64x2.h>
22#include <simdpp/detail/shuffle/shuffle_mask.h>
23#include <simdpp/detail/vector_array_macros.h>
24
25namespace simdpp {
26namespace SIMDPP_ARCH_NAMESPACE {
27namespace detail {
28namespace insn {
29
30// forward declarations due to circular dependencies
31template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
32uint64x4 permute_emul(const uint64x4& a);
33template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
34float64x4 permute_emul(const float64x4& a);
35
36// ----
37
38template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
39uint16x8 i_permute4(const uint16x8& a)
40{
41 static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
42#if SIMDPP_USE_NULL
43 return detail::null::permute<s0,s1,s2,s3>(a);
44#elif SIMDPP_USE_SSE2
45 uint16<8> b = a;
46 b = _mm_shufflelo_epi16(b.native(), SIMDPP_SHUFFLE_MASK_4x4(s0, s1, s2, s3));
47 b = _mm_shufflehi_epi16(b.native(), SIMDPP_SHUFFLE_MASK_4x4(s0, s1, s2, s3));
48 return b;
49#elif SIMDPP_USE_NEON
50 return detail::neon_shuffle_int16x8::permute4<s0,s1,s2,s3>(a);
51#elif SIMDPP_USE_ALTIVEC
52 // TODO optimize
53 uint16x8 mask = make_shuffle_bytes16_mask<s0,s1,s2,s3>(mask);
54 return permute_bytes16(a, mask);
55#elif SIMDPP_USE_MSA
56 return (v8u16) __msa_shf_h((v8i16) a.native(), SIMDPP_SHUFFLE_MASK_4x4(s0,s1,s2,s3));
57#endif
58}
59
60#if SIMDPP_USE_AVX2
61template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
62uint16x16 i_permute4(const uint16x16& a)
63{
64 static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
65 uint16<16> b = a;
66 b = _mm256_shufflelo_epi16(b.native(), SIMDPP_SHUFFLE_MASK_4x4(s0, s1, s2, s3));
67 b = _mm256_shufflehi_epi16(b.native(), SIMDPP_SHUFFLE_MASK_4x4(s0, s1, s2, s3));
68 return b;
69}
70#endif
71
72#if SIMDPP_USE_AVX512BW
73template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
74uint16<32> i_permute4(const uint16<32>& a)
75{
76 static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
77 uint16<32> r = a;
78 r = _mm512_shufflelo_epi16(r.native(), SIMDPP_SHUFFLE_MASK_4x4(s0, s1, s2, s3));
79 r = _mm512_shufflehi_epi16(r.native(), SIMDPP_SHUFFLE_MASK_4x4(s0, s1, s2, s3));
80 return r;
81}
82#endif
83
84template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> SIMDPP_INL
85uint16<N> i_permute4(const uint16<N>& a)
86{
87 static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
88 SIMDPP_VEC_ARRAY_IMPL1(uint16<N>, (i_permute4<s0,s1,s2,s3>), a);
89}
90
91// -----------------------------------------------------------------------------
92
93template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
94uint32x4 i_permute4(const uint32x4& a)
95{
96 static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
97#if SIMDPP_USE_NULL
98 return detail::null::permute<s0,s1,s2,s3>(a);
99#elif SIMDPP_USE_SSE2
100 return _mm_shuffle_epi32(a.native(), _MM_SHUFFLE(s3, s2, s1, s0));
101#elif SIMDPP_USE_NEON
102 return detail::neon_shuffle_int32x4::permute4<s0,s1,s2,s3>(a);
103#elif SIMDPP_USE_ALTIVEC
104 // TODO optimize
105 uint32x4 mask = make_shuffle_bytes16_mask<s0,s1,s2,s3>(mask);
106 return permute_bytes16(a, mask);
107#elif SIMDPP_USE_MSA
108 return (v4u32) __msa_shf_w((v4i32) a.native(), SIMDPP_SHUFFLE_MASK_4x4(s0,s1,s2,s3));
109#endif
110}
111
112#if SIMDPP_USE_AVX2
113template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
114uint32x8 i_permute4(const uint32x8& a)
115{
116 static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
117 return _mm256_shuffle_epi32(a.native(), _MM_SHUFFLE(s3, s2, s1, s0));
118}
119#endif
120
121#if SIMDPP_USE_AVX512F
122template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
123uint32<16> i_permute4(const uint32<16>& a)
124{
125 static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
126 return _mm512_shuffle_epi32(a.native(), _MM_PERM_ENUM(_MM_SHUFFLE(s3, s2, s1, s0)));
127}
128#endif
129
130template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> SIMDPP_INL
131uint32<N> i_permute4(const uint32<N>& a)
132{
133 static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
134 SIMDPP_VEC_ARRAY_IMPL1(uint32<N>, (i_permute4<s0,s1,s2,s3>), a);
135}
136
137// -----------------------------------------------------------------------------
138
139template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
140float32x4 i_permute4(const float32x4& a)
141{
142 static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
143#if SIMDPP_USE_NULL
144 return detail::null::permute<s0,s1,s2,s3>(a);
145#elif SIMDPP_USE_SSE2
146 return _mm_shuffle_ps(a.native(), a.native(), _MM_SHUFFLE(s3, s2, s1, s0));
147#elif SIMDPP_USE_NEON
148 return float32x4(detail::neon_shuffle_int32x4::permute4<s0,s1,s2,s3>(int32x4(a)));
149#elif SIMDPP_USE_ALTIVEC
150 // TODO optimize
151 uint32x4 mask = make_shuffle_bytes16_mask<s0,s1,s2,s3>(mask);
152 return permute_bytes16(a, mask);
153#elif SIMDPP_USE_MSA
154 return (v4f32) __msa_shf_w((v4i32) a.native(), SIMDPP_SHUFFLE_MASK_4x4(s0,s1,s2,s3));
155#endif
156}
157
158#if SIMDPP_USE_AVX
159template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
160float32x8 i_permute4(const float32x8& a)
161{
162 static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
163 return _mm256_shuffle_ps(a.native(), a.native(), _MM_SHUFFLE(s3, s2, s1, s0));
164}
165#endif
166
167#if SIMDPP_USE_AVX512F
168template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
169float32<16> i_permute4(const float32<16>& a)
170{
171 static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
172 return _mm512_shuffle_ps(a.native(), a.native(), _MM_SHUFFLE(s3, s2, s1, s0));
173}
174#endif
175
176template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> SIMDPP_INL
177float32<N> i_permute4(const float32<N>& a)
178{
179 static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
180 SIMDPP_VEC_ARRAY_IMPL1(float32<N>, (i_permute4<s0,s1,s2,s3>), a);
181}
182
183// -----------------------------------------------------------------------------
184
185template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
186uint64x4 i_permute4(const uint64x4& a)
187{
188 static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
189#if SIMDPP_USE_AVX2
190 return _mm256_permute4x64_epi64(a.native(), _MM_SHUFFLE(s3, s2, s1, s0));
191#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
192 return permute_emul<s0,s1,s2,s3>(a);
193#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
194 uint64x4 r;
195 r.vec(0).el(0) = a.vec(s0/2).el(s0%2);
196 r.vec(0).el(1) = a.vec(s1/2).el(s1%2);
197 r.vec(1).el(0) = a.vec(s2/2).el(s2%2);
198 r.vec(1).el(1) = a.vec(s3/2).el(s3%2);
199 return r;
200#endif
201}
202
203#if SIMDPP_USE_AVX512F
204template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
205uint64<8> i_permute4(const uint64<8>& a)
206{
207 static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
208 return _mm512_permutex_epi64(a.native(), _MM_SHUFFLE(s3, s2, s1, s0));
209}
210#endif
211
212template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> SIMDPP_INL
213uint64<N> i_permute4(const uint64<N>& a)
214{
215 static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
216 SIMDPP_VEC_ARRAY_IMPL1(uint64<N>, (i_permute4<s0,s1,s2,s3>), a);
217}
218
219// -----------------------------------------------------------------------------
220
221template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
222float64x4 i_permute4(const float64x4& a)
223{
224 static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
225#if SIMDPP_USE_AVX2
226 return _mm256_permute4x64_pd(a.native(), _MM_SHUFFLE(s3, s2, s1, s0));
227#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
228 return permute_emul<s0,s1,s2,s3>(a);
229#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC
230 float64x4 r;
231 r.vec(0).el(0) = a.vec(s0/2).el(s0%2);
232 r.vec(0).el(1) = a.vec(s1/2).el(s1%2);
233 r.vec(1).el(0) = a.vec(s2/2).el(s2%2);
234 r.vec(1).el(1) = a.vec(s3/2).el(s3%2);
235 return r;
236#endif
237}
238
239#if SIMDPP_USE_AVX512F
240template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
241float64<8> i_permute4(const float64<8>& a)
242{
243 static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
244 return _mm512_permutex_pd(a.native(), _MM_SHUFFLE(s3, s2, s1, s0));
245}
246#endif
247
248template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> SIMDPP_INL
249float64<N> i_permute4(const float64<N>& a)
250{
251 static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
252 SIMDPP_VEC_ARRAY_IMPL1(float64<N>, (i_permute4<s0,s1,s2,s3>), a);
253}
254
255} // namespace insn
256} // namespace detail
257} // namespace SIMDPP_ARCH_NAMESPACE
258} // namespace simdpp
259
260#endif
261
262