1 | /* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_SHUFFLE4x2_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_SHUFFLE4x2_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/make_shuffle_bytes_mask.h> |
17 | #include <simdpp/core/shuffle_bytes16.h> |
18 | #include <simdpp/detail/insn/shuffle2x2.h> |
19 | #include <simdpp/detail/shuffle/sse_float32_4x2.h> |
20 | #include <simdpp/detail/shuffle/sse_float64_4x2.h> |
21 | #include <simdpp/detail/shuffle/sse_int32_4x2.h> |
22 | #include <simdpp/detail/shuffle/sse_int64_4x2.h> |
23 | #include <simdpp/detail/shuffle/neon_int32x4.h> |
24 | #include <simdpp/detail/not_implemented.h> |
25 | #include <simdpp/detail/vector_array_macros.h> |
26 | |
27 | namespace simdpp { |
28 | namespace SIMDPP_ARCH_NAMESPACE { |
29 | namespace detail { |
30 | namespace insn { |
31 | |
32 | // ----------------------------------------------------------------------------- |
33 | // emulates 64x4 shuffle on architectures with 128-bit vectors |
34 | |
35 | template<unsigned s0, unsigned s1, class V> |
36 | V i_shuffle_emul_64x4_half(const V& a0, const V& a1, const V& b0, const V& b1) |
37 | { |
38 | const V& h0 = s0 < 2 ? a0 : |
39 | s0 < 4 ? a1 : |
40 | s0 < 6 ? b0 : b1; |
41 | const V& h1 = s1 < 2 ? a0 : |
42 | s1 < 4 ? a1 : |
43 | s1 < 6 ? b0 : b1; |
44 | return i_shuffle2x2<s0%2, s1%2+2>(h0, h1); |
45 | } |
46 | |
47 | // ----------------------------------------------------------------------------- |
48 | // float32 |
49 | |
50 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
51 | float32<4> i_shuffle4x2(const float32<4>& a, const float32<4>& b) |
52 | { |
53 | static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range" ); |
54 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP |
55 | float32<4> r; |
56 | r.el(0) = s0 < 4 ? a.el(s0) : b.el(s0-4); |
57 | r.el(1) = s1 < 4 ? a.el(s1) : b.el(s1-4); |
58 | r.el(2) = s2 < 4 ? a.el(s2) : b.el(s2-4); |
59 | r.el(3) = s3 < 4 ? a.el(s3) : b.el(s3-4); |
60 | return r; |
61 | #elif SIMDPP_USE_SSE2 |
62 | return sse_shuffle4x2_float32::do_shuffle<s0, s1, s2, s3>(a, b); |
63 | #elif SIMDPP_USE_NEON_FLT_SP |
64 | return (float32<4>)detail::neon_shuffle_int32x4::shuffle4x2<s0, s1, s2, s3>(uint32<4>(a), uint32<4>(b)); |
65 | #elif SIMDPP_USE_ALTIVEC |
66 | uint32<4> mask = make_shuffle_bytes16_mask<s0, s1, s2, s3>(mask); |
67 | return shuffle_bytes16(a, b, mask); |
68 | #elif SIMDPP_USE_MSA |
69 | uint32<4> mask = make_uint(s0,s1,s2,s3); |
70 | return (v4f32) __msa_vshf_w((v4i32) mask.native(), |
71 | (v4i32) b.native(), |
72 | (v4i32) a.native()); |
73 | #else |
74 | return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(int64<s0+4>, a, b); |
75 | #endif |
76 | } |
77 | |
78 | #if SIMDPP_USE_AVX |
79 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
80 | float32<8> i_shuffle4x2(const float32<8>& a, const float32<8>& b) |
81 | { |
82 | static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range" ); |
83 | return sse_shuffle4x2_float32::do_shuffle<s0, s1, s2, s3>(a, b); |
84 | } |
85 | #endif |
86 | |
87 | #if SIMDPP_USE_AVX512F |
88 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
89 | float32<16> i_shuffle4x2(const float32<16>& a, const float32<16>& b) |
90 | { |
91 | static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range" ); |
92 | return sse_shuffle4x2_float32::do_shuffle<s0, s1, s2, s3>(a, b); |
93 | } |
94 | #endif |
95 | |
96 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> SIMDPP_INL |
97 | float32<N> i_shuffle4x2(const float32<N>& a, const float32<N>& b) |
98 | { |
99 | SIMDPP_VEC_ARRAY_IMPL2(float32<N>, (i_shuffle4x2<s0,s1,s2,s3>), a, b); |
100 | } |
101 | |
102 | // ----------------------------------------------------------------------------- |
103 | // float64 |
104 | |
105 | #if SIMDPP_USE_AVX |
106 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
107 | float64<4> i_shuffle4x2(const float64<4>& a, const float64<4>& b) |
108 | { |
109 | static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range" ); |
110 | return sse_shuffle4x2_float64::do_shuffle<s0, s1, s2, s3>(a, b); |
111 | } |
112 | #endif |
113 | |
114 | #if SIMDPP_USE_AVX512F |
115 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
116 | float64<8> i_shuffle4x2(const float64<8>& a, const float64<8>& b) |
117 | { |
118 | static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range" ); |
119 | return sse_shuffle4x2_float64::do_shuffle<s0, s1, s2, s3>(a, b); |
120 | } |
121 | #endif |
122 | |
123 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> SIMDPP_INL |
124 | float64<N> i_shuffle4x2(const float64<N>& a, const float64<N>& b) |
125 | { |
126 | #if SIMDPP_USE_AVX |
127 | SIMDPP_VEC_ARRAY_IMPL2(float64<N>, (i_shuffle4x2<s0,s1,s2,s3>), a, b); |
128 | #else |
129 | float64<N> r; |
130 | for (unsigned i = 0; i < float64<N>::vec_length; i+=2) { |
131 | r.vec(i*2) = i_shuffle_emul_64x4_half<s0,s1>(a.vec(i*2), a.vec(i*2+1), |
132 | b.vec(i*2), b.vec(i*2+1)); |
133 | r.vec(i*2+1) = i_shuffle_emul_64x4_half<s2,s3>(a.vec(i*2), a.vec(i*2+1), |
134 | b.vec(i*2), b.vec(i*2+1)); |
135 | } |
136 | return r; |
137 | #endif |
138 | } |
139 | |
140 | // ----------------------------------------------------------------------------- |
141 | |
142 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
143 | uint32<4> i_shuffle4x2(const uint32<4>& a, const uint32<4>& b) |
144 | { |
145 | static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range" ); |
146 | #if SIMDPP_USE_NULL |
147 | uint32<4> r; |
148 | r.el(0) = s0 < 4 ? a.el(s0) : b.el(s0-4); |
149 | r.el(1) = s1 < 4 ? a.el(s1) : b.el(s1-4); |
150 | r.el(2) = s2 < 4 ? a.el(s2) : b.el(s2-4); |
151 | r.el(3) = s3 < 4 ? a.el(s3) : b.el(s3-4); |
152 | return r; |
153 | #elif SIMDPP_USE_SSE2 |
154 | return sse_shuffle4x2_int32::do_shuffle<s0, s1, s2, s3>(a, b); |
155 | #elif SIMDPP_USE_NEON |
156 | return detail::neon_shuffle_int32x4::shuffle4x2<s0, s1, s2, s3>(a, b); |
157 | #elif SIMDPP_USE_ALTIVEC |
158 | uint32<4> mask = make_shuffle_bytes16_mask<s0, s1, s2, s3>(mask); |
159 | return shuffle_bytes16(a, b, mask); |
160 | #elif SIMDPP_USE_MSA |
161 | uint32<4> mask = make_uint(s0,s1,s2,s3); |
162 | return (v4u32) __msa_vshf_w((v4i32) mask.native(), |
163 | (v4i32) b.native(), |
164 | (v4i32) a.native()); |
165 | #else |
166 | return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(int64<s0+4>, a, b); |
167 | #endif |
168 | } |
169 | |
170 | #if SIMDPP_USE_AVX2 |
171 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
172 | uint32<8> i_shuffle4x2(const uint32<8>& a, const uint32<8>& b) |
173 | { |
174 | static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range" ); |
175 | return sse_shuffle4x2_int32::do_shuffle<s0, s1, s2, s3>(a, b); |
176 | } |
177 | #endif |
178 | |
179 | #if SIMDPP_USE_AVX512F |
180 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
181 | uint32<16> i_shuffle4x2(const uint32<16>& a, const uint32<16>& b) |
182 | { |
183 | static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range" ); |
184 | return sse_shuffle4x2_int32::do_shuffle<s0, s1, s2, s3>(a, b); |
185 | } |
186 | #endif |
187 | |
188 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> SIMDPP_INL |
189 | uint32<N> i_shuffle4x2(const uint32<N>& a, const uint32<N>& b) |
190 | { |
191 | SIMDPP_VEC_ARRAY_IMPL2(uint32<N>, (i_shuffle4x2<s0,s1,s2,s3>), a, b); |
192 | } |
193 | |
194 | // ----------------------------------------------------------------------------- |
195 | // int64 |
196 | |
197 | #if SIMDPP_USE_AVX2 |
198 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
199 | uint64<4> i_shuffle4x2(const uint64<4>& a, const uint64<4>& b) |
200 | { |
201 | static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range" ); |
202 | return sse_shuffle4x2_int64::do_shuffle<s0, s1, s2, s3>(a, b); |
203 | } |
204 | #endif |
205 | |
206 | #if SIMDPP_USE_AVX512F |
207 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
208 | uint64<8> i_shuffle4x2(const uint64<8>& a, const uint64<8>& b) |
209 | { |
210 | static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range" ); |
211 | return sse_shuffle4x2_int64::do_shuffle<s0, s1, s2, s3>(a, b); |
212 | } |
213 | #endif |
214 | |
215 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> SIMDPP_INL |
216 | uint64<N> i_shuffle4x2(const uint64<N>& a, const uint64<N>& b) |
217 | { |
218 | #if SIMDPP_USE_AVX2 |
219 | SIMDPP_VEC_ARRAY_IMPL2(uint64<N>, (i_shuffle4x2<s0,s1,s2,s3>), a, b); |
220 | #else |
221 | uint64<N> r; |
222 | for (unsigned i = 0; i < uint64<N>::vec_length; i+=2) { |
223 | r.vec(i*2) = i_shuffle_emul_64x4_half<s0,s1>(a.vec(i*2), a.vec(i*2+1), |
224 | b.vec(i*2), b.vec(i*2+1)); |
225 | r.vec(i*2+1) = i_shuffle_emul_64x4_half<s2,s3>(a.vec(i*2), a.vec(i*2+1), |
226 | b.vec(i*2), b.vec(i*2+1)); |
227 | } |
228 | return r; |
229 | #endif |
230 | } |
231 | |
232 | |
233 | } // namespace insn |
234 | } // namespace detail |
235 | } // namespace SIMDPP_ARCH_NAMESPACE |
236 | } // namespace simdpp |
237 | |
238 | #endif |
239 | |
240 | |