1/* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_SHUFFLE4x2_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_SHUFFLE4x2_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/make_shuffle_bytes_mask.h>
17#include <simdpp/core/shuffle_bytes16.h>
18#include <simdpp/detail/insn/shuffle2x2.h>
19#include <simdpp/detail/shuffle/sse_float32_4x2.h>
20#include <simdpp/detail/shuffle/sse_float64_4x2.h>
21#include <simdpp/detail/shuffle/sse_int32_4x2.h>
22#include <simdpp/detail/shuffle/sse_int64_4x2.h>
23#include <simdpp/detail/shuffle/neon_int32x4.h>
24#include <simdpp/detail/not_implemented.h>
25#include <simdpp/detail/vector_array_macros.h>
26
27namespace simdpp {
28namespace SIMDPP_ARCH_NAMESPACE {
29namespace detail {
30namespace insn {
31
32// -----------------------------------------------------------------------------
33// emulates 64x4 shuffle on architectures with 128-bit vectors
34
35template<unsigned s0, unsigned s1, class V>
36V i_shuffle_emul_64x4_half(const V& a0, const V& a1, const V& b0, const V& b1)
37{
38 const V& h0 = s0 < 2 ? a0 :
39 s0 < 4 ? a1 :
40 s0 < 6 ? b0 : b1;
41 const V& h1 = s1 < 2 ? a0 :
42 s1 < 4 ? a1 :
43 s1 < 6 ? b0 : b1;
44 return i_shuffle2x2<s0%2, s1%2+2>(h0, h1);
45}
46
47// -----------------------------------------------------------------------------
48// float32
49
50template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
51float32<4> i_shuffle4x2(const float32<4>& a, const float32<4>& b)
52{
53 static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range");
54#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
55 float32<4> r;
56 r.el(0) = s0 < 4 ? a.el(s0) : b.el(s0-4);
57 r.el(1) = s1 < 4 ? a.el(s1) : b.el(s1-4);
58 r.el(2) = s2 < 4 ? a.el(s2) : b.el(s2-4);
59 r.el(3) = s3 < 4 ? a.el(s3) : b.el(s3-4);
60 return r;
61#elif SIMDPP_USE_SSE2
62 return sse_shuffle4x2_float32::do_shuffle<s0, s1, s2, s3>(a, b);
63#elif SIMDPP_USE_NEON_FLT_SP
64 return (float32<4>)detail::neon_shuffle_int32x4::shuffle4x2<s0, s1, s2, s3>(uint32<4>(a), uint32<4>(b));
65#elif SIMDPP_USE_ALTIVEC
66 uint32<4> mask = make_shuffle_bytes16_mask<s0, s1, s2, s3>(mask);
67 return shuffle_bytes16(a, b, mask);
68#elif SIMDPP_USE_MSA
69 uint32<4> mask = make_uint(s0,s1,s2,s3);
70 return (v4f32) __msa_vshf_w((v4i32) mask.native(),
71 (v4i32) b.native(),
72 (v4i32) a.native());
73#else
74 return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(int64<s0+4>, a, b);
75#endif
76}
77
78#if SIMDPP_USE_AVX
79template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
80float32<8> i_shuffle4x2(const float32<8>& a, const float32<8>& b)
81{
82 static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range");
83 return sse_shuffle4x2_float32::do_shuffle<s0, s1, s2, s3>(a, b);
84}
85#endif
86
87#if SIMDPP_USE_AVX512F
88template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
89float32<16> i_shuffle4x2(const float32<16>& a, const float32<16>& b)
90{
91 static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range");
92 return sse_shuffle4x2_float32::do_shuffle<s0, s1, s2, s3>(a, b);
93}
94#endif
95
96template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> SIMDPP_INL
97float32<N> i_shuffle4x2(const float32<N>& a, const float32<N>& b)
98{
99 SIMDPP_VEC_ARRAY_IMPL2(float32<N>, (i_shuffle4x2<s0,s1,s2,s3>), a, b);
100}
101
102// -----------------------------------------------------------------------------
103// float64
104
105#if SIMDPP_USE_AVX
106template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
107float64<4> i_shuffle4x2(const float64<4>& a, const float64<4>& b)
108{
109 static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range");
110 return sse_shuffle4x2_float64::do_shuffle<s0, s1, s2, s3>(a, b);
111}
112#endif
113
114#if SIMDPP_USE_AVX512F
115template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
116float64<8> i_shuffle4x2(const float64<8>& a, const float64<8>& b)
117{
118 static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range");
119 return sse_shuffle4x2_float64::do_shuffle<s0, s1, s2, s3>(a, b);
120}
121#endif
122
123template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> SIMDPP_INL
124float64<N> i_shuffle4x2(const float64<N>& a, const float64<N>& b)
125{
126#if SIMDPP_USE_AVX
127 SIMDPP_VEC_ARRAY_IMPL2(float64<N>, (i_shuffle4x2<s0,s1,s2,s3>), a, b);
128#else
129 float64<N> r;
130 for (unsigned i = 0; i < float64<N>::vec_length; i+=2) {
131 r.vec(i*2) = i_shuffle_emul_64x4_half<s0,s1>(a.vec(i*2), a.vec(i*2+1),
132 b.vec(i*2), b.vec(i*2+1));
133 r.vec(i*2+1) = i_shuffle_emul_64x4_half<s2,s3>(a.vec(i*2), a.vec(i*2+1),
134 b.vec(i*2), b.vec(i*2+1));
135 }
136 return r;
137#endif
138}
139
140// -----------------------------------------------------------------------------
141
142template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
143uint32<4> i_shuffle4x2(const uint32<4>& a, const uint32<4>& b)
144{
145 static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range");
146#if SIMDPP_USE_NULL
147 uint32<4> r;
148 r.el(0) = s0 < 4 ? a.el(s0) : b.el(s0-4);
149 r.el(1) = s1 < 4 ? a.el(s1) : b.el(s1-4);
150 r.el(2) = s2 < 4 ? a.el(s2) : b.el(s2-4);
151 r.el(3) = s3 < 4 ? a.el(s3) : b.el(s3-4);
152 return r;
153#elif SIMDPP_USE_SSE2
154 return sse_shuffle4x2_int32::do_shuffle<s0, s1, s2, s3>(a, b);
155#elif SIMDPP_USE_NEON
156 return detail::neon_shuffle_int32x4::shuffle4x2<s0, s1, s2, s3>(a, b);
157#elif SIMDPP_USE_ALTIVEC
158 uint32<4> mask = make_shuffle_bytes16_mask<s0, s1, s2, s3>(mask);
159 return shuffle_bytes16(a, b, mask);
160#elif SIMDPP_USE_MSA
161 uint32<4> mask = make_uint(s0,s1,s2,s3);
162 return (v4u32) __msa_vshf_w((v4i32) mask.native(),
163 (v4i32) b.native(),
164 (v4i32) a.native());
165#else
166 return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(int64<s0+4>, a, b);
167#endif
168}
169
170#if SIMDPP_USE_AVX2
171template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
172uint32<8> i_shuffle4x2(const uint32<8>& a, const uint32<8>& b)
173{
174 static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range");
175 return sse_shuffle4x2_int32::do_shuffle<s0, s1, s2, s3>(a, b);
176}
177#endif
178
179#if SIMDPP_USE_AVX512F
180template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
181uint32<16> i_shuffle4x2(const uint32<16>& a, const uint32<16>& b)
182{
183 static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range");
184 return sse_shuffle4x2_int32::do_shuffle<s0, s1, s2, s3>(a, b);
185}
186#endif
187
188template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> SIMDPP_INL
189uint32<N> i_shuffle4x2(const uint32<N>& a, const uint32<N>& b)
190{
191 SIMDPP_VEC_ARRAY_IMPL2(uint32<N>, (i_shuffle4x2<s0,s1,s2,s3>), a, b);
192}
193
194// -----------------------------------------------------------------------------
195// int64
196
197#if SIMDPP_USE_AVX2
198template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
199uint64<4> i_shuffle4x2(const uint64<4>& a, const uint64<4>& b)
200{
201 static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range");
202 return sse_shuffle4x2_int64::do_shuffle<s0, s1, s2, s3>(a, b);
203}
204#endif
205
206#if SIMDPP_USE_AVX512F
207template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
208uint64<8> i_shuffle4x2(const uint64<8>& a, const uint64<8>& b)
209{
210 static_assert(s0 < 8 && s1 < 8 && s2 < 8 && s3 < 8, "Selector out of range");
211 return sse_shuffle4x2_int64::do_shuffle<s0, s1, s2, s3>(a, b);
212}
213#endif
214
215template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> SIMDPP_INL
216uint64<N> i_shuffle4x2(const uint64<N>& a, const uint64<N>& b)
217{
218#if SIMDPP_USE_AVX2
219 SIMDPP_VEC_ARRAY_IMPL2(uint64<N>, (i_shuffle4x2<s0,s1,s2,s3>), a, b);
220#else
221 uint64<N> r;
222 for (unsigned i = 0; i < uint64<N>::vec_length; i+=2) {
223 r.vec(i*2) = i_shuffle_emul_64x4_half<s0,s1>(a.vec(i*2), a.vec(i*2+1),
224 b.vec(i*2), b.vec(i*2+1));
225 r.vec(i*2+1) = i_shuffle_emul_64x4_half<s2,s3>(a.vec(i*2), a.vec(i*2+1),
226 b.vec(i*2), b.vec(i*2+1));
227 }
228 return r;
229#endif
230}
231
232
233} // namespace insn
234} // namespace detail
235} // namespace SIMDPP_ARCH_NAMESPACE
236} // namespace simdpp
237
238#endif
239
240