1/* Copyright (C) 2012-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_SHUFFLE128_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_SHUFFLE128_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16
17namespace simdpp {
18namespace SIMDPP_ARCH_NAMESPACE {
19namespace detail {
20
21/** Shuffles 128 bit parts within the vectors.
22
23@code
24 switch(s0):
25 case 0: r[0..127] = a[0..127]
26 case 1: r[0..127] = a[128..255]
27 case 2: r[0..127] = a[256..383]
28 case 3: r[0..127] = a[384..511]
29
30 switch(s1):
31 case 0: r[128..255] = a[0..127]
32 case 1: r[128..255] = a[128..255]
33 case 2: r[128..255] = a[256..383]
34 case 3: r[128..255] = a[384..511]
35
36 switch(s2):
37 case 0: r[256..383] = b[0..127]
38 case 1: r[256..383] = b[128..255]
39 case 2: r[256..383] = b[256..383]
40 case 3: r[256..383] = b[384..511]
41
42 switch(s3):
43 case 0: r[384..511] = b[0..127]
44 case 1: r[384..511] = b[128..255]
45 case 2: r[384..511] = b[256..383]
46 case 3: r[384..511] = b[384..511]
47@endcode
48*/
49#if SIMDPP_USE_AVX512BW
50template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
51uint8<64> shuffle2_128(const uint8<64>& a, const uint8<64>& b)
52{
53 static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
54 return _mm512_shuffle_i32x4(a.native(), b.native(), (s3<<6) + (s2<<4) + (s1<<2) + s0);
55}
56
57template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
58uint16<32> shuffle2_128(const uint16<32>& a, const uint16<32>& b)
59{
60 static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
61 return _mm512_shuffle_i32x4(a.native(), b.native(), (s3<<6) + (s2<<4) + (s1<<2) + s0);
62}
63#endif
64#if SIMDPP_USE_AVX512F
65template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
66uint32<16> shuffle2_128(const uint32<16>& a, const uint32<16>& b)
67{
68 static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
69 return _mm512_shuffle_i32x4(a.native(), b.native(), (s3<<6) + (s2<<4) + (s1<<2) + s0);
70}
71template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
72uint64<8> shuffle2_128(const uint64<8>& a, const uint64<8>& b)
73{
74 static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
75 return _mm512_shuffle_i64x2(a.native(), b.native(), (s3<<6) + (s2<<4) + (s1<<2) + s0);
76}
77template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
78float32<16> shuffle2_128(const float32<16>& a, const float32<16>& b)
79{
80 static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
81 return _mm512_shuffle_f32x4(a.native(), b.native(), (s3<<6) + (s2<<4) + (s1<<2) + s0);
82}
83template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
84float64<8> shuffle2_128(const float64<8>& a, const float64<8>& b)
85{
86 static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range");
87 return _mm512_shuffle_f64x2(a.native(), b.native(), (s3<<6) + (s2<<4) + (s1<<2) + s0);
88}
89#endif
90
91/** Shuffles 128 bit parts within the vectors.
92
93 For larger than 256-bits vectors the same operation is applied to each
94 256-bit element.
95
96@code
97 switch(s0):
98 case 0: r[0..127] = a[0..127]
99 case 1: r[0..127] = a[128..255]
100
101 switch(s1):
102 case 0: r[128..255] = b[0..127]
103 case 1: r[128..255] = b[128..255]
104@endcode
105*/
106template<unsigned s0, unsigned s1> SIMDPP_INL
107uint8x32 shuffle1_128(const uint8x32& a, const uint8x32& b)
108{
109 static_assert(s0 < 2 && s1 < 2, "Selector out of range");
110#if SIMDPP_USE_AVX2
111 return _mm256_permute2x128_si256(a.native(), b.native(), ((s1+2)<<4) + s0);
112#else
113 uint8x32 r;
114 r.vec(0) = a.vec(s0);
115 r.vec(1) = b.vec(s1);
116 return r;
117#endif
118}
119template<unsigned s0, unsigned s1> SIMDPP_INL
120uint16x16 shuffle1_128(const uint16x16& a, const uint16x16& b) { return (uint16x16)shuffle1_128<s0,s1>(uint8x32(a), uint8x32(b)); }
121template<unsigned s0, unsigned s1> SIMDPP_INL
122uint32x8 shuffle1_128(const uint32x8& a, const uint32x8& b) { return (uint32x8)shuffle1_128<s0,s1>(uint8x32(a), uint8x32(b)); }
123template<unsigned s0, unsigned s1> SIMDPP_INL
124uint64x4 shuffle1_128(const uint64x4& a, const uint64x4& b) { return (uint64x4)shuffle1_128<s0,s1>(uint8x32(a), uint8x32(b)); }
125
126template<unsigned s0, unsigned s1> SIMDPP_INL
127float32x8 shuffle1_128(const float32x8& a, const float32x8& b)
128{
129 static_assert(s0 < 2 && s1 < 2, "Selector out of range");
130#if SIMDPP_USE_AVX
131 return _mm256_permute2f128_ps(a.native(), b.native(), ((s1+2)<<4) + s0);
132#else
133 float32x8 r;
134 r.vec(0) = a.vec(s0);
135 r.vec(1) = b.vec(s1);
136 return r;
137#endif
138}
139template<unsigned s0, unsigned s1> SIMDPP_INL
140float64x4 shuffle1_128(const float64x4& a, const float64x4& b)
141{
142 static_assert(s0 < 2 && s1 < 2, "Selector out of range");
143#if SIMDPP_USE_AVX
144 return _mm256_permute2f128_pd(a.native(), b.native(), ((s1+2)<<4) + s0);
145#else
146 float64x4 r;
147 r.vec(0) = a.vec(s0);
148 r.vec(1) = b.vec(s1);
149 return r;
150#endif
151}
152
153#if SIMDPP_USE_AVX512F
154template<unsigned s0, unsigned s1> SIMDPP_INL
155uint32<16> shuffle1_128(const uint32<16>& a, const uint32<16>& b)
156{
157 static_assert(s0 < 2 && s1 < 2, "Selector out of range");
158 return shuffle2_128<s0,s1,s0+2,s1+2>(a, b);
159}
160
161template<unsigned s0, unsigned s1> SIMDPP_INL
162uint64<8> shuffle1_128(const uint64<8>& a, const uint64<8>& b)
163{
164 static_assert(s0 < 2 && s1 < 2, "Selector out of range");
165 return shuffle2_128<s0,s1,s0+2,s1+2>(a, b);
166}
167
168template<unsigned s0, unsigned s1> SIMDPP_INL
169float32<16> shuffle1_128(const float32<16>& a, const float32<16>& b)
170{
171 static_assert(s0 < 2 && s1 < 2, "Selector out of range");
172 return shuffle2_128<s0,s1,s0+2,s1+2>(a, b);
173}
174
175template<unsigned s0, unsigned s1> SIMDPP_INL
176float64<8> shuffle1_128(const float64<8>& a, const float64<8>& b)
177{
178 static_assert(s0 < 2 && s1 < 2, "Selector out of range");
179 return shuffle2_128<s0,s1,s0+2,s1+2>(a, b);
180}
181#endif
182
183#if SIMDPP_USE_AVX512F
184template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, class V> SIMDPP_INL
185V permute4_128(const V& a)
186{
187 return shuffle2_128<s0,s1,s2,s3>(a, a);
188}
189#endif
190
191} // namespace detail
192} // namespace SIMDPP_ARCH_NAMESPACE
193} // namespace simdpp
194
195#endif
196
197