1 | /* Copyright (C) 2012-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_SHUFFLE128_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_SHUFFLE128_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | |
17 | namespace simdpp { |
18 | namespace SIMDPP_ARCH_NAMESPACE { |
19 | namespace detail { |
20 | |
21 | /** Shuffles 128 bit parts within the vectors. |
22 | |
23 | @code |
24 | switch(s0): |
25 | case 0: r[0..127] = a[0..127] |
26 | case 1: r[0..127] = a[128..255] |
27 | case 2: r[0..127] = a[256..383] |
28 | case 3: r[0..127] = a[384..511] |
29 | |
30 | switch(s1): |
31 | case 0: r[128..255] = a[0..127] |
32 | case 1: r[128..255] = a[128..255] |
33 | case 2: r[128..255] = a[256..383] |
34 | case 3: r[128..255] = a[384..511] |
35 | |
36 | switch(s2): |
37 | case 0: r[256..383] = b[0..127] |
38 | case 1: r[256..383] = b[128..255] |
39 | case 2: r[256..383] = b[256..383] |
40 | case 3: r[256..383] = b[384..511] |
41 | |
42 | switch(s3): |
43 | case 0: r[384..511] = b[0..127] |
44 | case 1: r[384..511] = b[128..255] |
45 | case 2: r[384..511] = b[256..383] |
46 | case 3: r[384..511] = b[384..511] |
47 | @endcode |
48 | */ |
49 | #if SIMDPP_USE_AVX512BW |
50 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
51 | uint8<64> shuffle2_128(const uint8<64>& a, const uint8<64>& b) |
52 | { |
53 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
54 | return _mm512_shuffle_i32x4(a.native(), b.native(), (s3<<6) + (s2<<4) + (s1<<2) + s0); |
55 | } |
56 | |
57 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
58 | uint16<32> shuffle2_128(const uint16<32>& a, const uint16<32>& b) |
59 | { |
60 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
61 | return _mm512_shuffle_i32x4(a.native(), b.native(), (s3<<6) + (s2<<4) + (s1<<2) + s0); |
62 | } |
63 | #endif |
64 | #if SIMDPP_USE_AVX512F |
65 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
66 | uint32<16> shuffle2_128(const uint32<16>& a, const uint32<16>& b) |
67 | { |
68 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
69 | return _mm512_shuffle_i32x4(a.native(), b.native(), (s3<<6) + (s2<<4) + (s1<<2) + s0); |
70 | } |
71 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
72 | uint64<8> shuffle2_128(const uint64<8>& a, const uint64<8>& b) |
73 | { |
74 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
75 | return _mm512_shuffle_i64x2(a.native(), b.native(), (s3<<6) + (s2<<4) + (s1<<2) + s0); |
76 | } |
77 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
78 | float32<16> shuffle2_128(const float32<16>& a, const float32<16>& b) |
79 | { |
80 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
81 | return _mm512_shuffle_f32x4(a.native(), b.native(), (s3<<6) + (s2<<4) + (s1<<2) + s0); |
82 | } |
83 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
84 | float64<8> shuffle2_128(const float64<8>& a, const float64<8>& b) |
85 | { |
86 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
87 | return _mm512_shuffle_f64x2(a.native(), b.native(), (s3<<6) + (s2<<4) + (s1<<2) + s0); |
88 | } |
89 | #endif |
90 | |
91 | /** Shuffles 128 bit parts within the vectors. |
92 | |
93 | For larger than 256-bits vectors the same operation is applied to each |
94 | 256-bit element. |
95 | |
96 | @code |
97 | switch(s0): |
98 | case 0: r[0..127] = a[0..127] |
99 | case 1: r[0..127] = a[128..255] |
100 | |
101 | switch(s1): |
102 | case 0: r[128..255] = b[0..127] |
103 | case 1: r[128..255] = b[128..255] |
104 | @endcode |
105 | */ |
106 | template<unsigned s0, unsigned s1> SIMDPP_INL |
107 | uint8x32 shuffle1_128(const uint8x32& a, const uint8x32& b) |
108 | { |
109 | static_assert(s0 < 2 && s1 < 2, "Selector out of range" ); |
110 | #if SIMDPP_USE_AVX2 |
111 | return _mm256_permute2x128_si256(a.native(), b.native(), ((s1+2)<<4) + s0); |
112 | #else |
113 | uint8x32 r; |
114 | r.vec(0) = a.vec(s0); |
115 | r.vec(1) = b.vec(s1); |
116 | return r; |
117 | #endif |
118 | } |
119 | template<unsigned s0, unsigned s1> SIMDPP_INL |
120 | uint16x16 shuffle1_128(const uint16x16& a, const uint16x16& b) { return (uint16x16)shuffle1_128<s0,s1>(uint8x32(a), uint8x32(b)); } |
121 | template<unsigned s0, unsigned s1> SIMDPP_INL |
122 | uint32x8 shuffle1_128(const uint32x8& a, const uint32x8& b) { return (uint32x8)shuffle1_128<s0,s1>(uint8x32(a), uint8x32(b)); } |
123 | template<unsigned s0, unsigned s1> SIMDPP_INL |
124 | uint64x4 shuffle1_128(const uint64x4& a, const uint64x4& b) { return (uint64x4)shuffle1_128<s0,s1>(uint8x32(a), uint8x32(b)); } |
125 | |
126 | template<unsigned s0, unsigned s1> SIMDPP_INL |
127 | float32x8 shuffle1_128(const float32x8& a, const float32x8& b) |
128 | { |
129 | static_assert(s0 < 2 && s1 < 2, "Selector out of range" ); |
130 | #if SIMDPP_USE_AVX |
131 | return _mm256_permute2f128_ps(a.native(), b.native(), ((s1+2)<<4) + s0); |
132 | #else |
133 | float32x8 r; |
134 | r.vec(0) = a.vec(s0); |
135 | r.vec(1) = b.vec(s1); |
136 | return r; |
137 | #endif |
138 | } |
139 | template<unsigned s0, unsigned s1> SIMDPP_INL |
140 | float64x4 shuffle1_128(const float64x4& a, const float64x4& b) |
141 | { |
142 | static_assert(s0 < 2 && s1 < 2, "Selector out of range" ); |
143 | #if SIMDPP_USE_AVX |
144 | return _mm256_permute2f128_pd(a.native(), b.native(), ((s1+2)<<4) + s0); |
145 | #else |
146 | float64x4 r; |
147 | r.vec(0) = a.vec(s0); |
148 | r.vec(1) = b.vec(s1); |
149 | return r; |
150 | #endif |
151 | } |
152 | |
153 | #if SIMDPP_USE_AVX512F |
154 | template<unsigned s0, unsigned s1> SIMDPP_INL |
155 | uint32<16> shuffle1_128(const uint32<16>& a, const uint32<16>& b) |
156 | { |
157 | static_assert(s0 < 2 && s1 < 2, "Selector out of range" ); |
158 | return shuffle2_128<s0,s1,s0+2,s1+2>(a, b); |
159 | } |
160 | |
161 | template<unsigned s0, unsigned s1> SIMDPP_INL |
162 | uint64<8> shuffle1_128(const uint64<8>& a, const uint64<8>& b) |
163 | { |
164 | static_assert(s0 < 2 && s1 < 2, "Selector out of range" ); |
165 | return shuffle2_128<s0,s1,s0+2,s1+2>(a, b); |
166 | } |
167 | |
168 | template<unsigned s0, unsigned s1> SIMDPP_INL |
169 | float32<16> shuffle1_128(const float32<16>& a, const float32<16>& b) |
170 | { |
171 | static_assert(s0 < 2 && s1 < 2, "Selector out of range" ); |
172 | return shuffle2_128<s0,s1,s0+2,s1+2>(a, b); |
173 | } |
174 | |
175 | template<unsigned s0, unsigned s1> SIMDPP_INL |
176 | float64<8> shuffle1_128(const float64<8>& a, const float64<8>& b) |
177 | { |
178 | static_assert(s0 < 2 && s1 < 2, "Selector out of range" ); |
179 | return shuffle2_128<s0,s1,s0+2,s1+2>(a, b); |
180 | } |
181 | #endif |
182 | |
183 | #if SIMDPP_USE_AVX512F |
184 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, class V> SIMDPP_INL |
185 | V permute4_128(const V& a) |
186 | { |
187 | return shuffle2_128<s0,s1,s2,s3>(a, a); |
188 | } |
189 | #endif |
190 | |
191 | } // namespace detail |
192 | } // namespace SIMDPP_ARCH_NAMESPACE |
193 | } // namespace simdpp |
194 | |
195 | #endif |
196 | |
197 | |