1 | /* Copyright (C) 2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_PERMUTE_SSE_INT32_4x2_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_PERMUTE_SSE_INT32_4x2_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/setup_arch.h> |
16 | #include <simdpp/types.h> |
17 | #include <simdpp/core/blend.h> |
18 | #include <simdpp/detail/shuffle/shuffle_mask.h> |
19 | #include <simdpp/detail/shuffle/sse_float32_4x2.h> |
20 | |
21 | #if SIMDPP_USE_SSE2 |
22 | |
23 | namespace simdpp { |
24 | namespace SIMDPP_ARCH_NAMESPACE { |
25 | namespace detail { |
26 | namespace sse_shuffle4x2_int32 { |
27 | |
28 | /* The code below implements generalized permutations for 2 elements sets |
29 | within uint32 vectors. |
30 | */ |
31 | |
32 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> |
33 | struct impl_selector { |
34 | |
35 | // 0 1 2 3 |
36 | // 4 5 6 7 |
37 | static const bool is1_zip_lo1 = (s0==0 && s1==4 && s2==1 && s3==5); |
38 | static const bool is2_zip_lo2 = (s0==4 && s1==0 && s2==5 && s3==1); |
39 | static const bool is3_zip_hi1 = (s0==2 && s1==6 && s2==3 && s3==7); |
40 | static const bool is4_zip_hi2 = (s0==6 && s1==2 && s2==7 && s3==3); |
41 | #if SIMDPP_USE_SSE4_1 |
42 | static const bool is5_blend = (s0==0 || s0==4) && (s1==1 || s1==5) && |
43 | (s2==2 || s2==6) && (s3==3 || s3==7); |
44 | #else |
45 | static const bool is5_blend = false; |
46 | #endif |
47 | #if SIMDPP_USE_SSSE3 |
48 | static const bool is6_align = (s0==s1-1) && (s1==s2-1) && (s2==s3-1); |
49 | #else |
50 | static const bool is6_align = false; |
51 | #endif |
52 | |
53 | static const int impl = is1_zip_lo1 ? 1 : |
54 | is2_zip_lo2 ? 2 : |
55 | is3_zip_hi1 ? 3 : |
56 | is4_zip_hi2 ? 4 : |
57 | is5_blend ? 5 : |
58 | is6_align ? 6 : 7; |
59 | }; |
60 | |
61 | template<unsigned N> struct shuffle_impl {}; |
62 | |
63 | // zip_lo1 |
64 | template<> struct shuffle_impl<1> { |
65 | template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL |
66 | static uint32<4> run(const uint32<4>& a, const uint32<4>& b) |
67 | { |
68 | return _mm_unpacklo_epi32(a.native(), b.native()); |
69 | } |
70 | #if SIMDPP_USE_AVX2 |
71 | template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL |
72 | static uint32<8> run(const uint32<8>& a, const uint32<8>& b) |
73 | { |
74 | return _mm256_unpacklo_epi32(a.native(), b.native()); |
75 | } |
76 | #endif |
77 | #if SIMDPP_USE_AVX512F |
78 | template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL |
79 | static uint32<16> run(const uint32<16>& a, const uint32<16>& b) |
80 | { |
81 | return _mm512_unpacklo_epi32(a.native(), b.native()); |
82 | } |
83 | #endif |
84 | }; |
85 | |
86 | // zip_lo2 |
87 | template<> struct shuffle_impl<2> { |
88 | template<unsigned, unsigned, unsigned, unsigned, unsigned N> SIMDPP_INL |
89 | static uint32<N> run(const uint32<N>& a, const uint32<N>& b) |
90 | { |
91 | return shuffle_impl<1>::run<0,0,0,0>(b, a); |
92 | } |
93 | }; |
94 | |
95 | // zip_hi1 |
96 | template<> struct shuffle_impl<3> { |
97 | template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL |
98 | static uint32<4> run(const uint32<4>& a, const uint32<4>& b) |
99 | { |
100 | return _mm_unpackhi_epi32(a.native(), b.native()); |
101 | } |
102 | #if SIMDPP_USE_AVX2 |
103 | template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL |
104 | static uint32<8> run(const uint32<8>& a, const uint32<8>& b) |
105 | { |
106 | return _mm256_unpackhi_epi32(a.native(), b.native()); |
107 | } |
108 | #endif |
109 | #if SIMDPP_USE_AVX512F |
110 | template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL |
111 | static uint32<16> run(const uint32<16>& a, const uint32<16>& b) |
112 | { |
113 | return _mm512_unpackhi_epi32(a.native(), b.native()); |
114 | } |
115 | #endif |
116 | }; |
117 | |
118 | // zip_hi2 |
119 | template<> struct shuffle_impl<4> { |
120 | template<unsigned, unsigned, unsigned, unsigned, unsigned N> SIMDPP_INL |
121 | static uint32<N> run(const uint32<N>& a, const uint32<N>& b) |
122 | { |
123 | return shuffle_impl<3>::run<0,0,0,0>(b, a); |
124 | } |
125 | }; |
126 | |
127 | // is5_blend |
128 | #if SIMDPP_USE_SSE4_1 |
129 | template<> struct shuffle_impl<5> { |
130 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
131 | static uint32<4> run(const uint32<4>& a, const uint32<4>& b) |
132 | { |
133 | const unsigned mask = (s0<4 ? 0 : 0x03) | (s1<4 ? 0 : 0x0c) | |
134 | (s2<4 ? 0 : 0x30) | (s3<4 ? 0 : 0xc0); |
135 | return _mm_blend_epi16(a.native(), b.native(), mask); |
136 | } |
137 | #if SIMDPP_USE_AVX2 |
138 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
139 | static uint32<8> run(const uint32<8>& a, const uint32<8>& b) |
140 | { |
141 | const unsigned mask = (s0<4 ? 0 : 1) | (s1<4 ? 0 : 2) | |
142 | (s2<4 ? 0 : 4) | (s3<4 ? 0 : 8); |
143 | return _mm256_blend_epi32(a.native(), b.native(), mask | mask << 4); |
144 | } |
145 | #endif |
146 | #if SIMDPP_USE_AVX512F |
147 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
148 | static uint32<16> run(const uint32<16>& a, const uint32<16>& b) |
149 | { |
150 | const unsigned mask = (s0<4 ? 0 : 1) | (s1<4 ? 0 : 2) | |
151 | (s2<4 ? 0 : 4) | (s3<4 ? 0 : 8); |
152 | const unsigned mask2 = mask | mask << 4 | mask << 8 | mask << 12; |
153 | return _mm512_mask_blend_epi32(mask2, a.native(), b.native()); |
154 | } |
155 | #endif |
156 | }; |
157 | #endif |
158 | |
159 | #if SIMDPP_USE_SSSE3 |
160 | template<> struct shuffle_impl<6> { |
161 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
162 | static uint32<4> run(const uint32<4>& a, const uint32<4>& b) |
163 | { |
164 | return _mm_alignr_epi8(b.native(), a.native(), s0*4); |
165 | } |
166 | #if SIMDPP_USE_AVX2 |
167 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
168 | static uint32<8> run(const uint32<8>& a, const uint32<8>& b) |
169 | { |
170 | return _mm256_alignr_epi8(b.native(), a.native(), s0*4); |
171 | } |
172 | #endif |
173 | #if SIMDPP_USE_AVX512F |
174 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
175 | static uint32<16> run(const uint32<16>& a, const uint32<16>& b) |
176 | { |
177 | // note that _mm512_alignr_epi32 operates on entire vector |
178 | __m512i ap = _mm512_alignr_epi32(a.native(), a.native(), s0); |
179 | const int mask = SIMDPP_SHUFFLE_MASK_4x2_4(s0>3, s0>2, s0>1, s0>0); |
180 | return _mm512_mask_alignr_epi32(ap, mask, b.native(), b.native(), (s0+12)%16); |
181 | } |
182 | #endif |
183 | }; |
184 | #endif |
185 | |
186 | template<> struct shuffle_impl<7> { |
187 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
188 | static uint32<4> run(const uint32<4>& a, const uint32<4>& b) |
189 | { |
190 | #if SIMDPP_USE_AVX2 |
191 | const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0%4, s1%4, s2%4, s3%4); |
192 | __m128i pa = _mm_shuffle_epi32(a.native(), mask); |
193 | __m128i pb = _mm_shuffle_epi32(b.native(), mask); |
194 | return _mm_blend_epi32(pa, pb, SIMDPP_SHUFFLE_MASK_4x2(s0/4,s1/4,s2/4,s3/4)); |
195 | #elif SIMDPP_USE_SSE4_1 |
196 | const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0%4, s1%4, s2%4, s3%4); |
197 | __m128i pa = _mm_shuffle_epi32(a.native(), mask); |
198 | __m128i pb = _mm_shuffle_epi32(b.native(), mask); |
199 | return _mm_blend_epi16(pa, pb, SIMDPP_SHUFFLE_MASK_4x4(s0/4*0x3,s1/4*0x3,s2/4*0x3,s3/4*0x3)); |
200 | #else |
201 | __m128 na = _mm_castsi128_ps(a.native()); |
202 | __m128 nb = _mm_castsi128_ps(b.native()); |
203 | __m128 ab1 = _mm_shuffle_ps(na, nb, _MM_SHUFFLE(s1%4, s0%4, s1%4, s0%4)); |
204 | __m128 ab2 = _mm_shuffle_ps(na, nb, _MM_SHUFFLE(s3%4, s2%4, s3%4, s2%4)); |
205 | float32<4> r = _mm_shuffle_ps(ab1, ab2, _MM_SHUFFLE(s3/4?3:1, s2/4?2:0, s1/4?3:1, s0/4?2:0)); |
206 | return uint32<4>(r); |
207 | #endif |
208 | } |
209 | #if SIMDPP_USE_AVX2 |
210 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
211 | static uint32<8> run(const uint32<8>& a, const uint32<8>& b) |
212 | { |
213 | const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0%4, s1%4, s2%4, s3%4); |
214 | __m256i pa = _mm256_shuffle_epi32(a.native(), mask); |
215 | __m256i pb = _mm256_shuffle_epi32(b.native(), mask); |
216 | return _mm256_blend_epi32(pa, pb, SIMDPP_SHUFFLE_MASK_4x2_2(s0/4,s1/4,s2/4,s3/4)); |
217 | } |
218 | #endif |
219 | #if SIMDPP_USE_AVX512F |
220 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
221 | static uint32<16> run(const uint32<16>& a, const uint32<16>& b) |
222 | { |
223 | const unsigned shuf_mask = SIMDPP_SHUFFLE_MASK_4x4(s0%4,s1%4,s2%4,s3%4); |
224 | __m512i ap = _mm512_shuffle_epi32(a.native(), _MM_PERM_ENUM(shuf_mask)); |
225 | const int mask = SIMDPP_SHUFFLE_MASK_4x2_4(s0/4,s1/4,s2/4,s3/4); |
226 | return _mm512_mask_shuffle_epi32(ap, mask, b.native(), |
227 | _MM_PERM_ENUM(shuf_mask)); |
228 | } |
229 | #endif |
230 | }; |
231 | |
232 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> |
233 | uint32<N> do_shuffle(const uint32<N>& a, const uint32<N>& b) |
234 | { |
235 | return shuffle_impl<impl_selector<s0, s1, s2, s3>::impl>::template run<s0, s1, s2, s3>(a, b); |
236 | } |
237 | |
238 | } // namespace sse_shuffle4x2_uint32 |
239 | } // namespace detail |
240 | } // namespace SIMDPP_ARCH_NAMESPACE |
241 | } // namespace simdpp |
242 | |
243 | #endif |
244 | #endif |
245 | |