1/* Copyright (C) 2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_PERMUTE_SSE_INT32_4x2_H
9#define LIBSIMDPP_SIMDPP_DETAIL_PERMUTE_SSE_INT32_4x2_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/setup_arch.h>
16#include <simdpp/types.h>
17#include <simdpp/core/blend.h>
18#include <simdpp/detail/shuffle/shuffle_mask.h>
19#include <simdpp/detail/shuffle/sse_float32_4x2.h>
20
21#if SIMDPP_USE_SSE2
22
23namespace simdpp {
24namespace SIMDPP_ARCH_NAMESPACE {
25namespace detail {
26namespace sse_shuffle4x2_int32 {
27
28/* The code below implements generalized permutations for 2 elements sets
29 within uint32 vectors.
30*/
31
32template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
33struct impl_selector {
34
35 // 0 1 2 3
36 // 4 5 6 7
37 static const bool is1_zip_lo1 = (s0==0 && s1==4 && s2==1 && s3==5);
38 static const bool is2_zip_lo2 = (s0==4 && s1==0 && s2==5 && s3==1);
39 static const bool is3_zip_hi1 = (s0==2 && s1==6 && s2==3 && s3==7);
40 static const bool is4_zip_hi2 = (s0==6 && s1==2 && s2==7 && s3==3);
41#if SIMDPP_USE_SSE4_1
42 static const bool is5_blend = (s0==0 || s0==4) && (s1==1 || s1==5) &&
43 (s2==2 || s2==6) && (s3==3 || s3==7);
44#else
45 static const bool is5_blend = false;
46#endif
47#if SIMDPP_USE_SSSE3
48 static const bool is6_align = (s0==s1-1) && (s1==s2-1) && (s2==s3-1);
49#else
50 static const bool is6_align = false;
51#endif
52
53 static const int impl = is1_zip_lo1 ? 1 :
54 is2_zip_lo2 ? 2 :
55 is3_zip_hi1 ? 3 :
56 is4_zip_hi2 ? 4 :
57 is5_blend ? 5 :
58 is6_align ? 6 : 7;
59};
60
61template<unsigned N> struct shuffle_impl {};
62
63// zip_lo1
64template<> struct shuffle_impl<1> {
65 template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
66 static uint32<4> run(const uint32<4>& a, const uint32<4>& b)
67 {
68 return _mm_unpacklo_epi32(a.native(), b.native());
69 }
70#if SIMDPP_USE_AVX2
71 template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
72 static uint32<8> run(const uint32<8>& a, const uint32<8>& b)
73 {
74 return _mm256_unpacklo_epi32(a.native(), b.native());
75 }
76#endif
77#if SIMDPP_USE_AVX512F
78 template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
79 static uint32<16> run(const uint32<16>& a, const uint32<16>& b)
80 {
81 return _mm512_unpacklo_epi32(a.native(), b.native());
82 }
83#endif
84};
85
86// zip_lo2
87template<> struct shuffle_impl<2> {
88 template<unsigned, unsigned, unsigned, unsigned, unsigned N> SIMDPP_INL
89 static uint32<N> run(const uint32<N>& a, const uint32<N>& b)
90 {
91 return shuffle_impl<1>::run<0,0,0,0>(b, a);
92 }
93};
94
95// zip_hi1
96template<> struct shuffle_impl<3> {
97 template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
98 static uint32<4> run(const uint32<4>& a, const uint32<4>& b)
99 {
100 return _mm_unpackhi_epi32(a.native(), b.native());
101 }
102#if SIMDPP_USE_AVX2
103 template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
104 static uint32<8> run(const uint32<8>& a, const uint32<8>& b)
105 {
106 return _mm256_unpackhi_epi32(a.native(), b.native());
107 }
108#endif
109#if SIMDPP_USE_AVX512F
110 template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
111 static uint32<16> run(const uint32<16>& a, const uint32<16>& b)
112 {
113 return _mm512_unpackhi_epi32(a.native(), b.native());
114 }
115#endif
116};
117
118// zip_hi2
119template<> struct shuffle_impl<4> {
120 template<unsigned, unsigned, unsigned, unsigned, unsigned N> SIMDPP_INL
121 static uint32<N> run(const uint32<N>& a, const uint32<N>& b)
122 {
123 return shuffle_impl<3>::run<0,0,0,0>(b, a);
124 }
125};
126
127// is5_blend
128#if SIMDPP_USE_SSE4_1
129template<> struct shuffle_impl<5> {
130 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
131 static uint32<4> run(const uint32<4>& a, const uint32<4>& b)
132 {
133 const unsigned mask = (s0<4 ? 0 : 0x03) | (s1<4 ? 0 : 0x0c) |
134 (s2<4 ? 0 : 0x30) | (s3<4 ? 0 : 0xc0);
135 return _mm_blend_epi16(a.native(), b.native(), mask);
136 }
137#if SIMDPP_USE_AVX2
138 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
139 static uint32<8> run(const uint32<8>& a, const uint32<8>& b)
140 {
141 const unsigned mask = (s0<4 ? 0 : 1) | (s1<4 ? 0 : 2) |
142 (s2<4 ? 0 : 4) | (s3<4 ? 0 : 8);
143 return _mm256_blend_epi32(a.native(), b.native(), mask | mask << 4);
144 }
145#endif
146#if SIMDPP_USE_AVX512F
147 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
148 static uint32<16> run(const uint32<16>& a, const uint32<16>& b)
149 {
150 const unsigned mask = (s0<4 ? 0 : 1) | (s1<4 ? 0 : 2) |
151 (s2<4 ? 0 : 4) | (s3<4 ? 0 : 8);
152 const unsigned mask2 = mask | mask << 4 | mask << 8 | mask << 12;
153 return _mm512_mask_blend_epi32(mask2, a.native(), b.native());
154 }
155#endif
156};
157#endif
158
159#if SIMDPP_USE_SSSE3
160template<> struct shuffle_impl<6> {
161 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
162 static uint32<4> run(const uint32<4>& a, const uint32<4>& b)
163 {
164 return _mm_alignr_epi8(b.native(), a.native(), s0*4);
165 }
166#if SIMDPP_USE_AVX2
167 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
168 static uint32<8> run(const uint32<8>& a, const uint32<8>& b)
169 {
170 return _mm256_alignr_epi8(b.native(), a.native(), s0*4);
171 }
172#endif
173#if SIMDPP_USE_AVX512F
174 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
175 static uint32<16> run(const uint32<16>& a, const uint32<16>& b)
176 {
177 // note that _mm512_alignr_epi32 operates on entire vector
178 __m512i ap = _mm512_alignr_epi32(a.native(), a.native(), s0);
179 const int mask = SIMDPP_SHUFFLE_MASK_4x2_4(s0>3, s0>2, s0>1, s0>0);
180 return _mm512_mask_alignr_epi32(ap, mask, b.native(), b.native(), (s0+12)%16);
181 }
182#endif
183};
184#endif
185
186template<> struct shuffle_impl<7> {
187 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
188 static uint32<4> run(const uint32<4>& a, const uint32<4>& b)
189 {
190#if SIMDPP_USE_AVX2
191 const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0%4, s1%4, s2%4, s3%4);
192 __m128i pa = _mm_shuffle_epi32(a.native(), mask);
193 __m128i pb = _mm_shuffle_epi32(b.native(), mask);
194 return _mm_blend_epi32(pa, pb, SIMDPP_SHUFFLE_MASK_4x2(s0/4,s1/4,s2/4,s3/4));
195#elif SIMDPP_USE_SSE4_1
196 const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0%4, s1%4, s2%4, s3%4);
197 __m128i pa = _mm_shuffle_epi32(a.native(), mask);
198 __m128i pb = _mm_shuffle_epi32(b.native(), mask);
199 return _mm_blend_epi16(pa, pb, SIMDPP_SHUFFLE_MASK_4x4(s0/4*0x3,s1/4*0x3,s2/4*0x3,s3/4*0x3));
200#else
201 __m128 na = _mm_castsi128_ps(a.native());
202 __m128 nb = _mm_castsi128_ps(b.native());
203 __m128 ab1 = _mm_shuffle_ps(na, nb, _MM_SHUFFLE(s1%4, s0%4, s1%4, s0%4));
204 __m128 ab2 = _mm_shuffle_ps(na, nb, _MM_SHUFFLE(s3%4, s2%4, s3%4, s2%4));
205 float32<4> r = _mm_shuffle_ps(ab1, ab2, _MM_SHUFFLE(s3/4?3:1, s2/4?2:0, s1/4?3:1, s0/4?2:0));
206 return uint32<4>(r);
207#endif
208 }
209#if SIMDPP_USE_AVX2
210 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
211 static uint32<8> run(const uint32<8>& a, const uint32<8>& b)
212 {
213 const unsigned mask = SIMDPP_SHUFFLE_MASK_4x4(s0%4, s1%4, s2%4, s3%4);
214 __m256i pa = _mm256_shuffle_epi32(a.native(), mask);
215 __m256i pb = _mm256_shuffle_epi32(b.native(), mask);
216 return _mm256_blend_epi32(pa, pb, SIMDPP_SHUFFLE_MASK_4x2_2(s0/4,s1/4,s2/4,s3/4));
217 }
218#endif
219#if SIMDPP_USE_AVX512F
220 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
221 static uint32<16> run(const uint32<16>& a, const uint32<16>& b)
222 {
223 const unsigned shuf_mask = SIMDPP_SHUFFLE_MASK_4x4(s0%4,s1%4,s2%4,s3%4);
224 __m512i ap = _mm512_shuffle_epi32(a.native(), _MM_PERM_ENUM(shuf_mask));
225 const int mask = SIMDPP_SHUFFLE_MASK_4x2_4(s0/4,s1/4,s2/4,s3/4);
226 return _mm512_mask_shuffle_epi32(ap, mask, b.native(),
227 _MM_PERM_ENUM(shuf_mask));
228 }
229#endif
230};
231
232template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N>
233uint32<N> do_shuffle(const uint32<N>& a, const uint32<N>& b)
234{
235 return shuffle_impl<impl_selector<s0, s1, s2, s3>::impl>::template run<s0, s1, s2, s3>(a, b);
236}
237
238} // namespace sse_shuffle4x2_uint32
239} // namespace detail
240} // namespace SIMDPP_ARCH_NAMESPACE
241} // namespace simdpp
242
243#endif
244#endif
245