1 | /* Copyright (C) 2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_PERMUTE_SSE_INT64_4x2_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_PERMUTE_SSE_INT64_4x2_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/setup_arch.h> |
16 | #include <simdpp/types.h> |
17 | #include <simdpp/core/combine.h> |
18 | #include <simdpp/core/split.h> |
19 | #include <simdpp/detail/insn/shuffle2x2.h> |
20 | #include <simdpp/detail/shuffle/shuffle_mask.h> |
21 | |
22 | #if SIMDPP_USE_AVX2 |
23 | |
24 | namespace simdpp { |
25 | namespace SIMDPP_ARCH_NAMESPACE { |
26 | namespace detail { |
27 | namespace sse_shuffle4x2_int64 { |
28 | |
29 | /* The code below implements generalized permutations for 4 elements sets |
30 | within uint64 vectors. |
31 | */ |
32 | |
33 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> |
34 | struct impl_selector { |
35 | |
36 | // 0 1 2 3 |
37 | // 4 5 6 7 |
38 | static const bool is1_zip_lo1 = (s0==0 && s1==4 && s2==2 && s3==6); |
39 | static const bool is2_zip_lo2 = (s0==4 && s1==0 && s2==6 && s3==2); |
40 | static const bool is3_zip_hi1 = (s0==1 && s1==5 && s2==3 && s3==7); |
41 | static const bool is4_zip_hi2 = (s0==5 && s1==1 && s2==7 && s3==3); |
42 | static const bool is5_blend = (s0==0 || s0==4) && (s1==1 || s1==5) && |
43 | (s2==2 || s2==6) && (s3==3 || s3==7); |
44 | #if SIMDPP_USE_AVX512F |
45 | static const bool is6_swap1 = false; |
46 | static const bool is7_swap2 = false; |
47 | static const bool is8_permx2var = true; |
48 | #else |
49 | static const bool is6_swap1 = (s0==1 && s1==4 && s2==3 && s3==6); |
50 | static const bool is7_swap2 = (s0==4 && s1==1 && s2==6 && s3==3); |
51 | static const bool is8_permx2var = false; |
52 | #endif |
53 | static const int impl = is1_zip_lo1 ? 1 : |
54 | is2_zip_lo2 ? 2 : |
55 | is3_zip_hi1 ? 3 : |
56 | is4_zip_hi2 ? 4 : |
57 | is5_blend ? 5 : |
58 | is6_swap1 ? 6 : |
59 | is7_swap2 ? 7 : |
60 | is8_permx2var ? 8 : 9; |
61 | }; |
62 | |
63 | template<unsigned N> struct shuffle_impl {}; |
64 | |
65 | // zip_lo1 |
66 | template<> struct shuffle_impl<1> { |
67 | template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL |
68 | static uint64<4> run(const uint64<4>& a, const uint64<4>& b) |
69 | { |
70 | return _mm256_unpacklo_epi64(a.native(), b.native()); |
71 | } |
72 | #if SIMDPP_USE_AVX512F |
73 | template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL |
74 | static uint64<8> run(const uint64<8>& a, const uint64<8>& b) |
75 | { |
76 | return _mm512_unpacklo_epi64(a.native(), b.native()); |
77 | } |
78 | #endif |
79 | }; |
80 | |
81 | // zip_lo2 |
82 | template<> struct shuffle_impl<2> { |
83 | template<unsigned, unsigned, unsigned, unsigned, unsigned N> SIMDPP_INL |
84 | static uint64<N> run(const uint64<N>& a, const uint64<N>& b) |
85 | { |
86 | return shuffle_impl<1>::run<0,0,0,0>(b, a); |
87 | } |
88 | }; |
89 | |
90 | // zip_hi1 |
91 | template<> struct shuffle_impl<3> { |
92 | template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL |
93 | static uint64<4> run(const uint64<4>& a, const uint64<4>& b) |
94 | { |
95 | return _mm256_unpackhi_epi64(a.native(), b.native()); |
96 | } |
97 | #if SIMDPP_USE_AVX512F |
98 | template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL |
99 | static uint64<8> run(const uint64<8>& a, const uint64<8>& b) |
100 | { |
101 | return _mm512_unpackhi_epi64(a.native(), b.native()); |
102 | } |
103 | #endif |
104 | }; |
105 | |
106 | // zip_hi2 |
107 | template<> struct shuffle_impl<4> { |
108 | template<unsigned, unsigned, unsigned, unsigned, unsigned N> SIMDPP_INL |
109 | static uint64<N> run(const uint64<N>& a, const uint64<N>& b) |
110 | { |
111 | return shuffle_impl<3>::run<0,0,0,0>(b, a); |
112 | } |
113 | }; |
114 | |
115 | // is5_blend |
116 | template<> struct shuffle_impl<5> { |
117 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
118 | static uint64<4> run(const uint64<4>& a, const uint64<4>& b) |
119 | { |
120 | const unsigned mask = (s0<4 ? 0 : 0x03) | (s1<4 ? 0 : 0x0c) | |
121 | (s2<4 ? 0 : 0x30) | (s3<4 ? 0 : 0xc0); |
122 | return _mm256_blend_epi32(a.native(), b.native(), mask); |
123 | } |
124 | #if SIMDPP_USE_AVX512F |
125 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
126 | static uint64<8> run(const uint64<8>& a, const uint64<8>& b) |
127 | { |
128 | const unsigned mask = (s0<4 ? 0 : 1) | (s1<4 ? 0 : 2) | |
129 | (s2<4 ? 0 : 4) | (s3<4 ? 0 : 8); |
130 | const unsigned mask2 = mask | mask << 4; |
131 | return _mm512_mask_blend_epi64(mask2, a.native(), b.native()); |
132 | } |
133 | #endif |
134 | }; |
135 | |
136 | // is6_swap1 (only AVX2) |
137 | template<> struct shuffle_impl<6> { |
138 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
139 | static uint64<4> run(const uint64<4>& a, const uint64<4>& b) |
140 | { |
141 | __m256i n = _mm256_blend_epi32(a.native(), b.native(), 0x33); |
142 | return _mm256_permute4x64_epi64(n, _MM_SHUFFLE(2,3,0,1)); |
143 | } |
144 | }; |
145 | |
146 | // is7_swap2 (only AVX2) |
147 | template<> struct shuffle_impl<7> { |
148 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
149 | static uint64<4> run(const uint64<4>& a, const uint64<4>& b) |
150 | { |
151 | __m256i n = _mm256_blend_epi32(a.native(), b.native(), 0xcc); |
152 | return _mm256_permute4x64_epi64(n, _MM_SHUFFLE(2,3,0,1)); |
153 | } |
154 | }; |
155 | |
156 | // is8_perm2xvar |
157 | #if SIMDPP_USE_AVX512F |
158 | template<> struct shuffle_impl<8> { |
159 | |
160 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
161 | static uint64<4> run(const uint64<4>& a, const uint64<4>& b) |
162 | { |
163 | #if SIMDPP_USE_AVX512VL |
164 | uint8<16> mask = make_uint(s0, s1, s2, s3); |
165 | return _mm256_permutex2var_epi64(a.native(), |
166 | _mm256_cvtepi8_epi64(mask.native()), |
167 | b.native()); |
168 | #else |
169 | const unsigned p0 = s0<4 ? s0 : s0+4; |
170 | const unsigned p1 = s1<4 ? s1 : s1+4; |
171 | const unsigned p2 = s2<4 ? s2 : s2+4; |
172 | const unsigned p3 = s3<4 ? s3 : s3+4; |
173 | uint8<16> mask = make_uint(p0, p1, p2, p3); |
174 | __m512i res = _mm512_permutex2var_epi64(_mm512_castsi256_si512(a.native()), |
175 | _mm512_cvtepi8_epi64(mask.native()), |
176 | _mm512_castsi256_si512(b.native())); |
177 | return _mm512_castsi512_si256(res); |
178 | #endif |
179 | } |
180 | |
181 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
182 | static uint64<8> run(const uint64<8>& a, const uint64<8>& b) |
183 | { |
184 | const unsigned p0 = s0<4 ? s0 : s0+4; |
185 | const unsigned p1 = s1<4 ? s1 : s1+4; |
186 | const unsigned p2 = s2<4 ? s2 : s2+4; |
187 | const unsigned p3 = s3<4 ? s3 : s3+4; |
188 | uint8<16> mask = make_uint(p0, p1, p2, p3, p0+4, p1+4, p2+4, p3+4); |
189 | return _mm512_permutex2var_epi64(a.native(), |
190 | _mm512_cvtepi8_epi64(mask.native()), |
191 | b.native()); |
192 | } |
193 | }; |
194 | #endif |
195 | |
196 | // any (only on AVX2) |
197 | template<> struct shuffle_impl<9> { |
198 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL |
199 | static uint64<4> run(const uint64<4>& a, const uint64<4>& b) |
200 | { |
201 | const unsigned shuf_mask = SIMDPP_SHUFFLE_MASK_4x4(s0%4, s1%4, s2%4, s3%4); |
202 | const unsigned bl_mask = (s0<4 ? 0 : 0x03) | (s1<4 ? 0 : 0x0c) | |
203 | (s2<4 ? 0 : 0x30) | (s3<4 ? 0 : 0xc0); |
204 | __m256i ta = _mm256_permute4x64_epi64(a.native(), shuf_mask); |
205 | __m256i tb = _mm256_permute4x64_epi64(b.native(), shuf_mask); |
206 | return _mm256_blend_epi32(ta, tb, bl_mask); |
207 | } |
208 | }; |
209 | |
210 | |
211 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N> |
212 | uint64<N> do_shuffle(const uint64<N>& a, const uint64<N>& b) |
213 | { |
214 | const unsigned selector = impl_selector<s0, s1, s2, s3>::impl; |
215 | return shuffle_impl<selector>::template run<s0, s1, s2, s3>(a, b); |
216 | } |
217 | |
218 | } // namespace sse_shuffle4x2_int64 |
219 | } // namespace detail |
220 | } // namespace SIMDPP_ARCH_NAMESPACE |
221 | } // namespace simdpp |
222 | |
223 | #endif |
224 | #endif |
225 | |