1/* Copyright (C) 2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_PERMUTE_SSE_INT64_4x2_H
9#define LIBSIMDPP_SIMDPP_DETAIL_PERMUTE_SSE_INT64_4x2_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/setup_arch.h>
16#include <simdpp/types.h>
17#include <simdpp/core/combine.h>
18#include <simdpp/core/split.h>
19#include <simdpp/detail/insn/shuffle2x2.h>
20#include <simdpp/detail/shuffle/shuffle_mask.h>
21
22#if SIMDPP_USE_AVX2
23
24namespace simdpp {
25namespace SIMDPP_ARCH_NAMESPACE {
26namespace detail {
27namespace sse_shuffle4x2_int64 {
28
29/* The code below implements generalized permutations for 4 elements sets
30 within uint64 vectors.
31*/
32
33template<unsigned s0, unsigned s1, unsigned s2, unsigned s3>
34struct impl_selector {
35
36 // 0 1 2 3
37 // 4 5 6 7
38 static const bool is1_zip_lo1 = (s0==0 && s1==4 && s2==2 && s3==6);
39 static const bool is2_zip_lo2 = (s0==4 && s1==0 && s2==6 && s3==2);
40 static const bool is3_zip_hi1 = (s0==1 && s1==5 && s2==3 && s3==7);
41 static const bool is4_zip_hi2 = (s0==5 && s1==1 && s2==7 && s3==3);
42 static const bool is5_blend = (s0==0 || s0==4) && (s1==1 || s1==5) &&
43 (s2==2 || s2==6) && (s3==3 || s3==7);
44#if SIMDPP_USE_AVX512F
45 static const bool is6_swap1 = false;
46 static const bool is7_swap2 = false;
47 static const bool is8_permx2var = true;
48#else
49 static const bool is6_swap1 = (s0==1 && s1==4 && s2==3 && s3==6);
50 static const bool is7_swap2 = (s0==4 && s1==1 && s2==6 && s3==3);
51 static const bool is8_permx2var = false;
52#endif
53 static const int impl = is1_zip_lo1 ? 1 :
54 is2_zip_lo2 ? 2 :
55 is3_zip_hi1 ? 3 :
56 is4_zip_hi2 ? 4 :
57 is5_blend ? 5 :
58 is6_swap1 ? 6 :
59 is7_swap2 ? 7 :
60 is8_permx2var ? 8 : 9;
61};
62
63template<unsigned N> struct shuffle_impl {};
64
65// zip_lo1
66template<> struct shuffle_impl<1> {
67 template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
68 static uint64<4> run(const uint64<4>& a, const uint64<4>& b)
69 {
70 return _mm256_unpacklo_epi64(a.native(), b.native());
71 }
72#if SIMDPP_USE_AVX512F
73 template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
74 static uint64<8> run(const uint64<8>& a, const uint64<8>& b)
75 {
76 return _mm512_unpacklo_epi64(a.native(), b.native());
77 }
78#endif
79};
80
81// zip_lo2
82template<> struct shuffle_impl<2> {
83 template<unsigned, unsigned, unsigned, unsigned, unsigned N> SIMDPP_INL
84 static uint64<N> run(const uint64<N>& a, const uint64<N>& b)
85 {
86 return shuffle_impl<1>::run<0,0,0,0>(b, a);
87 }
88};
89
90// zip_hi1
91template<> struct shuffle_impl<3> {
92 template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
93 static uint64<4> run(const uint64<4>& a, const uint64<4>& b)
94 {
95 return _mm256_unpackhi_epi64(a.native(), b.native());
96 }
97#if SIMDPP_USE_AVX512F
98 template<unsigned, unsigned, unsigned, unsigned> SIMDPP_INL
99 static uint64<8> run(const uint64<8>& a, const uint64<8>& b)
100 {
101 return _mm512_unpackhi_epi64(a.native(), b.native());
102 }
103#endif
104};
105
106// zip_hi2
107template<> struct shuffle_impl<4> {
108 template<unsigned, unsigned, unsigned, unsigned, unsigned N> SIMDPP_INL
109 static uint64<N> run(const uint64<N>& a, const uint64<N>& b)
110 {
111 return shuffle_impl<3>::run<0,0,0,0>(b, a);
112 }
113};
114
115// is5_blend
116template<> struct shuffle_impl<5> {
117 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
118 static uint64<4> run(const uint64<4>& a, const uint64<4>& b)
119 {
120 const unsigned mask = (s0<4 ? 0 : 0x03) | (s1<4 ? 0 : 0x0c) |
121 (s2<4 ? 0 : 0x30) | (s3<4 ? 0 : 0xc0);
122 return _mm256_blend_epi32(a.native(), b.native(), mask);
123 }
124#if SIMDPP_USE_AVX512F
125 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
126 static uint64<8> run(const uint64<8>& a, const uint64<8>& b)
127 {
128 const unsigned mask = (s0<4 ? 0 : 1) | (s1<4 ? 0 : 2) |
129 (s2<4 ? 0 : 4) | (s3<4 ? 0 : 8);
130 const unsigned mask2 = mask | mask << 4;
131 return _mm512_mask_blend_epi64(mask2, a.native(), b.native());
132 }
133#endif
134};
135
136// is6_swap1 (only AVX2)
137template<> struct shuffle_impl<6> {
138 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
139 static uint64<4> run(const uint64<4>& a, const uint64<4>& b)
140 {
141 __m256i n = _mm256_blend_epi32(a.native(), b.native(), 0x33);
142 return _mm256_permute4x64_epi64(n, _MM_SHUFFLE(2,3,0,1));
143 }
144};
145
146// is7_swap2 (only AVX2)
147template<> struct shuffle_impl<7> {
148 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
149 static uint64<4> run(const uint64<4>& a, const uint64<4>& b)
150 {
151 __m256i n = _mm256_blend_epi32(a.native(), b.native(), 0xcc);
152 return _mm256_permute4x64_epi64(n, _MM_SHUFFLE(2,3,0,1));
153 }
154};
155
156// is8_perm2xvar
157#if SIMDPP_USE_AVX512F
158template<> struct shuffle_impl<8> {
159
160 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
161 static uint64<4> run(const uint64<4>& a, const uint64<4>& b)
162 {
163#if SIMDPP_USE_AVX512VL
164 uint8<16> mask = make_uint(s0, s1, s2, s3);
165 return _mm256_permutex2var_epi64(a.native(),
166 _mm256_cvtepi8_epi64(mask.native()),
167 b.native());
168#else
169 const unsigned p0 = s0<4 ? s0 : s0+4;
170 const unsigned p1 = s1<4 ? s1 : s1+4;
171 const unsigned p2 = s2<4 ? s2 : s2+4;
172 const unsigned p3 = s3<4 ? s3 : s3+4;
173 uint8<16> mask = make_uint(p0, p1, p2, p3);
174 __m512i res = _mm512_permutex2var_epi64(_mm512_castsi256_si512(a.native()),
175 _mm512_cvtepi8_epi64(mask.native()),
176 _mm512_castsi256_si512(b.native()));
177 return _mm512_castsi512_si256(res);
178#endif
179 }
180
181 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
182 static uint64<8> run(const uint64<8>& a, const uint64<8>& b)
183 {
184 const unsigned p0 = s0<4 ? s0 : s0+4;
185 const unsigned p1 = s1<4 ? s1 : s1+4;
186 const unsigned p2 = s2<4 ? s2 : s2+4;
187 const unsigned p3 = s3<4 ? s3 : s3+4;
188 uint8<16> mask = make_uint(p0, p1, p2, p3, p0+4, p1+4, p2+4, p3+4);
189 return _mm512_permutex2var_epi64(a.native(),
190 _mm512_cvtepi8_epi64(mask.native()),
191 b.native());
192 }
193};
194#endif
195
196// any (only on AVX2)
197template<> struct shuffle_impl<9> {
198 template<unsigned s0, unsigned s1, unsigned s2, unsigned s3> SIMDPP_INL
199 static uint64<4> run(const uint64<4>& a, const uint64<4>& b)
200 {
201 const unsigned shuf_mask = SIMDPP_SHUFFLE_MASK_4x4(s0%4, s1%4, s2%4, s3%4);
202 const unsigned bl_mask = (s0<4 ? 0 : 0x03) | (s1<4 ? 0 : 0x0c) |
203 (s2<4 ? 0 : 0x30) | (s3<4 ? 0 : 0xc0);
204 __m256i ta = _mm256_permute4x64_epi64(a.native(), shuf_mask);
205 __m256i tb = _mm256_permute4x64_epi64(b.native(), shuf_mask);
206 return _mm256_blend_epi32(ta, tb, bl_mask);
207 }
208};
209
210
211template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, unsigned N>
212uint64<N> do_shuffle(const uint64<N>& a, const uint64<N>& b)
213{
214 const unsigned selector = impl_selector<s0, s1, s2, s3>::impl;
215 return shuffle_impl<selector>::template run<s0, s1, s2, s3>(a, b);
216}
217
218} // namespace sse_shuffle4x2_int64
219} // namespace detail
220} // namespace SIMDPP_ARCH_NAMESPACE
221} // namespace simdpp
222
223#endif
224#endif
225