1/* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_UNZIP_LO_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_UNZIP_LO_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/bit_and.h>
17#include <simdpp/core/shuffle2.h>
18#include <simdpp/detail/insn/zip_lo.h>
19#include <simdpp/detail/null/shuffle.h>
20
21namespace simdpp {
22namespace SIMDPP_ARCH_NAMESPACE {
23namespace detail {
24namespace insn {
25
26
27static SIMDPP_INL
28uint8x16 i_unzip16_lo(const uint8x16& ca, const uint8x16& cb)
29{
30 uint8<16> a = ca, b = cb;
31#if SIMDPP_USE_NULL
32 return detail::null::unzip16_lo(a, b);
33#elif SIMDPP_USE_SSE2
34 uint16x8 mask, r;
35 mask = make_ones();
36 mask = _mm_srli_epi16(mask.native(), 8);
37 a = bit_and(a, mask);
38 b = bit_and(b, mask);
39 r = _mm_packus_epi16(a.native(), b.native());
40 return (uint8x16)r;
41#elif SIMDPP_USE_NEON
42 return vuzpq_u8(a.native(), b.native()).val[0];
43#elif SIMDPP_USE_ALTIVEC
44#if SIMDPP_BIG_ENDIAN
45 uint8x16 mask = make_shuffle_bytes16_mask<0,2,4,6,8,10,12,14,
46 16,18,20,22,24,26,28,30>(mask);
47 return shuffle_bytes16(a, b, mask);
48#else
49 return vec_pack((__vector uint16_t) a.native(),
50 (__vector uint16_t) b.native());
51#endif
52#elif SIMDPP_USE_MSA
53 return (v16u8) __msa_pckev_b((v16i8) b.native(), (v16i8) a.native());
54#endif
55}
56
57#if SIMDPP_USE_AVX2
58static SIMDPP_INL
59uint8x32 i_unzip16_lo(const uint8x32& ca, const uint8x32& cb)
60{
61 uint8<32> a = ca, b = cb;
62 uint16x16 mask, r;
63 mask = make_ones();
64 mask = _mm256_srli_epi16(mask.native(), 8);
65 a = bit_and(a, mask);
66 b = bit_and(b, mask);
67 r = _mm256_packus_epi16(a.native(), b.native());
68 return uint8x32(r);
69}
70#endif
71
72#if SIMDPP_USE_AVX512BW
73SIMDPP_INL uint8<64> i_unzip16_lo(const uint8<64>& ca, const uint8<64>& cb)
74{
75 uint8<64> a = ca, b = cb;
76 uint16<32> mask, r;
77 mask = make_ones();
78 mask = _mm512_srli_epi16(mask.native(), 8);
79 a = bit_and(a, mask);
80 b = bit_and(b, mask);
81 r = _mm512_packus_epi16(a.native(), b.native());
82 return uint8<64>(r);
83}
84#endif
85
86template<unsigned N> SIMDPP_INL
87uint8<N> i_unzip16_lo(const uint8<N>& a, const uint8<N>& b)
88{
89 SIMDPP_VEC_ARRAY_IMPL2(uint8<N>, i_unzip16_lo, a, b);
90}
91
92// -----------------------------------------------------------------------------
93
94static SIMDPP_INL
95uint16x8 i_unzip8_lo(const uint16x8& ca, const uint16x8& cb)
96{
97 uint16<8> a = ca, b = cb;
98#if SIMDPP_USE_NULL
99 return detail::null::unzip8_lo(a, b);
100#elif SIMDPP_USE_SSE4_1
101 uint32x4 mask, r;
102 mask = make_ones();
103 mask = _mm_srli_epi32(mask.native(), 16);
104 a = bit_and(a, mask);
105 b = bit_and(b, mask);
106 r = _mm_packus_epi32(a.native(), b.native());
107 return uint16x8(r);
108#elif SIMDPP_USE_SSE2
109 uint32x4 r;
110 a = _mm_slli_epi32(a.native(), 16);
111 b = _mm_slli_epi32(b.native(), 16);
112 a = _mm_srai_epi32(a.native(), 16);
113 b = _mm_srai_epi32(b.native(), 16);
114 r = _mm_packs_epi32(a.native(), b.native());
115 return uint16x8(r);
116#elif SIMDPP_USE_NEON
117 return vuzpq_u16(a.native(), b.native()).val[0];
118#elif SIMDPP_USE_ALTIVEC
119#if SIMDPP_BIG_ENDIAN
120 uint16x8 mask = make_shuffle_bytes16_mask<0,2,4,6,8,10,12,14>(mask);
121 return shuffle_bytes16(a, b, mask);
122#else
123 return vec_pack((__vector uint32_t) a.native(),
124 (__vector uint32_t) b.native());
125#endif
126#elif SIMDPP_USE_MSA
127 return (v8u16) __msa_pckev_h((v8i16) b.native(), (v8i16) a.native());
128#endif
129}
130
131#if SIMDPP_USE_AVX2
132static SIMDPP_INL
133uint16x16 i_unzip8_lo(const uint16x16& ca, const uint16x16& cb)
134{
135 uint16<16> a = ca, b = cb;
136 uint32x8 mask, r;
137 mask = make_ones();
138 mask = _mm256_srli_epi32(mask.native(), 16);
139 a = bit_and(a, mask);
140 b = bit_and(b, mask);
141 r = _mm256_packus_epi32(a.native(), b.native());
142 return uint16x16(r);
143}
144#endif
145
146#if SIMDPP_USE_AVX512BW
147SIMDPP_INL uint16<32> i_unzip8_lo(const uint16<32>& ca, const uint16<32>& cb)
148{
149 uint16<32> a = ca, b = cb;
150 uint32<16> mask, r;
151 mask = make_ones();
152 mask = _mm512_srli_epi32(mask.native(), 16);
153 a = bit_and(a, mask);
154 b = bit_and(b, mask);
155 r = _mm512_packus_epi32(a.native(), b.native());
156 return uint16<32>(r);
157}
158#endif
159
160template<unsigned N> SIMDPP_INL
161uint16<N> i_unzip8_lo(const uint16<N>& a, const uint16<N>& b)
162{
163 SIMDPP_VEC_ARRAY_IMPL2(uint16<N>, i_unzip8_lo, a, b);
164}
165
166// -----------------------------------------------------------------------------
167
168static SIMDPP_INL
169uint32x4 i_unzip4_lo(const uint32x4& a, const uint32x4& b)
170{
171#if SIMDPP_USE_NULL
172 return detail::null::unzip4_lo(a, b);
173#elif SIMDPP_USE_SSE2
174 return shuffle2<0,2,0,2>(a,b);
175#elif SIMDPP_USE_NEON
176 return vuzpq_u32(a.native(), b.native()).val[0];
177#elif SIMDPP_USE_ALTIVEC
178 uint32x4 mask = make_shuffle_bytes16_mask<0,2,4,6>(mask);
179 return shuffle_bytes16(a, b, mask);
180#elif SIMDPP_USE_MSA
181 return (v4u32) __msa_pckev_w((v4i32) b.native(), (v4i32) a.native());
182#endif
183}
184
185#if SIMDPP_USE_AVX2
186static SIMDPP_INL
187uint32x8 i_unzip4_lo(const uint32x8& a, const uint32x8& b)
188{
189 return shuffle2<0,2,0,2>(a,b);
190}
191#endif
192
193#if SIMDPP_USE_AVX512F
194static SIMDPP_INL
195uint32<16> i_unzip4_lo(const uint32<16>& a, const uint32<16>& b)
196{
197 return shuffle2<0,2,0,2>(a,b);
198}
199#endif
200
201template<unsigned N> SIMDPP_INL
202uint32<N> i_unzip4_lo(const uint32<N>& a, const uint32<N>& b)
203{
204 SIMDPP_VEC_ARRAY_IMPL2(uint32<N>, i_unzip4_lo, a, b);
205}
206
207// -----------------------------------------------------------------------------
208
209template<unsigned N> SIMDPP_INL
210uint64<N> i_unzip2_lo(const uint64<N>& a, const uint64<N>& b)
211{
212 return zip2_lo(a, b);
213}
214
215// -----------------------------------------------------------------------------
216
217static SIMDPP_INL
218float32x4 i_unzip4_lo(const float32x4& a, const float32x4& b)
219{
220#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
221 return detail::null::unzip4_lo(a, b);
222#elif SIMDPP_USE_SSE2
223 return shuffle2<0,2,0,2>(a,b);
224#elif SIMDPP_USE_NEON
225 return vuzpq_f32(a.native(), b.native()).val[0];
226#elif SIMDPP_USE_ALTIVEC
227 uint32x4 mask = make_shuffle_bytes16_mask<0,2,4,6>(mask);
228 return shuffle_bytes16(a, b, mask);
229#elif SIMDPP_USE_MSA
230 return (v4f32) __msa_pckev_w((v4i32) b.native(), (v4i32) a.native());
231#endif
232}
233
234#if SIMDPP_USE_AVX
235static SIMDPP_INL
236float32x8 i_unzip4_lo(const float32x8& a, const float32x8& b)
237{
238 return shuffle2<0,2,0,2>(a,b);
239}
240#endif
241
242#if SIMDPP_USE_AVX512F
243static SIMDPP_INL
244float32<16> i_unzip4_lo(const float32<16>& a, const float32<16>& b)
245{
246 return shuffle2<0,2,0,2>(a,b);
247}
248#endif
249
250template<unsigned N> SIMDPP_INL
251float32<N> i_unzip4_lo(const float32<N>& a, const float32<N>& b)
252{
253 SIMDPP_VEC_ARRAY_IMPL2(float32<N>, i_unzip4_lo, a, b);
254}
255
256// -----------------------------------------------------------------------------
257
258template<unsigned N> SIMDPP_INL
259float64<N> i_unzip2_lo(const float64<N>& a, const float64<N>& b)
260{
261 return i_zip2_lo(a, b);
262}
263
264} // namespace insn
265} // namespace detail
266} // namespace SIMDPP_ARCH_NAMESPACE
267} // namespace simdpp
268
269#endif
270
271