1/* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_ZIP_LO_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_ZIP_LO_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/shuffle_bytes16.h>
17#include <simdpp/detail/null/shuffle.h>
18#include <simdpp/detail/neon/shuffle.h>
19
20namespace simdpp {
21namespace SIMDPP_ARCH_NAMESPACE {
22namespace detail {
23namespace insn {
24
25static SIMDPP_INL
26uint8x16 i_zip16_lo(const uint8x16& a, const uint8x16& b)
27{
28#if SIMDPP_USE_NULL
29 return detail::null::zip16_lo(a, b);
30#elif SIMDPP_USE_SSE2
31 return _mm_unpacklo_epi8(a.native(), b.native());
32#elif SIMDPP_USE_NEON
33 // the compiler will optimize multiple vzip instructions if both zip_lo
34 // and zip_hi are used on the same arguments
35 return vzipq_u8(a.native(), b.native()).val[0];
36#elif SIMDPP_USE_ALTIVEC
37 return vec_mergeh(a.native(), b.native());
38#elif SIMDPP_USE_MSA
39 return (v16u8) __msa_ilvr_b((v16i8)b.native(), (v16i8)a.native());
40#endif
41}
42
43#if SIMDPP_USE_AVX2
44static SIMDPP_INL
45uint8x32 i_zip16_lo(const uint8x32& a, const uint8x32& b)
46{
47 return _mm256_unpacklo_epi8(a.native(), b.native());
48}
49#endif
50
51#if SIMDPP_USE_AVX512BW
52SIMDPP_INL uint8<64> i_zip16_lo(const uint8<64>& a, const uint8<64>& b)
53{
54 return _mm512_unpacklo_epi8(a.native(), b.native());
55}
56#endif
57
58template<unsigned N> SIMDPP_INL
59uint8<N> i_zip16_lo(const uint8<N>& a, const uint8<N>& b)
60{
61 SIMDPP_VEC_ARRAY_IMPL2(uint8<N>, i_zip16_lo, a, b)
62}
63
64// -----------------------------------------------------------------------------
65
66static SIMDPP_INL
67uint16x8 i_zip8_lo(const uint16x8& a, const uint16x8& b)
68{
69#if SIMDPP_USE_NULL
70 return detail::null::zip8_lo(a, b);
71#elif SIMDPP_USE_SSE2
72 return _mm_unpacklo_epi16(a.native(), b.native());
73#elif SIMDPP_USE_NEON
74 return vzipq_u16(a.native(), b.native()).val[0];
75#elif SIMDPP_USE_ALTIVEC
76 return vec_mergeh(a.native(), b.native());
77#elif SIMDPP_USE_MSA
78 return (v8u16) __msa_ilvr_h((v8i16)b.native(), (v8i16)a.native());
79#endif
80}
81
82#if SIMDPP_USE_AVX2
83static SIMDPP_INL
84uint16x16 i_zip8_lo(const uint16x16& a, const uint16x16& b)
85{
86 return _mm256_unpacklo_epi16(a.native(), b.native());
87}
88#endif
89
90#if SIMDPP_USE_AVX512BW
91SIMDPP_INL uint16<32> i_zip8_lo(const uint16<32>& a, const uint16<32>& b)
92{
93 return _mm512_unpacklo_epi16(a.native(), b.native());
94}
95#endif
96
97template<unsigned N> SIMDPP_INL
98uint16<N> i_zip8_lo(const uint16<N>& a, const uint16<N>& b)
99{
100 SIMDPP_VEC_ARRAY_IMPL2(uint16<N>, i_zip8_lo, a, b)
101}
102
103// -----------------------------------------------------------------------------
104
105static SIMDPP_INL
106uint32x4 i_zip4_lo(const uint32x4& a, const uint32x4& b)
107{
108#if SIMDPP_USE_NULL
109 return detail::null::zip4_lo(a, b);
110#elif SIMDPP_USE_SSE2
111 return _mm_unpacklo_epi32(a.native(), b.native());
112#elif SIMDPP_USE_NEON
113 return vzipq_u32(a.native(), b.native()).val[0];
114#elif SIMDPP_USE_ALTIVEC
115 return vec_mergeh(a.native(), b.native());
116#elif SIMDPP_USE_MSA
117 return (v4u32) __msa_ilvr_w((v4i32)b.native(), (v4i32)a.native());
118#endif
119}
120
121#if SIMDPP_USE_AVX2
122static SIMDPP_INL
123uint32x8 i_zip4_lo(const uint32x8& a, const uint32x8& b)
124{
125 return _mm256_unpacklo_epi32(a.native(), b.native());
126}
127#endif
128
129#if SIMDPP_USE_AVX512F
130static SIMDPP_INL
131uint32<16> i_zip4_lo(const uint32<16>& a, const uint32<16>& b)
132{
133 return _mm512_unpacklo_epi32(a.native(), b.native());
134}
135#endif
136
137template<unsigned N> SIMDPP_INL
138uint32<N> i_zip4_lo(const uint32<N>& a, const uint32<N>& b)
139{
140 SIMDPP_VEC_ARRAY_IMPL2(uint32<N>, i_zip4_lo, a, b)
141}
142
143// -----------------------------------------------------------------------------
144
145static SIMDPP_INL
146uint64x2 i_zip2_lo(const uint64x2& a, const uint64x2& b)
147{
148#if SIMDPP_USE_SSE2
149 return _mm_unpacklo_epi64(a.native(), b.native());
150#elif SIMDPP_USE_NEON
151 return neon::zip2_lo(a, b);
152#elif SIMDPP_USE_VSX_207
153 return vec_mergeh(a.native(), b.native());
154#elif SIMDPP_USE_MSA
155 return (v2u64) __msa_ilvr_d((v2i64) b.native(), (v2i64) a.native());
156#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
157 return detail::null::zip2_lo(a, b);
158#endif
159}
160
161#if SIMDPP_USE_AVX2
162static SIMDPP_INL
163uint64x4 i_zip2_lo(const uint64x4& a, const uint64x4& b)
164{
165 return _mm256_unpacklo_epi64(a.native(), b.native());
166}
167#endif
168
169#if SIMDPP_USE_AVX512F
170static SIMDPP_INL
171uint64<8> i_zip2_lo(const uint64<8>& a, const uint64<8>& b)
172{
173 return _mm512_unpacklo_epi64(a.native(), b.native());
174}
175#endif
176
177template<unsigned N> SIMDPP_INL
178uint64<N> i_zip2_lo(const uint64<N>& a, const uint64<N>& b)
179{
180 SIMDPP_VEC_ARRAY_IMPL2(uint64<N>, i_zip2_lo, a, b)
181}
182
183// -----------------------------------------------------------------------------
184
185static SIMDPP_INL
186float32x4 i_zip4_lo(const float32x4& a, const float32x4& b)
187{
188#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
189 return detail::null::zip4_lo(a, b);
190#elif SIMDPP_USE_SSE2
191 return _mm_unpacklo_ps(a.native(), b.native());
192#elif SIMDPP_USE_NEON
193 return vzipq_f32(a.native(), b.native()).val[0];
194#elif SIMDPP_USE_ALTIVEC
195 return vec_mergeh(a.native(), b.native());
196#elif SIMDPP_USE_MSA
197 return (v4f32) __msa_ilvr_w((v4i32) b.native(), (v4i32) a.native());
198#endif
199}
200
201#if SIMDPP_USE_AVX
202static SIMDPP_INL
203float32x8 i_zip4_lo(const float32x8& a, const float32x8& b)
204{
205 return _mm256_unpacklo_ps(a.native(), b.native());
206}
207#endif
208
209#if SIMDPP_USE_AVX512F
210static SIMDPP_INL
211float32<16> i_zip4_lo(const float32<16>& a, const float32<16>& b)
212{
213 return _mm512_unpacklo_ps(a.native(), b.native());
214}
215#endif
216
217template<unsigned N> SIMDPP_INL
218float32<N> i_zip4_lo(const float32<N>& a, const float32<N>& b)
219{
220 SIMDPP_VEC_ARRAY_IMPL2(float32<N>, i_zip4_lo, a, b)
221}
222
223// -----------------------------------------------------------------------------
224
225static SIMDPP_INL
226float64x2 i_zip2_lo(const float64x2& a, const float64x2& b)
227{
228#if SIMDPP_USE_SSE2
229 return _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a.native()),
230 _mm_castpd_ps(b.native())));
231#elif SIMDPP_USE_NEON64
232 return vtrn1q_f64(a.native(), b.native());
233#elif SIMDPP_USE_VSX_206
234 return (__vector double) vec_mergeh((__vector uint64_t)a.native(),
235 (__vector uint64_t)b.native());
236#elif SIMDPP_USE_MSA
237 return (v2f64) __msa_ilvr_d((v2i64) b.native(), (v2i64) a.native());
238#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC || SIMDPP_USE_NEON
239 return detail::null::zip2_lo(a, b);
240#endif
241}
242
243#if SIMDPP_USE_AVX
244static SIMDPP_INL
245float64x4 i_zip2_lo(const float64x4& a, const float64x4& b)
246{
247 return _mm256_unpacklo_pd(a.native(), b.native());
248}
249#endif
250
251#if SIMDPP_USE_AVX512F
252static SIMDPP_INL
253float64<8> i_zip2_lo(const float64<8>& a, const float64<8>& b)
254{
255 return _mm512_unpacklo_pd(a.native(), b.native());
256}
257#endif
258
259template<unsigned N> SIMDPP_INL
260float64<N> i_zip2_lo(const float64<N>& a, const float64<N>& b)
261{
262 SIMDPP_VEC_ARRAY_IMPL2(float64<N>, i_zip2_lo, a, b)
263}
264
265
266} // namespace insn
267} // namespace detail
268} // namespace SIMDPP_ARCH_NAMESPACE
269} // namespace simdpp
270
271#endif
272