1/* Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_MEM_UNPACK_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_MEM_UNPACK_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/detail/width.h>
17#include <simdpp/detail/insn/shuffle128.h>
18#include <simdpp/detail/insn/zip128.h>
19#include <simdpp/core/align.h>
20#include <simdpp/core/splat_n.h>
21#include <simdpp/core/make_shuffle_bytes_mask.h>
22#include <simdpp/core/shuffle1.h>
23#include <simdpp/core/shuffle2.h>
24#include <simdpp/core/transpose.h>
25#include <simdpp/core/unzip_hi.h>
26#include <simdpp/core/unzip_lo.h>
27#include <simdpp/core/zip_hi.h>
28#include <simdpp/core/zip_lo.h>
29
30namespace simdpp {
31namespace SIMDPP_ARCH_NAMESPACE {
32namespace detail {
33namespace insn {
34
35/** Concatenates @a a and @a b and stores the elements of the resulting array
36 as follows:
37 * every (2n)-th element is stored to @a a
38 * every (2n+1)-th element is stored to @a b
39
40 n = [0, <number of elements in vector> - 1]
41*/
42template<class V> SIMDPP_INL
43void mem_unpack2(any_vec<16,V>& qa, any_vec<16,V>& qb)
44{
45 V a = qa.wrapped();
46 V b = qb.wrapped();
47
48 qa.wrapped() = unzip128_lo(a, b);
49 qb.wrapped() = unzip128_hi(a, b);
50}
51
52template<class V> SIMDPP_INL
53void mem_unpack2(any_vec<32,V>& qa, any_vec<32,V>& qb)
54{
55 V a = qa.wrapped();
56 V b = qb.wrapped();
57
58 V c1 = shuffle1_128<0,0>(a, b);
59 V c2 = shuffle1_128<1,1>(a, b);
60 qa.wrapped() = unzip128_lo(c1, c2);
61 qb.wrapped() = unzip128_hi(c1, c2);
62}
63
64#if SIMDPP_USE_AVX512F
65template<class V> SIMDPP_INL
66void mem_unpack2(any_vec<64,V>& qa, any_vec<64,V>& qb)
67{
68 V a = qa.wrapped();
69 V b = qb.wrapped();
70
71 V c1 = shuffle2_128<0,2,0,2>(a, b);
72 V c2 = shuffle2_128<1,3,1,3>(a, b);
73 qa.wrapped() = unzip128_lo(c1, c2);
74 qb.wrapped() = unzip128_hi(c1, c2);
75}
76#endif
77
78/** Generic implementation of mem_unpack3. The 128-bit lanes are processed
79 independently
80*/
81template<class T> SIMDPP_INL
82void v_mem_unpack3_impl8_128(T& a, T& b, T& c)
83{
84#if SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
85 // [a0, b0, c0, a1, b1, c1, a2, b2, c2, a3, b3, c3, a4, b4, c4, a5 ]
86 // [b5, c5, a6, b6, c6, a7, b7, c7, a8, b8, c8, a9, b9, c9, a10,b10]
87 // [c10,a11,b11,c11,a12,b12,c12,a13,b13,c13,a14,b14,c14,a15,b15,c15]
88 T mask1 = make_shuffle_bytes16_mask< 1, 4, 7, 10, 13,16+0,16+3,16+6,
89 16+9,16+12,16+15, 2, 5, 8, 11, 14>(mask1);
90 T a1, b1, c1;
91 a1 = shuffle_bytes16(c, a, mask1);
92 b1 = shuffle_bytes16(a, b, mask1);
93 c1 = shuffle_bytes16(b, c, mask1);
94 // [a11,a12,a13,a14,a15,a0, a1, a2, a3, a4, a5, b11,b12,b13,b14,b15]
95 // [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10,c0, c1, c2, c3, c4 ]
96 // [c5, c6, c7, c8, c9, c10,c11,c12,c13,c14,c15,a6, a7, a8, a9, a10]
97 T a2, b2, c2;
98 T mask2 = make_uint(0xff);
99 mask2 = move16_l<5>(mask2);
100
101 a2 = blend(a1, c1, mask2);
102 b2 = blend(b1, a1, mask2);
103 c2 = blend(c1, b1, mask2);
104 // [a11..a15,a0..a10]
105 // [b0..b15]
106 // [c5..c15,c0..c5]
107 a = align16<5>(a2, a2);
108 b = b2;
109 c = align16<11>(c2, c2);
110#else
111 typename same_width<T>::u8 t0, t1, t2, t3;
112 t0 = a;
113 t1 = align16<12>(a, b);
114 t2 = align16<8>(b, c);
115 t3 = move16_l<4>(c);
116 // [a0, b0, c0, a1, b1, c1, a2, b2, c2, a3, b3, c3, ...]
117 // [a4, b4, c4, a5, b5, c5, a6, b6, c6, a7, b7, c7, ...]
118 // [a8, b8, c8, a9, b9, c9, a10,b10,c10,a11,b11,c11, ...]
119 // [a12,b12,c12,a13,b13,c13,a14,b14,c14,a15,b15,c15, ...]
120 typename same_width<T>::u16 b0, b1, b2, b3;
121 b0 = zip16_lo(t0, t1);
122 b1 = zip16_lo(t2, t3);
123 b2 = zip16_hi(t0, t1);
124 b3 = zip16_hi(t2, t3);
125 // [a0, a4, b0, b4, c0, c4, a1, a5, b1, b5, c1, c5, a2, a6, b2, b6 ]
126 // [a8, a12,b8, b12,c9, c13,a9, a13,b9, b13,c9, c13,a10,a14,b10,b14,]
127 // [c2, c6, a3, a7, b3, b7, c3, c7, ... ]
128 // [c10,c14,a11,a15,b11,b15,c11,c15,... ]
129 typename same_width<T>::u8 u0, u1, u2;
130 u0 = zip8_lo(b0, b1);
131 u1 = zip8_hi(b0, b1);
132 u2 = zip8_lo(b2, b3);
133 // [a0, a4, a8, a12,b0, b4, b8, b12, c0, c4, c8, c12, a1, a5, a9, a13 ]
134 // [b1, b5, b9, b13,c1, c5, c9, c13, a2, a6, a10,a14, b2, b6, b10,b14 ]
135 // [c2, c6, c10,c14,a3, a7, a11,a15, b3, b7, b11,b15, c3, c7, c11,c15 ]
136 t0 = u0;
137 t1 = align16<12>(u0, u1);
138 t2 = align16<8>(u1, u2);
139 t3 = move16_l<4>(u2);
140 // [a0, a4, a8, a12,b0, b4, b8, b12, c0, c4, c8, c12, ...]
141 // [a1, a5, a9, a13,b1, b5, b9, b13, c1, c5, c9, c13, ...]
142 // [a2, a6, a10,a14,b2, b6, b10,b13, c2, c6, c10,c14, ...]
143 // [a3, a7, a11,a15,b3, b7, b11,b13, c3, c7, c11,c15, ...]
144 b0 = zip16_lo(t0, t1);
145 b1 = zip16_lo(t2, t3);
146 b2 = zip16_hi(t0, t1);
147 b3 = zip16_hi(t2, t3);
148 // [a0, a1, a4, a5, a8, a9, a12,a13,b0, b1, b4, b5, b8, b9, b12,b13 ]
149 // [a2, a3, a6, a7, a10,a11,a14,a15,b2, b3, b6, b7, b10,b11,b14,b15 ]
150 // [c0, c1, c4, c5, c8, c9, c12,c13, ... ]
151 // [c2, c3, c6, c7, c10,c11,c14,c15, ... ]
152 a = zip8_lo(b0, b1);
153 b = zip8_hi(b0, b1);
154 c = zip8_lo(b2, b3);
155#endif
156}
157
158template<class T> SIMDPP_INL
159void v_mem_unpack3_impl16_128(T& a, T& b, T& c)
160{
161#if SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
162 // [a0,b0,c0,a1,b1,c1,a2,b2]
163 // [c2,a3,b3,c3,a4,b4,c4,a5]
164 // [b5,c5,a6,b6,c6,a7,b7,c7]
165 T mask1 = make_shuffle_bytes16_mask<0,3,6,8+1,8+4,8+7,8+2,8+5>(mask1);
166 T a1, b1, c1;
167 a1 = shuffle_bytes16(a, b, mask1);
168 c1 = shuffle_bytes16(b, c, mask1);
169 b1 = shuffle_bytes16(c, a, mask1);
170 // [a0,a1,a2,a3,a4,a5,b3,b4]
171 // [c2,c3,c4,c5,c6,c7,a6,a7]
172 // [b5,b6,b7,b0,b1,b2,c0,c1]
173 T a2, b2, c2;
174 T mask2 = make_uint(0xffff);
175 mask2 = move8_l<2>(mask2);
176
177 a2 = blend(a1, c1, mask2);
178 b2 = blend(b1, a1, mask2);
179 c2 = blend(c1, b1, mask2);
180 // [a0..a7]
181 // [b5..b7,b0..b4]
182 // [c2..c7,c0,c1]
183 a = a2;
184 b = align8<3>(b2, b2);
185 c = align8<6>(c2, c2);
186#else
187 T t0, t1, t2, t3;
188 t0 = a;
189 t1 = align8<6>(a, b);
190 t2 = align8<4>(b, c);
191 t3 = move8_l<2>(c);
192 // [a0,b0,c0,a1,b1,c1, ... ]
193 // [a2,b2,c2,a3,b3,c3, ... ]
194 // [a4,b4,c4,a5,b5,c5, ... ]
195 // [a6,b6,c6,a7,b7,c7, ... ]
196 typename same_width<T>::u32 b0, b1, b2, b3;
197 b0 = zip8_lo(t0, t1);
198 b1 = zip8_lo(t2, t3);
199 b2 = zip8_hi(t0, t1);
200 b3 = zip8_hi(t2, t3);
201 // [a0,a2,b0,b2,c0,c2,a1,a3]
202 // [a4,a6,b4,b6,c4,c6,a5,a7]
203 // [b1,b3,c1,c3, ... ]
204 // [b5,b7,c5,c7, ... ]
205 typename same_width<T>::u64 c0, c1, c2;
206 c0 = zip4_lo(b0, b1);
207 c1 = zip4_hi(b0, b1);
208 c2 = zip4_lo(b2, b3);
209 // [a0,a2,a4,a6,b0,b2,b4,b6]
210 // [c0,c2,c4,c6,a1,a3,a5,a7]
211 // [b1,b3,b5,b7,c1,c3,c5,c7]
212 t0 = c0;
213 t1 = shuffle1<1,0>(c0, c1);
214 t2 = splat2<1>(c1);
215 t3 = c2;
216 // [a0,a2,a4,a6,b0,b2,b4,b6]
217 // [b0,b2,b4,b6,c0,c2,c4,c6]
218 // [a1,a3,a5,a7,a1,a3,a5,a7]
219 // [b1,b3,b5,b7,c1,c3,c5,c7]
220 a = zip8_lo(t0, t2);
221 b = zip8_lo(t1, t3);
222 c = zip8_hi(t1, t3);
223#endif
224}
225
226template<class T> SIMDPP_INL
227void v_mem_unpack3_impl32_128(T& a, T& b, T& c)
228{
229#if SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
230 using U = typename T::uint_vector_type;
231
232 // [a0,b0,c0,a1]
233 // [b1,c1,a2,b2]
234 // [c2,a3,b3,c3]
235 U mask1 = make_shuffle_bytes16_mask<0,3,4+2,4+1>(mask1);
236 T a1, b1, c1;
237 a1 = shuffle_bytes16(a, b, mask1);
238 b1 = shuffle_bytes16(b, c, mask1);
239 c1 = shuffle_bytes16(c, a, mask1);
240 // [a0,a1,a2,c1]
241 // [b1,b2,b3,a3]
242 // [c2,c3,c0,b0]
243 T a2, b2, c2;
244 U mask2 = make_uint(0xffffffff);
245 mask2 = move4_l<1>(mask2);
246
247 a2 = blend(a1, b1, mask2);
248 b2 = blend(b1, c1, mask2);
249 c2 = blend(c1, a1, mask2);
250 // [a0,a1,a2,a3]
251 // [b1,b2,b3,b0]
252 // [c2,c3,c0,c1]
253 a = a2;
254 b = align4<3>(b2, b2);
255 c = align4<2>(c2, c2);
256#else
257 T t11, t12, t21, t22, t31, t32;
258 // [a0,b0,c0,a1]
259 // [b1,c1,a2,b2]
260 // [c2,a3,b3,c3]
261 t11 = a;
262 t12 = shuffle2<0,1,2,3>(c, b);
263 t21 = shuffle2<0,1,0,1>(a, b);
264 t22 = shuffle2<2,3,2,3>(b, c);
265 t31 = shuffle2<2,3,0,1>(a, b);
266 t32 = c;
267 // [a0,b0,c0,a1]
268 // [c2,a3,a2,b2]
269 // [a0,b0,b1,c1]
270 // [a2,b2,b3,c3]
271 // [c0,a1,b1,c1]
272 // [c2,a3,b3,c3]
273 a = shuffle2<0,3,2,1>(t11, t12);
274 b = shuffle2<1,2,1,2>(t21, t22);
275 c = shuffle2<0,3,0,3>(t31, t32);
276#endif
277}
278
279template<class T> SIMDPP_INL
280void v_mem_unpack3_impl64_128(T& a, T& b, T& c)
281{
282 T d0, d1, d2;
283 d0 = shuffle1<0,1>(a, b);
284 d1 = shuffle1<1,0>(a, c);
285 d2 = shuffle1<0,1>(b, c);
286 a = d0; b = d1; c = d2;
287}
288
289template<class V> SIMDPP_INL
290void v_mem_unpack3_shuffle128(any_vec<16,V>& qa, any_vec<16,V>& qb, any_vec<16,V>& qc)
291{
292 (void) qa; (void) qb; (void) qc;
293}
294
295template<class V> SIMDPP_INL
296void v_mem_unpack3_shuffle128(any_vec<32,V>& qa, any_vec<32,V>& qb, any_vec<32,V>& qc)
297{
298 // shuffle the vectors so that the lower halves contain the first 3 128-bit
299 // items (a and lower half of b) and the higher halves contain the rest
300
301 V a0, b0, c0, a1, b1, c1;
302
303 a0 = qa.wrapped(); b0 = qb.wrapped(); c0 = qc.wrapped();
304
305 a1 = shuffle1_128<0,1>(a0, b0);
306 b1 = shuffle1_128<1,0>(a0, c0);
307 c1 = shuffle1_128<0,1>(b0, c0);
308
309 qa.wrapped() = a1; qb.wrapped() = b1; qc.wrapped() = c1;
310}
311
312#if SIMDPP_USE_AVX512F
313template<class V> SIMDPP_INL
314void v_mem_unpack3_shuffle128(any_vec<64,V>& qa, any_vec<64,V>& qb, any_vec<64,V>& qc)
315{
316 V a, b, c; // TODO: optimize. Using full-vector shuffle may be faster
317 a = qa.wrapped(); b = qb.wrapped(); c = qc.wrapped();
318
319 V t11, t12, t21, t22, t31, t32;
320 // [a0,b0,c0,a1]
321 // [b1,c1,a2,b2]
322 // [c2,a3,b3,c3]
323 t11 = a;
324 t12 = shuffle2_128<0,1,2,3>(c, b);
325 t21 = shuffle2_128<0,1,0,1>(a, b);
326 t22 = shuffle2_128<2,3,2,3>(b, c);
327 t31 = shuffle2_128<2,3,0,1>(a, b);
328 t32 = c;
329 // [a0,b0,c0,a1]
330 // [c2,a3,a2,b2]
331 // [a0,b0,b1,c1]
332 // [a2,b2,b3,c3]
333 // [c0,a1,b1,c1]
334 // [c2,a3,b3,c3]
335 a = shuffle2_128<0,3,2,1>(t11, t12);
336 b = shuffle2_128<1,2,1,2>(t21, t22);
337 c = shuffle2_128<0,3,0,3>(t31, t32);
338
339 qa.wrapped() = a; qb.wrapped() = b; qc.wrapped() = c;
340}
341#endif
342
343/** Concatenates @a a, @a b and @a c and stores the elements of the resulting
344 array as follows:
345 * every (3n)-th element is stored to @a a
346 * every (3n+1)-th element is stored to @a b
347 * every (3n+2)-th element is stored to @a c
348
349 n = [0, <number of elements in vector> - 1]
350*/
351template<unsigned N> SIMDPP_INL
352void mem_unpack3(uint8<N>& a, uint8<N>& b, uint8<N>& c)
353{
354 v_mem_unpack3_shuffle128(a, b, c);
355 v_mem_unpack3_impl8_128(a, b, c);
356}
357
358template<unsigned N> SIMDPP_INL
359void mem_unpack3(uint16<N>& a, uint16<N>& b, uint16<N>& c)
360{
361 v_mem_unpack3_shuffle128(a, b, c);
362 v_mem_unpack3_impl16_128(a, b, c);
363}
364
365template<unsigned N> SIMDPP_INL
366void mem_unpack3(uint32<N>& a, uint32<N>& b, uint32<N>& c)
367{
368 v_mem_unpack3_shuffle128(a, b, c);
369 v_mem_unpack3_impl32_128(a, b, c);
370}
371
372template<unsigned N> SIMDPP_INL
373void mem_unpack3(uint64<N>& a, uint64<N>& b, uint64<N>& c)
374{
375 v_mem_unpack3_shuffle128(a, b, c);
376 v_mem_unpack3_impl64_128(a, b, c);
377}
378
379template<unsigned N> SIMDPP_INL
380void mem_unpack3(float32<N>& a, float32<N>& b, float32<N>& c)
381{
382 v_mem_unpack3_shuffle128(a, b, c);
383 v_mem_unpack3_impl32_128(a, b, c);
384}
385
386template<unsigned N> SIMDPP_INL
387void mem_unpack3(float64<N>& a, float64<N>& b, float64<N>& c)
388{
389 v_mem_unpack3_shuffle128(a, b, c);
390 v_mem_unpack3_impl64_128(a, b, c);
391}
392
393/** Generic implementation of mem_unpack4. The 256-bit version applies 128-bit
394 operations to each half of each vector separately.
395*/
396template<class T> SIMDPP_INL
397void v_mem_unpack4_impl8_128(T& a, T& b, T& c, T& d)
398{
399#if SIMDPP_USE_SSSE3 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
400 // TODO: optimize for Altivec and MSA
401 typename same_width<T>::u32 b0, b1, b2, b3;
402 b0 = transpose_inplace(a);
403 b1 = transpose_inplace(b);
404 b2 = transpose_inplace(c);
405 b3 = transpose_inplace(d);
406
407 transpose4(b0, b1, b2, b3);
408 a = b0; b = b1; c = b2; d = b3;
409#else
410 // [a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3 ]
411 // [a4, b4, c4, d4, a5, b5, c5, d5, a6, b6, c6, d6, a7, b7, c7, d7 ]
412 // [a8, b8, c8, d8, a9, b9, c9, d9, a10,b10,c10,d10,a11,b11,c11,d11]
413 // [a12,b12,c12,d12,a13,b13,c13,d13,a14,b14,c14,d14,a15,b15,c15,d15]
414 T b0, b1, b2, b3, c0, c1, c2, c3;
415 b0 = zip16_lo(a, b);
416 b1 = zip16_hi(a, b);
417 b2 = zip16_lo(c, d);
418 b3 = zip16_hi(c, d);
419 // [a0, a4, b0, b4, c0, c4, d0, d4, a1, a5, b1, b5, c1, c5, d1, d5 ]
420 // [a2, a6, b2, b6, c2, c6, d2, d6, a3, a7, b3, b7, c3, c7, d3, d7 ]
421 // [a8, a12,b8, b12,c8, c12,d8, d12,a9, a13,b9, b13,c9, c13,d9, d13]
422 // [a10,a14,b10,b14,c10,c14,d10,d14,a11,a15,b11,b15,c11,c15,d11,d15]
423 c0 = zip16_lo(b0, b1);
424 c1 = zip16_hi(b0, b1);
425 c2 = zip16_lo(b2, b3);
426 c3 = zip16_hi(b2, b3);
427 // [a0, a2, a4, a6, b0, b2, b4, b6, c0, c2, c4, c6, d0, d2, d4, d6 ]
428 // [a1, a3, a5, a7, b1, b3, b5, b7, c1, c3, c5, c7, d1, d3, d5, d7 ]
429 // [a8, a10,a12,a14,b8, b10,b12,b14,c8, c10,c12,c14,d8, d10,d12,d14]
430 // [a9, a11,a13,a15,b9, b11,b13,b15,c9, c11,c13,c15,d9, d11,d13,d15]
431 typename same_width<T>::u64 d0, d1, d2, d3;
432 d0 = zip16_lo(c0, c1);
433 d1 = zip16_hi(c0, c1);
434 d2 = zip16_lo(c2, c3);
435 d3 = zip16_hi(c2, c3);
436 // [a0 .. a7, b0 .. b7 ]
437 // [c0 .. c7, d0 .. d7 ]
438 // [a8 .. a15, b8 .. b15 ]
439 // [b8 .. b15, d8 .. d15 ]
440 a = zip2_lo(d0, d2);
441 b = zip2_hi(d0, d2);
442 c = zip2_lo(d1, d3);
443 d = zip2_hi(d1, d3);
444#endif
445}
446
447template<class T> SIMDPP_INL
448void v_mem_unpack4_impl16_128(T& a, T& b, T& c, T& d)
449{
450 // [a0,b0,c0,d0,a1,b1,c1,d1]
451 // [a2,b2,c2,d2,a3,b3,c3,d3]
452 // [a4,b4,c4,d4,a5,b5,c5,d5]
453 // [a6,b6,c6,d6,a7,b7,c7,d7]
454 typename same_width<T>::u16 t0, t1, t2, t3;
455 t0 = zip8_lo(a, b);
456 t1 = zip8_hi(a, b);
457 t2 = zip8_lo(c, d);
458 t3 = zip8_hi(c, d);
459 // [a0,a2,b0,b2,c0,c2,d0,d2]
460 // [a1,a3,b1,b3,c1,c3,d1,d3]
461 // [a4,a6,b4,b6,c4,c6,d4,d6]
462 // [a5,a7,b5,b7,c5,c7,d5,d7]
463 typename same_width<T>::u64 u0, u1, u2, u3;
464 u0 = zip8_lo(t0, t1);
465 u1 = zip8_hi(t0, t1);
466 u2 = zip8_lo(t2, t3);
467 u3 = zip8_hi(t2, t3);
468 // [a0,a1,a2,a3,b0,b1,b2,b3]
469 // [c0,c1,c2,c3,d0,d1,d2,d3]
470 // [a4,a5,a6,a7,b4,b5,b6,b7]
471 // [c4,c5,c6,c7,d4,d5,d6,d7]
472 a = zip2_lo(u0, u2);
473 b = zip2_hi(u0, u2);
474 c = zip2_lo(u1, u3);
475 d = zip2_hi(u1, u3);
476}
477
478template<class T> SIMDPP_INL
479void v_mem_unpack4_impl32_128(T& a, T& b, T& c, T& d)
480{
481 transpose4(a, b, c, d);
482}
483
484template<class T> SIMDPP_INL
485void v_mem_unpack4_impl64_128(T& a, T& b, T& c, T& d)
486{
487 transpose2(a, c);
488 transpose2(b, d);
489 T t;
490 t = b;
491 b = c;
492 c = t;
493}
494
495template<class V> SIMDPP_INL
496void v_mem_unpack4_shuffle128(any_vec<16,V>& qa, any_vec<16,V>& qb,
497 any_vec<16,V>& qc, any_vec<16,V>& qd)
498{
499 (void) qa; (void) qb; (void) qc; (void) qd;
500}
501
502template<class V> SIMDPP_INL
503void v_mem_unpack4_shuffle128(any_vec<32,V>& qa, any_vec<32,V>& qb,
504 any_vec<32,V>& qc, any_vec<32,V>& qd)
505{
506 V a0, b0, c0, d0, a1, b1, c1, d1;
507
508 a0 = qa.wrapped(); b0 = qb.wrapped(); c0 = qc.wrapped(); d0 = qd.wrapped();
509
510 a1 = shuffle1_128<0,0>(a0, c0);
511 b1 = shuffle1_128<1,1>(a0, c0);
512 c1 = shuffle1_128<0,0>(b0, d0);
513 d1 = shuffle1_128<1,1>(b0, d0);
514
515 qa.wrapped() = a1; qb.wrapped() = b1; qc.wrapped() = c1; qd.wrapped() = d1;
516}
517
518#if SIMDPP_USE_AVX512F
519template<class V> SIMDPP_INL
520void v_mem_unpack4_shuffle128(any_vec<64,V>& qa, any_vec<64,V>& qb,
521 any_vec<64,V>& qc, any_vec<64,V>& qd)
522{
523 V a, b, c, d; // TODO: optimize. Using full-vector shuffle/permute will be faster
524
525 a = qa.wrapped(); b = qb.wrapped(); c = qc.wrapped(); d = qd.wrapped();
526
527 V t1, t2, t3, t4;
528 // [a0,a1,a2,a3]
529 // [b0,b1,b2,b3]
530 // [c0,c1,c2,c3]
531 // [d0,d1,d2,d3]
532 t1 = shuffle2_128<0,2,0,2>(a, b);
533 t2 = shuffle2_128<1,3,1,3>(a, b);
534 t3 = shuffle2_128<0,2,0,2>(c, d);
535 t4 = shuffle2_128<1,3,1,3>(c, d);
536 // [a0,a2,b0,b2]
537 // [a1,a3,b1,b3]
538 // [c0,c2,d0,d2]
539 // [c1,c3,d1,d3]
540 a = shuffle2_128<0,2,0,2>(t1, t3);
541 b = shuffle2_128<0,2,0,2>(t2, t4);
542 c = shuffle2_128<1,3,1,3>(t1, t3);
543 d = shuffle2_128<1,3,1,3>(t2, t4);
544 // [a0,b0,c0,d0]
545 // [a1,b1,c1,d1]
546 // [a2,b2,c2,d2]
547 // [a3,b3,c3,d3]
548
549 qa.wrapped() = a; qb.wrapped() = b; qc.wrapped() = c; qd.wrapped() = d;
550}
551#endif
552
553/** Concatenates @a a, @a b, @a c and @a d and stores the elements of the
554 resulting array as follows:
555 * every (4n)-th element is stored to @a a
556 * every (4n+1)-th element is stored to @a b
557 * every (4n+2)-th element is stored to @a c
558 * every (4n+3)-th element is stored to @a d
559
560 n = [0, <number of elements in vector> - 1]
561*/
562// @icost{SSE2, SSE3, 16}
563// @icost{SSSE3, SSE4.1, 12}
564template<unsigned N> SIMDPP_INL
565void mem_unpack4(uint8<N>& a, uint8<N>& b, uint8<N>& c, uint8<N>& d)
566{
567 v_mem_unpack4_shuffle128(a, b, c, d);
568 v_mem_unpack4_impl8_128(a, b, c, d);
569}
570
571template<unsigned N> SIMDPP_INL
572void mem_unpack4(uint16<N>& a, uint16<N>& b, uint16<N>& c, uint16<N>& d)
573{
574 v_mem_unpack4_shuffle128(a, b, c, d);
575 v_mem_unpack4_impl16_128(a, b, c, d);
576}
577
578template<unsigned N> SIMDPP_INL
579void mem_unpack4(uint32<N>& a, uint32<N>& b, uint32<N>& c, uint32<N>& d)
580{
581 v_mem_unpack4_shuffle128(a, b, c, d);
582 v_mem_unpack4_impl32_128(a, b, c, d);
583}
584
585template<unsigned N> SIMDPP_INL
586void mem_unpack4(uint64<N>& a, uint64<N>& b, uint64<N>& c, uint64<N>& d)
587{
588 v_mem_unpack4_shuffle128(a, b, c, d);
589 v_mem_unpack4_impl64_128(a, b, c, d);
590}
591
592template<unsigned N> SIMDPP_INL
593void mem_unpack4(float32<N>& a, float32<N>& b, float32<N>& c, float32<N>& d)
594{
595 v_mem_unpack4_shuffle128(a, b, c, d);
596 v_mem_unpack4_impl32_128(a, b, c, d);
597}
598
599template<unsigned N> SIMDPP_INL
600void mem_unpack4(float64<N>& a, float64<N>& b, float64<N>& c, float64<N>& d)
601{
602 v_mem_unpack4_shuffle128(a, b, c, d);
603 v_mem_unpack4_impl64_128(a, b, c, d);
604}
605
606/** Concatenates the given vectors and stores the elements of the resulting
607 array as follows:
608 * every (3n)-th element of the first 48 elements is stored to @a a
609 * every (3n+1)-th element of the first 48 elements is stored to @a b
610 * every (3n+2)-th element of the first 48 elements is stored to @a c
611 * every (3n)-th element of the last 48 elements is stored to @a d
612 * every (3n+1)-th element of the last 48 elements is stored to @a e
613 * every (3n+2)-th element of the lasd 48 elements is stored to @a f
614
615 n = [0, <number of elements in vector> - 1]
616*/
617static SIMDPP_INL
618void mem_unpack6(uint8x16& a, uint8x16& b, uint8x16& c,
619 uint8x16& d, uint8x16& e, uint8x16& f)
620{
621 uint8x16 t0, t1, t2, t3, t4, t5;
622 t0 = zip16_lo(a, d);
623 t1 = zip16_hi(a, d);
624 t2 = zip16_lo(b, e);
625 t3 = zip16_hi(b, e);
626 t4 = zip16_lo(c, f);
627 t5 = zip16_hi(c, f);
628
629 uint8x16 u0, u1, u2, u3, u4, u5;
630 u0 = zip16_lo(t0, t3);
631 u1 = zip16_hi(t0, t3);
632 u2 = zip16_lo(t1, t4);
633 u3 = zip16_hi(t1, t4);
634 u4 = zip16_lo(t2, t5);
635 u5 = zip16_hi(t2, t5);
636
637 t0 = zip16_lo(u0, u3);
638 t1 = zip16_hi(u0, u3);
639 t2 = zip16_lo(u1, u4);
640 t3 = zip16_hi(u1, u4);
641 t4 = zip16_lo(u2, u5);
642 t5 = zip16_hi(u2, u5);
643
644 u0 = zip16_lo(t0, t3);
645 u1 = zip16_hi(t0, t3);
646 u2 = zip16_lo(t1, t4);
647 u3 = zip16_hi(t1, t4);
648 u4 = zip16_lo(t2, t5);
649 u5 = zip16_hi(t2, t5);
650
651 t0 = zip16_lo(u0, u3);
652 t1 = zip16_hi(u0, u3);
653 t2 = zip16_lo(u1, u4);
654 t3 = zip16_hi(u1, u4);
655 t4 = zip16_lo(u2, u5);
656 t5 = zip16_hi(u2, u5);
657
658 a = zip16_lo(t0, t3);
659 b = zip16_hi(t0, t3);
660 c = zip16_lo(t1, t4);
661 d = zip16_hi(t1, t4);
662 e = zip16_lo(t2, t5);
663 f = zip16_hi(t2, t5);
664}
665
666static SIMDPP_INL
667void mem_unpack6(uint16x8& a, uint16x8& b, uint16x8& c,
668 uint16x8& d, uint16x8& e, uint16x8& f)
669{
670 uint16x8 t0, t1, t2, t3, t4, t5;
671 t0 = zip8_lo(a, d);
672 t1 = zip8_hi(a, d);
673 t2 = zip8_lo(b, e);
674 t3 = zip8_hi(b, e);
675 t4 = zip8_lo(c, f);
676 t5 = zip8_hi(c, f);
677
678 uint16x8 u0, u1, u2, u3, u4, u5;
679 u0 = zip8_lo(t0, t3);
680 u1 = zip8_hi(t0, t3);
681 u2 = zip8_lo(t1, t4);
682 u3 = zip8_hi(t1, t4);
683 u4 = zip8_lo(t2, t5);
684 u5 = zip8_hi(t2, t5);
685
686 a = zip8_lo(u0, u3);
687 b = zip8_hi(u0, u3);
688 c = zip8_lo(u1, u4);
689 d = zip8_hi(u1, u4);
690 e = zip8_lo(u2, u5);
691 f = zip8_hi(u2, u5);
692}
693
694} // namespace insn
695} // namespace detail
696} // namespace SIMDPP_ARCH_NAMESPACE
697} // namespace simdpp
698
699#endif
700
701