1/* Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_MEM_PACK_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_MEM_PACK_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/detail/width.h>
17#include <simdpp/detail/insn/shuffle128.h>
18#include <simdpp/core/bit_andnot.h>
19#include <simdpp/core/bit_or.h>
20#include <simdpp/core/move_l.h>
21#include <simdpp/core/move_r.h>
22#include <simdpp/core/permute4.h>
23#include <simdpp/core/shuffle2.h>
24#include <simdpp/core/transpose.h>
25#include <simdpp/core/zip_hi.h>
26#include <simdpp/core/zip_lo.h>
27#include <simdpp/detail/insn/zip128.h>
28
29namespace simdpp {
30namespace SIMDPP_ARCH_NAMESPACE {
31namespace detail {
32namespace insn {
33
34/** Interleaves the elements of @a a and @a b in such way that:
35 * every (2n)-th element comes from @a a
36 * every (2n+1)-th element comes from @a b
37
38 n = [0, <number of elements in vector> - 1]
39*/
40template<class V> SIMDPP_INL
41void mem_pack2(any_vec<16,V>& qa, any_vec<16,V>& qb)
42{
43 V a = qa.wrapped();
44 V b = qb.wrapped();
45
46 qa.wrapped() = zip128_lo(a, b);
47 qb.wrapped() = zip128_hi(a, b);
48}
49
50template<class V> SIMDPP_INL
51void mem_pack2(any_vec<32,V>& qa, any_vec<32,V>& qb)
52{
53 V a = qa.wrapped();
54 V b = qb.wrapped();
55
56 V c1, c2;
57 c1 = zip128_lo(a, b);
58 c2 = zip128_hi(a, b);
59 qa.wrapped() = shuffle1_128<0,0>(c1, c2);
60 qb.wrapped() = shuffle1_128<1,1>(c1, c2);
61}
62
63#if SIMDPP_USE_AVX512F || SIMDPP_USE_AVX512BW
64template<class V> SIMDPP_INL
65void mem_pack2(any_vec<64,V>& qa, any_vec<64,V>& qb)
66{
67 V a = qa.wrapped();
68 V b = qb.wrapped();
69
70 V c1, c2, d1, d2;
71 c1 = zip128_lo(a, b);
72 c2 = zip128_hi(a, b);
73 d1 = shuffle2_128<0,1,0,1>(c1, c2);
74 d2 = shuffle2_128<2,3,2,3>(c1, c2);
75 qa.wrapped() = permute4_128<0,2,1,3>(d1); // FIXME: optimize
76 qb.wrapped() = permute4_128<0,2,1,3>(d2);
77}
78#endif
79
80/** Generic implementation of mem_pack3. The 256-bit version applies 128-bit
81 operations to each half of each vector separately.
82*/
83template<class T> SIMDPP_INL
84void v_mem_pack3_impl8_128(T& a, T& b, T& c)
85{
86#if SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
87 using U = typename T::uint_vector_type;
88
89 T a1, b1, c1;
90 a1 = align16<11>(a, a);
91 b1 = b;
92 c1 = align16<5>(c, c);
93
94 // [a11..a15,a0..a10]
95 // [b0..b15]
96 // [c5..c15,c0..c4]
97 U mask1 = make_uint(0xff);
98 mask1 = move16_l<5>(mask1);
99
100 T a2, b2, c2;
101 a2 = blend(a1, b1, mask1);
102 b2 = blend(b1, c1, mask1);
103 c2 = blend(c1, a1, mask1);
104 // [a11,a12,a13,a14,a15,a0, a1, a2, a3, a4, a5, b11,b12,b13,b14,b15]
105 // [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10,c0, c1, c2, c3, c4 ]
106 // [c5, c6, c7, c8, c9, c10,c11,c12,c13,c14,c15,a6, a7, a8, a9, a10]
107 U mask2 = make_shuffle_bytes16_mask<5, 16+0, 16+11,
108 6, 16+1, 16+12,
109 7, 16+2, 16+13,
110 8, 16+3, 16+14,
111 9, 16+4, 16+15,
112 10>(mask2);
113 a = shuffle_bytes16(a2, b2, mask2);
114 b = shuffle_bytes16(b2, c2, mask2);
115 c = shuffle_bytes16(c2, a2, mask2);
116
117 // [a0, b0, c0, a1, b1, c1, a2, b2, c2, a3, b3, c3, a4, b4, c4, a5 ]
118 // [b5, c5, a6, b6, c6, a7, b7, c7, a8, b8, c8, a9, b9, c9, a10,b10]
119 // [c10,a11,b11,c11,a12,b12,c12,a13,b13,c13,a14,b14,c14,a15,b15,c15]
120#else
121 // either uint16x8 or uint16x16, other entries likewise
122 using w_b16 = typename same_width<T>::u16;
123 using w_b32 = typename same_width<T>::u32;
124 using w_b8 = T;
125
126 w_b16 t0, t1, t2, t3;
127 t0 = zip16_lo(a, b);
128 t1 = zip16_hi(a, b);
129 t2 = zip16_lo(c, (w_b8) make_zero());
130 t3 = zip16_hi(c, (w_b8) make_zero());
131
132 w_b8 u0, u1, u2, u3;
133 u0 = zip8_lo(t0, t2);
134 u1 = zip8_hi(t0, t2);
135 u2 = zip8_lo(t1, t3);
136 u3 = zip8_hi(t1, t3);
137
138 // [a0, b0, c0, 0, a1, b1, c1, 0, a2, b2, c2, 0, a3, b3, c3, 0]
139 // [a4, b4, c4, 0, a5, b5, c5, 0, a6, b6, c6, 0, a7, b7, c7, 0]
140 // [a8, b8, c8, 0, a9, b9, c9, 0, a10,b10,c10,0, a11,b11,c11,0]
141 // [a12,b12,c12,0, a13,b13,c13,0, a14,b14,c14,0, a15,b15,c15,0]
142#if SIMDPP_USE_SSSE3
143 // it's not worth to use 4 different index vectors to shuffle the vectors
144 // properly and use only bit_or later
145 w_b8 idx = make_uint(0, 1, 2, 4, 5, 6, 8, 9,
146 10, 12, 13, 14, 0xff, 0xff, 0xff, 0xff);
147 u0 = permute_bytes16(u0, idx);
148 u1 = permute_bytes16(u1, idx);
149 u2 = permute_bytes16(u2, idx);
150 u3 = permute_bytes16(u3, idx);
151#else
152 using w_u64 = typename same_width<T>::u64;
153
154 // the following is still faster than non-SIMD implementation
155 w_b8 mask1 = make_uint(0xff, 0xff, 0xff, 0, 0, 0, 0, 0,
156 0xff, 0xff, 0xff, 0, 0, 0, 0, 0);
157 w_u64 w0, w1, w2, w3;
158 w0 = u0; w1 = u1; w2 = u2; w3 = u3;
159 w0 = shift_r(w0, 8);
160 w1 = shift_r(w1, 8);
161 w2 = shift_r(w2, 8);
162 w3 = shift_r(w3, 8);
163
164 u0 = blend(u0, w0, mask1);
165 u1 = blend(u1, w1, mask1);
166 u2 = blend(u2, w2, mask1);
167 u3 = blend(u3, w3, mask1);
168
169 w_b8 mask2 = make_uint(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0);
171 w_b8 x0, x1, x2, x3;
172 x0 = move16_l<2>(u0);
173 x1 = move16_l<2>(u1);
174 x2 = move16_l<2>(u2);
175 x3 = move16_l<2>(u3);
176
177 u0 = blend(u0, x0, mask2);
178 u1 = blend(u1, x1, mask2);
179 u2 = blend(u2, x2, mask2);
180 u3 = blend(u3, x3, mask2);
181#endif
182 // [a0, b0, c0, a1, b1, c1, a2, b2, c2, a3, b3, c3, 0,0,0,0]
183 // [a4, b4, c4, a5, b5, c5, a6, b6, c6, a7, b7, c7, 0,0,0,0]
184 // [a8, b8, c8, a9, b9, c9, a10,b10,c10,a11,b11,c11,0,0,0,0]
185 // [a12,b12,c12,a13,b13,c13,a14,b14,c14,a15,b15,c15,0,0,0,0]
186 w_b32 k0, k1, k2, k3, l0, l3;
187 k0 = u0;
188 k1 = u1;
189 k2 = u2;
190 k3 = u3;
191 l0 = move4_r<3>(k1);
192 l3 = move4_l<2>(k2);
193 k3 = move4_r<1>(k3);
194 a = bit_or(k0, l0);
195 b = shuffle2<1,2,0,1>(k1, k2);
196 c = bit_or(k3, l3);
197#endif
198}
199
200template<class T> SIMDPP_INL
201void v_mem_pack3_impl16_128(T& a, T& b, T& c)
202{
203#if SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
204 using U = typename T::uint_vector_type;
205
206 // [a0..a7]
207 // [b0..b7]
208 // [c0..c7]
209 T a1, b1, c1;
210 a1 = a;
211 b1 = align8<5>(b, b);
212 c1 = align8<2>(c, c);
213
214 // [a0..a7]
215 // [b5..b7,b0..b4]
216 // [c2..c7,c0,c1]
217 T a2, b2, c2;
218 U mask2 = make_uint(0xffff);
219 mask2 = move8_l<2>(mask2);
220
221 a2 = blend(a1, b1, mask2);
222 b2 = blend(b1, c1, mask2);
223 c2 = blend(c1, a1, mask2);
224
225 // [a0,a1,a2,a3,a4,a5,b3,b4]
226 // [b5,b6,b7,b0,b1,b2,c0,c1]
227 // [c2,c3,c4,c5,c6,c7,a6,a7]
228 U mask1 = make_shuffle_bytes16_mask<0, 8+3, 8+6,
229 1, 8+4, 8+7,
230 2, 8+5>(mask1);
231 a = shuffle_bytes16(a2, b2, mask1);
232 b = shuffle_bytes16(c2, a2, mask1);
233 c = shuffle_bytes16(b2, c2, mask1);
234
235 // [a0,b0,c0,a1,b1,c1,a2,b2]
236 // [c2,a3,b3,c3,a4,b4,c4,a5]
237 // [b5,c5,a6,b6,c6,a7,b7,c7]
238
239#else
240 // either uint8x16 or uint8x32, other entries likewise
241 using w_b16 = T;
242 using w_b32 = typename same_width<T>::u32;
243
244 w_b32 t0, t1, t2, t3;
245 t0 = zip8_lo(a, b);
246 t1 = zip8_hi(a, b);
247 t2 = zip8_lo(c, (w_b16) make_zero());
248 t3 = zip8_hi(c, (w_b16) make_zero());
249
250 w_b16 u0, u1, u2, u3;
251 u0 = zip4_lo(t0, t2);
252 u1 = zip4_hi(t0, t2);
253 u2 = zip4_lo(t1, t3);
254 u3 = zip4_hi(t1, t3);
255
256 // [a0, b0, c0, 0, a1, b1, c1, 0 ]
257 // [a2, b2, c2, 0, a3, b3, c3, 0 ]
258 // [a4, b4, c4, 0, a5, b5, c5, 0 ]
259 // [a6, b6, c6, 0, a7, b7, c7, 0 ]
260
261#if SIMDPP_USE_SSSE3
262 // it's not worth to use 4 different index vectors to shuffle the vectors
263 // properly and use only bit_or later
264 w_b16 idx = make_shuffle_bytes16_mask<0,1,2,4,5,6,-1,-1>(idx);
265 u0 = permute_bytes16(u0, idx);
266 u1 = permute_bytes16(u1, idx);
267 u2 = permute_bytes16(u2, idx);
268 u3 = permute_bytes16(u3, idx);
269
270#else
271 // the following is still faster than non-SIMD implementation
272 w_b16 mask2 = make_uint(0xffff, 0xffff, 0xffff, 0,
273 0, 0, 0, 0);
274 u0 = blend(u0, move8_l<1>(u0), mask2);
275 u1 = blend(u1, move8_l<1>(u1), mask2);
276 u2 = blend(u2, move8_l<1>(u2), mask2);
277 u3 = blend(u3, move8_l<1>(u3), mask2);
278#endif
279 // [a0, b0, c0, a1, b1, c1, 0, 0]
280 // [a2, b2, c2, a3, b3, c3, 0, 0]
281 // [a4, b4, c4, a5, b5, c5, 0, 0]
282 // [a6, b6, c6, a7, b7, c7, 0, 0]
283 w_b32 k0, k1, k2, k3, l0, l3;
284 k0 = u0;
285 k1 = u1;
286 k2 = u2;
287 k3 = u3;
288 l0 = move4_r<3>(k1);
289 l3 = move4_l<2>(k2);
290 k3 = move4_r<1>(k3);
291 a = bit_or(k0, l0);
292 b = shuffle2<1,2,0,1>(k1, k2);
293 c = bit_or(k3, l3);
294#endif
295}
296
297template<class T> SIMDPP_INL
298void v_mem_pack3_impl32_128(T& a, T& b, T& c)
299{
300#if SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
301 using U = typename T::uint_vector_type;
302
303 // [a0,a1,a2,a3]
304 // [b0,b1,b2,b3]
305 // [c0,c1,c2,c3]
306 T a1, b1, c1;
307 a1 = a;
308 b1 = align4<1>(b, b);
309 c1 = align4<2>(c, c);
310
311 // [a0,a1,a2,a3]
312 // [b1,b2,b3,b0]
313 // [c2,c3,c0,c1]
314 T a2, b2, c2;
315 U mask2 = make_uint(0xffffffff);
316 mask2 = move4_l<1>(mask2);
317
318 a2 = blend(a1, c1, mask2);
319 b2 = blend(b1, a1, mask2);
320 c2 = blend(c1, b1, mask2);
321 // [a0,a1,a2,c1]
322 // [b1,b2,b3,a3]
323 // [c2,c3,c0,b0]
324 U mask1 = make_shuffle_bytes16_mask<0,4+3,4+2,1>(mask1);
325 a = shuffle_bytes16(a2, c2, mask1);
326 b = shuffle_bytes16(b2, a2, mask1);
327 c = shuffle_bytes16(c2, b2, mask1);
328 // [a0,b0,c0,a1]
329 // [b1,c1,a2,b2]
330 // [c2,a3,b3,c3]
331#else
332 T t0, t1, t2;
333 t0 = shuffle2<0,2,0,2>(a, b);
334 t1 = shuffle2<0,2,1,3>(c, a);
335 t2 = shuffle2<1,3,1,3>(b, c);
336 // [a0,a2,b0,b2]
337 // [c0,c2,a1,a3]
338 // [b1,b3,c1,c3]
339 t0 = permute4<0,2,1,3>(t0);
340 t1 = permute4<0,2,1,3>(t1);
341 t2 = permute4<0,2,1,3>(t2);
342 // [a0,b0,a2,b2]
343 // [c0,a1,c2,a3]
344 // [b1,c1,b3,c3]
345 a = shuffle2<0,1,0,1>(t0, t1);
346 b = shuffle2<0,1,2,3>(t2, t0);
347 c = shuffle2<2,3,2,3>(t1, t2);
348#endif
349}
350
351template<class T> SIMDPP_INL
352void v_mem_pack3_impl64_128(T& a, T& b, T& c)
353{
354 T d0, d1, d2;
355 d0 = shuffle1<0,0>(a, b);
356 d1 = shuffle1<0,1>(c, a);
357 d2 = shuffle1<1,1>(b, c);
358 a = d0; b = d1; c = d2;
359}
360
361template<class V> SIMDPP_INL
362void v_mem_pack3_shuffle128(any_vec<16,V>& qa, any_vec<16,V>& qb, any_vec<16,V>& qc)
363{
364 (void) qa; (void) qb; (void) qc;
365}
366
367template<class V> SIMDPP_INL
368void v_mem_pack3_shuffle128(any_vec<32,V>& qa, any_vec<32,V>& qb, any_vec<32,V>& qc)
369{
370 // shuffle the vectors so that the lower halves contain the first 3 128-bit
371 // items (a and lower half of b) and the higher halves contain the rest
372
373 V a0, b0, c0, a1, b1, c1;
374
375 a0 = qa.wrapped(); b0 = qb.wrapped(); c0 = qc.wrapped();
376
377 a1 = shuffle1_128<0,0>(a0, b0);
378 b1 = shuffle1_128<0,1>(c0, a0);
379 c1 = shuffle1_128<1,1>(b0, c0);
380
381 qa.wrapped() = a1; qb.wrapped() = b1; qc.wrapped() = c1;
382}
383
384#if SIMDPP_USE_AVX512F
385template<class V> SIMDPP_INL
386void v_mem_pack3_shuffle128(any_vec<64,V>& qa, any_vec<64,V>& qb, any_vec<64,V>& qc)
387{
388 V a, b, c; // TODO: optimize. Using full-vector shuffle may be faster
389 a = qa.wrapped(); b = qb.wrapped(); c = qc.wrapped();
390
391 V t0, t1, t2;
392 t0 = shuffle2_128<0,2,0,2>(a, b);
393 t1 = shuffle2_128<0,2,1,3>(c, a);
394 t2 = shuffle2_128<1,3,1,3>(b, c);
395 // [a0,a2,b0,b2]
396 // [c0,c2,a1,a3]
397 // [b1,b3,c1,c3]
398 t0 = permute4_128<0,2,1,3>(t0);
399 t1 = permute4_128<0,2,1,3>(t1);
400 t2 = permute4_128<0,2,1,3>(t2);
401 // [a0,b0,a2,b2]
402 // [c0,a1,c2,a3]
403 // [b1,c1,b3,c3]
404 a = shuffle2_128<0,1,0,1>(t0, t1);
405 b = shuffle2_128<0,1,2,3>(t2, t0);
406 c = shuffle2_128<2,3,2,3>(t1, t2);
407
408 qa.wrapped() = a; qb.wrapped() = b; qc.wrapped() = c;
409}
410#endif
411
412/** Interleaves the elements of @a a, @a b and @a c in such way that:
413 * every (3n)-th element comes from @a a
414 * every (3n+1)-th element comes from @a b
415 * every (3n+2)-th element comes from @a c
416
417 n = [0, <number of elements in vector> - 1]
418*/
419template<unsigned N> SIMDPP_INL
420void mem_pack3(uint8<N>& a, uint8<N>& b, uint8<N>& c)
421{
422 v_mem_pack3_impl8_128(a, b, c);
423 v_mem_pack3_shuffle128(a, b, c);
424}
425
426template<unsigned N> SIMDPP_INL
427void mem_pack3(uint16<N>& a, uint16<N>& b, uint16<N>& c)
428{
429 v_mem_pack3_impl16_128(a, b, c);
430 v_mem_pack3_shuffle128(a, b, c);
431}
432
433template<unsigned N> SIMDPP_INL
434void mem_pack3(uint32<N>& a, uint32<N>& b, uint32<N>& c)
435{
436 v_mem_pack3_impl32_128(a, b, c);
437 v_mem_pack3_shuffle128(a, b, c);
438}
439
440template<unsigned N> SIMDPP_INL
441void mem_pack3(uint64<N>& a, uint64<N>& b, uint64<N>& c)
442{
443 v_mem_pack3_impl64_128(a, b, c);
444 v_mem_pack3_shuffle128(a, b, c);
445}
446
447template<unsigned N> SIMDPP_INL
448void mem_pack3(float32<N>& a, float32<N>& b, float32<N>& c)
449{
450 v_mem_pack3_impl32_128(a, b, c);
451 v_mem_pack3_shuffle128(a, b, c);
452}
453
454template<unsigned N> SIMDPP_INL
455void mem_pack3(float64<N>& a, float64<N>& b, float64<N>& c)
456{
457 v_mem_pack3_impl64_128(a, b, c);
458 v_mem_pack3_shuffle128(a, b, c);
459}
460
461/** Generic implementation of mem_pack4. The 256-bit version applies 128-bit
462 operations to each half of each vector separately.
463*/
464template<class T> SIMDPP_INL
465void v_mem_pack4_impl8_128(T& a, T& b, T& c, T& d)
466{
467 // either uint16x8 or uint16x16, other entries likewise
468#if SIMDPP_USE_SSSE3 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
469 // TODO: optimize for altivec
470 using w_b32 = typename same_width<T>::u32;
471
472 w_b32 b0, b1, b2, b3;
473 b0 = a; b1 = b; b2 = c; b3 = d;
474 transpose4(b0, b1, b2, b3);
475 a = b0; b = b1; c = b2; d = b3;
476
477 a = transpose_inplace(a);
478 b = transpose_inplace(b);
479 c = transpose_inplace(c);
480 d = transpose_inplace(d);
481#else
482 using w_b8 = T;
483 using w_b16 = typename same_width<T>::u16;
484 using w_b64 = typename same_width<T>::u64;
485
486 w_b8 e0, e1, e2, e3;
487 w_b64 d0, d1, d2, d3;
488 d0 = a; d1 = b; d2 = c; d3 = d;
489 e0 = zip2_lo(d0, d2);
490 e1 = zip2_lo(d1, d3);
491 e2 = zip2_hi(d0, d2);
492 e3 = zip2_hi(d1, d3);
493 // [a0 .. a7, c0 .. c7 ]
494 // [b0 .. b7, d0 .. d7 ]
495 // [a8 .. a15, c8 .. c15 ]
496 // [b8 .. b15, d8 .. d15 ]
497 w_b16 f0, f1, f2, f3;
498 f0 = zip16_lo(e0, e1);
499 f1 = zip16_hi(e0, e1);
500 f2 = zip16_lo(e2, e3);
501 f3 = zip16_hi(e2, e3);
502 // [a0, b0, a1, b1, a2, b2, a3, b3, a4, b4, a5, b5, a6, b6, a7, b7 ]
503 // [c0, d0, c1, d1, c2, d2, c3, d3, c4, d4, c5, d5, c6, d6, c7, d7 ]
504 // [a8, b8, a9, b9, a10,b10,a11,b11,a12,b12,a13,b13,a14,b14,a15,b15]
505 // [c8, d8, c9, d9, c10,d10,c11,d11,c12,d12,c13,d13,c14,d14,c15,d15]
506 a = zip8_lo(f0, f1);
507 b = zip8_hi(f0, f1);
508 c = zip8_lo(f2, f3);
509 d = zip8_hi(f2, f3);
510#endif
511}
512
513template<class T> SIMDPP_INL
514void v_mem_pack4_impl16_128(T& a, T& b, T& c, T& d)
515{
516 using w_b16 = T;
517 using w_b32 = typename same_width<T>::u32;
518 using w_b64 = typename same_width<T>::u64;
519
520 w_b16 e0, e1, e2, e3;
521 w_b64 d0, d1, d2, d3;
522 d0 = a; d1 = b; d2 = c; d3 = d;
523 e0 = zip2_lo(d0, d2);
524 e1 = zip2_lo(d1, d3);
525 e2 = zip2_hi(d0, d2);
526 e3 = zip2_hi(d1, d3);
527 // [a0,a1,a2,a3,c0,c1,c2,c3]
528 // [b0,b1,b2,b3,d0,d1,d2,d3]
529 // [a4,a5,a6,a7,c4,c5,c6,c7]
530 // [b4,b5,b6,b7,d4,d5,d6,d7]
531 w_b32 f0, f1, f2, f3;
532 f0 = zip8_lo(e0, e1);
533 f1 = zip8_hi(e0, e1);
534 f2 = zip8_lo(e2, e3);
535 f3 = zip8_hi(e2, e3);
536 // [a0,b0,a1,b1,a2,b2,a3,b3]
537 // [c0,d0,c1,d1,c2,d2,c3,d3]
538 // [a4,b4,a5,b5,a6,b6,a7,b7]
539 // [c4,d4,c5,d5,c6,d6,c7,d7]
540 a = zip4_lo(f0, f1);
541 b = zip4_hi(f0, f1);
542 c = zip4_lo(f2, f3);
543 d = zip4_hi(f2, f3);
544}
545
546template<class T> SIMDPP_INL
547void v_mem_pack4_impl32_128(T& a, T& b, T& c, T& d)
548{
549 transpose4(a, b, c, d);
550}
551
552template<class T> SIMDPP_INL
553void v_mem_pack4_impl64_128(T& a, T& b, T& c, T& d)
554{
555 transpose2(a, b);
556 transpose2(c, d);
557 T t;
558 t = b;
559 b = c;
560 c = t;
561}
562
563template<class V> SIMDPP_INL
564void v_mem_pack4_shuffle128(any_vec<16,V>& qa, any_vec<16,V>& qb,
565 any_vec<16,V>& qc, any_vec<16,V>& qd)
566{
567 (void) qa; (void) qb; (void) qc; (void) qd;
568}
569
570template<class V> SIMDPP_INL
571void v_mem_pack4_shuffle128(any_vec<32,V>& qa, any_vec<32,V>& qb,
572 any_vec<32,V>& qc, any_vec<32,V>& qd)
573{
574 // shuffle the vectors with the lower halves containing the first 4 128-bit
575 // items and the higher halves contain the rest
576 V a0, b0, c0, d0, a1, b1, c1, d1;
577
578 a0 = qa.wrapped(); b0 = qb.wrapped(); c0 = qc.wrapped(); d0 = qd.wrapped();
579
580 a1 = shuffle1_128<0,0>(a0, b0);
581 b1 = shuffle1_128<0,0>(c0, d0);
582 c1 = shuffle1_128<1,1>(a0, b0);
583 d1 = shuffle1_128<1,1>(c0, d0);
584
585 qa.wrapped() = a1; qb.wrapped() = b1; qc.wrapped() = c1; qd.wrapped() = d1;
586}
587
588#if SIMDPP_USE_AVX512F
589template<class V> SIMDPP_INL
590void v_mem_pack4_shuffle128(any_vec<64,V>& qa, any_vec<64,V>& qb,
591 any_vec<64,V>& qc, any_vec<64,V>& qd)
592{
593 V a, b, c, d; // TODO: optimize. Using full-vector shuffle/permute will be faster
594
595 a = qa.wrapped(); b = qb.wrapped(); c = qc.wrapped(); d = qd.wrapped();
596
597 V t1, t2, t3, t4;
598 // [a0,a1,a2,a3]
599 // [b0,b1,b2,b3]
600 // [c0,c1,c2,c3]
601 // [d0,d1,d2,d3]
602 t1 = shuffle2_128<0,2,0,2>(a, b);
603 t2 = shuffle2_128<1,3,1,3>(a, b);
604 t3 = shuffle2_128<0,2,0,2>(c, d);
605 t4 = shuffle2_128<1,3,1,3>(c, d);
606 // [a0,a2,b0,b2]
607 // [a1,a3,b1,b3]
608 // [c0,c2,d0,d2]
609 // [c1,c3,d1,d3]
610 a = shuffle2_128<0,2,0,2>(t1, t3);
611 b = shuffle2_128<0,2,0,2>(t2, t4);
612 c = shuffle2_128<1,3,1,3>(t1, t3);
613 d = shuffle2_128<1,3,1,3>(t2, t4);
614 // [a0,b0,c0,d0]
615 // [a1,b1,c1,d1]
616 // [a2,b2,c2,d2]
617 // [a3,b3,c3,d3]
618
619
620 qa.wrapped() = a; qb.wrapped() = b; qc.wrapped() = c; qd.wrapped() = d;
621}
622#endif
623
624/** Interleaves the elements of @a a, @a b, @a c and @a d in such way that:
625 * every (4n)-th element comes from @a a
626 * every (4n+1)-th element comes from @a b
627 * every (4n+2)-th element comes from @a c
628 * every (4n+3)-th element comes from @a d
629
630 n = [0, <number of elements in vector> - 1]
631*/
632template<unsigned N> SIMDPP_INL
633void mem_pack4(uint8<N>& a, uint8<N>& b, uint8<N>& c, uint8<N>& d)
634{
635 v_mem_pack4_impl8_128(a, b, c, d);
636 v_mem_pack4_shuffle128(a, b, c, d);
637}
638
639template<unsigned N> SIMDPP_INL
640void mem_pack4(uint16<N>& a, uint16<N>& b, uint16<N>& c, uint16<N>& d)
641{
642 v_mem_pack4_impl16_128(a, b, c, d);
643 v_mem_pack4_shuffle128(a, b, c, d);
644}
645
646template<unsigned N> SIMDPP_INL
647void mem_pack4(uint32<N>& a, uint32<N>& b, uint32<N>& c, uint32<N>& d)
648{
649 v_mem_pack4_impl32_128(a, b, c, d);
650 v_mem_pack4_shuffle128(a, b, c, d);
651}
652
653template<unsigned N> SIMDPP_INL
654void mem_pack4(uint64<N>& a, uint64<N>& b, uint64<N>& c, uint64<N>& d)
655{
656 v_mem_pack4_impl64_128(a, b, c, d);
657 v_mem_pack4_shuffle128(a, b, c, d);
658}
659
660template<unsigned N> SIMDPP_INL
661void mem_pack4(float32<N>& a, float32<N>& b, float32<N>& c, float32<N>& d)
662{
663 v_mem_pack4_impl32_128(a, b, c, d);
664 v_mem_pack4_shuffle128(a, b, c, d);
665}
666
667template<unsigned N> SIMDPP_INL
668void mem_pack4(float64<N>& a, float64<N>& b, float64<N>& c, float64<N>& d)
669{
670 v_mem_pack4_impl64_128(a, b, c, d);
671 v_mem_pack4_shuffle128(a, b, c, d);
672}
673
674} // namespace insn
675} // namespace detail
676} // namespace SIMDPP_ARCH_NAMESPACE
677} // namespace simdpp
678
679#endif
680
681