mem_pack.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/mem_pack.h]

1	/ Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_MEM_PACK_H
9	#define LIBSIMDPP_SIMDPP_DETAIL_INSN_MEM_PACK_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/types.h>
16	#include <simdpp/detail/width.h>
17	#include <simdpp/detail/insn/shuffle128.h>
18	#include <simdpp/core/bit_andnot.h>
19	#include <simdpp/core/bit_or.h>
20	#include <simdpp/core/move_l.h>
21	#include <simdpp/core/move_r.h>
22	#include <simdpp/core/permute4.h>
23	#include <simdpp/core/shuffle2.h>
24	#include <simdpp/core/transpose.h>
25	#include <simdpp/core/zip_hi.h>
26	#include <simdpp/core/zip_lo.h>
27	#include <simdpp/detail/insn/zip128.h>
28
29	namespace simdpp {
30	namespace SIMDPP_ARCH_NAMESPACE {
31	namespace detail {
32	namespace insn {
33
34	/* Interleaves the elements of @a a and @a b in such way that:*
35	* every (2n)-th element comes from @a a
36	* every (2n+1)-th element comes from @a b
37
38	n = [0, <number of elements in vector> - 1]
39	*/
40	template<class V> SIMDPP_INL
41	void mem_pack2(any_vec<`16`,V>& qa, any_vec<`16`,V>& qb)
42	{
43	V a = qa.wrapped();
44	V b = qb.wrapped();
45
46	qa.wrapped() = zip128_lo(a, b);
47	qb.wrapped() = zip128_hi(a, b);
48	}
49
50	template<class V> SIMDPP_INL
51	void mem_pack2(any_vec<`32`,V>& qa, any_vec<`32`,V>& qb)
52	{
53	V a = qa.wrapped();
54	V b = qb.wrapped();
55
56	V c1, c2;
57	c1 = zip128_lo(a, b);
58	c2 = zip128_hi(a, b);
59	qa.wrapped() = shuffle1_128<`0`,`0`>(c1, c2);
60	qb.wrapped() = shuffle1_128<`1`,`1`>(c1, c2);
61	}
62
63	#if SIMDPP_USE_AVX512F \|\| SIMDPP_USE_AVX512BW
64	template<class V> SIMDPP_INL
65	void mem_pack2(any_vec<`64`,V>& qa, any_vec<`64`,V>& qb)
66	{
67	V a = qa.wrapped();
68	V b = qb.wrapped();
69
70	V c1, c2, d1, d2;
71	c1 = zip128_lo(a, b);
72	c2 = zip128_hi(a, b);
73	d1 = shuffle2_128<`0`,`1`,`0`,`1`>(c1, c2);
74	d2 = shuffle2_128<`2`,`3`,`2`,`3`>(c1, c2);
75	qa.wrapped() = permute4_128<`0`,`2`,`1`,`3`>(d1); // FIXME: optimize
76	qb.wrapped() = permute4_128<`0`,`2`,`1`,`3`>(d2);
77	}
78	#endif
79
80	/* Generic implementation of mem_pack3. The 256-bit version applies 128-bit*
81	operations to each half of each vector separately.
82	*/
83	template<class T> SIMDPP_INL
84	void v_mem_pack3_impl8_128(T& a, T& b, T& c)
85	{
86	#if SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
87	using U = typename T::uint_vector_type;
88
89	T a1, b1, c1;
90	a1 = align16<`11`>(a, a);
91	b1 = b;
92	c1 = align16<`5`>(c, c);
93
94	// [a11..a15,a0..a10]
95	// [b0..b15]
96	// [c5..c15,c0..c4]
97	U mask1 = make_uint(`0xff`);
98	mask1 = move16_l<`5`>(mask1);
99
100	T a2, b2, c2;
101	a2 = blend(a1, b1, mask1);
102	b2 = blend(b1, c1, mask1);
103	c2 = blend(c1, a1, mask1);
104	// [a11,a12,a13,a14,a15,a0, a1, a2, a3, a4, a5, b11,b12,b13,b14,b15]
105	// [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10,c0, c1, c2, c3, c4 ]
106	// [c5, c6, c7, c8, c9, c10,c11,c12,c13,c14,c15,a6, a7, a8, a9, a10]
107	U mask2 = make_shuffle_bytes16_mask<`5`, `16`+`0`, `16`+`11`,
108	`6`, `16`+`1`, `16`+`12`,
109	`7`, `16`+`2`, `16`+`13`,
110	`8`, `16`+`3`, `16`+`14`,
111	`9`, `16`+`4`, `16`+`15`,
112	`10`>(mask2);
113	a = shuffle_bytes16(a2, b2, mask2);
114	b = shuffle_bytes16(b2, c2, mask2);
115	c = shuffle_bytes16(c2, a2, mask2);
116
117	// [a0, b0, c0, a1, b1, c1, a2, b2, c2, a3, b3, c3, a4, b4, c4, a5 ]
118	// [b5, c5, a6, b6, c6, a7, b7, c7, a8, b8, c8, a9, b9, c9, a10,b10]
119	// [c10,a11,b11,c11,a12,b12,c12,a13,b13,c13,a14,b14,c14,a15,b15,c15]
120	#else
121	// either uint16x8 or uint16x16, other entries likewise
122	using w_b16 = typename same_width<T>::u16;
123	using w_b32 = typename same_width<T>::u32;
124	using w_b8 = T;
125
126	w_b16 t0, t1, t2, t3;
127	t0 = zip16_lo(a, b);
128	t1 = zip16_hi(a, b);
129	t2 = zip16_lo(c, (w_b8) make_zero());
130	t3 = zip16_hi(c, (w_b8) make_zero());
131
132	w_b8 u0, u1, u2, u3;
133	u0 = zip8_lo(t0, t2);
134	u1 = zip8_hi(t0, t2);
135	u2 = zip8_lo(t1, t3);
136	u3 = zip8_hi(t1, t3);
137
138	// [a0, b0, c0, 0, a1, b1, c1, 0, a2, b2, c2, 0, a3, b3, c3, 0]
139	// [a4, b4, c4, 0, a5, b5, c5, 0, a6, b6, c6, 0, a7, b7, c7, 0]
140	// [a8, b8, c8, 0, a9, b9, c9, 0, a10,b10,c10,0, a11,b11,c11,0]
141	// [a12,b12,c12,0, a13,b13,c13,0, a14,b14,c14,0, a15,b15,c15,0]
142	#if SIMDPP_USE_SSSE3
143	// it's not worth to use 4 different index vectors to shuffle the vectors
144	// properly and use only bit_or later
145	w_b8 idx = make_uint(`0`, `1`, `2`, `4`, `5`, `6`, `8`, `9`,
146	`10`, `12`, `13`, `14`, `0xff`, `0xff`, `0xff`, `0xff`);
147	u0 = permute_bytes16(u0, idx);
148	u1 = permute_bytes16(u1, idx);
149	u2 = permute_bytes16(u2, idx);
150	u3 = permute_bytes16(u3, idx);
151	#else
152	using w_u64 = typename same_width<T>::u64;
153
154	// the following is still faster than non-SIMD implementation
155	w_b8 mask1 = make_uint(`0xff`, `0xff`, `0xff`, `0`, `0`, `0`, `0`, `0`,
156	`0xff`, `0xff`, `0xff`, `0`, `0`, `0`, `0`, `0`);
157	w_u64 w0, w1, w2, w3;
158	w0 = u0; w1 = u1; w2 = u2; w3 = u3;
159	w0 = shift_r(w0, `8`);
160	w1 = shift_r(w1, `8`);
161	w2 = shift_r(w2, `8`);
162	w3 = shift_r(w3, `8`);
163
164	u0 = blend(u0, w0, mask1);
165	u1 = blend(u1, w1, mask1);
166	u2 = blend(u2, w2, mask1);
167	u3 = blend(u3, w3, mask1);
168
169	w_b8 mask2 = make_uint(`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0`, `0`,
170	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`);
171	w_b8 x0, x1, x2, x3;
172	x0 = move16_l<`2`>(u0);
173	x1 = move16_l<`2`>(u1);
174	x2 = move16_l<`2`>(u2);
175	x3 = move16_l<`2`>(u3);
176
177	u0 = blend(u0, x0, mask2);
178	u1 = blend(u1, x1, mask2);
179	u2 = blend(u2, x2, mask2);
180	u3 = blend(u3, x3, mask2);
181	#endif
182	// [a0, b0, c0, a1, b1, c1, a2, b2, c2, a3, b3, c3, 0,0,0,0]
183	// [a4, b4, c4, a5, b5, c5, a6, b6, c6, a7, b7, c7, 0,0,0,0]
184	// [a8, b8, c8, a9, b9, c9, a10,b10,c10,a11,b11,c11,0,0,0,0]
185	// [a12,b12,c12,a13,b13,c13,a14,b14,c14,a15,b15,c15,0,0,0,0]
186	w_b32 k0, k1, k2, k3, l0, l3;
187	k0 = u0;
188	k1 = u1;
189	k2 = u2;
190	k3 = u3;
191	l0 = move4_r<`3`>(k1);
192	l3 = move4_l<`2`>(k2);
193	k3 = move4_r<`1`>(k3);
194	a = bit_or(k0, l0);
195	b = shuffle2<`1`,`2`,`0`,`1`>(k1, k2);
196	c = bit_or(k3, l3);
197	#endif
198	}
199
200	template<class T> SIMDPP_INL
201	void v_mem_pack3_impl16_128(T& a, T& b, T& c)
202	{
203	#if SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
204	using U = typename T::uint_vector_type;
205
206	// [a0..a7]
207	// [b0..b7]
208	// [c0..c7]
209	T a1, b1, c1;
210	a1 = a;
211	b1 = align8<`5`>(b, b);
212	c1 = align8<`2`>(c, c);
213
214	// [a0..a7]
215	// [b5..b7,b0..b4]
216	// [c2..c7,c0,c1]
217	T a2, b2, c2;
218	U mask2 = make_uint(`0xffff`);
219	mask2 = move8_l<`2`>(mask2);
220
221	a2 = blend(a1, b1, mask2);
222	b2 = blend(b1, c1, mask2);
223	c2 = blend(c1, a1, mask2);
224
225	// [a0,a1,a2,a3,a4,a5,b3,b4]
226	// [b5,b6,b7,b0,b1,b2,c0,c1]
227	// [c2,c3,c4,c5,c6,c7,a6,a7]
228	U mask1 = make_shuffle_bytes16_mask<`0`, `8`+`3`, `8`+`6`,
229	`1`, `8`+`4`, `8`+`7`,
230	`2`, `8`+`5`>(mask1);
231	a = shuffle_bytes16(a2, b2, mask1);
232	b = shuffle_bytes16(c2, a2, mask1);
233	c = shuffle_bytes16(b2, c2, mask1);
234
235	// [a0,b0,c0,a1,b1,c1,a2,b2]
236	// [c2,a3,b3,c3,a4,b4,c4,a5]
237	// [b5,c5,a6,b6,c6,a7,b7,c7]
238
239	#else
240	// either uint8x16 or uint8x32, other entries likewise
241	using w_b16 = T;
242	using w_b32 = typename same_width<T>::u32;
243
244	w_b32 t0, t1, t2, t3;
245	t0 = zip8_lo(a, b);
246	t1 = zip8_hi(a, b);
247	t2 = zip8_lo(c, (w_b16) make_zero());
248	t3 = zip8_hi(c, (w_b16) make_zero());
249
250	w_b16 u0, u1, u2, u3;
251	u0 = zip4_lo(t0, t2);
252	u1 = zip4_hi(t0, t2);
253	u2 = zip4_lo(t1, t3);
254	u3 = zip4_hi(t1, t3);
255
256	// [a0, b0, c0, 0, a1, b1, c1, 0 ]
257	// [a2, b2, c2, 0, a3, b3, c3, 0 ]
258	// [a4, b4, c4, 0, a5, b5, c5, 0 ]
259	// [a6, b6, c6, 0, a7, b7, c7, 0 ]
260
261	#if SIMDPP_USE_SSSE3
262	// it's not worth to use 4 different index vectors to shuffle the vectors
263	// properly and use only bit_or later
264	w_b16 idx = make_shuffle_bytes16_mask<`0`,`1`,`2`,`4`,`5`,`6`,-`1`,-`1`>(idx);
265	u0 = permute_bytes16(u0, idx);
266	u1 = permute_bytes16(u1, idx);
267	u2 = permute_bytes16(u2, idx);
268	u3 = permute_bytes16(u3, idx);
269
270	#else
271	// the following is still faster than non-SIMD implementation
272	w_b16 mask2 = make_uint(`0xffff`, `0xffff`, `0xffff`, `0`,
273	`0`, `0`, `0`, `0`);
274	u0 = blend(u0, move8_l<`1`>(u0), mask2);
275	u1 = blend(u1, move8_l<`1`>(u1), mask2);
276	u2 = blend(u2, move8_l<`1`>(u2), mask2);
277	u3 = blend(u3, move8_l<`1`>(u3), mask2);
278	#endif
279	// [a0, b0, c0, a1, b1, c1, 0, 0]
280	// [a2, b2, c2, a3, b3, c3, 0, 0]
281	// [a4, b4, c4, a5, b5, c5, 0, 0]
282	// [a6, b6, c6, a7, b7, c7, 0, 0]
283	w_b32 k0, k1, k2, k3, l0, l3;
284	k0 = u0;
285	k1 = u1;
286	k2 = u2;
287	k3 = u3;
288	l0 = move4_r<`3`>(k1);
289	l3 = move4_l<`2`>(k2);
290	k3 = move4_r<`1`>(k3);
291	a = bit_or(k0, l0);
292	b = shuffle2<`1`,`2`,`0`,`1`>(k1, k2);
293	c = bit_or(k3, l3);
294	#endif
295	}
296
297	template<class T> SIMDPP_INL
298	void v_mem_pack3_impl32_128(T& a, T& b, T& c)
299	{
300	#if SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
301	using U = typename T::uint_vector_type;
302
303	// [a0,a1,a2,a3]
304	// [b0,b1,b2,b3]
305	// [c0,c1,c2,c3]
306	T a1, b1, c1;
307	a1 = a;
308	b1 = align4<`1`>(b, b);
309	c1 = align4<`2`>(c, c);
310
311	// [a0,a1,a2,a3]
312	// [b1,b2,b3,b0]
313	// [c2,c3,c0,c1]
314	T a2, b2, c2;
315	U mask2 = make_uint(`0xffffffff`);
316	mask2 = move4_l<`1`>(mask2);
317
318	a2 = blend(a1, c1, mask2);
319	b2 = blend(b1, a1, mask2);
320	c2 = blend(c1, b1, mask2);
321	// [a0,a1,a2,c1]
322	// [b1,b2,b3,a3]
323	// [c2,c3,c0,b0]
324	U mask1 = make_shuffle_bytes16_mask<`0`,`4`+`3`,`4`+`2`,`1`>(mask1);
325	a = shuffle_bytes16(a2, c2, mask1);
326	b = shuffle_bytes16(b2, a2, mask1);
327	c = shuffle_bytes16(c2, b2, mask1);
328	// [a0,b0,c0,a1]
329	// [b1,c1,a2,b2]
330	// [c2,a3,b3,c3]
331	#else
332	T t0, t1, t2;
333	t0 = shuffle2<`0`,`2`,`0`,`2`>(a, b);
334	t1 = shuffle2<`0`,`2`,`1`,`3`>(c, a);
335	t2 = shuffle2<`1`,`3`,`1`,`3`>(b, c);
336	// [a0,a2,b0,b2]
337	// [c0,c2,a1,a3]
338	// [b1,b3,c1,c3]
339	t0 = permute4<`0`,`2`,`1`,`3`>(t0);
340	t1 = permute4<`0`,`2`,`1`,`3`>(t1);
341	t2 = permute4<`0`,`2`,`1`,`3`>(t2);
342	// [a0,b0,a2,b2]
343	// [c0,a1,c2,a3]
344	// [b1,c1,b3,c3]
345	a = shuffle2<`0`,`1`,`0`,`1`>(t0, t1);
346	b = shuffle2<`0`,`1`,`2`,`3`>(t2, t0);
347	c = shuffle2<`2`,`3`,`2`,`3`>(t1, t2);
348	#endif
349	}
350
351	template<class T> SIMDPP_INL
352	void v_mem_pack3_impl64_128(T& a, T& b, T& c)
353	{
354	T d0, d1, d2;
355	d0 = shuffle1<`0`,`0`>(a, b);
356	d1 = shuffle1<`0`,`1`>(c, a);
357	d2 = shuffle1<`1`,`1`>(b, c);
358	a = d0; b = d1; c = d2;
359	}
360
361	template<class V> SIMDPP_INL
362	void v_mem_pack3_shuffle128(any_vec<`16`,V>& qa, any_vec<`16`,V>& qb, any_vec<`16`,V>& qc)
363	{
364	(void) qa; (void) qb; (void) qc;
365	}
366
367	template<class V> SIMDPP_INL
368	void v_mem_pack3_shuffle128(any_vec<`32`,V>& qa, any_vec<`32`,V>& qb, any_vec<`32`,V>& qc)
369	{
370	// shuffle the vectors so that the lower halves contain the first 3 128-bit
371	// items (a and lower half of b) and the higher halves contain the rest
372
373	V a0, b0, c0, a1, b1, c1;
374
375	a0 = qa.wrapped(); b0 = qb.wrapped(); c0 = qc.wrapped();
376
377	a1 = shuffle1_128<`0`,`0`>(a0, b0);
378	b1 = shuffle1_128<`0`,`1`>(c0, a0);
379	c1 = shuffle1_128<`1`,`1`>(b0, c0);
380
381	qa.wrapped() = a1; qb.wrapped() = b1; qc.wrapped() = c1;
382	}
383
384	#if SIMDPP_USE_AVX512F
385	template<class V> SIMDPP_INL
386	void v_mem_pack3_shuffle128(any_vec<`64`,V>& qa, any_vec<`64`,V>& qb, any_vec<`64`,V>& qc)
387	{
388	V a, b, c; // TODO: optimize. Using full-vector shuffle may be faster
389	a = qa.wrapped(); b = qb.wrapped(); c = qc.wrapped();
390
391	V t0, t1, t2;
392	t0 = shuffle2_128<`0`,`2`,`0`,`2`>(a, b);
393	t1 = shuffle2_128<`0`,`2`,`1`,`3`>(c, a);
394	t2 = shuffle2_128<`1`,`3`,`1`,`3`>(b, c);
395	// [a0,a2,b0,b2]
396	// [c0,c2,a1,a3]
397	// [b1,b3,c1,c3]
398	t0 = permute4_128<`0`,`2`,`1`,`3`>(t0);
399	t1 = permute4_128<`0`,`2`,`1`,`3`>(t1);
400	t2 = permute4_128<`0`,`2`,`1`,`3`>(t2);
401	// [a0,b0,a2,b2]
402	// [c0,a1,c2,a3]
403	// [b1,c1,b3,c3]
404	a = shuffle2_128<`0`,`1`,`0`,`1`>(t0, t1);
405	b = shuffle2_128<`0`,`1`,`2`,`3`>(t2, t0);
406	c = shuffle2_128<`2`,`3`,`2`,`3`>(t1, t2);
407
408	qa.wrapped() = a; qb.wrapped() = b; qc.wrapped() = c;
409	}
410	#endif
411
412	/* Interleaves the elements of @a a, @a b and @a c in such way that:*
413	* every (3n)-th element comes from @a a
414	* every (3n+1)-th element comes from @a b
415	* every (3n+2)-th element comes from @a c
416
417	n = [0, <number of elements in vector> - 1]
418	*/
419	template<unsigned N> SIMDPP_INL
420	void mem_pack3(uint8<N>& a, uint8<N>& b, uint8<N>& c)
421	{
422	v_mem_pack3_impl8_128(a, b, c);
423	v_mem_pack3_shuffle128(a, b, c);
424	}
425
426	template<unsigned N> SIMDPP_INL
427	void mem_pack3(uint16<N>& a, uint16<N>& b, uint16<N>& c)
428	{
429	v_mem_pack3_impl16_128(a, b, c);
430	v_mem_pack3_shuffle128(a, b, c);
431	}
432
433	template<unsigned N> SIMDPP_INL
434	void mem_pack3(uint32<N>& a, uint32<N>& b, uint32<N>& c)
435	{
436	v_mem_pack3_impl32_128(a, b, c);
437	v_mem_pack3_shuffle128(a, b, c);
438	}
439
440	template<unsigned N> SIMDPP_INL
441	void mem_pack3(uint64<N>& a, uint64<N>& b, uint64<N>& c)
442	{
443	v_mem_pack3_impl64_128(a, b, c);
444	v_mem_pack3_shuffle128(a, b, c);
445	}
446
447	template<unsigned N> SIMDPP_INL
448	void mem_pack3(float32<N>& a, float32<N>& b, float32<N>& c)
449	{
450	v_mem_pack3_impl32_128(a, b, c);
451	v_mem_pack3_shuffle128(a, b, c);
452	}
453
454	template<unsigned N> SIMDPP_INL
455	void mem_pack3(float64<N>& a, float64<N>& b, float64<N>& c)
456	{
457	v_mem_pack3_impl64_128(a, b, c);
458	v_mem_pack3_shuffle128(a, b, c);
459	}
460
461	/* Generic implementation of mem_pack4. The 256-bit version applies 128-bit*
462	operations to each half of each vector separately.
463	*/
464	template<class T> SIMDPP_INL
465	void v_mem_pack4_impl8_128(T& a, T& b, T& c, T& d)
466	{
467	// either uint16x8 or uint16x16, other entries likewise
468	#if SIMDPP_USE_SSSE3 \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
469	// TODO: optimize for altivec
470	using w_b32 = typename same_width<T>::u32;
471
472	w_b32 b0, b1, b2, b3;
473	b0 = a; b1 = b; b2 = c; b3 = d;
474	transpose4(b0, b1, b2, b3);
475	a = b0; b = b1; c = b2; d = b3;
476
477	a = transpose_inplace(a);
478	b = transpose_inplace(b);
479	c = transpose_inplace(c);
480	d = transpose_inplace(d);
481	#else
482	using w_b8 = T;
483	using w_b16 = typename same_width<T>::u16;
484	using w_b64 = typename same_width<T>::u64;
485
486	w_b8 e0, e1, e2, e3;
487	w_b64 d0, d1, d2, d3;
488	d0 = a; d1 = b; d2 = c; d3 = d;
489	e0 = zip2_lo(d0, d2);
490	e1 = zip2_lo(d1, d3);
491	e2 = zip2_hi(d0, d2);
492	e3 = zip2_hi(d1, d3);
493	// [a0 .. a7, c0 .. c7 ]
494	// [b0 .. b7, d0 .. d7 ]
495	// [a8 .. a15, c8 .. c15 ]
496	// [b8 .. b15, d8 .. d15 ]
497	w_b16 f0, f1, f2, f3;
498	f0 = zip16_lo(e0, e1);
499	f1 = zip16_hi(e0, e1);
500	f2 = zip16_lo(e2, e3);
501	f3 = zip16_hi(e2, e3);
502	// [a0, b0, a1, b1, a2, b2, a3, b3, a4, b4, a5, b5, a6, b6, a7, b7 ]
503	// [c0, d0, c1, d1, c2, d2, c3, d3, c4, d4, c5, d5, c6, d6, c7, d7 ]
504	// [a8, b8, a9, b9, a10,b10,a11,b11,a12,b12,a13,b13,a14,b14,a15,b15]
505	// [c8, d8, c9, d9, c10,d10,c11,d11,c12,d12,c13,d13,c14,d14,c15,d15]
506	a = zip8_lo(f0, f1);
507	b = zip8_hi(f0, f1);
508	c = zip8_lo(f2, f3);
509	d = zip8_hi(f2, f3);
510	#endif
511	}
512
513	template<class T> SIMDPP_INL
514	void v_mem_pack4_impl16_128(T& a, T& b, T& c, T& d)
515	{
516	using w_b16 = T;
517	using w_b32 = typename same_width<T>::u32;
518	using w_b64 = typename same_width<T>::u64;
519
520	w_b16 e0, e1, e2, e3;
521	w_b64 d0, d1, d2, d3;
522	d0 = a; d1 = b; d2 = c; d3 = d;
523	e0 = zip2_lo(d0, d2);
524	e1 = zip2_lo(d1, d3);
525	e2 = zip2_hi(d0, d2);
526	e3 = zip2_hi(d1, d3);
527	// [a0,a1,a2,a3,c0,c1,c2,c3]
528	// [b0,b1,b2,b3,d0,d1,d2,d3]
529	// [a4,a5,a6,a7,c4,c5,c6,c7]
530	// [b4,b5,b6,b7,d4,d5,d6,d7]
531	w_b32 f0, f1, f2, f3;
532	f0 = zip8_lo(e0, e1);
533	f1 = zip8_hi(e0, e1);
534	f2 = zip8_lo(e2, e3);
535	f3 = zip8_hi(e2, e3);
536	// [a0,b0,a1,b1,a2,b2,a3,b3]
537	// [c0,d0,c1,d1,c2,d2,c3,d3]
538	// [a4,b4,a5,b5,a6,b6,a7,b7]
539	// [c4,d4,c5,d5,c6,d6,c7,d7]
540	a = zip4_lo(f0, f1);
541	b = zip4_hi(f0, f1);
542	c = zip4_lo(f2, f3);
543	d = zip4_hi(f2, f3);
544	}
545
546	template<class T> SIMDPP_INL
547	void v_mem_pack4_impl32_128(T& a, T& b, T& c, T& d)
548	{
549	transpose4(a, b, c, d);
550	}
551
552	template<class T> SIMDPP_INL
553	void v_mem_pack4_impl64_128(T& a, T& b, T& c, T& d)
554	{
555	transpose2(a, b);
556	transpose2(c, d);
557	T t;
558	t = b;
559	b = c;
560	c = t;
561	}
562
563	template<class V> SIMDPP_INL
564	void v_mem_pack4_shuffle128(any_vec<`16`,V>& qa, any_vec<`16`,V>& qb,
565	any_vec<`16`,V>& qc, any_vec<`16`,V>& qd)
566	{
567	(void) qa; (void) qb; (void) qc; (void) qd;
568	}
569
570	template<class V> SIMDPP_INL
571	void v_mem_pack4_shuffle128(any_vec<`32`,V>& qa, any_vec<`32`,V>& qb,
572	any_vec<`32`,V>& qc, any_vec<`32`,V>& qd)
573	{
574	// shuffle the vectors with the lower halves containing the first 4 128-bit
575	// items and the higher halves contain the rest
576	V a0, b0, c0, d0, a1, b1, c1, d1;
577
578	a0 = qa.wrapped(); b0 = qb.wrapped(); c0 = qc.wrapped(); d0 = qd.wrapped();
579
580	a1 = shuffle1_128<`0`,`0`>(a0, b0);
581	b1 = shuffle1_128<`0`,`0`>(c0, d0);
582	c1 = shuffle1_128<`1`,`1`>(a0, b0);
583	d1 = shuffle1_128<`1`,`1`>(c0, d0);
584
585	qa.wrapped() = a1; qb.wrapped() = b1; qc.wrapped() = c1; qd.wrapped() = d1;
586	}
587
588	#if SIMDPP_USE_AVX512F
589	template<class V> SIMDPP_INL
590	void v_mem_pack4_shuffle128(any_vec<`64`,V>& qa, any_vec<`64`,V>& qb,
591	any_vec<`64`,V>& qc, any_vec<`64`,V>& qd)
592	{
593	V a, b, c, d; // TODO: optimize. Using full-vector shuffle/permute will be faster
594
595	a = qa.wrapped(); b = qb.wrapped(); c = qc.wrapped(); d = qd.wrapped();
596
597	V t1, t2, t3, t4;
598	// [a0,a1,a2,a3]
599	// [b0,b1,b2,b3]
600	// [c0,c1,c2,c3]
601	// [d0,d1,d2,d3]
602	t1 = shuffle2_128<`0`,`2`,`0`,`2`>(a, b);
603	t2 = shuffle2_128<`1`,`3`,`1`,`3`>(a, b);
604	t3 = shuffle2_128<`0`,`2`,`0`,`2`>(c, d);
605	t4 = shuffle2_128<`1`,`3`,`1`,`3`>(c, d);
606	// [a0,a2,b0,b2]
607	// [a1,a3,b1,b3]
608	// [c0,c2,d0,d2]
609	// [c1,c3,d1,d3]
610	a = shuffle2_128<`0`,`2`,`0`,`2`>(t1, t3);
611	b = shuffle2_128<`0`,`2`,`0`,`2`>(t2, t4);
612	c = shuffle2_128<`1`,`3`,`1`,`3`>(t1, t3);
613	d = shuffle2_128<`1`,`3`,`1`,`3`>(t2, t4);
614	// [a0,b0,c0,d0]
615	// [a1,b1,c1,d1]
616	// [a2,b2,c2,d2]
617	// [a3,b3,c3,d3]
618
619
620	qa.wrapped() = a; qb.wrapped() = b; qc.wrapped() = c; qd.wrapped() = d;
621	}
622	#endif
623
624	/* Interleaves the elements of @a a, @a b, @a c and @a d in such way that:*
625	* every (4n)-th element comes from @a a
626	* every (4n+1)-th element comes from @a b
627	* every (4n+2)-th element comes from @a c
628	* every (4n+3)-th element comes from @a d
629
630	n = [0, <number of elements in vector> - 1]
631	*/
632	template<unsigned N> SIMDPP_INL
633	void mem_pack4(uint8<N>& a, uint8<N>& b, uint8<N>& c, uint8<N>& d)
634	{
635	v_mem_pack4_impl8_128(a, b, c, d);
636	v_mem_pack4_shuffle128(a, b, c, d);
637	}
638
639	template<unsigned N> SIMDPP_INL
640	void mem_pack4(uint16<N>& a, uint16<N>& b, uint16<N>& c, uint16<N>& d)
641	{
642	v_mem_pack4_impl16_128(a, b, c, d);
643	v_mem_pack4_shuffle128(a, b, c, d);
644	}
645
646	template<unsigned N> SIMDPP_INL
647	void mem_pack4(uint32<N>& a, uint32<N>& b, uint32<N>& c, uint32<N>& d)
648	{
649	v_mem_pack4_impl32_128(a, b, c, d);
650	v_mem_pack4_shuffle128(a, b, c, d);
651	}
652
653	template<unsigned N> SIMDPP_INL
654	void mem_pack4(uint64<N>& a, uint64<N>& b, uint64<N>& c, uint64<N>& d)
655	{
656	v_mem_pack4_impl64_128(a, b, c, d);
657	v_mem_pack4_shuffle128(a, b, c, d);
658	}
659
660	template<unsigned N> SIMDPP_INL
661	void mem_pack4(float32<N>& a, float32<N>& b, float32<N>& c, float32<N>& d)
662	{
663	v_mem_pack4_impl32_128(a, b, c, d);
664	v_mem_pack4_shuffle128(a, b, c, d);
665	}
666
667	template<unsigned N> SIMDPP_INL
668	void mem_pack4(float64<N>& a, float64<N>& b, float64<N>& c, float64<N>& d)
669	{
670	v_mem_pack4_impl64_128(a, b, c, d);
671	v_mem_pack4_shuffle128(a, b, c, d);
672	}
673
674	} // namespace insn
675	} // namespace detail
676	} // namespace SIMDPP_ARCH_NAMESPACE
677	} // namespace simdpp
678
679	#endif
680
681

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/mem_pack.h