mem_unpack.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/mem_unpack.h]

1	/ Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_MEM_UNPACK_H
9	#define LIBSIMDPP_SIMDPP_DETAIL_INSN_MEM_UNPACK_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/types.h>
16	#include <simdpp/detail/width.h>
17	#include <simdpp/detail/insn/shuffle128.h>
18	#include <simdpp/detail/insn/zip128.h>
19	#include <simdpp/core/align.h>
20	#include <simdpp/core/splat_n.h>
21	#include <simdpp/core/make_shuffle_bytes_mask.h>
22	#include <simdpp/core/shuffle1.h>
23	#include <simdpp/core/shuffle2.h>
24	#include <simdpp/core/transpose.h>
25	#include <simdpp/core/unzip_hi.h>
26	#include <simdpp/core/unzip_lo.h>
27	#include <simdpp/core/zip_hi.h>
28	#include <simdpp/core/zip_lo.h>
29
30	namespace simdpp {
31	namespace SIMDPP_ARCH_NAMESPACE {
32	namespace detail {
33	namespace insn {
34
35	/* Concatenates @a a and @a b and stores the elements of the resulting array*
36	as follows:
37	* every (2n)-th element is stored to @a a
38	* every (2n+1)-th element is stored to @a b
39
40	n = [0, <number of elements in vector> - 1]
41	*/
42	template<class V> SIMDPP_INL
43	void mem_unpack2(any_vec<`16`,V>& qa, any_vec<`16`,V>& qb)
44	{
45	V a = qa.wrapped();
46	V b = qb.wrapped();
47
48	qa.wrapped() = unzip128_lo(a, b);
49	qb.wrapped() = unzip128_hi(a, b);
50	}
51
52	template<class V> SIMDPP_INL
53	void mem_unpack2(any_vec<`32`,V>& qa, any_vec<`32`,V>& qb)
54	{
55	V a = qa.wrapped();
56	V b = qb.wrapped();
57
58	V c1 = shuffle1_128<`0`,`0`>(a, b);
59	V c2 = shuffle1_128<`1`,`1`>(a, b);
60	qa.wrapped() = unzip128_lo(c1, c2);
61	qb.wrapped() = unzip128_hi(c1, c2);
62	}
63
64	#if SIMDPP_USE_AVX512F
65	template<class V> SIMDPP_INL
66	void mem_unpack2(any_vec<`64`,V>& qa, any_vec<`64`,V>& qb)
67	{
68	V a = qa.wrapped();
69	V b = qb.wrapped();
70
71	V c1 = shuffle2_128<`0`,`2`,`0`,`2`>(a, b);
72	V c2 = shuffle2_128<`1`,`3`,`1`,`3`>(a, b);
73	qa.wrapped() = unzip128_lo(c1, c2);
74	qb.wrapped() = unzip128_hi(c1, c2);
75	}
76	#endif
77
78	/* Generic implementation of mem_unpack3. The 128-bit lanes are processed*
79	independently
80	*/
81	template<class T> SIMDPP_INL
82	void v_mem_unpack3_impl8_128(T& a, T& b, T& c)
83	{
84	#if SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
85	// [a0, b0, c0, a1, b1, c1, a2, b2, c2, a3, b3, c3, a4, b4, c4, a5 ]
86	// [b5, c5, a6, b6, c6, a7, b7, c7, a8, b8, c8, a9, b9, c9, a10,b10]
87	// [c10,a11,b11,c11,a12,b12,c12,a13,b13,c13,a14,b14,c14,a15,b15,c15]
88	T mask1 = make_shuffle_bytes16_mask< `1`, `4`, `7`, `10`, `13`,`16`+`0`,`16`+`3`,`16`+`6`,
89	`16`+`9`,`16`+`12`,`16`+`15`, `2`, `5`, `8`, `11`, `14`>(mask1);
90	T a1, b1, c1;
91	a1 = shuffle_bytes16(c, a, mask1);
92	b1 = shuffle_bytes16(a, b, mask1);
93	c1 = shuffle_bytes16(b, c, mask1);
94	// [a11,a12,a13,a14,a15,a0, a1, a2, a3, a4, a5, b11,b12,b13,b14,b15]
95	// [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10,c0, c1, c2, c3, c4 ]
96	// [c5, c6, c7, c8, c9, c10,c11,c12,c13,c14,c15,a6, a7, a8, a9, a10]
97	T a2, b2, c2;
98	T mask2 = make_uint(`0xff`);
99	mask2 = move16_l<`5`>(mask2);
100
101	a2 = blend(a1, c1, mask2);
102	b2 = blend(b1, a1, mask2);
103	c2 = blend(c1, b1, mask2);
104	// [a11..a15,a0..a10]
105	// [b0..b15]
106	// [c5..c15,c0..c5]
107	a = align16<`5`>(a2, a2);
108	b = b2;
109	c = align16<`11`>(c2, c2);
110	#else
111	typename same_width<T>::u8 t0, t1, t2, t3;
112	t0 = a;
113	t1 = align16<`12`>(a, b);
114	t2 = align16<`8`>(b, c);
115	t3 = move16_l<`4`>(c);
116	// [a0, b0, c0, a1, b1, c1, a2, b2, c2, a3, b3, c3, ...]
117	// [a4, b4, c4, a5, b5, c5, a6, b6, c6, a7, b7, c7, ...]
118	// [a8, b8, c8, a9, b9, c9, a10,b10,c10,a11,b11,c11, ...]
119	// [a12,b12,c12,a13,b13,c13,a14,b14,c14,a15,b15,c15, ...]
120	typename same_width<T>::u16 b0, b1, b2, b3;
121	b0 = zip16_lo(t0, t1);
122	b1 = zip16_lo(t2, t3);
123	b2 = zip16_hi(t0, t1);
124	b3 = zip16_hi(t2, t3);
125	// [a0, a4, b0, b4, c0, c4, a1, a5, b1, b5, c1, c5, a2, a6, b2, b6 ]
126	// [a8, a12,b8, b12,c9, c13,a9, a13,b9, b13,c9, c13,a10,a14,b10,b14,]
127	// [c2, c6, a3, a7, b3, b7, c3, c7, ... ]
128	// [c10,c14,a11,a15,b11,b15,c11,c15,... ]
129	typename same_width<T>::u8 u0, u1, u2;
130	u0 = zip8_lo(b0, b1);
131	u1 = zip8_hi(b0, b1);
132	u2 = zip8_lo(b2, b3);
133	// [a0, a4, a8, a12,b0, b4, b8, b12, c0, c4, c8, c12, a1, a5, a9, a13 ]
134	// [b1, b5, b9, b13,c1, c5, c9, c13, a2, a6, a10,a14, b2, b6, b10,b14 ]
135	// [c2, c6, c10,c14,a3, a7, a11,a15, b3, b7, b11,b15, c3, c7, c11,c15 ]
136	t0 = u0;
137	t1 = align16<`12`>(u0, u1);
138	t2 = align16<`8`>(u1, u2);
139	t3 = move16_l<`4`>(u2);
140	// [a0, a4, a8, a12,b0, b4, b8, b12, c0, c4, c8, c12, ...]
141	// [a1, a5, a9, a13,b1, b5, b9, b13, c1, c5, c9, c13, ...]
142	// [a2, a6, a10,a14,b2, b6, b10,b13, c2, c6, c10,c14, ...]
143	// [a3, a7, a11,a15,b3, b7, b11,b13, c3, c7, c11,c15, ...]
144	b0 = zip16_lo(t0, t1);
145	b1 = zip16_lo(t2, t3);
146	b2 = zip16_hi(t0, t1);
147	b3 = zip16_hi(t2, t3);
148	// [a0, a1, a4, a5, a8, a9, a12,a13,b0, b1, b4, b5, b8, b9, b12,b13 ]
149	// [a2, a3, a6, a7, a10,a11,a14,a15,b2, b3, b6, b7, b10,b11,b14,b15 ]
150	// [c0, c1, c4, c5, c8, c9, c12,c13, ... ]
151	// [c2, c3, c6, c7, c10,c11,c14,c15, ... ]
152	a = zip8_lo(b0, b1);
153	b = zip8_hi(b0, b1);
154	c = zip8_lo(b2, b3);
155	#endif
156	}
157
158	template<class T> SIMDPP_INL
159	void v_mem_unpack3_impl16_128(T& a, T& b, T& c)
160	{
161	#if SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
162	// [a0,b0,c0,a1,b1,c1,a2,b2]
163	// [c2,a3,b3,c3,a4,b4,c4,a5]
164	// [b5,c5,a6,b6,c6,a7,b7,c7]
165	T mask1 = make_shuffle_bytes16_mask<`0`,`3`,`6`,`8`+`1`,`8`+`4`,`8`+`7`,`8`+`2`,`8`+`5`>(mask1);
166	T a1, b1, c1;
167	a1 = shuffle_bytes16(a, b, mask1);
168	c1 = shuffle_bytes16(b, c, mask1);
169	b1 = shuffle_bytes16(c, a, mask1);
170	// [a0,a1,a2,a3,a4,a5,b3,b4]
171	// [c2,c3,c4,c5,c6,c7,a6,a7]
172	// [b5,b6,b7,b0,b1,b2,c0,c1]
173	T a2, b2, c2;
174	T mask2 = make_uint(`0xffff`);
175	mask2 = move8_l<`2`>(mask2);
176
177	a2 = blend(a1, c1, mask2);
178	b2 = blend(b1, a1, mask2);
179	c2 = blend(c1, b1, mask2);
180	// [a0..a7]
181	// [b5..b7,b0..b4]
182	// [c2..c7,c0,c1]
183	a = a2;
184	b = align8<`3`>(b2, b2);
185	c = align8<`6`>(c2, c2);
186	#else
187	T t0, t1, t2, t3;
188	t0 = a;
189	t1 = align8<`6`>(a, b);
190	t2 = align8<`4`>(b, c);
191	t3 = move8_l<`2`>(c);
192	// [a0,b0,c0,a1,b1,c1, ... ]
193	// [a2,b2,c2,a3,b3,c3, ... ]
194	// [a4,b4,c4,a5,b5,c5, ... ]
195	// [a6,b6,c6,a7,b7,c7, ... ]
196	typename same_width<T>::u32 b0, b1, b2, b3;
197	b0 = zip8_lo(t0, t1);
198	b1 = zip8_lo(t2, t3);
199	b2 = zip8_hi(t0, t1);
200	b3 = zip8_hi(t2, t3);
201	// [a0,a2,b0,b2,c0,c2,a1,a3]
202	// [a4,a6,b4,b6,c4,c6,a5,a7]
203	// [b1,b3,c1,c3, ... ]
204	// [b5,b7,c5,c7, ... ]
205	typename same_width<T>::u64 c0, c1, c2;
206	c0 = zip4_lo(b0, b1);
207	c1 = zip4_hi(b0, b1);
208	c2 = zip4_lo(b2, b3);
209	// [a0,a2,a4,a6,b0,b2,b4,b6]
210	// [c0,c2,c4,c6,a1,a3,a5,a7]
211	// [b1,b3,b5,b7,c1,c3,c5,c7]
212	t0 = c0;
213	t1 = shuffle1<`1`,`0`>(c0, c1);
214	t2 = splat2<`1`>(c1);
215	t3 = c2;
216	// [a0,a2,a4,a6,b0,b2,b4,b6]
217	// [b0,b2,b4,b6,c0,c2,c4,c6]
218	// [a1,a3,a5,a7,a1,a3,a5,a7]
219	// [b1,b3,b5,b7,c1,c3,c5,c7]
220	a = zip8_lo(t0, t2);
221	b = zip8_lo(t1, t3);
222	c = zip8_hi(t1, t3);
223	#endif
224	}
225
226	template<class T> SIMDPP_INL
227	void v_mem_unpack3_impl32_128(T& a, T& b, T& c)
228	{
229	#if SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
230	using U = typename T::uint_vector_type;
231
232	// [a0,b0,c0,a1]
233	// [b1,c1,a2,b2]
234	// [c2,a3,b3,c3]
235	U mask1 = make_shuffle_bytes16_mask<`0`,`3`,`4`+`2`,`4`+`1`>(mask1);
236	T a1, b1, c1;
237	a1 = shuffle_bytes16(a, b, mask1);
238	b1 = shuffle_bytes16(b, c, mask1);
239	c1 = shuffle_bytes16(c, a, mask1);
240	// [a0,a1,a2,c1]
241	// [b1,b2,b3,a3]
242	// [c2,c3,c0,b0]
243	T a2, b2, c2;
244	U mask2 = make_uint(`0xffffffff`);
245	mask2 = move4_l<`1`>(mask2);
246
247	a2 = blend(a1, b1, mask2);
248	b2 = blend(b1, c1, mask2);
249	c2 = blend(c1, a1, mask2);
250	// [a0,a1,a2,a3]
251	// [b1,b2,b3,b0]
252	// [c2,c3,c0,c1]
253	a = a2;
254	b = align4<`3`>(b2, b2);
255	c = align4<`2`>(c2, c2);
256	#else
257	T t11, t12, t21, t22, t31, t32;
258	// [a0,b0,c0,a1]
259	// [b1,c1,a2,b2]
260	// [c2,a3,b3,c3]
261	t11 = a;
262	t12 = shuffle2<`0`,`1`,`2`,`3`>(c, b);
263	t21 = shuffle2<`0`,`1`,`0`,`1`>(a, b);
264	t22 = shuffle2<`2`,`3`,`2`,`3`>(b, c);
265	t31 = shuffle2<`2`,`3`,`0`,`1`>(a, b);
266	t32 = c;
267	// [a0,b0,c0,a1]
268	// [c2,a3,a2,b2]
269	// [a0,b0,b1,c1]
270	// [a2,b2,b3,c3]
271	// [c0,a1,b1,c1]
272	// [c2,a3,b3,c3]
273	a = shuffle2<`0`,`3`,`2`,`1`>(t11, t12);
274	b = shuffle2<`1`,`2`,`1`,`2`>(t21, t22);
275	c = shuffle2<`0`,`3`,`0`,`3`>(t31, t32);
276	#endif
277	}
278
279	template<class T> SIMDPP_INL
280	void v_mem_unpack3_impl64_128(T& a, T& b, T& c)
281	{
282	T d0, d1, d2;
283	d0 = shuffle1<`0`,`1`>(a, b);
284	d1 = shuffle1<`1`,`0`>(a, c);
285	d2 = shuffle1<`0`,`1`>(b, c);
286	a = d0; b = d1; c = d2;
287	}
288
289	template<class V> SIMDPP_INL
290	void v_mem_unpack3_shuffle128(any_vec<`16`,V>& qa, any_vec<`16`,V>& qb, any_vec<`16`,V>& qc)
291	{
292	(void) qa; (void) qb; (void) qc;
293	}
294
295	template<class V> SIMDPP_INL
296	void v_mem_unpack3_shuffle128(any_vec<`32`,V>& qa, any_vec<`32`,V>& qb, any_vec<`32`,V>& qc)
297	{
298	// shuffle the vectors so that the lower halves contain the first 3 128-bit
299	// items (a and lower half of b) and the higher halves contain the rest
300
301	V a0, b0, c0, a1, b1, c1;
302
303	a0 = qa.wrapped(); b0 = qb.wrapped(); c0 = qc.wrapped();
304
305	a1 = shuffle1_128<`0`,`1`>(a0, b0);
306	b1 = shuffle1_128<`1`,`0`>(a0, c0);
307	c1 = shuffle1_128<`0`,`1`>(b0, c0);
308
309	qa.wrapped() = a1; qb.wrapped() = b1; qc.wrapped() = c1;
310	}
311
312	#if SIMDPP_USE_AVX512F
313	template<class V> SIMDPP_INL
314	void v_mem_unpack3_shuffle128(any_vec<`64`,V>& qa, any_vec<`64`,V>& qb, any_vec<`64`,V>& qc)
315	{
316	V a, b, c; // TODO: optimize. Using full-vector shuffle may be faster
317	a = qa.wrapped(); b = qb.wrapped(); c = qc.wrapped();
318
319	V t11, t12, t21, t22, t31, t32;
320	// [a0,b0,c0,a1]
321	// [b1,c1,a2,b2]
322	// [c2,a3,b3,c3]
323	t11 = a;
324	t12 = shuffle2_128<`0`,`1`,`2`,`3`>(c, b);
325	t21 = shuffle2_128<`0`,`1`,`0`,`1`>(a, b);
326	t22 = shuffle2_128<`2`,`3`,`2`,`3`>(b, c);
327	t31 = shuffle2_128<`2`,`3`,`0`,`1`>(a, b);
328	t32 = c;
329	// [a0,b0,c0,a1]
330	// [c2,a3,a2,b2]
331	// [a0,b0,b1,c1]
332	// [a2,b2,b3,c3]
333	// [c0,a1,b1,c1]
334	// [c2,a3,b3,c3]
335	a = shuffle2_128<`0`,`3`,`2`,`1`>(t11, t12);
336	b = shuffle2_128<`1`,`2`,`1`,`2`>(t21, t22);
337	c = shuffle2_128<`0`,`3`,`0`,`3`>(t31, t32);
338
339	qa.wrapped() = a; qb.wrapped() = b; qc.wrapped() = c;
340	}
341	#endif
342
343	/* Concatenates @a a, @a b and @a c and stores the elements of the resulting*
344	array as follows:
345	* every (3n)-th element is stored to @a a
346	* every (3n+1)-th element is stored to @a b
347	* every (3n+2)-th element is stored to @a c
348
349	n = [0, <number of elements in vector> - 1]
350	*/
351	template<unsigned N> SIMDPP_INL
352	void mem_unpack3(uint8<N>& a, uint8<N>& b, uint8<N>& c)
353	{
354	v_mem_unpack3_shuffle128(a, b, c);
355	v_mem_unpack3_impl8_128(a, b, c);
356	}
357
358	template<unsigned N> SIMDPP_INL
359	void mem_unpack3(uint16<N>& a, uint16<N>& b, uint16<N>& c)
360	{
361	v_mem_unpack3_shuffle128(a, b, c);
362	v_mem_unpack3_impl16_128(a, b, c);
363	}
364
365	template<unsigned N> SIMDPP_INL
366	void mem_unpack3(uint32<N>& a, uint32<N>& b, uint32<N>& c)
367	{
368	v_mem_unpack3_shuffle128(a, b, c);
369	v_mem_unpack3_impl32_128(a, b, c);
370	}
371
372	template<unsigned N> SIMDPP_INL
373	void mem_unpack3(uint64<N>& a, uint64<N>& b, uint64<N>& c)
374	{
375	v_mem_unpack3_shuffle128(a, b, c);
376	v_mem_unpack3_impl64_128(a, b, c);
377	}
378
379	template<unsigned N> SIMDPP_INL
380	void mem_unpack3(float32<N>& a, float32<N>& b, float32<N>& c)
381	{
382	v_mem_unpack3_shuffle128(a, b, c);
383	v_mem_unpack3_impl32_128(a, b, c);
384	}
385
386	template<unsigned N> SIMDPP_INL
387	void mem_unpack3(float64<N>& a, float64<N>& b, float64<N>& c)
388	{
389	v_mem_unpack3_shuffle128(a, b, c);
390	v_mem_unpack3_impl64_128(a, b, c);
391	}
392
393	/* Generic implementation of mem_unpack4. The 256-bit version applies 128-bit*
394	operations to each half of each vector separately.
395	*/
396	template<class T> SIMDPP_INL
397	void v_mem_unpack4_impl8_128(T& a, T& b, T& c, T& d)
398	{
399	#if SIMDPP_USE_SSSE3 \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
400	// TODO: optimize for Altivec and MSA
401	typename same_width<T>::u32 b0, b1, b2, b3;
402	b0 = transpose_inplace(a);
403	b1 = transpose_inplace(b);
404	b2 = transpose_inplace(c);
405	b3 = transpose_inplace(d);
406
407	transpose4(b0, b1, b2, b3);
408	a = b0; b = b1; c = b2; d = b3;
409	#else
410	// [a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3 ]
411	// [a4, b4, c4, d4, a5, b5, c5, d5, a6, b6, c6, d6, a7, b7, c7, d7 ]
412	// [a8, b8, c8, d8, a9, b9, c9, d9, a10,b10,c10,d10,a11,b11,c11,d11]
413	// [a12,b12,c12,d12,a13,b13,c13,d13,a14,b14,c14,d14,a15,b15,c15,d15]
414	T b0, b1, b2, b3, c0, c1, c2, c3;
415	b0 = zip16_lo(a, b);
416	b1 = zip16_hi(a, b);
417	b2 = zip16_lo(c, d);
418	b3 = zip16_hi(c, d);
419	// [a0, a4, b0, b4, c0, c4, d0, d4, a1, a5, b1, b5, c1, c5, d1, d5 ]
420	// [a2, a6, b2, b6, c2, c6, d2, d6, a3, a7, b3, b7, c3, c7, d3, d7 ]
421	// [a8, a12,b8, b12,c8, c12,d8, d12,a9, a13,b9, b13,c9, c13,d9, d13]
422	// [a10,a14,b10,b14,c10,c14,d10,d14,a11,a15,b11,b15,c11,c15,d11,d15]
423	c0 = zip16_lo(b0, b1);
424	c1 = zip16_hi(b0, b1);
425	c2 = zip16_lo(b2, b3);
426	c3 = zip16_hi(b2, b3);
427	// [a0, a2, a4, a6, b0, b2, b4, b6, c0, c2, c4, c6, d0, d2, d4, d6 ]
428	// [a1, a3, a5, a7, b1, b3, b5, b7, c1, c3, c5, c7, d1, d3, d5, d7 ]
429	// [a8, a10,a12,a14,b8, b10,b12,b14,c8, c10,c12,c14,d8, d10,d12,d14]
430	// [a9, a11,a13,a15,b9, b11,b13,b15,c9, c11,c13,c15,d9, d11,d13,d15]
431	typename same_width<T>::u64 d0, d1, d2, d3;
432	d0 = zip16_lo(c0, c1);
433	d1 = zip16_hi(c0, c1);
434	d2 = zip16_lo(c2, c3);
435	d3 = zip16_hi(c2, c3);
436	// [a0 .. a7, b0 .. b7 ]
437	// [c0 .. c7, d0 .. d7 ]
438	// [a8 .. a15, b8 .. b15 ]
439	// [b8 .. b15, d8 .. d15 ]
440	a = zip2_lo(d0, d2);
441	b = zip2_hi(d0, d2);
442	c = zip2_lo(d1, d3);
443	d = zip2_hi(d1, d3);
444	#endif
445	}
446
447	template<class T> SIMDPP_INL
448	void v_mem_unpack4_impl16_128(T& a, T& b, T& c, T& d)
449	{
450	// [a0,b0,c0,d0,a1,b1,c1,d1]
451	// [a2,b2,c2,d2,a3,b3,c3,d3]
452	// [a4,b4,c4,d4,a5,b5,c5,d5]
453	// [a6,b6,c6,d6,a7,b7,c7,d7]
454	typename same_width<T>::u16 t0, t1, t2, t3;
455	t0 = zip8_lo(a, b);
456	t1 = zip8_hi(a, b);
457	t2 = zip8_lo(c, d);
458	t3 = zip8_hi(c, d);
459	// [a0,a2,b0,b2,c0,c2,d0,d2]
460	// [a1,a3,b1,b3,c1,c3,d1,d3]
461	// [a4,a6,b4,b6,c4,c6,d4,d6]
462	// [a5,a7,b5,b7,c5,c7,d5,d7]
463	typename same_width<T>::u64 u0, u1, u2, u3;
464	u0 = zip8_lo(t0, t1);
465	u1 = zip8_hi(t0, t1);
466	u2 = zip8_lo(t2, t3);
467	u3 = zip8_hi(t2, t3);
468	// [a0,a1,a2,a3,b0,b1,b2,b3]
469	// [c0,c1,c2,c3,d0,d1,d2,d3]
470	// [a4,a5,a6,a7,b4,b5,b6,b7]
471	// [c4,c5,c6,c7,d4,d5,d6,d7]
472	a = zip2_lo(u0, u2);
473	b = zip2_hi(u0, u2);
474	c = zip2_lo(u1, u3);
475	d = zip2_hi(u1, u3);
476	}
477
478	template<class T> SIMDPP_INL
479	void v_mem_unpack4_impl32_128(T& a, T& b, T& c, T& d)
480	{
481	transpose4(a, b, c, d);
482	}
483
484	template<class T> SIMDPP_INL
485	void v_mem_unpack4_impl64_128(T& a, T& b, T& c, T& d)
486	{
487	transpose2(a, c);
488	transpose2(b, d);
489	T t;
490	t = b;
491	b = c;
492	c = t;
493	}
494
495	template<class V> SIMDPP_INL
496	void v_mem_unpack4_shuffle128(any_vec<`16`,V>& qa, any_vec<`16`,V>& qb,
497	any_vec<`16`,V>& qc, any_vec<`16`,V>& qd)
498	{
499	(void) qa; (void) qb; (void) qc; (void) qd;
500	}
501
502	template<class V> SIMDPP_INL
503	void v_mem_unpack4_shuffle128(any_vec<`32`,V>& qa, any_vec<`32`,V>& qb,
504	any_vec<`32`,V>& qc, any_vec<`32`,V>& qd)
505	{
506	V a0, b0, c0, d0, a1, b1, c1, d1;
507
508	a0 = qa.wrapped(); b0 = qb.wrapped(); c0 = qc.wrapped(); d0 = qd.wrapped();
509
510	a1 = shuffle1_128<`0`,`0`>(a0, c0);
511	b1 = shuffle1_128<`1`,`1`>(a0, c0);
512	c1 = shuffle1_128<`0`,`0`>(b0, d0);
513	d1 = shuffle1_128<`1`,`1`>(b0, d0);
514
515	qa.wrapped() = a1; qb.wrapped() = b1; qc.wrapped() = c1; qd.wrapped() = d1;
516	}
517
518	#if SIMDPP_USE_AVX512F
519	template<class V> SIMDPP_INL
520	void v_mem_unpack4_shuffle128(any_vec<`64`,V>& qa, any_vec<`64`,V>& qb,
521	any_vec<`64`,V>& qc, any_vec<`64`,V>& qd)
522	{
523	V a, b, c, d; // TODO: optimize. Using full-vector shuffle/permute will be faster
524
525	a = qa.wrapped(); b = qb.wrapped(); c = qc.wrapped(); d = qd.wrapped();
526
527	V t1, t2, t3, t4;
528	// [a0,a1,a2,a3]
529	// [b0,b1,b2,b3]
530	// [c0,c1,c2,c3]
531	// [d0,d1,d2,d3]
532	t1 = shuffle2_128<`0`,`2`,`0`,`2`>(a, b);
533	t2 = shuffle2_128<`1`,`3`,`1`,`3`>(a, b);
534	t3 = shuffle2_128<`0`,`2`,`0`,`2`>(c, d);
535	t4 = shuffle2_128<`1`,`3`,`1`,`3`>(c, d);
536	// [a0,a2,b0,b2]
537	// [a1,a3,b1,b3]
538	// [c0,c2,d0,d2]
539	// [c1,c3,d1,d3]
540	a = shuffle2_128<`0`,`2`,`0`,`2`>(t1, t3);
541	b = shuffle2_128<`0`,`2`,`0`,`2`>(t2, t4);
542	c = shuffle2_128<`1`,`3`,`1`,`3`>(t1, t3);
543	d = shuffle2_128<`1`,`3`,`1`,`3`>(t2, t4);
544	// [a0,b0,c0,d0]
545	// [a1,b1,c1,d1]
546	// [a2,b2,c2,d2]
547	// [a3,b3,c3,d3]
548
549	qa.wrapped() = a; qb.wrapped() = b; qc.wrapped() = c; qd.wrapped() = d;
550	}
551	#endif
552
553	/* Concatenates @a a, @a b, @a c and @a d and stores the elements of the*
554	resulting array as follows:
555	* every (4n)-th element is stored to @a a
556	* every (4n+1)-th element is stored to @a b
557	* every (4n+2)-th element is stored to @a c
558	* every (4n+3)-th element is stored to @a d
559
560	n = [0, <number of elements in vector> - 1]
561	*/
562	// @icost{SSE2, SSE3, 16}
563	// @icost{SSSE3, SSE4.1, 12}
564	template<unsigned N> SIMDPP_INL
565	void mem_unpack4(uint8<N>& a, uint8<N>& b, uint8<N>& c, uint8<N>& d)
566	{
567	v_mem_unpack4_shuffle128(a, b, c, d);
568	v_mem_unpack4_impl8_128(a, b, c, d);
569	}
570
571	template<unsigned N> SIMDPP_INL
572	void mem_unpack4(uint16<N>& a, uint16<N>& b, uint16<N>& c, uint16<N>& d)
573	{
574	v_mem_unpack4_shuffle128(a, b, c, d);
575	v_mem_unpack4_impl16_128(a, b, c, d);
576	}
577
578	template<unsigned N> SIMDPP_INL
579	void mem_unpack4(uint32<N>& a, uint32<N>& b, uint32<N>& c, uint32<N>& d)
580	{
581	v_mem_unpack4_shuffle128(a, b, c, d);
582	v_mem_unpack4_impl32_128(a, b, c, d);
583	}
584
585	template<unsigned N> SIMDPP_INL
586	void mem_unpack4(uint64<N>& a, uint64<N>& b, uint64<N>& c, uint64<N>& d)
587	{
588	v_mem_unpack4_shuffle128(a, b, c, d);
589	v_mem_unpack4_impl64_128(a, b, c, d);
590	}
591
592	template<unsigned N> SIMDPP_INL
593	void mem_unpack4(float32<N>& a, float32<N>& b, float32<N>& c, float32<N>& d)
594	{
595	v_mem_unpack4_shuffle128(a, b, c, d);
596	v_mem_unpack4_impl32_128(a, b, c, d);
597	}
598
599	template<unsigned N> SIMDPP_INL
600	void mem_unpack4(float64<N>& a, float64<N>& b, float64<N>& c, float64<N>& d)
601	{
602	v_mem_unpack4_shuffle128(a, b, c, d);
603	v_mem_unpack4_impl64_128(a, b, c, d);
604	}
605
606	/* Concatenates the given vectors and stores the elements of the resulting*
607	array as follows:
608	* every (3n)-th element of the first 48 elements is stored to @a a
609	* every (3n+1)-th element of the first 48 elements is stored to @a b
610	* every (3n+2)-th element of the first 48 elements is stored to @a c
611	* every (3n)-th element of the last 48 elements is stored to @a d
612	* every (3n+1)-th element of the last 48 elements is stored to @a e
613	* every (3n+2)-th element of the lasd 48 elements is stored to @a f
614
615	n = [0, <number of elements in vector> - 1]
616	*/
617	static SIMDPP_INL
618	void mem_unpack6(uint8x16& a, uint8x16& b, uint8x16& c,
619	uint8x16& d, uint8x16& e, uint8x16& f)
620	{
621	uint8x16 t0, t1, t2, t3, t4, t5;
622	t0 = zip16_lo(a, d);
623	t1 = zip16_hi(a, d);
624	t2 = zip16_lo(b, e);
625	t3 = zip16_hi(b, e);
626	t4 = zip16_lo(c, f);
627	t5 = zip16_hi(c, f);
628
629	uint8x16 u0, u1, u2, u3, u4, u5;
630	u0 = zip16_lo(t0, t3);
631	u1 = zip16_hi(t0, t3);
632	u2 = zip16_lo(t1, t4);
633	u3 = zip16_hi(t1, t4);
634	u4 = zip16_lo(t2, t5);
635	u5 = zip16_hi(t2, t5);
636
637	t0 = zip16_lo(u0, u3);
638	t1 = zip16_hi(u0, u3);
639	t2 = zip16_lo(u1, u4);
640	t3 = zip16_hi(u1, u4);
641	t4 = zip16_lo(u2, u5);
642	t5 = zip16_hi(u2, u5);
643
644	u0 = zip16_lo(t0, t3);
645	u1 = zip16_hi(t0, t3);
646	u2 = zip16_lo(t1, t4);
647	u3 = zip16_hi(t1, t4);
648	u4 = zip16_lo(t2, t5);
649	u5 = zip16_hi(t2, t5);
650
651	t0 = zip16_lo(u0, u3);
652	t1 = zip16_hi(u0, u3);
653	t2 = zip16_lo(u1, u4);
654	t3 = zip16_hi(u1, u4);
655	t4 = zip16_lo(u2, u5);
656	t5 = zip16_hi(u2, u5);
657
658	a = zip16_lo(t0, t3);
659	b = zip16_hi(t0, t3);
660	c = zip16_lo(t1, t4);
661	d = zip16_hi(t1, t4);
662	e = zip16_lo(t2, t5);
663	f = zip16_hi(t2, t5);
664	}
665
666	static SIMDPP_INL
667	void mem_unpack6(uint16x8& a, uint16x8& b, uint16x8& c,
668	uint16x8& d, uint16x8& e, uint16x8& f)
669	{
670	uint16x8 t0, t1, t2, t3, t4, t5;
671	t0 = zip8_lo(a, d);
672	t1 = zip8_hi(a, d);
673	t2 = zip8_lo(b, e);
674	t3 = zip8_hi(b, e);
675	t4 = zip8_lo(c, f);
676	t5 = zip8_hi(c, f);
677
678	uint16x8 u0, u1, u2, u3, u4, u5;
679	u0 = zip8_lo(t0, t3);
680	u1 = zip8_hi(t0, t3);
681	u2 = zip8_lo(t1, t4);
682	u3 = zip8_hi(t1, t4);
683	u4 = zip8_lo(t2, t5);
684	u5 = zip8_hi(t2, t5);
685
686	a = zip8_lo(u0, u3);
687	b = zip8_hi(u0, u3);
688	c = zip8_lo(u1, u4);
689	d = zip8_hi(u1, u4);
690	e = zip8_lo(u2, u5);
691	f = zip8_hi(u2, u5);
692	}
693
694	} // namespace insn
695	} // namespace detail
696	} // namespace SIMDPP_ARCH_NAMESPACE
697	} // namespace simdpp
698
699	#endif
700
701

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/mem_unpack.h