transpose.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/transpose.h]

1	/ Copyright (C) 2012-2014 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_TRANSPOSE_H
9	#define LIBSIMDPP_SIMDPP_DETAIL_INSN_TRANSPOSE_H
10
11	#include <simdpp/types.h>
12	#include <simdpp/detail/not_implemented.h>
13	#include <simdpp/detail/width.h>
14	#include <simdpp/core/permute_bytes16.h>
15	#include <simdpp/core/zip_lo.h>
16	#include <simdpp/core/zip_hi.h>
17	#include <simdpp/detail/null/transpose.h>
18	#include <simdpp/detail/neon/shuffle.h>
19	#include <simdpp/detail/vector_array_macros.h>
20
21	namespace simdpp {
22	namespace SIMDPP_ARCH_NAMESPACE {
23	namespace detail {
24	namespace insn {
25
26
27	template<class V8, class V16, class V32> SIMDPP_INL
28	void v_sse_transpose8x4(V8& a0, V8& a1, V8& a2, V8& a3);
29	template<class V16, class V32, class V64> SIMDPP_INL
30	void v_sse_transpose16x4(V16& a0, V16& a1, V16& a2, V16& a3);
31	template<class V, class D> SIMDPP_INL
32	void v_sse_transpose32x4(V& a0, V& a1, V& a2, V& a3);
33
34	/* Transposes eight 2x2 8-bit matrices within two int8x16 vectors*
35
36	@code
37	r0 = [ a0_0; a1_0 ; ... ; a0_14; a1_14 ]
38	r1 = [ a1_1; a1_1 ; ... ; a0_15; a0_15 ]
39	@endcode
40
41	@par 128-bit version:
42	@icost{SSE2-AVX2, 4}
43	@icost{ALTIVEC, 2-4}
44
45	@par 256-bit version:
46	@icost{SSE2-AVX, 8}
47	@icost{AVX2, 4}
48	@icost{ALTIVEC, 4-6}
49
50	The lower and higher 128-bit halves are processed as if 128-bit instruction
51	was applied to each of them separately.
52	*/
53	static SIMDPP_INL
54	void i_transpose2(uint8x16& a0, uint8x16& a1)
55	{
56	#if SIMDPP_USE_NULL
57	detail::null::transpose2(a0, a1);
58	#elif SIMDPP_USE_NEON
59	auto r = vtrnq_u8(a0.native(), a1.native());
60	a0 = r.val[`0`];
61	a1 = r.val[`1`];
62	#elif SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
63	uint8x16 m0 = make_shuffle_bytes16_mask<`0`,`16`+`0`, `2`,`16`+`2`, `4`,`16`+`4`, `6`,`16`+`6`,
64	`8`,`16`+`8`, `10`,`16`+`10`, `12`,`16`+`12`, `14`,`16`+`14`>(m0);
65	uint8x16 m1 = make_shuffle_bytes16_mask<`1`,`16`+`1`, `3`,`16`+`3`, `5`,`16`+`5`, `7`,`16`+`7`,
66	`9`,`16`+`9`, `11`,`16`+`11`, `13`,`16`+`13`, `15`,`16`+`15`>(m1);
67	uint16x8 b0, b1;
68	b0 = shuffle_bytes16(a0, a1, m0);
69	b1 = shuffle_bytes16(a0, a1, m1);
70	a0 = b0; a1 = b1;
71	#else
72	SIMDPP_NOT_IMPLEMENTED2(a0, a1);
73	#endif
74	}
75
76	/* Helper function.*
77
78	@code
79	r = [a0,a4,a8,a12,a1,a5,a9,a13,a2,a6,a10,a14,a3,a7,a11,a15]
80	@endcode
81
82	The 256-bit version applies the 128 bit operation to the two halves.
83
84	Needs SSSE3
85	*/
86	static SIMDPP_INL
87	uint8x16 transpose_inplace(const uint8x16& a)
88	{
89	#if SIMDPP_USE_SSSE3 \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
90	// the compiler will take this out of any loops automatically
91	uint8x16 idx = make_uint(`0`, `4`, `8`, `12`, `1`, `5`, `9`, `13`,
92	`2`, `6`, `10`,`14`, `3`, `7`, `11`,`15`);
93	return permute_bytes16(a, idx);
94	#else
95	return SIMDPP_NOT_IMPLEMENTED1(a);
96	#endif
97	}
98
99	static SIMDPP_INL
100	uint8x32 transpose_inplace(const uint8x32& a)
101	{
102	#if SIMDPP_USE_AVX2
103	uint8x32 idx = make_uint(`0`, `4`, `8`, `12`, `1`, `5`, `9`, `13`,
104	`2`, `6`, `10`,`14`, `3`, `7`, `11`,`15`);
105	return permute_bytes16(a, idx);
106	#elif SIMDPP_USE_SSSE3 \|\| SIMDPP_USE_ALTIVEC
107	SIMDPP_VEC_ARRAY_IMPL1(uint8x32, transpose_inplace, a);
108	#else
109	return SIMDPP_NOT_IMPLEMENTED1(a);
110	#endif
111	}
112
113	#if SIMDPP_USE_AVX512BW
114	static SIMDPP_INL
115	uint8<`64`> transpose_inplace(const uint8<`64`>& a)
116	{
117	uint8<`64`> idx = make_uint(`0`, `4`, `8`, `12`, `1`, `5`, `9`, `13`,
118	`2`, `6`, `10`,`14`, `3`, `7`, `11`,`15`);
119	return permute_bytes16(a, idx);
120	}
121	#endif
122
123	static SIMDPP_INL
124	void i_transpose2(uint16x8& a0, uint16x8& a1)
125	{
126	#if SIMDPP_USE_NULL
127	detail::null::transpose2(a0, a1);
128	#elif SIMDPP_USE_SSE2
129	uint32x4 b0, b1;
130	b0 = zip8_lo(a0, a1);
131	b1 = zip8_hi(a0, a1);
132	a0 = shuffle2<`0`,`2`,`0`,`2`>(b0, b1);
133	a1 = shuffle2<`1`,`3`,`1`,`3`>(b0, b1);
134	#elif SIMDPP_USE_NEON
135	auto r = vtrnq_u16(a0.native(), a1.native());
136	a0 = r.val[`0`];
137	a1 = r.val[`1`];
138	#elif SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
139	uint16x8 m0 = make_shuffle_bytes16_mask<`0`,`8`+`0`, `2`,`8`+`2`, `4`,`8`+`4`, `6`,`8`+`6`>(m0);
140	uint16x8 m1 = make_shuffle_bytes16_mask<`1`,`8`+`1`, `3`,`8`+`3`, `5`,`8`+`5`, `7`,`8`+`7`>(m1);
141	uint16x8 b0, b1;
142	b0 = shuffle_bytes16(a0, a1, m0);
143	b1 = shuffle_bytes16(a0, a1, m1);
144	a0 = b0; a1 = b1;
145	#endif
146	}
147
148	#if SIMDPP_USE_AVX2
149	static SIMDPP_INL
150	void i_transpose2(uint16x16& a0, uint16x16& a1)
151	{
152	uint32x8 b0, b1;
153	b0 = zip8_lo(a0, a1);
154	b1 = zip8_hi(a0, a1);
155	a0 = shuffle2<`0`,`2`,`0`,`2`>(b0, b1);
156	a1 = shuffle2<`1`,`3`,`1`,`3`>(b0, b1);
157	}
158	#endif
159
160	#if SIMDPP_USE_AVX512BW
161	SIMDPP_INL void i_transpose2(uint16<`32`>& a0, uint16<`32`>& a1)
162	{
163	uint32<`16`> b0, b1;
164	b0 = zip8_lo(a0, a1);
165	b1 = zip8_hi(a0, a1);
166	a0 = shuffle2<`0`,`2`,`0`,`2`>(b0, b1);
167	a1 = shuffle2<`1`,`3`,`1`,`3`>(b0, b1);
168	}
169	#endif
170
171	template<unsigned N> SIMDPP_INL
172	void i_transpose2(uint16<N>& a0, uint16<N>& a1)
173	{
174	SIMDPP_VEC_ARRAY_IMPL_REF2(uint16<N>, i_transpose2, a0, a1);
175	}
176
177	// -----------------------------------------------------------------------------
178
179	static SIMDPP_INL
180	void i_transpose2(uint32x4& a0, uint32x4& a1)
181	{
182	#if SIMDPP_USE_NULL
183	detail::null::transpose2(a0, a1);
184	#elif SIMDPP_USE_SSE2
185	uint64x2 b0, b1;
186	b0 = zip4_lo(a0, a1);
187	b1 = zip4_hi(a0, a1);
188	a0 = zip2_lo(b0, b1);
189	a1 = zip2_hi(b0, b1);
190	#elif SIMDPP_USE_NEON
191	auto r = vtrnq_u32(a0.native(), a1.native());
192	a0 = r.val[`0`];
193	a1 = r.val[`1`];
194	#elif SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
195	uint32x4 m0 = make_shuffle_bytes16_mask<`0`,`4`+`0`, `2`,`4`+`2`>(m0);
196	uint32x4 m1 = make_shuffle_bytes16_mask<`1`,`4`+`1`, `3`,`4`+`3`>(m1);
197	uint32x4 b0, b1;
198	b0 = shuffle_bytes16(a0, a1, m0);
199	b1 = shuffle_bytes16(a0, a1, m1);
200	a0 = b0; a1 = b1;
201	#endif
202	}
203
204	#if SIMDPP_USE_AVX2
205	static SIMDPP_INL
206	void i_transpose2(uint32x8& a0, uint32x8& a1)
207	{
208	uint64x4 b0, b1;
209	b0 = zip4_lo(a0, a1);
210	b1 = zip4_hi(a0, a1);
211	a0 = zip2_lo(b0, b1);
212	a1 = zip2_hi(b0, b1);
213	}
214	#endif
215
216	#if SIMDPP_USE_AVX512F
217	static SIMDPP_INL
218	void i_transpose2(uint32<`16`>& a0, uint32<`16`>& a1)
219	{
220	uint64<`8`> b0, b1;
221	b0 = zip4_lo(a0, a1);
222	b1 = zip4_hi(a0, a1);
223	a0 = zip2_lo(b0, b1);
224	a1 = zip2_hi(b0, b1);
225	}
226	#endif
227
228	template<unsigned N> SIMDPP_INL
229	void i_transpose2(uint32<N>& a0, uint32<N>& a1)
230	{
231	SIMDPP_VEC_ARRAY_IMPL_REF2(uint32<N>, i_transpose2, a0, a1);
232	}
233
234	// -----------------------------------------------------------------------------
235
236	static SIMDPP_INL
237	void i_transpose2(uint64x2& a0, uint64x2& a1)
238	{
239	#if SIMDPP_USE_SSE2 \|\| SIMDPP_USE_VSX_207 \|\| SIMDPP_USE_MSA
240	uint64x2 b0;
241	b0 = zip2_lo(a0, a1);
242	a1 = zip2_hi(a0, a1);
243	a0 = b0;
244	#elif SIMDPP_USE_NEON
245	neon::transpose2(a0, a1);
246	#elif SIMDPP_USE_NULL \|\| SIMDPP_USE_ALTIVEC
247	detail::null::transpose2(a0, a1);
248	#endif
249	}
250
251	#if SIMDPP_USE_AVX2
252	static SIMDPP_INL
253	void i_transpose2(uint64x4& a0, uint64x4& a1)
254	{
255	uint64x4 b0;
256	b0 = zip2_lo(a0, a1);
257	a1 = zip2_hi(a0, a1);
258	a0 = b0;
259	}
260	#endif
261
262	#if SIMDPP_USE_AVX512F
263	static SIMDPP_INL
264	void i_transpose2(uint64<`8`>& a0, uint64<`8`>& a1)
265	{
266	uint64<`8`> b0;
267	b0 = zip2_lo(a0, a1);
268	a1 = zip2_hi(a0, a1);
269	a0 = b0;
270	}
271	#endif
272
273	template<unsigned N> SIMDPP_INL
274	void i_transpose2(uint64<N>& a0, uint64<N>& a1)
275	{
276	SIMDPP_VEC_ARRAY_IMPL_REF2(uint64<N>, i_transpose2, a0, a1);
277	}
278
279	// -----------------------------------------------------------------------------
280
281	static SIMDPP_INL
282	void i_transpose2(float32x4& a0, float32x4& a1)
283	{
284	#if SIMDPP_USE_NULL \|\| SIMDPP_USE_NEON_NO_FLT_SP
285	detail::null::transpose2(a0, a1);
286	#elif SIMDPP_USE_SSE2
287	float64x2 b0, b1;
288	b0 = bit_cast<float64x2>(zip4_lo(a0, a1));
289	b1 = bit_cast<float64x2>(zip4_hi(a0, a1));
290	a0 = bit_cast<float32x4>(zip2_lo(b0, b1));
291	a1 = bit_cast<float32x4>(zip2_hi(b0, b1));
292	#elif SIMDPP_USE_NEON
293	auto r = vtrnq_f32(a0.native(), a1.native());
294	a0 = r.val[`0`];
295	a1 = r.val[`1`];
296	#elif SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
297	uint32x4 m0 = make_shuffle_bytes16_mask<`0`,`4`+`0`, `2`,`4`+`2`>(m0);
298	uint32x4 m1 = make_shuffle_bytes16_mask<`1`,`4`+`1`, `3`,`4`+`3`>(m1);
299	float32x4 b0, b1;
300	b0 = shuffle_bytes16(a0, a1, m0);
301	b1 = shuffle_bytes16(a0, a1, m1);
302	a0 = b0; a1 = b1;
303	#endif
304	}
305
306	#if SIMDPP_USE_AVX
307	static SIMDPP_INL
308	void i_transpose2(float32x8& a0, float32x8& a1)
309	{
310	float64x4 b0, b1;
311	b0 = zip4_lo(a0, a1);
312	b1 = zip4_hi(a0, a1);
313	a0 = zip2_lo(b0, b1);
314	a1 = zip2_hi(b0, b1);
315	}
316	#endif
317
318	#if SIMDPP_USE_AVX512F
319	static SIMDPP_INL
320	void i_transpose2(float32<`16`>& a0, float32<`16`>& a1)
321	{
322	float64<`8`> b0, b1;
323	b0 = zip4_lo(a0, a1);
324	b1 = zip4_hi(a0, a1);
325	a0 = zip2_lo(b0, b1);
326	a1 = zip2_hi(b0, b1);
327	}
328	#endif
329
330	template<unsigned N> SIMDPP_INL
331	void i_transpose2(float32<N>& a0, float32<N>& a1)
332	{
333	SIMDPP_VEC_ARRAY_IMPL_REF2(float32<N>, i_transpose2, a0, a1);
334	}
335
336	// -----------------------------------------------------------------------------
337
338	static SIMDPP_INL
339	void i_transpose2(float64x2& a0, float64x2& a1)
340	{
341	#if SIMDPP_USE_SSE2 \|\| SIMDPP_USE_VSX_206 \|\| SIMDPP_USE_MSA
342	float64x2 b0;
343	b0 = zip2_lo(a0, a1);
344	a1 = zip2_hi(a0, a1);
345	a0 = b0;
346	#elif SIMDPP_USE_NEON64
347	uint64x2 b0, b1;
348	b0 = a0; b1 = a1;
349	i_transpose2(b0, b1);
350	a0 = b0; a1 = b1;
351	#elif SIMDPP_USE_NULL \|\| SIMDPP_USE_NEON32 \|\| SIMDPP_USE_ALTIVEC
352	detail::null::transpose2(a0, a1);
353	#endif
354	}
355
356	#if SIMDPP_USE_AVX
357	static SIMDPP_INL
358	void i_transpose2(float64x4& a0, float64x4& a1)
359	{
360	float64x4 b0;
361	b0 = zip2_lo(a0, a1);
362	a1 = zip2_hi(a0, a1);
363	a0 = b0;
364	}
365	#endif
366
367	#if SIMDPP_USE_AVX512F
368	static SIMDPP_INL
369	void i_transpose2(float64<`8`>& a0, float64<`8`>& a1)
370	{
371	float64<`8`> b0;
372	b0 = zip2_lo(a0, a1);
373	a1 = zip2_hi(a0, a1);
374	a0 = b0;
375	}
376	#endif
377
378	template<unsigned N> SIMDPP_INL
379	void i_transpose2(float64<N>& a0, float64<N>& a1)
380	{
381	SIMDPP_VEC_ARRAY_IMPL_REF2(float64<N>, i_transpose2, a0, a1);
382	}
383
384	// -----------------------------------------------------------------------------
385
386	static SIMDPP_INL
387	void i_transpose4(uint32x4& a0, uint32x4& a1,
388	uint32x4& a2, uint32x4& a3);
389
390	#if SIMDPP_USE_AVX2
391	static SIMDPP_INL
392	void i_transpose4(uint32x8& a0, uint32x8& a1,
393	uint32x8& a2, uint32x8& a3);
394	#endif
395
396	static SIMDPP_INL
397	void i_transpose4(uint8x16& a0, uint8x16& a1,
398	uint8x16& a2, uint8x16& a3)
399	{
400	// [a0,a1,a2,a3 ... ]
401	// [b0,b1,b2,b3 ... ]
402	// [c0,c1,c2,c3 ... ]
403	// [d0,d1,d2,d3 ... ]
404	#if SIMDPP_USE_NULL
405	detail::null::transpose4(a0, a1, a2, a3);
406	#elif SIMDPP_USE_SSE2
407	v_sse_transpose8x4<uint8<`16`>, uint16<`8`>, uint32<`4`>>(a0, a1, a2, a3);
408	#elif SIMDPP_USE_NEON \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
409	uint16x8 b0, b1, b2, b3;
410	i_transpose2(a0, a1); // 8-bit transpose
411	i_transpose2(a2, a3);
412	b0 = a0; b1 = a1; b2 = a2; b3 = a3;
413	i_transpose2(b0, b2); // 16-bit transpose
414	i_transpose2(b1, b3);
415	a0 = b0; a1 = b1; a2 = b2; a3 = b3;
416	#endif
417	}
418
419
420	#if SIMDPP_USE_AVX2
421	static SIMDPP_INL
422	void i_transpose4(uint8x32& a0, uint8x32& a1,
423	uint8x32& a2, uint8x32& a3)
424	{
425	v_sse_transpose8x4<uint8<`32`>, uint16<`16`>, uint32<`8`>>(a0, a1, a2, a3);
426	}
427	#endif
428
429	#if SIMDPP_USE_AVX512BW
430	static SIMDPP_INL
431	void i_transpose4(uint8<`64`>& a0, uint8<`64`>& a1,
432	uint8<`64`>& a2, uint8<`64`>& a3)
433	{
434	v_sse_transpose8x4<uint8<`64`>, uint16<`32`>, uint32<`16`>>(a0, a1, a2, a3);
435	}
436	#endif
437
438	template<unsigned N> SIMDPP_INL
439	void i_transpose4(uint8<N>& a0, uint8<N>& a1, uint8<N>& a2, uint8<N>& a3)
440	{
441	SIMDPP_VEC_ARRAY_IMPL_REF4(uint8<N>, i_transpose4, a0, a1, a2, a3);
442	}
443
444	// -----------------------------------------------------------------------------
445
446	static SIMDPP_INL
447	void i_transpose4(uint16x8& a0, uint16x8& a1,
448	uint16x8& a2, uint16x8& a3)
449	{
450	#if SIMDPP_USE_NULL
451	detail::null::transpose4(a0, a1, a2, a3);
452	#elif SIMDPP_USE_SSE2
453	v_sse_transpose16x4<uint16<`8`>, uint32<`4`>, uint64<`2`>>(a0, a1, a2, a3);
454	#elif SIMDPP_USE_NEON \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
455	uint32x4 b0, b1, b2, b3;
456	i_transpose2(a0, a1); // 16-bit transpose
457	i_transpose2(a2, a3);
458	b0 = a0; b1 = a1; b2 = a2; b3 = a3;
459	i_transpose2(b0, b2); // 32-bit transpose
460	i_transpose2(b1, b3);
461	a0 = b0; a1 = b1; a2 = b2; a3 = b3;
462	#endif
463	}
464
465	#if SIMDPP_USE_AVX2
466	static SIMDPP_INL
467	void i_transpose4(uint16x16& a0, uint16x16& a1,
468	uint16x16& a2, uint16x16& a3)
469	{
470	v_sse_transpose16x4<uint16<`16`>, uint32<`8`>, uint64<`4`>>(a0, a1, a2, a3);
471	}
472	#endif
473
474	#if SIMDPP_USE_AVX2
475	SIMDPP_INL void i_transpose4(uint16<`32`>& a0, uint16<`32`>& a1,
476	uint16<`32`>& a2, uint16<`32`>& a3)
477	{
478	v_sse_transpose16x4<uint16<`32`>, uint32<`16`>, uint64<`8`>>(a0, a1, a2, a3);
479	}
480	#endif
481
482	template<unsigned N> SIMDPP_INL
483	void i_transpose4(uint16<N>& a0, uint16<N>& a1, uint16<N>& a2, uint16<N>& a3)
484	{
485	SIMDPP_VEC_ARRAY_IMPL_REF4(uint16<N>, i_transpose4, a0, a1, a2, a3);
486	}
487
488	// -----------------------------------------------------------------------------
489
490	static SIMDPP_INL
491	void i_transpose4(uint32x4& a0, uint32x4& a1,
492	uint32x4& a2, uint32x4& a3)
493	{
494	#if SIMDPP_USE_NULL
495	detail::null::transpose4(a0, a1, a2, a3);
496	#elif SIMDPP_USE_SSE2
497	v_sse_transpose32x4<uint32<`4`>, uint64<`2`>>(a0, a1, a2, a3);
498	#elif SIMDPP_USE_NEON \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
499	uint64x2 b0, b1, b2, b3;
500	i_transpose2(a0, a1); // 32-bit transpose
501	i_transpose2(a2, a3);
502	b0 = a0; b1 = a1; b2 = a2; b3 = a3;
503	i_transpose2(b0, b2); // 64-bit transpose
504	i_transpose2(b1, b3);
505	a0 = b0; a1 = b1; a2 = b2; a3 = b3;
506	#endif
507	}
508
509	#if SIMDPP_USE_AVX2
510	static SIMDPP_INL
511	void i_transpose4(uint32x8& a0, uint32x8& a1,
512	uint32x8& a2, uint32x8& a3)
513	{
514	v_sse_transpose32x4<uint32<`8`>, uint64<`4`>>(a0, a1, a2, a3);
515	}
516	#endif
517
518	#if SIMDPP_USE_AVX2
519	static SIMDPP_INL
520	void i_transpose4(uint32<`16`>& a0, uint32<`16`>& a1,
521	uint32<`16`>& a2, uint32<`16`>& a3)
522	{
523	v_sse_transpose32x4<uint32<`16`>, uint64<`8`>>(a0, a1, a2, a3);
524	}
525	#endif
526
527	template<unsigned N> SIMDPP_INL
528	void i_transpose4(uint32<N>& a0, uint32<N>& a1, uint32<N>& a2, uint32<N>& a3)
529	{
530	SIMDPP_VEC_ARRAY_IMPL_REF4(uint32<N>, i_transpose4, a0, a1, a2, a3);
531	}
532
533	// -----------------------------------------------------------------------------
534
535	static SIMDPP_INL
536	void i_transpose4(float32x4& a0, float32x4& a1,
537	float32x4& a2, float32x4& a3)
538	{
539	#if SIMDPP_USE_SSE2
540	v_sse_transpose32x4<float32<`4`>, float64<`2`>>(a0, a1, a2, a3);
541	#else
542	uint32x4 b0, b1, b2, b3;
543	b0 = a0; b1 = a1; b2 = a2; b3 = a3;
544	i_transpose4(b0, b1, b2, b3);
545	a0 = b0; a1 = b1; a2 = b2; a3 = b3;
546	#endif
547	}
548
549	#if SIMDPP_USE_AVX
550	static SIMDPP_INL
551	void i_transpose4(float32x8& a0, float32x8& a1,
552	float32x8& a2, float32x8& a3)
553	{
554	v_sse_transpose32x4<float32<`8`>, float64<`4`>>(a0, a1, a2, a3);
555	}
556	#endif
557
558	#if SIMDPP_USE_AVX512F
559	static SIMDPP_INL
560	void i_transpose4(float32<`16`>& a0, float32<`16`>& a1,
561	float32<`16`>& a2, float32<`16`>& a3)
562	{
563	v_sse_transpose32x4<float32<`16`>, float64<`8`>>(a0, a1, a2, a3);
564	}
565	#endif
566
567	template<unsigned N> SIMDPP_INL
568	void i_transpose4(float32<N>& a0, float32<N>& a1, float32<N>& a2, float32<N>& a3)
569	{
570	SIMDPP_VEC_ARRAY_IMPL_REF4(float32<N>, i_transpose4, a0, a1, a2, a3);
571	}
572
573	// -----------------------------------------------------------------------------
574
575	template<class V, class D> SIMDPP_INL
576	void v_sse_transpose32x4(V& a0, V& a1, V& a2, V& a3)
577	{
578	D b0, b1, b2, b3;
579	// [a0,a1,a2,a3]
580	// [b0,b1,b2,b3]
581	// [c0,c1,c2,c3]
582	// [d0,d1,d2,d3]
583	b0 = zip4_lo(a0, a1);
584	b1 = zip4_hi(a0, a1);
585	b2 = zip4_lo(a2, a3);
586	b3 = zip4_hi(a2, a3);
587	// [a0,b0,a1,b1]
588	// [a2,b2,a3,b3]
589	// [c0,d0,c1,d1]
590	// [c2,d2,c3,d3]
591	a0 = zip2_lo(b0, b2);
592	a1 = zip2_hi(b0, b2);
593	a2 = zip2_lo(b1, b3);
594	a3 = zip2_hi(b1, b3);
595	}
596
597	template<class V16, class V32, class V64> SIMDPP_INL
598	void v_sse_transpose16x4(V16& a0, V16& a1, V16& a2, V16& a3)
599	{
600	V32 b0, b1, b2, b3;
601	V64 c0, c1, c2, c3;
602	b0 = zip8_lo(a0, a1);
603	b1 = zip8_hi(a0, a1);
604	b2 = zip8_lo(a2, a3);
605	b3 = zip8_hi(a2, a3);
606	// [a0,b0,a1,b1,a2,b2,a3,b3]
607	// [a4,b4,a5,b5,a6,b6,a7,b7]
608	// [c0,d0,c1,d1,c2,d2,c3,d3]
609	// [c4,d4,c5,d5,c6,d6,c7,d7]
610	c0 = zip4_lo(b0, b2);
611	c1 = zip4_hi(b0, b2);
612	c2 = zip4_lo(b1, b3);
613	c3 = zip4_hi(b1, b3);
614	// [a0,b0,c0,d0,a1,b1,c1,d1]
615	// [a2,b2,c2,d2,a3,b3,c3,d3]
616	// [a4,b4,c4,d4,a5,b5,c5,d5]
617	// [a6,b6,c6,d6,a7,b7,c7,d7]
618	a0 = zip2_lo(c0, c2);
619	a1 = zip2_hi(c0, c2);
620	a2 = zip2_lo(c1, c3);
621	a3 = zip2_hi(c1, c3);
622	// [a0,b0,c0,d0,a4,b4,c4,d4]
623	// [a1,b1,c1,d1,a5,b5,c5,d5]
624	// [a2,b2,c2,d2,a6,b6,c6,d6]
625	// [a3,b3,c3,d3,a7,b7,c7,d7]
626	}
627
628	template<class V8, class V16, class V32> SIMDPP_INL
629	void v_sse_transpose8x4(V8& a0, V8& a1, V8& a2, V8& a3)
630	{
631	V16 b0, b1, b2, b3;
632	b0 = zip16_lo(a0, a1);
633	b1 = zip16_lo(a2, a3);
634	b2 = zip16_hi(a0, a1);
635	b3 = zip16_hi(a2, a3);
636	// [a0,b0,a1,b1,a2,b2,a3,b3 ... b7]
637	// [c0,d0,c1,d1,c2,d2,c3,d3 ... d7]
638	// [a8 ... b15]
639	// [c8 ... d15]
640	V32 c0, c1, c2, c3;
641	c0 = zip8_lo(b0, b1);
642	c1 = zip8_hi(b0, b1);
643	c2 = zip8_lo(b2, b3);
644	c3 = zip8_hi(b2, b3);
645	// [a0,b0,c0,d0,[a..d]1, [a..d]2, [a..d]3]
646	// [[a..d]4, [a..d]5, [a..d]6, [a..d]7]
647	// [[a..d]8, [a..d]9, [a..d]10, [a..d]11]
648	// [[a..d]12, [a..d]13,[a..d]14, [a..d]15]
649	i_transpose4(c0, c1, c2, c3); // 32-bit transpose
650	a0 = c0;
651	a1 = c1;
652	a2 = c2;
653	a3 = c3;
654	}
655
656
657	} // namespace insn
658	} // namespace detail
659	} // namespace SIMDPP_ARCH_NAMESPACE
660	} // namespace simdpp
661
662	#endif
663

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/transpose.h