transpose.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/core/transpose.h]

1	/ Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_CORE_TRANSPOSE_H
9	#define LIBSIMDPP_SIMDPP_CORE_TRANSPOSE_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/types.h>
16	#include <simdpp/core/make_shuffle_bytes_mask.h>
17	#include <simdpp/core/bit_and.h>
18	#include <simdpp/core/shuffle2.h>
19	#include <simdpp/detail/insn/transpose.h>
20	#include <simdpp/detail/neon/shuffle.h>
21	#include <simdpp/detail/null/transpose.h>
22
23	namespace simdpp {
24	namespace SIMDPP_ARCH_NAMESPACE {
25
26	/* Transposes four 2x2 16-bit matrices within two int16x8 vectors*
27
28	Mask or expression vectors are not supported.
29
30	@code
31	r0 = [ a0_0; a1_0 ; ... ; a0_6; a1_6 ]
32	r1 = [ a0_1; a1_1 ; ... ; a0_7; a0_7 ]
33	@endcode
34
35	@par 128-bit version:
36	@icost{SSE2-AVX2, 4}
37	@icost{ALTIVEC, 2-4}
38
39	@par 256-bit version:
40	The lower and higher 128-bit halves are processed as if 128-bit instruction
41	was applied to each of them separately.
42
43	@icost{SSE2-AVX, 8}
44	@icost{AVX2, 4}
45	@icost{NEON, 2}
46	@icost{ALTIVEC, 4-6}
47	*/
48	template<unsigned N, class V> SIMDPP_INL
49	void transpose2(any_int16<N,V>& a0, any_int16<N,V>& a1)
50	{
51	static_assert(!is_mask<V>::value, "Mask vectors are not supported");
52	static_assert(is_value_vector<V>::value, "Expression vectors are not supported");
53	uint16<N> qa0 = a0.wrapped();
54	uint16<N> qa1 = a1.wrapped();
55	detail::insn::i_transpose2(qa0, qa1);
56	a0.wrapped() = qa0;
57	a1.wrapped() = qa1;
58	}
59
60	/* Transposes two 2x2 32-bit matrices within two int32x4 vectors*
61
62	@code
63	r0 = [ a0_0; a1_0 ; a0_2; a1_2 ]
64	r1 = [ a0_1; a1_1 ; a1_3; a0_3 ]
65	@endcode
66
67	@par 128-bit version:
68	@icost{SSE2-AVX2, 4}
69	@icost{ALTIVEC, 2-4}
70
71	@par 256-bit version:
72	The lower and higher 128-bit halves are processed as if 128-bit instruction
73	was applied to each of them separately.
74
75	@icost{SSE2-AVX, 8}
76	@icost{AVX2, 4}
77	@icost{NEON, 2}
78	@icost{ALTIVEC, 4-6}
79	*/
80	template<unsigned N, class V> SIMDPP_INL
81	void transpose2(any_int32<N,V>& a0, any_int32<N,V>& a1)
82	{
83	static_assert(!is_mask<V>::value, "Mask vectors are not supported");
84	static_assert(is_value_vector<V>::value, "Expression vectors are not supported");
85	uint32<N> qa0 = a0.wrapped();
86	uint32<N> qa1 = a1.wrapped();
87	detail::insn::i_transpose2(qa0, qa1);
88	a0.wrapped() = qa0;
89	a1.wrapped() = qa1;
90	}
91
92	/* Transposes a 2x2 64-bit matrix within two int64x2 vectors*
93
94	@code
95	r0 = [ a0_0; a1_0 ]
96	r1 = [ a0_1; a1_1 ]
97	@endcode
98
99	@par 128-bit version:
100	@icost{SSE2-AVX2, 2}
101	@icost{ALTIVEC, 2-4}
102
103	@par 256-bit version:
104	The lower and higher 128-bit halves are processed as if 128-bit instruction
105	was applied to each of them separately.
106
107	@icost{SSE2-AVX, 4}
108	@icost{AVX2, 2}
109	@icost{NEON, 2}
110	@icost{ALTIVEC, 4-6}
111	*/
112	template<unsigned N, class V> SIMDPP_INL
113	void transpose2(any_int64<N,V>& a0, any_int64<N,V>& a1)
114	{
115	static_assert(!is_mask<V>::value, "Mask vectors are not supported");
116	static_assert(is_value_vector<V>::value, "Expression vectors are not supported");
117	uint64<N> qa0 = a0.wrapped();
118	uint64<N> qa1 = a1.wrapped();
119	detail::insn::i_transpose2(qa0, qa1);
120	a0.wrapped() = qa0;
121	a1.wrapped() = qa1;
122	}
123
124	/* Transposes two 2x2 32-bit matrices within two float32x4 vectors*
125
126	@code
127	r0 = [ a0_0; a1_0 ; a0_2; a1_2 ]
128	r1 = [ a0_1; a1_1 ; a0_3; a0_3 ]
129	@endcode
130
131	@par 128-bit version:
132	@icost{SSE2-AVX2, 4}
133	@icost{ALTIVEC, 2-4}
134
135	@par 256-bit version:
136	The lower and higher 128-bit halves are processed as if 128-bit instruction
137	was applied to each of them separately.
138
139	@icost{SSE2-SSE4.1, 8}
140	@icost{AVX-AVX2, 4}
141	@icost{ALTIVEC, 4-6}
142	@icost{NEON, 2}
143	*/
144	template<unsigned N> SIMDPP_INL
145	void transpose2(float32<N>& a0, float32<N>& a1)
146	{
147	detail::insn::i_transpose2(a0, a1);
148	}
149
150	/* Transposes a 2x2 64-bit matrix within two int64x2 vectors*
151
152	@code
153	r0 = [ a0_0; a1_0 ]
154	r1 = [ a0_1; a1_1 ]
155	@endcode
156
157	@par 128-bit version:
158	@icost{SSE2-AVX2, 2}
159	@novec{NEON, ALTIVEC}
160
161	@par 256-bit version:
162	The lower and higher 128-bit halves are processed as if 128-bit instruction
163	was applied to each of them separately.
164
165	@icost{SSE2-SSE4.1, 4}
166	@icost{AVX-AVX2, 2}
167	@novec{NEON, ALTIVEC}
168	*/
169	template<unsigned N> SIMDPP_INL
170	void transpose2(float64<N>& a0, float64<N>& a1)
171	{
172	detail::insn::i_transpose2(a0, a1);
173	}
174
175	/* Transposes four 4x4 8-bit matrix within four int8x16 vectors*
176
177	Mask or expression vectors are not supported.
178
179	@code
180	r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ...]
181	r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ...]
182	r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ...]
183	r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ...]
184	@endcode
185
186	@par 128-bit version:
187	@icost{SSE2-AVX2, 16}
188	@icost{NEON, 4}
189	@icost{ALTIVEC, 8-12}
190
191	@par 256-bit version:
192	The lower and higher 128-bit halves are processed as if 128-bit instruction
193	was applied to each of them separately.
194
195	@icost{SSE2-AVX, 32}
196	@icost{AVX2, 16}
197	@icost{NEON, 8}
198	@icost{ALTIVEC, 16-20}
199	*/
200	template<unsigned N, class V> SIMDPP_INL
201	void transpose4(any_int8<N,V>& a0, any_int8<N,V>& a1,
202	any_int8<N,V>& a2, any_int8<N,V>& a3)
203	{
204	static_assert(!is_mask<V>::value, "Mask vectors are not supported");
205	static_assert(is_value_vector<V>::value, "Expression vectors are not supported");
206	uint8<N> qa0, qa1, qa2, qa3;
207	qa0 = a0.wrapped(); qa1 = a1.wrapped(); qa2 = a2.wrapped(); qa3 = a3.wrapped();
208	detail::insn::i_transpose4(qa0, qa1, qa2, qa3);
209	a0.wrapped() = qa0; a1.wrapped() = qa1; a2.wrapped() = qa2; a3.wrapped() = qa3;
210	}
211
212	/* Transposes two 4x4 16-bit matrices within four int16x8 vectors*
213
214	Mask or expression vectors are not supported.
215
216	@code
217	r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ]
218	r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ]
219	r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ]
220	r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ]
221	@endcode
222
223	@par 128-bit version:
224	@icost{SSE2-AVX2, 12}
225	@icost{NEON, 4}
226	@icost{ALTIVEC, 8-12}
227
228	@par 256-bit version:
229	The lower and higher 128-bit halves are processed as if 128-bit instruction
230	was applied to each of them separately.
231
232	@icost{SSE2-AVX, 24}
233	@icost{AVX2, 12}
234	@icost{NEON, 8}
235	@icost{ALTIVEC, 16-20}
236	*/
237	template<unsigned N, class V> SIMDPP_INL
238	void transpose4(any_int16<N,V>& a0, any_int16<N,V>& a1,
239	any_int16<N,V>& a2, any_int16<N,V>& a3)
240	{
241	static_assert(!is_mask<V>::value, "Mask vectors are not supported");
242	static_assert(is_value_vector<V>::value, "Expression vectors are not supported");
243	uint16<N> qa0, qa1, qa2, qa3;
244	qa0 = a0.wrapped(); qa1 = a1.wrapped(); qa2 = a2.wrapped(); qa3 = a3.wrapped();
245	detail::insn::i_transpose4(qa0, qa1, qa2, qa3);
246	a0.wrapped() = qa0; a1.wrapped() = qa1; a2.wrapped() = qa2; a3.wrapped() = qa3;
247	}
248
249	/* Transposes a 4x4 32-bit matrix within four int32x4 vectors*
250
251	Mask or expression vectors are not supported.
252
253	@code
254	r0 = [ a0_0; a1_0; a2_0; a3_0 ]
255	r1 = [ a0_1; a1_1; a2_1; a3_1 ]
256	r2 = [ a0_2; a1_2; a2_2; a3_2 ]
257	r3 = [ a0_3; a1_3; a2_3; a3_3 ]
258	@endcode
259
260	@par 128-bit version:
261	@icost{SSE2-AVX2, 12}
262	@icost{NEON, 4}
263	@icost{ALTIVEC, 8-12}
264
265	@par 256-bit version:
266	@icost{SSE2-AVX, 24}
267	@icost{AVX2, 12}
268	@icost{NEON, 8}
269	@icost{ALTIVEC, 16-20}
270
271	The lower and higher 128-bit halves are processed as if 128-bit instruction
272	was applied to each of them separately.
273	*/
274	template<unsigned N, class V> SIMDPP_INL
275	void transpose4(any_int32<N,V>& a0, any_int32<N,V>& a1,
276	any_int32<N,V>& a2, any_int32<N,V>& a3)
277	{
278	static_assert(!is_mask<V>::value, "Mask vectors are not supported");
279	static_assert(is_value_vector<V>::value, "Expression vectors are not supported");
280	uint32<N> qa0, qa1, qa2, qa3;
281	qa0 = a0.wrapped(); qa1 = a1.wrapped(); qa2 = a2.wrapped(); qa3 = a3.wrapped();
282	detail::insn::i_transpose4(qa0, qa1, qa2, qa3);
283	a0.wrapped() = qa0; a1.wrapped() = qa1; a2.wrapped() = qa2; a3.wrapped() = qa3;
284	}
285
286	/* Transposes 4x4 32-bit matrix within four float32x4 vectors*
287
288	@code
289	r0 = [ a0_0; a1_0; a2_0; a3_0 ]
290	r1 = [ a0_1; a1_1; a2_1; a3_1 ]
291	r2 = [ a0_2; a1_2; a2_2; a3_2 ]
292	r3 = [ a0_3; a1_3; a2_3; a3_3 ]
293	@endcode
294
295	@par 128-bit version:
296	@icost{SSE2-AVX2, 12}
297	@icost{NEON, 4}
298	@icost{ALTIVEC, 8-12}
299
300	@par 256-bit version:
301	@icost{SSE2-SSE4.1, 24}
302	@icost{AVX-AVX2, 12}
303	@icost{NEON, 8}
304	@icost{ALTIVEC, 16-20}
305
306	The lower and higher 128-bit halves are processed as if 128-bit instruction
307	was applied to each of them separately.
308	*/
309	template<unsigned N> SIMDPP_INL
310	void transpose4(float32<N>& a0, float32<N>& a1,
311	float32<N>& a2, float32<N>& a3)
312	{
313	detail::insn::i_transpose4(a0, a1, a2, a3);
314	}
315
316	} // namespace SIMDPP_ARCH_NAMESPACE
317	} // namespace simdpp
318
319	#endif
320
321

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/core/transpose.h