1/* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_CORE_TRANSPOSE_H
9#define LIBSIMDPP_SIMDPP_CORE_TRANSPOSE_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/make_shuffle_bytes_mask.h>
17#include <simdpp/core/bit_and.h>
18#include <simdpp/core/shuffle2.h>
19#include <simdpp/detail/insn/transpose.h>
20#include <simdpp/detail/neon/shuffle.h>
21#include <simdpp/detail/null/transpose.h>
22
23namespace simdpp {
24namespace SIMDPP_ARCH_NAMESPACE {
25
26/** Transposes four 2x2 16-bit matrices within two int16x8 vectors
27
28 Mask or expression vectors are not supported.
29
30 @code
31 r0 = [ a0_0; a1_0 ; ... ; a0_6; a1_6 ]
32 r1 = [ a0_1; a1_1 ; ... ; a0_7; a0_7 ]
33 @endcode
34
35 @par 128-bit version:
36 @icost{SSE2-AVX2, 4}
37 @icost{ALTIVEC, 2-4}
38
39 @par 256-bit version:
40 The lower and higher 128-bit halves are processed as if 128-bit instruction
41 was applied to each of them separately.
42
43 @icost{SSE2-AVX, 8}
44 @icost{AVX2, 4}
45 @icost{NEON, 2}
46 @icost{ALTIVEC, 4-6}
47*/
48template<unsigned N, class V> SIMDPP_INL
49void transpose2(any_int16<N,V>& a0, any_int16<N,V>& a1)
50{
51 static_assert(!is_mask<V>::value, "Mask vectors are not supported");
52 static_assert(is_value_vector<V>::value, "Expression vectors are not supported");
53 uint16<N> qa0 = a0.wrapped();
54 uint16<N> qa1 = a1.wrapped();
55 detail::insn::i_transpose2(qa0, qa1);
56 a0.wrapped() = qa0;
57 a1.wrapped() = qa1;
58}
59
60/** Transposes two 2x2 32-bit matrices within two int32x4 vectors
61
62 @code
63 r0 = [ a0_0; a1_0 ; a0_2; a1_2 ]
64 r1 = [ a0_1; a1_1 ; a1_3; a0_3 ]
65 @endcode
66
67 @par 128-bit version:
68 @icost{SSE2-AVX2, 4}
69 @icost{ALTIVEC, 2-4}
70
71 @par 256-bit version:
72 The lower and higher 128-bit halves are processed as if 128-bit instruction
73 was applied to each of them separately.
74
75 @icost{SSE2-AVX, 8}
76 @icost{AVX2, 4}
77 @icost{NEON, 2}
78 @icost{ALTIVEC, 4-6}
79*/
80template<unsigned N, class V> SIMDPP_INL
81void transpose2(any_int32<N,V>& a0, any_int32<N,V>& a1)
82{
83 static_assert(!is_mask<V>::value, "Mask vectors are not supported");
84 static_assert(is_value_vector<V>::value, "Expression vectors are not supported");
85 uint32<N> qa0 = a0.wrapped();
86 uint32<N> qa1 = a1.wrapped();
87 detail::insn::i_transpose2(qa0, qa1);
88 a0.wrapped() = qa0;
89 a1.wrapped() = qa1;
90}
91
92/** Transposes a 2x2 64-bit matrix within two int64x2 vectors
93
94 @code
95 r0 = [ a0_0; a1_0 ]
96 r1 = [ a0_1; a1_1 ]
97 @endcode
98
99 @par 128-bit version:
100 @icost{SSE2-AVX2, 2}
101 @icost{ALTIVEC, 2-4}
102
103 @par 256-bit version:
104 The lower and higher 128-bit halves are processed as if 128-bit instruction
105 was applied to each of them separately.
106
107 @icost{SSE2-AVX, 4}
108 @icost{AVX2, 2}
109 @icost{NEON, 2}
110 @icost{ALTIVEC, 4-6}
111*/
112template<unsigned N, class V> SIMDPP_INL
113void transpose2(any_int64<N,V>& a0, any_int64<N,V>& a1)
114{
115 static_assert(!is_mask<V>::value, "Mask vectors are not supported");
116 static_assert(is_value_vector<V>::value, "Expression vectors are not supported");
117 uint64<N> qa0 = a0.wrapped();
118 uint64<N> qa1 = a1.wrapped();
119 detail::insn::i_transpose2(qa0, qa1);
120 a0.wrapped() = qa0;
121 a1.wrapped() = qa1;
122}
123
124/** Transposes two 2x2 32-bit matrices within two float32x4 vectors
125
126 @code
127 r0 = [ a0_0; a1_0 ; a0_2; a1_2 ]
128 r1 = [ a0_1; a1_1 ; a0_3; a0_3 ]
129 @endcode
130
131 @par 128-bit version:
132 @icost{SSE2-AVX2, 4}
133 @icost{ALTIVEC, 2-4}
134
135 @par 256-bit version:
136 The lower and higher 128-bit halves are processed as if 128-bit instruction
137 was applied to each of them separately.
138
139 @icost{SSE2-SSE4.1, 8}
140 @icost{AVX-AVX2, 4}
141 @icost{ALTIVEC, 4-6}
142 @icost{NEON, 2}
143*/
144template<unsigned N> SIMDPP_INL
145void transpose2(float32<N>& a0, float32<N>& a1)
146{
147 detail::insn::i_transpose2(a0, a1);
148}
149
150/** Transposes a 2x2 64-bit matrix within two int64x2 vectors
151
152 @code
153 r0 = [ a0_0; a1_0 ]
154 r1 = [ a0_1; a1_1 ]
155 @endcode
156
157 @par 128-bit version:
158 @icost{SSE2-AVX2, 2}
159 @novec{NEON, ALTIVEC}
160
161 @par 256-bit version:
162 The lower and higher 128-bit halves are processed as if 128-bit instruction
163 was applied to each of them separately.
164
165 @icost{SSE2-SSE4.1, 4}
166 @icost{AVX-AVX2, 2}
167 @novec{NEON, ALTIVEC}
168*/
169template<unsigned N> SIMDPP_INL
170void transpose2(float64<N>& a0, float64<N>& a1)
171{
172 detail::insn::i_transpose2(a0, a1);
173}
174
175/** Transposes four 4x4 8-bit matrix within four int8x16 vectors
176
177 Mask or expression vectors are not supported.
178
179 @code
180 r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ...]
181 r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ...]
182 r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ...]
183 r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ...]
184 @endcode
185
186 @par 128-bit version:
187 @icost{SSE2-AVX2, 16}
188 @icost{NEON, 4}
189 @icost{ALTIVEC, 8-12}
190
191 @par 256-bit version:
192 The lower and higher 128-bit halves are processed as if 128-bit instruction
193 was applied to each of them separately.
194
195 @icost{SSE2-AVX, 32}
196 @icost{AVX2, 16}
197 @icost{NEON, 8}
198 @icost{ALTIVEC, 16-20}
199*/
200template<unsigned N, class V> SIMDPP_INL
201void transpose4(any_int8<N,V>& a0, any_int8<N,V>& a1,
202 any_int8<N,V>& a2, any_int8<N,V>& a3)
203{
204 static_assert(!is_mask<V>::value, "Mask vectors are not supported");
205 static_assert(is_value_vector<V>::value, "Expression vectors are not supported");
206 uint8<N> qa0, qa1, qa2, qa3;
207 qa0 = a0.wrapped(); qa1 = a1.wrapped(); qa2 = a2.wrapped(); qa3 = a3.wrapped();
208 detail::insn::i_transpose4(qa0, qa1, qa2, qa3);
209 a0.wrapped() = qa0; a1.wrapped() = qa1; a2.wrapped() = qa2; a3.wrapped() = qa3;
210}
211
212/** Transposes two 4x4 16-bit matrices within four int16x8 vectors
213
214 Mask or expression vectors are not supported.
215
216 @code
217 r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ]
218 r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ]
219 r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ]
220 r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ]
221 @endcode
222
223 @par 128-bit version:
224 @icost{SSE2-AVX2, 12}
225 @icost{NEON, 4}
226 @icost{ALTIVEC, 8-12}
227
228 @par 256-bit version:
229 The lower and higher 128-bit halves are processed as if 128-bit instruction
230 was applied to each of them separately.
231
232 @icost{SSE2-AVX, 24}
233 @icost{AVX2, 12}
234 @icost{NEON, 8}
235 @icost{ALTIVEC, 16-20}
236*/
237template<unsigned N, class V> SIMDPP_INL
238void transpose4(any_int16<N,V>& a0, any_int16<N,V>& a1,
239 any_int16<N,V>& a2, any_int16<N,V>& a3)
240{
241 static_assert(!is_mask<V>::value, "Mask vectors are not supported");
242 static_assert(is_value_vector<V>::value, "Expression vectors are not supported");
243 uint16<N> qa0, qa1, qa2, qa3;
244 qa0 = a0.wrapped(); qa1 = a1.wrapped(); qa2 = a2.wrapped(); qa3 = a3.wrapped();
245 detail::insn::i_transpose4(qa0, qa1, qa2, qa3);
246 a0.wrapped() = qa0; a1.wrapped() = qa1; a2.wrapped() = qa2; a3.wrapped() = qa3;
247}
248
249/** Transposes a 4x4 32-bit matrix within four int32x4 vectors
250
251 Mask or expression vectors are not supported.
252
253 @code
254 r0 = [ a0_0; a1_0; a2_0; a3_0 ]
255 r1 = [ a0_1; a1_1; a2_1; a3_1 ]
256 r2 = [ a0_2; a1_2; a2_2; a3_2 ]
257 r3 = [ a0_3; a1_3; a2_3; a3_3 ]
258 @endcode
259
260 @par 128-bit version:
261 @icost{SSE2-AVX2, 12}
262 @icost{NEON, 4}
263 @icost{ALTIVEC, 8-12}
264
265 @par 256-bit version:
266 @icost{SSE2-AVX, 24}
267 @icost{AVX2, 12}
268 @icost{NEON, 8}
269 @icost{ALTIVEC, 16-20}
270
271 The lower and higher 128-bit halves are processed as if 128-bit instruction
272 was applied to each of them separately.
273*/
274template<unsigned N, class V> SIMDPP_INL
275void transpose4(any_int32<N,V>& a0, any_int32<N,V>& a1,
276 any_int32<N,V>& a2, any_int32<N,V>& a3)
277{
278 static_assert(!is_mask<V>::value, "Mask vectors are not supported");
279 static_assert(is_value_vector<V>::value, "Expression vectors are not supported");
280 uint32<N> qa0, qa1, qa2, qa3;
281 qa0 = a0.wrapped(); qa1 = a1.wrapped(); qa2 = a2.wrapped(); qa3 = a3.wrapped();
282 detail::insn::i_transpose4(qa0, qa1, qa2, qa3);
283 a0.wrapped() = qa0; a1.wrapped() = qa1; a2.wrapped() = qa2; a3.wrapped() = qa3;
284}
285
286/** Transposes 4x4 32-bit matrix within four float32x4 vectors
287
288 @code
289 r0 = [ a0_0; a1_0; a2_0; a3_0 ]
290 r1 = [ a0_1; a1_1; a2_1; a3_1 ]
291 r2 = [ a0_2; a1_2; a2_2; a3_2 ]
292 r3 = [ a0_3; a1_3; a2_3; a3_3 ]
293 @endcode
294
295 @par 128-bit version:
296 @icost{SSE2-AVX2, 12}
297 @icost{NEON, 4}
298 @icost{ALTIVEC, 8-12}
299
300 @par 256-bit version:
301 @icost{SSE2-SSE4.1, 24}
302 @icost{AVX-AVX2, 12}
303 @icost{NEON, 8}
304 @icost{ALTIVEC, 16-20}
305
306 The lower and higher 128-bit halves are processed as if 128-bit instruction
307 was applied to each of them separately.
308*/
309template<unsigned N> SIMDPP_INL
310void transpose4(float32<N>& a0, float32<N>& a1,
311 float32<N>& a2, float32<N>& a3)
312{
313 detail::insn::i_transpose4(a0, a1, a2, a3);
314}
315
316} // namespace SIMDPP_ARCH_NAMESPACE
317} // namespace simdpp
318
319#endif
320
321