1 | /* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_CORE_TRANSPOSE_H |
9 | #define LIBSIMDPP_SIMDPP_CORE_TRANSPOSE_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/make_shuffle_bytes_mask.h> |
17 | #include <simdpp/core/bit_and.h> |
18 | #include <simdpp/core/shuffle2.h> |
19 | #include <simdpp/detail/insn/transpose.h> |
20 | #include <simdpp/detail/neon/shuffle.h> |
21 | #include <simdpp/detail/null/transpose.h> |
22 | |
23 | namespace simdpp { |
24 | namespace SIMDPP_ARCH_NAMESPACE { |
25 | |
26 | /** Transposes four 2x2 16-bit matrices within two int16x8 vectors |
27 | |
28 | Mask or expression vectors are not supported. |
29 | |
30 | @code |
31 | r0 = [ a0_0; a1_0 ; ... ; a0_6; a1_6 ] |
32 | r1 = [ a0_1; a1_1 ; ... ; a0_7; a0_7 ] |
33 | @endcode |
34 | |
35 | @par 128-bit version: |
36 | @icost{SSE2-AVX2, 4} |
37 | @icost{ALTIVEC, 2-4} |
38 | |
39 | @par 256-bit version: |
40 | The lower and higher 128-bit halves are processed as if 128-bit instruction |
41 | was applied to each of them separately. |
42 | |
43 | @icost{SSE2-AVX, 8} |
44 | @icost{AVX2, 4} |
45 | @icost{NEON, 2} |
46 | @icost{ALTIVEC, 4-6} |
47 | */ |
48 | template<unsigned N, class V> SIMDPP_INL |
49 | void transpose2(any_int16<N,V>& a0, any_int16<N,V>& a1) |
50 | { |
51 | static_assert(!is_mask<V>::value, "Mask vectors are not supported" ); |
52 | static_assert(is_value_vector<V>::value, "Expression vectors are not supported" ); |
53 | uint16<N> qa0 = a0.wrapped(); |
54 | uint16<N> qa1 = a1.wrapped(); |
55 | detail::insn::i_transpose2(qa0, qa1); |
56 | a0.wrapped() = qa0; |
57 | a1.wrapped() = qa1; |
58 | } |
59 | |
60 | /** Transposes two 2x2 32-bit matrices within two int32x4 vectors |
61 | |
62 | @code |
63 | r0 = [ a0_0; a1_0 ; a0_2; a1_2 ] |
64 | r1 = [ a0_1; a1_1 ; a1_3; a0_3 ] |
65 | @endcode |
66 | |
67 | @par 128-bit version: |
68 | @icost{SSE2-AVX2, 4} |
69 | @icost{ALTIVEC, 2-4} |
70 | |
71 | @par 256-bit version: |
72 | The lower and higher 128-bit halves are processed as if 128-bit instruction |
73 | was applied to each of them separately. |
74 | |
75 | @icost{SSE2-AVX, 8} |
76 | @icost{AVX2, 4} |
77 | @icost{NEON, 2} |
78 | @icost{ALTIVEC, 4-6} |
79 | */ |
80 | template<unsigned N, class V> SIMDPP_INL |
81 | void transpose2(any_int32<N,V>& a0, any_int32<N,V>& a1) |
82 | { |
83 | static_assert(!is_mask<V>::value, "Mask vectors are not supported" ); |
84 | static_assert(is_value_vector<V>::value, "Expression vectors are not supported" ); |
85 | uint32<N> qa0 = a0.wrapped(); |
86 | uint32<N> qa1 = a1.wrapped(); |
87 | detail::insn::i_transpose2(qa0, qa1); |
88 | a0.wrapped() = qa0; |
89 | a1.wrapped() = qa1; |
90 | } |
91 | |
92 | /** Transposes a 2x2 64-bit matrix within two int64x2 vectors |
93 | |
94 | @code |
95 | r0 = [ a0_0; a1_0 ] |
96 | r1 = [ a0_1; a1_1 ] |
97 | @endcode |
98 | |
99 | @par 128-bit version: |
100 | @icost{SSE2-AVX2, 2} |
101 | @icost{ALTIVEC, 2-4} |
102 | |
103 | @par 256-bit version: |
104 | The lower and higher 128-bit halves are processed as if 128-bit instruction |
105 | was applied to each of them separately. |
106 | |
107 | @icost{SSE2-AVX, 4} |
108 | @icost{AVX2, 2} |
109 | @icost{NEON, 2} |
110 | @icost{ALTIVEC, 4-6} |
111 | */ |
112 | template<unsigned N, class V> SIMDPP_INL |
113 | void transpose2(any_int64<N,V>& a0, any_int64<N,V>& a1) |
114 | { |
115 | static_assert(!is_mask<V>::value, "Mask vectors are not supported" ); |
116 | static_assert(is_value_vector<V>::value, "Expression vectors are not supported" ); |
117 | uint64<N> qa0 = a0.wrapped(); |
118 | uint64<N> qa1 = a1.wrapped(); |
119 | detail::insn::i_transpose2(qa0, qa1); |
120 | a0.wrapped() = qa0; |
121 | a1.wrapped() = qa1; |
122 | } |
123 | |
124 | /** Transposes two 2x2 32-bit matrices within two float32x4 vectors |
125 | |
126 | @code |
127 | r0 = [ a0_0; a1_0 ; a0_2; a1_2 ] |
128 | r1 = [ a0_1; a1_1 ; a0_3; a0_3 ] |
129 | @endcode |
130 | |
131 | @par 128-bit version: |
132 | @icost{SSE2-AVX2, 4} |
133 | @icost{ALTIVEC, 2-4} |
134 | |
135 | @par 256-bit version: |
136 | The lower and higher 128-bit halves are processed as if 128-bit instruction |
137 | was applied to each of them separately. |
138 | |
139 | @icost{SSE2-SSE4.1, 8} |
140 | @icost{AVX-AVX2, 4} |
141 | @icost{ALTIVEC, 4-6} |
142 | @icost{NEON, 2} |
143 | */ |
144 | template<unsigned N> SIMDPP_INL |
145 | void transpose2(float32<N>& a0, float32<N>& a1) |
146 | { |
147 | detail::insn::i_transpose2(a0, a1); |
148 | } |
149 | |
150 | /** Transposes a 2x2 64-bit matrix within two int64x2 vectors |
151 | |
152 | @code |
153 | r0 = [ a0_0; a1_0 ] |
154 | r1 = [ a0_1; a1_1 ] |
155 | @endcode |
156 | |
157 | @par 128-bit version: |
158 | @icost{SSE2-AVX2, 2} |
159 | @novec{NEON, ALTIVEC} |
160 | |
161 | @par 256-bit version: |
162 | The lower and higher 128-bit halves are processed as if 128-bit instruction |
163 | was applied to each of them separately. |
164 | |
165 | @icost{SSE2-SSE4.1, 4} |
166 | @icost{AVX-AVX2, 2} |
167 | @novec{NEON, ALTIVEC} |
168 | */ |
169 | template<unsigned N> SIMDPP_INL |
170 | void transpose2(float64<N>& a0, float64<N>& a1) |
171 | { |
172 | detail::insn::i_transpose2(a0, a1); |
173 | } |
174 | |
175 | /** Transposes four 4x4 8-bit matrix within four int8x16 vectors |
176 | |
177 | Mask or expression vectors are not supported. |
178 | |
179 | @code |
180 | r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ...] |
181 | r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ...] |
182 | r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ...] |
183 | r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ...] |
184 | @endcode |
185 | |
186 | @par 128-bit version: |
187 | @icost{SSE2-AVX2, 16} |
188 | @icost{NEON, 4} |
189 | @icost{ALTIVEC, 8-12} |
190 | |
191 | @par 256-bit version: |
192 | The lower and higher 128-bit halves are processed as if 128-bit instruction |
193 | was applied to each of them separately. |
194 | |
195 | @icost{SSE2-AVX, 32} |
196 | @icost{AVX2, 16} |
197 | @icost{NEON, 8} |
198 | @icost{ALTIVEC, 16-20} |
199 | */ |
200 | template<unsigned N, class V> SIMDPP_INL |
201 | void transpose4(any_int8<N,V>& a0, any_int8<N,V>& a1, |
202 | any_int8<N,V>& a2, any_int8<N,V>& a3) |
203 | { |
204 | static_assert(!is_mask<V>::value, "Mask vectors are not supported" ); |
205 | static_assert(is_value_vector<V>::value, "Expression vectors are not supported" ); |
206 | uint8<N> qa0, qa1, qa2, qa3; |
207 | qa0 = a0.wrapped(); qa1 = a1.wrapped(); qa2 = a2.wrapped(); qa3 = a3.wrapped(); |
208 | detail::insn::i_transpose4(qa0, qa1, qa2, qa3); |
209 | a0.wrapped() = qa0; a1.wrapped() = qa1; a2.wrapped() = qa2; a3.wrapped() = qa3; |
210 | } |
211 | |
212 | /** Transposes two 4x4 16-bit matrices within four int16x8 vectors |
213 | |
214 | Mask or expression vectors are not supported. |
215 | |
216 | @code |
217 | r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ] |
218 | r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ] |
219 | r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ] |
220 | r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ] |
221 | @endcode |
222 | |
223 | @par 128-bit version: |
224 | @icost{SSE2-AVX2, 12} |
225 | @icost{NEON, 4} |
226 | @icost{ALTIVEC, 8-12} |
227 | |
228 | @par 256-bit version: |
229 | The lower and higher 128-bit halves are processed as if 128-bit instruction |
230 | was applied to each of them separately. |
231 | |
232 | @icost{SSE2-AVX, 24} |
233 | @icost{AVX2, 12} |
234 | @icost{NEON, 8} |
235 | @icost{ALTIVEC, 16-20} |
236 | */ |
237 | template<unsigned N, class V> SIMDPP_INL |
238 | void transpose4(any_int16<N,V>& a0, any_int16<N,V>& a1, |
239 | any_int16<N,V>& a2, any_int16<N,V>& a3) |
240 | { |
241 | static_assert(!is_mask<V>::value, "Mask vectors are not supported" ); |
242 | static_assert(is_value_vector<V>::value, "Expression vectors are not supported" ); |
243 | uint16<N> qa0, qa1, qa2, qa3; |
244 | qa0 = a0.wrapped(); qa1 = a1.wrapped(); qa2 = a2.wrapped(); qa3 = a3.wrapped(); |
245 | detail::insn::i_transpose4(qa0, qa1, qa2, qa3); |
246 | a0.wrapped() = qa0; a1.wrapped() = qa1; a2.wrapped() = qa2; a3.wrapped() = qa3; |
247 | } |
248 | |
249 | /** Transposes a 4x4 32-bit matrix within four int32x4 vectors |
250 | |
251 | Mask or expression vectors are not supported. |
252 | |
253 | @code |
254 | r0 = [ a0_0; a1_0; a2_0; a3_0 ] |
255 | r1 = [ a0_1; a1_1; a2_1; a3_1 ] |
256 | r2 = [ a0_2; a1_2; a2_2; a3_2 ] |
257 | r3 = [ a0_3; a1_3; a2_3; a3_3 ] |
258 | @endcode |
259 | |
260 | @par 128-bit version: |
261 | @icost{SSE2-AVX2, 12} |
262 | @icost{NEON, 4} |
263 | @icost{ALTIVEC, 8-12} |
264 | |
265 | @par 256-bit version: |
266 | @icost{SSE2-AVX, 24} |
267 | @icost{AVX2, 12} |
268 | @icost{NEON, 8} |
269 | @icost{ALTIVEC, 16-20} |
270 | |
271 | The lower and higher 128-bit halves are processed as if 128-bit instruction |
272 | was applied to each of them separately. |
273 | */ |
274 | template<unsigned N, class V> SIMDPP_INL |
275 | void transpose4(any_int32<N,V>& a0, any_int32<N,V>& a1, |
276 | any_int32<N,V>& a2, any_int32<N,V>& a3) |
277 | { |
278 | static_assert(!is_mask<V>::value, "Mask vectors are not supported" ); |
279 | static_assert(is_value_vector<V>::value, "Expression vectors are not supported" ); |
280 | uint32<N> qa0, qa1, qa2, qa3; |
281 | qa0 = a0.wrapped(); qa1 = a1.wrapped(); qa2 = a2.wrapped(); qa3 = a3.wrapped(); |
282 | detail::insn::i_transpose4(qa0, qa1, qa2, qa3); |
283 | a0.wrapped() = qa0; a1.wrapped() = qa1; a2.wrapped() = qa2; a3.wrapped() = qa3; |
284 | } |
285 | |
286 | /** Transposes 4x4 32-bit matrix within four float32x4 vectors |
287 | |
288 | @code |
289 | r0 = [ a0_0; a1_0; a2_0; a3_0 ] |
290 | r1 = [ a0_1; a1_1; a2_1; a3_1 ] |
291 | r2 = [ a0_2; a1_2; a2_2; a3_2 ] |
292 | r3 = [ a0_3; a1_3; a2_3; a3_3 ] |
293 | @endcode |
294 | |
295 | @par 128-bit version: |
296 | @icost{SSE2-AVX2, 12} |
297 | @icost{NEON, 4} |
298 | @icost{ALTIVEC, 8-12} |
299 | |
300 | @par 256-bit version: |
301 | @icost{SSE2-SSE4.1, 24} |
302 | @icost{AVX-AVX2, 12} |
303 | @icost{NEON, 8} |
304 | @icost{ALTIVEC, 16-20} |
305 | |
306 | The lower and higher 128-bit halves are processed as if 128-bit instruction |
307 | was applied to each of them separately. |
308 | */ |
309 | template<unsigned N> SIMDPP_INL |
310 | void transpose4(float32<N>& a0, float32<N>& a1, |
311 | float32<N>& a2, float32<N>& a3) |
312 | { |
313 | detail::insn::i_transpose4(a0, a1, a2, a3); |
314 | } |
315 | |
316 | } // namespace SIMDPP_ARCH_NAMESPACE |
317 | } // namespace simdpp |
318 | |
319 | #endif |
320 | |
321 | |