1 | /* Copyright (C) 2012-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_TRANSPOSE_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_TRANSPOSE_H |
10 | |
11 | #include <simdpp/types.h> |
12 | #include <simdpp/detail/not_implemented.h> |
13 | #include <simdpp/detail/width.h> |
14 | #include <simdpp/core/permute_bytes16.h> |
15 | #include <simdpp/core/zip_lo.h> |
16 | #include <simdpp/core/zip_hi.h> |
17 | #include <simdpp/detail/null/transpose.h> |
18 | #include <simdpp/detail/neon/shuffle.h> |
19 | #include <simdpp/detail/vector_array_macros.h> |
20 | |
21 | namespace simdpp { |
22 | namespace SIMDPP_ARCH_NAMESPACE { |
23 | namespace detail { |
24 | namespace insn { |
25 | |
26 | |
27 | template<class V8, class V16, class V32> SIMDPP_INL |
28 | void v_sse_transpose8x4(V8& a0, V8& a1, V8& a2, V8& a3); |
29 | template<class V16, class V32, class V64> SIMDPP_INL |
30 | void v_sse_transpose16x4(V16& a0, V16& a1, V16& a2, V16& a3); |
31 | template<class V, class D> SIMDPP_INL |
32 | void v_sse_transpose32x4(V& a0, V& a1, V& a2, V& a3); |
33 | |
34 | /** Transposes eight 2x2 8-bit matrices within two int8x16 vectors |
35 | |
36 | @code |
37 | r0 = [ a0_0; a1_0 ; ... ; a0_14; a1_14 ] |
38 | r1 = [ a1_1; a1_1 ; ... ; a0_15; a0_15 ] |
39 | @endcode |
40 | |
41 | @par 128-bit version: |
42 | @icost{SSE2-AVX2, 4} |
43 | @icost{ALTIVEC, 2-4} |
44 | |
45 | @par 256-bit version: |
46 | @icost{SSE2-AVX, 8} |
47 | @icost{AVX2, 4} |
48 | @icost{ALTIVEC, 4-6} |
49 | |
50 | The lower and higher 128-bit halves are processed as if 128-bit instruction |
51 | was applied to each of them separately. |
52 | */ |
53 | static SIMDPP_INL |
54 | void i_transpose2(uint8x16& a0, uint8x16& a1) |
55 | { |
56 | #if SIMDPP_USE_NULL |
57 | detail::null::transpose2(a0, a1); |
58 | #elif SIMDPP_USE_NEON |
59 | auto r = vtrnq_u8(a0.native(), a1.native()); |
60 | a0 = r.val[0]; |
61 | a1 = r.val[1]; |
62 | #elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
63 | uint8x16 m0 = make_shuffle_bytes16_mask<0,16+0, 2,16+2, 4,16+4, 6,16+6, |
64 | 8,16+8, 10,16+10, 12,16+12, 14,16+14>(m0); |
65 | uint8x16 m1 = make_shuffle_bytes16_mask<1,16+1, 3,16+3, 5,16+5, 7,16+7, |
66 | 9,16+9, 11,16+11, 13,16+13, 15,16+15>(m1); |
67 | uint16x8 b0, b1; |
68 | b0 = shuffle_bytes16(a0, a1, m0); |
69 | b1 = shuffle_bytes16(a0, a1, m1); |
70 | a0 = b0; a1 = b1; |
71 | #else |
72 | SIMDPP_NOT_IMPLEMENTED2(a0, a1); |
73 | #endif |
74 | } |
75 | |
76 | /** Helper function. |
77 | |
78 | @code |
79 | r = [a0,a4,a8,a12,a1,a5,a9,a13,a2,a6,a10,a14,a3,a7,a11,a15] |
80 | @endcode |
81 | |
82 | The 256-bit version applies the 128 bit operation to the two halves. |
83 | |
84 | Needs SSSE3 |
85 | */ |
86 | static SIMDPP_INL |
87 | uint8x16 transpose_inplace(const uint8x16& a) |
88 | { |
89 | #if SIMDPP_USE_SSSE3 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
90 | // the compiler will take this out of any loops automatically |
91 | uint8x16 idx = make_uint(0, 4, 8, 12, 1, 5, 9, 13, |
92 | 2, 6, 10,14, 3, 7, 11,15); |
93 | return permute_bytes16(a, idx); |
94 | #else |
95 | return SIMDPP_NOT_IMPLEMENTED1(a); |
96 | #endif |
97 | } |
98 | |
99 | static SIMDPP_INL |
100 | uint8x32 transpose_inplace(const uint8x32& a) |
101 | { |
102 | #if SIMDPP_USE_AVX2 |
103 | uint8x32 idx = make_uint(0, 4, 8, 12, 1, 5, 9, 13, |
104 | 2, 6, 10,14, 3, 7, 11,15); |
105 | return permute_bytes16(a, idx); |
106 | #elif SIMDPP_USE_SSSE3 || SIMDPP_USE_ALTIVEC |
107 | SIMDPP_VEC_ARRAY_IMPL1(uint8x32, transpose_inplace, a); |
108 | #else |
109 | return SIMDPP_NOT_IMPLEMENTED1(a); |
110 | #endif |
111 | } |
112 | |
113 | #if SIMDPP_USE_AVX512BW |
114 | static SIMDPP_INL |
115 | uint8<64> transpose_inplace(const uint8<64>& a) |
116 | { |
117 | uint8<64> idx = make_uint(0, 4, 8, 12, 1, 5, 9, 13, |
118 | 2, 6, 10,14, 3, 7, 11,15); |
119 | return permute_bytes16(a, idx); |
120 | } |
121 | #endif |
122 | |
123 | static SIMDPP_INL |
124 | void i_transpose2(uint16x8& a0, uint16x8& a1) |
125 | { |
126 | #if SIMDPP_USE_NULL |
127 | detail::null::transpose2(a0, a1); |
128 | #elif SIMDPP_USE_SSE2 |
129 | uint32x4 b0, b1; |
130 | b0 = zip8_lo(a0, a1); |
131 | b1 = zip8_hi(a0, a1); |
132 | a0 = shuffle2<0,2,0,2>(b0, b1); |
133 | a1 = shuffle2<1,3,1,3>(b0, b1); |
134 | #elif SIMDPP_USE_NEON |
135 | auto r = vtrnq_u16(a0.native(), a1.native()); |
136 | a0 = r.val[0]; |
137 | a1 = r.val[1]; |
138 | #elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
139 | uint16x8 m0 = make_shuffle_bytes16_mask<0,8+0, 2,8+2, 4,8+4, 6,8+6>(m0); |
140 | uint16x8 m1 = make_shuffle_bytes16_mask<1,8+1, 3,8+3, 5,8+5, 7,8+7>(m1); |
141 | uint16x8 b0, b1; |
142 | b0 = shuffle_bytes16(a0, a1, m0); |
143 | b1 = shuffle_bytes16(a0, a1, m1); |
144 | a0 = b0; a1 = b1; |
145 | #endif |
146 | } |
147 | |
148 | #if SIMDPP_USE_AVX2 |
149 | static SIMDPP_INL |
150 | void i_transpose2(uint16x16& a0, uint16x16& a1) |
151 | { |
152 | uint32x8 b0, b1; |
153 | b0 = zip8_lo(a0, a1); |
154 | b1 = zip8_hi(a0, a1); |
155 | a0 = shuffle2<0,2,0,2>(b0, b1); |
156 | a1 = shuffle2<1,3,1,3>(b0, b1); |
157 | } |
158 | #endif |
159 | |
160 | #if SIMDPP_USE_AVX512BW |
161 | SIMDPP_INL void i_transpose2(uint16<32>& a0, uint16<32>& a1) |
162 | { |
163 | uint32<16> b0, b1; |
164 | b0 = zip8_lo(a0, a1); |
165 | b1 = zip8_hi(a0, a1); |
166 | a0 = shuffle2<0,2,0,2>(b0, b1); |
167 | a1 = shuffle2<1,3,1,3>(b0, b1); |
168 | } |
169 | #endif |
170 | |
171 | template<unsigned N> SIMDPP_INL |
172 | void i_transpose2(uint16<N>& a0, uint16<N>& a1) |
173 | { |
174 | SIMDPP_VEC_ARRAY_IMPL_REF2(uint16<N>, i_transpose2, a0, a1); |
175 | } |
176 | |
177 | // ----------------------------------------------------------------------------- |
178 | |
179 | static SIMDPP_INL |
180 | void i_transpose2(uint32x4& a0, uint32x4& a1) |
181 | { |
182 | #if SIMDPP_USE_NULL |
183 | detail::null::transpose2(a0, a1); |
184 | #elif SIMDPP_USE_SSE2 |
185 | uint64x2 b0, b1; |
186 | b0 = zip4_lo(a0, a1); |
187 | b1 = zip4_hi(a0, a1); |
188 | a0 = zip2_lo(b0, b1); |
189 | a1 = zip2_hi(b0, b1); |
190 | #elif SIMDPP_USE_NEON |
191 | auto r = vtrnq_u32(a0.native(), a1.native()); |
192 | a0 = r.val[0]; |
193 | a1 = r.val[1]; |
194 | #elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
195 | uint32x4 m0 = make_shuffle_bytes16_mask<0,4+0, 2,4+2>(m0); |
196 | uint32x4 m1 = make_shuffle_bytes16_mask<1,4+1, 3,4+3>(m1); |
197 | uint32x4 b0, b1; |
198 | b0 = shuffle_bytes16(a0, a1, m0); |
199 | b1 = shuffle_bytes16(a0, a1, m1); |
200 | a0 = b0; a1 = b1; |
201 | #endif |
202 | } |
203 | |
204 | #if SIMDPP_USE_AVX2 |
205 | static SIMDPP_INL |
206 | void i_transpose2(uint32x8& a0, uint32x8& a1) |
207 | { |
208 | uint64x4 b0, b1; |
209 | b0 = zip4_lo(a0, a1); |
210 | b1 = zip4_hi(a0, a1); |
211 | a0 = zip2_lo(b0, b1); |
212 | a1 = zip2_hi(b0, b1); |
213 | } |
214 | #endif |
215 | |
216 | #if SIMDPP_USE_AVX512F |
217 | static SIMDPP_INL |
218 | void i_transpose2(uint32<16>& a0, uint32<16>& a1) |
219 | { |
220 | uint64<8> b0, b1; |
221 | b0 = zip4_lo(a0, a1); |
222 | b1 = zip4_hi(a0, a1); |
223 | a0 = zip2_lo(b0, b1); |
224 | a1 = zip2_hi(b0, b1); |
225 | } |
226 | #endif |
227 | |
228 | template<unsigned N> SIMDPP_INL |
229 | void i_transpose2(uint32<N>& a0, uint32<N>& a1) |
230 | { |
231 | SIMDPP_VEC_ARRAY_IMPL_REF2(uint32<N>, i_transpose2, a0, a1); |
232 | } |
233 | |
234 | // ----------------------------------------------------------------------------- |
235 | |
236 | static SIMDPP_INL |
237 | void i_transpose2(uint64x2& a0, uint64x2& a1) |
238 | { |
239 | #if SIMDPP_USE_SSE2 || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA |
240 | uint64x2 b0; |
241 | b0 = zip2_lo(a0, a1); |
242 | a1 = zip2_hi(a0, a1); |
243 | a0 = b0; |
244 | #elif SIMDPP_USE_NEON |
245 | neon::transpose2(a0, a1); |
246 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC |
247 | detail::null::transpose2(a0, a1); |
248 | #endif |
249 | } |
250 | |
251 | #if SIMDPP_USE_AVX2 |
252 | static SIMDPP_INL |
253 | void i_transpose2(uint64x4& a0, uint64x4& a1) |
254 | { |
255 | uint64x4 b0; |
256 | b0 = zip2_lo(a0, a1); |
257 | a1 = zip2_hi(a0, a1); |
258 | a0 = b0; |
259 | } |
260 | #endif |
261 | |
262 | #if SIMDPP_USE_AVX512F |
263 | static SIMDPP_INL |
264 | void i_transpose2(uint64<8>& a0, uint64<8>& a1) |
265 | { |
266 | uint64<8> b0; |
267 | b0 = zip2_lo(a0, a1); |
268 | a1 = zip2_hi(a0, a1); |
269 | a0 = b0; |
270 | } |
271 | #endif |
272 | |
273 | template<unsigned N> SIMDPP_INL |
274 | void i_transpose2(uint64<N>& a0, uint64<N>& a1) |
275 | { |
276 | SIMDPP_VEC_ARRAY_IMPL_REF2(uint64<N>, i_transpose2, a0, a1); |
277 | } |
278 | |
279 | // ----------------------------------------------------------------------------- |
280 | |
281 | static SIMDPP_INL |
282 | void i_transpose2(float32x4& a0, float32x4& a1) |
283 | { |
284 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP |
285 | detail::null::transpose2(a0, a1); |
286 | #elif SIMDPP_USE_SSE2 |
287 | float64x2 b0, b1; |
288 | b0 = bit_cast<float64x2>(zip4_lo(a0, a1)); |
289 | b1 = bit_cast<float64x2>(zip4_hi(a0, a1)); |
290 | a0 = bit_cast<float32x4>(zip2_lo(b0, b1)); |
291 | a1 = bit_cast<float32x4>(zip2_hi(b0, b1)); |
292 | #elif SIMDPP_USE_NEON |
293 | auto r = vtrnq_f32(a0.native(), a1.native()); |
294 | a0 = r.val[0]; |
295 | a1 = r.val[1]; |
296 | #elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
297 | uint32x4 m0 = make_shuffle_bytes16_mask<0,4+0, 2,4+2>(m0); |
298 | uint32x4 m1 = make_shuffle_bytes16_mask<1,4+1, 3,4+3>(m1); |
299 | float32x4 b0, b1; |
300 | b0 = shuffle_bytes16(a0, a1, m0); |
301 | b1 = shuffle_bytes16(a0, a1, m1); |
302 | a0 = b0; a1 = b1; |
303 | #endif |
304 | } |
305 | |
306 | #if SIMDPP_USE_AVX |
307 | static SIMDPP_INL |
308 | void i_transpose2(float32x8& a0, float32x8& a1) |
309 | { |
310 | float64x4 b0, b1; |
311 | b0 = zip4_lo(a0, a1); |
312 | b1 = zip4_hi(a0, a1); |
313 | a0 = zip2_lo(b0, b1); |
314 | a1 = zip2_hi(b0, b1); |
315 | } |
316 | #endif |
317 | |
318 | #if SIMDPP_USE_AVX512F |
319 | static SIMDPP_INL |
320 | void i_transpose2(float32<16>& a0, float32<16>& a1) |
321 | { |
322 | float64<8> b0, b1; |
323 | b0 = zip4_lo(a0, a1); |
324 | b1 = zip4_hi(a0, a1); |
325 | a0 = zip2_lo(b0, b1); |
326 | a1 = zip2_hi(b0, b1); |
327 | } |
328 | #endif |
329 | |
330 | template<unsigned N> SIMDPP_INL |
331 | void i_transpose2(float32<N>& a0, float32<N>& a1) |
332 | { |
333 | SIMDPP_VEC_ARRAY_IMPL_REF2(float32<N>, i_transpose2, a0, a1); |
334 | } |
335 | |
336 | // ----------------------------------------------------------------------------- |
337 | |
338 | static SIMDPP_INL |
339 | void i_transpose2(float64x2& a0, float64x2& a1) |
340 | { |
341 | #if SIMDPP_USE_SSE2 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA |
342 | float64x2 b0; |
343 | b0 = zip2_lo(a0, a1); |
344 | a1 = zip2_hi(a0, a1); |
345 | a0 = b0; |
346 | #elif SIMDPP_USE_NEON64 |
347 | uint64x2 b0, b1; |
348 | b0 = a0; b1 = a1; |
349 | i_transpose2(b0, b1); |
350 | a0 = b0; a1 = b1; |
351 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC |
352 | detail::null::transpose2(a0, a1); |
353 | #endif |
354 | } |
355 | |
356 | #if SIMDPP_USE_AVX |
357 | static SIMDPP_INL |
358 | void i_transpose2(float64x4& a0, float64x4& a1) |
359 | { |
360 | float64x4 b0; |
361 | b0 = zip2_lo(a0, a1); |
362 | a1 = zip2_hi(a0, a1); |
363 | a0 = b0; |
364 | } |
365 | #endif |
366 | |
367 | #if SIMDPP_USE_AVX512F |
368 | static SIMDPP_INL |
369 | void i_transpose2(float64<8>& a0, float64<8>& a1) |
370 | { |
371 | float64<8> b0; |
372 | b0 = zip2_lo(a0, a1); |
373 | a1 = zip2_hi(a0, a1); |
374 | a0 = b0; |
375 | } |
376 | #endif |
377 | |
378 | template<unsigned N> SIMDPP_INL |
379 | void i_transpose2(float64<N>& a0, float64<N>& a1) |
380 | { |
381 | SIMDPP_VEC_ARRAY_IMPL_REF2(float64<N>, i_transpose2, a0, a1); |
382 | } |
383 | |
384 | // ----------------------------------------------------------------------------- |
385 | |
386 | static SIMDPP_INL |
387 | void i_transpose4(uint32x4& a0, uint32x4& a1, |
388 | uint32x4& a2, uint32x4& a3); |
389 | |
390 | #if SIMDPP_USE_AVX2 |
391 | static SIMDPP_INL |
392 | void i_transpose4(uint32x8& a0, uint32x8& a1, |
393 | uint32x8& a2, uint32x8& a3); |
394 | #endif |
395 | |
396 | static SIMDPP_INL |
397 | void i_transpose4(uint8x16& a0, uint8x16& a1, |
398 | uint8x16& a2, uint8x16& a3) |
399 | { |
400 | // [a0,a1,a2,a3 ... ] |
401 | // [b0,b1,b2,b3 ... ] |
402 | // [c0,c1,c2,c3 ... ] |
403 | // [d0,d1,d2,d3 ... ] |
404 | #if SIMDPP_USE_NULL |
405 | detail::null::transpose4(a0, a1, a2, a3); |
406 | #elif SIMDPP_USE_SSE2 |
407 | v_sse_transpose8x4<uint8<16>, uint16<8>, uint32<4>>(a0, a1, a2, a3); |
408 | #elif SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
409 | uint16x8 b0, b1, b2, b3; |
410 | i_transpose2(a0, a1); // 8-bit transpose |
411 | i_transpose2(a2, a3); |
412 | b0 = a0; b1 = a1; b2 = a2; b3 = a3; |
413 | i_transpose2(b0, b2); // 16-bit transpose |
414 | i_transpose2(b1, b3); |
415 | a0 = b0; a1 = b1; a2 = b2; a3 = b3; |
416 | #endif |
417 | } |
418 | |
419 | |
420 | #if SIMDPP_USE_AVX2 |
421 | static SIMDPP_INL |
422 | void i_transpose4(uint8x32& a0, uint8x32& a1, |
423 | uint8x32& a2, uint8x32& a3) |
424 | { |
425 | v_sse_transpose8x4<uint8<32>, uint16<16>, uint32<8>>(a0, a1, a2, a3); |
426 | } |
427 | #endif |
428 | |
429 | #if SIMDPP_USE_AVX512BW |
430 | static SIMDPP_INL |
431 | void i_transpose4(uint8<64>& a0, uint8<64>& a1, |
432 | uint8<64>& a2, uint8<64>& a3) |
433 | { |
434 | v_sse_transpose8x4<uint8<64>, uint16<32>, uint32<16>>(a0, a1, a2, a3); |
435 | } |
436 | #endif |
437 | |
438 | template<unsigned N> SIMDPP_INL |
439 | void i_transpose4(uint8<N>& a0, uint8<N>& a1, uint8<N>& a2, uint8<N>& a3) |
440 | { |
441 | SIMDPP_VEC_ARRAY_IMPL_REF4(uint8<N>, i_transpose4, a0, a1, a2, a3); |
442 | } |
443 | |
444 | // ----------------------------------------------------------------------------- |
445 | |
446 | static SIMDPP_INL |
447 | void i_transpose4(uint16x8& a0, uint16x8& a1, |
448 | uint16x8& a2, uint16x8& a3) |
449 | { |
450 | #if SIMDPP_USE_NULL |
451 | detail::null::transpose4(a0, a1, a2, a3); |
452 | #elif SIMDPP_USE_SSE2 |
453 | v_sse_transpose16x4<uint16<8>, uint32<4>, uint64<2>>(a0, a1, a2, a3); |
454 | #elif SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
455 | uint32x4 b0, b1, b2, b3; |
456 | i_transpose2(a0, a1); // 16-bit transpose |
457 | i_transpose2(a2, a3); |
458 | b0 = a0; b1 = a1; b2 = a2; b3 = a3; |
459 | i_transpose2(b0, b2); // 32-bit transpose |
460 | i_transpose2(b1, b3); |
461 | a0 = b0; a1 = b1; a2 = b2; a3 = b3; |
462 | #endif |
463 | } |
464 | |
465 | #if SIMDPP_USE_AVX2 |
466 | static SIMDPP_INL |
467 | void i_transpose4(uint16x16& a0, uint16x16& a1, |
468 | uint16x16& a2, uint16x16& a3) |
469 | { |
470 | v_sse_transpose16x4<uint16<16>, uint32<8>, uint64<4>>(a0, a1, a2, a3); |
471 | } |
472 | #endif |
473 | |
474 | #if SIMDPP_USE_AVX2 |
475 | SIMDPP_INL void i_transpose4(uint16<32>& a0, uint16<32>& a1, |
476 | uint16<32>& a2, uint16<32>& a3) |
477 | { |
478 | v_sse_transpose16x4<uint16<32>, uint32<16>, uint64<8>>(a0, a1, a2, a3); |
479 | } |
480 | #endif |
481 | |
482 | template<unsigned N> SIMDPP_INL |
483 | void i_transpose4(uint16<N>& a0, uint16<N>& a1, uint16<N>& a2, uint16<N>& a3) |
484 | { |
485 | SIMDPP_VEC_ARRAY_IMPL_REF4(uint16<N>, i_transpose4, a0, a1, a2, a3); |
486 | } |
487 | |
488 | // ----------------------------------------------------------------------------- |
489 | |
490 | static SIMDPP_INL |
491 | void i_transpose4(uint32x4& a0, uint32x4& a1, |
492 | uint32x4& a2, uint32x4& a3) |
493 | { |
494 | #if SIMDPP_USE_NULL |
495 | detail::null::transpose4(a0, a1, a2, a3); |
496 | #elif SIMDPP_USE_SSE2 |
497 | v_sse_transpose32x4<uint32<4>, uint64<2>>(a0, a1, a2, a3); |
498 | #elif SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
499 | uint64x2 b0, b1, b2, b3; |
500 | i_transpose2(a0, a1); // 32-bit transpose |
501 | i_transpose2(a2, a3); |
502 | b0 = a0; b1 = a1; b2 = a2; b3 = a3; |
503 | i_transpose2(b0, b2); // 64-bit transpose |
504 | i_transpose2(b1, b3); |
505 | a0 = b0; a1 = b1; a2 = b2; a3 = b3; |
506 | #endif |
507 | } |
508 | |
509 | #if SIMDPP_USE_AVX2 |
510 | static SIMDPP_INL |
511 | void i_transpose4(uint32x8& a0, uint32x8& a1, |
512 | uint32x8& a2, uint32x8& a3) |
513 | { |
514 | v_sse_transpose32x4<uint32<8>, uint64<4>>(a0, a1, a2, a3); |
515 | } |
516 | #endif |
517 | |
518 | #if SIMDPP_USE_AVX2 |
519 | static SIMDPP_INL |
520 | void i_transpose4(uint32<16>& a0, uint32<16>& a1, |
521 | uint32<16>& a2, uint32<16>& a3) |
522 | { |
523 | v_sse_transpose32x4<uint32<16>, uint64<8>>(a0, a1, a2, a3); |
524 | } |
525 | #endif |
526 | |
527 | template<unsigned N> SIMDPP_INL |
528 | void i_transpose4(uint32<N>& a0, uint32<N>& a1, uint32<N>& a2, uint32<N>& a3) |
529 | { |
530 | SIMDPP_VEC_ARRAY_IMPL_REF4(uint32<N>, i_transpose4, a0, a1, a2, a3); |
531 | } |
532 | |
533 | // ----------------------------------------------------------------------------- |
534 | |
535 | static SIMDPP_INL |
536 | void i_transpose4(float32x4& a0, float32x4& a1, |
537 | float32x4& a2, float32x4& a3) |
538 | { |
539 | #if SIMDPP_USE_SSE2 |
540 | v_sse_transpose32x4<float32<4>, float64<2>>(a0, a1, a2, a3); |
541 | #else |
542 | uint32x4 b0, b1, b2, b3; |
543 | b0 = a0; b1 = a1; b2 = a2; b3 = a3; |
544 | i_transpose4(b0, b1, b2, b3); |
545 | a0 = b0; a1 = b1; a2 = b2; a3 = b3; |
546 | #endif |
547 | } |
548 | |
549 | #if SIMDPP_USE_AVX |
550 | static SIMDPP_INL |
551 | void i_transpose4(float32x8& a0, float32x8& a1, |
552 | float32x8& a2, float32x8& a3) |
553 | { |
554 | v_sse_transpose32x4<float32<8>, float64<4>>(a0, a1, a2, a3); |
555 | } |
556 | #endif |
557 | |
558 | #if SIMDPP_USE_AVX512F |
559 | static SIMDPP_INL |
560 | void i_transpose4(float32<16>& a0, float32<16>& a1, |
561 | float32<16>& a2, float32<16>& a3) |
562 | { |
563 | v_sse_transpose32x4<float32<16>, float64<8>>(a0, a1, a2, a3); |
564 | } |
565 | #endif |
566 | |
567 | template<unsigned N> SIMDPP_INL |
568 | void i_transpose4(float32<N>& a0, float32<N>& a1, float32<N>& a2, float32<N>& a3) |
569 | { |
570 | SIMDPP_VEC_ARRAY_IMPL_REF4(float32<N>, i_transpose4, a0, a1, a2, a3); |
571 | } |
572 | |
573 | // ----------------------------------------------------------------------------- |
574 | |
575 | template<class V, class D> SIMDPP_INL |
576 | void v_sse_transpose32x4(V& a0, V& a1, V& a2, V& a3) |
577 | { |
578 | D b0, b1, b2, b3; |
579 | // [a0,a1,a2,a3] |
580 | // [b0,b1,b2,b3] |
581 | // [c0,c1,c2,c3] |
582 | // [d0,d1,d2,d3] |
583 | b0 = zip4_lo(a0, a1); |
584 | b1 = zip4_hi(a0, a1); |
585 | b2 = zip4_lo(a2, a3); |
586 | b3 = zip4_hi(a2, a3); |
587 | // [a0,b0,a1,b1] |
588 | // [a2,b2,a3,b3] |
589 | // [c0,d0,c1,d1] |
590 | // [c2,d2,c3,d3] |
591 | a0 = zip2_lo(b0, b2); |
592 | a1 = zip2_hi(b0, b2); |
593 | a2 = zip2_lo(b1, b3); |
594 | a3 = zip2_hi(b1, b3); |
595 | } |
596 | |
597 | template<class V16, class V32, class V64> SIMDPP_INL |
598 | void v_sse_transpose16x4(V16& a0, V16& a1, V16& a2, V16& a3) |
599 | { |
600 | V32 b0, b1, b2, b3; |
601 | V64 c0, c1, c2, c3; |
602 | b0 = zip8_lo(a0, a1); |
603 | b1 = zip8_hi(a0, a1); |
604 | b2 = zip8_lo(a2, a3); |
605 | b3 = zip8_hi(a2, a3); |
606 | // [a0,b0,a1,b1,a2,b2,a3,b3] |
607 | // [a4,b4,a5,b5,a6,b6,a7,b7] |
608 | // [c0,d0,c1,d1,c2,d2,c3,d3] |
609 | // [c4,d4,c5,d5,c6,d6,c7,d7] |
610 | c0 = zip4_lo(b0, b2); |
611 | c1 = zip4_hi(b0, b2); |
612 | c2 = zip4_lo(b1, b3); |
613 | c3 = zip4_hi(b1, b3); |
614 | // [a0,b0,c0,d0,a1,b1,c1,d1] |
615 | // [a2,b2,c2,d2,a3,b3,c3,d3] |
616 | // [a4,b4,c4,d4,a5,b5,c5,d5] |
617 | // [a6,b6,c6,d6,a7,b7,c7,d7] |
618 | a0 = zip2_lo(c0, c2); |
619 | a1 = zip2_hi(c0, c2); |
620 | a2 = zip2_lo(c1, c3); |
621 | a3 = zip2_hi(c1, c3); |
622 | // [a0,b0,c0,d0,a4,b4,c4,d4] |
623 | // [a1,b1,c1,d1,a5,b5,c5,d5] |
624 | // [a2,b2,c2,d2,a6,b6,c6,d6] |
625 | // [a3,b3,c3,d3,a7,b7,c7,d7] |
626 | } |
627 | |
628 | template<class V8, class V16, class V32> SIMDPP_INL |
629 | void v_sse_transpose8x4(V8& a0, V8& a1, V8& a2, V8& a3) |
630 | { |
631 | V16 b0, b1, b2, b3; |
632 | b0 = zip16_lo(a0, a1); |
633 | b1 = zip16_lo(a2, a3); |
634 | b2 = zip16_hi(a0, a1); |
635 | b3 = zip16_hi(a2, a3); |
636 | // [a0,b0,a1,b1,a2,b2,a3,b3 ... b7] |
637 | // [c0,d0,c1,d1,c2,d2,c3,d3 ... d7] |
638 | // [a8 ... b15] |
639 | // [c8 ... d15] |
640 | V32 c0, c1, c2, c3; |
641 | c0 = zip8_lo(b0, b1); |
642 | c1 = zip8_hi(b0, b1); |
643 | c2 = zip8_lo(b2, b3); |
644 | c3 = zip8_hi(b2, b3); |
645 | // [a0,b0,c0,d0,[a..d]1, [a..d]2, [a..d]3] |
646 | // [[a..d]4, [a..d]5, [a..d]6, [a..d]7] |
647 | // [[a..d]8, [a..d]9, [a..d]10, [a..d]11] |
648 | // [[a..d]12, [a..d]13,[a..d]14, [a..d]15] |
649 | i_transpose4(c0, c1, c2, c3); // 32-bit transpose |
650 | a0 = c0; |
651 | a1 = c1; |
652 | a2 = c2; |
653 | a3 = c3; |
654 | } |
655 | |
656 | |
657 | } // namespace insn |
658 | } // namespace detail |
659 | } // namespace SIMDPP_ARCH_NAMESPACE |
660 | } // namespace simdpp |
661 | |
662 | #endif |
663 | |