1/* Copyright (C) 2012-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_TRANSPOSE_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_TRANSPOSE_H
10
11#include <simdpp/types.h>
12#include <simdpp/detail/not_implemented.h>
13#include <simdpp/detail/width.h>
14#include <simdpp/core/permute_bytes16.h>
15#include <simdpp/core/zip_lo.h>
16#include <simdpp/core/zip_hi.h>
17#include <simdpp/detail/null/transpose.h>
18#include <simdpp/detail/neon/shuffle.h>
19#include <simdpp/detail/vector_array_macros.h>
20
21namespace simdpp {
22namespace SIMDPP_ARCH_NAMESPACE {
23namespace detail {
24namespace insn {
25
26
27template<class V8, class V16, class V32> SIMDPP_INL
28void v_sse_transpose8x4(V8& a0, V8& a1, V8& a2, V8& a3);
29template<class V16, class V32, class V64> SIMDPP_INL
30void v_sse_transpose16x4(V16& a0, V16& a1, V16& a2, V16& a3);
31template<class V, class D> SIMDPP_INL
32void v_sse_transpose32x4(V& a0, V& a1, V& a2, V& a3);
33
34/** Transposes eight 2x2 8-bit matrices within two int8x16 vectors
35
36 @code
37 r0 = [ a0_0; a1_0 ; ... ; a0_14; a1_14 ]
38 r1 = [ a1_1; a1_1 ; ... ; a0_15; a0_15 ]
39 @endcode
40
41 @par 128-bit version:
42 @icost{SSE2-AVX2, 4}
43 @icost{ALTIVEC, 2-4}
44
45 @par 256-bit version:
46 @icost{SSE2-AVX, 8}
47 @icost{AVX2, 4}
48 @icost{ALTIVEC, 4-6}
49
50 The lower and higher 128-bit halves are processed as if 128-bit instruction
51 was applied to each of them separately.
52*/
53static SIMDPP_INL
54void i_transpose2(uint8x16& a0, uint8x16& a1)
55{
56#if SIMDPP_USE_NULL
57 detail::null::transpose2(a0, a1);
58#elif SIMDPP_USE_NEON
59 auto r = vtrnq_u8(a0.native(), a1.native());
60 a0 = r.val[0];
61 a1 = r.val[1];
62#elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
63 uint8x16 m0 = make_shuffle_bytes16_mask<0,16+0, 2,16+2, 4,16+4, 6,16+6,
64 8,16+8, 10,16+10, 12,16+12, 14,16+14>(m0);
65 uint8x16 m1 = make_shuffle_bytes16_mask<1,16+1, 3,16+3, 5,16+5, 7,16+7,
66 9,16+9, 11,16+11, 13,16+13, 15,16+15>(m1);
67 uint16x8 b0, b1;
68 b0 = shuffle_bytes16(a0, a1, m0);
69 b1 = shuffle_bytes16(a0, a1, m1);
70 a0 = b0; a1 = b1;
71#else
72 SIMDPP_NOT_IMPLEMENTED2(a0, a1);
73#endif
74}
75
76/** Helper function.
77
78 @code
79 r = [a0,a4,a8,a12,a1,a5,a9,a13,a2,a6,a10,a14,a3,a7,a11,a15]
80 @endcode
81
82 The 256-bit version applies the 128 bit operation to the two halves.
83
84 Needs SSSE3
85*/
86static SIMDPP_INL
87uint8x16 transpose_inplace(const uint8x16& a)
88{
89#if SIMDPP_USE_SSSE3 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
90 // the compiler will take this out of any loops automatically
91 uint8x16 idx = make_uint(0, 4, 8, 12, 1, 5, 9, 13,
92 2, 6, 10,14, 3, 7, 11,15);
93 return permute_bytes16(a, idx);
94#else
95 return SIMDPP_NOT_IMPLEMENTED1(a);
96#endif
97}
98
99static SIMDPP_INL
100uint8x32 transpose_inplace(const uint8x32& a)
101{
102#if SIMDPP_USE_AVX2
103 uint8x32 idx = make_uint(0, 4, 8, 12, 1, 5, 9, 13,
104 2, 6, 10,14, 3, 7, 11,15);
105 return permute_bytes16(a, idx);
106#elif SIMDPP_USE_SSSE3 || SIMDPP_USE_ALTIVEC
107 SIMDPP_VEC_ARRAY_IMPL1(uint8x32, transpose_inplace, a);
108#else
109 return SIMDPP_NOT_IMPLEMENTED1(a);
110#endif
111}
112
113#if SIMDPP_USE_AVX512BW
114static SIMDPP_INL
115uint8<64> transpose_inplace(const uint8<64>& a)
116{
117 uint8<64> idx = make_uint(0, 4, 8, 12, 1, 5, 9, 13,
118 2, 6, 10,14, 3, 7, 11,15);
119 return permute_bytes16(a, idx);
120}
121#endif
122
123static SIMDPP_INL
124void i_transpose2(uint16x8& a0, uint16x8& a1)
125{
126#if SIMDPP_USE_NULL
127 detail::null::transpose2(a0, a1);
128#elif SIMDPP_USE_SSE2
129 uint32x4 b0, b1;
130 b0 = zip8_lo(a0, a1);
131 b1 = zip8_hi(a0, a1);
132 a0 = shuffle2<0,2,0,2>(b0, b1);
133 a1 = shuffle2<1,3,1,3>(b0, b1);
134#elif SIMDPP_USE_NEON
135 auto r = vtrnq_u16(a0.native(), a1.native());
136 a0 = r.val[0];
137 a1 = r.val[1];
138#elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
139 uint16x8 m0 = make_shuffle_bytes16_mask<0,8+0, 2,8+2, 4,8+4, 6,8+6>(m0);
140 uint16x8 m1 = make_shuffle_bytes16_mask<1,8+1, 3,8+3, 5,8+5, 7,8+7>(m1);
141 uint16x8 b0, b1;
142 b0 = shuffle_bytes16(a0, a1, m0);
143 b1 = shuffle_bytes16(a0, a1, m1);
144 a0 = b0; a1 = b1;
145#endif
146}
147
148#if SIMDPP_USE_AVX2
149static SIMDPP_INL
150void i_transpose2(uint16x16& a0, uint16x16& a1)
151{
152 uint32x8 b0, b1;
153 b0 = zip8_lo(a0, a1);
154 b1 = zip8_hi(a0, a1);
155 a0 = shuffle2<0,2,0,2>(b0, b1);
156 a1 = shuffle2<1,3,1,3>(b0, b1);
157}
158#endif
159
160#if SIMDPP_USE_AVX512BW
161SIMDPP_INL void i_transpose2(uint16<32>& a0, uint16<32>& a1)
162{
163 uint32<16> b0, b1;
164 b0 = zip8_lo(a0, a1);
165 b1 = zip8_hi(a0, a1);
166 a0 = shuffle2<0,2,0,2>(b0, b1);
167 a1 = shuffle2<1,3,1,3>(b0, b1);
168}
169#endif
170
171template<unsigned N> SIMDPP_INL
172void i_transpose2(uint16<N>& a0, uint16<N>& a1)
173{
174 SIMDPP_VEC_ARRAY_IMPL_REF2(uint16<N>, i_transpose2, a0, a1);
175}
176
177// -----------------------------------------------------------------------------
178
179static SIMDPP_INL
180void i_transpose2(uint32x4& a0, uint32x4& a1)
181{
182#if SIMDPP_USE_NULL
183 detail::null::transpose2(a0, a1);
184#elif SIMDPP_USE_SSE2
185 uint64x2 b0, b1;
186 b0 = zip4_lo(a0, a1);
187 b1 = zip4_hi(a0, a1);
188 a0 = zip2_lo(b0, b1);
189 a1 = zip2_hi(b0, b1);
190#elif SIMDPP_USE_NEON
191 auto r = vtrnq_u32(a0.native(), a1.native());
192 a0 = r.val[0];
193 a1 = r.val[1];
194#elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
195 uint32x4 m0 = make_shuffle_bytes16_mask<0,4+0, 2,4+2>(m0);
196 uint32x4 m1 = make_shuffle_bytes16_mask<1,4+1, 3,4+3>(m1);
197 uint32x4 b0, b1;
198 b0 = shuffle_bytes16(a0, a1, m0);
199 b1 = shuffle_bytes16(a0, a1, m1);
200 a0 = b0; a1 = b1;
201#endif
202}
203
204#if SIMDPP_USE_AVX2
205static SIMDPP_INL
206void i_transpose2(uint32x8& a0, uint32x8& a1)
207{
208 uint64x4 b0, b1;
209 b0 = zip4_lo(a0, a1);
210 b1 = zip4_hi(a0, a1);
211 a0 = zip2_lo(b0, b1);
212 a1 = zip2_hi(b0, b1);
213}
214#endif
215
216#if SIMDPP_USE_AVX512F
217static SIMDPP_INL
218void i_transpose2(uint32<16>& a0, uint32<16>& a1)
219{
220 uint64<8> b0, b1;
221 b0 = zip4_lo(a0, a1);
222 b1 = zip4_hi(a0, a1);
223 a0 = zip2_lo(b0, b1);
224 a1 = zip2_hi(b0, b1);
225}
226#endif
227
228template<unsigned N> SIMDPP_INL
229void i_transpose2(uint32<N>& a0, uint32<N>& a1)
230{
231 SIMDPP_VEC_ARRAY_IMPL_REF2(uint32<N>, i_transpose2, a0, a1);
232}
233
234// -----------------------------------------------------------------------------
235
236static SIMDPP_INL
237void i_transpose2(uint64x2& a0, uint64x2& a1)
238{
239#if SIMDPP_USE_SSE2 || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
240 uint64x2 b0;
241 b0 = zip2_lo(a0, a1);
242 a1 = zip2_hi(a0, a1);
243 a0 = b0;
244#elif SIMDPP_USE_NEON
245 neon::transpose2(a0, a1);
246#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
247 detail::null::transpose2(a0, a1);
248#endif
249}
250
251#if SIMDPP_USE_AVX2
252static SIMDPP_INL
253void i_transpose2(uint64x4& a0, uint64x4& a1)
254{
255 uint64x4 b0;
256 b0 = zip2_lo(a0, a1);
257 a1 = zip2_hi(a0, a1);
258 a0 = b0;
259}
260#endif
261
262#if SIMDPP_USE_AVX512F
263static SIMDPP_INL
264void i_transpose2(uint64<8>& a0, uint64<8>& a1)
265{
266 uint64<8> b0;
267 b0 = zip2_lo(a0, a1);
268 a1 = zip2_hi(a0, a1);
269 a0 = b0;
270}
271#endif
272
273template<unsigned N> SIMDPP_INL
274void i_transpose2(uint64<N>& a0, uint64<N>& a1)
275{
276 SIMDPP_VEC_ARRAY_IMPL_REF2(uint64<N>, i_transpose2, a0, a1);
277}
278
279// -----------------------------------------------------------------------------
280
281static SIMDPP_INL
282void i_transpose2(float32x4& a0, float32x4& a1)
283{
284#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
285 detail::null::transpose2(a0, a1);
286#elif SIMDPP_USE_SSE2
287 float64x2 b0, b1;
288 b0 = bit_cast<float64x2>(zip4_lo(a0, a1));
289 b1 = bit_cast<float64x2>(zip4_hi(a0, a1));
290 a0 = bit_cast<float32x4>(zip2_lo(b0, b1));
291 a1 = bit_cast<float32x4>(zip2_hi(b0, b1));
292#elif SIMDPP_USE_NEON
293 auto r = vtrnq_f32(a0.native(), a1.native());
294 a0 = r.val[0];
295 a1 = r.val[1];
296#elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
297 uint32x4 m0 = make_shuffle_bytes16_mask<0,4+0, 2,4+2>(m0);
298 uint32x4 m1 = make_shuffle_bytes16_mask<1,4+1, 3,4+3>(m1);
299 float32x4 b0, b1;
300 b0 = shuffle_bytes16(a0, a1, m0);
301 b1 = shuffle_bytes16(a0, a1, m1);
302 a0 = b0; a1 = b1;
303#endif
304}
305
306#if SIMDPP_USE_AVX
307static SIMDPP_INL
308void i_transpose2(float32x8& a0, float32x8& a1)
309{
310 float64x4 b0, b1;
311 b0 = zip4_lo(a0, a1);
312 b1 = zip4_hi(a0, a1);
313 a0 = zip2_lo(b0, b1);
314 a1 = zip2_hi(b0, b1);
315}
316#endif
317
318#if SIMDPP_USE_AVX512F
319static SIMDPP_INL
320void i_transpose2(float32<16>& a0, float32<16>& a1)
321{
322 float64<8> b0, b1;
323 b0 = zip4_lo(a0, a1);
324 b1 = zip4_hi(a0, a1);
325 a0 = zip2_lo(b0, b1);
326 a1 = zip2_hi(b0, b1);
327}
328#endif
329
330template<unsigned N> SIMDPP_INL
331void i_transpose2(float32<N>& a0, float32<N>& a1)
332{
333 SIMDPP_VEC_ARRAY_IMPL_REF2(float32<N>, i_transpose2, a0, a1);
334}
335
336// -----------------------------------------------------------------------------
337
338static SIMDPP_INL
339void i_transpose2(float64x2& a0, float64x2& a1)
340{
341#if SIMDPP_USE_SSE2 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
342 float64x2 b0;
343 b0 = zip2_lo(a0, a1);
344 a1 = zip2_hi(a0, a1);
345 a0 = b0;
346#elif SIMDPP_USE_NEON64
347 uint64x2 b0, b1;
348 b0 = a0; b1 = a1;
349 i_transpose2(b0, b1);
350 a0 = b0; a1 = b1;
351#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
352 detail::null::transpose2(a0, a1);
353#endif
354}
355
356#if SIMDPP_USE_AVX
357static SIMDPP_INL
358void i_transpose2(float64x4& a0, float64x4& a1)
359{
360 float64x4 b0;
361 b0 = zip2_lo(a0, a1);
362 a1 = zip2_hi(a0, a1);
363 a0 = b0;
364}
365#endif
366
367#if SIMDPP_USE_AVX512F
368static SIMDPP_INL
369void i_transpose2(float64<8>& a0, float64<8>& a1)
370{
371 float64<8> b0;
372 b0 = zip2_lo(a0, a1);
373 a1 = zip2_hi(a0, a1);
374 a0 = b0;
375}
376#endif
377
378template<unsigned N> SIMDPP_INL
379void i_transpose2(float64<N>& a0, float64<N>& a1)
380{
381 SIMDPP_VEC_ARRAY_IMPL_REF2(float64<N>, i_transpose2, a0, a1);
382}
383
384// -----------------------------------------------------------------------------
385
386static SIMDPP_INL
387void i_transpose4(uint32x4& a0, uint32x4& a1,
388 uint32x4& a2, uint32x4& a3);
389
390#if SIMDPP_USE_AVX2
391static SIMDPP_INL
392void i_transpose4(uint32x8& a0, uint32x8& a1,
393 uint32x8& a2, uint32x8& a3);
394#endif
395
396static SIMDPP_INL
397void i_transpose4(uint8x16& a0, uint8x16& a1,
398 uint8x16& a2, uint8x16& a3)
399{
400 // [a0,a1,a2,a3 ... ]
401 // [b0,b1,b2,b3 ... ]
402 // [c0,c1,c2,c3 ... ]
403 // [d0,d1,d2,d3 ... ]
404#if SIMDPP_USE_NULL
405 detail::null::transpose4(a0, a1, a2, a3);
406#elif SIMDPP_USE_SSE2
407 v_sse_transpose8x4<uint8<16>, uint16<8>, uint32<4>>(a0, a1, a2, a3);
408#elif SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
409 uint16x8 b0, b1, b2, b3;
410 i_transpose2(a0, a1); // 8-bit transpose
411 i_transpose2(a2, a3);
412 b0 = a0; b1 = a1; b2 = a2; b3 = a3;
413 i_transpose2(b0, b2); // 16-bit transpose
414 i_transpose2(b1, b3);
415 a0 = b0; a1 = b1; a2 = b2; a3 = b3;
416#endif
417}
418
419
420#if SIMDPP_USE_AVX2
421static SIMDPP_INL
422void i_transpose4(uint8x32& a0, uint8x32& a1,
423 uint8x32& a2, uint8x32& a3)
424{
425 v_sse_transpose8x4<uint8<32>, uint16<16>, uint32<8>>(a0, a1, a2, a3);
426}
427#endif
428
429#if SIMDPP_USE_AVX512BW
430static SIMDPP_INL
431void i_transpose4(uint8<64>& a0, uint8<64>& a1,
432 uint8<64>& a2, uint8<64>& a3)
433{
434 v_sse_transpose8x4<uint8<64>, uint16<32>, uint32<16>>(a0, a1, a2, a3);
435}
436#endif
437
438template<unsigned N> SIMDPP_INL
439void i_transpose4(uint8<N>& a0, uint8<N>& a1, uint8<N>& a2, uint8<N>& a3)
440{
441 SIMDPP_VEC_ARRAY_IMPL_REF4(uint8<N>, i_transpose4, a0, a1, a2, a3);
442}
443
444// -----------------------------------------------------------------------------
445
446static SIMDPP_INL
447void i_transpose4(uint16x8& a0, uint16x8& a1,
448 uint16x8& a2, uint16x8& a3)
449{
450#if SIMDPP_USE_NULL
451 detail::null::transpose4(a0, a1, a2, a3);
452#elif SIMDPP_USE_SSE2
453 v_sse_transpose16x4<uint16<8>, uint32<4>, uint64<2>>(a0, a1, a2, a3);
454#elif SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
455 uint32x4 b0, b1, b2, b3;
456 i_transpose2(a0, a1); // 16-bit transpose
457 i_transpose2(a2, a3);
458 b0 = a0; b1 = a1; b2 = a2; b3 = a3;
459 i_transpose2(b0, b2); // 32-bit transpose
460 i_transpose2(b1, b3);
461 a0 = b0; a1 = b1; a2 = b2; a3 = b3;
462#endif
463}
464
465#if SIMDPP_USE_AVX2
466static SIMDPP_INL
467void i_transpose4(uint16x16& a0, uint16x16& a1,
468 uint16x16& a2, uint16x16& a3)
469{
470 v_sse_transpose16x4<uint16<16>, uint32<8>, uint64<4>>(a0, a1, a2, a3);
471}
472#endif
473
474#if SIMDPP_USE_AVX2
475SIMDPP_INL void i_transpose4(uint16<32>& a0, uint16<32>& a1,
476 uint16<32>& a2, uint16<32>& a3)
477{
478 v_sse_transpose16x4<uint16<32>, uint32<16>, uint64<8>>(a0, a1, a2, a3);
479}
480#endif
481
482template<unsigned N> SIMDPP_INL
483void i_transpose4(uint16<N>& a0, uint16<N>& a1, uint16<N>& a2, uint16<N>& a3)
484{
485 SIMDPP_VEC_ARRAY_IMPL_REF4(uint16<N>, i_transpose4, a0, a1, a2, a3);
486}
487
488// -----------------------------------------------------------------------------
489
490static SIMDPP_INL
491void i_transpose4(uint32x4& a0, uint32x4& a1,
492 uint32x4& a2, uint32x4& a3)
493{
494#if SIMDPP_USE_NULL
495 detail::null::transpose4(a0, a1, a2, a3);
496#elif SIMDPP_USE_SSE2
497 v_sse_transpose32x4<uint32<4>, uint64<2>>(a0, a1, a2, a3);
498#elif SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
499 uint64x2 b0, b1, b2, b3;
500 i_transpose2(a0, a1); // 32-bit transpose
501 i_transpose2(a2, a3);
502 b0 = a0; b1 = a1; b2 = a2; b3 = a3;
503 i_transpose2(b0, b2); // 64-bit transpose
504 i_transpose2(b1, b3);
505 a0 = b0; a1 = b1; a2 = b2; a3 = b3;
506#endif
507}
508
509#if SIMDPP_USE_AVX2
510static SIMDPP_INL
511void i_transpose4(uint32x8& a0, uint32x8& a1,
512 uint32x8& a2, uint32x8& a3)
513{
514 v_sse_transpose32x4<uint32<8>, uint64<4>>(a0, a1, a2, a3);
515}
516#endif
517
518#if SIMDPP_USE_AVX2
519static SIMDPP_INL
520void i_transpose4(uint32<16>& a0, uint32<16>& a1,
521 uint32<16>& a2, uint32<16>& a3)
522{
523 v_sse_transpose32x4<uint32<16>, uint64<8>>(a0, a1, a2, a3);
524}
525#endif
526
527template<unsigned N> SIMDPP_INL
528void i_transpose4(uint32<N>& a0, uint32<N>& a1, uint32<N>& a2, uint32<N>& a3)
529{
530 SIMDPP_VEC_ARRAY_IMPL_REF4(uint32<N>, i_transpose4, a0, a1, a2, a3);
531}
532
533// -----------------------------------------------------------------------------
534
535static SIMDPP_INL
536void i_transpose4(float32x4& a0, float32x4& a1,
537 float32x4& a2, float32x4& a3)
538{
539#if SIMDPP_USE_SSE2
540 v_sse_transpose32x4<float32<4>, float64<2>>(a0, a1, a2, a3);
541#else
542 uint32x4 b0, b1, b2, b3;
543 b0 = a0; b1 = a1; b2 = a2; b3 = a3;
544 i_transpose4(b0, b1, b2, b3);
545 a0 = b0; a1 = b1; a2 = b2; a3 = b3;
546#endif
547}
548
549#if SIMDPP_USE_AVX
550static SIMDPP_INL
551void i_transpose4(float32x8& a0, float32x8& a1,
552 float32x8& a2, float32x8& a3)
553{
554 v_sse_transpose32x4<float32<8>, float64<4>>(a0, a1, a2, a3);
555}
556#endif
557
558#if SIMDPP_USE_AVX512F
559static SIMDPP_INL
560void i_transpose4(float32<16>& a0, float32<16>& a1,
561 float32<16>& a2, float32<16>& a3)
562{
563 v_sse_transpose32x4<float32<16>, float64<8>>(a0, a1, a2, a3);
564}
565#endif
566
567template<unsigned N> SIMDPP_INL
568void i_transpose4(float32<N>& a0, float32<N>& a1, float32<N>& a2, float32<N>& a3)
569{
570 SIMDPP_VEC_ARRAY_IMPL_REF4(float32<N>, i_transpose4, a0, a1, a2, a3);
571}
572
573// -----------------------------------------------------------------------------
574
575template<class V, class D> SIMDPP_INL
576void v_sse_transpose32x4(V& a0, V& a1, V& a2, V& a3)
577{
578 D b0, b1, b2, b3;
579 // [a0,a1,a2,a3]
580 // [b0,b1,b2,b3]
581 // [c0,c1,c2,c3]
582 // [d0,d1,d2,d3]
583 b0 = zip4_lo(a0, a1);
584 b1 = zip4_hi(a0, a1);
585 b2 = zip4_lo(a2, a3);
586 b3 = zip4_hi(a2, a3);
587 // [a0,b0,a1,b1]
588 // [a2,b2,a3,b3]
589 // [c0,d0,c1,d1]
590 // [c2,d2,c3,d3]
591 a0 = zip2_lo(b0, b2);
592 a1 = zip2_hi(b0, b2);
593 a2 = zip2_lo(b1, b3);
594 a3 = zip2_hi(b1, b3);
595}
596
597template<class V16, class V32, class V64> SIMDPP_INL
598void v_sse_transpose16x4(V16& a0, V16& a1, V16& a2, V16& a3)
599{
600 V32 b0, b1, b2, b3;
601 V64 c0, c1, c2, c3;
602 b0 = zip8_lo(a0, a1);
603 b1 = zip8_hi(a0, a1);
604 b2 = zip8_lo(a2, a3);
605 b3 = zip8_hi(a2, a3);
606 // [a0,b0,a1,b1,a2,b2,a3,b3]
607 // [a4,b4,a5,b5,a6,b6,a7,b7]
608 // [c0,d0,c1,d1,c2,d2,c3,d3]
609 // [c4,d4,c5,d5,c6,d6,c7,d7]
610 c0 = zip4_lo(b0, b2);
611 c1 = zip4_hi(b0, b2);
612 c2 = zip4_lo(b1, b3);
613 c3 = zip4_hi(b1, b3);
614 // [a0,b0,c0,d0,a1,b1,c1,d1]
615 // [a2,b2,c2,d2,a3,b3,c3,d3]
616 // [a4,b4,c4,d4,a5,b5,c5,d5]
617 // [a6,b6,c6,d6,a7,b7,c7,d7]
618 a0 = zip2_lo(c0, c2);
619 a1 = zip2_hi(c0, c2);
620 a2 = zip2_lo(c1, c3);
621 a3 = zip2_hi(c1, c3);
622 // [a0,b0,c0,d0,a4,b4,c4,d4]
623 // [a1,b1,c1,d1,a5,b5,c5,d5]
624 // [a2,b2,c2,d2,a6,b6,c6,d6]
625 // [a3,b3,c3,d3,a7,b7,c7,d7]
626}
627
628template<class V8, class V16, class V32> SIMDPP_INL
629void v_sse_transpose8x4(V8& a0, V8& a1, V8& a2, V8& a3)
630{
631 V16 b0, b1, b2, b3;
632 b0 = zip16_lo(a0, a1);
633 b1 = zip16_lo(a2, a3);
634 b2 = zip16_hi(a0, a1);
635 b3 = zip16_hi(a2, a3);
636 // [a0,b0,a1,b1,a2,b2,a3,b3 ... b7]
637 // [c0,d0,c1,d1,c2,d2,c3,d3 ... d7]
638 // [a8 ... b15]
639 // [c8 ... d15]
640 V32 c0, c1, c2, c3;
641 c0 = zip8_lo(b0, b1);
642 c1 = zip8_hi(b0, b1);
643 c2 = zip8_lo(b2, b3);
644 c3 = zip8_hi(b2, b3);
645 // [a0,b0,c0,d0,[a..d]1, [a..d]2, [a..d]3]
646 // [[a..d]4, [a..d]5, [a..d]6, [a..d]7]
647 // [[a..d]8, [a..d]9, [a..d]10, [a..d]11]
648 // [[a..d]12, [a..d]13,[a..d]14, [a..d]15]
649 i_transpose4(c0, c1, c2, c3); // 32-bit transpose
650 a0 = c0;
651 a1 = c1;
652 a2 = c2;
653 a3 = c3;
654}
655
656
657} // namespace insn
658} // namespace detail
659} // namespace SIMDPP_ARCH_NAMESPACE
660} // namespace simdpp
661
662#endif
663