| 1 | /* Copyright (C) 2012-2014 Povilas Kanapickas <povilas@radix.lt> |
| 2 | |
| 3 | Distributed under the Boost Software License, Version 1.0. |
| 4 | (See accompanying file LICENSE_1_0.txt or copy at |
| 5 | http://www.boost.org/LICENSE_1_0.txt) |
| 6 | */ |
| 7 | |
| 8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_TRANSPOSE_H |
| 9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_TRANSPOSE_H |
| 10 | |
| 11 | #include <simdpp/types.h> |
| 12 | #include <simdpp/detail/not_implemented.h> |
| 13 | #include <simdpp/detail/width.h> |
| 14 | #include <simdpp/core/permute_bytes16.h> |
| 15 | #include <simdpp/core/zip_lo.h> |
| 16 | #include <simdpp/core/zip_hi.h> |
| 17 | #include <simdpp/detail/null/transpose.h> |
| 18 | #include <simdpp/detail/neon/shuffle.h> |
| 19 | #include <simdpp/detail/vector_array_macros.h> |
| 20 | |
| 21 | namespace simdpp { |
| 22 | namespace SIMDPP_ARCH_NAMESPACE { |
| 23 | namespace detail { |
| 24 | namespace insn { |
| 25 | |
| 26 | |
| 27 | template<class V8, class V16, class V32> SIMDPP_INL |
| 28 | void v_sse_transpose8x4(V8& a0, V8& a1, V8& a2, V8& a3); |
| 29 | template<class V16, class V32, class V64> SIMDPP_INL |
| 30 | void v_sse_transpose16x4(V16& a0, V16& a1, V16& a2, V16& a3); |
| 31 | template<class V, class D> SIMDPP_INL |
| 32 | void v_sse_transpose32x4(V& a0, V& a1, V& a2, V& a3); |
| 33 | |
| 34 | /** Transposes eight 2x2 8-bit matrices within two int8x16 vectors |
| 35 | |
| 36 | @code |
| 37 | r0 = [ a0_0; a1_0 ; ... ; a0_14; a1_14 ] |
| 38 | r1 = [ a1_1; a1_1 ; ... ; a0_15; a0_15 ] |
| 39 | @endcode |
| 40 | |
| 41 | @par 128-bit version: |
| 42 | @icost{SSE2-AVX2, 4} |
| 43 | @icost{ALTIVEC, 2-4} |
| 44 | |
| 45 | @par 256-bit version: |
| 46 | @icost{SSE2-AVX, 8} |
| 47 | @icost{AVX2, 4} |
| 48 | @icost{ALTIVEC, 4-6} |
| 49 | |
| 50 | The lower and higher 128-bit halves are processed as if 128-bit instruction |
| 51 | was applied to each of them separately. |
| 52 | */ |
| 53 | static SIMDPP_INL |
| 54 | void i_transpose2(uint8x16& a0, uint8x16& a1) |
| 55 | { |
| 56 | #if SIMDPP_USE_NULL |
| 57 | detail::null::transpose2(a0, a1); |
| 58 | #elif SIMDPP_USE_NEON |
| 59 | auto r = vtrnq_u8(a0.native(), a1.native()); |
| 60 | a0 = r.val[0]; |
| 61 | a1 = r.val[1]; |
| 62 | #elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
| 63 | uint8x16 m0 = make_shuffle_bytes16_mask<0,16+0, 2,16+2, 4,16+4, 6,16+6, |
| 64 | 8,16+8, 10,16+10, 12,16+12, 14,16+14>(m0); |
| 65 | uint8x16 m1 = make_shuffle_bytes16_mask<1,16+1, 3,16+3, 5,16+5, 7,16+7, |
| 66 | 9,16+9, 11,16+11, 13,16+13, 15,16+15>(m1); |
| 67 | uint16x8 b0, b1; |
| 68 | b0 = shuffle_bytes16(a0, a1, m0); |
| 69 | b1 = shuffle_bytes16(a0, a1, m1); |
| 70 | a0 = b0; a1 = b1; |
| 71 | #else |
| 72 | SIMDPP_NOT_IMPLEMENTED2(a0, a1); |
| 73 | #endif |
| 74 | } |
| 75 | |
| 76 | /** Helper function. |
| 77 | |
| 78 | @code |
| 79 | r = [a0,a4,a8,a12,a1,a5,a9,a13,a2,a6,a10,a14,a3,a7,a11,a15] |
| 80 | @endcode |
| 81 | |
| 82 | The 256-bit version applies the 128 bit operation to the two halves. |
| 83 | |
| 84 | Needs SSSE3 |
| 85 | */ |
| 86 | static SIMDPP_INL |
| 87 | uint8x16 transpose_inplace(const uint8x16& a) |
| 88 | { |
| 89 | #if SIMDPP_USE_SSSE3 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
| 90 | // the compiler will take this out of any loops automatically |
| 91 | uint8x16 idx = make_uint(0, 4, 8, 12, 1, 5, 9, 13, |
| 92 | 2, 6, 10,14, 3, 7, 11,15); |
| 93 | return permute_bytes16(a, idx); |
| 94 | #else |
| 95 | return SIMDPP_NOT_IMPLEMENTED1(a); |
| 96 | #endif |
| 97 | } |
| 98 | |
| 99 | static SIMDPP_INL |
| 100 | uint8x32 transpose_inplace(const uint8x32& a) |
| 101 | { |
| 102 | #if SIMDPP_USE_AVX2 |
| 103 | uint8x32 idx = make_uint(0, 4, 8, 12, 1, 5, 9, 13, |
| 104 | 2, 6, 10,14, 3, 7, 11,15); |
| 105 | return permute_bytes16(a, idx); |
| 106 | #elif SIMDPP_USE_SSSE3 || SIMDPP_USE_ALTIVEC |
| 107 | SIMDPP_VEC_ARRAY_IMPL1(uint8x32, transpose_inplace, a); |
| 108 | #else |
| 109 | return SIMDPP_NOT_IMPLEMENTED1(a); |
| 110 | #endif |
| 111 | } |
| 112 | |
| 113 | #if SIMDPP_USE_AVX512BW |
| 114 | static SIMDPP_INL |
| 115 | uint8<64> transpose_inplace(const uint8<64>& a) |
| 116 | { |
| 117 | uint8<64> idx = make_uint(0, 4, 8, 12, 1, 5, 9, 13, |
| 118 | 2, 6, 10,14, 3, 7, 11,15); |
| 119 | return permute_bytes16(a, idx); |
| 120 | } |
| 121 | #endif |
| 122 | |
| 123 | static SIMDPP_INL |
| 124 | void i_transpose2(uint16x8& a0, uint16x8& a1) |
| 125 | { |
| 126 | #if SIMDPP_USE_NULL |
| 127 | detail::null::transpose2(a0, a1); |
| 128 | #elif SIMDPP_USE_SSE2 |
| 129 | uint32x4 b0, b1; |
| 130 | b0 = zip8_lo(a0, a1); |
| 131 | b1 = zip8_hi(a0, a1); |
| 132 | a0 = shuffle2<0,2,0,2>(b0, b1); |
| 133 | a1 = shuffle2<1,3,1,3>(b0, b1); |
| 134 | #elif SIMDPP_USE_NEON |
| 135 | auto r = vtrnq_u16(a0.native(), a1.native()); |
| 136 | a0 = r.val[0]; |
| 137 | a1 = r.val[1]; |
| 138 | #elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
| 139 | uint16x8 m0 = make_shuffle_bytes16_mask<0,8+0, 2,8+2, 4,8+4, 6,8+6>(m0); |
| 140 | uint16x8 m1 = make_shuffle_bytes16_mask<1,8+1, 3,8+3, 5,8+5, 7,8+7>(m1); |
| 141 | uint16x8 b0, b1; |
| 142 | b0 = shuffle_bytes16(a0, a1, m0); |
| 143 | b1 = shuffle_bytes16(a0, a1, m1); |
| 144 | a0 = b0; a1 = b1; |
| 145 | #endif |
| 146 | } |
| 147 | |
| 148 | #if SIMDPP_USE_AVX2 |
| 149 | static SIMDPP_INL |
| 150 | void i_transpose2(uint16x16& a0, uint16x16& a1) |
| 151 | { |
| 152 | uint32x8 b0, b1; |
| 153 | b0 = zip8_lo(a0, a1); |
| 154 | b1 = zip8_hi(a0, a1); |
| 155 | a0 = shuffle2<0,2,0,2>(b0, b1); |
| 156 | a1 = shuffle2<1,3,1,3>(b0, b1); |
| 157 | } |
| 158 | #endif |
| 159 | |
| 160 | #if SIMDPP_USE_AVX512BW |
| 161 | SIMDPP_INL void i_transpose2(uint16<32>& a0, uint16<32>& a1) |
| 162 | { |
| 163 | uint32<16> b0, b1; |
| 164 | b0 = zip8_lo(a0, a1); |
| 165 | b1 = zip8_hi(a0, a1); |
| 166 | a0 = shuffle2<0,2,0,2>(b0, b1); |
| 167 | a1 = shuffle2<1,3,1,3>(b0, b1); |
| 168 | } |
| 169 | #endif |
| 170 | |
| 171 | template<unsigned N> SIMDPP_INL |
| 172 | void i_transpose2(uint16<N>& a0, uint16<N>& a1) |
| 173 | { |
| 174 | SIMDPP_VEC_ARRAY_IMPL_REF2(uint16<N>, i_transpose2, a0, a1); |
| 175 | } |
| 176 | |
| 177 | // ----------------------------------------------------------------------------- |
| 178 | |
| 179 | static SIMDPP_INL |
| 180 | void i_transpose2(uint32x4& a0, uint32x4& a1) |
| 181 | { |
| 182 | #if SIMDPP_USE_NULL |
| 183 | detail::null::transpose2(a0, a1); |
| 184 | #elif SIMDPP_USE_SSE2 |
| 185 | uint64x2 b0, b1; |
| 186 | b0 = zip4_lo(a0, a1); |
| 187 | b1 = zip4_hi(a0, a1); |
| 188 | a0 = zip2_lo(b0, b1); |
| 189 | a1 = zip2_hi(b0, b1); |
| 190 | #elif SIMDPP_USE_NEON |
| 191 | auto r = vtrnq_u32(a0.native(), a1.native()); |
| 192 | a0 = r.val[0]; |
| 193 | a1 = r.val[1]; |
| 194 | #elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
| 195 | uint32x4 m0 = make_shuffle_bytes16_mask<0,4+0, 2,4+2>(m0); |
| 196 | uint32x4 m1 = make_shuffle_bytes16_mask<1,4+1, 3,4+3>(m1); |
| 197 | uint32x4 b0, b1; |
| 198 | b0 = shuffle_bytes16(a0, a1, m0); |
| 199 | b1 = shuffle_bytes16(a0, a1, m1); |
| 200 | a0 = b0; a1 = b1; |
| 201 | #endif |
| 202 | } |
| 203 | |
| 204 | #if SIMDPP_USE_AVX2 |
| 205 | static SIMDPP_INL |
| 206 | void i_transpose2(uint32x8& a0, uint32x8& a1) |
| 207 | { |
| 208 | uint64x4 b0, b1; |
| 209 | b0 = zip4_lo(a0, a1); |
| 210 | b1 = zip4_hi(a0, a1); |
| 211 | a0 = zip2_lo(b0, b1); |
| 212 | a1 = zip2_hi(b0, b1); |
| 213 | } |
| 214 | #endif |
| 215 | |
| 216 | #if SIMDPP_USE_AVX512F |
| 217 | static SIMDPP_INL |
| 218 | void i_transpose2(uint32<16>& a0, uint32<16>& a1) |
| 219 | { |
| 220 | uint64<8> b0, b1; |
| 221 | b0 = zip4_lo(a0, a1); |
| 222 | b1 = zip4_hi(a0, a1); |
| 223 | a0 = zip2_lo(b0, b1); |
| 224 | a1 = zip2_hi(b0, b1); |
| 225 | } |
| 226 | #endif |
| 227 | |
| 228 | template<unsigned N> SIMDPP_INL |
| 229 | void i_transpose2(uint32<N>& a0, uint32<N>& a1) |
| 230 | { |
| 231 | SIMDPP_VEC_ARRAY_IMPL_REF2(uint32<N>, i_transpose2, a0, a1); |
| 232 | } |
| 233 | |
| 234 | // ----------------------------------------------------------------------------- |
| 235 | |
| 236 | static SIMDPP_INL |
| 237 | void i_transpose2(uint64x2& a0, uint64x2& a1) |
| 238 | { |
| 239 | #if SIMDPP_USE_SSE2 || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA |
| 240 | uint64x2 b0; |
| 241 | b0 = zip2_lo(a0, a1); |
| 242 | a1 = zip2_hi(a0, a1); |
| 243 | a0 = b0; |
| 244 | #elif SIMDPP_USE_NEON |
| 245 | neon::transpose2(a0, a1); |
| 246 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC |
| 247 | detail::null::transpose2(a0, a1); |
| 248 | #endif |
| 249 | } |
| 250 | |
| 251 | #if SIMDPP_USE_AVX2 |
| 252 | static SIMDPP_INL |
| 253 | void i_transpose2(uint64x4& a0, uint64x4& a1) |
| 254 | { |
| 255 | uint64x4 b0; |
| 256 | b0 = zip2_lo(a0, a1); |
| 257 | a1 = zip2_hi(a0, a1); |
| 258 | a0 = b0; |
| 259 | } |
| 260 | #endif |
| 261 | |
| 262 | #if SIMDPP_USE_AVX512F |
| 263 | static SIMDPP_INL |
| 264 | void i_transpose2(uint64<8>& a0, uint64<8>& a1) |
| 265 | { |
| 266 | uint64<8> b0; |
| 267 | b0 = zip2_lo(a0, a1); |
| 268 | a1 = zip2_hi(a0, a1); |
| 269 | a0 = b0; |
| 270 | } |
| 271 | #endif |
| 272 | |
| 273 | template<unsigned N> SIMDPP_INL |
| 274 | void i_transpose2(uint64<N>& a0, uint64<N>& a1) |
| 275 | { |
| 276 | SIMDPP_VEC_ARRAY_IMPL_REF2(uint64<N>, i_transpose2, a0, a1); |
| 277 | } |
| 278 | |
| 279 | // ----------------------------------------------------------------------------- |
| 280 | |
| 281 | static SIMDPP_INL |
| 282 | void i_transpose2(float32x4& a0, float32x4& a1) |
| 283 | { |
| 284 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP |
| 285 | detail::null::transpose2(a0, a1); |
| 286 | #elif SIMDPP_USE_SSE2 |
| 287 | float64x2 b0, b1; |
| 288 | b0 = bit_cast<float64x2>(zip4_lo(a0, a1)); |
| 289 | b1 = bit_cast<float64x2>(zip4_hi(a0, a1)); |
| 290 | a0 = bit_cast<float32x4>(zip2_lo(b0, b1)); |
| 291 | a1 = bit_cast<float32x4>(zip2_hi(b0, b1)); |
| 292 | #elif SIMDPP_USE_NEON |
| 293 | auto r = vtrnq_f32(a0.native(), a1.native()); |
| 294 | a0 = r.val[0]; |
| 295 | a1 = r.val[1]; |
| 296 | #elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
| 297 | uint32x4 m0 = make_shuffle_bytes16_mask<0,4+0, 2,4+2>(m0); |
| 298 | uint32x4 m1 = make_shuffle_bytes16_mask<1,4+1, 3,4+3>(m1); |
| 299 | float32x4 b0, b1; |
| 300 | b0 = shuffle_bytes16(a0, a1, m0); |
| 301 | b1 = shuffle_bytes16(a0, a1, m1); |
| 302 | a0 = b0; a1 = b1; |
| 303 | #endif |
| 304 | } |
| 305 | |
| 306 | #if SIMDPP_USE_AVX |
| 307 | static SIMDPP_INL |
| 308 | void i_transpose2(float32x8& a0, float32x8& a1) |
| 309 | { |
| 310 | float64x4 b0, b1; |
| 311 | b0 = zip4_lo(a0, a1); |
| 312 | b1 = zip4_hi(a0, a1); |
| 313 | a0 = zip2_lo(b0, b1); |
| 314 | a1 = zip2_hi(b0, b1); |
| 315 | } |
| 316 | #endif |
| 317 | |
| 318 | #if SIMDPP_USE_AVX512F |
| 319 | static SIMDPP_INL |
| 320 | void i_transpose2(float32<16>& a0, float32<16>& a1) |
| 321 | { |
| 322 | float64<8> b0, b1; |
| 323 | b0 = zip4_lo(a0, a1); |
| 324 | b1 = zip4_hi(a0, a1); |
| 325 | a0 = zip2_lo(b0, b1); |
| 326 | a1 = zip2_hi(b0, b1); |
| 327 | } |
| 328 | #endif |
| 329 | |
| 330 | template<unsigned N> SIMDPP_INL |
| 331 | void i_transpose2(float32<N>& a0, float32<N>& a1) |
| 332 | { |
| 333 | SIMDPP_VEC_ARRAY_IMPL_REF2(float32<N>, i_transpose2, a0, a1); |
| 334 | } |
| 335 | |
| 336 | // ----------------------------------------------------------------------------- |
| 337 | |
| 338 | static SIMDPP_INL |
| 339 | void i_transpose2(float64x2& a0, float64x2& a1) |
| 340 | { |
| 341 | #if SIMDPP_USE_SSE2 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA |
| 342 | float64x2 b0; |
| 343 | b0 = zip2_lo(a0, a1); |
| 344 | a1 = zip2_hi(a0, a1); |
| 345 | a0 = b0; |
| 346 | #elif SIMDPP_USE_NEON64 |
| 347 | uint64x2 b0, b1; |
| 348 | b0 = a0; b1 = a1; |
| 349 | i_transpose2(b0, b1); |
| 350 | a0 = b0; a1 = b1; |
| 351 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC |
| 352 | detail::null::transpose2(a0, a1); |
| 353 | #endif |
| 354 | } |
| 355 | |
| 356 | #if SIMDPP_USE_AVX |
| 357 | static SIMDPP_INL |
| 358 | void i_transpose2(float64x4& a0, float64x4& a1) |
| 359 | { |
| 360 | float64x4 b0; |
| 361 | b0 = zip2_lo(a0, a1); |
| 362 | a1 = zip2_hi(a0, a1); |
| 363 | a0 = b0; |
| 364 | } |
| 365 | #endif |
| 366 | |
| 367 | #if SIMDPP_USE_AVX512F |
| 368 | static SIMDPP_INL |
| 369 | void i_transpose2(float64<8>& a0, float64<8>& a1) |
| 370 | { |
| 371 | float64<8> b0; |
| 372 | b0 = zip2_lo(a0, a1); |
| 373 | a1 = zip2_hi(a0, a1); |
| 374 | a0 = b0; |
| 375 | } |
| 376 | #endif |
| 377 | |
| 378 | template<unsigned N> SIMDPP_INL |
| 379 | void i_transpose2(float64<N>& a0, float64<N>& a1) |
| 380 | { |
| 381 | SIMDPP_VEC_ARRAY_IMPL_REF2(float64<N>, i_transpose2, a0, a1); |
| 382 | } |
| 383 | |
| 384 | // ----------------------------------------------------------------------------- |
| 385 | |
| 386 | static SIMDPP_INL |
| 387 | void i_transpose4(uint32x4& a0, uint32x4& a1, |
| 388 | uint32x4& a2, uint32x4& a3); |
| 389 | |
| 390 | #if SIMDPP_USE_AVX2 |
| 391 | static SIMDPP_INL |
| 392 | void i_transpose4(uint32x8& a0, uint32x8& a1, |
| 393 | uint32x8& a2, uint32x8& a3); |
| 394 | #endif |
| 395 | |
| 396 | static SIMDPP_INL |
| 397 | void i_transpose4(uint8x16& a0, uint8x16& a1, |
| 398 | uint8x16& a2, uint8x16& a3) |
| 399 | { |
| 400 | // [a0,a1,a2,a3 ... ] |
| 401 | // [b0,b1,b2,b3 ... ] |
| 402 | // [c0,c1,c2,c3 ... ] |
| 403 | // [d0,d1,d2,d3 ... ] |
| 404 | #if SIMDPP_USE_NULL |
| 405 | detail::null::transpose4(a0, a1, a2, a3); |
| 406 | #elif SIMDPP_USE_SSE2 |
| 407 | v_sse_transpose8x4<uint8<16>, uint16<8>, uint32<4>>(a0, a1, a2, a3); |
| 408 | #elif SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
| 409 | uint16x8 b0, b1, b2, b3; |
| 410 | i_transpose2(a0, a1); // 8-bit transpose |
| 411 | i_transpose2(a2, a3); |
| 412 | b0 = a0; b1 = a1; b2 = a2; b3 = a3; |
| 413 | i_transpose2(b0, b2); // 16-bit transpose |
| 414 | i_transpose2(b1, b3); |
| 415 | a0 = b0; a1 = b1; a2 = b2; a3 = b3; |
| 416 | #endif |
| 417 | } |
| 418 | |
| 419 | |
| 420 | #if SIMDPP_USE_AVX2 |
| 421 | static SIMDPP_INL |
| 422 | void i_transpose4(uint8x32& a0, uint8x32& a1, |
| 423 | uint8x32& a2, uint8x32& a3) |
| 424 | { |
| 425 | v_sse_transpose8x4<uint8<32>, uint16<16>, uint32<8>>(a0, a1, a2, a3); |
| 426 | } |
| 427 | #endif |
| 428 | |
| 429 | #if SIMDPP_USE_AVX512BW |
| 430 | static SIMDPP_INL |
| 431 | void i_transpose4(uint8<64>& a0, uint8<64>& a1, |
| 432 | uint8<64>& a2, uint8<64>& a3) |
| 433 | { |
| 434 | v_sse_transpose8x4<uint8<64>, uint16<32>, uint32<16>>(a0, a1, a2, a3); |
| 435 | } |
| 436 | #endif |
| 437 | |
| 438 | template<unsigned N> SIMDPP_INL |
| 439 | void i_transpose4(uint8<N>& a0, uint8<N>& a1, uint8<N>& a2, uint8<N>& a3) |
| 440 | { |
| 441 | SIMDPP_VEC_ARRAY_IMPL_REF4(uint8<N>, i_transpose4, a0, a1, a2, a3); |
| 442 | } |
| 443 | |
| 444 | // ----------------------------------------------------------------------------- |
| 445 | |
| 446 | static SIMDPP_INL |
| 447 | void i_transpose4(uint16x8& a0, uint16x8& a1, |
| 448 | uint16x8& a2, uint16x8& a3) |
| 449 | { |
| 450 | #if SIMDPP_USE_NULL |
| 451 | detail::null::transpose4(a0, a1, a2, a3); |
| 452 | #elif SIMDPP_USE_SSE2 |
| 453 | v_sse_transpose16x4<uint16<8>, uint32<4>, uint64<2>>(a0, a1, a2, a3); |
| 454 | #elif SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
| 455 | uint32x4 b0, b1, b2, b3; |
| 456 | i_transpose2(a0, a1); // 16-bit transpose |
| 457 | i_transpose2(a2, a3); |
| 458 | b0 = a0; b1 = a1; b2 = a2; b3 = a3; |
| 459 | i_transpose2(b0, b2); // 32-bit transpose |
| 460 | i_transpose2(b1, b3); |
| 461 | a0 = b0; a1 = b1; a2 = b2; a3 = b3; |
| 462 | #endif |
| 463 | } |
| 464 | |
| 465 | #if SIMDPP_USE_AVX2 |
| 466 | static SIMDPP_INL |
| 467 | void i_transpose4(uint16x16& a0, uint16x16& a1, |
| 468 | uint16x16& a2, uint16x16& a3) |
| 469 | { |
| 470 | v_sse_transpose16x4<uint16<16>, uint32<8>, uint64<4>>(a0, a1, a2, a3); |
| 471 | } |
| 472 | #endif |
| 473 | |
| 474 | #if SIMDPP_USE_AVX2 |
| 475 | SIMDPP_INL void i_transpose4(uint16<32>& a0, uint16<32>& a1, |
| 476 | uint16<32>& a2, uint16<32>& a3) |
| 477 | { |
| 478 | v_sse_transpose16x4<uint16<32>, uint32<16>, uint64<8>>(a0, a1, a2, a3); |
| 479 | } |
| 480 | #endif |
| 481 | |
| 482 | template<unsigned N> SIMDPP_INL |
| 483 | void i_transpose4(uint16<N>& a0, uint16<N>& a1, uint16<N>& a2, uint16<N>& a3) |
| 484 | { |
| 485 | SIMDPP_VEC_ARRAY_IMPL_REF4(uint16<N>, i_transpose4, a0, a1, a2, a3); |
| 486 | } |
| 487 | |
| 488 | // ----------------------------------------------------------------------------- |
| 489 | |
| 490 | static SIMDPP_INL |
| 491 | void i_transpose4(uint32x4& a0, uint32x4& a1, |
| 492 | uint32x4& a2, uint32x4& a3) |
| 493 | { |
| 494 | #if SIMDPP_USE_NULL |
| 495 | detail::null::transpose4(a0, a1, a2, a3); |
| 496 | #elif SIMDPP_USE_SSE2 |
| 497 | v_sse_transpose32x4<uint32<4>, uint64<2>>(a0, a1, a2, a3); |
| 498 | #elif SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
| 499 | uint64x2 b0, b1, b2, b3; |
| 500 | i_transpose2(a0, a1); // 32-bit transpose |
| 501 | i_transpose2(a2, a3); |
| 502 | b0 = a0; b1 = a1; b2 = a2; b3 = a3; |
| 503 | i_transpose2(b0, b2); // 64-bit transpose |
| 504 | i_transpose2(b1, b3); |
| 505 | a0 = b0; a1 = b1; a2 = b2; a3 = b3; |
| 506 | #endif |
| 507 | } |
| 508 | |
| 509 | #if SIMDPP_USE_AVX2 |
| 510 | static SIMDPP_INL |
| 511 | void i_transpose4(uint32x8& a0, uint32x8& a1, |
| 512 | uint32x8& a2, uint32x8& a3) |
| 513 | { |
| 514 | v_sse_transpose32x4<uint32<8>, uint64<4>>(a0, a1, a2, a3); |
| 515 | } |
| 516 | #endif |
| 517 | |
| 518 | #if SIMDPP_USE_AVX2 |
| 519 | static SIMDPP_INL |
| 520 | void i_transpose4(uint32<16>& a0, uint32<16>& a1, |
| 521 | uint32<16>& a2, uint32<16>& a3) |
| 522 | { |
| 523 | v_sse_transpose32x4<uint32<16>, uint64<8>>(a0, a1, a2, a3); |
| 524 | } |
| 525 | #endif |
| 526 | |
| 527 | template<unsigned N> SIMDPP_INL |
| 528 | void i_transpose4(uint32<N>& a0, uint32<N>& a1, uint32<N>& a2, uint32<N>& a3) |
| 529 | { |
| 530 | SIMDPP_VEC_ARRAY_IMPL_REF4(uint32<N>, i_transpose4, a0, a1, a2, a3); |
| 531 | } |
| 532 | |
| 533 | // ----------------------------------------------------------------------------- |
| 534 | |
| 535 | static SIMDPP_INL |
| 536 | void i_transpose4(float32x4& a0, float32x4& a1, |
| 537 | float32x4& a2, float32x4& a3) |
| 538 | { |
| 539 | #if SIMDPP_USE_SSE2 |
| 540 | v_sse_transpose32x4<float32<4>, float64<2>>(a0, a1, a2, a3); |
| 541 | #else |
| 542 | uint32x4 b0, b1, b2, b3; |
| 543 | b0 = a0; b1 = a1; b2 = a2; b3 = a3; |
| 544 | i_transpose4(b0, b1, b2, b3); |
| 545 | a0 = b0; a1 = b1; a2 = b2; a3 = b3; |
| 546 | #endif |
| 547 | } |
| 548 | |
| 549 | #if SIMDPP_USE_AVX |
| 550 | static SIMDPP_INL |
| 551 | void i_transpose4(float32x8& a0, float32x8& a1, |
| 552 | float32x8& a2, float32x8& a3) |
| 553 | { |
| 554 | v_sse_transpose32x4<float32<8>, float64<4>>(a0, a1, a2, a3); |
| 555 | } |
| 556 | #endif |
| 557 | |
| 558 | #if SIMDPP_USE_AVX512F |
| 559 | static SIMDPP_INL |
| 560 | void i_transpose4(float32<16>& a0, float32<16>& a1, |
| 561 | float32<16>& a2, float32<16>& a3) |
| 562 | { |
| 563 | v_sse_transpose32x4<float32<16>, float64<8>>(a0, a1, a2, a3); |
| 564 | } |
| 565 | #endif |
| 566 | |
| 567 | template<unsigned N> SIMDPP_INL |
| 568 | void i_transpose4(float32<N>& a0, float32<N>& a1, float32<N>& a2, float32<N>& a3) |
| 569 | { |
| 570 | SIMDPP_VEC_ARRAY_IMPL_REF4(float32<N>, i_transpose4, a0, a1, a2, a3); |
| 571 | } |
| 572 | |
| 573 | // ----------------------------------------------------------------------------- |
| 574 | |
| 575 | template<class V, class D> SIMDPP_INL |
| 576 | void v_sse_transpose32x4(V& a0, V& a1, V& a2, V& a3) |
| 577 | { |
| 578 | D b0, b1, b2, b3; |
| 579 | // [a0,a1,a2,a3] |
| 580 | // [b0,b1,b2,b3] |
| 581 | // [c0,c1,c2,c3] |
| 582 | // [d0,d1,d2,d3] |
| 583 | b0 = zip4_lo(a0, a1); |
| 584 | b1 = zip4_hi(a0, a1); |
| 585 | b2 = zip4_lo(a2, a3); |
| 586 | b3 = zip4_hi(a2, a3); |
| 587 | // [a0,b0,a1,b1] |
| 588 | // [a2,b2,a3,b3] |
| 589 | // [c0,d0,c1,d1] |
| 590 | // [c2,d2,c3,d3] |
| 591 | a0 = zip2_lo(b0, b2); |
| 592 | a1 = zip2_hi(b0, b2); |
| 593 | a2 = zip2_lo(b1, b3); |
| 594 | a3 = zip2_hi(b1, b3); |
| 595 | } |
| 596 | |
| 597 | template<class V16, class V32, class V64> SIMDPP_INL |
| 598 | void v_sse_transpose16x4(V16& a0, V16& a1, V16& a2, V16& a3) |
| 599 | { |
| 600 | V32 b0, b1, b2, b3; |
| 601 | V64 c0, c1, c2, c3; |
| 602 | b0 = zip8_lo(a0, a1); |
| 603 | b1 = zip8_hi(a0, a1); |
| 604 | b2 = zip8_lo(a2, a3); |
| 605 | b3 = zip8_hi(a2, a3); |
| 606 | // [a0,b0,a1,b1,a2,b2,a3,b3] |
| 607 | // [a4,b4,a5,b5,a6,b6,a7,b7] |
| 608 | // [c0,d0,c1,d1,c2,d2,c3,d3] |
| 609 | // [c4,d4,c5,d5,c6,d6,c7,d7] |
| 610 | c0 = zip4_lo(b0, b2); |
| 611 | c1 = zip4_hi(b0, b2); |
| 612 | c2 = zip4_lo(b1, b3); |
| 613 | c3 = zip4_hi(b1, b3); |
| 614 | // [a0,b0,c0,d0,a1,b1,c1,d1] |
| 615 | // [a2,b2,c2,d2,a3,b3,c3,d3] |
| 616 | // [a4,b4,c4,d4,a5,b5,c5,d5] |
| 617 | // [a6,b6,c6,d6,a7,b7,c7,d7] |
| 618 | a0 = zip2_lo(c0, c2); |
| 619 | a1 = zip2_hi(c0, c2); |
| 620 | a2 = zip2_lo(c1, c3); |
| 621 | a3 = zip2_hi(c1, c3); |
| 622 | // [a0,b0,c0,d0,a4,b4,c4,d4] |
| 623 | // [a1,b1,c1,d1,a5,b5,c5,d5] |
| 624 | // [a2,b2,c2,d2,a6,b6,c6,d6] |
| 625 | // [a3,b3,c3,d3,a7,b7,c7,d7] |
| 626 | } |
| 627 | |
| 628 | template<class V8, class V16, class V32> SIMDPP_INL |
| 629 | void v_sse_transpose8x4(V8& a0, V8& a1, V8& a2, V8& a3) |
| 630 | { |
| 631 | V16 b0, b1, b2, b3; |
| 632 | b0 = zip16_lo(a0, a1); |
| 633 | b1 = zip16_lo(a2, a3); |
| 634 | b2 = zip16_hi(a0, a1); |
| 635 | b3 = zip16_hi(a2, a3); |
| 636 | // [a0,b0,a1,b1,a2,b2,a3,b3 ... b7] |
| 637 | // [c0,d0,c1,d1,c2,d2,c3,d3 ... d7] |
| 638 | // [a8 ... b15] |
| 639 | // [c8 ... d15] |
| 640 | V32 c0, c1, c2, c3; |
| 641 | c0 = zip8_lo(b0, b1); |
| 642 | c1 = zip8_hi(b0, b1); |
| 643 | c2 = zip8_lo(b2, b3); |
| 644 | c3 = zip8_hi(b2, b3); |
| 645 | // [a0,b0,c0,d0,[a..d]1, [a..d]2, [a..d]3] |
| 646 | // [[a..d]4, [a..d]5, [a..d]6, [a..d]7] |
| 647 | // [[a..d]8, [a..d]9, [a..d]10, [a..d]11] |
| 648 | // [[a..d]12, [a..d]13,[a..d]14, [a..d]15] |
| 649 | i_transpose4(c0, c1, c2, c3); // 32-bit transpose |
| 650 | a0 = c0; |
| 651 | a1 = c1; |
| 652 | a2 = c2; |
| 653 | a3 = c3; |
| 654 | } |
| 655 | |
| 656 | |
| 657 | } // namespace insn |
| 658 | } // namespace detail |
| 659 | } // namespace SIMDPP_ARCH_NAMESPACE |
| 660 | } // namespace simdpp |
| 661 | |
| 662 | #endif |
| 663 | |