| 1 | /* Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt> |
| 2 | |
| 3 | Distributed under the Boost Software License, Version 1.0. |
| 4 | (See accompanying file LICENSE_1_0.txt or copy at |
| 5 | http://www.boost.org/LICENSE_1_0.txt) |
| 6 | */ |
| 7 | |
| 8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_MEM_UNPACK_H |
| 9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_MEM_UNPACK_H |
| 10 | |
| 11 | #ifndef LIBSIMDPP_SIMD_H |
| 12 | #error "This file must be included through simd.h" |
| 13 | #endif |
| 14 | |
| 15 | #include <simdpp/types.h> |
| 16 | #include <simdpp/detail/width.h> |
| 17 | #include <simdpp/detail/insn/shuffle128.h> |
| 18 | #include <simdpp/detail/insn/zip128.h> |
| 19 | #include <simdpp/core/align.h> |
| 20 | #include <simdpp/core/splat_n.h> |
| 21 | #include <simdpp/core/make_shuffle_bytes_mask.h> |
| 22 | #include <simdpp/core/shuffle1.h> |
| 23 | #include <simdpp/core/shuffle2.h> |
| 24 | #include <simdpp/core/transpose.h> |
| 25 | #include <simdpp/core/unzip_hi.h> |
| 26 | #include <simdpp/core/unzip_lo.h> |
| 27 | #include <simdpp/core/zip_hi.h> |
| 28 | #include <simdpp/core/zip_lo.h> |
| 29 | |
| 30 | namespace simdpp { |
| 31 | namespace SIMDPP_ARCH_NAMESPACE { |
| 32 | namespace detail { |
| 33 | namespace insn { |
| 34 | |
| 35 | /** Concatenates @a a and @a b and stores the elements of the resulting array |
| 36 | as follows: |
| 37 | * every (2n)-th element is stored to @a a |
| 38 | * every (2n+1)-th element is stored to @a b |
| 39 | |
| 40 | n = [0, <number of elements in vector> - 1] |
| 41 | */ |
| 42 | template<class V> SIMDPP_INL |
| 43 | void mem_unpack2(any_vec<16,V>& qa, any_vec<16,V>& qb) |
| 44 | { |
| 45 | V a = qa.wrapped(); |
| 46 | V b = qb.wrapped(); |
| 47 | |
| 48 | qa.wrapped() = unzip128_lo(a, b); |
| 49 | qb.wrapped() = unzip128_hi(a, b); |
| 50 | } |
| 51 | |
| 52 | template<class V> SIMDPP_INL |
| 53 | void mem_unpack2(any_vec<32,V>& qa, any_vec<32,V>& qb) |
| 54 | { |
| 55 | V a = qa.wrapped(); |
| 56 | V b = qb.wrapped(); |
| 57 | |
| 58 | V c1 = shuffle1_128<0,0>(a, b); |
| 59 | V c2 = shuffle1_128<1,1>(a, b); |
| 60 | qa.wrapped() = unzip128_lo(c1, c2); |
| 61 | qb.wrapped() = unzip128_hi(c1, c2); |
| 62 | } |
| 63 | |
| 64 | #if SIMDPP_USE_AVX512F |
| 65 | template<class V> SIMDPP_INL |
| 66 | void mem_unpack2(any_vec<64,V>& qa, any_vec<64,V>& qb) |
| 67 | { |
| 68 | V a = qa.wrapped(); |
| 69 | V b = qb.wrapped(); |
| 70 | |
| 71 | V c1 = shuffle2_128<0,2,0,2>(a, b); |
| 72 | V c2 = shuffle2_128<1,3,1,3>(a, b); |
| 73 | qa.wrapped() = unzip128_lo(c1, c2); |
| 74 | qb.wrapped() = unzip128_hi(c1, c2); |
| 75 | } |
| 76 | #endif |
| 77 | |
| 78 | /** Generic implementation of mem_unpack3. The 128-bit lanes are processed |
| 79 | independently |
| 80 | */ |
| 81 | template<class T> SIMDPP_INL |
| 82 | void v_mem_unpack3_impl8_128(T& a, T& b, T& c) |
| 83 | { |
| 84 | #if SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
| 85 | // [a0, b0, c0, a1, b1, c1, a2, b2, c2, a3, b3, c3, a4, b4, c4, a5 ] |
| 86 | // [b5, c5, a6, b6, c6, a7, b7, c7, a8, b8, c8, a9, b9, c9, a10,b10] |
| 87 | // [c10,a11,b11,c11,a12,b12,c12,a13,b13,c13,a14,b14,c14,a15,b15,c15] |
| 88 | T mask1 = make_shuffle_bytes16_mask< 1, 4, 7, 10, 13,16+0,16+3,16+6, |
| 89 | 16+9,16+12,16+15, 2, 5, 8, 11, 14>(mask1); |
| 90 | T a1, b1, c1; |
| 91 | a1 = shuffle_bytes16(c, a, mask1); |
| 92 | b1 = shuffle_bytes16(a, b, mask1); |
| 93 | c1 = shuffle_bytes16(b, c, mask1); |
| 94 | // [a11,a12,a13,a14,a15,a0, a1, a2, a3, a4, a5, b11,b12,b13,b14,b15] |
| 95 | // [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10,c0, c1, c2, c3, c4 ] |
| 96 | // [c5, c6, c7, c8, c9, c10,c11,c12,c13,c14,c15,a6, a7, a8, a9, a10] |
| 97 | T a2, b2, c2; |
| 98 | T mask2 = make_uint(0xff); |
| 99 | mask2 = move16_l<5>(mask2); |
| 100 | |
| 101 | a2 = blend(a1, c1, mask2); |
| 102 | b2 = blend(b1, a1, mask2); |
| 103 | c2 = blend(c1, b1, mask2); |
| 104 | // [a11..a15,a0..a10] |
| 105 | // [b0..b15] |
| 106 | // [c5..c15,c0..c5] |
| 107 | a = align16<5>(a2, a2); |
| 108 | b = b2; |
| 109 | c = align16<11>(c2, c2); |
| 110 | #else |
| 111 | typename same_width<T>::u8 t0, t1, t2, t3; |
| 112 | t0 = a; |
| 113 | t1 = align16<12>(a, b); |
| 114 | t2 = align16<8>(b, c); |
| 115 | t3 = move16_l<4>(c); |
| 116 | // [a0, b0, c0, a1, b1, c1, a2, b2, c2, a3, b3, c3, ...] |
| 117 | // [a4, b4, c4, a5, b5, c5, a6, b6, c6, a7, b7, c7, ...] |
| 118 | // [a8, b8, c8, a9, b9, c9, a10,b10,c10,a11,b11,c11, ...] |
| 119 | // [a12,b12,c12,a13,b13,c13,a14,b14,c14,a15,b15,c15, ...] |
| 120 | typename same_width<T>::u16 b0, b1, b2, b3; |
| 121 | b0 = zip16_lo(t0, t1); |
| 122 | b1 = zip16_lo(t2, t3); |
| 123 | b2 = zip16_hi(t0, t1); |
| 124 | b3 = zip16_hi(t2, t3); |
| 125 | // [a0, a4, b0, b4, c0, c4, a1, a5, b1, b5, c1, c5, a2, a6, b2, b6 ] |
| 126 | // [a8, a12,b8, b12,c9, c13,a9, a13,b9, b13,c9, c13,a10,a14,b10,b14,] |
| 127 | // [c2, c6, a3, a7, b3, b7, c3, c7, ... ] |
| 128 | // [c10,c14,a11,a15,b11,b15,c11,c15,... ] |
| 129 | typename same_width<T>::u8 u0, u1, u2; |
| 130 | u0 = zip8_lo(b0, b1); |
| 131 | u1 = zip8_hi(b0, b1); |
| 132 | u2 = zip8_lo(b2, b3); |
| 133 | // [a0, a4, a8, a12,b0, b4, b8, b12, c0, c4, c8, c12, a1, a5, a9, a13 ] |
| 134 | // [b1, b5, b9, b13,c1, c5, c9, c13, a2, a6, a10,a14, b2, b6, b10,b14 ] |
| 135 | // [c2, c6, c10,c14,a3, a7, a11,a15, b3, b7, b11,b15, c3, c7, c11,c15 ] |
| 136 | t0 = u0; |
| 137 | t1 = align16<12>(u0, u1); |
| 138 | t2 = align16<8>(u1, u2); |
| 139 | t3 = move16_l<4>(u2); |
| 140 | // [a0, a4, a8, a12,b0, b4, b8, b12, c0, c4, c8, c12, ...] |
| 141 | // [a1, a5, a9, a13,b1, b5, b9, b13, c1, c5, c9, c13, ...] |
| 142 | // [a2, a6, a10,a14,b2, b6, b10,b13, c2, c6, c10,c14, ...] |
| 143 | // [a3, a7, a11,a15,b3, b7, b11,b13, c3, c7, c11,c15, ...] |
| 144 | b0 = zip16_lo(t0, t1); |
| 145 | b1 = zip16_lo(t2, t3); |
| 146 | b2 = zip16_hi(t0, t1); |
| 147 | b3 = zip16_hi(t2, t3); |
| 148 | // [a0, a1, a4, a5, a8, a9, a12,a13,b0, b1, b4, b5, b8, b9, b12,b13 ] |
| 149 | // [a2, a3, a6, a7, a10,a11,a14,a15,b2, b3, b6, b7, b10,b11,b14,b15 ] |
| 150 | // [c0, c1, c4, c5, c8, c9, c12,c13, ... ] |
| 151 | // [c2, c3, c6, c7, c10,c11,c14,c15, ... ] |
| 152 | a = zip8_lo(b0, b1); |
| 153 | b = zip8_hi(b0, b1); |
| 154 | c = zip8_lo(b2, b3); |
| 155 | #endif |
| 156 | } |
| 157 | |
| 158 | template<class T> SIMDPP_INL |
| 159 | void v_mem_unpack3_impl16_128(T& a, T& b, T& c) |
| 160 | { |
| 161 | #if SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
| 162 | // [a0,b0,c0,a1,b1,c1,a2,b2] |
| 163 | // [c2,a3,b3,c3,a4,b4,c4,a5] |
| 164 | // [b5,c5,a6,b6,c6,a7,b7,c7] |
| 165 | T mask1 = make_shuffle_bytes16_mask<0,3,6,8+1,8+4,8+7,8+2,8+5>(mask1); |
| 166 | T a1, b1, c1; |
| 167 | a1 = shuffle_bytes16(a, b, mask1); |
| 168 | c1 = shuffle_bytes16(b, c, mask1); |
| 169 | b1 = shuffle_bytes16(c, a, mask1); |
| 170 | // [a0,a1,a2,a3,a4,a5,b3,b4] |
| 171 | // [c2,c3,c4,c5,c6,c7,a6,a7] |
| 172 | // [b5,b6,b7,b0,b1,b2,c0,c1] |
| 173 | T a2, b2, c2; |
| 174 | T mask2 = make_uint(0xffff); |
| 175 | mask2 = move8_l<2>(mask2); |
| 176 | |
| 177 | a2 = blend(a1, c1, mask2); |
| 178 | b2 = blend(b1, a1, mask2); |
| 179 | c2 = blend(c1, b1, mask2); |
| 180 | // [a0..a7] |
| 181 | // [b5..b7,b0..b4] |
| 182 | // [c2..c7,c0,c1] |
| 183 | a = a2; |
| 184 | b = align8<3>(b2, b2); |
| 185 | c = align8<6>(c2, c2); |
| 186 | #else |
| 187 | T t0, t1, t2, t3; |
| 188 | t0 = a; |
| 189 | t1 = align8<6>(a, b); |
| 190 | t2 = align8<4>(b, c); |
| 191 | t3 = move8_l<2>(c); |
| 192 | // [a0,b0,c0,a1,b1,c1, ... ] |
| 193 | // [a2,b2,c2,a3,b3,c3, ... ] |
| 194 | // [a4,b4,c4,a5,b5,c5, ... ] |
| 195 | // [a6,b6,c6,a7,b7,c7, ... ] |
| 196 | typename same_width<T>::u32 b0, b1, b2, b3; |
| 197 | b0 = zip8_lo(t0, t1); |
| 198 | b1 = zip8_lo(t2, t3); |
| 199 | b2 = zip8_hi(t0, t1); |
| 200 | b3 = zip8_hi(t2, t3); |
| 201 | // [a0,a2,b0,b2,c0,c2,a1,a3] |
| 202 | // [a4,a6,b4,b6,c4,c6,a5,a7] |
| 203 | // [b1,b3,c1,c3, ... ] |
| 204 | // [b5,b7,c5,c7, ... ] |
| 205 | typename same_width<T>::u64 c0, c1, c2; |
| 206 | c0 = zip4_lo(b0, b1); |
| 207 | c1 = zip4_hi(b0, b1); |
| 208 | c2 = zip4_lo(b2, b3); |
| 209 | // [a0,a2,a4,a6,b0,b2,b4,b6] |
| 210 | // [c0,c2,c4,c6,a1,a3,a5,a7] |
| 211 | // [b1,b3,b5,b7,c1,c3,c5,c7] |
| 212 | t0 = c0; |
| 213 | t1 = shuffle1<1,0>(c0, c1); |
| 214 | t2 = splat2<1>(c1); |
| 215 | t3 = c2; |
| 216 | // [a0,a2,a4,a6,b0,b2,b4,b6] |
| 217 | // [b0,b2,b4,b6,c0,c2,c4,c6] |
| 218 | // [a1,a3,a5,a7,a1,a3,a5,a7] |
| 219 | // [b1,b3,b5,b7,c1,c3,c5,c7] |
| 220 | a = zip8_lo(t0, t2); |
| 221 | b = zip8_lo(t1, t3); |
| 222 | c = zip8_hi(t1, t3); |
| 223 | #endif |
| 224 | } |
| 225 | |
| 226 | template<class T> SIMDPP_INL |
| 227 | void v_mem_unpack3_impl32_128(T& a, T& b, T& c) |
| 228 | { |
| 229 | #if SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
| 230 | using U = typename T::uint_vector_type; |
| 231 | |
| 232 | // [a0,b0,c0,a1] |
| 233 | // [b1,c1,a2,b2] |
| 234 | // [c2,a3,b3,c3] |
| 235 | U mask1 = make_shuffle_bytes16_mask<0,3,4+2,4+1>(mask1); |
| 236 | T a1, b1, c1; |
| 237 | a1 = shuffle_bytes16(a, b, mask1); |
| 238 | b1 = shuffle_bytes16(b, c, mask1); |
| 239 | c1 = shuffle_bytes16(c, a, mask1); |
| 240 | // [a0,a1,a2,c1] |
| 241 | // [b1,b2,b3,a3] |
| 242 | // [c2,c3,c0,b0] |
| 243 | T a2, b2, c2; |
| 244 | U mask2 = make_uint(0xffffffff); |
| 245 | mask2 = move4_l<1>(mask2); |
| 246 | |
| 247 | a2 = blend(a1, b1, mask2); |
| 248 | b2 = blend(b1, c1, mask2); |
| 249 | c2 = blend(c1, a1, mask2); |
| 250 | // [a0,a1,a2,a3] |
| 251 | // [b1,b2,b3,b0] |
| 252 | // [c2,c3,c0,c1] |
| 253 | a = a2; |
| 254 | b = align4<3>(b2, b2); |
| 255 | c = align4<2>(c2, c2); |
| 256 | #else |
| 257 | T t11, t12, t21, t22, t31, t32; |
| 258 | // [a0,b0,c0,a1] |
| 259 | // [b1,c1,a2,b2] |
| 260 | // [c2,a3,b3,c3] |
| 261 | t11 = a; |
| 262 | t12 = shuffle2<0,1,2,3>(c, b); |
| 263 | t21 = shuffle2<0,1,0,1>(a, b); |
| 264 | t22 = shuffle2<2,3,2,3>(b, c); |
| 265 | t31 = shuffle2<2,3,0,1>(a, b); |
| 266 | t32 = c; |
| 267 | // [a0,b0,c0,a1] |
| 268 | // [c2,a3,a2,b2] |
| 269 | // [a0,b0,b1,c1] |
| 270 | // [a2,b2,b3,c3] |
| 271 | // [c0,a1,b1,c1] |
| 272 | // [c2,a3,b3,c3] |
| 273 | a = shuffle2<0,3,2,1>(t11, t12); |
| 274 | b = shuffle2<1,2,1,2>(t21, t22); |
| 275 | c = shuffle2<0,3,0,3>(t31, t32); |
| 276 | #endif |
| 277 | } |
| 278 | |
| 279 | template<class T> SIMDPP_INL |
| 280 | void v_mem_unpack3_impl64_128(T& a, T& b, T& c) |
| 281 | { |
| 282 | T d0, d1, d2; |
| 283 | d0 = shuffle1<0,1>(a, b); |
| 284 | d1 = shuffle1<1,0>(a, c); |
| 285 | d2 = shuffle1<0,1>(b, c); |
| 286 | a = d0; b = d1; c = d2; |
| 287 | } |
| 288 | |
| 289 | template<class V> SIMDPP_INL |
| 290 | void v_mem_unpack3_shuffle128(any_vec<16,V>& qa, any_vec<16,V>& qb, any_vec<16,V>& qc) |
| 291 | { |
| 292 | (void) qa; (void) qb; (void) qc; |
| 293 | } |
| 294 | |
| 295 | template<class V> SIMDPP_INL |
| 296 | void v_mem_unpack3_shuffle128(any_vec<32,V>& qa, any_vec<32,V>& qb, any_vec<32,V>& qc) |
| 297 | { |
| 298 | // shuffle the vectors so that the lower halves contain the first 3 128-bit |
| 299 | // items (a and lower half of b) and the higher halves contain the rest |
| 300 | |
| 301 | V a0, b0, c0, a1, b1, c1; |
| 302 | |
| 303 | a0 = qa.wrapped(); b0 = qb.wrapped(); c0 = qc.wrapped(); |
| 304 | |
| 305 | a1 = shuffle1_128<0,1>(a0, b0); |
| 306 | b1 = shuffle1_128<1,0>(a0, c0); |
| 307 | c1 = shuffle1_128<0,1>(b0, c0); |
| 308 | |
| 309 | qa.wrapped() = a1; qb.wrapped() = b1; qc.wrapped() = c1; |
| 310 | } |
| 311 | |
| 312 | #if SIMDPP_USE_AVX512F |
| 313 | template<class V> SIMDPP_INL |
| 314 | void v_mem_unpack3_shuffle128(any_vec<64,V>& qa, any_vec<64,V>& qb, any_vec<64,V>& qc) |
| 315 | { |
| 316 | V a, b, c; // TODO: optimize. Using full-vector shuffle may be faster |
| 317 | a = qa.wrapped(); b = qb.wrapped(); c = qc.wrapped(); |
| 318 | |
| 319 | V t11, t12, t21, t22, t31, t32; |
| 320 | // [a0,b0,c0,a1] |
| 321 | // [b1,c1,a2,b2] |
| 322 | // [c2,a3,b3,c3] |
| 323 | t11 = a; |
| 324 | t12 = shuffle2_128<0,1,2,3>(c, b); |
| 325 | t21 = shuffle2_128<0,1,0,1>(a, b); |
| 326 | t22 = shuffle2_128<2,3,2,3>(b, c); |
| 327 | t31 = shuffle2_128<2,3,0,1>(a, b); |
| 328 | t32 = c; |
| 329 | // [a0,b0,c0,a1] |
| 330 | // [c2,a3,a2,b2] |
| 331 | // [a0,b0,b1,c1] |
| 332 | // [a2,b2,b3,c3] |
| 333 | // [c0,a1,b1,c1] |
| 334 | // [c2,a3,b3,c3] |
| 335 | a = shuffle2_128<0,3,2,1>(t11, t12); |
| 336 | b = shuffle2_128<1,2,1,2>(t21, t22); |
| 337 | c = shuffle2_128<0,3,0,3>(t31, t32); |
| 338 | |
| 339 | qa.wrapped() = a; qb.wrapped() = b; qc.wrapped() = c; |
| 340 | } |
| 341 | #endif |
| 342 | |
| 343 | /** Concatenates @a a, @a b and @a c and stores the elements of the resulting |
| 344 | array as follows: |
| 345 | * every (3n)-th element is stored to @a a |
| 346 | * every (3n+1)-th element is stored to @a b |
| 347 | * every (3n+2)-th element is stored to @a c |
| 348 | |
| 349 | n = [0, <number of elements in vector> - 1] |
| 350 | */ |
| 351 | template<unsigned N> SIMDPP_INL |
| 352 | void mem_unpack3(uint8<N>& a, uint8<N>& b, uint8<N>& c) |
| 353 | { |
| 354 | v_mem_unpack3_shuffle128(a, b, c); |
| 355 | v_mem_unpack3_impl8_128(a, b, c); |
| 356 | } |
| 357 | |
| 358 | template<unsigned N> SIMDPP_INL |
| 359 | void mem_unpack3(uint16<N>& a, uint16<N>& b, uint16<N>& c) |
| 360 | { |
| 361 | v_mem_unpack3_shuffle128(a, b, c); |
| 362 | v_mem_unpack3_impl16_128(a, b, c); |
| 363 | } |
| 364 | |
| 365 | template<unsigned N> SIMDPP_INL |
| 366 | void mem_unpack3(uint32<N>& a, uint32<N>& b, uint32<N>& c) |
| 367 | { |
| 368 | v_mem_unpack3_shuffle128(a, b, c); |
| 369 | v_mem_unpack3_impl32_128(a, b, c); |
| 370 | } |
| 371 | |
| 372 | template<unsigned N> SIMDPP_INL |
| 373 | void mem_unpack3(uint64<N>& a, uint64<N>& b, uint64<N>& c) |
| 374 | { |
| 375 | v_mem_unpack3_shuffle128(a, b, c); |
| 376 | v_mem_unpack3_impl64_128(a, b, c); |
| 377 | } |
| 378 | |
| 379 | template<unsigned N> SIMDPP_INL |
| 380 | void mem_unpack3(float32<N>& a, float32<N>& b, float32<N>& c) |
| 381 | { |
| 382 | v_mem_unpack3_shuffle128(a, b, c); |
| 383 | v_mem_unpack3_impl32_128(a, b, c); |
| 384 | } |
| 385 | |
| 386 | template<unsigned N> SIMDPP_INL |
| 387 | void mem_unpack3(float64<N>& a, float64<N>& b, float64<N>& c) |
| 388 | { |
| 389 | v_mem_unpack3_shuffle128(a, b, c); |
| 390 | v_mem_unpack3_impl64_128(a, b, c); |
| 391 | } |
| 392 | |
| 393 | /** Generic implementation of mem_unpack4. The 256-bit version applies 128-bit |
| 394 | operations to each half of each vector separately. |
| 395 | */ |
| 396 | template<class T> SIMDPP_INL |
| 397 | void v_mem_unpack4_impl8_128(T& a, T& b, T& c, T& d) |
| 398 | { |
| 399 | #if SIMDPP_USE_SSSE3 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
| 400 | // TODO: optimize for Altivec and MSA |
| 401 | typename same_width<T>::u32 b0, b1, b2, b3; |
| 402 | b0 = transpose_inplace(a); |
| 403 | b1 = transpose_inplace(b); |
| 404 | b2 = transpose_inplace(c); |
| 405 | b3 = transpose_inplace(d); |
| 406 | |
| 407 | transpose4(b0, b1, b2, b3); |
| 408 | a = b0; b = b1; c = b2; d = b3; |
| 409 | #else |
| 410 | // [a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3 ] |
| 411 | // [a4, b4, c4, d4, a5, b5, c5, d5, a6, b6, c6, d6, a7, b7, c7, d7 ] |
| 412 | // [a8, b8, c8, d8, a9, b9, c9, d9, a10,b10,c10,d10,a11,b11,c11,d11] |
| 413 | // [a12,b12,c12,d12,a13,b13,c13,d13,a14,b14,c14,d14,a15,b15,c15,d15] |
| 414 | T b0, b1, b2, b3, c0, c1, c2, c3; |
| 415 | b0 = zip16_lo(a, b); |
| 416 | b1 = zip16_hi(a, b); |
| 417 | b2 = zip16_lo(c, d); |
| 418 | b3 = zip16_hi(c, d); |
| 419 | // [a0, a4, b0, b4, c0, c4, d0, d4, a1, a5, b1, b5, c1, c5, d1, d5 ] |
| 420 | // [a2, a6, b2, b6, c2, c6, d2, d6, a3, a7, b3, b7, c3, c7, d3, d7 ] |
| 421 | // [a8, a12,b8, b12,c8, c12,d8, d12,a9, a13,b9, b13,c9, c13,d9, d13] |
| 422 | // [a10,a14,b10,b14,c10,c14,d10,d14,a11,a15,b11,b15,c11,c15,d11,d15] |
| 423 | c0 = zip16_lo(b0, b1); |
| 424 | c1 = zip16_hi(b0, b1); |
| 425 | c2 = zip16_lo(b2, b3); |
| 426 | c3 = zip16_hi(b2, b3); |
| 427 | // [a0, a2, a4, a6, b0, b2, b4, b6, c0, c2, c4, c6, d0, d2, d4, d6 ] |
| 428 | // [a1, a3, a5, a7, b1, b3, b5, b7, c1, c3, c5, c7, d1, d3, d5, d7 ] |
| 429 | // [a8, a10,a12,a14,b8, b10,b12,b14,c8, c10,c12,c14,d8, d10,d12,d14] |
| 430 | // [a9, a11,a13,a15,b9, b11,b13,b15,c9, c11,c13,c15,d9, d11,d13,d15] |
| 431 | typename same_width<T>::u64 d0, d1, d2, d3; |
| 432 | d0 = zip16_lo(c0, c1); |
| 433 | d1 = zip16_hi(c0, c1); |
| 434 | d2 = zip16_lo(c2, c3); |
| 435 | d3 = zip16_hi(c2, c3); |
| 436 | // [a0 .. a7, b0 .. b7 ] |
| 437 | // [c0 .. c7, d0 .. d7 ] |
| 438 | // [a8 .. a15, b8 .. b15 ] |
| 439 | // [b8 .. b15, d8 .. d15 ] |
| 440 | a = zip2_lo(d0, d2); |
| 441 | b = zip2_hi(d0, d2); |
| 442 | c = zip2_lo(d1, d3); |
| 443 | d = zip2_hi(d1, d3); |
| 444 | #endif |
| 445 | } |
| 446 | |
| 447 | template<class T> SIMDPP_INL |
| 448 | void v_mem_unpack4_impl16_128(T& a, T& b, T& c, T& d) |
| 449 | { |
| 450 | // [a0,b0,c0,d0,a1,b1,c1,d1] |
| 451 | // [a2,b2,c2,d2,a3,b3,c3,d3] |
| 452 | // [a4,b4,c4,d4,a5,b5,c5,d5] |
| 453 | // [a6,b6,c6,d6,a7,b7,c7,d7] |
| 454 | typename same_width<T>::u16 t0, t1, t2, t3; |
| 455 | t0 = zip8_lo(a, b); |
| 456 | t1 = zip8_hi(a, b); |
| 457 | t2 = zip8_lo(c, d); |
| 458 | t3 = zip8_hi(c, d); |
| 459 | // [a0,a2,b0,b2,c0,c2,d0,d2] |
| 460 | // [a1,a3,b1,b3,c1,c3,d1,d3] |
| 461 | // [a4,a6,b4,b6,c4,c6,d4,d6] |
| 462 | // [a5,a7,b5,b7,c5,c7,d5,d7] |
| 463 | typename same_width<T>::u64 u0, u1, u2, u3; |
| 464 | u0 = zip8_lo(t0, t1); |
| 465 | u1 = zip8_hi(t0, t1); |
| 466 | u2 = zip8_lo(t2, t3); |
| 467 | u3 = zip8_hi(t2, t3); |
| 468 | // [a0,a1,a2,a3,b0,b1,b2,b3] |
| 469 | // [c0,c1,c2,c3,d0,d1,d2,d3] |
| 470 | // [a4,a5,a6,a7,b4,b5,b6,b7] |
| 471 | // [c4,c5,c6,c7,d4,d5,d6,d7] |
| 472 | a = zip2_lo(u0, u2); |
| 473 | b = zip2_hi(u0, u2); |
| 474 | c = zip2_lo(u1, u3); |
| 475 | d = zip2_hi(u1, u3); |
| 476 | } |
| 477 | |
| 478 | template<class T> SIMDPP_INL |
| 479 | void v_mem_unpack4_impl32_128(T& a, T& b, T& c, T& d) |
| 480 | { |
| 481 | transpose4(a, b, c, d); |
| 482 | } |
| 483 | |
| 484 | template<class T> SIMDPP_INL |
| 485 | void v_mem_unpack4_impl64_128(T& a, T& b, T& c, T& d) |
| 486 | { |
| 487 | transpose2(a, c); |
| 488 | transpose2(b, d); |
| 489 | T t; |
| 490 | t = b; |
| 491 | b = c; |
| 492 | c = t; |
| 493 | } |
| 494 | |
| 495 | template<class V> SIMDPP_INL |
| 496 | void v_mem_unpack4_shuffle128(any_vec<16,V>& qa, any_vec<16,V>& qb, |
| 497 | any_vec<16,V>& qc, any_vec<16,V>& qd) |
| 498 | { |
| 499 | (void) qa; (void) qb; (void) qc; (void) qd; |
| 500 | } |
| 501 | |
| 502 | template<class V> SIMDPP_INL |
| 503 | void v_mem_unpack4_shuffle128(any_vec<32,V>& qa, any_vec<32,V>& qb, |
| 504 | any_vec<32,V>& qc, any_vec<32,V>& qd) |
| 505 | { |
| 506 | V a0, b0, c0, d0, a1, b1, c1, d1; |
| 507 | |
| 508 | a0 = qa.wrapped(); b0 = qb.wrapped(); c0 = qc.wrapped(); d0 = qd.wrapped(); |
| 509 | |
| 510 | a1 = shuffle1_128<0,0>(a0, c0); |
| 511 | b1 = shuffle1_128<1,1>(a0, c0); |
| 512 | c1 = shuffle1_128<0,0>(b0, d0); |
| 513 | d1 = shuffle1_128<1,1>(b0, d0); |
| 514 | |
| 515 | qa.wrapped() = a1; qb.wrapped() = b1; qc.wrapped() = c1; qd.wrapped() = d1; |
| 516 | } |
| 517 | |
| 518 | #if SIMDPP_USE_AVX512F |
| 519 | template<class V> SIMDPP_INL |
| 520 | void v_mem_unpack4_shuffle128(any_vec<64,V>& qa, any_vec<64,V>& qb, |
| 521 | any_vec<64,V>& qc, any_vec<64,V>& qd) |
| 522 | { |
| 523 | V a, b, c, d; // TODO: optimize. Using full-vector shuffle/permute will be faster |
| 524 | |
| 525 | a = qa.wrapped(); b = qb.wrapped(); c = qc.wrapped(); d = qd.wrapped(); |
| 526 | |
| 527 | V t1, t2, t3, t4; |
| 528 | // [a0,a1,a2,a3] |
| 529 | // [b0,b1,b2,b3] |
| 530 | // [c0,c1,c2,c3] |
| 531 | // [d0,d1,d2,d3] |
| 532 | t1 = shuffle2_128<0,2,0,2>(a, b); |
| 533 | t2 = shuffle2_128<1,3,1,3>(a, b); |
| 534 | t3 = shuffle2_128<0,2,0,2>(c, d); |
| 535 | t4 = shuffle2_128<1,3,1,3>(c, d); |
| 536 | // [a0,a2,b0,b2] |
| 537 | // [a1,a3,b1,b3] |
| 538 | // [c0,c2,d0,d2] |
| 539 | // [c1,c3,d1,d3] |
| 540 | a = shuffle2_128<0,2,0,2>(t1, t3); |
| 541 | b = shuffle2_128<0,2,0,2>(t2, t4); |
| 542 | c = shuffle2_128<1,3,1,3>(t1, t3); |
| 543 | d = shuffle2_128<1,3,1,3>(t2, t4); |
| 544 | // [a0,b0,c0,d0] |
| 545 | // [a1,b1,c1,d1] |
| 546 | // [a2,b2,c2,d2] |
| 547 | // [a3,b3,c3,d3] |
| 548 | |
| 549 | qa.wrapped() = a; qb.wrapped() = b; qc.wrapped() = c; qd.wrapped() = d; |
| 550 | } |
| 551 | #endif |
| 552 | |
| 553 | /** Concatenates @a a, @a b, @a c and @a d and stores the elements of the |
| 554 | resulting array as follows: |
| 555 | * every (4n)-th element is stored to @a a |
| 556 | * every (4n+1)-th element is stored to @a b |
| 557 | * every (4n+2)-th element is stored to @a c |
| 558 | * every (4n+3)-th element is stored to @a d |
| 559 | |
| 560 | n = [0, <number of elements in vector> - 1] |
| 561 | */ |
| 562 | // @icost{SSE2, SSE3, 16} |
| 563 | // @icost{SSSE3, SSE4.1, 12} |
| 564 | template<unsigned N> SIMDPP_INL |
| 565 | void mem_unpack4(uint8<N>& a, uint8<N>& b, uint8<N>& c, uint8<N>& d) |
| 566 | { |
| 567 | v_mem_unpack4_shuffle128(a, b, c, d); |
| 568 | v_mem_unpack4_impl8_128(a, b, c, d); |
| 569 | } |
| 570 | |
| 571 | template<unsigned N> SIMDPP_INL |
| 572 | void mem_unpack4(uint16<N>& a, uint16<N>& b, uint16<N>& c, uint16<N>& d) |
| 573 | { |
| 574 | v_mem_unpack4_shuffle128(a, b, c, d); |
| 575 | v_mem_unpack4_impl16_128(a, b, c, d); |
| 576 | } |
| 577 | |
| 578 | template<unsigned N> SIMDPP_INL |
| 579 | void mem_unpack4(uint32<N>& a, uint32<N>& b, uint32<N>& c, uint32<N>& d) |
| 580 | { |
| 581 | v_mem_unpack4_shuffle128(a, b, c, d); |
| 582 | v_mem_unpack4_impl32_128(a, b, c, d); |
| 583 | } |
| 584 | |
| 585 | template<unsigned N> SIMDPP_INL |
| 586 | void mem_unpack4(uint64<N>& a, uint64<N>& b, uint64<N>& c, uint64<N>& d) |
| 587 | { |
| 588 | v_mem_unpack4_shuffle128(a, b, c, d); |
| 589 | v_mem_unpack4_impl64_128(a, b, c, d); |
| 590 | } |
| 591 | |
| 592 | template<unsigned N> SIMDPP_INL |
| 593 | void mem_unpack4(float32<N>& a, float32<N>& b, float32<N>& c, float32<N>& d) |
| 594 | { |
| 595 | v_mem_unpack4_shuffle128(a, b, c, d); |
| 596 | v_mem_unpack4_impl32_128(a, b, c, d); |
| 597 | } |
| 598 | |
| 599 | template<unsigned N> SIMDPP_INL |
| 600 | void mem_unpack4(float64<N>& a, float64<N>& b, float64<N>& c, float64<N>& d) |
| 601 | { |
| 602 | v_mem_unpack4_shuffle128(a, b, c, d); |
| 603 | v_mem_unpack4_impl64_128(a, b, c, d); |
| 604 | } |
| 605 | |
| 606 | /** Concatenates the given vectors and stores the elements of the resulting |
| 607 | array as follows: |
| 608 | * every (3n)-th element of the first 48 elements is stored to @a a |
| 609 | * every (3n+1)-th element of the first 48 elements is stored to @a b |
| 610 | * every (3n+2)-th element of the first 48 elements is stored to @a c |
| 611 | * every (3n)-th element of the last 48 elements is stored to @a d |
| 612 | * every (3n+1)-th element of the last 48 elements is stored to @a e |
| 613 | * every (3n+2)-th element of the lasd 48 elements is stored to @a f |
| 614 | |
| 615 | n = [0, <number of elements in vector> - 1] |
| 616 | */ |
| 617 | static SIMDPP_INL |
| 618 | void mem_unpack6(uint8x16& a, uint8x16& b, uint8x16& c, |
| 619 | uint8x16& d, uint8x16& e, uint8x16& f) |
| 620 | { |
| 621 | uint8x16 t0, t1, t2, t3, t4, t5; |
| 622 | t0 = zip16_lo(a, d); |
| 623 | t1 = zip16_hi(a, d); |
| 624 | t2 = zip16_lo(b, e); |
| 625 | t3 = zip16_hi(b, e); |
| 626 | t4 = zip16_lo(c, f); |
| 627 | t5 = zip16_hi(c, f); |
| 628 | |
| 629 | uint8x16 u0, u1, u2, u3, u4, u5; |
| 630 | u0 = zip16_lo(t0, t3); |
| 631 | u1 = zip16_hi(t0, t3); |
| 632 | u2 = zip16_lo(t1, t4); |
| 633 | u3 = zip16_hi(t1, t4); |
| 634 | u4 = zip16_lo(t2, t5); |
| 635 | u5 = zip16_hi(t2, t5); |
| 636 | |
| 637 | t0 = zip16_lo(u0, u3); |
| 638 | t1 = zip16_hi(u0, u3); |
| 639 | t2 = zip16_lo(u1, u4); |
| 640 | t3 = zip16_hi(u1, u4); |
| 641 | t4 = zip16_lo(u2, u5); |
| 642 | t5 = zip16_hi(u2, u5); |
| 643 | |
| 644 | u0 = zip16_lo(t0, t3); |
| 645 | u1 = zip16_hi(t0, t3); |
| 646 | u2 = zip16_lo(t1, t4); |
| 647 | u3 = zip16_hi(t1, t4); |
| 648 | u4 = zip16_lo(t2, t5); |
| 649 | u5 = zip16_hi(t2, t5); |
| 650 | |
| 651 | t0 = zip16_lo(u0, u3); |
| 652 | t1 = zip16_hi(u0, u3); |
| 653 | t2 = zip16_lo(u1, u4); |
| 654 | t3 = zip16_hi(u1, u4); |
| 655 | t4 = zip16_lo(u2, u5); |
| 656 | t5 = zip16_hi(u2, u5); |
| 657 | |
| 658 | a = zip16_lo(t0, t3); |
| 659 | b = zip16_hi(t0, t3); |
| 660 | c = zip16_lo(t1, t4); |
| 661 | d = zip16_hi(t1, t4); |
| 662 | e = zip16_lo(t2, t5); |
| 663 | f = zip16_hi(t2, t5); |
| 664 | } |
| 665 | |
| 666 | static SIMDPP_INL |
| 667 | void mem_unpack6(uint16x8& a, uint16x8& b, uint16x8& c, |
| 668 | uint16x8& d, uint16x8& e, uint16x8& f) |
| 669 | { |
| 670 | uint16x8 t0, t1, t2, t3, t4, t5; |
| 671 | t0 = zip8_lo(a, d); |
| 672 | t1 = zip8_hi(a, d); |
| 673 | t2 = zip8_lo(b, e); |
| 674 | t3 = zip8_hi(b, e); |
| 675 | t4 = zip8_lo(c, f); |
| 676 | t5 = zip8_hi(c, f); |
| 677 | |
| 678 | uint16x8 u0, u1, u2, u3, u4, u5; |
| 679 | u0 = zip8_lo(t0, t3); |
| 680 | u1 = zip8_hi(t0, t3); |
| 681 | u2 = zip8_lo(t1, t4); |
| 682 | u3 = zip8_hi(t1, t4); |
| 683 | u4 = zip8_lo(t2, t5); |
| 684 | u5 = zip8_hi(t2, t5); |
| 685 | |
| 686 | a = zip8_lo(u0, u3); |
| 687 | b = zip8_hi(u0, u3); |
| 688 | c = zip8_lo(u1, u4); |
| 689 | d = zip8_hi(u1, u4); |
| 690 | e = zip8_lo(u2, u5); |
| 691 | f = zip8_hi(u2, u5); |
| 692 | } |
| 693 | |
| 694 | } // namespace insn |
| 695 | } // namespace detail |
| 696 | } // namespace SIMDPP_ARCH_NAMESPACE |
| 697 | } // namespace simdpp |
| 698 | |
| 699 | #endif |
| 700 | |
| 701 | |