1 | /* Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_MEM_PACK_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_MEM_PACK_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/detail/width.h> |
17 | #include <simdpp/detail/insn/shuffle128.h> |
18 | #include <simdpp/core/bit_andnot.h> |
19 | #include <simdpp/core/bit_or.h> |
20 | #include <simdpp/core/move_l.h> |
21 | #include <simdpp/core/move_r.h> |
22 | #include <simdpp/core/permute4.h> |
23 | #include <simdpp/core/shuffle2.h> |
24 | #include <simdpp/core/transpose.h> |
25 | #include <simdpp/core/zip_hi.h> |
26 | #include <simdpp/core/zip_lo.h> |
27 | #include <simdpp/detail/insn/zip128.h> |
28 | |
29 | namespace simdpp { |
30 | namespace SIMDPP_ARCH_NAMESPACE { |
31 | namespace detail { |
32 | namespace insn { |
33 | |
34 | /** Interleaves the elements of @a a and @a b in such way that: |
35 | * every (2n)-th element comes from @a a |
36 | * every (2n+1)-th element comes from @a b |
37 | |
38 | n = [0, <number of elements in vector> - 1] |
39 | */ |
40 | template<class V> SIMDPP_INL |
41 | void mem_pack2(any_vec<16,V>& qa, any_vec<16,V>& qb) |
42 | { |
43 | V a = qa.wrapped(); |
44 | V b = qb.wrapped(); |
45 | |
46 | qa.wrapped() = zip128_lo(a, b); |
47 | qb.wrapped() = zip128_hi(a, b); |
48 | } |
49 | |
50 | template<class V> SIMDPP_INL |
51 | void mem_pack2(any_vec<32,V>& qa, any_vec<32,V>& qb) |
52 | { |
53 | V a = qa.wrapped(); |
54 | V b = qb.wrapped(); |
55 | |
56 | V c1, c2; |
57 | c1 = zip128_lo(a, b); |
58 | c2 = zip128_hi(a, b); |
59 | qa.wrapped() = shuffle1_128<0,0>(c1, c2); |
60 | qb.wrapped() = shuffle1_128<1,1>(c1, c2); |
61 | } |
62 | |
63 | #if SIMDPP_USE_AVX512F || SIMDPP_USE_AVX512BW |
64 | template<class V> SIMDPP_INL |
65 | void mem_pack2(any_vec<64,V>& qa, any_vec<64,V>& qb) |
66 | { |
67 | V a = qa.wrapped(); |
68 | V b = qb.wrapped(); |
69 | |
70 | V c1, c2, d1, d2; |
71 | c1 = zip128_lo(a, b); |
72 | c2 = zip128_hi(a, b); |
73 | d1 = shuffle2_128<0,1,0,1>(c1, c2); |
74 | d2 = shuffle2_128<2,3,2,3>(c1, c2); |
75 | qa.wrapped() = permute4_128<0,2,1,3>(d1); // FIXME: optimize |
76 | qb.wrapped() = permute4_128<0,2,1,3>(d2); |
77 | } |
78 | #endif |
79 | |
80 | /** Generic implementation of mem_pack3. The 256-bit version applies 128-bit |
81 | operations to each half of each vector separately. |
82 | */ |
83 | template<class T> SIMDPP_INL |
84 | void v_mem_pack3_impl8_128(T& a, T& b, T& c) |
85 | { |
86 | #if SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
87 | using U = typename T::uint_vector_type; |
88 | |
89 | T a1, b1, c1; |
90 | a1 = align16<11>(a, a); |
91 | b1 = b; |
92 | c1 = align16<5>(c, c); |
93 | |
94 | // [a11..a15,a0..a10] |
95 | // [b0..b15] |
96 | // [c5..c15,c0..c4] |
97 | U mask1 = make_uint(0xff); |
98 | mask1 = move16_l<5>(mask1); |
99 | |
100 | T a2, b2, c2; |
101 | a2 = blend(a1, b1, mask1); |
102 | b2 = blend(b1, c1, mask1); |
103 | c2 = blend(c1, a1, mask1); |
104 | // [a11,a12,a13,a14,a15,a0, a1, a2, a3, a4, a5, b11,b12,b13,b14,b15] |
105 | // [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10,c0, c1, c2, c3, c4 ] |
106 | // [c5, c6, c7, c8, c9, c10,c11,c12,c13,c14,c15,a6, a7, a8, a9, a10] |
107 | U mask2 = make_shuffle_bytes16_mask<5, 16+0, 16+11, |
108 | 6, 16+1, 16+12, |
109 | 7, 16+2, 16+13, |
110 | 8, 16+3, 16+14, |
111 | 9, 16+4, 16+15, |
112 | 10>(mask2); |
113 | a = shuffle_bytes16(a2, b2, mask2); |
114 | b = shuffle_bytes16(b2, c2, mask2); |
115 | c = shuffle_bytes16(c2, a2, mask2); |
116 | |
117 | // [a0, b0, c0, a1, b1, c1, a2, b2, c2, a3, b3, c3, a4, b4, c4, a5 ] |
118 | // [b5, c5, a6, b6, c6, a7, b7, c7, a8, b8, c8, a9, b9, c9, a10,b10] |
119 | // [c10,a11,b11,c11,a12,b12,c12,a13,b13,c13,a14,b14,c14,a15,b15,c15] |
120 | #else |
121 | // either uint16x8 or uint16x16, other entries likewise |
122 | using w_b16 = typename same_width<T>::u16; |
123 | using w_b32 = typename same_width<T>::u32; |
124 | using w_b8 = T; |
125 | |
126 | w_b16 t0, t1, t2, t3; |
127 | t0 = zip16_lo(a, b); |
128 | t1 = zip16_hi(a, b); |
129 | t2 = zip16_lo(c, (w_b8) make_zero()); |
130 | t3 = zip16_hi(c, (w_b8) make_zero()); |
131 | |
132 | w_b8 u0, u1, u2, u3; |
133 | u0 = zip8_lo(t0, t2); |
134 | u1 = zip8_hi(t0, t2); |
135 | u2 = zip8_lo(t1, t3); |
136 | u3 = zip8_hi(t1, t3); |
137 | |
138 | // [a0, b0, c0, 0, a1, b1, c1, 0, a2, b2, c2, 0, a3, b3, c3, 0] |
139 | // [a4, b4, c4, 0, a5, b5, c5, 0, a6, b6, c6, 0, a7, b7, c7, 0] |
140 | // [a8, b8, c8, 0, a9, b9, c9, 0, a10,b10,c10,0, a11,b11,c11,0] |
141 | // [a12,b12,c12,0, a13,b13,c13,0, a14,b14,c14,0, a15,b15,c15,0] |
142 | #if SIMDPP_USE_SSSE3 |
143 | // it's not worth to use 4 different index vectors to shuffle the vectors |
144 | // properly and use only bit_or later |
145 | w_b8 idx = make_uint(0, 1, 2, 4, 5, 6, 8, 9, |
146 | 10, 12, 13, 14, 0xff, 0xff, 0xff, 0xff); |
147 | u0 = permute_bytes16(u0, idx); |
148 | u1 = permute_bytes16(u1, idx); |
149 | u2 = permute_bytes16(u2, idx); |
150 | u3 = permute_bytes16(u3, idx); |
151 | #else |
152 | using w_u64 = typename same_width<T>::u64; |
153 | |
154 | // the following is still faster than non-SIMD implementation |
155 | w_b8 mask1 = make_uint(0xff, 0xff, 0xff, 0, 0, 0, 0, 0, |
156 | 0xff, 0xff, 0xff, 0, 0, 0, 0, 0); |
157 | w_u64 w0, w1, w2, w3; |
158 | w0 = u0; w1 = u1; w2 = u2; w3 = u3; |
159 | w0 = shift_r(w0, 8); |
160 | w1 = shift_r(w1, 8); |
161 | w2 = shift_r(w2, 8); |
162 | w3 = shift_r(w3, 8); |
163 | |
164 | u0 = blend(u0, w0, mask1); |
165 | u1 = blend(u1, w1, mask1); |
166 | u2 = blend(u2, w2, mask1); |
167 | u3 = blend(u3, w3, mask1); |
168 | |
169 | w_b8 mask2 = make_uint(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, |
170 | 0, 0, 0, 0, 0, 0, 0, 0); |
171 | w_b8 x0, x1, x2, x3; |
172 | x0 = move16_l<2>(u0); |
173 | x1 = move16_l<2>(u1); |
174 | x2 = move16_l<2>(u2); |
175 | x3 = move16_l<2>(u3); |
176 | |
177 | u0 = blend(u0, x0, mask2); |
178 | u1 = blend(u1, x1, mask2); |
179 | u2 = blend(u2, x2, mask2); |
180 | u3 = blend(u3, x3, mask2); |
181 | #endif |
182 | // [a0, b0, c0, a1, b1, c1, a2, b2, c2, a3, b3, c3, 0,0,0,0] |
183 | // [a4, b4, c4, a5, b5, c5, a6, b6, c6, a7, b7, c7, 0,0,0,0] |
184 | // [a8, b8, c8, a9, b9, c9, a10,b10,c10,a11,b11,c11,0,0,0,0] |
185 | // [a12,b12,c12,a13,b13,c13,a14,b14,c14,a15,b15,c15,0,0,0,0] |
186 | w_b32 k0, k1, k2, k3, l0, l3; |
187 | k0 = u0; |
188 | k1 = u1; |
189 | k2 = u2; |
190 | k3 = u3; |
191 | l0 = move4_r<3>(k1); |
192 | l3 = move4_l<2>(k2); |
193 | k3 = move4_r<1>(k3); |
194 | a = bit_or(k0, l0); |
195 | b = shuffle2<1,2,0,1>(k1, k2); |
196 | c = bit_or(k3, l3); |
197 | #endif |
198 | } |
199 | |
200 | template<class T> SIMDPP_INL |
201 | void v_mem_pack3_impl16_128(T& a, T& b, T& c) |
202 | { |
203 | #if SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
204 | using U = typename T::uint_vector_type; |
205 | |
206 | // [a0..a7] |
207 | // [b0..b7] |
208 | // [c0..c7] |
209 | T a1, b1, c1; |
210 | a1 = a; |
211 | b1 = align8<5>(b, b); |
212 | c1 = align8<2>(c, c); |
213 | |
214 | // [a0..a7] |
215 | // [b5..b7,b0..b4] |
216 | // [c2..c7,c0,c1] |
217 | T a2, b2, c2; |
218 | U mask2 = make_uint(0xffff); |
219 | mask2 = move8_l<2>(mask2); |
220 | |
221 | a2 = blend(a1, b1, mask2); |
222 | b2 = blend(b1, c1, mask2); |
223 | c2 = blend(c1, a1, mask2); |
224 | |
225 | // [a0,a1,a2,a3,a4,a5,b3,b4] |
226 | // [b5,b6,b7,b0,b1,b2,c0,c1] |
227 | // [c2,c3,c4,c5,c6,c7,a6,a7] |
228 | U mask1 = make_shuffle_bytes16_mask<0, 8+3, 8+6, |
229 | 1, 8+4, 8+7, |
230 | 2, 8+5>(mask1); |
231 | a = shuffle_bytes16(a2, b2, mask1); |
232 | b = shuffle_bytes16(c2, a2, mask1); |
233 | c = shuffle_bytes16(b2, c2, mask1); |
234 | |
235 | // [a0,b0,c0,a1,b1,c1,a2,b2] |
236 | // [c2,a3,b3,c3,a4,b4,c4,a5] |
237 | // [b5,c5,a6,b6,c6,a7,b7,c7] |
238 | |
239 | #else |
240 | // either uint8x16 or uint8x32, other entries likewise |
241 | using w_b16 = T; |
242 | using w_b32 = typename same_width<T>::u32; |
243 | |
244 | w_b32 t0, t1, t2, t3; |
245 | t0 = zip8_lo(a, b); |
246 | t1 = zip8_hi(a, b); |
247 | t2 = zip8_lo(c, (w_b16) make_zero()); |
248 | t3 = zip8_hi(c, (w_b16) make_zero()); |
249 | |
250 | w_b16 u0, u1, u2, u3; |
251 | u0 = zip4_lo(t0, t2); |
252 | u1 = zip4_hi(t0, t2); |
253 | u2 = zip4_lo(t1, t3); |
254 | u3 = zip4_hi(t1, t3); |
255 | |
256 | // [a0, b0, c0, 0, a1, b1, c1, 0 ] |
257 | // [a2, b2, c2, 0, a3, b3, c3, 0 ] |
258 | // [a4, b4, c4, 0, a5, b5, c5, 0 ] |
259 | // [a6, b6, c6, 0, a7, b7, c7, 0 ] |
260 | |
261 | #if SIMDPP_USE_SSSE3 |
262 | // it's not worth to use 4 different index vectors to shuffle the vectors |
263 | // properly and use only bit_or later |
264 | w_b16 idx = make_shuffle_bytes16_mask<0,1,2,4,5,6,-1,-1>(idx); |
265 | u0 = permute_bytes16(u0, idx); |
266 | u1 = permute_bytes16(u1, idx); |
267 | u2 = permute_bytes16(u2, idx); |
268 | u3 = permute_bytes16(u3, idx); |
269 | |
270 | #else |
271 | // the following is still faster than non-SIMD implementation |
272 | w_b16 mask2 = make_uint(0xffff, 0xffff, 0xffff, 0, |
273 | 0, 0, 0, 0); |
274 | u0 = blend(u0, move8_l<1>(u0), mask2); |
275 | u1 = blend(u1, move8_l<1>(u1), mask2); |
276 | u2 = blend(u2, move8_l<1>(u2), mask2); |
277 | u3 = blend(u3, move8_l<1>(u3), mask2); |
278 | #endif |
279 | // [a0, b0, c0, a1, b1, c1, 0, 0] |
280 | // [a2, b2, c2, a3, b3, c3, 0, 0] |
281 | // [a4, b4, c4, a5, b5, c5, 0, 0] |
282 | // [a6, b6, c6, a7, b7, c7, 0, 0] |
283 | w_b32 k0, k1, k2, k3, l0, l3; |
284 | k0 = u0; |
285 | k1 = u1; |
286 | k2 = u2; |
287 | k3 = u3; |
288 | l0 = move4_r<3>(k1); |
289 | l3 = move4_l<2>(k2); |
290 | k3 = move4_r<1>(k3); |
291 | a = bit_or(k0, l0); |
292 | b = shuffle2<1,2,0,1>(k1, k2); |
293 | c = bit_or(k3, l3); |
294 | #endif |
295 | } |
296 | |
297 | template<class T> SIMDPP_INL |
298 | void v_mem_pack3_impl32_128(T& a, T& b, T& c) |
299 | { |
300 | #if SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
301 | using U = typename T::uint_vector_type; |
302 | |
303 | // [a0,a1,a2,a3] |
304 | // [b0,b1,b2,b3] |
305 | // [c0,c1,c2,c3] |
306 | T a1, b1, c1; |
307 | a1 = a; |
308 | b1 = align4<1>(b, b); |
309 | c1 = align4<2>(c, c); |
310 | |
311 | // [a0,a1,a2,a3] |
312 | // [b1,b2,b3,b0] |
313 | // [c2,c3,c0,c1] |
314 | T a2, b2, c2; |
315 | U mask2 = make_uint(0xffffffff); |
316 | mask2 = move4_l<1>(mask2); |
317 | |
318 | a2 = blend(a1, c1, mask2); |
319 | b2 = blend(b1, a1, mask2); |
320 | c2 = blend(c1, b1, mask2); |
321 | // [a0,a1,a2,c1] |
322 | // [b1,b2,b3,a3] |
323 | // [c2,c3,c0,b0] |
324 | U mask1 = make_shuffle_bytes16_mask<0,4+3,4+2,1>(mask1); |
325 | a = shuffle_bytes16(a2, c2, mask1); |
326 | b = shuffle_bytes16(b2, a2, mask1); |
327 | c = shuffle_bytes16(c2, b2, mask1); |
328 | // [a0,b0,c0,a1] |
329 | // [b1,c1,a2,b2] |
330 | // [c2,a3,b3,c3] |
331 | #else |
332 | T t0, t1, t2; |
333 | t0 = shuffle2<0,2,0,2>(a, b); |
334 | t1 = shuffle2<0,2,1,3>(c, a); |
335 | t2 = shuffle2<1,3,1,3>(b, c); |
336 | // [a0,a2,b0,b2] |
337 | // [c0,c2,a1,a3] |
338 | // [b1,b3,c1,c3] |
339 | t0 = permute4<0,2,1,3>(t0); |
340 | t1 = permute4<0,2,1,3>(t1); |
341 | t2 = permute4<0,2,1,3>(t2); |
342 | // [a0,b0,a2,b2] |
343 | // [c0,a1,c2,a3] |
344 | // [b1,c1,b3,c3] |
345 | a = shuffle2<0,1,0,1>(t0, t1); |
346 | b = shuffle2<0,1,2,3>(t2, t0); |
347 | c = shuffle2<2,3,2,3>(t1, t2); |
348 | #endif |
349 | } |
350 | |
351 | template<class T> SIMDPP_INL |
352 | void v_mem_pack3_impl64_128(T& a, T& b, T& c) |
353 | { |
354 | T d0, d1, d2; |
355 | d0 = shuffle1<0,0>(a, b); |
356 | d1 = shuffle1<0,1>(c, a); |
357 | d2 = shuffle1<1,1>(b, c); |
358 | a = d0; b = d1; c = d2; |
359 | } |
360 | |
361 | template<class V> SIMDPP_INL |
362 | void v_mem_pack3_shuffle128(any_vec<16,V>& qa, any_vec<16,V>& qb, any_vec<16,V>& qc) |
363 | { |
364 | (void) qa; (void) qb; (void) qc; |
365 | } |
366 | |
367 | template<class V> SIMDPP_INL |
368 | void v_mem_pack3_shuffle128(any_vec<32,V>& qa, any_vec<32,V>& qb, any_vec<32,V>& qc) |
369 | { |
370 | // shuffle the vectors so that the lower halves contain the first 3 128-bit |
371 | // items (a and lower half of b) and the higher halves contain the rest |
372 | |
373 | V a0, b0, c0, a1, b1, c1; |
374 | |
375 | a0 = qa.wrapped(); b0 = qb.wrapped(); c0 = qc.wrapped(); |
376 | |
377 | a1 = shuffle1_128<0,0>(a0, b0); |
378 | b1 = shuffle1_128<0,1>(c0, a0); |
379 | c1 = shuffle1_128<1,1>(b0, c0); |
380 | |
381 | qa.wrapped() = a1; qb.wrapped() = b1; qc.wrapped() = c1; |
382 | } |
383 | |
384 | #if SIMDPP_USE_AVX512F |
385 | template<class V> SIMDPP_INL |
386 | void v_mem_pack3_shuffle128(any_vec<64,V>& qa, any_vec<64,V>& qb, any_vec<64,V>& qc) |
387 | { |
388 | V a, b, c; // TODO: optimize. Using full-vector shuffle may be faster |
389 | a = qa.wrapped(); b = qb.wrapped(); c = qc.wrapped(); |
390 | |
391 | V t0, t1, t2; |
392 | t0 = shuffle2_128<0,2,0,2>(a, b); |
393 | t1 = shuffle2_128<0,2,1,3>(c, a); |
394 | t2 = shuffle2_128<1,3,1,3>(b, c); |
395 | // [a0,a2,b0,b2] |
396 | // [c0,c2,a1,a3] |
397 | // [b1,b3,c1,c3] |
398 | t0 = permute4_128<0,2,1,3>(t0); |
399 | t1 = permute4_128<0,2,1,3>(t1); |
400 | t2 = permute4_128<0,2,1,3>(t2); |
401 | // [a0,b0,a2,b2] |
402 | // [c0,a1,c2,a3] |
403 | // [b1,c1,b3,c3] |
404 | a = shuffle2_128<0,1,0,1>(t0, t1); |
405 | b = shuffle2_128<0,1,2,3>(t2, t0); |
406 | c = shuffle2_128<2,3,2,3>(t1, t2); |
407 | |
408 | qa.wrapped() = a; qb.wrapped() = b; qc.wrapped() = c; |
409 | } |
410 | #endif |
411 | |
412 | /** Interleaves the elements of @a a, @a b and @a c in such way that: |
413 | * every (3n)-th element comes from @a a |
414 | * every (3n+1)-th element comes from @a b |
415 | * every (3n+2)-th element comes from @a c |
416 | |
417 | n = [0, <number of elements in vector> - 1] |
418 | */ |
419 | template<unsigned N> SIMDPP_INL |
420 | void mem_pack3(uint8<N>& a, uint8<N>& b, uint8<N>& c) |
421 | { |
422 | v_mem_pack3_impl8_128(a, b, c); |
423 | v_mem_pack3_shuffle128(a, b, c); |
424 | } |
425 | |
426 | template<unsigned N> SIMDPP_INL |
427 | void mem_pack3(uint16<N>& a, uint16<N>& b, uint16<N>& c) |
428 | { |
429 | v_mem_pack3_impl16_128(a, b, c); |
430 | v_mem_pack3_shuffle128(a, b, c); |
431 | } |
432 | |
433 | template<unsigned N> SIMDPP_INL |
434 | void mem_pack3(uint32<N>& a, uint32<N>& b, uint32<N>& c) |
435 | { |
436 | v_mem_pack3_impl32_128(a, b, c); |
437 | v_mem_pack3_shuffle128(a, b, c); |
438 | } |
439 | |
440 | template<unsigned N> SIMDPP_INL |
441 | void mem_pack3(uint64<N>& a, uint64<N>& b, uint64<N>& c) |
442 | { |
443 | v_mem_pack3_impl64_128(a, b, c); |
444 | v_mem_pack3_shuffle128(a, b, c); |
445 | } |
446 | |
447 | template<unsigned N> SIMDPP_INL |
448 | void mem_pack3(float32<N>& a, float32<N>& b, float32<N>& c) |
449 | { |
450 | v_mem_pack3_impl32_128(a, b, c); |
451 | v_mem_pack3_shuffle128(a, b, c); |
452 | } |
453 | |
454 | template<unsigned N> SIMDPP_INL |
455 | void mem_pack3(float64<N>& a, float64<N>& b, float64<N>& c) |
456 | { |
457 | v_mem_pack3_impl64_128(a, b, c); |
458 | v_mem_pack3_shuffle128(a, b, c); |
459 | } |
460 | |
461 | /** Generic implementation of mem_pack4. The 256-bit version applies 128-bit |
462 | operations to each half of each vector separately. |
463 | */ |
464 | template<class T> SIMDPP_INL |
465 | void v_mem_pack4_impl8_128(T& a, T& b, T& c, T& d) |
466 | { |
467 | // either uint16x8 or uint16x16, other entries likewise |
468 | #if SIMDPP_USE_SSSE3 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
469 | // TODO: optimize for altivec |
470 | using w_b32 = typename same_width<T>::u32; |
471 | |
472 | w_b32 b0, b1, b2, b3; |
473 | b0 = a; b1 = b; b2 = c; b3 = d; |
474 | transpose4(b0, b1, b2, b3); |
475 | a = b0; b = b1; c = b2; d = b3; |
476 | |
477 | a = transpose_inplace(a); |
478 | b = transpose_inplace(b); |
479 | c = transpose_inplace(c); |
480 | d = transpose_inplace(d); |
481 | #else |
482 | using w_b8 = T; |
483 | using w_b16 = typename same_width<T>::u16; |
484 | using w_b64 = typename same_width<T>::u64; |
485 | |
486 | w_b8 e0, e1, e2, e3; |
487 | w_b64 d0, d1, d2, d3; |
488 | d0 = a; d1 = b; d2 = c; d3 = d; |
489 | e0 = zip2_lo(d0, d2); |
490 | e1 = zip2_lo(d1, d3); |
491 | e2 = zip2_hi(d0, d2); |
492 | e3 = zip2_hi(d1, d3); |
493 | // [a0 .. a7, c0 .. c7 ] |
494 | // [b0 .. b7, d0 .. d7 ] |
495 | // [a8 .. a15, c8 .. c15 ] |
496 | // [b8 .. b15, d8 .. d15 ] |
497 | w_b16 f0, f1, f2, f3; |
498 | f0 = zip16_lo(e0, e1); |
499 | f1 = zip16_hi(e0, e1); |
500 | f2 = zip16_lo(e2, e3); |
501 | f3 = zip16_hi(e2, e3); |
502 | // [a0, b0, a1, b1, a2, b2, a3, b3, a4, b4, a5, b5, a6, b6, a7, b7 ] |
503 | // [c0, d0, c1, d1, c2, d2, c3, d3, c4, d4, c5, d5, c6, d6, c7, d7 ] |
504 | // [a8, b8, a9, b9, a10,b10,a11,b11,a12,b12,a13,b13,a14,b14,a15,b15] |
505 | // [c8, d8, c9, d9, c10,d10,c11,d11,c12,d12,c13,d13,c14,d14,c15,d15] |
506 | a = zip8_lo(f0, f1); |
507 | b = zip8_hi(f0, f1); |
508 | c = zip8_lo(f2, f3); |
509 | d = zip8_hi(f2, f3); |
510 | #endif |
511 | } |
512 | |
513 | template<class T> SIMDPP_INL |
514 | void v_mem_pack4_impl16_128(T& a, T& b, T& c, T& d) |
515 | { |
516 | using w_b16 = T; |
517 | using w_b32 = typename same_width<T>::u32; |
518 | using w_b64 = typename same_width<T>::u64; |
519 | |
520 | w_b16 e0, e1, e2, e3; |
521 | w_b64 d0, d1, d2, d3; |
522 | d0 = a; d1 = b; d2 = c; d3 = d; |
523 | e0 = zip2_lo(d0, d2); |
524 | e1 = zip2_lo(d1, d3); |
525 | e2 = zip2_hi(d0, d2); |
526 | e3 = zip2_hi(d1, d3); |
527 | // [a0,a1,a2,a3,c0,c1,c2,c3] |
528 | // [b0,b1,b2,b3,d0,d1,d2,d3] |
529 | // [a4,a5,a6,a7,c4,c5,c6,c7] |
530 | // [b4,b5,b6,b7,d4,d5,d6,d7] |
531 | w_b32 f0, f1, f2, f3; |
532 | f0 = zip8_lo(e0, e1); |
533 | f1 = zip8_hi(e0, e1); |
534 | f2 = zip8_lo(e2, e3); |
535 | f3 = zip8_hi(e2, e3); |
536 | // [a0,b0,a1,b1,a2,b2,a3,b3] |
537 | // [c0,d0,c1,d1,c2,d2,c3,d3] |
538 | // [a4,b4,a5,b5,a6,b6,a7,b7] |
539 | // [c4,d4,c5,d5,c6,d6,c7,d7] |
540 | a = zip4_lo(f0, f1); |
541 | b = zip4_hi(f0, f1); |
542 | c = zip4_lo(f2, f3); |
543 | d = zip4_hi(f2, f3); |
544 | } |
545 | |
546 | template<class T> SIMDPP_INL |
547 | void v_mem_pack4_impl32_128(T& a, T& b, T& c, T& d) |
548 | { |
549 | transpose4(a, b, c, d); |
550 | } |
551 | |
552 | template<class T> SIMDPP_INL |
553 | void v_mem_pack4_impl64_128(T& a, T& b, T& c, T& d) |
554 | { |
555 | transpose2(a, b); |
556 | transpose2(c, d); |
557 | T t; |
558 | t = b; |
559 | b = c; |
560 | c = t; |
561 | } |
562 | |
563 | template<class V> SIMDPP_INL |
564 | void v_mem_pack4_shuffle128(any_vec<16,V>& qa, any_vec<16,V>& qb, |
565 | any_vec<16,V>& qc, any_vec<16,V>& qd) |
566 | { |
567 | (void) qa; (void) qb; (void) qc; (void) qd; |
568 | } |
569 | |
570 | template<class V> SIMDPP_INL |
571 | void v_mem_pack4_shuffle128(any_vec<32,V>& qa, any_vec<32,V>& qb, |
572 | any_vec<32,V>& qc, any_vec<32,V>& qd) |
573 | { |
574 | // shuffle the vectors with the lower halves containing the first 4 128-bit |
575 | // items and the higher halves contain the rest |
576 | V a0, b0, c0, d0, a1, b1, c1, d1; |
577 | |
578 | a0 = qa.wrapped(); b0 = qb.wrapped(); c0 = qc.wrapped(); d0 = qd.wrapped(); |
579 | |
580 | a1 = shuffle1_128<0,0>(a0, b0); |
581 | b1 = shuffle1_128<0,0>(c0, d0); |
582 | c1 = shuffle1_128<1,1>(a0, b0); |
583 | d1 = shuffle1_128<1,1>(c0, d0); |
584 | |
585 | qa.wrapped() = a1; qb.wrapped() = b1; qc.wrapped() = c1; qd.wrapped() = d1; |
586 | } |
587 | |
588 | #if SIMDPP_USE_AVX512F |
589 | template<class V> SIMDPP_INL |
590 | void v_mem_pack4_shuffle128(any_vec<64,V>& qa, any_vec<64,V>& qb, |
591 | any_vec<64,V>& qc, any_vec<64,V>& qd) |
592 | { |
593 | V a, b, c, d; // TODO: optimize. Using full-vector shuffle/permute will be faster |
594 | |
595 | a = qa.wrapped(); b = qb.wrapped(); c = qc.wrapped(); d = qd.wrapped(); |
596 | |
597 | V t1, t2, t3, t4; |
598 | // [a0,a1,a2,a3] |
599 | // [b0,b1,b2,b3] |
600 | // [c0,c1,c2,c3] |
601 | // [d0,d1,d2,d3] |
602 | t1 = shuffle2_128<0,2,0,2>(a, b); |
603 | t2 = shuffle2_128<1,3,1,3>(a, b); |
604 | t3 = shuffle2_128<0,2,0,2>(c, d); |
605 | t4 = shuffle2_128<1,3,1,3>(c, d); |
606 | // [a0,a2,b0,b2] |
607 | // [a1,a3,b1,b3] |
608 | // [c0,c2,d0,d2] |
609 | // [c1,c3,d1,d3] |
610 | a = shuffle2_128<0,2,0,2>(t1, t3); |
611 | b = shuffle2_128<0,2,0,2>(t2, t4); |
612 | c = shuffle2_128<1,3,1,3>(t1, t3); |
613 | d = shuffle2_128<1,3,1,3>(t2, t4); |
614 | // [a0,b0,c0,d0] |
615 | // [a1,b1,c1,d1] |
616 | // [a2,b2,c2,d2] |
617 | // [a3,b3,c3,d3] |
618 | |
619 | |
620 | qa.wrapped() = a; qb.wrapped() = b; qc.wrapped() = c; qd.wrapped() = d; |
621 | } |
622 | #endif |
623 | |
624 | /** Interleaves the elements of @a a, @a b, @a c and @a d in such way that: |
625 | * every (4n)-th element comes from @a a |
626 | * every (4n+1)-th element comes from @a b |
627 | * every (4n+2)-th element comes from @a c |
628 | * every (4n+3)-th element comes from @a d |
629 | |
630 | n = [0, <number of elements in vector> - 1] |
631 | */ |
632 | template<unsigned N> SIMDPP_INL |
633 | void mem_pack4(uint8<N>& a, uint8<N>& b, uint8<N>& c, uint8<N>& d) |
634 | { |
635 | v_mem_pack4_impl8_128(a, b, c, d); |
636 | v_mem_pack4_shuffle128(a, b, c, d); |
637 | } |
638 | |
639 | template<unsigned N> SIMDPP_INL |
640 | void mem_pack4(uint16<N>& a, uint16<N>& b, uint16<N>& c, uint16<N>& d) |
641 | { |
642 | v_mem_pack4_impl16_128(a, b, c, d); |
643 | v_mem_pack4_shuffle128(a, b, c, d); |
644 | } |
645 | |
646 | template<unsigned N> SIMDPP_INL |
647 | void mem_pack4(uint32<N>& a, uint32<N>& b, uint32<N>& c, uint32<N>& d) |
648 | { |
649 | v_mem_pack4_impl32_128(a, b, c, d); |
650 | v_mem_pack4_shuffle128(a, b, c, d); |
651 | } |
652 | |
653 | template<unsigned N> SIMDPP_INL |
654 | void mem_pack4(uint64<N>& a, uint64<N>& b, uint64<N>& c, uint64<N>& d) |
655 | { |
656 | v_mem_pack4_impl64_128(a, b, c, d); |
657 | v_mem_pack4_shuffle128(a, b, c, d); |
658 | } |
659 | |
660 | template<unsigned N> SIMDPP_INL |
661 | void mem_pack4(float32<N>& a, float32<N>& b, float32<N>& c, float32<N>& d) |
662 | { |
663 | v_mem_pack4_impl32_128(a, b, c, d); |
664 | v_mem_pack4_shuffle128(a, b, c, d); |
665 | } |
666 | |
667 | template<unsigned N> SIMDPP_INL |
668 | void mem_pack4(float64<N>& a, float64<N>& b, float64<N>& c, float64<N>& d) |
669 | { |
670 | v_mem_pack4_impl64_128(a, b, c, d); |
671 | v_mem_pack4_shuffle128(a, b, c, d); |
672 | } |
673 | |
674 | } // namespace insn |
675 | } // namespace detail |
676 | } // namespace SIMDPP_ARCH_NAMESPACE |
677 | } // namespace simdpp |
678 | |
679 | #endif |
680 | |
681 | |