1 | /* Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_MEM_UNPACK_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_MEM_UNPACK_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/detail/width.h> |
17 | #include <simdpp/detail/insn/shuffle128.h> |
18 | #include <simdpp/detail/insn/zip128.h> |
19 | #include <simdpp/core/align.h> |
20 | #include <simdpp/core/splat_n.h> |
21 | #include <simdpp/core/make_shuffle_bytes_mask.h> |
22 | #include <simdpp/core/shuffle1.h> |
23 | #include <simdpp/core/shuffle2.h> |
24 | #include <simdpp/core/transpose.h> |
25 | #include <simdpp/core/unzip_hi.h> |
26 | #include <simdpp/core/unzip_lo.h> |
27 | #include <simdpp/core/zip_hi.h> |
28 | #include <simdpp/core/zip_lo.h> |
29 | |
30 | namespace simdpp { |
31 | namespace SIMDPP_ARCH_NAMESPACE { |
32 | namespace detail { |
33 | namespace insn { |
34 | |
35 | /** Concatenates @a a and @a b and stores the elements of the resulting array |
36 | as follows: |
37 | * every (2n)-th element is stored to @a a |
38 | * every (2n+1)-th element is stored to @a b |
39 | |
40 | n = [0, <number of elements in vector> - 1] |
41 | */ |
42 | template<class V> SIMDPP_INL |
43 | void mem_unpack2(any_vec<16,V>& qa, any_vec<16,V>& qb) |
44 | { |
45 | V a = qa.wrapped(); |
46 | V b = qb.wrapped(); |
47 | |
48 | qa.wrapped() = unzip128_lo(a, b); |
49 | qb.wrapped() = unzip128_hi(a, b); |
50 | } |
51 | |
52 | template<class V> SIMDPP_INL |
53 | void mem_unpack2(any_vec<32,V>& qa, any_vec<32,V>& qb) |
54 | { |
55 | V a = qa.wrapped(); |
56 | V b = qb.wrapped(); |
57 | |
58 | V c1 = shuffle1_128<0,0>(a, b); |
59 | V c2 = shuffle1_128<1,1>(a, b); |
60 | qa.wrapped() = unzip128_lo(c1, c2); |
61 | qb.wrapped() = unzip128_hi(c1, c2); |
62 | } |
63 | |
64 | #if SIMDPP_USE_AVX512F |
65 | template<class V> SIMDPP_INL |
66 | void mem_unpack2(any_vec<64,V>& qa, any_vec<64,V>& qb) |
67 | { |
68 | V a = qa.wrapped(); |
69 | V b = qb.wrapped(); |
70 | |
71 | V c1 = shuffle2_128<0,2,0,2>(a, b); |
72 | V c2 = shuffle2_128<1,3,1,3>(a, b); |
73 | qa.wrapped() = unzip128_lo(c1, c2); |
74 | qb.wrapped() = unzip128_hi(c1, c2); |
75 | } |
76 | #endif |
77 | |
78 | /** Generic implementation of mem_unpack3. The 128-bit lanes are processed |
79 | independently |
80 | */ |
81 | template<class T> SIMDPP_INL |
82 | void v_mem_unpack3_impl8_128(T& a, T& b, T& c) |
83 | { |
84 | #if SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
85 | // [a0, b0, c0, a1, b1, c1, a2, b2, c2, a3, b3, c3, a4, b4, c4, a5 ] |
86 | // [b5, c5, a6, b6, c6, a7, b7, c7, a8, b8, c8, a9, b9, c9, a10,b10] |
87 | // [c10,a11,b11,c11,a12,b12,c12,a13,b13,c13,a14,b14,c14,a15,b15,c15] |
88 | T mask1 = make_shuffle_bytes16_mask< 1, 4, 7, 10, 13,16+0,16+3,16+6, |
89 | 16+9,16+12,16+15, 2, 5, 8, 11, 14>(mask1); |
90 | T a1, b1, c1; |
91 | a1 = shuffle_bytes16(c, a, mask1); |
92 | b1 = shuffle_bytes16(a, b, mask1); |
93 | c1 = shuffle_bytes16(b, c, mask1); |
94 | // [a11,a12,a13,a14,a15,a0, a1, a2, a3, a4, a5, b11,b12,b13,b14,b15] |
95 | // [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10,c0, c1, c2, c3, c4 ] |
96 | // [c5, c6, c7, c8, c9, c10,c11,c12,c13,c14,c15,a6, a7, a8, a9, a10] |
97 | T a2, b2, c2; |
98 | T mask2 = make_uint(0xff); |
99 | mask2 = move16_l<5>(mask2); |
100 | |
101 | a2 = blend(a1, c1, mask2); |
102 | b2 = blend(b1, a1, mask2); |
103 | c2 = blend(c1, b1, mask2); |
104 | // [a11..a15,a0..a10] |
105 | // [b0..b15] |
106 | // [c5..c15,c0..c5] |
107 | a = align16<5>(a2, a2); |
108 | b = b2; |
109 | c = align16<11>(c2, c2); |
110 | #else |
111 | typename same_width<T>::u8 t0, t1, t2, t3; |
112 | t0 = a; |
113 | t1 = align16<12>(a, b); |
114 | t2 = align16<8>(b, c); |
115 | t3 = move16_l<4>(c); |
116 | // [a0, b0, c0, a1, b1, c1, a2, b2, c2, a3, b3, c3, ...] |
117 | // [a4, b4, c4, a5, b5, c5, a6, b6, c6, a7, b7, c7, ...] |
118 | // [a8, b8, c8, a9, b9, c9, a10,b10,c10,a11,b11,c11, ...] |
119 | // [a12,b12,c12,a13,b13,c13,a14,b14,c14,a15,b15,c15, ...] |
120 | typename same_width<T>::u16 b0, b1, b2, b3; |
121 | b0 = zip16_lo(t0, t1); |
122 | b1 = zip16_lo(t2, t3); |
123 | b2 = zip16_hi(t0, t1); |
124 | b3 = zip16_hi(t2, t3); |
125 | // [a0, a4, b0, b4, c0, c4, a1, a5, b1, b5, c1, c5, a2, a6, b2, b6 ] |
126 | // [a8, a12,b8, b12,c9, c13,a9, a13,b9, b13,c9, c13,a10,a14,b10,b14,] |
127 | // [c2, c6, a3, a7, b3, b7, c3, c7, ... ] |
128 | // [c10,c14,a11,a15,b11,b15,c11,c15,... ] |
129 | typename same_width<T>::u8 u0, u1, u2; |
130 | u0 = zip8_lo(b0, b1); |
131 | u1 = zip8_hi(b0, b1); |
132 | u2 = zip8_lo(b2, b3); |
133 | // [a0, a4, a8, a12,b0, b4, b8, b12, c0, c4, c8, c12, a1, a5, a9, a13 ] |
134 | // [b1, b5, b9, b13,c1, c5, c9, c13, a2, a6, a10,a14, b2, b6, b10,b14 ] |
135 | // [c2, c6, c10,c14,a3, a7, a11,a15, b3, b7, b11,b15, c3, c7, c11,c15 ] |
136 | t0 = u0; |
137 | t1 = align16<12>(u0, u1); |
138 | t2 = align16<8>(u1, u2); |
139 | t3 = move16_l<4>(u2); |
140 | // [a0, a4, a8, a12,b0, b4, b8, b12, c0, c4, c8, c12, ...] |
141 | // [a1, a5, a9, a13,b1, b5, b9, b13, c1, c5, c9, c13, ...] |
142 | // [a2, a6, a10,a14,b2, b6, b10,b13, c2, c6, c10,c14, ...] |
143 | // [a3, a7, a11,a15,b3, b7, b11,b13, c3, c7, c11,c15, ...] |
144 | b0 = zip16_lo(t0, t1); |
145 | b1 = zip16_lo(t2, t3); |
146 | b2 = zip16_hi(t0, t1); |
147 | b3 = zip16_hi(t2, t3); |
148 | // [a0, a1, a4, a5, a8, a9, a12,a13,b0, b1, b4, b5, b8, b9, b12,b13 ] |
149 | // [a2, a3, a6, a7, a10,a11,a14,a15,b2, b3, b6, b7, b10,b11,b14,b15 ] |
150 | // [c0, c1, c4, c5, c8, c9, c12,c13, ... ] |
151 | // [c2, c3, c6, c7, c10,c11,c14,c15, ... ] |
152 | a = zip8_lo(b0, b1); |
153 | b = zip8_hi(b0, b1); |
154 | c = zip8_lo(b2, b3); |
155 | #endif |
156 | } |
157 | |
158 | template<class T> SIMDPP_INL |
159 | void v_mem_unpack3_impl16_128(T& a, T& b, T& c) |
160 | { |
161 | #if SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
162 | // [a0,b0,c0,a1,b1,c1,a2,b2] |
163 | // [c2,a3,b3,c3,a4,b4,c4,a5] |
164 | // [b5,c5,a6,b6,c6,a7,b7,c7] |
165 | T mask1 = make_shuffle_bytes16_mask<0,3,6,8+1,8+4,8+7,8+2,8+5>(mask1); |
166 | T a1, b1, c1; |
167 | a1 = shuffle_bytes16(a, b, mask1); |
168 | c1 = shuffle_bytes16(b, c, mask1); |
169 | b1 = shuffle_bytes16(c, a, mask1); |
170 | // [a0,a1,a2,a3,a4,a5,b3,b4] |
171 | // [c2,c3,c4,c5,c6,c7,a6,a7] |
172 | // [b5,b6,b7,b0,b1,b2,c0,c1] |
173 | T a2, b2, c2; |
174 | T mask2 = make_uint(0xffff); |
175 | mask2 = move8_l<2>(mask2); |
176 | |
177 | a2 = blend(a1, c1, mask2); |
178 | b2 = blend(b1, a1, mask2); |
179 | c2 = blend(c1, b1, mask2); |
180 | // [a0..a7] |
181 | // [b5..b7,b0..b4] |
182 | // [c2..c7,c0,c1] |
183 | a = a2; |
184 | b = align8<3>(b2, b2); |
185 | c = align8<6>(c2, c2); |
186 | #else |
187 | T t0, t1, t2, t3; |
188 | t0 = a; |
189 | t1 = align8<6>(a, b); |
190 | t2 = align8<4>(b, c); |
191 | t3 = move8_l<2>(c); |
192 | // [a0,b0,c0,a1,b1,c1, ... ] |
193 | // [a2,b2,c2,a3,b3,c3, ... ] |
194 | // [a4,b4,c4,a5,b5,c5, ... ] |
195 | // [a6,b6,c6,a7,b7,c7, ... ] |
196 | typename same_width<T>::u32 b0, b1, b2, b3; |
197 | b0 = zip8_lo(t0, t1); |
198 | b1 = zip8_lo(t2, t3); |
199 | b2 = zip8_hi(t0, t1); |
200 | b3 = zip8_hi(t2, t3); |
201 | // [a0,a2,b0,b2,c0,c2,a1,a3] |
202 | // [a4,a6,b4,b6,c4,c6,a5,a7] |
203 | // [b1,b3,c1,c3, ... ] |
204 | // [b5,b7,c5,c7, ... ] |
205 | typename same_width<T>::u64 c0, c1, c2; |
206 | c0 = zip4_lo(b0, b1); |
207 | c1 = zip4_hi(b0, b1); |
208 | c2 = zip4_lo(b2, b3); |
209 | // [a0,a2,a4,a6,b0,b2,b4,b6] |
210 | // [c0,c2,c4,c6,a1,a3,a5,a7] |
211 | // [b1,b3,b5,b7,c1,c3,c5,c7] |
212 | t0 = c0; |
213 | t1 = shuffle1<1,0>(c0, c1); |
214 | t2 = splat2<1>(c1); |
215 | t3 = c2; |
216 | // [a0,a2,a4,a6,b0,b2,b4,b6] |
217 | // [b0,b2,b4,b6,c0,c2,c4,c6] |
218 | // [a1,a3,a5,a7,a1,a3,a5,a7] |
219 | // [b1,b3,b5,b7,c1,c3,c5,c7] |
220 | a = zip8_lo(t0, t2); |
221 | b = zip8_lo(t1, t3); |
222 | c = zip8_hi(t1, t3); |
223 | #endif |
224 | } |
225 | |
226 | template<class T> SIMDPP_INL |
227 | void v_mem_unpack3_impl32_128(T& a, T& b, T& c) |
228 | { |
229 | #if SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
230 | using U = typename T::uint_vector_type; |
231 | |
232 | // [a0,b0,c0,a1] |
233 | // [b1,c1,a2,b2] |
234 | // [c2,a3,b3,c3] |
235 | U mask1 = make_shuffle_bytes16_mask<0,3,4+2,4+1>(mask1); |
236 | T a1, b1, c1; |
237 | a1 = shuffle_bytes16(a, b, mask1); |
238 | b1 = shuffle_bytes16(b, c, mask1); |
239 | c1 = shuffle_bytes16(c, a, mask1); |
240 | // [a0,a1,a2,c1] |
241 | // [b1,b2,b3,a3] |
242 | // [c2,c3,c0,b0] |
243 | T a2, b2, c2; |
244 | U mask2 = make_uint(0xffffffff); |
245 | mask2 = move4_l<1>(mask2); |
246 | |
247 | a2 = blend(a1, b1, mask2); |
248 | b2 = blend(b1, c1, mask2); |
249 | c2 = blend(c1, a1, mask2); |
250 | // [a0,a1,a2,a3] |
251 | // [b1,b2,b3,b0] |
252 | // [c2,c3,c0,c1] |
253 | a = a2; |
254 | b = align4<3>(b2, b2); |
255 | c = align4<2>(c2, c2); |
256 | #else |
257 | T t11, t12, t21, t22, t31, t32; |
258 | // [a0,b0,c0,a1] |
259 | // [b1,c1,a2,b2] |
260 | // [c2,a3,b3,c3] |
261 | t11 = a; |
262 | t12 = shuffle2<0,1,2,3>(c, b); |
263 | t21 = shuffle2<0,1,0,1>(a, b); |
264 | t22 = shuffle2<2,3,2,3>(b, c); |
265 | t31 = shuffle2<2,3,0,1>(a, b); |
266 | t32 = c; |
267 | // [a0,b0,c0,a1] |
268 | // [c2,a3,a2,b2] |
269 | // [a0,b0,b1,c1] |
270 | // [a2,b2,b3,c3] |
271 | // [c0,a1,b1,c1] |
272 | // [c2,a3,b3,c3] |
273 | a = shuffle2<0,3,2,1>(t11, t12); |
274 | b = shuffle2<1,2,1,2>(t21, t22); |
275 | c = shuffle2<0,3,0,3>(t31, t32); |
276 | #endif |
277 | } |
278 | |
279 | template<class T> SIMDPP_INL |
280 | void v_mem_unpack3_impl64_128(T& a, T& b, T& c) |
281 | { |
282 | T d0, d1, d2; |
283 | d0 = shuffle1<0,1>(a, b); |
284 | d1 = shuffle1<1,0>(a, c); |
285 | d2 = shuffle1<0,1>(b, c); |
286 | a = d0; b = d1; c = d2; |
287 | } |
288 | |
289 | template<class V> SIMDPP_INL |
290 | void v_mem_unpack3_shuffle128(any_vec<16,V>& qa, any_vec<16,V>& qb, any_vec<16,V>& qc) |
291 | { |
292 | (void) qa; (void) qb; (void) qc; |
293 | } |
294 | |
295 | template<class V> SIMDPP_INL |
296 | void v_mem_unpack3_shuffle128(any_vec<32,V>& qa, any_vec<32,V>& qb, any_vec<32,V>& qc) |
297 | { |
298 | // shuffle the vectors so that the lower halves contain the first 3 128-bit |
299 | // items (a and lower half of b) and the higher halves contain the rest |
300 | |
301 | V a0, b0, c0, a1, b1, c1; |
302 | |
303 | a0 = qa.wrapped(); b0 = qb.wrapped(); c0 = qc.wrapped(); |
304 | |
305 | a1 = shuffle1_128<0,1>(a0, b0); |
306 | b1 = shuffle1_128<1,0>(a0, c0); |
307 | c1 = shuffle1_128<0,1>(b0, c0); |
308 | |
309 | qa.wrapped() = a1; qb.wrapped() = b1; qc.wrapped() = c1; |
310 | } |
311 | |
312 | #if SIMDPP_USE_AVX512F |
313 | template<class V> SIMDPP_INL |
314 | void v_mem_unpack3_shuffle128(any_vec<64,V>& qa, any_vec<64,V>& qb, any_vec<64,V>& qc) |
315 | { |
316 | V a, b, c; // TODO: optimize. Using full-vector shuffle may be faster |
317 | a = qa.wrapped(); b = qb.wrapped(); c = qc.wrapped(); |
318 | |
319 | V t11, t12, t21, t22, t31, t32; |
320 | // [a0,b0,c0,a1] |
321 | // [b1,c1,a2,b2] |
322 | // [c2,a3,b3,c3] |
323 | t11 = a; |
324 | t12 = shuffle2_128<0,1,2,3>(c, b); |
325 | t21 = shuffle2_128<0,1,0,1>(a, b); |
326 | t22 = shuffle2_128<2,3,2,3>(b, c); |
327 | t31 = shuffle2_128<2,3,0,1>(a, b); |
328 | t32 = c; |
329 | // [a0,b0,c0,a1] |
330 | // [c2,a3,a2,b2] |
331 | // [a0,b0,b1,c1] |
332 | // [a2,b2,b3,c3] |
333 | // [c0,a1,b1,c1] |
334 | // [c2,a3,b3,c3] |
335 | a = shuffle2_128<0,3,2,1>(t11, t12); |
336 | b = shuffle2_128<1,2,1,2>(t21, t22); |
337 | c = shuffle2_128<0,3,0,3>(t31, t32); |
338 | |
339 | qa.wrapped() = a; qb.wrapped() = b; qc.wrapped() = c; |
340 | } |
341 | #endif |
342 | |
343 | /** Concatenates @a a, @a b and @a c and stores the elements of the resulting |
344 | array as follows: |
345 | * every (3n)-th element is stored to @a a |
346 | * every (3n+1)-th element is stored to @a b |
347 | * every (3n+2)-th element is stored to @a c |
348 | |
349 | n = [0, <number of elements in vector> - 1] |
350 | */ |
351 | template<unsigned N> SIMDPP_INL |
352 | void mem_unpack3(uint8<N>& a, uint8<N>& b, uint8<N>& c) |
353 | { |
354 | v_mem_unpack3_shuffle128(a, b, c); |
355 | v_mem_unpack3_impl8_128(a, b, c); |
356 | } |
357 | |
358 | template<unsigned N> SIMDPP_INL |
359 | void mem_unpack3(uint16<N>& a, uint16<N>& b, uint16<N>& c) |
360 | { |
361 | v_mem_unpack3_shuffle128(a, b, c); |
362 | v_mem_unpack3_impl16_128(a, b, c); |
363 | } |
364 | |
365 | template<unsigned N> SIMDPP_INL |
366 | void mem_unpack3(uint32<N>& a, uint32<N>& b, uint32<N>& c) |
367 | { |
368 | v_mem_unpack3_shuffle128(a, b, c); |
369 | v_mem_unpack3_impl32_128(a, b, c); |
370 | } |
371 | |
372 | template<unsigned N> SIMDPP_INL |
373 | void mem_unpack3(uint64<N>& a, uint64<N>& b, uint64<N>& c) |
374 | { |
375 | v_mem_unpack3_shuffle128(a, b, c); |
376 | v_mem_unpack3_impl64_128(a, b, c); |
377 | } |
378 | |
379 | template<unsigned N> SIMDPP_INL |
380 | void mem_unpack3(float32<N>& a, float32<N>& b, float32<N>& c) |
381 | { |
382 | v_mem_unpack3_shuffle128(a, b, c); |
383 | v_mem_unpack3_impl32_128(a, b, c); |
384 | } |
385 | |
386 | template<unsigned N> SIMDPP_INL |
387 | void mem_unpack3(float64<N>& a, float64<N>& b, float64<N>& c) |
388 | { |
389 | v_mem_unpack3_shuffle128(a, b, c); |
390 | v_mem_unpack3_impl64_128(a, b, c); |
391 | } |
392 | |
393 | /** Generic implementation of mem_unpack4. The 256-bit version applies 128-bit |
394 | operations to each half of each vector separately. |
395 | */ |
396 | template<class T> SIMDPP_INL |
397 | void v_mem_unpack4_impl8_128(T& a, T& b, T& c, T& d) |
398 | { |
399 | #if SIMDPP_USE_SSSE3 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
400 | // TODO: optimize for Altivec and MSA |
401 | typename same_width<T>::u32 b0, b1, b2, b3; |
402 | b0 = transpose_inplace(a); |
403 | b1 = transpose_inplace(b); |
404 | b2 = transpose_inplace(c); |
405 | b3 = transpose_inplace(d); |
406 | |
407 | transpose4(b0, b1, b2, b3); |
408 | a = b0; b = b1; c = b2; d = b3; |
409 | #else |
410 | // [a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3 ] |
411 | // [a4, b4, c4, d4, a5, b5, c5, d5, a6, b6, c6, d6, a7, b7, c7, d7 ] |
412 | // [a8, b8, c8, d8, a9, b9, c9, d9, a10,b10,c10,d10,a11,b11,c11,d11] |
413 | // [a12,b12,c12,d12,a13,b13,c13,d13,a14,b14,c14,d14,a15,b15,c15,d15] |
414 | T b0, b1, b2, b3, c0, c1, c2, c3; |
415 | b0 = zip16_lo(a, b); |
416 | b1 = zip16_hi(a, b); |
417 | b2 = zip16_lo(c, d); |
418 | b3 = zip16_hi(c, d); |
419 | // [a0, a4, b0, b4, c0, c4, d0, d4, a1, a5, b1, b5, c1, c5, d1, d5 ] |
420 | // [a2, a6, b2, b6, c2, c6, d2, d6, a3, a7, b3, b7, c3, c7, d3, d7 ] |
421 | // [a8, a12,b8, b12,c8, c12,d8, d12,a9, a13,b9, b13,c9, c13,d9, d13] |
422 | // [a10,a14,b10,b14,c10,c14,d10,d14,a11,a15,b11,b15,c11,c15,d11,d15] |
423 | c0 = zip16_lo(b0, b1); |
424 | c1 = zip16_hi(b0, b1); |
425 | c2 = zip16_lo(b2, b3); |
426 | c3 = zip16_hi(b2, b3); |
427 | // [a0, a2, a4, a6, b0, b2, b4, b6, c0, c2, c4, c6, d0, d2, d4, d6 ] |
428 | // [a1, a3, a5, a7, b1, b3, b5, b7, c1, c3, c5, c7, d1, d3, d5, d7 ] |
429 | // [a8, a10,a12,a14,b8, b10,b12,b14,c8, c10,c12,c14,d8, d10,d12,d14] |
430 | // [a9, a11,a13,a15,b9, b11,b13,b15,c9, c11,c13,c15,d9, d11,d13,d15] |
431 | typename same_width<T>::u64 d0, d1, d2, d3; |
432 | d0 = zip16_lo(c0, c1); |
433 | d1 = zip16_hi(c0, c1); |
434 | d2 = zip16_lo(c2, c3); |
435 | d3 = zip16_hi(c2, c3); |
436 | // [a0 .. a7, b0 .. b7 ] |
437 | // [c0 .. c7, d0 .. d7 ] |
438 | // [a8 .. a15, b8 .. b15 ] |
439 | // [b8 .. b15, d8 .. d15 ] |
440 | a = zip2_lo(d0, d2); |
441 | b = zip2_hi(d0, d2); |
442 | c = zip2_lo(d1, d3); |
443 | d = zip2_hi(d1, d3); |
444 | #endif |
445 | } |
446 | |
447 | template<class T> SIMDPP_INL |
448 | void v_mem_unpack4_impl16_128(T& a, T& b, T& c, T& d) |
449 | { |
450 | // [a0,b0,c0,d0,a1,b1,c1,d1] |
451 | // [a2,b2,c2,d2,a3,b3,c3,d3] |
452 | // [a4,b4,c4,d4,a5,b5,c5,d5] |
453 | // [a6,b6,c6,d6,a7,b7,c7,d7] |
454 | typename same_width<T>::u16 t0, t1, t2, t3; |
455 | t0 = zip8_lo(a, b); |
456 | t1 = zip8_hi(a, b); |
457 | t2 = zip8_lo(c, d); |
458 | t3 = zip8_hi(c, d); |
459 | // [a0,a2,b0,b2,c0,c2,d0,d2] |
460 | // [a1,a3,b1,b3,c1,c3,d1,d3] |
461 | // [a4,a6,b4,b6,c4,c6,d4,d6] |
462 | // [a5,a7,b5,b7,c5,c7,d5,d7] |
463 | typename same_width<T>::u64 u0, u1, u2, u3; |
464 | u0 = zip8_lo(t0, t1); |
465 | u1 = zip8_hi(t0, t1); |
466 | u2 = zip8_lo(t2, t3); |
467 | u3 = zip8_hi(t2, t3); |
468 | // [a0,a1,a2,a3,b0,b1,b2,b3] |
469 | // [c0,c1,c2,c3,d0,d1,d2,d3] |
470 | // [a4,a5,a6,a7,b4,b5,b6,b7] |
471 | // [c4,c5,c6,c7,d4,d5,d6,d7] |
472 | a = zip2_lo(u0, u2); |
473 | b = zip2_hi(u0, u2); |
474 | c = zip2_lo(u1, u3); |
475 | d = zip2_hi(u1, u3); |
476 | } |
477 | |
478 | template<class T> SIMDPP_INL |
479 | void v_mem_unpack4_impl32_128(T& a, T& b, T& c, T& d) |
480 | { |
481 | transpose4(a, b, c, d); |
482 | } |
483 | |
484 | template<class T> SIMDPP_INL |
485 | void v_mem_unpack4_impl64_128(T& a, T& b, T& c, T& d) |
486 | { |
487 | transpose2(a, c); |
488 | transpose2(b, d); |
489 | T t; |
490 | t = b; |
491 | b = c; |
492 | c = t; |
493 | } |
494 | |
495 | template<class V> SIMDPP_INL |
496 | void v_mem_unpack4_shuffle128(any_vec<16,V>& qa, any_vec<16,V>& qb, |
497 | any_vec<16,V>& qc, any_vec<16,V>& qd) |
498 | { |
499 | (void) qa; (void) qb; (void) qc; (void) qd; |
500 | } |
501 | |
502 | template<class V> SIMDPP_INL |
503 | void v_mem_unpack4_shuffle128(any_vec<32,V>& qa, any_vec<32,V>& qb, |
504 | any_vec<32,V>& qc, any_vec<32,V>& qd) |
505 | { |
506 | V a0, b0, c0, d0, a1, b1, c1, d1; |
507 | |
508 | a0 = qa.wrapped(); b0 = qb.wrapped(); c0 = qc.wrapped(); d0 = qd.wrapped(); |
509 | |
510 | a1 = shuffle1_128<0,0>(a0, c0); |
511 | b1 = shuffle1_128<1,1>(a0, c0); |
512 | c1 = shuffle1_128<0,0>(b0, d0); |
513 | d1 = shuffle1_128<1,1>(b0, d0); |
514 | |
515 | qa.wrapped() = a1; qb.wrapped() = b1; qc.wrapped() = c1; qd.wrapped() = d1; |
516 | } |
517 | |
518 | #if SIMDPP_USE_AVX512F |
519 | template<class V> SIMDPP_INL |
520 | void v_mem_unpack4_shuffle128(any_vec<64,V>& qa, any_vec<64,V>& qb, |
521 | any_vec<64,V>& qc, any_vec<64,V>& qd) |
522 | { |
523 | V a, b, c, d; // TODO: optimize. Using full-vector shuffle/permute will be faster |
524 | |
525 | a = qa.wrapped(); b = qb.wrapped(); c = qc.wrapped(); d = qd.wrapped(); |
526 | |
527 | V t1, t2, t3, t4; |
528 | // [a0,a1,a2,a3] |
529 | // [b0,b1,b2,b3] |
530 | // [c0,c1,c2,c3] |
531 | // [d0,d1,d2,d3] |
532 | t1 = shuffle2_128<0,2,0,2>(a, b); |
533 | t2 = shuffle2_128<1,3,1,3>(a, b); |
534 | t3 = shuffle2_128<0,2,0,2>(c, d); |
535 | t4 = shuffle2_128<1,3,1,3>(c, d); |
536 | // [a0,a2,b0,b2] |
537 | // [a1,a3,b1,b3] |
538 | // [c0,c2,d0,d2] |
539 | // [c1,c3,d1,d3] |
540 | a = shuffle2_128<0,2,0,2>(t1, t3); |
541 | b = shuffle2_128<0,2,0,2>(t2, t4); |
542 | c = shuffle2_128<1,3,1,3>(t1, t3); |
543 | d = shuffle2_128<1,3,1,3>(t2, t4); |
544 | // [a0,b0,c0,d0] |
545 | // [a1,b1,c1,d1] |
546 | // [a2,b2,c2,d2] |
547 | // [a3,b3,c3,d3] |
548 | |
549 | qa.wrapped() = a; qb.wrapped() = b; qc.wrapped() = c; qd.wrapped() = d; |
550 | } |
551 | #endif |
552 | |
553 | /** Concatenates @a a, @a b, @a c and @a d and stores the elements of the |
554 | resulting array as follows: |
555 | * every (4n)-th element is stored to @a a |
556 | * every (4n+1)-th element is stored to @a b |
557 | * every (4n+2)-th element is stored to @a c |
558 | * every (4n+3)-th element is stored to @a d |
559 | |
560 | n = [0, <number of elements in vector> - 1] |
561 | */ |
562 | // @icost{SSE2, SSE3, 16} |
563 | // @icost{SSSE3, SSE4.1, 12} |
564 | template<unsigned N> SIMDPP_INL |
565 | void mem_unpack4(uint8<N>& a, uint8<N>& b, uint8<N>& c, uint8<N>& d) |
566 | { |
567 | v_mem_unpack4_shuffle128(a, b, c, d); |
568 | v_mem_unpack4_impl8_128(a, b, c, d); |
569 | } |
570 | |
571 | template<unsigned N> SIMDPP_INL |
572 | void mem_unpack4(uint16<N>& a, uint16<N>& b, uint16<N>& c, uint16<N>& d) |
573 | { |
574 | v_mem_unpack4_shuffle128(a, b, c, d); |
575 | v_mem_unpack4_impl16_128(a, b, c, d); |
576 | } |
577 | |
578 | template<unsigned N> SIMDPP_INL |
579 | void mem_unpack4(uint32<N>& a, uint32<N>& b, uint32<N>& c, uint32<N>& d) |
580 | { |
581 | v_mem_unpack4_shuffle128(a, b, c, d); |
582 | v_mem_unpack4_impl32_128(a, b, c, d); |
583 | } |
584 | |
585 | template<unsigned N> SIMDPP_INL |
586 | void mem_unpack4(uint64<N>& a, uint64<N>& b, uint64<N>& c, uint64<N>& d) |
587 | { |
588 | v_mem_unpack4_shuffle128(a, b, c, d); |
589 | v_mem_unpack4_impl64_128(a, b, c, d); |
590 | } |
591 | |
592 | template<unsigned N> SIMDPP_INL |
593 | void mem_unpack4(float32<N>& a, float32<N>& b, float32<N>& c, float32<N>& d) |
594 | { |
595 | v_mem_unpack4_shuffle128(a, b, c, d); |
596 | v_mem_unpack4_impl32_128(a, b, c, d); |
597 | } |
598 | |
599 | template<unsigned N> SIMDPP_INL |
600 | void mem_unpack4(float64<N>& a, float64<N>& b, float64<N>& c, float64<N>& d) |
601 | { |
602 | v_mem_unpack4_shuffle128(a, b, c, d); |
603 | v_mem_unpack4_impl64_128(a, b, c, d); |
604 | } |
605 | |
606 | /** Concatenates the given vectors and stores the elements of the resulting |
607 | array as follows: |
608 | * every (3n)-th element of the first 48 elements is stored to @a a |
609 | * every (3n+1)-th element of the first 48 elements is stored to @a b |
610 | * every (3n+2)-th element of the first 48 elements is stored to @a c |
611 | * every (3n)-th element of the last 48 elements is stored to @a d |
612 | * every (3n+1)-th element of the last 48 elements is stored to @a e |
613 | * every (3n+2)-th element of the lasd 48 elements is stored to @a f |
614 | |
615 | n = [0, <number of elements in vector> - 1] |
616 | */ |
617 | static SIMDPP_INL |
618 | void mem_unpack6(uint8x16& a, uint8x16& b, uint8x16& c, |
619 | uint8x16& d, uint8x16& e, uint8x16& f) |
620 | { |
621 | uint8x16 t0, t1, t2, t3, t4, t5; |
622 | t0 = zip16_lo(a, d); |
623 | t1 = zip16_hi(a, d); |
624 | t2 = zip16_lo(b, e); |
625 | t3 = zip16_hi(b, e); |
626 | t4 = zip16_lo(c, f); |
627 | t5 = zip16_hi(c, f); |
628 | |
629 | uint8x16 u0, u1, u2, u3, u4, u5; |
630 | u0 = zip16_lo(t0, t3); |
631 | u1 = zip16_hi(t0, t3); |
632 | u2 = zip16_lo(t1, t4); |
633 | u3 = zip16_hi(t1, t4); |
634 | u4 = zip16_lo(t2, t5); |
635 | u5 = zip16_hi(t2, t5); |
636 | |
637 | t0 = zip16_lo(u0, u3); |
638 | t1 = zip16_hi(u0, u3); |
639 | t2 = zip16_lo(u1, u4); |
640 | t3 = zip16_hi(u1, u4); |
641 | t4 = zip16_lo(u2, u5); |
642 | t5 = zip16_hi(u2, u5); |
643 | |
644 | u0 = zip16_lo(t0, t3); |
645 | u1 = zip16_hi(t0, t3); |
646 | u2 = zip16_lo(t1, t4); |
647 | u3 = zip16_hi(t1, t4); |
648 | u4 = zip16_lo(t2, t5); |
649 | u5 = zip16_hi(t2, t5); |
650 | |
651 | t0 = zip16_lo(u0, u3); |
652 | t1 = zip16_hi(u0, u3); |
653 | t2 = zip16_lo(u1, u4); |
654 | t3 = zip16_hi(u1, u4); |
655 | t4 = zip16_lo(u2, u5); |
656 | t5 = zip16_hi(u2, u5); |
657 | |
658 | a = zip16_lo(t0, t3); |
659 | b = zip16_hi(t0, t3); |
660 | c = zip16_lo(t1, t4); |
661 | d = zip16_hi(t1, t4); |
662 | e = zip16_lo(t2, t5); |
663 | f = zip16_hi(t2, t5); |
664 | } |
665 | |
666 | static SIMDPP_INL |
667 | void mem_unpack6(uint16x8& a, uint16x8& b, uint16x8& c, |
668 | uint16x8& d, uint16x8& e, uint16x8& f) |
669 | { |
670 | uint16x8 t0, t1, t2, t3, t4, t5; |
671 | t0 = zip8_lo(a, d); |
672 | t1 = zip8_hi(a, d); |
673 | t2 = zip8_lo(b, e); |
674 | t3 = zip8_hi(b, e); |
675 | t4 = zip8_lo(c, f); |
676 | t5 = zip8_hi(c, f); |
677 | |
678 | uint16x8 u0, u1, u2, u3, u4, u5; |
679 | u0 = zip8_lo(t0, t3); |
680 | u1 = zip8_hi(t0, t3); |
681 | u2 = zip8_lo(t1, t4); |
682 | u3 = zip8_hi(t1, t4); |
683 | u4 = zip8_lo(t2, t5); |
684 | u5 = zip8_hi(t2, t5); |
685 | |
686 | a = zip8_lo(u0, u3); |
687 | b = zip8_hi(u0, u3); |
688 | c = zip8_lo(u1, u4); |
689 | d = zip8_hi(u1, u4); |
690 | e = zip8_lo(u2, u5); |
691 | f = zip8_hi(u2, u5); |
692 | } |
693 | |
694 | } // namespace insn |
695 | } // namespace detail |
696 | } // namespace SIMDPP_ARCH_NAMESPACE |
697 | } // namespace simdpp |
698 | |
699 | #endif |
700 | |
701 | |