1 | /* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_ALIGN_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_ALIGN_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/bit_or.h> |
17 | #include <simdpp/core/move_l.h> |
18 | #include <simdpp/core/move_r.h> |
19 | #include <simdpp/core/permute4.h> |
20 | #include <simdpp/core/shuffle2x2.h> |
21 | #include <simdpp/core/shuffle4x2.h> |
22 | #include <simdpp/detail/shuffle/shuffle_mask.h> |
23 | #include <simdpp/detail/vector_array_macros.h> |
24 | |
25 | namespace simdpp { |
26 | namespace SIMDPP_ARCH_NAMESPACE { |
27 | namespace detail { |
28 | namespace insn { |
29 | |
30 | // base 8x16 implementation |
31 | template<unsigned shift> SIMDPP_INL |
32 | uint8x16 i_align16(const uint8x16& clower, const uint8x16& cupper) |
33 | { |
34 | uint8x16 lower = clower, upper = cupper; |
35 | #if SIMDPP_USE_NULL |
36 | uint8x16 r; |
37 | //use int to disable warnings wrt. comparison result always being true/false |
38 | for (int i = 0; i < (int)(16-shift); i++) { |
39 | r.el(i) = lower.el(i + shift); |
40 | } |
41 | for (unsigned i = 16-shift; i < 16; i++) { |
42 | r.el(i) = upper.el(i - 16 + shift); |
43 | } |
44 | return r; |
45 | #elif SIMDPP_USE_SSSE3 |
46 | return _mm_alignr_epi8(upper.native(), lower.native(), shift); |
47 | #elif SIMDPP_USE_SSE2 |
48 | uint8x16 a; |
49 | lower = move16_l<shift>(lower); |
50 | upper = move16_r<16-shift>(upper); |
51 | a = bit_or(upper, lower); |
52 | return a; |
53 | #elif SIMDPP_USE_NEON |
54 | if (shift == 0) |
55 | return lower; |
56 | if (shift == 16) |
57 | return upper; |
58 | return vextq_u8(lower.native(), upper.native(), shift % 16); |
59 | #elif SIMDPP_USE_ALTIVEC |
60 | return vec_sld_biendian<shift>(lower, upper); |
61 | #elif SIMDPP_USE_MSA |
62 | return (v16u8) __msa_sld_b((v16i8)upper.native(), |
63 | (v16i8)lower.native(), shift); |
64 | #endif |
65 | } |
66 | |
67 | #if SIMDPP_USE_AVX2 |
68 | template<unsigned shift> SIMDPP_INL |
69 | uint8x32 i_align16(const uint8x32& lower, const uint8x32& upper) |
70 | { |
71 | return _mm256_alignr_epi8(upper.native(), lower.native(), shift); |
72 | } |
73 | #endif |
74 | |
75 | #if SIMDPP_USE_AVX512BW |
76 | template<unsigned shift> SIMDPP_INL |
77 | uint8<64> i_align16(const uint8<64>& lower, const uint8<64>& upper) |
78 | { |
79 | return _mm512_alignr_epi8(upper.native(), lower.native(), shift); |
80 | } |
81 | #endif |
82 | |
83 | template<unsigned shift, unsigned N> SIMDPP_INL |
84 | uint8<N> i_align16(const uint8<N>& lower, const uint8<N>& upper) |
85 | { |
86 | SIMDPP_VEC_ARRAY_IMPL2(uint8<N>, i_align16<shift>, lower, upper); |
87 | } |
88 | |
89 | // ----------------------------------------------------------------------------- |
90 | |
91 | template<unsigned shift> SIMDPP_INL |
92 | uint16<8> i_align8(const uint16<8>& lower, const uint16<8>& upper) |
93 | { |
94 | #if SIMDPP_USE_NULL |
95 | uint16<8> r; |
96 | //use int to disable warnings wrt. comparison result always being true/false |
97 | for (int i = 0; i < (int)(8-shift); i++) { |
98 | r.el(i) = lower.el(i + shift); |
99 | } |
100 | for (unsigned i = 8-shift; i < 8; i++) { |
101 | r.el(i) = upper.el(i - 8 + shift); |
102 | } |
103 | return r; |
104 | #else |
105 | return uint16<8>(i_align16<shift*2>(uint8<16>(lower), |
106 | uint8<16>(upper))); |
107 | #endif |
108 | } |
109 | |
110 | #if SIMDPP_USE_AVX2 |
111 | template<unsigned shift> SIMDPP_INL |
112 | uint16<16> i_align8(const uint16<16>& lower, const uint16<16>& upper) |
113 | { |
114 | return _mm256_alignr_epi8(upper.native(), lower.native(), shift*2); |
115 | } |
116 | #endif |
117 | |
118 | #if SIMDPP_USE_AVX512BW |
119 | template<unsigned shift> SIMDPP_INL |
120 | uint16<32> i_align8(const uint16<32>& lower, const uint16<32>& upper) |
121 | { |
122 | return _mm512_alignr_epi8(upper.native(), lower.native(), shift*2); |
123 | } |
124 | #endif |
125 | |
126 | template<unsigned shift, unsigned N> SIMDPP_INL |
127 | uint16<N> i_align8(const uint16<N>& lower, const uint16<N>& upper) |
128 | { |
129 | SIMDPP_VEC_ARRAY_IMPL2(uint16<N>, i_align8<shift>, lower, upper); |
130 | } |
131 | |
132 | // ----------------------------------------------------------------------------- |
133 | |
134 | template<unsigned shift> SIMDPP_INL |
135 | uint32x4 i_align4(const uint32x4& lower, const uint32x4& upper) |
136 | { |
137 | #if SIMDPP_USE_NULL |
138 | uint32x4 r; |
139 | //use int to disable warnings wrt. comparison result always being true/false |
140 | for (int i = 0; i < (int)(4-shift); i++) { |
141 | r.el(i) = lower.el(i + shift); |
142 | } |
143 | for (unsigned i = 4-shift; i < 4; i++) { |
144 | r.el(i) = upper.el(i - 4 + shift); |
145 | } |
146 | return r; |
147 | #elif SIMDPP_USE_SSE2 |
148 | switch (shift) { |
149 | default: |
150 | case 0: return lower; |
151 | #if SIMDPP_USE_SSSE3 |
152 | case 1: |
153 | case 2: |
154 | case 3: return _mm_alignr_epi8(upper.native(), lower.native(), shift*4); |
155 | #else |
156 | case 2: return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(lower.native()), |
157 | _mm_castsi128_ps(upper.native()), |
158 | SIMDPP_SHUFFLE_MASK_4x4(2,3,0,1))); |
159 | case 1: |
160 | case 3: return bit_or(move4_l<shift>(lower), |
161 | move4_r<4-shift>(upper)); |
162 | #endif |
163 | case 4: return upper; |
164 | } |
165 | #elif SIMDPP_USE_NEON |
166 | if (shift == 0) |
167 | return lower; |
168 | if (shift == 4) |
169 | return upper; |
170 | return vextq_u32(lower.native(), upper.native(), shift); |
171 | #elif SIMDPP_USE_ALTIVEC |
172 | return (uint32<4>) vec_sld_biendian<shift*4>((uint8<16>)lower, (uint8<16>)upper); |
173 | #elif SIMDPP_USE_MSA |
174 | return (v4u32) __msa_sld_b((v16i8)upper.native(), |
175 | (v16i8)lower.native(), shift*4); |
176 | #endif |
177 | } |
178 | |
179 | #if SIMDPP_USE_AVX2 |
180 | template<unsigned shift> SIMDPP_INL |
181 | uint32<8> i_align4(const uint32<8>& lower, const uint32<8>& upper) |
182 | { |
183 | return _mm256_alignr_epi8(upper.native(), lower.native(), shift*4); |
184 | } |
185 | #endif |
186 | |
187 | #if SIMDPP_USE_AVX512F |
188 | template<unsigned shift> SIMDPP_INL |
189 | uint32<16> i_align4(const uint32<16>& lower, const uint32<16>& upper) |
190 | { |
191 | // note that _mm512_alignr_epi32 operates on entire vector |
192 | switch (shift) { |
193 | default: |
194 | case 0: return lower; |
195 | case 1: return shuffle4x2<1,2,3,4>(lower, upper); |
196 | case 2: return shuffle4x2<2,3,4,5>(lower, upper); |
197 | case 3: return shuffle4x2<3,4,5,6>(lower, upper); |
198 | case 4: return upper; |
199 | } |
200 | } |
201 | #endif |
202 | |
203 | template<unsigned shift, unsigned N> SIMDPP_INL |
204 | uint32<N> i_align4(const uint32<N>& lower, const uint32<N>& upper) |
205 | { |
206 | SIMDPP_VEC_ARRAY_IMPL2(uint32<N>, i_align4<shift>, lower, upper); |
207 | } |
208 | |
209 | // ----------------------------------------------------------------------------- |
210 | |
211 | template<unsigned shift> SIMDPP_INL |
212 | uint64x2 i_align2(const uint64x2& lower, const uint64x2& upper) |
213 | { |
214 | #if SIMDPP_USE_SSE2 |
215 | switch (shift) { |
216 | default: |
217 | case 0: return lower; |
218 | case 1: return _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(lower.native()), |
219 | _mm_castsi128_pd(upper.native()), |
220 | SIMDPP_SHUFFLE_MASK_2x2(1,0))); |
221 | case 2: return upper; |
222 | } |
223 | #elif SIMDPP_USE_NEON |
224 | if (shift == 0) |
225 | return lower; |
226 | if (shift == 2) |
227 | return upper; |
228 | return vextq_u64(lower.native(), upper.native(), shift % 2); |
229 | #elif SIMDPP_USE_VSX_207 |
230 | return (uint64<2>) vec_sld_biendian<shift*8>((uint8<16>) lower, |
231 | (uint8<16>) upper); |
232 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC |
233 | uint64x2 r; |
234 | //use int to disable warnings wrt. comparison result always being true/false |
235 | for (int i = 0; i < (int)(2-shift); i++) { |
236 | r.el(i) = lower.el(i + shift); |
237 | } |
238 | for (unsigned i = 2-shift; i < 2; i++) { |
239 | r.el(i) = upper.el(i - 2 + shift); |
240 | } |
241 | return r; |
242 | #elif SIMDPP_USE_MSA |
243 | return (v2u64) __msa_sld_b((v16i8) upper.native(), |
244 | (v16i8) lower.native(), shift*8); |
245 | #endif |
246 | } |
247 | |
248 | #if SIMDPP_USE_AVX2 |
249 | template<unsigned shift> SIMDPP_INL |
250 | uint64<4> i_align2(const uint64<4>& lower, const uint64<4>& upper) |
251 | { |
252 | return _mm256_alignr_epi8(upper.native(), lower.native(), shift*8); |
253 | } |
254 | #endif |
255 | |
256 | #if SIMDPP_USE_AVX512F |
257 | template<unsigned shift> SIMDPP_INL |
258 | uint64<8> i_align2(const uint64<8>& lower, const uint64<8>& upper) |
259 | { |
260 | switch (shift) { |
261 | default: |
262 | case 0: return lower; |
263 | case 1: return shuffle2x2<1,2>(lower, upper); |
264 | case 2: return upper; |
265 | } |
266 | } |
267 | #endif |
268 | |
269 | template<unsigned shift, unsigned N> SIMDPP_INL |
270 | uint64<N> i_align2(const uint64<N>& lower, const uint64<N>& upper) |
271 | { |
272 | SIMDPP_VEC_ARRAY_IMPL2(uint64<N>, i_align2<shift>, lower, upper); |
273 | } |
274 | |
275 | // ----------------------------------------------------------------------------- |
276 | |
277 | template<unsigned shift> SIMDPP_INL |
278 | float32x4 i_align4(const float32x4& lower, const float32x4& upper) |
279 | { |
280 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP |
281 | float32x4 r; |
282 | //use int to disable warnings wrt. comparison result always being true/false |
283 | for (int i = 0; i < (int)(4-shift); i++) { |
284 | r.el(i) = lower.el(i + shift); |
285 | } |
286 | for (unsigned i = 4-shift; i < 4; i++) { |
287 | r.el(i) = upper.el(i - 4 + shift); |
288 | } |
289 | return r; |
290 | #elif SIMDPP_USE_SSE2 |
291 | switch (shift) { |
292 | default: |
293 | case 0: return lower; |
294 | #if SIMDPP_USE_SSSE3 |
295 | case 1: |
296 | case 3: { |
297 | __m128i res = _mm_alignr_epi8(_mm_castps_si128(upper.native()), |
298 | _mm_castps_si128(lower.native()), shift*4); |
299 | return _mm_castsi128_ps(res); |
300 | } |
301 | #else |
302 | case 1: |
303 | case 3: return bit_or(move4_l<shift>(lower), |
304 | move4_r<4-shift>(upper)); |
305 | #endif |
306 | case 2: return _mm_shuffle_ps(lower.native(), upper.native(), |
307 | SIMDPP_SHUFFLE_MASK_4x4(2,3,0,1)); |
308 | case 4: return upper; |
309 | } |
310 | #elif SIMDPP_USE_NEON_FLT_SP |
311 | if (shift == 0) |
312 | return lower; |
313 | if (shift == 4) |
314 | return upper; |
315 | return vextq_f32(lower.native(), upper.native(), shift); |
316 | #elif SIMDPP_USE_ALTIVEC |
317 | return (float32<4>) vec_sld_biendian<shift*4>((uint8<16>)lower, (uint8<16>)upper); |
318 | #elif SIMDPP_USE_MSA |
319 | return (v4f32) __msa_sld_b((v16i8)upper.native(), |
320 | (v16i8)lower.native(), shift*4); |
321 | #endif |
322 | } |
323 | |
324 | #if SIMDPP_USE_AVX |
325 | template<unsigned shift> SIMDPP_INL |
326 | float32<8> i_align4(const float32<8>& lower, const float32<8>& upper) |
327 | { |
328 | switch (shift) { |
329 | default: |
330 | case 0: return lower; |
331 | #if SIMDPP_USE_AVX2 |
332 | case 1: |
333 | case 3: { |
334 | __m256i res = _mm256_alignr_epi8(_mm256_castps_si256(upper.native()), |
335 | _mm256_castps_si256(lower.native()), shift*4); |
336 | return _mm256_castsi256_ps(res); |
337 | } |
338 | #else |
339 | case 1: return shuffle4x2<1,2,3,4>(lower, upper); |
340 | case 3: return shuffle4x2<3,4,5,6>(lower, upper); |
341 | #endif |
342 | case 2: return _mm256_shuffle_ps(lower.native(), upper.native(), |
343 | SIMDPP_SHUFFLE_MASK_4x4(2,3,0,1)); |
344 | case 4: return upper; |
345 | } |
346 | } |
347 | #endif |
348 | |
349 | #if SIMDPP_USE_AVX512F |
350 | template<unsigned shift> SIMDPP_INL |
351 | float32<16> i_align4(const float32<16>& lower, const float32<16>& upper) |
352 | { |
353 | switch (shift) { |
354 | default: |
355 | case 0: return lower; |
356 | case 1: return shuffle4x2<1,2,3,4>(lower, upper); |
357 | case 2: return _mm512_shuffle_ps(lower.native(), upper.native(), |
358 | SIMDPP_SHUFFLE_MASK_4x4(2,3,0,1)); |
359 | case 3: return shuffle4x2<3,4,5,6>(lower, upper); |
360 | case 4: return upper; |
361 | } |
362 | } |
363 | #endif |
364 | |
365 | template<unsigned shift, unsigned N> SIMDPP_INL |
366 | float32<N> i_align4(const float32<N>& lower, const float32<N>& upper) |
367 | { |
368 | SIMDPP_VEC_ARRAY_IMPL2(float32<N>, i_align4<shift>, lower, upper); |
369 | } |
370 | |
371 | // ----------------------------------------------------------------------------- |
372 | |
373 | template<unsigned shift> SIMDPP_INL |
374 | float64x2 i_align2(const float64x2& lower, const float64x2& upper) |
375 | { |
376 | #if SIMDPP_USE_SSE2 |
377 | switch (shift) { |
378 | default: |
379 | case 0: return lower; |
380 | case 1: return _mm_shuffle_pd(lower.native(), upper.native(), |
381 | SIMDPP_SHUFFLE_MASK_2x2(1, 0)); |
382 | case 2: return upper; |
383 | } |
384 | #elif SIMDPP_USE_NEON64 |
385 | if (shift == 0) |
386 | return lower; |
387 | if (shift == 2) |
388 | return upper; |
389 | return vextq_f64(lower.native(), upper.native(), shift); |
390 | #elif SIMDPP_USE_VSX_206 |
391 | return (float64<2>) vec_sld_biendian<shift*8>((uint8<16>)lower, |
392 | (uint8<16>)upper); |
393 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC || SIMDPP_USE_NEON32 |
394 | float64x2 r; |
395 | //use int to disable warnings wrt. comparison result always being true/false |
396 | for (int i = 0; i < (int)(2-shift); i++) { |
397 | r.el(i) = lower.el(i + shift); |
398 | } |
399 | for (unsigned i = 2-shift; i < 2; i++) { |
400 | r.el(i) = upper.el(i - 2 + shift); |
401 | } |
402 | return r; |
403 | #elif SIMDPP_USE_MSA |
404 | return (v2f64) __msa_sld_b((v16i8) upper.native(), |
405 | (v16i8) lower.native(), shift*8); |
406 | #else |
407 | return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(float64<shift+4>, lower, upper); |
408 | #endif |
409 | } |
410 | |
411 | #if SIMDPP_USE_AVX |
412 | template<unsigned shift> SIMDPP_INL |
413 | float64<4> i_align2(const float64<4>& lower, const float64<4>& upper) |
414 | { |
415 | switch (shift) { |
416 | default: |
417 | case 0: return lower; |
418 | case 1: return _mm256_shuffle_pd(lower.native(), upper.native(), |
419 | SIMDPP_SHUFFLE_MASK_2x2_2(1, 0)); |
420 | case 2: return upper; |
421 | } |
422 | } |
423 | #endif |
424 | |
425 | #if SIMDPP_USE_AVX512F |
426 | template<unsigned shift> SIMDPP_INL |
427 | float64<8> i_align2(const float64<8>& lower, const float64<8>& upper) |
428 | { |
429 | switch (shift) { |
430 | default: |
431 | case 0: return lower; |
432 | case 1: return _mm512_shuffle_pd(lower.native(), upper.native(), |
433 | SIMDPP_SHUFFLE_MASK_2x2_4(1, 0)); |
434 | case 2: return upper; |
435 | } |
436 | } |
437 | #endif |
438 | |
439 | template<unsigned shift, unsigned N> SIMDPP_INL |
440 | float64<N> i_align2(const float64<N>& lower, const float64<N>& upper) |
441 | { |
442 | SIMDPP_VEC_ARRAY_IMPL2(float64<N>, i_align2<shift>, lower, upper); |
443 | } |
444 | |
445 | } // namespace insn |
446 | } // namespace detail |
447 | } // namespace SIMDPP_ARCH_NAMESPACE |
448 | } // namespace simdpp |
449 | |
450 | #endif |
451 | |
452 | |