1/* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_ALIGN_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_ALIGN_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/bit_or.h>
17#include <simdpp/core/move_l.h>
18#include <simdpp/core/move_r.h>
19#include <simdpp/core/permute4.h>
20#include <simdpp/core/shuffle2x2.h>
21#include <simdpp/core/shuffle4x2.h>
22#include <simdpp/detail/shuffle/shuffle_mask.h>
23#include <simdpp/detail/vector_array_macros.h>
24
25namespace simdpp {
26namespace SIMDPP_ARCH_NAMESPACE {
27namespace detail {
28namespace insn {
29
30// base 8x16 implementation
31template<unsigned shift> SIMDPP_INL
32uint8x16 i_align16(const uint8x16& clower, const uint8x16& cupper)
33{
34 uint8x16 lower = clower, upper = cupper;
35#if SIMDPP_USE_NULL
36 uint8x16 r;
37 //use int to disable warnings wrt. comparison result always being true/false
38 for (int i = 0; i < (int)(16-shift); i++) {
39 r.el(i) = lower.el(i + shift);
40 }
41 for (unsigned i = 16-shift; i < 16; i++) {
42 r.el(i) = upper.el(i - 16 + shift);
43 }
44 return r;
45#elif SIMDPP_USE_SSSE3
46 return _mm_alignr_epi8(upper.native(), lower.native(), shift);
47#elif SIMDPP_USE_SSE2
48 uint8x16 a;
49 lower = move16_l<shift>(lower);
50 upper = move16_r<16-shift>(upper);
51 a = bit_or(upper, lower);
52 return a;
53#elif SIMDPP_USE_NEON
54 if (shift == 0)
55 return lower;
56 if (shift == 16)
57 return upper;
58 return vextq_u8(lower.native(), upper.native(), shift % 16);
59#elif SIMDPP_USE_ALTIVEC
60 return vec_sld_biendian<shift>(lower, upper);
61#elif SIMDPP_USE_MSA
62 return (v16u8) __msa_sld_b((v16i8)upper.native(),
63 (v16i8)lower.native(), shift);
64#endif
65}
66
67#if SIMDPP_USE_AVX2
68template<unsigned shift> SIMDPP_INL
69uint8x32 i_align16(const uint8x32& lower, const uint8x32& upper)
70{
71 return _mm256_alignr_epi8(upper.native(), lower.native(), shift);
72}
73#endif
74
75#if SIMDPP_USE_AVX512BW
76template<unsigned shift> SIMDPP_INL
77uint8<64> i_align16(const uint8<64>& lower, const uint8<64>& upper)
78{
79 return _mm512_alignr_epi8(upper.native(), lower.native(), shift);
80}
81#endif
82
83template<unsigned shift, unsigned N> SIMDPP_INL
84uint8<N> i_align16(const uint8<N>& lower, const uint8<N>& upper)
85{
86 SIMDPP_VEC_ARRAY_IMPL2(uint8<N>, i_align16<shift>, lower, upper);
87}
88
89// -----------------------------------------------------------------------------
90
91template<unsigned shift> SIMDPP_INL
92uint16<8> i_align8(const uint16<8>& lower, const uint16<8>& upper)
93{
94#if SIMDPP_USE_NULL
95 uint16<8> r;
96 //use int to disable warnings wrt. comparison result always being true/false
97 for (int i = 0; i < (int)(8-shift); i++) {
98 r.el(i) = lower.el(i + shift);
99 }
100 for (unsigned i = 8-shift; i < 8; i++) {
101 r.el(i) = upper.el(i - 8 + shift);
102 }
103 return r;
104#else
105 return uint16<8>(i_align16<shift*2>(uint8<16>(lower),
106 uint8<16>(upper)));
107#endif
108}
109
110#if SIMDPP_USE_AVX2
111template<unsigned shift> SIMDPP_INL
112uint16<16> i_align8(const uint16<16>& lower, const uint16<16>& upper)
113{
114 return _mm256_alignr_epi8(upper.native(), lower.native(), shift*2);
115}
116#endif
117
118#if SIMDPP_USE_AVX512BW
119template<unsigned shift> SIMDPP_INL
120uint16<32> i_align8(const uint16<32>& lower, const uint16<32>& upper)
121{
122 return _mm512_alignr_epi8(upper.native(), lower.native(), shift*2);
123}
124#endif
125
126template<unsigned shift, unsigned N> SIMDPP_INL
127uint16<N> i_align8(const uint16<N>& lower, const uint16<N>& upper)
128{
129 SIMDPP_VEC_ARRAY_IMPL2(uint16<N>, i_align8<shift>, lower, upper);
130}
131
132// -----------------------------------------------------------------------------
133
134template<unsigned shift> SIMDPP_INL
135uint32x4 i_align4(const uint32x4& lower, const uint32x4& upper)
136{
137#if SIMDPP_USE_NULL
138 uint32x4 r;
139 //use int to disable warnings wrt. comparison result always being true/false
140 for (int i = 0; i < (int)(4-shift); i++) {
141 r.el(i) = lower.el(i + shift);
142 }
143 for (unsigned i = 4-shift; i < 4; i++) {
144 r.el(i) = upper.el(i - 4 + shift);
145 }
146 return r;
147#elif SIMDPP_USE_SSE2
148 switch (shift) {
149 default:
150 case 0: return lower;
151#if SIMDPP_USE_SSSE3
152 case 1:
153 case 2:
154 case 3: return _mm_alignr_epi8(upper.native(), lower.native(), shift*4);
155#else
156 case 2: return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(lower.native()),
157 _mm_castsi128_ps(upper.native()),
158 SIMDPP_SHUFFLE_MASK_4x4(2,3,0,1)));
159 case 1:
160 case 3: return bit_or(move4_l<shift>(lower),
161 move4_r<4-shift>(upper));
162#endif
163 case 4: return upper;
164 }
165#elif SIMDPP_USE_NEON
166 if (shift == 0)
167 return lower;
168 if (shift == 4)
169 return upper;
170 return vextq_u32(lower.native(), upper.native(), shift);
171#elif SIMDPP_USE_ALTIVEC
172 return (uint32<4>) vec_sld_biendian<shift*4>((uint8<16>)lower, (uint8<16>)upper);
173#elif SIMDPP_USE_MSA
174 return (v4u32) __msa_sld_b((v16i8)upper.native(),
175 (v16i8)lower.native(), shift*4);
176#endif
177}
178
179#if SIMDPP_USE_AVX2
180template<unsigned shift> SIMDPP_INL
181uint32<8> i_align4(const uint32<8>& lower, const uint32<8>& upper)
182{
183 return _mm256_alignr_epi8(upper.native(), lower.native(), shift*4);
184}
185#endif
186
187#if SIMDPP_USE_AVX512F
188template<unsigned shift> SIMDPP_INL
189uint32<16> i_align4(const uint32<16>& lower, const uint32<16>& upper)
190{
191 // note that _mm512_alignr_epi32 operates on entire vector
192 switch (shift) {
193 default:
194 case 0: return lower;
195 case 1: return shuffle4x2<1,2,3,4>(lower, upper);
196 case 2: return shuffle4x2<2,3,4,5>(lower, upper);
197 case 3: return shuffle4x2<3,4,5,6>(lower, upper);
198 case 4: return upper;
199 }
200}
201#endif
202
203template<unsigned shift, unsigned N> SIMDPP_INL
204uint32<N> i_align4(const uint32<N>& lower, const uint32<N>& upper)
205{
206 SIMDPP_VEC_ARRAY_IMPL2(uint32<N>, i_align4<shift>, lower, upper);
207}
208
209// -----------------------------------------------------------------------------
210
211template<unsigned shift> SIMDPP_INL
212uint64x2 i_align2(const uint64x2& lower, const uint64x2& upper)
213{
214#if SIMDPP_USE_SSE2
215 switch (shift) {
216 default:
217 case 0: return lower;
218 case 1: return _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(lower.native()),
219 _mm_castsi128_pd(upper.native()),
220 SIMDPP_SHUFFLE_MASK_2x2(1,0)));
221 case 2: return upper;
222 }
223#elif SIMDPP_USE_NEON
224 if (shift == 0)
225 return lower;
226 if (shift == 2)
227 return upper;
228 return vextq_u64(lower.native(), upper.native(), shift % 2);
229#elif SIMDPP_USE_VSX_207
230 return (uint64<2>) vec_sld_biendian<shift*8>((uint8<16>) lower,
231 (uint8<16>) upper);
232#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
233 uint64x2 r;
234 //use int to disable warnings wrt. comparison result always being true/false
235 for (int i = 0; i < (int)(2-shift); i++) {
236 r.el(i) = lower.el(i + shift);
237 }
238 for (unsigned i = 2-shift; i < 2; i++) {
239 r.el(i) = upper.el(i - 2 + shift);
240 }
241 return r;
242#elif SIMDPP_USE_MSA
243 return (v2u64) __msa_sld_b((v16i8) upper.native(),
244 (v16i8) lower.native(), shift*8);
245#endif
246}
247
248#if SIMDPP_USE_AVX2
249template<unsigned shift> SIMDPP_INL
250uint64<4> i_align2(const uint64<4>& lower, const uint64<4>& upper)
251{
252 return _mm256_alignr_epi8(upper.native(), lower.native(), shift*8);
253}
254#endif
255
256#if SIMDPP_USE_AVX512F
257template<unsigned shift> SIMDPP_INL
258uint64<8> i_align2(const uint64<8>& lower, const uint64<8>& upper)
259{
260 switch (shift) {
261 default:
262 case 0: return lower;
263 case 1: return shuffle2x2<1,2>(lower, upper);
264 case 2: return upper;
265 }
266}
267#endif
268
269template<unsigned shift, unsigned N> SIMDPP_INL
270uint64<N> i_align2(const uint64<N>& lower, const uint64<N>& upper)
271{
272 SIMDPP_VEC_ARRAY_IMPL2(uint64<N>, i_align2<shift>, lower, upper);
273}
274
275// -----------------------------------------------------------------------------
276
277template<unsigned shift> SIMDPP_INL
278float32x4 i_align4(const float32x4& lower, const float32x4& upper)
279{
280#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
281 float32x4 r;
282 //use int to disable warnings wrt. comparison result always being true/false
283 for (int i = 0; i < (int)(4-shift); i++) {
284 r.el(i) = lower.el(i + shift);
285 }
286 for (unsigned i = 4-shift; i < 4; i++) {
287 r.el(i) = upper.el(i - 4 + shift);
288 }
289 return r;
290#elif SIMDPP_USE_SSE2
291 switch (shift) {
292 default:
293 case 0: return lower;
294#if SIMDPP_USE_SSSE3
295 case 1:
296 case 3: {
297 __m128i res = _mm_alignr_epi8(_mm_castps_si128(upper.native()),
298 _mm_castps_si128(lower.native()), shift*4);
299 return _mm_castsi128_ps(res);
300 }
301#else
302 case 1:
303 case 3: return bit_or(move4_l<shift>(lower),
304 move4_r<4-shift>(upper));
305#endif
306 case 2: return _mm_shuffle_ps(lower.native(), upper.native(),
307 SIMDPP_SHUFFLE_MASK_4x4(2,3,0,1));
308 case 4: return upper;
309 }
310#elif SIMDPP_USE_NEON_FLT_SP
311 if (shift == 0)
312 return lower;
313 if (shift == 4)
314 return upper;
315 return vextq_f32(lower.native(), upper.native(), shift);
316#elif SIMDPP_USE_ALTIVEC
317 return (float32<4>) vec_sld_biendian<shift*4>((uint8<16>)lower, (uint8<16>)upper);
318#elif SIMDPP_USE_MSA
319 return (v4f32) __msa_sld_b((v16i8)upper.native(),
320 (v16i8)lower.native(), shift*4);
321#endif
322}
323
324#if SIMDPP_USE_AVX
325template<unsigned shift> SIMDPP_INL
326float32<8> i_align4(const float32<8>& lower, const float32<8>& upper)
327{
328 switch (shift) {
329 default:
330 case 0: return lower;
331#if SIMDPP_USE_AVX2
332 case 1:
333 case 3: {
334 __m256i res = _mm256_alignr_epi8(_mm256_castps_si256(upper.native()),
335 _mm256_castps_si256(lower.native()), shift*4);
336 return _mm256_castsi256_ps(res);
337 }
338#else
339 case 1: return shuffle4x2<1,2,3,4>(lower, upper);
340 case 3: return shuffle4x2<3,4,5,6>(lower, upper);
341#endif
342 case 2: return _mm256_shuffle_ps(lower.native(), upper.native(),
343 SIMDPP_SHUFFLE_MASK_4x4(2,3,0,1));
344 case 4: return upper;
345 }
346}
347#endif
348
349#if SIMDPP_USE_AVX512F
350template<unsigned shift> SIMDPP_INL
351float32<16> i_align4(const float32<16>& lower, const float32<16>& upper)
352{
353 switch (shift) {
354 default:
355 case 0: return lower;
356 case 1: return shuffle4x2<1,2,3,4>(lower, upper);
357 case 2: return _mm512_shuffle_ps(lower.native(), upper.native(),
358 SIMDPP_SHUFFLE_MASK_4x4(2,3,0,1));
359 case 3: return shuffle4x2<3,4,5,6>(lower, upper);
360 case 4: return upper;
361 }
362}
363#endif
364
365template<unsigned shift, unsigned N> SIMDPP_INL
366float32<N> i_align4(const float32<N>& lower, const float32<N>& upper)
367{
368 SIMDPP_VEC_ARRAY_IMPL2(float32<N>, i_align4<shift>, lower, upper);
369}
370
371// -----------------------------------------------------------------------------
372
373template<unsigned shift> SIMDPP_INL
374float64x2 i_align2(const float64x2& lower, const float64x2& upper)
375{
376#if SIMDPP_USE_SSE2
377 switch (shift) {
378 default:
379 case 0: return lower;
380 case 1: return _mm_shuffle_pd(lower.native(), upper.native(),
381 SIMDPP_SHUFFLE_MASK_2x2(1, 0));
382 case 2: return upper;
383 }
384#elif SIMDPP_USE_NEON64
385 if (shift == 0)
386 return lower;
387 if (shift == 2)
388 return upper;
389 return vextq_f64(lower.native(), upper.native(), shift);
390#elif SIMDPP_USE_VSX_206
391 return (float64<2>) vec_sld_biendian<shift*8>((uint8<16>)lower,
392 (uint8<16>)upper);
393#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC || SIMDPP_USE_NEON32
394 float64x2 r;
395 //use int to disable warnings wrt. comparison result always being true/false
396 for (int i = 0; i < (int)(2-shift); i++) {
397 r.el(i) = lower.el(i + shift);
398 }
399 for (unsigned i = 2-shift; i < 2; i++) {
400 r.el(i) = upper.el(i - 2 + shift);
401 }
402 return r;
403#elif SIMDPP_USE_MSA
404 return (v2f64) __msa_sld_b((v16i8) upper.native(),
405 (v16i8) lower.native(), shift*8);
406#else
407 return SIMDPP_NOT_IMPLEMENTED_TEMPLATE2(float64<shift+4>, lower, upper);
408#endif
409}
410
411#if SIMDPP_USE_AVX
412template<unsigned shift> SIMDPP_INL
413float64<4> i_align2(const float64<4>& lower, const float64<4>& upper)
414{
415 switch (shift) {
416 default:
417 case 0: return lower;
418 case 1: return _mm256_shuffle_pd(lower.native(), upper.native(),
419 SIMDPP_SHUFFLE_MASK_2x2_2(1, 0));
420 case 2: return upper;
421 }
422}
423#endif
424
425#if SIMDPP_USE_AVX512F
426template<unsigned shift> SIMDPP_INL
427float64<8> i_align2(const float64<8>& lower, const float64<8>& upper)
428{
429 switch (shift) {
430 default:
431 case 0: return lower;
432 case 1: return _mm512_shuffle_pd(lower.native(), upper.native(),
433 SIMDPP_SHUFFLE_MASK_2x2_4(1, 0));
434 case 2: return upper;
435 }
436}
437#endif
438
439template<unsigned shift, unsigned N> SIMDPP_INL
440float64<N> i_align2(const float64<N>& lower, const float64<N>& upper)
441{
442 SIMDPP_VEC_ARRAY_IMPL2(float64<N>, i_align2<shift>, lower, upper);
443}
444
445} // namespace insn
446} // namespace detail
447} // namespace SIMDPP_ARCH_NAMESPACE
448} // namespace simdpp
449
450#endif
451
452