1/* Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_CORE_ALIGN_H
9#define LIBSIMDPP_SIMDPP_CORE_ALIGN_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/detail/insn/align.h>
17#include <simdpp/detail/get_expr.h>
18
19namespace simdpp {
20namespace SIMDPP_ARCH_NAMESPACE {
21
22
23/** Extracts a int8x16 vector from two concatenated int8x16 vectors
24
25 @code
26 shift: pos:| 0 1 . 14 15 |
27 0 r = [ l0 l1 . l14 l15 ]
28 1 r = [ l1 l2 . l15 u0 ]
29 2 r = [ l2 l3 . u0 l1 ]
30 ... .. .. .. ... .. ..
31 15 r = [ l15 u0 . u13 u14 ]
32 16 r = [ u0 u1 . u14 u15 ]
33 @endcode
34
35 @par 128-bit version:
36 @icost{SSE2-SSE3, 3}
37
38 @par 256-bit version:
39 The lower and higher 128-bit halves are processed as if 128-bit instruction
40 was applied to each of them separately.
41
42 @icost{SSE2-SSE3, 6}
43 @icost{SSSE3-AVX, NEON, ALTIVEC, 2}
44*/
45template<unsigned shift, unsigned N, class V1, class V2> SIMDPP_INL
46typename detail::get_expr2_nomask<V1, V2>::empty
47 align16(const any_vec8<N,V1>& lower,
48 const any_vec8<N,V2>& upper)
49{
50 static_assert(shift <= 16, "Shift out of bounds");
51 if (shift == 0) return lower.wrapped().eval();
52 if (shift == 16) return upper.wrapped().eval();
53
54 typename detail::get_expr2_nomask_nosign<V1, V2>::type qlower, qupper;
55 qlower = lower.wrapped().eval();
56 qupper = upper.wrapped().eval();
57 return detail::insn::i_align16<shift>(qlower, qupper);
58}
59
60/** Extracts a int16x8 vector from two concatenated int16x8 vectors
61
62 @code
63 shift: pos:| 0 1 . 6 7 |
64 0 r = [ l0 l1 . l6 l7 ]
65 1 r = [ l1 l2 . l7 u0 ]
66 2 r = [ l2 l3 . u0 l1 ]
67 ... .. .. .. ... .. ..
68 7 r = [ l3 u0 . u5 u6 ]
69 8 r = [ u0 u1 . u6 u7 ]
70 @endcode
71
72 @par 128-bit version:
73 @icost{SSE2-SSE3, 3}
74
75 @par 256-bit version:
76 @icost{SSE2-SSE3, 6}
77 @icost{SSSE3-AVX, NEON, ALTIVEC, 2}
78
79 The all 128-bit sub-vectors are processed as if 128-bit instruction
80 was applied to each of them separately.
81*/
82template<unsigned shift, unsigned N, class V1, class V2> SIMDPP_INL
83typename detail::get_expr2_nomask<V1, V2>::empty
84 align8(const any_vec16<N,V1>& lower,
85 const any_vec16<N,V2>& upper)
86{
87 static_assert(shift <= 8, "Shift out of bounds");
88 if (shift == 0) return lower.wrapped().eval();
89 if (shift == 8) return upper.wrapped().eval();
90
91 typename detail::get_expr2_nomask_nosign<V1, V2>::type qlower, qupper;
92 qlower = lower.wrapped().eval();
93 qupper = upper.wrapped().eval();
94 return detail::insn::i_align8<shift>(qlower, qupper);
95}
96
97/** Extracts a int32x4 vector from two concatenated int32x4 vectors
98
99 @code
100 shift: pos:| 0 1 2 3 |
101 0 r = [ l0 l1 l2 l3 ]
102 1 r = [ l1 l2 l3 u0 ]
103 2 r = [ l2 l3 u0 u1 ]
104 3 r = [ l3 u0 u1 u2 ]
105 4 r = [ u0 u1 u2 u3 ]
106 @endcode
107
108 @par int32
109
110 @par 128-bit version:
111 @icost{SSE2-SSE3, 3}
112
113 @par 256-bit version:
114 The lower and higher 128-bit halves are processed as if 128-bit instruction
115 was applied to each of them separately.
116
117 @icost{SSE2-SSE3, 6}
118 @icost{SSSE3-AVX, NEON, ALTIVEC, 2}
119
120 @par float32
121
122 @par 128-bit version:
123 @icost{SSE2-SSE3, 3}
124
125 @par 256-bit version:
126 The lower and higher 128-bit halves are processed as if 128-bit instruction
127 was applied to each of them separately.
128
129 @icost{SSE2-SSE3, 6}
130 @icost{SSSE3-SSE4.1 NEON, ALTIVEC, 2}
131*/
132template<unsigned shift, unsigned N, class V1, class V2> SIMDPP_INL
133typename detail::get_expr2_nomask<V1, V2>::empty
134 align4(const any_vec32<N,V1>& lower,
135 const any_vec32<N,V2>& upper)
136{
137 static_assert(shift <= 4, "Shift out of bounds");
138 if (shift == 0) return lower.wrapped().eval();
139 if (shift == 4) return upper.wrapped().eval();
140
141 typename detail::get_expr2_nomask_nosign<V1, V2>::type qlower, qupper;
142 qlower = lower.wrapped().eval();
143 qupper = upper.wrapped().eval();
144 return detail::insn::i_align4<shift>(qlower, qupper);
145}
146
147
148/** Extracts a int64x2 vector from two concatenated int64x2 vectors
149
150 @code
151 shift: pos:| 0 1 |
152 0 r = [ l0 l1 ]
153 1 r = [ l1 u0 ]
154 2 r = [ u0 u1 ]
155 @endcode
156
157 @par int64
158
159 @par 128-bit version:
160 @icost{SSE2-SSE3, 3}
161
162 @par 256-bit version:
163 The lower and higher 128-bit halves are processed as if 128-bit instruction
164 was applied to each of them separately.
165
166 @icost{SSE2-SSE3, 6}
167 @icost{SSSE3-AVX, NEON, ALTIVEC, 2}
168
169 @par float64
170
171 @par 128-bit version:
172 @icost{SSE2-SSE3, 3}
173
174 @par 256-bit version:
175 The lower and higher 128-bit halves are processed as if 128-bit instruction
176 was applied to each of them separately.
177
178 @icost{SSE2-SSE3, 6}
179 @icost{SSSE3-SSE4.1 NEON, ALTIVEC, 2}
180*/
181template<unsigned shift, unsigned N, class V1, class V2> SIMDPP_INL
182typename detail::get_expr2_nomask<V1, V2>::empty
183 align2(const any_vec64<N,V1>& lower,
184 const any_vec64<N,V2>& upper)
185{
186 static_assert(shift <= 2, "Shift out of bounds");
187 if (shift == 0) return lower.wrapped().eval();
188 if (shift == 2) return upper.wrapped().eval();
189
190 typename detail::get_expr2_nomask_nosign<V1, V2>::type qlower, qupper;
191 qlower = lower.wrapped().eval();
192 qupper = upper.wrapped().eval();
193 return detail::insn::i_align2<shift>(qlower, qupper);
194}
195
196} // namespace SIMDPP_ARCH_NAMESPACE
197} // namespace simdpp
198
199#endif
200
201