1 | /* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_PACKED2_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_PACKED2_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/detail/align.h> |
17 | #include <simdpp/detail/insn/mem_pack.h> |
18 | #include <simdpp/core/store.h> |
19 | #include <simdpp/detail/null/memory.h> |
20 | |
21 | namespace simdpp { |
22 | namespace SIMDPP_ARCH_NAMESPACE { |
23 | namespace detail { |
24 | namespace insn { |
25 | |
26 | // collect some boilerplate |
27 | template<class V> SIMDPP_INL |
28 | void v128_store_pack2(char* p, const V& ca, const V& cb); |
29 | template<class V> SIMDPP_INL |
30 | void v256_store_pack2(char* p, const V& ca, const V& cb); |
31 | template<class V> SIMDPP_INL |
32 | void v512_store_pack2(char* p, const V& ca, const V& cb); |
33 | |
34 | // ----------------------------------------------------------------------------- |
35 | |
36 | static SIMDPP_INL |
37 | void i_store_packed2(char* p, const uint8x16& a, const uint8x16& b) |
38 | { |
39 | p = detail::assume_aligned(p, 16); |
40 | #if SIMDPP_USE_NULL |
41 | detail::null::store_packed2(p, a, b); |
42 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
43 | v128_store_pack2(p, a, b); |
44 | #elif SIMDPP_USE_NEON |
45 | uint8x16x2_t t; |
46 | t.val[0] = a.native(); |
47 | t.val[1] = b.native(); |
48 | vst2q_u8(reinterpret_cast<uint8_t*>(p), t); |
49 | #endif |
50 | } |
51 | |
52 | #if SIMDPP_USE_AVX2 |
53 | static SIMDPP_INL |
54 | void i_store_packed2(char* p, const uint8x32& a, const uint8x32& b) |
55 | { |
56 | v256_store_pack2(p, a, b); |
57 | } |
58 | #endif |
59 | |
60 | #if SIMDPP_USE_AVX512BW |
61 | SIMDPP_INL void i_store_packed2(char* p, const uint8<64>& a, const uint8<64>& b) |
62 | { |
63 | v512_store_pack2(p, a, b); |
64 | } |
65 | #endif |
66 | |
67 | // ----------------------------------------------------------------------------- |
68 | |
69 | static SIMDPP_INL |
70 | void i_store_packed2(char* p, const uint16x8& a, const uint16x8& b) |
71 | { |
72 | p = detail::assume_aligned(p, 16); |
73 | #if SIMDPP_USE_NULL |
74 | detail::null::store_packed2(p, a, b); |
75 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
76 | v128_store_pack2(p, a, b); |
77 | #elif SIMDPP_USE_NEON |
78 | uint16x8x2_t t; |
79 | t.val[0] = a.native(); |
80 | t.val[1] = b.native(); |
81 | vst2q_u16(reinterpret_cast<uint16_t*>(p), t); |
82 | #endif |
83 | } |
84 | |
85 | #if SIMDPP_USE_AVX2 |
86 | static SIMDPP_INL |
87 | void i_store_packed2(char* p, const uint16x16& a, const uint16x16& b) |
88 | { |
89 | v256_store_pack2(p, a, b); |
90 | } |
91 | #endif |
92 | |
93 | #if SIMDPP_USE_AVX512BW |
94 | SIMDPP_INL void i_store_packed2(char* p, const uint16<32>& a, const uint16<32>& b) |
95 | { |
96 | v512_store_pack2(p, a, b); |
97 | } |
98 | #endif |
99 | |
100 | // ----------------------------------------------------------------------------- |
101 | |
102 | static SIMDPP_INL |
103 | void i_store_packed2(char* p, const uint32x4& a, const uint32x4& b) |
104 | { |
105 | p = detail::assume_aligned(p, 16); |
106 | #if SIMDPP_USE_NULL |
107 | detail::null::store_packed2(p, a, b); |
108 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
109 | v128_store_pack2(p, a, b); |
110 | #elif SIMDPP_USE_NEON |
111 | uint32x4x2_t t; |
112 | t.val[0] = a.native(); |
113 | t.val[1] = b.native(); |
114 | vst2q_u32(reinterpret_cast<uint32_t*>(p), t); |
115 | #endif |
116 | } |
117 | |
118 | #if SIMDPP_USE_AVX2 |
119 | static SIMDPP_INL |
120 | void i_store_packed2(char* p, const uint32x8& a, const uint32x8& b) |
121 | { |
122 | v256_store_pack2(p, a, b); |
123 | } |
124 | #endif |
125 | |
126 | #if SIMDPP_USE_AVX512F |
127 | static SIMDPP_INL |
128 | void i_store_packed2(char* p, const uint32<16>& a, const uint32<16>& b) |
129 | { |
130 | v512_store_pack2(p, a, b); |
131 | } |
132 | #endif |
133 | |
134 | // ----------------------------------------------------------------------------- |
135 | |
136 | static SIMDPP_INL |
137 | void i_store_packed2(char* p, const uint64x2& a, const uint64x2& b) |
138 | { |
139 | #if SIMDPP_USE_NEON64 |
140 | uint64x2x2_t t; |
141 | t.val[0] = a.native(); |
142 | t.val[1] = b.native(); |
143 | vst2q_u64(reinterpret_cast<uint64_t*>(p), t); |
144 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA |
145 | v128_store_pack2(p, a, b); |
146 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC |
147 | detail::null::store_packed2(p, a, b); |
148 | #endif |
149 | } |
150 | |
151 | #if SIMDPP_USE_AVX2 |
152 | static SIMDPP_INL |
153 | void i_store_packed2(char* p, const uint64x4& a, const uint64x4& b) |
154 | { |
155 | v256_store_pack2(p, a, b); |
156 | } |
157 | #endif |
158 | |
159 | #if SIMDPP_USE_AVX512F |
160 | static SIMDPP_INL |
161 | void i_store_packed2(char* p, const uint64<8>& a, const uint64<8>& b) |
162 | { |
163 | v512_store_pack2(p, a, b); |
164 | } |
165 | #endif |
166 | |
167 | // ----------------------------------------------------------------------------- |
168 | |
169 | static SIMDPP_INL |
170 | void i_store_packed2(char* p, const float32x4& a, const float32x4& b) |
171 | { |
172 | p = detail::assume_aligned(p, 16); |
173 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP |
174 | detail::null::store_packed2(p, a, b); |
175 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
176 | v128_store_pack2(p, a, b); |
177 | #elif SIMDPP_USE_NEON |
178 | float32x4x2_t t; |
179 | t.val[0] = a.native(); |
180 | t.val[1] = b.native(); |
181 | vst2q_f32(reinterpret_cast<float*>(p), t); |
182 | #endif |
183 | } |
184 | |
185 | #if SIMDPP_USE_AVX |
186 | static SIMDPP_INL |
187 | void i_store_packed2(char* p, const float32x8& a, const float32x8& b) |
188 | { |
189 | v256_store_pack2(p, a, b); |
190 | } |
191 | #endif |
192 | |
193 | #if SIMDPP_USE_AVX512F |
194 | static SIMDPP_INL |
195 | void i_store_packed2(char* p, const float32<16>& a, const float32<16>& b) |
196 | { |
197 | v512_store_pack2(p, a, b); |
198 | } |
199 | #endif |
200 | |
201 | // ----------------------------------------------------------------------------- |
202 | |
203 | static SIMDPP_INL |
204 | void i_store_packed2(char* p, const float64x2& a, const float64x2& b) |
205 | { |
206 | #if SIMDPP_USE_NEON64 |
207 | float64x2x2_t t; |
208 | t.val[0] = a.native(); |
209 | t.val[1] = b.native(); |
210 | vst2q_f64(reinterpret_cast<double*>(p), t); |
211 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA |
212 | v128_store_pack2(p, a, b); |
213 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC |
214 | detail::null::store_packed2(p, a, b); |
215 | #endif |
216 | } |
217 | |
218 | #if SIMDPP_USE_AVX |
219 | static SIMDPP_INL |
220 | void i_store_packed2(char* p, const float64x4& a, const float64x4& b) |
221 | { |
222 | v256_store_pack2(p, a, b); |
223 | } |
224 | #endif |
225 | |
226 | #if SIMDPP_USE_AVX512F |
227 | static SIMDPP_INL |
228 | void i_store_packed2(char* p, const float64<8>& a, const float64<8>& b) |
229 | { |
230 | v512_store_pack2(p, a, b); |
231 | } |
232 | #endif |
233 | |
234 | // ----------------------------------------------------------------------------- |
235 | |
236 | template<class V> SIMDPP_INL |
237 | void v128_store_pack2(char* p, const V& ca, const V& cb) |
238 | { |
239 | p = detail::assume_aligned(p, 32); |
240 | V a = ca, b = cb; |
241 | mem_pack2(a, b); |
242 | i_store(p, a); |
243 | i_store(p + 16, b); |
244 | } |
245 | |
246 | template<class V> SIMDPP_INL |
247 | void v256_store_pack2(char* p, const V& ca, const V& cb) |
248 | { |
249 | p = detail::assume_aligned(p, 32); |
250 | V a = ca, b = cb; |
251 | mem_pack2(a, b); |
252 | i_store(p, a); |
253 | i_store(p + 32, b); |
254 | } |
255 | |
256 | template<class V> SIMDPP_INL |
257 | void v512_store_pack2(char* p, const V& ca, const V& cb) |
258 | { |
259 | p = detail::assume_aligned(p, 32); |
260 | V a = ca, b = cb; |
261 | mem_pack2(a, b); |
262 | i_store(p, a); |
263 | i_store(p + 64, b); |
264 | } |
265 | |
266 | |
267 | template<class V> SIMDPP_INL |
268 | void i_store_packed2(char* p, const V& ca, const V& cb) |
269 | { |
270 | const unsigned veclen = V::base_vector_type::length_bytes; |
271 | typename detail::remove_sign<V>::type a = ca, b = cb; |
272 | |
273 | p = detail::assume_aligned(p, veclen); |
274 | for (unsigned i = 0; i < V::vec_length; ++i) { |
275 | i_store_packed2(p, a.vec(i), b.vec(i)); |
276 | p += veclen*2; |
277 | } |
278 | } |
279 | |
280 | } // namespace insn |
281 | } // namespace detail |
282 | } // namespace SIMDPP_ARCH_NAMESPACE |
283 | } // namespace simdpp |
284 | |
285 | #endif |
286 | |