1 | /* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_PACKED4_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_PACKED4_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/detail/align.h> |
17 | #include <simdpp/detail/insn/mem_pack.h> |
18 | #include <simdpp/core/store.h> |
19 | #include <simdpp/detail/null/memory.h> |
20 | |
21 | namespace simdpp { |
22 | namespace SIMDPP_ARCH_NAMESPACE { |
23 | namespace detail { |
24 | namespace insn { |
25 | |
26 | |
27 | // collect some boilerplate |
28 | template<class V> SIMDPP_INL |
29 | void v128_store_pack4(char* p, const V& ca, const V& cb, const V& cc, const V& dd); |
30 | template<class V> SIMDPP_INL |
31 | void v256_store_pack4(char* p, const V& ca, const V& cb, const V& cc, const V& dd); |
32 | template<class V> SIMDPP_INL |
33 | void v512_store_pack4(char* p, const V& ca, const V& cb, const V& cc, const V& dd); |
34 | |
35 | // ----------------------------------------------------------------------------- |
36 | |
37 | static SIMDPP_INL |
38 | void i_store_packed4(char* p, |
39 | const uint8x16& a, const uint8x16& b, |
40 | const uint8x16& c, const uint8x16& d) |
41 | { |
42 | p = detail::assume_aligned(p, 16); |
43 | #if SIMDPP_USE_NULL |
44 | detail::null::store_packed4(p, a, b, c, d); |
45 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
46 | v128_store_pack4(p, a, b, c, d); |
47 | #elif SIMDPP_USE_NEON |
48 | uint8x16x4_t t; |
49 | t.val[0] = a.native(); |
50 | t.val[1] = b.native(); |
51 | t.val[2] = c.native(); |
52 | t.val[3] = d.native(); |
53 | vst4q_u8(reinterpret_cast<uint8_t*>(p), t); |
54 | #endif |
55 | } |
56 | |
57 | #if SIMDPP_USE_AVX2 |
58 | static SIMDPP_INL |
59 | void i_store_packed4(char* p, |
60 | const uint8x32& a, const uint8x32& b, |
61 | const uint8x32& c, const uint8x32& d) |
62 | { |
63 | v256_store_pack4(p, a, b, c, d); |
64 | } |
65 | #endif |
66 | |
67 | #if SIMDPP_USE_AVX512BW |
68 | static SIMDPP_INL |
69 | void i_store_packed4(char* p, |
70 | const uint8<64>& a, const uint8<64>& b, |
71 | const uint8<64>& c, const uint8<64>& d) |
72 | { |
73 | v512_store_pack4(p, a, b, c, d); |
74 | } |
75 | #endif |
76 | |
77 | // ----------------------------------------------------------------------------- |
78 | |
79 | static SIMDPP_INL |
80 | void i_store_packed4(char* p, |
81 | const uint16x8& a, const uint16x8& b, |
82 | const uint16x8& c, const uint16x8& d) |
83 | { |
84 | p = detail::assume_aligned(p, 16); |
85 | #if SIMDPP_USE_NULL |
86 | detail::null::store_packed4(p, a, b, c, d); |
87 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
88 | v128_store_pack4(p, a, b, c, d); |
89 | #elif SIMDPP_USE_NEON |
90 | uint16x8x4_t t; |
91 | t.val[0] = a.native(); |
92 | t.val[1] = b.native(); |
93 | t.val[2] = c.native(); |
94 | t.val[3] = d.native(); |
95 | vst4q_u16(reinterpret_cast<uint16_t*>(p), t); |
96 | #endif |
97 | } |
98 | |
99 | #if SIMDPP_USE_AVX2 |
100 | static SIMDPP_INL |
101 | void i_store_packed4(char* p, |
102 | const uint16x16& a, const uint16x16& b, |
103 | const uint16x16& c, const uint16x16& d) |
104 | { |
105 | v256_store_pack4(p, a, b, c, d); |
106 | } |
107 | #endif |
108 | |
109 | #if SIMDPP_USE_AVX512BW |
110 | static SIMDPP_INL |
111 | void i_store_packed4(char* p, |
112 | const uint16<32>& a, const uint16<32>& b, |
113 | const uint16<32>& c, const uint16<32>& d) |
114 | { |
115 | v512_store_pack4(p, a, b, c, d); |
116 | } |
117 | #endif |
118 | |
119 | // ----------------------------------------------------------------------------- |
120 | |
121 | static SIMDPP_INL |
122 | void i_store_packed4(char* p, |
123 | const uint32x4& a, const uint32x4& b, |
124 | const uint32x4& c, const uint32x4& d) |
125 | { |
126 | p = detail::assume_aligned(p, 16); |
127 | #if SIMDPP_USE_NULL |
128 | detail::null::store_packed4(p, a, b, c, d); |
129 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
130 | v128_store_pack4(p, a, b, c, d); |
131 | #elif SIMDPP_USE_NEON |
132 | uint32x4x4_t t; |
133 | t.val[0] = a.native(); |
134 | t.val[1] = b.native(); |
135 | t.val[2] = c.native(); |
136 | t.val[3] = d.native(); |
137 | vst4q_u32(reinterpret_cast<uint32_t*>(p), t); |
138 | #endif |
139 | } |
140 | |
141 | #if SIMDPP_USE_AVX2 |
142 | static SIMDPP_INL |
143 | void i_store_packed4(char* p, |
144 | const uint32x8& a, const uint32x8& b, |
145 | const uint32x8& c, const uint32x8& d) |
146 | { |
147 | v256_store_pack4(p, a, b, c, d); |
148 | } |
149 | #endif |
150 | |
151 | #if SIMDPP_USE_AVX512F |
152 | static SIMDPP_INL |
153 | void i_store_packed4(char* p, |
154 | const uint32<16>& a, const uint32<16>& b, |
155 | const uint32<16>& c, const uint32<16>& d) |
156 | { |
157 | v512_store_pack4(p, a, b, c, d); |
158 | } |
159 | #endif |
160 | |
161 | // ----------------------------------------------------------------------------- |
162 | |
163 | static SIMDPP_INL |
164 | void i_store_packed4(char* p, |
165 | const uint64x2& a, const uint64x2& b, |
166 | const uint64x2& c, const uint64x2& d) |
167 | { |
168 | #if SIMDPP_USE_NEON64 |
169 | uint64x2x4_t t; |
170 | t.val[0] = a.native(); |
171 | t.val[1] = b.native(); |
172 | t.val[2] = c.native(); |
173 | t.val[3] = d.native(); |
174 | vst4q_u64(reinterpret_cast<uint64_t*>(p), t); |
175 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA |
176 | v128_store_pack4(p, a, b, c, d); |
177 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC |
178 | detail::null::store_packed4(p, a, b, c, d); |
179 | #endif |
180 | } |
181 | |
182 | #if SIMDPP_USE_AVX2 |
183 | static SIMDPP_INL |
184 | void i_store_packed4(char* p, |
185 | const uint64x4& a, const uint64x4& b, |
186 | const uint64x4& c, const uint64x4& d) |
187 | { |
188 | v256_store_pack4(p, a, b, c, d); |
189 | } |
190 | #endif |
191 | |
192 | #if SIMDPP_USE_AVX512F |
193 | static SIMDPP_INL |
194 | void i_store_packed4(char* p, |
195 | const uint64<8>& a, const uint64<8>& b, |
196 | const uint64<8>& c, const uint64<8>& d) |
197 | { |
198 | v512_store_pack4(p, a, b, c, d); |
199 | } |
200 | #endif |
201 | |
202 | // ----------------------------------------------------------------------------- |
203 | |
204 | static SIMDPP_INL |
205 | void i_store_packed4(char* p, |
206 | const float32x4& a, const float32x4& b, |
207 | const float32x4& c, const float32x4& d) |
208 | { |
209 | p = detail::assume_aligned(p, 16); |
210 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP |
211 | detail::null::store_packed4(p, a, b, c, d); |
212 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
213 | v128_store_pack4(p, a, b, c, d); |
214 | #elif SIMDPP_USE_NEON |
215 | float32x4x4_t t; |
216 | t.val[0] = a.native(); |
217 | t.val[1] = b.native(); |
218 | t.val[2] = c.native(); |
219 | t.val[3] = d.native(); |
220 | vst4q_f32(reinterpret_cast<float*>(p), t); |
221 | #endif |
222 | } |
223 | |
224 | #if SIMDPP_USE_AVX |
225 | static SIMDPP_INL |
226 | void i_store_packed4(char* p, |
227 | const float32x8& a, const float32x8& b, |
228 | const float32x8& c, const float32x8& d) |
229 | { |
230 | v256_store_pack4(p, a, b, c, d); |
231 | } |
232 | #endif |
233 | |
234 | #if SIMDPP_USE_AVX512F |
235 | static SIMDPP_INL |
236 | void i_store_packed4(char* p, |
237 | const float32<16>& a, const float32<16>& b, |
238 | const float32<16>& c, const float32<16>& d) |
239 | { |
240 | v512_store_pack4(p, a, b, c, d); |
241 | } |
242 | #endif |
243 | |
244 | // ----------------------------------------------------------------------------- |
245 | |
246 | static SIMDPP_INL |
247 | void i_store_packed4(char* p, |
248 | const float64x2& a, const float64x2& b, |
249 | const float64x2& c, const float64x2& d) |
250 | { |
251 | p = detail::assume_aligned(p, 16); |
252 | #if SIMDPP_USE_SSE2 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA |
253 | v128_store_pack4(p, a, b, c, d); |
254 | #elif SIMDPP_USE_NEON64 |
255 | float64x2x4_t t; |
256 | t.val[0] = a.native(); |
257 | t.val[1] = b.native(); |
258 | t.val[2] = c.native(); |
259 | t.val[3] = d.native(); |
260 | vst4q_f64(reinterpret_cast<double*>(p), t); |
261 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC |
262 | detail::null::store_packed4(p, a, b, c, d); |
263 | #endif |
264 | } |
265 | |
266 | #if SIMDPP_USE_AVX |
267 | static SIMDPP_INL |
268 | void i_store_packed4(char* p, |
269 | const float64x4& a, const float64x4& b, |
270 | const float64x4& c, const float64x4& d) |
271 | { |
272 | v256_store_pack4(p, a, b, c, d); |
273 | } |
274 | #endif |
275 | |
276 | #if SIMDPP_USE_AVX512F |
277 | static SIMDPP_INL |
278 | void i_store_packed4(char* p, |
279 | const float64<8>& a, const float64<8>& b, |
280 | const float64<8>& c, const float64<8>& d) |
281 | { |
282 | v512_store_pack4(p, a, b, c, d); |
283 | } |
284 | #endif |
285 | |
286 | // ----------------------------------------------------------------------------- |
287 | |
288 | template<class V> SIMDPP_INL |
289 | void v128_store_pack4(char* p, const V& ca, const V& cb, const V& cc, const V& dd) |
290 | { |
291 | p = detail::assume_aligned(p, 16); |
292 | V a = ca, b = cb, c = cc, d = dd; |
293 | mem_pack4(a, b, c, d); |
294 | i_store(p, a); |
295 | i_store(p + 16, b); |
296 | i_store(p + 32, c); |
297 | i_store(p + 48, d); |
298 | } |
299 | |
300 | template<class V> SIMDPP_INL |
301 | void v256_store_pack4(char* p, const V& ca, const V& cb, const V& cc, const V& dd) |
302 | { |
303 | p = detail::assume_aligned(p, 32); |
304 | V a = ca, b = cb, c = cc, d = dd; |
305 | mem_pack4(a, b, c, d); |
306 | i_store(p, a); |
307 | i_store(p + 32, b); |
308 | i_store(p + 64, c); |
309 | i_store(p + 96, d); |
310 | } |
311 | |
312 | template<class V> SIMDPP_INL |
313 | void v512_store_pack4(char* p, const V& ca, const V& cb, const V& cc, const V& dd) |
314 | { |
315 | p = detail::assume_aligned(p, 64); |
316 | V a = ca, b = cb, c = cc, d = dd; |
317 | mem_pack4(a, b, c, d); |
318 | i_store(p, a); |
319 | i_store(p + 64, b); |
320 | i_store(p + 128, c); |
321 | i_store(p + 192, d); |
322 | } |
323 | |
324 | template<class V> SIMDPP_INL |
325 | void i_store_packed4(char* p, const V& ca, const V& cb, const V& cc, const V& dd) |
326 | { |
327 | const unsigned veclen = V::base_vector_type::length_bytes; |
328 | typename detail::remove_sign<V>::type a = ca, b = cb, c = cc, d = dd; |
329 | |
330 | p = detail::assume_aligned(p, veclen); |
331 | for (unsigned i = 0; i < V::vec_length; ++i) { |
332 | i_store_packed4(p, a.vec(i), b.vec(i), c.vec(i), d.vec(i)); |
333 | p += veclen*4; |
334 | } |
335 | } |
336 | |
337 | } // namespace insn |
338 | } // namespace detail |
339 | } // namespace SIMDPP_ARCH_NAMESPACE |
340 | } // namespace simdpp |
341 | |
342 | #endif |
343 | |