1/* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_PACKED4_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_PACKED4_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/detail/align.h>
17#include <simdpp/detail/insn/mem_pack.h>
18#include <simdpp/core/store.h>
19#include <simdpp/detail/null/memory.h>
20
21namespace simdpp {
22namespace SIMDPP_ARCH_NAMESPACE {
23namespace detail {
24namespace insn {
25
26
27// collect some boilerplate
28template<class V> SIMDPP_INL
29void v128_store_pack4(char* p, const V& ca, const V& cb, const V& cc, const V& dd);
30template<class V> SIMDPP_INL
31void v256_store_pack4(char* p, const V& ca, const V& cb, const V& cc, const V& dd);
32template<class V> SIMDPP_INL
33void v512_store_pack4(char* p, const V& ca, const V& cb, const V& cc, const V& dd);
34
35// -----------------------------------------------------------------------------
36
37static SIMDPP_INL
38void i_store_packed4(char* p,
39 const uint8x16& a, const uint8x16& b,
40 const uint8x16& c, const uint8x16& d)
41{
42 p = detail::assume_aligned(p, 16);
43#if SIMDPP_USE_NULL
44 detail::null::store_packed4(p, a, b, c, d);
45#elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
46 v128_store_pack4(p, a, b, c, d);
47#elif SIMDPP_USE_NEON
48 uint8x16x4_t t;
49 t.val[0] = a.native();
50 t.val[1] = b.native();
51 t.val[2] = c.native();
52 t.val[3] = d.native();
53 vst4q_u8(reinterpret_cast<uint8_t*>(p), t);
54#endif
55}
56
57#if SIMDPP_USE_AVX2
58static SIMDPP_INL
59void i_store_packed4(char* p,
60 const uint8x32& a, const uint8x32& b,
61 const uint8x32& c, const uint8x32& d)
62{
63 v256_store_pack4(p, a, b, c, d);
64}
65#endif
66
67#if SIMDPP_USE_AVX512BW
68static SIMDPP_INL
69void i_store_packed4(char* p,
70 const uint8<64>& a, const uint8<64>& b,
71 const uint8<64>& c, const uint8<64>& d)
72{
73 v512_store_pack4(p, a, b, c, d);
74}
75#endif
76
77// -----------------------------------------------------------------------------
78
79static SIMDPP_INL
80void i_store_packed4(char* p,
81 const uint16x8& a, const uint16x8& b,
82 const uint16x8& c, const uint16x8& d)
83{
84 p = detail::assume_aligned(p, 16);
85#if SIMDPP_USE_NULL
86 detail::null::store_packed4(p, a, b, c, d);
87#elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
88 v128_store_pack4(p, a, b, c, d);
89#elif SIMDPP_USE_NEON
90 uint16x8x4_t t;
91 t.val[0] = a.native();
92 t.val[1] = b.native();
93 t.val[2] = c.native();
94 t.val[3] = d.native();
95 vst4q_u16(reinterpret_cast<uint16_t*>(p), t);
96#endif
97}
98
99#if SIMDPP_USE_AVX2
100static SIMDPP_INL
101void i_store_packed4(char* p,
102 const uint16x16& a, const uint16x16& b,
103 const uint16x16& c, const uint16x16& d)
104{
105 v256_store_pack4(p, a, b, c, d);
106}
107#endif
108
109#if SIMDPP_USE_AVX512BW
110static SIMDPP_INL
111void i_store_packed4(char* p,
112 const uint16<32>& a, const uint16<32>& b,
113 const uint16<32>& c, const uint16<32>& d)
114{
115 v512_store_pack4(p, a, b, c, d);
116}
117#endif
118
119// -----------------------------------------------------------------------------
120
121static SIMDPP_INL
122void i_store_packed4(char* p,
123 const uint32x4& a, const uint32x4& b,
124 const uint32x4& c, const uint32x4& d)
125{
126 p = detail::assume_aligned(p, 16);
127#if SIMDPP_USE_NULL
128 detail::null::store_packed4(p, a, b, c, d);
129#elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
130 v128_store_pack4(p, a, b, c, d);
131#elif SIMDPP_USE_NEON
132 uint32x4x4_t t;
133 t.val[0] = a.native();
134 t.val[1] = b.native();
135 t.val[2] = c.native();
136 t.val[3] = d.native();
137 vst4q_u32(reinterpret_cast<uint32_t*>(p), t);
138#endif
139}
140
141#if SIMDPP_USE_AVX2
142static SIMDPP_INL
143void i_store_packed4(char* p,
144 const uint32x8& a, const uint32x8& b,
145 const uint32x8& c, const uint32x8& d)
146{
147 v256_store_pack4(p, a, b, c, d);
148}
149#endif
150
151#if SIMDPP_USE_AVX512F
152static SIMDPP_INL
153void i_store_packed4(char* p,
154 const uint32<16>& a, const uint32<16>& b,
155 const uint32<16>& c, const uint32<16>& d)
156{
157 v512_store_pack4(p, a, b, c, d);
158}
159#endif
160
161// -----------------------------------------------------------------------------
162
163static SIMDPP_INL
164void i_store_packed4(char* p,
165 const uint64x2& a, const uint64x2& b,
166 const uint64x2& c, const uint64x2& d)
167{
168#if SIMDPP_USE_NEON64
169 uint64x2x4_t t;
170 t.val[0] = a.native();
171 t.val[1] = b.native();
172 t.val[2] = c.native();
173 t.val[3] = d.native();
174 vst4q_u64(reinterpret_cast<uint64_t*>(p), t);
175#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
176 v128_store_pack4(p, a, b, c, d);
177#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
178 detail::null::store_packed4(p, a, b, c, d);
179#endif
180}
181
182#if SIMDPP_USE_AVX2
183static SIMDPP_INL
184void i_store_packed4(char* p,
185 const uint64x4& a, const uint64x4& b,
186 const uint64x4& c, const uint64x4& d)
187{
188 v256_store_pack4(p, a, b, c, d);
189}
190#endif
191
192#if SIMDPP_USE_AVX512F
193static SIMDPP_INL
194void i_store_packed4(char* p,
195 const uint64<8>& a, const uint64<8>& b,
196 const uint64<8>& c, const uint64<8>& d)
197{
198 v512_store_pack4(p, a, b, c, d);
199}
200#endif
201
202// -----------------------------------------------------------------------------
203
204static SIMDPP_INL
205void i_store_packed4(char* p,
206 const float32x4& a, const float32x4& b,
207 const float32x4& c, const float32x4& d)
208{
209 p = detail::assume_aligned(p, 16);
210#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
211 detail::null::store_packed4(p, a, b, c, d);
212#elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
213 v128_store_pack4(p, a, b, c, d);
214#elif SIMDPP_USE_NEON
215 float32x4x4_t t;
216 t.val[0] = a.native();
217 t.val[1] = b.native();
218 t.val[2] = c.native();
219 t.val[3] = d.native();
220 vst4q_f32(reinterpret_cast<float*>(p), t);
221#endif
222}
223
224#if SIMDPP_USE_AVX
225static SIMDPP_INL
226void i_store_packed4(char* p,
227 const float32x8& a, const float32x8& b,
228 const float32x8& c, const float32x8& d)
229{
230 v256_store_pack4(p, a, b, c, d);
231}
232#endif
233
234#if SIMDPP_USE_AVX512F
235static SIMDPP_INL
236void i_store_packed4(char* p,
237 const float32<16>& a, const float32<16>& b,
238 const float32<16>& c, const float32<16>& d)
239{
240 v512_store_pack4(p, a, b, c, d);
241}
242#endif
243
244// -----------------------------------------------------------------------------
245
246static SIMDPP_INL
247void i_store_packed4(char* p,
248 const float64x2& a, const float64x2& b,
249 const float64x2& c, const float64x2& d)
250{
251 p = detail::assume_aligned(p, 16);
252#if SIMDPP_USE_SSE2 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
253 v128_store_pack4(p, a, b, c, d);
254#elif SIMDPP_USE_NEON64
255 float64x2x4_t t;
256 t.val[0] = a.native();
257 t.val[1] = b.native();
258 t.val[2] = c.native();
259 t.val[3] = d.native();
260 vst4q_f64(reinterpret_cast<double*>(p), t);
261#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC
262 detail::null::store_packed4(p, a, b, c, d);
263#endif
264}
265
266#if SIMDPP_USE_AVX
267static SIMDPP_INL
268void i_store_packed4(char* p,
269 const float64x4& a, const float64x4& b,
270 const float64x4& c, const float64x4& d)
271{
272 v256_store_pack4(p, a, b, c, d);
273}
274#endif
275
276#if SIMDPP_USE_AVX512F
277static SIMDPP_INL
278void i_store_packed4(char* p,
279 const float64<8>& a, const float64<8>& b,
280 const float64<8>& c, const float64<8>& d)
281{
282 v512_store_pack4(p, a, b, c, d);
283}
284#endif
285
286// -----------------------------------------------------------------------------
287
288template<class V> SIMDPP_INL
289void v128_store_pack4(char* p, const V& ca, const V& cb, const V& cc, const V& dd)
290{
291 p = detail::assume_aligned(p, 16);
292 V a = ca, b = cb, c = cc, d = dd;
293 mem_pack4(a, b, c, d);
294 i_store(p, a);
295 i_store(p + 16, b);
296 i_store(p + 32, c);
297 i_store(p + 48, d);
298}
299
300template<class V> SIMDPP_INL
301void v256_store_pack4(char* p, const V& ca, const V& cb, const V& cc, const V& dd)
302{
303 p = detail::assume_aligned(p, 32);
304 V a = ca, b = cb, c = cc, d = dd;
305 mem_pack4(a, b, c, d);
306 i_store(p, a);
307 i_store(p + 32, b);
308 i_store(p + 64, c);
309 i_store(p + 96, d);
310}
311
312template<class V> SIMDPP_INL
313void v512_store_pack4(char* p, const V& ca, const V& cb, const V& cc, const V& dd)
314{
315 p = detail::assume_aligned(p, 64);
316 V a = ca, b = cb, c = cc, d = dd;
317 mem_pack4(a, b, c, d);
318 i_store(p, a);
319 i_store(p + 64, b);
320 i_store(p + 128, c);
321 i_store(p + 192, d);
322}
323
324template<class V> SIMDPP_INL
325void i_store_packed4(char* p, const V& ca, const V& cb, const V& cc, const V& dd)
326{
327 const unsigned veclen = V::base_vector_type::length_bytes;
328 typename detail::remove_sign<V>::type a = ca, b = cb, c = cc, d = dd;
329
330 p = detail::assume_aligned(p, veclen);
331 for (unsigned i = 0; i < V::vec_length; ++i) {
332 i_store_packed4(p, a.vec(i), b.vec(i), c.vec(i), d.vec(i));
333 p += veclen*4;
334 }
335}
336
337} // namespace insn
338} // namespace detail
339} // namespace SIMDPP_ARCH_NAMESPACE
340} // namespace simdpp
341
342#endif
343