1/* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_PACKED3_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_PACKED3_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/detail/align.h>
17#include <simdpp/detail/insn/mem_pack.h>
18#include <simdpp/core/store.h>
19#include <simdpp/detail/null/memory.h>
20
21namespace simdpp {
22namespace SIMDPP_ARCH_NAMESPACE {
23namespace detail {
24namespace insn {
25
26// collect some boilerplate
27template<class V> SIMDPP_INL
28void v128_store_pack3(char* p, const V& ca, const V& cb, const V& cc);
29template<class V> SIMDPP_INL
30void v256_store_pack3(char* p, const V& ca, const V& cb, const V& cc);
31template<class V> SIMDPP_INL
32void v512_store_pack3(char* p, const V& ca, const V& cb, const V& cc);
33
34// -----------------------------------------------------------------------------
35
36static SIMDPP_INL
37void i_store_packed3(char* p, const uint8x16& a, const uint8x16& b, const uint8x16& c)
38{
39 p = detail::assume_aligned(p, 16);
40#if SIMDPP_USE_NULL
41 detail::null::store_packed3(p, a, b, c);
42#elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
43 v128_store_pack3(p, a, b, c);
44#elif SIMDPP_USE_NEON
45 uint8x16x3_t t;
46 t.val[0] = a.native();
47 t.val[1] = b.native();
48 t.val[2] = c.native();
49 vst3q_u8(reinterpret_cast<uint8_t*>(p), t);
50#endif
51}
52
53#if SIMDPP_USE_AVX2
54static SIMDPP_INL
55void i_store_packed3(char* p, const uint8x32& a, const uint8x32& b, const uint8x32& c)
56{
57 v256_store_pack3(p, a, b, c);
58}
59#endif
60
61#if SIMDPP_USE_AVX512BW
62static SIMDPP_INL
63void i_store_packed3(char* p, const uint8<64>& a, const uint8<64>& b, const uint8<64>& c)
64{
65 v512_store_pack3(p, a, b, c);
66}
67#endif
68
69// -----------------------------------------------------------------------------
70
71static SIMDPP_INL
72void i_store_packed3(char* p, const uint16x8& a, const uint16x8& b, const uint16x8& c)
73{
74 p = detail::assume_aligned(p, 16);
75#if SIMDPP_USE_NULL
76 detail::null::store_packed3(p, a, b, c);
77#elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
78 v128_store_pack3(p, a, b, c);
79#elif SIMDPP_USE_NEON
80 uint16x8x3_t t;
81 t.val[0] = a.native();
82 t.val[1] = b.native();
83 t.val[2] = c.native();
84 vst3q_u16(reinterpret_cast<uint16_t*>(p), t);
85#endif
86}
87
88#if SIMDPP_USE_AVX2
89static SIMDPP_INL
90void i_store_packed3(char* p, const uint16x16& a, const uint16x16& b, const uint16x16& c)
91{
92 v256_store_pack3(p, a, b, c);
93}
94#endif
95
96#if SIMDPP_USE_AVX512BW
97static SIMDPP_INL
98void i_store_packed3(char* p, const uint16<32>& a, const uint16<32>& b, const uint16<32>& c)
99{
100 v512_store_pack3(p, a, b, c);
101}
102#endif
103
104// -----------------------------------------------------------------------------
105
106static SIMDPP_INL
107void i_store_packed3(char* p, const uint32x4& a, const uint32x4& b, const uint32x4& c)
108{
109 p = detail::assume_aligned(p, 16);
110#if SIMDPP_USE_NULL
111 detail::null::store_packed3(p, a, b, c);
112#elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
113 v128_store_pack3(p, a, b, c);
114#elif SIMDPP_USE_NEON
115 uint32x4x3_t t;
116 t.val[0] = a.native();
117 t.val[1] = b.native();
118 t.val[2] = c.native();
119 vst3q_u32(reinterpret_cast<uint32_t*>(p), t);
120#endif
121}
122
123#if SIMDPP_USE_AVX2
124static SIMDPP_INL
125void i_store_packed3(char* p, const uint32x8& a, const uint32x8& b, const uint32x8& c)
126{
127 v256_store_pack3(p, a, b, c);
128}
129#endif
130
131#if SIMDPP_USE_AVX512F
132static SIMDPP_INL
133void i_store_packed3(char* p, const uint32<16>& a, const uint32<16>& b, const uint32<16>& c)
134{
135 v512_store_pack3(p, a, b, c);
136}
137#endif
138
139// -----------------------------------------------------------------------------
140
141static SIMDPP_INL
142void i_store_packed3(char* p, const uint64x2& a, const uint64x2& b, const uint64x2& c)
143{
144 p = detail::assume_aligned(p, 16);
145#if SIMDPP_USE_SSE2 || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
146 v128_store_pack3(p, a, b, c);
147#elif SIMDPP_USE_NEON32
148 uint64_t* q = reinterpret_cast<uint64_t*>(p);
149 uint64x1x2_t t1, t2, t3;
150 t1.val[0] = vget_low_u64(a.native()); t1.val[1] = vget_low_u64(b.native());
151 t2.val[0] = vget_low_u64(c.native()); t2.val[1] = vget_high_u64(a.native());
152 t3.val[0] = vget_high_u64(b.native()); t3.val[1] = vget_high_u64(c.native());
153
154 vst2_u64(q, t1);
155 vst2_u64(q+2, t2);
156 vst2_u64(q+4, t3);
157#elif SIMDPP_USE_NEON64
158 uint64x2x3_t t;
159 t.val[0] = a.native();
160 t.val[1] = b.native();
161 t.val[2] = c.native();
162 vst3q_u64(reinterpret_cast<uint64_t*>(p), t);
163#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
164 detail::null::store_packed3(p, a, b, c);
165#endif
166}
167
168#if SIMDPP_USE_AVX2
169static SIMDPP_INL
170void i_store_packed3(char* p, const uint64x4& a, const uint64x4& b, const uint64x4& c)
171{
172 v256_store_pack3(p, a, b, c);
173}
174#endif
175
176#if SIMDPP_USE_AVX512F
177static SIMDPP_INL
178void i_store_packed3(char* p, const uint64<8>& a, const uint64<8>& b, const uint64<8>& c)
179{
180 v512_store_pack3(p, a, b, c);
181}
182#endif
183
184// -----------------------------------------------------------------------------
185
186static SIMDPP_INL
187void i_store_packed3(char* p, const float32x4& a, const float32x4& b, const float32x4& c)
188{
189 p = detail::assume_aligned(p, 16);
190#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
191 detail::null::store_packed3(p, a, b, c);
192#elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
193 v128_store_pack3(p, a, b, c);
194#elif SIMDPP_USE_NEON
195 float32x4x3_t t;
196 t.val[0] = a.native();
197 t.val[1] = b.native();
198 t.val[2] = c.native();
199 vst3q_f32(reinterpret_cast<float*>(p), t);
200#endif
201}
202
203#if SIMDPP_USE_AVX
204static SIMDPP_INL
205void i_store_packed3(char* p, const float32x8& a, const float32x8& b, const float32x8& c)
206{
207 v256_store_pack3(p, a, b, c);
208}
209#endif
210
211#if SIMDPP_USE_AVX512F
212static SIMDPP_INL
213void i_store_packed3(char* p, const float32<16>& a, const float32<16>& b, const float32<16>& c)
214{
215 v512_store_pack3(p, a, b, c);
216}
217#endif
218
219// -----------------------------------------------------------------------------
220
221static SIMDPP_INL
222void i_store_packed3(char* p, const float64x2& a, const float64x2& b, const float64x2& c)
223{
224 p = detail::assume_aligned(p, 16);
225#if SIMDPP_USE_SSE2 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
226 v128_store_pack3(p, a, b, c);
227#elif SIMDPP_USE_NEON64
228 float64x2x3_t t;
229 t.val[0] = a.native();
230 t.val[1] = b.native();
231 t.val[2] = c.native();
232 vst3q_f64(reinterpret_cast<double*>(p), t);
233#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC || SIMDPP_USE_NEON
234 detail::null::store_packed3(p, a, b, c);
235#endif
236}
237
238#if SIMDPP_USE_AVX
239static SIMDPP_INL
240void i_store_packed3(char* p, const float64x4& a, const float64x4& b, const float64x4& c)
241{
242 v256_store_pack3(p, a, b, c);
243}
244#endif
245
246#if SIMDPP_USE_AVX512F
247static SIMDPP_INL
248void i_store_packed3(char* p, const float64<8>& a, const float64<8>& b, const float64<8>& c)
249{
250 v512_store_pack3(p, a, b, c);
251}
252#endif
253
254// -----------------------------------------------------------------------------
255
256template<class V> SIMDPP_INL
257void v128_store_pack3(char* p, const V& ca, const V& cb, const V& cc)
258{
259 p = detail::assume_aligned(p, 16);
260 V a = ca, b = cb, c = cc;
261 mem_pack3(a, b, c);
262 i_store(p, a);
263 i_store(p + 16, b);
264 i_store(p + 32, c);
265}
266
267template<class V> SIMDPP_INL
268void v256_store_pack3(char* p, const V& ca, const V& cb, const V& cc)
269{
270 p = detail::assume_aligned(p, 32);
271 V a = ca, b = cb, c = cc;
272 mem_pack3(a, b, c);
273 i_store(p, a);
274 i_store(p + 32, b);
275 i_store(p + 64, c);
276}
277
278template<class V> SIMDPP_INL
279void v512_store_pack3(char* p, const V& ca, const V& cb, const V& cc)
280{
281 p = detail::assume_aligned(p, 64);
282 V a = ca, b = cb, c = cc;
283 mem_pack3(a, b, c);
284 i_store(p, a);
285 i_store(p + 64, b);
286 i_store(p + 128, c);
287}
288
289template<class V> SIMDPP_INL
290void i_store_packed3(char* p, const V& ca, const V& cb, const V& cc)
291{
292 const unsigned veclen = V::base_vector_type::length_bytes;
293 typename detail::remove_sign<V>::type a = ca, b = cb, c = cc;
294
295 p = detail::assume_aligned(p, veclen);
296 for (unsigned i = 0; i < V::vec_length; ++i) {
297 i_store_packed3(p, a.vec(i), b.vec(i), c.vec(i));
298 p += veclen*3;
299 }
300}
301
302} // namespace insn
303} // namespace detail
304} // namespace SIMDPP_ARCH_NAMESPACE
305} // namespace simdpp
306
307#endif
308