1/* Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_LOAD_PACKED3_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_LOAD_PACKED3_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/detail/align.h>
17#include <simdpp/detail/insn/mem_unpack.h>
18#include <simdpp/core/load.h>
19#include <simdpp/core/transpose.h>
20#include <simdpp/detail/null/memory.h>
21
22namespace simdpp {
23namespace SIMDPP_ARCH_NAMESPACE {
24namespace detail {
25namespace insn {
26
27
28// collect some boilerplate
29template<class V> SIMDPP_INL
30void v128_load_packed3(V& a, V& b, V& c, const char* p);
31template<class V> SIMDPP_INL
32void v256_load_packed3(V& a, V& b, V& c, const char* p);
33template<class V> SIMDPP_INL
34void v512_load_packed3(V& a, V& b, V& c, const char* p);
35
36// -----------------------------------------------------------------------------
37
38static SIMDPP_INL
39void i_load_packed3(uint8x16& a, uint8x16& b, uint8x16& c, const char* p)
40{
41 p = detail::assume_aligned(p, 16);
42#if SIMDPP_USE_NULL
43 detail::null::load_packed3(a, b, c, p);
44#elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
45 v128_load_packed3(a, b, c, p);
46#elif SIMDPP_USE_NEON
47 auto r = vld3q_u8(reinterpret_cast<const uint8_t*>(p));
48 a = r.val[0];
49 b = r.val[1];
50 c = r.val[2];
51#endif
52}
53
54#if SIMDPP_USE_AVX2
55static SIMDPP_INL
56void i_load_packed3(uint8x32& a, uint8x32& b, uint8x32& c, const char* p)
57{
58 v256_load_packed3(a, b, c, p);
59}
60#endif
61
62#if SIMDPP_USE_AVX512BW
63SIMDPP_INL void i_load_packed3(uint8<64>& a, uint8<64>& b, uint8<64>& c, const char* p)
64{
65 v512_load_packed3(a, b, c, p);
66}
67#endif
68
69// -----------------------------------------------------------------------------
70
71static SIMDPP_INL
72void i_load_packed3(uint16x8& a, uint16x8& b, uint16x8& c,
73 const char* p)
74{
75 p = detail::assume_aligned(p, 16);
76#if SIMDPP_USE_NULL
77 detail::null::load_packed3(a, b, c, p);
78#elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
79 v128_load_packed3(a, b, c, p);
80#elif SIMDPP_USE_NEON
81 auto r = vld3q_u16(reinterpret_cast<const uint16_t*>(p));
82 a = r.val[0];
83 b = r.val[1];
84 c = r.val[2];
85#endif
86}
87
88#if SIMDPP_USE_AVX2
89static SIMDPP_INL
90void i_load_packed3(uint16x16& a, uint16x16& b, uint16x16& c,
91 const char* p)
92{
93 v256_load_packed3(a, b, c, p);
94}
95#endif
96
97#if SIMDPP_USE_AVX512BW
98SIMDPP_INL void i_load_packed3(uint16<32>& a, uint16<32>& b, uint16<32>& c,
99 const char* p)
100{
101 v512_load_packed3(a, b, c, p);
102}
103#endif
104
105// -----------------------------------------------------------------------------
106
107static SIMDPP_INL
108void i_load_packed3(uint32x4& a, uint32x4& b, uint32x4&c, const char* p)
109{
110 p = detail::assume_aligned(p, 16);
111#if SIMDPP_USE_NULL
112 detail::null::load_packed3(a, b, c, p);
113#elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
114 v128_load_packed3(a, b, c, p);
115#elif SIMDPP_USE_NEON
116 auto r = vld3q_u32(reinterpret_cast<const uint32_t*>(p));
117 a = r.val[0];
118 b = r.val[1];
119 c = r.val[2];
120#endif
121}
122
123#if SIMDPP_USE_AVX2
124static SIMDPP_INL
125void i_load_packed3(uint32x8& a, uint32x8& b, uint32x8& c, const char* p)
126{
127 v256_load_packed3(a, b, c, p);
128}
129#endif
130
131#if SIMDPP_USE_AVX512F
132static SIMDPP_INL
133void i_load_packed3(uint32<16>& a, uint32<16>& b, uint32<16>& c, const char* p)
134{
135 v512_load_packed3(a, b, c, p);
136}
137#endif
138
139// -----------------------------------------------------------------------------
140
141static SIMDPP_INL
142void i_load_packed3(uint64x2& a, uint64x2& b, uint64x2& c, const char* p)
143{
144 p = detail::assume_aligned(p, 16);
145#if SIMDPP_USE_SSE2 || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
146 v128_load_packed3(a, b, c, p);
147#elif SIMDPP_USE_NEON64
148 auto r = vld3q_u64(reinterpret_cast<const uint64_t*>(p));
149 a = r.val[0];
150 b = r.val[1];
151 c = r.val[2];
152#elif SIMDPP_USE_NEON32
153 uint64x2 a0, b0, c0;
154 a0 = load(p);
155 b0 = load(p+16);
156 c0 = load(p+32);
157
158 int64x1_t al, bl, cl, ah, bh, ch;
159 al = vget_low_u64(a0.native());
160 ah = vget_high_u64(a0.native());
161 bl = vget_low_u64(b0.native());
162 bh = vget_high_u64(b0.native());
163 cl = vget_low_u64(c0.native());
164 ch = vget_high_u64(c0.native());
165 a = vcombine_u64(al, bh);
166 b = vcombine_u64(ah, cl);
167 c = vcombine_u64(bl, ch);
168#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
169 detail::null::load_packed3(a, b, c, p);
170#endif
171}
172
173#if SIMDPP_USE_AVX2
174static SIMDPP_INL
175void i_load_packed3(uint64x4& a, uint64x4& b, uint64x4& c, const char* p)
176{
177 v256_load_packed3(a, b, c, p);
178}
179#endif
180
181#if SIMDPP_USE_AVX512F
182static SIMDPP_INL
183void i_load_packed3(uint64<8>& a, uint64<8>& b, uint64<8>& c,
184 const char* p)
185{
186 v512_load_packed3(a, b, c, p);
187}
188#endif
189
190// -----------------------------------------------------------------------------
191
192static SIMDPP_INL
193void i_load_packed3(float32x4& a, float32x4& b, float32x4& c, const char* p)
194{
195 p = detail::assume_aligned(p, 16);
196#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
197 detail::null::load_packed3(a, b, c, p);
198#elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
199 v128_load_packed3(a, b, c, p);
200#elif SIMDPP_USE_NEON
201 auto r = vld3q_f32(reinterpret_cast<const float*>(p));
202 a = r.val[0];
203 b = r.val[1];
204 c = r.val[2];
205#endif
206}
207
208#if SIMDPP_USE_AVX
209static SIMDPP_INL
210void i_load_packed3(float32x8& a, float32x8& b, float32x8& c, const char* p)
211{
212 v256_load_packed3(a, b, c, p);
213}
214#endif
215
216#if SIMDPP_USE_AVX512F
217static SIMDPP_INL
218void i_load_packed3(float32<16>& a, float32<16>& b, float32<16>& c,
219 const char* p)
220{
221 v512_load_packed3(a, b, c, p);
222}
223#endif
224
225// -----------------------------------------------------------------------------
226
227static SIMDPP_INL
228void i_load_packed3(float64x2& a, float64x2& b, float64x2& c, const char* p)
229{
230 p = detail::assume_aligned(p, 16);
231#if SIMDPP_USE_NEON64
232 auto r = vld3q_f64(reinterpret_cast<const double*>(p));
233 a = r.val[0];
234 b = r.val[1];
235 c = r.val[2];
236#elif SIMDPP_USE_SSE2 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
237 v128_load_packed3(a, b, c, p);
238#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
239 detail::null::load_packed3(a, b, c, p);
240#endif
241}
242
243#if SIMDPP_USE_AVX
244static SIMDPP_INL
245void i_load_packed3(float64x4& a, float64x4& b, float64x4& c,
246 const char* p)
247{
248 v256_load_packed3(a, b, c, p);
249}
250#endif
251
252#if SIMDPP_USE_AVX512F
253static SIMDPP_INL
254void i_load_packed3(float64<8>& a, float64<8>& b, float64<8>& c,
255 const char* p)
256{
257 v512_load_packed3(a, b, c, p);
258}
259#endif
260
261// -----------------------------------------------------------------------------
262
263template<class V> SIMDPP_INL
264void v128_load_packed3(V& a, V& b, V& c, const char* p)
265{
266 p = detail::assume_aligned(p, 16);
267 a = load(p);
268 b = load(p + 16);
269 c = load(p + 32);
270 mem_unpack3(a, b, c);
271}
272
273template<class V> SIMDPP_INL
274void v256_load_packed3(V& a, V& b, V& c, const char* p)
275{
276 p = detail::assume_aligned(p, 32);
277 a = load(p);
278 b = load(p + 32);
279 c = load(p + 64);
280 mem_unpack3(a, b, c);
281}
282
283template<class V> SIMDPP_INL
284void v512_load_packed3(V& a, V& b, V& c, const char* p)
285{
286 p = detail::assume_aligned(p, 64);
287 a = load(p);
288 b = load(p + 64);
289 c = load(p + 128);
290 mem_unpack3(a, b, c);
291}
292
293
294template<class V> SIMDPP_INL
295void i_load_packed3(V& a, V& b, V& c, const char* p)
296{
297 const unsigned veclen = V::base_vector_type::length_bytes;
298
299 p = detail::assume_aligned(p, veclen);
300 for (unsigned i = 0; i < V::vec_length; ++i) {
301 i_load_packed3(a.vec(i), b.vec(i), c.vec(i), p);
302 p += veclen*3;
303 }
304}
305
306} // namespace insn
307} // namespace detail
308} // namespace SIMDPP_ARCH_NAMESPACE
309} // namespace simdpp
310
311#endif
312
313