1/* Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_LOAD_PACKED4_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_LOAD_PACKED4_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/detail/insn/mem_unpack.h>
17#include <simdpp/core/load.h>
18#include <simdpp/core/transpose.h>
19#include <simdpp/detail/null/memory.h>
20
21namespace simdpp {
22namespace SIMDPP_ARCH_NAMESPACE {
23namespace detail {
24namespace insn {
25
26
27// collect some boilerplate
28template<class V> SIMDPP_INL
29void v128_load_packed4(V& a, V& b, V& c, V& d, const char* p);
30template<class V> SIMDPP_INL
31void v256_load_packed4(V& a, V& b, V& c, V& d, const char* p);
32template<class V> SIMDPP_INL
33void v512_load_packed4(V& a, V& b, V& c, V& d, const char* p);
34
35// -----------------------------------------------------------------------------
36
37static SIMDPP_INL
38void i_load_packed4(uint8x16& a, uint8x16& b, uint8x16& c, uint8x16& d,
39 const char* p)
40{
41 p = detail::assume_aligned(p, 16);
42#if SIMDPP_USE_NULL
43 detail::null::load_packed4(a, b, c, d, p);
44#elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
45 v128_load_packed4(a, b, c, d, p);
46#elif SIMDPP_USE_NEON
47 auto r = vld4q_u8(reinterpret_cast<const uint8_t*>(p));
48 a = r.val[0];
49 b = r.val[1];
50 c = r.val[2];
51 d = r.val[3];
52#endif
53}
54
55#if SIMDPP_USE_AVX2
56static SIMDPP_INL
57void i_load_packed4(uint8x32& a, uint8x32& b, uint8x32& c, uint8x32& d,
58 const char* p)
59{
60 v256_load_packed4(a, b, c, d, p);
61}
62#endif
63
64#if SIMDPP_USE_AVX512BW
65static SIMDPP_INL
66void i_load_packed4(uint8<64>& a, uint8<64>& b, uint8<64>& c, uint8<64>& d,
67 const char* p)
68{
69 v512_load_packed4(a, b, c, d, p);
70}
71#endif
72
73// -----------------------------------------------------------------------------
74
75static SIMDPP_INL
76void i_load_packed4(uint16x8& a, uint16x8& b, uint16x8& c, uint16x8& d,
77 const char* p)
78{
79 p = detail::assume_aligned(p, 16);
80#if SIMDPP_USE_NULL
81 detail::null::load_packed4(a, b, c, d, p);
82#elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
83 v128_load_packed4(a, b, c, d, p);
84#elif SIMDPP_USE_NEON
85 auto r = vld4q_u16(reinterpret_cast<const uint16_t*>(p));
86 a = r.val[0];
87 b = r.val[1];
88 c = r.val[2];
89 d = r.val[3];
90#endif
91}
92
93#if SIMDPP_USE_AVX2
94static SIMDPP_INL
95void i_load_packed4(uint16x16& a, uint16x16& b, uint16x16& c, uint16x16& d,
96 const char* p)
97{
98 v256_load_packed4(a, b, c, d, p);
99}
100#endif
101
102#if SIMDPP_USE_AVX512BW
103static SIMDPP_INL
104void i_load_packed4(uint16<32>& a, uint16<32>& b, uint16<32>& c, uint16<32>& d,
105 const char* p)
106{
107 v512_load_packed4(a, b, c, d, p);
108}
109#endif
110
111// -----------------------------------------------------------------------------
112
113static SIMDPP_INL
114void i_load_packed4(uint32x4& a, uint32x4& b, uint32x4& c, uint32x4& d,
115 const char* p)
116{
117 p = detail::assume_aligned(p, 16);
118#if SIMDPP_USE_NULL
119 detail::null::load_packed4(a, b, c, d, p);
120#elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
121 v128_load_packed4(a, b, c, d, p);
122#elif SIMDPP_USE_NEON
123 auto r = vld4q_u32(reinterpret_cast<const uint32_t*>(p));
124 a = r.val[0];
125 b = r.val[1];
126 c = r.val[2];
127 d = r.val[3];
128#endif
129}
130
131#if SIMDPP_USE_AVX2
132static SIMDPP_INL
133void i_load_packed4(uint32x8& a, uint32x8& b, uint32x8& c, uint32x8& d,
134 const char* p)
135{
136 v256_load_packed4(a, b, c, d, p);
137}
138#endif
139
140#if SIMDPP_USE_AVX512F
141static SIMDPP_INL
142void i_load_packed4(uint32<16>& a, uint32<16>& b, uint32<16>& c, uint32<16>& d,
143 const char* p)
144{
145 v512_load_packed4(a, b, c, d, p);
146}
147#endif
148
149// -----------------------------------------------------------------------------
150
151static SIMDPP_INL
152void i_load_packed4(uint64x2& a, uint64x2& b, uint64x2& c, uint64x2& d,
153 const char* p)
154{
155#if SIMDPP_USE_NEON64
156 auto r = vld4q_u64(reinterpret_cast<const uint64_t*>(p));
157 a = r.val[0];
158 b = r.val[1];
159 c = r.val[2];
160 d = r.val[3];
161#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
162 v128_load_packed4(a, b, c, d, p);
163#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
164 detail::null::load_packed4(a, b, c, d, p);
165#endif
166}
167
168#if SIMDPP_USE_AVX2
169static SIMDPP_INL
170void i_load_packed4(uint64x4& a, uint64x4& b, uint64x4& c, uint64x4& d,
171 const char* p)
172{
173 v256_load_packed4(a, b, c, d, p);
174}
175#endif
176
177#if SIMDPP_USE_AVX512F
178static SIMDPP_INL
179void i_load_packed4(uint64<8>& a, uint64<8>& b, uint64<8>& c, uint64<8>& d,
180 const char* p)
181{
182 v512_load_packed4(a, b, c, d, p);
183}
184#endif
185
186// -----------------------------------------------------------------------------
187
188static SIMDPP_INL
189void i_load_packed4(float32x4& a, float32x4& b, float32x4& c, float32x4& d,
190 const char* p)
191{
192 p = detail::assume_aligned(p, 16);
193#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
194 detail::null::load_packed4(a, b, c, d, p);
195#elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
196 v128_load_packed4(a, b, c, d, p);
197#elif SIMDPP_USE_NEON
198 auto r = vld4q_f32(reinterpret_cast<const float*>(p));
199 a = r.val[0];
200 b = r.val[1];
201 c = r.val[2];
202 d = r.val[3];
203#endif
204}
205
206#if SIMDPP_USE_AVX
207static SIMDPP_INL
208void i_load_packed4(float32x8& a, float32x8& b, float32x8& c, float32x8& d,
209 const char* p)
210{
211 v256_load_packed4(a, b, c, d, p);
212}
213#endif
214
215#if SIMDPP_USE_AVX512F
216static SIMDPP_INL
217void i_load_packed4(float32<16>& a, float32<16>& b, float32<16>& c, float32<16>& d,
218 const char* p)
219{
220 v512_load_packed4(a, b, c, d, p);
221}
222#endif
223
224// -----------------------------------------------------------------------------
225
226static SIMDPP_INL
227void i_load_packed4(float64x2& a, float64x2& b, float64x2& c, float64x2& d,
228 const char* p)
229{
230 p = detail::assume_aligned(p, 16);
231#if SIMDPP_USE_SSE2 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
232 v128_load_packed4(a, b, c, d, p);
233#elif SIMDPP_USE_NEON64
234 auto r = vld4q_f64(reinterpret_cast<const double*>(p));
235 a = r.val[0];
236 b = r.val[1];
237 c = r.val[2];
238 d = r.val[3];
239#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC
240 detail::null::load_packed4(a, b, c, d, p);
241#endif
242}
243
244#if SIMDPP_USE_AVX
245static SIMDPP_INL
246void i_load_packed4(float64x4& a, float64x4& b, float64x4& c, float64x4& d,
247 const char* p)
248{
249 v256_load_packed4(a, b, c, d, p);
250}
251#endif
252
253#if SIMDPP_USE_AVX512F
254static SIMDPP_INL
255void i_load_packed4(float64<8>& a, float64<8>& b, float64<8>& c, float64<8>& d,
256 const char* p)
257{
258 v512_load_packed4(a, b, c, d, p);
259}
260#endif
261
262// -----------------------------------------------------------------------------
263
264template<class V> SIMDPP_INL
265void v128_load_packed4(V& a, V& b, V& c, V& d, const char* p)
266{
267 p = detail::assume_aligned(p, 16);
268 a = load(p);
269 b = load(p + 16);
270 c = load(p + 32);
271 d = load(p + 48);
272 mem_unpack4(a, b, c, d);
273}
274
275template<class V> SIMDPP_INL
276void v256_load_packed4(V& a, V& b, V& c, V& d, const char* p)
277{
278 p = detail::assume_aligned(p, 32);
279 a = load(p);
280 b = load(p + 32);
281 c = load(p + 64);
282 d = load(p + 96);
283 mem_unpack4(a, b, c, d);
284}
285
286template<class V> SIMDPP_INL
287void v512_load_packed4(V& a, V& b, V& c, V& d, const char* p)
288{
289 p = detail::assume_aligned(p, 64);
290 a = load(p);
291 b = load(p + 64);
292 c = load(p + 128);
293 d = load(p + 192);
294 mem_unpack4(a, b, c, d);
295}
296
297template<class V> SIMDPP_INL
298void i_load_packed4(V& a, V& b, V& c, V& d, const char* p)
299{
300 const unsigned veclen = V::base_vector_type::length_bytes;
301
302 p = detail::assume_aligned(p, veclen);
303 for (unsigned i = 0; i < V::vec_length; ++i) {
304 i_load_packed4(a.vec(i), b.vec(i), c.vec(i), d.vec(i), p);
305 p += veclen*4;
306 }
307}
308
309
310} // namespace insn
311} // namespace detail
312} // namespace SIMDPP_ARCH_NAMESPACE
313} // namespace simdpp
314
315#endif
316
317