load_packed4.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/load_packed4.h]

1	/ Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_LOAD_PACKED4_H
9	#define LIBSIMDPP_SIMDPP_DETAIL_INSN_LOAD_PACKED4_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/types.h>
16	#include <simdpp/detail/insn/mem_unpack.h>
17	#include <simdpp/core/load.h>
18	#include <simdpp/core/transpose.h>
19	#include <simdpp/detail/null/memory.h>
20
21	namespace simdpp {
22	namespace SIMDPP_ARCH_NAMESPACE {
23	namespace detail {
24	namespace insn {
25
26
27	// collect some boilerplate
28	template<class V> SIMDPP_INL
29	void v128_load_packed4(V& a, V& b, V& c, V& d, const char* p);
30	template<class V> SIMDPP_INL
31	void v256_load_packed4(V& a, V& b, V& c, V& d, const char* p);
32	template<class V> SIMDPP_INL
33	void v512_load_packed4(V& a, V& b, V& c, V& d, const char* p);
34
35	// -----------------------------------------------------------------------------
36
37	static SIMDPP_INL
38	void i_load_packed4(uint8x16& a, uint8x16& b, uint8x16& c, uint8x16& d,
39	const char* p)
40	{
41	p = detail::assume_aligned(p, `16`);
42	#if SIMDPP_USE_NULL
43	detail::null::load_packed4(a, b, c, d, p);
44	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
45	v128_load_packed4(a, b, c, d, p);
46	#elif SIMDPP_USE_NEON
47	auto r = vld4q_u8(reinterpret_cast<const uint8_t*>(p));
48	a = r.val[`0`];
49	b = r.val[`1`];
50	c = r.val[`2`];
51	d = r.val[`3`];
52	#endif
53	}
54
55	#if SIMDPP_USE_AVX2
56	static SIMDPP_INL
57	void i_load_packed4(uint8x32& a, uint8x32& b, uint8x32& c, uint8x32& d,
58	const char* p)
59	{
60	v256_load_packed4(a, b, c, d, p);
61	}
62	#endif
63
64	#if SIMDPP_USE_AVX512BW
65	static SIMDPP_INL
66	void i_load_packed4(uint8<`64`>& a, uint8<`64`>& b, uint8<`64`>& c, uint8<`64`>& d,
67	const char* p)
68	{
69	v512_load_packed4(a, b, c, d, p);
70	}
71	#endif
72
73	// -----------------------------------------------------------------------------
74
75	static SIMDPP_INL
76	void i_load_packed4(uint16x8& a, uint16x8& b, uint16x8& c, uint16x8& d,
77	const char* p)
78	{
79	p = detail::assume_aligned(p, `16`);
80	#if SIMDPP_USE_NULL
81	detail::null::load_packed4(a, b, c, d, p);
82	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
83	v128_load_packed4(a, b, c, d, p);
84	#elif SIMDPP_USE_NEON
85	auto r = vld4q_u16(reinterpret_cast<const uint16_t*>(p));
86	a = r.val[`0`];
87	b = r.val[`1`];
88	c = r.val[`2`];
89	d = r.val[`3`];
90	#endif
91	}
92
93	#if SIMDPP_USE_AVX2
94	static SIMDPP_INL
95	void i_load_packed4(uint16x16& a, uint16x16& b, uint16x16& c, uint16x16& d,
96	const char* p)
97	{
98	v256_load_packed4(a, b, c, d, p);
99	}
100	#endif
101
102	#if SIMDPP_USE_AVX512BW
103	static SIMDPP_INL
104	void i_load_packed4(uint16<`32`>& a, uint16<`32`>& b, uint16<`32`>& c, uint16<`32`>& d,
105	const char* p)
106	{
107	v512_load_packed4(a, b, c, d, p);
108	}
109	#endif
110
111	// -----------------------------------------------------------------------------
112
113	static SIMDPP_INL
114	void i_load_packed4(uint32x4& a, uint32x4& b, uint32x4& c, uint32x4& d,
115	const char* p)
116	{
117	p = detail::assume_aligned(p, `16`);
118	#if SIMDPP_USE_NULL
119	detail::null::load_packed4(a, b, c, d, p);
120	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
121	v128_load_packed4(a, b, c, d, p);
122	#elif SIMDPP_USE_NEON
123	auto r = vld4q_u32(reinterpret_cast<const uint32_t*>(p));
124	a = r.val[`0`];
125	b = r.val[`1`];
126	c = r.val[`2`];
127	d = r.val[`3`];
128	#endif
129	}
130
131	#if SIMDPP_USE_AVX2
132	static SIMDPP_INL
133	void i_load_packed4(uint32x8& a, uint32x8& b, uint32x8& c, uint32x8& d,
134	const char* p)
135	{
136	v256_load_packed4(a, b, c, d, p);
137	}
138	#endif
139
140	#if SIMDPP_USE_AVX512F
141	static SIMDPP_INL
142	void i_load_packed4(uint32<`16`>& a, uint32<`16`>& b, uint32<`16`>& c, uint32<`16`>& d,
143	const char* p)
144	{
145	v512_load_packed4(a, b, c, d, p);
146	}
147	#endif
148
149	// -----------------------------------------------------------------------------
150
151	static SIMDPP_INL
152	void i_load_packed4(uint64x2& a, uint64x2& b, uint64x2& c, uint64x2& d,
153	const char* p)
154	{
155	#if SIMDPP_USE_NEON64
156	auto r = vld4q_u64(reinterpret_cast<const uint64_t*>(p));
157	a = r.val[`0`];
158	b = r.val[`1`];
159	c = r.val[`2`];
160	d = r.val[`3`];
161	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_NEON \|\| SIMDPP_USE_VSX_207 \|\| SIMDPP_USE_MSA
162	v128_load_packed4(a, b, c, d, p);
163	#elif SIMDPP_USE_NULL \|\| SIMDPP_USE_ALTIVEC
164	detail::null::load_packed4(a, b, c, d, p);
165	#endif
166	}
167
168	#if SIMDPP_USE_AVX2
169	static SIMDPP_INL
170	void i_load_packed4(uint64x4& a, uint64x4& b, uint64x4& c, uint64x4& d,
171	const char* p)
172	{
173	v256_load_packed4(a, b, c, d, p);
174	}
175	#endif
176
177	#if SIMDPP_USE_AVX512F
178	static SIMDPP_INL
179	void i_load_packed4(uint64<`8`>& a, uint64<`8`>& b, uint64<`8`>& c, uint64<`8`>& d,
180	const char* p)
181	{
182	v512_load_packed4(a, b, c, d, p);
183	}
184	#endif
185
186	// -----------------------------------------------------------------------------
187
188	static SIMDPP_INL
189	void i_load_packed4(float32x4& a, float32x4& b, float32x4& c, float32x4& d,
190	const char* p)
191	{
192	p = detail::assume_aligned(p, `16`);
193	#if SIMDPP_USE_NULL \|\| SIMDPP_USE_NEON_NO_FLT_SP
194	detail::null::load_packed4(a, b, c, d, p);
195	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
196	v128_load_packed4(a, b, c, d, p);
197	#elif SIMDPP_USE_NEON
198	auto r = vld4q_f32(reinterpret_cast<const float*>(p));
199	a = r.val[`0`];
200	b = r.val[`1`];
201	c = r.val[`2`];
202	d = r.val[`3`];
203	#endif
204	}
205
206	#if SIMDPP_USE_AVX
207	static SIMDPP_INL
208	void i_load_packed4(float32x8& a, float32x8& b, float32x8& c, float32x8& d,
209	const char* p)
210	{
211	v256_load_packed4(a, b, c, d, p);
212	}
213	#endif
214
215	#if SIMDPP_USE_AVX512F
216	static SIMDPP_INL
217	void i_load_packed4(float32<`16`>& a, float32<`16`>& b, float32<`16`>& c, float32<`16`>& d,
218	const char* p)
219	{
220	v512_load_packed4(a, b, c, d, p);
221	}
222	#endif
223
224	// -----------------------------------------------------------------------------
225
226	static SIMDPP_INL
227	void i_load_packed4(float64x2& a, float64x2& b, float64x2& c, float64x2& d,
228	const char* p)
229	{
230	p = detail::assume_aligned(p, `16`);
231	#if SIMDPP_USE_SSE2 \|\| SIMDPP_USE_VSX_206 \|\| SIMDPP_USE_MSA
232	v128_load_packed4(a, b, c, d, p);
233	#elif SIMDPP_USE_NEON64
234	auto r = vld4q_f64(reinterpret_cast<const double*>(p));
235	a = r.val[`0`];
236	b = r.val[`1`];
237	c = r.val[`2`];
238	d = r.val[`3`];
239	#elif SIMDPP_USE_NULL \|\| SIMDPP_USE_NEON \|\| SIMDPP_USE_ALTIVEC
240	detail::null::load_packed4(a, b, c, d, p);
241	#endif
242	}
243
244	#if SIMDPP_USE_AVX
245	static SIMDPP_INL
246	void i_load_packed4(float64x4& a, float64x4& b, float64x4& c, float64x4& d,
247	const char* p)
248	{
249	v256_load_packed4(a, b, c, d, p);
250	}
251	#endif
252
253	#if SIMDPP_USE_AVX512F
254	static SIMDPP_INL
255	void i_load_packed4(float64<`8`>& a, float64<`8`>& b, float64<`8`>& c, float64<`8`>& d,
256	const char* p)
257	{
258	v512_load_packed4(a, b, c, d, p);
259	}
260	#endif
261
262	// -----------------------------------------------------------------------------
263
264	template<class V> SIMDPP_INL
265	void v128_load_packed4(V& a, V& b, V& c, V& d, const char* p)
266	{
267	p = detail::assume_aligned(p, `16`);
268	a = load(p);
269	b = load(p + `16`);
270	c = load(p + `32`);
271	d = load(p + `48`);
272	mem_unpack4(a, b, c, d);
273	}
274
275	template<class V> SIMDPP_INL
276	void v256_load_packed4(V& a, V& b, V& c, V& d, const char* p)
277	{
278	p = detail::assume_aligned(p, `32`);
279	a = load(p);
280	b = load(p + `32`);
281	c = load(p + `64`);
282	d = load(p + `96`);
283	mem_unpack4(a, b, c, d);
284	}
285
286	template<class V> SIMDPP_INL
287	void v512_load_packed4(V& a, V& b, V& c, V& d, const char* p)
288	{
289	p = detail::assume_aligned(p, `64`);
290	a = load(p);
291	b = load(p + `64`);
292	c = load(p + `128`);
293	d = load(p + `192`);
294	mem_unpack4(a, b, c, d);
295	}
296
297	template<class V> SIMDPP_INL
298	void i_load_packed4(V& a, V& b, V& c, V& d, const char* p)
299	{
300	const unsigned veclen = V::base_vector_type::length_bytes;
301
302	p = detail::assume_aligned(p, veclen);
303	for (unsigned i = `0`; i < V::vec_length; ++i) {
304	i_load_packed4(a.vec(i), b.vec(i), c.vec(i), d.vec(i), p);
305	p += veclen*`4`;
306	}
307	}
308
309
310	} // namespace insn
311	} // namespace detail
312	} // namespace SIMDPP_ARCH_NAMESPACE
313	} // namespace simdpp
314
315	#endif
316
317

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/load_packed4.h