load_packed3.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/load_packed3.h]

1	/ Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_LOAD_PACKED3_H
9	#define LIBSIMDPP_SIMDPP_DETAIL_INSN_LOAD_PACKED3_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/types.h>
16	#include <simdpp/detail/align.h>
17	#include <simdpp/detail/insn/mem_unpack.h>
18	#include <simdpp/core/load.h>
19	#include <simdpp/core/transpose.h>
20	#include <simdpp/detail/null/memory.h>
21
22	namespace simdpp {
23	namespace SIMDPP_ARCH_NAMESPACE {
24	namespace detail {
25	namespace insn {
26
27
28	// collect some boilerplate
29	template<class V> SIMDPP_INL
30	void v128_load_packed3(V& a, V& b, V& c, const char* p);
31	template<class V> SIMDPP_INL
32	void v256_load_packed3(V& a, V& b, V& c, const char* p);
33	template<class V> SIMDPP_INL
34	void v512_load_packed3(V& a, V& b, V& c, const char* p);
35
36	// -----------------------------------------------------------------------------
37
38	static SIMDPP_INL
39	void i_load_packed3(uint8x16& a, uint8x16& b, uint8x16& c, const char* p)
40	{
41	p = detail::assume_aligned(p, `16`);
42	#if SIMDPP_USE_NULL
43	detail::null::load_packed3(a, b, c, p);
44	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
45	v128_load_packed3(a, b, c, p);
46	#elif SIMDPP_USE_NEON
47	auto r = vld3q_u8(reinterpret_cast<const uint8_t*>(p));
48	a = r.val[`0`];
49	b = r.val[`1`];
50	c = r.val[`2`];
51	#endif
52	}
53
54	#if SIMDPP_USE_AVX2
55	static SIMDPP_INL
56	void i_load_packed3(uint8x32& a, uint8x32& b, uint8x32& c, const char* p)
57	{
58	v256_load_packed3(a, b, c, p);
59	}
60	#endif
61
62	#if SIMDPP_USE_AVX512BW
63	SIMDPP_INL void i_load_packed3(uint8<`64`>& a, uint8<`64`>& b, uint8<`64`>& c, const char* p)
64	{
65	v512_load_packed3(a, b, c, p);
66	}
67	#endif
68
69	// -----------------------------------------------------------------------------
70
71	static SIMDPP_INL
72	void i_load_packed3(uint16x8& a, uint16x8& b, uint16x8& c,
73	const char* p)
74	{
75	p = detail::assume_aligned(p, `16`);
76	#if SIMDPP_USE_NULL
77	detail::null::load_packed3(a, b, c, p);
78	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
79	v128_load_packed3(a, b, c, p);
80	#elif SIMDPP_USE_NEON
81	auto r = vld3q_u16(reinterpret_cast<const uint16_t*>(p));
82	a = r.val[`0`];
83	b = r.val[`1`];
84	c = r.val[`2`];
85	#endif
86	}
87
88	#if SIMDPP_USE_AVX2
89	static SIMDPP_INL
90	void i_load_packed3(uint16x16& a, uint16x16& b, uint16x16& c,
91	const char* p)
92	{
93	v256_load_packed3(a, b, c, p);
94	}
95	#endif
96
97	#if SIMDPP_USE_AVX512BW
98	SIMDPP_INL void i_load_packed3(uint16<`32`>& a, uint16<`32`>& b, uint16<`32`>& c,
99	const char* p)
100	{
101	v512_load_packed3(a, b, c, p);
102	}
103	#endif
104
105	// -----------------------------------------------------------------------------
106
107	static SIMDPP_INL
108	void i_load_packed3(uint32x4& a, uint32x4& b, uint32x4&c, const char* p)
109	{
110	p = detail::assume_aligned(p, `16`);
111	#if SIMDPP_USE_NULL
112	detail::null::load_packed3(a, b, c, p);
113	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
114	v128_load_packed3(a, b, c, p);
115	#elif SIMDPP_USE_NEON
116	auto r = vld3q_u32(reinterpret_cast<const uint32_t*>(p));
117	a = r.val[`0`];
118	b = r.val[`1`];
119	c = r.val[`2`];
120	#endif
121	}
122
123	#if SIMDPP_USE_AVX2
124	static SIMDPP_INL
125	void i_load_packed3(uint32x8& a, uint32x8& b, uint32x8& c, const char* p)
126	{
127	v256_load_packed3(a, b, c, p);
128	}
129	#endif
130
131	#if SIMDPP_USE_AVX512F
132	static SIMDPP_INL
133	void i_load_packed3(uint32<`16`>& a, uint32<`16`>& b, uint32<`16`>& c, const char* p)
134	{
135	v512_load_packed3(a, b, c, p);
136	}
137	#endif
138
139	// -----------------------------------------------------------------------------
140
141	static SIMDPP_INL
142	void i_load_packed3(uint64x2& a, uint64x2& b, uint64x2& c, const char* p)
143	{
144	p = detail::assume_aligned(p, `16`);
145	#if SIMDPP_USE_SSE2 \|\| SIMDPP_USE_VSX_207 \|\| SIMDPP_USE_MSA
146	v128_load_packed3(a, b, c, p);
147	#elif SIMDPP_USE_NEON64
148	auto r = vld3q_u64(reinterpret_cast<const uint64_t*>(p));
149	a = r.val[`0`];
150	b = r.val[`1`];
151	c = r.val[`2`];
152	#elif SIMDPP_USE_NEON32
153	uint64x2 a0, b0, c0;
154	a0 = load(p);
155	b0 = load(p+`16`);
156	c0 = load(p+`32`);
157
158	int64x1_t al, bl, cl, ah, bh, ch;
159	al = vget_low_u64(a0.native());
160	ah = vget_high_u64(a0.native());
161	bl = vget_low_u64(b0.native());
162	bh = vget_high_u64(b0.native());
163	cl = vget_low_u64(c0.native());
164	ch = vget_high_u64(c0.native());
165	a = vcombine_u64(al, bh);
166	b = vcombine_u64(ah, cl);
167	c = vcombine_u64(bl, ch);
168	#elif SIMDPP_USE_NULL \|\| SIMDPP_USE_ALTIVEC
169	detail::null::load_packed3(a, b, c, p);
170	#endif
171	}
172
173	#if SIMDPP_USE_AVX2
174	static SIMDPP_INL
175	void i_load_packed3(uint64x4& a, uint64x4& b, uint64x4& c, const char* p)
176	{
177	v256_load_packed3(a, b, c, p);
178	}
179	#endif
180
181	#if SIMDPP_USE_AVX512F
182	static SIMDPP_INL
183	void i_load_packed3(uint64<`8`>& a, uint64<`8`>& b, uint64<`8`>& c,
184	const char* p)
185	{
186	v512_load_packed3(a, b, c, p);
187	}
188	#endif
189
190	// -----------------------------------------------------------------------------
191
192	static SIMDPP_INL
193	void i_load_packed3(float32x4& a, float32x4& b, float32x4& c, const char* p)
194	{
195	p = detail::assume_aligned(p, `16`);
196	#if SIMDPP_USE_NULL \|\| SIMDPP_USE_NEON_NO_FLT_SP
197	detail::null::load_packed3(a, b, c, p);
198	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
199	v128_load_packed3(a, b, c, p);
200	#elif SIMDPP_USE_NEON
201	auto r = vld3q_f32(reinterpret_cast<const float*>(p));
202	a = r.val[`0`];
203	b = r.val[`1`];
204	c = r.val[`2`];
205	#endif
206	}
207
208	#if SIMDPP_USE_AVX
209	static SIMDPP_INL
210	void i_load_packed3(float32x8& a, float32x8& b, float32x8& c, const char* p)
211	{
212	v256_load_packed3(a, b, c, p);
213	}
214	#endif
215
216	#if SIMDPP_USE_AVX512F
217	static SIMDPP_INL
218	void i_load_packed3(float32<`16`>& a, float32<`16`>& b, float32<`16`>& c,
219	const char* p)
220	{
221	v512_load_packed3(a, b, c, p);
222	}
223	#endif
224
225	// -----------------------------------------------------------------------------
226
227	static SIMDPP_INL
228	void i_load_packed3(float64x2& a, float64x2& b, float64x2& c, const char* p)
229	{
230	p = detail::assume_aligned(p, `16`);
231	#if SIMDPP_USE_NEON64
232	auto r = vld3q_f64(reinterpret_cast<const double*>(p));
233	a = r.val[`0`];
234	b = r.val[`1`];
235	c = r.val[`2`];
236	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_VSX_206 \|\| SIMDPP_USE_MSA
237	v128_load_packed3(a, b, c, p);
238	#elif SIMDPP_USE_NULL \|\| SIMDPP_USE_NEON32 \|\| SIMDPP_USE_ALTIVEC
239	detail::null::load_packed3(a, b, c, p);
240	#endif
241	}
242
243	#if SIMDPP_USE_AVX
244	static SIMDPP_INL
245	void i_load_packed3(float64x4& a, float64x4& b, float64x4& c,
246	const char* p)
247	{
248	v256_load_packed3(a, b, c, p);
249	}
250	#endif
251
252	#if SIMDPP_USE_AVX512F
253	static SIMDPP_INL
254	void i_load_packed3(float64<`8`>& a, float64<`8`>& b, float64<`8`>& c,
255	const char* p)
256	{
257	v512_load_packed3(a, b, c, p);
258	}
259	#endif
260
261	// -----------------------------------------------------------------------------
262
263	template<class V> SIMDPP_INL
264	void v128_load_packed3(V& a, V& b, V& c, const char* p)
265	{
266	p = detail::assume_aligned(p, `16`);
267	a = load(p);
268	b = load(p + `16`);
269	c = load(p + `32`);
270	mem_unpack3(a, b, c);
271	}
272
273	template<class V> SIMDPP_INL
274	void v256_load_packed3(V& a, V& b, V& c, const char* p)
275	{
276	p = detail::assume_aligned(p, `32`);
277	a = load(p);
278	b = load(p + `32`);
279	c = load(p + `64`);
280	mem_unpack3(a, b, c);
281	}
282
283	template<class V> SIMDPP_INL
284	void v512_load_packed3(V& a, V& b, V& c, const char* p)
285	{
286	p = detail::assume_aligned(p, `64`);
287	a = load(p);
288	b = load(p + `64`);
289	c = load(p + `128`);
290	mem_unpack3(a, b, c);
291	}
292
293
294	template<class V> SIMDPP_INL
295	void i_load_packed3(V& a, V& b, V& c, const char* p)
296	{
297	const unsigned veclen = V::base_vector_type::length_bytes;
298
299	p = detail::assume_aligned(p, veclen);
300	for (unsigned i = `0`; i < V::vec_length; ++i) {
301	i_load_packed3(a.vec(i), b.vec(i), c.vec(i), p);
302	p += veclen*`3`;
303	}
304	}
305
306	} // namespace insn
307	} // namespace detail
308	} // namespace SIMDPP_ARCH_NAMESPACE
309	} // namespace simdpp
310
311	#endif
312
313

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/load_packed3.h