load_u.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/load_u.h]

1	/ Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_LOAD_U_H
9	#define LIBSIMDPP_SIMDPP_DETAIL_INSN_LOAD_U_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/types.h>
16	#include <simdpp/core/transpose.h>
17	#include <simdpp/detail/align.h>
18	#include <simdpp/detail/not_implemented.h>
19	#include <simdpp/detail/insn/mem_unpack.h>
20	#include <simdpp/detail/null/memory.h>
21
22	namespace simdpp {
23	namespace SIMDPP_ARCH_NAMESPACE {
24	namespace detail {
25	namespace insn {
26
27	// -----------------------------------------------------------------------------
28
29	// Each integer type is handled separately because higher aligment guarantees
30	// offer better performance on e.g. ARM. Note, we don't use LDDQU on SSE,
31	// because it has usage restrictions and offers improved performance only on
32	// Pentium 4 era processors.
33	static SIMDPP_INL
34	void i_load_u(uint8x16& a, const char* p)
35	{
36	#if SIMDPP_USE_NULL
37	detail::null::load(a, p);
38	#elif SIMDPP_USE_SSE2
39	a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
40	#elif SIMDPP_USE_NEON
41	a = vld1q_u8(reinterpret_cast<const uint8_t*>(p));
42	#elif SIMDPP_USE_VSX_206
43	const uint8_t* q = reinterpret_cast<const uint8_t*>(p);
44	a = vec_vsx_ld(`0`, q);
45	#elif SIMDPP_USE_ALTIVEC
46	const uint8_t* q = reinterpret_cast<const uint8_t*>(p);
47	uint8x16 l1, l2, mask;
48	l1 = vec_ld(`0`, q);
49	l2 = vec_ld(`16`, q);
50	#pragma GCC diagnostic push
51	#pragma GCC diagnostic ignored "-Wdeprecated"
52	mask = vec_lvsl(`0`, q);
53	#pragma GCC diagnostic pop
54	a = vec_perm(l1.native(), l2.native(), mask.native());
55	#elif SIMDPP_USE_MSA
56	a = (v16u8) __msa_ld_b(p, `0`);
57	#endif
58	}
59
60	static SIMDPP_INL
61	void i_load_u(uint16x8& a, const char* p)
62	{
63	#if SIMDPP_USE_NULL
64	detail::null::load(a, p);
65	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_ALTIVEC
66	uint8x16 b;
67	i_load_u(b, p);
68	a = b;
69	#elif SIMDPP_USE_NEON
70	a = vld1q_u16(reinterpret_cast<const uint16_t*>(p));
71	#elif SIMDPP_USE_MSA
72	a = (v8u16) __msa_ld_h(p, `0`);
73	#endif
74	}
75
76	static SIMDPP_INL
77	void i_load_u(uint32x4& a, const char* p)
78	{
79	#if SIMDPP_USE_NULL
80	detail::null::load(a, p);
81	#elif SIMDPP_USE_VSX_206
82	a = vec_vsx_ld(`0`, reinterpret_cast<const uint32_t*>(p));
83	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_ALTIVEC
84	uint8x16 b;
85	i_load_u(b, p);
86	a = b;
87	#elif SIMDPP_USE_NEON
88	a = vld1q_u32(reinterpret_cast<const uint32_t*>(p));
89	#elif SIMDPP_USE_MSA
90	a = (v4u32) __msa_ld_w(p, `0`);
91	#endif
92	}
93
94	static SIMDPP_INL
95	void i_load_u(uint64x2& a, const char* p)
96	{
97	#if SIMDPP_USE_NULL
98	detail::null::load(a, p);
99	#elif SIMDPP_USE_SSE2
100	uint8x16 b;
101	i_load_u(b, p);
102	a = b;
103	#elif SIMDPP_USE_VSX_207
104	#if SIMDPP_64_BITS
105	a = (__vector uint64_t) vec_vsx_ld(`0`, reinterpret_cast<const uint64_t*>(p));
106	#else
107	// BUG: GCC does not support vec_vsx_ld in 32-bit mode even when
108	// VSX 2.07 is enabled
109	uint8x16 r;
110	i_load_u(r, p);
111	a = r;
112	#endif
113	#elif SIMDPP_USE_ALTIVEC
114	detail::null::load(a, p);
115	#elif SIMDPP_USE_NEON
116	a = vld1q_u64(reinterpret_cast<const uint64_t*>(p));
117	#elif SIMDPP_USE_MSA
118	a = (v2u64) __msa_ld_d(p, `0`);
119	#endif
120	}
121
122	static SIMDPP_INL
123	void i_load_u(float32x4& a, const char* p)
124	{
125	const float* q = reinterpret_cast<const float*>(p);
126	(void) q;
127	#if SIMDPP_USE_NULL \|\| SIMDPP_USE_NEON_NO_FLT_SP
128	detail::null::load(a, p);
129	#elif SIMDPP_USE_SSE2
130	a = _mm_loadu_ps(q);
131	#elif SIMDPP_USE_NEON
132	a = vld1q_f32(q);
133	#elif SIMDPP_USE_VSX_206
134	a = vec_vsx_ld(`0`, q);
135	#elif SIMDPP_USE_ALTIVEC
136	uint32x4 b; (void) q;
137	i_load_u(b, p);
138	a = b;
139	#elif SIMDPP_USE_MSA
140	a = (v4f32) __msa_ld_w(q, `0`);
141	#endif
142	}
143
144	static SIMDPP_INL
145	void i_load_u(float64x2& a, const char* p)
146	{
147	const double* q = reinterpret_cast<const double*>(p);
148	(void) q;
149	#if SIMDPP_USE_SSE2
150	a = _mm_loadu_pd(q);
151	#elif SIMDPP_USE_NEON64
152	a = vld1q_f64(q);
153	#elif SIMDPP_USE_VSX_206
154	a = vec_vsx_ld(`0`, q);
155	#elif SIMDPP_USE_MSA
156	a = (v2f64) __msa_ld_d(q, `0`);
157	#elif SIMDPP_USE_NULL \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_NEON
158	detail::null::load(a, p);
159	#else
160	SIMDPP_NOT_IMPLEMENTED2(a, p);
161	#endif
162	}
163
164	#if SIMDPP_USE_AVX2
165	static SIMDPP_INL
166	void i_load_u(uint8x32& a, const char* p)
167	{
168	a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
169	}
170	static SIMDPP_INL
171	void i_load_u(uint16x16& a, const char* p)
172	{
173	a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
174	}
175	static SIMDPP_INL
176	void i_load_u(uint32x8& a, const char* p)
177	{
178	a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
179	}
180	static SIMDPP_INL
181	void i_load_u(uint64x4& a, const char* p)
182	{
183	a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
184	}
185	#endif
186	#if SIMDPP_USE_AVX
187	static SIMDPP_INL
188	void i_load_u(float32x8& a, const char* p)
189	{
190	a = _mm256_loadu_ps(reinterpret_cast<const float*>(p));
191	}
192	static SIMDPP_INL
193	void i_load_u(float64x4& a, const char* p)
194	{
195	a = _mm256_loadu_pd(reinterpret_cast<const double*>(p));
196	}
197	#endif
198
199	#if __INTEL_COMPILER && SIMDPP_USE_AVX && !SIMDPP_USE_AVX512F
200	// BUG: Certain versions of ICC don't like vectors larger than native vector
201	// (e.g. float32<16> and float64<8>) on AVX and AVX2. Two xmm vmovaps aligned
202	// loads are emitted for each 32-byte load even though the argument is clearly
203	// unaligned (e.g. p + 1). The code below results in the same output except
204	// that correct vmovups unaligned load instructions are used.
205	template<unsigned N> SIMDPP_INL
206	void i_load_u(float32<N>& a, const char* p)
207	{
208	for (unsigned i = `0`; i < float32<N>::vec_length; ++i) {
209	__m128 lo, hi;
210	lo = _mm_loadu_ps(reinterpret_cast<const float*>(p));
211	hi = _mm_loadu_ps(reinterpret_cast<const float*>(p + `16`));
212	a.vec(i) = _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, `1`);
213	p += `32`;
214	}
215	}
216
217	template<unsigned N> SIMDPP_INL
218	void i_load_u(float64<N>& a, const char* p)
219	{
220	for (unsigned i = `0`; i < float64<N>::vec_length; ++i) {
221	__m128d lo, hi;
222	lo = _mm_loadu_pd(reinterpret_cast<const double*>(p));
223	hi = _mm_loadu_pd(reinterpret_cast<const double*>(p + `16`));
224	a.vec(i) = _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, `1`);
225	p += `32`;
226	}
227	}
228	#endif
229
230	#if __INTEL_COMPILER && SIMDPP_USE_AVX2 && !SIMDPP_USE_AVX512BW
231	template<unsigned N> SIMDPP_INL
232	void i_load_u(uint8<N>& a, const char* p)
233	{
234	for (unsigned i = `0`; i < uint8<N>::vec_length; ++i) {
235	__m128i lo, hi;
236	lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
237	hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + `16`));
238	a.vec(i) = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, `1`);
239	p += `32`;
240	}
241	}
242
243	template<unsigned N> SIMDPP_INL
244	void i_load_u(uint16<N>& a, const char* p)
245	{
246	for (unsigned i = `0`; i < uint16<N>::vec_length; ++i) {
247	__m128i lo, hi;
248	lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
249	hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + `16`));
250	a.vec(i) = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, `1`);
251	p += `32`;
252	}
253	}
254	#endif
255
256	#if __INTEL_COMPILER && SIMDPP_USE_AVX2 && !SIMDPP_USE_AVX512F
257	template<unsigned N> SIMDPP_INL
258	void i_load_u(uint32<N>& a, const char* p)
259	{
260	for (unsigned i = `0`; i < uint32<N>::vec_length; ++i) {
261	__m128i lo, hi;
262	lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
263	hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + `16`));
264	a.vec(i) = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, `1`);
265	p += `32`;
266	}
267	}
268
269	template<unsigned N> SIMDPP_INL
270	void i_load_u(uint64<N>& a, const char* p)
271	{
272	for (unsigned i = `0`; i < uint64<N>::vec_length; ++i) {
273	__m128i lo, hi;
274	lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
275	hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + `16`));
276	a.vec(i) = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, `1`);
277	p += `32`;
278	}
279	}
280	#endif
281
282	#if SIMDPP_USE_AVX512BW
283	SIMDPP_INL void i_load_u(uint8<`64`>& a, const char* p)
284	{
285	a = _mm512_loadu_si512(p);
286	}
287	SIMDPP_INL void i_load_u(uint16<`32`>& a, const char* p)
288	{
289	a = _mm512_loadu_si512(p);
290	}
291	#endif
292
293	#if SIMDPP_USE_AVX512F
294	static SIMDPP_INL
295	void i_load_u(uint32<`16`>& a, const char* p)
296	{
297	a = _mm512_loadu_si512(p);
298	}
299	static SIMDPP_INL
300	void i_load_u(uint64<`8`>& a, const char* p)
301	{
302	a = _mm512_loadu_si512(p);
303	}
304	static SIMDPP_INL
305	void i_load_u(float32<`16`>& a, const char* p)
306	{
307	a = _mm512_loadu_ps(reinterpret_cast<const float*>(p));
308	}
309	static SIMDPP_INL
310	void i_load_u(float64<`8`>& a, const char* p)
311	{
312	a = _mm512_loadu_pd(reinterpret_cast<const double*>(p));
313	}
314	#endif
315
316	// -----------------------------------------------------------------------------
317
318	template<class V> SIMDPP_INL
319	void i_load_u(V& a, const char* p)
320	{
321	const unsigned veclen = V::base_vector_type::length_bytes;
322	for (unsigned i = `0`; i < V::vec_length; ++i) {
323	i_load_u(a.vec(i), p);
324	p += veclen;
325	}
326	}
327
328	template<class V>
329	V i_load_u_any(const char* p)
330	{
331	typename detail::remove_sign<V>::type r;
332	i_load_u(r, p);
333	return V(r);
334	}
335
336	} // namespace insn
337
338	template<class V> SIMDPP_INL
339	void construct_eval(V& v, const expr_vec_load_u& e)
340	{
341	v = insn::i_load_u_any<V>(e.a);
342	}
343
344	} // namespace detail
345	} // namespace SIMDPP_ARCH_NAMESPACE
346	} // namespace simdpp
347
348	#endif
349
350

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/load_u.h