1/* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_LOAD_U_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_LOAD_U_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/transpose.h>
17#include <simdpp/detail/align.h>
18#include <simdpp/detail/not_implemented.h>
19#include <simdpp/detail/insn/mem_unpack.h>
20#include <simdpp/detail/null/memory.h>
21
22namespace simdpp {
23namespace SIMDPP_ARCH_NAMESPACE {
24namespace detail {
25namespace insn {
26
27// -----------------------------------------------------------------------------
28
29// Each integer type is handled separately because higher aligment guarantees
30// offer better performance on e.g. ARM. Note, we don't use LDDQU on SSE,
31// because it has usage restrictions and offers improved performance only on
32// Pentium 4 era processors.
33static SIMDPP_INL
34void i_load_u(uint8x16& a, const char* p)
35{
36#if SIMDPP_USE_NULL
37 detail::null::load(a, p);
38#elif SIMDPP_USE_SSE2
39 a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
40#elif SIMDPP_USE_NEON
41 a = vld1q_u8(reinterpret_cast<const uint8_t*>(p));
42#elif SIMDPP_USE_VSX_206
43 const uint8_t* q = reinterpret_cast<const uint8_t*>(p);
44 a = vec_vsx_ld(0, q);
45#elif SIMDPP_USE_ALTIVEC
46 const uint8_t* q = reinterpret_cast<const uint8_t*>(p);
47 uint8x16 l1, l2, mask;
48 l1 = vec_ld(0, q);
49 l2 = vec_ld(16, q);
50#pragma GCC diagnostic push
51#pragma GCC diagnostic ignored "-Wdeprecated"
52 mask = vec_lvsl(0, q);
53#pragma GCC diagnostic pop
54 a = vec_perm(l1.native(), l2.native(), mask.native());
55#elif SIMDPP_USE_MSA
56 a = (v16u8) __msa_ld_b(p, 0);
57#endif
58}
59
60static SIMDPP_INL
61void i_load_u(uint16x8& a, const char* p)
62{
63#if SIMDPP_USE_NULL
64 detail::null::load(a, p);
65#elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC
66 uint8x16 b;
67 i_load_u(b, p);
68 a = b;
69#elif SIMDPP_USE_NEON
70 a = vld1q_u16(reinterpret_cast<const uint16_t*>(p));
71#elif SIMDPP_USE_MSA
72 a = (v8u16) __msa_ld_h(p, 0);
73#endif
74}
75
76static SIMDPP_INL
77void i_load_u(uint32x4& a, const char* p)
78{
79#if SIMDPP_USE_NULL
80 detail::null::load(a, p);
81#elif SIMDPP_USE_VSX_206
82 a = vec_vsx_ld(0, reinterpret_cast<const uint32_t*>(p));
83#elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC
84 uint8x16 b;
85 i_load_u(b, p);
86 a = b;
87#elif SIMDPP_USE_NEON
88 a = vld1q_u32(reinterpret_cast<const uint32_t*>(p));
89#elif SIMDPP_USE_MSA
90 a = (v4u32) __msa_ld_w(p, 0);
91#endif
92}
93
94static SIMDPP_INL
95void i_load_u(uint64x2& a, const char* p)
96{
97#if SIMDPP_USE_NULL
98 detail::null::load(a, p);
99#elif SIMDPP_USE_SSE2
100 uint8x16 b;
101 i_load_u(b, p);
102 a = b;
103#elif SIMDPP_USE_VSX_207
104#if SIMDPP_64_BITS
105 a = (__vector uint64_t) vec_vsx_ld(0, reinterpret_cast<const uint64_t*>(p));
106#else
107 // BUG: GCC does not support vec_vsx_ld in 32-bit mode even when
108 // VSX 2.07 is enabled
109 uint8x16 r;
110 i_load_u(r, p);
111 a = r;
112#endif
113#elif SIMDPP_USE_ALTIVEC
114 detail::null::load(a, p);
115#elif SIMDPP_USE_NEON
116 a = vld1q_u64(reinterpret_cast<const uint64_t*>(p));
117#elif SIMDPP_USE_MSA
118 a = (v2u64) __msa_ld_d(p, 0);
119#endif
120}
121
122static SIMDPP_INL
123void i_load_u(float32x4& a, const char* p)
124{
125 const float* q = reinterpret_cast<const float*>(p);
126 (void) q;
127#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
128 detail::null::load(a, p);
129#elif SIMDPP_USE_SSE2
130 a = _mm_loadu_ps(q);
131#elif SIMDPP_USE_NEON
132 a = vld1q_f32(q);
133#elif SIMDPP_USE_VSX_206
134 a = vec_vsx_ld(0, q);
135#elif SIMDPP_USE_ALTIVEC
136 uint32x4 b; (void) q;
137 i_load_u(b, p);
138 a = b;
139#elif SIMDPP_USE_MSA
140 a = (v4f32) __msa_ld_w(q, 0);
141#endif
142}
143
144static SIMDPP_INL
145void i_load_u(float64x2& a, const char* p)
146{
147 const double* q = reinterpret_cast<const double*>(p);
148 (void) q;
149#if SIMDPP_USE_SSE2
150 a = _mm_loadu_pd(q);
151#elif SIMDPP_USE_NEON64
152 a = vld1q_f64(q);
153#elif SIMDPP_USE_VSX_206
154 a = vec_vsx_ld(0, q);
155#elif SIMDPP_USE_MSA
156 a = (v2f64) __msa_ld_d(q, 0);
157#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC || SIMDPP_USE_NEON
158 detail::null::load(a, p);
159#else
160 SIMDPP_NOT_IMPLEMENTED2(a, p);
161#endif
162}
163
164#if SIMDPP_USE_AVX2
165static SIMDPP_INL
166void i_load_u(uint8x32& a, const char* p)
167{
168 a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
169}
170static SIMDPP_INL
171void i_load_u(uint16x16& a, const char* p)
172{
173 a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
174}
175static SIMDPP_INL
176void i_load_u(uint32x8& a, const char* p)
177{
178 a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
179}
180static SIMDPP_INL
181void i_load_u(uint64x4& a, const char* p)
182{
183 a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
184}
185#endif
186#if SIMDPP_USE_AVX
187static SIMDPP_INL
188void i_load_u(float32x8& a, const char* p)
189{
190 a = _mm256_loadu_ps(reinterpret_cast<const float*>(p));
191}
192static SIMDPP_INL
193void i_load_u(float64x4& a, const char* p)
194{
195 a = _mm256_loadu_pd(reinterpret_cast<const double*>(p));
196}
197#endif
198
199#if __INTEL_COMPILER && SIMDPP_USE_AVX && !SIMDPP_USE_AVX512F
200// BUG: Certain versions of ICC don't like vectors larger than native vector
201// (e.g. float32<16> and float64<8>) on AVX and AVX2. Two xmm vmovaps aligned
202// loads are emitted for each 32-byte load even though the argument is clearly
203// unaligned (e.g. p + 1). The code below results in the same output except
204// that correct vmovups unaligned load instructions are used.
205template<unsigned N> SIMDPP_INL
206void i_load_u(float32<N>& a, const char* p)
207{
208 for (unsigned i = 0; i < float32<N>::vec_length; ++i) {
209 __m128 lo, hi;
210 lo = _mm_loadu_ps(reinterpret_cast<const float*>(p));
211 hi = _mm_loadu_ps(reinterpret_cast<const float*>(p + 16));
212 a.vec(i) = _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1);
213 p += 32;
214 }
215}
216
217template<unsigned N> SIMDPP_INL
218void i_load_u(float64<N>& a, const char* p)
219{
220 for (unsigned i = 0; i < float64<N>::vec_length; ++i) {
221 __m128d lo, hi;
222 lo = _mm_loadu_pd(reinterpret_cast<const double*>(p));
223 hi = _mm_loadu_pd(reinterpret_cast<const double*>(p + 16));
224 a.vec(i) = _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1);
225 p += 32;
226 }
227}
228#endif
229
230#if __INTEL_COMPILER && SIMDPP_USE_AVX2 && !SIMDPP_USE_AVX512BW
231template<unsigned N> SIMDPP_INL
232void i_load_u(uint8<N>& a, const char* p)
233{
234 for (unsigned i = 0; i < uint8<N>::vec_length; ++i) {
235 __m128i lo, hi;
236 lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
237 hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + 16));
238 a.vec(i) = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
239 p += 32;
240 }
241}
242
243template<unsigned N> SIMDPP_INL
244void i_load_u(uint16<N>& a, const char* p)
245{
246 for (unsigned i = 0; i < uint16<N>::vec_length; ++i) {
247 __m128i lo, hi;
248 lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
249 hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + 16));
250 a.vec(i) = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
251 p += 32;
252 }
253}
254#endif
255
256#if __INTEL_COMPILER && SIMDPP_USE_AVX2 && !SIMDPP_USE_AVX512F
257template<unsigned N> SIMDPP_INL
258void i_load_u(uint32<N>& a, const char* p)
259{
260 for (unsigned i = 0; i < uint32<N>::vec_length; ++i) {
261 __m128i lo, hi;
262 lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
263 hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + 16));
264 a.vec(i) = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
265 p += 32;
266 }
267}
268
269template<unsigned N> SIMDPP_INL
270void i_load_u(uint64<N>& a, const char* p)
271{
272 for (unsigned i = 0; i < uint64<N>::vec_length; ++i) {
273 __m128i lo, hi;
274 lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
275 hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + 16));
276 a.vec(i) = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
277 p += 32;
278 }
279}
280#endif
281
282#if SIMDPP_USE_AVX512BW
283SIMDPP_INL void i_load_u(uint8<64>& a, const char* p)
284{
285 a = _mm512_loadu_si512(p);
286}
287SIMDPP_INL void i_load_u(uint16<32>& a, const char* p)
288{
289 a = _mm512_loadu_si512(p);
290}
291#endif
292
293#if SIMDPP_USE_AVX512F
294static SIMDPP_INL
295void i_load_u(uint32<16>& a, const char* p)
296{
297 a = _mm512_loadu_si512(p);
298}
299static SIMDPP_INL
300void i_load_u(uint64<8>& a, const char* p)
301{
302 a = _mm512_loadu_si512(p);
303}
304static SIMDPP_INL
305void i_load_u(float32<16>& a, const char* p)
306{
307 a = _mm512_loadu_ps(reinterpret_cast<const float*>(p));
308}
309static SIMDPP_INL
310void i_load_u(float64<8>& a, const char* p)
311{
312 a = _mm512_loadu_pd(reinterpret_cast<const double*>(p));
313}
314#endif
315
316// -----------------------------------------------------------------------------
317
318template<class V> SIMDPP_INL
319void i_load_u(V& a, const char* p)
320{
321 const unsigned veclen = V::base_vector_type::length_bytes;
322 for (unsigned i = 0; i < V::vec_length; ++i) {
323 i_load_u(a.vec(i), p);
324 p += veclen;
325 }
326}
327
328template<class V>
329V i_load_u_any(const char* p)
330{
331 typename detail::remove_sign<V>::type r;
332 i_load_u(r, p);
333 return V(r);
334}
335
336} // namespace insn
337
338template<class V> SIMDPP_INL
339void construct_eval(V& v, const expr_vec_load_u& e)
340{
341 v = insn::i_load_u_any<V>(e.a);
342}
343
344} // namespace detail
345} // namespace SIMDPP_ARCH_NAMESPACE
346} // namespace simdpp
347
348#endif
349
350