1 | /* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_LOAD_U_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_LOAD_U_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/transpose.h> |
17 | #include <simdpp/detail/align.h> |
18 | #include <simdpp/detail/not_implemented.h> |
19 | #include <simdpp/detail/insn/mem_unpack.h> |
20 | #include <simdpp/detail/null/memory.h> |
21 | |
22 | namespace simdpp { |
23 | namespace SIMDPP_ARCH_NAMESPACE { |
24 | namespace detail { |
25 | namespace insn { |
26 | |
27 | // ----------------------------------------------------------------------------- |
28 | |
29 | // Each integer type is handled separately because higher aligment guarantees |
30 | // offer better performance on e.g. ARM. Note, we don't use LDDQU on SSE, |
31 | // because it has usage restrictions and offers improved performance only on |
32 | // Pentium 4 era processors. |
33 | static SIMDPP_INL |
34 | void i_load_u(uint8x16& a, const char* p) |
35 | { |
36 | #if SIMDPP_USE_NULL |
37 | detail::null::load(a, p); |
38 | #elif SIMDPP_USE_SSE2 |
39 | a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p)); |
40 | #elif SIMDPP_USE_NEON |
41 | a = vld1q_u8(reinterpret_cast<const uint8_t*>(p)); |
42 | #elif SIMDPP_USE_VSX_206 |
43 | const uint8_t* q = reinterpret_cast<const uint8_t*>(p); |
44 | a = vec_vsx_ld(0, q); |
45 | #elif SIMDPP_USE_ALTIVEC |
46 | const uint8_t* q = reinterpret_cast<const uint8_t*>(p); |
47 | uint8x16 l1, l2, mask; |
48 | l1 = vec_ld(0, q); |
49 | l2 = vec_ld(16, q); |
50 | #pragma GCC diagnostic push |
51 | #pragma GCC diagnostic ignored "-Wdeprecated" |
52 | mask = vec_lvsl(0, q); |
53 | #pragma GCC diagnostic pop |
54 | a = vec_perm(l1.native(), l2.native(), mask.native()); |
55 | #elif SIMDPP_USE_MSA |
56 | a = (v16u8) __msa_ld_b(p, 0); |
57 | #endif |
58 | } |
59 | |
60 | static SIMDPP_INL |
61 | void i_load_u(uint16x8& a, const char* p) |
62 | { |
63 | #if SIMDPP_USE_NULL |
64 | detail::null::load(a, p); |
65 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC |
66 | uint8x16 b; |
67 | i_load_u(b, p); |
68 | a = b; |
69 | #elif SIMDPP_USE_NEON |
70 | a = vld1q_u16(reinterpret_cast<const uint16_t*>(p)); |
71 | #elif SIMDPP_USE_MSA |
72 | a = (v8u16) __msa_ld_h(p, 0); |
73 | #endif |
74 | } |
75 | |
76 | static SIMDPP_INL |
77 | void i_load_u(uint32x4& a, const char* p) |
78 | { |
79 | #if SIMDPP_USE_NULL |
80 | detail::null::load(a, p); |
81 | #elif SIMDPP_USE_VSX_206 |
82 | a = vec_vsx_ld(0, reinterpret_cast<const uint32_t*>(p)); |
83 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC |
84 | uint8x16 b; |
85 | i_load_u(b, p); |
86 | a = b; |
87 | #elif SIMDPP_USE_NEON |
88 | a = vld1q_u32(reinterpret_cast<const uint32_t*>(p)); |
89 | #elif SIMDPP_USE_MSA |
90 | a = (v4u32) __msa_ld_w(p, 0); |
91 | #endif |
92 | } |
93 | |
94 | static SIMDPP_INL |
95 | void i_load_u(uint64x2& a, const char* p) |
96 | { |
97 | #if SIMDPP_USE_NULL |
98 | detail::null::load(a, p); |
99 | #elif SIMDPP_USE_SSE2 |
100 | uint8x16 b; |
101 | i_load_u(b, p); |
102 | a = b; |
103 | #elif SIMDPP_USE_VSX_207 |
104 | #if SIMDPP_64_BITS |
105 | a = (__vector uint64_t) vec_vsx_ld(0, reinterpret_cast<const uint64_t*>(p)); |
106 | #else |
107 | // BUG: GCC does not support vec_vsx_ld in 32-bit mode even when |
108 | // VSX 2.07 is enabled |
109 | uint8x16 r; |
110 | i_load_u(r, p); |
111 | a = r; |
112 | #endif |
113 | #elif SIMDPP_USE_ALTIVEC |
114 | detail::null::load(a, p); |
115 | #elif SIMDPP_USE_NEON |
116 | a = vld1q_u64(reinterpret_cast<const uint64_t*>(p)); |
117 | #elif SIMDPP_USE_MSA |
118 | a = (v2u64) __msa_ld_d(p, 0); |
119 | #endif |
120 | } |
121 | |
122 | static SIMDPP_INL |
123 | void i_load_u(float32x4& a, const char* p) |
124 | { |
125 | const float* q = reinterpret_cast<const float*>(p); |
126 | (void) q; |
127 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP |
128 | detail::null::load(a, p); |
129 | #elif SIMDPP_USE_SSE2 |
130 | a = _mm_loadu_ps(q); |
131 | #elif SIMDPP_USE_NEON |
132 | a = vld1q_f32(q); |
133 | #elif SIMDPP_USE_VSX_206 |
134 | a = vec_vsx_ld(0, q); |
135 | #elif SIMDPP_USE_ALTIVEC |
136 | uint32x4 b; (void) q; |
137 | i_load_u(b, p); |
138 | a = b; |
139 | #elif SIMDPP_USE_MSA |
140 | a = (v4f32) __msa_ld_w(q, 0); |
141 | #endif |
142 | } |
143 | |
144 | static SIMDPP_INL |
145 | void i_load_u(float64x2& a, const char* p) |
146 | { |
147 | const double* q = reinterpret_cast<const double*>(p); |
148 | (void) q; |
149 | #if SIMDPP_USE_SSE2 |
150 | a = _mm_loadu_pd(q); |
151 | #elif SIMDPP_USE_NEON64 |
152 | a = vld1q_f64(q); |
153 | #elif SIMDPP_USE_VSX_206 |
154 | a = vec_vsx_ld(0, q); |
155 | #elif SIMDPP_USE_MSA |
156 | a = (v2f64) __msa_ld_d(q, 0); |
157 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC || SIMDPP_USE_NEON |
158 | detail::null::load(a, p); |
159 | #else |
160 | SIMDPP_NOT_IMPLEMENTED2(a, p); |
161 | #endif |
162 | } |
163 | |
164 | #if SIMDPP_USE_AVX2 |
165 | static SIMDPP_INL |
166 | void i_load_u(uint8x32& a, const char* p) |
167 | { |
168 | a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p)); |
169 | } |
170 | static SIMDPP_INL |
171 | void i_load_u(uint16x16& a, const char* p) |
172 | { |
173 | a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p)); |
174 | } |
175 | static SIMDPP_INL |
176 | void i_load_u(uint32x8& a, const char* p) |
177 | { |
178 | a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p)); |
179 | } |
180 | static SIMDPP_INL |
181 | void i_load_u(uint64x4& a, const char* p) |
182 | { |
183 | a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p)); |
184 | } |
185 | #endif |
186 | #if SIMDPP_USE_AVX |
187 | static SIMDPP_INL |
188 | void i_load_u(float32x8& a, const char* p) |
189 | { |
190 | a = _mm256_loadu_ps(reinterpret_cast<const float*>(p)); |
191 | } |
192 | static SIMDPP_INL |
193 | void i_load_u(float64x4& a, const char* p) |
194 | { |
195 | a = _mm256_loadu_pd(reinterpret_cast<const double*>(p)); |
196 | } |
197 | #endif |
198 | |
199 | #if __INTEL_COMPILER && SIMDPP_USE_AVX && !SIMDPP_USE_AVX512F |
200 | // BUG: Certain versions of ICC don't like vectors larger than native vector |
201 | // (e.g. float32<16> and float64<8>) on AVX and AVX2. Two xmm vmovaps aligned |
202 | // loads are emitted for each 32-byte load even though the argument is clearly |
203 | // unaligned (e.g. p + 1). The code below results in the same output except |
204 | // that correct vmovups unaligned load instructions are used. |
205 | template<unsigned N> SIMDPP_INL |
206 | void i_load_u(float32<N>& a, const char* p) |
207 | { |
208 | for (unsigned i = 0; i < float32<N>::vec_length; ++i) { |
209 | __m128 lo, hi; |
210 | lo = _mm_loadu_ps(reinterpret_cast<const float*>(p)); |
211 | hi = _mm_loadu_ps(reinterpret_cast<const float*>(p + 16)); |
212 | a.vec(i) = _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1); |
213 | p += 32; |
214 | } |
215 | } |
216 | |
217 | template<unsigned N> SIMDPP_INL |
218 | void i_load_u(float64<N>& a, const char* p) |
219 | { |
220 | for (unsigned i = 0; i < float64<N>::vec_length; ++i) { |
221 | __m128d lo, hi; |
222 | lo = _mm_loadu_pd(reinterpret_cast<const double*>(p)); |
223 | hi = _mm_loadu_pd(reinterpret_cast<const double*>(p + 16)); |
224 | a.vec(i) = _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1); |
225 | p += 32; |
226 | } |
227 | } |
228 | #endif |
229 | |
230 | #if __INTEL_COMPILER && SIMDPP_USE_AVX2 && !SIMDPP_USE_AVX512BW |
231 | template<unsigned N> SIMDPP_INL |
232 | void i_load_u(uint8<N>& a, const char* p) |
233 | { |
234 | for (unsigned i = 0; i < uint8<N>::vec_length; ++i) { |
235 | __m128i lo, hi; |
236 | lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p)); |
237 | hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + 16)); |
238 | a.vec(i) = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); |
239 | p += 32; |
240 | } |
241 | } |
242 | |
243 | template<unsigned N> SIMDPP_INL |
244 | void i_load_u(uint16<N>& a, const char* p) |
245 | { |
246 | for (unsigned i = 0; i < uint16<N>::vec_length; ++i) { |
247 | __m128i lo, hi; |
248 | lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p)); |
249 | hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + 16)); |
250 | a.vec(i) = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); |
251 | p += 32; |
252 | } |
253 | } |
254 | #endif |
255 | |
256 | #if __INTEL_COMPILER && SIMDPP_USE_AVX2 && !SIMDPP_USE_AVX512F |
257 | template<unsigned N> SIMDPP_INL |
258 | void i_load_u(uint32<N>& a, const char* p) |
259 | { |
260 | for (unsigned i = 0; i < uint32<N>::vec_length; ++i) { |
261 | __m128i lo, hi; |
262 | lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p)); |
263 | hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + 16)); |
264 | a.vec(i) = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); |
265 | p += 32; |
266 | } |
267 | } |
268 | |
269 | template<unsigned N> SIMDPP_INL |
270 | void i_load_u(uint64<N>& a, const char* p) |
271 | { |
272 | for (unsigned i = 0; i < uint64<N>::vec_length; ++i) { |
273 | __m128i lo, hi; |
274 | lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p)); |
275 | hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + 16)); |
276 | a.vec(i) = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); |
277 | p += 32; |
278 | } |
279 | } |
280 | #endif |
281 | |
282 | #if SIMDPP_USE_AVX512BW |
283 | SIMDPP_INL void i_load_u(uint8<64>& a, const char* p) |
284 | { |
285 | a = _mm512_loadu_si512(p); |
286 | } |
287 | SIMDPP_INL void i_load_u(uint16<32>& a, const char* p) |
288 | { |
289 | a = _mm512_loadu_si512(p); |
290 | } |
291 | #endif |
292 | |
293 | #if SIMDPP_USE_AVX512F |
294 | static SIMDPP_INL |
295 | void i_load_u(uint32<16>& a, const char* p) |
296 | { |
297 | a = _mm512_loadu_si512(p); |
298 | } |
299 | static SIMDPP_INL |
300 | void i_load_u(uint64<8>& a, const char* p) |
301 | { |
302 | a = _mm512_loadu_si512(p); |
303 | } |
304 | static SIMDPP_INL |
305 | void i_load_u(float32<16>& a, const char* p) |
306 | { |
307 | a = _mm512_loadu_ps(reinterpret_cast<const float*>(p)); |
308 | } |
309 | static SIMDPP_INL |
310 | void i_load_u(float64<8>& a, const char* p) |
311 | { |
312 | a = _mm512_loadu_pd(reinterpret_cast<const double*>(p)); |
313 | } |
314 | #endif |
315 | |
316 | // ----------------------------------------------------------------------------- |
317 | |
318 | template<class V> SIMDPP_INL |
319 | void i_load_u(V& a, const char* p) |
320 | { |
321 | const unsigned veclen = V::base_vector_type::length_bytes; |
322 | for (unsigned i = 0; i < V::vec_length; ++i) { |
323 | i_load_u(a.vec(i), p); |
324 | p += veclen; |
325 | } |
326 | } |
327 | |
328 | template<class V> |
329 | V i_load_u_any(const char* p) |
330 | { |
331 | typename detail::remove_sign<V>::type r; |
332 | i_load_u(r, p); |
333 | return V(r); |
334 | } |
335 | |
336 | } // namespace insn |
337 | |
338 | template<class V> SIMDPP_INL |
339 | void construct_eval(V& v, const expr_vec_load_u& e) |
340 | { |
341 | v = insn::i_load_u_any<V>(e.a); |
342 | } |
343 | |
344 | } // namespace detail |
345 | } // namespace SIMDPP_ARCH_NAMESPACE |
346 | } // namespace simdpp |
347 | |
348 | #endif |
349 | |
350 | |