1/* Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_LAST_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_LAST_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/detail/align.h>
17#include <simdpp/core/blend.h>
18#include <simdpp/core/cmp_gt.h>
19#include <simdpp/core/load.h>
20#include <simdpp/core/load_u.h>
21#include <simdpp/core/move_l.h>
22#include <simdpp/core/store.h>
23#include <simdpp/detail/neon/memory_store.h>
24#include <simdpp/detail/null/memory.h>
25#include <simdpp/detail/extract128.h>
26
27namespace simdpp {
28namespace SIMDPP_ARCH_NAMESPACE {
29namespace detail {
30namespace insn {
31
32// -----------------------------------------------------------------------------
33
34static SIMDPP_INL
35void i_store_last(char* p, const uint8x16& a, unsigned n)
36{
37 p = detail::assume_aligned(p, 16);
38#if SIMDPP_USE_NULL
39 detail::null::store_last(p, a, n);
40#elif SIMDPP_USE_ALTIVEC && SIMDPP_BIG_ENDIAN
41 uint8x16 mask = vec_lvsl(n, (const uint8_t*)NULL);
42 mask = cmp_gt(mask, 0x0f);
43 uint8x16 b = load(p);
44 b = blend(a, b, mask);
45 store(p, b);
46#elif SIMDPP_USE_ALTIVEC && SIMDPP_LITTLE_ENDIAN
47 uint8<16> mask = make_ones();
48 uint8<16> shift = vec_splats((unsigned char)(n << 3));
49 mask = vec_sro(mask.native(), shift.native());
50
51 uint8x16 b = load(p);
52 b = blend(b, a, mask);
53 store(p, b);
54#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON
55 static const uint8_t mask_d[32] = {0,0,0,0,0,0,0,0,
56 0,0,0,0,0,0,0,0,
57 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
58 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};
59
60 uint8x16 mask = load_u(mask_d + n);
61 uint8x16 b = load(p);
62 b = blend(a, b, mask);
63 store(p, b);
64#elif SIMDPP_USE_MSA
65 int8x16 mask = make_ones();
66 int8x16 zero = make_zero();
67 mask = __msa_sld_b(mask.native(), zero.native(), n);
68 uint8x16 b = load(p);
69 b = blend(a, b, mask);
70 store(p, b);
71#endif
72}
73
74#if SIMDPP_USE_AVX2
75static SIMDPP_INL
76void i_store_last(char* p, const uint8x32& a, unsigned n)
77{
78 p = detail::assume_aligned(p, 32);
79 static const uint8_t mask_d[64] = {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
80 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
81 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
82 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
83 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
84 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};
85 uint8x32 mask = load_u(mask_d + n);
86 uint8x32 b = load(p);
87 b = blend(a, b, mask);
88 store(p, b);
89}
90#endif
91
92#if SIMDPP_USE_AVX512BW
93SIMDPP_INL void i_store_last(char* p, const uint8<64>& a, unsigned n)
94{
95 static const uint8_t mask_d[128] = {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
96 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
97 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
98 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
99 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
100 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
101 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
102 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
103 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
104 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
105 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
106 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff };
107
108 uint8<64> mask = load_u(mask_d + n);
109 uint8<64> b = load(p);
110 b = blend(a, b, mask);
111 store(p, b);
112}
113#endif
114
115
116// -----------------------------------------------------------------------------
117
118static SIMDPP_INL
119void i_store_last(char* p, const uint16x8& a, unsigned n)
120{
121 p = detail::assume_aligned(p, 16);
122#if SIMDPP_USE_NULL
123 detail::null::store_last(p, a, n);
124#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
125 i_store_last(p, (uint8x16)a, n*2);
126#endif
127}
128
129#if SIMDPP_USE_AVX2
130static SIMDPP_INL
131void i_store_last(char* p, const uint16x16& a, unsigned n)
132{
133 i_store_last(p, uint8x32(a), n*2);
134}
135#endif
136
137#if SIMDPP_USE_AVX512BW
138SIMDPP_INL void i_store_last(char* p, const uint16<32>& a, unsigned n)
139{
140 i_store_last(p, uint8<64>(a), n*2);
141}
142#endif
143
144// -----------------------------------------------------------------------------
145
146static SIMDPP_INL
147void i_store_last(char* p, const uint32x4& a, unsigned n)
148{
149 p = detail::assume_aligned(p, 16);
150#if SIMDPP_USE_NULL
151 detail::null::store_last(p, a, n);
152#elif SIMDPP_USE_AVX2
153 static const int32_t mask_d[8] = {0, 0, 0, 0, -1, -1, -1, -1};
154 uint32x4 mask = load_u(mask_d + n);
155 _mm_maskstore_epi32(reinterpret_cast<int*>(p), mask.native(), a.native());
156#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
157 i_store_last(p, (uint8x16)a, n*4);
158#endif
159}
160
161#if SIMDPP_USE_AVX2
162static SIMDPP_INL
163void i_store_last(char* p, const uint32x8& a, unsigned n)
164{
165 static const int32_t mask_d[16] = {0, 0, 0, 0, 0, 0, 0, 0,
166 -1, -1, -1, -1, -1, -1, -1, -1};
167 uint32<8> mask = load_u(mask_d + n);
168 _mm256_maskstore_epi32(reinterpret_cast<int*>(p), mask.native(), a.native());
169}
170#endif
171
172#if SIMDPP_USE_AVX512F
173static SIMDPP_INL
174void i_store_last(char* p, const uint32<16>& a, unsigned n)
175{
176 _mm512_mask_store_epi32(p, 0xffff << (16-n), a.native());
177}
178#endif
179
180// -----------------------------------------------------------------------------
181
182static SIMDPP_INL
183void i_store_last(char* p, const uint64x2& a, unsigned n)
184{
185 p = detail::assume_aligned(p, 16);
186#if SIMDPP_USE_SSE2
187 if (n == 1) {
188 uint64x2 b = move2_l<1>(a);
189 _mm_store_sd(reinterpret_cast<double*>(p+8), _mm_castsi128_pd(b.native()));
190 }
191#elif SIMDPP_USE_NEON
192 if (n == 1) {
193 neon::store_lane<1,1>(p+8, a);
194 }
195#elif SIMDPP_USE_VSX_207
196 if (n == 1) {
197 uint64_t* q = reinterpret_cast<uint64_t*>(p) + 1;
198 *q = vec_extract(a.native(), 1);
199 }
200#elif SIMDPP_USE_MSA
201 i_store_last(p, uint8<16>(a), n*8);
202#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
203 detail::null::store_last(p, a, n);
204#endif
205}
206
207#if SIMDPP_USE_AVX2
208static SIMDPP_INL
209void i_store_last(char* p, const uint64x4& a, unsigned n)
210{
211 static const int64_t mask_d[8] = { 0, 0, 0, 0, -1, -1, -1, -1 };
212 uint64<4> mask = load_u(mask_d + n);
213#if __INTEL_COMPILER
214 _mm256_maskstore_epi64(reinterpret_cast<__int64*>(p), mask.native(), a.native());
215#else
216 _mm256_maskstore_epi64(reinterpret_cast<long long*>(p), mask.native(), a.native());
217#endif
218}
219#endif
220
221#if SIMDPP_USE_AVX512F
222static SIMDPP_INL
223void i_store_last(char* p, const uint64<8>& a, unsigned n)
224{
225 _mm512_mask_store_epi64(p, 0xff << (8-n), a.native());
226}
227#endif
228
229// -----------------------------------------------------------------------------
230
231static SIMDPP_INL
232void i_store_last(char* p, const float32x4& ca, unsigned n)
233{
234 float32<4> a = ca;
235 p = detail::assume_aligned(p, 16);
236#if SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC || SIMDPP_USE_NEON_FLT_SP || SIMDPP_USE_MSA
237 i_store_last(p, uint32x4(a), n);
238#elif SIMDPP_USE_AVX && !SIMDPP_USE_AMD
239 static const int32_t mask_d[8] = { 0, 0, 0, 0, -1, -1, -1, -1 };
240 float32x4 mask = load_u(mask_d + n);
241 _mm_maskstore_ps(reinterpret_cast<float*>(p),
242 _mm_castps_si128(mask.native()), a.native());
243#elif SIMDPP_USE_SSE2
244 static const int32_t mask_d[8] = { 0, 0, 0, 0, -1, -1, -1, -1 };
245 float32x4 mask = load_u(mask_d + n);
246 float32x4 old = load(p);
247 a = blend(a, old, mask);
248 store(p, a);
249#elif SIMDPP_USE_NEON
250 // + VFP
251 if (n < 1) return;
252 neon::store_lane<3,1>(p+12, a);
253 if (n < 2) return;
254 neon::store_lane<2,1>(p+8, a);
255 if (n < 3) return;
256 neon::store_lane<1,1>(p+4, a);
257#endif
258}
259
260#if SIMDPP_USE_AVX
261static SIMDPP_INL
262void i_store_last(char* p, const float32x8& ca, unsigned n)
263{
264 float32<8> a = ca;
265 static const int32_t mask_d[16] = { 0, 0, 0, 0, 0, 0, 0, 0,
266 -1, -1, -1, -1, -1, -1, -1, -1 };
267
268 float32x8 mask = load_u(mask_d + n);
269#if !SIMDPP_USE_AMD
270 _mm256_maskstore_ps(reinterpret_cast<float*>(p),
271 _mm256_castps_si256(mask.native()), a.native());
272#else
273 float32x8 old = load(p);
274 a = blend(a, old, mask);
275 store(v, a);
276#endif
277}
278#endif
279
280#if SIMDPP_USE_AVX512F
281static SIMDPP_INL
282void i_store_last(char* p, const float32<16>& a, unsigned n)
283{
284 _mm512_mask_store_ps(p, 0xffff << (16-n), a.native());
285}
286#endif
287
288// -----------------------------------------------------------------------------
289
290static SIMDPP_INL
291void i_store_last(char* p, const float64x2& a, unsigned n)
292{
293 p = detail::assume_aligned(p, 16);
294#if SIMDPP_USE_SSE2
295 if (n == 1) {
296 float64x2 b = zip2_hi(a, a);
297 _mm_store_sd(reinterpret_cast<double*>(p)+1, b.native());
298 }
299#elif SIMDPP_USE_NEON64
300 if (n == 1) {
301 vst1_f64(reinterpret_cast<double*>(p)+1, vget_high_f64(a.native()));
302 }
303#elif SIMDPP_USE_VSX_206
304 if (n == 1) {
305 *(reinterpret_cast<double*>(p)+1) = vec_extract(a.native(), 1);
306 }
307#elif SIMDPP_USE_MSA
308 i_store_last(p, uint64x2(a), n);
309#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
310 detail::null::store_last(p, a, n);
311#endif
312}
313
314#if SIMDPP_USE_AVX
315static SIMDPP_INL
316void i_store_last(char* p, const float64x4& a, unsigned n)
317{
318 static const int64_t mask_d[8] = { 0, 0, 0, 0, -1, -1, -1, -1 };
319 float64x4 mask = load_u(mask_d + n);
320
321#if !SIMDPP_USE_AMD
322 _mm256_maskstore_pd(reinterpret_cast<double*>(p),
323 _mm256_castpd_si256(mask.native()), a.native());
324#else
325 float64x4 old = load(p);
326 a = blend(a, old, mask);
327 store(v, a);
328#endif
329}
330#endif
331
332#if SIMDPP_USE_AVX512F
333static SIMDPP_INL
334void i_store_last(char* p, const float64<8>& a, unsigned n)
335{
336 _mm512_mask_store_pd(p, 0xff << (8-n), a.native());
337}
338#endif
339
340// -----------------------------------------------------------------------------
341
342template<class V> SIMDPP_INL
343void i_store_last(char* p, const V& ca, unsigned n)
344{
345 const unsigned veclen = V::base_vector_type::length_bytes;
346
347 typename detail::remove_sign<V>::type a = ca;
348 p = detail::assume_aligned(p, veclen);
349 unsigned el_to_skip = V::length - n;
350
351 unsigned n_empty_vec = el_to_skip / V::base_vector_type::length;
352 unsigned mid_vec_skip_count = n % V::base_vector_type::length;
353 unsigned curr_vec = 0;
354
355 p += n_empty_vec * veclen;
356 curr_vec += n_empty_vec;
357 if (mid_vec_skip_count > 0) {
358 i_store_last(p, a.vec(curr_vec), mid_vec_skip_count);
359 p += veclen;
360 curr_vec++;
361 }
362
363 for (; curr_vec < V::vec_length; ++curr_vec) {
364 i_store(p, a.vec(curr_vec));
365 p += veclen;
366 }
367}
368
369} // namespace insn
370} // namespace detail
371} // namespace SIMDPP_ARCH_NAMESPACE
372} // namespace simdpp
373
374#endif
375