1/* Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_FIRST_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_FIRST_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/detail/align.h>
17#include <simdpp/core/blend.h>
18#include <simdpp/core/cmp_gt.h>
19#include <simdpp/core/load.h>
20#include <simdpp/core/load_u.h>
21#include <simdpp/core/store.h>
22#include <simdpp/detail/neon/memory_store.h>
23#include <simdpp/detail/null/memory.h>
24#include <simdpp/detail/extract128.h>
25
26namespace simdpp {
27namespace SIMDPP_ARCH_NAMESPACE {
28namespace detail {
29namespace insn {
30
31static SIMDPP_INL
32void i_store_first(char* p, const uint8x16& a, unsigned n)
33{
34 p = detail::assume_aligned(p, 16);
35#if SIMDPP_USE_NULL
36 detail::null::store_first(p, a, n);
37#elif SIMDPP_USE_ALTIVEC && SIMDPP_BIG_ENDIAN
38 uint8x16 mask = vec_lvsr(n, (const uint8_t*)NULL);
39 mask = cmp_lt(mask, 0x10);
40 uint8x16 b = load(p);
41 b = blend(a, b, mask);
42 store(p, b);
43#elif SIMDPP_USE_ALTIVEC && SIMDPP_LITTLE_ENDIAN
44 uint8<16> mask = make_ones();
45 uint8<16> shift = vec_splats((unsigned char)(n << 3));
46 mask = vec_slo(mask.native(), shift.native());
47
48 uint8x16 b = load(p);
49 b = blend(b, a, mask);
50 store(p, b);
51#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_MSA
52 // for MSA we can't use __msa_sld_b because it does not work with shift==16
53 static const uint8_t mask_d[32] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
54 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
55 0,0,0,0,0,0,0,0,
56 0,0,0,0,0,0,0,0};
57
58 uint8x16 mask = load_u(mask_d + 16 - n);
59 uint8x16 b = load(p);
60 b = blend(a, b, mask);
61 store(p, b);
62#endif
63}
64
65#if SIMDPP_USE_AVX2
66static SIMDPP_INL
67void i_store_first(char* p, const uint8x32& a, unsigned n)
68{
69 static const uint8_t mask_d[64] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
70 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
71 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
72 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
73 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
74 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0};
75
76 uint8x32 mask = load_u(mask_d + 32 - n);
77 uint8x32 b = load(p);
78 b = blend(a, b, mask);
79 store(p, b);
80}
81#endif
82
83#if SIMDPP_USE_AVX512BW
84SIMDPP_INL void i_store_first(char* p, const uint8<64>& a, unsigned n)
85{
86 static const uint8_t mask_d[128] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
87 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
88 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
89 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
90 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
91 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
92 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
93 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
94 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
95 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
96 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
97 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0};
98
99 uint8<64> mask = load_u(mask_d + 64 - n);
100 uint8<64> b = load(p);
101 b = blend(a, b, mask);
102 store(p, b);
103}
104#endif
105
106// -----------------------------------------------------------------------------
107
108static SIMDPP_INL
109void i_store_first(char* p, const uint16x8& a, unsigned n)
110{
111 p = detail::assume_aligned(p, 16);
112#if SIMDPP_USE_NULL
113 detail::null::store_first(p, a, n);
114#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
115 i_store_first(p, (uint8x16)a, n*2);
116#endif
117}
118
119#if SIMDPP_USE_AVX2
120static SIMDPP_INL
121void i_store_first(char* p, const uint16x16& a, unsigned n)
122{
123 i_store_first(p, uint8x32(a), n*2);
124}
125#endif
126
127#if SIMDPP_USE_AVX512BW
128SIMDPP_INL void i_store_first(char* p, const uint16<32>& a, unsigned n)
129{
130 i_store_first(p, uint8<64>(a), n*2);
131}
132#endif
133
134// -----------------------------------------------------------------------------
135
136static SIMDPP_INL
137void i_store_first(char* p, const uint32x4& a, unsigned n)
138{
139 p = detail::assume_aligned(p, 16);
140#if SIMDPP_USE_NULL
141 detail::null::store_first(p, a, n);
142#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
143 i_store_first(p, (uint8x16)a, n*4);
144#endif
145}
146
147#if SIMDPP_USE_AVX2
148static SIMDPP_INL
149void i_store_first(char* p, const uint32x8& a, unsigned n)
150{
151 static const int32_t mask_d[16] = { -1, -1, -1, -1, -1, -1, -1, -1,
152 0, 0, 0, 0, 0, 0, 0, 0 };
153 uint32<8> mask = load_u(mask_d + 8-n);
154 _mm256_maskstore_epi32(reinterpret_cast<int*>(p), mask.native(), a.native());
155}
156#endif
157
158#if SIMDPP_USE_AVX512F
159static SIMDPP_INL
160void i_store_first(char* p, const uint32<16>& a, unsigned n)
161{
162 _mm512_mask_store_epi32(p, 0xffff >> (16-n), a.native());
163}
164#endif
165
166// -----------------------------------------------------------------------------
167
168static SIMDPP_INL
169void i_store_first(char* p, const uint64x2& a, unsigned n)
170{
171 p = detail::assume_aligned(p, 16);
172#if SIMDPP_USE_SSE2
173 if (n == 1) {
174 _mm_store_sd(reinterpret_cast<double*>(p), _mm_castsi128_pd(a.native()));
175 }
176#elif SIMDPP_USE_NEON
177 if (n == 1) {
178 neon::store_lane<0,1>(p, a);
179 }
180#elif SIMDPP_USE_VSX_207
181 if (n == 1) {
182 uint64_t* q = reinterpret_cast<uint64_t*>(p);
183 *q = vec_extract(a.native(), 0);
184 }
185#elif SIMDPP_USE_MSA
186 i_store_first(p, uint8<16>(a), n*8);
187#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
188 detail::null::store_first(p, a, n);
189#endif
190}
191
192#if SIMDPP_USE_AVX2
193static SIMDPP_INL
194void i_store_first(char* p, const uint64x4& a, unsigned n)
195{
196 static const int64_t mask_d[8] = { -1, -1, -1, -1, 0, 0, 0, 0 };
197 uint64<4> mask = load_u(mask_d + 4-n);
198#if __INTEL_COMPILER
199 _mm256_maskstore_epi64(reinterpret_cast<__int64*>(p), mask.native(), a.native());
200#else
201 _mm256_maskstore_epi64(reinterpret_cast<long long*>(p), mask.native(), a.native());
202#endif
203}
204#endif
205
206#if SIMDPP_USE_AVX512F
207static SIMDPP_INL
208void i_store_first(char* p, const uint64<8>& a, unsigned n)
209{
210 _mm512_mask_store_epi64(p, 0xff >> (8-n), a.native());
211}
212#endif
213
214// -----------------------------------------------------------------------------
215
216static SIMDPP_INL
217void i_store_first(char* p, const float32x4& a, unsigned n)
218{
219 p = detail::assume_aligned(p, 16);
220#if SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC || SIMDPP_USE_NEON_NO_FLT_SP || SIMDPP_USE_MSA
221 i_store_first(p, int32x4(a), n);
222#elif SIMDPP_USE_AVX && !SIMDPP_USE_AMD
223 static const int32_t mask_d[8] = { -1, -1, -1, -1, 0, 0, 0, 0 };
224 float32x4 mask = load_u(mask_d + 4-n);
225 _mm_maskstore_ps(reinterpret_cast<float*>(p), _mm_castps_si128(mask.native()), a.native());
226#elif SIMDPP_USE_SSE2
227 static const int32_t mask_d[8] = { -1, -1, -1, -1, 0, 0, 0, 0 };
228 float32x4 mask = load_u(mask_d + 4-n);
229 float32x4 b = load(p);
230 b = blend(a, b, mask);
231 store(p, b);
232#elif SIMDPP_USE_NEON
233 // + VFP
234 if (n < 1) return;
235 neon::store_lane<0,1>(p, a);
236 if (n < 2) return;
237 neon::store_lane<1,1>(p+4, a);
238 if (n < 3) return;
239 neon::store_lane<2,1>(p+8, a);
240#endif
241}
242
243#if SIMDPP_USE_AVX
244static SIMDPP_INL
245void i_store_first(char* p, const float32x8& a, unsigned n)
246{
247 static const int32_t mask_d[16] = { -1, -1, -1, -1, -1, -1, -1, -1,
248 0, 0, 0, 0, 0, 0, 0, 0 };
249
250 float32x8 mask = load_u(mask_d + 8-n);
251#if !SIMDPP_USE_AMD
252 _mm256_maskstore_ps(reinterpret_cast<float*>(p), _mm256_castps_si256(mask.native()), a.native());
253#else
254 float32x8 b = load(p);
255 b = blend(a, b, mask);
256 store(v, b);
257#endif
258}
259#endif
260
261#if SIMDPP_USE_AVX512F
262static SIMDPP_INL
263void i_store_first(char* p, const float32<16>& a, unsigned n)
264{
265 _mm512_mask_store_ps(p, 0xffff >> (16-n), a.native());
266}
267#endif
268
269// -----------------------------------------------------------------------------
270
271static SIMDPP_INL
272void i_store_first(char* p, const float64x2& a, unsigned n)
273{
274 p = detail::assume_aligned(p, 16);
275#if SIMDPP_USE_SSE2
276 if (n == 1) {
277 _mm_store_sd(reinterpret_cast<double*>(p), a.native());
278 }
279#elif SIMDPP_USE_NEON64
280 if (n == 1) {
281 vst1_f64(reinterpret_cast<double*>(p), vget_low_f64(a.native()));
282 }
283#elif SIMDPP_USE_VSX_206
284 if (n == 1) {
285 double* q = reinterpret_cast<double*>(p);
286 *q = vec_extract(a.native(), 0);
287 }
288#elif SIMDPP_USE_MSA
289 i_store_first(p, (uint64x2)a, n);
290#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
291 detail::null::store_first(p, a, n);
292#endif
293}
294
295#if SIMDPP_USE_AVX
296static SIMDPP_INL
297void i_store_first(char* p, const float64x4& a, unsigned n)
298{
299 static const int64_t mask_d[16] = { -1, -1, -1, -1, 0, 0, 0, 0 };
300 float64x4 mask = load_u(mask_d + 4-n);
301#if !SIMDPP_USE_AMD
302 _mm256_maskstore_pd(reinterpret_cast<double*>(p), _mm256_castpd_si256(mask.native()), a.native());
303#else
304 float64x4 b = load(p);
305 b = blend(a, b, mask);
306 store(v, b);
307#endif
308}
309#endif
310
311#if SIMDPP_USE_AVX512F
312static SIMDPP_INL
313void i_store_first(char* p, const float64<8>& a, unsigned n)
314{
315 _mm512_mask_store_pd(p, 0xff >> (8-n), a.native());
316}
317#endif
318
319// -----------------------------------------------------------------------------
320
321template<class V> SIMDPP_INL
322void i_store_first(char* p, const V& ca, unsigned n)
323{
324 const unsigned veclen = V::base_vector_type::length_bytes;
325
326 typename detail::remove_sign<V>::type a = ca;
327 p = detail::assume_aligned(p, veclen);
328
329 unsigned n_full_vec = n / V::base_vector_type::length;
330 unsigned mid_vec_fill_count = n % V::base_vector_type::length;
331 unsigned curr_vec = 0;
332
333 for (; curr_vec < n_full_vec; ++curr_vec) {
334 store(p, a.vec(curr_vec));
335 p += veclen;
336 }
337
338 if (mid_vec_fill_count > 0) {
339 i_store_first(p, a.vec(curr_vec), mid_vec_fill_count);
340 }
341}
342
343} // namespace insn
344} // namespace detail
345} // namespace SIMDPP_ARCH_NAMESPACE
346} // namespace simdpp
347
348#endif
349