store_first.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/store_first.h]

1	/ Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_FIRST_H
9	#define LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_FIRST_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/types.h>
16	#include <simdpp/detail/align.h>
17	#include <simdpp/core/blend.h>
18	#include <simdpp/core/cmp_gt.h>
19	#include <simdpp/core/load.h>
20	#include <simdpp/core/load_u.h>
21	#include <simdpp/core/store.h>
22	#include <simdpp/detail/neon/memory_store.h>
23	#include <simdpp/detail/null/memory.h>
24	#include <simdpp/detail/extract128.h>
25
26	namespace simdpp {
27	namespace SIMDPP_ARCH_NAMESPACE {
28	namespace detail {
29	namespace insn {
30
31	static SIMDPP_INL
32	void i_store_first(char* p, const uint8x16& a, unsigned n)
33	{
34	p = detail::assume_aligned(p, `16`);
35	#if SIMDPP_USE_NULL
36	detail::null::store_first(p, a, n);
37	#elif SIMDPP_USE_ALTIVEC && SIMDPP_BIG_ENDIAN
38	uint8x16 mask = vec_lvsr(n, (const uint8_t*)NULL);
39	mask = cmp_lt(mask, `0x10`);
40	uint8x16 b = load(p);
41	b = blend(a, b, mask);
42	store(p, b);
43	#elif SIMDPP_USE_ALTIVEC && SIMDPP_LITTLE_ENDIAN
44	uint8<`16`> mask = make_ones();
45	uint8<`16`> shift = vec_splats((unsigned char)(n << `3`));
46	mask = vec_slo(mask.native(), shift.native());
47
48	uint8x16 b = load(p);
49	b = blend(b, a, mask);
50	store(p, b);
51	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_NEON \|\| SIMDPP_USE_MSA
52	// for MSA we can't use __msa_sld_b because it does not work with shift==16
53	static const uint8_t mask_d[`32`] = {`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
54	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
55	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
56	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`};
57
58	uint8x16 mask = load_u(mask_d + `16` - n);
59	uint8x16 b = load(p);
60	b = blend(a, b, mask);
61	store(p, b);
62	#endif
63	}
64
65	#if SIMDPP_USE_AVX2
66	static SIMDPP_INL
67	void i_store_first(char* p, const uint8x32& a, unsigned n)
68	{
69	static const uint8_t mask_d[`64`] = {`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
70	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
71	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
72	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
73	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
74	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`};
75
76	uint8x32 mask = load_u(mask_d + `32` - n);
77	uint8x32 b = load(p);
78	b = blend(a, b, mask);
79	store(p, b);
80	}
81	#endif
82
83	#if SIMDPP_USE_AVX512BW
84	SIMDPP_INL void i_store_first(char* p, const uint8<`64`>& a, unsigned n)
85	{
86	static const uint8_t mask_d[`128`] = {`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
87	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
88	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
89	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
90	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
91	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
92	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
93	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
94	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
95	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
96	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
97	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`};
98
99	uint8<`64`> mask = load_u(mask_d + `64` - n);
100	uint8<`64`> b = load(p);
101	b = blend(a, b, mask);
102	store(p, b);
103	}
104	#endif
105
106	// -----------------------------------------------------------------------------
107
108	static SIMDPP_INL
109	void i_store_first(char* p, const uint16x8& a, unsigned n)
110	{
111	p = detail::assume_aligned(p, `16`);
112	#if SIMDPP_USE_NULL
113	detail::null::store_first(p, a, n);
114	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_NEON \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
115	i_store_first(p, (uint8x16)a, n*`2`);
116	#endif
117	}
118
119	#if SIMDPP_USE_AVX2
120	static SIMDPP_INL
121	void i_store_first(char* p, const uint16x16& a, unsigned n)
122	{
123	i_store_first(p, uint8x32(a), n*`2`);
124	}
125	#endif
126
127	#if SIMDPP_USE_AVX512BW
128	SIMDPP_INL void i_store_first(char* p, const uint16<`32`>& a, unsigned n)
129	{
130	i_store_first(p, uint8<`64`>(a), n*`2`);
131	}
132	#endif
133
134	// -----------------------------------------------------------------------------
135
136	static SIMDPP_INL
137	void i_store_first(char* p, const uint32x4& a, unsigned n)
138	{
139	p = detail::assume_aligned(p, `16`);
140	#if SIMDPP_USE_NULL
141	detail::null::store_first(p, a, n);
142	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_NEON \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
143	i_store_first(p, (uint8x16)a, n*`4`);
144	#endif
145	}
146
147	#if SIMDPP_USE_AVX2
148	static SIMDPP_INL
149	void i_store_first(char* p, const uint32x8& a, unsigned n)
150	{
151	static const int32_t mask_d[`16`] = { -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
152	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0` };
153	uint32<`8`> mask = load_u(mask_d + `8`-n);
154	_mm256_maskstore_epi32(reinterpret_cast<int*>(p), mask.native(), a.native());
155	}
156	#endif
157
158	#if SIMDPP_USE_AVX512F
159	static SIMDPP_INL
160	void i_store_first(char* p, const uint32<`16`>& a, unsigned n)
161	{
162	_mm512_mask_store_epi32(p, `0xffff` >> (`16`-n), a.native());
163	}
164	#endif
165
166	// -----------------------------------------------------------------------------
167
168	static SIMDPP_INL
169	void i_store_first(char* p, const uint64x2& a, unsigned n)
170	{
171	p = detail::assume_aligned(p, `16`);
172	#if SIMDPP_USE_SSE2
173	if (n == `1`) {
174	_mm_store_sd(reinterpret_cast<double*>(p), _mm_castsi128_pd(a.native()));
175	}
176	#elif SIMDPP_USE_NEON
177	if (n == `1`) {
178	neon::store_lane<`0`,`1`>(p, a);
179	}
180	#elif SIMDPP_USE_VSX_207
181	if (n == `1`) {
182	uint64_t* q = reinterpret_cast<uint64_t*>(p);
183	*q = vec_extract(a.native(), `0`);
184	}
185	#elif SIMDPP_USE_MSA
186	i_store_first(p, uint8<`16`>(a), n*`8`);
187	#elif SIMDPP_USE_NULL \|\| SIMDPP_USE_ALTIVEC
188	detail::null::store_first(p, a, n);
189	#endif
190	}
191
192	#if SIMDPP_USE_AVX2
193	static SIMDPP_INL
194	void i_store_first(char* p, const uint64x4& a, unsigned n)
195	{
196	static const int64_t mask_d[`8`] = { -`1`, -`1`, -`1`, -`1`, `0`, `0`, `0`, `0` };
197	uint64<`4`> mask = load_u(mask_d + `4`-n);
198	#if __INTEL_COMPILER
199	_mm256_maskstore_epi64(reinterpret_cast<__int64*>(p), mask.native(), a.native());
200	#else
201	_mm256_maskstore_epi64(reinterpret_cast<long long*>(p), mask.native(), a.native());
202	#endif
203	}
204	#endif
205
206	#if SIMDPP_USE_AVX512F
207	static SIMDPP_INL
208	void i_store_first(char* p, const uint64<`8`>& a, unsigned n)
209	{
210	_mm512_mask_store_epi64(p, `0xff` >> (`8`-n), a.native());
211	}
212	#endif
213
214	// -----------------------------------------------------------------------------
215
216	static SIMDPP_INL
217	void i_store_first(char* p, const float32x4& a, unsigned n)
218	{
219	p = detail::assume_aligned(p, `16`);
220	#if SIMDPP_USE_NULL \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_NEON_NO_FLT_SP \|\| SIMDPP_USE_MSA
221	i_store_first(p, int32x4(a), n);
222	#elif SIMDPP_USE_AVX && !SIMDPP_USE_AMD
223	static const int32_t mask_d[`8`] = { -`1`, -`1`, -`1`, -`1`, `0`, `0`, `0`, `0` };
224	float32x4 mask = load_u(mask_d + `4`-n);
225	_mm_maskstore_ps(reinterpret_cast<float*>(p), _mm_castps_si128(mask.native()), a.native());
226	#elif SIMDPP_USE_SSE2
227	static const int32_t mask_d[`8`] = { -`1`, -`1`, -`1`, -`1`, `0`, `0`, `0`, `0` };
228	float32x4 mask = load_u(mask_d + `4`-n);
229	float32x4 b = load(p);
230	b = blend(a, b, mask);
231	store(p, b);
232	#elif SIMDPP_USE_NEON
233	// + VFP
234	if (n < `1`) return;
235	neon::store_lane<`0`,`1`>(p, a);
236	if (n < `2`) return;
237	neon::store_lane<`1`,`1`>(p+`4`, a);
238	if (n < `3`) return;
239	neon::store_lane<`2`,`1`>(p+`8`, a);
240	#endif
241	}
242
243	#if SIMDPP_USE_AVX
244	static SIMDPP_INL
245	void i_store_first(char* p, const float32x8& a, unsigned n)
246	{
247	static const int32_t mask_d[`16`] = { -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
248	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0` };
249
250	float32x8 mask = load_u(mask_d + `8`-n);
251	#if !SIMDPP_USE_AMD
252	_mm256_maskstore_ps(reinterpret_cast<float*>(p), _mm256_castps_si256(mask.native()), a.native());
253	#else
254	float32x8 b = load(p);
255	b = blend(a, b, mask);
256	store(v, b);
257	#endif
258	}
259	#endif
260
261	#if SIMDPP_USE_AVX512F
262	static SIMDPP_INL
263	void i_store_first(char* p, const float32<`16`>& a, unsigned n)
264	{
265	_mm512_mask_store_ps(p, `0xffff` >> (`16`-n), a.native());
266	}
267	#endif
268
269	// -----------------------------------------------------------------------------
270
271	static SIMDPP_INL
272	void i_store_first(char* p, const float64x2& a, unsigned n)
273	{
274	p = detail::assume_aligned(p, `16`);
275	#if SIMDPP_USE_SSE2
276	if (n == `1`) {
277	_mm_store_sd(reinterpret_cast<double*>(p), a.native());
278	}
279	#elif SIMDPP_USE_NEON64
280	if (n == `1`) {
281	vst1_f64(reinterpret_cast<double*>(p), vget_low_f64(a.native()));
282	}
283	#elif SIMDPP_USE_VSX_206
284	if (n == `1`) {
285	double* q = reinterpret_cast<double*>(p);
286	*q = vec_extract(a.native(), `0`);
287	}
288	#elif SIMDPP_USE_MSA
289	i_store_first(p, (uint64x2)a, n);
290	#elif SIMDPP_USE_NULL \|\| SIMDPP_USE_NEON32 \|\| SIMDPP_USE_ALTIVEC
291	detail::null::store_first(p, a, n);
292	#endif
293	}
294
295	#if SIMDPP_USE_AVX
296	static SIMDPP_INL
297	void i_store_first(char* p, const float64x4& a, unsigned n)
298	{
299	static const int64_t mask_d[`16`] = { -`1`, -`1`, -`1`, -`1`, `0`, `0`, `0`, `0` };
300	float64x4 mask = load_u(mask_d + `4`-n);
301	#if !SIMDPP_USE_AMD
302	_mm256_maskstore_pd(reinterpret_cast<double*>(p), _mm256_castpd_si256(mask.native()), a.native());
303	#else
304	float64x4 b = load(p);
305	b = blend(a, b, mask);
306	store(v, b);
307	#endif
308	}
309	#endif
310
311	#if SIMDPP_USE_AVX512F
312	static SIMDPP_INL
313	void i_store_first(char* p, const float64<`8`>& a, unsigned n)
314	{
315	_mm512_mask_store_pd(p, `0xff` >> (`8`-n), a.native());
316	}
317	#endif
318
319	// -----------------------------------------------------------------------------
320
321	template<class V> SIMDPP_INL
322	void i_store_first(char* p, const V& ca, unsigned n)
323	{
324	const unsigned veclen = V::base_vector_type::length_bytes;
325
326	typename detail::remove_sign<V>::type a = ca;
327	p = detail::assume_aligned(p, veclen);
328
329	unsigned n_full_vec = n / V::base_vector_type::length;
330	unsigned mid_vec_fill_count = n % V::base_vector_type::length;
331	unsigned curr_vec = `0`;
332
333	for (; curr_vec < n_full_vec; ++curr_vec) {
334	store(p, a.vec(curr_vec));
335	p += veclen;
336	}
337
338	if (mid_vec_fill_count > `0`) {
339	i_store_first(p, a.vec(curr_vec), mid_vec_fill_count);
340	}
341	}
342
343	} // namespace insn
344	} // namespace detail
345	} // namespace SIMDPP_ARCH_NAMESPACE
346	} // namespace simdpp
347
348	#endif
349

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/store_first.h