store_last.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/store_last.h]

1	/ Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_LAST_H
9	#define LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_LAST_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/types.h>
16	#include <simdpp/detail/align.h>
17	#include <simdpp/core/blend.h>
18	#include <simdpp/core/cmp_gt.h>
19	#include <simdpp/core/load.h>
20	#include <simdpp/core/load_u.h>
21	#include <simdpp/core/move_l.h>
22	#include <simdpp/core/store.h>
23	#include <simdpp/detail/neon/memory_store.h>
24	#include <simdpp/detail/null/memory.h>
25	#include <simdpp/detail/extract128.h>
26
27	namespace simdpp {
28	namespace SIMDPP_ARCH_NAMESPACE {
29	namespace detail {
30	namespace insn {
31
32	// -----------------------------------------------------------------------------
33
34	static SIMDPP_INL
35	void i_store_last(char* p, const uint8x16& a, unsigned n)
36	{
37	p = detail::assume_aligned(p, `16`);
38	#if SIMDPP_USE_NULL
39	detail::null::store_last(p, a, n);
40	#elif SIMDPP_USE_ALTIVEC && SIMDPP_BIG_ENDIAN
41	uint8x16 mask = vec_lvsl(n, (const uint8_t*)NULL);
42	mask = cmp_gt(mask, `0x0f`);
43	uint8x16 b = load(p);
44	b = blend(a, b, mask);
45	store(p, b);
46	#elif SIMDPP_USE_ALTIVEC && SIMDPP_LITTLE_ENDIAN
47	uint8<`16`> mask = make_ones();
48	uint8<`16`> shift = vec_splats((unsigned char)(n << `3`));
49	mask = vec_sro(mask.native(), shift.native());
50
51	uint8x16 b = load(p);
52	b = blend(b, a, mask);
53	store(p, b);
54	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_NEON
55	static const uint8_t mask_d[`32`] = {`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
56	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
57	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
58	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`};
59
60	uint8x16 mask = load_u(mask_d + n);
61	uint8x16 b = load(p);
62	b = blend(a, b, mask);
63	store(p, b);
64	#elif SIMDPP_USE_MSA
65	int8x16 mask = make_ones();
66	int8x16 zero = make_zero();
67	mask = __msa_sld_b(mask.native(), zero.native(), n);
68	uint8x16 b = load(p);
69	b = blend(a, b, mask);
70	store(p, b);
71	#endif
72	}
73
74	#if SIMDPP_USE_AVX2
75	static SIMDPP_INL
76	void i_store_last(char* p, const uint8x32& a, unsigned n)
77	{
78	p = detail::assume_aligned(p, `32`);
79	static const uint8_t mask_d[`64`] = {`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
80	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
81	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
82	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
83	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
84	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`};
85	uint8x32 mask = load_u(mask_d + n);
86	uint8x32 b = load(p);
87	b = blend(a, b, mask);
88	store(p, b);
89	}
90	#endif
91
92	#if SIMDPP_USE_AVX512BW
93	SIMDPP_INL void i_store_last(char* p, const uint8<`64`>& a, unsigned n)
94	{
95	static const uint8_t mask_d[`128`] = {`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
96	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
97	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
98	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
99	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
100	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
101	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
102	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
103	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
104	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
105	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,
106	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff` };
107
108	uint8<`64`> mask = load_u(mask_d + n);
109	uint8<`64`> b = load(p);
110	b = blend(a, b, mask);
111	store(p, b);
112	}
113	#endif
114
115
116	// -----------------------------------------------------------------------------
117
118	static SIMDPP_INL
119	void i_store_last(char* p, const uint16x8& a, unsigned n)
120	{
121	p = detail::assume_aligned(p, `16`);
122	#if SIMDPP_USE_NULL
123	detail::null::store_last(p, a, n);
124	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_NEON \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
125	i_store_last(p, (uint8x16)a, n*`2`);
126	#endif
127	}
128
129	#if SIMDPP_USE_AVX2
130	static SIMDPP_INL
131	void i_store_last(char* p, const uint16x16& a, unsigned n)
132	{
133	i_store_last(p, uint8x32(a), n*`2`);
134	}
135	#endif
136
137	#if SIMDPP_USE_AVX512BW
138	SIMDPP_INL void i_store_last(char* p, const uint16<`32`>& a, unsigned n)
139	{
140	i_store_last(p, uint8<`64`>(a), n*`2`);
141	}
142	#endif
143
144	// -----------------------------------------------------------------------------
145
146	static SIMDPP_INL
147	void i_store_last(char* p, const uint32x4& a, unsigned n)
148	{
149	p = detail::assume_aligned(p, `16`);
150	#if SIMDPP_USE_NULL
151	detail::null::store_last(p, a, n);
152	#elif SIMDPP_USE_AVX2
153	static const int32_t mask_d[`8`] = {`0`, `0`, `0`, `0`, -`1`, -`1`, -`1`, -`1`};
154	uint32x4 mask = load_u(mask_d + n);
155	_mm_maskstore_epi32(reinterpret_cast<int*>(p), mask.native(), a.native());
156	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_NEON \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
157	i_store_last(p, (uint8x16)a, n*`4`);
158	#endif
159	}
160
161	#if SIMDPP_USE_AVX2
162	static SIMDPP_INL
163	void i_store_last(char* p, const uint32x8& a, unsigned n)
164	{
165	static const int32_t mask_d[`16`] = {`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
166	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`};
167	uint32<`8`> mask = load_u(mask_d + n);
168	_mm256_maskstore_epi32(reinterpret_cast<int*>(p), mask.native(), a.native());
169	}
170	#endif
171
172	#if SIMDPP_USE_AVX512F
173	static SIMDPP_INL
174	void i_store_last(char* p, const uint32<`16`>& a, unsigned n)
175	{
176	_mm512_mask_store_epi32(p, `0xffff` << (`16`-n), a.native());
177	}
178	#endif
179
180	// -----------------------------------------------------------------------------
181
182	static SIMDPP_INL
183	void i_store_last(char* p, const uint64x2& a, unsigned n)
184	{
185	p = detail::assume_aligned(p, `16`);
186	#if SIMDPP_USE_SSE2
187	if (n == `1`) {
188	uint64x2 b = move2_l<`1`>(a);
189	_mm_store_sd(reinterpret_cast<double*>(p+`8`), _mm_castsi128_pd(b.native()));
190	}
191	#elif SIMDPP_USE_NEON
192	if (n == `1`) {
193	neon::store_lane<`1`,`1`>(p+`8`, a);
194	}
195	#elif SIMDPP_USE_VSX_207
196	if (n == `1`) {
197	uint64_t* q = reinterpret_cast<uint64_t*>(p) + `1`;
198	*q = vec_extract(a.native(), `1`);
199	}
200	#elif SIMDPP_USE_MSA
201	i_store_last(p, uint8<`16`>(a), n*`8`);
202	#elif SIMDPP_USE_NULL \|\| SIMDPP_USE_ALTIVEC
203	detail::null::store_last(p, a, n);
204	#endif
205	}
206
207	#if SIMDPP_USE_AVX2
208	static SIMDPP_INL
209	void i_store_last(char* p, const uint64x4& a, unsigned n)
210	{
211	static const int64_t mask_d[`8`] = { `0`, `0`, `0`, `0`, -`1`, -`1`, -`1`, -`1` };
212	uint64<`4`> mask = load_u(mask_d + n);
213	#if __INTEL_COMPILER
214	_mm256_maskstore_epi64(reinterpret_cast<__int64*>(p), mask.native(), a.native());
215	#else
216	_mm256_maskstore_epi64(reinterpret_cast<long long*>(p), mask.native(), a.native());
217	#endif
218	}
219	#endif
220
221	#if SIMDPP_USE_AVX512F
222	static SIMDPP_INL
223	void i_store_last(char* p, const uint64<`8`>& a, unsigned n)
224	{
225	_mm512_mask_store_epi64(p, `0xff` << (`8`-n), a.native());
226	}
227	#endif
228
229	// -----------------------------------------------------------------------------
230
231	static SIMDPP_INL
232	void i_store_last(char* p, const float32x4& ca, unsigned n)
233	{
234	float32<`4`> a = ca;
235	p = detail::assume_aligned(p, `16`);
236	#if SIMDPP_USE_NULL \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_NEON_FLT_SP \|\| SIMDPP_USE_MSA
237	i_store_last(p, uint32x4(a), n);
238	#elif SIMDPP_USE_AVX && !SIMDPP_USE_AMD
239	static const int32_t mask_d[`8`] = { `0`, `0`, `0`, `0`, -`1`, -`1`, -`1`, -`1` };
240	float32x4 mask = load_u(mask_d + n);
241	_mm_maskstore_ps(reinterpret_cast<float*>(p),
242	_mm_castps_si128(mask.native()), a.native());
243	#elif SIMDPP_USE_SSE2
244	static const int32_t mask_d[`8`] = { `0`, `0`, `0`, `0`, -`1`, -`1`, -`1`, -`1` };
245	float32x4 mask = load_u(mask_d + n);
246	float32x4 old = load(p);
247	a = blend(a, old, mask);
248	store(p, a);
249	#elif SIMDPP_USE_NEON
250	// + VFP
251	if (n < `1`) return;
252	neon::store_lane<`3`,`1`>(p+`12`, a);
253	if (n < `2`) return;
254	neon::store_lane<`2`,`1`>(p+`8`, a);
255	if (n < `3`) return;
256	neon::store_lane<`1`,`1`>(p+`4`, a);
257	#endif
258	}
259
260	#if SIMDPP_USE_AVX
261	static SIMDPP_INL
262	void i_store_last(char* p, const float32x8& ca, unsigned n)
263	{
264	float32<`8`> a = ca;
265	static const int32_t mask_d[`16`] = { `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
266	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1` };
267
268	float32x8 mask = load_u(mask_d + n);
269	#if !SIMDPP_USE_AMD
270	_mm256_maskstore_ps(reinterpret_cast<float*>(p),
271	_mm256_castps_si256(mask.native()), a.native());
272	#else
273	float32x8 old = load(p);
274	a = blend(a, old, mask);
275	store(v, a);
276	#endif
277	}
278	#endif
279
280	#if SIMDPP_USE_AVX512F
281	static SIMDPP_INL
282	void i_store_last(char* p, const float32<`16`>& a, unsigned n)
283	{
284	_mm512_mask_store_ps(p, `0xffff` << (`16`-n), a.native());
285	}
286	#endif
287
288	// -----------------------------------------------------------------------------
289
290	static SIMDPP_INL
291	void i_store_last(char* p, const float64x2& a, unsigned n)
292	{
293	p = detail::assume_aligned(p, `16`);
294	#if SIMDPP_USE_SSE2
295	if (n == `1`) {
296	float64x2 b = zip2_hi(a, a);
297	_mm_store_sd(reinterpret_cast<double*>(p)+`1`, b.native());
298	}
299	#elif SIMDPP_USE_NEON64
300	if (n == `1`) {
301	vst1_f64(reinterpret_cast<double*>(p)+`1`, vget_high_f64(a.native()));
302	}
303	#elif SIMDPP_USE_VSX_206
304	if (n == `1`) {
305	(reinterpret_cast<double**>(p)+`1`) = vec_extract(a.native(), `1`);
306	}
307	#elif SIMDPP_USE_MSA
308	i_store_last(p, uint64x2(a), n);
309	#elif SIMDPP_USE_NULL \|\| SIMDPP_USE_NEON32 \|\| SIMDPP_USE_ALTIVEC
310	detail::null::store_last(p, a, n);
311	#endif
312	}
313
314	#if SIMDPP_USE_AVX
315	static SIMDPP_INL
316	void i_store_last(char* p, const float64x4& a, unsigned n)
317	{
318	static const int64_t mask_d[`8`] = { `0`, `0`, `0`, `0`, -`1`, -`1`, -`1`, -`1` };
319	float64x4 mask = load_u(mask_d + n);
320
321	#if !SIMDPP_USE_AMD
322	_mm256_maskstore_pd(reinterpret_cast<double*>(p),
323	_mm256_castpd_si256(mask.native()), a.native());
324	#else
325	float64x4 old = load(p);
326	a = blend(a, old, mask);
327	store(v, a);
328	#endif
329	}
330	#endif
331
332	#if SIMDPP_USE_AVX512F
333	static SIMDPP_INL
334	void i_store_last(char* p, const float64<`8`>& a, unsigned n)
335	{
336	_mm512_mask_store_pd(p, `0xff` << (`8`-n), a.native());
337	}
338	#endif
339
340	// -----------------------------------------------------------------------------
341
342	template<class V> SIMDPP_INL
343	void i_store_last(char* p, const V& ca, unsigned n)
344	{
345	const unsigned veclen = V::base_vector_type::length_bytes;
346
347	typename detail::remove_sign<V>::type a = ca;
348	p = detail::assume_aligned(p, veclen);
349	unsigned el_to_skip = V::length - n;
350
351	unsigned n_empty_vec = el_to_skip / V::base_vector_type::length;
352	unsigned mid_vec_skip_count = n % V::base_vector_type::length;
353	unsigned curr_vec = `0`;
354
355	p += n_empty_vec * veclen;
356	curr_vec += n_empty_vec;
357	if (mid_vec_skip_count > `0`) {
358	i_store_last(p, a.vec(curr_vec), mid_vec_skip_count);
359	p += veclen;
360	curr_vec++;
361	}
362
363	for (; curr_vec < V::vec_length; ++curr_vec) {
364	i_store(p, a.vec(curr_vec));
365	p += veclen;
366	}
367	}
368
369	} // namespace insn
370	} // namespace detail
371	} // namespace SIMDPP_ARCH_NAMESPACE
372	} // namespace simdpp
373
374	#endif
375

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/store_last.h