store_packed4.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/store_packed4.h]

1	/ Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_PACKED4_H
9	#define LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_PACKED4_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/types.h>
16	#include <simdpp/detail/align.h>
17	#include <simdpp/detail/insn/mem_pack.h>
18	#include <simdpp/core/store.h>
19	#include <simdpp/detail/null/memory.h>
20
21	namespace simdpp {
22	namespace SIMDPP_ARCH_NAMESPACE {
23	namespace detail {
24	namespace insn {
25
26
27	// collect some boilerplate
28	template<class V> SIMDPP_INL
29	void v128_store_pack4(char* p, const V& ca, const V& cb, const V& cc, const V& dd);
30	template<class V> SIMDPP_INL
31	void v256_store_pack4(char* p, const V& ca, const V& cb, const V& cc, const V& dd);
32	template<class V> SIMDPP_INL
33	void v512_store_pack4(char* p, const V& ca, const V& cb, const V& cc, const V& dd);
34
35	// -----------------------------------------------------------------------------
36
37	static SIMDPP_INL
38	void i_store_packed4(char* p,
39	const uint8x16& a, const uint8x16& b,
40	const uint8x16& c, const uint8x16& d)
41	{
42	p = detail::assume_aligned(p, `16`);
43	#if SIMDPP_USE_NULL
44	detail::null::store_packed4(p, a, b, c, d);
45	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
46	v128_store_pack4(p, a, b, c, d);
47	#elif SIMDPP_USE_NEON
48	uint8x16x4_t t;
49	t.val[`0`] = a.native();
50	t.val[`1`] = b.native();
51	t.val[`2`] = c.native();
52	t.val[`3`] = d.native();
53	vst4q_u8(reinterpret_cast<uint8_t*>(p), t);
54	#endif
55	}
56
57	#if SIMDPP_USE_AVX2
58	static SIMDPP_INL
59	void i_store_packed4(char* p,
60	const uint8x32& a, const uint8x32& b,
61	const uint8x32& c, const uint8x32& d)
62	{
63	v256_store_pack4(p, a, b, c, d);
64	}
65	#endif
66
67	#if SIMDPP_USE_AVX512BW
68	static SIMDPP_INL
69	void i_store_packed4(char* p,
70	const uint8<`64`>& a, const uint8<`64`>& b,
71	const uint8<`64`>& c, const uint8<`64`>& d)
72	{
73	v512_store_pack4(p, a, b, c, d);
74	}
75	#endif
76
77	// -----------------------------------------------------------------------------
78
79	static SIMDPP_INL
80	void i_store_packed4(char* p,
81	const uint16x8& a, const uint16x8& b,
82	const uint16x8& c, const uint16x8& d)
83	{
84	p = detail::assume_aligned(p, `16`);
85	#if SIMDPP_USE_NULL
86	detail::null::store_packed4(p, a, b, c, d);
87	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
88	v128_store_pack4(p, a, b, c, d);
89	#elif SIMDPP_USE_NEON
90	uint16x8x4_t t;
91	t.val[`0`] = a.native();
92	t.val[`1`] = b.native();
93	t.val[`2`] = c.native();
94	t.val[`3`] = d.native();
95	vst4q_u16(reinterpret_cast<uint16_t*>(p), t);
96	#endif
97	}
98
99	#if SIMDPP_USE_AVX2
100	static SIMDPP_INL
101	void i_store_packed4(char* p,
102	const uint16x16& a, const uint16x16& b,
103	const uint16x16& c, const uint16x16& d)
104	{
105	v256_store_pack4(p, a, b, c, d);
106	}
107	#endif
108
109	#if SIMDPP_USE_AVX512BW
110	static SIMDPP_INL
111	void i_store_packed4(char* p,
112	const uint16<`32`>& a, const uint16<`32`>& b,
113	const uint16<`32`>& c, const uint16<`32`>& d)
114	{
115	v512_store_pack4(p, a, b, c, d);
116	}
117	#endif
118
119	// -----------------------------------------------------------------------------
120
121	static SIMDPP_INL
122	void i_store_packed4(char* p,
123	const uint32x4& a, const uint32x4& b,
124	const uint32x4& c, const uint32x4& d)
125	{
126	p = detail::assume_aligned(p, `16`);
127	#if SIMDPP_USE_NULL
128	detail::null::store_packed4(p, a, b, c, d);
129	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
130	v128_store_pack4(p, a, b, c, d);
131	#elif SIMDPP_USE_NEON
132	uint32x4x4_t t;
133	t.val[`0`] = a.native();
134	t.val[`1`] = b.native();
135	t.val[`2`] = c.native();
136	t.val[`3`] = d.native();
137	vst4q_u32(reinterpret_cast<uint32_t*>(p), t);
138	#endif
139	}
140
141	#if SIMDPP_USE_AVX2
142	static SIMDPP_INL
143	void i_store_packed4(char* p,
144	const uint32x8& a, const uint32x8& b,
145	const uint32x8& c, const uint32x8& d)
146	{
147	v256_store_pack4(p, a, b, c, d);
148	}
149	#endif
150
151	#if SIMDPP_USE_AVX512F
152	static SIMDPP_INL
153	void i_store_packed4(char* p,
154	const uint32<`16`>& a, const uint32<`16`>& b,
155	const uint32<`16`>& c, const uint32<`16`>& d)
156	{
157	v512_store_pack4(p, a, b, c, d);
158	}
159	#endif
160
161	// -----------------------------------------------------------------------------
162
163	static SIMDPP_INL
164	void i_store_packed4(char* p,
165	const uint64x2& a, const uint64x2& b,
166	const uint64x2& c, const uint64x2& d)
167	{
168	#if SIMDPP_USE_NEON64
169	uint64x2x4_t t;
170	t.val[`0`] = a.native();
171	t.val[`1`] = b.native();
172	t.val[`2`] = c.native();
173	t.val[`3`] = d.native();
174	vst4q_u64(reinterpret_cast<uint64_t*>(p), t);
175	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_NEON \|\| SIMDPP_USE_VSX_207 \|\| SIMDPP_USE_MSA
176	v128_store_pack4(p, a, b, c, d);
177	#elif SIMDPP_USE_NULL \|\| SIMDPP_USE_ALTIVEC
178	detail::null::store_packed4(p, a, b, c, d);
179	#endif
180	}
181
182	#if SIMDPP_USE_AVX2
183	static SIMDPP_INL
184	void i_store_packed4(char* p,
185	const uint64x4& a, const uint64x4& b,
186	const uint64x4& c, const uint64x4& d)
187	{
188	v256_store_pack4(p, a, b, c, d);
189	}
190	#endif
191
192	#if SIMDPP_USE_AVX512F
193	static SIMDPP_INL
194	void i_store_packed4(char* p,
195	const uint64<`8`>& a, const uint64<`8`>& b,
196	const uint64<`8`>& c, const uint64<`8`>& d)
197	{
198	v512_store_pack4(p, a, b, c, d);
199	}
200	#endif
201
202	// -----------------------------------------------------------------------------
203
204	static SIMDPP_INL
205	void i_store_packed4(char* p,
206	const float32x4& a, const float32x4& b,
207	const float32x4& c, const float32x4& d)
208	{
209	p = detail::assume_aligned(p, `16`);
210	#if SIMDPP_USE_NULL \|\| SIMDPP_USE_NEON_NO_FLT_SP
211	detail::null::store_packed4(p, a, b, c, d);
212	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
213	v128_store_pack4(p, a, b, c, d);
214	#elif SIMDPP_USE_NEON
215	float32x4x4_t t;
216	t.val[`0`] = a.native();
217	t.val[`1`] = b.native();
218	t.val[`2`] = c.native();
219	t.val[`3`] = d.native();
220	vst4q_f32(reinterpret_cast<float*>(p), t);
221	#endif
222	}
223
224	#if SIMDPP_USE_AVX
225	static SIMDPP_INL
226	void i_store_packed4(char* p,
227	const float32x8& a, const float32x8& b,
228	const float32x8& c, const float32x8& d)
229	{
230	v256_store_pack4(p, a, b, c, d);
231	}
232	#endif
233
234	#if SIMDPP_USE_AVX512F
235	static SIMDPP_INL
236	void i_store_packed4(char* p,
237	const float32<`16`>& a, const float32<`16`>& b,
238	const float32<`16`>& c, const float32<`16`>& d)
239	{
240	v512_store_pack4(p, a, b, c, d);
241	}
242	#endif
243
244	// -----------------------------------------------------------------------------
245
246	static SIMDPP_INL
247	void i_store_packed4(char* p,
248	const float64x2& a, const float64x2& b,
249	const float64x2& c, const float64x2& d)
250	{
251	p = detail::assume_aligned(p, `16`);
252	#if SIMDPP_USE_SSE2 \|\| SIMDPP_USE_VSX_206 \|\| SIMDPP_USE_MSA
253	v128_store_pack4(p, a, b, c, d);
254	#elif SIMDPP_USE_NEON64
255	float64x2x4_t t;
256	t.val[`0`] = a.native();
257	t.val[`1`] = b.native();
258	t.val[`2`] = c.native();
259	t.val[`3`] = d.native();
260	vst4q_f64(reinterpret_cast<double*>(p), t);
261	#elif SIMDPP_USE_NULL \|\| SIMDPP_USE_NEON \|\| SIMDPP_USE_ALTIVEC
262	detail::null::store_packed4(p, a, b, c, d);
263	#endif
264	}
265
266	#if SIMDPP_USE_AVX
267	static SIMDPP_INL
268	void i_store_packed4(char* p,
269	const float64x4& a, const float64x4& b,
270	const float64x4& c, const float64x4& d)
271	{
272	v256_store_pack4(p, a, b, c, d);
273	}
274	#endif
275
276	#if SIMDPP_USE_AVX512F
277	static SIMDPP_INL
278	void i_store_packed4(char* p,
279	const float64<`8`>& a, const float64<`8`>& b,
280	const float64<`8`>& c, const float64<`8`>& d)
281	{
282	v512_store_pack4(p, a, b, c, d);
283	}
284	#endif
285
286	// -----------------------------------------------------------------------------
287
288	template<class V> SIMDPP_INL
289	void v128_store_pack4(char* p, const V& ca, const V& cb, const V& cc, const V& dd)
290	{
291	p = detail::assume_aligned(p, `16`);
292	V a = ca, b = cb, c = cc, d = dd;
293	mem_pack4(a, b, c, d);
294	i_store(p, a);
295	i_store(p + `16`, b);
296	i_store(p + `32`, c);
297	i_store(p + `48`, d);
298	}
299
300	template<class V> SIMDPP_INL
301	void v256_store_pack4(char* p, const V& ca, const V& cb, const V& cc, const V& dd)
302	{
303	p = detail::assume_aligned(p, `32`);
304	V a = ca, b = cb, c = cc, d = dd;
305	mem_pack4(a, b, c, d);
306	i_store(p, a);
307	i_store(p + `32`, b);
308	i_store(p + `64`, c);
309	i_store(p + `96`, d);
310	}
311
312	template<class V> SIMDPP_INL
313	void v512_store_pack4(char* p, const V& ca, const V& cb, const V& cc, const V& dd)
314	{
315	p = detail::assume_aligned(p, `64`);
316	V a = ca, b = cb, c = cc, d = dd;
317	mem_pack4(a, b, c, d);
318	i_store(p, a);
319	i_store(p + `64`, b);
320	i_store(p + `128`, c);
321	i_store(p + `192`, d);
322	}
323
324	template<class V> SIMDPP_INL
325	void i_store_packed4(char* p, const V& ca, const V& cb, const V& cc, const V& dd)
326	{
327	const unsigned veclen = V::base_vector_type::length_bytes;
328	typename detail::remove_sign<V>::type a = ca, b = cb, c = cc, d = dd;
329
330	p = detail::assume_aligned(p, veclen);
331	for (unsigned i = `0`; i < V::vec_length; ++i) {
332	i_store_packed4(p, a.vec(i), b.vec(i), c.vec(i), d.vec(i));
333	p += veclen*`4`;
334	}
335	}
336
337	} // namespace insn
338	} // namespace detail
339	} // namespace SIMDPP_ARCH_NAMESPACE
340	} // namespace simdpp
341
342	#endif
343

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/store_packed4.h