splat_n.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/splat_n.h]

1	/ Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_BROADCAST_H
9	#define LIBSIMDPP_SIMDPP_DETAIL_INSN_BROADCAST_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/types.h>
16	#include <simdpp/core/bit_or.h>
17	#include <simdpp/core/permute2.h>
18	#include <simdpp/core/permute4.h>
19	#include <simdpp/core/permute_bytes16.h>
20	#include <simdpp/core/zip_lo.h>
21	#include <simdpp/core/zip_hi.h>
22	#include <simdpp/detail/null/shuffle.h>
23	#include <simdpp/detail/shuffle/shuffle_mask.h>
24	#include <simdpp/detail/vector_array_macros.h>
25
26	namespace simdpp {
27	namespace SIMDPP_ARCH_NAMESPACE {
28	namespace detail {
29	namespace insn {
30
31	// forward declarations
32	template<unsigned s> SIMDPP_INL
33	uint16x8 i_splat8(const uint16x8& a);
34	#if SIMDPP_USE_AVX2
35	template<unsigned s> SIMDPP_INL
36	uint16x16 i_splat8(const uint16x16& a);
37	#endif
38	#if SIMDPP_USE_AVX512BW
39	template<unsigned s> SIMDPP_INL
40	uint16<`32`> i_splat8(const uint16<`32`>& a);
41	#endif
42
43	// -----------------------------------------------------------------------------
44
45	template<unsigned s> SIMDPP_INL
46	uint8x16 i_splat16(const uint8x16& ca)
47	{
48	uint8<`16`> a = ca;
49	static_assert(s < `16`, "Access out of bounds");
50	#if SIMDPP_USE_NULL
51	return detail::null::splat<s>(a);
52	#elif SIMDPP_USE_AVX2
53	a = move16_l<s>(a);
54	return _mm_broadcastb_epi8(a.native());
55	#elif SIMDPP_USE_SSSE3
56	uint8x16 mask = make_shuffle_bytes16_mask<s,s,s,s,s,s,s,s,
57	s,s,s,s,s,s,s,s>(mask);
58	return permute_bytes16(a, mask);
59	#elif SIMDPP_USE_SSE2
60	__m128i n1, n2;
61
62	if (s % `2` == `1`) {
63	n1 = _mm_srli_epi16(a.native(), `8`);
64	n2 = _mm_slli_epi16(n1, `8`);
65	} else {
66	n1 = _mm_slli_epi16(a.native(), `8`);
67	n2 = _mm_srli_epi16(n1, `8`);
68	}
69	uint16x8 b = _mm_or_si128(n1, n2);
70	return (uint8<`16`>) i_splat8<s/`2`>(b);
71	#elif SIMDPP_USE_NEON64
72	return vdupq_laneq_u8(a.native(), s);
73	#elif SIMDPP_USE_NEON
74	if (s < `8`) {
75	uint8x8_t z = vget_low_u8(a.native());
76	return (uint8x16_t) vdupq_lane_u8(z, (s < `8` ? s : `0`));
77	} else {
78	uint8x8_t z = vget_high_u8(a.native());
79	return (uint8x16_t) vdupq_lane_u8(z, (s < `8` ? `0` : s-`8`));
80	}
81	#elif SIMDPP_USE_ALTIVEC
82	return vec_splat(a.native(), s);
83	#elif SIMDPP_USE_MSA
84	return (v16u8) __msa_splat_b((v16i8) a.native(), s);
85	#endif
86	}
87
88	#if SIMDPP_USE_AVX2
89	template<unsigned s> SIMDPP_INL
90	uint8x32 i_splat16(const uint8x32& a)
91	{
92	static_assert(s < `16`, "Access out of bounds");
93	uint16x16 b; b = s < `8` ? zip16_lo(a, a) : zip16_hi(a, a);
94	return (uint8x32) i_splat8<s%`8`>(b);
95	}
96	#endif
97
98	#if SIMDPP_USE_AVX512BW
99	template<unsigned s> SIMDPP_INL
100	uint8<`64`> i_splat16(const uint8<`64`>& a)
101	{
102	static_assert(s < `16`, "Access out of bounds");
103	uint16<`32`> b;
104	b = s < `8` ? zip16_lo(a, a) : zip16_hi(a, a);
105	return (uint8<`64`>) i_splat8<s%`8`>(b);
106	}
107	#endif
108
109	template<unsigned s, unsigned N> SIMDPP_INL
110	uint8<N> i_splat16(const uint8<N>& a)
111	{
112	static_assert(s < `16`, "Access out of bounds");
113	SIMDPP_VEC_ARRAY_IMPL1(uint8<N>, i_splat16<s>, a);
114	}
115
116	// -----------------------------------------------------------------------------
117
118	template<unsigned s> SIMDPP_INL
119	uint16x8 i_splat8(const uint16x8& a)
120	{
121	static_assert(s < `8`, "Access out of bounds");
122	#if SIMDPP_USE_NULL
123	return detail::null::splat<s>(a);
124	#elif SIMDPP_USE_AVX2
125	uint16<`8`> b = move8_l<s>(a);
126	return _mm_broadcastw_epi16(b.native());
127	#elif SIMDPP_USE_SSSE3
128	uint16x8 mask = make_shuffle_bytes16_mask<s,s,s,s,s,s,s,s>(mask);
129	return permute_bytes16(a, mask);
130	#elif SIMDPP_USE_SSE2
131	// s2 is needed because static_assert fires in branch we don't use
132	uint64x2 b;
133	if (s < `4`) {
134	const unsigned s2 = s < `4` ? s : s-`4`;
135	b = _mm_shufflelo_epi16(a.native(), _MM_SHUFFLE(s2,s2,s2,s2));
136	return (uint16<`8`>) permute2<`0`,`0`>(b);
137	} else {
138	const unsigned s2 = s < `4` ? s : s-`4`;
139	b = _mm_shufflehi_epi16(a.native(), _MM_SHUFFLE(s2,s2,s2,s2));
140	return (uint16<`8`>) permute2<`1`,`1`>(b);
141	}
142	#elif SIMDPP_USE_NEON64
143	return vdupq_laneq_u16(a.native(), s);
144	#elif SIMDPP_USE_NEON
145	if (s < `4`) {
146	uint16x4_t z = vget_low_u16(a.native());
147	return (uint16x8_t) vdupq_lane_u16(z, (s < `4` ? s : `0`));
148	} else {
149	uint16x4_t z = vget_high_u16(a.native());
150	return (uint16x8_t) vdupq_lane_u16(z, (s < `4` ? `0` : s-`4`));
151	}
152	#elif SIMDPP_USE_ALTIVEC
153	return vec_splat(a.native(), s);
154	#elif SIMDPP_USE_MSA
155	return (v8u16) __msa_splat_h((v8i16) a.native(), s);
156	#endif
157	}
158
159	#if SIMDPP_USE_AVX2
160	template<unsigned s> SIMDPP_INL
161	uint16x16 i_splat8(const uint16x16& a)
162	{
163	static_assert(s < `8`, "Access out of bounds");
164	if (s < `4`) {
165	const unsigned q = (s < `4`) ? s : `0`;
166	uint64x4 h = _mm256_shufflelo_epi16(a.native(), SIMDPP_SHUFFLE_MASK_4x4(q, q, q, q));
167	h = permute2<`0`,`0`>(h);
168	return uint16x16(h);
169	} else {
170	const unsigned q = (s < `4`) ? `0` : s - `4`;
171	uint64x4 h = _mm256_shufflehi_epi16(a.native(), SIMDPP_SHUFFLE_MASK_4x4(q, q, q, q));
172	h = permute2<`1`,`1`>(h);
173	return uint16x16(h);
174	}
175	}
176	#endif
177
178	#if SIMDPP_USE_AVX512BW
179	template<unsigned s> SIMDPP_INL
180	uint16<`32`> i_splat8(const uint16<`32`>& a)
181	{
182	static_assert(s < `8`, "Access out of bounds");
183	uint64<`8`> r;
184	if (s < `4`) {
185	const unsigned q = (s < `4`) ? s : `0`;
186	r = _mm512_shufflelo_epi16(a.native(), SIMDPP_SHUFFLE_MASK_4x4(q, q, q, q));
187	r = permute2<`0`,`0`>(r);
188	} else {
189	const unsigned q = (s < `4`) ? `0` : s - `4`;
190	r = _mm512_shufflehi_epi16(a.native(), SIMDPP_SHUFFLE_MASK_4x4(q, q, q, q));
191	r = permute2<`1`,`1`>(r);
192	}
193	return uint16<`32`>(r);
194	}
195	#endif
196
197	template<unsigned s, unsigned N> SIMDPP_INL
198	uint16<N> i_splat8(const uint16<N>& a)
199	{
200	static_assert(s < `8`, "Access out of bounds");
201	SIMDPP_VEC_ARRAY_IMPL1(uint16<N>, i_splat8<s>, a);
202	}
203
204	// -----------------------------------------------------------------------------
205
206	template<unsigned s> SIMDPP_INL
207	uint32x4 i_splat4(const uint32x4& a)
208	{
209	static_assert(s < `4`, "Access out of bounds");
210	#if SIMDPP_USE_NULL
211	return detail::null::splat<s>(a);
212	#elif SIMDPP_USE_SSE2
213	return permute4<s,s,s,s>(a);
214	#elif SIMDPP_USE_NEON64
215	return vdupq_laneq_u32(a.native(), s);
216	#elif SIMDPP_USE_NEON
217	if (s < `2`) {
218	uint32x2_t z = vget_low_u32(a.native());
219	// Clang implements vdupq_lane_u32 as a macro, thus we must never
220	// supply it with s>=2, even if we know the branch will never be executed
221	return (uint32x4_t) vdupq_lane_u32(z, (s < `2` ? s : `0`));
222	} else {
223	uint32x2_t z = vget_high_u32(a.native());
224	return (uint32x4_t) vdupq_lane_u32(z, (s < `2` ? `0` : s-`2`));
225	}
226	#elif SIMDPP_USE_ALTIVEC
227	return vec_splat(a.native(), s);
228	#elif SIMDPP_USE_MSA
229	return (v4u32) __msa_splat_w((v4i32) a.native(), s);
230	#endif
231	}
232
233	#if SIMDPP_USE_AVX2
234	template<unsigned s> SIMDPP_INL
235	uint32x8 i_splat4(const uint32x8& a)
236	{
237	static_assert(s < `4`, "Access out of bounds");
238	return permute4<s,s,s,s>(a);
239	}
240	#endif\
241
242	#if SIMDPP_USE_AVX512F
243	template<unsigned s> SIMDPP_INL
244	uint32<`16`> i_splat4(const uint32<`16`>& a)
245	{
246	static_assert(s < `4`, "Access out of bounds");
247	return permute4<s,s,s,s>(a);
248	}
249	#endif
250
251	template<unsigned s, unsigned N> SIMDPP_INL
252	uint32<N> i_splat4(const uint32<N>& a)
253	{
254	static_assert(s < `4`, "Access out of bounds");
255	SIMDPP_VEC_ARRAY_IMPL1(uint32<N>, i_splat4<s>, a);
256	}
257
258	// -----------------------------------------------------------------------------
259
260	template<unsigned s> SIMDPP_INL
261	uint64x2 i_splat2(const uint64x2& a)
262	{
263	static_assert(s < `2`, "Access out of bounds");
264	#if SIMDPP_USE_SSE2
265	if (s == `0`) {
266	return permute2<`0`,`0`>(a);
267	} else {
268	return permute2<`1`,`1`>(a);
269	}
270	#elif SIMDPP_USE_NEON64
271	return vdupq_laneq_u64(a.native(), s);
272	#elif SIMDPP_USE_NEON
273	uint64x1_t z;
274	if (s == `0`) {
275	z = vget_low_u64(a.native());
276	} else {
277	z = vget_high_u64(a.native());
278	}
279	return (uint64x2_t) vdupq_lane_u64(z, `0`);
280	#elif SIMDPP_USE_VSX_207
281	return vec_splat(a.native(), s);
282	#elif SIMDPP_USE_MSA
283	return (v2u64) __msa_splat_d((v2i64) a.native(), s);
284	#elif SIMDPP_USE_NULL \|\| SIMDPP_USE_ALTIVEC
285	return detail::null::splat<s>(a);
286	#endif
287	}
288
289	#if SIMDPP_USE_AVX2
290	template<unsigned s> SIMDPP_INL
291	uint64x4 i_splat2(const uint64x4& a)
292	{
293	static_assert(s < `2`, "Access out of bounds");
294	return permute2<s,s>(a);
295	}
296	#endif
297
298	#if SIMDPP_USE_AVX512F
299	template<unsigned s> SIMDPP_INL
300	uint64<`8`> i_splat2(const uint64<`8`>& a)
301	{
302	static_assert(s < `2`, "Access out of bounds");
303	return permute2<s,s>(a);
304	}
305	#endif
306
307	template<unsigned s, unsigned N> SIMDPP_INL
308	uint64<N> i_splat2(const uint64<N>& a)
309	{
310	static_assert(s < `2`, "Access out of bounds");
311	SIMDPP_VEC_ARRAY_IMPL1(uint64<N>, i_splat2<s>, a);
312	}
313
314	// -----------------------------------------------------------------------------
315
316	template<unsigned s> SIMDPP_INL
317	float32x4 i_splat4(const float32x4& a)
318	{
319	static_assert(s < `4`, "Access out of bounds");
320	#if SIMDPP_USE_NULL \|\| SIMDPP_USE_NEON_NO_FLT_SP
321	return detail::null::splat<s>(a);
322	#elif SIMDPP_USE_SSE2
323	return permute4<s,s,s,s>(a);
324	#elif SIMDPP_USE_NEON64
325	return vdupq_laneq_f32(a.native(), s);
326	#elif SIMDPP_USE_NEON
327	if (s < `2`) {
328	float32x2_t z = vget_low_f32(a.native());
329	// Clang implements vdupq_lane_f32 as a macro, thus we must never
330	// supply it with s>=2, even if we know the branch will never be executed
331	return (float32x4_t) vdupq_lane_f32(z, (s < `2` ? s : `0`));
332	} else {
333	float32x2_t z = vget_high_f32(a.native());
334	return (float32x4_t) vdupq_lane_f32(z, (s < `2` ? `0` : s-`2`) );
335	}
336	#elif SIMDPP_USE_MSA
337	return (v4f32) __msa_splat_w((v4i32) a.native(), s);
338	#elif SIMDPP_USE_ALTIVEC
339	return vec_splat(a.native(), s);
340	#endif
341	}
342
343	#if SIMDPP_USE_AVX
344	template<unsigned s> SIMDPP_INL
345	float32x8 i_splat4(const float32x8& a)
346	{
347	static_assert(s < `4`, "Access out of bounds");
348	return permute4<s,s,s,s>(a);
349	}
350	#endif
351
352	#if SIMDPP_USE_AVX512F
353	template<unsigned s> SIMDPP_INL
354	float32<`16`> i_splat4(const float32<`16`>& a)
355	{
356	static_assert(s < `4`, "Access out of bounds");
357	return permute4<s,s,s,s>(a);
358	}
359	#endif
360
361	template<unsigned s, unsigned N> SIMDPP_INL
362	float32<N> i_splat4(const float32<N>& a)
363	{
364	static_assert(s < `4`, "Access out of bounds");
365	SIMDPP_VEC_ARRAY_IMPL1(float32<N>, i_splat4<s>, a);
366	}
367
368	// -----------------------------------------------------------------------------
369
370	template<unsigned s> SIMDPP_INL
371	float64x2 i_splat2(const float64x2& a)
372	{
373	static_assert(s < `2`, "Access out of bounds");
374	#if SIMDPP_USE_SSE2
375	return permute2<s,s>(a);
376	#elif SIMDPP_USE_NEON64
377	return vdupq_laneq_f64(a.native(), s);
378	#elif SIMDPP_USE_VSX_206
379	return vec_splat(a.native(), s);
380	#elif SIMDPP_USE_MSA
381	return (v2f64) __msa_splat_d((v2i64) a.native(), s);
382	#elif SIMDPP_USE_NULL \|\| SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_NEON
383	return detail::null::splat<s>(a);
384	#endif
385	}
386
387	#if SIMDPP_USE_AVX
388	template<unsigned s> SIMDPP_INL
389	float64x4 i_splat2(const float64x4& a)
390	{
391	static_assert(s < `2`, "Access out of bounds");
392	return permute2<s,s>(a);
393	}
394	#endif
395
396	#if SIMDPP_USE_AVX512F
397	template<unsigned s> SIMDPP_INL
398	float64<`8`> i_splat2(const float64<`8`>& a)
399	{
400	static_assert(s < `2`, "Access out of bounds");
401	return permute2<s,s>(a);
402	}
403	#endif
404
405	template<unsigned s, unsigned N> SIMDPP_INL
406	float64<N> i_splat2(const float64<N>& a)
407	{
408	static_assert(s < `2`, "Access out of bounds");
409	SIMDPP_VEC_ARRAY_IMPL1(float64<N>, i_splat2<s>, a);
410	}
411
412
413	} // namespace insn
414	} // namespace detail
415	} // namespace SIMDPP_ARCH_NAMESPACE
416	} // namespace simdpp
417
418	#endif
419
420

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/splat_n.h