shuffle_bytes16.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/shuffle_bytes16.h]

1	/ Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_SHUFFLE_BYTES16_H
9	#define LIBSIMDPP_SIMDPP_DETAIL_INSN_SHUFFLE_BYTES16_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/types.h>
16	#include <simdpp/core/bit_or.h>
17	#include <simdpp/core/blend.h>
18	#include <simdpp/core/cmp_lt.h>
19	#include <simdpp/core/i_add.h>
20	#include <simdpp/detail/not_implemented.h>
21
22	namespace simdpp {
23	namespace SIMDPP_ARCH_NAMESPACE {
24	namespace detail {
25	namespace insn {
26
27
28	static SIMDPP_INL
29	uint8x16 i_shuffle_bytes16(const uint8x16& a, const uint8x16& b, const uint8x16& mask)
30	{
31	#if SIMDPP_USE_NULL
32	uint8x16 r;
33
34	for (unsigned i = `0`; i < `16`; i++) {
35	unsigned j = mask.el(i) & `0x0f`;
36	bool which = mask.el(i) < `0x10`;
37	r.el(i) = which ? a.el(j) : b.el(j);
38	}
39	return r;
40	#elif SIMDPP_USE_XOP
41	return _mm_perm_epi8(a.native(), b.native(), mask.native());
42	#elif SIMDPP_USE_AVX
43	// it's not advantageous to use _mm_blendv_epi8 on pre-AVX machines
44	// because it takes the same number of cycles as the alternative, but
45	// forces the result into xmm0 register.
46	int16x8 sel, ai, bi, r;
47	sel = _mm_slli_epi16(mask.native(), `3`);
48
49	ai = _mm_shuffle_epi8(a.native(), mask.native());
50	bi = _mm_shuffle_epi8(b.native(), mask.native());
51	r = _mm_blendv_epi8(ai.native(), bi.native(), sel.native());
52	return (uint8<`16`>) r;
53	#elif SIMDPP_USE_SSSE3
54	#if (defined(__clang__) && (__clang_major__ == 3) && (__clang_minor__ <= 7) && (__clang_minor__ >= 6))
55	// Clang 3.7 and 3.6 incorrectly optimize certain cases of constant
56	// mask when the values are available for the compiler to collapse.
57	// Fortunately the overhead of the workaround is very small
58	uint8<`16`> ai, bi;
59	mask_int8<`16`> select_a = cmp_lt((int8<`16`>) mask, `0x10`);
60	ai = _mm_shuffle_epi8(a.native(), mask.native());
61	bi = _mm_shuffle_epi8(b.native(), mask.native());
62	return blend(ai, bi, select_a);
63	#else
64	uint8x16 m1, m2, ai, bi;
65	// sets the 7-th bit if we want an element from the other vector
66	m1 = add(mask, `0x70`);
67	m2 = add(mask, `0xf0`);
68
69	ai = _mm_shuffle_epi8(a.native(), m1.native());
70	bi = _mm_shuffle_epi8(b.native(), m2.native());
71	return bit_or(ai, bi);
72	#endif
73	#elif SIMDPP_USE_NEON32
74	uint8x8x4_t table = {{vget_low_u8(a.native()), vget_high_u8(a.native()),
75	vget_low_u8(b.native()), vget_high_u8(b.native())}};
76	uint8x8_t lo = vtbl4_u8(table, vget_low_u8(mask.native()));
77	uint8x8_t hi = vtbl4_u8(table, vget_high_u8(mask.native()));
78	return vcombine_u8(lo, hi);
79	#elif SIMDPP_USE_NEON64
80	uint8x16x2_t table;
81	table.val[`0`] = a.native();
82	table.val[`1`] = b.native();
83	return vqtbl2q_u8(table, mask.native());
84	#elif SIMDPP_USE_ALTIVEC
85	return vec_perm(a.native(), b.native(), mask.native());
86	#elif SIMDPP_USE_MSA
87	return (v16u8) __msa_vshf_b((v16i8)mask.native(),
88	(v16i8)b.native(),
89	(v16i8)a.native());
90	#else
91	return SIMDPP_NOT_IMPLEMENTED3(a, b, mask);
92	#endif
93	}
94
95	#if SIMDPP_USE_AVX2
96	static SIMDPP_INL
97	uint8x32 i_shuffle_bytes16(const uint8x32& a, const uint8x32& b, const uint8x32& mask)
98	{
99	int16x16 sel, ai, bi, r;
100	sel = mask;
101	sel = _mm256_slli_epi16(sel.native(), `3`); // the top 3 bits are already clear
102
103	ai = _mm256_shuffle_epi8(a.native(), mask.native());
104	bi = _mm256_shuffle_epi8(b.native(), mask.native());
105	r = _mm256_blendv_epi8(ai.native(), bi.native(), sel.native());
106	return (uint8<`32`>) r;
107	}
108	#endif
109
110	#if SIMDPP_USE_AVX512BW
111	SIMDPP_INL uint8<`64`> i_shuffle_bytes16(const uint8<`64`>& a, const uint8<`64`>& b, const uint8<`64`>& mask)
112	{
113	uint8<`64`> sel_mask, ai, bi, r;
114	sel_mask = make_uint(`0x10`);
115	__mmask64 sel = _mm512_test_epi8_mask(mask.native(), sel_mask.native());
116
117	ai = _mm512_shuffle_epi8(a.native(), mask.native());
118	bi = _mm512_shuffle_epi8(b.native(), mask.native());
119	r = _mm512_mask_blend_epi8(sel, ai.native(), bi.native());
120	return r;
121	}
122	#endif
123
124	template<unsigned N> SIMDPP_INL
125	uint8<N> i_shuffle_bytes16(const uint8<N>& a, const uint8<N>& b, const uint8<N>& mask)
126	{
127	SIMDPP_VEC_ARRAY_IMPL3(uint8<N>, i_shuffle_bytes16, a, b, mask);
128	}
129
130	template<unsigned N> SIMDPP_INL
131	uint16<N> i_shuffle_bytes16(const uint16<N>& a, const uint16<N>& b, const uint16<N>& mask)
132	{
133	return (uint16<N>) i_shuffle_bytes16(uint8<N`2`>(a), uint8<N`2`>(b), uint8<N*`2`>(mask));
134	}
135	template<unsigned N> SIMDPP_INL
136	uint32<N> i_shuffle_bytes16(const uint32<N>& a, const uint32<N>& b, const uint32<N>& mask)
137	{
138	return (uint32<N>) i_shuffle_bytes16(uint8<N`4`>(a), uint8<N`4`>(b), uint8<N*`4`>(mask));
139	}
140	template<unsigned N> SIMDPP_INL
141	uint64<N> i_shuffle_bytes16(const uint64<N>& a, const uint64<N>& b, const uint64<N>& mask)
142	{
143	return (uint64<N>) i_shuffle_bytes16(uint8<N`8`>(a), uint8<N`8`>(b), uint8<N*`8`>(mask));
144	}
145	template<unsigned N> SIMDPP_INL
146	float32<N> i_shuffle_bytes16(const float32<N>& a, const float32<N>& b, const uint32<N>& mask)
147	{
148	return float32<N>(i_shuffle_bytes16(uint32<N>(a), uint32<N>(b), mask));
149	}
150	template<unsigned N> SIMDPP_INL
151	float64<N> i_shuffle_bytes16(const float64<N>& a, const float64<N>& b, const uint64<N>& mask)
152	{
153	return float64<N>(i_shuffle_bytes16(uint64<N>(a), uint64<N>(b), mask));
154	}
155
156
157	} // namespace insn
158	} // namespace detail
159	} // namespace SIMDPP_ARCH_NAMESPACE
160	} // namespace simdpp
161
162	#endif
163
164

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/shuffle_bytes16.h