i_popcnt.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/i_popcnt.h]

1	/ Copyright (C) 2017 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_POPCNT_H
9	#define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_POPCNT_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/types.h>
16	#include <simdpp/core/bit_and.h>
17	#include <simdpp/core/extract.h>
18	#include <simdpp/core/i_add.h>
19	#include <simdpp/core/i_shift_r.h>
20	#include <simdpp/core/i_sub.h>
21	#include <simdpp/core/i_mul.h>
22	#include <simdpp/core/insert.h>
23	#include <simdpp/detail/null/bitwise.h>
24	#include <simdpp/detail/width.h>
25	#include <simdpp/detail/vector_array_macros.h>
26
27	namespace simdpp {
28	namespace SIMDPP_ARCH_NAMESPACE {
29	namespace detail {
30	namespace insn {
31
32	template<class V> SIMDPP_INL
33	V v_emul_popcnt_u8(const V& a)
34	{
35	// We're using 16-bit ops because on SSE/AVX no 8-bit shift is available
36	// There's no difference on other architectures
37	using w_b16 = typename same_width<V>::u16;
38
39	w_b16 p = (w_b16)a;
40	w_b16 m55 = splat(`0x5555`);
41	w_b16 m33 = splat(`0x3333`);
42	w_b16 m0f = splat(`0x0f0f`);
43
44	p = sub(p, bit_and(shift_r<`1`>(p), m55));
45	p = add(bit_and(p, m33), bit_and(shift_r<`2`>(p), m33));
46	p = bit_and(add(p, shift_r<`4`>(p)), m0f);
47	return (V) p;
48	}
49
50	static SIMDPP_INL
51	uint8<`16`> i_popcnt(const uint8<`16`>& a)
52	{
53	#if SIMDPP_USE_NULL
54	uint8<`16`> r;
55	for (unsigned i = `0`; i < a.length; i++) {
56	r.el(i) = detail::null::el_popcnt8(a.el(i));
57	}
58	return r;
59	#elif SIMDPP_USE_NEON
60	return vcntq_u8(a.native());
61	#elif SIMDPP_USE_VSX_207
62	return vec_vpopcnt(a.native());
63	#elif SIMDPP_USE_MSA
64	return (v16u8) __msa_pcnt_b((v16i8) a.native());
65	#else
66	return v_emul_popcnt_u8(a);
67	#endif
68	}
69
70	#if SIMDPP_USE_AVX2
71	static SIMDPP_INL
72	uint8<`32`> i_popcnt(const uint8<`32`>& a)
73	{
74	return v_emul_popcnt_u8(a);
75	}
76	#endif
77
78	#if SIMDPP_USE_AVX512BW
79	static SIMDPP_INL
80	uint8<`64`> i_popcnt(const uint8<`64`>& a)
81	{
82	return v_emul_popcnt_u8(a);
83	}
84	#endif
85
86	// -----------------------------------------------------------------------------
87
88	template<class V> SIMDPP_INL
89	V v_emul_popcnt_u16(const V& a)
90	{
91	V p = a;
92	V m55 = splat(`0x5555`);
93	V m33 = splat(`0x3333`);
94	V m0f = splat(`0x0f0f`);
95	V res_mask = splat(`0x00ff`);
96
97
98	p = sub(p, bit_and(shift_r<`1`>(p), m55));
99	p = add(bit_and(p, m33), bit_and(shift_r<`2`>(p), m33));
100	p = bit_and(add(p, shift_r<`4`>(p)), m0f);
101	p = add(p, shift_r<`8`>(p));
102	p = bit_and(p, res_mask);
103	return p;
104	}
105
106	static SIMDPP_INL
107	uint16<`8`> i_popcnt(const uint16<`8`>& a)
108	{
109	#if SIMDPP_USE_NULL
110	uint16<`8`> r;
111	for (unsigned i = `0`; i < a.length; i++) {
112	r.el(i) = detail::null::el_popcnt16(a.el(i));
113	}
114	return r;
115	#elif SIMDPP_USE_NEON
116	uint8x16_t p8 = vcntq_u8(vreinterpretq_u8_u16(a.native()));
117	return vpaddlq_u8(p8);
118	#elif SIMDPP_USE_VSX_207
119	return vec_vpopcnt(a.native());
120	#elif SIMDPP_USE_MSA
121	return (v8u16) __msa_pcnt_h((v8i16) a.native());
122	#else
123	return v_emul_popcnt_u16(a);
124	#endif
125	}
126
127	#if SIMDPP_USE_AVX2
128	static SIMDPP_INL
129	uint16<`16`> i_popcnt(const uint16<`16`>& a)
130	{
131	return v_emul_popcnt_u16(a);
132	}
133	#endif
134
135	#if SIMDPP_USE_AVX512BW
136	static SIMDPP_INL
137	uint16<`32`> i_popcnt(const uint16<`32`>& a)
138	{
139	return v_emul_popcnt_u16(a);
140	}
141	#endif
142
143	// -----------------------------------------------------------------------------
144
145	template<class V> SIMDPP_INL
146	V v_emul_popcnt_u32(const V& a)
147	{
148	V p = a;
149	V m55 = splat(`0x55555555`);
150	V m33 = splat(`0x33333333`);
151	V m0f = splat(`0x0f0f0f0f`);
152
153	p = sub(p, bit_and(shift_r<`1`>(p), m55));
154	p = add(bit_and(p, m33), bit_and(shift_r<`2`>(p), m33));
155	p = bit_and(add(p, shift_r<`4`>(p)), m0f);
156	#if SIMDPP_USE_SSE4_1 \|\| SIMDPP_USE_NEON \|\| SIMDPP_USE_MSA
157	V m01 = splat(`0x01010101`);
158	// rather than doing 2 adds + 2 shifts we can do 1 mul + 1 shift
159	p = shift_r<`24`>(mul_lo(p, m01));
160	#else
161	V res_mask = splat(`0x000000ff`);
162	p = add(p, shift_r<`8`>(p));
163	p = add(p, shift_r<`16`>(p));
164	p = bit_and(p, res_mask);
165	#endif
166	return p;
167	}
168
169	static SIMDPP_INL
170	uint32<`4`> i_popcnt(const uint32<`4`>& a)
171	{
172	#if SIMDPP_USE_NULL
173	uint32<`4`> r;
174	for (unsigned i = `0`; i < a.length; i++) {
175	r.el(i) = detail::null::el_popcnt32(a.el(i));
176	}
177	return r;
178	#elif SIMDPP_USE_X86_POPCNT_INSN
179	// slightly faster than the vectorized version
180	unsigned a0 = _mm_popcnt_u32(extract<`0`>(a));
181	unsigned a1 = _mm_popcnt_u32(extract<`1`>(a));
182	unsigned a2 = _mm_popcnt_u32(extract<`2`>(a));
183	unsigned a3 = _mm_popcnt_u32(extract<`3`>(a));
184	uint16<`8`> r = _mm_cvtsi32_si128(a0);
185	r = insert<`2`>(r, a1);
186	r = insert<`4`>(r, a2);
187	r = insert<`6`>(r, a3);
188	return (uint32<`4`>) r;
189	#elif SIMDPP_USE_NEON
190	uint8x16_t p8 = vcntq_u8(vreinterpretq_u8_u32(a.native()));
191	uint16x8_t p16 = vpaddlq_u8(p8);
192	return vpaddlq_u16(p16);
193	#elif SIMDPP_USE_VSX_207
194	return vec_vpopcnt(a.native());
195	#elif SIMDPP_USE_MSA
196	return (v4u32) __msa_pcnt_w((v4i32) a.native());
197	#else
198	return v_emul_popcnt_u32(a);
199	#endif
200	}
201
202	#if SIMDPP_USE_AVX2
203	static SIMDPP_INL
204	uint32<`8`> i_popcnt(const uint32<`8`>& a)
205	{
206	return v_emul_popcnt_u32(a);
207	}
208	#endif
209
210	#if SIMDPP_USE_AVX512F
211	static SIMDPP_INL
212	uint32<`16`> i_popcnt(const uint32<`16`>& a)
213	{
214	// TODO: support AVX512VPOPCNTDQ
215	return v_emul_popcnt_u32(a);
216	}
217	#endif
218
219	// -----------------------------------------------------------------------------
220
221	template<class V> SIMDPP_INL
222	V v_emul_popcnt_u64(const V& a)
223	{
224	V p = a;
225	V m55 = splat(`0x5555555555555555`);
226	V m33 = splat(`0x3333333333333333`);
227	V m0f = splat(`0x0f0f0f0f0f0f0f0f`);
228	V res_mask = splat(`0x00000000000000ff`);
229
230
231	p = sub(p, bit_and(shift_r<`1`>(p), m55));
232	p = add(bit_and(p, m33), bit_and(shift_r<`2`>(p), m33));
233	p = bit_and(add(p, shift_r<`4`>(p)), m0f);
234	p = add(p, shift_r<`8`>(p));
235	p = add(p, shift_r<`16`>(p));
236	p = add(p, shift_r<`32`>(p));
237	p = bit_and(p, res_mask);
238	return p;
239	}
240
241	static SIMDPP_INL
242	uint64<`2`> i_popcnt(const uint64<`2`>& a)
243	{
244	#if SIMDPP_USE_NULL
245	uint64<`2`> r;
246	for (unsigned i = `0`; i < a.length; i++) {
247	r.el(i) = detail::null::el_popcnt64(a.el(i));
248	}
249	return r;
250	#elif SIMDPP_USE_X86_POPCNT_INSN
251	unsigned a0, a1;
252	#if SIMDPP_64_BITS
253	a0 = _mm_popcnt_u64(extract<`0`>(a));
254	a1 = _mm_popcnt_u64(extract<`1`>(a));
255	#else
256	uint32<`4`> a32; a32 = a;
257	a0 = _mm_popcnt_u32(extract<`0`>(a32));
258	a0 += _mm_popcnt_u32(extract<`1`>(a32));
259	a1 = _mm_popcnt_u32(extract<`2`>(a32));
260	a1 += _mm_popcnt_u32(extract<`3`>(a32));
261	#endif
262	uint16<`8`> r = _mm_cvtsi32_si128(a0);
263	r = insert<`4`>(r, a1);
264	return (uint64<`2`>) r;
265	#elif SIMDPP_USE_SSE2
266	uint8<`16`> p8 = v_emul_popcnt_u8((uint8<`16`>) a);
267	return _mm_sad_epu8(p8.native(), _mm_setzero_si128());
268	#elif SIMDPP_USE_NEON
269	uint8x16_t p8 = vcntq_u8(vreinterpretq_u8_u64(a.native()));
270	uint16x8_t p16 = vpaddlq_u8(p8);
271	uint32x4_t p32 = vpaddlq_u16(p16);
272	return vpaddlq_u32(p32);
273	#elif SIMDPP_USE_VSX_207
274	return vec_vpopcnt(a.native());
275	#elif SIMDPP_USE_MSA
276	return (v2u64) __msa_pcnt_d((v2i64) a.native());
277	#else
278	return v_emul_popcnt_u64(a);
279	#endif
280	}
281
282	#if SIMDPP_USE_AVX2
283	static SIMDPP_INL
284	uint64<`4`> i_popcnt(const uint64<`4`>& a)
285	{
286	#if SIMDPP_USE_X86_POPCNT_INSN && SIMDPP_64_BITS
287	uint64<`2`> a0, a1;
288	split(a, a0, a1);
289	a0 = i_popcnt(a0);
290	a1 = i_popcnt(a1);
291	return combine(a0, a1);
292	#else
293	uint8<`32`> p8 = v_emul_popcnt_u8((uint8<`32`>) a);
294	return _mm256_sad_epu8(p8.native(), _mm256_setzero_si256());
295	#endif
296	}
297	#endif
298
299	#if SIMDPP_USE_AVX512F
300	static SIMDPP_INL
301	uint64<`8`> i_popcnt(const uint64<`8`>& a)
302	{
303	// TODO: support AVX512VPOPCNTDQ
304	#if SIMDPP_USE_AVX512BW
305	uint8<`64`> p8 = v_emul_popcnt_u8((uint8<`64`>) a);
306	return _mm512_sad_epu8(p8.native(), _mm512_setzero_si512());
307	#else
308	return v_emul_popcnt_u64(a);
309	#endif
310	}
311	#endif
312
313	// -----------------------------------------------------------------------------
314
315	template<class V> SIMDPP_INL
316	V i_popcnt(const V& a)
317	{
318	SIMDPP_VEC_ARRAY_IMPL1(V, i_popcnt, a)
319	}
320
321
322	} // namespace insn
323	} // namespace detail
324	} // namespace SIMDPP_ARCH_NAMESPACE
325	} // namespace simdpp
326
327	#endif
328
329

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/i_popcnt.h