1/* Copyright (C) 2017 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_POPCNT_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_POPCNT_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/bit_and.h>
17#include <simdpp/core/extract.h>
18#include <simdpp/core/i_add.h>
19#include <simdpp/core/i_shift_r.h>
20#include <simdpp/core/i_sub.h>
21#include <simdpp/core/i_mul.h>
22#include <simdpp/core/insert.h>
23#include <simdpp/detail/null/bitwise.h>
24#include <simdpp/detail/width.h>
25#include <simdpp/detail/vector_array_macros.h>
26
27namespace simdpp {
28namespace SIMDPP_ARCH_NAMESPACE {
29namespace detail {
30namespace insn {
31
32template<class V> SIMDPP_INL
33V v_emul_popcnt_u8(const V& a)
34{
35 // We're using 16-bit ops because on SSE/AVX no 8-bit shift is available
36 // There's no difference on other architectures
37 using w_b16 = typename same_width<V>::u16;
38
39 w_b16 p = (w_b16)a;
40 w_b16 m55 = splat(0x5555);
41 w_b16 m33 = splat(0x3333);
42 w_b16 m0f = splat(0x0f0f);
43
44 p = sub(p, bit_and(shift_r<1>(p), m55));
45 p = add(bit_and(p, m33), bit_and(shift_r<2>(p), m33));
46 p = bit_and(add(p, shift_r<4>(p)), m0f);
47 return (V) p;
48}
49
50static SIMDPP_INL
51uint8<16> i_popcnt(const uint8<16>& a)
52{
53#if SIMDPP_USE_NULL
54 uint8<16> r;
55 for (unsigned i = 0; i < a.length; i++) {
56 r.el(i) = detail::null::el_popcnt8(a.el(i));
57 }
58 return r;
59#elif SIMDPP_USE_NEON
60 return vcntq_u8(a.native());
61#elif SIMDPP_USE_VSX_207
62 return vec_vpopcnt(a.native());
63#elif SIMDPP_USE_MSA
64 return (v16u8) __msa_pcnt_b((v16i8) a.native());
65#else
66 return v_emul_popcnt_u8(a);
67#endif
68}
69
70#if SIMDPP_USE_AVX2
71static SIMDPP_INL
72uint8<32> i_popcnt(const uint8<32>& a)
73{
74 return v_emul_popcnt_u8(a);
75}
76#endif
77
78#if SIMDPP_USE_AVX512BW
79static SIMDPP_INL
80uint8<64> i_popcnt(const uint8<64>& a)
81{
82 return v_emul_popcnt_u8(a);
83}
84#endif
85
86// -----------------------------------------------------------------------------
87
88template<class V> SIMDPP_INL
89V v_emul_popcnt_u16(const V& a)
90{
91 V p = a;
92 V m55 = splat(0x5555);
93 V m33 = splat(0x3333);
94 V m0f = splat(0x0f0f);
95 V res_mask = splat(0x00ff);
96
97
98 p = sub(p, bit_and(shift_r<1>(p), m55));
99 p = add(bit_and(p, m33), bit_and(shift_r<2>(p), m33));
100 p = bit_and(add(p, shift_r<4>(p)), m0f);
101 p = add(p, shift_r<8>(p));
102 p = bit_and(p, res_mask);
103 return p;
104}
105
106static SIMDPP_INL
107uint16<8> i_popcnt(const uint16<8>& a)
108{
109#if SIMDPP_USE_NULL
110 uint16<8> r;
111 for (unsigned i = 0; i < a.length; i++) {
112 r.el(i) = detail::null::el_popcnt16(a.el(i));
113 }
114 return r;
115#elif SIMDPP_USE_NEON
116 uint8x16_t p8 = vcntq_u8(vreinterpretq_u8_u16(a.native()));
117 return vpaddlq_u8(p8);
118#elif SIMDPP_USE_VSX_207
119 return vec_vpopcnt(a.native());
120#elif SIMDPP_USE_MSA
121 return (v8u16) __msa_pcnt_h((v8i16) a.native());
122#else
123 return v_emul_popcnt_u16(a);
124#endif
125}
126
127#if SIMDPP_USE_AVX2
128static SIMDPP_INL
129uint16<16> i_popcnt(const uint16<16>& a)
130{
131 return v_emul_popcnt_u16(a);
132}
133#endif
134
135#if SIMDPP_USE_AVX512BW
136static SIMDPP_INL
137uint16<32> i_popcnt(const uint16<32>& a)
138{
139 return v_emul_popcnt_u16(a);
140}
141#endif
142
143// -----------------------------------------------------------------------------
144
145template<class V> SIMDPP_INL
146V v_emul_popcnt_u32(const V& a)
147{
148 V p = a;
149 V m55 = splat(0x55555555);
150 V m33 = splat(0x33333333);
151 V m0f = splat(0x0f0f0f0f);
152
153 p = sub(p, bit_and(shift_r<1>(p), m55));
154 p = add(bit_and(p, m33), bit_and(shift_r<2>(p), m33));
155 p = bit_and(add(p, shift_r<4>(p)), m0f);
156#if SIMDPP_USE_SSE4_1 || SIMDPP_USE_NEON || SIMDPP_USE_MSA
157 V m01 = splat(0x01010101);
158 // rather than doing 2 adds + 2 shifts we can do 1 mul + 1 shift
159 p = shift_r<24>(mul_lo(p, m01));
160#else
161 V res_mask = splat(0x000000ff);
162 p = add(p, shift_r<8>(p));
163 p = add(p, shift_r<16>(p));
164 p = bit_and(p, res_mask);
165#endif
166 return p;
167}
168
169static SIMDPP_INL
170uint32<4> i_popcnt(const uint32<4>& a)
171{
172#if SIMDPP_USE_NULL
173 uint32<4> r;
174 for (unsigned i = 0; i < a.length; i++) {
175 r.el(i) = detail::null::el_popcnt32(a.el(i));
176 }
177 return r;
178#elif SIMDPP_USE_X86_POPCNT_INSN
179 // slightly faster than the vectorized version
180 unsigned a0 = _mm_popcnt_u32(extract<0>(a));
181 unsigned a1 = _mm_popcnt_u32(extract<1>(a));
182 unsigned a2 = _mm_popcnt_u32(extract<2>(a));
183 unsigned a3 = _mm_popcnt_u32(extract<3>(a));
184 uint16<8> r = _mm_cvtsi32_si128(a0);
185 r = insert<2>(r, a1);
186 r = insert<4>(r, a2);
187 r = insert<6>(r, a3);
188 return (uint32<4>) r;
189#elif SIMDPP_USE_NEON
190 uint8x16_t p8 = vcntq_u8(vreinterpretq_u8_u32(a.native()));
191 uint16x8_t p16 = vpaddlq_u8(p8);
192 return vpaddlq_u16(p16);
193#elif SIMDPP_USE_VSX_207
194 return vec_vpopcnt(a.native());
195#elif SIMDPP_USE_MSA
196 return (v4u32) __msa_pcnt_w((v4i32) a.native());
197#else
198 return v_emul_popcnt_u32(a);
199#endif
200}
201
202#if SIMDPP_USE_AVX2
203static SIMDPP_INL
204uint32<8> i_popcnt(const uint32<8>& a)
205{
206 return v_emul_popcnt_u32(a);
207}
208#endif
209
210#if SIMDPP_USE_AVX512F
211static SIMDPP_INL
212uint32<16> i_popcnt(const uint32<16>& a)
213{
214 // TODO: support AVX512VPOPCNTDQ
215 return v_emul_popcnt_u32(a);
216}
217#endif
218
219// -----------------------------------------------------------------------------
220
221template<class V> SIMDPP_INL
222V v_emul_popcnt_u64(const V& a)
223{
224 V p = a;
225 V m55 = splat(0x5555555555555555);
226 V m33 = splat(0x3333333333333333);
227 V m0f = splat(0x0f0f0f0f0f0f0f0f);
228 V res_mask = splat(0x00000000000000ff);
229
230
231 p = sub(p, bit_and(shift_r<1>(p), m55));
232 p = add(bit_and(p, m33), bit_and(shift_r<2>(p), m33));
233 p = bit_and(add(p, shift_r<4>(p)), m0f);
234 p = add(p, shift_r<8>(p));
235 p = add(p, shift_r<16>(p));
236 p = add(p, shift_r<32>(p));
237 p = bit_and(p, res_mask);
238 return p;
239}
240
241static SIMDPP_INL
242uint64<2> i_popcnt(const uint64<2>& a)
243{
244#if SIMDPP_USE_NULL
245 uint64<2> r;
246 for (unsigned i = 0; i < a.length; i++) {
247 r.el(i) = detail::null::el_popcnt64(a.el(i));
248 }
249 return r;
250#elif SIMDPP_USE_X86_POPCNT_INSN
251 unsigned a0, a1;
252#if SIMDPP_64_BITS
253 a0 = _mm_popcnt_u64(extract<0>(a));
254 a1 = _mm_popcnt_u64(extract<1>(a));
255#else
256 uint32<4> a32; a32 = a;
257 a0 = _mm_popcnt_u32(extract<0>(a32));
258 a0 += _mm_popcnt_u32(extract<1>(a32));
259 a1 = _mm_popcnt_u32(extract<2>(a32));
260 a1 += _mm_popcnt_u32(extract<3>(a32));
261#endif
262 uint16<8> r = _mm_cvtsi32_si128(a0);
263 r = insert<4>(r, a1);
264 return (uint64<2>) r;
265#elif SIMDPP_USE_SSE2
266 uint8<16> p8 = v_emul_popcnt_u8((uint8<16>) a);
267 return _mm_sad_epu8(p8.native(), _mm_setzero_si128());
268#elif SIMDPP_USE_NEON
269 uint8x16_t p8 = vcntq_u8(vreinterpretq_u8_u64(a.native()));
270 uint16x8_t p16 = vpaddlq_u8(p8);
271 uint32x4_t p32 = vpaddlq_u16(p16);
272 return vpaddlq_u32(p32);
273#elif SIMDPP_USE_VSX_207
274 return vec_vpopcnt(a.native());
275#elif SIMDPP_USE_MSA
276 return (v2u64) __msa_pcnt_d((v2i64) a.native());
277#else
278 return v_emul_popcnt_u64(a);
279#endif
280}
281
282#if SIMDPP_USE_AVX2
283static SIMDPP_INL
284uint64<4> i_popcnt(const uint64<4>& a)
285{
286#if SIMDPP_USE_X86_POPCNT_INSN && SIMDPP_64_BITS
287 uint64<2> a0, a1;
288 split(a, a0, a1);
289 a0 = i_popcnt(a0);
290 a1 = i_popcnt(a1);
291 return combine(a0, a1);
292#else
293 uint8<32> p8 = v_emul_popcnt_u8((uint8<32>) a);
294 return _mm256_sad_epu8(p8.native(), _mm256_setzero_si256());
295#endif
296}
297#endif
298
299#if SIMDPP_USE_AVX512F
300static SIMDPP_INL
301uint64<8> i_popcnt(const uint64<8>& a)
302{
303 // TODO: support AVX512VPOPCNTDQ
304#if SIMDPP_USE_AVX512BW
305 uint8<64> p8 = v_emul_popcnt_u8((uint8<64>) a);
306 return _mm512_sad_epu8(p8.native(), _mm512_setzero_si512());
307#else
308 return v_emul_popcnt_u64(a);
309#endif
310}
311#endif
312
313// -----------------------------------------------------------------------------
314
315template<class V> SIMDPP_INL
316V i_popcnt(const V& a)
317{
318 SIMDPP_VEC_ARRAY_IMPL1(V, i_popcnt, a)
319}
320
321
322} // namespace insn
323} // namespace detail
324} // namespace SIMDPP_ARCH_NAMESPACE
325} // namespace simdpp
326
327#endif
328
329