1 | /* Copyright (C) 2017 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_POPCNT_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_POPCNT_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/bit_and.h> |
17 | #include <simdpp/core/extract.h> |
18 | #include <simdpp/core/i_add.h> |
19 | #include <simdpp/core/i_shift_r.h> |
20 | #include <simdpp/core/i_sub.h> |
21 | #include <simdpp/core/i_mul.h> |
22 | #include <simdpp/core/insert.h> |
23 | #include <simdpp/detail/null/bitwise.h> |
24 | #include <simdpp/detail/width.h> |
25 | #include <simdpp/detail/vector_array_macros.h> |
26 | |
27 | namespace simdpp { |
28 | namespace SIMDPP_ARCH_NAMESPACE { |
29 | namespace detail { |
30 | namespace insn { |
31 | |
32 | template<class V> SIMDPP_INL |
33 | V v_emul_popcnt_u8(const V& a) |
34 | { |
35 | // We're using 16-bit ops because on SSE/AVX no 8-bit shift is available |
36 | // There's no difference on other architectures |
37 | using w_b16 = typename same_width<V>::u16; |
38 | |
39 | w_b16 p = (w_b16)a; |
40 | w_b16 m55 = splat(0x5555); |
41 | w_b16 m33 = splat(0x3333); |
42 | w_b16 m0f = splat(0x0f0f); |
43 | |
44 | p = sub(p, bit_and(shift_r<1>(p), m55)); |
45 | p = add(bit_and(p, m33), bit_and(shift_r<2>(p), m33)); |
46 | p = bit_and(add(p, shift_r<4>(p)), m0f); |
47 | return (V) p; |
48 | } |
49 | |
50 | static SIMDPP_INL |
51 | uint8<16> i_popcnt(const uint8<16>& a) |
52 | { |
53 | #if SIMDPP_USE_NULL |
54 | uint8<16> r; |
55 | for (unsigned i = 0; i < a.length; i++) { |
56 | r.el(i) = detail::null::el_popcnt8(a.el(i)); |
57 | } |
58 | return r; |
59 | #elif SIMDPP_USE_NEON |
60 | return vcntq_u8(a.native()); |
61 | #elif SIMDPP_USE_VSX_207 |
62 | return vec_vpopcnt(a.native()); |
63 | #elif SIMDPP_USE_MSA |
64 | return (v16u8) __msa_pcnt_b((v16i8) a.native()); |
65 | #else |
66 | return v_emul_popcnt_u8(a); |
67 | #endif |
68 | } |
69 | |
70 | #if SIMDPP_USE_AVX2 |
71 | static SIMDPP_INL |
72 | uint8<32> i_popcnt(const uint8<32>& a) |
73 | { |
74 | return v_emul_popcnt_u8(a); |
75 | } |
76 | #endif |
77 | |
78 | #if SIMDPP_USE_AVX512BW |
79 | static SIMDPP_INL |
80 | uint8<64> i_popcnt(const uint8<64>& a) |
81 | { |
82 | return v_emul_popcnt_u8(a); |
83 | } |
84 | #endif |
85 | |
86 | // ----------------------------------------------------------------------------- |
87 | |
88 | template<class V> SIMDPP_INL |
89 | V v_emul_popcnt_u16(const V& a) |
90 | { |
91 | V p = a; |
92 | V m55 = splat(0x5555); |
93 | V m33 = splat(0x3333); |
94 | V m0f = splat(0x0f0f); |
95 | V res_mask = splat(0x00ff); |
96 | |
97 | |
98 | p = sub(p, bit_and(shift_r<1>(p), m55)); |
99 | p = add(bit_and(p, m33), bit_and(shift_r<2>(p), m33)); |
100 | p = bit_and(add(p, shift_r<4>(p)), m0f); |
101 | p = add(p, shift_r<8>(p)); |
102 | p = bit_and(p, res_mask); |
103 | return p; |
104 | } |
105 | |
106 | static SIMDPP_INL |
107 | uint16<8> i_popcnt(const uint16<8>& a) |
108 | { |
109 | #if SIMDPP_USE_NULL |
110 | uint16<8> r; |
111 | for (unsigned i = 0; i < a.length; i++) { |
112 | r.el(i) = detail::null::el_popcnt16(a.el(i)); |
113 | } |
114 | return r; |
115 | #elif SIMDPP_USE_NEON |
116 | uint8x16_t p8 = vcntq_u8(vreinterpretq_u8_u16(a.native())); |
117 | return vpaddlq_u8(p8); |
118 | #elif SIMDPP_USE_VSX_207 |
119 | return vec_vpopcnt(a.native()); |
120 | #elif SIMDPP_USE_MSA |
121 | return (v8u16) __msa_pcnt_h((v8i16) a.native()); |
122 | #else |
123 | return v_emul_popcnt_u16(a); |
124 | #endif |
125 | } |
126 | |
127 | #if SIMDPP_USE_AVX2 |
128 | static SIMDPP_INL |
129 | uint16<16> i_popcnt(const uint16<16>& a) |
130 | { |
131 | return v_emul_popcnt_u16(a); |
132 | } |
133 | #endif |
134 | |
135 | #if SIMDPP_USE_AVX512BW |
136 | static SIMDPP_INL |
137 | uint16<32> i_popcnt(const uint16<32>& a) |
138 | { |
139 | return v_emul_popcnt_u16(a); |
140 | } |
141 | #endif |
142 | |
143 | // ----------------------------------------------------------------------------- |
144 | |
145 | template<class V> SIMDPP_INL |
146 | V v_emul_popcnt_u32(const V& a) |
147 | { |
148 | V p = a; |
149 | V m55 = splat(0x55555555); |
150 | V m33 = splat(0x33333333); |
151 | V m0f = splat(0x0f0f0f0f); |
152 | |
153 | p = sub(p, bit_and(shift_r<1>(p), m55)); |
154 | p = add(bit_and(p, m33), bit_and(shift_r<2>(p), m33)); |
155 | p = bit_and(add(p, shift_r<4>(p)), m0f); |
156 | #if SIMDPP_USE_SSE4_1 || SIMDPP_USE_NEON || SIMDPP_USE_MSA |
157 | V m01 = splat(0x01010101); |
158 | // rather than doing 2 adds + 2 shifts we can do 1 mul + 1 shift |
159 | p = shift_r<24>(mul_lo(p, m01)); |
160 | #else |
161 | V res_mask = splat(0x000000ff); |
162 | p = add(p, shift_r<8>(p)); |
163 | p = add(p, shift_r<16>(p)); |
164 | p = bit_and(p, res_mask); |
165 | #endif |
166 | return p; |
167 | } |
168 | |
169 | static SIMDPP_INL |
170 | uint32<4> i_popcnt(const uint32<4>& a) |
171 | { |
172 | #if SIMDPP_USE_NULL |
173 | uint32<4> r; |
174 | for (unsigned i = 0; i < a.length; i++) { |
175 | r.el(i) = detail::null::el_popcnt32(a.el(i)); |
176 | } |
177 | return r; |
178 | #elif SIMDPP_USE_X86_POPCNT_INSN |
179 | // slightly faster than the vectorized version |
180 | unsigned a0 = _mm_popcnt_u32(extract<0>(a)); |
181 | unsigned a1 = _mm_popcnt_u32(extract<1>(a)); |
182 | unsigned a2 = _mm_popcnt_u32(extract<2>(a)); |
183 | unsigned a3 = _mm_popcnt_u32(extract<3>(a)); |
184 | uint16<8> r = _mm_cvtsi32_si128(a0); |
185 | r = insert<2>(r, a1); |
186 | r = insert<4>(r, a2); |
187 | r = insert<6>(r, a3); |
188 | return (uint32<4>) r; |
189 | #elif SIMDPP_USE_NEON |
190 | uint8x16_t p8 = vcntq_u8(vreinterpretq_u8_u32(a.native())); |
191 | uint16x8_t p16 = vpaddlq_u8(p8); |
192 | return vpaddlq_u16(p16); |
193 | #elif SIMDPP_USE_VSX_207 |
194 | return vec_vpopcnt(a.native()); |
195 | #elif SIMDPP_USE_MSA |
196 | return (v4u32) __msa_pcnt_w((v4i32) a.native()); |
197 | #else |
198 | return v_emul_popcnt_u32(a); |
199 | #endif |
200 | } |
201 | |
202 | #if SIMDPP_USE_AVX2 |
203 | static SIMDPP_INL |
204 | uint32<8> i_popcnt(const uint32<8>& a) |
205 | { |
206 | return v_emul_popcnt_u32(a); |
207 | } |
208 | #endif |
209 | |
210 | #if SIMDPP_USE_AVX512F |
211 | static SIMDPP_INL |
212 | uint32<16> i_popcnt(const uint32<16>& a) |
213 | { |
214 | // TODO: support AVX512VPOPCNTDQ |
215 | return v_emul_popcnt_u32(a); |
216 | } |
217 | #endif |
218 | |
219 | // ----------------------------------------------------------------------------- |
220 | |
221 | template<class V> SIMDPP_INL |
222 | V v_emul_popcnt_u64(const V& a) |
223 | { |
224 | V p = a; |
225 | V m55 = splat(0x5555555555555555); |
226 | V m33 = splat(0x3333333333333333); |
227 | V m0f = splat(0x0f0f0f0f0f0f0f0f); |
228 | V res_mask = splat(0x00000000000000ff); |
229 | |
230 | |
231 | p = sub(p, bit_and(shift_r<1>(p), m55)); |
232 | p = add(bit_and(p, m33), bit_and(shift_r<2>(p), m33)); |
233 | p = bit_and(add(p, shift_r<4>(p)), m0f); |
234 | p = add(p, shift_r<8>(p)); |
235 | p = add(p, shift_r<16>(p)); |
236 | p = add(p, shift_r<32>(p)); |
237 | p = bit_and(p, res_mask); |
238 | return p; |
239 | } |
240 | |
241 | static SIMDPP_INL |
242 | uint64<2> i_popcnt(const uint64<2>& a) |
243 | { |
244 | #if SIMDPP_USE_NULL |
245 | uint64<2> r; |
246 | for (unsigned i = 0; i < a.length; i++) { |
247 | r.el(i) = detail::null::el_popcnt64(a.el(i)); |
248 | } |
249 | return r; |
250 | #elif SIMDPP_USE_X86_POPCNT_INSN |
251 | unsigned a0, a1; |
252 | #if SIMDPP_64_BITS |
253 | a0 = _mm_popcnt_u64(extract<0>(a)); |
254 | a1 = _mm_popcnt_u64(extract<1>(a)); |
255 | #else |
256 | uint32<4> a32; a32 = a; |
257 | a0 = _mm_popcnt_u32(extract<0>(a32)); |
258 | a0 += _mm_popcnt_u32(extract<1>(a32)); |
259 | a1 = _mm_popcnt_u32(extract<2>(a32)); |
260 | a1 += _mm_popcnt_u32(extract<3>(a32)); |
261 | #endif |
262 | uint16<8> r = _mm_cvtsi32_si128(a0); |
263 | r = insert<4>(r, a1); |
264 | return (uint64<2>) r; |
265 | #elif SIMDPP_USE_SSE2 |
266 | uint8<16> p8 = v_emul_popcnt_u8((uint8<16>) a); |
267 | return _mm_sad_epu8(p8.native(), _mm_setzero_si128()); |
268 | #elif SIMDPP_USE_NEON |
269 | uint8x16_t p8 = vcntq_u8(vreinterpretq_u8_u64(a.native())); |
270 | uint16x8_t p16 = vpaddlq_u8(p8); |
271 | uint32x4_t p32 = vpaddlq_u16(p16); |
272 | return vpaddlq_u32(p32); |
273 | #elif SIMDPP_USE_VSX_207 |
274 | return vec_vpopcnt(a.native()); |
275 | #elif SIMDPP_USE_MSA |
276 | return (v2u64) __msa_pcnt_d((v2i64) a.native()); |
277 | #else |
278 | return v_emul_popcnt_u64(a); |
279 | #endif |
280 | } |
281 | |
282 | #if SIMDPP_USE_AVX2 |
283 | static SIMDPP_INL |
284 | uint64<4> i_popcnt(const uint64<4>& a) |
285 | { |
286 | #if SIMDPP_USE_X86_POPCNT_INSN && SIMDPP_64_BITS |
287 | uint64<2> a0, a1; |
288 | split(a, a0, a1); |
289 | a0 = i_popcnt(a0); |
290 | a1 = i_popcnt(a1); |
291 | return combine(a0, a1); |
292 | #else |
293 | uint8<32> p8 = v_emul_popcnt_u8((uint8<32>) a); |
294 | return _mm256_sad_epu8(p8.native(), _mm256_setzero_si256()); |
295 | #endif |
296 | } |
297 | #endif |
298 | |
299 | #if SIMDPP_USE_AVX512F |
300 | static SIMDPP_INL |
301 | uint64<8> i_popcnt(const uint64<8>& a) |
302 | { |
303 | // TODO: support AVX512VPOPCNTDQ |
304 | #if SIMDPP_USE_AVX512BW |
305 | uint8<64> p8 = v_emul_popcnt_u8((uint8<64>) a); |
306 | return _mm512_sad_epu8(p8.native(), _mm512_setzero_si512()); |
307 | #else |
308 | return v_emul_popcnt_u64(a); |
309 | #endif |
310 | } |
311 | #endif |
312 | |
313 | // ----------------------------------------------------------------------------- |
314 | |
315 | template<class V> SIMDPP_INL |
316 | V i_popcnt(const V& a) |
317 | { |
318 | SIMDPP_VEC_ARRAY_IMPL1(V, i_popcnt, a) |
319 | } |
320 | |
321 | |
322 | } // namespace insn |
323 | } // namespace detail |
324 | } // namespace SIMDPP_ARCH_NAMESPACE |
325 | } // namespace simdpp |
326 | |
327 | #endif |
328 | |
329 | |