1 | /* Copyright (C) 2017 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_POPCNT_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_POPCNT_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/detail/null/bitwise.h> |
17 | #include <simdpp/core/i_popcnt.h> |
18 | #include <simdpp/core/i_reduce_add.h> |
19 | |
20 | namespace simdpp { |
21 | namespace SIMDPP_ARCH_NAMESPACE { |
22 | namespace detail { |
23 | namespace insn { |
24 | |
25 | static SIMDPP_INL |
26 | uint32_t i_reduce_popcnt(const uint32<4>& a) |
27 | { |
28 | #if SIMDPP_USE_NULL |
29 | uint32_t r = 0; |
30 | for (unsigned i = 0; i < a.length; i++) { |
31 | r += detail::null::el_popcnt32(a.el(i)); |
32 | } |
33 | return r; |
34 | #elif SIMDPP_USE_X86_POPCNT_INSN |
35 | uint32_t r = 0; |
36 | #if SIMDPP_64_BITS |
37 | uint64<2> a64; a64 = a; |
38 | r += _mm_popcnt_u64(extract<0>(a64)); |
39 | r += _mm_popcnt_u64(extract<1>(a64)); |
40 | #else |
41 | r += _mm_popcnt_u32(extract<0>(a)); |
42 | r += _mm_popcnt_u32(extract<1>(a)); |
43 | r += _mm_popcnt_u32(extract<2>(a)); |
44 | r += _mm_popcnt_u32(extract<3>(a)); |
45 | #endif |
46 | return r; |
47 | #elif SIMDPP_USE_NEON |
48 | uint8<16> r = vcntq_u8(vreinterpretq_u8_u32(a.native())); |
49 | return reduce_add(r); |
50 | #elif SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA |
51 | uint64<2> a64; a64 = a; |
52 | a64 = popcnt(a64); |
53 | return reduce_add(a64); |
54 | #elif SIMDPP_USE_SSE2 |
55 | uint64<2> r = popcnt((uint64<2>)a); |
56 | return (uint32_t) reduce_add(r); |
57 | #else |
58 | uint32<4> r = popcnt(a); |
59 | return reduce_add(r); |
60 | #endif |
61 | } |
62 | |
63 | #if SIMDPP_USE_AVX2 |
64 | static SIMDPP_INL |
65 | uint32_t i_reduce_popcnt(const uint32<8>& a) |
66 | { |
67 | #if SIMDPP_USE_X86_POPCNT_INSN && SIMDPP_64_BITS |
68 | uint32<4> a0, a1; |
69 | split(a, a0, a1); |
70 | return i_reduce_popcnt(a0) + i_reduce_popcnt(a1); |
71 | #else |
72 | uint64<4> r = popcnt((uint64<4>)a); |
73 | return (uint32_t) reduce_add(r); |
74 | #endif |
75 | } |
76 | #endif |
77 | |
78 | #if SIMDPP_USE_AVX512F |
79 | static SIMDPP_INL |
80 | uint32_t i_reduce_popcnt(const uint32<16>& a) |
81 | { |
82 | #if SIMDPP_USE_X86_POPCNT_INSN && SIMDPP_64_BITS |
83 | uint32<8> a0, a1; |
84 | split(a, a0, a1); |
85 | return i_reduce_popcnt(a0) + i_reduce_popcnt(a1); |
86 | #else |
87 | // TODO: support AVX512VPOPCNTDQ |
88 | uint64<8> r = popcnt((uint64<8>)a); |
89 | return reduce_add(r); |
90 | #endif |
91 | } |
92 | #endif |
93 | |
94 | template<unsigned N> SIMDPP_INL |
95 | uint32_t i_reduce_popcnt(const uint32<N>& a) |
96 | { |
97 | uint32_t r = 0; |
98 | for (unsigned j = 0; j < a.vec_length; ++j) { |
99 | r += i_reduce_popcnt(a.vec(j)); |
100 | } |
101 | return r; |
102 | } |
103 | |
104 | // ----------------------------------------------------------------------------- |
105 | |
106 | } // namespace insn |
107 | } // namespace detail |
108 | } // namespace SIMDPP_ARCH_NAMESPACE |
109 | } // namespace simdpp |
110 | |
111 | #endif |
112 | |
113 | |