1 | /* Copyright (C) 2016 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_OR_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_OR_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/bit_or.h> |
17 | #include <simdpp/core/extract.h> |
18 | #include <simdpp/core/move_l.h> |
19 | #include <simdpp/core/make_uint.h> |
20 | |
21 | namespace simdpp { |
22 | namespace SIMDPP_ARCH_NAMESPACE { |
23 | namespace detail { |
24 | namespace insn { |
25 | |
26 | static SIMDPP_INL |
27 | uint8_t i_reduce_or(const uint8x16& a) |
28 | { |
29 | #if SIMDPP_USE_NULL |
30 | uint8_t r = a.el(0); |
31 | for (unsigned i = 1; i < a.length; i++) { |
32 | r |= a.el(i); |
33 | } |
34 | return r; |
35 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
36 | uint8x16 r = bit_or(a, move16_l<8>(a)); |
37 | r = bit_or(r, move16_l<4>(r)); |
38 | r = bit_or(r, move16_l<2>(r)); |
39 | r = bit_or(r, move16_l<1>(r)); |
40 | return extract<0>(r); |
41 | #endif |
42 | } |
43 | |
44 | #if SIMDPP_USE_AVX2 |
45 | static SIMDPP_INL |
46 | uint8_t i_reduce_or(const uint8x32& a) |
47 | { |
48 | uint8x16 r = detail::extract128<0>(a); |
49 | r = bit_or(r, detail::extract128<1>(a)); |
50 | return i_reduce_or(r); |
51 | } |
52 | #endif |
53 | |
54 | #if SIMDPP_USE_AVX512BW |
55 | SIMDPP_INL uint8_t i_reduce_or(const uint8<64>& a) |
56 | { |
57 | uint8<32> r = detail::extract256<0>(a); |
58 | r = bit_or(r, detail::extract256<1>(a)); |
59 | return i_reduce_or(r); |
60 | } |
61 | #endif |
62 | |
63 | template<unsigned N> |
64 | SIMDPP_INL uint8_t i_reduce_or(const uint8<N>& a) |
65 | { |
66 | #if SIMDPP_USE_NULL |
67 | uint8_t r = 0; |
68 | for (unsigned j = 0; j < a.vec_length; ++j) { |
69 | for (unsigned i = 0; i < a.base_length; i++) { |
70 | r |= a.vec(j).el(i); |
71 | } |
72 | } |
73 | return r; |
74 | #else |
75 | uint8v r = a.vec(0); |
76 | for (unsigned j = 1; j < a.vec_length; ++j) { |
77 | r = bit_or(r, a.vec(j)); |
78 | } |
79 | return i_reduce_or(r); |
80 | #endif |
81 | } |
82 | |
83 | // ----------------------------------------------------------------------------- |
84 | |
85 | static SIMDPP_INL |
86 | uint16_t i_reduce_or(const uint16x8& a) |
87 | { |
88 | #if SIMDPP_USE_NULL |
89 | uint16_t r = a.el(0); |
90 | for (unsigned i = 0; i < a.length; i++) { |
91 | r |= a.el(i); |
92 | } |
93 | return r; |
94 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
95 | uint16x8 r = bit_or(a, move8_l<4>(a)); |
96 | r = bit_or(r, move8_l<2>(r)); |
97 | r = bit_or(r, move8_l<1>(r)); |
98 | return extract<0>(r); |
99 | #endif |
100 | } |
101 | |
102 | #if SIMDPP_USE_AVX2 |
103 | static SIMDPP_INL |
104 | uint16_t i_reduce_or(const uint16x16& a) |
105 | { |
106 | uint16x8 r = detail::extract128<0>(a); |
107 | r = bit_or(r, detail::extract128<1>(a)); |
108 | return i_reduce_or(r); |
109 | } |
110 | #endif |
111 | |
112 | #if SIMDPP_USE_AVX512BW |
113 | SIMDPP_INL uint16_t i_reduce_or(const uint16<32>& a) |
114 | { |
115 | uint16<16> r = detail::extract256<0>(a); |
116 | r = bit_or(r, detail::extract256<1>(a)); |
117 | return i_reduce_or(r); |
118 | } |
119 | #endif |
120 | |
121 | template<unsigned N> |
122 | SIMDPP_INL uint16_t i_reduce_or(const uint16<N>& a) |
123 | { |
124 | #if SIMDPP_USE_NULL |
125 | uint16_t r = 0; |
126 | for (unsigned j = 0; j < a.vec_length; ++j) { |
127 | for (unsigned i = 0; i < a.base_length; i++) { |
128 | r |= a.vec(j).el(i); |
129 | } |
130 | } |
131 | return r; |
132 | #else |
133 | uint16v r = a.vec(0); |
134 | for (unsigned j = 1; j < a.vec_length; ++j) { |
135 | r = bit_or(r, a.vec(j)); |
136 | } |
137 | return i_reduce_or(r); |
138 | #endif |
139 | } |
140 | |
141 | // ----------------------------------------------------------------------------- |
142 | |
143 | static SIMDPP_INL |
144 | uint32_t i_reduce_or(const uint32x4& a) |
145 | { |
146 | #if SIMDPP_USE_NULL |
147 | uint32_t r = a.el(0); |
148 | for (unsigned i = 0; i < a.length; i++) { |
149 | r |= a.el(i); |
150 | } |
151 | return r; |
152 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
153 | uint32x4 r = bit_or(a, move4_l<2>(a)); |
154 | r = bit_or(r, move4_l<1>(r)); |
155 | return extract<0>(r); |
156 | #endif |
157 | } |
158 | |
159 | #if SIMDPP_USE_AVX2 |
160 | static SIMDPP_INL |
161 | uint32_t i_reduce_or(const uint32x8& a) |
162 | { |
163 | uint32x4 r = detail::extract128<0>(a); |
164 | r = bit_or(r, detail::extract128<1>(a)); |
165 | r = bit_or(r, move4_l<2>(r)); |
166 | r = bit_or(r, move4_l<1>(r)); |
167 | return extract<0>(r); |
168 | } |
169 | #endif |
170 | |
171 | #if SIMDPP_USE_AVX512F |
172 | static SIMDPP_INL |
173 | uint32_t i_reduce_or(const uint32<16>& a) |
174 | { |
175 | return i_reduce_or(bit_or(extract256<0>(a), extract256<1>(a))); |
176 | } |
177 | #endif |
178 | |
179 | template<unsigned N> |
180 | SIMDPP_INL uint32_t i_reduce_or(const uint32<N>& a) |
181 | { |
182 | #if SIMDPP_USE_NULL |
183 | uint32_t r = 0; |
184 | for (unsigned j = 0; j < a.vec_length; ++j) { |
185 | for (unsigned i = 0; i < a.base_length; i++) { |
186 | r |= a.vec(j).el(i); |
187 | } |
188 | } |
189 | return r; |
190 | #else |
191 | uint32v r = a.vec(0); |
192 | for (unsigned j = 1; j < a.vec_length; ++j) { |
193 | r = bit_or(r, a.vec(j)); |
194 | } |
195 | return i_reduce_or(r); |
196 | #endif |
197 | } |
198 | |
199 | // ----------------------------------------------------------------------------- |
200 | |
201 | static SIMDPP_INL |
202 | uint64_t i_reduce_or(const uint64x2& a) |
203 | { |
204 | #if SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA |
205 | uint64x2 r = bit_or(a, move2_l<1>(a)); |
206 | return extract<0>(r); |
207 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC |
208 | uint64_t r = a.el(0); |
209 | for (unsigned i = 0; i < a.length; i++) { |
210 | r |= a.el(i); |
211 | } |
212 | return r; |
213 | #endif |
214 | } |
215 | |
216 | #if SIMDPP_USE_AVX2 |
217 | static SIMDPP_INL |
218 | uint64_t i_reduce_or(const uint64x4& a) |
219 | { |
220 | uint64x2 r = detail::extract128<0>(a); |
221 | r = bit_or(r, detail::extract128<1>(a)); |
222 | r = bit_or(r, move2_l<1>(r)); |
223 | return extract<0>(r); |
224 | } |
225 | #endif |
226 | |
227 | #if SIMDPP_USE_AVX512F |
228 | static SIMDPP_INL |
229 | uint64_t i_reduce_or(const uint64<8>& a) |
230 | { |
231 | return i_reduce_or(bit_or(extract256<0>(a), extract256<1>(a))); |
232 | } |
233 | #endif |
234 | |
235 | template<unsigned N> |
236 | SIMDPP_INL uint64_t i_reduce_or(const uint64<N>& a) |
237 | { |
238 | #if SIMDPP_USE_NULL |
239 | uint64_t r = 0; |
240 | for (unsigned j = 0; j < a.vec_length; ++j) { |
241 | for (unsigned i = 0; i < a.base_length; i++) { |
242 | r |= a.vec(j).el(i); |
243 | } |
244 | } |
245 | return r; |
246 | #else |
247 | uint64v r = a.vec(0); |
248 | for (unsigned j = 1; j < a.vec_length; ++j) { |
249 | r = bit_or(r, a.vec(j)); |
250 | } |
251 | return i_reduce_or(r); |
252 | #endif |
253 | } |
254 | |
255 | // ----------------------------------------------------------------------------- |
256 | |
257 | } // namespace insn |
258 | } // namespace detail |
259 | } // namespace SIMDPP_ARCH_NAMESPACE |
260 | } // namespace simdpp |
261 | |
262 | #endif |
263 | |
264 | |