1/* Copyright (C) 2016 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_MUL_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_MUL_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/extract.h>
17#include <simdpp/core/f_mul.h>
18#include <simdpp/core/i_mul.h>
19#include <simdpp/core/move_l.h>
20#include <simdpp/core/make_uint.h>
21#include <simdpp/core/to_int32.h>
22#include <simdpp/detail/mem_block.h>
23
24namespace simdpp {
25namespace SIMDPP_ARCH_NAMESPACE {
26
27// forward declarations
28template<unsigned N, class E> SIMDPP_INL
29int32_t reduce_mul(const int16<N,E>& a);
30template<unsigned N, class E> SIMDPP_INL
31uint32_t reduce_mul(const uint16<N,E>& a);
32template<unsigned N, class E> SIMDPP_INL
33int32_t reduce_mul(const int32<N,E>& a);
34template<unsigned N, class E> SIMDPP_INL
35uint32_t reduce_mul(const uint32<N,E>& a);
36
37namespace detail {
38namespace insn {
39
40static SIMDPP_INL
41uint32_t i_reduce_mul(const uint16x8& a)
42{
43#if SIMDPP_USE_NULL
44 uint32_t r = a.el(0);
45 for (unsigned i = 1; i < a.length; i++) {
46 r *= a.el(i);
47 }
48 return r;
49#elif SIMDPP_USE_SSE2
50 uint32x4 ca = (uint32x4)a;
51 // shift data zeroing out bits
52 uint32x4 al = shift_r<16>(ca);
53 uint32x4 ah = shift_l<16>(ca);
54 // due to zeroing bits in previous steps, the result can be OR'ed
55 uint32x4 lo = _mm_mullo_epi16(ca.native(), al.native());
56 uint32x4 hi = _mm_mulhi_epu16(ca.native(), ah.native());
57 uint32x4 r = bit_or(lo, hi);
58 r = _mm_mul_epu32(r.native(), move4_l<1>(r).eval().native());
59 r = _mm_mul_epu32(r.native(), move4_l<2>(r).eval().native());
60 return extract<0>(r);
61#elif SIMDPP_USE_NEON
62 uint32x4 r = vmull_u16(vget_low_u16(a.native()),
63 vget_high_u16(a.native()));
64 r = mul_lo(r, move4_l<2>(r));
65 r = mul_lo(r, move4_l<1>(r));
66 return extract<0>(r);
67#elif SIMDPP_USE_ALTIVEC
68 uint16x8 a2 = move8_l<1>(a);
69 uint32x4 r = vec_mule(a2.native(), a.native());
70 mem_block<uint32x4> b = r;
71 return b[0] * b[1] * b[2] * b[3];
72#elif SIMDPP_USE_MSA
73 uint32<8> a32 = to_uint32(a);
74 uint32<4> r = mul_lo(a32.vec(0), a32.vec(1));
75 r = mul_lo(r, move4_l<2>(r));
76 r = mul_lo(r, move4_l<1>(r));
77 return extract<0>(r);
78#endif
79}
80
81#if SIMDPP_USE_AVX2
82static SIMDPP_INL
83uint32_t i_reduce_mul(const uint16x16& a)
84{
85 uint32x8 ca = (uint32x8) a;
86 // shift data zeroing out bits
87 uint32x8 al = shift_r<16>(ca);
88 uint32x8 ah = shift_l<16>(ca);
89 // due to zoreing bits in previous steps, the result can be OR'ed
90 uint32x8 lo = _mm256_mullo_epi16(ca.native(), al.native());
91 uint32x8 hi = _mm256_mulhi_epu16(ca.native(), ah.native());
92 uint32x8 r = bit_or(lo, hi);
93 r = _mm256_mul_epu32(r.native(), move4_l<1>(r).eval().native());
94 uint32x4 r2 = _mm_mul_epu32(detail::extract128<0>(r).native(),
95 detail::extract128<1>(r).native());
96 r2 = _mm_mul_epu32(r2.native(), move4_l<2>(r2).eval().native());
97 return extract<0>(r2);
98}
99#endif
100
101#if SIMDPP_USE_AVX512BW
102SIMDPP_INL uint32_t i_reduce_mul(const uint16<32>& a)
103{
104 uint32<16> ca = (uint32<16>) a;
105 // shift data zeroing out bits
106 uint32<16> al = shift_r<16>(ca);
107 uint32<16> ah = shift_l<16>(ca);
108 // due to zeroing bits in previous steps, the result can be OR'ed
109 uint32<16> lo = _mm512_mullo_epi16(ca.native(), al.native());
110 uint32<16> hi = _mm512_mulhi_epu16(ca.native(), ah.native());
111 uint32<16> r = bit_or(lo, hi);
112 return reduce_mul(r);
113}
114#endif
115
116template<unsigned N>
117SIMDPP_INL uint32_t i_reduce_mul(const uint16<N>& a)
118{
119#if SIMDPP_USE_NULL
120 uint32_t r = 1;
121 for (unsigned j = 0; j < a.vec_length; ++j) {
122 for (unsigned i = 0; i < a.base_length; i++) {
123 r *= a.vec(j).el(i);
124 }
125 }
126 return r;
127#elif SIMDPP_USE_AVX512BW
128 uint32<16> prod = make_uint(1);
129 for (unsigned j = 0; j < a.vec_length; ++j) {
130 uint32<16> ca = (uint32<16>) a.vec(j);
131 // shift data zeroing out bits
132 uint32<16> al = shift_r<16>(ca);
133 uint32<16> ah = shift_l<16>(ca);
134 // due to zoreing bits in previous steps, the result can be OR'ed
135 uint32<16> lo = _mm512_mullo_epi16(ca.native(), al.native());
136 uint32<16> hi = _mm512_mulhi_epu16(ca.native(), ah.native());
137 uint32<16> iprod = bit_or(lo, hi);
138 iprod = _mm512_mul_epu32(iprod.native(), move4_l<1>(iprod).eval().native());
139 prod = _mm512_mul_epu32(prod.native(), iprod.native());
140 }
141 return reduce_mul(prod);
142#elif SIMDPP_USE_AVX2
143 uint32x8 prod = make_uint(1);
144 for (unsigned j = 0; j < a.vec_length; ++j) {
145 uint32x8 ca = (uint32x8) a.vec(j);
146 // shift data zeroing out bits
147 uint32x8 al = shift_r<16>(ca);
148 uint32x8 ah = shift_l<16>(ca);
149 // due to zoreing bits in previous steps, the result can be OR'ed
150 uint32x8 lo = _mm256_mullo_epi16(ca.native(), al.native());
151 uint32x8 hi = _mm256_mulhi_epu16(ca.native(), ah.native());
152 uint32x8 iprod = bit_or(lo, hi);
153 iprod = _mm256_mul_epu32(iprod.native(), move4_l<1>(iprod).eval().native());
154 prod = _mm256_mul_epu32(prod.native(), iprod.native());
155 }
156 uint32x4 r2 = _mm_mul_epu32(detail::extract128<0>(prod).native(),
157 detail::extract128<1>(prod).native());
158 r2 = _mm_mul_epu32(r2.native(), move4_l<2>(r2).eval().native());
159 return extract<0>(r2);
160#elif SIMDPP_USE_SSE2
161 uint32x4 prod = make_uint(1);
162 for (unsigned j = 0; j < a.vec_length; ++j) {
163 uint32x4 ca = (uint32x4) a.vec(j);
164 // shift data zeroing out bits
165 uint32x4 al = shift_r<16>(ca);
166 uint32x4 ah = shift_l<16>(ca);
167 // due to zoreing bits in previous steps, the result can be OR'ed
168 uint32x4 lo = _mm_mullo_epi16(ca.native(), al.native());
169 uint32x4 hi = _mm_mulhi_epu16(ca.native(), ah.native());
170 uint32x4 r = bit_or(lo, hi);
171 r = _mm_mul_epu32(r.native(), move4_l<1>(r).eval().native());
172 prod = _mm_mul_epu32(prod.native(), r.native());
173 }
174 prod = _mm_mul_epu32(prod.native(), move4_l<2>(prod).eval().native());
175 return extract<0>(prod);
176#elif SIMDPP_USE_NEON
177 uint32x4 prod = make_uint(1);
178 for (unsigned j = 0; j < a.vec_length; ++j) {
179 uint32x4 r = vmull_u16(vget_low_u16(a.vec(j).native()),
180 vget_high_u16(a.vec(j).native()));
181 prod = mul_lo(prod, r);
182 }
183 prod = mul_lo(prod, move4_l<2>(prod));
184 prod = mul_lo(prod, move4_l<1>(prod));
185 return extract<0>(prod);
186#elif SIMDPP_USE_ALTIVEC
187 uint32_t r = i_reduce_mul(a.vec(0));
188 for (unsigned j = 1; j < a.vec_length; ++j) {
189 r *= i_reduce_mul(a.vec(j));
190 }
191 return r;
192#elif SIMDPP_USE_MSA
193 uint32x4 prod = make_uint(1);
194 for (unsigned j = 0; j < a.vec_length; ++j) {
195 uint32<8> a32 = to_uint32(a.vec(j));
196 prod = mul_lo(prod, mul_lo(a32.vec(0), a32.vec(1)));
197 }
198 prod = mul_lo(prod, move4_l<2>(prod));
199 prod = mul_lo(prod, move4_l<1>(prod));
200 return extract<0>(prod);
201#endif
202}
203
204// -----------------------------------------------------------------------------
205
206static SIMDPP_INL
207int32_t i_reduce_mul(const int16x8& a)
208{
209#if SIMDPP_USE_NULL
210 int32_t r = a.el(0);
211 for (unsigned i = 1; i < a.length; i++) {
212 r *= a.el(i);
213 }
214 return r;
215#elif SIMDPP_USE_SSE2
216 uint32x4 ca = (uint32x4) a;
217 // shift data zeroing out bits
218 uint32x4 al = shift_r<16>(ca);
219 uint32x4 ah = shift_l<16>(ca);
220 // due to zoreing bits in previous steps, the result can be OR'ed
221 uint32x4 lo = _mm_mullo_epi16(ca.native(), al.native());
222 uint32x4 hi = _mm_mulhi_epi16(ca.native(), ah.native());
223 uint32x4 r = bit_or(lo, hi);
224 r = _mm_mul_epu32(r.native(), move4_l<1>(r).eval().native());
225 r = _mm_mul_epu32(r.native(), move4_l<2>(r).eval().native());
226 return extract<0>(r);
227#elif SIMDPP_USE_NEON
228 int32x4 r = vmull_s16(vget_low_s16(a.native()), vget_high_s16(a.native()));
229 r = mul_lo(r, move4_l<2>(r));
230 r = mul_lo(r, move4_l<1>(r));
231 return extract<0>(r);
232#elif SIMDPP_USE_ALTIVEC
233 int16x8 a2 = move8_l<1>(a);
234 int32x4 r = vec_mule(a2.native(), a.native());
235 mem_block<int32x4> b = r;
236 return b[0] * b[1] * b[2] * b[3];
237#elif SIMDPP_USE_MSA
238 int32<8> a32 = to_int32(a);
239 int32<4> r = mul_lo(a32.vec(0), a32.vec(1));
240 r = mul_lo(r, move4_l<2>(r));
241 r = mul_lo(r, move4_l<1>(r));
242 return extract<0>(r);
243#endif
244}
245
246#if SIMDPP_USE_AVX2
247static SIMDPP_INL
248int32_t i_reduce_mul(const int16x16& a)
249{
250 uint32x8 ca = (uint32x8) a;
251 // shift data zeroing out bits
252 uint32x8 al = shift_r<16>(ca);
253 uint32x8 ah = shift_l<16>(ca);
254 // due to zoreing bits in previous steps, the result can be OR'ed
255 uint32x8 lo = _mm256_mullo_epi16(ca.native(), al.native());
256 uint32x8 hi = _mm256_mulhi_epi16(ca.native(), ah.native());
257 uint32x8 r = bit_or(lo, hi);
258 r = _mm256_mul_epu32(r.native(), move4_l<1>(r).eval().native());
259 uint32x4 r2 = _mm_mul_epu32(detail::extract128<0>(r).native(),
260 detail::extract128<1>(r).native());
261 r2 = _mm_mul_epu32(r2.native(), move4_l<2>(r2).eval().native());
262 return extract<0>(r2);
263}
264#endif
265
266#if SIMDPP_USE_AVX512BW
267SIMDPP_INL int32_t i_reduce_mul(const int16<32>& a)
268{
269 uint32<16> ca = (uint32<16>) a;
270 // shift data zeroing out bits
271 uint32<16> al = shift_r<16>(ca);
272 uint32<16> ah = shift_l<16>(ca);
273 // due to zoreing bits in previous steps, the result can be OR'ed
274 uint32<16> lo = _mm512_mullo_epi16(ca.native(), al.native());
275 uint32<16> hi = _mm512_mulhi_epi16(ca.native(), ah.native());
276 uint32<16> r = bit_or(lo, hi);
277 return reduce_mul(r);
278}
279#endif
280
281template<unsigned N>
282SIMDPP_INL int32_t i_reduce_mul(const int16<N>& a)
283{
284#if SIMDPP_USE_NULL
285 uint32_t r = 1;
286 for (unsigned j = 0; j < a.vec_length; ++j) {
287 for (unsigned i = 0; i < a.base_length; i++) {
288 r *= a.vec(j).el(i);
289 }
290 }
291 return r;
292#elif SIMDPP_USE_AVX512BW
293 uint32<16> prod = make_uint(1);
294 for (unsigned j = 0; j < a.vec_length; ++j) {
295 uint32<16> ca = (uint32<16>) a.vec(j);
296 // shift data zeroing out bits
297 uint32<16> al = shift_r<16>(ca);
298 uint32<16> ah = shift_l<16>(ca);
299 // due to zoreing bits in previous steps, the result can be OR'ed
300 uint32<16> lo = _mm512_mullo_epi16(ca.native(), al.native());
301 uint32<16> hi = _mm512_mulhi_epi16(ca.native(), ah.native());
302 uint32<16> iprod = bit_or(lo, hi);
303 iprod = _mm512_mul_epu32(iprod.native(), move4_l<1>(iprod).eval().native());
304 prod = _mm512_mul_epu32(prod.native(), iprod.native());
305 }
306 return reduce_mul(prod);
307#elif SIMDPP_USE_AVX2
308 uint32x8 prod = make_uint(1);
309 for (unsigned j = 0; j < a.vec_length; ++j) {
310 uint32x8 ca = (uint32x8) a.vec(j);
311 // shift data zeroing out bits
312 uint32x8 al = shift_r<16>(ca);
313 uint32x8 ah = shift_l<16>(ca);
314 // due to zoreing bits in previous steps, the result can be OR'ed
315 uint32x8 lo = _mm256_mullo_epi16(ca.native(), al.native());
316 uint32x8 hi = _mm256_mulhi_epi16(ca.native(), ah.native());
317 uint32x8 iprod = bit_or(lo, hi);
318 iprod = _mm256_mul_epu32(iprod.native(), move4_l<1>(iprod).eval().native());
319 prod = _mm256_mul_epu32(prod.native(), iprod.native());
320 }
321 uint32x4 r2 = _mm_mul_epu32(detail::extract128<0>(prod).native(),
322 detail::extract128<1>(prod).native());
323 r2 = _mm_mul_epu32(r2.native(), move4_l<2>(r2).eval().native());
324 return extract<0>(r2);
325#elif SIMDPP_USE_SSE2
326 uint32x4 prod = make_uint(1);
327 for (unsigned j = 0; j < a.vec_length; ++j) {
328 uint32x4 ca = (uint32x4) a.vec(j);
329 // shift data zeroing out bits
330 uint32x4 al = shift_r<16>(ca);
331 uint32x4 ah = shift_l<16>(ca);
332 // due to zoreing bits in previous steps, the result can be OR'ed
333 uint32x4 lo = _mm_mullo_epi16(ca.native(), al.native());
334 uint32x4 hi = _mm_mulhi_epi16(ca.native(), ah.native());
335 uint32x4 r = bit_or(lo, hi);
336 r = _mm_mul_epu32(r.native(), move4_l<1>(r).eval().native());
337 prod = _mm_mul_epu32(prod.native(), r.native());
338 }
339 prod = _mm_mul_epu32(prod.native(), move4_l<2>(prod).eval().native());
340 return extract<0>(prod);
341#elif SIMDPP_USE_NEON
342 int32x4 prod = make_uint(1);
343 for (unsigned j = 0; j < a.vec_length; ++j) {
344 int32x4 r = vmull_s16(vget_low_s16(a.vec(j).native()),
345 vget_high_s16(a.vec(j).native()));
346 prod = mul_lo(prod, r);
347 }
348 prod = mul_lo(prod, move4_l<2>(prod));
349 prod = mul_lo(prod, move4_l<1>(prod));
350 return extract<0>(prod);
351#elif SIMDPP_USE_ALTIVEC
352 int32_t r = i_reduce_mul(a.vec(0));
353 for (unsigned j = 1; j < a.vec_length; ++j) {
354 r *= i_reduce_mul(a.vec(j));
355 }
356 return r;
357#elif SIMDPP_USE_MSA
358 int32x4 prod = make_uint(1);
359 for (unsigned j = 0; j < a.vec_length; ++j) {
360 int32<8> a32 = to_int32(a.vec(j));
361 prod = mul_lo(prod, mul_lo(a32.vec(0), a32.vec(1)));
362 }
363 prod = mul_lo(prod, move4_l<2>(prod));
364 prod = mul_lo(prod, move4_l<1>(prod));
365 return extract<0>(prod);
366#endif
367}
368
369// -----------------------------------------------------------------------------
370
371static SIMDPP_INL
372uint32_t i_reduce_mul(const uint32x4& a)
373{
374#if SIMDPP_USE_NULL
375 uint32_t r = a.el(0);
376 for (unsigned i = 1; i < a.length; i++) {
377 r *= a.el(i);
378 }
379 return r;
380#elif SIMDPP_USE_SSE2
381 uint32x4 r = _mm_mul_epu32(a.native(), move4_l<1>(a).eval().native());
382 r = _mm_mul_epu32(r.native(), move4_l<2>(r).eval().native());
383 return extract<0>(r);
384#elif SIMDPP_USE_NEON || SIMDPP_USE_MSA
385 uint32x4 r = a;
386 r = mul_lo(r, move4_l<2>(r));
387 r = mul_lo(r, move4_l<1>(r));
388 return extract<0>(r);
389#elif SIMDPP_USE_ALTIVEC
390 mem_block<uint32x4> b = a;
391 return b[0] * b[1] * b[2] * b[3];
392#endif
393}
394
395#if SIMDPP_USE_AVX2
396static SIMDPP_INL
397uint32_t i_reduce_mul(const uint32x8& a)
398{
399 uint32x8 ra = _mm256_mul_epu32(a.native(), move4_l<1>(a).eval().native());
400 uint32x4 r = _mm_mul_epu32(detail::extract128<0>(ra).native(),
401 detail::extract128<1>(ra).native());
402 r = _mm_mul_epu32(r.native(), move4_l<2>(r).eval().native());
403 return extract<0>(r);
404}
405#endif
406
407#if SIMDPP_USE_AVX512F
408static SIMDPP_INL
409uint32_t i_reduce_mul(const uint32<16>& a)
410{
411 return i_reduce_mul(mul_lo(extract256<0>(a), extract256<1>(a)));
412}
413#endif
414
415template<unsigned N>
416SIMDPP_INL uint32_t i_reduce_mul(const uint32<N>& a)
417{
418#if SIMDPP_USE_NULL
419 uint32_t r = 1;
420 for (unsigned j = 0; j < a.vec_length; ++j) {
421 for (unsigned i = 0; i < a.base_length; i++) {
422 r *= a.vec(j).el(i);
423 }
424 }
425 return r;
426#elif SIMDPP_USE_AVX2
427 uint32x8 prod = make_uint(1);
428 for (unsigned j = 0; j < a.vec_length; ++j) {
429 uint32x8 ai = a.vec(j);
430 uint32x8 ra = _mm256_mul_epu32(ai.native(), move4_l<1>(ai).eval().native());
431 prod = _mm256_mul_epu32(prod.native(), ra.native());
432 }
433 uint32x4 r = _mm_mul_epu32(detail::extract128<0>(prod).native(),
434 detail::extract128<1>(prod).native());
435 r = _mm_mul_epu32(r.native(), move4_l<2>(r).eval().native());
436 return extract<0>(r);
437#elif SIMDPP_USE_SSE2
438 uint32x4 r = make_uint(1);
439 for (unsigned j = 0; j < a.vec_length; ++j) {
440 uint32x4 ai = a.vec(j);
441 uint32x4 ra = _mm_mul_epu32(ai.native(), move4_l<1>(ai).eval().native());
442 r = _mm_mul_epu32(r.native(), ra.native());
443 }
444 r = _mm_mul_epu32(r.native(), move4_l<2>(r).eval().native());
445 return extract<0>(r);
446#elif SIMDPP_USE_NEON || SIMDPP_USE_MSA
447 uint32x4 prod = make_uint(1);
448 for (unsigned j = 0; j < a.vec_length; ++j) {
449 prod = mul_lo(prod, a.vec(j));
450 }
451 prod = mul_lo(prod, move4_l<2>(prod));
452 prod = mul_lo(prod, move4_l<1>(prod));
453 return extract<0>(prod);
454#elif SIMDPP_USE_ALTIVEC
455 uint32_t r = i_reduce_mul(a.vec(0));
456 for (unsigned j = 1; j < a.vec_length; ++j) {
457 r *= i_reduce_mul(a.vec(j));
458 }
459 return r;
460#endif
461}
462
463// -----------------------------------------------------------------------------
464
465
466} // namespace insn
467} // namespace detail
468} // namespace SIMDPP_ARCH_NAMESPACE
469} // namespace simdpp
470
471#endif
472
473