i_reduce_mul.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/i_reduce_mul.h]

1	/ Copyright (C) 2016 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_MUL_H
9	#define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_MUL_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/types.h>
16	#include <simdpp/core/extract.h>
17	#include <simdpp/core/f_mul.h>
18	#include <simdpp/core/i_mul.h>
19	#include <simdpp/core/move_l.h>
20	#include <simdpp/core/make_uint.h>
21	#include <simdpp/core/to_int32.h>
22	#include <simdpp/detail/mem_block.h>
23
24	namespace simdpp {
25	namespace SIMDPP_ARCH_NAMESPACE {
26
27	// forward declarations
28	template<unsigned N, class E> SIMDPP_INL
29	int32_t reduce_mul(const int16<N,E>& a);
30	template<unsigned N, class E> SIMDPP_INL
31	uint32_t reduce_mul(const uint16<N,E>& a);
32	template<unsigned N, class E> SIMDPP_INL
33	int32_t reduce_mul(const int32<N,E>& a);
34	template<unsigned N, class E> SIMDPP_INL
35	uint32_t reduce_mul(const uint32<N,E>& a);
36
37	namespace detail {
38	namespace insn {
39
40	static SIMDPP_INL
41	uint32_t i_reduce_mul(const uint16x8& a)
42	{
43	#if SIMDPP_USE_NULL
44	uint32_t r = a.el(`0`);
45	for (unsigned i = `1`; i < a.length; i++) {
46	r *= a.el(i);
47	}
48	return r;
49	#elif SIMDPP_USE_SSE2
50	uint32x4 ca = (uint32x4)a;
51	// shift data zeroing out bits
52	uint32x4 al = shift_r<`16`>(ca);
53	uint32x4 ah = shift_l<`16`>(ca);
54	// due to zeroing bits in previous steps, the result can be OR'ed
55	uint32x4 lo = _mm_mullo_epi16(ca.native(), al.native());
56	uint32x4 hi = _mm_mulhi_epu16(ca.native(), ah.native());
57	uint32x4 r = bit_or(lo, hi);
58	r = _mm_mul_epu32(r.native(), move4_l<`1`>(r).eval().native());
59	r = _mm_mul_epu32(r.native(), move4_l<`2`>(r).eval().native());
60	return extract<`0`>(r);
61	#elif SIMDPP_USE_NEON
62	uint32x4 r = vmull_u16(vget_low_u16(a.native()),
63	vget_high_u16(a.native()));
64	r = mul_lo(r, move4_l<`2`>(r));
65	r = mul_lo(r, move4_l<`1`>(r));
66	return extract<`0`>(r);
67	#elif SIMDPP_USE_ALTIVEC
68	uint16x8 a2 = move8_l<`1`>(a);
69	uint32x4 r = vec_mule(a2.native(), a.native());
70	mem_block<uint32x4> b = r;
71	return b[`0`] * b[`1`] * b[`2`] * b[`3`];
72	#elif SIMDPP_USE_MSA
73	uint32<`8`> a32 = to_uint32(a);
74	uint32<`4`> r = mul_lo(a32.vec(`0`), a32.vec(`1`));
75	r = mul_lo(r, move4_l<`2`>(r));
76	r = mul_lo(r, move4_l<`1`>(r));
77	return extract<`0`>(r);
78	#endif
79	}
80
81	#if SIMDPP_USE_AVX2
82	static SIMDPP_INL
83	uint32_t i_reduce_mul(const uint16x16& a)
84	{
85	uint32x8 ca = (uint32x8) a;
86	// shift data zeroing out bits
87	uint32x8 al = shift_r<`16`>(ca);
88	uint32x8 ah = shift_l<`16`>(ca);
89	// due to zoreing bits in previous steps, the result can be OR'ed
90	uint32x8 lo = _mm256_mullo_epi16(ca.native(), al.native());
91	uint32x8 hi = _mm256_mulhi_epu16(ca.native(), ah.native());
92	uint32x8 r = bit_or(lo, hi);
93	r = _mm256_mul_epu32(r.native(), move4_l<`1`>(r).eval().native());
94	uint32x4 r2 = _mm_mul_epu32(detail::extract128<`0`>(r).native(),
95	detail::extract128<`1`>(r).native());
96	r2 = _mm_mul_epu32(r2.native(), move4_l<`2`>(r2).eval().native());
97	return extract<`0`>(r2);
98	}
99	#endif
100
101	#if SIMDPP_USE_AVX512BW
102	SIMDPP_INL uint32_t i_reduce_mul(const uint16<`32`>& a)
103	{
104	uint32<`16`> ca = (uint32<`16`>) a;
105	// shift data zeroing out bits
106	uint32<`16`> al = shift_r<`16`>(ca);
107	uint32<`16`> ah = shift_l<`16`>(ca);
108	// due to zeroing bits in previous steps, the result can be OR'ed
109	uint32<`16`> lo = _mm512_mullo_epi16(ca.native(), al.native());
110	uint32<`16`> hi = _mm512_mulhi_epu16(ca.native(), ah.native());
111	uint32<`16`> r = bit_or(lo, hi);
112	return reduce_mul(r);
113	}
114	#endif
115
116	template<unsigned N>
117	SIMDPP_INL uint32_t i_reduce_mul(const uint16<N>& a)
118	{
119	#if SIMDPP_USE_NULL
120	uint32_t r = `1`;
121	for (unsigned j = `0`; j < a.vec_length; ++j) {
122	for (unsigned i = `0`; i < a.base_length; i++) {
123	r *= a.vec(j).el(i);
124	}
125	}
126	return r;
127	#elif SIMDPP_USE_AVX512BW
128	uint32<`16`> prod = make_uint(`1`);
129	for (unsigned j = `0`; j < a.vec_length; ++j) {
130	uint32<`16`> ca = (uint32<`16`>) a.vec(j);
131	// shift data zeroing out bits
132	uint32<`16`> al = shift_r<`16`>(ca);
133	uint32<`16`> ah = shift_l<`16`>(ca);
134	// due to zoreing bits in previous steps, the result can be OR'ed
135	uint32<`16`> lo = _mm512_mullo_epi16(ca.native(), al.native());
136	uint32<`16`> hi = _mm512_mulhi_epu16(ca.native(), ah.native());
137	uint32<`16`> iprod = bit_or(lo, hi);
138	iprod = _mm512_mul_epu32(iprod.native(), move4_l<`1`>(iprod).eval().native());
139	prod = _mm512_mul_epu32(prod.native(), iprod.native());
140	}
141	return reduce_mul(prod);
142	#elif SIMDPP_USE_AVX2
143	uint32x8 prod = make_uint(`1`);
144	for (unsigned j = `0`; j < a.vec_length; ++j) {
145	uint32x8 ca = (uint32x8) a.vec(j);
146	// shift data zeroing out bits
147	uint32x8 al = shift_r<`16`>(ca);
148	uint32x8 ah = shift_l<`16`>(ca);
149	// due to zoreing bits in previous steps, the result can be OR'ed
150	uint32x8 lo = _mm256_mullo_epi16(ca.native(), al.native());
151	uint32x8 hi = _mm256_mulhi_epu16(ca.native(), ah.native());
152	uint32x8 iprod = bit_or(lo, hi);
153	iprod = _mm256_mul_epu32(iprod.native(), move4_l<`1`>(iprod).eval().native());
154	prod = _mm256_mul_epu32(prod.native(), iprod.native());
155	}
156	uint32x4 r2 = _mm_mul_epu32(detail::extract128<`0`>(prod).native(),
157	detail::extract128<`1`>(prod).native());
158	r2 = _mm_mul_epu32(r2.native(), move4_l<`2`>(r2).eval().native());
159	return extract<`0`>(r2);
160	#elif SIMDPP_USE_SSE2
161	uint32x4 prod = make_uint(`1`);
162	for (unsigned j = `0`; j < a.vec_length; ++j) {
163	uint32x4 ca = (uint32x4) a.vec(j);
164	// shift data zeroing out bits
165	uint32x4 al = shift_r<`16`>(ca);
166	uint32x4 ah = shift_l<`16`>(ca);
167	// due to zoreing bits in previous steps, the result can be OR'ed
168	uint32x4 lo = _mm_mullo_epi16(ca.native(), al.native());
169	uint32x4 hi = _mm_mulhi_epu16(ca.native(), ah.native());
170	uint32x4 r = bit_or(lo, hi);
171	r = _mm_mul_epu32(r.native(), move4_l<`1`>(r).eval().native());
172	prod = _mm_mul_epu32(prod.native(), r.native());
173	}
174	prod = _mm_mul_epu32(prod.native(), move4_l<`2`>(prod).eval().native());
175	return extract<`0`>(prod);
176	#elif SIMDPP_USE_NEON
177	uint32x4 prod = make_uint(`1`);
178	for (unsigned j = `0`; j < a.vec_length; ++j) {
179	uint32x4 r = vmull_u16(vget_low_u16(a.vec(j).native()),
180	vget_high_u16(a.vec(j).native()));
181	prod = mul_lo(prod, r);
182	}
183	prod = mul_lo(prod, move4_l<`2`>(prod));
184	prod = mul_lo(prod, move4_l<`1`>(prod));
185	return extract<`0`>(prod);
186	#elif SIMDPP_USE_ALTIVEC
187	uint32_t r = i_reduce_mul(a.vec(`0`));
188	for (unsigned j = `1`; j < a.vec_length; ++j) {
189	r *= i_reduce_mul(a.vec(j));
190	}
191	return r;
192	#elif SIMDPP_USE_MSA
193	uint32x4 prod = make_uint(`1`);
194	for (unsigned j = `0`; j < a.vec_length; ++j) {
195	uint32<`8`> a32 = to_uint32(a.vec(j));
196	prod = mul_lo(prod, mul_lo(a32.vec(`0`), a32.vec(`1`)));
197	}
198	prod = mul_lo(prod, move4_l<`2`>(prod));
199	prod = mul_lo(prod, move4_l<`1`>(prod));
200	return extract<`0`>(prod);
201	#endif
202	}
203
204	// -----------------------------------------------------------------------------
205
206	static SIMDPP_INL
207	int32_t i_reduce_mul(const int16x8& a)
208	{
209	#if SIMDPP_USE_NULL
210	int32_t r = a.el(`0`);
211	for (unsigned i = `1`; i < a.length; i++) {
212	r *= a.el(i);
213	}
214	return r;
215	#elif SIMDPP_USE_SSE2
216	uint32x4 ca = (uint32x4) a;
217	// shift data zeroing out bits
218	uint32x4 al = shift_r<`16`>(ca);
219	uint32x4 ah = shift_l<`16`>(ca);
220	// due to zoreing bits in previous steps, the result can be OR'ed
221	uint32x4 lo = _mm_mullo_epi16(ca.native(), al.native());
222	uint32x4 hi = _mm_mulhi_epi16(ca.native(), ah.native());
223	uint32x4 r = bit_or(lo, hi);
224	r = _mm_mul_epu32(r.native(), move4_l<`1`>(r).eval().native());
225	r = _mm_mul_epu32(r.native(), move4_l<`2`>(r).eval().native());
226	return extract<`0`>(r);
227	#elif SIMDPP_USE_NEON
228	int32x4 r = vmull_s16(vget_low_s16(a.native()), vget_high_s16(a.native()));
229	r = mul_lo(r, move4_l<`2`>(r));
230	r = mul_lo(r, move4_l<`1`>(r));
231	return extract<`0`>(r);
232	#elif SIMDPP_USE_ALTIVEC
233	int16x8 a2 = move8_l<`1`>(a);
234	int32x4 r = vec_mule(a2.native(), a.native());
235	mem_block<int32x4> b = r;
236	return b[`0`] * b[`1`] * b[`2`] * b[`3`];
237	#elif SIMDPP_USE_MSA
238	int32<`8`> a32 = to_int32(a);
239	int32<`4`> r = mul_lo(a32.vec(`0`), a32.vec(`1`));
240	r = mul_lo(r, move4_l<`2`>(r));
241	r = mul_lo(r, move4_l<`1`>(r));
242	return extract<`0`>(r);
243	#endif
244	}
245
246	#if SIMDPP_USE_AVX2
247	static SIMDPP_INL
248	int32_t i_reduce_mul(const int16x16& a)
249	{
250	uint32x8 ca = (uint32x8) a;
251	// shift data zeroing out bits
252	uint32x8 al = shift_r<`16`>(ca);
253	uint32x8 ah = shift_l<`16`>(ca);
254	// due to zoreing bits in previous steps, the result can be OR'ed
255	uint32x8 lo = _mm256_mullo_epi16(ca.native(), al.native());
256	uint32x8 hi = _mm256_mulhi_epi16(ca.native(), ah.native());
257	uint32x8 r = bit_or(lo, hi);
258	r = _mm256_mul_epu32(r.native(), move4_l<`1`>(r).eval().native());
259	uint32x4 r2 = _mm_mul_epu32(detail::extract128<`0`>(r).native(),
260	detail::extract128<`1`>(r).native());
261	r2 = _mm_mul_epu32(r2.native(), move4_l<`2`>(r2).eval().native());
262	return extract<`0`>(r2);
263	}
264	#endif
265
266	#if SIMDPP_USE_AVX512BW
267	SIMDPP_INL int32_t i_reduce_mul(const int16<`32`>& a)
268	{
269	uint32<`16`> ca = (uint32<`16`>) a;
270	// shift data zeroing out bits
271	uint32<`16`> al = shift_r<`16`>(ca);
272	uint32<`16`> ah = shift_l<`16`>(ca);
273	// due to zoreing bits in previous steps, the result can be OR'ed
274	uint32<`16`> lo = _mm512_mullo_epi16(ca.native(), al.native());
275	uint32<`16`> hi = _mm512_mulhi_epi16(ca.native(), ah.native());
276	uint32<`16`> r = bit_or(lo, hi);
277	return reduce_mul(r);
278	}
279	#endif
280
281	template<unsigned N>
282	SIMDPP_INL int32_t i_reduce_mul(const int16<N>& a)
283	{
284	#if SIMDPP_USE_NULL
285	uint32_t r = `1`;
286	for (unsigned j = `0`; j < a.vec_length; ++j) {
287	for (unsigned i = `0`; i < a.base_length; i++) {
288	r *= a.vec(j).el(i);
289	}
290	}
291	return r;
292	#elif SIMDPP_USE_AVX512BW
293	uint32<`16`> prod = make_uint(`1`);
294	for (unsigned j = `0`; j < a.vec_length; ++j) {
295	uint32<`16`> ca = (uint32<`16`>) a.vec(j);
296	// shift data zeroing out bits
297	uint32<`16`> al = shift_r<`16`>(ca);
298	uint32<`16`> ah = shift_l<`16`>(ca);
299	// due to zoreing bits in previous steps, the result can be OR'ed
300	uint32<`16`> lo = _mm512_mullo_epi16(ca.native(), al.native());
301	uint32<`16`> hi = _mm512_mulhi_epi16(ca.native(), ah.native());
302	uint32<`16`> iprod = bit_or(lo, hi);
303	iprod = _mm512_mul_epu32(iprod.native(), move4_l<`1`>(iprod).eval().native());
304	prod = _mm512_mul_epu32(prod.native(), iprod.native());
305	}
306	return reduce_mul(prod);
307	#elif SIMDPP_USE_AVX2
308	uint32x8 prod = make_uint(`1`);
309	for (unsigned j = `0`; j < a.vec_length; ++j) {
310	uint32x8 ca = (uint32x8) a.vec(j);
311	// shift data zeroing out bits
312	uint32x8 al = shift_r<`16`>(ca);
313	uint32x8 ah = shift_l<`16`>(ca);
314	// due to zoreing bits in previous steps, the result can be OR'ed
315	uint32x8 lo = _mm256_mullo_epi16(ca.native(), al.native());
316	uint32x8 hi = _mm256_mulhi_epi16(ca.native(), ah.native());
317	uint32x8 iprod = bit_or(lo, hi);
318	iprod = _mm256_mul_epu32(iprod.native(), move4_l<`1`>(iprod).eval().native());
319	prod = _mm256_mul_epu32(prod.native(), iprod.native());
320	}
321	uint32x4 r2 = _mm_mul_epu32(detail::extract128<`0`>(prod).native(),
322	detail::extract128<`1`>(prod).native());
323	r2 = _mm_mul_epu32(r2.native(), move4_l<`2`>(r2).eval().native());
324	return extract<`0`>(r2);
325	#elif SIMDPP_USE_SSE2
326	uint32x4 prod = make_uint(`1`);
327	for (unsigned j = `0`; j < a.vec_length; ++j) {
328	uint32x4 ca = (uint32x4) a.vec(j);
329	// shift data zeroing out bits
330	uint32x4 al = shift_r<`16`>(ca);
331	uint32x4 ah = shift_l<`16`>(ca);
332	// due to zoreing bits in previous steps, the result can be OR'ed
333	uint32x4 lo = _mm_mullo_epi16(ca.native(), al.native());
334	uint32x4 hi = _mm_mulhi_epi16(ca.native(), ah.native());
335	uint32x4 r = bit_or(lo, hi);
336	r = _mm_mul_epu32(r.native(), move4_l<`1`>(r).eval().native());
337	prod = _mm_mul_epu32(prod.native(), r.native());
338	}
339	prod = _mm_mul_epu32(prod.native(), move4_l<`2`>(prod).eval().native());
340	return extract<`0`>(prod);
341	#elif SIMDPP_USE_NEON
342	int32x4 prod = make_uint(`1`);
343	for (unsigned j = `0`; j < a.vec_length; ++j) {
344	int32x4 r = vmull_s16(vget_low_s16(a.vec(j).native()),
345	vget_high_s16(a.vec(j).native()));
346	prod = mul_lo(prod, r);
347	}
348	prod = mul_lo(prod, move4_l<`2`>(prod));
349	prod = mul_lo(prod, move4_l<`1`>(prod));
350	return extract<`0`>(prod);
351	#elif SIMDPP_USE_ALTIVEC
352	int32_t r = i_reduce_mul(a.vec(`0`));
353	for (unsigned j = `1`; j < a.vec_length; ++j) {
354	r *= i_reduce_mul(a.vec(j));
355	}
356	return r;
357	#elif SIMDPP_USE_MSA
358	int32x4 prod = make_uint(`1`);
359	for (unsigned j = `0`; j < a.vec_length; ++j) {
360	int32<`8`> a32 = to_int32(a.vec(j));
361	prod = mul_lo(prod, mul_lo(a32.vec(`0`), a32.vec(`1`)));
362	}
363	prod = mul_lo(prod, move4_l<`2`>(prod));
364	prod = mul_lo(prod, move4_l<`1`>(prod));
365	return extract<`0`>(prod);
366	#endif
367	}
368
369	// -----------------------------------------------------------------------------
370
371	static SIMDPP_INL
372	uint32_t i_reduce_mul(const uint32x4& a)
373	{
374	#if SIMDPP_USE_NULL
375	uint32_t r = a.el(`0`);
376	for (unsigned i = `1`; i < a.length; i++) {
377	r *= a.el(i);
378	}
379	return r;
380	#elif SIMDPP_USE_SSE2
381	uint32x4 r = _mm_mul_epu32(a.native(), move4_l<`1`>(a).eval().native());
382	r = _mm_mul_epu32(r.native(), move4_l<`2`>(r).eval().native());
383	return extract<`0`>(r);
384	#elif SIMDPP_USE_NEON \|\| SIMDPP_USE_MSA
385	uint32x4 r = a;
386	r = mul_lo(r, move4_l<`2`>(r));
387	r = mul_lo(r, move4_l<`1`>(r));
388	return extract<`0`>(r);
389	#elif SIMDPP_USE_ALTIVEC
390	mem_block<uint32x4> b = a;
391	return b[`0`] * b[`1`] * b[`2`] * b[`3`];
392	#endif
393	}
394
395	#if SIMDPP_USE_AVX2
396	static SIMDPP_INL
397	uint32_t i_reduce_mul(const uint32x8& a)
398	{
399	uint32x8 ra = _mm256_mul_epu32(a.native(), move4_l<`1`>(a).eval().native());
400	uint32x4 r = _mm_mul_epu32(detail::extract128<`0`>(ra).native(),
401	detail::extract128<`1`>(ra).native());
402	r = _mm_mul_epu32(r.native(), move4_l<`2`>(r).eval().native());
403	return extract<`0`>(r);
404	}
405	#endif
406
407	#if SIMDPP_USE_AVX512F
408	static SIMDPP_INL
409	uint32_t i_reduce_mul(const uint32<`16`>& a)
410	{
411	return i_reduce_mul(mul_lo(extract256<`0`>(a), extract256<`1`>(a)));
412	}
413	#endif
414
415	template<unsigned N>
416	SIMDPP_INL uint32_t i_reduce_mul(const uint32<N>& a)
417	{
418	#if SIMDPP_USE_NULL
419	uint32_t r = `1`;
420	for (unsigned j = `0`; j < a.vec_length; ++j) {
421	for (unsigned i = `0`; i < a.base_length; i++) {
422	r *= a.vec(j).el(i);
423	}
424	}
425	return r;
426	#elif SIMDPP_USE_AVX2
427	uint32x8 prod = make_uint(`1`);
428	for (unsigned j = `0`; j < a.vec_length; ++j) {
429	uint32x8 ai = a.vec(j);
430	uint32x8 ra = _mm256_mul_epu32(ai.native(), move4_l<`1`>(ai).eval().native());
431	prod = _mm256_mul_epu32(prod.native(), ra.native());
432	}
433	uint32x4 r = _mm_mul_epu32(detail::extract128<`0`>(prod).native(),
434	detail::extract128<`1`>(prod).native());
435	r = _mm_mul_epu32(r.native(), move4_l<`2`>(r).eval().native());
436	return extract<`0`>(r);
437	#elif SIMDPP_USE_SSE2
438	uint32x4 r = make_uint(`1`);
439	for (unsigned j = `0`; j < a.vec_length; ++j) {
440	uint32x4 ai = a.vec(j);
441	uint32x4 ra = _mm_mul_epu32(ai.native(), move4_l<`1`>(ai).eval().native());
442	r = _mm_mul_epu32(r.native(), ra.native());
443	}
444	r = _mm_mul_epu32(r.native(), move4_l<`2`>(r).eval().native());
445	return extract<`0`>(r);
446	#elif SIMDPP_USE_NEON \|\| SIMDPP_USE_MSA
447	uint32x4 prod = make_uint(`1`);
448	for (unsigned j = `0`; j < a.vec_length; ++j) {
449	prod = mul_lo(prod, a.vec(j));
450	}
451	prod = mul_lo(prod, move4_l<`2`>(prod));
452	prod = mul_lo(prod, move4_l<`1`>(prod));
453	return extract<`0`>(prod);
454	#elif SIMDPP_USE_ALTIVEC
455	uint32_t r = i_reduce_mul(a.vec(`0`));
456	for (unsigned j = `1`; j < a.vec_length; ++j) {
457	r *= i_reduce_mul(a.vec(j));
458	}
459	return r;
460	#endif
461	}
462
463	// -----------------------------------------------------------------------------
464
465
466	} // namespace insn
467	} // namespace detail
468	} // namespace SIMDPP_ARCH_NAMESPACE
469	} // namespace simdpp
470
471	#endif
472
473

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/i_reduce_mul.h