i_mul_lo.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/i_mul_lo.h]

1	/ Copyright (C) 2013-2017 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_MUL_LO_H
9	#define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_MUL_LO_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/types.h>
16	#include <simdpp/core/i_mull.h>
17	#include <simdpp/core/move_l.h>
18	#include <simdpp/core/permute4.h>
19	#include <simdpp/core/shuffle2.h>
20	#include <simdpp/detail/null/math.h>
21	#include <simdpp/detail/vector_array_macros.h>
22
23	namespace simdpp {
24	namespace SIMDPP_ARCH_NAMESPACE {
25	namespace detail {
26	namespace insn {
27
28	static SIMDPP_INL
29	uint16<`8`> i_mul_lo(const uint16<`8`>& a, const uint16<`8`>& b)
30	{
31	#if SIMDPP_USE_NULL
32	return detail::null::mul(a, b);
33	#elif SIMDPP_USE_SSE2
34	return _mm_mullo_epi16(a.native(), b.native());
35	#elif SIMDPP_USE_NEON
36	return vmulq_u16(a.native(), b.native());
37	#elif SIMDPP_USE_ALTIVEC
38	return vec_mladd(a.native(), b.native(),
39	((uint16x8) make_zero()).native());
40	#elif SIMDPP_USE_MSA
41	return (v8u16) __msa_mulv_h((v8i16) a.native(), (v8i16) b.native());
42	#endif
43	}
44
45	#if SIMDPP_USE_AVX2
46	static SIMDPP_INL
47	uint16<`16`> i_mul_lo(const uint16<`16`>& a, const uint16<`16`>& b)
48	{
49	return _mm256_mullo_epi16(a.native(), b.native());
50	}
51	#endif
52
53	#if SIMDPP_USE_AVX512BW
54	static SIMDPP_INL
55	uint16<`32`> i_mul_lo(const uint16<`32`>& a, const uint16<`32`>& b)
56	{
57	return _mm512_mullo_epi16(a.native(), b.native());
58	}
59	#endif
60
61	// -----------------------------------------------------------------------------
62
63	static SIMDPP_INL
64	uint32<`4`> i_mul_lo(const uint32<`4`>& a, const uint32<`4`>& b)
65	{
66	#if SIMDPP_USE_NULL
67	return detail::null::mul(a, b);
68	#elif SIMDPP_USE_SSE4_1
69	return _mm_mullo_epi32(a.native(), b.native());
70	#elif SIMDPP_USE_SSE2
71	uint32x4 a1, b1, r;
72	a1 = move4_l<`1`>(a);
73	b1 = move4_l<`1`>(b);
74	r = _mm_mul_epu32(a.native(), b.native());
75	a1 = _mm_mul_epu32(a1.native(), b1.native());
76	r = shuffle2<`0`,`2`,`0`,`2`>(r, a1); // moves to FP domain, additional latency unavoidable
77	r = permute4<`0`,`2`,`1`,`3`>(r);
78	return r;
79	#elif SIMDPP_USE_NEON
80	return vmulq_u32(a.native(), b.native());
81	#elif SIMDPP_USE_VSX_207
82	#if __GNUC__
83	// BUG: GCC does not have support for vmuluwm yet
84	__vector uint32_t va = a.native(), vb = b.native();
85	__vector uint32_t vr;
86	asm("vmuluwm %0, %1, %2" : "=v"(vr) : "v"(va), "v"(vb));
87	return vr;
88	#else
89	return vec_vmuluwm(a.native(), b.native());
90	#endif
91	#elif SIMDPP_USE_ALTIVEC
92	// implement in terms of 16-bit multiplies
93	// ah al*
94	// bh bl
95	// ======
96	// + (albl) <- l_ab*
97	//+ (ahbl) <- h_ab*
98	//+ (albh) <- h_ba*
99
100	uint16<`8`> ra, rb; ra = a, rb = b;
101	#if SIMDPP_BIG_ENDIAN
102	uint16<`8`> sa = move8_r<`1`>(ra);
103	uint16<`8`> sb = move8_r<`1`>(rb);
104
105	uint32<`4`> l_ab = vec_mulo(ra.native(), rb.native());
106	uint32<`4`> h_ab = vec_mulo(ra.native(), sb.native());
107	uint32<`4`> h_ba = vec_mulo(sa.native(), rb.native());
108	#else
109	uint16<`8`> sa = move8_l<`1`>(ra);
110	uint16<`8`> sb = move8_l<`1`>(rb);
111
112	uint32<`4`> l_ab = vec_mule(ra.native(), rb.native());
113	uint32<`4`> h_ab = vec_mule(ra.native(), sb.native());
114	uint32<`4`> h_ba = vec_mule(sa.native(), rb.native());
115	#endif
116
117	h_ab = shift_l<`16`>(add(h_ab, h_ba));
118	h_ab = add(h_ab, l_ab);
119	return h_ab;
120	#elif SIMDPP_USE_MSA
121	return (v4u32) __msa_mulv_w((v4i32) a.native(), (v4i32) b.native());
122	#endif
123	}
124
125	#if SIMDPP_USE_AVX2
126	static SIMDPP_INL
127	uint32<`8`> i_mul_lo(const uint32<`8`>& a, const uint32<`8`>& b)
128	{
129	return _mm256_mullo_epi32(a.native(), b.native());
130	}
131	#endif
132
133	#if SIMDPP_USE_AVX512F
134	static SIMDPP_INL
135	uint32<`16`> i_mul_lo(const uint32<`16`>& a, const uint32<`16`>& b)
136	{
137	return _mm512_mullo_epi32(a.native(), b.native());
138	}
139	#endif
140
141	// -----------------------------------------------------------------------------
142
143	template<class V> SIMDPP_INL
144	V i_mul_lo(const V& a, const V& b)
145	{
146	SIMDPP_VEC_ARRAY_IMPL2(V, i_mul_lo, a, b)
147	}
148
149	} // namespace insn
150	} // namespace detail
151	} // namespace SIMDPP_ARCH_NAMESPACE
152	} // namespace simdpp
153
154	#endif
155
156

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/i_mul_lo.h