i_mull.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/i_mull.h]

1	/ Copyright (C) 2013-2017 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_MULL_H
9	#define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_MULL_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/types.h>
16	#include <simdpp/detail/mem_block.h>
17	#include <simdpp/detail/not_implemented.h>
18	#include <simdpp/core/detail/subvec_insert.h>
19	#include <simdpp/core/to_int32.h>
20	#include <simdpp/core/to_int64.h>
21	#include <simdpp/core/i_mul.h>
22	#include <simdpp/core/combine.h>
23	#include <simdpp/core/zip_hi.h>
24	#include <simdpp/core/zip_lo.h>
25
26	namespace simdpp {
27	namespace SIMDPP_ARCH_NAMESPACE {
28	namespace detail {
29	namespace insn {
30
31	/ Note: widening integer multiplication instructions are very different among*
32	instruction sets. The main difference is in which half of the elements are
33	selected for multiplication. Trying to abstract this incurs definite
34	overhead.
35
36	- SSE2-SSE4.1, AVX2, ALTIVEC and VSX provide only instructions with
37	interfaces similar to mul_lo and mul_hi. The result vectors must be
38	interleaved to obtain contiguous result values. Multiplying 2 vectors
39	always incurs overhead of at least two interleaving instructions.
40
41	- AVX512 only provides 32-bit integer support. Widening multiplication
42	can be done only by using PMULDQ, which takes odd elements and produces
43	widened multiplication results. Multiplication of two whole vectors
44	always incurs overhead of at least two shifts or interleaving
45	instructions.
46
47	- NEON, NEONv2 provide instructions that take elements of either the lower
48	or higher halves of two 128-bit vectors and multiply them. No
49	additional overhead is incurred to obtain contiguous result values.
50
51	The abstraction below uses the NEON model. No additional overhead is
52	incurred on SSE/AVX and NEON. On ALTIVEC, a single additional permute
53	instruction is needed for each vector multiplication on average.
54	*/
55
56	static SIMDPP_INL
57	int32<`8`> i_mull(const int16<`8`>& a, const int16<`8`>& b)
58	{
59	#if SIMDPP_USE_NULL
60	int32x8 r;
61	for (unsigned i = `0`; i < `8`; i++) {
62	r.vec(i/`4`).el(i%`4`) = int32_t(a.el(i)) * b.el(i);
63	}
64	return r;
65	#elif SIMDPP_USE_SSE2
66	int16x8 lo = _mm_mullo_epi16(a.native(), b.native());
67	int16x8 hi = _mm_mulhi_epi16(a.native(), b.native());
68	return (int32x8)combine(zip8_lo(lo, hi), zip8_hi(lo, hi));
69	#elif SIMDPP_USE_NEON
70	int32x4 lo = vmull_s16(vget_low_s16(a.native()), vget_low_s16(b.native()));
71	int32x4 hi = vmull_s16(vget_high_s16(a.native()), vget_high_s16(b.native()));
72	return combine(lo, hi);
73	#elif SIMDPP_USE_ALTIVEC
74	int32x4 lo = vec_mule(a.native(), b.native());
75	int32x4 hi = vec_mulo(a.native(), b.native());
76	return combine(zip4_lo(lo, hi), zip4_hi(lo, hi));
77	#elif SIMDPP_USE_MSA
78	int32<`8`> a32 = to_int32(a);
79	int32<`8`> b32 = to_int32(b);
80	a32.vec(`0`) = __msa_mulv_w(a32.vec(`0`).native(), b32.vec(`0`).native());
81	a32.vec(`1`) = __msa_mulv_w(a32.vec(`1`).native(), b32.vec(`1`).native());
82	return a32;
83	#endif
84	}
85
86	#if SIMDPP_USE_AVX2
87	static SIMDPP_INL
88	int32<`16`> i_mull(const int16<`16`>& a, const int16<`16`>& b)
89	{
90	int16<`16`> as, bs, lo, hi;
91	as = _mm256_permute4x64_epi64(a.native(), _MM_SHUFFLE(`3`,`1`,`2`,`0`));
92	bs = _mm256_permute4x64_epi64(b.native(), _MM_SHUFFLE(`3`,`1`,`2`,`0`));
93	lo = _mm256_mullo_epi16(as.native(), bs.native());
94	hi = _mm256_mulhi_epi16(as.native(), bs.native());
95	return (int32<`16`>) combine(zip8_lo(lo, hi), zip8_hi(lo, hi));
96	}
97	#endif
98
99	#if SIMDPP_USE_AVX512BW
100	static SIMDPP_INL
101	int32<`32`> i_mull(const int16<`32`>& a, const int16<`32`>& b)
102	{
103	int16<`32`> as, bs, lo, hi;
104	int64<`8`> idx = make_uint(`0`, `4`, `1`, `5`, `2`, `6`, `3`, `7`);
105	as = _mm512_permutexvar_epi64(idx.native(), a.native());
106	bs = _mm512_permutexvar_epi64(idx.native(), b.native());
107	lo = _mm512_mullo_epi16(as.native(), bs.native());
108	hi = _mm512_mulhi_epi16(as.native(), bs.native());
109	return (int32<`32`>) combine(zip8_lo(lo, hi), zip8_hi(lo, hi));
110	}
111	#endif
112
113	template<unsigned N> SIMDPP_INL
114	int32<N> i_mull(const int16<N>& a, const int16<N>& b)
115	{
116	int32<N> r;
117	for (unsigned i = `0`; i < a.vec_length; ++i) {
118	detail::subvec_insert(r, mull(a.vec(i), b.vec(i)).eval(), i);
119	}
120	return r;
121	}
122
123	// -----------------------------------------------------------------------------
124
125	static SIMDPP_INL
126	uint32<`8`> i_mull(const uint16<`8`>& a, const uint16<`8`>& b)
127	{
128	#if SIMDPP_USE_NULL
129	int32x8 r;
130	for (unsigned i = `0`; i < `8`; i++) {
131	r.vec(i/`4`).el(i%`4`) = uint32_t(a.el(i)) * b.el(i);
132	}
133	return r;
134	#elif SIMDPP_USE_SSE2
135	int16x8 lo = _mm_mullo_epi16(a.native(), b.native());
136	int16x8 hi = _mm_mulhi_epu16(a.native(), b.native());
137	return (uint32x8) combine(zip8_lo(lo, hi), zip8_hi(lo, hi));
138	#elif SIMDPP_USE_NEON
139	uint32x4 lo = vmull_u16(vget_low_u16(a.native()), vget_low_u16(b.native()));
140	uint32x4 hi = vmull_u16(vget_high_u16(a.native()), vget_high_u16(b.native()));
141	return combine(lo, hi);
142	#elif SIMDPP_USE_ALTIVEC
143	uint32x4 lo = vec_mule(a.native(), b.native());
144	uint32x4 hi = vec_mulo(a.native(), b.native());
145	return combine(zip4_lo(lo, hi), zip4_hi(lo, hi));
146	#elif SIMDPP_USE_MSA
147	int32<`8`> a32 = (int32<`8`>) to_uint32(a);
148	int32<`8`> b32 = (int32<`8`>) to_uint32(b);
149	a32.vec(`0`) = __msa_mulv_w(a32.vec(`0`).native(), b32.vec(`0`).native());
150	a32.vec(`1`) = __msa_mulv_w(a32.vec(`1`).native(), b32.vec(`1`).native());
151	return uint32<`8`>(a32);
152	#endif
153	}
154
155	#if SIMDPP_USE_AVX2
156	static SIMDPP_INL
157	uint32<`16`> i_mull(const uint16<`16`>& a, const uint16<`16`>& b)
158	{
159	uint16<`16`> as, bs, lo, hi;
160	as = _mm256_permute4x64_epi64(a.native(), _MM_SHUFFLE(`3`,`1`,`2`,`0`));
161	bs = _mm256_permute4x64_epi64(b.native(), _MM_SHUFFLE(`3`,`1`,`2`,`0`));
162	lo = _mm256_mullo_epi16(as.native(), bs.native());
163	hi = _mm256_mulhi_epu16(as.native(), bs.native());
164	return (uint32<`16`>) combine(zip8_lo(lo, hi), zip8_hi(lo, hi));
165	}
166	#endif
167
168	#if SIMDPP_USE_AVX512BW
169	static SIMDPP_INL
170	uint32<`32`> i_mull(const uint16<`32`>& a, const uint16<`32`>& b)
171	{
172	uint16<`32`> as, bs, lo, hi;
173	uint64<`8`> idx = make_uint(`0`, `4`, `1`, `5`, `2`, `6`, `3`, `7`);
174	as = _mm512_permutexvar_epi64(idx.native(), a.native());
175	bs = _mm512_permutexvar_epi64(idx.native(), b.native());
176	lo = _mm512_mullo_epi16(as.native(), bs.native());
177	hi = _mm512_mulhi_epu16(as.native(), bs.native());
178	return (uint32<`32`>) combine(zip8_lo(lo, hi), zip8_hi(lo, hi));
179	}
180	#endif
181
182	template<unsigned N> SIMDPP_INL
183	uint32<N> i_mull(const uint16<N>& a, const uint16<N>& b)
184	{
185	uint32<N> r;
186	for (unsigned i = `0`; i < a.vec_length; ++i) {
187	detail::subvec_insert(r, mull(a.vec(i), b.vec(i)).eval(), i);
188	}
189	return r;
190	}
191
192	// -----------------------------------------------------------------------------
193
194	static SIMDPP_INL
195	int64<`4`> i_mull(const int32<`4`>& a, const int32<`4`>& b)
196	{
197	#if SIMDPP_USE_NULL
198	int64x4 r;
199	r.vec(`0`).el(`0`) = int64_t(a.el(`0`)) * b.el(`0`);
200	r.vec(`0`).el(`1`) = int64_t(a.el(`1`)) * b.el(`1`);
201	r.vec(`1`).el(`0`) = int64_t(a.el(`2`)) * b.el(`2`);
202	r.vec(`1`).el(`1`) = int64_t(a.el(`3`)) * b.el(`3`);
203	return r;
204	#elif SIMDPP_USE_SSE4_1
205	int32x4 al, ah, bl, bh;
206	int64x2 rl, rh;
207	al = zip4_lo(a, a);
208	bl = zip4_lo(b, b);
209	ah = zip4_hi(a, a);
210	bh = zip4_hi(b, b);
211	rl = _mm_mul_epi32(al.native(), bl.native());
212	rh = _mm_mul_epi32(ah.native(), bh.native());
213	return combine(rl, rh);
214	#elif SIMDPP_USE_NEON
215	int64x2 lo = vmull_s32(vget_low_s32(a.native()), vget_low_s32(b.native()));
216	int64x2 hi = vmull_s32(vget_high_s32(a.native()), vget_high_s32(b.native()));
217	return combine(lo, hi);
218	#elif SIMDPP_USE_VSX_207
219	#if defined(__GNUC__) && (__GNUC__ < 8)
220	// BUG: GCC 7 and earlied don't implement 32-bit integer multiplication
221	__vector int32_t va = a.native(), vb = b.native();
222	__vector int64_t vlo, vhi;
223	#if SIMDPP_BIG_ENDIAN
224	asm("vmulesw %0, %1, %2" : "=v"(vlo) : "v"(va), "v"(vb));
225	asm("vmulosw %0, %1, %2" : "=v"(vhi) : "v"(va), "v"(vb));
226	#else
227	asm("vmulosw %0, %1, %2" : "=v"(vlo) : "v"(va), "v"(vb));
228	asm("vmulesw %0, %1, %2" : "=v"(vhi) : "v"(va), "v"(vb));
229	#endif
230	int64<`2`> lo = vlo, hi = vhi;
231	#else
232	int64x2 lo = vec_vmulesw(a.native(), b.native());
233	int64x2 hi = vec_vmulosw(a.native(), b.native());
234	#endif
235	return combine(zip2_lo(lo, hi), zip2_hi(lo, hi));
236	#elif SIMDPP_USE_MSA
237	int64<`4`> a64 = to_int64(a);
238	int64<`4`> b64 = to_int64(b);
239	a64.vec(`0`) = __msa_mulv_d(a64.vec(`0`).native(), b64.vec(`0`).native());
240	a64.vec(`1`) = __msa_mulv_d(a64.vec(`1`).native(), b64.vec(`1`).native());
241	return a64;
242	#else
243	return SIMDPP_NOT_IMPLEMENTED2(a, b);
244	#endif
245	}
246
247	#if SIMDPP_USE_AVX2
248	static SIMDPP_INL
249	int64<`8`> i_mull(const int32<`8`>& a, const int32<`8`>& b)
250	{
251	int32x4 al, ah, bl, bh;
252	int64x4 rl, rh;
253	split(a, al, ah);
254	split(b, bl, bh);
255
256	rl = _mm256_mul_epi32(to_int64(al).eval().native(), to_int64(bl).eval().native());
257	rh = _mm256_mul_epi32(to_int64(ah).eval().native(), to_int64(bh).eval().native());
258	return combine(rl, rh);
259	}
260	#endif
261
262	template<unsigned N> SIMDPP_INL
263	int64<N> i_mull(const int32<N>& a, const int32<N>& b)
264	{
265	int64<N> r;
266	for (unsigned i = `0`; i < a.vec_length; ++i) {
267	detail::subvec_insert(r, mull(a.vec(i), b.vec(i)).eval(), i);
268	}
269	return r;
270	}
271
272	// -----------------------------------------------------------------------------
273
274	static SIMDPP_INL
275	uint64<`4`> i_mull(const uint32<`4`>& a, const uint32<`4`>& b)
276	{
277	#if SIMDPP_USE_NULL
278	uint64x4 r;
279	r.vec(`0`).el(`0`) = uint64_t(a.el(`0`)) * b.el(`0`);
280	r.vec(`0`).el(`1`) = uint64_t(a.el(`1`)) * b.el(`1`);
281	r.vec(`1`).el(`0`) = uint64_t(a.el(`2`)) * b.el(`2`);
282	r.vec(`1`).el(`1`) = uint64_t(a.el(`3`)) * b.el(`3`);
283	return r;
284	#elif SIMDPP_USE_SSE2
285	uint32x4 al, ah, bl, bh;
286	uint64x2 rl, rh;
287	al = zip4_lo(a, a);
288	bl = zip4_lo(b, b);
289	ah = zip4_hi(a, a);
290	bh = zip4_hi(b, b);
291	rl = _mm_mul_epu32(al.native(), bl.native());
292	rh = _mm_mul_epu32(ah.native(), bh.native());
293	return combine(rl, rh);
294	#elif SIMDPP_USE_NEON
295	uint64x2 lo = vmull_u32(vget_low_u32(a.native()), vget_low_u32(b.native()));
296	uint64x2 hi = vmull_u32(vget_high_u32(a.native()), vget_high_u32(b.native()));
297	return combine(lo, hi);
298	#elif SIMDPP_USE_VSX_207
299	#if defined(__GNUC__) && (__GNUC__ < 8)
300	// BUG: GCC 7 and earlied don't implement 32-bit integer multiplication
301	__vector uint32_t va = a.native(), vb = b.native();
302	__vector uint64_t vlo, vhi;
303	#if SIMDPP_BIG_ENDIAN
304	asm("vmuleuw %0, %1, %2" : "=v"(vlo) : "v"(va), "v"(vb));
305	asm("vmulouw %0, %1, %2" : "=v"(vhi) : "v"(va), "v"(vb));
306	#else
307	asm("vmulouw %0, %1, %2" : "=v"(vlo) : "v"(va), "v"(vb));
308	asm("vmuleuw %0, %1, %2" : "=v"(vhi) : "v"(va), "v"(vb));
309	#endif
310	uint64<`2`> lo = vlo, hi = vhi;
311	#else
312	uint64x2 lo = vec_vmuleuw(a.native(), b.native());
313	uint64x2 hi = vec_vmulouw(a.native(), b.native());
314	#endif
315	return combine(zip2_lo(lo, hi), zip2_hi(lo, hi));
316	#elif SIMDPP_USE_ALTIVEC
317	mem_block<uint32<`4`>> ba = a;
318	mem_block<uint32<`4`>> bb = b;
319	uint64x4 r;
320	r.vec(`0`).el(`0`) = (uint64_t) ba[`0`] * bb[`0`];
321	r.vec(`0`).el(`1`) = (uint64_t) ba[`1`] * bb[`1`];
322	r.vec(`1`).el(`0`) = (uint64_t) ba[`2`] * bb[`2`];
323	r.vec(`1`).el(`1`) = (uint64_t) ba[`3`] * bb[`3`];
324	return r;
325	#elif SIMDPP_USE_MSA
326	int64<`4`> a64 = (int64<`4`>) to_uint64(a);
327	int64<`4`> b64 = (int64<`4`>) to_uint64(b);
328	a64.vec(`0`) = __msa_mulv_d(a64.vec(`0`).native(), b64.vec(`0`).native());
329	a64.vec(`1`) = __msa_mulv_d(a64.vec(`1`).native(), b64.vec(`1`).native());
330	return (uint64<`4`>) a64;
331	#endif
332	}
333
334	#if SIMDPP_USE_AVX2
335	static SIMDPP_INL
336	uint64<`8`> i_mull(const uint32<`8`>& a, const uint32<`8`>& b)
337	{
338	uint32x4 al, ah, bl, bh;
339	uint64x4 rl, rh;
340
341	split(a, al, ah);
342	split(b, bl, bh);
343
344	rl = _mm256_mul_epu32(to_uint64(al).eval().native(), to_uint64(bl).eval().native());
345	rh = _mm256_mul_epu32(to_uint64(ah).eval().native(), to_uint64(bh).eval().native());
346
347	return combine(rl, rh);
348	}
349	#endif
350
351	#if SIMDPP_USE_AVX512F
352	static SIMDPP_INL
353	uint64<`16`> i_mull(const uint32<`16`>& a, const uint32<`16`>& b)
354	{
355	uint32<`8`> al, ah, bl, bh;
356	uint64<`8`> rl, rh;
357
358	split(a, al, ah);
359	split(b, bl, bh);
360
361	rl = _mm512_mul_epu32(to_int64(al).eval().native(), to_int64(bl).eval().native());
362	rh = _mm512_mul_epu32(to_int64(ah).eval().native(), to_int64(bh).eval().native());
363
364	return combine(rl, rh);
365	}
366	#endif
367
368	template<unsigned N> SIMDPP_INL
369	uint64<N> i_mull(const uint32<N>& a, const uint32<N>& b)
370	{
371	uint64<N> r;
372	for (unsigned i = `0`; i < a.vec_length; ++i) {
373	detail::subvec_insert(r, mull(a.vec(i), b.vec(i)).eval(), i);
374	}
375	return r;
376	}
377
378	} // namespace insn
379	} // namespace detail
380	} // namespace SIMDPP_ARCH_NAMESPACE
381	} // namespace simdpp
382
383	#endif
384
385

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/i_mull.h