i_mull.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/core/i_mull.h]

1	/ Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_CORE_I_MULL_H
9	#define LIBSIMDPP_SIMDPP_CORE_I_MULL_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/types.h>
16	#include <simdpp/detail/expr/i_mull.h>
17	#include <simdpp/core/detail/scalar_arg_impl.h>
18
19	namespace simdpp {
20	namespace SIMDPP_ARCH_NAMESPACE {
21
22
23	/ Note: widening integer multiplication instructions are very different among*
24	instruction sets. The main difference is in which half of the elements are
25	selected for multiplication. Trying to abstract this incurs definite
26	overhead.
27
28	- SSE2-SSE4.1 and AVX2 provide only instructions with interfaces similar
29	to mul_lo and mul_hi. The result vectors must be interleaved to obtain
30	contiguous result values. Multiplying 2 vectors always incurs
31	overhead of at least two interleaving instructions.
32
33	- AVX512 only provides 32-bit integer support. Widening multiplication
34	can be done only by using PMULDQ, which takes odd elements and produces
35	widened multiplication results. Multiplication of two whole vectors
36	always incurs overhead of at least two shifts or interleaving
37	instructions.
38
39	- NEON, NEONv2 provide instructions that take elements of either the lower
40	or higher halves of two 128-bit vectors and multiply them. No
41	additional overhead is incurred to obtain contiguous result values.
42
43	- ALTIVEC hav multiply odd and multiply even instructions. No additional
44	overhead is incurred to obtain contiguous result values.
45
46	The abstraction below uses the NEON model. No additional overhead is
47	incurred on SSE/AVX and NEON. On ALTIVEC, a single additional permute
48	instruction is needed for each vector multiplication on average.
49	*/
50
51	/* Multiplies signed 16-bit values and expands the results to 32 bits.*
52
53	@par 128-bit version:
54	@code
55	r0 = a0 b0*
56	...
57	rN = aN bN*
58	@endcode
59
60	@icost{SSE2-AVX, ALTIVEC, 2-3}
61
62	@par 256-bit version:
63
64	@icost{SSE2-AVX, ALTIVEC, 4-6}
65	@icost{AVX2, NEON, 2-3}
66	*/
67	template<unsigned N, class E1, class E2> SIMDPP_INL
68	int32<N, expr_mull<int16<N,E1>,
69	int16<N,E2>>> mull(const int16<N,E1>& a, const int16<N,E2>& b)
70	{
71	return { { a, b } };
72	}
73
74	SIMDPP_SCALAR_ARG_IMPL_EXPR(mull, expr_mull, int32, int16)
75
76	/* Multiplies unsigned 16-bit values and expands the results to 32 bits.*
77
78	@par 128-bit version:
79	@code
80	r0 = a0 b0*
81	...
82	rN = aN bN*
83	@endcode
84
85	@icost{SSE2-AVX2, ALTIVEC, 2-3}
86
87	@par 256-bit version:
88	@icost{SSE2-AVX, ALTIVEC, 4-6}
89	@icost{AVX2, 2-3}
90	@icost{NEON, 2}
91	*/
92	template<unsigned N, class E1, class E2> SIMDPP_INL
93	uint32<N, expr_mull<uint16<N,E1>,
94	uint16<N,E2>>> mull(const uint16<N,E1>& a, const uint16<N,E2>& b)
95	{
96	return { { a, b } };
97	}
98
99	SIMDPP_SCALAR_ARG_IMPL_EXPR(mull, expr_mull, uint32, uint16)
100
101	/* Multiplies signed 32-bit values in and expands the results to 64 bits.*
102
103	@code
104	r0 = a0 b0*
105	...
106	rN = aN bN*
107	@endcode
108	@par 128-bit version:
109	@icost{SSE4.1-AVX, 3}
110	@unimp{SSE2-SSSE3, ALTIVEC}
111
112	@par 256-bit version:
113	@icost{SSE4.1-AVX, 6}
114	@icost{AVX2, 3}
115	@icost{NEON, 2}
116	@unimp{SSE2-SSSE3, ALTIVEC}
117	*/
118	template<unsigned N, class E1, class E2> SIMDPP_INL
119	int64<N, expr_mull<int32<N,E1>,
120	int32<N,E2>>> mull(const int32<N,E1>& a, const int32<N,E2>& b)
121	{
122	return { { a, b } };
123	}
124
125	SIMDPP_SCALAR_ARG_IMPL_EXPR(mull, expr_mull, int64, int32)
126
127	/* Multiplies unsigned 32-bit values in the lower halves of the vectors and*
128	expands the results to 64 bits.
129
130	@par 128-bit version:
131	@code
132	r0 = a0 b0*
133	r1 = a1 b1*
134	@endcode
135	@icost{SSE2-AVX, 3}
136	@unimp{ALTIVEC}
137
138	@icost{SSE2-AVX, 6}
139	@icost{AVX2, 3}
140	@icost{NEON, 2}
141	@unimp{ALTIVEC}
142	*/
143	template<unsigned N, class E1, class E2> SIMDPP_INL
144	uint64<N, expr_mull<uint32<N,E1>,
145	uint32<N,E2>>> mull(const uint32<N,E1>& a, const uint32<N,E2>& b)
146	{
147	return { { a, b } };
148	}
149
150	SIMDPP_SCALAR_ARG_IMPL_EXPR(mull, expr_mull, uint64, uint32)
151
152	} // namespace SIMDPP_ARCH_NAMESPACE
153	} // namespace simdpp
154
155	#endif
156
157

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/core/i_mull.h