1 | /* Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_CORE_I_MULL_H |
9 | #define LIBSIMDPP_SIMDPP_CORE_I_MULL_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/detail/expr/i_mull.h> |
17 | #include <simdpp/core/detail/scalar_arg_impl.h> |
18 | |
19 | namespace simdpp { |
20 | namespace SIMDPP_ARCH_NAMESPACE { |
21 | |
22 | |
23 | /* Note: widening integer multiplication instructions are very different among |
24 | instruction sets. The main difference is in which half of the elements are |
25 | selected for multiplication. Trying to abstract this incurs definite |
26 | overhead. |
27 | |
28 | - SSE2-SSE4.1 and AVX2 provide only instructions with interfaces similar |
29 | to mul_lo and mul_hi. The result vectors must be interleaved to obtain |
30 | contiguous result values. Multiplying 2 vectors always incurs |
31 | overhead of at least two interleaving instructions. |
32 | |
33 | - AVX512 only provides 32-bit integer support. Widening multiplication |
34 | can be done only by using PMULDQ, which takes odd elements and produces |
35 | widened multiplication results. Multiplication of two whole vectors |
36 | always incurs overhead of at least two shifts or interleaving |
37 | instructions. |
38 | |
39 | - NEON, NEONv2 provide instructions that take elements of either the lower |
40 | or higher halves of two 128-bit vectors and multiply them. No |
41 | additional overhead is incurred to obtain contiguous result values. |
42 | |
43 | - ALTIVEC hav multiply odd and multiply even instructions. No additional |
44 | overhead is incurred to obtain contiguous result values. |
45 | |
46 | The abstraction below uses the NEON model. No additional overhead is |
47 | incurred on SSE/AVX and NEON. On ALTIVEC, a single additional permute |
48 | instruction is needed for each vector multiplication on average. |
49 | */ |
50 | |
51 | /** Multiplies signed 16-bit values and expands the results to 32 bits. |
52 | |
53 | @par 128-bit version: |
54 | @code |
55 | r0 = a0 * b0 |
56 | ... |
57 | rN = aN * bN |
58 | @endcode |
59 | |
60 | @icost{SSE2-AVX, ALTIVEC, 2-3} |
61 | |
62 | @par 256-bit version: |
63 | |
64 | @icost{SSE2-AVX, ALTIVEC, 4-6} |
65 | @icost{AVX2, NEON, 2-3} |
66 | */ |
67 | template<unsigned N, class E1, class E2> SIMDPP_INL |
68 | int32<N, expr_mull<int16<N,E1>, |
69 | int16<N,E2>>> mull(const int16<N,E1>& a, const int16<N,E2>& b) |
70 | { |
71 | return { { a, b } }; |
72 | } |
73 | |
74 | SIMDPP_SCALAR_ARG_IMPL_EXPR(mull, expr_mull, int32, int16) |
75 | |
76 | /** Multiplies unsigned 16-bit values and expands the results to 32 bits. |
77 | |
78 | @par 128-bit version: |
79 | @code |
80 | r0 = a0 * b0 |
81 | ... |
82 | rN = aN * bN |
83 | @endcode |
84 | |
85 | @icost{SSE2-AVX2, ALTIVEC, 2-3} |
86 | |
87 | @par 256-bit version: |
88 | @icost{SSE2-AVX, ALTIVEC, 4-6} |
89 | @icost{AVX2, 2-3} |
90 | @icost{NEON, 2} |
91 | */ |
92 | template<unsigned N, class E1, class E2> SIMDPP_INL |
93 | uint32<N, expr_mull<uint16<N,E1>, |
94 | uint16<N,E2>>> mull(const uint16<N,E1>& a, const uint16<N,E2>& b) |
95 | { |
96 | return { { a, b } }; |
97 | } |
98 | |
99 | SIMDPP_SCALAR_ARG_IMPL_EXPR(mull, expr_mull, uint32, uint16) |
100 | |
101 | /** Multiplies signed 32-bit values in and expands the results to 64 bits. |
102 | |
103 | @code |
104 | r0 = a0 * b0 |
105 | ... |
106 | rN = aN * bN |
107 | @endcode |
108 | @par 128-bit version: |
109 | @icost{SSE4.1-AVX, 3} |
110 | @unimp{SSE2-SSSE3, ALTIVEC} |
111 | |
112 | @par 256-bit version: |
113 | @icost{SSE4.1-AVX, 6} |
114 | @icost{AVX2, 3} |
115 | @icost{NEON, 2} |
116 | @unimp{SSE2-SSSE3, ALTIVEC} |
117 | */ |
118 | template<unsigned N, class E1, class E2> SIMDPP_INL |
119 | int64<N, expr_mull<int32<N,E1>, |
120 | int32<N,E2>>> mull(const int32<N,E1>& a, const int32<N,E2>& b) |
121 | { |
122 | return { { a, b } }; |
123 | } |
124 | |
125 | SIMDPP_SCALAR_ARG_IMPL_EXPR(mull, expr_mull, int64, int32) |
126 | |
127 | /** Multiplies unsigned 32-bit values in the lower halves of the vectors and |
128 | expands the results to 64 bits. |
129 | |
130 | @par 128-bit version: |
131 | @code |
132 | r0 = a0 * b0 |
133 | r1 = a1 * b1 |
134 | @endcode |
135 | @icost{SSE2-AVX, 3} |
136 | @unimp{ALTIVEC} |
137 | |
138 | @icost{SSE2-AVX, 6} |
139 | @icost{AVX2, 3} |
140 | @icost{NEON, 2} |
141 | @unimp{ALTIVEC} |
142 | */ |
143 | template<unsigned N, class E1, class E2> SIMDPP_INL |
144 | uint64<N, expr_mull<uint32<N,E1>, |
145 | uint32<N,E2>>> mull(const uint32<N,E1>& a, const uint32<N,E2>& b) |
146 | { |
147 | return { { a, b } }; |
148 | } |
149 | |
150 | SIMDPP_SCALAR_ARG_IMPL_EXPR(mull, expr_mull, uint64, uint32) |
151 | |
152 | } // namespace SIMDPP_ARCH_NAMESPACE |
153 | } // namespace simdpp |
154 | |
155 | #endif |
156 | |
157 | |