1 | /* Copyright (C) 2013-2017 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_MULL_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_MULL_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/detail/mem_block.h> |
17 | #include <simdpp/detail/not_implemented.h> |
18 | #include <simdpp/core/detail/subvec_insert.h> |
19 | #include <simdpp/core/to_int32.h> |
20 | #include <simdpp/core/to_int64.h> |
21 | #include <simdpp/core/i_mul.h> |
22 | #include <simdpp/core/combine.h> |
23 | #include <simdpp/core/zip_hi.h> |
24 | #include <simdpp/core/zip_lo.h> |
25 | |
26 | namespace simdpp { |
27 | namespace SIMDPP_ARCH_NAMESPACE { |
28 | namespace detail { |
29 | namespace insn { |
30 | |
31 | /* Note: widening integer multiplication instructions are very different among |
32 | instruction sets. The main difference is in which half of the elements are |
33 | selected for multiplication. Trying to abstract this incurs definite |
34 | overhead. |
35 | |
36 | - SSE2-SSE4.1, AVX2, ALTIVEC and VSX provide only instructions with |
37 | interfaces similar to mul_lo and mul_hi. The result vectors must be |
38 | interleaved to obtain contiguous result values. Multiplying 2 vectors |
39 | always incurs overhead of at least two interleaving instructions. |
40 | |
41 | - AVX512 only provides 32-bit integer support. Widening multiplication |
42 | can be done only by using PMULDQ, which takes odd elements and produces |
43 | widened multiplication results. Multiplication of two whole vectors |
44 | always incurs overhead of at least two shifts or interleaving |
45 | instructions. |
46 | |
47 | - NEON, NEONv2 provide instructions that take elements of either the lower |
48 | or higher halves of two 128-bit vectors and multiply them. No |
49 | additional overhead is incurred to obtain contiguous result values. |
50 | |
51 | The abstraction below uses the NEON model. No additional overhead is |
52 | incurred on SSE/AVX and NEON. On ALTIVEC, a single additional permute |
53 | instruction is needed for each vector multiplication on average. |
54 | */ |
55 | |
56 | static SIMDPP_INL |
57 | int32<8> i_mull(const int16<8>& a, const int16<8>& b) |
58 | { |
59 | #if SIMDPP_USE_NULL |
60 | int32x8 r; |
61 | for (unsigned i = 0; i < 8; i++) { |
62 | r.vec(i/4).el(i%4) = int32_t(a.el(i)) * b.el(i); |
63 | } |
64 | return r; |
65 | #elif SIMDPP_USE_SSE2 |
66 | int16x8 lo = _mm_mullo_epi16(a.native(), b.native()); |
67 | int16x8 hi = _mm_mulhi_epi16(a.native(), b.native()); |
68 | return (int32x8)combine(zip8_lo(lo, hi), zip8_hi(lo, hi)); |
69 | #elif SIMDPP_USE_NEON |
70 | int32x4 lo = vmull_s16(vget_low_s16(a.native()), vget_low_s16(b.native())); |
71 | int32x4 hi = vmull_s16(vget_high_s16(a.native()), vget_high_s16(b.native())); |
72 | return combine(lo, hi); |
73 | #elif SIMDPP_USE_ALTIVEC |
74 | int32x4 lo = vec_mule(a.native(), b.native()); |
75 | int32x4 hi = vec_mulo(a.native(), b.native()); |
76 | return combine(zip4_lo(lo, hi), zip4_hi(lo, hi)); |
77 | #elif SIMDPP_USE_MSA |
78 | int32<8> a32 = to_int32(a); |
79 | int32<8> b32 = to_int32(b); |
80 | a32.vec(0) = __msa_mulv_w(a32.vec(0).native(), b32.vec(0).native()); |
81 | a32.vec(1) = __msa_mulv_w(a32.vec(1).native(), b32.vec(1).native()); |
82 | return a32; |
83 | #endif |
84 | } |
85 | |
86 | #if SIMDPP_USE_AVX2 |
87 | static SIMDPP_INL |
88 | int32<16> i_mull(const int16<16>& a, const int16<16>& b) |
89 | { |
90 | int16<16> as, bs, lo, hi; |
91 | as = _mm256_permute4x64_epi64(a.native(), _MM_SHUFFLE(3,1,2,0)); |
92 | bs = _mm256_permute4x64_epi64(b.native(), _MM_SHUFFLE(3,1,2,0)); |
93 | lo = _mm256_mullo_epi16(as.native(), bs.native()); |
94 | hi = _mm256_mulhi_epi16(as.native(), bs.native()); |
95 | return (int32<16>) combine(zip8_lo(lo, hi), zip8_hi(lo, hi)); |
96 | } |
97 | #endif |
98 | |
99 | #if SIMDPP_USE_AVX512BW |
100 | static SIMDPP_INL |
101 | int32<32> i_mull(const int16<32>& a, const int16<32>& b) |
102 | { |
103 | int16<32> as, bs, lo, hi; |
104 | int64<8> idx = make_uint(0, 4, 1, 5, 2, 6, 3, 7); |
105 | as = _mm512_permutexvar_epi64(idx.native(), a.native()); |
106 | bs = _mm512_permutexvar_epi64(idx.native(), b.native()); |
107 | lo = _mm512_mullo_epi16(as.native(), bs.native()); |
108 | hi = _mm512_mulhi_epi16(as.native(), bs.native()); |
109 | return (int32<32>) combine(zip8_lo(lo, hi), zip8_hi(lo, hi)); |
110 | } |
111 | #endif |
112 | |
113 | template<unsigned N> SIMDPP_INL |
114 | int32<N> i_mull(const int16<N>& a, const int16<N>& b) |
115 | { |
116 | int32<N> r; |
117 | for (unsigned i = 0; i < a.vec_length; ++i) { |
118 | detail::subvec_insert(r, mull(a.vec(i), b.vec(i)).eval(), i); |
119 | } |
120 | return r; |
121 | } |
122 | |
123 | // ----------------------------------------------------------------------------- |
124 | |
125 | static SIMDPP_INL |
126 | uint32<8> i_mull(const uint16<8>& a, const uint16<8>& b) |
127 | { |
128 | #if SIMDPP_USE_NULL |
129 | int32x8 r; |
130 | for (unsigned i = 0; i < 8; i++) { |
131 | r.vec(i/4).el(i%4) = uint32_t(a.el(i)) * b.el(i); |
132 | } |
133 | return r; |
134 | #elif SIMDPP_USE_SSE2 |
135 | int16x8 lo = _mm_mullo_epi16(a.native(), b.native()); |
136 | int16x8 hi = _mm_mulhi_epu16(a.native(), b.native()); |
137 | return (uint32x8) combine(zip8_lo(lo, hi), zip8_hi(lo, hi)); |
138 | #elif SIMDPP_USE_NEON |
139 | uint32x4 lo = vmull_u16(vget_low_u16(a.native()), vget_low_u16(b.native())); |
140 | uint32x4 hi = vmull_u16(vget_high_u16(a.native()), vget_high_u16(b.native())); |
141 | return combine(lo, hi); |
142 | #elif SIMDPP_USE_ALTIVEC |
143 | uint32x4 lo = vec_mule(a.native(), b.native()); |
144 | uint32x4 hi = vec_mulo(a.native(), b.native()); |
145 | return combine(zip4_lo(lo, hi), zip4_hi(lo, hi)); |
146 | #elif SIMDPP_USE_MSA |
147 | int32<8> a32 = (int32<8>) to_uint32(a); |
148 | int32<8> b32 = (int32<8>) to_uint32(b); |
149 | a32.vec(0) = __msa_mulv_w(a32.vec(0).native(), b32.vec(0).native()); |
150 | a32.vec(1) = __msa_mulv_w(a32.vec(1).native(), b32.vec(1).native()); |
151 | return uint32<8>(a32); |
152 | #endif |
153 | } |
154 | |
155 | #if SIMDPP_USE_AVX2 |
156 | static SIMDPP_INL |
157 | uint32<16> i_mull(const uint16<16>& a, const uint16<16>& b) |
158 | { |
159 | uint16<16> as, bs, lo, hi; |
160 | as = _mm256_permute4x64_epi64(a.native(), _MM_SHUFFLE(3,1,2,0)); |
161 | bs = _mm256_permute4x64_epi64(b.native(), _MM_SHUFFLE(3,1,2,0)); |
162 | lo = _mm256_mullo_epi16(as.native(), bs.native()); |
163 | hi = _mm256_mulhi_epu16(as.native(), bs.native()); |
164 | return (uint32<16>) combine(zip8_lo(lo, hi), zip8_hi(lo, hi)); |
165 | } |
166 | #endif |
167 | |
168 | #if SIMDPP_USE_AVX512BW |
169 | static SIMDPP_INL |
170 | uint32<32> i_mull(const uint16<32>& a, const uint16<32>& b) |
171 | { |
172 | uint16<32> as, bs, lo, hi; |
173 | uint64<8> idx = make_uint(0, 4, 1, 5, 2, 6, 3, 7); |
174 | as = _mm512_permutexvar_epi64(idx.native(), a.native()); |
175 | bs = _mm512_permutexvar_epi64(idx.native(), b.native()); |
176 | lo = _mm512_mullo_epi16(as.native(), bs.native()); |
177 | hi = _mm512_mulhi_epu16(as.native(), bs.native()); |
178 | return (uint32<32>) combine(zip8_lo(lo, hi), zip8_hi(lo, hi)); |
179 | } |
180 | #endif |
181 | |
182 | template<unsigned N> SIMDPP_INL |
183 | uint32<N> i_mull(const uint16<N>& a, const uint16<N>& b) |
184 | { |
185 | uint32<N> r; |
186 | for (unsigned i = 0; i < a.vec_length; ++i) { |
187 | detail::subvec_insert(r, mull(a.vec(i), b.vec(i)).eval(), i); |
188 | } |
189 | return r; |
190 | } |
191 | |
192 | // ----------------------------------------------------------------------------- |
193 | |
194 | static SIMDPP_INL |
195 | int64<4> i_mull(const int32<4>& a, const int32<4>& b) |
196 | { |
197 | #if SIMDPP_USE_NULL |
198 | int64x4 r; |
199 | r.vec(0).el(0) = int64_t(a.el(0)) * b.el(0); |
200 | r.vec(0).el(1) = int64_t(a.el(1)) * b.el(1); |
201 | r.vec(1).el(0) = int64_t(a.el(2)) * b.el(2); |
202 | r.vec(1).el(1) = int64_t(a.el(3)) * b.el(3); |
203 | return r; |
204 | #elif SIMDPP_USE_SSE4_1 |
205 | int32x4 al, ah, bl, bh; |
206 | int64x2 rl, rh; |
207 | al = zip4_lo(a, a); |
208 | bl = zip4_lo(b, b); |
209 | ah = zip4_hi(a, a); |
210 | bh = zip4_hi(b, b); |
211 | rl = _mm_mul_epi32(al.native(), bl.native()); |
212 | rh = _mm_mul_epi32(ah.native(), bh.native()); |
213 | return combine(rl, rh); |
214 | #elif SIMDPP_USE_NEON |
215 | int64x2 lo = vmull_s32(vget_low_s32(a.native()), vget_low_s32(b.native())); |
216 | int64x2 hi = vmull_s32(vget_high_s32(a.native()), vget_high_s32(b.native())); |
217 | return combine(lo, hi); |
218 | #elif SIMDPP_USE_VSX_207 |
219 | #if defined(__GNUC__) && (__GNUC__ < 8) |
220 | // BUG: GCC 7 and earlied don't implement 32-bit integer multiplication |
221 | __vector int32_t va = a.native(), vb = b.native(); |
222 | __vector int64_t vlo, vhi; |
223 | #if SIMDPP_BIG_ENDIAN |
224 | asm("vmulesw %0, %1, %2" : "=v" (vlo) : "v" (va), "v" (vb)); |
225 | asm("vmulosw %0, %1, %2" : "=v" (vhi) : "v" (va), "v" (vb)); |
226 | #else |
227 | asm("vmulosw %0, %1, %2" : "=v" (vlo) : "v" (va), "v" (vb)); |
228 | asm("vmulesw %0, %1, %2" : "=v" (vhi) : "v" (va), "v" (vb)); |
229 | #endif |
230 | int64<2> lo = vlo, hi = vhi; |
231 | #else |
232 | int64x2 lo = vec_vmulesw(a.native(), b.native()); |
233 | int64x2 hi = vec_vmulosw(a.native(), b.native()); |
234 | #endif |
235 | return combine(zip2_lo(lo, hi), zip2_hi(lo, hi)); |
236 | #elif SIMDPP_USE_MSA |
237 | int64<4> a64 = to_int64(a); |
238 | int64<4> b64 = to_int64(b); |
239 | a64.vec(0) = __msa_mulv_d(a64.vec(0).native(), b64.vec(0).native()); |
240 | a64.vec(1) = __msa_mulv_d(a64.vec(1).native(), b64.vec(1).native()); |
241 | return a64; |
242 | #else |
243 | return SIMDPP_NOT_IMPLEMENTED2(a, b); |
244 | #endif |
245 | } |
246 | |
247 | #if SIMDPP_USE_AVX2 |
248 | static SIMDPP_INL |
249 | int64<8> i_mull(const int32<8>& a, const int32<8>& b) |
250 | { |
251 | int32x4 al, ah, bl, bh; |
252 | int64x4 rl, rh; |
253 | split(a, al, ah); |
254 | split(b, bl, bh); |
255 | |
256 | rl = _mm256_mul_epi32(to_int64(al).eval().native(), to_int64(bl).eval().native()); |
257 | rh = _mm256_mul_epi32(to_int64(ah).eval().native(), to_int64(bh).eval().native()); |
258 | return combine(rl, rh); |
259 | } |
260 | #endif |
261 | |
262 | template<unsigned N> SIMDPP_INL |
263 | int64<N> i_mull(const int32<N>& a, const int32<N>& b) |
264 | { |
265 | int64<N> r; |
266 | for (unsigned i = 0; i < a.vec_length; ++i) { |
267 | detail::subvec_insert(r, mull(a.vec(i), b.vec(i)).eval(), i); |
268 | } |
269 | return r; |
270 | } |
271 | |
272 | // ----------------------------------------------------------------------------- |
273 | |
274 | static SIMDPP_INL |
275 | uint64<4> i_mull(const uint32<4>& a, const uint32<4>& b) |
276 | { |
277 | #if SIMDPP_USE_NULL |
278 | uint64x4 r; |
279 | r.vec(0).el(0) = uint64_t(a.el(0)) * b.el(0); |
280 | r.vec(0).el(1) = uint64_t(a.el(1)) * b.el(1); |
281 | r.vec(1).el(0) = uint64_t(a.el(2)) * b.el(2); |
282 | r.vec(1).el(1) = uint64_t(a.el(3)) * b.el(3); |
283 | return r; |
284 | #elif SIMDPP_USE_SSE2 |
285 | uint32x4 al, ah, bl, bh; |
286 | uint64x2 rl, rh; |
287 | al = zip4_lo(a, a); |
288 | bl = zip4_lo(b, b); |
289 | ah = zip4_hi(a, a); |
290 | bh = zip4_hi(b, b); |
291 | rl = _mm_mul_epu32(al.native(), bl.native()); |
292 | rh = _mm_mul_epu32(ah.native(), bh.native()); |
293 | return combine(rl, rh); |
294 | #elif SIMDPP_USE_NEON |
295 | uint64x2 lo = vmull_u32(vget_low_u32(a.native()), vget_low_u32(b.native())); |
296 | uint64x2 hi = vmull_u32(vget_high_u32(a.native()), vget_high_u32(b.native())); |
297 | return combine(lo, hi); |
298 | #elif SIMDPP_USE_VSX_207 |
299 | #if defined(__GNUC__) && (__GNUC__ < 8) |
300 | // BUG: GCC 7 and earlied don't implement 32-bit integer multiplication |
301 | __vector uint32_t va = a.native(), vb = b.native(); |
302 | __vector uint64_t vlo, vhi; |
303 | #if SIMDPP_BIG_ENDIAN |
304 | asm("vmuleuw %0, %1, %2" : "=v" (vlo) : "v" (va), "v" (vb)); |
305 | asm("vmulouw %0, %1, %2" : "=v" (vhi) : "v" (va), "v" (vb)); |
306 | #else |
307 | asm("vmulouw %0, %1, %2" : "=v" (vlo) : "v" (va), "v" (vb)); |
308 | asm("vmuleuw %0, %1, %2" : "=v" (vhi) : "v" (va), "v" (vb)); |
309 | #endif |
310 | uint64<2> lo = vlo, hi = vhi; |
311 | #else |
312 | uint64x2 lo = vec_vmuleuw(a.native(), b.native()); |
313 | uint64x2 hi = vec_vmulouw(a.native(), b.native()); |
314 | #endif |
315 | return combine(zip2_lo(lo, hi), zip2_hi(lo, hi)); |
316 | #elif SIMDPP_USE_ALTIVEC |
317 | mem_block<uint32<4>> ba = a; |
318 | mem_block<uint32<4>> bb = b; |
319 | uint64x4 r; |
320 | r.vec(0).el(0) = (uint64_t) ba[0] * bb[0]; |
321 | r.vec(0).el(1) = (uint64_t) ba[1] * bb[1]; |
322 | r.vec(1).el(0) = (uint64_t) ba[2] * bb[2]; |
323 | r.vec(1).el(1) = (uint64_t) ba[3] * bb[3]; |
324 | return r; |
325 | #elif SIMDPP_USE_MSA |
326 | int64<4> a64 = (int64<4>) to_uint64(a); |
327 | int64<4> b64 = (int64<4>) to_uint64(b); |
328 | a64.vec(0) = __msa_mulv_d(a64.vec(0).native(), b64.vec(0).native()); |
329 | a64.vec(1) = __msa_mulv_d(a64.vec(1).native(), b64.vec(1).native()); |
330 | return (uint64<4>) a64; |
331 | #endif |
332 | } |
333 | |
334 | #if SIMDPP_USE_AVX2 |
335 | static SIMDPP_INL |
336 | uint64<8> i_mull(const uint32<8>& a, const uint32<8>& b) |
337 | { |
338 | uint32x4 al, ah, bl, bh; |
339 | uint64x4 rl, rh; |
340 | |
341 | split(a, al, ah); |
342 | split(b, bl, bh); |
343 | |
344 | rl = _mm256_mul_epu32(to_uint64(al).eval().native(), to_uint64(bl).eval().native()); |
345 | rh = _mm256_mul_epu32(to_uint64(ah).eval().native(), to_uint64(bh).eval().native()); |
346 | |
347 | return combine(rl, rh); |
348 | } |
349 | #endif |
350 | |
351 | #if SIMDPP_USE_AVX512F |
352 | static SIMDPP_INL |
353 | uint64<16> i_mull(const uint32<16>& a, const uint32<16>& b) |
354 | { |
355 | uint32<8> al, ah, bl, bh; |
356 | uint64<8> rl, rh; |
357 | |
358 | split(a, al, ah); |
359 | split(b, bl, bh); |
360 | |
361 | rl = _mm512_mul_epu32(to_int64(al).eval().native(), to_int64(bl).eval().native()); |
362 | rh = _mm512_mul_epu32(to_int64(ah).eval().native(), to_int64(bh).eval().native()); |
363 | |
364 | return combine(rl, rh); |
365 | } |
366 | #endif |
367 | |
368 | template<unsigned N> SIMDPP_INL |
369 | uint64<N> i_mull(const uint32<N>& a, const uint32<N>& b) |
370 | { |
371 | uint64<N> r; |
372 | for (unsigned i = 0; i < a.vec_length; ++i) { |
373 | detail::subvec_insert(r, mull(a.vec(i), b.vec(i)).eval(), i); |
374 | } |
375 | return r; |
376 | } |
377 | |
378 | } // namespace insn |
379 | } // namespace detail |
380 | } // namespace SIMDPP_ARCH_NAMESPACE |
381 | } // namespace simdpp |
382 | |
383 | #endif |
384 | |
385 | |