1/* Copyright (C) 2013-2017 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_MULL_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_MULL_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/detail/mem_block.h>
17#include <simdpp/detail/not_implemented.h>
18#include <simdpp/core/detail/subvec_insert.h>
19#include <simdpp/core/to_int32.h>
20#include <simdpp/core/to_int64.h>
21#include <simdpp/core/i_mul.h>
22#include <simdpp/core/combine.h>
23#include <simdpp/core/zip_hi.h>
24#include <simdpp/core/zip_lo.h>
25
26namespace simdpp {
27namespace SIMDPP_ARCH_NAMESPACE {
28namespace detail {
29namespace insn {
30
31/* Note: widening integer multiplication instructions are very different among
32 instruction sets. The main difference is in which half of the elements are
33 selected for multiplication. Trying to abstract this incurs definite
34 overhead.
35
36 - SSE2-SSE4.1, AVX2, ALTIVEC and VSX provide only instructions with
37 interfaces similar to mul_lo and mul_hi. The result vectors must be
38 interleaved to obtain contiguous result values. Multiplying 2 vectors
39 always incurs overhead of at least two interleaving instructions.
40
41 - AVX512 only provides 32-bit integer support. Widening multiplication
42 can be done only by using PMULDQ, which takes odd elements and produces
43 widened multiplication results. Multiplication of two whole vectors
44 always incurs overhead of at least two shifts or interleaving
45 instructions.
46
47 - NEON, NEONv2 provide instructions that take elements of either the lower
48 or higher halves of two 128-bit vectors and multiply them. No
49 additional overhead is incurred to obtain contiguous result values.
50
51 The abstraction below uses the NEON model. No additional overhead is
52 incurred on SSE/AVX and NEON. On ALTIVEC, a single additional permute
53 instruction is needed for each vector multiplication on average.
54*/
55
56static SIMDPP_INL
57int32<8> i_mull(const int16<8>& a, const int16<8>& b)
58{
59#if SIMDPP_USE_NULL
60 int32x8 r;
61 for (unsigned i = 0; i < 8; i++) {
62 r.vec(i/4).el(i%4) = int32_t(a.el(i)) * b.el(i);
63 }
64 return r;
65#elif SIMDPP_USE_SSE2
66 int16x8 lo = _mm_mullo_epi16(a.native(), b.native());
67 int16x8 hi = _mm_mulhi_epi16(a.native(), b.native());
68 return (int32x8)combine(zip8_lo(lo, hi), zip8_hi(lo, hi));
69#elif SIMDPP_USE_NEON
70 int32x4 lo = vmull_s16(vget_low_s16(a.native()), vget_low_s16(b.native()));
71 int32x4 hi = vmull_s16(vget_high_s16(a.native()), vget_high_s16(b.native()));
72 return combine(lo, hi);
73#elif SIMDPP_USE_ALTIVEC
74 int32x4 lo = vec_mule(a.native(), b.native());
75 int32x4 hi = vec_mulo(a.native(), b.native());
76 return combine(zip4_lo(lo, hi), zip4_hi(lo, hi));
77#elif SIMDPP_USE_MSA
78 int32<8> a32 = to_int32(a);
79 int32<8> b32 = to_int32(b);
80 a32.vec(0) = __msa_mulv_w(a32.vec(0).native(), b32.vec(0).native());
81 a32.vec(1) = __msa_mulv_w(a32.vec(1).native(), b32.vec(1).native());
82 return a32;
83#endif
84}
85
86#if SIMDPP_USE_AVX2
87static SIMDPP_INL
88int32<16> i_mull(const int16<16>& a, const int16<16>& b)
89{
90 int16<16> as, bs, lo, hi;
91 as = _mm256_permute4x64_epi64(a.native(), _MM_SHUFFLE(3,1,2,0));
92 bs = _mm256_permute4x64_epi64(b.native(), _MM_SHUFFLE(3,1,2,0));
93 lo = _mm256_mullo_epi16(as.native(), bs.native());
94 hi = _mm256_mulhi_epi16(as.native(), bs.native());
95 return (int32<16>) combine(zip8_lo(lo, hi), zip8_hi(lo, hi));
96}
97#endif
98
99#if SIMDPP_USE_AVX512BW
100static SIMDPP_INL
101int32<32> i_mull(const int16<32>& a, const int16<32>& b)
102{
103 int16<32> as, bs, lo, hi;
104 int64<8> idx = make_uint(0, 4, 1, 5, 2, 6, 3, 7);
105 as = _mm512_permutexvar_epi64(idx.native(), a.native());
106 bs = _mm512_permutexvar_epi64(idx.native(), b.native());
107 lo = _mm512_mullo_epi16(as.native(), bs.native());
108 hi = _mm512_mulhi_epi16(as.native(), bs.native());
109 return (int32<32>) combine(zip8_lo(lo, hi), zip8_hi(lo, hi));
110}
111#endif
112
113template<unsigned N> SIMDPP_INL
114int32<N> i_mull(const int16<N>& a, const int16<N>& b)
115{
116 int32<N> r;
117 for (unsigned i = 0; i < a.vec_length; ++i) {
118 detail::subvec_insert(r, mull(a.vec(i), b.vec(i)).eval(), i);
119 }
120 return r;
121}
122
123// -----------------------------------------------------------------------------
124
125static SIMDPP_INL
126uint32<8> i_mull(const uint16<8>& a, const uint16<8>& b)
127{
128#if SIMDPP_USE_NULL
129 int32x8 r;
130 for (unsigned i = 0; i < 8; i++) {
131 r.vec(i/4).el(i%4) = uint32_t(a.el(i)) * b.el(i);
132 }
133 return r;
134#elif SIMDPP_USE_SSE2
135 int16x8 lo = _mm_mullo_epi16(a.native(), b.native());
136 int16x8 hi = _mm_mulhi_epu16(a.native(), b.native());
137 return (uint32x8) combine(zip8_lo(lo, hi), zip8_hi(lo, hi));
138#elif SIMDPP_USE_NEON
139 uint32x4 lo = vmull_u16(vget_low_u16(a.native()), vget_low_u16(b.native()));
140 uint32x4 hi = vmull_u16(vget_high_u16(a.native()), vget_high_u16(b.native()));
141 return combine(lo, hi);
142#elif SIMDPP_USE_ALTIVEC
143 uint32x4 lo = vec_mule(a.native(), b.native());
144 uint32x4 hi = vec_mulo(a.native(), b.native());
145 return combine(zip4_lo(lo, hi), zip4_hi(lo, hi));
146#elif SIMDPP_USE_MSA
147 int32<8> a32 = (int32<8>) to_uint32(a);
148 int32<8> b32 = (int32<8>) to_uint32(b);
149 a32.vec(0) = __msa_mulv_w(a32.vec(0).native(), b32.vec(0).native());
150 a32.vec(1) = __msa_mulv_w(a32.vec(1).native(), b32.vec(1).native());
151 return uint32<8>(a32);
152#endif
153}
154
155#if SIMDPP_USE_AVX2
156static SIMDPP_INL
157uint32<16> i_mull(const uint16<16>& a, const uint16<16>& b)
158{
159 uint16<16> as, bs, lo, hi;
160 as = _mm256_permute4x64_epi64(a.native(), _MM_SHUFFLE(3,1,2,0));
161 bs = _mm256_permute4x64_epi64(b.native(), _MM_SHUFFLE(3,1,2,0));
162 lo = _mm256_mullo_epi16(as.native(), bs.native());
163 hi = _mm256_mulhi_epu16(as.native(), bs.native());
164 return (uint32<16>) combine(zip8_lo(lo, hi), zip8_hi(lo, hi));
165}
166#endif
167
168#if SIMDPP_USE_AVX512BW
169static SIMDPP_INL
170uint32<32> i_mull(const uint16<32>& a, const uint16<32>& b)
171{
172 uint16<32> as, bs, lo, hi;
173 uint64<8> idx = make_uint(0, 4, 1, 5, 2, 6, 3, 7);
174 as = _mm512_permutexvar_epi64(idx.native(), a.native());
175 bs = _mm512_permutexvar_epi64(idx.native(), b.native());
176 lo = _mm512_mullo_epi16(as.native(), bs.native());
177 hi = _mm512_mulhi_epu16(as.native(), bs.native());
178 return (uint32<32>) combine(zip8_lo(lo, hi), zip8_hi(lo, hi));
179}
180#endif
181
182template<unsigned N> SIMDPP_INL
183uint32<N> i_mull(const uint16<N>& a, const uint16<N>& b)
184{
185 uint32<N> r;
186 for (unsigned i = 0; i < a.vec_length; ++i) {
187 detail::subvec_insert(r, mull(a.vec(i), b.vec(i)).eval(), i);
188 }
189 return r;
190}
191
192// -----------------------------------------------------------------------------
193
194static SIMDPP_INL
195int64<4> i_mull(const int32<4>& a, const int32<4>& b)
196{
197#if SIMDPP_USE_NULL
198 int64x4 r;
199 r.vec(0).el(0) = int64_t(a.el(0)) * b.el(0);
200 r.vec(0).el(1) = int64_t(a.el(1)) * b.el(1);
201 r.vec(1).el(0) = int64_t(a.el(2)) * b.el(2);
202 r.vec(1).el(1) = int64_t(a.el(3)) * b.el(3);
203 return r;
204#elif SIMDPP_USE_SSE4_1
205 int32x4 al, ah, bl, bh;
206 int64x2 rl, rh;
207 al = zip4_lo(a, a);
208 bl = zip4_lo(b, b);
209 ah = zip4_hi(a, a);
210 bh = zip4_hi(b, b);
211 rl = _mm_mul_epi32(al.native(), bl.native());
212 rh = _mm_mul_epi32(ah.native(), bh.native());
213 return combine(rl, rh);
214#elif SIMDPP_USE_NEON
215 int64x2 lo = vmull_s32(vget_low_s32(a.native()), vget_low_s32(b.native()));
216 int64x2 hi = vmull_s32(vget_high_s32(a.native()), vget_high_s32(b.native()));
217 return combine(lo, hi);
218#elif SIMDPP_USE_VSX_207
219#if defined(__GNUC__) && (__GNUC__ < 8)
220 // BUG: GCC 7 and earlied don't implement 32-bit integer multiplication
221 __vector int32_t va = a.native(), vb = b.native();
222 __vector int64_t vlo, vhi;
223#if SIMDPP_BIG_ENDIAN
224 asm("vmulesw %0, %1, %2" : "=v"(vlo) : "v"(va), "v"(vb));
225 asm("vmulosw %0, %1, %2" : "=v"(vhi) : "v"(va), "v"(vb));
226#else
227 asm("vmulosw %0, %1, %2" : "=v"(vlo) : "v"(va), "v"(vb));
228 asm("vmulesw %0, %1, %2" : "=v"(vhi) : "v"(va), "v"(vb));
229#endif
230 int64<2> lo = vlo, hi = vhi;
231#else
232 int64x2 lo = vec_vmulesw(a.native(), b.native());
233 int64x2 hi = vec_vmulosw(a.native(), b.native());
234#endif
235 return combine(zip2_lo(lo, hi), zip2_hi(lo, hi));
236#elif SIMDPP_USE_MSA
237 int64<4> a64 = to_int64(a);
238 int64<4> b64 = to_int64(b);
239 a64.vec(0) = __msa_mulv_d(a64.vec(0).native(), b64.vec(0).native());
240 a64.vec(1) = __msa_mulv_d(a64.vec(1).native(), b64.vec(1).native());
241 return a64;
242#else
243 return SIMDPP_NOT_IMPLEMENTED2(a, b);
244#endif
245}
246
247#if SIMDPP_USE_AVX2
248static SIMDPP_INL
249int64<8> i_mull(const int32<8>& a, const int32<8>& b)
250{
251 int32x4 al, ah, bl, bh;
252 int64x4 rl, rh;
253 split(a, al, ah);
254 split(b, bl, bh);
255
256 rl = _mm256_mul_epi32(to_int64(al).eval().native(), to_int64(bl).eval().native());
257 rh = _mm256_mul_epi32(to_int64(ah).eval().native(), to_int64(bh).eval().native());
258 return combine(rl, rh);
259}
260#endif
261
262template<unsigned N> SIMDPP_INL
263int64<N> i_mull(const int32<N>& a, const int32<N>& b)
264{
265 int64<N> r;
266 for (unsigned i = 0; i < a.vec_length; ++i) {
267 detail::subvec_insert(r, mull(a.vec(i), b.vec(i)).eval(), i);
268 }
269 return r;
270}
271
272// -----------------------------------------------------------------------------
273
274static SIMDPP_INL
275uint64<4> i_mull(const uint32<4>& a, const uint32<4>& b)
276{
277#if SIMDPP_USE_NULL
278 uint64x4 r;
279 r.vec(0).el(0) = uint64_t(a.el(0)) * b.el(0);
280 r.vec(0).el(1) = uint64_t(a.el(1)) * b.el(1);
281 r.vec(1).el(0) = uint64_t(a.el(2)) * b.el(2);
282 r.vec(1).el(1) = uint64_t(a.el(3)) * b.el(3);
283 return r;
284#elif SIMDPP_USE_SSE2
285 uint32x4 al, ah, bl, bh;
286 uint64x2 rl, rh;
287 al = zip4_lo(a, a);
288 bl = zip4_lo(b, b);
289 ah = zip4_hi(a, a);
290 bh = zip4_hi(b, b);
291 rl = _mm_mul_epu32(al.native(), bl.native());
292 rh = _mm_mul_epu32(ah.native(), bh.native());
293 return combine(rl, rh);
294#elif SIMDPP_USE_NEON
295 uint64x2 lo = vmull_u32(vget_low_u32(a.native()), vget_low_u32(b.native()));
296 uint64x2 hi = vmull_u32(vget_high_u32(a.native()), vget_high_u32(b.native()));
297 return combine(lo, hi);
298#elif SIMDPP_USE_VSX_207
299#if defined(__GNUC__) && (__GNUC__ < 8)
300 // BUG: GCC 7 and earlied don't implement 32-bit integer multiplication
301 __vector uint32_t va = a.native(), vb = b.native();
302 __vector uint64_t vlo, vhi;
303#if SIMDPP_BIG_ENDIAN
304 asm("vmuleuw %0, %1, %2" : "=v"(vlo) : "v"(va), "v"(vb));
305 asm("vmulouw %0, %1, %2" : "=v"(vhi) : "v"(va), "v"(vb));
306#else
307 asm("vmulouw %0, %1, %2" : "=v"(vlo) : "v"(va), "v"(vb));
308 asm("vmuleuw %0, %1, %2" : "=v"(vhi) : "v"(va), "v"(vb));
309#endif
310 uint64<2> lo = vlo, hi = vhi;
311#else
312 uint64x2 lo = vec_vmuleuw(a.native(), b.native());
313 uint64x2 hi = vec_vmulouw(a.native(), b.native());
314#endif
315 return combine(zip2_lo(lo, hi), zip2_hi(lo, hi));
316#elif SIMDPP_USE_ALTIVEC
317 mem_block<uint32<4>> ba = a;
318 mem_block<uint32<4>> bb = b;
319 uint64x4 r;
320 r.vec(0).el(0) = (uint64_t) ba[0] * bb[0];
321 r.vec(0).el(1) = (uint64_t) ba[1] * bb[1];
322 r.vec(1).el(0) = (uint64_t) ba[2] * bb[2];
323 r.vec(1).el(1) = (uint64_t) ba[3] * bb[3];
324 return r;
325#elif SIMDPP_USE_MSA
326 int64<4> a64 = (int64<4>) to_uint64(a);
327 int64<4> b64 = (int64<4>) to_uint64(b);
328 a64.vec(0) = __msa_mulv_d(a64.vec(0).native(), b64.vec(0).native());
329 a64.vec(1) = __msa_mulv_d(a64.vec(1).native(), b64.vec(1).native());
330 return (uint64<4>) a64;
331#endif
332}
333
334#if SIMDPP_USE_AVX2
335static SIMDPP_INL
336uint64<8> i_mull(const uint32<8>& a, const uint32<8>& b)
337{
338 uint32x4 al, ah, bl, bh;
339 uint64x4 rl, rh;
340
341 split(a, al, ah);
342 split(b, bl, bh);
343
344 rl = _mm256_mul_epu32(to_uint64(al).eval().native(), to_uint64(bl).eval().native());
345 rh = _mm256_mul_epu32(to_uint64(ah).eval().native(), to_uint64(bh).eval().native());
346
347 return combine(rl, rh);
348}
349#endif
350
351#if SIMDPP_USE_AVX512F
352static SIMDPP_INL
353uint64<16> i_mull(const uint32<16>& a, const uint32<16>& b)
354{
355 uint32<8> al, ah, bl, bh;
356 uint64<8> rl, rh;
357
358 split(a, al, ah);
359 split(b, bl, bh);
360
361 rl = _mm512_mul_epu32(to_int64(al).eval().native(), to_int64(bl).eval().native());
362 rh = _mm512_mul_epu32(to_int64(ah).eval().native(), to_int64(bh).eval().native());
363
364 return combine(rl, rh);
365}
366#endif
367
368template<unsigned N> SIMDPP_INL
369uint64<N> i_mull(const uint32<N>& a, const uint32<N>& b)
370{
371 uint64<N> r;
372 for (unsigned i = 0; i < a.vec_length; ++i) {
373 detail::subvec_insert(r, mull(a.vec(i), b.vec(i)).eval(), i);
374 }
375 return r;
376}
377
378} // namespace insn
379} // namespace detail
380} // namespace SIMDPP_ARCH_NAMESPACE
381} // namespace simdpp
382
383#endif
384
385