| 1 | /* Copyright (C) 2013-2017 Povilas Kanapickas <povilas@radix.lt> |
| 2 | |
| 3 | Distributed under the Boost Software License, Version 1.0. |
| 4 | (See accompanying file LICENSE_1_0.txt or copy at |
| 5 | http://www.boost.org/LICENSE_1_0.txt) |
| 6 | */ |
| 7 | |
| 8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_MULL_H |
| 9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_MULL_H |
| 10 | |
| 11 | #ifndef LIBSIMDPP_SIMD_H |
| 12 | #error "This file must be included through simd.h" |
| 13 | #endif |
| 14 | |
| 15 | #include <simdpp/types.h> |
| 16 | #include <simdpp/detail/mem_block.h> |
| 17 | #include <simdpp/detail/not_implemented.h> |
| 18 | #include <simdpp/core/detail/subvec_insert.h> |
| 19 | #include <simdpp/core/to_int32.h> |
| 20 | #include <simdpp/core/to_int64.h> |
| 21 | #include <simdpp/core/i_mul.h> |
| 22 | #include <simdpp/core/combine.h> |
| 23 | #include <simdpp/core/zip_hi.h> |
| 24 | #include <simdpp/core/zip_lo.h> |
| 25 | |
| 26 | namespace simdpp { |
| 27 | namespace SIMDPP_ARCH_NAMESPACE { |
| 28 | namespace detail { |
| 29 | namespace insn { |
| 30 | |
| 31 | /* Note: widening integer multiplication instructions are very different among |
| 32 | instruction sets. The main difference is in which half of the elements are |
| 33 | selected for multiplication. Trying to abstract this incurs definite |
| 34 | overhead. |
| 35 | |
| 36 | - SSE2-SSE4.1, AVX2, ALTIVEC and VSX provide only instructions with |
| 37 | interfaces similar to mul_lo and mul_hi. The result vectors must be |
| 38 | interleaved to obtain contiguous result values. Multiplying 2 vectors |
| 39 | always incurs overhead of at least two interleaving instructions. |
| 40 | |
| 41 | - AVX512 only provides 32-bit integer support. Widening multiplication |
| 42 | can be done only by using PMULDQ, which takes odd elements and produces |
| 43 | widened multiplication results. Multiplication of two whole vectors |
| 44 | always incurs overhead of at least two shifts or interleaving |
| 45 | instructions. |
| 46 | |
| 47 | - NEON, NEONv2 provide instructions that take elements of either the lower |
| 48 | or higher halves of two 128-bit vectors and multiply them. No |
| 49 | additional overhead is incurred to obtain contiguous result values. |
| 50 | |
| 51 | The abstraction below uses the NEON model. No additional overhead is |
| 52 | incurred on SSE/AVX and NEON. On ALTIVEC, a single additional permute |
| 53 | instruction is needed for each vector multiplication on average. |
| 54 | */ |
| 55 | |
| 56 | static SIMDPP_INL |
| 57 | int32<8> i_mull(const int16<8>& a, const int16<8>& b) |
| 58 | { |
| 59 | #if SIMDPP_USE_NULL |
| 60 | int32x8 r; |
| 61 | for (unsigned i = 0; i < 8; i++) { |
| 62 | r.vec(i/4).el(i%4) = int32_t(a.el(i)) * b.el(i); |
| 63 | } |
| 64 | return r; |
| 65 | #elif SIMDPP_USE_SSE2 |
| 66 | int16x8 lo = _mm_mullo_epi16(a.native(), b.native()); |
| 67 | int16x8 hi = _mm_mulhi_epi16(a.native(), b.native()); |
| 68 | return (int32x8)combine(zip8_lo(lo, hi), zip8_hi(lo, hi)); |
| 69 | #elif SIMDPP_USE_NEON |
| 70 | int32x4 lo = vmull_s16(vget_low_s16(a.native()), vget_low_s16(b.native())); |
| 71 | int32x4 hi = vmull_s16(vget_high_s16(a.native()), vget_high_s16(b.native())); |
| 72 | return combine(lo, hi); |
| 73 | #elif SIMDPP_USE_ALTIVEC |
| 74 | int32x4 lo = vec_mule(a.native(), b.native()); |
| 75 | int32x4 hi = vec_mulo(a.native(), b.native()); |
| 76 | return combine(zip4_lo(lo, hi), zip4_hi(lo, hi)); |
| 77 | #elif SIMDPP_USE_MSA |
| 78 | int32<8> a32 = to_int32(a); |
| 79 | int32<8> b32 = to_int32(b); |
| 80 | a32.vec(0) = __msa_mulv_w(a32.vec(0).native(), b32.vec(0).native()); |
| 81 | a32.vec(1) = __msa_mulv_w(a32.vec(1).native(), b32.vec(1).native()); |
| 82 | return a32; |
| 83 | #endif |
| 84 | } |
| 85 | |
| 86 | #if SIMDPP_USE_AVX2 |
| 87 | static SIMDPP_INL |
| 88 | int32<16> i_mull(const int16<16>& a, const int16<16>& b) |
| 89 | { |
| 90 | int16<16> as, bs, lo, hi; |
| 91 | as = _mm256_permute4x64_epi64(a.native(), _MM_SHUFFLE(3,1,2,0)); |
| 92 | bs = _mm256_permute4x64_epi64(b.native(), _MM_SHUFFLE(3,1,2,0)); |
| 93 | lo = _mm256_mullo_epi16(as.native(), bs.native()); |
| 94 | hi = _mm256_mulhi_epi16(as.native(), bs.native()); |
| 95 | return (int32<16>) combine(zip8_lo(lo, hi), zip8_hi(lo, hi)); |
| 96 | } |
| 97 | #endif |
| 98 | |
| 99 | #if SIMDPP_USE_AVX512BW |
| 100 | static SIMDPP_INL |
| 101 | int32<32> i_mull(const int16<32>& a, const int16<32>& b) |
| 102 | { |
| 103 | int16<32> as, bs, lo, hi; |
| 104 | int64<8> idx = make_uint(0, 4, 1, 5, 2, 6, 3, 7); |
| 105 | as = _mm512_permutexvar_epi64(idx.native(), a.native()); |
| 106 | bs = _mm512_permutexvar_epi64(idx.native(), b.native()); |
| 107 | lo = _mm512_mullo_epi16(as.native(), bs.native()); |
| 108 | hi = _mm512_mulhi_epi16(as.native(), bs.native()); |
| 109 | return (int32<32>) combine(zip8_lo(lo, hi), zip8_hi(lo, hi)); |
| 110 | } |
| 111 | #endif |
| 112 | |
| 113 | template<unsigned N> SIMDPP_INL |
| 114 | int32<N> i_mull(const int16<N>& a, const int16<N>& b) |
| 115 | { |
| 116 | int32<N> r; |
| 117 | for (unsigned i = 0; i < a.vec_length; ++i) { |
| 118 | detail::subvec_insert(r, mull(a.vec(i), b.vec(i)).eval(), i); |
| 119 | } |
| 120 | return r; |
| 121 | } |
| 122 | |
| 123 | // ----------------------------------------------------------------------------- |
| 124 | |
| 125 | static SIMDPP_INL |
| 126 | uint32<8> i_mull(const uint16<8>& a, const uint16<8>& b) |
| 127 | { |
| 128 | #if SIMDPP_USE_NULL |
| 129 | int32x8 r; |
| 130 | for (unsigned i = 0; i < 8; i++) { |
| 131 | r.vec(i/4).el(i%4) = uint32_t(a.el(i)) * b.el(i); |
| 132 | } |
| 133 | return r; |
| 134 | #elif SIMDPP_USE_SSE2 |
| 135 | int16x8 lo = _mm_mullo_epi16(a.native(), b.native()); |
| 136 | int16x8 hi = _mm_mulhi_epu16(a.native(), b.native()); |
| 137 | return (uint32x8) combine(zip8_lo(lo, hi), zip8_hi(lo, hi)); |
| 138 | #elif SIMDPP_USE_NEON |
| 139 | uint32x4 lo = vmull_u16(vget_low_u16(a.native()), vget_low_u16(b.native())); |
| 140 | uint32x4 hi = vmull_u16(vget_high_u16(a.native()), vget_high_u16(b.native())); |
| 141 | return combine(lo, hi); |
| 142 | #elif SIMDPP_USE_ALTIVEC |
| 143 | uint32x4 lo = vec_mule(a.native(), b.native()); |
| 144 | uint32x4 hi = vec_mulo(a.native(), b.native()); |
| 145 | return combine(zip4_lo(lo, hi), zip4_hi(lo, hi)); |
| 146 | #elif SIMDPP_USE_MSA |
| 147 | int32<8> a32 = (int32<8>) to_uint32(a); |
| 148 | int32<8> b32 = (int32<8>) to_uint32(b); |
| 149 | a32.vec(0) = __msa_mulv_w(a32.vec(0).native(), b32.vec(0).native()); |
| 150 | a32.vec(1) = __msa_mulv_w(a32.vec(1).native(), b32.vec(1).native()); |
| 151 | return uint32<8>(a32); |
| 152 | #endif |
| 153 | } |
| 154 | |
| 155 | #if SIMDPP_USE_AVX2 |
| 156 | static SIMDPP_INL |
| 157 | uint32<16> i_mull(const uint16<16>& a, const uint16<16>& b) |
| 158 | { |
| 159 | uint16<16> as, bs, lo, hi; |
| 160 | as = _mm256_permute4x64_epi64(a.native(), _MM_SHUFFLE(3,1,2,0)); |
| 161 | bs = _mm256_permute4x64_epi64(b.native(), _MM_SHUFFLE(3,1,2,0)); |
| 162 | lo = _mm256_mullo_epi16(as.native(), bs.native()); |
| 163 | hi = _mm256_mulhi_epu16(as.native(), bs.native()); |
| 164 | return (uint32<16>) combine(zip8_lo(lo, hi), zip8_hi(lo, hi)); |
| 165 | } |
| 166 | #endif |
| 167 | |
| 168 | #if SIMDPP_USE_AVX512BW |
| 169 | static SIMDPP_INL |
| 170 | uint32<32> i_mull(const uint16<32>& a, const uint16<32>& b) |
| 171 | { |
| 172 | uint16<32> as, bs, lo, hi; |
| 173 | uint64<8> idx = make_uint(0, 4, 1, 5, 2, 6, 3, 7); |
| 174 | as = _mm512_permutexvar_epi64(idx.native(), a.native()); |
| 175 | bs = _mm512_permutexvar_epi64(idx.native(), b.native()); |
| 176 | lo = _mm512_mullo_epi16(as.native(), bs.native()); |
| 177 | hi = _mm512_mulhi_epu16(as.native(), bs.native()); |
| 178 | return (uint32<32>) combine(zip8_lo(lo, hi), zip8_hi(lo, hi)); |
| 179 | } |
| 180 | #endif |
| 181 | |
| 182 | template<unsigned N> SIMDPP_INL |
| 183 | uint32<N> i_mull(const uint16<N>& a, const uint16<N>& b) |
| 184 | { |
| 185 | uint32<N> r; |
| 186 | for (unsigned i = 0; i < a.vec_length; ++i) { |
| 187 | detail::subvec_insert(r, mull(a.vec(i), b.vec(i)).eval(), i); |
| 188 | } |
| 189 | return r; |
| 190 | } |
| 191 | |
| 192 | // ----------------------------------------------------------------------------- |
| 193 | |
| 194 | static SIMDPP_INL |
| 195 | int64<4> i_mull(const int32<4>& a, const int32<4>& b) |
| 196 | { |
| 197 | #if SIMDPP_USE_NULL |
| 198 | int64x4 r; |
| 199 | r.vec(0).el(0) = int64_t(a.el(0)) * b.el(0); |
| 200 | r.vec(0).el(1) = int64_t(a.el(1)) * b.el(1); |
| 201 | r.vec(1).el(0) = int64_t(a.el(2)) * b.el(2); |
| 202 | r.vec(1).el(1) = int64_t(a.el(3)) * b.el(3); |
| 203 | return r; |
| 204 | #elif SIMDPP_USE_SSE4_1 |
| 205 | int32x4 al, ah, bl, bh; |
| 206 | int64x2 rl, rh; |
| 207 | al = zip4_lo(a, a); |
| 208 | bl = zip4_lo(b, b); |
| 209 | ah = zip4_hi(a, a); |
| 210 | bh = zip4_hi(b, b); |
| 211 | rl = _mm_mul_epi32(al.native(), bl.native()); |
| 212 | rh = _mm_mul_epi32(ah.native(), bh.native()); |
| 213 | return combine(rl, rh); |
| 214 | #elif SIMDPP_USE_NEON |
| 215 | int64x2 lo = vmull_s32(vget_low_s32(a.native()), vget_low_s32(b.native())); |
| 216 | int64x2 hi = vmull_s32(vget_high_s32(a.native()), vget_high_s32(b.native())); |
| 217 | return combine(lo, hi); |
| 218 | #elif SIMDPP_USE_VSX_207 |
| 219 | #if defined(__GNUC__) && (__GNUC__ < 8) |
| 220 | // BUG: GCC 7 and earlied don't implement 32-bit integer multiplication |
| 221 | __vector int32_t va = a.native(), vb = b.native(); |
| 222 | __vector int64_t vlo, vhi; |
| 223 | #if SIMDPP_BIG_ENDIAN |
| 224 | asm("vmulesw %0, %1, %2" : "=v" (vlo) : "v" (va), "v" (vb)); |
| 225 | asm("vmulosw %0, %1, %2" : "=v" (vhi) : "v" (va), "v" (vb)); |
| 226 | #else |
| 227 | asm("vmulosw %0, %1, %2" : "=v" (vlo) : "v" (va), "v" (vb)); |
| 228 | asm("vmulesw %0, %1, %2" : "=v" (vhi) : "v" (va), "v" (vb)); |
| 229 | #endif |
| 230 | int64<2> lo = vlo, hi = vhi; |
| 231 | #else |
| 232 | int64x2 lo = vec_vmulesw(a.native(), b.native()); |
| 233 | int64x2 hi = vec_vmulosw(a.native(), b.native()); |
| 234 | #endif |
| 235 | return combine(zip2_lo(lo, hi), zip2_hi(lo, hi)); |
| 236 | #elif SIMDPP_USE_MSA |
| 237 | int64<4> a64 = to_int64(a); |
| 238 | int64<4> b64 = to_int64(b); |
| 239 | a64.vec(0) = __msa_mulv_d(a64.vec(0).native(), b64.vec(0).native()); |
| 240 | a64.vec(1) = __msa_mulv_d(a64.vec(1).native(), b64.vec(1).native()); |
| 241 | return a64; |
| 242 | #else |
| 243 | return SIMDPP_NOT_IMPLEMENTED2(a, b); |
| 244 | #endif |
| 245 | } |
| 246 | |
| 247 | #if SIMDPP_USE_AVX2 |
| 248 | static SIMDPP_INL |
| 249 | int64<8> i_mull(const int32<8>& a, const int32<8>& b) |
| 250 | { |
| 251 | int32x4 al, ah, bl, bh; |
| 252 | int64x4 rl, rh; |
| 253 | split(a, al, ah); |
| 254 | split(b, bl, bh); |
| 255 | |
| 256 | rl = _mm256_mul_epi32(to_int64(al).eval().native(), to_int64(bl).eval().native()); |
| 257 | rh = _mm256_mul_epi32(to_int64(ah).eval().native(), to_int64(bh).eval().native()); |
| 258 | return combine(rl, rh); |
| 259 | } |
| 260 | #endif |
| 261 | |
| 262 | template<unsigned N> SIMDPP_INL |
| 263 | int64<N> i_mull(const int32<N>& a, const int32<N>& b) |
| 264 | { |
| 265 | int64<N> r; |
| 266 | for (unsigned i = 0; i < a.vec_length; ++i) { |
| 267 | detail::subvec_insert(r, mull(a.vec(i), b.vec(i)).eval(), i); |
| 268 | } |
| 269 | return r; |
| 270 | } |
| 271 | |
| 272 | // ----------------------------------------------------------------------------- |
| 273 | |
| 274 | static SIMDPP_INL |
| 275 | uint64<4> i_mull(const uint32<4>& a, const uint32<4>& b) |
| 276 | { |
| 277 | #if SIMDPP_USE_NULL |
| 278 | uint64x4 r; |
| 279 | r.vec(0).el(0) = uint64_t(a.el(0)) * b.el(0); |
| 280 | r.vec(0).el(1) = uint64_t(a.el(1)) * b.el(1); |
| 281 | r.vec(1).el(0) = uint64_t(a.el(2)) * b.el(2); |
| 282 | r.vec(1).el(1) = uint64_t(a.el(3)) * b.el(3); |
| 283 | return r; |
| 284 | #elif SIMDPP_USE_SSE2 |
| 285 | uint32x4 al, ah, bl, bh; |
| 286 | uint64x2 rl, rh; |
| 287 | al = zip4_lo(a, a); |
| 288 | bl = zip4_lo(b, b); |
| 289 | ah = zip4_hi(a, a); |
| 290 | bh = zip4_hi(b, b); |
| 291 | rl = _mm_mul_epu32(al.native(), bl.native()); |
| 292 | rh = _mm_mul_epu32(ah.native(), bh.native()); |
| 293 | return combine(rl, rh); |
| 294 | #elif SIMDPP_USE_NEON |
| 295 | uint64x2 lo = vmull_u32(vget_low_u32(a.native()), vget_low_u32(b.native())); |
| 296 | uint64x2 hi = vmull_u32(vget_high_u32(a.native()), vget_high_u32(b.native())); |
| 297 | return combine(lo, hi); |
| 298 | #elif SIMDPP_USE_VSX_207 |
| 299 | #if defined(__GNUC__) && (__GNUC__ < 8) |
| 300 | // BUG: GCC 7 and earlied don't implement 32-bit integer multiplication |
| 301 | __vector uint32_t va = a.native(), vb = b.native(); |
| 302 | __vector uint64_t vlo, vhi; |
| 303 | #if SIMDPP_BIG_ENDIAN |
| 304 | asm("vmuleuw %0, %1, %2" : "=v" (vlo) : "v" (va), "v" (vb)); |
| 305 | asm("vmulouw %0, %1, %2" : "=v" (vhi) : "v" (va), "v" (vb)); |
| 306 | #else |
| 307 | asm("vmulouw %0, %1, %2" : "=v" (vlo) : "v" (va), "v" (vb)); |
| 308 | asm("vmuleuw %0, %1, %2" : "=v" (vhi) : "v" (va), "v" (vb)); |
| 309 | #endif |
| 310 | uint64<2> lo = vlo, hi = vhi; |
| 311 | #else |
| 312 | uint64x2 lo = vec_vmuleuw(a.native(), b.native()); |
| 313 | uint64x2 hi = vec_vmulouw(a.native(), b.native()); |
| 314 | #endif |
| 315 | return combine(zip2_lo(lo, hi), zip2_hi(lo, hi)); |
| 316 | #elif SIMDPP_USE_ALTIVEC |
| 317 | mem_block<uint32<4>> ba = a; |
| 318 | mem_block<uint32<4>> bb = b; |
| 319 | uint64x4 r; |
| 320 | r.vec(0).el(0) = (uint64_t) ba[0] * bb[0]; |
| 321 | r.vec(0).el(1) = (uint64_t) ba[1] * bb[1]; |
| 322 | r.vec(1).el(0) = (uint64_t) ba[2] * bb[2]; |
| 323 | r.vec(1).el(1) = (uint64_t) ba[3] * bb[3]; |
| 324 | return r; |
| 325 | #elif SIMDPP_USE_MSA |
| 326 | int64<4> a64 = (int64<4>) to_uint64(a); |
| 327 | int64<4> b64 = (int64<4>) to_uint64(b); |
| 328 | a64.vec(0) = __msa_mulv_d(a64.vec(0).native(), b64.vec(0).native()); |
| 329 | a64.vec(1) = __msa_mulv_d(a64.vec(1).native(), b64.vec(1).native()); |
| 330 | return (uint64<4>) a64; |
| 331 | #endif |
| 332 | } |
| 333 | |
| 334 | #if SIMDPP_USE_AVX2 |
| 335 | static SIMDPP_INL |
| 336 | uint64<8> i_mull(const uint32<8>& a, const uint32<8>& b) |
| 337 | { |
| 338 | uint32x4 al, ah, bl, bh; |
| 339 | uint64x4 rl, rh; |
| 340 | |
| 341 | split(a, al, ah); |
| 342 | split(b, bl, bh); |
| 343 | |
| 344 | rl = _mm256_mul_epu32(to_uint64(al).eval().native(), to_uint64(bl).eval().native()); |
| 345 | rh = _mm256_mul_epu32(to_uint64(ah).eval().native(), to_uint64(bh).eval().native()); |
| 346 | |
| 347 | return combine(rl, rh); |
| 348 | } |
| 349 | #endif |
| 350 | |
| 351 | #if SIMDPP_USE_AVX512F |
| 352 | static SIMDPP_INL |
| 353 | uint64<16> i_mull(const uint32<16>& a, const uint32<16>& b) |
| 354 | { |
| 355 | uint32<8> al, ah, bl, bh; |
| 356 | uint64<8> rl, rh; |
| 357 | |
| 358 | split(a, al, ah); |
| 359 | split(b, bl, bh); |
| 360 | |
| 361 | rl = _mm512_mul_epu32(to_int64(al).eval().native(), to_int64(bl).eval().native()); |
| 362 | rh = _mm512_mul_epu32(to_int64(ah).eval().native(), to_int64(bh).eval().native()); |
| 363 | |
| 364 | return combine(rl, rh); |
| 365 | } |
| 366 | #endif |
| 367 | |
| 368 | template<unsigned N> SIMDPP_INL |
| 369 | uint64<N> i_mull(const uint32<N>& a, const uint32<N>& b) |
| 370 | { |
| 371 | uint64<N> r; |
| 372 | for (unsigned i = 0; i < a.vec_length; ++i) { |
| 373 | detail::subvec_insert(r, mull(a.vec(i), b.vec(i)).eval(), i); |
| 374 | } |
| 375 | return r; |
| 376 | } |
| 377 | |
| 378 | } // namespace insn |
| 379 | } // namespace detail |
| 380 | } // namespace SIMDPP_ARCH_NAMESPACE |
| 381 | } // namespace simdpp |
| 382 | |
| 383 | #endif |
| 384 | |
| 385 | |