1/* Copyright (C) 2013-2017 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_MUL_LO_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_MUL_LO_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/i_mull.h>
17#include <simdpp/core/move_l.h>
18#include <simdpp/core/permute4.h>
19#include <simdpp/core/shuffle2.h>
20#include <simdpp/detail/null/math.h>
21#include <simdpp/detail/vector_array_macros.h>
22
23namespace simdpp {
24namespace SIMDPP_ARCH_NAMESPACE {
25namespace detail {
26namespace insn {
27
28static SIMDPP_INL
29uint16<8> i_mul_lo(const uint16<8>& a, const uint16<8>& b)
30{
31#if SIMDPP_USE_NULL
32 return detail::null::mul(a, b);
33#elif SIMDPP_USE_SSE2
34 return _mm_mullo_epi16(a.native(), b.native());
35#elif SIMDPP_USE_NEON
36 return vmulq_u16(a.native(), b.native());
37#elif SIMDPP_USE_ALTIVEC
38 return vec_mladd(a.native(), b.native(),
39 ((uint16x8) make_zero()).native());
40#elif SIMDPP_USE_MSA
41 return (v8u16) __msa_mulv_h((v8i16) a.native(), (v8i16) b.native());
42#endif
43}
44
45#if SIMDPP_USE_AVX2
46static SIMDPP_INL
47uint16<16> i_mul_lo(const uint16<16>& a, const uint16<16>& b)
48{
49 return _mm256_mullo_epi16(a.native(), b.native());
50}
51#endif
52
53#if SIMDPP_USE_AVX512BW
54static SIMDPP_INL
55uint16<32> i_mul_lo(const uint16<32>& a, const uint16<32>& b)
56{
57 return _mm512_mullo_epi16(a.native(), b.native());
58}
59#endif
60
61// -----------------------------------------------------------------------------
62
63static SIMDPP_INL
64uint32<4> i_mul_lo(const uint32<4>& a, const uint32<4>& b)
65{
66#if SIMDPP_USE_NULL
67 return detail::null::mul(a, b);
68#elif SIMDPP_USE_SSE4_1
69 return _mm_mullo_epi32(a.native(), b.native());
70#elif SIMDPP_USE_SSE2
71 uint32x4 a1, b1, r;
72 a1 = move4_l<1>(a);
73 b1 = move4_l<1>(b);
74 r = _mm_mul_epu32(a.native(), b.native());
75 a1 = _mm_mul_epu32(a1.native(), b1.native());
76 r = shuffle2<0,2,0,2>(r, a1); // moves to FP domain, additional latency unavoidable
77 r = permute4<0,2,1,3>(r);
78 return r;
79#elif SIMDPP_USE_NEON
80 return vmulq_u32(a.native(), b.native());
81#elif SIMDPP_USE_VSX_207
82#if __GNUC__
83 // BUG: GCC does not have support for vmuluwm yet
84 __vector uint32_t va = a.native(), vb = b.native();
85 __vector uint32_t vr;
86 asm("vmuluwm %0, %1, %2" : "=v"(vr) : "v"(va), "v"(vb));
87 return vr;
88#else
89 return vec_vmuluwm(a.native(), b.native());
90#endif
91#elif SIMDPP_USE_ALTIVEC
92 // implement in terms of 16-bit multiplies
93 // * ah al
94 // bh bl
95 // ======
96 // + (al*bl) <- l_ab
97 //+ (ah*bl) <- h_ab
98 //+ (al*bh) <- h_ba
99
100 uint16<8> ra, rb; ra = a, rb = b;
101#if SIMDPP_BIG_ENDIAN
102 uint16<8> sa = move8_r<1>(ra);
103 uint16<8> sb = move8_r<1>(rb);
104
105 uint32<4> l_ab = vec_mulo(ra.native(), rb.native());
106 uint32<4> h_ab = vec_mulo(ra.native(), sb.native());
107 uint32<4> h_ba = vec_mulo(sa.native(), rb.native());
108#else
109 uint16<8> sa = move8_l<1>(ra);
110 uint16<8> sb = move8_l<1>(rb);
111
112 uint32<4> l_ab = vec_mule(ra.native(), rb.native());
113 uint32<4> h_ab = vec_mule(ra.native(), sb.native());
114 uint32<4> h_ba = vec_mule(sa.native(), rb.native());
115#endif
116
117 h_ab = shift_l<16>(add(h_ab, h_ba));
118 h_ab = add(h_ab, l_ab);
119 return h_ab;
120#elif SIMDPP_USE_MSA
121 return (v4u32) __msa_mulv_w((v4i32) a.native(), (v4i32) b.native());
122#endif
123}
124
125#if SIMDPP_USE_AVX2
126static SIMDPP_INL
127uint32<8> i_mul_lo(const uint32<8>& a, const uint32<8>& b)
128{
129 return _mm256_mullo_epi32(a.native(), b.native());
130}
131#endif
132
133#if SIMDPP_USE_AVX512F
134static SIMDPP_INL
135uint32<16> i_mul_lo(const uint32<16>& a, const uint32<16>& b)
136{
137 return _mm512_mullo_epi32(a.native(), b.native());
138}
139#endif
140
141// -----------------------------------------------------------------------------
142
143template<class V> SIMDPP_INL
144V i_mul_lo(const V& a, const V& b)
145{
146 SIMDPP_VEC_ARRAY_IMPL2(V, i_mul_lo, a, b)
147}
148
149} // namespace insn
150} // namespace detail
151} // namespace SIMDPP_ARCH_NAMESPACE
152} // namespace simdpp
153
154#endif
155
156