1 | /* Copyright (C) 2013-2017 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_MUL_LO_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_MUL_LO_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/i_mull.h> |
17 | #include <simdpp/core/move_l.h> |
18 | #include <simdpp/core/permute4.h> |
19 | #include <simdpp/core/shuffle2.h> |
20 | #include <simdpp/detail/null/math.h> |
21 | #include <simdpp/detail/vector_array_macros.h> |
22 | |
23 | namespace simdpp { |
24 | namespace SIMDPP_ARCH_NAMESPACE { |
25 | namespace detail { |
26 | namespace insn { |
27 | |
28 | static SIMDPP_INL |
29 | uint16<8> i_mul_lo(const uint16<8>& a, const uint16<8>& b) |
30 | { |
31 | #if SIMDPP_USE_NULL |
32 | return detail::null::mul(a, b); |
33 | #elif SIMDPP_USE_SSE2 |
34 | return _mm_mullo_epi16(a.native(), b.native()); |
35 | #elif SIMDPP_USE_NEON |
36 | return vmulq_u16(a.native(), b.native()); |
37 | #elif SIMDPP_USE_ALTIVEC |
38 | return vec_mladd(a.native(), b.native(), |
39 | ((uint16x8) make_zero()).native()); |
40 | #elif SIMDPP_USE_MSA |
41 | return (v8u16) __msa_mulv_h((v8i16) a.native(), (v8i16) b.native()); |
42 | #endif |
43 | } |
44 | |
45 | #if SIMDPP_USE_AVX2 |
46 | static SIMDPP_INL |
47 | uint16<16> i_mul_lo(const uint16<16>& a, const uint16<16>& b) |
48 | { |
49 | return _mm256_mullo_epi16(a.native(), b.native()); |
50 | } |
51 | #endif |
52 | |
53 | #if SIMDPP_USE_AVX512BW |
54 | static SIMDPP_INL |
55 | uint16<32> i_mul_lo(const uint16<32>& a, const uint16<32>& b) |
56 | { |
57 | return _mm512_mullo_epi16(a.native(), b.native()); |
58 | } |
59 | #endif |
60 | |
61 | // ----------------------------------------------------------------------------- |
62 | |
63 | static SIMDPP_INL |
64 | uint32<4> i_mul_lo(const uint32<4>& a, const uint32<4>& b) |
65 | { |
66 | #if SIMDPP_USE_NULL |
67 | return detail::null::mul(a, b); |
68 | #elif SIMDPP_USE_SSE4_1 |
69 | return _mm_mullo_epi32(a.native(), b.native()); |
70 | #elif SIMDPP_USE_SSE2 |
71 | uint32x4 a1, b1, r; |
72 | a1 = move4_l<1>(a); |
73 | b1 = move4_l<1>(b); |
74 | r = _mm_mul_epu32(a.native(), b.native()); |
75 | a1 = _mm_mul_epu32(a1.native(), b1.native()); |
76 | r = shuffle2<0,2,0,2>(r, a1); // moves to FP domain, additional latency unavoidable |
77 | r = permute4<0,2,1,3>(r); |
78 | return r; |
79 | #elif SIMDPP_USE_NEON |
80 | return vmulq_u32(a.native(), b.native()); |
81 | #elif SIMDPP_USE_VSX_207 |
82 | #if __GNUC__ |
83 | // BUG: GCC does not have support for vmuluwm yet |
84 | __vector uint32_t va = a.native(), vb = b.native(); |
85 | __vector uint32_t vr; |
86 | asm("vmuluwm %0, %1, %2" : "=v" (vr) : "v" (va), "v" (vb)); |
87 | return vr; |
88 | #else |
89 | return vec_vmuluwm(a.native(), b.native()); |
90 | #endif |
91 | #elif SIMDPP_USE_ALTIVEC |
92 | // implement in terms of 16-bit multiplies |
93 | // * ah al |
94 | // bh bl |
95 | // ====== |
96 | // + (al*bl) <- l_ab |
97 | //+ (ah*bl) <- h_ab |
98 | //+ (al*bh) <- h_ba |
99 | |
100 | uint16<8> ra, rb; ra = a, rb = b; |
101 | #if SIMDPP_BIG_ENDIAN |
102 | uint16<8> sa = move8_r<1>(ra); |
103 | uint16<8> sb = move8_r<1>(rb); |
104 | |
105 | uint32<4> l_ab = vec_mulo(ra.native(), rb.native()); |
106 | uint32<4> h_ab = vec_mulo(ra.native(), sb.native()); |
107 | uint32<4> h_ba = vec_mulo(sa.native(), rb.native()); |
108 | #else |
109 | uint16<8> sa = move8_l<1>(ra); |
110 | uint16<8> sb = move8_l<1>(rb); |
111 | |
112 | uint32<4> l_ab = vec_mule(ra.native(), rb.native()); |
113 | uint32<4> h_ab = vec_mule(ra.native(), sb.native()); |
114 | uint32<4> h_ba = vec_mule(sa.native(), rb.native()); |
115 | #endif |
116 | |
117 | h_ab = shift_l<16>(add(h_ab, h_ba)); |
118 | h_ab = add(h_ab, l_ab); |
119 | return h_ab; |
120 | #elif SIMDPP_USE_MSA |
121 | return (v4u32) __msa_mulv_w((v4i32) a.native(), (v4i32) b.native()); |
122 | #endif |
123 | } |
124 | |
125 | #if SIMDPP_USE_AVX2 |
126 | static SIMDPP_INL |
127 | uint32<8> i_mul_lo(const uint32<8>& a, const uint32<8>& b) |
128 | { |
129 | return _mm256_mullo_epi32(a.native(), b.native()); |
130 | } |
131 | #endif |
132 | |
133 | #if SIMDPP_USE_AVX512F |
134 | static SIMDPP_INL |
135 | uint32<16> i_mul_lo(const uint32<16>& a, const uint32<16>& b) |
136 | { |
137 | return _mm512_mullo_epi32(a.native(), b.native()); |
138 | } |
139 | #endif |
140 | |
141 | // ----------------------------------------------------------------------------- |
142 | |
143 | template<class V> SIMDPP_INL |
144 | V i_mul_lo(const V& a, const V& b) |
145 | { |
146 | SIMDPP_VEC_ARRAY_IMPL2(V, i_mul_lo, a, b) |
147 | } |
148 | |
149 | } // namespace insn |
150 | } // namespace detail |
151 | } // namespace SIMDPP_ARCH_NAMESPACE |
152 | } // namespace simdpp |
153 | |
154 | #endif |
155 | |
156 | |