1/* Copyright (C) 2013-2017 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_MUL_HI_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_MUL_HI_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/i_mull.h>
17#include <simdpp/core/unzip_hi.h>
18#include <simdpp/detail/null/math.h>
19#include <simdpp/detail/vector_array_macros.h>
20
21namespace simdpp {
22namespace SIMDPP_ARCH_NAMESPACE {
23namespace detail {
24namespace insn {
25
26static SIMDPP_INL
27int16<8> i_mul_hi(const int16<8>& a, const int16<8>& b)
28{
29#if SIMDPP_USE_NULL
30 uint16<8> r;
31 for (unsigned i = 0; i < a.length; i++) {
32 r.el(i) = (int32_t(a.el(i)) * b.el(i)) >> 16;
33 }
34 return r;
35#elif SIMDPP_USE_SSE2
36 return _mm_mulhi_epi16(a.native(), b.native());
37#elif SIMDPP_USE_NEON
38 int32x4 lo = vmull_s16(vget_low_s16(a.native()), vget_low_s16(b.native()));
39 int32x4 hi = vmull_s16(vget_high_s16(a.native()), vget_high_s16(b.native()));
40 return unzip8_hi(int16x8(lo), int16x8(hi));
41#elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
42#if SIMDPP_BIG_ENDIAN
43 int16<16> ab;
44 ab = mull(a, b);
45 return unzip8_lo(ab.vec(0), ab.vec(1));
46#else
47 int16<16> ab;
48 ab = mull(a, b);
49 return unzip8_hi(ab.vec(0), ab.vec(1));
50#endif
51#endif
52}
53
54#if SIMDPP_USE_AVX2
55static SIMDPP_INL
56int16<16> i_mul_hi(const int16<16>& a, const int16<16>& b)
57{
58 return _mm256_mulhi_epi16(a.native(), b.native());
59}
60#endif
61
62#if SIMDPP_USE_AVX512BW
63static SIMDPP_INL
64int16<32> i_mul_hi(const int16<32>& a, const int16<32>& b)
65{
66 return _mm512_mulhi_epi16(a.native(), b.native());
67}
68#endif
69
70// -----------------------------------------------------------------------------
71
72static SIMDPP_INL
73uint16<8> i_mul_hi(const uint16<8>& a, const uint16<8>& b)
74{
75#if SIMDPP_USE_NULL
76 uint16<8> r;
77 for (unsigned i = 0; i < a.length; i++) {
78 r.el(i) = (uint32_t(a.el(i)) * b.el(i)) >> 16;
79 }
80 return r;
81#elif SIMDPP_USE_SSE2
82 return _mm_mulhi_epu16(a.native(), b.native());
83#elif SIMDPP_USE_NEON
84 uint32x4 lo = vmull_u16(vget_low_u16(a.native()), vget_low_u16(b.native()));
85 uint32x4 hi = vmull_u16(vget_high_u16(a.native()), vget_high_u16(b.native()));
86 return unzip8_hi(uint16x8(lo), uint16x8(hi));
87#elif SIMDPP_USE_ALTIVEC && SIMDPP_BIG_ENDIAN
88 uint16<16> ab;
89 ab = mull(a, b);
90 return unzip8_lo(ab.vec(0), ab.vec(1));
91#elif (SIMDPP_USE_ALTIVEC && SIMDPP_LITTLE_ENDIAN) || SIMDPP_USE_MSA
92 uint16<16> ab;
93 ab = mull(a, b);
94 return unzip8_hi(ab.vec(0), ab.vec(1));
95#endif
96}
97
98#if SIMDPP_USE_AVX2
99static SIMDPP_INL
100uint16<16> i_mul_hi(const uint16<16>& a, const uint16<16>& b)
101{
102 return _mm256_mulhi_epu16(a.native(), b.native());
103}
104#endif
105
106#if SIMDPP_USE_AVX512BW
107static SIMDPP_INL
108uint16<32> i_mul_hi(const uint16<32>& a, const uint16<32>& b)
109{
110 return _mm512_mulhi_epu16(a.native(), b.native());
111}
112#endif
113
114// -----------------------------------------------------------------------------
115
116template<class V> SIMDPP_INL
117V i_mul_hi(const V& a, const V& b)
118{
119 SIMDPP_VEC_ARRAY_IMPL2(V, i_mul_hi, a, b)
120}
121
122} // namespace insn
123} // namespace detail
124} // namespace SIMDPP_ARCH_NAMESPACE
125} // namespace simdpp
126
127#endif
128
129