1/* Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_CORE_I_MULL_H
9#define LIBSIMDPP_SIMDPP_CORE_I_MULL_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/detail/expr/i_mull.h>
17#include <simdpp/core/detail/scalar_arg_impl.h>
18
19namespace simdpp {
20namespace SIMDPP_ARCH_NAMESPACE {
21
22
23/* Note: widening integer multiplication instructions are very different among
24 instruction sets. The main difference is in which half of the elements are
25 selected for multiplication. Trying to abstract this incurs definite
26 overhead.
27
28 - SSE2-SSE4.1 and AVX2 provide only instructions with interfaces similar
29 to mul_lo and mul_hi. The result vectors must be interleaved to obtain
30 contiguous result values. Multiplying 2 vectors always incurs
31 overhead of at least two interleaving instructions.
32
33 - AVX512 only provides 32-bit integer support. Widening multiplication
34 can be done only by using PMULDQ, which takes odd elements and produces
35 widened multiplication results. Multiplication of two whole vectors
36 always incurs overhead of at least two shifts or interleaving
37 instructions.
38
39 - NEON, NEONv2 provide instructions that take elements of either the lower
40 or higher halves of two 128-bit vectors and multiply them. No
41 additional overhead is incurred to obtain contiguous result values.
42
43 - ALTIVEC hav multiply odd and multiply even instructions. No additional
44 overhead is incurred to obtain contiguous result values.
45
46 The abstraction below uses the NEON model. No additional overhead is
47 incurred on SSE/AVX and NEON. On ALTIVEC, a single additional permute
48 instruction is needed for each vector multiplication on average.
49*/
50
51/** Multiplies signed 16-bit values and expands the results to 32 bits.
52
53 @par 128-bit version:
54 @code
55 r0 = a0 * b0
56 ...
57 rN = aN * bN
58 @endcode
59
60 @icost{SSE2-AVX, ALTIVEC, 2-3}
61
62 @par 256-bit version:
63
64 @icost{SSE2-AVX, ALTIVEC, 4-6}
65 @icost{AVX2, NEON, 2-3}
66*/
67template<unsigned N, class E1, class E2> SIMDPP_INL
68int32<N, expr_mull<int16<N,E1>,
69 int16<N,E2>>> mull(const int16<N,E1>& a, const int16<N,E2>& b)
70{
71 return { { a, b } };
72}
73
74SIMDPP_SCALAR_ARG_IMPL_EXPR(mull, expr_mull, int32, int16)
75
76/** Multiplies unsigned 16-bit values and expands the results to 32 bits.
77
78 @par 128-bit version:
79 @code
80 r0 = a0 * b0
81 ...
82 rN = aN * bN
83 @endcode
84
85 @icost{SSE2-AVX2, ALTIVEC, 2-3}
86
87 @par 256-bit version:
88 @icost{SSE2-AVX, ALTIVEC, 4-6}
89 @icost{AVX2, 2-3}
90 @icost{NEON, 2}
91*/
92template<unsigned N, class E1, class E2> SIMDPP_INL
93uint32<N, expr_mull<uint16<N,E1>,
94 uint16<N,E2>>> mull(const uint16<N,E1>& a, const uint16<N,E2>& b)
95{
96 return { { a, b } };
97}
98
99SIMDPP_SCALAR_ARG_IMPL_EXPR(mull, expr_mull, uint32, uint16)
100
101/** Multiplies signed 32-bit values in and expands the results to 64 bits.
102
103 @code
104 r0 = a0 * b0
105 ...
106 rN = aN * bN
107 @endcode
108 @par 128-bit version:
109 @icost{SSE4.1-AVX, 3}
110 @unimp{SSE2-SSSE3, ALTIVEC}
111
112 @par 256-bit version:
113 @icost{SSE4.1-AVX, 6}
114 @icost{AVX2, 3}
115 @icost{NEON, 2}
116 @unimp{SSE2-SSSE3, ALTIVEC}
117*/
118template<unsigned N, class E1, class E2> SIMDPP_INL
119int64<N, expr_mull<int32<N,E1>,
120 int32<N,E2>>> mull(const int32<N,E1>& a, const int32<N,E2>& b)
121{
122 return { { a, b } };
123}
124
125SIMDPP_SCALAR_ARG_IMPL_EXPR(mull, expr_mull, int64, int32)
126
127/** Multiplies unsigned 32-bit values in the lower halves of the vectors and
128 expands the results to 64 bits.
129
130 @par 128-bit version:
131 @code
132 r0 = a0 * b0
133 r1 = a1 * b1
134 @endcode
135 @icost{SSE2-AVX, 3}
136 @unimp{ALTIVEC}
137
138 @icost{SSE2-AVX, 6}
139 @icost{AVX2, 3}
140 @icost{NEON, 2}
141 @unimp{ALTIVEC}
142*/
143template<unsigned N, class E1, class E2> SIMDPP_INL
144uint64<N, expr_mull<uint32<N,E1>,
145 uint32<N,E2>>> mull(const uint32<N,E1>& a, const uint32<N,E2>& b)
146{
147 return { { a, b } };
148}
149
150SIMDPP_SCALAR_ARG_IMPL_EXPR(mull, expr_mull, uint64, uint32)
151
152} // namespace SIMDPP_ARCH_NAMESPACE
153} // namespace simdpp
154
155#endif
156
157