1/* Copyright (C) 2012-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_AVG_TRUNC_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_AVG_TRUNC_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/bit_xor.h>
17#include <simdpp/core/bit_and.h>
18#include <simdpp/core/i_add.h>
19#include <simdpp/core/i_shift_r.h>
20#include <simdpp/detail/vector_array_macros.h>
21
22namespace simdpp {
23namespace SIMDPP_ARCH_NAMESPACE {
24namespace detail {
25namespace insn {
26
27template<class V> SIMDPP_INL V v_emul_avg_trunc(const V& a, const V& b);
28template<class V> SIMDPP_INL V v_emul_avg_trunc_i8(const V& a, const V& b);
29template<class V> SIMDPP_INL V v_emul_avg_trunc_i16(const V& a, const V& b);
30template<class V> SIMDPP_INL V v_emul_avg_trunc_i32(const V& a, const V& b);
31
32
33static SIMDPP_INL
34uint8x16 i_avg_trunc(const uint8x16& a, const uint8x16& b)
35{
36#if SIMDPP_USE_NULL
37 uint8x16 r;
38 for (unsigned i = 0; i < a.length; i++) {
39 r.el(i) = (uint16_t(a.el(i)) + b.el(i)) >> 1;
40 }
41 return r;
42#elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC
43 return v_emul_avg_trunc(a, b);
44#elif SIMDPP_USE_NEON
45 return vhaddq_u8(a.native(), b.native());
46#elif SIMDPP_USE_MSA
47 return __msa_ave_u_b(a.native(), b.native());
48#endif
49}
50
51#if SIMDPP_USE_AVX2
52static SIMDPP_INL
53uint8x32 i_avg_trunc(const uint8x32& a, const uint8x32& b)
54{
55 return v_emul_avg_trunc(a, b);
56}
57#endif
58
59#if SIMDPP_USE_AVX512BW
60SIMDPP_INL uint8<64> i_avg_trunc(const uint8<64>& a, const uint8<64>& b)
61{
62 return v_emul_avg_trunc(a, b);
63}
64#endif
65
66// -----------------------------------------------------------------------------
67
68static SIMDPP_INL
69int8x16 i_avg_trunc(const int8x16& a, const int8x16& b)
70{
71#if SIMDPP_USE_NULL
72 int8x16 r;
73 for (unsigned i = 0; i < a.length; i++) {
74 r.el(i) = (int16_t(a.el(i)) + b.el(i)) >> 1;
75 }
76 return r;
77#elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC
78 return v_emul_avg_trunc_i8(a, b);
79#elif SIMDPP_USE_NEON
80 return vhaddq_s8(a.native(), b.native());
81#elif SIMDPP_USE_MSA
82 return __msa_ave_s_b(a.native(), b.native());
83#endif
84}
85
86#if SIMDPP_USE_AVX2
87static SIMDPP_INL
88int8x32 i_avg_trunc(const int8x32& a, const int8x32& b)
89{
90 return v_emul_avg_trunc_i8(a, b);
91}
92#endif
93
94#if SIMDPP_USE_AVX512BW
95SIMDPP_INL int8<64> i_avg_trunc(const int8<64>& a, const int8<64>& b)
96{
97 return v_emul_avg_trunc_i8(a, b);
98}
99#endif
100
101// -----------------------------------------------------------------------------
102
103static SIMDPP_INL
104uint16x8 i_avg_trunc(const uint16x8& a, const uint16x8& b)
105{
106#if SIMDPP_USE_NULL
107 uint16x8 r;
108 for (unsigned i = 0; i < a.length; i++) {
109 r.el(i) = (uint32_t(a.el(i)) + b.el(i)) >> 1;
110 }
111 return r;
112#elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC
113 return v_emul_avg_trunc(a, b);
114#elif SIMDPP_USE_NEON
115 return vhaddq_u16(a.native(), b.native());
116#elif SIMDPP_USE_MSA
117 return __msa_ave_u_h(a.native(), b.native());
118#endif
119}
120
121#if SIMDPP_USE_AVX2
122static SIMDPP_INL
123uint16x16 i_avg_trunc(const uint16x16& a, const uint16x16& b)
124{
125 return v_emul_avg_trunc(a, b);
126}
127#endif
128
129#if SIMDPP_USE_AVX512BW
130SIMDPP_INL uint16<32> i_avg_trunc(const uint16<32>& a, const uint16<32>& b)
131{
132 return v_emul_avg_trunc(a, b);
133}
134#endif
135
136// -----------------------------------------------------------------------------
137
138static SIMDPP_INL
139int16x8 i_avg_trunc(const int16x8& a, const int16x8& b)
140{
141#if SIMDPP_USE_NULL
142 int16x8 r;
143 for (unsigned i = 0; i < a.length; i++) {
144 r.el(i) = (int32_t(a.el(i)) + b.el(i)) >> 1;
145 }
146 return r;
147#elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC
148 return v_emul_avg_trunc_i16(a, b);
149#elif SIMDPP_USE_NEON
150 return vhaddq_s16(a.native(), b.native());
151#elif SIMDPP_USE_MSA
152 return __msa_ave_s_h(a.native(), b.native());
153#endif
154}
155
156#if SIMDPP_USE_AVX2
157static SIMDPP_INL
158int16x16 i_avg_trunc(const int16x16& a, const int16x16& b)
159{
160 return v_emul_avg_trunc_i16(a, b);
161}
162#endif
163
164#if SIMDPP_USE_AVX512BW
165SIMDPP_INL int16<32> i_avg_trunc(const int16<32>& a, const int16<32>& b)
166{
167 return v_emul_avg_trunc_i16(a, b);
168}
169#endif
170
171// -----------------------------------------------------------------------------
172
173static SIMDPP_INL
174uint32x4 i_avg_trunc(const uint32x4& a, const uint32x4& b)
175{
176#if SIMDPP_USE_NULL
177 uint32x4 r;
178 for (unsigned i = 0; i < a.length; i++) {
179 r.el(i) = (uint64_t(a.el(i)) + b.el(i)) >> 1;
180 }
181 return r;
182#elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC
183 return v_emul_avg_trunc(a, b);
184#elif SIMDPP_USE_NEON
185 return vhaddq_u32(a.native(), b.native());
186#elif SIMDPP_USE_MSA
187 return __msa_ave_u_w(a.native(), b.native());
188#endif
189}
190
191#if SIMDPP_USE_AVX2
192static SIMDPP_INL
193uint32x8 i_avg_trunc(const uint32x8& a, const uint32x8& b)
194{
195 return v_emul_avg_trunc(a, b);
196}
197#endif
198
199#if SIMDPP_USE_AVX512F
200static SIMDPP_INL
201uint32<16> i_avg_trunc(const uint32<16>& a, const uint32<16>& b)
202{
203 return v_emul_avg_trunc(a, b);
204}
205#endif
206
207// -----------------------------------------------------------------------------
208
209static SIMDPP_INL
210int32x4 i_avg_trunc(const int32x4& a, const int32x4& b)
211{
212#if SIMDPP_USE_NULL
213 int32x4 r;
214 for (unsigned i = 0; i < a.length; i++) {
215 r.el(i) = (int64_t(a.el(i)) + b.el(i)) >> 1;
216 }
217 return r;
218#elif SIMDPP_USE_SSE2 || SIMDPP_USE_ALTIVEC
219 return v_emul_avg_trunc_i32(a, b);
220#elif SIMDPP_USE_NEON
221 return vhaddq_s32(a.native(), b.native());
222#elif SIMDPP_USE_MSA
223 return __msa_ave_s_w(a.native(), b.native());
224#endif
225}
226
227#if SIMDPP_USE_AVX2
228static SIMDPP_INL
229int32x8 i_avg_trunc(const int32x8& a, const int32x8& b)
230{
231 return v_emul_avg_trunc_i32(a, b);
232}
233#endif
234
235#if SIMDPP_USE_AVX512F
236static SIMDPP_INL
237int32<16> i_avg_trunc(const int32<16>& a, const int32<16>& b)
238{
239 return v_emul_avg_trunc_i32(a, b);
240}
241#endif
242
243// -----------------------------------------------------------------------------
244
245template<class V> SIMDPP_INL
246V i_avg_trunc(const V& a, const V& b)
247{
248 SIMDPP_VEC_ARRAY_IMPL2(V, i_avg_trunc, a, b);
249}
250
251template<class V> SIMDPP_INL
252V v_emul_avg_trunc(const V& a, const V& b)
253{
254 // (x & y) + ((x ^ y) >> 1)
255 V x1 = bit_and(a, b);
256 V x2 = bit_xor(a, b);
257 return add(x1, shift_r<1>(x2));
258}
259
260template<class V> SIMDPP_INL
261V v_emul_avg_trunc_i8(const V& a, const V& b)
262{
263 typename V::uint_vector_type a2, b2, r, bias;
264 bias = make_uint(0x80);
265
266 a2 = bit_xor(a, bias); // add
267 b2 = bit_xor(b, bias); // add
268 r = v_emul_avg_trunc(a2, b2); // unsigned
269 r = bit_xor(r, bias); // sub
270 return r;
271}
272
273template<class V> SIMDPP_INL
274V v_emul_avg_trunc_i16(const V& a, const V& b)
275{
276 typename V::uint_vector_type a2, b2, r, bias;
277 bias = make_uint(0x8000);
278
279 a2 = bit_xor(a, bias); // add
280 b2 = bit_xor(b, bias); // add
281 r = v_emul_avg_trunc(a2, b2); // unsigned
282 r = bit_xor(r, bias); // sub
283 return r;
284}
285
286template<class V> SIMDPP_INL
287V v_emul_avg_trunc_i32(const V& a, const V& b)
288{
289 typename V::uint_vector_type a2, b2, r, bias;
290 bias = make_uint(0x80000000);
291
292 a2 = bit_xor(a, bias); // add
293 b2 = bit_xor(b, bias); // add
294 r = v_emul_avg_trunc(a2, b2); // unsigned
295 r = bit_xor(r, bias); // sub
296 return r;
297}
298
299} // namespace insn
300} // namespace detail
301} // namespace SIMDPP_ARCH_NAMESPACE
302} // namespace simdpp
303
304#endif
305
306