1/* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_AVG_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_AVG_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/bit_xor.h>
17#include <simdpp/core/bit_and.h>
18#include <simdpp/core/i_add.h>
19#include <simdpp/core/i_shift_r.h>
20#include <simdpp/detail/vector_array_macros.h>
21
22namespace simdpp {
23namespace SIMDPP_ARCH_NAMESPACE {
24namespace detail {
25namespace insn {
26
27template<class V> SIMDPP_INL
28V v_emul_avg_u32(const V& a, const V& b);
29template<class V> SIMDPP_INL
30V v_emul_avg_i32(const V& a, const V& b);
31
32static SIMDPP_INL
33uint8x16 i_avg(const uint8x16& a, const uint8x16& b)
34{
35#if SIMDPP_USE_NULL
36 uint8x16 r;
37 for (unsigned i = 0; i < a.length; i++) {
38 r.el(i) = (uint16_t(a.el(i)) + b.el(i) + 1) >> 1;
39 }
40 return r;
41#elif SIMDPP_USE_SSE2
42 return _mm_avg_epu8(a.native(), b.native());
43#elif SIMDPP_USE_NEON
44 return vrhaddq_u8(a.native(), b.native());
45#elif SIMDPP_USE_ALTIVEC
46 return vec_avg(a.native(), b.native());
47#elif SIMDPP_USE_MSA
48 return __msa_aver_u_b(a.native(), b.native());
49#endif
50}
51
52#if SIMDPP_USE_AVX2
53static SIMDPP_INL
54uint8x32 i_avg(const uint8x32& a, const uint8x32& b)
55{
56 return _mm256_avg_epu8(a.native(), b.native());
57}
58#endif
59
60#if SIMDPP_USE_AVX512BW
61SIMDPP_INL uint8<64> i_avg(const uint8<64>& a, const uint8<64>& b)
62{
63 return _mm512_avg_epu8(a.native(), b.native());
64}
65#endif
66
67// -----------------------------------------------------------------------------
68
69static SIMDPP_INL
70int8x16 i_avg(const int8x16& a, const int8x16& b)
71{
72#if SIMDPP_USE_NULL
73 int8x16 r;
74 for (unsigned i = 0; i < a.length; i++) {
75 r.el(i) = (int16_t(a.el(i)) + b.el(i) + 1) >> 1;
76 }
77 return r;
78#elif SIMDPP_USE_SSE2
79 uint8x16 a2, b2, bias, r;
80 bias = make_uint(0x80);
81 a2 = bit_xor(a, bias); // add
82 b2 = bit_xor(b, bias); // add
83 r = i_avg(a2, b2); // unsigned
84 r = bit_xor(r, bias); // sub
85 return r;
86#elif SIMDPP_USE_NEON
87 return vrhaddq_s8(a.native(), b.native());
88#elif SIMDPP_USE_ALTIVEC
89 return vec_avg(a.native(), b.native());
90#elif SIMDPP_USE_MSA
91 return __msa_aver_s_b(a.native(), b.native());
92#endif
93}
94
95#if SIMDPP_USE_AVX2
96static SIMDPP_INL
97int8x32 i_avg(const int8x32& a, const int8x32& b)
98{
99 uint8x32 a2, b2, bias, r;
100 bias = make_uint(0x80);
101 a2 = bit_xor(a, bias); // add
102 b2 = bit_xor(b, bias); // add
103 r = i_avg(a2, b2); // unsigned
104 r = bit_xor(r, bias); // sub
105 return r;
106}
107#endif
108
109#if SIMDPP_USE_AVX512BW
110SIMDPP_INL int8<64>i_avg(const int8<64>& a, const int8<64>& b)
111{
112 uint8<64> a2, b2, bias, r;
113 bias = make_uint(0x80);
114 a2 = bit_xor(a, bias); // add
115 b2 = bit_xor(b, bias); // add
116 r = i_avg(a2, b2); // unsigned
117 r = bit_xor(r, bias); // sub
118 return r;
119}
120#endif
121
122// -----------------------------------------------------------------------------
123
124static SIMDPP_INL
125uint16x8 i_avg(const uint16x8& a, const uint16x8& b)
126{
127#if SIMDPP_USE_NULL
128 uint16x8 r;
129 for (unsigned i = 0; i < a.length; i++) {
130 r.el(i) = (uint32_t(a.el(i)) + b.el(i) + 1) >> 1;
131 }
132 return r;
133#elif SIMDPP_USE_SSE2
134 return _mm_avg_epu16(a.native(), b.native());
135#elif SIMDPP_USE_NEON
136 return vrhaddq_u16(a.native(), b.native());
137#elif SIMDPP_USE_ALTIVEC
138 return vec_avg(a.native(), b.native());
139#elif SIMDPP_USE_MSA
140 return __msa_aver_u_h(a.native(), b.native());
141#endif
142}
143
144#if SIMDPP_USE_AVX2
145static SIMDPP_INL
146uint16x16 i_avg(const uint16x16& a, const uint16x16& b)
147{
148 return _mm256_avg_epu16(a.native(), b.native());
149}
150#endif
151
152#if SIMDPP_USE_AVX512BW
153SIMDPP_INL uint16<32> i_avg(const uint16<32>& a, const uint16<32>& b)
154{
155 return _mm512_avg_epu16(a.native(), b.native());
156}
157#endif
158
159// -----------------------------------------------------------------------------
160
161static SIMDPP_INL
162int16x8 i_avg(const int16x8& a, const int16x8& b)
163{
164#if SIMDPP_USE_NULL
165 int16x8 r;
166 for (unsigned i = 0; i < a.length; i++) {
167 r.el(i) = (int32_t(a.el(i)) + b.el(i) + 1) >> 1;
168 }
169 return r;
170#elif SIMDPP_USE_SSE2
171 uint16x8 a2, b2, r;
172 a2 = bit_xor(a, 0x8000); // add
173 b2 = bit_xor(b, 0x8000); // add
174 r = i_avg(a2, b2); // unsigned
175 r = bit_xor(r, 0x8000); // sub
176 return r;
177#elif SIMDPP_USE_NEON
178 return vrhaddq_s16(a.native(), b.native());
179#elif SIMDPP_USE_ALTIVEC
180 return vec_avg(a.native(), b.native());
181#elif SIMDPP_USE_MSA
182 return __msa_aver_s_h(a.native(), b.native());
183#endif
184}
185
186#if SIMDPP_USE_AVX2
187static SIMDPP_INL
188int16x16 i_avg(const int16x16& a, const int16x16& b)
189{
190 uint16x16 a2, b2, r;
191 a2 = bit_xor(a, 0x8000); // add
192 b2 = bit_xor(b, 0x8000); // add
193 r = i_avg(a2, b2); // unsigned
194 r = bit_xor(r, 0x8000); // sub
195 return r;
196}
197#endif
198
199#if SIMDPP_USE_AVX512BW
200SIMDPP_INL int16<32> i_avg(const int16<32>& a, const int16<32>& b)
201{
202 uint16<32> a2, b2, r;
203 a2 = bit_xor(a, 0x8000); // add
204 b2 = bit_xor(b, 0x8000); // add
205 r = i_avg(a2, b2); // unsigned
206 r = bit_xor(r, 0x8000); // sub
207 return r;
208}
209#endif
210
211// -----------------------------------------------------------------------------
212
213static SIMDPP_INL
214uint32x4 i_avg(const uint32x4& a, const uint32x4& b)
215{
216#if SIMDPP_USE_NULL
217 uint32x4 r;
218 for (unsigned i = 0; i < a.length; i++) {
219 r.el(i) = (uint64_t(a.el(i)) + b.el(i) + 1) >> 1;
220 }
221 return r;
222#elif SIMDPP_USE_SSE2
223 return v_emul_avg_u32(a, b);
224#elif SIMDPP_USE_NEON
225 return vrhaddq_u32(a.native(), b.native());
226#elif SIMDPP_USE_ALTIVEC
227 return vec_avg(a.native(), b.native());
228#elif SIMDPP_USE_MSA
229 return __msa_aver_u_w(a.native(), b.native());
230#endif
231}
232
233#if SIMDPP_USE_AVX2
234static SIMDPP_INL
235uint32x8 i_avg(const uint32x8& a, const uint32x8& b)
236{
237 return v_emul_avg_u32(a, b);
238}
239#endif
240
241#if SIMDPP_USE_AVX512F
242static SIMDPP_INL
243uint32<16> i_avg(const uint32<16>& a, const uint32<16>& b)
244{
245 return v_emul_avg_u32(a, b);
246}
247#endif
248
249// -----------------------------------------------------------------------------
250
251static SIMDPP_INL
252int32x4 i_avg(const int32x4& a, const int32x4& b)
253{
254#if SIMDPP_USE_NULL
255 int32x4 r;
256 for (unsigned i = 0; i < a.length; i++) {
257 r.el(i) = (int64_t(a.el(i)) + b.el(i) + 1) >> 1;
258 }
259 return r;
260#elif SIMDPP_USE_SSE2
261 return v_emul_avg_i32(a, b);
262
263#elif SIMDPP_USE_NEON
264 return vrhaddq_s32(a.native(), b.native());
265#elif SIMDPP_USE_ALTIVEC
266 return vec_avg(a.native(), b.native());
267#elif SIMDPP_USE_MSA
268 return __msa_aver_s_w(a.native(), b.native());
269#endif
270}
271
272#if SIMDPP_USE_AVX2
273static SIMDPP_INL
274int32x8 i_avg(const int32x8& a, const int32x8& b)
275{
276 return v_emul_avg_i32(a, b);
277}
278#endif
279
280#if SIMDPP_USE_AVX512F
281static SIMDPP_INL
282int32<16> i_avg(const int32<16>& a, const int32<16>& b)
283{
284 return v_emul_avg_i32(a, b);
285}
286#endif
287
288// -----------------------------------------------------------------------------
289
290template<class V> SIMDPP_INL
291V i_avg(const V& a, const V& b)
292{
293 SIMDPP_VEC_ARRAY_IMPL2(V, i_avg, a, b);
294}
295
296// generic implementations
297
298template<class V> SIMDPP_INL
299V v_emul_avg_u32(const V& a, const V& b)
300{
301 // (x & y) + ((x ^ y) >> 1) + (x ^ y) & 1
302 V x1, x2, round;
303 x1 = bit_and(a, b);
304 x2 = bit_xor(a, b);
305 round = bit_and(x2, 1);
306 x1 = add(x1, shift_r<1>(x2));
307 x1 = add(x1, round);
308 return x1;
309}
310
311template<class V> SIMDPP_INL
312V v_emul_avg_i32(const V& a, const V& b)
313{
314 using VI = typename V::uint_vector_type;
315 VI a2, b2, r;
316 a2 = bit_xor(a, 0x80000000); // add
317 b2 = bit_xor(b, 0x80000000); // add
318 r = v_emul_avg_u32(a2, b2); // unsigned
319 r = bit_xor(r, 0x80000000); // sub
320 return r;
321}
322
323} // namespace insn
324} // namespace detail
325} // namespace SIMDPP_ARCH_NAMESPACE
326} // namespace simdpp
327
328#endif
329
330