yuv_neon.c source code [Skia/third_party/externals/libwebp/src/dsp/yuv_neon.c]

1	// Copyright 2017 Google Inc. All Rights Reserved.
2	//
3	// Use of this source code is governed by a BSD-style license
4	// that can be found in the COPYING file in the root of the source
5	// tree. An additional intellectual property rights grant can be found
6	// in the file PATENTS. All contributing project authors may
7	// be found in the AUTHORS file in the root of the source tree.
8	// -----------------------------------------------------------------------------
9	//
10	// YUV->RGB conversion functions
11	//
12	// Author: Skal (pascal.massimino@gmail.com)
13
14	#include "src/dsp/yuv.h"
15
16	#if defined(WEBP_USE_NEON)
17
18	#include <assert.h>
19	#include <stdlib.h>
20
21	#include "src/dsp/neon.h"
22
23	//-----------------------------------------------------------------------------
24
25	static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R,
26	const uint8x8_t G,
27	const uint8x8_t B) {
28	const uint16x8_t r = vmovl_u8(R);
29	const uint16x8_t g = vmovl_u8(G);
30	const uint16x8_t b = vmovl_u8(B);
31	const uint16x4_t r_lo = vget_low_u16(r);
32	const uint16x4_t r_hi = vget_high_u16(r);
33	const uint16x4_t g_lo = vget_low_u16(g);
34	const uint16x4_t g_hi = vget_high_u16(g);
35	const uint16x4_t b_lo = vget_low_u16(b);
36	const uint16x4_t b_hi = vget_high_u16(b);
37	const uint32x4_t tmp0_lo = vmull_n_u16( r_lo, `16839u`);
38	const uint32x4_t tmp0_hi = vmull_n_u16( r_hi, `16839u`);
39	const uint32x4_t tmp1_lo = vmlal_n_u16(tmp0_lo, g_lo, `33059u`);
40	const uint32x4_t tmp1_hi = vmlal_n_u16(tmp0_hi, g_hi, `33059u`);
41	const uint32x4_t tmp2_lo = vmlal_n_u16(tmp1_lo, b_lo, `6420u`);
42	const uint32x4_t tmp2_hi = vmlal_n_u16(tmp1_hi, b_hi, `6420u`);
43	const uint16x8_t Y1 = vcombine_u16(vrshrn_n_u32(tmp2_lo, `16`),
44	vrshrn_n_u32(tmp2_hi, `16`));
45	const uint16x8_t Y2 = vaddq_u16(Y1, vdupq_n_u16(`16`));
46	return vqmovn_u16(Y2);
47	}
48
49	static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) {
50	int i;
51	for (i = `0`; i + `8` <= width; i += `8`, rgb += `3` * `8`) {
52	const uint8x8x3_t RGB = vld3_u8(rgb);
53	const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[`0`], RGB.val[`1`], RGB.val[`2`]);
54	vst1_u8(y + i, Y);
55	}
56	for (; i < width; ++i, rgb += `3`) { // left-over
57	y[i] = VP8RGBToY(rgb[`0`], rgb[`1`], rgb[`2`], YUV_HALF);
58	}
59	}
60
61	static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) {
62	int i;
63	for (i = `0`; i + `8` <= width; i += `8`, bgr += `3` * `8`) {
64	const uint8x8x3_t BGR = vld3_u8(bgr);
65	const uint8x8_t Y = ConvertRGBToY_NEON(BGR.val[`2`], BGR.val[`1`], BGR.val[`0`]);
66	vst1_u8(y + i, Y);
67	}
68	for (; i < width; ++i, bgr += `3`) { // left-over
69	y[i] = VP8RGBToY(bgr[`2`], bgr[`1`], bgr[`0`], YUV_HALF);
70	}
71	}
72
73	static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) {
74	int i;
75	for (i = `0`; i + `8` <= width; i += `8`) {
76	const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]);
77	const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[`2`], RGB.val[`1`], RGB.val[`0`]);
78	vst1_u8(y + i, Y);
79	}
80	for (; i < width; ++i) { // left-over
81	const uint32_t p = argb[i];
82	y[i] = VP8RGBToY((p >> `16`) & `0xff`, (p >> `8`) & `0xff`, (p >> `0`) & `0xff`,
83	YUV_HALF);
84	}
85	}
86
87	//-----------------------------------------------------------------------------
88
89	// computes: DST_s16 = [(C0 r + C1 * g + C2 * b) >> 16] + CST*
90	#define MULTIPLY_16b_PREAMBLE(r, g, b) \
91	const int16x4_t r_lo = vreinterpret_s16_u16(vget_low_u16(r)); \
92	const int16x4_t r_hi = vreinterpret_s16_u16(vget_high_u16(r)); \
93	const int16x4_t g_lo = vreinterpret_s16_u16(vget_low_u16(g)); \
94	const int16x4_t g_hi = vreinterpret_s16_u16(vget_high_u16(g)); \
95	const int16x4_t b_lo = vreinterpret_s16_u16(vget_low_u16(b)); \
96	const int16x4_t b_hi = vreinterpret_s16_u16(vget_high_u16(b))
97
98	#define MULTIPLY_16b(C0, C1, C2, CST, DST_s16) do { \
99	const int32x4_t tmp0_lo = vmull_n_s16( r_lo, C0); \
100	const int32x4_t tmp0_hi = vmull_n_s16( r_hi, C0); \
101	const int32x4_t tmp1_lo = vmlal_n_s16(tmp0_lo, g_lo, C1); \
102	const int32x4_t tmp1_hi = vmlal_n_s16(tmp0_hi, g_hi, C1); \
103	const int32x4_t tmp2_lo = vmlal_n_s16(tmp1_lo, b_lo, C2); \
104	const int32x4_t tmp2_hi = vmlal_n_s16(tmp1_hi, b_hi, C2); \
105	const int16x8_t tmp3 = vcombine_s16(vshrn_n_s32(tmp2_lo, 16), \
106	vshrn_n_s32(tmp2_hi, 16)); \
107	DST_s16 = vaddq_s16(tmp3, vdupq_n_s16(CST)); \
108	} while (0)
109
110	// This needs to be a macro, since (128 << SHIFT) needs to be an immediate.
111	#define CONVERT_RGB_TO_UV(r, g, b, SHIFT, U_DST, V_DST) do { \
112	MULTIPLY_16b_PREAMBLE(r, g, b); \
113	MULTIPLY_16b(-9719, -19081, 28800, 128 << SHIFT, U_DST); \
114	MULTIPLY_16b(28800, -24116, -4684, 128 << SHIFT, V_DST); \
115	} while (0)
116
117	static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb,
118	uint8_t* u, uint8_t* v, int width) {
119	int i;
120	for (i = `0`; i + `8` <= width; i += `8`, rgb += `4` * `8`) {
121	const uint16x8x4_t RGB = vld4q_u16((const uint16_t*)rgb);
122	int16x8_t U, V;
123	CONVERT_RGB_TO_UV(RGB.val[`0`], RGB.val[`1`], RGB.val[`2`], `2`, U, V);
124	vst1_u8(u + i, vqrshrun_n_s16(U, `2`));
125	vst1_u8(v + i, vqrshrun_n_s16(V, `2`));
126	}
127	for (; i < width; i += `1`, rgb += `4`) {
128	const int r = rgb[`0`], g = rgb[`1`], b = rgb[`2`];
129	u[i] = VP8RGBToU(r, g, b, YUV_HALF << `2`);
130	v[i] = VP8RGBToV(r, g, b, YUV_HALF << `2`);
131	}
132	}
133
134	static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v,
135	int src_width, int do_store) {
136	int i;
137	for (i = `0`; i + `16` <= src_width; i += `16`, u += `8`, v += `8`) {
138	const uint8x16x4_t RGB = vld4q_u8((const uint8_t*)&argb[i]);
139	const uint16x8_t R = vpaddlq_u8(RGB.val[`2`]); // pair-wise adds
140	const uint16x8_t G = vpaddlq_u8(RGB.val[`1`]);
141	const uint16x8_t B = vpaddlq_u8(RGB.val[`0`]);
142	int16x8_t U_tmp, V_tmp;
143	CONVERT_RGB_TO_UV(R, G, B, `1`, U_tmp, V_tmp);
144	{
145	const uint8x8_t U = vqrshrun_n_s16(U_tmp, `1`);
146	const uint8x8_t V = vqrshrun_n_s16(V_tmp, `1`);
147	if (do_store) {
148	vst1_u8(u, U);
149	vst1_u8(v, V);
150	} else {
151	const uint8x8_t prev_u = vld1_u8(u);
152	const uint8x8_t prev_v = vld1_u8(v);
153	vst1_u8(u, vrhadd_u8(U, prev_u));
154	vst1_u8(v, vrhadd_u8(V, prev_v));
155	}
156	}
157	}
158	if (i < src_width) { // left-over
159	WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
160	}
161	}
162
163
164	//------------------------------------------------------------------------------
165
166	extern void WebPInitConvertARGBToYUVNEON(void);
167
168	WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVNEON(void) {
169	WebPConvertRGB24ToY = ConvertRGB24ToY_NEON;
170	WebPConvertBGR24ToY = ConvertBGR24ToY_NEON;
171	WebPConvertARGBToY = ConvertARGBToY_NEON;
172	WebPConvertARGBToUV = ConvertARGBToUV_NEON;
173	WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_NEON;
174	}
175
176	//------------------------------------------------------------------------------
177
178	#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
179	static uint16_t clip_y_NEON(int v) {
180	return (v < `0`) ? `0` : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
181	}
182
183	static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
184	uint16_t* dst, int len) {
185	int i;
186	const int16x8_t zero = vdupq_n_s16(`0`);
187	const int16x8_t max = vdupq_n_s16(MAX_Y);
188	uint64x2_t sum = vdupq_n_u64(`0`);
189	uint64_t diff;
190
191	for (i = `0`; i + `8` <= len; i += `8`) {
192	const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
193	const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
194	const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
195	const int16x8_t D = vsubq_s16(A, B); // diff_y
196	const int16x8_t F = vaddq_s16(C, D); // new_y
197	const uint16x8_t H =
198	vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
199	const int16x8_t I = vabsq_s16(D); // abs(diff_y)
200	vst1q_u16(dst + i, H);
201	sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
202	}
203	diff = vgetq_lane_u64(sum, `0`) + vgetq_lane_u64(sum, `1`);
204	for (; i < len; ++i) {
205	const int diff_y = ref[i] - src[i];
206	const int new_y = (int)(dst[i]) + diff_y;
207	dst[i] = clip_y_NEON(new_y);
208	diff += (uint64_t)(abs(diff_y));
209	}
210	return diff;
211	}
212
213	static void SharpYUVUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
214	int16_t* dst, int len) {
215	int i;
216	for (i = `0`; i + `8` <= len; i += `8`) {
217	const int16x8_t A = vld1q_s16(ref + i);
218	const int16x8_t B = vld1q_s16(src + i);
219	const int16x8_t C = vld1q_s16(dst + i);
220	const int16x8_t D = vsubq_s16(A, B); // diff_uv
221	const int16x8_t E = vaddq_s16(C, D); // new_uv
222	vst1q_s16(dst + i, E);
223	}
224	for (; i < len; ++i) {
225	const int diff_uv = ref[i] - src[i];
226	dst[i] += diff_uv;
227	}
228	}
229
230	static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
231	const uint16_t* best_y, uint16_t* out) {
232	int i;
233	const int16x8_t max = vdupq_n_s16(MAX_Y);
234	const int16x8_t zero = vdupq_n_s16(`0`);
235	for (i = `0`; i + `8` <= len; i += `8`) {
236	const int16x8_t a0 = vld1q_s16(A + i + `0`);
237	const int16x8_t a1 = vld1q_s16(A + i + `1`);
238	const int16x8_t b0 = vld1q_s16(B + i + `0`);
239	const int16x8_t b1 = vld1q_s16(B + i + `1`);
240	const int16x8_t a0b1 = vaddq_s16(a0, b1);
241	const int16x8_t a1b0 = vaddq_s16(a1, b0);
242	const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1
243	const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2(A0+B1)*
244	const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2(A1+B0)*
245	const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), `3`);
246	const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), `3`);
247	const int16x8_t d0 = vaddq_s16(c1, a0);
248	const int16x8_t d1 = vaddq_s16(c0, a1);
249	const int16x8_t e0 = vrshrq_n_s16(d0, `1`);
250	const int16x8_t e1 = vrshrq_n_s16(d1, `1`);
251	const int16x8x2_t f = vzipq_s16(e0, e1);
252	const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + `2` * i + `0`));
253	const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + `2` * i + `8`));
254	const int16x8_t h0 = vaddq_s16(g0, f.val[`0`]);
255	const int16x8_t h1 = vaddq_s16(g1, f.val[`1`]);
256	const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
257	const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
258	vst1q_u16(out + `2` * i + `0`, vreinterpretq_u16_s16(i0));
259	vst1q_u16(out + `2` * i + `8`, vreinterpretq_u16_s16(i1));
260	}
261	for (; i < len; ++i) {
262	const int a0b1 = A[i + `0`] + B[i + `1`];
263	const int a1b0 = A[i + `1`] + B[i + `0`];
264	const int a0a1b0b1 = a0b1 + a1b0 + `8`;
265	const int v0 = (`8` * A[i + `0`] + `2` * a1b0 + a0a1b0b1) >> `4`;
266	const int v1 = (`8` * A[i + `1`] + `2` * a0b1 + a0a1b0b1) >> `4`;
267	out[`2` * i + `0`] = clip_y_NEON(best_y[`2` * i + `0`] + v0);
268	out[`2` * i + `1`] = clip_y_NEON(best_y[`2` * i + `1`] + v1);
269	}
270	}
271	#undef MAX_Y
272
273	//------------------------------------------------------------------------------
274
275	extern void WebPInitSharpYUVNEON(void);
276
277	WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVNEON(void) {
278	WebPSharpYUVUpdateY = SharpYUVUpdateY_NEON;
279	WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_NEON;
280	WebPSharpYUVFilterRow = SharpYUVFilterRow_NEON;
281	}
282
283	#else // !WEBP_USE_NEON
284
285	WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVNEON)
286	WEBP_DSP_INIT_STUB(WebPInitSharpYUVNEON)
287
288	#endif // WEBP_USE_NEON
289

Browse the source code of Skia/third_party/externals/libwebp/src/dsp/yuv_neon.c