yuv_neon.c source code [Godot/thirdparty/libwebp/src/dsp/yuv_neon.c]

1	// Copyright 2017 Google Inc. All Rights Reserved.
2	//
3	// Use of this source code is governed by a BSD-style license
4	// that can be found in the COPYING file in the root of the source
5	// tree. An additional intellectual property rights grant can be found
6	// in the file PATENTS. All contributing project authors may
7	// be found in the AUTHORS file in the root of the source tree.
8	// -----------------------------------------------------------------------------
9	//
10	// YUV->RGB conversion functions
11	//
12	// Author: Skal (pascal.massimino@gmail.com)
13
14	#include "src/dsp/yuv.h"
15
16	#if defined(WEBP_USE_NEON)
17
18	#include <assert.h>
19	#include <stdlib.h>
20
21	#include "src/dsp/neon.h"
22
23	//-----------------------------------------------------------------------------
24
25	static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R,
26	const uint8x8_t G,
27	const uint8x8_t B) {
28	const uint16x8_t r = vmovl_u8(R);
29	const uint16x8_t g = vmovl_u8(G);
30	const uint16x8_t b = vmovl_u8(B);
31	const uint16x4_t r_lo = vget_low_u16(r);
32	const uint16x4_t r_hi = vget_high_u16(r);
33	const uint16x4_t g_lo = vget_low_u16(g);
34	const uint16x4_t g_hi = vget_high_u16(g);
35	const uint16x4_t b_lo = vget_low_u16(b);
36	const uint16x4_t b_hi = vget_high_u16(b);
37	const uint32x4_t tmp0_lo = vmull_n_u16( r_lo, `16839u`);
38	const uint32x4_t tmp0_hi = vmull_n_u16( r_hi, `16839u`);
39	const uint32x4_t tmp1_lo = vmlal_n_u16(tmp0_lo, g_lo, `33059u`);
40	const uint32x4_t tmp1_hi = vmlal_n_u16(tmp0_hi, g_hi, `33059u`);
41	const uint32x4_t tmp2_lo = vmlal_n_u16(tmp1_lo, b_lo, `6420u`);
42	const uint32x4_t tmp2_hi = vmlal_n_u16(tmp1_hi, b_hi, `6420u`);
43	const uint16x8_t Y1 = vcombine_u16(vrshrn_n_u32(tmp2_lo, `16`),
44	vrshrn_n_u32(tmp2_hi, `16`));
45	const uint16x8_t Y2 = vaddq_u16(Y1, vdupq_n_u16(`16`));
46	return vqmovn_u16(Y2);
47	}
48
49	static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) {
50	int i;
51	for (i = `0`; i + `8` <= width; i += `8`, rgb += `3` * `8`) {
52	const uint8x8x3_t RGB = vld3_u8(rgb);
53	const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[`0`], RGB.val[`1`], RGB.val[`2`]);
54	vst1_u8(y + i, Y);
55	}
56	for (; i < width; ++i, rgb += `3`) { // left-over
57	y[i] = VP8RGBToY(rgb[`0`], rgb[`1`], rgb[`2`], YUV_HALF);
58	}
59	}
60
61	static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) {
62	int i;
63	for (i = `0`; i + `8` <= width; i += `8`, bgr += `3` * `8`) {
64	const uint8x8x3_t BGR = vld3_u8(bgr);
65	const uint8x8_t Y = ConvertRGBToY_NEON(BGR.val[`2`], BGR.val[`1`], BGR.val[`0`]);
66	vst1_u8(y + i, Y);
67	}
68	for (; i < width; ++i, bgr += `3`) { // left-over
69	y[i] = VP8RGBToY(bgr[`2`], bgr[`1`], bgr[`0`], YUV_HALF);
70	}
71	}
72
73	static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) {
74	int i;
75	for (i = `0`; i + `8` <= width; i += `8`) {
76	const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]);
77	const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[`2`], RGB.val[`1`], RGB.val[`0`]);
78	vst1_u8(y + i, Y);
79	}
80	for (; i < width; ++i) { // left-over
81	const uint32_t p = argb[i];
82	y[i] = VP8RGBToY((p >> `16`) & `0xff`, (p >> `8`) & `0xff`, (p >> `0`) & `0xff`,
83	YUV_HALF);
84	}
85	}
86
87	//-----------------------------------------------------------------------------
88
89	// computes: DST_s16 = [(C0 r + C1 * g + C2 * b) >> 16] + CST*
90	#define MULTIPLY_16b_PREAMBLE(r, g, b) \
91	const int16x4_t r_lo = vreinterpret_s16_u16(vget_low_u16(r)); \
92	const int16x4_t r_hi = vreinterpret_s16_u16(vget_high_u16(r)); \
93	const int16x4_t g_lo = vreinterpret_s16_u16(vget_low_u16(g)); \
94	const int16x4_t g_hi = vreinterpret_s16_u16(vget_high_u16(g)); \
95	const int16x4_t b_lo = vreinterpret_s16_u16(vget_low_u16(b)); \
96	const int16x4_t b_hi = vreinterpret_s16_u16(vget_high_u16(b))
97
98	#define MULTIPLY_16b(C0, C1, C2, CST, DST_s16) do { \
99	const int32x4_t tmp0_lo = vmull_n_s16( r_lo, C0); \
100	const int32x4_t tmp0_hi = vmull_n_s16( r_hi, C0); \
101	const int32x4_t tmp1_lo = vmlal_n_s16(tmp0_lo, g_lo, C1); \
102	const int32x4_t tmp1_hi = vmlal_n_s16(tmp0_hi, g_hi, C1); \
103	const int32x4_t tmp2_lo = vmlal_n_s16(tmp1_lo, b_lo, C2); \
104	const int32x4_t tmp2_hi = vmlal_n_s16(tmp1_hi, b_hi, C2); \
105	const int16x8_t tmp3 = vcombine_s16(vshrn_n_s32(tmp2_lo, 16), \
106	vshrn_n_s32(tmp2_hi, 16)); \
107	DST_s16 = vaddq_s16(tmp3, vdupq_n_s16(CST)); \
108	} while (0)
109
110	// This needs to be a macro, since (128 << SHIFT) needs to be an immediate.
111	#define CONVERT_RGB_TO_UV(r, g, b, SHIFT, U_DST, V_DST) do { \
112	MULTIPLY_16b_PREAMBLE(r, g, b); \
113	MULTIPLY_16b(-9719, -19081, 28800, 128 << SHIFT, U_DST); \
114	MULTIPLY_16b(28800, -24116, -4684, 128 << SHIFT, V_DST); \
115	} while (0)
116
117	static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb,
118	uint8_t* u, uint8_t* v, int width) {
119	int i;
120	for (i = `0`; i + `8` <= width; i += `8`, rgb += `4` * `8`) {
121	const uint16x8x4_t RGB = vld4q_u16((const uint16_t*)rgb);
122	int16x8_t U, V;
123	CONVERT_RGB_TO_UV(RGB.val[`0`], RGB.val[`1`], RGB.val[`2`], `2`, U, V);
124	vst1_u8(u + i, vqrshrun_n_s16(U, `2`));
125	vst1_u8(v + i, vqrshrun_n_s16(V, `2`));
126	}
127	for (; i < width; i += `1`, rgb += `4`) {
128	const int r = rgb[`0`], g = rgb[`1`], b = rgb[`2`];
129	u[i] = VP8RGBToU(r, g, b, YUV_HALF << `2`);
130	v[i] = VP8RGBToV(r, g, b, YUV_HALF << `2`);
131	}
132	}
133
134	static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v,
135	int src_width, int do_store) {
136	int i;
137	for (i = `0`; i + `16` <= src_width; i += `16`, u += `8`, v += `8`) {
138	const uint8x16x4_t RGB = vld4q_u8((const uint8_t*)&argb[i]);
139	const uint16x8_t R = vpaddlq_u8(RGB.val[`2`]); // pair-wise adds
140	const uint16x8_t G = vpaddlq_u8(RGB.val[`1`]);
141	const uint16x8_t B = vpaddlq_u8(RGB.val[`0`]);
142	int16x8_t U_tmp, V_tmp;
143	CONVERT_RGB_TO_UV(R, G, B, `1`, U_tmp, V_tmp);
144	{
145	const uint8x8_t U = vqrshrun_n_s16(U_tmp, `1`);
146	const uint8x8_t V = vqrshrun_n_s16(V_tmp, `1`);
147	if (do_store) {
148	vst1_u8(u, U);
149	vst1_u8(v, V);
150	} else {
151	const uint8x8_t prev_u = vld1_u8(u);
152	const uint8x8_t prev_v = vld1_u8(v);
153	vst1_u8(u, vrhadd_u8(U, prev_u));
154	vst1_u8(v, vrhadd_u8(V, prev_v));
155	}
156	}
157	}
158	if (i < src_width) { // left-over
159	WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
160	}
161	}
162
163
164	//------------------------------------------------------------------------------
165
166	extern void WebPInitConvertARGBToYUVNEON(void);
167
168	WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVNEON(void) {
169	WebPConvertRGB24ToY = ConvertRGB24ToY_NEON;
170	WebPConvertBGR24ToY = ConvertBGR24ToY_NEON;
171	WebPConvertARGBToY = ConvertARGBToY_NEON;
172	WebPConvertARGBToUV = ConvertARGBToUV_NEON;
173	WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_NEON;
174	}
175
176	#else // !WEBP_USE_NEON
177
178	WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVNEON)
179
180	#endif // WEBP_USE_NEON
181

Browse the source code of Godot/thirdparty/libwebp/src/dsp/yuv_neon.c