yuv_sse2.c source code [Godot/thirdparty/libwebp/src/dsp/yuv_sse2.c]

1	// Copyright 2014 Google Inc. All Rights Reserved.
2	//
3	// Use of this source code is governed by a BSD-style license
4	// that can be found in the COPYING file in the root of the source
5	// tree. An additional intellectual property rights grant can be found
6	// in the file PATENTS. All contributing project authors may
7	// be found in the AUTHORS file in the root of the source tree.
8	// -----------------------------------------------------------------------------
9	//
10	// YUV->RGB conversion functions
11	//
12	// Author: Skal (pascal.massimino@gmail.com)
13
14	#include "src/dsp/yuv.h"
15
16	#if defined(WEBP_USE_SSE2)
17
18	#include <stdlib.h>
19	#include <emmintrin.h>
20
21	#include "src/dsp/common_sse2.h"
22	#include "src/utils/utils.h"
23
24	//-----------------------------------------------------------------------------
25	// Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
26
27	// These constants are 14b fixed-point version of ITU-R BT.601 constants.
28	// R = (19077 y + 26149 * v - 14234) >> 6*
29	// G = (19077 y - 6419 * u - 13320 * v + 8708) >> 6*
30	// B = (19077 y + 33050 * u - 17685) >> 6*
31	static void ConvertYUV444ToRGB_SSE2(const __m128i* const Y0,
32	const __m128i* const U0,
33	const __m128i* const V0,
34	__m128i* const R,
35	__m128i* const G,
36	__m128i* const B) {
37	const __m128i k19077 = _mm_set1_epi16(`19077`);
38	const __m128i k26149 = _mm_set1_epi16(`26149`);
39	const __m128i k14234 = _mm_set1_epi16(`14234`);
40	// 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
41	const __m128i k33050 = _mm_set1_epi16((short)`33050`);
42	const __m128i k17685 = _mm_set1_epi16(`17685`);
43	const __m128i k6419 = _mm_set1_epi16(`6419`);
44	const __m128i k13320 = _mm_set1_epi16(`13320`);
45	const __m128i k8708 = _mm_set1_epi16(`8708`);
46
47	const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077);
48
49	const __m128i R0 = _mm_mulhi_epu16(*V0, k26149);
50	const __m128i R1 = _mm_sub_epi16(Y1, k14234);
51	const __m128i R2 = _mm_add_epi16(R1, R0);
52
53	const __m128i G0 = _mm_mulhi_epu16(*U0, k6419);
54	const __m128i G1 = _mm_mulhi_epu16(*V0, k13320);
55	const __m128i G2 = _mm_add_epi16(Y1, k8708);
56	const __m128i G3 = _mm_add_epi16(G0, G1);
57	const __m128i G4 = _mm_sub_epi16(G2, G3);
58
59	// be careful with the saturated unsigned* arithmetic here!*
60	const __m128i B0 = _mm_mulhi_epu16(*U0, k33050);
61	const __m128i B1 = _mm_adds_epu16(B0, Y1);
62	const __m128i B2 = _mm_subs_epu16(B1, k17685);
63
64	// use logical shift for B2, which can be larger than 32767
65	R = _mm_srai_epi16(R2, `6`); // range: [-14234, 30815]*
66	G = _mm_srai_epi16(G4, `6`); // range: [-10953, 27710]*
67	B = _mm_srli_epi16(B2, `6`); // range: [0, 34238]*
68	}
69
70	// Load the bytes into the upper* part of 16b words. That's "<< 8", basically.*
71	static WEBP_INLINE __m128i Load_HI_16_SSE2(const uint8_t* src) {
72	const __m128i zero = _mm_setzero_si128();
73	return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
74	}
75
76	// Load and replicate the U/V samples
77	static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) {
78	const __m128i zero = _mm_setzero_si128();
79	const __m128i tmp0 = _mm_cvtsi32_si128(WebPMemToInt32(src));
80	const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
81	return _mm_unpacklo_epi16(tmp1, tmp1); // replicate samples
82	}
83
84	// Convert 32 samples of YUV444 to R/G/B
85	static void YUV444ToRGB_SSE2(const uint8_t* const y,
86	const uint8_t* const u,
87	const uint8_t* const v,
88	__m128i* const R, __m128i* const G,
89	__m128i* const B) {
90	const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u),
91	V0 = Load_HI_16_SSE2(v);
92	ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
93	}
94
95	// Convert 32 samples of YUV420 to R/G/B
96	static void YUV420ToRGB_SSE2(const uint8_t* const y,
97	const uint8_t* const u,
98	const uint8_t* const v,
99	__m128i* const R, __m128i* const G,
100	__m128i* const B) {
101	const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u),
102	V0 = Load_UV_HI_8_SSE2(v);
103	ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
104	}
105
106	// Pack R/G/B/A results into 32b output.
107	static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
108	const __m128i* const G,
109	const __m128i* const B,
110	const __m128i* const A,
111	uint8_t* const dst) {
112	const __m128i rb = _mm_packus_epi16(R, B);
113	const __m128i ga = _mm_packus_epi16(G, A);
114	const __m128i rg = _mm_unpacklo_epi8(rb, ga);
115	const __m128i ba = _mm_unpackhi_epi8(rb, ga);
116	const __m128i RGBA_lo = _mm_unpacklo_epi16(rg, ba);
117	const __m128i RGBA_hi = _mm_unpackhi_epi16(rg, ba);
118	_mm_storeu_si128((__m128i*)(dst + `0`), RGBA_lo);
119	_mm_storeu_si128((__m128i*)(dst + `16`), RGBA_hi);
120	}
121
122	// Pack R/G/B/A results into 16b output.
123	static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
124	const __m128i* const G,
125	const __m128i* const B,
126	const __m128i* const A,
127	uint8_t* const dst) {
128	#if (WEBP_SWAP_16BIT_CSP == 0)
129	const __m128i rg0 = _mm_packus_epi16(R, G);
130	const __m128i ba0 = _mm_packus_epi16(B, A);
131	#else
132	const __m128i rg0 = _mm_packus_epi16(B, A);
133	const __m128i ba0 = _mm_packus_epi16(R, G);
134	#endif
135	const __m128i mask_0xf0 = _mm_set1_epi8((char)`0xf0`);
136	const __m128i rb1 = _mm_unpacklo_epi8(rg0, ba0); // rbrbrbrbrb...
137	const __m128i ga1 = _mm_unpackhi_epi8(rg0, ba0); // gagagagaga...
138	const __m128i rb2 = _mm_and_si128(rb1, mask_0xf0);
139	const __m128i ga2 = _mm_srli_epi16(_mm_and_si128(ga1, mask_0xf0), `4`);
140	const __m128i rgba4444 = _mm_or_si128(rb2, ga2);
141	_mm_storeu_si128((__m128i*)dst, rgba4444);
142	}
143
144	// Pack R/G/B results into 16b output.
145	static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
146	const __m128i* const G,
147	const __m128i* const B,
148	uint8_t* const dst) {
149	const __m128i r0 = _mm_packus_epi16(R, R);
150	const __m128i g0 = _mm_packus_epi16(G, G);
151	const __m128i b0 = _mm_packus_epi16(B, B);
152	const __m128i r1 = _mm_and_si128(r0, _mm_set1_epi8((char)`0xf8`));
153	const __m128i b1 = _mm_and_si128(_mm_srli_epi16(b0, `3`), _mm_set1_epi8(`0x1f`));
154	const __m128i g1 =
155	_mm_srli_epi16(_mm_and_si128(g0, _mm_set1_epi8((char)`0xe0`)), `5`);
156	const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(`0x1c`)), `3`);
157	const __m128i rg = _mm_or_si128(r1, g1);
158	const __m128i gb = _mm_or_si128(g2, b1);
159	#if (WEBP_SWAP_16BIT_CSP == 0)
160	const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb);
161	#else
162	const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg);
163	#endif
164	_mm_storeu_si128((__m128i*)dst, rgb565);
165	}
166
167	// Pack the planar buffers
168	// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
169	// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
170	static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
171	__m128i* const in2, __m128i* const in3,
172	__m128i* const in4, __m128i* const in5,
173	uint8_t* const rgb) {
174	// The input is 6 registers of sixteen 8b but for the sake of explanation,
175	// let's take 6 registers of four 8b values.
176	// To pack, we will keep taking one every two 8b integer and move it
177	// around as follows:
178	// Input:
179	// r0r1r2r3 \| r4r5r6r7 \| g0g1g2g3 \| g4g5g6g7 \| b0b1b2b3 \| b4b5b6b7
180	// Split the 6 registers in two sets of 3 registers: the first set as the even
181	// 8b bytes, the second the odd ones:
182	// r0r2r4r6 \| g0g2g4g6 \| b0b2b4b6 \| r1r3r5r7 \| g1g3g5g7 \| b1b3b5b7
183	// Repeat the same permutations twice more:
184	// r0r4g0g4 \| b0b4r1r5 \| g1g5b1b5 \| r2r6g2g6 \| b2b6r3r7 \| g3g7b3b7
185	// r0g0b0r1 \| g1b1r2g2 \| b2r3g3b3 \| r4g4b4r5 \| g5b5r6g6 \| b6r7g7b7
186	VP8PlanarTo24b_SSE2(in0, in1, in2, in3, in4, in5);
187
188	_mm_storeu_si128((__m128i)(rgb + `0`), in0);
189	_mm_storeu_si128((__m128i)(rgb + `16`), in1);
190	_mm_storeu_si128((__m128i)(rgb + `32`), in2);
191	_mm_storeu_si128((__m128i)(rgb + `48`), in3);
192	_mm_storeu_si128((__m128i)(rgb + `64`), in4);
193	_mm_storeu_si128((__m128i)(rgb + `80`), in5);
194	}
195
196	void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
197	uint8_t* dst) {
198	const __m128i kAlpha = _mm_set1_epi16(`255`);
199	int n;
200	for (n = `0`; n < `32`; n += `8`, dst += `32`) {
201	__m128i R, G, B;
202	YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
203	PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
204	}
205	}
206
207	void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
208	uint8_t* dst) {
209	const __m128i kAlpha = _mm_set1_epi16(`255`);
210	int n;
211	for (n = `0`; n < `32`; n += `8`, dst += `32`) {
212	__m128i R, G, B;
213	YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
214	PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
215	}
216	}
217
218	void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
219	uint8_t* dst) {
220	const __m128i kAlpha = _mm_set1_epi16(`255`);
221	int n;
222	for (n = `0`; n < `32`; n += `8`, dst += `32`) {
223	__m128i R, G, B;
224	YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
225	PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
226	}
227	}
228
229	void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
230	const uint8_t* v, uint8_t* dst) {
231	const __m128i kAlpha = _mm_set1_epi16(`255`);
232	int n;
233	for (n = `0`; n < `32`; n += `8`, dst += `16`) {
234	__m128i R, G, B;
235	YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
236	PackAndStore4444_SSE2(&R, &G, &B, &kAlpha, dst);
237	}
238	}
239
240	void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
241	uint8_t* dst) {
242	int n;
243	for (n = `0`; n < `32`; n += `8`, dst += `16`) {
244	__m128i R, G, B;
245	YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
246	PackAndStore565_SSE2(&R, &G, &B, dst);
247	}
248	}
249
250	void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
251	uint8_t* dst) {
252	__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
253	__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
254
255	YUV444ToRGB_SSE2(y + `0`, u + `0`, v + `0`, &R0, &G0, &B0);
256	YUV444ToRGB_SSE2(y + `8`, u + `8`, v + `8`, &R1, &G1, &B1);
257	YUV444ToRGB_SSE2(y + `16`, u + `16`, v + `16`, &R2, &G2, &B2);
258	YUV444ToRGB_SSE2(y + `24`, u + `24`, v + `24`, &R3, &G3, &B3);
259
260	// Cast to 8b and store as RRRRGGGGBBBB.
261	rgb0 = _mm_packus_epi16(R0, R1);
262	rgb1 = _mm_packus_epi16(R2, R3);
263	rgb2 = _mm_packus_epi16(G0, G1);
264	rgb3 = _mm_packus_epi16(G2, G3);
265	rgb4 = _mm_packus_epi16(B0, B1);
266	rgb5 = _mm_packus_epi16(B2, B3);
267
268	// Pack as RGBRGBRGBRGB.
269	PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
270	}
271
272	void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
273	uint8_t* dst) {
274	__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
275	__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
276
277	YUV444ToRGB_SSE2(y + `0`, u + `0`, v + `0`, &R0, &G0, &B0);
278	YUV444ToRGB_SSE2(y + `8`, u + `8`, v + `8`, &R1, &G1, &B1);
279	YUV444ToRGB_SSE2(y + `16`, u + `16`, v + `16`, &R2, &G2, &B2);
280	YUV444ToRGB_SSE2(y + `24`, u + `24`, v + `24`, &R3, &G3, &B3);
281
282	// Cast to 8b and store as BBBBGGGGRRRR.
283	bgr0 = _mm_packus_epi16(B0, B1);
284	bgr1 = _mm_packus_epi16(B2, B3);
285	bgr2 = _mm_packus_epi16(G0, G1);
286	bgr3 = _mm_packus_epi16(G2, G3);
287	bgr4 = _mm_packus_epi16(R0, R1);
288	bgr5= _mm_packus_epi16(R2, R3);
289
290	// Pack as BGRBGRBGRBGR.
291	PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
292	}
293
294	//-----------------------------------------------------------------------------
295	// Arbitrary-length row conversion functions
296
297	static void YuvToRgbaRow_SSE2(const uint8_t* y,
298	const uint8_t* u, const uint8_t* v,
299	uint8_t* dst, int len) {
300	const __m128i kAlpha = _mm_set1_epi16(`255`);
301	int n;
302	for (n = `0`; n + `8` <= len; n += `8`, dst += `32`) {
303	__m128i R, G, B;
304	YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
305	PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
306	y += `8`;
307	u += `4`;
308	v += `4`;
309	}
310	for (; n < len; ++n) { // Finish off
311	VP8YuvToRgba(y[`0`], u[`0`], v[`0`], dst);
312	dst += `4`;
313	y += `1`;
314	u += (n & `1`);
315	v += (n & `1`);
316	}
317	}
318
319	static void YuvToBgraRow_SSE2(const uint8_t* y,
320	const uint8_t* u, const uint8_t* v,
321	uint8_t* dst, int len) {
322	const __m128i kAlpha = _mm_set1_epi16(`255`);
323	int n;
324	for (n = `0`; n + `8` <= len; n += `8`, dst += `32`) {
325	__m128i R, G, B;
326	YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
327	PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
328	y += `8`;
329	u += `4`;
330	v += `4`;
331	}
332	for (; n < len; ++n) { // Finish off
333	VP8YuvToBgra(y[`0`], u[`0`], v[`0`], dst);
334	dst += `4`;
335	y += `1`;
336	u += (n & `1`);
337	v += (n & `1`);
338	}
339	}
340
341	static void YuvToArgbRow_SSE2(const uint8_t* y,
342	const uint8_t* u, const uint8_t* v,
343	uint8_t* dst, int len) {
344	const __m128i kAlpha = _mm_set1_epi16(`255`);
345	int n;
346	for (n = `0`; n + `8` <= len; n += `8`, dst += `32`) {
347	__m128i R, G, B;
348	YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
349	PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
350	y += `8`;
351	u += `4`;
352	v += `4`;
353	}
354	for (; n < len; ++n) { // Finish off
355	VP8YuvToArgb(y[`0`], u[`0`], v[`0`], dst);
356	dst += `4`;
357	y += `1`;
358	u += (n & `1`);
359	v += (n & `1`);
360	}
361	}
362
363	static void YuvToRgbRow_SSE2(const uint8_t* y,
364	const uint8_t* u, const uint8_t* v,
365	uint8_t* dst, int len) {
366	int n;
367	for (n = `0`; n + `32` <= len; n += `32`, dst += `32` * `3`) {
368	__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
369	__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
370
371	YUV420ToRGB_SSE2(y + `0`, u + `0`, v + `0`, &R0, &G0, &B0);
372	YUV420ToRGB_SSE2(y + `8`, u + `4`, v + `4`, &R1, &G1, &B1);
373	YUV420ToRGB_SSE2(y + `16`, u + `8`, v + `8`, &R2, &G2, &B2);
374	YUV420ToRGB_SSE2(y + `24`, u + `12`, v + `12`, &R3, &G3, &B3);
375
376	// Cast to 8b and store as RRRRGGGGBBBB.
377	rgb0 = _mm_packus_epi16(R0, R1);
378	rgb1 = _mm_packus_epi16(R2, R3);
379	rgb2 = _mm_packus_epi16(G0, G1);
380	rgb3 = _mm_packus_epi16(G2, G3);
381	rgb4 = _mm_packus_epi16(B0, B1);
382	rgb5 = _mm_packus_epi16(B2, B3);
383
384	// Pack as RGBRGBRGBRGB.
385	PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
386
387	y += `32`;
388	u += `16`;
389	v += `16`;
390	}
391	for (; n < len; ++n) { // Finish off
392	VP8YuvToRgb(y[`0`], u[`0`], v[`0`], dst);
393	dst += `3`;
394	y += `1`;
395	u += (n & `1`);
396	v += (n & `1`);
397	}
398	}
399
400	static void YuvToBgrRow_SSE2(const uint8_t* y,
401	const uint8_t* u, const uint8_t* v,
402	uint8_t* dst, int len) {
403	int n;
404	for (n = `0`; n + `32` <= len; n += `32`, dst += `32` * `3`) {
405	__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
406	__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
407
408	YUV420ToRGB_SSE2(y + `0`, u + `0`, v + `0`, &R0, &G0, &B0);
409	YUV420ToRGB_SSE2(y + `8`, u + `4`, v + `4`, &R1, &G1, &B1);
410	YUV420ToRGB_SSE2(y + `16`, u + `8`, v + `8`, &R2, &G2, &B2);
411	YUV420ToRGB_SSE2(y + `24`, u + `12`, v + `12`, &R3, &G3, &B3);
412
413	// Cast to 8b and store as BBBBGGGGRRRR.
414	bgr0 = _mm_packus_epi16(B0, B1);
415	bgr1 = _mm_packus_epi16(B2, B3);
416	bgr2 = _mm_packus_epi16(G0, G1);
417	bgr3 = _mm_packus_epi16(G2, G3);
418	bgr4 = _mm_packus_epi16(R0, R1);
419	bgr5 = _mm_packus_epi16(R2, R3);
420
421	// Pack as BGRBGRBGRBGR.
422	PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
423
424	y += `32`;
425	u += `16`;
426	v += `16`;
427	}
428	for (; n < len; ++n) { // Finish off
429	VP8YuvToBgr(y[`0`], u[`0`], v[`0`], dst);
430	dst += `3`;
431	y += `1`;
432	u += (n & `1`);
433	v += (n & `1`);
434	}
435	}
436
437	//------------------------------------------------------------------------------
438	// Entry point
439
440	extern void WebPInitSamplersSSE2(void);
441
442	WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) {
443	WebPSamplers[MODE_RGB] = YuvToRgbRow_SSE2;
444	WebPSamplers[MODE_RGBA] = YuvToRgbaRow_SSE2;
445	WebPSamplers[MODE_BGR] = YuvToBgrRow_SSE2;
446	WebPSamplers[MODE_BGRA] = YuvToBgraRow_SSE2;
447	WebPSamplers[MODE_ARGB] = YuvToArgbRow_SSE2;
448	}
449
450	//------------------------------------------------------------------------------
451	// RGB24/32 -> YUV converters
452
453	// Load eight 16b-words from src.*
454	#define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
455	// Store either 16b-words into dst*
456	#define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V))
457
458	// Function that inserts a value of the second half of the in buffer in between
459	// every two char of the first half.
460	static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2(
461	const __m128i* const in /in[6]/, __m128i* const out /out[6]/) {
462	out[`0`] = _mm_unpacklo_epi8(in[`0`], in[`3`]);
463	out[`1`] = _mm_unpackhi_epi8(in[`0`], in[`3`]);
464	out[`2`] = _mm_unpacklo_epi8(in[`1`], in[`4`]);
465	out[`3`] = _mm_unpackhi_epi8(in[`1`], in[`4`]);
466	out[`4`] = _mm_unpacklo_epi8(in[`2`], in[`5`]);
467	out[`5`] = _mm_unpackhi_epi8(in[`2`], in[`5`]);
468	}
469
470	// Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
471	// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
472	// Similar to PlanarTo24bHelper(), but in reverse order.
473	static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
474	const uint8_t* const rgb, __m128i* const out /out[6]/) {
475	__m128i tmp[`6`];
476	tmp[`0`] = _mm_loadu_si128((const __m128i*)(rgb + `0`));
477	tmp[`1`] = _mm_loadu_si128((const __m128i*)(rgb + `16`));
478	tmp[`2`] = _mm_loadu_si128((const __m128i*)(rgb + `32`));
479	tmp[`3`] = _mm_loadu_si128((const __m128i*)(rgb + `48`));
480	tmp[`4`] = _mm_loadu_si128((const __m128i*)(rgb + `64`));
481	tmp[`5`] = _mm_loadu_si128((const __m128i*)(rgb + `80`));
482
483	RGB24PackedToPlanarHelper_SSE2(tmp, out);
484	RGB24PackedToPlanarHelper_SSE2(out, tmp);
485	RGB24PackedToPlanarHelper_SSE2(tmp, out);
486	RGB24PackedToPlanarHelper_SSE2(out, tmp);
487	RGB24PackedToPlanarHelper_SSE2(tmp, out);
488	}
489
490	// Convert 8 packed ARGB to r[], g[], b[]
491	static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb,
492	__m128i* const rgb /in[6]/) {
493	const __m128i zero = _mm_setzero_si128();
494	__m128i a0 = LOAD_16(argb + `0`);
495	__m128i a1 = LOAD_16(argb + `4`);
496	__m128i a2 = LOAD_16(argb + `8`);
497	__m128i a3 = LOAD_16(argb + `12`);
498	VP8L32bToPlanar_SSE2(&a0, &a1, &a2, &a3);
499	rgb[`0`] = _mm_unpacklo_epi8(a1, zero);
500	rgb[`1`] = _mm_unpackhi_epi8(a1, zero);
501	rgb[`2`] = _mm_unpacklo_epi8(a2, zero);
502	rgb[`3`] = _mm_unpackhi_epi8(a2, zero);
503	rgb[`4`] = _mm_unpacklo_epi8(a3, zero);
504	rgb[`5`] = _mm_unpackhi_epi8(a3, zero);
505	}
506
507	// This macro computes (RG MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX*
508	// It's a macro and not a function because we need to use immediate values with
509	// srai_epi32, e.g.
510	#define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \
511	ROUNDER, DESCALE_FIX, OUT) do { \
512	const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG); \
513	const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG); \
514	const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB); \
515	const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB); \
516	const __m128i V2_lo = _mm_add_epi32(V0_lo, V1_lo); \
517	const __m128i V2_hi = _mm_add_epi32(V0_hi, V1_hi); \
518	const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER); \
519	const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER); \
520	const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX); \
521	const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX); \
522	(OUT) = _mm_packs_epi32(V5_lo, V5_hi); \
523	} while (0)
524
525	#define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
526	static WEBP_INLINE void ConvertRGBToY_SSE2(const __m128i* const R,
527	const __m128i* const G,
528	const __m128i* const B,
529	__m128i* const Y) {
530	const __m128i kRG_y = MK_CST_16(`16839`, `33059` - `16384`);
531	const __m128i kGB_y = MK_CST_16(`16384`, `6420`);
532	const __m128i kHALF_Y = _mm_set1_epi32((`16` << YUV_FIX) + YUV_HALF);
533
534	const __m128i RG_lo = _mm_unpacklo_epi16(R, G);
535	const __m128i RG_hi = _mm_unpackhi_epi16(R, G);
536	const __m128i GB_lo = _mm_unpacklo_epi16(G, B);
537	const __m128i GB_hi = _mm_unpackhi_epi16(G, B);
538	TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
539	}
540
541	static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R,
542	const __m128i* const G,
543	const __m128i* const B,
544	__m128i* const U,
545	__m128i* const V) {
546	const __m128i kRG_u = MK_CST_16(-`9719`, -`19081`);
547	const __m128i kGB_u = MK_CST_16(`0`, `28800`);
548	const __m128i kRG_v = MK_CST_16(`28800`, `0`);
549	const __m128i kGB_v = MK_CST_16(-`24116`, -`4684`);
550	const __m128i kHALF_UV = _mm_set1_epi32(((`128` << YUV_FIX) + YUV_HALF) << `2`);
551
552	const __m128i RG_lo = _mm_unpacklo_epi16(R, G);
553	const __m128i RG_hi = _mm_unpackhi_epi16(R, G);
554	const __m128i GB_lo = _mm_unpacklo_epi16(G, B);
555	const __m128i GB_hi = _mm_unpackhi_epi16(G, B);
556	TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u,
557	kHALF_UV, YUV_FIX + `2`, *U);
558	TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_v, kGB_v,
559	kHALF_UV, YUV_FIX + `2`, *V);
560	}
561
562	#undef MK_CST_16
563	#undef TRANSFORM
564
565	static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) {
566	const int max_width = width & ~`31`;
567	int i;
568	for (i = `0`; i < max_width; rgb += `3` * `16` * `2`) {
569	__m128i rgb_plane[`6`];
570	int j;
571
572	RGB24PackedToPlanar_SSE2(rgb, rgb_plane);
573
574	for (j = `0`; j < `2`; ++j, i += `16`) {
575	const __m128i zero = _mm_setzero_si128();
576	__m128i r, g, b, Y0, Y1;
577
578	// Convert to 16-bit Y.
579	r = _mm_unpacklo_epi8(rgb_plane[`0` + j], zero);
580	g = _mm_unpacklo_epi8(rgb_plane[`2` + j], zero);
581	b = _mm_unpacklo_epi8(rgb_plane[`4` + j], zero);
582	ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
583
584	// Convert to 16-bit Y.
585	r = _mm_unpackhi_epi8(rgb_plane[`0` + j], zero);
586	g = _mm_unpackhi_epi8(rgb_plane[`2` + j], zero);
587	b = _mm_unpackhi_epi8(rgb_plane[`4` + j], zero);
588	ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
589
590	// Cast to 8-bit and store.
591	STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
592	}
593	}
594	for (; i < width; ++i, rgb += `3`) { // left-over
595	y[i] = VP8RGBToY(rgb[`0`], rgb[`1`], rgb[`2`], YUV_HALF);
596	}
597	}
598
599	static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) {
600	const int max_width = width & ~`31`;
601	int i;
602	for (i = `0`; i < max_width; bgr += `3` * `16` * `2`) {
603	__m128i bgr_plane[`6`];
604	int j;
605
606	RGB24PackedToPlanar_SSE2(bgr, bgr_plane);
607
608	for (j = `0`; j < `2`; ++j, i += `16`) {
609	const __m128i zero = _mm_setzero_si128();
610	__m128i r, g, b, Y0, Y1;
611
612	// Convert to 16-bit Y.
613	b = _mm_unpacklo_epi8(bgr_plane[`0` + j], zero);
614	g = _mm_unpacklo_epi8(bgr_plane[`2` + j], zero);
615	r = _mm_unpacklo_epi8(bgr_plane[`4` + j], zero);
616	ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
617
618	// Convert to 16-bit Y.
619	b = _mm_unpackhi_epi8(bgr_plane[`0` + j], zero);
620	g = _mm_unpackhi_epi8(bgr_plane[`2` + j], zero);
621	r = _mm_unpackhi_epi8(bgr_plane[`4` + j], zero);
622	ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
623
624	// Cast to 8-bit and store.
625	STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
626	}
627	}
628	for (; i < width; ++i, bgr += `3`) { // left-over
629	y[i] = VP8RGBToY(bgr[`2`], bgr[`1`], bgr[`0`], YUV_HALF);
630	}
631	}
632
633	static void ConvertARGBToY_SSE2(const uint32_t* argb, uint8_t* y, int width) {
634	const int max_width = width & ~`15`;
635	int i;
636	for (i = `0`; i < max_width; i += `16`) {
637	__m128i Y0, Y1, rgb[`6`];
638	RGB32PackedToPlanar_SSE2(&argb[i], rgb);
639	ConvertRGBToY_SSE2(&rgb[`0`], &rgb[`2`], &rgb[`4`], &Y0);
640	ConvertRGBToY_SSE2(&rgb[`1`], &rgb[`3`], &rgb[`5`], &Y1);
641	STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
642	}
643	for (; i < width; ++i) { // left-over
644	const uint32_t p = argb[i];
645	y[i] = VP8RGBToY((p >> `16`) & `0xff`, (p >> `8`) & `0xff`, (p >> `0`) & `0xff`,
646	YUV_HALF);
647	}
648	}
649
650	// Horizontal add (doubled) of two 16b values, result is 16b.
651	// in: A \| B \| C \| D \| ... -> out: 2(A+B) \| 2(C+D) \| ...
652	static void HorizontalAddPack_SSE2(const __m128i* const A,
653	const __m128i* const B,
654	__m128i* const out) {
655	const __m128i k2 = _mm_set1_epi16(`2`);
656	const __m128i C = _mm_madd_epi16(*A, k2);
657	const __m128i D = _mm_madd_epi16(*B, k2);
658	*out = _mm_packs_epi32(C, D);
659	}
660
661	static void ConvertARGBToUV_SSE2(const uint32_t* argb,
662	uint8_t* u, uint8_t* v,
663	int src_width, int do_store) {
664	const int max_width = src_width & ~`31`;
665	int i;
666	for (i = `0`; i < max_width; i += `32`, u += `16`, v += `16`) {
667	__m128i rgb[`6`], U0, V0, U1, V1;
668	RGB32PackedToPlanar_SSE2(&argb[i], rgb);
669	HorizontalAddPack_SSE2(&rgb[`0`], &rgb[`1`], &rgb[`0`]);
670	HorizontalAddPack_SSE2(&rgb[`2`], &rgb[`3`], &rgb[`2`]);
671	HorizontalAddPack_SSE2(&rgb[`4`], &rgb[`5`], &rgb[`4`]);
672	ConvertRGBToUV_SSE2(&rgb[`0`], &rgb[`2`], &rgb[`4`], &U0, &V0);
673
674	RGB32PackedToPlanar_SSE2(&argb[i + `16`], rgb);
675	HorizontalAddPack_SSE2(&rgb[`0`], &rgb[`1`], &rgb[`0`]);
676	HorizontalAddPack_SSE2(&rgb[`2`], &rgb[`3`], &rgb[`2`]);
677	HorizontalAddPack_SSE2(&rgb[`4`], &rgb[`5`], &rgb[`4`]);
678	ConvertRGBToUV_SSE2(&rgb[`0`], &rgb[`2`], &rgb[`4`], &U1, &V1);
679
680	U0 = _mm_packus_epi16(U0, U1);
681	V0 = _mm_packus_epi16(V0, V1);
682	if (!do_store) {
683	const __m128i prev_u = LOAD_16(u);
684	const __m128i prev_v = LOAD_16(v);
685	U0 = _mm_avg_epu8(U0, prev_u);
686	V0 = _mm_avg_epu8(V0, prev_v);
687	}
688	STORE_16(U0, u);
689	STORE_16(V0, v);
690	}
691	if (i < src_width) { // left-over
692	WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
693	}
694	}
695
696	// Convert 16 packed ARGB 16b-values to r[], g[], b[]
697	static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
698	const uint16_t* const rgbx,
699	__m128i* const r, __m128i* const g, __m128i* const b) {
700	const __m128i in0 = LOAD_16(rgbx + `0`); // r0 \| g0 \| b0 \|x\| r1 \| g1 \| b1 \|x
701	const __m128i in1 = LOAD_16(rgbx + `8`); // r2 \| g2 \| b2 \|x\| r3 \| g3 \| b3 \|x
702	const __m128i in2 = LOAD_16(rgbx + `16`); // r4 \| ...
703	const __m128i in3 = LOAD_16(rgbx + `24`); // r6 \| ...
704	// column-wise transpose
705	const __m128i A0 = _mm_unpacklo_epi16(in0, in1);
706	const __m128i A1 = _mm_unpackhi_epi16(in0, in1);
707	const __m128i A2 = _mm_unpacklo_epi16(in2, in3);
708	const __m128i A3 = _mm_unpackhi_epi16(in2, in3);
709	const __m128i B0 = _mm_unpacklo_epi16(A0, A1); // r0 r1 r2 r3 \| g0 g1 ..
710	const __m128i B1 = _mm_unpackhi_epi16(A0, A1); // b0 b1 b2 b3 \| x x x x
711	const __m128i B2 = _mm_unpacklo_epi16(A2, A3); // r4 r5 r6 r7 \| g4 g5 ..
712	const __m128i B3 = _mm_unpackhi_epi16(A2, A3); // b4 b5 b6 b7 \| x x x x
713	*r = _mm_unpacklo_epi64(B0, B2);
714	*g = _mm_unpackhi_epi64(B0, B2);
715	*b = _mm_unpacklo_epi64(B1, B3);
716	}
717
718	static void ConvertRGBA32ToUV_SSE2(const uint16_t* rgb,
719	uint8_t* u, uint8_t* v, int width) {
720	const int max_width = width & ~`15`;
721	const uint16_t* const last_rgb = rgb + `4` * max_width;
722	while (rgb < last_rgb) {
723	__m128i r, g, b, U0, V0, U1, V1;
724	RGBA32PackedToPlanar_16b_SSE2(rgb + `0`, &r, &g, &b);
725	ConvertRGBToUV_SSE2(&r, &g, &b, &U0, &V0);
726	RGBA32PackedToPlanar_16b_SSE2(rgb + `32`, &r, &g, &b);
727	ConvertRGBToUV_SSE2(&r, &g, &b, &U1, &V1);
728	STORE_16(_mm_packus_epi16(U0, U1), u);
729	STORE_16(_mm_packus_epi16(V0, V1), v);
730	u += `16`;
731	v += `16`;
732	rgb += `2` * `32`;
733	}
734	if (max_width < width) { // left-over
735	WebPConvertRGBA32ToUV_C(rgb, u, v, width - max_width);
736	}
737	}
738
739	//------------------------------------------------------------------------------
740
741	extern void WebPInitConvertARGBToYUVSSE2(void);
742
743	WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
744	WebPConvertARGBToY = ConvertARGBToY_SSE2;
745	WebPConvertARGBToUV = ConvertARGBToUV_SSE2;
746
747	WebPConvertRGB24ToY = ConvertRGB24ToY_SSE2;
748	WebPConvertBGR24ToY = ConvertBGR24ToY_SSE2;
749
750	WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2;
751	}
752
753	#else // !WEBP_USE_SSE2
754
755	WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2)
756	WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2)
757
758	#endif // WEBP_USE_SSE2
759

Browse the source code of Godot/thirdparty/libwebp/src/dsp/yuv_sse2.c