yuv_sse41.c source code [Godot/thirdparty/libwebp/src/dsp/yuv_sse41.c]

1	// Copyright 2014 Google Inc. All Rights Reserved.
2	//
3	// Use of this source code is governed by a BSD-style license
4	// that can be found in the COPYING file in the root of the source
5	// tree. An additional intellectual property rights grant can be found
6	// in the file PATENTS. All contributing project authors may
7	// be found in the AUTHORS file in the root of the source tree.
8	// -----------------------------------------------------------------------------
9	//
10	// YUV->RGB conversion functions
11	//
12	// Author: Skal (pascal.massimino@gmail.com)
13
14	#include "src/dsp/yuv.h"
15
16	#if defined(WEBP_USE_SSE41)
17
18	#include <stdlib.h>
19	#include <smmintrin.h>
20
21	#include "src/dsp/common_sse41.h"
22	#include "src/utils/utils.h"
23
24	//-----------------------------------------------------------------------------
25	// Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
26
27	// These constants are 14b fixed-point version of ITU-R BT.601 constants.
28	// R = (19077 y + 26149 * v - 14234) >> 6*
29	// G = (19077 y - 6419 * u - 13320 * v + 8708) >> 6*
30	// B = (19077 y + 33050 * u - 17685) >> 6*
31	static void ConvertYUV444ToRGB_SSE41(const __m128i* const Y0,
32	const __m128i* const U0,
33	const __m128i* const V0,
34	__m128i* const R,
35	__m128i* const G,
36	__m128i* const B) {
37	const __m128i k19077 = _mm_set1_epi16(`19077`);
38	const __m128i k26149 = _mm_set1_epi16(`26149`);
39	const __m128i k14234 = _mm_set1_epi16(`14234`);
40	// 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
41	const __m128i k33050 = _mm_set1_epi16((short)`33050`);
42	const __m128i k17685 = _mm_set1_epi16(`17685`);
43	const __m128i k6419 = _mm_set1_epi16(`6419`);
44	const __m128i k13320 = _mm_set1_epi16(`13320`);
45	const __m128i k8708 = _mm_set1_epi16(`8708`);
46
47	const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077);
48
49	const __m128i R0 = _mm_mulhi_epu16(*V0, k26149);
50	const __m128i R1 = _mm_sub_epi16(Y1, k14234);
51	const __m128i R2 = _mm_add_epi16(R1, R0);
52
53	const __m128i G0 = _mm_mulhi_epu16(*U0, k6419);
54	const __m128i G1 = _mm_mulhi_epu16(*V0, k13320);
55	const __m128i G2 = _mm_add_epi16(Y1, k8708);
56	const __m128i G3 = _mm_add_epi16(G0, G1);
57	const __m128i G4 = _mm_sub_epi16(G2, G3);
58
59	// be careful with the saturated unsigned* arithmetic here!*
60	const __m128i B0 = _mm_mulhi_epu16(*U0, k33050);
61	const __m128i B1 = _mm_adds_epu16(B0, Y1);
62	const __m128i B2 = _mm_subs_epu16(B1, k17685);
63
64	// use logical shift for B2, which can be larger than 32767
65	R = _mm_srai_epi16(R2, `6`); // range: [-14234, 30815]*
66	G = _mm_srai_epi16(G4, `6`); // range: [-10953, 27710]*
67	B = _mm_srli_epi16(B2, `6`); // range: [0, 34238]*
68	}
69
70	// Load the bytes into the upper* part of 16b words. That's "<< 8", basically.*
71	static WEBP_INLINE __m128i Load_HI_16_SSE41(const uint8_t* src) {
72	const __m128i zero = _mm_setzero_si128();
73	return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
74	}
75
76	// Load and replicate the U/V samples
77	static WEBP_INLINE __m128i Load_UV_HI_8_SSE41(const uint8_t* src) {
78	const __m128i zero = _mm_setzero_si128();
79	const __m128i tmp0 = _mm_cvtsi32_si128(WebPMemToInt32(src));
80	const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
81	return _mm_unpacklo_epi16(tmp1, tmp1); // replicate samples
82	}
83
84	// Convert 32 samples of YUV444 to R/G/B
85	static void YUV444ToRGB_SSE41(const uint8_t* const y,
86	const uint8_t* const u,
87	const uint8_t* const v,
88	__m128i* const R, __m128i* const G,
89	__m128i* const B) {
90	const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_HI_16_SSE41(u),
91	V0 = Load_HI_16_SSE41(v);
92	ConvertYUV444ToRGB_SSE41(&Y0, &U0, &V0, R, G, B);
93	}
94
95	// Convert 32 samples of YUV420 to R/G/B
96	static void YUV420ToRGB_SSE41(const uint8_t* const y,
97	const uint8_t* const u,
98	const uint8_t* const v,
99	__m128i* const R, __m128i* const G,
100	__m128i* const B) {
101	const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_UV_HI_8_SSE41(u),
102	V0 = Load_UV_HI_8_SSE41(v);
103	ConvertYUV444ToRGB_SSE41(&Y0, &U0, &V0, R, G, B);
104	}
105
106	// Pack the planar buffers
107	// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
108	// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
109	static WEBP_INLINE void PlanarTo24b_SSE41(
110	__m128i* const in0, __m128i* const in1, __m128i* const in2,
111	__m128i* const in3, __m128i* const in4, __m128i* const in5,
112	uint8_t* const rgb) {
113	// The input is 6 registers of sixteen 8b but for the sake of explanation,
114	// let's take 6 registers of four 8b values.
115	// To pack, we will keep taking one every two 8b integer and move it
116	// around as follows:
117	// Input:
118	// r0r1r2r3 \| r4r5r6r7 \| g0g1g2g3 \| g4g5g6g7 \| b0b1b2b3 \| b4b5b6b7
119	// Split the 6 registers in two sets of 3 registers: the first set as the even
120	// 8b bytes, the second the odd ones:
121	// r0r2r4r6 \| g0g2g4g6 \| b0b2b4b6 \| r1r3r5r7 \| g1g3g5g7 \| b1b3b5b7
122	// Repeat the same permutations twice more:
123	// r0r4g0g4 \| b0b4r1r5 \| g1g5b1b5 \| r2r6g2g6 \| b2b6r3r7 \| g3g7b3b7
124	// r0g0b0r1 \| g1b1r2g2 \| b2r3g3b3 \| r4g4b4r5 \| g5b5r6g6 \| b6r7g7b7
125	VP8PlanarTo24b_SSE41(in0, in1, in2, in3, in4, in5);
126
127	_mm_storeu_si128((__m128i)(rgb + `0`), in0);
128	_mm_storeu_si128((__m128i)(rgb + `16`), in1);
129	_mm_storeu_si128((__m128i)(rgb + `32`), in2);
130	_mm_storeu_si128((__m128i)(rgb + `48`), in3);
131	_mm_storeu_si128((__m128i)(rgb + `64`), in4);
132	_mm_storeu_si128((__m128i)(rgb + `80`), in5);
133	}
134
135	void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
136	uint8_t* dst) {
137	__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
138	__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
139
140	YUV444ToRGB_SSE41(y + `0`, u + `0`, v + `0`, &R0, &G0, &B0);
141	YUV444ToRGB_SSE41(y + `8`, u + `8`, v + `8`, &R1, &G1, &B1);
142	YUV444ToRGB_SSE41(y + `16`, u + `16`, v + `16`, &R2, &G2, &B2);
143	YUV444ToRGB_SSE41(y + `24`, u + `24`, v + `24`, &R3, &G3, &B3);
144
145	// Cast to 8b and store as RRRRGGGGBBBB.
146	rgb0 = _mm_packus_epi16(R0, R1);
147	rgb1 = _mm_packus_epi16(R2, R3);
148	rgb2 = _mm_packus_epi16(G0, G1);
149	rgb3 = _mm_packus_epi16(G2, G3);
150	rgb4 = _mm_packus_epi16(B0, B1);
151	rgb5 = _mm_packus_epi16(B2, B3);
152
153	// Pack as RGBRGBRGBRGB.
154	PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
155	}
156
157	void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
158	uint8_t* dst) {
159	__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
160	__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
161
162	YUV444ToRGB_SSE41(y + `0`, u + `0`, v + `0`, &R0, &G0, &B0);
163	YUV444ToRGB_SSE41(y + `8`, u + `8`, v + `8`, &R1, &G1, &B1);
164	YUV444ToRGB_SSE41(y + `16`, u + `16`, v + `16`, &R2, &G2, &B2);
165	YUV444ToRGB_SSE41(y + `24`, u + `24`, v + `24`, &R3, &G3, &B3);
166
167	// Cast to 8b and store as BBBBGGGGRRRR.
168	bgr0 = _mm_packus_epi16(B0, B1);
169	bgr1 = _mm_packus_epi16(B2, B3);
170	bgr2 = _mm_packus_epi16(G0, G1);
171	bgr3 = _mm_packus_epi16(G2, G3);
172	bgr4 = _mm_packus_epi16(R0, R1);
173	bgr5= _mm_packus_epi16(R2, R3);
174
175	// Pack as BGRBGRBGRBGR.
176	PlanarTo24b_SSE41(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
177	}
178
179	//-----------------------------------------------------------------------------
180	// Arbitrary-length row conversion functions
181
182	static void YuvToRgbRow_SSE41(const uint8_t* y,
183	const uint8_t* u, const uint8_t* v,
184	uint8_t* dst, int len) {
185	int n;
186	for (n = `0`; n + `32` <= len; n += `32`, dst += `32` * `3`) {
187	__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
188	__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
189
190	YUV420ToRGB_SSE41(y + `0`, u + `0`, v + `0`, &R0, &G0, &B0);
191	YUV420ToRGB_SSE41(y + `8`, u + `4`, v + `4`, &R1, &G1, &B1);
192	YUV420ToRGB_SSE41(y + `16`, u + `8`, v + `8`, &R2, &G2, &B2);
193	YUV420ToRGB_SSE41(y + `24`, u + `12`, v + `12`, &R3, &G3, &B3);
194
195	// Cast to 8b and store as RRRRGGGGBBBB.
196	rgb0 = _mm_packus_epi16(R0, R1);
197	rgb1 = _mm_packus_epi16(R2, R3);
198	rgb2 = _mm_packus_epi16(G0, G1);
199	rgb3 = _mm_packus_epi16(G2, G3);
200	rgb4 = _mm_packus_epi16(B0, B1);
201	rgb5 = _mm_packus_epi16(B2, B3);
202
203	// Pack as RGBRGBRGBRGB.
204	PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
205
206	y += `32`;
207	u += `16`;
208	v += `16`;
209	}
210	for (; n < len; ++n) { // Finish off
211	VP8YuvToRgb(y[`0`], u[`0`], v[`0`], dst);
212	dst += `3`;
213	y += `1`;
214	u += (n & `1`);
215	v += (n & `1`);
216	}
217	}
218
219	static void YuvToBgrRow_SSE41(const uint8_t* y,
220	const uint8_t* u, const uint8_t* v,
221	uint8_t* dst, int len) {
222	int n;
223	for (n = `0`; n + `32` <= len; n += `32`, dst += `32` * `3`) {
224	__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
225	__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
226
227	YUV420ToRGB_SSE41(y + `0`, u + `0`, v + `0`, &R0, &G0, &B0);
228	YUV420ToRGB_SSE41(y + `8`, u + `4`, v + `4`, &R1, &G1, &B1);
229	YUV420ToRGB_SSE41(y + `16`, u + `8`, v + `8`, &R2, &G2, &B2);
230	YUV420ToRGB_SSE41(y + `24`, u + `12`, v + `12`, &R3, &G3, &B3);
231
232	// Cast to 8b and store as BBBBGGGGRRRR.
233	bgr0 = _mm_packus_epi16(B0, B1);
234	bgr1 = _mm_packus_epi16(B2, B3);
235	bgr2 = _mm_packus_epi16(G0, G1);
236	bgr3 = _mm_packus_epi16(G2, G3);
237	bgr4 = _mm_packus_epi16(R0, R1);
238	bgr5 = _mm_packus_epi16(R2, R3);
239
240	// Pack as BGRBGRBGRBGR.
241	PlanarTo24b_SSE41(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
242
243	y += `32`;
244	u += `16`;
245	v += `16`;
246	}
247	for (; n < len; ++n) { // Finish off
248	VP8YuvToBgr(y[`0`], u[`0`], v[`0`], dst);
249	dst += `3`;
250	y += `1`;
251	u += (n & `1`);
252	v += (n & `1`);
253	}
254	}
255
256	//------------------------------------------------------------------------------
257	// Entry point
258
259	extern void WebPInitSamplersSSE41(void);
260
261	WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE41(void) {
262	WebPSamplers[MODE_RGB] = YuvToRgbRow_SSE41;
263	WebPSamplers[MODE_BGR] = YuvToBgrRow_SSE41;
264	}
265
266	//------------------------------------------------------------------------------
267	// RGB24/32 -> YUV converters
268
269	// Load eight 16b-words from src.*
270	#define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
271	// Store either 16b-words into dst*
272	#define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V))
273
274	#define WEBP_SSE41_SHUFF(OUT) do { \
275	const __m128i tmp0 = _mm_shuffle_epi8(A0, shuff0); \
276	const __m128i tmp1 = _mm_shuffle_epi8(A1, shuff1); \
277	const __m128i tmp2 = _mm_shuffle_epi8(A2, shuff2); \
278	const __m128i tmp3 = _mm_shuffle_epi8(A3, shuff0); \
279	const __m128i tmp4 = _mm_shuffle_epi8(A4, shuff1); \
280	const __m128i tmp5 = _mm_shuffle_epi8(A5, shuff2); \
281	\
282	/* OR everything to get one channel */ \
283	const __m128i tmp6 = _mm_or_si128(tmp0, tmp1); \
284	const __m128i tmp7 = _mm_or_si128(tmp3, tmp4); \
285	out[OUT + 0] = _mm_or_si128(tmp6, tmp2); \
286	out[OUT + 1] = _mm_or_si128(tmp7, tmp5); \
287	} while (0);
288
289	// Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
290	// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
291	// Similar to PlanarTo24bHelper(), but in reverse order.
292	static WEBP_INLINE void RGB24PackedToPlanar_SSE41(
293	const uint8_t* const rgb, __m128i* const out /out[6]/) {
294	const __m128i A0 = _mm_loadu_si128((const __m128i*)(rgb + `0`));
295	const __m128i A1 = _mm_loadu_si128((const __m128i*)(rgb + `16`));
296	const __m128i A2 = _mm_loadu_si128((const __m128i*)(rgb + `32`));
297	const __m128i A3 = _mm_loadu_si128((const __m128i*)(rgb + `48`));
298	const __m128i A4 = _mm_loadu_si128((const __m128i*)(rgb + `64`));
299	const __m128i A5 = _mm_loadu_si128((const __m128i*)(rgb + `80`));
300
301	// Compute RR.
302	{
303	const __m128i shuff0 = _mm_set_epi8(
304	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `15`, `12`, `9`, `6`, `3`, `0`);
305	const __m128i shuff1 = _mm_set_epi8(
306	-`1`, -`1`, -`1`, -`1`, -`1`, `14`, `11`, `8`, `5`, `2`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`);
307	const __m128i shuff2 = _mm_set_epi8(
308	`13`, `10`, `7`, `4`, `1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`);
309	WEBP_SSE41_SHUFF(`0`)
310	}
311	// Compute GG.
312	{
313	const __m128i shuff0 = _mm_set_epi8(
314	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `13`, `10`, `7`, `4`, `1`);
315	const __m128i shuff1 = _mm_set_epi8(
316	-`1`, -`1`, -`1`, -`1`, -`1`, `15`, `12`, `9`, `6`, `3`, `0`, -`1`, -`1`, -`1`, -`1`, -`1`);
317	const __m128i shuff2 = _mm_set_epi8(
318	`14`, `11`, `8`, `5`, `2`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`);
319	WEBP_SSE41_SHUFF(`2`)
320	}
321	// Compute BB.
322	{
323	const __m128i shuff0 = _mm_set_epi8(
324	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `14`, `11`, `8`, `5`, `2`);
325	const __m128i shuff1 = _mm_set_epi8(
326	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `13`, `10`, `7`, `4`, `1`, -`1`, -`1`, -`1`, -`1`, -`1`);
327	const __m128i shuff2 = _mm_set_epi8(
328	`15`, `12`, `9`, `6`, `3`, `0`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`);
329	WEBP_SSE41_SHUFF(`4`)
330	}
331	}
332
333	#undef WEBP_SSE41_SHUFF
334
335	// Convert 8 packed ARGB to r[], g[], b[]
336	static WEBP_INLINE void RGB32PackedToPlanar_SSE41(
337	const uint32_t* const argb, __m128i* const rgb /in[6]/) {
338	const __m128i zero = _mm_setzero_si128();
339	__m128i a0 = LOAD_16(argb + `0`);
340	__m128i a1 = LOAD_16(argb + `4`);
341	__m128i a2 = LOAD_16(argb + `8`);
342	__m128i a3 = LOAD_16(argb + `12`);
343	VP8L32bToPlanar_SSE41(&a0, &a1, &a2, &a3);
344	rgb[`0`] = _mm_unpacklo_epi8(a1, zero);
345	rgb[`1`] = _mm_unpackhi_epi8(a1, zero);
346	rgb[`2`] = _mm_unpacklo_epi8(a2, zero);
347	rgb[`3`] = _mm_unpackhi_epi8(a2, zero);
348	rgb[`4`] = _mm_unpacklo_epi8(a3, zero);
349	rgb[`5`] = _mm_unpackhi_epi8(a3, zero);
350	}
351
352	// This macro computes (RG MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX*
353	// It's a macro and not a function because we need to use immediate values with
354	// srai_epi32, e.g.
355	#define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \
356	ROUNDER, DESCALE_FIX, OUT) do { \
357	const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG); \
358	const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG); \
359	const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB); \
360	const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB); \
361	const __m128i V2_lo = _mm_add_epi32(V0_lo, V1_lo); \
362	const __m128i V2_hi = _mm_add_epi32(V0_hi, V1_hi); \
363	const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER); \
364	const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER); \
365	const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX); \
366	const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX); \
367	(OUT) = _mm_packs_epi32(V5_lo, V5_hi); \
368	} while (0)
369
370	#define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
371	static WEBP_INLINE void ConvertRGBToY_SSE41(const __m128i* const R,
372	const __m128i* const G,
373	const __m128i* const B,
374	__m128i* const Y) {
375	const __m128i kRG_y = MK_CST_16(`16839`, `33059` - `16384`);
376	const __m128i kGB_y = MK_CST_16(`16384`, `6420`);
377	const __m128i kHALF_Y = _mm_set1_epi32((`16` << YUV_FIX) + YUV_HALF);
378
379	const __m128i RG_lo = _mm_unpacklo_epi16(R, G);
380	const __m128i RG_hi = _mm_unpackhi_epi16(R, G);
381	const __m128i GB_lo = _mm_unpacklo_epi16(G, B);
382	const __m128i GB_hi = _mm_unpackhi_epi16(G, B);
383	TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
384	}
385
386	static WEBP_INLINE void ConvertRGBToUV_SSE41(const __m128i* const R,
387	const __m128i* const G,
388	const __m128i* const B,
389	__m128i* const U,
390	__m128i* const V) {
391	const __m128i kRG_u = MK_CST_16(-`9719`, -`19081`);
392	const __m128i kGB_u = MK_CST_16(`0`, `28800`);
393	const __m128i kRG_v = MK_CST_16(`28800`, `0`);
394	const __m128i kGB_v = MK_CST_16(-`24116`, -`4684`);
395	const __m128i kHALF_UV = _mm_set1_epi32(((`128` << YUV_FIX) + YUV_HALF) << `2`);
396
397	const __m128i RG_lo = _mm_unpacklo_epi16(R, G);
398	const __m128i RG_hi = _mm_unpackhi_epi16(R, G);
399	const __m128i GB_lo = _mm_unpacklo_epi16(G, B);
400	const __m128i GB_hi = _mm_unpackhi_epi16(G, B);
401	TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u,
402	kHALF_UV, YUV_FIX + `2`, *U);
403	TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_v, kGB_v,
404	kHALF_UV, YUV_FIX + `2`, *V);
405	}
406
407	#undef MK_CST_16
408	#undef TRANSFORM
409
410	static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) {
411	const int max_width = width & ~`31`;
412	int i;
413	for (i = `0`; i < max_width; rgb += `3` * `16` * `2`) {
414	__m128i rgb_plane[`6`];
415	int j;
416
417	RGB24PackedToPlanar_SSE41(rgb, rgb_plane);
418
419	for (j = `0`; j < `2`; ++j, i += `16`) {
420	const __m128i zero = _mm_setzero_si128();
421	__m128i r, g, b, Y0, Y1;
422
423	// Convert to 16-bit Y.
424	r = _mm_unpacklo_epi8(rgb_plane[`0` + j], zero);
425	g = _mm_unpacklo_epi8(rgb_plane[`2` + j], zero);
426	b = _mm_unpacklo_epi8(rgb_plane[`4` + j], zero);
427	ConvertRGBToY_SSE41(&r, &g, &b, &Y0);
428
429	// Convert to 16-bit Y.
430	r = _mm_unpackhi_epi8(rgb_plane[`0` + j], zero);
431	g = _mm_unpackhi_epi8(rgb_plane[`2` + j], zero);
432	b = _mm_unpackhi_epi8(rgb_plane[`4` + j], zero);
433	ConvertRGBToY_SSE41(&r, &g, &b, &Y1);
434
435	// Cast to 8-bit and store.
436	STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
437	}
438	}
439	for (; i < width; ++i, rgb += `3`) { // left-over
440	y[i] = VP8RGBToY(rgb[`0`], rgb[`1`], rgb[`2`], YUV_HALF);
441	}
442	}
443
444	static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) {
445	const int max_width = width & ~`31`;
446	int i;
447	for (i = `0`; i < max_width; bgr += `3` * `16` * `2`) {
448	__m128i bgr_plane[`6`];
449	int j;
450
451	RGB24PackedToPlanar_SSE41(bgr, bgr_plane);
452
453	for (j = `0`; j < `2`; ++j, i += `16`) {
454	const __m128i zero = _mm_setzero_si128();
455	__m128i r, g, b, Y0, Y1;
456
457	// Convert to 16-bit Y.
458	b = _mm_unpacklo_epi8(bgr_plane[`0` + j], zero);
459	g = _mm_unpacklo_epi8(bgr_plane[`2` + j], zero);
460	r = _mm_unpacklo_epi8(bgr_plane[`4` + j], zero);
461	ConvertRGBToY_SSE41(&r, &g, &b, &Y0);
462
463	// Convert to 16-bit Y.
464	b = _mm_unpackhi_epi8(bgr_plane[`0` + j], zero);
465	g = _mm_unpackhi_epi8(bgr_plane[`2` + j], zero);
466	r = _mm_unpackhi_epi8(bgr_plane[`4` + j], zero);
467	ConvertRGBToY_SSE41(&r, &g, &b, &Y1);
468
469	// Cast to 8-bit and store.
470	STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
471	}
472	}
473	for (; i < width; ++i, bgr += `3`) { // left-over
474	y[i] = VP8RGBToY(bgr[`2`], bgr[`1`], bgr[`0`], YUV_HALF);
475	}
476	}
477
478	static void ConvertARGBToY_SSE41(const uint32_t* argb, uint8_t* y, int width) {
479	const int max_width = width & ~`15`;
480	int i;
481	for (i = `0`; i < max_width; i += `16`) {
482	__m128i Y0, Y1, rgb[`6`];
483	RGB32PackedToPlanar_SSE41(&argb[i], rgb);
484	ConvertRGBToY_SSE41(&rgb[`0`], &rgb[`2`], &rgb[`4`], &Y0);
485	ConvertRGBToY_SSE41(&rgb[`1`], &rgb[`3`], &rgb[`5`], &Y1);
486	STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
487	}
488	for (; i < width; ++i) { // left-over
489	const uint32_t p = argb[i];
490	y[i] = VP8RGBToY((p >> `16`) & `0xff`, (p >> `8`) & `0xff`, (p >> `0`) & `0xff`,
491	YUV_HALF);
492	}
493	}
494
495	// Horizontal add (doubled) of two 16b values, result is 16b.
496	// in: A \| B \| C \| D \| ... -> out: 2(A+B) \| 2(C+D) \| ...
497	static void HorizontalAddPack_SSE41(const __m128i* const A,
498	const __m128i* const B,
499	__m128i* const out) {
500	const __m128i k2 = _mm_set1_epi16(`2`);
501	const __m128i C = _mm_madd_epi16(*A, k2);
502	const __m128i D = _mm_madd_epi16(*B, k2);
503	*out = _mm_packs_epi32(C, D);
504	}
505
506	static void ConvertARGBToUV_SSE41(const uint32_t* argb,
507	uint8_t* u, uint8_t* v,
508	int src_width, int do_store) {
509	const int max_width = src_width & ~`31`;
510	int i;
511	for (i = `0`; i < max_width; i += `32`, u += `16`, v += `16`) {
512	__m128i rgb[`6`], U0, V0, U1, V1;
513	RGB32PackedToPlanar_SSE41(&argb[i], rgb);
514	HorizontalAddPack_SSE41(&rgb[`0`], &rgb[`1`], &rgb[`0`]);
515	HorizontalAddPack_SSE41(&rgb[`2`], &rgb[`3`], &rgb[`2`]);
516	HorizontalAddPack_SSE41(&rgb[`4`], &rgb[`5`], &rgb[`4`]);
517	ConvertRGBToUV_SSE41(&rgb[`0`], &rgb[`2`], &rgb[`4`], &U0, &V0);
518
519	RGB32PackedToPlanar_SSE41(&argb[i + `16`], rgb);
520	HorizontalAddPack_SSE41(&rgb[`0`], &rgb[`1`], &rgb[`0`]);
521	HorizontalAddPack_SSE41(&rgb[`2`], &rgb[`3`], &rgb[`2`]);
522	HorizontalAddPack_SSE41(&rgb[`4`], &rgb[`5`], &rgb[`4`]);
523	ConvertRGBToUV_SSE41(&rgb[`0`], &rgb[`2`], &rgb[`4`], &U1, &V1);
524
525	U0 = _mm_packus_epi16(U0, U1);
526	V0 = _mm_packus_epi16(V0, V1);
527	if (!do_store) {
528	const __m128i prev_u = LOAD_16(u);
529	const __m128i prev_v = LOAD_16(v);
530	U0 = _mm_avg_epu8(U0, prev_u);
531	V0 = _mm_avg_epu8(V0, prev_v);
532	}
533	STORE_16(U0, u);
534	STORE_16(V0, v);
535	}
536	if (i < src_width) { // left-over
537	WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
538	}
539	}
540
541	// Convert 16 packed ARGB 16b-values to r[], g[], b[]
542	static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41(
543	const uint16_t* const rgbx,
544	__m128i* const r, __m128i* const g, __m128i* const b) {
545	const __m128i in0 = LOAD_16(rgbx + `0`); // r0 \| g0 \| b0 \|x\| r1 \| g1 \| b1 \|x
546	const __m128i in1 = LOAD_16(rgbx + `8`); // r2 \| g2 \| b2 \|x\| r3 \| g3 \| b3 \|x
547	const __m128i in2 = LOAD_16(rgbx + `16`); // r4 \| ...
548	const __m128i in3 = LOAD_16(rgbx + `24`); // r6 \| ...
549	// aarrggbb as 16-bit.
550	const __m128i shuff0 =
551	_mm_set_epi8(-`1`, -`1`, -`1`, -`1`, `13`, `12`, `5`, `4`, `11`, `10`, `3`, `2`, `9`, `8`, `1`, `0`);
552	const __m128i shuff1 =
553	_mm_set_epi8(`13`, `12`, `5`, `4`, -`1`, -`1`, -`1`, -`1`, `11`, `10`, `3`, `2`, `9`, `8`, `1`, `0`);
554	const __m128i A0 = _mm_shuffle_epi8(in0, shuff0);
555	const __m128i A1 = _mm_shuffle_epi8(in1, shuff1);
556	const __m128i A2 = _mm_shuffle_epi8(in2, shuff0);
557	const __m128i A3 = _mm_shuffle_epi8(in3, shuff1);
558	// R0R1G0G1
559	// B0B1****
560	// R2R3G2G3
561	// B2B3****
562	// (OR is used to free port 5 for the unpack)
563	const __m128i B0 = _mm_unpacklo_epi32(A0, A1);
564	const __m128i B1 = _mm_or_si128(A0, A1);
565	const __m128i B2 = _mm_unpacklo_epi32(A2, A3);
566	const __m128i B3 = _mm_or_si128(A2, A3);
567	// Gather the channels.
568	*r = _mm_unpacklo_epi64(B0, B2);
569	*g = _mm_unpackhi_epi64(B0, B2);
570	*b = _mm_unpackhi_epi64(B1, B3);
571	}
572
573	static void ConvertRGBA32ToUV_SSE41(const uint16_t* rgb,
574	uint8_t* u, uint8_t* v, int width) {
575	const int max_width = width & ~`15`;
576	const uint16_t* const last_rgb = rgb + `4` * max_width;
577	while (rgb < last_rgb) {
578	__m128i r, g, b, U0, V0, U1, V1;
579	RGBA32PackedToPlanar_16b_SSE41(rgb + `0`, &r, &g, &b);
580	ConvertRGBToUV_SSE41(&r, &g, &b, &U0, &V0);
581	RGBA32PackedToPlanar_16b_SSE41(rgb + `32`, &r, &g, &b);
582	ConvertRGBToUV_SSE41(&r, &g, &b, &U1, &V1);
583	STORE_16(_mm_packus_epi16(U0, U1), u);
584	STORE_16(_mm_packus_epi16(V0, V1), v);
585	u += `16`;
586	v += `16`;
587	rgb += `2` * `32`;
588	}
589	if (max_width < width) { // left-over
590	WebPConvertRGBA32ToUV_C(rgb, u, v, width - max_width);
591	}
592	}
593
594	//------------------------------------------------------------------------------
595
596	extern void WebPInitConvertARGBToYUVSSE41(void);
597
598	WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE41(void) {
599	WebPConvertARGBToY = ConvertARGBToY_SSE41;
600	WebPConvertARGBToUV = ConvertARGBToUV_SSE41;
601
602	WebPConvertRGB24ToY = ConvertRGB24ToY_SSE41;
603	WebPConvertBGR24ToY = ConvertBGR24ToY_SSE41;
604
605	WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE41;
606	}
607
608	//------------------------------------------------------------------------------
609
610	#else // !WEBP_USE_SSE41
611
612	WEBP_DSP_INIT_STUB(WebPInitSamplersSSE41)
613	WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE41)
614
615	#endif // WEBP_USE_SSE41
616

Browse the source code of Godot/thirdparty/libwebp/src/dsp/yuv_sse41.c