yuv_sse2.c source code [engine/third_party/libwebp/src/dsp/yuv_sse2.c]

1	// Copyright 2014 Google Inc. All Rights Reserved.
2	//
3	// Use of this source code is governed by a BSD-style license
4	// that can be found in the COPYING file in the root of the source
5	// tree. An additional intellectual property rights grant can be found
6	// in the file PATENTS. All contributing project authors may
7	// be found in the AUTHORS file in the root of the source tree.
8	// -----------------------------------------------------------------------------
9	//
10	// YUV->RGB conversion functions
11	//
12	// Author: Skal (pascal.massimino@gmail.com)
13
14	#include "./yuv.h"
15
16	#if defined(WEBP_USE_SSE2)
17
18	#include "./common_sse2.h"
19	#include <stdlib.h>
20	#include <emmintrin.h>
21
22	//-----------------------------------------------------------------------------
23	// Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
24
25	// These constants are 14b fixed-point version of ITU-R BT.601 constants.
26	// R = (19077 y + 26149 * v - 14234) >> 6*
27	// G = (19077 y - 6419 * u - 13320 * v + 8708) >> 6*
28	// B = (19077 y + 33050 * u - 17685) >> 6*
29	static void ConvertYUV444ToRGB(const __m128i* const Y0,
30	const __m128i* const U0,
31	const __m128i* const V0,
32	__m128i* const R,
33	__m128i* const G,
34	__m128i* const B) {
35	const __m128i k19077 = _mm_set1_epi16(`19077`);
36	const __m128i k26149 = _mm_set1_epi16(`26149`);
37	const __m128i k14234 = _mm_set1_epi16(`14234`);
38	// 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
39	const __m128i k33050 = _mm_set1_epi16((short)`33050`);
40	const __m128i k17685 = _mm_set1_epi16(`17685`);
41	const __m128i k6419 = _mm_set1_epi16(`6419`);
42	const __m128i k13320 = _mm_set1_epi16(`13320`);
43	const __m128i k8708 = _mm_set1_epi16(`8708`);
44
45	const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077);
46
47	const __m128i R0 = _mm_mulhi_epu16(*V0, k26149);
48	const __m128i R1 = _mm_sub_epi16(Y1, k14234);
49	const __m128i R2 = _mm_add_epi16(R1, R0);
50
51	const __m128i G0 = _mm_mulhi_epu16(*U0, k6419);
52	const __m128i G1 = _mm_mulhi_epu16(*V0, k13320);
53	const __m128i G2 = _mm_add_epi16(Y1, k8708);
54	const __m128i G3 = _mm_add_epi16(G0, G1);
55	const __m128i G4 = _mm_sub_epi16(G2, G3);
56
57	// be careful with the saturated unsigned* arithmetic here!*
58	const __m128i B0 = _mm_mulhi_epu16(*U0, k33050);
59	const __m128i B1 = _mm_adds_epu16(B0, Y1);
60	const __m128i B2 = _mm_subs_epu16(B1, k17685);
61
62	// use logical shift for B2, which can be larger than 32767
63	R = _mm_srai_epi16(R2, `6`); // range: [-14234, 30815]*
64	G = _mm_srai_epi16(G4, `6`); // range: [-10953, 27710]*
65	B = _mm_srli_epi16(B2, `6`); // range: [0, 34238]*
66	}
67
68	// Load the bytes into the upper* part of 16b words. That's "<< 8", basically.*
69	static WEBP_INLINE __m128i Load_HI_16(const uint8_t* src) {
70	const __m128i zero = _mm_setzero_si128();
71	return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
72	}
73
74	// Load and replicate the U/V samples
75	static WEBP_INLINE __m128i Load_UV_HI_8(const uint8_t* src) {
76	const __m128i zero = _mm_setzero_si128();
77	const __m128i tmp0 = _mm_cvtsi32_si128((const* uint32_t*)src);
78	const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
79	return _mm_unpacklo_epi16(tmp1, tmp1); // replicate samples
80	}
81
82	// Convert 32 samples of YUV444 to R/G/B
83	static void YUV444ToRGB(const uint8_t* const y,
84	const uint8_t* const u,
85	const uint8_t* const v,
86	__m128i* const R, __m128i* const G, __m128i* const B) {
87	const __m128i Y0 = Load_HI_16(y), U0 = Load_HI_16(u), V0 = Load_HI_16(v);
88	ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B);
89	}
90
91	// Convert 32 samples of YUV420 to R/G/B
92	static void YUV420ToRGB(const uint8_t* const y,
93	const uint8_t* const u,
94	const uint8_t* const v,
95	__m128i* const R, __m128i* const G, __m128i* const B) {
96	const __m128i Y0 = Load_HI_16(y), U0 = Load_UV_HI_8(u), V0 = Load_UV_HI_8(v);
97	ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B);
98	}
99
100	// Pack R/G/B/A results into 32b output.
101	static WEBP_INLINE void PackAndStore4(const __m128i* const R,
102	const __m128i* const G,
103	const __m128i* const B,
104	const __m128i* const A,
105	uint8_t* const dst) {
106	const __m128i rb = _mm_packus_epi16(R, B);
107	const __m128i ga = _mm_packus_epi16(G, A);
108	const __m128i rg = _mm_unpacklo_epi8(rb, ga);
109	const __m128i ba = _mm_unpackhi_epi8(rb, ga);
110	const __m128i RGBA_lo = _mm_unpacklo_epi16(rg, ba);
111	const __m128i RGBA_hi = _mm_unpackhi_epi16(rg, ba);
112	_mm_storeu_si128((__m128i*)(dst + `0`), RGBA_lo);
113	_mm_storeu_si128((__m128i*)(dst + `16`), RGBA_hi);
114	}
115
116	// Pack R/G/B/A results into 16b output.
117	static WEBP_INLINE void PackAndStore4444(const __m128i* const R,
118	const __m128i* const G,
119	const __m128i* const B,
120	const __m128i* const A,
121	uint8_t* const dst) {
122	#if !defined(WEBP_SWAP_16BIT_CSP)
123	const __m128i rg0 = _mm_packus_epi16(R, G);
124	const __m128i ba0 = _mm_packus_epi16(B, A);
125	#else
126	const __m128i rg0 = _mm_packus_epi16(B, A);
127	const __m128i ba0 = _mm_packus_epi16(R, G);
128	#endif
129	const __m128i mask_0xf0 = _mm_set1_epi8(`0xf0`);
130	const __m128i rb1 = _mm_unpacklo_epi8(rg0, ba0); // rbrbrbrbrb...
131	const __m128i ga1 = _mm_unpackhi_epi8(rg0, ba0); // gagagagaga...
132	const __m128i rb2 = _mm_and_si128(rb1, mask_0xf0);
133	const __m128i ga2 = _mm_srli_epi16(_mm_and_si128(ga1, mask_0xf0), `4`);
134	const __m128i rgba4444 = _mm_or_si128(rb2, ga2);
135	_mm_storeu_si128((__m128i*)dst, rgba4444);
136	}
137
138	// Pack R/G/B results into 16b output.
139	static WEBP_INLINE void PackAndStore565(const __m128i* const R,
140	const __m128i* const G,
141	const __m128i* const B,
142	uint8_t* const dst) {
143	const __m128i r0 = _mm_packus_epi16(R, R);
144	const __m128i g0 = _mm_packus_epi16(G, G);
145	const __m128i b0 = _mm_packus_epi16(B, B);
146	const __m128i r1 = _mm_and_si128(r0, _mm_set1_epi8(`0xf8`));
147	const __m128i b1 = _mm_and_si128(_mm_srli_epi16(b0, `3`), _mm_set1_epi8(`0x1f`));
148	const __m128i g1 = _mm_srli_epi16(_mm_and_si128(g0, _mm_set1_epi8(`0xe0`)), `5`);
149	const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(`0x1c`)), `3`);
150	const __m128i rg = _mm_or_si128(r1, g1);
151	const __m128i gb = _mm_or_si128(g2, b1);
152	#if !defined(WEBP_SWAP_16BIT_CSP)
153	const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb);
154	#else
155	const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg);
156	#endif
157	_mm_storeu_si128((__m128i*)dst, rgb565);
158	}
159
160	// Pack the planar buffers
161	// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
162	// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
163	static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1,
164	__m128i* const in2, __m128i* const in3,
165	__m128i* const in4, __m128i* const in5,
166	uint8_t* const rgb) {
167	// The input is 6 registers of sixteen 8b but for the sake of explanation,
168	// let's take 6 registers of four 8b values.
169	// To pack, we will keep taking one every two 8b integer and move it
170	// around as follows:
171	// Input:
172	// r0r1r2r3 \| r4r5r6r7 \| g0g1g2g3 \| g4g5g6g7 \| b0b1b2b3 \| b4b5b6b7
173	// Split the 6 registers in two sets of 3 registers: the first set as the even
174	// 8b bytes, the second the odd ones:
175	// r0r2r4r6 \| g0g2g4g6 \| b0b2b4b6 \| r1r3r5r7 \| g1g3g5g7 \| b1b3b5b7
176	// Repeat the same permutations twice more:
177	// r0r4g0g4 \| b0b4r1r5 \| g1g5b1b5 \| r2r6g2g6 \| b2b6r3r7 \| g3g7b3b7
178	// r0g0b0r1 \| g1b1r2g2 \| b2r3g3b3 \| r4g4b4r5 \| g5b5r6g6 \| b6r7g7b7
179	VP8PlanarTo24b(in0, in1, in2, in3, in4, in5);
180
181	_mm_storeu_si128((__m128i)(rgb + `0`), in0);
182	_mm_storeu_si128((__m128i)(rgb + `16`), in1);
183	_mm_storeu_si128((__m128i)(rgb + `32`), in2);
184	_mm_storeu_si128((__m128i)(rgb + `48`), in3);
185	_mm_storeu_si128((__m128i)(rgb + `64`), in4);
186	_mm_storeu_si128((__m128i)(rgb + `80`), in5);
187	}
188
189	void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
190	uint8_t* dst) {
191	const __m128i kAlpha = _mm_set1_epi16(`255`);
192	int n;
193	for (n = `0`; n < `32`; n += `8`, dst += `32`) {
194	__m128i R, G, B;
195	YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
196	PackAndStore4(&R, &G, &B, &kAlpha, dst);
197	}
198	}
199
200	void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
201	uint8_t* dst) {
202	const __m128i kAlpha = _mm_set1_epi16(`255`);
203	int n;
204	for (n = `0`; n < `32`; n += `8`, dst += `32`) {
205	__m128i R, G, B;
206	YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
207	PackAndStore4(&B, &G, &R, &kAlpha, dst);
208	}
209	}
210
211	void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
212	uint8_t* dst) {
213	const __m128i kAlpha = _mm_set1_epi16(`255`);
214	int n;
215	for (n = `0`; n < `32`; n += `8`, dst += `32`) {
216	__m128i R, G, B;
217	YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
218	PackAndStore4(&kAlpha, &R, &G, &B, dst);
219	}
220	}
221
222	void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v,
223	uint8_t* dst) {
224	const __m128i kAlpha = _mm_set1_epi16(`255`);
225	int n;
226	for (n = `0`; n < `32`; n += `8`, dst += `16`) {
227	__m128i R, G, B;
228	YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
229	PackAndStore4444(&R, &G, &B, &kAlpha, dst);
230	}
231	}
232
233	void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v,
234	uint8_t* dst) {
235	int n;
236	for (n = `0`; n < `32`; n += `8`, dst += `16`) {
237	__m128i R, G, B;
238	YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
239	PackAndStore565(&R, &G, &B, dst);
240	}
241	}
242
243	void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
244	uint8_t* dst) {
245	__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
246	__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
247
248	YUV444ToRGB(y + `0`, u + `0`, v + `0`, &R0, &G0, &B0);
249	YUV444ToRGB(y + `8`, u + `8`, v + `8`, &R1, &G1, &B1);
250	YUV444ToRGB(y + `16`, u + `16`, v + `16`, &R2, &G2, &B2);
251	YUV444ToRGB(y + `24`, u + `24`, v + `24`, &R3, &G3, &B3);
252
253	// Cast to 8b and store as RRRRGGGGBBBB.
254	rgb0 = _mm_packus_epi16(R0, R1);
255	rgb1 = _mm_packus_epi16(R2, R3);
256	rgb2 = _mm_packus_epi16(G0, G1);
257	rgb3 = _mm_packus_epi16(G2, G3);
258	rgb4 = _mm_packus_epi16(B0, B1);
259	rgb5 = _mm_packus_epi16(B2, B3);
260
261	// Pack as RGBRGBRGBRGB.
262	PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
263	}
264
265	void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
266	uint8_t* dst) {
267	__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
268	__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
269
270	YUV444ToRGB(y + `0`, u + `0`, v + `0`, &R0, &G0, &B0);
271	YUV444ToRGB(y + `8`, u + `8`, v + `8`, &R1, &G1, &B1);
272	YUV444ToRGB(y + `16`, u + `16`, v + `16`, &R2, &G2, &B2);
273	YUV444ToRGB(y + `24`, u + `24`, v + `24`, &R3, &G3, &B3);
274
275	// Cast to 8b and store as BBBBGGGGRRRR.
276	bgr0 = _mm_packus_epi16(B0, B1);
277	bgr1 = _mm_packus_epi16(B2, B3);
278	bgr2 = _mm_packus_epi16(G0, G1);
279	bgr3 = _mm_packus_epi16(G2, G3);
280	bgr4 = _mm_packus_epi16(R0, R1);
281	bgr5= _mm_packus_epi16(R2, R3);
282
283	// Pack as BGRBGRBGRBGR.
284	PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
285	}
286
287	//-----------------------------------------------------------------------------
288	// Arbitrary-length row conversion functions
289
290	static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
291	uint8_t* dst, int len) {
292	const __m128i kAlpha = _mm_set1_epi16(`255`);
293	int n;
294	for (n = `0`; n + `8` <= len; n += `8`, dst += `32`) {
295	__m128i R, G, B;
296	YUV420ToRGB(y, u, v, &R, &G, &B);
297	PackAndStore4(&R, &G, &B, &kAlpha, dst);
298	y += `8`;
299	u += `4`;
300	v += `4`;
301	}
302	for (; n < len; ++n) { // Finish off
303	VP8YuvToRgba(y[`0`], u[`0`], v[`0`], dst);
304	dst += `4`;
305	y += `1`;
306	u += (n & `1`);
307	v += (n & `1`);
308	}
309	}
310
311	static void YuvToBgraRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
312	uint8_t* dst, int len) {
313	const __m128i kAlpha = _mm_set1_epi16(`255`);
314	int n;
315	for (n = `0`; n + `8` <= len; n += `8`, dst += `32`) {
316	__m128i R, G, B;
317	YUV420ToRGB(y, u, v, &R, &G, &B);
318	PackAndStore4(&B, &G, &R, &kAlpha, dst);
319	y += `8`;
320	u += `4`;
321	v += `4`;
322	}
323	for (; n < len; ++n) { // Finish off
324	VP8YuvToBgra(y[`0`], u[`0`], v[`0`], dst);
325	dst += `4`;
326	y += `1`;
327	u += (n & `1`);
328	v += (n & `1`);
329	}
330	}
331
332	static void YuvToArgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
333	uint8_t* dst, int len) {
334	const __m128i kAlpha = _mm_set1_epi16(`255`);
335	int n;
336	for (n = `0`; n + `8` <= len; n += `8`, dst += `32`) {
337	__m128i R, G, B;
338	YUV420ToRGB(y, u, v, &R, &G, &B);
339	PackAndStore4(&kAlpha, &R, &G, &B, dst);
340	y += `8`;
341	u += `4`;
342	v += `4`;
343	}
344	for (; n < len; ++n) { // Finish off
345	VP8YuvToArgb(y[`0`], u[`0`], v[`0`], dst);
346	dst += `4`;
347	y += `1`;
348	u += (n & `1`);
349	v += (n & `1`);
350	}
351	}
352
353	static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
354	uint8_t* dst, int len) {
355	int n;
356	for (n = `0`; n + `32` <= len; n += `32`, dst += `32` * `3`) {
357	__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
358	__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
359
360	YUV420ToRGB(y + `0`, u + `0`, v + `0`, &R0, &G0, &B0);
361	YUV420ToRGB(y + `8`, u + `4`, v + `4`, &R1, &G1, &B1);
362	YUV420ToRGB(y + `16`, u + `8`, v + `8`, &R2, &G2, &B2);
363	YUV420ToRGB(y + `24`, u + `12`, v + `12`, &R3, &G3, &B3);
364
365	// Cast to 8b and store as RRRRGGGGBBBB.
366	rgb0 = _mm_packus_epi16(R0, R1);
367	rgb1 = _mm_packus_epi16(R2, R3);
368	rgb2 = _mm_packus_epi16(G0, G1);
369	rgb3 = _mm_packus_epi16(G2, G3);
370	rgb4 = _mm_packus_epi16(B0, B1);
371	rgb5 = _mm_packus_epi16(B2, B3);
372
373	// Pack as RGBRGBRGBRGB.
374	PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
375
376	y += `32`;
377	u += `16`;
378	v += `16`;
379	}
380	for (; n < len; ++n) { // Finish off
381	VP8YuvToRgb(y[`0`], u[`0`], v[`0`], dst);
382	dst += `3`;
383	y += `1`;
384	u += (n & `1`);
385	v += (n & `1`);
386	}
387	}
388
389	static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
390	uint8_t* dst, int len) {
391	int n;
392	for (n = `0`; n + `32` <= len; n += `32`, dst += `32` * `3`) {
393	__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
394	__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
395
396	YUV420ToRGB(y + `0`, u + `0`, v + `0`, &R0, &G0, &B0);
397	YUV420ToRGB(y + `8`, u + `4`, v + `4`, &R1, &G1, &B1);
398	YUV420ToRGB(y + `16`, u + `8`, v + `8`, &R2, &G2, &B2);
399	YUV420ToRGB(y + `24`, u + `12`, v + `12`, &R3, &G3, &B3);
400
401	// Cast to 8b and store as BBBBGGGGRRRR.
402	bgr0 = _mm_packus_epi16(B0, B1);
403	bgr1 = _mm_packus_epi16(B2, B3);
404	bgr2 = _mm_packus_epi16(G0, G1);
405	bgr3 = _mm_packus_epi16(G2, G3);
406	bgr4 = _mm_packus_epi16(R0, R1);
407	bgr5 = _mm_packus_epi16(R2, R3);
408
409	// Pack as BGRBGRBGRBGR.
410	PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
411
412	y += `32`;
413	u += `16`;
414	v += `16`;
415	}
416	for (; n < len; ++n) { // Finish off
417	VP8YuvToBgr(y[`0`], u[`0`], v[`0`], dst);
418	dst += `3`;
419	y += `1`;
420	u += (n & `1`);
421	v += (n & `1`);
422	}
423	}
424
425	//------------------------------------------------------------------------------
426	// Entry point
427
428	extern void WebPInitSamplersSSE2(void);
429
430	WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) {
431	WebPSamplers[MODE_RGB] = YuvToRgbRow;
432	WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
433	WebPSamplers[MODE_BGR] = YuvToBgrRow;
434	WebPSamplers[MODE_BGRA] = YuvToBgraRow;
435	WebPSamplers[MODE_ARGB] = YuvToArgbRow;
436	}
437
438	//------------------------------------------------------------------------------
439	// RGB24/32 -> YUV converters
440
441	// Load eight 16b-words from src.*
442	#define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
443	// Store either 16b-words into dst*
444	#define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V))
445
446	// Function that inserts a value of the second half of the in buffer in between
447	// every two char of the first half.
448	static WEBP_INLINE void RGB24PackedToPlanarHelper(
449	const __m128i* const in /in[6]/, __m128i* const out /out[6]/) {
450	out[`0`] = _mm_unpacklo_epi8(in[`0`], in[`3`]);
451	out[`1`] = _mm_unpackhi_epi8(in[`0`], in[`3`]);
452	out[`2`] = _mm_unpacklo_epi8(in[`1`], in[`4`]);
453	out[`3`] = _mm_unpackhi_epi8(in[`1`], in[`4`]);
454	out[`4`] = _mm_unpacklo_epi8(in[`2`], in[`5`]);
455	out[`5`] = _mm_unpackhi_epi8(in[`2`], in[`5`]);
456	}
457
458	// Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
459	// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
460	// Similar to PlanarTo24bHelper(), but in reverse order.
461	static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb,
462	__m128i* const out /out[6]/) {
463	__m128i tmp[`6`];
464	tmp[`0`] = _mm_loadu_si128((const __m128i*)(rgb + `0`));
465	tmp[`1`] = _mm_loadu_si128((const __m128i*)(rgb + `16`));
466	tmp[`2`] = _mm_loadu_si128((const __m128i*)(rgb + `32`));
467	tmp[`3`] = _mm_loadu_si128((const __m128i*)(rgb + `48`));
468	tmp[`4`] = _mm_loadu_si128((const __m128i*)(rgb + `64`));
469	tmp[`5`] = _mm_loadu_si128((const __m128i*)(rgb + `80`));
470
471	RGB24PackedToPlanarHelper(tmp, out);
472	RGB24PackedToPlanarHelper(out, tmp);
473	RGB24PackedToPlanarHelper(tmp, out);
474	RGB24PackedToPlanarHelper(out, tmp);
475	RGB24PackedToPlanarHelper(tmp, out);
476	}
477
478	// Convert 8 packed ARGB to r[], g[], b[]
479	static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb,
480	__m128i* const rgb /in[6]/) {
481	const __m128i zero = _mm_setzero_si128();
482	__m128i a0 = LOAD_16(argb + `0`);
483	__m128i a1 = LOAD_16(argb + `4`);
484	__m128i a2 = LOAD_16(argb + `8`);
485	__m128i a3 = LOAD_16(argb + `12`);
486	VP8L32bToPlanar(&a0, &a1, &a2, &a3);
487	rgb[`0`] = _mm_unpacklo_epi8(a1, zero);
488	rgb[`1`] = _mm_unpackhi_epi8(a1, zero);
489	rgb[`2`] = _mm_unpacklo_epi8(a2, zero);
490	rgb[`3`] = _mm_unpackhi_epi8(a2, zero);
491	rgb[`4`] = _mm_unpacklo_epi8(a3, zero);
492	rgb[`5`] = _mm_unpackhi_epi8(a3, zero);
493	}
494
495	// This macro computes (RG MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX*
496	// It's a macro and not a function because we need to use immediate values with
497	// srai_epi32, e.g.
498	#define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \
499	ROUNDER, DESCALE_FIX, OUT) do { \
500	const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG); \
501	const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG); \
502	const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB); \
503	const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB); \
504	const __m128i V2_lo = _mm_add_epi32(V0_lo, V1_lo); \
505	const __m128i V2_hi = _mm_add_epi32(V0_hi, V1_hi); \
506	const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER); \
507	const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER); \
508	const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX); \
509	const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX); \
510	(OUT) = _mm_packs_epi32(V5_lo, V5_hi); \
511	} while (0)
512
513	#define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
514	static WEBP_INLINE void ConvertRGBToY(const __m128i* const R,
515	const __m128i* const G,
516	const __m128i* const B,
517	__m128i* const Y) {
518	const __m128i kRG_y = MK_CST_16(`16839`, `33059` - `16384`);
519	const __m128i kGB_y = MK_CST_16(`16384`, `6420`);
520	const __m128i kHALF_Y = _mm_set1_epi32((`16` << YUV_FIX) + YUV_HALF);
521
522	const __m128i RG_lo = _mm_unpacklo_epi16(R, G);
523	const __m128i RG_hi = _mm_unpackhi_epi16(R, G);
524	const __m128i GB_lo = _mm_unpacklo_epi16(G, B);
525	const __m128i GB_hi = _mm_unpackhi_epi16(G, B);
526	TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
527	}
528
529	static WEBP_INLINE void ConvertRGBToUV(const __m128i* const R,
530	const __m128i* const G,
531	const __m128i* const B,
532	__m128i* const U, __m128i* const V) {
533	const __m128i kRG_u = MK_CST_16(-`9719`, -`19081`);
534	const __m128i kGB_u = MK_CST_16(`0`, `28800`);
535	const __m128i kRG_v = MK_CST_16(`28800`, `0`);
536	const __m128i kGB_v = MK_CST_16(-`24116`, -`4684`);
537	const __m128i kHALF_UV = _mm_set1_epi32(((`128` << YUV_FIX) + YUV_HALF) << `2`);
538
539	const __m128i RG_lo = _mm_unpacklo_epi16(R, G);
540	const __m128i RG_hi = _mm_unpackhi_epi16(R, G);
541	const __m128i GB_lo = _mm_unpacklo_epi16(G, B);
542	const __m128i GB_hi = _mm_unpackhi_epi16(G, B);
543	TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u,
544	kHALF_UV, YUV_FIX + `2`, *U);
545	TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_v, kGB_v,
546	kHALF_UV, YUV_FIX + `2`, *V);
547	}
548
549	#undef MK_CST_16
550	#undef TRANSFORM
551
552	static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
553	const int max_width = width & ~`31`;
554	int i;
555	for (i = `0`; i < max_width; rgb += `3` * `16` * `2`) {
556	__m128i rgb_plane[`6`];
557	int j;
558
559	RGB24PackedToPlanar(rgb, rgb_plane);
560
561	for (j = `0`; j < `2`; ++j, i += `16`) {
562	const __m128i zero = _mm_setzero_si128();
563	__m128i r, g, b, Y0, Y1;
564
565	// Convert to 16-bit Y.
566	r = _mm_unpacklo_epi8(rgb_plane[`0` + j], zero);
567	g = _mm_unpacklo_epi8(rgb_plane[`2` + j], zero);
568	b = _mm_unpacklo_epi8(rgb_plane[`4` + j], zero);
569	ConvertRGBToY(&r, &g, &b, &Y0);
570
571	// Convert to 16-bit Y.
572	r = _mm_unpackhi_epi8(rgb_plane[`0` + j], zero);
573	g = _mm_unpackhi_epi8(rgb_plane[`2` + j], zero);
574	b = _mm_unpackhi_epi8(rgb_plane[`4` + j], zero);
575	ConvertRGBToY(&r, &g, &b, &Y1);
576
577	// Cast to 8-bit and store.
578	STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
579	}
580	}
581	for (; i < width; ++i, rgb += `3`) { // left-over
582	y[i] = VP8RGBToY(rgb[`0`], rgb[`1`], rgb[`2`], YUV_HALF);
583	}
584	}
585
586	static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
587	const int max_width = width & ~`31`;
588	int i;
589	for (i = `0`; i < max_width; bgr += `3` * `16` * `2`) {
590	__m128i bgr_plane[`6`];
591	int j;
592
593	RGB24PackedToPlanar(bgr, bgr_plane);
594
595	for (j = `0`; j < `2`; ++j, i += `16`) {
596	const __m128i zero = _mm_setzero_si128();
597	__m128i r, g, b, Y0, Y1;
598
599	// Convert to 16-bit Y.
600	b = _mm_unpacklo_epi8(bgr_plane[`0` + j], zero);
601	g = _mm_unpacklo_epi8(bgr_plane[`2` + j], zero);
602	r = _mm_unpacklo_epi8(bgr_plane[`4` + j], zero);
603	ConvertRGBToY(&r, &g, &b, &Y0);
604
605	// Convert to 16-bit Y.
606	b = _mm_unpackhi_epi8(bgr_plane[`0` + j], zero);
607	g = _mm_unpackhi_epi8(bgr_plane[`2` + j], zero);
608	r = _mm_unpackhi_epi8(bgr_plane[`4` + j], zero);
609	ConvertRGBToY(&r, &g, &b, &Y1);
610
611	// Cast to 8-bit and store.
612	STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
613	}
614	}
615	for (; i < width; ++i, bgr += `3`) { // left-over
616	y[i] = VP8RGBToY(bgr[`2`], bgr[`1`], bgr[`0`], YUV_HALF);
617	}
618	}
619
620	static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
621	const int max_width = width & ~`15`;
622	int i;
623	for (i = `0`; i < max_width; i += `16`) {
624	__m128i Y0, Y1, rgb[`6`];
625	RGB32PackedToPlanar(&argb[i], rgb);
626	ConvertRGBToY(&rgb[`0`], &rgb[`2`], &rgb[`4`], &Y0);
627	ConvertRGBToY(&rgb[`1`], &rgb[`3`], &rgb[`5`], &Y1);
628	STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
629	}
630	for (; i < width; ++i) { // left-over
631	const uint32_t p = argb[i];
632	y[i] = VP8RGBToY((p >> `16`) & `0xff`, (p >> `8`) & `0xff`, (p >> `0`) & `0xff`,
633	YUV_HALF);
634	}
635	}
636
637	// Horizontal add (doubled) of two 16b values, result is 16b.
638	// in: A \| B \| C \| D \| ... -> out: 2(A+B) \| 2(C+D) \| ...
639	static void HorizontalAddPack(const __m128i* const A, const __m128i* const B,
640	__m128i* const out) {
641	const __m128i k2 = _mm_set1_epi16(`2`);
642	const __m128i C = _mm_madd_epi16(*A, k2);
643	const __m128i D = _mm_madd_epi16(*B, k2);
644	*out = _mm_packs_epi32(C, D);
645	}
646
647	static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v,
648	int src_width, int do_store) {
649	const int max_width = src_width & ~`31`;
650	int i;
651	for (i = `0`; i < max_width; i += `32`, u += `16`, v += `16`) {
652	__m128i rgb[`6`], U0, V0, U1, V1;
653	RGB32PackedToPlanar(&argb[i], rgb);
654	HorizontalAddPack(&rgb[`0`], &rgb[`1`], &rgb[`0`]);
655	HorizontalAddPack(&rgb[`2`], &rgb[`3`], &rgb[`2`]);
656	HorizontalAddPack(&rgb[`4`], &rgb[`5`], &rgb[`4`]);
657	ConvertRGBToUV(&rgb[`0`], &rgb[`2`], &rgb[`4`], &U0, &V0);
658
659	RGB32PackedToPlanar(&argb[i + `16`], rgb);
660	HorizontalAddPack(&rgb[`0`], &rgb[`1`], &rgb[`0`]);
661	HorizontalAddPack(&rgb[`2`], &rgb[`3`], &rgb[`2`]);
662	HorizontalAddPack(&rgb[`4`], &rgb[`5`], &rgb[`4`]);
663	ConvertRGBToUV(&rgb[`0`], &rgb[`2`], &rgb[`4`], &U1, &V1);
664
665	U0 = _mm_packus_epi16(U0, U1);
666	V0 = _mm_packus_epi16(V0, V1);
667	if (!do_store) {
668	const __m128i prev_u = LOAD_16(u);
669	const __m128i prev_v = LOAD_16(v);
670	U0 = _mm_avg_epu8(U0, prev_u);
671	V0 = _mm_avg_epu8(V0, prev_v);
672	}
673	STORE_16(U0, u);
674	STORE_16(V0, v);
675	}
676	if (i < src_width) { // left-over
677	WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
678	}
679	}
680
681	// Convert 16 packed ARGB 16b-values to r[], g[], b[]
682	static WEBP_INLINE void RGBA32PackedToPlanar_16b(const uint16_t* const rgbx,
683	__m128i* const r,
684	__m128i* const g,
685	__m128i* const b) {
686	const __m128i in0 = LOAD_16(rgbx + `0`); // r0 \| g0 \| b0 \|x\| r1 \| g1 \| b1 \|x
687	const __m128i in1 = LOAD_16(rgbx + `8`); // r2 \| g2 \| b2 \|x\| r3 \| g3 \| b3 \|x
688	const __m128i in2 = LOAD_16(rgbx + `16`); // r4 \| ...
689	const __m128i in3 = LOAD_16(rgbx + `24`); // r6 \| ...
690	// column-wise transpose
691	const __m128i A0 = _mm_unpacklo_epi16(in0, in1);
692	const __m128i A1 = _mm_unpackhi_epi16(in0, in1);
693	const __m128i A2 = _mm_unpacklo_epi16(in2, in3);
694	const __m128i A3 = _mm_unpackhi_epi16(in2, in3);
695	const __m128i B0 = _mm_unpacklo_epi16(A0, A1); // r0 r1 r2 r3 \| g0 g1 ..
696	const __m128i B1 = _mm_unpackhi_epi16(A0, A1); // b0 b1 b2 b3 \| x x x x
697	const __m128i B2 = _mm_unpacklo_epi16(A2, A3); // r4 r5 r6 r7 \| g4 g5 ..
698	const __m128i B3 = _mm_unpackhi_epi16(A2, A3); // b4 b5 b6 b7 \| x x x x
699	*r = _mm_unpacklo_epi64(B0, B2);
700	*g = _mm_unpackhi_epi64(B0, B2);
701	*b = _mm_unpacklo_epi64(B1, B3);
702	}
703
704	static void ConvertRGBA32ToUV(const uint16_t* rgb,
705	uint8_t* u, uint8_t* v, int width) {
706	const int max_width = width & ~`15`;
707	const uint16_t* const last_rgb = rgb + `4` * max_width;
708	while (rgb < last_rgb) {
709	__m128i r, g, b, U0, V0, U1, V1;
710	RGBA32PackedToPlanar_16b(rgb + `0`, &r, &g, &b);
711	ConvertRGBToUV(&r, &g, &b, &U0, &V0);
712	RGBA32PackedToPlanar_16b(rgb + `32`, &r, &g, &b);
713	ConvertRGBToUV(&r, &g, &b, &U1, &V1);
714	STORE_16(_mm_packus_epi16(U0, U1), u);
715	STORE_16(_mm_packus_epi16(V0, V1), v);
716	u += `16`;
717	v += `16`;
718	rgb += `2` * `32`;
719	}
720	if (max_width < width) { // left-over
721	WebPConvertRGBA32ToUV_C(rgb, u, v, width - max_width);
722	}
723	}
724
725	//------------------------------------------------------------------------------
726
727	extern void WebPInitConvertARGBToYUVSSE2(void);
728
729	WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
730	WebPConvertARGBToY = ConvertARGBToY;
731	WebPConvertARGBToUV = ConvertARGBToUV;
732
733	WebPConvertRGB24ToY = ConvertRGB24ToY;
734	WebPConvertBGR24ToY = ConvertBGR24ToY;
735
736	WebPConvertRGBA32ToUV = ConvertRGBA32ToUV;
737	}
738
739	//------------------------------------------------------------------------------
740
741	#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
742	static uint16_t clip_y(int v) {
743	return (v < `0`) ? `0` : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
744	}
745
746	static uint64_t SharpYUVUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
747	uint16_t* dst, int len) {
748	uint64_t diff = `0`;
749	uint32_t tmp[`4`];
750	int i;
751	const __m128i zero = _mm_setzero_si128();
752	const __m128i max = _mm_set1_epi16(MAX_Y);
753	const __m128i one = _mm_set1_epi16(`1`);
754	__m128i sum = zero;
755
756	for (i = `0`; i + `8` <= len; i += `8`) {
757	const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
758	const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
759	const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
760	const __m128i D = _mm_sub_epi16(A, B); // diff_y
761	const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0)
762	const __m128i F = _mm_add_epi16(C, D); // new_y
763	const __m128i G = _mm_or_si128(E, one); // -1 or 1
764	const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
765	const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...))
766	_mm_storeu_si128((__m128i*)(dst + i), H);
767	sum = _mm_add_epi32(sum, I);
768	}
769	_mm_storeu_si128((__m128i*)tmp, sum);
770	diff = tmp[`3`] + tmp[`2`] + tmp[`1`] + tmp[`0`];
771	for (; i < len; ++i) {
772	const int diff_y = ref[i] - src[i];
773	const int new_y = (int)dst[i] + diff_y;
774	dst[i] = clip_y(new_y);
775	diff += (uint64_t)abs(diff_y);
776	}
777	return diff;
778	}
779
780	static void SharpYUVUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
781	int16_t* dst, int len) {
782	int i = `0`;
783	for (i = `0`; i + `8` <= len; i += `8`) {
784	const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
785	const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
786	const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
787	const __m128i D = _mm_sub_epi16(A, B); // diff_uv
788	const __m128i E = _mm_add_epi16(C, D); // new_uv
789	_mm_storeu_si128((__m128i*)(dst + i), E);
790	}
791	for (; i < len; ++i) {
792	const int diff_uv = ref[i] - src[i];
793	dst[i] += diff_uv;
794	}
795	}
796
797	static void SharpYUVFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
798	const uint16_t* best_y, uint16_t* out) {
799	int i;
800	const __m128i kCst8 = _mm_set1_epi16(`8`);
801	const __m128i max = _mm_set1_epi16(MAX_Y);
802	const __m128i zero = _mm_setzero_si128();
803	for (i = `0`; i + `8` <= len; i += `8`) {
804	const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + `0`));
805	const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + `1`));
806	const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + `0`));
807	const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + `1`));
808	const __m128i a0b1 = _mm_add_epi16(a0, b1);
809	const __m128i a1b0 = _mm_add_epi16(a1, b0);
810	const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1
811	const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
812	const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2(A0+B1)*
813	const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2(A1+B0)*
814	const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), `3`);
815	const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), `3`);
816	const __m128i d0 = _mm_add_epi16(c1, a0);
817	const __m128i d1 = _mm_add_epi16(c0, a1);
818	const __m128i e0 = _mm_srai_epi16(d0, `1`);
819	const __m128i e1 = _mm_srai_epi16(d1, `1`);
820	const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
821	const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
822	const __m128i g0 = _mm_loadu_si128((const __m128i)(best_y + `2` i + `0`));
823	const __m128i g1 = _mm_loadu_si128((const __m128i)(best_y + `2` i + `8`));
824	const __m128i h0 = _mm_add_epi16(g0, f0);
825	const __m128i h1 = _mm_add_epi16(g1, f1);
826	const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
827	const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
828	_mm_storeu_si128((__m128i)(out + `2` i + `0`), i0);
829	_mm_storeu_si128((__m128i)(out + `2` i + `8`), i1);
830	}
831	for (; i < len; ++i) {
832	// (9 A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =*
833	// = (8 A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4*
834	// We reuse the common sub-expressions.
835	const int a0b1 = A[i + `0`] + B[i + `1`];
836	const int a1b0 = A[i + `1`] + B[i + `0`];
837	const int a0a1b0b1 = a0b1 + a1b0 + `8`;
838	const int v0 = (`8` * A[i + `0`] + `2` * a1b0 + a0a1b0b1) >> `4`;
839	const int v1 = (`8` * A[i + `1`] + `2` * a0b1 + a0a1b0b1) >> `4`;
840	out[`2` * i + `0`] = clip_y(best_y[`2` * i + `0`] + v0);
841	out[`2` * i + `1`] = clip_y(best_y[`2` * i + `1`] + v1);
842	}
843	}
844
845	#undef MAX_Y
846
847	//------------------------------------------------------------------------------
848
849	extern void WebPInitSharpYUVSSE2(void);
850
851	WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVSSE2(void) {
852	WebPSharpYUVUpdateY = SharpYUVUpdateY_SSE2;
853	WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_SSE2;
854	WebPSharpYUVFilterRow = SharpYUVFilterRow_SSE2;
855	}
856
857	#else // !WEBP_USE_SSE2
858
859	WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2)
860	WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2)
861	WEBP_DSP_INIT_STUB(WebPInitSharpYUVSSE2)
862
863	#endif // WEBP_USE_SSE2
864

Browse the source code of engine/third_party/libwebp/src/dsp/yuv_sse2.c