yuv_sse2.c source code [Skia/third_party/externals/libwebp/src/dsp/yuv_sse2.c]

1	// Copyright 2014 Google Inc. All Rights Reserved.
2	//
3	// Use of this source code is governed by a BSD-style license
4	// that can be found in the COPYING file in the root of the source
5	// tree. An additional intellectual property rights grant can be found
6	// in the file PATENTS. All contributing project authors may
7	// be found in the AUTHORS file in the root of the source tree.
8	// -----------------------------------------------------------------------------
9	//
10	// YUV->RGB conversion functions
11	//
12	// Author: Skal (pascal.massimino@gmail.com)
13
14	#include "src/dsp/yuv.h"
15
16	#if defined(WEBP_USE_SSE2)
17
18	#include "src/dsp/common_sse2.h"
19	#include <stdlib.h>
20	#include <emmintrin.h>
21
22	//-----------------------------------------------------------------------------
23	// Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
24
25	// These constants are 14b fixed-point version of ITU-R BT.601 constants.
26	// R = (19077 y + 26149 * v - 14234) >> 6*
27	// G = (19077 y - 6419 * u - 13320 * v + 8708) >> 6*
28	// B = (19077 y + 33050 * u - 17685) >> 6*
29	static void ConvertYUV444ToRGB_SSE2(const __m128i* const Y0,
30	const __m128i* const U0,
31	const __m128i* const V0,
32	__m128i* const R,
33	__m128i* const G,
34	__m128i* const B) {
35	const __m128i k19077 = _mm_set1_epi16(`19077`);
36	const __m128i k26149 = _mm_set1_epi16(`26149`);
37	const __m128i k14234 = _mm_set1_epi16(`14234`);
38	// 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
39	const __m128i k33050 = _mm_set1_epi16((short)`33050`);
40	const __m128i k17685 = _mm_set1_epi16(`17685`);
41	const __m128i k6419 = _mm_set1_epi16(`6419`);
42	const __m128i k13320 = _mm_set1_epi16(`13320`);
43	const __m128i k8708 = _mm_set1_epi16(`8708`);
44
45	const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077);
46
47	const __m128i R0 = _mm_mulhi_epu16(*V0, k26149);
48	const __m128i R1 = _mm_sub_epi16(Y1, k14234);
49	const __m128i R2 = _mm_add_epi16(R1, R0);
50
51	const __m128i G0 = _mm_mulhi_epu16(*U0, k6419);
52	const __m128i G1 = _mm_mulhi_epu16(*V0, k13320);
53	const __m128i G2 = _mm_add_epi16(Y1, k8708);
54	const __m128i G3 = _mm_add_epi16(G0, G1);
55	const __m128i G4 = _mm_sub_epi16(G2, G3);
56
57	// be careful with the saturated unsigned* arithmetic here!*
58	const __m128i B0 = _mm_mulhi_epu16(*U0, k33050);
59	const __m128i B1 = _mm_adds_epu16(B0, Y1);
60	const __m128i B2 = _mm_subs_epu16(B1, k17685);
61
62	// use logical shift for B2, which can be larger than 32767
63	R = _mm_srai_epi16(R2, `6`); // range: [-14234, 30815]*
64	G = _mm_srai_epi16(G4, `6`); // range: [-10953, 27710]*
65	B = _mm_srli_epi16(B2, `6`); // range: [0, 34238]*
66	}
67
68	// Load the bytes into the upper* part of 16b words. That's "<< 8", basically.*
69	static WEBP_INLINE __m128i Load_HI_16_SSE2(const uint8_t* src) {
70	const __m128i zero = _mm_setzero_si128();
71	return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
72	}
73
74	// Load and replicate the U/V samples
75	static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) {
76	const __m128i zero = _mm_setzero_si128();
77	const __m128i tmp0 = _mm_cvtsi32_si128((const* uint32_t*)src);
78	const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
79	return _mm_unpacklo_epi16(tmp1, tmp1); // replicate samples
80	}
81
82	// Convert 32 samples of YUV444 to R/G/B
83	static void YUV444ToRGB_SSE2(const uint8_t* const y,
84	const uint8_t* const u,
85	const uint8_t* const v,
86	__m128i* const R, __m128i* const G,
87	__m128i* const B) {
88	const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u),
89	V0 = Load_HI_16_SSE2(v);
90	ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
91	}
92
93	// Convert 32 samples of YUV420 to R/G/B
94	static void YUV420ToRGB_SSE2(const uint8_t* const y,
95	const uint8_t* const u,
96	const uint8_t* const v,
97	__m128i* const R, __m128i* const G,
98	__m128i* const B) {
99	const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u),
100	V0 = Load_UV_HI_8_SSE2(v);
101	ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
102	}
103
104	// Pack R/G/B/A results into 32b output.
105	static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
106	const __m128i* const G,
107	const __m128i* const B,
108	const __m128i* const A,
109	uint8_t* const dst) {
110	const __m128i rb = _mm_packus_epi16(R, B);
111	const __m128i ga = _mm_packus_epi16(G, A);
112	const __m128i rg = _mm_unpacklo_epi8(rb, ga);
113	const __m128i ba = _mm_unpackhi_epi8(rb, ga);
114	const __m128i RGBA_lo = _mm_unpacklo_epi16(rg, ba);
115	const __m128i RGBA_hi = _mm_unpackhi_epi16(rg, ba);
116	_mm_storeu_si128((__m128i*)(dst + `0`), RGBA_lo);
117	_mm_storeu_si128((__m128i*)(dst + `16`), RGBA_hi);
118	}
119
120	// Pack R/G/B/A results into 16b output.
121	static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
122	const __m128i* const G,
123	const __m128i* const B,
124	const __m128i* const A,
125	uint8_t* const dst) {
126	#if (WEBP_SWAP_16BIT_CSP == 0)
127	const __m128i rg0 = _mm_packus_epi16(R, G);
128	const __m128i ba0 = _mm_packus_epi16(B, A);
129	#else
130	const __m128i rg0 = _mm_packus_epi16(B, A);
131	const __m128i ba0 = _mm_packus_epi16(R, G);
132	#endif
133	const __m128i mask_0xf0 = _mm_set1_epi8(`0xf0`);
134	const __m128i rb1 = _mm_unpacklo_epi8(rg0, ba0); // rbrbrbrbrb...
135	const __m128i ga1 = _mm_unpackhi_epi8(rg0, ba0); // gagagagaga...
136	const __m128i rb2 = _mm_and_si128(rb1, mask_0xf0);
137	const __m128i ga2 = _mm_srli_epi16(_mm_and_si128(ga1, mask_0xf0), `4`);
138	const __m128i rgba4444 = _mm_or_si128(rb2, ga2);
139	_mm_storeu_si128((__m128i*)dst, rgba4444);
140	}
141
142	// Pack R/G/B results into 16b output.
143	static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
144	const __m128i* const G,
145	const __m128i* const B,
146	uint8_t* const dst) {
147	const __m128i r0 = _mm_packus_epi16(R, R);
148	const __m128i g0 = _mm_packus_epi16(G, G);
149	const __m128i b0 = _mm_packus_epi16(B, B);
150	const __m128i r1 = _mm_and_si128(r0, _mm_set1_epi8(`0xf8`));
151	const __m128i b1 = _mm_and_si128(_mm_srli_epi16(b0, `3`), _mm_set1_epi8(`0x1f`));
152	const __m128i g1 = _mm_srli_epi16(_mm_and_si128(g0, _mm_set1_epi8(`0xe0`)), `5`);
153	const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(`0x1c`)), `3`);
154	const __m128i rg = _mm_or_si128(r1, g1);
155	const __m128i gb = _mm_or_si128(g2, b1);
156	#if (WEBP_SWAP_16BIT_CSP == 0)
157	const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb);
158	#else
159	const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg);
160	#endif
161	_mm_storeu_si128((__m128i*)dst, rgb565);
162	}
163
164	// Pack the planar buffers
165	// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
166	// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
167	static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
168	__m128i* const in2, __m128i* const in3,
169	__m128i* const in4, __m128i* const in5,
170	uint8_t* const rgb) {
171	// The input is 6 registers of sixteen 8b but for the sake of explanation,
172	// let's take 6 registers of four 8b values.
173	// To pack, we will keep taking one every two 8b integer and move it
174	// around as follows:
175	// Input:
176	// r0r1r2r3 \| r4r5r6r7 \| g0g1g2g3 \| g4g5g6g7 \| b0b1b2b3 \| b4b5b6b7
177	// Split the 6 registers in two sets of 3 registers: the first set as the even
178	// 8b bytes, the second the odd ones:
179	// r0r2r4r6 \| g0g2g4g6 \| b0b2b4b6 \| r1r3r5r7 \| g1g3g5g7 \| b1b3b5b7
180	// Repeat the same permutations twice more:
181	// r0r4g0g4 \| b0b4r1r5 \| g1g5b1b5 \| r2r6g2g6 \| b2b6r3r7 \| g3g7b3b7
182	// r0g0b0r1 \| g1b1r2g2 \| b2r3g3b3 \| r4g4b4r5 \| g5b5r6g6 \| b6r7g7b7
183	VP8PlanarTo24b_SSE2(in0, in1, in2, in3, in4, in5);
184
185	_mm_storeu_si128((__m128i)(rgb + `0`), in0);
186	_mm_storeu_si128((__m128i)(rgb + `16`), in1);
187	_mm_storeu_si128((__m128i)(rgb + `32`), in2);
188	_mm_storeu_si128((__m128i)(rgb + `48`), in3);
189	_mm_storeu_si128((__m128i)(rgb + `64`), in4);
190	_mm_storeu_si128((__m128i)(rgb + `80`), in5);
191	}
192
193	void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
194	uint8_t* dst) {
195	const __m128i kAlpha = _mm_set1_epi16(`255`);
196	int n;
197	for (n = `0`; n < `32`; n += `8`, dst += `32`) {
198	__m128i R, G, B;
199	YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
200	PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
201	}
202	}
203
204	void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
205	uint8_t* dst) {
206	const __m128i kAlpha = _mm_set1_epi16(`255`);
207	int n;
208	for (n = `0`; n < `32`; n += `8`, dst += `32`) {
209	__m128i R, G, B;
210	YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
211	PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
212	}
213	}
214
215	void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
216	uint8_t* dst) {
217	const __m128i kAlpha = _mm_set1_epi16(`255`);
218	int n;
219	for (n = `0`; n < `32`; n += `8`, dst += `32`) {
220	__m128i R, G, B;
221	YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
222	PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
223	}
224	}
225
226	void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
227	const uint8_t* v, uint8_t* dst) {
228	const __m128i kAlpha = _mm_set1_epi16(`255`);
229	int n;
230	for (n = `0`; n < `32`; n += `8`, dst += `16`) {
231	__m128i R, G, B;
232	YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
233	PackAndStore4444_SSE2(&R, &G, &B, &kAlpha, dst);
234	}
235	}
236
237	void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
238	uint8_t* dst) {
239	int n;
240	for (n = `0`; n < `32`; n += `8`, dst += `16`) {
241	__m128i R, G, B;
242	YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
243	PackAndStore565_SSE2(&R, &G, &B, dst);
244	}
245	}
246
247	void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
248	uint8_t* dst) {
249	__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
250	__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
251
252	YUV444ToRGB_SSE2(y + `0`, u + `0`, v + `0`, &R0, &G0, &B0);
253	YUV444ToRGB_SSE2(y + `8`, u + `8`, v + `8`, &R1, &G1, &B1);
254	YUV444ToRGB_SSE2(y + `16`, u + `16`, v + `16`, &R2, &G2, &B2);
255	YUV444ToRGB_SSE2(y + `24`, u + `24`, v + `24`, &R3, &G3, &B3);
256
257	// Cast to 8b and store as RRRRGGGGBBBB.
258	rgb0 = _mm_packus_epi16(R0, R1);
259	rgb1 = _mm_packus_epi16(R2, R3);
260	rgb2 = _mm_packus_epi16(G0, G1);
261	rgb3 = _mm_packus_epi16(G2, G3);
262	rgb4 = _mm_packus_epi16(B0, B1);
263	rgb5 = _mm_packus_epi16(B2, B3);
264
265	// Pack as RGBRGBRGBRGB.
266	PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
267	}
268
269	void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
270	uint8_t* dst) {
271	__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
272	__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
273
274	YUV444ToRGB_SSE2(y + `0`, u + `0`, v + `0`, &R0, &G0, &B0);
275	YUV444ToRGB_SSE2(y + `8`, u + `8`, v + `8`, &R1, &G1, &B1);
276	YUV444ToRGB_SSE2(y + `16`, u + `16`, v + `16`, &R2, &G2, &B2);
277	YUV444ToRGB_SSE2(y + `24`, u + `24`, v + `24`, &R3, &G3, &B3);
278
279	// Cast to 8b and store as BBBBGGGGRRRR.
280	bgr0 = _mm_packus_epi16(B0, B1);
281	bgr1 = _mm_packus_epi16(B2, B3);
282	bgr2 = _mm_packus_epi16(G0, G1);
283	bgr3 = _mm_packus_epi16(G2, G3);
284	bgr4 = _mm_packus_epi16(R0, R1);
285	bgr5= _mm_packus_epi16(R2, R3);
286
287	// Pack as BGRBGRBGRBGR.
288	PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
289	}
290
291	//-----------------------------------------------------------------------------
292	// Arbitrary-length row conversion functions
293
294	static void YuvToRgbaRow_SSE2(const uint8_t* y,
295	const uint8_t* u, const uint8_t* v,
296	uint8_t* dst, int len) {
297	const __m128i kAlpha = _mm_set1_epi16(`255`);
298	int n;
299	for (n = `0`; n + `8` <= len; n += `8`, dst += `32`) {
300	__m128i R, G, B;
301	YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
302	PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
303	y += `8`;
304	u += `4`;
305	v += `4`;
306	}
307	for (; n < len; ++n) { // Finish off
308	VP8YuvToRgba(y[`0`], u[`0`], v[`0`], dst);
309	dst += `4`;
310	y += `1`;
311	u += (n & `1`);
312	v += (n & `1`);
313	}
314	}
315
316	static void YuvToBgraRow_SSE2(const uint8_t* y,
317	const uint8_t* u, const uint8_t* v,
318	uint8_t* dst, int len) {
319	const __m128i kAlpha = _mm_set1_epi16(`255`);
320	int n;
321	for (n = `0`; n + `8` <= len; n += `8`, dst += `32`) {
322	__m128i R, G, B;
323	YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
324	PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
325	y += `8`;
326	u += `4`;
327	v += `4`;
328	}
329	for (; n < len; ++n) { // Finish off
330	VP8YuvToBgra(y[`0`], u[`0`], v[`0`], dst);
331	dst += `4`;
332	y += `1`;
333	u += (n & `1`);
334	v += (n & `1`);
335	}
336	}
337
338	static void YuvToArgbRow_SSE2(const uint8_t* y,
339	const uint8_t* u, const uint8_t* v,
340	uint8_t* dst, int len) {
341	const __m128i kAlpha = _mm_set1_epi16(`255`);
342	int n;
343	for (n = `0`; n + `8` <= len; n += `8`, dst += `32`) {
344	__m128i R, G, B;
345	YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
346	PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
347	y += `8`;
348	u += `4`;
349	v += `4`;
350	}
351	for (; n < len; ++n) { // Finish off
352	VP8YuvToArgb(y[`0`], u[`0`], v[`0`], dst);
353	dst += `4`;
354	y += `1`;
355	u += (n & `1`);
356	v += (n & `1`);
357	}
358	}
359
360	static void YuvToRgbRow_SSE2(const uint8_t* y,
361	const uint8_t* u, const uint8_t* v,
362	uint8_t* dst, int len) {
363	int n;
364	for (n = `0`; n + `32` <= len; n += `32`, dst += `32` * `3`) {
365	__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
366	__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
367
368	YUV420ToRGB_SSE2(y + `0`, u + `0`, v + `0`, &R0, &G0, &B0);
369	YUV420ToRGB_SSE2(y + `8`, u + `4`, v + `4`, &R1, &G1, &B1);
370	YUV420ToRGB_SSE2(y + `16`, u + `8`, v + `8`, &R2, &G2, &B2);
371	YUV420ToRGB_SSE2(y + `24`, u + `12`, v + `12`, &R3, &G3, &B3);
372
373	// Cast to 8b and store as RRRRGGGGBBBB.
374	rgb0 = _mm_packus_epi16(R0, R1);
375	rgb1 = _mm_packus_epi16(R2, R3);
376	rgb2 = _mm_packus_epi16(G0, G1);
377	rgb3 = _mm_packus_epi16(G2, G3);
378	rgb4 = _mm_packus_epi16(B0, B1);
379	rgb5 = _mm_packus_epi16(B2, B3);
380
381	// Pack as RGBRGBRGBRGB.
382	PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
383
384	y += `32`;
385	u += `16`;
386	v += `16`;
387	}
388	for (; n < len; ++n) { // Finish off
389	VP8YuvToRgb(y[`0`], u[`0`], v[`0`], dst);
390	dst += `3`;
391	y += `1`;
392	u += (n & `1`);
393	v += (n & `1`);
394	}
395	}
396
397	static void YuvToBgrRow_SSE2(const uint8_t* y,
398	const uint8_t* u, const uint8_t* v,
399	uint8_t* dst, int len) {
400	int n;
401	for (n = `0`; n + `32` <= len; n += `32`, dst += `32` * `3`) {
402	__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
403	__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
404
405	YUV420ToRGB_SSE2(y + `0`, u + `0`, v + `0`, &R0, &G0, &B0);
406	YUV420ToRGB_SSE2(y + `8`, u + `4`, v + `4`, &R1, &G1, &B1);
407	YUV420ToRGB_SSE2(y + `16`, u + `8`, v + `8`, &R2, &G2, &B2);
408	YUV420ToRGB_SSE2(y + `24`, u + `12`, v + `12`, &R3, &G3, &B3);
409
410	// Cast to 8b and store as BBBBGGGGRRRR.
411	bgr0 = _mm_packus_epi16(B0, B1);
412	bgr1 = _mm_packus_epi16(B2, B3);
413	bgr2 = _mm_packus_epi16(G0, G1);
414	bgr3 = _mm_packus_epi16(G2, G3);
415	bgr4 = _mm_packus_epi16(R0, R1);
416	bgr5 = _mm_packus_epi16(R2, R3);
417
418	// Pack as BGRBGRBGRBGR.
419	PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
420
421	y += `32`;
422	u += `16`;
423	v += `16`;
424	}
425	for (; n < len; ++n) { // Finish off
426	VP8YuvToBgr(y[`0`], u[`0`], v[`0`], dst);
427	dst += `3`;
428	y += `1`;
429	u += (n & `1`);
430	v += (n & `1`);
431	}
432	}
433
434	//------------------------------------------------------------------------------
435	// Entry point
436
437	extern void WebPInitSamplersSSE2(void);
438
439	WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) {
440	WebPSamplers[MODE_RGB] = YuvToRgbRow_SSE2;
441	WebPSamplers[MODE_RGBA] = YuvToRgbaRow_SSE2;
442	WebPSamplers[MODE_BGR] = YuvToBgrRow_SSE2;
443	WebPSamplers[MODE_BGRA] = YuvToBgraRow_SSE2;
444	WebPSamplers[MODE_ARGB] = YuvToArgbRow_SSE2;
445	}
446
447	//------------------------------------------------------------------------------
448	// RGB24/32 -> YUV converters
449
450	// Load eight 16b-words from src.*
451	#define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
452	// Store either 16b-words into dst*
453	#define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V))
454
455	// Function that inserts a value of the second half of the in buffer in between
456	// every two char of the first half.
457	static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2(
458	const __m128i* const in /in[6]/, __m128i* const out /out[6]/) {
459	out[`0`] = _mm_unpacklo_epi8(in[`0`], in[`3`]);
460	out[`1`] = _mm_unpackhi_epi8(in[`0`], in[`3`]);
461	out[`2`] = _mm_unpacklo_epi8(in[`1`], in[`4`]);
462	out[`3`] = _mm_unpackhi_epi8(in[`1`], in[`4`]);
463	out[`4`] = _mm_unpacklo_epi8(in[`2`], in[`5`]);
464	out[`5`] = _mm_unpackhi_epi8(in[`2`], in[`5`]);
465	}
466
467	// Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
468	// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
469	// Similar to PlanarTo24bHelper(), but in reverse order.
470	static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
471	const uint8_t* const rgb, __m128i* const out /out[6]/) {
472	__m128i tmp[`6`];
473	tmp[`0`] = _mm_loadu_si128((const __m128i*)(rgb + `0`));
474	tmp[`1`] = _mm_loadu_si128((const __m128i*)(rgb + `16`));
475	tmp[`2`] = _mm_loadu_si128((const __m128i*)(rgb + `32`));
476	tmp[`3`] = _mm_loadu_si128((const __m128i*)(rgb + `48`));
477	tmp[`4`] = _mm_loadu_si128((const __m128i*)(rgb + `64`));
478	tmp[`5`] = _mm_loadu_si128((const __m128i*)(rgb + `80`));
479
480	RGB24PackedToPlanarHelper_SSE2(tmp, out);
481	RGB24PackedToPlanarHelper_SSE2(out, tmp);
482	RGB24PackedToPlanarHelper_SSE2(tmp, out);
483	RGB24PackedToPlanarHelper_SSE2(out, tmp);
484	RGB24PackedToPlanarHelper_SSE2(tmp, out);
485	}
486
487	// Convert 8 packed ARGB to r[], g[], b[]
488	static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb,
489	__m128i* const rgb /in[6]/) {
490	const __m128i zero = _mm_setzero_si128();
491	__m128i a0 = LOAD_16(argb + `0`);
492	__m128i a1 = LOAD_16(argb + `4`);
493	__m128i a2 = LOAD_16(argb + `8`);
494	__m128i a3 = LOAD_16(argb + `12`);
495	VP8L32bToPlanar_SSE2(&a0, &a1, &a2, &a3);
496	rgb[`0`] = _mm_unpacklo_epi8(a1, zero);
497	rgb[`1`] = _mm_unpackhi_epi8(a1, zero);
498	rgb[`2`] = _mm_unpacklo_epi8(a2, zero);
499	rgb[`3`] = _mm_unpackhi_epi8(a2, zero);
500	rgb[`4`] = _mm_unpacklo_epi8(a3, zero);
501	rgb[`5`] = _mm_unpackhi_epi8(a3, zero);
502	}
503
504	// This macro computes (RG MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX*
505	// It's a macro and not a function because we need to use immediate values with
506	// srai_epi32, e.g.
507	#define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \
508	ROUNDER, DESCALE_FIX, OUT) do { \
509	const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG); \
510	const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG); \
511	const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB); \
512	const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB); \
513	const __m128i V2_lo = _mm_add_epi32(V0_lo, V1_lo); \
514	const __m128i V2_hi = _mm_add_epi32(V0_hi, V1_hi); \
515	const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER); \
516	const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER); \
517	const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX); \
518	const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX); \
519	(OUT) = _mm_packs_epi32(V5_lo, V5_hi); \
520	} while (0)
521
522	#define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
523	static WEBP_INLINE void ConvertRGBToY_SSE2(const __m128i* const R,
524	const __m128i* const G,
525	const __m128i* const B,
526	__m128i* const Y) {
527	const __m128i kRG_y = MK_CST_16(`16839`, `33059` - `16384`);
528	const __m128i kGB_y = MK_CST_16(`16384`, `6420`);
529	const __m128i kHALF_Y = _mm_set1_epi32((`16` << YUV_FIX) + YUV_HALF);
530
531	const __m128i RG_lo = _mm_unpacklo_epi16(R, G);
532	const __m128i RG_hi = _mm_unpackhi_epi16(R, G);
533	const __m128i GB_lo = _mm_unpacklo_epi16(G, B);
534	const __m128i GB_hi = _mm_unpackhi_epi16(G, B);
535	TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
536	}
537
538	static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R,
539	const __m128i* const G,
540	const __m128i* const B,
541	__m128i* const U,
542	__m128i* const V) {
543	const __m128i kRG_u = MK_CST_16(-`9719`, -`19081`);
544	const __m128i kGB_u = MK_CST_16(`0`, `28800`);
545	const __m128i kRG_v = MK_CST_16(`28800`, `0`);
546	const __m128i kGB_v = MK_CST_16(-`24116`, -`4684`);
547	const __m128i kHALF_UV = _mm_set1_epi32(((`128` << YUV_FIX) + YUV_HALF) << `2`);
548
549	const __m128i RG_lo = _mm_unpacklo_epi16(R, G);
550	const __m128i RG_hi = _mm_unpackhi_epi16(R, G);
551	const __m128i GB_lo = _mm_unpacklo_epi16(G, B);
552	const __m128i GB_hi = _mm_unpackhi_epi16(G, B);
553	TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u,
554	kHALF_UV, YUV_FIX + `2`, *U);
555	TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_v, kGB_v,
556	kHALF_UV, YUV_FIX + `2`, *V);
557	}
558
559	#undef MK_CST_16
560	#undef TRANSFORM
561
562	static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) {
563	const int max_width = width & ~`31`;
564	int i;
565	for (i = `0`; i < max_width; rgb += `3` * `16` * `2`) {
566	__m128i rgb_plane[`6`];
567	int j;
568
569	RGB24PackedToPlanar_SSE2(rgb, rgb_plane);
570
571	for (j = `0`; j < `2`; ++j, i += `16`) {
572	const __m128i zero = _mm_setzero_si128();
573	__m128i r, g, b, Y0, Y1;
574
575	// Convert to 16-bit Y.
576	r = _mm_unpacklo_epi8(rgb_plane[`0` + j], zero);
577	g = _mm_unpacklo_epi8(rgb_plane[`2` + j], zero);
578	b = _mm_unpacklo_epi8(rgb_plane[`4` + j], zero);
579	ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
580
581	// Convert to 16-bit Y.
582	r = _mm_unpackhi_epi8(rgb_plane[`0` + j], zero);
583	g = _mm_unpackhi_epi8(rgb_plane[`2` + j], zero);
584	b = _mm_unpackhi_epi8(rgb_plane[`4` + j], zero);
585	ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
586
587	// Cast to 8-bit and store.
588	STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
589	}
590	}
591	for (; i < width; ++i, rgb += `3`) { // left-over
592	y[i] = VP8RGBToY(rgb[`0`], rgb[`1`], rgb[`2`], YUV_HALF);
593	}
594	}
595
596	static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) {
597	const int max_width = width & ~`31`;
598	int i;
599	for (i = `0`; i < max_width; bgr += `3` * `16` * `2`) {
600	__m128i bgr_plane[`6`];
601	int j;
602
603	RGB24PackedToPlanar_SSE2(bgr, bgr_plane);
604
605	for (j = `0`; j < `2`; ++j, i += `16`) {
606	const __m128i zero = _mm_setzero_si128();
607	__m128i r, g, b, Y0, Y1;
608
609	// Convert to 16-bit Y.
610	b = _mm_unpacklo_epi8(bgr_plane[`0` + j], zero);
611	g = _mm_unpacklo_epi8(bgr_plane[`2` + j], zero);
612	r = _mm_unpacklo_epi8(bgr_plane[`4` + j], zero);
613	ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
614
615	// Convert to 16-bit Y.
616	b = _mm_unpackhi_epi8(bgr_plane[`0` + j], zero);
617	g = _mm_unpackhi_epi8(bgr_plane[`2` + j], zero);
618	r = _mm_unpackhi_epi8(bgr_plane[`4` + j], zero);
619	ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
620
621	// Cast to 8-bit and store.
622	STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
623	}
624	}
625	for (; i < width; ++i, bgr += `3`) { // left-over
626	y[i] = VP8RGBToY(bgr[`2`], bgr[`1`], bgr[`0`], YUV_HALF);
627	}
628	}
629
630	static void ConvertARGBToY_SSE2(const uint32_t* argb, uint8_t* y, int width) {
631	const int max_width = width & ~`15`;
632	int i;
633	for (i = `0`; i < max_width; i += `16`) {
634	__m128i Y0, Y1, rgb[`6`];
635	RGB32PackedToPlanar_SSE2(&argb[i], rgb);
636	ConvertRGBToY_SSE2(&rgb[`0`], &rgb[`2`], &rgb[`4`], &Y0);
637	ConvertRGBToY_SSE2(&rgb[`1`], &rgb[`3`], &rgb[`5`], &Y1);
638	STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
639	}
640	for (; i < width; ++i) { // left-over
641	const uint32_t p = argb[i];
642	y[i] = VP8RGBToY((p >> `16`) & `0xff`, (p >> `8`) & `0xff`, (p >> `0`) & `0xff`,
643	YUV_HALF);
644	}
645	}
646
647	// Horizontal add (doubled) of two 16b values, result is 16b.
648	// in: A \| B \| C \| D \| ... -> out: 2(A+B) \| 2(C+D) \| ...
649	static void HorizontalAddPack_SSE2(const __m128i* const A,
650	const __m128i* const B,
651	__m128i* const out) {
652	const __m128i k2 = _mm_set1_epi16(`2`);
653	const __m128i C = _mm_madd_epi16(*A, k2);
654	const __m128i D = _mm_madd_epi16(*B, k2);
655	*out = _mm_packs_epi32(C, D);
656	}
657
658	static void ConvertARGBToUV_SSE2(const uint32_t* argb,
659	uint8_t* u, uint8_t* v,
660	int src_width, int do_store) {
661	const int max_width = src_width & ~`31`;
662	int i;
663	for (i = `0`; i < max_width; i += `32`, u += `16`, v += `16`) {
664	__m128i rgb[`6`], U0, V0, U1, V1;
665	RGB32PackedToPlanar_SSE2(&argb[i], rgb);
666	HorizontalAddPack_SSE2(&rgb[`0`], &rgb[`1`], &rgb[`0`]);
667	HorizontalAddPack_SSE2(&rgb[`2`], &rgb[`3`], &rgb[`2`]);
668	HorizontalAddPack_SSE2(&rgb[`4`], &rgb[`5`], &rgb[`4`]);
669	ConvertRGBToUV_SSE2(&rgb[`0`], &rgb[`2`], &rgb[`4`], &U0, &V0);
670
671	RGB32PackedToPlanar_SSE2(&argb[i + `16`], rgb);
672	HorizontalAddPack_SSE2(&rgb[`0`], &rgb[`1`], &rgb[`0`]);
673	HorizontalAddPack_SSE2(&rgb[`2`], &rgb[`3`], &rgb[`2`]);
674	HorizontalAddPack_SSE2(&rgb[`4`], &rgb[`5`], &rgb[`4`]);
675	ConvertRGBToUV_SSE2(&rgb[`0`], &rgb[`2`], &rgb[`4`], &U1, &V1);
676
677	U0 = _mm_packus_epi16(U0, U1);
678	V0 = _mm_packus_epi16(V0, V1);
679	if (!do_store) {
680	const __m128i prev_u = LOAD_16(u);
681	const __m128i prev_v = LOAD_16(v);
682	U0 = _mm_avg_epu8(U0, prev_u);
683	V0 = _mm_avg_epu8(V0, prev_v);
684	}
685	STORE_16(U0, u);
686	STORE_16(V0, v);
687	}
688	if (i < src_width) { // left-over
689	WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
690	}
691	}
692
693	// Convert 16 packed ARGB 16b-values to r[], g[], b[]
694	static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
695	const uint16_t* const rgbx,
696	__m128i* const r, __m128i* const g, __m128i* const b) {
697	const __m128i in0 = LOAD_16(rgbx + `0`); // r0 \| g0 \| b0 \|x\| r1 \| g1 \| b1 \|x
698	const __m128i in1 = LOAD_16(rgbx + `8`); // r2 \| g2 \| b2 \|x\| r3 \| g3 \| b3 \|x
699	const __m128i in2 = LOAD_16(rgbx + `16`); // r4 \| ...
700	const __m128i in3 = LOAD_16(rgbx + `24`); // r6 \| ...
701	// column-wise transpose
702	const __m128i A0 = _mm_unpacklo_epi16(in0, in1);
703	const __m128i A1 = _mm_unpackhi_epi16(in0, in1);
704	const __m128i A2 = _mm_unpacklo_epi16(in2, in3);
705	const __m128i A3 = _mm_unpackhi_epi16(in2, in3);
706	const __m128i B0 = _mm_unpacklo_epi16(A0, A1); // r0 r1 r2 r3 \| g0 g1 ..
707	const __m128i B1 = _mm_unpackhi_epi16(A0, A1); // b0 b1 b2 b3 \| x x x x
708	const __m128i B2 = _mm_unpacklo_epi16(A2, A3); // r4 r5 r6 r7 \| g4 g5 ..
709	const __m128i B3 = _mm_unpackhi_epi16(A2, A3); // b4 b5 b6 b7 \| x x x x
710	*r = _mm_unpacklo_epi64(B0, B2);
711	*g = _mm_unpackhi_epi64(B0, B2);
712	*b = _mm_unpacklo_epi64(B1, B3);
713	}
714
715	static void ConvertRGBA32ToUV_SSE2(const uint16_t* rgb,
716	uint8_t* u, uint8_t* v, int width) {
717	const int max_width = width & ~`15`;
718	const uint16_t* const last_rgb = rgb + `4` * max_width;
719	while (rgb < last_rgb) {
720	__m128i r, g, b, U0, V0, U1, V1;
721	RGBA32PackedToPlanar_16b_SSE2(rgb + `0`, &r, &g, &b);
722	ConvertRGBToUV_SSE2(&r, &g, &b, &U0, &V0);
723	RGBA32PackedToPlanar_16b_SSE2(rgb + `32`, &r, &g, &b);
724	ConvertRGBToUV_SSE2(&r, &g, &b, &U1, &V1);
725	STORE_16(_mm_packus_epi16(U0, U1), u);
726	STORE_16(_mm_packus_epi16(V0, V1), v);
727	u += `16`;
728	v += `16`;
729	rgb += `2` * `32`;
730	}
731	if (max_width < width) { // left-over
732	WebPConvertRGBA32ToUV_C(rgb, u, v, width - max_width);
733	}
734	}
735
736	//------------------------------------------------------------------------------
737
738	extern void WebPInitConvertARGBToYUVSSE2(void);
739
740	WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
741	WebPConvertARGBToY = ConvertARGBToY_SSE2;
742	WebPConvertARGBToUV = ConvertARGBToUV_SSE2;
743
744	WebPConvertRGB24ToY = ConvertRGB24ToY_SSE2;
745	WebPConvertBGR24ToY = ConvertBGR24ToY_SSE2;
746
747	WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2;
748	}
749
750	//------------------------------------------------------------------------------
751
752	#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
753	static uint16_t clip_y(int v) {
754	return (v < `0`) ? `0` : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
755	}
756
757	static uint64_t SharpYUVUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
758	uint16_t* dst, int len) {
759	uint64_t diff = `0`;
760	uint32_t tmp[`4`];
761	int i;
762	const __m128i zero = _mm_setzero_si128();
763	const __m128i max = _mm_set1_epi16(MAX_Y);
764	const __m128i one = _mm_set1_epi16(`1`);
765	__m128i sum = zero;
766
767	for (i = `0`; i + `8` <= len; i += `8`) {
768	const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
769	const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
770	const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
771	const __m128i D = _mm_sub_epi16(A, B); // diff_y
772	const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0)
773	const __m128i F = _mm_add_epi16(C, D); // new_y
774	const __m128i G = _mm_or_si128(E, one); // -1 or 1
775	const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
776	const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...))
777	_mm_storeu_si128((__m128i*)(dst + i), H);
778	sum = _mm_add_epi32(sum, I);
779	}
780	_mm_storeu_si128((__m128i*)tmp, sum);
781	diff = tmp[`3`] + tmp[`2`] + tmp[`1`] + tmp[`0`];
782	for (; i < len; ++i) {
783	const int diff_y = ref[i] - src[i];
784	const int new_y = (int)dst[i] + diff_y;
785	dst[i] = clip_y(new_y);
786	diff += (uint64_t)abs(diff_y);
787	}
788	return diff;
789	}
790
791	static void SharpYUVUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
792	int16_t* dst, int len) {
793	int i = `0`;
794	for (i = `0`; i + `8` <= len; i += `8`) {
795	const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
796	const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
797	const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
798	const __m128i D = _mm_sub_epi16(A, B); // diff_uv
799	const __m128i E = _mm_add_epi16(C, D); // new_uv
800	_mm_storeu_si128((__m128i*)(dst + i), E);
801	}
802	for (; i < len; ++i) {
803	const int diff_uv = ref[i] - src[i];
804	dst[i] += diff_uv;
805	}
806	}
807
808	static void SharpYUVFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
809	const uint16_t* best_y, uint16_t* out) {
810	int i;
811	const __m128i kCst8 = _mm_set1_epi16(`8`);
812	const __m128i max = _mm_set1_epi16(MAX_Y);
813	const __m128i zero = _mm_setzero_si128();
814	for (i = `0`; i + `8` <= len; i += `8`) {
815	const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + `0`));
816	const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + `1`));
817	const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + `0`));
818	const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + `1`));
819	const __m128i a0b1 = _mm_add_epi16(a0, b1);
820	const __m128i a1b0 = _mm_add_epi16(a1, b0);
821	const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1
822	const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
823	const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2(A0+B1)*
824	const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2(A1+B0)*
825	const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), `3`);
826	const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), `3`);
827	const __m128i d0 = _mm_add_epi16(c1, a0);
828	const __m128i d1 = _mm_add_epi16(c0, a1);
829	const __m128i e0 = _mm_srai_epi16(d0, `1`);
830	const __m128i e1 = _mm_srai_epi16(d1, `1`);
831	const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
832	const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
833	const __m128i g0 = _mm_loadu_si128((const __m128i)(best_y + `2` i + `0`));
834	const __m128i g1 = _mm_loadu_si128((const __m128i)(best_y + `2` i + `8`));
835	const __m128i h0 = _mm_add_epi16(g0, f0);
836	const __m128i h1 = _mm_add_epi16(g1, f1);
837	const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
838	const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
839	_mm_storeu_si128((__m128i)(out + `2` i + `0`), i0);
840	_mm_storeu_si128((__m128i)(out + `2` i + `8`), i1);
841	}
842	for (; i < len; ++i) {
843	// (9 A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =*
844	// = (8 A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4*
845	// We reuse the common sub-expressions.
846	const int a0b1 = A[i + `0`] + B[i + `1`];
847	const int a1b0 = A[i + `1`] + B[i + `0`];
848	const int a0a1b0b1 = a0b1 + a1b0 + `8`;
849	const int v0 = (`8` * A[i + `0`] + `2` * a1b0 + a0a1b0b1) >> `4`;
850	const int v1 = (`8` * A[i + `1`] + `2` * a0b1 + a0a1b0b1) >> `4`;
851	out[`2` * i + `0`] = clip_y(best_y[`2` * i + `0`] + v0);
852	out[`2` * i + `1`] = clip_y(best_y[`2` * i + `1`] + v1);
853	}
854	}
855
856	#undef MAX_Y
857
858	//------------------------------------------------------------------------------
859
860	extern void WebPInitSharpYUVSSE2(void);
861
862	WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVSSE2(void) {
863	WebPSharpYUVUpdateY = SharpYUVUpdateY_SSE2;
864	WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_SSE2;
865	WebPSharpYUVFilterRow = SharpYUVFilterRow_SSE2;
866	}
867
868	#else // !WEBP_USE_SSE2
869
870	WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2)
871	WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2)
872	WEBP_DSP_INIT_STUB(WebPInitSharpYUVSSE2)
873
874	#endif // WEBP_USE_SSE2
875

Browse the source code of Skia/third_party/externals/libwebp/src/dsp/yuv_sse2.c