yuv_sse41.c source code [Skia/third_party/externals/libwebp/src/dsp/yuv_sse41.c]

1	// Copyright 2014 Google Inc. All Rights Reserved.
2	//
3	// Use of this source code is governed by a BSD-style license
4	// that can be found in the COPYING file in the root of the source
5	// tree. An additional intellectual property rights grant can be found
6	// in the file PATENTS. All contributing project authors may
7	// be found in the AUTHORS file in the root of the source tree.
8	// -----------------------------------------------------------------------------
9	//
10	// YUV->RGB conversion functions
11	//
12	// Author: Skal (pascal.massimino@gmail.com)
13
14	#include "src/dsp/yuv.h"
15
16	#if defined(WEBP_USE_SSE41)
17
18	#include "src/dsp/common_sse41.h"
19	#include <stdlib.h>
20	#include <smmintrin.h>
21
22	//-----------------------------------------------------------------------------
23	// Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
24
25	// These constants are 14b fixed-point version of ITU-R BT.601 constants.
26	// R = (19077 y + 26149 * v - 14234) >> 6*
27	// G = (19077 y - 6419 * u - 13320 * v + 8708) >> 6*
28	// B = (19077 y + 33050 * u - 17685) >> 6*
29	static void ConvertYUV444ToRGB_SSE41(const __m128i* const Y0,
30	const __m128i* const U0,
31	const __m128i* const V0,
32	__m128i* const R,
33	__m128i* const G,
34	__m128i* const B) {
35	const __m128i k19077 = _mm_set1_epi16(`19077`);
36	const __m128i k26149 = _mm_set1_epi16(`26149`);
37	const __m128i k14234 = _mm_set1_epi16(`14234`);
38	// 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
39	const __m128i k33050 = _mm_set1_epi16((short)`33050`);
40	const __m128i k17685 = _mm_set1_epi16(`17685`);
41	const __m128i k6419 = _mm_set1_epi16(`6419`);
42	const __m128i k13320 = _mm_set1_epi16(`13320`);
43	const __m128i k8708 = _mm_set1_epi16(`8708`);
44
45	const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077);
46
47	const __m128i R0 = _mm_mulhi_epu16(*V0, k26149);
48	const __m128i R1 = _mm_sub_epi16(Y1, k14234);
49	const __m128i R2 = _mm_add_epi16(R1, R0);
50
51	const __m128i G0 = _mm_mulhi_epu16(*U0, k6419);
52	const __m128i G1 = _mm_mulhi_epu16(*V0, k13320);
53	const __m128i G2 = _mm_add_epi16(Y1, k8708);
54	const __m128i G3 = _mm_add_epi16(G0, G1);
55	const __m128i G4 = _mm_sub_epi16(G2, G3);
56
57	// be careful with the saturated unsigned* arithmetic here!*
58	const __m128i B0 = _mm_mulhi_epu16(*U0, k33050);
59	const __m128i B1 = _mm_adds_epu16(B0, Y1);
60	const __m128i B2 = _mm_subs_epu16(B1, k17685);
61
62	// use logical shift for B2, which can be larger than 32767
63	R = _mm_srai_epi16(R2, `6`); // range: [-14234, 30815]*
64	G = _mm_srai_epi16(G4, `6`); // range: [-10953, 27710]*
65	B = _mm_srli_epi16(B2, `6`); // range: [0, 34238]*
66	}
67
68	// Load the bytes into the upper* part of 16b words. That's "<< 8", basically.*
69	static WEBP_INLINE __m128i Load_HI_16_SSE41(const uint8_t* src) {
70	const __m128i zero = _mm_setzero_si128();
71	return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
72	}
73
74	// Load and replicate the U/V samples
75	static WEBP_INLINE __m128i Load_UV_HI_8_SSE41(const uint8_t* src) {
76	const __m128i zero = _mm_setzero_si128();
77	const __m128i tmp0 = _mm_cvtsi32_si128((const* uint32_t*)src);
78	const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
79	return _mm_unpacklo_epi16(tmp1, tmp1); // replicate samples
80	}
81
82	// Convert 32 samples of YUV444 to R/G/B
83	static void YUV444ToRGB_SSE41(const uint8_t* const y,
84	const uint8_t* const u,
85	const uint8_t* const v,
86	__m128i* const R, __m128i* const G,
87	__m128i* const B) {
88	const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_HI_16_SSE41(u),
89	V0 = Load_HI_16_SSE41(v);
90	ConvertYUV444ToRGB_SSE41(&Y0, &U0, &V0, R, G, B);
91	}
92
93	// Convert 32 samples of YUV420 to R/G/B
94	static void YUV420ToRGB_SSE41(const uint8_t* const y,
95	const uint8_t* const u,
96	const uint8_t* const v,
97	__m128i* const R, __m128i* const G,
98	__m128i* const B) {
99	const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_UV_HI_8_SSE41(u),
100	V0 = Load_UV_HI_8_SSE41(v);
101	ConvertYUV444ToRGB_SSE41(&Y0, &U0, &V0, R, G, B);
102	}
103
104	// Pack the planar buffers
105	// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
106	// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
107	static WEBP_INLINE void PlanarTo24b_SSE41(
108	__m128i* const in0, __m128i* const in1, __m128i* const in2,
109	__m128i* const in3, __m128i* const in4, __m128i* const in5,
110	uint8_t* const rgb) {
111	// The input is 6 registers of sixteen 8b but for the sake of explanation,
112	// let's take 6 registers of four 8b values.
113	// To pack, we will keep taking one every two 8b integer and move it
114	// around as follows:
115	// Input:
116	// r0r1r2r3 \| r4r5r6r7 \| g0g1g2g3 \| g4g5g6g7 \| b0b1b2b3 \| b4b5b6b7
117	// Split the 6 registers in two sets of 3 registers: the first set as the even
118	// 8b bytes, the second the odd ones:
119	// r0r2r4r6 \| g0g2g4g6 \| b0b2b4b6 \| r1r3r5r7 \| g1g3g5g7 \| b1b3b5b7
120	// Repeat the same permutations twice more:
121	// r0r4g0g4 \| b0b4r1r5 \| g1g5b1b5 \| r2r6g2g6 \| b2b6r3r7 \| g3g7b3b7
122	// r0g0b0r1 \| g1b1r2g2 \| b2r3g3b3 \| r4g4b4r5 \| g5b5r6g6 \| b6r7g7b7
123	VP8PlanarTo24b_SSE41(in0, in1, in2, in3, in4, in5);
124
125	_mm_storeu_si128((__m128i)(rgb + `0`), in0);
126	_mm_storeu_si128((__m128i)(rgb + `16`), in1);
127	_mm_storeu_si128((__m128i)(rgb + `32`), in2);
128	_mm_storeu_si128((__m128i)(rgb + `48`), in3);
129	_mm_storeu_si128((__m128i)(rgb + `64`), in4);
130	_mm_storeu_si128((__m128i)(rgb + `80`), in5);
131	}
132
133	void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
134	uint8_t* dst) {
135	__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
136	__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
137
138	YUV444ToRGB_SSE41(y + `0`, u + `0`, v + `0`, &R0, &G0, &B0);
139	YUV444ToRGB_SSE41(y + `8`, u + `8`, v + `8`, &R1, &G1, &B1);
140	YUV444ToRGB_SSE41(y + `16`, u + `16`, v + `16`, &R2, &G2, &B2);
141	YUV444ToRGB_SSE41(y + `24`, u + `24`, v + `24`, &R3, &G3, &B3);
142
143	// Cast to 8b and store as RRRRGGGGBBBB.
144	rgb0 = _mm_packus_epi16(R0, R1);
145	rgb1 = _mm_packus_epi16(R2, R3);
146	rgb2 = _mm_packus_epi16(G0, G1);
147	rgb3 = _mm_packus_epi16(G2, G3);
148	rgb4 = _mm_packus_epi16(B0, B1);
149	rgb5 = _mm_packus_epi16(B2, B3);
150
151	// Pack as RGBRGBRGBRGB.
152	PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
153	}
154
155	void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
156	uint8_t* dst) {
157	__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
158	__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
159
160	YUV444ToRGB_SSE41(y + `0`, u + `0`, v + `0`, &R0, &G0, &B0);
161	YUV444ToRGB_SSE41(y + `8`, u + `8`, v + `8`, &R1, &G1, &B1);
162	YUV444ToRGB_SSE41(y + `16`, u + `16`, v + `16`, &R2, &G2, &B2);
163	YUV444ToRGB_SSE41(y + `24`, u + `24`, v + `24`, &R3, &G3, &B3);
164
165	// Cast to 8b and store as BBBBGGGGRRRR.
166	bgr0 = _mm_packus_epi16(B0, B1);
167	bgr1 = _mm_packus_epi16(B2, B3);
168	bgr2 = _mm_packus_epi16(G0, G1);
169	bgr3 = _mm_packus_epi16(G2, G3);
170	bgr4 = _mm_packus_epi16(R0, R1);
171	bgr5= _mm_packus_epi16(R2, R3);
172
173	// Pack as BGRBGRBGRBGR.
174	PlanarTo24b_SSE41(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
175	}
176
177	//-----------------------------------------------------------------------------
178	// Arbitrary-length row conversion functions
179
180	static void YuvToRgbRow_SSE41(const uint8_t* y,
181	const uint8_t* u, const uint8_t* v,
182	uint8_t* dst, int len) {
183	int n;
184	for (n = `0`; n + `32` <= len; n += `32`, dst += `32` * `3`) {
185	__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
186	__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
187
188	YUV420ToRGB_SSE41(y + `0`, u + `0`, v + `0`, &R0, &G0, &B0);
189	YUV420ToRGB_SSE41(y + `8`, u + `4`, v + `4`, &R1, &G1, &B1);
190	YUV420ToRGB_SSE41(y + `16`, u + `8`, v + `8`, &R2, &G2, &B2);
191	YUV420ToRGB_SSE41(y + `24`, u + `12`, v + `12`, &R3, &G3, &B3);
192
193	// Cast to 8b and store as RRRRGGGGBBBB.
194	rgb0 = _mm_packus_epi16(R0, R1);
195	rgb1 = _mm_packus_epi16(R2, R3);
196	rgb2 = _mm_packus_epi16(G0, G1);
197	rgb3 = _mm_packus_epi16(G2, G3);
198	rgb4 = _mm_packus_epi16(B0, B1);
199	rgb5 = _mm_packus_epi16(B2, B3);
200
201	// Pack as RGBRGBRGBRGB.
202	PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
203
204	y += `32`;
205	u += `16`;
206	v += `16`;
207	}
208	for (; n < len; ++n) { // Finish off
209	VP8YuvToRgb(y[`0`], u[`0`], v[`0`], dst);
210	dst += `3`;
211	y += `1`;
212	u += (n & `1`);
213	v += (n & `1`);
214	}
215	}
216
217	static void YuvToBgrRow_SSE41(const uint8_t* y,
218	const uint8_t* u, const uint8_t* v,
219	uint8_t* dst, int len) {
220	int n;
221	for (n = `0`; n + `32` <= len; n += `32`, dst += `32` * `3`) {
222	__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
223	__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
224
225	YUV420ToRGB_SSE41(y + `0`, u + `0`, v + `0`, &R0, &G0, &B0);
226	YUV420ToRGB_SSE41(y + `8`, u + `4`, v + `4`, &R1, &G1, &B1);
227	YUV420ToRGB_SSE41(y + `16`, u + `8`, v + `8`, &R2, &G2, &B2);
228	YUV420ToRGB_SSE41(y + `24`, u + `12`, v + `12`, &R3, &G3, &B3);
229
230	// Cast to 8b and store as BBBBGGGGRRRR.
231	bgr0 = _mm_packus_epi16(B0, B1);
232	bgr1 = _mm_packus_epi16(B2, B3);
233	bgr2 = _mm_packus_epi16(G0, G1);
234	bgr3 = _mm_packus_epi16(G2, G3);
235	bgr4 = _mm_packus_epi16(R0, R1);
236	bgr5 = _mm_packus_epi16(R2, R3);
237
238	// Pack as BGRBGRBGRBGR.
239	PlanarTo24b_SSE41(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
240
241	y += `32`;
242	u += `16`;
243	v += `16`;
244	}
245	for (; n < len; ++n) { // Finish off
246	VP8YuvToBgr(y[`0`], u[`0`], v[`0`], dst);
247	dst += `3`;
248	y += `1`;
249	u += (n & `1`);
250	v += (n & `1`);
251	}
252	}
253
254	//------------------------------------------------------------------------------
255	// Entry point
256
257	extern void WebPInitSamplersSSE41(void);
258
259	WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE41(void) {
260	WebPSamplers[MODE_RGB] = YuvToRgbRow_SSE41;
261	WebPSamplers[MODE_BGR] = YuvToBgrRow_SSE41;
262	}
263
264	//------------------------------------------------------------------------------
265	// RGB24/32 -> YUV converters
266
267	// Load eight 16b-words from src.*
268	#define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
269	// Store either 16b-words into dst*
270	#define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V))
271
272	#define WEBP_SSE41_SHUFF(OUT) do { \
273	const __m128i tmp0 = _mm_shuffle_epi8(A0, shuff0); \
274	const __m128i tmp1 = _mm_shuffle_epi8(A1, shuff1); \
275	const __m128i tmp2 = _mm_shuffle_epi8(A2, shuff2); \
276	const __m128i tmp3 = _mm_shuffle_epi8(A3, shuff0); \
277	const __m128i tmp4 = _mm_shuffle_epi8(A4, shuff1); \
278	const __m128i tmp5 = _mm_shuffle_epi8(A5, shuff2); \
279	\
280	/* OR everything to get one channel */ \
281	const __m128i tmp6 = _mm_or_si128(tmp0, tmp1); \
282	const __m128i tmp7 = _mm_or_si128(tmp3, tmp4); \
283	out[OUT + 0] = _mm_or_si128(tmp6, tmp2); \
284	out[OUT + 1] = _mm_or_si128(tmp7, tmp5); \
285	} while (0);
286
287	// Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
288	// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
289	// Similar to PlanarTo24bHelper(), but in reverse order.
290	static WEBP_INLINE void RGB24PackedToPlanar_SSE41(
291	const uint8_t* const rgb, __m128i* const out /out[6]/) {
292	const __m128i A0 = _mm_loadu_si128((const __m128i*)(rgb + `0`));
293	const __m128i A1 = _mm_loadu_si128((const __m128i*)(rgb + `16`));
294	const __m128i A2 = _mm_loadu_si128((const __m128i*)(rgb + `32`));
295	const __m128i A3 = _mm_loadu_si128((const __m128i*)(rgb + `48`));
296	const __m128i A4 = _mm_loadu_si128((const __m128i*)(rgb + `64`));
297	const __m128i A5 = _mm_loadu_si128((const __m128i*)(rgb + `80`));
298
299	// Compute RR.
300	{
301	const __m128i shuff0 = _mm_set_epi8(
302	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `15`, `12`, `9`, `6`, `3`, `0`);
303	const __m128i shuff1 = _mm_set_epi8(
304	-`1`, -`1`, -`1`, -`1`, -`1`, `14`, `11`, `8`, `5`, `2`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`);
305	const __m128i shuff2 = _mm_set_epi8(
306	`13`, `10`, `7`, `4`, `1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`);
307	WEBP_SSE41_SHUFF(`0`)
308	}
309	// Compute GG.
310	{
311	const __m128i shuff0 = _mm_set_epi8(
312	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `13`, `10`, `7`, `4`, `1`);
313	const __m128i shuff1 = _mm_set_epi8(
314	-`1`, -`1`, -`1`, -`1`, -`1`, `15`, `12`, `9`, `6`, `3`, `0`, -`1`, -`1`, -`1`, -`1`, -`1`);
315	const __m128i shuff2 = _mm_set_epi8(
316	`14`, `11`, `8`, `5`, `2`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`);
317	WEBP_SSE41_SHUFF(`2`)
318	}
319	// Compute BB.
320	{
321	const __m128i shuff0 = _mm_set_epi8(
322	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `14`, `11`, `8`, `5`, `2`);
323	const __m128i shuff1 = _mm_set_epi8(
324	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `13`, `10`, `7`, `4`, `1`, -`1`, -`1`, -`1`, -`1`, -`1`);
325	const __m128i shuff2 = _mm_set_epi8(
326	`15`, `12`, `9`, `6`, `3`, `0`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`);
327	WEBP_SSE41_SHUFF(`4`)
328	}
329	}
330
331	#undef WEBP_SSE41_SHUFF
332
333	// Convert 8 packed ARGB to r[], g[], b[]
334	static WEBP_INLINE void RGB32PackedToPlanar_SSE41(
335	const uint32_t* const argb, __m128i* const rgb /in[6]/) {
336	const __m128i zero = _mm_setzero_si128();
337	__m128i a0 = LOAD_16(argb + `0`);
338	__m128i a1 = LOAD_16(argb + `4`);
339	__m128i a2 = LOAD_16(argb + `8`);
340	__m128i a3 = LOAD_16(argb + `12`);
341	VP8L32bToPlanar_SSE41(&a0, &a1, &a2, &a3);
342	rgb[`0`] = _mm_unpacklo_epi8(a1, zero);
343	rgb[`1`] = _mm_unpackhi_epi8(a1, zero);
344	rgb[`2`] = _mm_unpacklo_epi8(a2, zero);
345	rgb[`3`] = _mm_unpackhi_epi8(a2, zero);
346	rgb[`4`] = _mm_unpacklo_epi8(a3, zero);
347	rgb[`5`] = _mm_unpackhi_epi8(a3, zero);
348	}
349
350	// This macro computes (RG MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX*
351	// It's a macro and not a function because we need to use immediate values with
352	// srai_epi32, e.g.
353	#define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \
354	ROUNDER, DESCALE_FIX, OUT) do { \
355	const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG); \
356	const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG); \
357	const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB); \
358	const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB); \
359	const __m128i V2_lo = _mm_add_epi32(V0_lo, V1_lo); \
360	const __m128i V2_hi = _mm_add_epi32(V0_hi, V1_hi); \
361	const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER); \
362	const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER); \
363	const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX); \
364	const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX); \
365	(OUT) = _mm_packs_epi32(V5_lo, V5_hi); \
366	} while (0)
367
368	#define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
369	static WEBP_INLINE void ConvertRGBToY_SSE41(const __m128i* const R,
370	const __m128i* const G,
371	const __m128i* const B,
372	__m128i* const Y) {
373	const __m128i kRG_y = MK_CST_16(`16839`, `33059` - `16384`);
374	const __m128i kGB_y = MK_CST_16(`16384`, `6420`);
375	const __m128i kHALF_Y = _mm_set1_epi32((`16` << YUV_FIX) + YUV_HALF);
376
377	const __m128i RG_lo = _mm_unpacklo_epi16(R, G);
378	const __m128i RG_hi = _mm_unpackhi_epi16(R, G);
379	const __m128i GB_lo = _mm_unpacklo_epi16(G, B);
380	const __m128i GB_hi = _mm_unpackhi_epi16(G, B);
381	TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
382	}
383
384	static WEBP_INLINE void ConvertRGBToUV_SSE41(const __m128i* const R,
385	const __m128i* const G,
386	const __m128i* const B,
387	__m128i* const U,
388	__m128i* const V) {
389	const __m128i kRG_u = MK_CST_16(-`9719`, -`19081`);
390	const __m128i kGB_u = MK_CST_16(`0`, `28800`);
391	const __m128i kRG_v = MK_CST_16(`28800`, `0`);
392	const __m128i kGB_v = MK_CST_16(-`24116`, -`4684`);
393	const __m128i kHALF_UV = _mm_set1_epi32(((`128` << YUV_FIX) + YUV_HALF) << `2`);
394
395	const __m128i RG_lo = _mm_unpacklo_epi16(R, G);
396	const __m128i RG_hi = _mm_unpackhi_epi16(R, G);
397	const __m128i GB_lo = _mm_unpacklo_epi16(G, B);
398	const __m128i GB_hi = _mm_unpackhi_epi16(G, B);
399	TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u,
400	kHALF_UV, YUV_FIX + `2`, *U);
401	TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_v, kGB_v,
402	kHALF_UV, YUV_FIX + `2`, *V);
403	}
404
405	#undef MK_CST_16
406	#undef TRANSFORM
407
408	static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) {
409	const int max_width = width & ~`31`;
410	int i;
411	for (i = `0`; i < max_width; rgb += `3` * `16` * `2`) {
412	__m128i rgb_plane[`6`];
413	int j;
414
415	RGB24PackedToPlanar_SSE41(rgb, rgb_plane);
416
417	for (j = `0`; j < `2`; ++j, i += `16`) {
418	const __m128i zero = _mm_setzero_si128();
419	__m128i r, g, b, Y0, Y1;
420
421	// Convert to 16-bit Y.
422	r = _mm_unpacklo_epi8(rgb_plane[`0` + j], zero);
423	g = _mm_unpacklo_epi8(rgb_plane[`2` + j], zero);
424	b = _mm_unpacklo_epi8(rgb_plane[`4` + j], zero);
425	ConvertRGBToY_SSE41(&r, &g, &b, &Y0);
426
427	// Convert to 16-bit Y.
428	r = _mm_unpackhi_epi8(rgb_plane[`0` + j], zero);
429	g = _mm_unpackhi_epi8(rgb_plane[`2` + j], zero);
430	b = _mm_unpackhi_epi8(rgb_plane[`4` + j], zero);
431	ConvertRGBToY_SSE41(&r, &g, &b, &Y1);
432
433	// Cast to 8-bit and store.
434	STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
435	}
436	}
437	for (; i < width; ++i, rgb += `3`) { // left-over
438	y[i] = VP8RGBToY(rgb[`0`], rgb[`1`], rgb[`2`], YUV_HALF);
439	}
440	}
441
442	static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) {
443	const int max_width = width & ~`31`;
444	int i;
445	for (i = `0`; i < max_width; bgr += `3` * `16` * `2`) {
446	__m128i bgr_plane[`6`];
447	int j;
448
449	RGB24PackedToPlanar_SSE41(bgr, bgr_plane);
450
451	for (j = `0`; j < `2`; ++j, i += `16`) {
452	const __m128i zero = _mm_setzero_si128();
453	__m128i r, g, b, Y0, Y1;
454
455	// Convert to 16-bit Y.
456	b = _mm_unpacklo_epi8(bgr_plane[`0` + j], zero);
457	g = _mm_unpacklo_epi8(bgr_plane[`2` + j], zero);
458	r = _mm_unpacklo_epi8(bgr_plane[`4` + j], zero);
459	ConvertRGBToY_SSE41(&r, &g, &b, &Y0);
460
461	// Convert to 16-bit Y.
462	b = _mm_unpackhi_epi8(bgr_plane[`0` + j], zero);
463	g = _mm_unpackhi_epi8(bgr_plane[`2` + j], zero);
464	r = _mm_unpackhi_epi8(bgr_plane[`4` + j], zero);
465	ConvertRGBToY_SSE41(&r, &g, &b, &Y1);
466
467	// Cast to 8-bit and store.
468	STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
469	}
470	}
471	for (; i < width; ++i, bgr += `3`) { // left-over
472	y[i] = VP8RGBToY(bgr[`2`], bgr[`1`], bgr[`0`], YUV_HALF);
473	}
474	}
475
476	static void ConvertARGBToY_SSE41(const uint32_t* argb, uint8_t* y, int width) {
477	const int max_width = width & ~`15`;
478	int i;
479	for (i = `0`; i < max_width; i += `16`) {
480	__m128i Y0, Y1, rgb[`6`];
481	RGB32PackedToPlanar_SSE41(&argb[i], rgb);
482	ConvertRGBToY_SSE41(&rgb[`0`], &rgb[`2`], &rgb[`4`], &Y0);
483	ConvertRGBToY_SSE41(&rgb[`1`], &rgb[`3`], &rgb[`5`], &Y1);
484	STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
485	}
486	for (; i < width; ++i) { // left-over
487	const uint32_t p = argb[i];
488	y[i] = VP8RGBToY((p >> `16`) & `0xff`, (p >> `8`) & `0xff`, (p >> `0`) & `0xff`,
489	YUV_HALF);
490	}
491	}
492
493	// Horizontal add (doubled) of two 16b values, result is 16b.
494	// in: A \| B \| C \| D \| ... -> out: 2(A+B) \| 2(C+D) \| ...
495	static void HorizontalAddPack_SSE41(const __m128i* const A,
496	const __m128i* const B,
497	__m128i* const out) {
498	const __m128i k2 = _mm_set1_epi16(`2`);
499	const __m128i C = _mm_madd_epi16(*A, k2);
500	const __m128i D = _mm_madd_epi16(*B, k2);
501	*out = _mm_packs_epi32(C, D);
502	}
503
504	static void ConvertARGBToUV_SSE41(const uint32_t* argb,
505	uint8_t* u, uint8_t* v,
506	int src_width, int do_store) {
507	const int max_width = src_width & ~`31`;
508	int i;
509	for (i = `0`; i < max_width; i += `32`, u += `16`, v += `16`) {
510	__m128i rgb[`6`], U0, V0, U1, V1;
511	RGB32PackedToPlanar_SSE41(&argb[i], rgb);
512	HorizontalAddPack_SSE41(&rgb[`0`], &rgb[`1`], &rgb[`0`]);
513	HorizontalAddPack_SSE41(&rgb[`2`], &rgb[`3`], &rgb[`2`]);
514	HorizontalAddPack_SSE41(&rgb[`4`], &rgb[`5`], &rgb[`4`]);
515	ConvertRGBToUV_SSE41(&rgb[`0`], &rgb[`2`], &rgb[`4`], &U0, &V0);
516
517	RGB32PackedToPlanar_SSE41(&argb[i + `16`], rgb);
518	HorizontalAddPack_SSE41(&rgb[`0`], &rgb[`1`], &rgb[`0`]);
519	HorizontalAddPack_SSE41(&rgb[`2`], &rgb[`3`], &rgb[`2`]);
520	HorizontalAddPack_SSE41(&rgb[`4`], &rgb[`5`], &rgb[`4`]);
521	ConvertRGBToUV_SSE41(&rgb[`0`], &rgb[`2`], &rgb[`4`], &U1, &V1);
522
523	U0 = _mm_packus_epi16(U0, U1);
524	V0 = _mm_packus_epi16(V0, V1);
525	if (!do_store) {
526	const __m128i prev_u = LOAD_16(u);
527	const __m128i prev_v = LOAD_16(v);
528	U0 = _mm_avg_epu8(U0, prev_u);
529	V0 = _mm_avg_epu8(V0, prev_v);
530	}
531	STORE_16(U0, u);
532	STORE_16(V0, v);
533	}
534	if (i < src_width) { // left-over
535	WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
536	}
537	}
538
539	// Convert 16 packed ARGB 16b-values to r[], g[], b[]
540	static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41(
541	const uint16_t* const rgbx,
542	__m128i* const r, __m128i* const g, __m128i* const b) {
543	const __m128i in0 = LOAD_16(rgbx + `0`); // r0 \| g0 \| b0 \|x\| r1 \| g1 \| b1 \|x
544	const __m128i in1 = LOAD_16(rgbx + `8`); // r2 \| g2 \| b2 \|x\| r3 \| g3 \| b3 \|x
545	const __m128i in2 = LOAD_16(rgbx + `16`); // r4 \| ...
546	const __m128i in3 = LOAD_16(rgbx + `24`); // r6 \| ...
547	// aarrggbb as 16-bit.
548	const __m128i shuff0 =
549	_mm_set_epi8(-`1`, -`1`, -`1`, -`1`, `13`, `12`, `5`, `4`, `11`, `10`, `3`, `2`, `9`, `8`, `1`, `0`);
550	const __m128i shuff1 =
551	_mm_set_epi8(`13`, `12`, `5`, `4`, -`1`, -`1`, -`1`, -`1`, `11`, `10`, `3`, `2`, `9`, `8`, `1`, `0`);
552	const __m128i A0 = _mm_shuffle_epi8(in0, shuff0);
553	const __m128i A1 = _mm_shuffle_epi8(in1, shuff1);
554	const __m128i A2 = _mm_shuffle_epi8(in2, shuff0);
555	const __m128i A3 = _mm_shuffle_epi8(in3, shuff1);
556	// R0R1G0G1
557	// B0B1****
558	// R2R3G2G3
559	// B2B3****
560	// (OR is used to free port 5 for the unpack)
561	const __m128i B0 = _mm_unpacklo_epi32(A0, A1);
562	const __m128i B1 = _mm_or_si128(A0, A1);
563	const __m128i B2 = _mm_unpacklo_epi32(A2, A3);
564	const __m128i B3 = _mm_or_si128(A2, A3);
565	// Gather the channels.
566	*r = _mm_unpacklo_epi64(B0, B2);
567	*g = _mm_unpackhi_epi64(B0, B2);
568	*b = _mm_unpackhi_epi64(B1, B3);
569	}
570
571	static void ConvertRGBA32ToUV_SSE41(const uint16_t* rgb,
572	uint8_t* u, uint8_t* v, int width) {
573	const int max_width = width & ~`15`;
574	const uint16_t* const last_rgb = rgb + `4` * max_width;
575	while (rgb < last_rgb) {
576	__m128i r, g, b, U0, V0, U1, V1;
577	RGBA32PackedToPlanar_16b_SSE41(rgb + `0`, &r, &g, &b);
578	ConvertRGBToUV_SSE41(&r, &g, &b, &U0, &V0);
579	RGBA32PackedToPlanar_16b_SSE41(rgb + `32`, &r, &g, &b);
580	ConvertRGBToUV_SSE41(&r, &g, &b, &U1, &V1);
581	STORE_16(_mm_packus_epi16(U0, U1), u);
582	STORE_16(_mm_packus_epi16(V0, V1), v);
583	u += `16`;
584	v += `16`;
585	rgb += `2` * `32`;
586	}
587	if (max_width < width) { // left-over
588	WebPConvertRGBA32ToUV_C(rgb, u, v, width - max_width);
589	}
590	}
591
592	//------------------------------------------------------------------------------
593
594	extern void WebPInitConvertARGBToYUVSSE41(void);
595
596	WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE41(void) {
597	WebPConvertARGBToY = ConvertARGBToY_SSE41;
598	WebPConvertARGBToUV = ConvertARGBToUV_SSE41;
599
600	WebPConvertRGB24ToY = ConvertRGB24ToY_SSE41;
601	WebPConvertBGR24ToY = ConvertBGR24ToY_SSE41;
602
603	WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE41;
604	}
605
606	//------------------------------------------------------------------------------
607
608	#else // !WEBP_USE_SSE41
609
610	WEBP_DSP_INIT_STUB(WebPInitSamplersSSE41)
611	WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE41)
612
613	#endif // WEBP_USE_SSE41
614

Browse the source code of Skia/third_party/externals/libwebp/src/dsp/yuv_sse41.c