1// Copyright 2014 Google Inc. All Rights Reserved.
2//
3// Use of this source code is governed by a BSD-style license
4// that can be found in the COPYING file in the root of the source
5// tree. An additional intellectual property rights grant can be found
6// in the file PATENTS. All contributing project authors may
7// be found in the AUTHORS file in the root of the source tree.
8// -----------------------------------------------------------------------------
9//
10// YUV->RGB conversion functions
11//
12// Author: Skal (pascal.massimino@gmail.com)
13
14#include "src/dsp/yuv.h"
15
16#if defined(WEBP_USE_SSE2)
17
18#include "src/dsp/common_sse2.h"
19#include <stdlib.h>
20#include <emmintrin.h>
21
22//-----------------------------------------------------------------------------
23// Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
24
25// These constants are 14b fixed-point version of ITU-R BT.601 constants.
26// R = (19077 * y + 26149 * v - 14234) >> 6
27// G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6
28// B = (19077 * y + 33050 * u - 17685) >> 6
29static void ConvertYUV444ToRGB_SSE2(const __m128i* const Y0,
30 const __m128i* const U0,
31 const __m128i* const V0,
32 __m128i* const R,
33 __m128i* const G,
34 __m128i* const B) {
35 const __m128i k19077 = _mm_set1_epi16(19077);
36 const __m128i k26149 = _mm_set1_epi16(26149);
37 const __m128i k14234 = _mm_set1_epi16(14234);
38 // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
39 const __m128i k33050 = _mm_set1_epi16((short)33050);
40 const __m128i k17685 = _mm_set1_epi16(17685);
41 const __m128i k6419 = _mm_set1_epi16(6419);
42 const __m128i k13320 = _mm_set1_epi16(13320);
43 const __m128i k8708 = _mm_set1_epi16(8708);
44
45 const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077);
46
47 const __m128i R0 = _mm_mulhi_epu16(*V0, k26149);
48 const __m128i R1 = _mm_sub_epi16(Y1, k14234);
49 const __m128i R2 = _mm_add_epi16(R1, R0);
50
51 const __m128i G0 = _mm_mulhi_epu16(*U0, k6419);
52 const __m128i G1 = _mm_mulhi_epu16(*V0, k13320);
53 const __m128i G2 = _mm_add_epi16(Y1, k8708);
54 const __m128i G3 = _mm_add_epi16(G0, G1);
55 const __m128i G4 = _mm_sub_epi16(G2, G3);
56
57 // be careful with the saturated *unsigned* arithmetic here!
58 const __m128i B0 = _mm_mulhi_epu16(*U0, k33050);
59 const __m128i B1 = _mm_adds_epu16(B0, Y1);
60 const __m128i B2 = _mm_subs_epu16(B1, k17685);
61
62 // use logical shift for B2, which can be larger than 32767
63 *R = _mm_srai_epi16(R2, 6); // range: [-14234, 30815]
64 *G = _mm_srai_epi16(G4, 6); // range: [-10953, 27710]
65 *B = _mm_srli_epi16(B2, 6); // range: [0, 34238]
66}
67
68// Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
69static WEBP_INLINE __m128i Load_HI_16_SSE2(const uint8_t* src) {
70 const __m128i zero = _mm_setzero_si128();
71 return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
72}
73
74// Load and replicate the U/V samples
75static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) {
76 const __m128i zero = _mm_setzero_si128();
77 const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src);
78 const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
79 return _mm_unpacklo_epi16(tmp1, tmp1); // replicate samples
80}
81
82// Convert 32 samples of YUV444 to R/G/B
83static void YUV444ToRGB_SSE2(const uint8_t* const y,
84 const uint8_t* const u,
85 const uint8_t* const v,
86 __m128i* const R, __m128i* const G,
87 __m128i* const B) {
88 const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u),
89 V0 = Load_HI_16_SSE2(v);
90 ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
91}
92
93// Convert 32 samples of YUV420 to R/G/B
94static void YUV420ToRGB_SSE2(const uint8_t* const y,
95 const uint8_t* const u,
96 const uint8_t* const v,
97 __m128i* const R, __m128i* const G,
98 __m128i* const B) {
99 const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u),
100 V0 = Load_UV_HI_8_SSE2(v);
101 ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
102}
103
104// Pack R/G/B/A results into 32b output.
105static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
106 const __m128i* const G,
107 const __m128i* const B,
108 const __m128i* const A,
109 uint8_t* const dst) {
110 const __m128i rb = _mm_packus_epi16(*R, *B);
111 const __m128i ga = _mm_packus_epi16(*G, *A);
112 const __m128i rg = _mm_unpacklo_epi8(rb, ga);
113 const __m128i ba = _mm_unpackhi_epi8(rb, ga);
114 const __m128i RGBA_lo = _mm_unpacklo_epi16(rg, ba);
115 const __m128i RGBA_hi = _mm_unpackhi_epi16(rg, ba);
116 _mm_storeu_si128((__m128i*)(dst + 0), RGBA_lo);
117 _mm_storeu_si128((__m128i*)(dst + 16), RGBA_hi);
118}
119
120// Pack R/G/B/A results into 16b output.
121static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
122 const __m128i* const G,
123 const __m128i* const B,
124 const __m128i* const A,
125 uint8_t* const dst) {
126#if (WEBP_SWAP_16BIT_CSP == 0)
127 const __m128i rg0 = _mm_packus_epi16(*R, *G);
128 const __m128i ba0 = _mm_packus_epi16(*B, *A);
129#else
130 const __m128i rg0 = _mm_packus_epi16(*B, *A);
131 const __m128i ba0 = _mm_packus_epi16(*R, *G);
132#endif
133 const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
134 const __m128i rb1 = _mm_unpacklo_epi8(rg0, ba0); // rbrbrbrbrb...
135 const __m128i ga1 = _mm_unpackhi_epi8(rg0, ba0); // gagagagaga...
136 const __m128i rb2 = _mm_and_si128(rb1, mask_0xf0);
137 const __m128i ga2 = _mm_srli_epi16(_mm_and_si128(ga1, mask_0xf0), 4);
138 const __m128i rgba4444 = _mm_or_si128(rb2, ga2);
139 _mm_storeu_si128((__m128i*)dst, rgba4444);
140}
141
142// Pack R/G/B results into 16b output.
143static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
144 const __m128i* const G,
145 const __m128i* const B,
146 uint8_t* const dst) {
147 const __m128i r0 = _mm_packus_epi16(*R, *R);
148 const __m128i g0 = _mm_packus_epi16(*G, *G);
149 const __m128i b0 = _mm_packus_epi16(*B, *B);
150 const __m128i r1 = _mm_and_si128(r0, _mm_set1_epi8(0xf8));
151 const __m128i b1 = _mm_and_si128(_mm_srli_epi16(b0, 3), _mm_set1_epi8(0x1f));
152 const __m128i g1 = _mm_srli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0xe0)), 5);
153 const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0x1c)), 3);
154 const __m128i rg = _mm_or_si128(r1, g1);
155 const __m128i gb = _mm_or_si128(g2, b1);
156#if (WEBP_SWAP_16BIT_CSP == 0)
157 const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb);
158#else
159 const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg);
160#endif
161 _mm_storeu_si128((__m128i*)dst, rgb565);
162}
163
164// Pack the planar buffers
165// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
166// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
167static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
168 __m128i* const in2, __m128i* const in3,
169 __m128i* const in4, __m128i* const in5,
170 uint8_t* const rgb) {
171 // The input is 6 registers of sixteen 8b but for the sake of explanation,
172 // let's take 6 registers of four 8b values.
173 // To pack, we will keep taking one every two 8b integer and move it
174 // around as follows:
175 // Input:
176 // r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7
177 // Split the 6 registers in two sets of 3 registers: the first set as the even
178 // 8b bytes, the second the odd ones:
179 // r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7
180 // Repeat the same permutations twice more:
181 // r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
182 // r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
183 VP8PlanarTo24b_SSE2(in0, in1, in2, in3, in4, in5);
184
185 _mm_storeu_si128((__m128i*)(rgb + 0), *in0);
186 _mm_storeu_si128((__m128i*)(rgb + 16), *in1);
187 _mm_storeu_si128((__m128i*)(rgb + 32), *in2);
188 _mm_storeu_si128((__m128i*)(rgb + 48), *in3);
189 _mm_storeu_si128((__m128i*)(rgb + 64), *in4);
190 _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
191}
192
193void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
194 uint8_t* dst) {
195 const __m128i kAlpha = _mm_set1_epi16(255);
196 int n;
197 for (n = 0; n < 32; n += 8, dst += 32) {
198 __m128i R, G, B;
199 YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
200 PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
201 }
202}
203
204void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
205 uint8_t* dst) {
206 const __m128i kAlpha = _mm_set1_epi16(255);
207 int n;
208 for (n = 0; n < 32; n += 8, dst += 32) {
209 __m128i R, G, B;
210 YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
211 PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
212 }
213}
214
215void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
216 uint8_t* dst) {
217 const __m128i kAlpha = _mm_set1_epi16(255);
218 int n;
219 for (n = 0; n < 32; n += 8, dst += 32) {
220 __m128i R, G, B;
221 YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
222 PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
223 }
224}
225
226void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
227 const uint8_t* v, uint8_t* dst) {
228 const __m128i kAlpha = _mm_set1_epi16(255);
229 int n;
230 for (n = 0; n < 32; n += 8, dst += 16) {
231 __m128i R, G, B;
232 YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
233 PackAndStore4444_SSE2(&R, &G, &B, &kAlpha, dst);
234 }
235}
236
237void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
238 uint8_t* dst) {
239 int n;
240 for (n = 0; n < 32; n += 8, dst += 16) {
241 __m128i R, G, B;
242 YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
243 PackAndStore565_SSE2(&R, &G, &B, dst);
244 }
245}
246
247void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
248 uint8_t* dst) {
249 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
250 __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
251
252 YUV444ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
253 YUV444ToRGB_SSE2(y + 8, u + 8, v + 8, &R1, &G1, &B1);
254 YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2);
255 YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3);
256
257 // Cast to 8b and store as RRRRGGGGBBBB.
258 rgb0 = _mm_packus_epi16(R0, R1);
259 rgb1 = _mm_packus_epi16(R2, R3);
260 rgb2 = _mm_packus_epi16(G0, G1);
261 rgb3 = _mm_packus_epi16(G2, G3);
262 rgb4 = _mm_packus_epi16(B0, B1);
263 rgb5 = _mm_packus_epi16(B2, B3);
264
265 // Pack as RGBRGBRGBRGB.
266 PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
267}
268
269void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
270 uint8_t* dst) {
271 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
272 __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
273
274 YUV444ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
275 YUV444ToRGB_SSE2(y + 8, u + 8, v + 8, &R1, &G1, &B1);
276 YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2);
277 YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3);
278
279 // Cast to 8b and store as BBBBGGGGRRRR.
280 bgr0 = _mm_packus_epi16(B0, B1);
281 bgr1 = _mm_packus_epi16(B2, B3);
282 bgr2 = _mm_packus_epi16(G0, G1);
283 bgr3 = _mm_packus_epi16(G2, G3);
284 bgr4 = _mm_packus_epi16(R0, R1);
285 bgr5= _mm_packus_epi16(R2, R3);
286
287 // Pack as BGRBGRBGRBGR.
288 PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
289}
290
291//-----------------------------------------------------------------------------
292// Arbitrary-length row conversion functions
293
294static void YuvToRgbaRow_SSE2(const uint8_t* y,
295 const uint8_t* u, const uint8_t* v,
296 uint8_t* dst, int len) {
297 const __m128i kAlpha = _mm_set1_epi16(255);
298 int n;
299 for (n = 0; n + 8 <= len; n += 8, dst += 32) {
300 __m128i R, G, B;
301 YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
302 PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
303 y += 8;
304 u += 4;
305 v += 4;
306 }
307 for (; n < len; ++n) { // Finish off
308 VP8YuvToRgba(y[0], u[0], v[0], dst);
309 dst += 4;
310 y += 1;
311 u += (n & 1);
312 v += (n & 1);
313 }
314}
315
316static void YuvToBgraRow_SSE2(const uint8_t* y,
317 const uint8_t* u, const uint8_t* v,
318 uint8_t* dst, int len) {
319 const __m128i kAlpha = _mm_set1_epi16(255);
320 int n;
321 for (n = 0; n + 8 <= len; n += 8, dst += 32) {
322 __m128i R, G, B;
323 YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
324 PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
325 y += 8;
326 u += 4;
327 v += 4;
328 }
329 for (; n < len; ++n) { // Finish off
330 VP8YuvToBgra(y[0], u[0], v[0], dst);
331 dst += 4;
332 y += 1;
333 u += (n & 1);
334 v += (n & 1);
335 }
336}
337
338static void YuvToArgbRow_SSE2(const uint8_t* y,
339 const uint8_t* u, const uint8_t* v,
340 uint8_t* dst, int len) {
341 const __m128i kAlpha = _mm_set1_epi16(255);
342 int n;
343 for (n = 0; n + 8 <= len; n += 8, dst += 32) {
344 __m128i R, G, B;
345 YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
346 PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
347 y += 8;
348 u += 4;
349 v += 4;
350 }
351 for (; n < len; ++n) { // Finish off
352 VP8YuvToArgb(y[0], u[0], v[0], dst);
353 dst += 4;
354 y += 1;
355 u += (n & 1);
356 v += (n & 1);
357 }
358}
359
360static void YuvToRgbRow_SSE2(const uint8_t* y,
361 const uint8_t* u, const uint8_t* v,
362 uint8_t* dst, int len) {
363 int n;
364 for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
365 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
366 __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
367
368 YUV420ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
369 YUV420ToRGB_SSE2(y + 8, u + 4, v + 4, &R1, &G1, &B1);
370 YUV420ToRGB_SSE2(y + 16, u + 8, v + 8, &R2, &G2, &B2);
371 YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3);
372
373 // Cast to 8b and store as RRRRGGGGBBBB.
374 rgb0 = _mm_packus_epi16(R0, R1);
375 rgb1 = _mm_packus_epi16(R2, R3);
376 rgb2 = _mm_packus_epi16(G0, G1);
377 rgb3 = _mm_packus_epi16(G2, G3);
378 rgb4 = _mm_packus_epi16(B0, B1);
379 rgb5 = _mm_packus_epi16(B2, B3);
380
381 // Pack as RGBRGBRGBRGB.
382 PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
383
384 y += 32;
385 u += 16;
386 v += 16;
387 }
388 for (; n < len; ++n) { // Finish off
389 VP8YuvToRgb(y[0], u[0], v[0], dst);
390 dst += 3;
391 y += 1;
392 u += (n & 1);
393 v += (n & 1);
394 }
395}
396
397static void YuvToBgrRow_SSE2(const uint8_t* y,
398 const uint8_t* u, const uint8_t* v,
399 uint8_t* dst, int len) {
400 int n;
401 for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
402 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
403 __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
404
405 YUV420ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
406 YUV420ToRGB_SSE2(y + 8, u + 4, v + 4, &R1, &G1, &B1);
407 YUV420ToRGB_SSE2(y + 16, u + 8, v + 8, &R2, &G2, &B2);
408 YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3);
409
410 // Cast to 8b and store as BBBBGGGGRRRR.
411 bgr0 = _mm_packus_epi16(B0, B1);
412 bgr1 = _mm_packus_epi16(B2, B3);
413 bgr2 = _mm_packus_epi16(G0, G1);
414 bgr3 = _mm_packus_epi16(G2, G3);
415 bgr4 = _mm_packus_epi16(R0, R1);
416 bgr5 = _mm_packus_epi16(R2, R3);
417
418 // Pack as BGRBGRBGRBGR.
419 PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
420
421 y += 32;
422 u += 16;
423 v += 16;
424 }
425 for (; n < len; ++n) { // Finish off
426 VP8YuvToBgr(y[0], u[0], v[0], dst);
427 dst += 3;
428 y += 1;
429 u += (n & 1);
430 v += (n & 1);
431 }
432}
433
434//------------------------------------------------------------------------------
435// Entry point
436
437extern void WebPInitSamplersSSE2(void);
438
439WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) {
440 WebPSamplers[MODE_RGB] = YuvToRgbRow_SSE2;
441 WebPSamplers[MODE_RGBA] = YuvToRgbaRow_SSE2;
442 WebPSamplers[MODE_BGR] = YuvToBgrRow_SSE2;
443 WebPSamplers[MODE_BGRA] = YuvToBgraRow_SSE2;
444 WebPSamplers[MODE_ARGB] = YuvToArgbRow_SSE2;
445}
446
447//------------------------------------------------------------------------------
448// RGB24/32 -> YUV converters
449
450// Load eight 16b-words from *src.
451#define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
452// Store either 16b-words into *dst
453#define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V))
454
455// Function that inserts a value of the second half of the in buffer in between
456// every two char of the first half.
457static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2(
458 const __m128i* const in /*in[6]*/, __m128i* const out /*out[6]*/) {
459 out[0] = _mm_unpacklo_epi8(in[0], in[3]);
460 out[1] = _mm_unpackhi_epi8(in[0], in[3]);
461 out[2] = _mm_unpacklo_epi8(in[1], in[4]);
462 out[3] = _mm_unpackhi_epi8(in[1], in[4]);
463 out[4] = _mm_unpacklo_epi8(in[2], in[5]);
464 out[5] = _mm_unpackhi_epi8(in[2], in[5]);
465}
466
467// Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
468// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
469// Similar to PlanarTo24bHelper(), but in reverse order.
470static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
471 const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
472 __m128i tmp[6];
473 tmp[0] = _mm_loadu_si128((const __m128i*)(rgb + 0));
474 tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16));
475 tmp[2] = _mm_loadu_si128((const __m128i*)(rgb + 32));
476 tmp[3] = _mm_loadu_si128((const __m128i*)(rgb + 48));
477 tmp[4] = _mm_loadu_si128((const __m128i*)(rgb + 64));
478 tmp[5] = _mm_loadu_si128((const __m128i*)(rgb + 80));
479
480 RGB24PackedToPlanarHelper_SSE2(tmp, out);
481 RGB24PackedToPlanarHelper_SSE2(out, tmp);
482 RGB24PackedToPlanarHelper_SSE2(tmp, out);
483 RGB24PackedToPlanarHelper_SSE2(out, tmp);
484 RGB24PackedToPlanarHelper_SSE2(tmp, out);
485}
486
487// Convert 8 packed ARGB to r[], g[], b[]
488static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb,
489 __m128i* const rgb /*in[6]*/) {
490 const __m128i zero = _mm_setzero_si128();
491 __m128i a0 = LOAD_16(argb + 0);
492 __m128i a1 = LOAD_16(argb + 4);
493 __m128i a2 = LOAD_16(argb + 8);
494 __m128i a3 = LOAD_16(argb + 12);
495 VP8L32bToPlanar_SSE2(&a0, &a1, &a2, &a3);
496 rgb[0] = _mm_unpacklo_epi8(a1, zero);
497 rgb[1] = _mm_unpackhi_epi8(a1, zero);
498 rgb[2] = _mm_unpacklo_epi8(a2, zero);
499 rgb[3] = _mm_unpackhi_epi8(a2, zero);
500 rgb[4] = _mm_unpacklo_epi8(a3, zero);
501 rgb[5] = _mm_unpackhi_epi8(a3, zero);
502}
503
504// This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX
505// It's a macro and not a function because we need to use immediate values with
506// srai_epi32, e.g.
507#define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \
508 ROUNDER, DESCALE_FIX, OUT) do { \
509 const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG); \
510 const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG); \
511 const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB); \
512 const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB); \
513 const __m128i V2_lo = _mm_add_epi32(V0_lo, V1_lo); \
514 const __m128i V2_hi = _mm_add_epi32(V0_hi, V1_hi); \
515 const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER); \
516 const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER); \
517 const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX); \
518 const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX); \
519 (OUT) = _mm_packs_epi32(V5_lo, V5_hi); \
520} while (0)
521
522#define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
523static WEBP_INLINE void ConvertRGBToY_SSE2(const __m128i* const R,
524 const __m128i* const G,
525 const __m128i* const B,
526 __m128i* const Y) {
527 const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384);
528 const __m128i kGB_y = MK_CST_16(16384, 6420);
529 const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF);
530
531 const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
532 const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
533 const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
534 const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
535 TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
536}
537
538static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R,
539 const __m128i* const G,
540 const __m128i* const B,
541 __m128i* const U,
542 __m128i* const V) {
543 const __m128i kRG_u = MK_CST_16(-9719, -19081);
544 const __m128i kGB_u = MK_CST_16(0, 28800);
545 const __m128i kRG_v = MK_CST_16(28800, 0);
546 const __m128i kGB_v = MK_CST_16(-24116, -4684);
547 const __m128i kHALF_UV = _mm_set1_epi32(((128 << YUV_FIX) + YUV_HALF) << 2);
548
549 const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
550 const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
551 const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
552 const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
553 TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u,
554 kHALF_UV, YUV_FIX + 2, *U);
555 TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_v, kGB_v,
556 kHALF_UV, YUV_FIX + 2, *V);
557}
558
559#undef MK_CST_16
560#undef TRANSFORM
561
562static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) {
563 const int max_width = width & ~31;
564 int i;
565 for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
566 __m128i rgb_plane[6];
567 int j;
568
569 RGB24PackedToPlanar_SSE2(rgb, rgb_plane);
570
571 for (j = 0; j < 2; ++j, i += 16) {
572 const __m128i zero = _mm_setzero_si128();
573 __m128i r, g, b, Y0, Y1;
574
575 // Convert to 16-bit Y.
576 r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero);
577 g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero);
578 b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero);
579 ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
580
581 // Convert to 16-bit Y.
582 r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero);
583 g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero);
584 b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero);
585 ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
586
587 // Cast to 8-bit and store.
588 STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
589 }
590 }
591 for (; i < width; ++i, rgb += 3) { // left-over
592 y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
593 }
594}
595
596static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) {
597 const int max_width = width & ~31;
598 int i;
599 for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
600 __m128i bgr_plane[6];
601 int j;
602
603 RGB24PackedToPlanar_SSE2(bgr, bgr_plane);
604
605 for (j = 0; j < 2; ++j, i += 16) {
606 const __m128i zero = _mm_setzero_si128();
607 __m128i r, g, b, Y0, Y1;
608
609 // Convert to 16-bit Y.
610 b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero);
611 g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero);
612 r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero);
613 ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
614
615 // Convert to 16-bit Y.
616 b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero);
617 g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero);
618 r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero);
619 ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
620
621 // Cast to 8-bit and store.
622 STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
623 }
624 }
625 for (; i < width; ++i, bgr += 3) { // left-over
626 y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
627 }
628}
629
630static void ConvertARGBToY_SSE2(const uint32_t* argb, uint8_t* y, int width) {
631 const int max_width = width & ~15;
632 int i;
633 for (i = 0; i < max_width; i += 16) {
634 __m128i Y0, Y1, rgb[6];
635 RGB32PackedToPlanar_SSE2(&argb[i], rgb);
636 ConvertRGBToY_SSE2(&rgb[0], &rgb[2], &rgb[4], &Y0);
637 ConvertRGBToY_SSE2(&rgb[1], &rgb[3], &rgb[5], &Y1);
638 STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
639 }
640 for (; i < width; ++i) { // left-over
641 const uint32_t p = argb[i];
642 y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff,
643 YUV_HALF);
644 }
645}
646
647// Horizontal add (doubled) of two 16b values, result is 16b.
648// in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ...
649static void HorizontalAddPack_SSE2(const __m128i* const A,
650 const __m128i* const B,
651 __m128i* const out) {
652 const __m128i k2 = _mm_set1_epi16(2);
653 const __m128i C = _mm_madd_epi16(*A, k2);
654 const __m128i D = _mm_madd_epi16(*B, k2);
655 *out = _mm_packs_epi32(C, D);
656}
657
658static void ConvertARGBToUV_SSE2(const uint32_t* argb,
659 uint8_t* u, uint8_t* v,
660 int src_width, int do_store) {
661 const int max_width = src_width & ~31;
662 int i;
663 for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
664 __m128i rgb[6], U0, V0, U1, V1;
665 RGB32PackedToPlanar_SSE2(&argb[i], rgb);
666 HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]);
667 HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]);
668 HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]);
669 ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
670
671 RGB32PackedToPlanar_SSE2(&argb[i + 16], rgb);
672 HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]);
673 HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]);
674 HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]);
675 ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
676
677 U0 = _mm_packus_epi16(U0, U1);
678 V0 = _mm_packus_epi16(V0, V1);
679 if (!do_store) {
680 const __m128i prev_u = LOAD_16(u);
681 const __m128i prev_v = LOAD_16(v);
682 U0 = _mm_avg_epu8(U0, prev_u);
683 V0 = _mm_avg_epu8(V0, prev_v);
684 }
685 STORE_16(U0, u);
686 STORE_16(V0, v);
687 }
688 if (i < src_width) { // left-over
689 WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
690 }
691}
692
693// Convert 16 packed ARGB 16b-values to r[], g[], b[]
694static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
695 const uint16_t* const rgbx,
696 __m128i* const r, __m128i* const g, __m128i* const b) {
697 const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x
698 const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x
699 const __m128i in2 = LOAD_16(rgbx + 16); // r4 | ...
700 const __m128i in3 = LOAD_16(rgbx + 24); // r6 | ...
701 // column-wise transpose
702 const __m128i A0 = _mm_unpacklo_epi16(in0, in1);
703 const __m128i A1 = _mm_unpackhi_epi16(in0, in1);
704 const __m128i A2 = _mm_unpacklo_epi16(in2, in3);
705 const __m128i A3 = _mm_unpackhi_epi16(in2, in3);
706 const __m128i B0 = _mm_unpacklo_epi16(A0, A1); // r0 r1 r2 r3 | g0 g1 ..
707 const __m128i B1 = _mm_unpackhi_epi16(A0, A1); // b0 b1 b2 b3 | x x x x
708 const __m128i B2 = _mm_unpacklo_epi16(A2, A3); // r4 r5 r6 r7 | g4 g5 ..
709 const __m128i B3 = _mm_unpackhi_epi16(A2, A3); // b4 b5 b6 b7 | x x x x
710 *r = _mm_unpacklo_epi64(B0, B2);
711 *g = _mm_unpackhi_epi64(B0, B2);
712 *b = _mm_unpacklo_epi64(B1, B3);
713}
714
715static void ConvertRGBA32ToUV_SSE2(const uint16_t* rgb,
716 uint8_t* u, uint8_t* v, int width) {
717 const int max_width = width & ~15;
718 const uint16_t* const last_rgb = rgb + 4 * max_width;
719 while (rgb < last_rgb) {
720 __m128i r, g, b, U0, V0, U1, V1;
721 RGBA32PackedToPlanar_16b_SSE2(rgb + 0, &r, &g, &b);
722 ConvertRGBToUV_SSE2(&r, &g, &b, &U0, &V0);
723 RGBA32PackedToPlanar_16b_SSE2(rgb + 32, &r, &g, &b);
724 ConvertRGBToUV_SSE2(&r, &g, &b, &U1, &V1);
725 STORE_16(_mm_packus_epi16(U0, U1), u);
726 STORE_16(_mm_packus_epi16(V0, V1), v);
727 u += 16;
728 v += 16;
729 rgb += 2 * 32;
730 }
731 if (max_width < width) { // left-over
732 WebPConvertRGBA32ToUV_C(rgb, u, v, width - max_width);
733 }
734}
735
736//------------------------------------------------------------------------------
737
738extern void WebPInitConvertARGBToYUVSSE2(void);
739
740WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
741 WebPConvertARGBToY = ConvertARGBToY_SSE2;
742 WebPConvertARGBToUV = ConvertARGBToUV_SSE2;
743
744 WebPConvertRGB24ToY = ConvertRGB24ToY_SSE2;
745 WebPConvertBGR24ToY = ConvertBGR24ToY_SSE2;
746
747 WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2;
748}
749
750//------------------------------------------------------------------------------
751
752#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
753static uint16_t clip_y(int v) {
754 return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
755}
756
757static uint64_t SharpYUVUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
758 uint16_t* dst, int len) {
759 uint64_t diff = 0;
760 uint32_t tmp[4];
761 int i;
762 const __m128i zero = _mm_setzero_si128();
763 const __m128i max = _mm_set1_epi16(MAX_Y);
764 const __m128i one = _mm_set1_epi16(1);
765 __m128i sum = zero;
766
767 for (i = 0; i + 8 <= len; i += 8) {
768 const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
769 const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
770 const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
771 const __m128i D = _mm_sub_epi16(A, B); // diff_y
772 const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0)
773 const __m128i F = _mm_add_epi16(C, D); // new_y
774 const __m128i G = _mm_or_si128(E, one); // -1 or 1
775 const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
776 const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...))
777 _mm_storeu_si128((__m128i*)(dst + i), H);
778 sum = _mm_add_epi32(sum, I);
779 }
780 _mm_storeu_si128((__m128i*)tmp, sum);
781 diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
782 for (; i < len; ++i) {
783 const int diff_y = ref[i] - src[i];
784 const int new_y = (int)dst[i] + diff_y;
785 dst[i] = clip_y(new_y);
786 diff += (uint64_t)abs(diff_y);
787 }
788 return diff;
789}
790
791static void SharpYUVUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
792 int16_t* dst, int len) {
793 int i = 0;
794 for (i = 0; i + 8 <= len; i += 8) {
795 const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
796 const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
797 const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
798 const __m128i D = _mm_sub_epi16(A, B); // diff_uv
799 const __m128i E = _mm_add_epi16(C, D); // new_uv
800 _mm_storeu_si128((__m128i*)(dst + i), E);
801 }
802 for (; i < len; ++i) {
803 const int diff_uv = ref[i] - src[i];
804 dst[i] += diff_uv;
805 }
806}
807
808static void SharpYUVFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
809 const uint16_t* best_y, uint16_t* out) {
810 int i;
811 const __m128i kCst8 = _mm_set1_epi16(8);
812 const __m128i max = _mm_set1_epi16(MAX_Y);
813 const __m128i zero = _mm_setzero_si128();
814 for (i = 0; i + 8 <= len; i += 8) {
815 const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0));
816 const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1));
817 const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0));
818 const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1));
819 const __m128i a0b1 = _mm_add_epi16(a0, b1);
820 const __m128i a1b0 = _mm_add_epi16(a1, b0);
821 const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1
822 const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
823 const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1)
824 const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0)
825 const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
826 const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
827 const __m128i d0 = _mm_add_epi16(c1, a0);
828 const __m128i d1 = _mm_add_epi16(c0, a1);
829 const __m128i e0 = _mm_srai_epi16(d0, 1);
830 const __m128i e1 = _mm_srai_epi16(d1, 1);
831 const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
832 const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
833 const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
834 const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8));
835 const __m128i h0 = _mm_add_epi16(g0, f0);
836 const __m128i h1 = _mm_add_epi16(g1, f1);
837 const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
838 const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
839 _mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0);
840 _mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1);
841 }
842 for (; i < len; ++i) {
843 // (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
844 // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
845 // We reuse the common sub-expressions.
846 const int a0b1 = A[i + 0] + B[i + 1];
847 const int a1b0 = A[i + 1] + B[i + 0];
848 const int a0a1b0b1 = a0b1 + a1b0 + 8;
849 const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
850 const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
851 out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
852 out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
853 }
854}
855
856#undef MAX_Y
857
858//------------------------------------------------------------------------------
859
860extern void WebPInitSharpYUVSSE2(void);
861
862WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVSSE2(void) {
863 WebPSharpYUVUpdateY = SharpYUVUpdateY_SSE2;
864 WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_SSE2;
865 WebPSharpYUVFilterRow = SharpYUVFilterRow_SSE2;
866}
867
868#else // !WEBP_USE_SSE2
869
870WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2)
871WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2)
872WEBP_DSP_INIT_STUB(WebPInitSharpYUVSSE2)
873
874#endif // WEBP_USE_SSE2
875