1// Copyright 2014 Google Inc. All Rights Reserved.
2//
3// Use of this source code is governed by a BSD-style license
4// that can be found in the COPYING file in the root of the source
5// tree. An additional intellectual property rights grant can be found
6// in the file PATENTS. All contributing project authors may
7// be found in the AUTHORS file in the root of the source tree.
8// -----------------------------------------------------------------------------
9//
10// SSE2 variant of methods for lossless decoder
11//
12// Author: Skal (pascal.massimino@gmail.com)
13
14#include "src/dsp/dsp.h"
15
16#if defined(WEBP_USE_SSE2)
17
18#include "src/dsp/common_sse2.h"
19#include "src/dsp/lossless.h"
20#include "src/dsp/lossless_common.h"
21#include <emmintrin.h>
22
23//------------------------------------------------------------------------------
24// Predictor Transform
25
26static WEBP_INLINE uint32_t ClampedAddSubtractFull_SSE2(uint32_t c0,
27 uint32_t c1,
28 uint32_t c2) {
29 const __m128i zero = _mm_setzero_si128();
30 const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c0), zero);
31 const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c1), zero);
32 const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c2), zero);
33 const __m128i V1 = _mm_add_epi16(C0, C1);
34 const __m128i V2 = _mm_sub_epi16(V1, C2);
35 const __m128i b = _mm_packus_epi16(V2, V2);
36 return (uint32_t)_mm_cvtsi128_si32(b);
37}
38
39static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0,
40 uint32_t c1,
41 uint32_t c2) {
42 const __m128i zero = _mm_setzero_si128();
43 const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c0), zero);
44 const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c1), zero);
45 const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c2), zero);
46 const __m128i avg = _mm_add_epi16(C1, C0);
47 const __m128i A0 = _mm_srli_epi16(avg, 1);
48 const __m128i A1 = _mm_sub_epi16(A0, B0);
49 const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
50 const __m128i A2 = _mm_sub_epi16(A1, BgtA);
51 const __m128i A3 = _mm_srai_epi16(A2, 1);
52 const __m128i A4 = _mm_add_epi16(A0, A3);
53 const __m128i A5 = _mm_packus_epi16(A4, A4);
54 return (uint32_t)_mm_cvtsi128_si32(A5);
55}
56
57static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) {
58 int pa_minus_pb;
59 const __m128i zero = _mm_setzero_si128();
60 const __m128i A0 = _mm_cvtsi32_si128((int)a);
61 const __m128i B0 = _mm_cvtsi32_si128((int)b);
62 const __m128i C0 = _mm_cvtsi32_si128((int)c);
63 const __m128i AC0 = _mm_subs_epu8(A0, C0);
64 const __m128i CA0 = _mm_subs_epu8(C0, A0);
65 const __m128i BC0 = _mm_subs_epu8(B0, C0);
66 const __m128i CB0 = _mm_subs_epu8(C0, B0);
67 const __m128i AC = _mm_or_si128(AC0, CA0);
68 const __m128i BC = _mm_or_si128(BC0, CB0);
69 const __m128i pa = _mm_unpacklo_epi8(AC, zero); // |a - c|
70 const __m128i pb = _mm_unpacklo_epi8(BC, zero); // |b - c|
71 const __m128i diff = _mm_sub_epi16(pb, pa);
72 {
73 int16_t out[8];
74 _mm_storeu_si128((__m128i*)out, diff);
75 pa_minus_pb = out[0] + out[1] + out[2] + out[3];
76 }
77 return (pa_minus_pb <= 0) ? a : b;
78}
79
80static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
81 const __m128i* const a1,
82 __m128i* const avg) {
83 // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
84 const __m128i ones = _mm_set1_epi8(1);
85 const __m128i avg1 = _mm_avg_epu8(*a0, *a1);
86 const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones);
87 *avg = _mm_sub_epi8(avg1, one);
88}
89
90static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0,
91 const uint32_t a1,
92 __m128i* const avg) {
93 // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
94 const __m128i ones = _mm_set1_epi8(1);
95 const __m128i A0 = _mm_cvtsi32_si128((int)a0);
96 const __m128i A1 = _mm_cvtsi32_si128((int)a1);
97 const __m128i avg1 = _mm_avg_epu8(A0, A1);
98 const __m128i one = _mm_and_si128(_mm_xor_si128(A0, A1), ones);
99 *avg = _mm_sub_epi8(avg1, one);
100}
101
102static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) {
103 const __m128i zero = _mm_setzero_si128();
104 const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a0), zero);
105 const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a1), zero);
106 const __m128i sum = _mm_add_epi16(A1, A0);
107 return _mm_srli_epi16(sum, 1);
108}
109
110static WEBP_INLINE uint32_t Average2_SSE2(uint32_t a0, uint32_t a1) {
111 __m128i output;
112 Average2_uint32_SSE2(a0, a1, &output);
113 return (uint32_t)_mm_cvtsi128_si32(output);
114}
115
116static WEBP_INLINE uint32_t Average3_SSE2(uint32_t a0, uint32_t a1,
117 uint32_t a2) {
118 const __m128i zero = _mm_setzero_si128();
119 const __m128i avg1 = Average2_uint32_16_SSE2(a0, a2);
120 const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a1), zero);
121 const __m128i sum = _mm_add_epi16(avg1, A1);
122 const __m128i avg2 = _mm_srli_epi16(sum, 1);
123 const __m128i A2 = _mm_packus_epi16(avg2, avg2);
124 return (uint32_t)_mm_cvtsi128_si32(A2);
125}
126
127static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
128 uint32_t a2, uint32_t a3) {
129 const __m128i avg1 = Average2_uint32_16_SSE2(a0, a1);
130 const __m128i avg2 = Average2_uint32_16_SSE2(a2, a3);
131 const __m128i sum = _mm_add_epi16(avg2, avg1);
132 const __m128i avg3 = _mm_srli_epi16(sum, 1);
133 const __m128i A0 = _mm_packus_epi16(avg3, avg3);
134 return (uint32_t)_mm_cvtsi128_si32(A0);
135}
136
137static uint32_t Predictor5_SSE2(const uint32_t* const left,
138 const uint32_t* const top) {
139 const uint32_t pred = Average3_SSE2(*left, top[0], top[1]);
140 return pred;
141}
142static uint32_t Predictor6_SSE2(const uint32_t* const left,
143 const uint32_t* const top) {
144 const uint32_t pred = Average2_SSE2(*left, top[-1]);
145 return pred;
146}
147static uint32_t Predictor7_SSE2(const uint32_t* const left,
148 const uint32_t* const top) {
149 const uint32_t pred = Average2_SSE2(*left, top[0]);
150 return pred;
151}
152static uint32_t Predictor8_SSE2(const uint32_t* const left,
153 const uint32_t* const top) {
154 const uint32_t pred = Average2_SSE2(top[-1], top[0]);
155 (void)left;
156 return pred;
157}
158static uint32_t Predictor9_SSE2(const uint32_t* const left,
159 const uint32_t* const top) {
160 const uint32_t pred = Average2_SSE2(top[0], top[1]);
161 (void)left;
162 return pred;
163}
164static uint32_t Predictor10_SSE2(const uint32_t* const left,
165 const uint32_t* const top) {
166 const uint32_t pred = Average4_SSE2(*left, top[-1], top[0], top[1]);
167 return pred;
168}
169static uint32_t Predictor11_SSE2(const uint32_t* const left,
170 const uint32_t* const top) {
171 const uint32_t pred = Select_SSE2(top[0], *left, top[-1]);
172 return pred;
173}
174static uint32_t Predictor12_SSE2(const uint32_t* const left,
175 const uint32_t* const top) {
176 const uint32_t pred = ClampedAddSubtractFull_SSE2(*left, top[0], top[-1]);
177 return pred;
178}
179static uint32_t Predictor13_SSE2(const uint32_t* const left,
180 const uint32_t* const top) {
181 const uint32_t pred = ClampedAddSubtractHalf_SSE2(*left, top[0], top[-1]);
182 return pred;
183}
184
185// Batch versions of those functions.
186
187// Predictor0: ARGB_BLACK.
188static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
189 int num_pixels, uint32_t* out) {
190 int i;
191 const __m128i black = _mm_set1_epi32((int)ARGB_BLACK);
192 for (i = 0; i + 4 <= num_pixels; i += 4) {
193 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
194 const __m128i res = _mm_add_epi8(src, black);
195 _mm_storeu_si128((__m128i*)&out[i], res);
196 }
197 if (i != num_pixels) {
198 VP8LPredictorsAdd_C[0](in + i, NULL, num_pixels - i, out + i);
199 }
200 (void)upper;
201}
202
203// Predictor1: left.
204static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,
205 int num_pixels, uint32_t* out) {
206 int i;
207 __m128i prev = _mm_set1_epi32((int)out[-1]);
208 for (i = 0; i + 4 <= num_pixels; i += 4) {
209 // a | b | c | d
210 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
211 // 0 | a | b | c
212 const __m128i shift0 = _mm_slli_si128(src, 4);
213 // a | a + b | b + c | c + d
214 const __m128i sum0 = _mm_add_epi8(src, shift0);
215 // 0 | 0 | a | a + b
216 const __m128i shift1 = _mm_slli_si128(sum0, 8);
217 // a | a + b | a + b + c | a + b + c + d
218 const __m128i sum1 = _mm_add_epi8(sum0, shift1);
219 const __m128i res = _mm_add_epi8(sum1, prev);
220 _mm_storeu_si128((__m128i*)&out[i], res);
221 // replicate prev output on the four lanes
222 prev = _mm_shuffle_epi32(res, (3 << 0) | (3 << 2) | (3 << 4) | (3 << 6));
223 }
224 if (i != num_pixels) {
225 VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i);
226 }
227}
228
229// Macro that adds 32-bit integers from IN using mod 256 arithmetic
230// per 8 bit channel.
231#define GENERATE_PREDICTOR_1(X, IN) \
232static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
233 int num_pixels, uint32_t* out) { \
234 int i; \
235 for (i = 0; i + 4 <= num_pixels; i += 4) { \
236 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
237 const __m128i other = _mm_loadu_si128((const __m128i*)&(IN)); \
238 const __m128i res = _mm_add_epi8(src, other); \
239 _mm_storeu_si128((__m128i*)&out[i], res); \
240 } \
241 if (i != num_pixels) { \
242 VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
243 } \
244}
245
246// Predictor2: Top.
247GENERATE_PREDICTOR_1(2, upper[i])
248// Predictor3: Top-right.
249GENERATE_PREDICTOR_1(3, upper[i + 1])
250// Predictor4: Top-left.
251GENERATE_PREDICTOR_1(4, upper[i - 1])
252#undef GENERATE_PREDICTOR_1
253
254// Due to averages with integers, values cannot be accumulated in parallel for
255// predictors 5 to 7.
256GENERATE_PREDICTOR_ADD(Predictor5_SSE2, PredictorAdd5_SSE2)
257GENERATE_PREDICTOR_ADD(Predictor6_SSE2, PredictorAdd6_SSE2)
258GENERATE_PREDICTOR_ADD(Predictor7_SSE2, PredictorAdd7_SSE2)
259
260#define GENERATE_PREDICTOR_2(X, IN) \
261static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
262 int num_pixels, uint32_t* out) { \
263 int i; \
264 for (i = 0; i + 4 <= num_pixels; i += 4) { \
265 const __m128i Tother = _mm_loadu_si128((const __m128i*)&(IN)); \
266 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); \
267 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
268 __m128i avg, res; \
269 Average2_m128i(&T, &Tother, &avg); \
270 res = _mm_add_epi8(avg, src); \
271 _mm_storeu_si128((__m128i*)&out[i], res); \
272 } \
273 if (i != num_pixels) { \
274 VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
275 } \
276}
277// Predictor8: average TL T.
278GENERATE_PREDICTOR_2(8, upper[i - 1])
279// Predictor9: average T TR.
280GENERATE_PREDICTOR_2(9, upper[i + 1])
281#undef GENERATE_PREDICTOR_2
282
283// Predictor10: average of (average of (L,TL), average of (T, TR)).
284#define DO_PRED10(OUT) do { \
285 __m128i avgLTL, avg; \
286 Average2_m128i(&L, &TL, &avgLTL); \
287 Average2_m128i(&avgTTR, &avgLTL, &avg); \
288 L = _mm_add_epi8(avg, src); \
289 out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L); \
290} while (0)
291
292#define DO_PRED10_SHIFT do { \
293 /* Rotate the pre-computed values for the next iteration.*/ \
294 avgTTR = _mm_srli_si128(avgTTR, 4); \
295 TL = _mm_srli_si128(TL, 4); \
296 src = _mm_srli_si128(src, 4); \
297} while (0)
298
299static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
300 int num_pixels, uint32_t* out) {
301 int i;
302 __m128i L = _mm_cvtsi32_si128((int)out[-1]);
303 for (i = 0; i + 4 <= num_pixels; i += 4) {
304 __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
305 __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
306 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
307 const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
308 __m128i avgTTR;
309 Average2_m128i(&T, &TR, &avgTTR);
310 DO_PRED10(0);
311 DO_PRED10_SHIFT;
312 DO_PRED10(1);
313 DO_PRED10_SHIFT;
314 DO_PRED10(2);
315 DO_PRED10_SHIFT;
316 DO_PRED10(3);
317 }
318 if (i != num_pixels) {
319 VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
320 }
321}
322#undef DO_PRED10
323#undef DO_PRED10_SHIFT
324
325// Predictor11: select.
326#define DO_PRED11(OUT) do { \
327 const __m128i L_lo = _mm_unpacklo_epi32(L, T); \
328 const __m128i TL_lo = _mm_unpacklo_epi32(TL, T); \
329 const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/ \
330 const __m128i mask = _mm_cmpgt_epi32(pb, pa); \
331 const __m128i A = _mm_and_si128(mask, L); \
332 const __m128i B = _mm_andnot_si128(mask, T); \
333 const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \
334 L = _mm_add_epi8(src, pred); \
335 out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L); \
336} while (0)
337
338#define DO_PRED11_SHIFT do { \
339 /* Shift the pre-computed value for the next iteration.*/ \
340 T = _mm_srli_si128(T, 4); \
341 TL = _mm_srli_si128(TL, 4); \
342 src = _mm_srli_si128(src, 4); \
343 pa = _mm_srli_si128(pa, 4); \
344} while (0)
345
346static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
347 int num_pixels, uint32_t* out) {
348 int i;
349 __m128i pa;
350 __m128i L = _mm_cvtsi32_si128((int)out[-1]);
351 for (i = 0; i + 4 <= num_pixels; i += 4) {
352 __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
353 __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
354 __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
355 {
356 // We can unpack with any value on the upper 32 bits, provided it's the
357 // same on both operands (so that their sum of abs diff is zero). Here we
358 // use T.
359 const __m128i T_lo = _mm_unpacklo_epi32(T, T);
360 const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);
361 const __m128i T_hi = _mm_unpackhi_epi32(T, T);
362 const __m128i TL_hi = _mm_unpackhi_epi32(TL, T);
363 const __m128i s_lo = _mm_sad_epu8(T_lo, TL_lo);
364 const __m128i s_hi = _mm_sad_epu8(T_hi, TL_hi);
365 pa = _mm_packs_epi32(s_lo, s_hi); // pa = sum |T-TL|
366 }
367 DO_PRED11(0);
368 DO_PRED11_SHIFT;
369 DO_PRED11(1);
370 DO_PRED11_SHIFT;
371 DO_PRED11(2);
372 DO_PRED11_SHIFT;
373 DO_PRED11(3);
374 }
375 if (i != num_pixels) {
376 VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
377 }
378}
379#undef DO_PRED11
380#undef DO_PRED11_SHIFT
381
382// Predictor12: ClampedAddSubtractFull.
383#define DO_PRED12(DIFF, LANE, OUT) do { \
384 const __m128i all = _mm_add_epi16(L, (DIFF)); \
385 const __m128i alls = _mm_packus_epi16(all, all); \
386 const __m128i res = _mm_add_epi8(src, alls); \
387 out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(res); \
388 L = _mm_unpacklo_epi8(res, zero); \
389} while (0)
390
391#define DO_PRED12_SHIFT(DIFF, LANE) do { \
392 /* Shift the pre-computed value for the next iteration.*/ \
393 if ((LANE) == 0) (DIFF) = _mm_srli_si128((DIFF), 8); \
394 src = _mm_srli_si128(src, 4); \
395} while (0)
396
397static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
398 int num_pixels, uint32_t* out) {
399 int i;
400 const __m128i zero = _mm_setzero_si128();
401 const __m128i L8 = _mm_cvtsi32_si128((int)out[-1]);
402 __m128i L = _mm_unpacklo_epi8(L8, zero);
403 for (i = 0; i + 4 <= num_pixels; i += 4) {
404 // Load 4 pixels at a time.
405 __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
406 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
407 const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
408 const __m128i T_hi = _mm_unpackhi_epi8(T, zero);
409 const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
410 const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
411 const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);
412 __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
413 __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
414 DO_PRED12(diff_lo, 0, 0);
415 DO_PRED12_SHIFT(diff_lo, 0);
416 DO_PRED12(diff_lo, 1, 1);
417 DO_PRED12_SHIFT(diff_lo, 1);
418 DO_PRED12(diff_hi, 0, 2);
419 DO_PRED12_SHIFT(diff_hi, 0);
420 DO_PRED12(diff_hi, 1, 3);
421 }
422 if (i != num_pixels) {
423 VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i);
424 }
425}
426#undef DO_PRED12
427#undef DO_PRED12_SHIFT
428
429// Due to averages with integers, values cannot be accumulated in parallel for
430// predictors 13.
431GENERATE_PREDICTOR_ADD(Predictor13_SSE2, PredictorAdd13_SSE2)
432
433//------------------------------------------------------------------------------
434// Subtract-Green Transform
435
436static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels,
437 uint32_t* dst) {
438 int i;
439 for (i = 0; i + 4 <= num_pixels; i += 4) {
440 const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
441 const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g
442 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
443 const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g
444 const __m128i out = _mm_add_epi8(in, C);
445 _mm_storeu_si128((__m128i*)&dst[i], out);
446 }
447 // fallthrough and finish off with plain-C
448 if (i != num_pixels) {
449 VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i);
450 }
451}
452
453//------------------------------------------------------------------------------
454// Color Transform
455
456static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
457 const uint32_t* const src,
458 int num_pixels, uint32_t* dst) {
459// sign-extended multiplying constants, pre-shifted by 5.
460#define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend
461#define MK_CST_16(HI, LO) \
462 _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
463 const __m128i mults_rb = MK_CST_16(CST(green_to_red_), CST(green_to_blue_));
464 const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0);
465#undef MK_CST_16
466#undef CST
467 const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00); // alpha-green masks
468 int i;
469 for (i = 0; i + 4 <= num_pixels; i += 4) {
470 const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
471 const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0
472 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
473 const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0
474 const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1
475 const __m128i E = _mm_add_epi8(in, D); // x r' x b'
476 const __m128i F = _mm_slli_epi16(E, 8); // r' 0 b' 0
477 const __m128i G = _mm_mulhi_epi16(F, mults_b2); // x db2 0 0
478 const __m128i H = _mm_srli_epi32(G, 8); // 0 x db2 0
479 const __m128i I = _mm_add_epi8(H, F); // r' x b'' 0
480 const __m128i J = _mm_srli_epi16(I, 8); // 0 r' 0 b''
481 const __m128i out = _mm_or_si128(J, A);
482 _mm_storeu_si128((__m128i*)&dst[i], out);
483 }
484 // Fall-back to C-version for left-overs.
485 if (i != num_pixels) {
486 VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
487 }
488}
489
490//------------------------------------------------------------------------------
491// Color-space conversion functions
492
493static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
494 uint8_t* dst) {
495 const __m128i* in = (const __m128i*)src;
496 __m128i* out = (__m128i*)dst;
497
498 while (num_pixels >= 32) {
499 // Load the BGRA buffers.
500 __m128i in0 = _mm_loadu_si128(in + 0);
501 __m128i in1 = _mm_loadu_si128(in + 1);
502 __m128i in2 = _mm_loadu_si128(in + 2);
503 __m128i in3 = _mm_loadu_si128(in + 3);
504 __m128i in4 = _mm_loadu_si128(in + 4);
505 __m128i in5 = _mm_loadu_si128(in + 5);
506 __m128i in6 = _mm_loadu_si128(in + 6);
507 __m128i in7 = _mm_loadu_si128(in + 7);
508 VP8L32bToPlanar_SSE2(&in0, &in1, &in2, &in3);
509 VP8L32bToPlanar_SSE2(&in4, &in5, &in6, &in7);
510 // At this points, in1/in5 contains red only, in2/in6 green only ...
511 // Pack the colors in 24b RGB.
512 VP8PlanarTo24b_SSE2(&in1, &in5, &in2, &in6, &in3, &in7);
513 _mm_storeu_si128(out + 0, in1);
514 _mm_storeu_si128(out + 1, in5);
515 _mm_storeu_si128(out + 2, in2);
516 _mm_storeu_si128(out + 3, in6);
517 _mm_storeu_si128(out + 4, in3);
518 _mm_storeu_si128(out + 5, in7);
519 in += 8;
520 out += 6;
521 num_pixels -= 32;
522 }
523 // left-overs
524 if (num_pixels > 0) {
525 VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
526 }
527}
528
529static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
530 int num_pixels, uint8_t* dst) {
531 const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ff);
532 const __m128i* in = (const __m128i*)src;
533 __m128i* out = (__m128i*)dst;
534 while (num_pixels >= 8) {
535 const __m128i A1 = _mm_loadu_si128(in++);
536 const __m128i A2 = _mm_loadu_si128(in++);
537 const __m128i B1 = _mm_and_si128(A1, red_blue_mask); // R 0 B 0
538 const __m128i B2 = _mm_and_si128(A2, red_blue_mask); // R 0 B 0
539 const __m128i C1 = _mm_andnot_si128(red_blue_mask, A1); // 0 G 0 A
540 const __m128i C2 = _mm_andnot_si128(red_blue_mask, A2); // 0 G 0 A
541 const __m128i D1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(2, 3, 0, 1));
542 const __m128i D2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(2, 3, 0, 1));
543 const __m128i E1 = _mm_shufflehi_epi16(D1, _MM_SHUFFLE(2, 3, 0, 1));
544 const __m128i E2 = _mm_shufflehi_epi16(D2, _MM_SHUFFLE(2, 3, 0, 1));
545 const __m128i F1 = _mm_or_si128(E1, C1);
546 const __m128i F2 = _mm_or_si128(E2, C2);
547 _mm_storeu_si128(out++, F1);
548 _mm_storeu_si128(out++, F2);
549 num_pixels -= 8;
550 }
551 // left-overs
552 if (num_pixels > 0) {
553 VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
554 }
555}
556
557static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
558 int num_pixels, uint8_t* dst) {
559 const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
560 const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0);
561 const __m128i* in = (const __m128i*)src;
562 __m128i* out = (__m128i*)dst;
563 while (num_pixels >= 8) {
564 const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
565 const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
566 const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...
567 const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...
568 const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...
569 const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...
570 const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7
571 const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7
572 const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7
573 const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7
574 const __m128i ga1 = _mm_srli_epi16(ga0, 4); // g0-|g1-|...|a6-|a7-
575 const __m128i rb1 = _mm_and_si128(rb0, mask_0xf0); // -r0|-r1|...|-b6|-a7
576 const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f); // g0-|g1-|...|a6-|a7-
577 const __m128i rgba0 = _mm_or_si128(ga2, rb1); // rg0..rg7 | ba0..ba7
578 const __m128i rgba1 = _mm_srli_si128(rgba0, 8); // ba0..ba7 | 0
579#if (WEBP_SWAP_16BIT_CSP == 1)
580 const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0); // barg0...barg7
581#else
582 const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1); // rgba0...rgba7
583#endif
584 _mm_storeu_si128(out++, rgba);
585 num_pixels -= 8;
586 }
587 // left-overs
588 if (num_pixels > 0) {
589 VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
590 }
591}
592
593static void ConvertBGRAToRGB565_SSE2(const uint32_t* src,
594 int num_pixels, uint8_t* dst) {
595 const __m128i mask_0xe0 = _mm_set1_epi8((char)0xe0);
596 const __m128i mask_0xf8 = _mm_set1_epi8((char)0xf8);
597 const __m128i mask_0x07 = _mm_set1_epi8(0x07);
598 const __m128i* in = (const __m128i*)src;
599 __m128i* out = (__m128i*)dst;
600 while (num_pixels >= 8) {
601 const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
602 const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
603 const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...
604 const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...
605 const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...
606 const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...
607 const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7
608 const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7
609 const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7
610 const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7
611 const __m128i rb1 = _mm_and_si128(rb0, mask_0xf8); // -r0..-r7|-b0..-b7
612 const __m128i g_lo1 = _mm_srli_epi16(ga0, 5);
613 const __m128i g_lo2 = _mm_and_si128(g_lo1, mask_0x07); // g0-...g7-|xx (3b)
614 const __m128i g_hi1 = _mm_slli_epi16(ga0, 3);
615 const __m128i g_hi2 = _mm_and_si128(g_hi1, mask_0xe0); // -g0...-g7|xx (3b)
616 const __m128i b0 = _mm_srli_si128(rb1, 8); // -b0...-b7|0
617 const __m128i rg1 = _mm_or_si128(rb1, g_lo2); // gr0...gr7|xx
618 const __m128i b1 = _mm_srli_epi16(b0, 3);
619 const __m128i gb1 = _mm_or_si128(b1, g_hi2); // bg0...bg7|xx
620#if (WEBP_SWAP_16BIT_CSP == 1)
621 const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1); // rggb0...rggb7
622#else
623 const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1); // bgrb0...bgrb7
624#endif
625 _mm_storeu_si128(out++, rgba);
626 num_pixels -= 8;
627 }
628 // left-overs
629 if (num_pixels > 0) {
630 VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
631 }
632}
633
634static void ConvertBGRAToBGR_SSE2(const uint32_t* src,
635 int num_pixels, uint8_t* dst) {
636 const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
637 const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
638 const __m128i* in = (const __m128i*)src;
639 const uint8_t* const end = dst + num_pixels * 3;
640 // the last storel_epi64 below writes 8 bytes starting at offset 18
641 while (dst + 26 <= end) {
642 const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
643 const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
644 const __m128i a0l = _mm_and_si128(bgra0, mask_l); // bgr0|0|bgr0|0
645 const __m128i a4l = _mm_and_si128(bgra4, mask_l); // bgr0|0|bgr0|0
646 const __m128i a0h = _mm_and_si128(bgra0, mask_h); // 0|bgr0|0|bgr0
647 const __m128i a4h = _mm_and_si128(bgra4, mask_h); // 0|bgr0|0|bgr0
648 const __m128i b0h = _mm_srli_epi64(a0h, 8); // 000b|gr00|000b|gr00
649 const __m128i b4h = _mm_srli_epi64(a4h, 8); // 000b|gr00|000b|gr00
650 const __m128i c0 = _mm_or_si128(a0l, b0h); // rgbrgb00|rgbrgb00
651 const __m128i c4 = _mm_or_si128(a4l, b4h); // rgbrgb00|rgbrgb00
652 const __m128i c2 = _mm_srli_si128(c0, 8);
653 const __m128i c6 = _mm_srli_si128(c4, 8);
654 _mm_storel_epi64((__m128i*)(dst + 0), c0);
655 _mm_storel_epi64((__m128i*)(dst + 6), c2);
656 _mm_storel_epi64((__m128i*)(dst + 12), c4);
657 _mm_storel_epi64((__m128i*)(dst + 18), c6);
658 dst += 24;
659 num_pixels -= 8;
660 }
661 // left-overs
662 if (num_pixels > 0) {
663 VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst);
664 }
665}
666
667//------------------------------------------------------------------------------
668// Entry point
669
670extern void VP8LDspInitSSE2(void);
671
672WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) {
673 VP8LPredictors[5] = Predictor5_SSE2;
674 VP8LPredictors[6] = Predictor6_SSE2;
675 VP8LPredictors[7] = Predictor7_SSE2;
676 VP8LPredictors[8] = Predictor8_SSE2;
677 VP8LPredictors[9] = Predictor9_SSE2;
678 VP8LPredictors[10] = Predictor10_SSE2;
679 VP8LPredictors[11] = Predictor11_SSE2;
680 VP8LPredictors[12] = Predictor12_SSE2;
681 VP8LPredictors[13] = Predictor13_SSE2;
682
683 VP8LPredictorsAdd[0] = PredictorAdd0_SSE2;
684 VP8LPredictorsAdd[1] = PredictorAdd1_SSE2;
685 VP8LPredictorsAdd[2] = PredictorAdd2_SSE2;
686 VP8LPredictorsAdd[3] = PredictorAdd3_SSE2;
687 VP8LPredictorsAdd[4] = PredictorAdd4_SSE2;
688 VP8LPredictorsAdd[5] = PredictorAdd5_SSE2;
689 VP8LPredictorsAdd[6] = PredictorAdd6_SSE2;
690 VP8LPredictorsAdd[7] = PredictorAdd7_SSE2;
691 VP8LPredictorsAdd[8] = PredictorAdd8_SSE2;
692 VP8LPredictorsAdd[9] = PredictorAdd9_SSE2;
693 VP8LPredictorsAdd[10] = PredictorAdd10_SSE2;
694 VP8LPredictorsAdd[11] = PredictorAdd11_SSE2;
695 VP8LPredictorsAdd[12] = PredictorAdd12_SSE2;
696 VP8LPredictorsAdd[13] = PredictorAdd13_SSE2;
697
698 VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_SSE2;
699 VP8LTransformColorInverse = TransformColorInverse_SSE2;
700
701 VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE2;
702 VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_SSE2;
703 VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_SSE2;
704 VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_SSE2;
705 VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE2;
706}
707
708#else // !WEBP_USE_SSE2
709
710WEBP_DSP_INIT_STUB(VP8LDspInitSSE2)
711
712#endif // WEBP_USE_SSE2
713