| 1 | // Copyright 2014 Google Inc. All Rights Reserved. | 
|---|
| 2 | // | 
|---|
| 3 | // Use of this source code is governed by a BSD-style license | 
|---|
| 4 | // that can be found in the COPYING file in the root of the source | 
|---|
| 5 | // tree. An additional intellectual property rights grant can be found | 
|---|
| 6 | // in the file PATENTS. All contributing project authors may | 
|---|
| 7 | // be found in the AUTHORS file in the root of the source tree. | 
|---|
| 8 | // ----------------------------------------------------------------------------- | 
|---|
| 9 | // | 
|---|
| 10 | // Utilities for processing transparent channel. | 
|---|
| 11 | // | 
|---|
| 12 | // Author: Skal (pascal.massimino@gmail.com) | 
|---|
| 13 |  | 
|---|
| 14 | #include "./dsp.h" | 
|---|
| 15 |  | 
|---|
| 16 | #if defined(WEBP_USE_SSE2) | 
|---|
| 17 | #include <emmintrin.h> | 
|---|
| 18 |  | 
|---|
| 19 | //------------------------------------------------------------------------------ | 
|---|
| 20 |  | 
|---|
| 21 | static int DispatchAlpha(const uint8_t* alpha, int alpha_stride, | 
|---|
| 22 | int width, int height, | 
|---|
| 23 | uint8_t* dst, int dst_stride) { | 
|---|
| 24 | // alpha_and stores an 'and' operation of all the alpha[] values. The final | 
|---|
| 25 | // value is not 0xff if any of the alpha[] is not equal to 0xff. | 
|---|
| 26 | uint32_t alpha_and = 0xff; | 
|---|
| 27 | int i, j; | 
|---|
| 28 | const __m128i zero = _mm_setzero_si128(); | 
|---|
| 29 | const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u);  // to preserve RGB | 
|---|
| 30 | const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u); | 
|---|
| 31 | __m128i all_alphas = all_0xff; | 
|---|
| 32 |  | 
|---|
| 33 | // We must be able to access 3 extra bytes after the last written byte | 
|---|
| 34 | // 'dst[4 * width - 4]', because we don't know if alpha is the first or the | 
|---|
| 35 | // last byte of the quadruplet. | 
|---|
| 36 | const int limit = (width - 1) & ~7; | 
|---|
| 37 |  | 
|---|
| 38 | for (j = 0; j < height; ++j) { | 
|---|
| 39 | __m128i* out = (__m128i*)dst; | 
|---|
| 40 | for (i = 0; i < limit; i += 8) { | 
|---|
| 41 | // load 8 alpha bytes | 
|---|
| 42 | const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[i]); | 
|---|
| 43 | const __m128i a1 = _mm_unpacklo_epi8(a0, zero); | 
|---|
| 44 | const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero); | 
|---|
| 45 | const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero); | 
|---|
| 46 | // load 8 dst pixels (32 bytes) | 
|---|
| 47 | const __m128i b0_lo = _mm_loadu_si128(out + 0); | 
|---|
| 48 | const __m128i b0_hi = _mm_loadu_si128(out + 1); | 
|---|
| 49 | // mask dst alpha values | 
|---|
| 50 | const __m128i b1_lo = _mm_and_si128(b0_lo, rgb_mask); | 
|---|
| 51 | const __m128i b1_hi = _mm_and_si128(b0_hi, rgb_mask); | 
|---|
| 52 | // combine | 
|---|
| 53 | const __m128i b2_lo = _mm_or_si128(b1_lo, a2_lo); | 
|---|
| 54 | const __m128i b2_hi = _mm_or_si128(b1_hi, a2_hi); | 
|---|
| 55 | // store | 
|---|
| 56 | _mm_storeu_si128(out + 0, b2_lo); | 
|---|
| 57 | _mm_storeu_si128(out + 1, b2_hi); | 
|---|
| 58 | // accumulate eight alpha 'and' in parallel | 
|---|
| 59 | all_alphas = _mm_and_si128(all_alphas, a0); | 
|---|
| 60 | out += 2; | 
|---|
| 61 | } | 
|---|
| 62 | for (; i < width; ++i) { | 
|---|
| 63 | const uint32_t alpha_value = alpha[i]; | 
|---|
| 64 | dst[4 * i] = alpha_value; | 
|---|
| 65 | alpha_and &= alpha_value; | 
|---|
| 66 | } | 
|---|
| 67 | alpha += alpha_stride; | 
|---|
| 68 | dst += dst_stride; | 
|---|
| 69 | } | 
|---|
| 70 | // Combine the eight alpha 'and' into a 8-bit mask. | 
|---|
| 71 | alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff)); | 
|---|
| 72 | return (alpha_and != 0xff); | 
|---|
| 73 | } | 
|---|
| 74 |  | 
|---|
| 75 | static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride, | 
|---|
| 76 | int width, int height, | 
|---|
| 77 | uint32_t* dst, int dst_stride) { | 
|---|
| 78 | int i, j; | 
|---|
| 79 | const __m128i zero = _mm_setzero_si128(); | 
|---|
| 80 | const int limit = width & ~15; | 
|---|
| 81 | for (j = 0; j < height; ++j) { | 
|---|
| 82 | for (i = 0; i < limit; i += 16) {   // process 16 alpha bytes | 
|---|
| 83 | const __m128i a0 = _mm_loadu_si128((const __m128i*)&alpha[i]); | 
|---|
| 84 | const __m128i a1 = _mm_unpacklo_epi8(zero, a0);  // note the 'zero' first! | 
|---|
| 85 | const __m128i b1 = _mm_unpackhi_epi8(zero, a0); | 
|---|
| 86 | const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero); | 
|---|
| 87 | const __m128i b2_lo = _mm_unpacklo_epi16(b1, zero); | 
|---|
| 88 | const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero); | 
|---|
| 89 | const __m128i b2_hi = _mm_unpackhi_epi16(b1, zero); | 
|---|
| 90 | _mm_storeu_si128((__m128i*)&dst[i +  0], a2_lo); | 
|---|
| 91 | _mm_storeu_si128((__m128i*)&dst[i +  4], a2_hi); | 
|---|
| 92 | _mm_storeu_si128((__m128i*)&dst[i +  8], b2_lo); | 
|---|
| 93 | _mm_storeu_si128((__m128i*)&dst[i + 12], b2_hi); | 
|---|
| 94 | } | 
|---|
| 95 | for (; i < width; ++i) dst[i] = alpha[i] << 8; | 
|---|
| 96 | alpha += alpha_stride; | 
|---|
| 97 | dst += dst_stride; | 
|---|
| 98 | } | 
|---|
| 99 | } | 
|---|
| 100 |  | 
|---|
| 101 | static int (const uint8_t* argb, int argb_stride, | 
|---|
| 102 | int width, int height, | 
|---|
| 103 | uint8_t* alpha, int alpha_stride) { | 
|---|
| 104 | // alpha_and stores an 'and' operation of all the alpha[] values. The final | 
|---|
| 105 | // value is not 0xff if any of the alpha[] is not equal to 0xff. | 
|---|
| 106 | uint32_t alpha_and = 0xff; | 
|---|
| 107 | int i, j; | 
|---|
| 108 | const __m128i a_mask = _mm_set1_epi32(0xffu);  // to preserve alpha | 
|---|
| 109 | const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u); | 
|---|
| 110 | __m128i all_alphas = all_0xff; | 
|---|
| 111 |  | 
|---|
| 112 | // We must be able to access 3 extra bytes after the last written byte | 
|---|
| 113 | // 'src[4 * width - 4]', because we don't know if alpha is the first or the | 
|---|
| 114 | // last byte of the quadruplet. | 
|---|
| 115 | const int limit = (width - 1) & ~7; | 
|---|
| 116 |  | 
|---|
| 117 | for (j = 0; j < height; ++j) { | 
|---|
| 118 | const __m128i* src = (const __m128i*)argb; | 
|---|
| 119 | for (i = 0; i < limit; i += 8) { | 
|---|
| 120 | // load 32 argb bytes | 
|---|
| 121 | const __m128i a0 = _mm_loadu_si128(src + 0); | 
|---|
| 122 | const __m128i a1 = _mm_loadu_si128(src + 1); | 
|---|
| 123 | const __m128i b0 = _mm_and_si128(a0, a_mask); | 
|---|
| 124 | const __m128i b1 = _mm_and_si128(a1, a_mask); | 
|---|
| 125 | const __m128i c0 = _mm_packs_epi32(b0, b1); | 
|---|
| 126 | const __m128i d0 = _mm_packus_epi16(c0, c0); | 
|---|
| 127 | // store | 
|---|
| 128 | _mm_storel_epi64((__m128i*)&alpha[i], d0); | 
|---|
| 129 | // accumulate eight alpha 'and' in parallel | 
|---|
| 130 | all_alphas = _mm_and_si128(all_alphas, d0); | 
|---|
| 131 | src += 2; | 
|---|
| 132 | } | 
|---|
| 133 | for (; i < width; ++i) { | 
|---|
| 134 | const uint32_t alpha_value = argb[4 * i]; | 
|---|
| 135 | alpha[i] = alpha_value; | 
|---|
| 136 | alpha_and &= alpha_value; | 
|---|
| 137 | } | 
|---|
| 138 | argb += argb_stride; | 
|---|
| 139 | alpha += alpha_stride; | 
|---|
| 140 | } | 
|---|
| 141 | // Combine the eight alpha 'and' into a 8-bit mask. | 
|---|
| 142 | alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff)); | 
|---|
| 143 | return (alpha_and == 0xff); | 
|---|
| 144 | } | 
|---|
| 145 |  | 
|---|
| 146 | //------------------------------------------------------------------------------ | 
|---|
| 147 | // Non-dither premultiplied modes | 
|---|
| 148 |  | 
|---|
| 149 | #define MULTIPLIER(a)   ((a) * 0x8081) | 
|---|
| 150 | #define PREMULTIPLY(x, m) (((x) * (m)) >> 23) | 
|---|
| 151 |  | 
|---|
| 152 | // We can't use a 'const int' for the SHUFFLE value, because it has to be an | 
|---|
| 153 | // immediate in the _mm_shufflexx_epi16() instruction. We really need a macro. | 
|---|
| 154 | // We use: v / 255 = (v * 0x8081) >> 23, where v = alpha * {r,g,b} is a 16bit | 
|---|
| 155 | // value. | 
|---|
| 156 | #define APPLY_ALPHA(RGBX, SHUFFLE) do {                              \ | 
|---|
| 157 | const __m128i argb0 = _mm_loadu_si128((const __m128i*)&(RGBX));    \ | 
|---|
| 158 | const __m128i argb1_lo = _mm_unpacklo_epi8(argb0, zero);           \ | 
|---|
| 159 | const __m128i argb1_hi = _mm_unpackhi_epi8(argb0, zero);           \ | 
|---|
| 160 | const __m128i alpha0_lo = _mm_or_si128(argb1_lo, kMask);           \ | 
|---|
| 161 | const __m128i alpha0_hi = _mm_or_si128(argb1_hi, kMask);           \ | 
|---|
| 162 | const __m128i alpha1_lo = _mm_shufflelo_epi16(alpha0_lo, SHUFFLE); \ | 
|---|
| 163 | const __m128i alpha1_hi = _mm_shufflelo_epi16(alpha0_hi, SHUFFLE); \ | 
|---|
| 164 | const __m128i alpha2_lo = _mm_shufflehi_epi16(alpha1_lo, SHUFFLE); \ | 
|---|
| 165 | const __m128i alpha2_hi = _mm_shufflehi_epi16(alpha1_hi, SHUFFLE); \ | 
|---|
| 166 | /* alpha2 = [ff a0 a0 a0][ff a1 a1 a1] */                          \ | 
|---|
| 167 | const __m128i A0_lo = _mm_mullo_epi16(alpha2_lo, argb1_lo);        \ | 
|---|
| 168 | const __m128i A0_hi = _mm_mullo_epi16(alpha2_hi, argb1_hi);        \ | 
|---|
| 169 | const __m128i A1_lo = _mm_mulhi_epu16(A0_lo, kMult);               \ | 
|---|
| 170 | const __m128i A1_hi = _mm_mulhi_epu16(A0_hi, kMult);               \ | 
|---|
| 171 | const __m128i A2_lo = _mm_srli_epi16(A1_lo, 7);                    \ | 
|---|
| 172 | const __m128i A2_hi = _mm_srli_epi16(A1_hi, 7);                    \ | 
|---|
| 173 | const __m128i A3 = _mm_packus_epi16(A2_lo, A2_hi);                 \ | 
|---|
| 174 | _mm_storeu_si128((__m128i*)&(RGBX), A3);                           \ | 
|---|
| 175 | } while (0) | 
|---|
| 176 |  | 
|---|
| 177 | static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first, | 
|---|
| 178 | int w, int h, int stride) { | 
|---|
| 179 | const __m128i zero = _mm_setzero_si128(); | 
|---|
| 180 | const __m128i kMult = _mm_set1_epi16(0x8081u); | 
|---|
| 181 | const __m128i kMask = _mm_set_epi16(0, 0xff, 0xff, 0, 0, 0xff, 0xff, 0); | 
|---|
| 182 | const int kSpan = 4; | 
|---|
| 183 | while (h-- > 0) { | 
|---|
| 184 | uint32_t* const rgbx = (uint32_t*)rgba; | 
|---|
| 185 | int i; | 
|---|
| 186 | if (!alpha_first) { | 
|---|
| 187 | for (i = 0; i + kSpan <= w; i += kSpan) { | 
|---|
| 188 | APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(2, 3, 3, 3)); | 
|---|
| 189 | } | 
|---|
| 190 | } else { | 
|---|
| 191 | for (i = 0; i + kSpan <= w; i += kSpan) { | 
|---|
| 192 | APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 1)); | 
|---|
| 193 | } | 
|---|
| 194 | } | 
|---|
| 195 | // Finish with left-overs. | 
|---|
| 196 | for (; i < w; ++i) { | 
|---|
| 197 | uint8_t* const rgb = rgba + (alpha_first ? 1 : 0); | 
|---|
| 198 | const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3); | 
|---|
| 199 | const uint32_t a = alpha[4 * i]; | 
|---|
| 200 | if (a != 0xff) { | 
|---|
| 201 | const uint32_t mult = MULTIPLIER(a); | 
|---|
| 202 | rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult); | 
|---|
| 203 | rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult); | 
|---|
| 204 | rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult); | 
|---|
| 205 | } | 
|---|
| 206 | } | 
|---|
| 207 | rgba += stride; | 
|---|
| 208 | } | 
|---|
| 209 | } | 
|---|
| 210 | #undef MULTIPLIER | 
|---|
| 211 | #undef PREMULTIPLY | 
|---|
| 212 |  | 
|---|
| 213 | // ----------------------------------------------------------------------------- | 
|---|
| 214 | // Apply alpha value to rows | 
|---|
| 215 |  | 
|---|
| 216 | static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) { | 
|---|
| 217 | int x = 0; | 
|---|
| 218 | if (!inverse) { | 
|---|
| 219 | const int kSpan = 2; | 
|---|
| 220 | const __m128i zero = _mm_setzero_si128(); | 
|---|
| 221 | const __m128i k128 = _mm_set1_epi16(128); | 
|---|
| 222 | const __m128i kMult = _mm_set1_epi16(0x0101); | 
|---|
| 223 | const __m128i kMask = _mm_set_epi16(0, 0xff, 0, 0, 0, 0xff, 0, 0); | 
|---|
| 224 | for (x = 0; x + kSpan <= width; x += kSpan) { | 
|---|
| 225 | // To compute 'result = (int)(a * x / 255. + .5)', we use: | 
|---|
| 226 | //   tmp = a * v + 128, result = (tmp * 0x0101u) >> 16 | 
|---|
| 227 | const __m128i A0 = _mm_loadl_epi64((const __m128i*)&ptr[x]); | 
|---|
| 228 | const __m128i A1 = _mm_unpacklo_epi8(A0, zero); | 
|---|
| 229 | const __m128i A2 = _mm_or_si128(A1, kMask); | 
|---|
| 230 | const __m128i A3 = _mm_shufflelo_epi16(A2, _MM_SHUFFLE(2, 3, 3, 3)); | 
|---|
| 231 | const __m128i A4 = _mm_shufflehi_epi16(A3, _MM_SHUFFLE(2, 3, 3, 3)); | 
|---|
| 232 | // here, A4 = [ff a0 a0 a0][ff a1 a1 a1] | 
|---|
| 233 | const __m128i A5 = _mm_mullo_epi16(A4, A1); | 
|---|
| 234 | const __m128i A6 = _mm_add_epi16(A5, k128); | 
|---|
| 235 | const __m128i A7 = _mm_mulhi_epu16(A6, kMult); | 
|---|
| 236 | const __m128i A10 = _mm_packus_epi16(A7, zero); | 
|---|
| 237 | _mm_storel_epi64((__m128i*)&ptr[x], A10); | 
|---|
| 238 | } | 
|---|
| 239 | } | 
|---|
| 240 | width -= x; | 
|---|
| 241 | if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse); | 
|---|
| 242 | } | 
|---|
| 243 |  | 
|---|
| 244 | static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha, | 
|---|
| 245 | int width, int inverse) { | 
|---|
| 246 | int x = 0; | 
|---|
| 247 | if (!inverse) { | 
|---|
| 248 | const __m128i zero = _mm_setzero_si128(); | 
|---|
| 249 | const __m128i k128 = _mm_set1_epi16(128); | 
|---|
| 250 | const __m128i kMult = _mm_set1_epi16(0x0101); | 
|---|
| 251 | for (x = 0; x + 8 <= width; x += 8) { | 
|---|
| 252 | const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]); | 
|---|
| 253 | const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[x]); | 
|---|
| 254 | const __m128i v1 = _mm_unpacklo_epi8(v0, zero); | 
|---|
| 255 | const __m128i a1 = _mm_unpacklo_epi8(a0, zero); | 
|---|
| 256 | const __m128i v2 = _mm_mullo_epi16(v1, a1); | 
|---|
| 257 | const __m128i v3 = _mm_add_epi16(v2, k128); | 
|---|
| 258 | const __m128i v4 = _mm_mulhi_epu16(v3, kMult); | 
|---|
| 259 | const __m128i v5 = _mm_packus_epi16(v4, zero); | 
|---|
| 260 | _mm_storel_epi64((__m128i*)&ptr[x], v5); | 
|---|
| 261 | } | 
|---|
| 262 | } | 
|---|
| 263 | width -= x; | 
|---|
| 264 | if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse); | 
|---|
| 265 | } | 
|---|
| 266 |  | 
|---|
| 267 | //------------------------------------------------------------------------------ | 
|---|
| 268 | // Entry point | 
|---|
| 269 |  | 
|---|
| 270 | extern void WebPInitAlphaProcessingSSE2(void); | 
|---|
| 271 |  | 
|---|
| 272 | WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) { | 
|---|
| 273 | WebPMultARGBRow = MultARGBRow_SSE2; | 
|---|
| 274 | WebPMultRow = MultRow_SSE2; | 
|---|
| 275 | WebPApplyAlphaMultiply = ApplyAlphaMultiply_SSE2; | 
|---|
| 276 | WebPDispatchAlpha = DispatchAlpha; | 
|---|
| 277 | WebPDispatchAlphaToGreen = DispatchAlphaToGreen; | 
|---|
| 278 | WebPExtractAlpha = ExtractAlpha; | 
|---|
| 279 | } | 
|---|
| 280 |  | 
|---|
| 281 | #else  // !WEBP_USE_SSE2 | 
|---|
| 282 |  | 
|---|
| 283 | WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE2) | 
|---|
| 284 |  | 
|---|
| 285 | #endif  // WEBP_USE_SSE2 | 
|---|
| 286 |  | 
|---|