| 1 | // Copyright 2014 Google Inc. All Rights Reserved. |
| 2 | // |
| 3 | // Use of this source code is governed by a BSD-style license |
| 4 | // that can be found in the COPYING file in the root of the source |
| 5 | // tree. An additional intellectual property rights grant can be found |
| 6 | // in the file PATENTS. All contributing project authors may |
| 7 | // be found in the AUTHORS file in the root of the source tree. |
| 8 | // ----------------------------------------------------------------------------- |
| 9 | // |
| 10 | // Utilities for processing transparent channel. |
| 11 | // |
| 12 | // Author: Skal (pascal.massimino@gmail.com) |
| 13 | |
| 14 | #include "src/dsp/dsp.h" |
| 15 | |
| 16 | #if defined(WEBP_USE_SSE2) |
| 17 | #include <emmintrin.h> |
| 18 | |
| 19 | //------------------------------------------------------------------------------ |
| 20 | |
| 21 | static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha, |
| 22 | int alpha_stride, int width, int height, |
| 23 | uint8_t* WEBP_RESTRICT dst, int dst_stride) { |
| 24 | // alpha_and stores an 'and' operation of all the alpha[] values. The final |
| 25 | // value is not 0xff if any of the alpha[] is not equal to 0xff. |
| 26 | uint32_t alpha_and = 0xff; |
| 27 | int i, j; |
| 28 | const __m128i zero = _mm_setzero_si128(); |
| 29 | const __m128i rgb_mask = _mm_set1_epi32((int)0xffffff00); // to preserve RGB |
| 30 | const __m128i all_0xff = _mm_set_epi32(0, 0, ~0, ~0); |
| 31 | __m128i all_alphas = all_0xff; |
| 32 | |
| 33 | // We must be able to access 3 extra bytes after the last written byte |
| 34 | // 'dst[4 * width - 4]', because we don't know if alpha is the first or the |
| 35 | // last byte of the quadruplet. |
| 36 | const int limit = (width - 1) & ~7; |
| 37 | |
| 38 | for (j = 0; j < height; ++j) { |
| 39 | __m128i* out = (__m128i*)dst; |
| 40 | for (i = 0; i < limit; i += 8) { |
| 41 | // load 8 alpha bytes |
| 42 | const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[i]); |
| 43 | const __m128i a1 = _mm_unpacklo_epi8(a0, zero); |
| 44 | const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero); |
| 45 | const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero); |
| 46 | // load 8 dst pixels (32 bytes) |
| 47 | const __m128i b0_lo = _mm_loadu_si128(out + 0); |
| 48 | const __m128i b0_hi = _mm_loadu_si128(out + 1); |
| 49 | // mask dst alpha values |
| 50 | const __m128i b1_lo = _mm_and_si128(b0_lo, rgb_mask); |
| 51 | const __m128i b1_hi = _mm_and_si128(b0_hi, rgb_mask); |
| 52 | // combine |
| 53 | const __m128i b2_lo = _mm_or_si128(b1_lo, a2_lo); |
| 54 | const __m128i b2_hi = _mm_or_si128(b1_hi, a2_hi); |
| 55 | // store |
| 56 | _mm_storeu_si128(out + 0, b2_lo); |
| 57 | _mm_storeu_si128(out + 1, b2_hi); |
| 58 | // accumulate eight alpha 'and' in parallel |
| 59 | all_alphas = _mm_and_si128(all_alphas, a0); |
| 60 | out += 2; |
| 61 | } |
| 62 | for (; i < width; ++i) { |
| 63 | const uint32_t alpha_value = alpha[i]; |
| 64 | dst[4 * i] = alpha_value; |
| 65 | alpha_and &= alpha_value; |
| 66 | } |
| 67 | alpha += alpha_stride; |
| 68 | dst += dst_stride; |
| 69 | } |
| 70 | // Combine the eight alpha 'and' into a 8-bit mask. |
| 71 | alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff)); |
| 72 | return (alpha_and != 0xff); |
| 73 | } |
| 74 | |
| 75 | static void DispatchAlphaToGreen_SSE2(const uint8_t* WEBP_RESTRICT alpha, |
| 76 | int alpha_stride, int width, int height, |
| 77 | uint32_t* WEBP_RESTRICT dst, |
| 78 | int dst_stride) { |
| 79 | int i, j; |
| 80 | const __m128i zero = _mm_setzero_si128(); |
| 81 | const int limit = width & ~15; |
| 82 | for (j = 0; j < height; ++j) { |
| 83 | for (i = 0; i < limit; i += 16) { // process 16 alpha bytes |
| 84 | const __m128i a0 = _mm_loadu_si128((const __m128i*)&alpha[i]); |
| 85 | const __m128i a1 = _mm_unpacklo_epi8(zero, a0); // note the 'zero' first! |
| 86 | const __m128i b1 = _mm_unpackhi_epi8(zero, a0); |
| 87 | const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero); |
| 88 | const __m128i b2_lo = _mm_unpacklo_epi16(b1, zero); |
| 89 | const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero); |
| 90 | const __m128i b2_hi = _mm_unpackhi_epi16(b1, zero); |
| 91 | _mm_storeu_si128((__m128i*)&dst[i + 0], a2_lo); |
| 92 | _mm_storeu_si128((__m128i*)&dst[i + 4], a2_hi); |
| 93 | _mm_storeu_si128((__m128i*)&dst[i + 8], b2_lo); |
| 94 | _mm_storeu_si128((__m128i*)&dst[i + 12], b2_hi); |
| 95 | } |
| 96 | for (; i < width; ++i) dst[i] = alpha[i] << 8; |
| 97 | alpha += alpha_stride; |
| 98 | dst += dst_stride; |
| 99 | } |
| 100 | } |
| 101 | |
| 102 | static int (const uint8_t* WEBP_RESTRICT argb, int argb_stride, |
| 103 | int width, int height, |
| 104 | uint8_t* WEBP_RESTRICT alpha, int alpha_stride) { |
| 105 | // alpha_and stores an 'and' operation of all the alpha[] values. The final |
| 106 | // value is not 0xff if any of the alpha[] is not equal to 0xff. |
| 107 | uint32_t alpha_and = 0xff; |
| 108 | int i, j; |
| 109 | const __m128i a_mask = _mm_set1_epi32(0xff); // to preserve alpha |
| 110 | const __m128i all_0xff = _mm_set_epi32(0, 0, ~0, ~0); |
| 111 | __m128i all_alphas = all_0xff; |
| 112 | |
| 113 | // We must be able to access 3 extra bytes after the last written byte |
| 114 | // 'src[4 * width - 4]', because we don't know if alpha is the first or the |
| 115 | // last byte of the quadruplet. |
| 116 | const int limit = (width - 1) & ~7; |
| 117 | |
| 118 | for (j = 0; j < height; ++j) { |
| 119 | const __m128i* src = (const __m128i*)argb; |
| 120 | for (i = 0; i < limit; i += 8) { |
| 121 | // load 32 argb bytes |
| 122 | const __m128i a0 = _mm_loadu_si128(src + 0); |
| 123 | const __m128i a1 = _mm_loadu_si128(src + 1); |
| 124 | const __m128i b0 = _mm_and_si128(a0, a_mask); |
| 125 | const __m128i b1 = _mm_and_si128(a1, a_mask); |
| 126 | const __m128i c0 = _mm_packs_epi32(b0, b1); |
| 127 | const __m128i d0 = _mm_packus_epi16(c0, c0); |
| 128 | // store |
| 129 | _mm_storel_epi64((__m128i*)&alpha[i], d0); |
| 130 | // accumulate eight alpha 'and' in parallel |
| 131 | all_alphas = _mm_and_si128(all_alphas, d0); |
| 132 | src += 2; |
| 133 | } |
| 134 | for (; i < width; ++i) { |
| 135 | const uint32_t alpha_value = argb[4 * i]; |
| 136 | alpha[i] = alpha_value; |
| 137 | alpha_and &= alpha_value; |
| 138 | } |
| 139 | argb += argb_stride; |
| 140 | alpha += alpha_stride; |
| 141 | } |
| 142 | // Combine the eight alpha 'and' into a 8-bit mask. |
| 143 | alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff)); |
| 144 | return (alpha_and == 0xff); |
| 145 | } |
| 146 | |
| 147 | //------------------------------------------------------------------------------ |
| 148 | // Non-dither premultiplied modes |
| 149 | |
| 150 | #define MULTIPLIER(a) ((a) * 0x8081) |
| 151 | #define PREMULTIPLY(x, m) (((x) * (m)) >> 23) |
| 152 | |
| 153 | // We can't use a 'const int' for the SHUFFLE value, because it has to be an |
| 154 | // immediate in the _mm_shufflexx_epi16() instruction. We really need a macro. |
| 155 | // We use: v / 255 = (v * 0x8081) >> 23, where v = alpha * {r,g,b} is a 16bit |
| 156 | // value. |
| 157 | #define APPLY_ALPHA(RGBX, SHUFFLE) do { \ |
| 158 | const __m128i argb0 = _mm_loadu_si128((const __m128i*)&(RGBX)); \ |
| 159 | const __m128i argb1_lo = _mm_unpacklo_epi8(argb0, zero); \ |
| 160 | const __m128i argb1_hi = _mm_unpackhi_epi8(argb0, zero); \ |
| 161 | const __m128i alpha0_lo = _mm_or_si128(argb1_lo, kMask); \ |
| 162 | const __m128i alpha0_hi = _mm_or_si128(argb1_hi, kMask); \ |
| 163 | const __m128i alpha1_lo = _mm_shufflelo_epi16(alpha0_lo, SHUFFLE); \ |
| 164 | const __m128i alpha1_hi = _mm_shufflelo_epi16(alpha0_hi, SHUFFLE); \ |
| 165 | const __m128i alpha2_lo = _mm_shufflehi_epi16(alpha1_lo, SHUFFLE); \ |
| 166 | const __m128i alpha2_hi = _mm_shufflehi_epi16(alpha1_hi, SHUFFLE); \ |
| 167 | /* alpha2 = [ff a0 a0 a0][ff a1 a1 a1] */ \ |
| 168 | const __m128i A0_lo = _mm_mullo_epi16(alpha2_lo, argb1_lo); \ |
| 169 | const __m128i A0_hi = _mm_mullo_epi16(alpha2_hi, argb1_hi); \ |
| 170 | const __m128i A1_lo = _mm_mulhi_epu16(A0_lo, kMult); \ |
| 171 | const __m128i A1_hi = _mm_mulhi_epu16(A0_hi, kMult); \ |
| 172 | const __m128i A2_lo = _mm_srli_epi16(A1_lo, 7); \ |
| 173 | const __m128i A2_hi = _mm_srli_epi16(A1_hi, 7); \ |
| 174 | const __m128i A3 = _mm_packus_epi16(A2_lo, A2_hi); \ |
| 175 | _mm_storeu_si128((__m128i*)&(RGBX), A3); \ |
| 176 | } while (0) |
| 177 | |
| 178 | static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first, |
| 179 | int w, int h, int stride) { |
| 180 | const __m128i zero = _mm_setzero_si128(); |
| 181 | const __m128i kMult = _mm_set1_epi16((short)0x8081); |
| 182 | const __m128i kMask = _mm_set_epi16(0, 0xff, 0xff, 0, 0, 0xff, 0xff, 0); |
| 183 | const int kSpan = 4; |
| 184 | while (h-- > 0) { |
| 185 | uint32_t* const rgbx = (uint32_t*)rgba; |
| 186 | int i; |
| 187 | if (!alpha_first) { |
| 188 | for (i = 0; i + kSpan <= w; i += kSpan) { |
| 189 | APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(2, 3, 3, 3)); |
| 190 | } |
| 191 | } else { |
| 192 | for (i = 0; i + kSpan <= w; i += kSpan) { |
| 193 | APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 1)); |
| 194 | } |
| 195 | } |
| 196 | // Finish with left-overs. |
| 197 | for (; i < w; ++i) { |
| 198 | uint8_t* const rgb = rgba + (alpha_first ? 1 : 0); |
| 199 | const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3); |
| 200 | const uint32_t a = alpha[4 * i]; |
| 201 | if (a != 0xff) { |
| 202 | const uint32_t mult = MULTIPLIER(a); |
| 203 | rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult); |
| 204 | rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult); |
| 205 | rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult); |
| 206 | } |
| 207 | } |
| 208 | rgba += stride; |
| 209 | } |
| 210 | } |
| 211 | #undef MULTIPLIER |
| 212 | #undef PREMULTIPLY |
| 213 | |
| 214 | //------------------------------------------------------------------------------ |
| 215 | // Alpha detection |
| 216 | |
| 217 | static int HasAlpha8b_SSE2(const uint8_t* src, int length) { |
| 218 | const __m128i all_0xff = _mm_set1_epi8((char)0xff); |
| 219 | int i = 0; |
| 220 | for (; i + 16 <= length; i += 16) { |
| 221 | const __m128i v = _mm_loadu_si128((const __m128i*)(src + i)); |
| 222 | const __m128i bits = _mm_cmpeq_epi8(v, all_0xff); |
| 223 | const int mask = _mm_movemask_epi8(bits); |
| 224 | if (mask != 0xffff) return 1; |
| 225 | } |
| 226 | for (; i < length; ++i) if (src[i] != 0xff) return 1; |
| 227 | return 0; |
| 228 | } |
| 229 | |
| 230 | static int HasAlpha32b_SSE2(const uint8_t* src, int length) { |
| 231 | const __m128i alpha_mask = _mm_set1_epi32(0xff); |
| 232 | const __m128i all_0xff = _mm_set1_epi8((char)0xff); |
| 233 | int i = 0; |
| 234 | // We don't know if we can access the last 3 bytes after the last alpha |
| 235 | // value 'src[4 * length - 4]' (because we don't know if alpha is the first |
| 236 | // or the last byte of the quadruplet). Hence the '-3' protection below. |
| 237 | length = length * 4 - 3; // size in bytes |
| 238 | for (; i + 64 <= length; i += 64) { |
| 239 | const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0)); |
| 240 | const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16)); |
| 241 | const __m128i a2 = _mm_loadu_si128((const __m128i*)(src + i + 32)); |
| 242 | const __m128i a3 = _mm_loadu_si128((const __m128i*)(src + i + 48)); |
| 243 | const __m128i b0 = _mm_and_si128(a0, alpha_mask); |
| 244 | const __m128i b1 = _mm_and_si128(a1, alpha_mask); |
| 245 | const __m128i b2 = _mm_and_si128(a2, alpha_mask); |
| 246 | const __m128i b3 = _mm_and_si128(a3, alpha_mask); |
| 247 | const __m128i c0 = _mm_packs_epi32(b0, b1); |
| 248 | const __m128i c1 = _mm_packs_epi32(b2, b3); |
| 249 | const __m128i d = _mm_packus_epi16(c0, c1); |
| 250 | const __m128i bits = _mm_cmpeq_epi8(d, all_0xff); |
| 251 | const int mask = _mm_movemask_epi8(bits); |
| 252 | if (mask != 0xffff) return 1; |
| 253 | } |
| 254 | for (; i + 32 <= length; i += 32) { |
| 255 | const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0)); |
| 256 | const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16)); |
| 257 | const __m128i b0 = _mm_and_si128(a0, alpha_mask); |
| 258 | const __m128i b1 = _mm_and_si128(a1, alpha_mask); |
| 259 | const __m128i c = _mm_packs_epi32(b0, b1); |
| 260 | const __m128i d = _mm_packus_epi16(c, c); |
| 261 | const __m128i bits = _mm_cmpeq_epi8(d, all_0xff); |
| 262 | const int mask = _mm_movemask_epi8(bits); |
| 263 | if (mask != 0xffff) return 1; |
| 264 | } |
| 265 | for (; i <= length; i += 4) if (src[i] != 0xff) return 1; |
| 266 | return 0; |
| 267 | } |
| 268 | |
| 269 | static void AlphaReplace_SSE2(uint32_t* src, int length, uint32_t color) { |
| 270 | const __m128i m_color = _mm_set1_epi32((int)color); |
| 271 | const __m128i zero = _mm_setzero_si128(); |
| 272 | int i = 0; |
| 273 | for (; i + 8 <= length; i += 8) { |
| 274 | const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0)); |
| 275 | const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 4)); |
| 276 | const __m128i b0 = _mm_srai_epi32(a0, 24); |
| 277 | const __m128i b1 = _mm_srai_epi32(a1, 24); |
| 278 | const __m128i c0 = _mm_cmpeq_epi32(b0, zero); |
| 279 | const __m128i c1 = _mm_cmpeq_epi32(b1, zero); |
| 280 | const __m128i d0 = _mm_and_si128(c0, m_color); |
| 281 | const __m128i d1 = _mm_and_si128(c1, m_color); |
| 282 | const __m128i e0 = _mm_andnot_si128(c0, a0); |
| 283 | const __m128i e1 = _mm_andnot_si128(c1, a1); |
| 284 | _mm_storeu_si128((__m128i*)(src + i + 0), _mm_or_si128(d0, e0)); |
| 285 | _mm_storeu_si128((__m128i*)(src + i + 4), _mm_or_si128(d1, e1)); |
| 286 | } |
| 287 | for (; i < length; ++i) if ((src[i] >> 24) == 0) src[i] = color; |
| 288 | } |
| 289 | |
| 290 | // ----------------------------------------------------------------------------- |
| 291 | // Apply alpha value to rows |
| 292 | |
| 293 | static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) { |
| 294 | int x = 0; |
| 295 | if (!inverse) { |
| 296 | const int kSpan = 2; |
| 297 | const __m128i zero = _mm_setzero_si128(); |
| 298 | const __m128i k128 = _mm_set1_epi16(128); |
| 299 | const __m128i kMult = _mm_set1_epi16(0x0101); |
| 300 | const __m128i kMask = _mm_set_epi16(0, 0xff, 0, 0, 0, 0xff, 0, 0); |
| 301 | for (x = 0; x + kSpan <= width; x += kSpan) { |
| 302 | // To compute 'result = (int)(a * x / 255. + .5)', we use: |
| 303 | // tmp = a * v + 128, result = (tmp * 0x0101u) >> 16 |
| 304 | const __m128i A0 = _mm_loadl_epi64((const __m128i*)&ptr[x]); |
| 305 | const __m128i A1 = _mm_unpacklo_epi8(A0, zero); |
| 306 | const __m128i A2 = _mm_or_si128(A1, kMask); |
| 307 | const __m128i A3 = _mm_shufflelo_epi16(A2, _MM_SHUFFLE(2, 3, 3, 3)); |
| 308 | const __m128i A4 = _mm_shufflehi_epi16(A3, _MM_SHUFFLE(2, 3, 3, 3)); |
| 309 | // here, A4 = [ff a0 a0 a0][ff a1 a1 a1] |
| 310 | const __m128i A5 = _mm_mullo_epi16(A4, A1); |
| 311 | const __m128i A6 = _mm_add_epi16(A5, k128); |
| 312 | const __m128i A7 = _mm_mulhi_epu16(A6, kMult); |
| 313 | const __m128i A10 = _mm_packus_epi16(A7, zero); |
| 314 | _mm_storel_epi64((__m128i*)&ptr[x], A10); |
| 315 | } |
| 316 | } |
| 317 | width -= x; |
| 318 | if (width > 0) WebPMultARGBRow_C(ptr + x, width, inverse); |
| 319 | } |
| 320 | |
| 321 | static void MultRow_SSE2(uint8_t* WEBP_RESTRICT const ptr, |
| 322 | const uint8_t* WEBP_RESTRICT const alpha, |
| 323 | int width, int inverse) { |
| 324 | int x = 0; |
| 325 | if (!inverse) { |
| 326 | const __m128i zero = _mm_setzero_si128(); |
| 327 | const __m128i k128 = _mm_set1_epi16(128); |
| 328 | const __m128i kMult = _mm_set1_epi16(0x0101); |
| 329 | for (x = 0; x + 8 <= width; x += 8) { |
| 330 | const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]); |
| 331 | const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[x]); |
| 332 | const __m128i v1 = _mm_unpacklo_epi8(v0, zero); |
| 333 | const __m128i a1 = _mm_unpacklo_epi8(a0, zero); |
| 334 | const __m128i v2 = _mm_mullo_epi16(v1, a1); |
| 335 | const __m128i v3 = _mm_add_epi16(v2, k128); |
| 336 | const __m128i v4 = _mm_mulhi_epu16(v3, kMult); |
| 337 | const __m128i v5 = _mm_packus_epi16(v4, zero); |
| 338 | _mm_storel_epi64((__m128i*)&ptr[x], v5); |
| 339 | } |
| 340 | } |
| 341 | width -= x; |
| 342 | if (width > 0) WebPMultRow_C(ptr + x, alpha + x, width, inverse); |
| 343 | } |
| 344 | |
| 345 | //------------------------------------------------------------------------------ |
| 346 | // Entry point |
| 347 | |
| 348 | extern void WebPInitAlphaProcessingSSE2(void); |
| 349 | |
| 350 | WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) { |
| 351 | WebPMultARGBRow = MultARGBRow_SSE2; |
| 352 | WebPMultRow = MultRow_SSE2; |
| 353 | WebPApplyAlphaMultiply = ApplyAlphaMultiply_SSE2; |
| 354 | WebPDispatchAlpha = DispatchAlpha_SSE2; |
| 355 | WebPDispatchAlphaToGreen = DispatchAlphaToGreen_SSE2; |
| 356 | WebPExtractAlpha = ExtractAlpha_SSE2; |
| 357 | |
| 358 | WebPHasAlpha8b = HasAlpha8b_SSE2; |
| 359 | WebPHasAlpha32b = HasAlpha32b_SSE2; |
| 360 | WebPAlphaReplace = AlphaReplace_SSE2; |
| 361 | } |
| 362 | |
| 363 | #else // !WEBP_USE_SSE2 |
| 364 | |
| 365 | WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE2) |
| 366 | |
| 367 | #endif // WEBP_USE_SSE2 |
| 368 | |