| 1 | // Copyright 2021 Google Inc. All Rights Reserved. |
| 2 | // |
| 3 | // Use of this source code is governed by a BSD-style license |
| 4 | // that can be found in the COPYING file in the root of the source |
| 5 | // tree. An additional intellectual property rights grant can be found |
| 6 | // in the file PATENTS. All contributing project authors may |
| 7 | // be found in the AUTHORS file in the root of the source tree. |
| 8 | // ----------------------------------------------------------------------------- |
| 9 | // |
| 10 | // SSE41 variant of methods for lossless decoder |
| 11 | |
| 12 | #include "src/dsp/dsp.h" |
| 13 | |
| 14 | #if defined(WEBP_USE_SSE41) |
| 15 | |
| 16 | #include "src/dsp/common_sse41.h" |
| 17 | #include "src/dsp/lossless.h" |
| 18 | #include "src/dsp/lossless_common.h" |
| 19 | |
| 20 | //------------------------------------------------------------------------------ |
| 21 | // Color-space conversion functions |
| 22 | |
| 23 | static void TransformColorInverse_SSE41(const VP8LMultipliers* const m, |
| 24 | const uint32_t* const src, |
| 25 | int num_pixels, uint32_t* dst) { |
| 26 | // sign-extended multiplying constants, pre-shifted by 5. |
| 27 | #define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend |
| 28 | const __m128i mults_rb = |
| 29 | _mm_set1_epi32((int)((uint32_t)CST(green_to_red_) << 16 | |
| 30 | (CST(green_to_blue_) & 0xffff))); |
| 31 | const __m128i mults_b2 = _mm_set1_epi32(CST(red_to_blue_)); |
| 32 | #undef CST |
| 33 | const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00); |
| 34 | const __m128i perm1 = _mm_setr_epi8(-1, 1, -1, 1, -1, 5, -1, 5, |
| 35 | -1, 9, -1, 9, -1, 13, -1, 13); |
| 36 | const __m128i perm2 = _mm_setr_epi8(-1, 2, -1, -1, -1, 6, -1, -1, |
| 37 | -1, 10, -1, -1, -1, 14, -1, -1); |
| 38 | int i; |
| 39 | for (i = 0; i + 4 <= num_pixels; i += 4) { |
| 40 | const __m128i A = _mm_loadu_si128((const __m128i*)(src + i)); |
| 41 | const __m128i B = _mm_shuffle_epi8(A, perm1); // argb -> g0g0 |
| 42 | const __m128i C = _mm_mulhi_epi16(B, mults_rb); |
| 43 | const __m128i D = _mm_add_epi8(A, C); |
| 44 | const __m128i E = _mm_shuffle_epi8(D, perm2); |
| 45 | const __m128i F = _mm_mulhi_epi16(E, mults_b2); |
| 46 | const __m128i G = _mm_add_epi8(D, F); |
| 47 | const __m128i out = _mm_blendv_epi8(G, A, mask_ag); |
| 48 | _mm_storeu_si128((__m128i*)&dst[i], out); |
| 49 | } |
| 50 | // Fall-back to C-version for left-overs. |
| 51 | if (i != num_pixels) { |
| 52 | VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i); |
| 53 | } |
| 54 | } |
| 55 | |
| 56 | //------------------------------------------------------------------------------ |
| 57 | |
| 58 | #define ARGB_TO_RGB_SSE41 do { \ |
| 59 | while (num_pixels >= 16) { \ |
| 60 | const __m128i in0 = _mm_loadu_si128(in + 0); \ |
| 61 | const __m128i in1 = _mm_loadu_si128(in + 1); \ |
| 62 | const __m128i in2 = _mm_loadu_si128(in + 2); \ |
| 63 | const __m128i in3 = _mm_loadu_si128(in + 3); \ |
| 64 | const __m128i a0 = _mm_shuffle_epi8(in0, perm0); \ |
| 65 | const __m128i a1 = _mm_shuffle_epi8(in1, perm1); \ |
| 66 | const __m128i a2 = _mm_shuffle_epi8(in2, perm2); \ |
| 67 | const __m128i a3 = _mm_shuffle_epi8(in3, perm3); \ |
| 68 | const __m128i b0 = _mm_blend_epi16(a0, a1, 0xc0); \ |
| 69 | const __m128i b1 = _mm_blend_epi16(a1, a2, 0xf0); \ |
| 70 | const __m128i b2 = _mm_blend_epi16(a2, a3, 0xfc); \ |
| 71 | _mm_storeu_si128(out + 0, b0); \ |
| 72 | _mm_storeu_si128(out + 1, b1); \ |
| 73 | _mm_storeu_si128(out + 2, b2); \ |
| 74 | in += 4; \ |
| 75 | out += 3; \ |
| 76 | num_pixels -= 16; \ |
| 77 | } \ |
| 78 | } while (0) |
| 79 | |
| 80 | static void ConvertBGRAToRGB_SSE41(const uint32_t* src, int num_pixels, |
| 81 | uint8_t* dst) { |
| 82 | const __m128i* in = (const __m128i*)src; |
| 83 | __m128i* out = (__m128i*)dst; |
| 84 | const __m128i perm0 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, |
| 85 | 8, 14, 13, 12, -1, -1, -1, -1); |
| 86 | const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39); |
| 87 | const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e); |
| 88 | const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93); |
| 89 | |
| 90 | ARGB_TO_RGB_SSE41; |
| 91 | |
| 92 | // left-overs |
| 93 | if (num_pixels > 0) { |
| 94 | VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out); |
| 95 | } |
| 96 | } |
| 97 | |
| 98 | static void ConvertBGRAToBGR_SSE41(const uint32_t* src, |
| 99 | int num_pixels, uint8_t* dst) { |
| 100 | const __m128i* in = (const __m128i*)src; |
| 101 | __m128i* out = (__m128i*)dst; |
| 102 | const __m128i perm0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, |
| 103 | 12, 13, 14, -1, -1, -1, -1); |
| 104 | const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39); |
| 105 | const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e); |
| 106 | const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93); |
| 107 | |
| 108 | ARGB_TO_RGB_SSE41; |
| 109 | |
| 110 | // left-overs |
| 111 | if (num_pixels > 0) { |
| 112 | VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, (uint8_t*)out); |
| 113 | } |
| 114 | } |
| 115 | |
| 116 | #undef ARGB_TO_RGB_SSE41 |
| 117 | |
| 118 | //------------------------------------------------------------------------------ |
| 119 | // Entry point |
| 120 | |
| 121 | extern void VP8LDspInitSSE41(void); |
| 122 | |
| 123 | WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE41(void) { |
| 124 | VP8LTransformColorInverse = TransformColorInverse_SSE41; |
| 125 | VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE41; |
| 126 | VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE41; |
| 127 | } |
| 128 | |
| 129 | #else // !WEBP_USE_SSE41 |
| 130 | |
| 131 | WEBP_DSP_INIT_STUB(VP8LDspInitSSE41) |
| 132 | |
| 133 | #endif // WEBP_USE_SSE41 |
| 134 | |