| 1 | // Copyright 2017 Google Inc. All Rights Reserved. | 
|---|
| 2 | // | 
|---|
| 3 | // Use of this source code is governed by a BSD-style license | 
|---|
| 4 | // that can be found in the COPYING file in the root of the source | 
|---|
| 5 | // tree. An additional intellectual property rights grant can be found | 
|---|
| 6 | // in the file PATENTS. All contributing project authors may | 
|---|
| 7 | // be found in the AUTHORS file in the root of the source tree. | 
|---|
| 8 | // ----------------------------------------------------------------------------- | 
|---|
| 9 | // | 
|---|
| 10 | // SSE2 version of distortion calculation | 
|---|
| 11 | // | 
|---|
| 12 | // Author: Skal (pascal.massimino@gmail.com) | 
|---|
| 13 |  | 
|---|
| 14 | #include "src/dsp/dsp.h" | 
|---|
| 15 |  | 
|---|
| 16 | #if defined(WEBP_USE_SSE2) | 
|---|
| 17 |  | 
|---|
| 18 | #include <assert.h> | 
|---|
| 19 | #include <emmintrin.h> | 
|---|
| 20 |  | 
|---|
| 21 | #include "src/dsp/common_sse2.h" | 
|---|
| 22 |  | 
|---|
| 23 | #if !defined(WEBP_DISABLE_STATS) | 
|---|
| 24 |  | 
|---|
| 25 | // Helper function | 
|---|
| 26 | static WEBP_INLINE void SubtractAndSquare_SSE2(const __m128i a, const __m128i b, | 
|---|
| 27 | __m128i* const sum) { | 
|---|
| 28 | // take abs(a-b) in 8b | 
|---|
| 29 | const __m128i a_b = _mm_subs_epu8(a, b); | 
|---|
| 30 | const __m128i b_a = _mm_subs_epu8(b, a); | 
|---|
| 31 | const __m128i abs_a_b = _mm_or_si128(a_b, b_a); | 
|---|
| 32 | // zero-extend to 16b | 
|---|
| 33 | const __m128i zero = _mm_setzero_si128(); | 
|---|
| 34 | const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero); | 
|---|
| 35 | const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero); | 
|---|
| 36 | // multiply with self | 
|---|
| 37 | const __m128i sum1 = _mm_madd_epi16(C0, C0); | 
|---|
| 38 | const __m128i sum2 = _mm_madd_epi16(C1, C1); | 
|---|
| 39 | *sum = _mm_add_epi32(sum1, sum2); | 
|---|
| 40 | } | 
|---|
| 41 |  | 
|---|
| 42 | //------------------------------------------------------------------------------ | 
|---|
| 43 | // SSIM / PSNR entry point | 
|---|
| 44 |  | 
|---|
| 45 | static uint32_t AccumulateSSE_SSE2(const uint8_t* src1, | 
|---|
| 46 | const uint8_t* src2, int len) { | 
|---|
| 47 | int i = 0; | 
|---|
| 48 | uint32_t sse2 = 0; | 
|---|
| 49 | if (len >= 16) { | 
|---|
| 50 | const int limit = len - 32; | 
|---|
| 51 | int32_t tmp[4]; | 
|---|
| 52 | __m128i sum1; | 
|---|
| 53 | __m128i sum = _mm_setzero_si128(); | 
|---|
| 54 | __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]); | 
|---|
| 55 | __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]); | 
|---|
| 56 | i += 16; | 
|---|
| 57 | while (i <= limit) { | 
|---|
| 58 | const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]); | 
|---|
| 59 | const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]); | 
|---|
| 60 | __m128i sum2; | 
|---|
| 61 | i += 16; | 
|---|
| 62 | SubtractAndSquare_SSE2(a0, b0, &sum1); | 
|---|
| 63 | sum = _mm_add_epi32(sum, sum1); | 
|---|
| 64 | a0 = _mm_loadu_si128((const __m128i*)&src1[i]); | 
|---|
| 65 | b0 = _mm_loadu_si128((const __m128i*)&src2[i]); | 
|---|
| 66 | i += 16; | 
|---|
| 67 | SubtractAndSquare_SSE2(a1, b1, &sum2); | 
|---|
| 68 | sum = _mm_add_epi32(sum, sum2); | 
|---|
| 69 | } | 
|---|
| 70 | SubtractAndSquare_SSE2(a0, b0, &sum1); | 
|---|
| 71 | sum = _mm_add_epi32(sum, sum1); | 
|---|
| 72 | _mm_storeu_si128((__m128i*)tmp, sum); | 
|---|
| 73 | sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]); | 
|---|
| 74 | } | 
|---|
| 75 |  | 
|---|
| 76 | for (; i < len; ++i) { | 
|---|
| 77 | const int32_t diff = src1[i] - src2[i]; | 
|---|
| 78 | sse2 += diff * diff; | 
|---|
| 79 | } | 
|---|
| 80 | return sse2; | 
|---|
| 81 | } | 
|---|
| 82 | #endif  // !defined(WEBP_DISABLE_STATS) | 
|---|
| 83 |  | 
|---|
| 84 | #if !defined(WEBP_REDUCE_SIZE) | 
|---|
| 85 |  | 
|---|
| 86 | static uint32_t HorizontalAdd16b_SSE2(const __m128i* const m) { | 
|---|
| 87 | uint16_t tmp[8]; | 
|---|
| 88 | const __m128i a = _mm_srli_si128(*m, 8); | 
|---|
| 89 | const __m128i b = _mm_add_epi16(*m, a); | 
|---|
| 90 | _mm_storeu_si128((__m128i*)tmp, b); | 
|---|
| 91 | return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0]; | 
|---|
| 92 | } | 
|---|
| 93 |  | 
|---|
| 94 | static uint32_t HorizontalAdd32b_SSE2(const __m128i* const m) { | 
|---|
| 95 | const __m128i a = _mm_srli_si128(*m, 8); | 
|---|
| 96 | const __m128i b = _mm_add_epi32(*m, a); | 
|---|
| 97 | const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4)); | 
|---|
| 98 | return (uint32_t)_mm_cvtsi128_si32(c); | 
|---|
| 99 | } | 
|---|
| 100 |  | 
|---|
| 101 | static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 }; | 
|---|
| 102 |  | 
|---|
| 103 | #define ACCUMULATE_ROW(WEIGHT) do {                         \ | 
|---|
| 104 | /* compute row weight (Wx * Wy) */                        \ | 
|---|
| 105 | const __m128i Wy = _mm_set1_epi16((WEIGHT));              \ | 
|---|
| 106 | const __m128i W = _mm_mullo_epi16(Wx, Wy);                \ | 
|---|
| 107 | /* process 8 bytes at a time (7 bytes, actually) */       \ | 
|---|
| 108 | const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \ | 
|---|
| 109 | const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \ | 
|---|
| 110 | /* convert to 16b and multiply by weight */               \ | 
|---|
| 111 | const __m128i a1 = _mm_unpacklo_epi8(a0, zero);           \ | 
|---|
| 112 | const __m128i b1 = _mm_unpacklo_epi8(b0, zero);           \ | 
|---|
| 113 | const __m128i wa1 = _mm_mullo_epi16(a1, W);               \ | 
|---|
| 114 | const __m128i wb1 = _mm_mullo_epi16(b1, W);               \ | 
|---|
| 115 | /* accumulate */                                          \ | 
|---|
| 116 | xm  = _mm_add_epi16(xm, wa1);                             \ | 
|---|
| 117 | ym  = _mm_add_epi16(ym, wb1);                             \ | 
|---|
| 118 | xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1));        \ | 
|---|
| 119 | xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1));        \ | 
|---|
| 120 | yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1));        \ | 
|---|
| 121 | src1 += stride1;                                          \ | 
|---|
| 122 | src2 += stride2;                                          \ | 
|---|
| 123 | } while (0) | 
|---|
| 124 |  | 
|---|
| 125 | static double SSIMGet_SSE2(const uint8_t* src1, int stride1, | 
|---|
| 126 | const uint8_t* src2, int stride2) { | 
|---|
| 127 | VP8DistoStats stats; | 
|---|
| 128 | const __m128i zero = _mm_setzero_si128(); | 
|---|
| 129 | __m128i xm = zero, ym = zero;                // 16b accums | 
|---|
| 130 | __m128i xxm = zero, yym = zero, xym = zero;  // 32b accum | 
|---|
| 131 | const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight); | 
|---|
| 132 | assert(2 * VP8_SSIM_KERNEL + 1 == 7); | 
|---|
| 133 | ACCUMULATE_ROW(1); | 
|---|
| 134 | ACCUMULATE_ROW(2); | 
|---|
| 135 | ACCUMULATE_ROW(3); | 
|---|
| 136 | ACCUMULATE_ROW(4); | 
|---|
| 137 | ACCUMULATE_ROW(3); | 
|---|
| 138 | ACCUMULATE_ROW(2); | 
|---|
| 139 | ACCUMULATE_ROW(1); | 
|---|
| 140 | stats.xm  = HorizontalAdd16b_SSE2(&xm); | 
|---|
| 141 | stats.ym  = HorizontalAdd16b_SSE2(&ym); | 
|---|
| 142 | stats.xxm = HorizontalAdd32b_SSE2(&xxm); | 
|---|
| 143 | stats.xym = HorizontalAdd32b_SSE2(&xym); | 
|---|
| 144 | stats.yym = HorizontalAdd32b_SSE2(&yym); | 
|---|
| 145 | return VP8SSIMFromStats(&stats); | 
|---|
| 146 | } | 
|---|
| 147 |  | 
|---|
| 148 | #endif  // !defined(WEBP_REDUCE_SIZE) | 
|---|
| 149 |  | 
|---|
| 150 | extern void VP8SSIMDspInitSSE2(void); | 
|---|
| 151 |  | 
|---|
| 152 | WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) { | 
|---|
| 153 | #if !defined(WEBP_DISABLE_STATS) | 
|---|
| 154 | VP8AccumulateSSE = AccumulateSSE_SSE2; | 
|---|
| 155 | #endif | 
|---|
| 156 | #if !defined(WEBP_REDUCE_SIZE) | 
|---|
| 157 | VP8SSIMGet = SSIMGet_SSE2; | 
|---|
| 158 | #endif | 
|---|
| 159 | } | 
|---|
| 160 |  | 
|---|
| 161 | #else  // !WEBP_USE_SSE2 | 
|---|
| 162 |  | 
|---|
| 163 | WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2) | 
|---|
| 164 |  | 
|---|
| 165 | #endif  // WEBP_USE_SSE2 | 
|---|
| 166 |  | 
|---|