| 1 | // Copyright 2016 Google Inc. All Rights Reserved. | 
|---|
| 2 | // | 
|---|
| 3 | // Use of this source code is governed by a BSD-style license | 
|---|
| 4 | // that can be found in the COPYING file in the root of the source | 
|---|
| 5 | // tree. An additional intellectual property rights grant can be found | 
|---|
| 6 | // in the file PATENTS. All contributing project authors may | 
|---|
| 7 | // be found in the AUTHORS file in the root of the source tree. | 
|---|
| 8 | // ----------------------------------------------------------------------------- | 
|---|
| 9 | // | 
|---|
| 10 | // MSA version of rescaling functions | 
|---|
| 11 | // | 
|---|
| 12 | // Author: Prashant Patil (prashant.patil@imgtec.com) | 
|---|
| 13 |  | 
|---|
| 14 | #include "./dsp.h" | 
|---|
| 15 |  | 
|---|
| 16 | #if defined(WEBP_USE_MSA) | 
|---|
| 17 |  | 
|---|
| 18 | #include <assert.h> | 
|---|
| 19 |  | 
|---|
| 20 | #include "../utils/rescaler_utils.h" | 
|---|
| 21 | #include "./msa_macro.h" | 
|---|
| 22 |  | 
|---|
| 23 | #define ROUNDER (WEBP_RESCALER_ONE >> 1) | 
|---|
| 24 | #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX) | 
|---|
| 25 |  | 
|---|
| 26 | #define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do {  \ | 
|---|
| 27 | v4u32 tmp0, tmp1, tmp2, tmp3;                                       \ | 
|---|
| 28 | v16u8 t0, t1, t2, t3, t4, t5;                                       \ | 
|---|
| 29 | v2u64 out0, out1, out2, out3;                                       \ | 
|---|
| 30 | ILVRL_W2_UW(zero, in0, tmp0, tmp1);                                 \ | 
|---|
| 31 | ILVRL_W2_UW(zero, in1, tmp2, tmp3);                                 \ | 
|---|
| 32 | DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);                  \ | 
|---|
| 33 | DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3);                  \ | 
|---|
| 34 | SRAR_D4_UD(out0, out1, out2, out3, shift);                          \ | 
|---|
| 35 | PCKEV_B2_UB(out1, out0, out3, out2, t0, t1);                        \ | 
|---|
| 36 | ILVRL_W2_UW(zero, in2, tmp0, tmp1);                                 \ | 
|---|
| 37 | ILVRL_W2_UW(zero, in3, tmp2, tmp3);                                 \ | 
|---|
| 38 | DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);                  \ | 
|---|
| 39 | DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3);                  \ | 
|---|
| 40 | SRAR_D4_UD(out0, out1, out2, out3, shift);                          \ | 
|---|
| 41 | PCKEV_B2_UB(out1, out0, out3, out2, t2, t3);                        \ | 
|---|
| 42 | PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);                                \ | 
|---|
| 43 | dst = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);                   \ | 
|---|
| 44 | } while (0) | 
|---|
| 45 |  | 
|---|
| 46 | #define CALC_MULT_FIX_4(in0, scale, shift, dst) do {  \ | 
|---|
| 47 | v4u32 tmp0, tmp1;                                   \ | 
|---|
| 48 | v16i8 t0, t1;                                       \ | 
|---|
| 49 | v2u64 out0, out1;                                   \ | 
|---|
| 50 | ILVRL_W2_UW(zero, in0, tmp0, tmp1);                 \ | 
|---|
| 51 | DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);  \ | 
|---|
| 52 | SRAR_D2_UD(out0, out1, shift);                      \ | 
|---|
| 53 | t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0);       \ | 
|---|
| 54 | t1 = __msa_pckev_b(t0, t0);                         \ | 
|---|
| 55 | t0 = __msa_pckev_b(t1, t1);                         \ | 
|---|
| 56 | dst = __msa_copy_s_w((v4i32)t0, 0);                 \ | 
|---|
| 57 | } while (0) | 
|---|
| 58 |  | 
|---|
| 59 | #define CALC_MULT_FIX1_16(in0, in1, in2, in3, fyscale, shift,  \ | 
|---|
| 60 | dst0, dst1, dst2, dst3) do {         \ | 
|---|
| 61 | v4u32 tmp0, tmp1, tmp2, tmp3;                                \ | 
|---|
| 62 | v2u64 out0, out1, out2, out3;                                \ | 
|---|
| 63 | ILVRL_W2_UW(zero, in0, tmp0, tmp1);                          \ | 
|---|
| 64 | ILVRL_W2_UW(zero, in1, tmp2, tmp3);                          \ | 
|---|
| 65 | DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1);       \ | 
|---|
| 66 | DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3);       \ | 
|---|
| 67 | SRAR_D4_UD(out0, out1, out2, out3, shift);                   \ | 
|---|
| 68 | PCKEV_W2_UW(out1, out0, out3, out2, dst0, dst1);             \ | 
|---|
| 69 | ILVRL_W2_UW(zero, in2, tmp0, tmp1);                          \ | 
|---|
| 70 | ILVRL_W2_UW(zero, in3, tmp2, tmp3);                          \ | 
|---|
| 71 | DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1);       \ | 
|---|
| 72 | DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3);       \ | 
|---|
| 73 | SRAR_D4_UD(out0, out1, out2, out3, shift);                   \ | 
|---|
| 74 | PCKEV_W2_UW(out1, out0, out3, out2, dst2, dst3);             \ | 
|---|
| 75 | } while (0) | 
|---|
| 76 |  | 
|---|
| 77 | #define CALC_MULT_FIX1_4(in0, scale, shift, dst) do {    \ | 
|---|
| 78 | v4u32 tmp0, tmp1;                                      \ | 
|---|
| 79 | v2u64 out0, out1;                                      \ | 
|---|
| 80 | ILVRL_W2_UW(zero, in0, tmp0, tmp1);                    \ | 
|---|
| 81 | DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);     \ | 
|---|
| 82 | SRAR_D2_UD(out0, out1, shift);                         \ | 
|---|
| 83 | dst = (v4u32)__msa_pckev_w((v4i32)out1, (v4i32)out0);  \ | 
|---|
| 84 | } while (0) | 
|---|
| 85 |  | 
|---|
| 86 | #define CALC_MULT_FIX2_16(in0, in1, in2, in3, mult, scale, shift,  \ | 
|---|
| 87 | dst0, dst1) do {                         \ | 
|---|
| 88 | v4u32 tmp0, tmp1, tmp2, tmp3;                                    \ | 
|---|
| 89 | v2u64 out0, out1, out2, out3;                                    \ | 
|---|
| 90 | ILVRL_W2_UW(in0, in2, tmp0, tmp1);                               \ | 
|---|
| 91 | ILVRL_W2_UW(in1, in3, tmp2, tmp3);                               \ | 
|---|
| 92 | DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1);                 \ | 
|---|
| 93 | DOTP_UW2_UD(tmp2, tmp3, mult, mult, out2, out3);                 \ | 
|---|
| 94 | SRAR_D4_UD(out0, out1, out2, out3, shift);                       \ | 
|---|
| 95 | DOTP_UW2_UD(out0, out1, scale, scale, out0, out1);               \ | 
|---|
| 96 | DOTP_UW2_UD(out2, out3, scale, scale, out2, out3);               \ | 
|---|
| 97 | SRAR_D4_UD(out0, out1, out2, out3, shift);                       \ | 
|---|
| 98 | PCKEV_B2_UB(out1, out0, out3, out2, dst0, dst1);                 \ | 
|---|
| 99 | } while (0) | 
|---|
| 100 |  | 
|---|
| 101 | #define CALC_MULT_FIX2_4(in0, in1, mult, scale, shift, dst) do {  \ | 
|---|
| 102 | v4u32 tmp0, tmp1;                                               \ | 
|---|
| 103 | v2u64 out0, out1;                                               \ | 
|---|
| 104 | v16i8 t0, t1;                                                   \ | 
|---|
| 105 | ILVRL_W2_UW(in0, in1, tmp0, tmp1);                              \ | 
|---|
| 106 | DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1);                \ | 
|---|
| 107 | SRAR_D2_UD(out0, out1, shift);                                  \ | 
|---|
| 108 | DOTP_UW2_UD(out0, out1, scale, scale, out0, out1);              \ | 
|---|
| 109 | SRAR_D2_UD(out0, out1, shift);                                  \ | 
|---|
| 110 | t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0);                   \ | 
|---|
| 111 | t1 = __msa_pckev_b(t0, t0);                                     \ | 
|---|
| 112 | t0 = __msa_pckev_b(t1, t1);                                     \ | 
|---|
| 113 | dst = __msa_copy_s_w((v4i32)t0, 0);                             \ | 
|---|
| 114 | } while (0) | 
|---|
| 115 |  | 
|---|
| 116 | static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst, | 
|---|
| 117 | int length, | 
|---|
| 118 | WebPRescaler* const wrk) { | 
|---|
| 119 | const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale); | 
|---|
| 120 | const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX); | 
|---|
| 121 | const v4i32 zero = { 0 }; | 
|---|
| 122 |  | 
|---|
| 123 | while (length >= 16) { | 
|---|
| 124 | v4u32 src0, src1, src2, src3; | 
|---|
| 125 | v16u8 out; | 
|---|
| 126 | LD_UW4(frow, 4, src0, src1, src2, src3); | 
|---|
| 127 | CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, out); | 
|---|
| 128 | ST_UB(out, dst); | 
|---|
| 129 | length -= 16; | 
|---|
| 130 | frow   += 16; | 
|---|
| 131 | dst    += 16; | 
|---|
| 132 | } | 
|---|
| 133 | if (length > 0) { | 
|---|
| 134 | int x_out; | 
|---|
| 135 | if (length >= 12) { | 
|---|
| 136 | uint32_t val0_m, val1_m, val2_m; | 
|---|
| 137 | v4u32 src0, src1, src2; | 
|---|
| 138 | LD_UW3(frow, 4, src0, src1, src2); | 
|---|
| 139 | CALC_MULT_FIX_4(src0, scale, shift, val0_m); | 
|---|
| 140 | CALC_MULT_FIX_4(src1, scale, shift, val1_m); | 
|---|
| 141 | CALC_MULT_FIX_4(src2, scale, shift, val2_m); | 
|---|
| 142 | SW3(val0_m, val1_m, val2_m, dst, 4); | 
|---|
| 143 | length -= 12; | 
|---|
| 144 | frow   += 12; | 
|---|
| 145 | dst    += 12; | 
|---|
| 146 | } else if (length >= 8) { | 
|---|
| 147 | uint32_t val0_m, val1_m; | 
|---|
| 148 | v4u32 src0, src1; | 
|---|
| 149 | LD_UW2(frow, 4, src0, src1); | 
|---|
| 150 | CALC_MULT_FIX_4(src0, scale, shift, val0_m); | 
|---|
| 151 | CALC_MULT_FIX_4(src1, scale, shift, val1_m); | 
|---|
| 152 | SW2(val0_m, val1_m, dst, 4); | 
|---|
| 153 | length -= 8; | 
|---|
| 154 | frow   += 8; | 
|---|
| 155 | dst    += 8; | 
|---|
| 156 | } else if (length >= 4) { | 
|---|
| 157 | uint32_t val0_m; | 
|---|
| 158 | const v4u32 src0 = LD_UW(frow); | 
|---|
| 159 | CALC_MULT_FIX_4(src0, scale, shift, val0_m); | 
|---|
| 160 | SW(val0_m, dst); | 
|---|
| 161 | length -= 4; | 
|---|
| 162 | frow   += 4; | 
|---|
| 163 | dst    += 4; | 
|---|
| 164 | } | 
|---|
| 165 | for (x_out = 0; x_out < length; ++x_out) { | 
|---|
| 166 | const uint32_t J = frow[x_out]; | 
|---|
| 167 | const int v = (int)MULT_FIX(J, wrk->fy_scale); | 
|---|
| 168 | assert(v >= 0 && v <= 255); | 
|---|
| 169 | dst[x_out] = v; | 
|---|
| 170 | } | 
|---|
| 171 | } | 
|---|
| 172 | } | 
|---|
| 173 |  | 
|---|
| 174 | static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow, | 
|---|
| 175 | uint8_t* dst, int length, | 
|---|
| 176 | WebPRescaler* const wrk) { | 
|---|
| 177 | const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub); | 
|---|
| 178 | const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B); | 
|---|
| 179 | const v4i32 B1 = __msa_fill_w(B); | 
|---|
| 180 | const v4i32 A1 = __msa_fill_w(A); | 
|---|
| 181 | const v4i32 AB = __msa_ilvr_w(A1, B1); | 
|---|
| 182 | const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale); | 
|---|
| 183 | const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX); | 
|---|
| 184 |  | 
|---|
| 185 | while (length >= 16) { | 
|---|
| 186 | v4u32 frow0, frow1, frow2, frow3, irow0, irow1, irow2, irow3; | 
|---|
| 187 | v16u8 t0, t1, t2, t3, t4, t5; | 
|---|
| 188 | LD_UW4(frow, 4, frow0, frow1, frow2, frow3); | 
|---|
| 189 | LD_UW4(irow, 4, irow0, irow1, irow2, irow3); | 
|---|
| 190 | CALC_MULT_FIX2_16(frow0, frow1, irow0, irow1, AB, scale, shift, t0, t1); | 
|---|
| 191 | CALC_MULT_FIX2_16(frow2, frow3, irow2, irow3, AB, scale, shift, t2, t3); | 
|---|
| 192 | PCKEV_B2_UB(t1, t0, t3, t2, t4, t5); | 
|---|
| 193 | t0 = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4); | 
|---|
| 194 | ST_UB(t0, dst); | 
|---|
| 195 | frow   += 16; | 
|---|
| 196 | irow   += 16; | 
|---|
| 197 | dst    += 16; | 
|---|
| 198 | length -= 16; | 
|---|
| 199 | } | 
|---|
| 200 | if (length > 0) { | 
|---|
| 201 | int x_out; | 
|---|
| 202 | if (length >= 12) { | 
|---|
| 203 | uint32_t val0_m, val1_m, val2_m; | 
|---|
| 204 | v4u32 frow0, frow1, frow2, irow0, irow1, irow2; | 
|---|
| 205 | LD_UW3(frow, 4, frow0, frow1, frow2); | 
|---|
| 206 | LD_UW3(irow, 4, irow0, irow1, irow2); | 
|---|
| 207 | CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m); | 
|---|
| 208 | CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m); | 
|---|
| 209 | CALC_MULT_FIX2_4(frow2, irow2, AB, scale, shift, val2_m); | 
|---|
| 210 | SW3(val0_m, val1_m, val2_m, dst, 4); | 
|---|
| 211 | frow   += 12; | 
|---|
| 212 | irow   += 12; | 
|---|
| 213 | dst    += 12; | 
|---|
| 214 | length -= 12; | 
|---|
| 215 | } else if (length >= 8) { | 
|---|
| 216 | uint32_t val0_m, val1_m; | 
|---|
| 217 | v4u32 frow0, frow1, irow0, irow1; | 
|---|
| 218 | LD_UW2(frow, 4, frow0, frow1); | 
|---|
| 219 | LD_UW2(irow, 4, irow0, irow1); | 
|---|
| 220 | CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m); | 
|---|
| 221 | CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m); | 
|---|
| 222 | SW2(val0_m, val1_m, dst, 4); | 
|---|
| 223 | frow   += 4; | 
|---|
| 224 | irow   += 4; | 
|---|
| 225 | dst    += 4; | 
|---|
| 226 | length -= 4; | 
|---|
| 227 | } else if (length >= 4) { | 
|---|
| 228 | uint32_t val0_m; | 
|---|
| 229 | const v4u32 frow0 = LD_UW(frow + 0); | 
|---|
| 230 | const v4u32 irow0 = LD_UW(irow + 0); | 
|---|
| 231 | CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m); | 
|---|
| 232 | SW(val0_m, dst); | 
|---|
| 233 | frow   += 4; | 
|---|
| 234 | irow   += 4; | 
|---|
| 235 | dst    += 4; | 
|---|
| 236 | length -= 4; | 
|---|
| 237 | } | 
|---|
| 238 | for (x_out = 0; x_out < length; ++x_out) { | 
|---|
| 239 | const uint64_t I = (uint64_t)A * frow[x_out] | 
|---|
| 240 | + (uint64_t)B * irow[x_out]; | 
|---|
| 241 | const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX); | 
|---|
| 242 | const int v = (int)MULT_FIX(J, wrk->fy_scale); | 
|---|
| 243 | assert(v >= 0 && v <= 255); | 
|---|
| 244 | dst[x_out] = v; | 
|---|
| 245 | } | 
|---|
| 246 | } | 
|---|
| 247 | } | 
|---|
| 248 |  | 
|---|
| 249 | static void RescalerExportRowExpand(WebPRescaler* const wrk) { | 
|---|
| 250 | uint8_t* dst = wrk->dst; | 
|---|
| 251 | rescaler_t* irow = wrk->irow; | 
|---|
| 252 | const int x_out_max = wrk->dst_width * wrk->num_channels; | 
|---|
| 253 | const rescaler_t* frow = wrk->frow; | 
|---|
| 254 | assert(!WebPRescalerOutputDone(wrk)); | 
|---|
| 255 | assert(wrk->y_accum <= 0); | 
|---|
| 256 | assert(wrk->y_expand); | 
|---|
| 257 | assert(wrk->y_sub != 0); | 
|---|
| 258 | if (wrk->y_accum == 0) { | 
|---|
| 259 | ExportRowExpand_0(frow, dst, x_out_max, wrk); | 
|---|
| 260 | } else { | 
|---|
| 261 | ExportRowExpand_1(frow, irow, dst, x_out_max, wrk); | 
|---|
| 262 | } | 
|---|
| 263 | } | 
|---|
| 264 |  | 
|---|
| 265 | static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow, | 
|---|
| 266 | uint8_t* dst, int length, | 
|---|
| 267 | const uint32_t yscale, | 
|---|
| 268 | WebPRescaler* const wrk) { | 
|---|
| 269 | const v4u32 y_scale = (v4u32)__msa_fill_w(yscale); | 
|---|
| 270 | const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale); | 
|---|
| 271 | const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX); | 
|---|
| 272 | const v4i32 zero = { 0 }; | 
|---|
| 273 |  | 
|---|
| 274 | while (length >= 16) { | 
|---|
| 275 | v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3; | 
|---|
| 276 | v16u8 out; | 
|---|
| 277 | LD_UW4(frow, 4, src0, src1, src2, src3); | 
|---|
| 278 | CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval, | 
|---|
| 279 | frac0, frac1, frac2, frac3); | 
|---|
| 280 | LD_UW4(irow, 4, src0, src1, src2, src3); | 
|---|
| 281 | SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3, | 
|---|
| 282 | src0, src1, src2, src3); | 
|---|
| 283 | CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out); | 
|---|
| 284 | ST_UB(out, dst); | 
|---|
| 285 | ST_UW4(frac0, frac1, frac2, frac3, irow, 4); | 
|---|
| 286 | frow   += 16; | 
|---|
| 287 | irow   += 16; | 
|---|
| 288 | dst    += 16; | 
|---|
| 289 | length -= 16; | 
|---|
| 290 | } | 
|---|
| 291 | if (length > 0) { | 
|---|
| 292 | int x_out; | 
|---|
| 293 | if (length >= 12) { | 
|---|
| 294 | uint32_t val0_m, val1_m, val2_m; | 
|---|
| 295 | v4u32 src0, src1, src2, frac0, frac1, frac2; | 
|---|
| 296 | LD_UW3(frow, 4, src0, src1, src2); | 
|---|
| 297 | CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0); | 
|---|
| 298 | CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1); | 
|---|
| 299 | CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2); | 
|---|
| 300 | LD_UW3(irow, 4, src0, src1, src2); | 
|---|
| 301 | SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2); | 
|---|
| 302 | CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m); | 
|---|
| 303 | CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m); | 
|---|
| 304 | CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m); | 
|---|
| 305 | SW3(val0_m, val1_m, val2_m, dst, 4); | 
|---|
| 306 | ST_UW3(frac0, frac1, frac2, irow, 4); | 
|---|
| 307 | frow   += 12; | 
|---|
| 308 | irow   += 12; | 
|---|
| 309 | dst    += 12; | 
|---|
| 310 | length -= 12; | 
|---|
| 311 | } else if (length >= 8) { | 
|---|
| 312 | uint32_t val0_m, val1_m; | 
|---|
| 313 | v4u32 src0, src1, frac0, frac1; | 
|---|
| 314 | LD_UW2(frow, 4, src0, src1); | 
|---|
| 315 | CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0); | 
|---|
| 316 | CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1); | 
|---|
| 317 | LD_UW2(irow, 4, src0, src1); | 
|---|
| 318 | SUB2(src0, frac0, src1, frac1, src0, src1); | 
|---|
| 319 | CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m); | 
|---|
| 320 | CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m); | 
|---|
| 321 | SW2(val0_m, val1_m, dst, 4); | 
|---|
| 322 | ST_UW2(frac0, frac1, irow, 4); | 
|---|
| 323 | frow   += 8; | 
|---|
| 324 | irow   += 8; | 
|---|
| 325 | dst    += 8; | 
|---|
| 326 | length -= 8; | 
|---|
| 327 | } else if (length >= 4) { | 
|---|
| 328 | uint32_t val0_m; | 
|---|
| 329 | v4u32 frac0; | 
|---|
| 330 | v4u32 src0 = LD_UW(frow); | 
|---|
| 331 | CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0); | 
|---|
| 332 | src0 = LD_UW(irow); | 
|---|
| 333 | src0 = src0 - frac0; | 
|---|
| 334 | CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m); | 
|---|
| 335 | SW(val0_m, dst); | 
|---|
| 336 | ST_UW(frac0, irow); | 
|---|
| 337 | frow   += 4; | 
|---|
| 338 | irow   += 4; | 
|---|
| 339 | dst    += 4; | 
|---|
| 340 | length -= 4; | 
|---|
| 341 | } | 
|---|
| 342 | for (x_out = 0; x_out < length; ++x_out) { | 
|---|
| 343 | const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale); | 
|---|
| 344 | const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale); | 
|---|
| 345 | assert(v >= 0 && v <= 255); | 
|---|
| 346 | dst[x_out] = v; | 
|---|
| 347 | irow[x_out] = frac; | 
|---|
| 348 | } | 
|---|
| 349 | } | 
|---|
| 350 | } | 
|---|
| 351 |  | 
|---|
| 352 | static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst, | 
|---|
| 353 | int length, | 
|---|
| 354 | WebPRescaler* const wrk) { | 
|---|
| 355 | const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale); | 
|---|
| 356 | const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX); | 
|---|
| 357 | const v4i32 zero = { 0 }; | 
|---|
| 358 |  | 
|---|
| 359 | while (length >= 16) { | 
|---|
| 360 | v4u32 src0, src1, src2, src3; | 
|---|
| 361 | v16u8 dst0; | 
|---|
| 362 | LD_UW4(irow, 4, src0, src1, src2, src3); | 
|---|
| 363 | CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, dst0); | 
|---|
| 364 | ST_UB(dst0, dst); | 
|---|
| 365 | ST_SW4(zero, zero, zero, zero, irow, 4); | 
|---|
| 366 | length -= 16; | 
|---|
| 367 | irow   += 16; | 
|---|
| 368 | dst    += 16; | 
|---|
| 369 | } | 
|---|
| 370 | if (length > 0) { | 
|---|
| 371 | int x_out; | 
|---|
| 372 | if (length >= 12) { | 
|---|
| 373 | uint32_t val0_m, val1_m, val2_m; | 
|---|
| 374 | v4u32 src0, src1, src2; | 
|---|
| 375 | LD_UW3(irow, 4, src0, src1, src2); | 
|---|
| 376 | CALC_MULT_FIX_4(src0, scale, shift, val0_m); | 
|---|
| 377 | CALC_MULT_FIX_4(src1, scale, shift, val1_m); | 
|---|
| 378 | CALC_MULT_FIX_4(src2, scale, shift, val2_m); | 
|---|
| 379 | SW3(val0_m, val1_m, val2_m, dst, 4); | 
|---|
| 380 | ST_SW3(zero, zero, zero, irow, 4); | 
|---|
| 381 | length -= 12; | 
|---|
| 382 | irow   += 12; | 
|---|
| 383 | dst    += 12; | 
|---|
| 384 | } else if (length >= 8) { | 
|---|
| 385 | uint32_t val0_m, val1_m; | 
|---|
| 386 | v4u32 src0, src1; | 
|---|
| 387 | LD_UW2(irow, 4, src0, src1); | 
|---|
| 388 | CALC_MULT_FIX_4(src0, scale, shift, val0_m); | 
|---|
| 389 | CALC_MULT_FIX_4(src1, scale, shift, val1_m); | 
|---|
| 390 | SW2(val0_m, val1_m, dst, 4); | 
|---|
| 391 | ST_SW2(zero, zero, irow, 4); | 
|---|
| 392 | length -= 8; | 
|---|
| 393 | irow   += 8; | 
|---|
| 394 | dst    += 8; | 
|---|
| 395 | } else if (length >= 4) { | 
|---|
| 396 | uint32_t val0_m; | 
|---|
| 397 | const v4u32 src0 = LD_UW(irow + 0); | 
|---|
| 398 | CALC_MULT_FIX_4(src0, scale, shift, val0_m); | 
|---|
| 399 | SW(val0_m, dst); | 
|---|
| 400 | ST_SW(zero, irow); | 
|---|
| 401 | length -= 4; | 
|---|
| 402 | irow   += 4; | 
|---|
| 403 | dst    += 4; | 
|---|
| 404 | } | 
|---|
| 405 | for (x_out = 0; x_out < length; ++x_out) { | 
|---|
| 406 | const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale); | 
|---|
| 407 | assert(v >= 0 && v <= 255); | 
|---|
| 408 | dst[x_out] = v; | 
|---|
| 409 | irow[x_out] = 0; | 
|---|
| 410 | } | 
|---|
| 411 | } | 
|---|
| 412 | } | 
|---|
| 413 |  | 
|---|
| 414 | static void RescalerExportRowShrink(WebPRescaler* const wrk) { | 
|---|
| 415 | uint8_t* dst = wrk->dst; | 
|---|
| 416 | rescaler_t* irow = wrk->irow; | 
|---|
| 417 | const int x_out_max = wrk->dst_width * wrk->num_channels; | 
|---|
| 418 | const rescaler_t* frow = wrk->frow; | 
|---|
| 419 | const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum); | 
|---|
| 420 | assert(!WebPRescalerOutputDone(wrk)); | 
|---|
| 421 | assert(wrk->y_accum <= 0); | 
|---|
| 422 | assert(!wrk->y_expand); | 
|---|
| 423 | if (yscale) { | 
|---|
| 424 | ExportRowShrink_0(frow, irow, dst, x_out_max, yscale, wrk); | 
|---|
| 425 | } else { | 
|---|
| 426 | ExportRowShrink_1(irow, dst, x_out_max, wrk); | 
|---|
| 427 | } | 
|---|
| 428 | } | 
|---|
| 429 |  | 
|---|
| 430 | //------------------------------------------------------------------------------ | 
|---|
| 431 | // Entry point | 
|---|
| 432 |  | 
|---|
| 433 | extern void WebPRescalerDspInitMSA(void); | 
|---|
| 434 |  | 
|---|
| 435 | WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) { | 
|---|
| 436 | WebPRescalerExportRowExpand = RescalerExportRowExpand; | 
|---|
| 437 | WebPRescalerExportRowShrink = RescalerExportRowShrink; | 
|---|
| 438 | } | 
|---|
| 439 |  | 
|---|
| 440 | #else     // !WEBP_USE_MSA | 
|---|
| 441 |  | 
|---|
| 442 | WEBP_DSP_INIT_STUB(WebPRescalerDspInitMSA) | 
|---|
| 443 |  | 
|---|
| 444 | #endif    // WEBP_USE_MSA | 
|---|
| 445 |  | 
|---|