1// Copyright 2016 Google Inc. All Rights Reserved.
2//
3// Use of this source code is governed by a BSD-style license
4// that can be found in the COPYING file in the root of the source
5// tree. An additional intellectual property rights grant can be found
6// in the file PATENTS. All contributing project authors may
7// be found in the AUTHORS file in the root of the source tree.
8// -----------------------------------------------------------------------------
9//
10// MSA version of rescaling functions
11//
12// Author: Prashant Patil (prashant.patil@imgtec.com)
13
14#include "./dsp.h"
15
16#if defined(WEBP_USE_MSA)
17
18#include <assert.h>
19
20#include "../utils/rescaler_utils.h"
21#include "./msa_macro.h"
22
23#define ROUNDER (WEBP_RESCALER_ONE >> 1)
24#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
25
26#define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do { \
27 v4u32 tmp0, tmp1, tmp2, tmp3; \
28 v16u8 t0, t1, t2, t3, t4, t5; \
29 v2u64 out0, out1, out2, out3; \
30 ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
31 ILVRL_W2_UW(zero, in1, tmp2, tmp3); \
32 DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
33 DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \
34 SRAR_D4_UD(out0, out1, out2, out3, shift); \
35 PCKEV_B2_UB(out1, out0, out3, out2, t0, t1); \
36 ILVRL_W2_UW(zero, in2, tmp0, tmp1); \
37 ILVRL_W2_UW(zero, in3, tmp2, tmp3); \
38 DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
39 DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \
40 SRAR_D4_UD(out0, out1, out2, out3, shift); \
41 PCKEV_B2_UB(out1, out0, out3, out2, t2, t3); \
42 PCKEV_B2_UB(t1, t0, t3, t2, t4, t5); \
43 dst = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4); \
44} while (0)
45
46#define CALC_MULT_FIX_4(in0, scale, shift, dst) do { \
47 v4u32 tmp0, tmp1; \
48 v16i8 t0, t1; \
49 v2u64 out0, out1; \
50 ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
51 DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
52 SRAR_D2_UD(out0, out1, shift); \
53 t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \
54 t1 = __msa_pckev_b(t0, t0); \
55 t0 = __msa_pckev_b(t1, t1); \
56 dst = __msa_copy_s_w((v4i32)t0, 0); \
57} while (0)
58
59#define CALC_MULT_FIX1_16(in0, in1, in2, in3, fyscale, shift, \
60 dst0, dst1, dst2, dst3) do { \
61 v4u32 tmp0, tmp1, tmp2, tmp3; \
62 v2u64 out0, out1, out2, out3; \
63 ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
64 ILVRL_W2_UW(zero, in1, tmp2, tmp3); \
65 DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \
66 DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \
67 SRAR_D4_UD(out0, out1, out2, out3, shift); \
68 PCKEV_W2_UW(out1, out0, out3, out2, dst0, dst1); \
69 ILVRL_W2_UW(zero, in2, tmp0, tmp1); \
70 ILVRL_W2_UW(zero, in3, tmp2, tmp3); \
71 DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \
72 DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \
73 SRAR_D4_UD(out0, out1, out2, out3, shift); \
74 PCKEV_W2_UW(out1, out0, out3, out2, dst2, dst3); \
75} while (0)
76
77#define CALC_MULT_FIX1_4(in0, scale, shift, dst) do { \
78 v4u32 tmp0, tmp1; \
79 v2u64 out0, out1; \
80 ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
81 DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
82 SRAR_D2_UD(out0, out1, shift); \
83 dst = (v4u32)__msa_pckev_w((v4i32)out1, (v4i32)out0); \
84} while (0)
85
86#define CALC_MULT_FIX2_16(in0, in1, in2, in3, mult, scale, shift, \
87 dst0, dst1) do { \
88 v4u32 tmp0, tmp1, tmp2, tmp3; \
89 v2u64 out0, out1, out2, out3; \
90 ILVRL_W2_UW(in0, in2, tmp0, tmp1); \
91 ILVRL_W2_UW(in1, in3, tmp2, tmp3); \
92 DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \
93 DOTP_UW2_UD(tmp2, tmp3, mult, mult, out2, out3); \
94 SRAR_D4_UD(out0, out1, out2, out3, shift); \
95 DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \
96 DOTP_UW2_UD(out2, out3, scale, scale, out2, out3); \
97 SRAR_D4_UD(out0, out1, out2, out3, shift); \
98 PCKEV_B2_UB(out1, out0, out3, out2, dst0, dst1); \
99} while (0)
100
101#define CALC_MULT_FIX2_4(in0, in1, mult, scale, shift, dst) do { \
102 v4u32 tmp0, tmp1; \
103 v2u64 out0, out1; \
104 v16i8 t0, t1; \
105 ILVRL_W2_UW(in0, in1, tmp0, tmp1); \
106 DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \
107 SRAR_D2_UD(out0, out1, shift); \
108 DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \
109 SRAR_D2_UD(out0, out1, shift); \
110 t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \
111 t1 = __msa_pckev_b(t0, t0); \
112 t0 = __msa_pckev_b(t1, t1); \
113 dst = __msa_copy_s_w((v4i32)t0, 0); \
114} while (0)
115
116static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
117 int length,
118 WebPRescaler* const wrk) {
119 const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
120 const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
121 const v4i32 zero = { 0 };
122
123 while (length >= 16) {
124 v4u32 src0, src1, src2, src3;
125 v16u8 out;
126 LD_UW4(frow, 4, src0, src1, src2, src3);
127 CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, out);
128 ST_UB(out, dst);
129 length -= 16;
130 frow += 16;
131 dst += 16;
132 }
133 if (length > 0) {
134 int x_out;
135 if (length >= 12) {
136 uint32_t val0_m, val1_m, val2_m;
137 v4u32 src0, src1, src2;
138 LD_UW3(frow, 4, src0, src1, src2);
139 CALC_MULT_FIX_4(src0, scale, shift, val0_m);
140 CALC_MULT_FIX_4(src1, scale, shift, val1_m);
141 CALC_MULT_FIX_4(src2, scale, shift, val2_m);
142 SW3(val0_m, val1_m, val2_m, dst, 4);
143 length -= 12;
144 frow += 12;
145 dst += 12;
146 } else if (length >= 8) {
147 uint32_t val0_m, val1_m;
148 v4u32 src0, src1;
149 LD_UW2(frow, 4, src0, src1);
150 CALC_MULT_FIX_4(src0, scale, shift, val0_m);
151 CALC_MULT_FIX_4(src1, scale, shift, val1_m);
152 SW2(val0_m, val1_m, dst, 4);
153 length -= 8;
154 frow += 8;
155 dst += 8;
156 } else if (length >= 4) {
157 uint32_t val0_m;
158 const v4u32 src0 = LD_UW(frow);
159 CALC_MULT_FIX_4(src0, scale, shift, val0_m);
160 SW(val0_m, dst);
161 length -= 4;
162 frow += 4;
163 dst += 4;
164 }
165 for (x_out = 0; x_out < length; ++x_out) {
166 const uint32_t J = frow[x_out];
167 const int v = (int)MULT_FIX(J, wrk->fy_scale);
168 assert(v >= 0 && v <= 255);
169 dst[x_out] = v;
170 }
171 }
172}
173
174static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
175 uint8_t* dst, int length,
176 WebPRescaler* const wrk) {
177 const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
178 const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
179 const v4i32 B1 = __msa_fill_w(B);
180 const v4i32 A1 = __msa_fill_w(A);
181 const v4i32 AB = __msa_ilvr_w(A1, B1);
182 const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
183 const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
184
185 while (length >= 16) {
186 v4u32 frow0, frow1, frow2, frow3, irow0, irow1, irow2, irow3;
187 v16u8 t0, t1, t2, t3, t4, t5;
188 LD_UW4(frow, 4, frow0, frow1, frow2, frow3);
189 LD_UW4(irow, 4, irow0, irow1, irow2, irow3);
190 CALC_MULT_FIX2_16(frow0, frow1, irow0, irow1, AB, scale, shift, t0, t1);
191 CALC_MULT_FIX2_16(frow2, frow3, irow2, irow3, AB, scale, shift, t2, t3);
192 PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);
193 t0 = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);
194 ST_UB(t0, dst);
195 frow += 16;
196 irow += 16;
197 dst += 16;
198 length -= 16;
199 }
200 if (length > 0) {
201 int x_out;
202 if (length >= 12) {
203 uint32_t val0_m, val1_m, val2_m;
204 v4u32 frow0, frow1, frow2, irow0, irow1, irow2;
205 LD_UW3(frow, 4, frow0, frow1, frow2);
206 LD_UW3(irow, 4, irow0, irow1, irow2);
207 CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
208 CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
209 CALC_MULT_FIX2_4(frow2, irow2, AB, scale, shift, val2_m);
210 SW3(val0_m, val1_m, val2_m, dst, 4);
211 frow += 12;
212 irow += 12;
213 dst += 12;
214 length -= 12;
215 } else if (length >= 8) {
216 uint32_t val0_m, val1_m;
217 v4u32 frow0, frow1, irow0, irow1;
218 LD_UW2(frow, 4, frow0, frow1);
219 LD_UW2(irow, 4, irow0, irow1);
220 CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
221 CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
222 SW2(val0_m, val1_m, dst, 4);
223 frow += 4;
224 irow += 4;
225 dst += 4;
226 length -= 4;
227 } else if (length >= 4) {
228 uint32_t val0_m;
229 const v4u32 frow0 = LD_UW(frow + 0);
230 const v4u32 irow0 = LD_UW(irow + 0);
231 CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
232 SW(val0_m, dst);
233 frow += 4;
234 irow += 4;
235 dst += 4;
236 length -= 4;
237 }
238 for (x_out = 0; x_out < length; ++x_out) {
239 const uint64_t I = (uint64_t)A * frow[x_out]
240 + (uint64_t)B * irow[x_out];
241 const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
242 const int v = (int)MULT_FIX(J, wrk->fy_scale);
243 assert(v >= 0 && v <= 255);
244 dst[x_out] = v;
245 }
246 }
247}
248
249static void RescalerExportRowExpand(WebPRescaler* const wrk) {
250 uint8_t* dst = wrk->dst;
251 rescaler_t* irow = wrk->irow;
252 const int x_out_max = wrk->dst_width * wrk->num_channels;
253 const rescaler_t* frow = wrk->frow;
254 assert(!WebPRescalerOutputDone(wrk));
255 assert(wrk->y_accum <= 0);
256 assert(wrk->y_expand);
257 assert(wrk->y_sub != 0);
258 if (wrk->y_accum == 0) {
259 ExportRowExpand_0(frow, dst, x_out_max, wrk);
260 } else {
261 ExportRowExpand_1(frow, irow, dst, x_out_max, wrk);
262 }
263}
264
265static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
266 uint8_t* dst, int length,
267 const uint32_t yscale,
268 WebPRescaler* const wrk) {
269 const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
270 const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
271 const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
272 const v4i32 zero = { 0 };
273
274 while (length >= 16) {
275 v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3;
276 v16u8 out;
277 LD_UW4(frow, 4, src0, src1, src2, src3);
278 CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval,
279 frac0, frac1, frac2, frac3);
280 LD_UW4(irow, 4, src0, src1, src2, src3);
281 SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3,
282 src0, src1, src2, src3);
283 CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out);
284 ST_UB(out, dst);
285 ST_UW4(frac0, frac1, frac2, frac3, irow, 4);
286 frow += 16;
287 irow += 16;
288 dst += 16;
289 length -= 16;
290 }
291 if (length > 0) {
292 int x_out;
293 if (length >= 12) {
294 uint32_t val0_m, val1_m, val2_m;
295 v4u32 src0, src1, src2, frac0, frac1, frac2;
296 LD_UW3(frow, 4, src0, src1, src2);
297 CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
298 CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
299 CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2);
300 LD_UW3(irow, 4, src0, src1, src2);
301 SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2);
302 CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
303 CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
304 CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m);
305 SW3(val0_m, val1_m, val2_m, dst, 4);
306 ST_UW3(frac0, frac1, frac2, irow, 4);
307 frow += 12;
308 irow += 12;
309 dst += 12;
310 length -= 12;
311 } else if (length >= 8) {
312 uint32_t val0_m, val1_m;
313 v4u32 src0, src1, frac0, frac1;
314 LD_UW2(frow, 4, src0, src1);
315 CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
316 CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
317 LD_UW2(irow, 4, src0, src1);
318 SUB2(src0, frac0, src1, frac1, src0, src1);
319 CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
320 CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
321 SW2(val0_m, val1_m, dst, 4);
322 ST_UW2(frac0, frac1, irow, 4);
323 frow += 8;
324 irow += 8;
325 dst += 8;
326 length -= 8;
327 } else if (length >= 4) {
328 uint32_t val0_m;
329 v4u32 frac0;
330 v4u32 src0 = LD_UW(frow);
331 CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
332 src0 = LD_UW(irow);
333 src0 = src0 - frac0;
334 CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
335 SW(val0_m, dst);
336 ST_UW(frac0, irow);
337 frow += 4;
338 irow += 4;
339 dst += 4;
340 length -= 4;
341 }
342 for (x_out = 0; x_out < length; ++x_out) {
343 const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale);
344 const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
345 assert(v >= 0 && v <= 255);
346 dst[x_out] = v;
347 irow[x_out] = frac;
348 }
349 }
350}
351
352static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
353 int length,
354 WebPRescaler* const wrk) {
355 const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
356 const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
357 const v4i32 zero = { 0 };
358
359 while (length >= 16) {
360 v4u32 src0, src1, src2, src3;
361 v16u8 dst0;
362 LD_UW4(irow, 4, src0, src1, src2, src3);
363 CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, dst0);
364 ST_UB(dst0, dst);
365 ST_SW4(zero, zero, zero, zero, irow, 4);
366 length -= 16;
367 irow += 16;
368 dst += 16;
369 }
370 if (length > 0) {
371 int x_out;
372 if (length >= 12) {
373 uint32_t val0_m, val1_m, val2_m;
374 v4u32 src0, src1, src2;
375 LD_UW3(irow, 4, src0, src1, src2);
376 CALC_MULT_FIX_4(src0, scale, shift, val0_m);
377 CALC_MULT_FIX_4(src1, scale, shift, val1_m);
378 CALC_MULT_FIX_4(src2, scale, shift, val2_m);
379 SW3(val0_m, val1_m, val2_m, dst, 4);
380 ST_SW3(zero, zero, zero, irow, 4);
381 length -= 12;
382 irow += 12;
383 dst += 12;
384 } else if (length >= 8) {
385 uint32_t val0_m, val1_m;
386 v4u32 src0, src1;
387 LD_UW2(irow, 4, src0, src1);
388 CALC_MULT_FIX_4(src0, scale, shift, val0_m);
389 CALC_MULT_FIX_4(src1, scale, shift, val1_m);
390 SW2(val0_m, val1_m, dst, 4);
391 ST_SW2(zero, zero, irow, 4);
392 length -= 8;
393 irow += 8;
394 dst += 8;
395 } else if (length >= 4) {
396 uint32_t val0_m;
397 const v4u32 src0 = LD_UW(irow + 0);
398 CALC_MULT_FIX_4(src0, scale, shift, val0_m);
399 SW(val0_m, dst);
400 ST_SW(zero, irow);
401 length -= 4;
402 irow += 4;
403 dst += 4;
404 }
405 for (x_out = 0; x_out < length; ++x_out) {
406 const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
407 assert(v >= 0 && v <= 255);
408 dst[x_out] = v;
409 irow[x_out] = 0;
410 }
411 }
412}
413
414static void RescalerExportRowShrink(WebPRescaler* const wrk) {
415 uint8_t* dst = wrk->dst;
416 rescaler_t* irow = wrk->irow;
417 const int x_out_max = wrk->dst_width * wrk->num_channels;
418 const rescaler_t* frow = wrk->frow;
419 const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
420 assert(!WebPRescalerOutputDone(wrk));
421 assert(wrk->y_accum <= 0);
422 assert(!wrk->y_expand);
423 if (yscale) {
424 ExportRowShrink_0(frow, irow, dst, x_out_max, yscale, wrk);
425 } else {
426 ExportRowShrink_1(irow, dst, x_out_max, wrk);
427 }
428}
429
430//------------------------------------------------------------------------------
431// Entry point
432
433extern void WebPRescalerDspInitMSA(void);
434
435WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
436 WebPRescalerExportRowExpand = RescalerExportRowExpand;
437 WebPRescalerExportRowShrink = RescalerExportRowShrink;
438}
439
440#else // !WEBP_USE_MSA
441
442WEBP_DSP_INIT_STUB(WebPRescalerDspInitMSA)
443
444#endif // WEBP_USE_MSA
445