1 | // Copyright 2016 Google Inc. All Rights Reserved. |
2 | // |
3 | // Use of this source code is governed by a BSD-style license |
4 | // that can be found in the COPYING file in the root of the source |
5 | // tree. An additional intellectual property rights grant can be found |
6 | // in the file PATENTS. All contributing project authors may |
7 | // be found in the AUTHORS file in the root of the source tree. |
8 | // ----------------------------------------------------------------------------- |
9 | // |
10 | // MSA version of YUV to RGB upsampling functions. |
11 | // |
12 | // Author: Prashant Patil (prashant.patil@imgtec.com) |
13 | |
14 | #include <string.h> |
15 | #include "src/dsp/dsp.h" |
16 | |
17 | #if defined(WEBP_USE_MSA) |
18 | |
19 | #include "src/dsp/msa_macro.h" |
20 | #include "src/dsp/yuv.h" |
21 | |
22 | #ifdef FANCY_UPSAMPLING |
23 | |
24 | #define ILVR_UW2(in, out0, out1) do { \ |
25 | const v8i16 t0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)in); \ |
26 | out0 = (v4u32)__msa_ilvr_h((v8i16)zero, t0); \ |
27 | out1 = (v4u32)__msa_ilvl_h((v8i16)zero, t0); \ |
28 | } while (0) |
29 | |
30 | #define ILVRL_UW4(in, out0, out1, out2, out3) do { \ |
31 | v16u8 t0, t1; \ |
32 | ILVRL_B2_UB(zero, in, t0, t1); \ |
33 | ILVRL_H2_UW(zero, t0, out0, out1); \ |
34 | ILVRL_H2_UW(zero, t1, out2, out3); \ |
35 | } while (0) |
36 | |
37 | #define MULTHI_16(in0, in1, in2, in3, cnst, out0, out1) do { \ |
38 | const v4i32 const0 = (v4i32)__msa_fill_w(cnst * 256); \ |
39 | v4u32 temp0, temp1, temp2, temp3; \ |
40 | MUL4(in0, const0, in1, const0, in2, const0, in3, const0, \ |
41 | temp0, temp1, temp2, temp3); \ |
42 | PCKOD_H2_UH(temp1, temp0, temp3, temp2, out0, out1); \ |
43 | } while (0) |
44 | |
45 | #define MULTHI_8(in0, in1, cnst, out0) do { \ |
46 | const v4i32 const0 = (v4i32)__msa_fill_w(cnst * 256); \ |
47 | v4u32 temp0, temp1; \ |
48 | MUL2(in0, const0, in1, const0, temp0, temp1); \ |
49 | out0 = (v8u16)__msa_pckod_h((v8i16)temp1, (v8i16)temp0); \ |
50 | } while (0) |
51 | |
52 | #define CALC_R16(y0, y1, v0, v1, dst) do { \ |
53 | const v8i16 const_a = (v8i16)__msa_fill_h(14234); \ |
54 | const v8i16 a0 = __msa_adds_s_h((v8i16)y0, (v8i16)v0); \ |
55 | const v8i16 a1 = __msa_adds_s_h((v8i16)y1, (v8i16)v1); \ |
56 | v8i16 b0 = __msa_subs_s_h(a0, const_a); \ |
57 | v8i16 b1 = __msa_subs_s_h(a1, const_a); \ |
58 | SRAI_H2_SH(b0, b1, 6); \ |
59 | CLIP_SH2_0_255(b0, b1); \ |
60 | dst = (v16u8)__msa_pckev_b((v16i8)b1, (v16i8)b0); \ |
61 | } while (0) |
62 | |
63 | #define CALC_R8(y0, v0, dst) do { \ |
64 | const v8i16 const_a = (v8i16)__msa_fill_h(14234); \ |
65 | const v8i16 a0 = __msa_adds_s_h((v8i16)y0, (v8i16)v0); \ |
66 | v8i16 b0 = __msa_subs_s_h(a0, const_a); \ |
67 | b0 = SRAI_H(b0, 6); \ |
68 | CLIP_SH_0_255(b0); \ |
69 | dst = (v16u8)__msa_pckev_b((v16i8)b0, (v16i8)b0); \ |
70 | } while (0) |
71 | |
72 | #define CALC_G16(y0, y1, u0, u1, v0, v1, dst) do { \ |
73 | const v8i16 const_a = (v8i16)__msa_fill_h(8708); \ |
74 | v8i16 a0 = __msa_subs_s_h((v8i16)y0, (v8i16)u0); \ |
75 | v8i16 a1 = __msa_subs_s_h((v8i16)y1, (v8i16)u1); \ |
76 | const v8i16 b0 = __msa_subs_s_h(a0, (v8i16)v0); \ |
77 | const v8i16 b1 = __msa_subs_s_h(a1, (v8i16)v1); \ |
78 | a0 = __msa_adds_s_h(b0, const_a); \ |
79 | a1 = __msa_adds_s_h(b1, const_a); \ |
80 | SRAI_H2_SH(a0, a1, 6); \ |
81 | CLIP_SH2_0_255(a0, a1); \ |
82 | dst = (v16u8)__msa_pckev_b((v16i8)a1, (v16i8)a0); \ |
83 | } while (0) |
84 | |
85 | #define CALC_G8(y0, u0, v0, dst) do { \ |
86 | const v8i16 const_a = (v8i16)__msa_fill_h(8708); \ |
87 | v8i16 a0 = __msa_subs_s_h((v8i16)y0, (v8i16)u0); \ |
88 | const v8i16 b0 = __msa_subs_s_h(a0, (v8i16)v0); \ |
89 | a0 = __msa_adds_s_h(b0, const_a); \ |
90 | a0 = SRAI_H(a0, 6); \ |
91 | CLIP_SH_0_255(a0); \ |
92 | dst = (v16u8)__msa_pckev_b((v16i8)a0, (v16i8)a0); \ |
93 | } while (0) |
94 | |
95 | #define CALC_B16(y0, y1, u0, u1, dst) do { \ |
96 | const v8u16 const_a = (v8u16)__msa_fill_h(17685); \ |
97 | const v8u16 a0 = __msa_adds_u_h((v8u16)y0, u0); \ |
98 | const v8u16 a1 = __msa_adds_u_h((v8u16)y1, u1); \ |
99 | v8u16 b0 = __msa_subs_u_h(a0, const_a); \ |
100 | v8u16 b1 = __msa_subs_u_h(a1, const_a); \ |
101 | SRAI_H2_UH(b0, b1, 6); \ |
102 | CLIP_UH2_0_255(b0, b1); \ |
103 | dst = (v16u8)__msa_pckev_b((v16i8)b1, (v16i8)b0); \ |
104 | } while (0) |
105 | |
106 | #define CALC_B8(y0, u0, dst) do { \ |
107 | const v8u16 const_a = (v8u16)__msa_fill_h(17685); \ |
108 | const v8u16 a0 = __msa_adds_u_h((v8u16)y0, u0); \ |
109 | v8u16 b0 = __msa_subs_u_h(a0, const_a); \ |
110 | b0 = SRAI_H(b0, 6); \ |
111 | CLIP_UH_0_255(b0); \ |
112 | dst = (v16u8)__msa_pckev_b((v16i8)b0, (v16i8)b0); \ |
113 | } while (0) |
114 | |
115 | #define CALC_RGB16(y, u, v, R, G, B) do { \ |
116 | const v16u8 zero = { 0 }; \ |
117 | v8u16 y0, y1, u0, u1, v0, v1; \ |
118 | v4u32 p0, p1, p2, p3; \ |
119 | const v16u8 in_y = LD_UB(y); \ |
120 | const v16u8 in_u = LD_UB(u); \ |
121 | const v16u8 in_v = LD_UB(v); \ |
122 | ILVRL_UW4(in_y, p0, p1, p2, p3); \ |
123 | MULTHI_16(p0, p1, p2, p3, 19077, y0, y1); \ |
124 | ILVRL_UW4(in_v, p0, p1, p2, p3); \ |
125 | MULTHI_16(p0, p1, p2, p3, 26149, v0, v1); \ |
126 | CALC_R16(y0, y1, v0, v1, R); \ |
127 | MULTHI_16(p0, p1, p2, p3, 13320, v0, v1); \ |
128 | ILVRL_UW4(in_u, p0, p1, p2, p3); \ |
129 | MULTHI_16(p0, p1, p2, p3, 6419, u0, u1); \ |
130 | CALC_G16(y0, y1, u0, u1, v0, v1, G); \ |
131 | MULTHI_16(p0, p1, p2, p3, 33050, u0, u1); \ |
132 | CALC_B16(y0, y1, u0, u1, B); \ |
133 | } while (0) |
134 | |
135 | #define CALC_RGB8(y, u, v, R, G, B) do { \ |
136 | const v16u8 zero = { 0 }; \ |
137 | v8u16 y0, u0, v0; \ |
138 | v4u32 p0, p1; \ |
139 | const v16u8 in_y = LD_UB(y); \ |
140 | const v16u8 in_u = LD_UB(u); \ |
141 | const v16u8 in_v = LD_UB(v); \ |
142 | ILVR_UW2(in_y, p0, p1); \ |
143 | MULTHI_8(p0, p1, 19077, y0); \ |
144 | ILVR_UW2(in_v, p0, p1); \ |
145 | MULTHI_8(p0, p1, 26149, v0); \ |
146 | CALC_R8(y0, v0, R); \ |
147 | MULTHI_8(p0, p1, 13320, v0); \ |
148 | ILVR_UW2(in_u, p0, p1); \ |
149 | MULTHI_8(p0, p1, 6419, u0); \ |
150 | CALC_G8(y0, u0, v0, G); \ |
151 | MULTHI_8(p0, p1, 33050, u0); \ |
152 | CALC_B8(y0, u0, B); \ |
153 | } while (0) |
154 | |
155 | #define STORE16_3(a0, a1, a2, dst) do { \ |
156 | const v16u8 mask0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, \ |
157 | 8, 9, 20, 10 }; \ |
158 | const v16u8 mask1 = { 0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, \ |
159 | 8, 25, 9, 10 }; \ |
160 | const v16u8 mask2 = { 26, 0, 1, 27, 2, 3, 28, 4, 5, 29, 6, 7, \ |
161 | 30, 8, 9, 31 }; \ |
162 | v16u8 out0, out1, out2, tmp0, tmp1, tmp2; \ |
163 | ILVRL_B2_UB(a1, a0, tmp0, tmp1); \ |
164 | out0 = VSHF_UB(tmp0, a2, mask0); \ |
165 | tmp2 = SLDI_UB(tmp1, tmp0, 11); \ |
166 | out1 = VSHF_UB(tmp2, a2, mask1); \ |
167 | tmp2 = SLDI_UB(tmp1, tmp1, 6); \ |
168 | out2 = VSHF_UB(tmp2, a2, mask2); \ |
169 | ST_UB(out0, dst + 0); \ |
170 | ST_UB(out1, dst + 16); \ |
171 | ST_UB(out2, dst + 32); \ |
172 | } while (0) |
173 | |
174 | #define STORE8_3(a0, a1, a2, dst) do { \ |
175 | int64_t out_m; \ |
176 | const v16u8 mask0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, \ |
177 | 8, 9, 20, 10 }; \ |
178 | const v16u8 mask1 = { 11, 21, 12, 13, 22, 14, 15, 23, \ |
179 | 255, 255, 255, 255, 255, 255, 255, 255 }; \ |
180 | const v16u8 tmp0 = (v16u8)__msa_ilvr_b((v16i8)a1, (v16i8)a0); \ |
181 | v16u8 out0, out1; \ |
182 | VSHF_B2_UB(tmp0, a2, tmp0, a2, mask0, mask1, out0, out1); \ |
183 | ST_UB(out0, dst); \ |
184 | out_m = __msa_copy_s_d((v2i64)out1, 0); \ |
185 | SD(out_m, dst + 16); \ |
186 | } while (0) |
187 | |
188 | #define STORE16_4(a0, a1, a2, a3, dst) do { \ |
189 | v16u8 tmp0, tmp1, tmp2, tmp3; \ |
190 | v16u8 out0, out1, out2, out3; \ |
191 | ILVRL_B2_UB(a1, a0, tmp0, tmp1); \ |
192 | ILVRL_B2_UB(a3, a2, tmp2, tmp3); \ |
193 | ILVRL_H2_UB(tmp2, tmp0, out0, out1); \ |
194 | ILVRL_H2_UB(tmp3, tmp1, out2, out3); \ |
195 | ST_UB(out0, dst + 0); \ |
196 | ST_UB(out1, dst + 16); \ |
197 | ST_UB(out2, dst + 32); \ |
198 | ST_UB(out3, dst + 48); \ |
199 | } while (0) |
200 | |
201 | #define STORE8_4(a0, a1, a2, a3, dst) do { \ |
202 | v16u8 tmp0, tmp1, tmp2, tmp3; \ |
203 | ILVR_B2_UB(a1, a0, a3, a2, tmp0, tmp1); \ |
204 | ILVRL_H2_UB(tmp1, tmp0, tmp2, tmp3); \ |
205 | ST_UB(tmp2, dst + 0); \ |
206 | ST_UB(tmp3, dst + 16); \ |
207 | } while (0) |
208 | |
209 | #define STORE2_16(a0, a1, dst) do { \ |
210 | v16u8 out0, out1; \ |
211 | ILVRL_B2_UB(a1, a0, out0, out1); \ |
212 | ST_UB(out0, dst + 0); \ |
213 | ST_UB(out1, dst + 16); \ |
214 | } while (0) |
215 | |
216 | #define STORE2_8(a0, a1, dst) do { \ |
217 | const v16u8 out0 = (v16u8)__msa_ilvr_b((v16i8)a1, (v16i8)a0); \ |
218 | ST_UB(out0, dst); \ |
219 | } while (0) |
220 | |
221 | #define CALC_RGBA4444(y, u, v, out0, out1, N, dst) do { \ |
222 | CALC_RGB##N(y, u, v, R, G, B); \ |
223 | tmp0 = ANDI_B(R, 0xf0); \ |
224 | tmp1 = SRAI_B(G, 4); \ |
225 | RG = tmp0 | tmp1; \ |
226 | tmp0 = ANDI_B(B, 0xf0); \ |
227 | BA = ORI_B(tmp0, 0x0f); \ |
228 | STORE2_##N(out0, out1, dst); \ |
229 | } while (0) |
230 | |
231 | #define CALC_RGB565(y, u, v, out0, out1, N, dst) do { \ |
232 | CALC_RGB##N(y, u, v, R, G, B); \ |
233 | tmp0 = ANDI_B(R, 0xf8); \ |
234 | tmp1 = SRAI_B(G, 5); \ |
235 | RG = tmp0 | tmp1; \ |
236 | tmp0 = SLLI_B(G, 3); \ |
237 | tmp1 = ANDI_B(tmp0, 0xe0); \ |
238 | tmp0 = SRAI_B(B, 3); \ |
239 | GB = tmp0 | tmp1; \ |
240 | STORE2_##N(out0, out1, dst); \ |
241 | } while (0) |
242 | |
243 | static WEBP_INLINE int Clip8(int v) { |
244 | return v < 0 ? 0 : v > 255 ? 255 : v; |
245 | } |
246 | |
247 | static void YuvToRgb(int y, int u, int v, uint8_t* const rgb) { |
248 | const int y1 = MultHi(y, 19077); |
249 | const int r1 = y1 + MultHi(v, 26149) - 14234; |
250 | const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708; |
251 | const int b1 = y1 + MultHi(u, 33050) - 17685; |
252 | rgb[0] = Clip8(r1 >> 6); |
253 | rgb[1] = Clip8(g1 >> 6); |
254 | rgb[2] = Clip8(b1 >> 6); |
255 | } |
256 | |
257 | static void YuvToBgr(int y, int u, int v, uint8_t* const bgr) { |
258 | const int y1 = MultHi(y, 19077); |
259 | const int r1 = y1 + MultHi(v, 26149) - 14234; |
260 | const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708; |
261 | const int b1 = y1 + MultHi(u, 33050) - 17685; |
262 | bgr[0] = Clip8(b1 >> 6); |
263 | bgr[1] = Clip8(g1 >> 6); |
264 | bgr[2] = Clip8(r1 >> 6); |
265 | } |
266 | |
267 | #if !defined(WEBP_REDUCE_CSP) |
268 | static void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) { |
269 | const int y1 = MultHi(y, 19077); |
270 | const int r1 = y1 + MultHi(v, 26149) - 14234; |
271 | const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708; |
272 | const int b1 = y1 + MultHi(u, 33050) - 17685; |
273 | const int r = Clip8(r1 >> 6); |
274 | const int g = Clip8(g1 >> 6); |
275 | const int b = Clip8(b1 >> 6); |
276 | const int rg = (r & 0xf8) | (g >> 5); |
277 | const int gb = ((g << 3) & 0xe0) | (b >> 3); |
278 | #if (WEBP_SWAP_16BIT_CSP == 1) |
279 | rgb[0] = gb; |
280 | rgb[1] = rg; |
281 | #else |
282 | rgb[0] = rg; |
283 | rgb[1] = gb; |
284 | #endif |
285 | } |
286 | |
287 | static void YuvToRgba4444(int y, int u, int v, uint8_t* const argb) { |
288 | const int y1 = MultHi(y, 19077); |
289 | const int r1 = y1 + MultHi(v, 26149) - 14234; |
290 | const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708; |
291 | const int b1 = y1 + MultHi(u, 33050) - 17685; |
292 | const int r = Clip8(r1 >> 6); |
293 | const int g = Clip8(g1 >> 6); |
294 | const int b = Clip8(b1 >> 6); |
295 | const int rg = (r & 0xf0) | (g >> 4); |
296 | const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits |
297 | #if (WEBP_SWAP_16BIT_CSP == 1) |
298 | argb[0] = ba; |
299 | argb[1] = rg; |
300 | #else |
301 | argb[0] = rg; |
302 | argb[1] = ba; |
303 | #endif |
304 | } |
305 | |
306 | static void YuvToArgb(uint8_t y, uint8_t u, uint8_t v, uint8_t* const argb) { |
307 | argb[0] = 0xff; |
308 | YuvToRgb(y, u, v, argb + 1); |
309 | } |
310 | #endif // WEBP_REDUCE_CSP |
311 | |
312 | static void YuvToBgra(uint8_t y, uint8_t u, uint8_t v, uint8_t* const bgra) { |
313 | YuvToBgr(y, u, v, bgra); |
314 | bgra[3] = 0xff; |
315 | } |
316 | |
317 | static void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgba) { |
318 | YuvToRgb(y, u, v, rgba); |
319 | rgba[3] = 0xff; |
320 | } |
321 | |
322 | #if !defined(WEBP_REDUCE_CSP) |
323 | static void YuvToRgbLine(const uint8_t* y, const uint8_t* u, |
324 | const uint8_t* v, uint8_t* dst, int length) { |
325 | v16u8 R, G, B; |
326 | while (length >= 16) { |
327 | CALC_RGB16(y, u, v, R, G, B); |
328 | STORE16_3(R, G, B, dst); |
329 | y += 16; |
330 | u += 16; |
331 | v += 16; |
332 | dst += 16 * 3; |
333 | length -= 16; |
334 | } |
335 | if (length > 8) { |
336 | uint8_t temp[3 * 16] = { 0 }; |
337 | memcpy(temp, y, length * sizeof(*temp)); |
338 | CALC_RGB16(temp, u, v, R, G, B); |
339 | STORE16_3(R, G, B, temp); |
340 | memcpy(dst, temp, length * 3 * sizeof(*dst)); |
341 | } else if (length > 0) { |
342 | uint8_t temp[3 * 8] = { 0 }; |
343 | memcpy(temp, y, length * sizeof(*temp)); |
344 | CALC_RGB8(temp, u, v, R, G, B); |
345 | STORE8_3(R, G, B, temp); |
346 | memcpy(dst, temp, length * 3 * sizeof(*dst)); |
347 | } |
348 | } |
349 | |
350 | static void YuvToBgrLine(const uint8_t* y, const uint8_t* u, |
351 | const uint8_t* v, uint8_t* dst, int length) { |
352 | v16u8 R, G, B; |
353 | while (length >= 16) { |
354 | CALC_RGB16(y, u, v, R, G, B); |
355 | STORE16_3(B, G, R, dst); |
356 | y += 16; |
357 | u += 16; |
358 | v += 16; |
359 | dst += 16 * 3; |
360 | length -= 16; |
361 | } |
362 | if (length > 8) { |
363 | uint8_t temp[3 * 16] = { 0 }; |
364 | memcpy(temp, y, length * sizeof(*temp)); |
365 | CALC_RGB16(temp, u, v, R, G, B); |
366 | STORE16_3(B, G, R, temp); |
367 | memcpy(dst, temp, length * 3 * sizeof(*dst)); |
368 | } else if (length > 0) { |
369 | uint8_t temp[3 * 8] = { 0 }; |
370 | memcpy(temp, y, length * sizeof(*temp)); |
371 | CALC_RGB8(temp, u, v, R, G, B); |
372 | STORE8_3(B, G, R, temp); |
373 | memcpy(dst, temp, length * 3 * sizeof(*dst)); |
374 | } |
375 | } |
376 | #endif // WEBP_REDUCE_CSP |
377 | |
378 | static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u, |
379 | const uint8_t* v, uint8_t* dst, int length) { |
380 | v16u8 R, G, B; |
381 | const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL); |
382 | while (length >= 16) { |
383 | CALC_RGB16(y, u, v, R, G, B); |
384 | STORE16_4(R, G, B, A, dst); |
385 | y += 16; |
386 | u += 16; |
387 | v += 16; |
388 | dst += 16 * 4; |
389 | length -= 16; |
390 | } |
391 | if (length > 8) { |
392 | uint8_t temp[4 * 16] = { 0 }; |
393 | memcpy(temp, y, length * sizeof(*temp)); |
394 | CALC_RGB16(&temp[0], u, v, R, G, B); |
395 | STORE16_4(R, G, B, A, temp); |
396 | memcpy(dst, temp, length * 4 * sizeof(*dst)); |
397 | } else if (length > 0) { |
398 | uint8_t temp[4 * 8] = { 0 }; |
399 | memcpy(temp, y, length * sizeof(*temp)); |
400 | CALC_RGB8(temp, u, v, R, G, B); |
401 | STORE8_4(R, G, B, A, temp); |
402 | memcpy(dst, temp, length * 4 * sizeof(*dst)); |
403 | } |
404 | } |
405 | |
406 | static void YuvToBgraLine(const uint8_t* y, const uint8_t* u, |
407 | const uint8_t* v, uint8_t* dst, int length) { |
408 | v16u8 R, G, B; |
409 | const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL); |
410 | while (length >= 16) { |
411 | CALC_RGB16(y, u, v, R, G, B); |
412 | STORE16_4(B, G, R, A, dst); |
413 | y += 16; |
414 | u += 16; |
415 | v += 16; |
416 | dst += 16 * 4; |
417 | length -= 16; |
418 | } |
419 | if (length > 8) { |
420 | uint8_t temp[4 * 16] = { 0 }; |
421 | memcpy(temp, y, length * sizeof(*temp)); |
422 | CALC_RGB16(temp, u, v, R, G, B); |
423 | STORE16_4(B, G, R, A, temp); |
424 | memcpy(dst, temp, length * 4 * sizeof(*dst)); |
425 | } else if (length > 0) { |
426 | uint8_t temp[4 * 8] = { 0 }; |
427 | memcpy(temp, y, length * sizeof(*temp)); |
428 | CALC_RGB8(temp, u, v, R, G, B); |
429 | STORE8_4(B, G, R, A, temp); |
430 | memcpy(dst, temp, length * 4 * sizeof(*dst)); |
431 | } |
432 | } |
433 | |
434 | #if !defined(WEBP_REDUCE_CSP) |
435 | static void YuvToArgbLine(const uint8_t* y, const uint8_t* u, |
436 | const uint8_t* v, uint8_t* dst, int length) { |
437 | v16u8 R, G, B; |
438 | const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL); |
439 | while (length >= 16) { |
440 | CALC_RGB16(y, u, v, R, G, B); |
441 | STORE16_4(A, R, G, B, dst); |
442 | y += 16; |
443 | u += 16; |
444 | v += 16; |
445 | dst += 16 * 4; |
446 | length -= 16; |
447 | } |
448 | if (length > 8) { |
449 | uint8_t temp[4 * 16] = { 0 }; |
450 | memcpy(temp, y, length * sizeof(*temp)); |
451 | CALC_RGB16(temp, u, v, R, G, B); |
452 | STORE16_4(A, R, G, B, temp); |
453 | memcpy(dst, temp, length * 4 * sizeof(*dst)); |
454 | } else if (length > 0) { |
455 | uint8_t temp[4 * 8] = { 0 }; |
456 | memcpy(temp, y, length * sizeof(*temp)); |
457 | CALC_RGB8(temp, u, v, R, G, B); |
458 | STORE8_4(A, R, G, B, temp); |
459 | memcpy(dst, temp, length * 4 * sizeof(*dst)); |
460 | } |
461 | } |
462 | |
463 | static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u, |
464 | const uint8_t* v, uint8_t* dst, int length) { |
465 | v16u8 R, G, B, RG, BA, tmp0, tmp1; |
466 | while (length >= 16) { |
467 | #if (WEBP_SWAP_16BIT_CSP == 1) |
468 | CALC_RGBA4444(y, u, v, BA, RG, 16, dst); |
469 | #else |
470 | CALC_RGBA4444(y, u, v, RG, BA, 16, dst); |
471 | #endif |
472 | y += 16; |
473 | u += 16; |
474 | v += 16; |
475 | dst += 16 * 2; |
476 | length -= 16; |
477 | } |
478 | if (length > 8) { |
479 | uint8_t temp[2 * 16] = { 0 }; |
480 | memcpy(temp, y, length * sizeof(*temp)); |
481 | #if (WEBP_SWAP_16BIT_CSP == 1) |
482 | CALC_RGBA4444(temp, u, v, BA, RG, 16, temp); |
483 | #else |
484 | CALC_RGBA4444(temp, u, v, RG, BA, 16, temp); |
485 | #endif |
486 | memcpy(dst, temp, length * 2 * sizeof(*dst)); |
487 | } else if (length > 0) { |
488 | uint8_t temp[2 * 8] = { 0 }; |
489 | memcpy(temp, y, length * sizeof(*temp)); |
490 | #if (WEBP_SWAP_16BIT_CSP == 1) |
491 | CALC_RGBA4444(temp, u, v, BA, RG, 8, temp); |
492 | #else |
493 | CALC_RGBA4444(temp, u, v, RG, BA, 8, temp); |
494 | #endif |
495 | memcpy(dst, temp, length * 2 * sizeof(*dst)); |
496 | } |
497 | } |
498 | |
499 | static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u, |
500 | const uint8_t* v, uint8_t* dst, int length) { |
501 | v16u8 R, G, B, RG, GB, tmp0, tmp1; |
502 | while (length >= 16) { |
503 | #if (WEBP_SWAP_16BIT_CSP == 1) |
504 | CALC_RGB565(y, u, v, GB, RG, 16, dst); |
505 | #else |
506 | CALC_RGB565(y, u, v, RG, GB, 16, dst); |
507 | #endif |
508 | y += 16; |
509 | u += 16; |
510 | v += 16; |
511 | dst += 16 * 2; |
512 | length -= 16; |
513 | } |
514 | if (length > 8) { |
515 | uint8_t temp[2 * 16] = { 0 }; |
516 | memcpy(temp, y, length * sizeof(*temp)); |
517 | #if (WEBP_SWAP_16BIT_CSP == 1) |
518 | CALC_RGB565(temp, u, v, GB, RG, 16, temp); |
519 | #else |
520 | CALC_RGB565(temp, u, v, RG, GB, 16, temp); |
521 | #endif |
522 | memcpy(dst, temp, length * 2 * sizeof(*dst)); |
523 | } else if (length > 0) { |
524 | uint8_t temp[2 * 8] = { 0 }; |
525 | memcpy(temp, y, length * sizeof(*temp)); |
526 | #if (WEBP_SWAP_16BIT_CSP == 1) |
527 | CALC_RGB565(temp, u, v, GB, RG, 8, temp); |
528 | #else |
529 | CALC_RGB565(temp, u, v, RG, GB, 8, temp); |
530 | #endif |
531 | memcpy(dst, temp, length * 2 * sizeof(*dst)); |
532 | } |
533 | } |
534 | #endif // WEBP_REDUCE_CSP |
535 | |
536 | #define UPSAMPLE_32PIXELS(a, b, c, d) do { \ |
537 | v16u8 s = __msa_aver_u_b(a, d); \ |
538 | v16u8 t = __msa_aver_u_b(b, c); \ |
539 | const v16u8 st = s ^ t; \ |
540 | v16u8 ad = a ^ d; \ |
541 | v16u8 bc = b ^ c; \ |
542 | v16u8 t0 = ad | bc; \ |
543 | v16u8 t1 = t0 | st; \ |
544 | v16u8 t2 = ANDI_B(t1, 1); \ |
545 | v16u8 t3 = __msa_aver_u_b(s, t); \ |
546 | const v16u8 k = t3 - t2; \ |
547 | v16u8 diag1, diag2; \ |
548 | AVER_UB2_UB(t, k, s, k, t0, t1); \ |
549 | bc = bc & st; \ |
550 | ad = ad & st; \ |
551 | t = t ^ k; \ |
552 | s = s ^ k; \ |
553 | t2 = bc | t; \ |
554 | t3 = ad | s; \ |
555 | t2 = ANDI_B(t2, 1); \ |
556 | t3 = ANDI_B(t3, 1); \ |
557 | SUB2(t0, t2, t1, t3, diag1, diag2); \ |
558 | AVER_UB2_UB(a, diag1, b, diag2, t0, t1); \ |
559 | ILVRL_B2_UB(t1, t0, a, b); \ |
560 | if (pbot_y != NULL) { \ |
561 | AVER_UB2_UB(c, diag2, d, diag1, t0, t1); \ |
562 | ILVRL_B2_UB(t1, t0, c, d); \ |
563 | } \ |
564 | } while (0) |
565 | |
566 | #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ |
567 | static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y, \ |
568 | const uint8_t* top_u, const uint8_t* top_v, \ |
569 | const uint8_t* cur_u, const uint8_t* cur_v, \ |
570 | uint8_t* top_dst, uint8_t* bot_dst, int len) \ |
571 | { \ |
572 | int size = (len - 1) >> 1; \ |
573 | uint8_t temp_u[64]; \ |
574 | uint8_t temp_v[64]; \ |
575 | const uint32_t tl_uv = ((top_u[0]) | ((top_v[0]) << 16)); \ |
576 | const uint32_t l_uv = ((cur_u[0]) | ((cur_v[0]) << 16)); \ |
577 | const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \ |
578 | const uint8_t* ptop_y = &top_y[1]; \ |
579 | uint8_t* ptop_dst = top_dst + XSTEP; \ |
580 | const uint8_t* pbot_y = &bot_y[1]; \ |
581 | uint8_t* pbot_dst = bot_dst + XSTEP; \ |
582 | \ |
583 | FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst); \ |
584 | if (bot_y != NULL) { \ |
585 | const uint32_t uv1 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \ |
586 | FUNC(bot_y[0], uv1 & 0xff, (uv1 >> 16), bot_dst); \ |
587 | } \ |
588 | while (size >= 16) { \ |
589 | v16u8 tu0, tu1, tv0, tv1, cu0, cu1, cv0, cv1; \ |
590 | LD_UB2(top_u, 1, tu0, tu1); \ |
591 | LD_UB2(cur_u, 1, cu0, cu1); \ |
592 | LD_UB2(top_v, 1, tv0, tv1); \ |
593 | LD_UB2(cur_v, 1, cv0, cv1); \ |
594 | UPSAMPLE_32PIXELS(tu0, tu1, cu0, cu1); \ |
595 | UPSAMPLE_32PIXELS(tv0, tv1, cv0, cv1); \ |
596 | ST_UB4(tu0, tu1, cu0, cu1, &temp_u[0], 16); \ |
597 | ST_UB4(tv0, tv1, cv0, cv1, &temp_v[0], 16); \ |
598 | FUNC##Line(ptop_y, &temp_u[ 0], &temp_v[0], ptop_dst, 32); \ |
599 | if (bot_y != NULL) { \ |
600 | FUNC##Line(pbot_y, &temp_u[32], &temp_v[32], pbot_dst, 32); \ |
601 | } \ |
602 | ptop_y += 32; \ |
603 | pbot_y += 32; \ |
604 | ptop_dst += XSTEP * 32; \ |
605 | pbot_dst += XSTEP * 32; \ |
606 | top_u += 16; \ |
607 | top_v += 16; \ |
608 | cur_u += 16; \ |
609 | cur_v += 16; \ |
610 | size -= 16; \ |
611 | } \ |
612 | if (size > 0) { \ |
613 | v16u8 tu0, tu1, tv0, tv1, cu0, cu1, cv0, cv1; \ |
614 | memcpy(&temp_u[ 0], top_u, 17 * sizeof(uint8_t)); \ |
615 | memcpy(&temp_u[32], cur_u, 17 * sizeof(uint8_t)); \ |
616 | memcpy(&temp_v[ 0], top_v, 17 * sizeof(uint8_t)); \ |
617 | memcpy(&temp_v[32], cur_v, 17 * sizeof(uint8_t)); \ |
618 | LD_UB2(&temp_u[ 0], 1, tu0, tu1); \ |
619 | LD_UB2(&temp_u[32], 1, cu0, cu1); \ |
620 | LD_UB2(&temp_v[ 0], 1, tv0, tv1); \ |
621 | LD_UB2(&temp_v[32], 1, cv0, cv1); \ |
622 | UPSAMPLE_32PIXELS(tu0, tu1, cu0, cu1); \ |
623 | UPSAMPLE_32PIXELS(tv0, tv1, cv0, cv1); \ |
624 | ST_UB4(tu0, tu1, cu0, cu1, &temp_u[0], 16); \ |
625 | ST_UB4(tv0, tv1, cv0, cv1, &temp_v[0], 16); \ |
626 | FUNC##Line(ptop_y, &temp_u[ 0], &temp_v[0], ptop_dst, size * 2); \ |
627 | if (bot_y != NULL) { \ |
628 | FUNC##Line(pbot_y, &temp_u[32], &temp_v[32], pbot_dst, size * 2); \ |
629 | } \ |
630 | top_u += size; \ |
631 | top_v += size; \ |
632 | cur_u += size; \ |
633 | cur_v += size; \ |
634 | } \ |
635 | if (!(len & 1)) { \ |
636 | const uint32_t t0 = ((top_u[0]) | ((top_v[0]) << 16)); \ |
637 | const uint32_t c0 = ((cur_u[0]) | ((cur_v[0]) << 16)); \ |
638 | const uint32_t tmp0 = (3 * t0 + c0 + 0x00020002u) >> 2; \ |
639 | FUNC(top_y[len - 1], tmp0 & 0xff, (tmp0 >> 16), \ |
640 | top_dst + (len - 1) * XSTEP); \ |
641 | if (bot_y != NULL) { \ |
642 | const uint32_t tmp1 = (3 * c0 + t0 + 0x00020002u) >> 2; \ |
643 | FUNC(bot_y[len - 1], tmp1 & 0xff, (tmp1 >> 16), \ |
644 | bot_dst + (len - 1) * XSTEP); \ |
645 | } \ |
646 | } \ |
647 | } |
648 | |
649 | UPSAMPLE_FUNC(UpsampleRgbaLinePair, YuvToRgba, 4) |
650 | UPSAMPLE_FUNC(UpsampleBgraLinePair, YuvToBgra, 4) |
651 | #if !defined(WEBP_REDUCE_CSP) |
652 | UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3) |
653 | UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3) |
654 | UPSAMPLE_FUNC(UpsampleArgbLinePair, YuvToArgb, 4) |
655 | UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2) |
656 | UPSAMPLE_FUNC(UpsampleRgb565LinePair, YuvToRgb565, 2) |
657 | #endif // WEBP_REDUCE_CSP |
658 | |
659 | //------------------------------------------------------------------------------ |
660 | // Entry point |
661 | |
662 | extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */]; |
663 | |
664 | extern void WebPInitUpsamplersMSA(void); |
665 | |
666 | WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMSA(void) { |
667 | WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair; |
668 | WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair; |
669 | WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair; |
670 | WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair; |
671 | #if !defined(WEBP_REDUCE_CSP) |
672 | WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair; |
673 | WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair; |
674 | WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair; |
675 | WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair; |
676 | WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair; |
677 | WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair; |
678 | WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair; |
679 | #endif // WEBP_REDUCE_CSP |
680 | } |
681 | |
682 | #endif // FANCY_UPSAMPLING |
683 | |
684 | #endif // WEBP_USE_MSA |
685 | |
686 | #if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_MSA)) |
687 | WEBP_DSP_INIT_STUB(WebPInitUpsamplersMSA) |
688 | #endif |
689 | |