1 | // Copyright 2016 Google Inc. All Rights Reserved. |
2 | // |
3 | // Use of this source code is governed by a BSD-style license |
4 | // that can be found in the COPYING file in the root of the source |
5 | // tree. An additional intellectual property rights grant can be found |
6 | // in the file PATENTS. All contributing project authors may |
7 | // be found in the AUTHORS file in the root of the source tree. |
8 | // ----------------------------------------------------------------------------- |
9 | // |
10 | // MSA version of YUV to RGB upsampling functions. |
11 | // |
12 | // Author: Prashant Patil (prashant.patil@imgtec.com) |
13 | |
14 | #include <string.h> |
15 | #include "./dsp.h" |
16 | |
17 | #if defined(WEBP_USE_MSA) |
18 | |
19 | #include "./msa_macro.h" |
20 | #include "./yuv.h" |
21 | |
22 | #ifdef FANCY_UPSAMPLING |
23 | |
24 | #define ILVR_UW2(in, out0, out1) do { \ |
25 | const v8i16 t0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)in); \ |
26 | out0 = (v4u32)__msa_ilvr_h((v8i16)zero, t0); \ |
27 | out1 = (v4u32)__msa_ilvl_h((v8i16)zero, t0); \ |
28 | } while (0) |
29 | |
30 | #define ILVRL_UW4(in, out0, out1, out2, out3) do { \ |
31 | v16u8 t0, t1; \ |
32 | ILVRL_B2_UB(zero, in, t0, t1); \ |
33 | ILVRL_H2_UW(zero, t0, out0, out1); \ |
34 | ILVRL_H2_UW(zero, t1, out2, out3); \ |
35 | } while (0) |
36 | |
37 | #define MULTHI_16(in0, in1, in2, in3, cnst, out0, out1) do { \ |
38 | const v4i32 const0 = (v4i32)__msa_fill_w(cnst * 256); \ |
39 | v4u32 temp0, temp1, temp2, temp3; \ |
40 | MUL4(in0, const0, in1, const0, in2, const0, in3, const0, \ |
41 | temp0, temp1, temp2, temp3); \ |
42 | PCKOD_H2_UH(temp1, temp0, temp3, temp2, out0, out1); \ |
43 | } while (0) |
44 | |
45 | #define MULTHI_8(in0, in1, cnst, out0) do { \ |
46 | const v4i32 const0 = (v4i32)__msa_fill_w(cnst * 256); \ |
47 | v4u32 temp0, temp1; \ |
48 | MUL2(in0, const0, in1, const0, temp0, temp1); \ |
49 | out0 = (v8u16)__msa_pckod_h((v8i16)temp1, (v8i16)temp0); \ |
50 | } while (0) |
51 | |
52 | #define CALC_R16(y0, y1, v0, v1, dst) do { \ |
53 | const v8i16 const_a = (v8i16)__msa_fill_h(14234); \ |
54 | const v8i16 a0 = __msa_adds_s_h((v8i16)y0, (v8i16)v0); \ |
55 | const v8i16 a1 = __msa_adds_s_h((v8i16)y1, (v8i16)v1); \ |
56 | v8i16 b0 = __msa_subs_s_h(a0, const_a); \ |
57 | v8i16 b1 = __msa_subs_s_h(a1, const_a); \ |
58 | SRAI_H2_SH(b0, b1, 6); \ |
59 | CLIP_SH2_0_255(b0, b1); \ |
60 | dst = (v16u8)__msa_pckev_b((v16i8)b1, (v16i8)b0); \ |
61 | } while (0) |
62 | |
63 | #define CALC_R8(y0, v0, dst) do { \ |
64 | const v8i16 const_a = (v8i16)__msa_fill_h(14234); \ |
65 | const v8i16 a0 = __msa_adds_s_h((v8i16)y0, (v8i16)v0); \ |
66 | v8i16 b0 = __msa_subs_s_h(a0, const_a); \ |
67 | b0 = SRAI_H(b0, 6); \ |
68 | CLIP_SH_0_255(b0); \ |
69 | dst = (v16u8)__msa_pckev_b((v16i8)b0, (v16i8)b0); \ |
70 | } while (0) |
71 | |
72 | #define CALC_G16(y0, y1, u0, u1, v0, v1, dst) do { \ |
73 | const v8i16 const_a = (v8i16)__msa_fill_h(8708); \ |
74 | v8i16 a0 = __msa_subs_s_h((v8i16)y0, (v8i16)u0); \ |
75 | v8i16 a1 = __msa_subs_s_h((v8i16)y1, (v8i16)u1); \ |
76 | const v8i16 b0 = __msa_subs_s_h(a0, (v8i16)v0); \ |
77 | const v8i16 b1 = __msa_subs_s_h(a1, (v8i16)v1); \ |
78 | a0 = __msa_adds_s_h(b0, const_a); \ |
79 | a1 = __msa_adds_s_h(b1, const_a); \ |
80 | SRAI_H2_SH(a0, a1, 6); \ |
81 | CLIP_SH2_0_255(a0, a1); \ |
82 | dst = (v16u8)__msa_pckev_b((v16i8)a1, (v16i8)a0); \ |
83 | } while (0) |
84 | |
85 | #define CALC_G8(y0, u0, v0, dst) do { \ |
86 | const v8i16 const_a = (v8i16)__msa_fill_h(8708); \ |
87 | v8i16 a0 = __msa_subs_s_h((v8i16)y0, (v8i16)u0); \ |
88 | const v8i16 b0 = __msa_subs_s_h(a0, (v8i16)v0); \ |
89 | a0 = __msa_adds_s_h(b0, const_a); \ |
90 | a0 = SRAI_H(a0, 6); \ |
91 | CLIP_SH_0_255(a0); \ |
92 | dst = (v16u8)__msa_pckev_b((v16i8)a0, (v16i8)a0); \ |
93 | } while (0) |
94 | |
95 | #define CALC_B16(y0, y1, u0, u1, dst) do { \ |
96 | const v8u16 const_a = (v8u16)__msa_fill_h(17685); \ |
97 | const v8u16 a0 = __msa_adds_u_h((v8u16)y0, u0); \ |
98 | const v8u16 a1 = __msa_adds_u_h((v8u16)y1, u1); \ |
99 | v8u16 b0 = __msa_subs_u_h(a0, const_a); \ |
100 | v8u16 b1 = __msa_subs_u_h(a1, const_a); \ |
101 | SRAI_H2_UH(b0, b1, 6); \ |
102 | CLIP_UH2_0_255(b0, b1); \ |
103 | dst = (v16u8)__msa_pckev_b((v16i8)b1, (v16i8)b0); \ |
104 | } while (0) |
105 | |
106 | #define CALC_B8(y0, u0, dst) do { \ |
107 | const v8u16 const_a = (v8u16)__msa_fill_h(17685); \ |
108 | const v8u16 a0 = __msa_adds_u_h((v8u16)y0, u0); \ |
109 | v8u16 b0 = __msa_subs_u_h(a0, const_a); \ |
110 | b0 = SRAI_H(b0, 6); \ |
111 | CLIP_UH_0_255(b0); \ |
112 | dst = (v16u8)__msa_pckev_b((v16i8)b0, (v16i8)b0); \ |
113 | } while (0) |
114 | |
115 | #define CALC_RGB16(y, u, v, R, G, B) do { \ |
116 | const v16u8 zero = { 0 }; \ |
117 | v8u16 y0, y1, u0, u1, v0, v1; \ |
118 | v4u32 p0, p1, p2, p3; \ |
119 | const v16u8 in_y = LD_UB(y); \ |
120 | const v16u8 in_u = LD_UB(u); \ |
121 | const v16u8 in_v = LD_UB(v); \ |
122 | ILVRL_UW4(in_y, p0, p1, p2, p3); \ |
123 | MULTHI_16(p0, p1, p2, p3, 19077, y0, y1); \ |
124 | ILVRL_UW4(in_v, p0, p1, p2, p3); \ |
125 | MULTHI_16(p0, p1, p2, p3, 26149, v0, v1); \ |
126 | CALC_R16(y0, y1, v0, v1, R); \ |
127 | MULTHI_16(p0, p1, p2, p3, 13320, v0, v1); \ |
128 | ILVRL_UW4(in_u, p0, p1, p2, p3); \ |
129 | MULTHI_16(p0, p1, p2, p3, 6419, u0, u1); \ |
130 | CALC_G16(y0, y1, u0, u1, v0, v1, G); \ |
131 | MULTHI_16(p0, p1, p2, p3, 33050, u0, u1); \ |
132 | CALC_B16(y0, y1, u0, u1, B); \ |
133 | } while (0) |
134 | |
135 | #define CALC_RGB8(y, u, v, R, G, B) do { \ |
136 | const v16u8 zero = { 0 }; \ |
137 | v8u16 y0, u0, v0; \ |
138 | v4u32 p0, p1; \ |
139 | const v16u8 in_y = LD_UB(y); \ |
140 | const v16u8 in_u = LD_UB(u); \ |
141 | const v16u8 in_v = LD_UB(v); \ |
142 | ILVR_UW2(in_y, p0, p1); \ |
143 | MULTHI_8(p0, p1, 19077, y0); \ |
144 | ILVR_UW2(in_v, p0, p1); \ |
145 | MULTHI_8(p0, p1, 26149, v0); \ |
146 | CALC_R8(y0, v0, R); \ |
147 | MULTHI_8(p0, p1, 13320, v0); \ |
148 | ILVR_UW2(in_u, p0, p1); \ |
149 | MULTHI_8(p0, p1, 6419, u0); \ |
150 | CALC_G8(y0, u0, v0, G); \ |
151 | MULTHI_8(p0, p1, 33050, u0); \ |
152 | CALC_B8(y0, u0, B); \ |
153 | } while (0) |
154 | |
155 | #define STORE16_3(a0, a1, a2, dst) do { \ |
156 | const v16u8 mask0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, \ |
157 | 8, 9, 20, 10 }; \ |
158 | const v16u8 mask1 = { 0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, \ |
159 | 8, 25, 9, 10 }; \ |
160 | const v16u8 mask2 = { 26, 0, 1, 27, 2, 3, 28, 4, 5, 29, 6, 7, \ |
161 | 30, 8, 9, 31 }; \ |
162 | v16u8 out0, out1, out2, tmp0, tmp1, tmp2; \ |
163 | ILVRL_B2_UB(a1, a0, tmp0, tmp1); \ |
164 | out0 = VSHF_UB(tmp0, a2, mask0); \ |
165 | tmp2 = SLDI_UB(tmp1, tmp0, 11); \ |
166 | out1 = VSHF_UB(tmp2, a2, mask1); \ |
167 | tmp2 = SLDI_UB(tmp1, tmp1, 6); \ |
168 | out2 = VSHF_UB(tmp2, a2, mask2); \ |
169 | ST_UB(out0, dst + 0); \ |
170 | ST_UB(out1, dst + 16); \ |
171 | ST_UB(out2, dst + 32); \ |
172 | } while (0) |
173 | |
174 | #define STORE8_3(a0, a1, a2, dst) do { \ |
175 | int64_t out_m; \ |
176 | const v16u8 mask0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, \ |
177 | 8, 9, 20, 10 }; \ |
178 | const v16u8 mask1 = { 11, 21, 12, 13, 22, 14, 15, 23, \ |
179 | 255, 255, 255, 255, 255, 255, 255, 255 }; \ |
180 | const v16u8 tmp0 = (v16u8)__msa_ilvr_b((v16i8)a1, (v16i8)a0); \ |
181 | v16u8 out0, out1; \ |
182 | VSHF_B2_UB(tmp0, a2, tmp0, a2, mask0, mask1, out0, out1); \ |
183 | ST_UB(out0, dst); \ |
184 | out_m = __msa_copy_s_d((v2i64)out1, 0); \ |
185 | SD(out_m, dst + 16); \ |
186 | } while (0) |
187 | |
188 | #define STORE16_4(a0, a1, a2, a3, dst) do { \ |
189 | v16u8 tmp0, tmp1, tmp2, tmp3; \ |
190 | v16u8 out0, out1, out2, out3; \ |
191 | ILVRL_B2_UB(a1, a0, tmp0, tmp1); \ |
192 | ILVRL_B2_UB(a3, a2, tmp2, tmp3); \ |
193 | ILVRL_H2_UB(tmp2, tmp0, out0, out1); \ |
194 | ILVRL_H2_UB(tmp3, tmp1, out2, out3); \ |
195 | ST_UB(out0, dst + 0); \ |
196 | ST_UB(out1, dst + 16); \ |
197 | ST_UB(out2, dst + 32); \ |
198 | ST_UB(out3, dst + 48); \ |
199 | } while (0) |
200 | |
201 | #define STORE8_4(a0, a1, a2, a3, dst) do { \ |
202 | v16u8 tmp0, tmp1, tmp2, tmp3; \ |
203 | ILVR_B2_UB(a1, a0, a3, a2, tmp0, tmp1); \ |
204 | ILVRL_H2_UB(tmp1, tmp0, tmp2, tmp3); \ |
205 | ST_UB(tmp2, dst + 0); \ |
206 | ST_UB(tmp3, dst + 16); \ |
207 | } while (0) |
208 | |
209 | #define STORE2_16(a0, a1, dst) do { \ |
210 | v16u8 out0, out1; \ |
211 | ILVRL_B2_UB(a1, a0, out0, out1); \ |
212 | ST_UB(out0, dst + 0); \ |
213 | ST_UB(out1, dst + 16); \ |
214 | } while (0) |
215 | |
216 | #define STORE2_8(a0, a1, dst) do { \ |
217 | const v16u8 out0 = (v16u8)__msa_ilvr_b((v16i8)a1, (v16i8)a0); \ |
218 | ST_UB(out0, dst); \ |
219 | } while (0) |
220 | |
221 | #define CALC_RGBA4444(y, u, v, out0, out1, N, dst) do { \ |
222 | CALC_RGB##N(y, u, v, R, G, B); \ |
223 | tmp0 = ANDI_B(R, 0xf0); \ |
224 | tmp1 = SRAI_B(G, 4); \ |
225 | RG = tmp0 | tmp1; \ |
226 | tmp0 = ANDI_B(B, 0xf0); \ |
227 | BA = ORI_B(tmp0, 0x0f); \ |
228 | STORE2_##N(out0, out1, dst); \ |
229 | } while (0) |
230 | |
231 | #define CALC_RGB565(y, u, v, out0, out1, N, dst) do { \ |
232 | CALC_RGB##N(y, u, v, R, G, B); \ |
233 | tmp0 = ANDI_B(R, 0xf8); \ |
234 | tmp1 = SRAI_B(G, 5); \ |
235 | RG = tmp0 | tmp1; \ |
236 | tmp0 = SLLI_B(G, 3); \ |
237 | tmp1 = ANDI_B(tmp0, 0xe0); \ |
238 | tmp0 = SRAI_B(B, 3); \ |
239 | GB = tmp0 | tmp1; \ |
240 | STORE2_##N(out0, out1, dst); \ |
241 | } while (0) |
242 | |
243 | static WEBP_INLINE int Clip8(int v) { |
244 | return v < 0 ? 0 : v > 255 ? 255 : v; |
245 | } |
246 | |
247 | static void YuvToRgb(int y, int u, int v, uint8_t* const rgb) { |
248 | const int y1 = MultHi(y, 19077); |
249 | const int r1 = y1 + MultHi(v, 26149) - 14234; |
250 | const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708; |
251 | const int b1 = y1 + MultHi(u, 33050) - 17685; |
252 | rgb[0] = Clip8(r1 >> 6); |
253 | rgb[1] = Clip8(g1 >> 6); |
254 | rgb[2] = Clip8(b1 >> 6); |
255 | } |
256 | |
257 | static void YuvToBgr(int y, int u, int v, uint8_t* const bgr) { |
258 | const int y1 = MultHi(y, 19077); |
259 | const int r1 = y1 + MultHi(v, 26149) - 14234; |
260 | const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708; |
261 | const int b1 = y1 + MultHi(u, 33050) - 17685; |
262 | bgr[0] = Clip8(b1 >> 6); |
263 | bgr[1] = Clip8(g1 >> 6); |
264 | bgr[2] = Clip8(r1 >> 6); |
265 | } |
266 | |
267 | static void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) { |
268 | const int y1 = MultHi(y, 19077); |
269 | const int r1 = y1 + MultHi(v, 26149) - 14234; |
270 | const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708; |
271 | const int b1 = y1 + MultHi(u, 33050) - 17685; |
272 | const int r = Clip8(r1 >> 6); |
273 | const int g = Clip8(g1 >> 6); |
274 | const int b = Clip8(b1 >> 6); |
275 | const int rg = (r & 0xf8) | (g >> 5); |
276 | const int gb = ((g << 3) & 0xe0) | (b >> 3); |
277 | #ifdef WEBP_SWAP_16BIT_CSP |
278 | rgb[0] = gb; |
279 | rgb[1] = rg; |
280 | #else |
281 | rgb[0] = rg; |
282 | rgb[1] = gb; |
283 | #endif |
284 | } |
285 | |
286 | static void YuvToRgba4444(int y, int u, int v, uint8_t* const argb) { |
287 | const int y1 = MultHi(y, 19077); |
288 | const int r1 = y1 + MultHi(v, 26149) - 14234; |
289 | const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708; |
290 | const int b1 = y1 + MultHi(u, 33050) - 17685; |
291 | const int r = Clip8(r1 >> 6); |
292 | const int g = Clip8(g1 >> 6); |
293 | const int b = Clip8(b1 >> 6); |
294 | const int rg = (r & 0xf0) | (g >> 4); |
295 | const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits |
296 | #ifdef WEBP_SWAP_16BIT_CSP |
297 | argb[0] = ba; |
298 | argb[1] = rg; |
299 | #else |
300 | argb[0] = rg; |
301 | argb[1] = ba; |
302 | #endif |
303 | } |
304 | |
305 | static void YuvToArgb(uint8_t y, uint8_t u, uint8_t v, uint8_t* const argb) { |
306 | argb[0] = 0xff; |
307 | YuvToRgb(y, u, v, argb + 1); |
308 | } |
309 | |
310 | static void YuvToBgra(uint8_t y, uint8_t u, uint8_t v, uint8_t* const bgra) { |
311 | YuvToBgr(y, u, v, bgra); |
312 | bgra[3] = 0xff; |
313 | } |
314 | |
315 | static void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgba) { |
316 | YuvToRgb(y, u, v, rgba); |
317 | rgba[3] = 0xff; |
318 | } |
319 | |
320 | static void YuvToRgbLine(const uint8_t* y, const uint8_t* u, |
321 | const uint8_t* v, uint8_t* dst, int length) { |
322 | v16u8 R, G, B; |
323 | while (length >= 16) { |
324 | CALC_RGB16(y, u, v, R, G, B); |
325 | STORE16_3(R, G, B, dst); |
326 | y += 16; |
327 | u += 16; |
328 | v += 16; |
329 | dst += 16 * 3; |
330 | length -= 16; |
331 | } |
332 | if (length > 8) { |
333 | uint8_t temp[3 * 16] = { 0 }; |
334 | memcpy(temp, y, length * sizeof(*temp)); |
335 | CALC_RGB16(temp, u, v, R, G, B); |
336 | STORE16_3(R, G, B, temp); |
337 | memcpy(dst, temp, length * 3 * sizeof(*dst)); |
338 | } else if (length > 0) { |
339 | uint8_t temp[3 * 8] = { 0 }; |
340 | memcpy(temp, y, length * sizeof(*temp)); |
341 | CALC_RGB8(temp, u, v, R, G, B); |
342 | STORE8_3(R, G, B, temp); |
343 | memcpy(dst, temp, length * 3 * sizeof(*dst)); |
344 | } |
345 | } |
346 | |
347 | static void YuvToBgrLine(const uint8_t* y, const uint8_t* u, |
348 | const uint8_t* v, uint8_t* dst, int length) { |
349 | v16u8 R, G, B; |
350 | while (length >= 16) { |
351 | CALC_RGB16(y, u, v, R, G, B); |
352 | STORE16_3(B, G, R, dst); |
353 | y += 16; |
354 | u += 16; |
355 | v += 16; |
356 | dst += 16 * 3; |
357 | length -= 16; |
358 | } |
359 | if (length > 8) { |
360 | uint8_t temp[3 * 16] = { 0 }; |
361 | memcpy(temp, y, length * sizeof(*temp)); |
362 | CALC_RGB16(temp, u, v, R, G, B); |
363 | STORE16_3(B, G, R, temp); |
364 | memcpy(dst, temp, length * 3 * sizeof(*dst)); |
365 | } else if (length > 0) { |
366 | uint8_t temp[3 * 8] = { 0 }; |
367 | memcpy(temp, y, length * sizeof(*temp)); |
368 | CALC_RGB8(temp, u, v, R, G, B); |
369 | STORE8_3(B, G, R, temp); |
370 | memcpy(dst, temp, length * 3 * sizeof(*dst)); |
371 | } |
372 | } |
373 | |
374 | static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u, |
375 | const uint8_t* v, uint8_t* dst, int length) { |
376 | v16u8 R, G, B; |
377 | const v16u8 A = (v16u8)__msa_ldi_b(0xff); |
378 | while (length >= 16) { |
379 | CALC_RGB16(y, u, v, R, G, B); |
380 | STORE16_4(R, G, B, A, dst); |
381 | y += 16; |
382 | u += 16; |
383 | v += 16; |
384 | dst += 16 * 4; |
385 | length -= 16; |
386 | } |
387 | if (length > 8) { |
388 | uint8_t temp[4 * 16] = { 0 }; |
389 | memcpy(temp, y, length * sizeof(*temp)); |
390 | CALC_RGB16(&temp[0], u, v, R, G, B); |
391 | STORE16_4(R, G, B, A, temp); |
392 | memcpy(dst, temp, length * 4 * sizeof(*dst)); |
393 | } else if (length > 0) { |
394 | uint8_t temp[4 * 8] = { 0 }; |
395 | memcpy(temp, y, length * sizeof(*temp)); |
396 | CALC_RGB8(temp, u, v, R, G, B); |
397 | STORE8_4(R, G, B, A, temp); |
398 | memcpy(dst, temp, length * 4 * sizeof(*dst)); |
399 | } |
400 | } |
401 | |
402 | static void YuvToBgraLine(const uint8_t* y, const uint8_t* u, |
403 | const uint8_t* v, uint8_t* dst, int length) { |
404 | v16u8 R, G, B; |
405 | const v16u8 A = (v16u8)__msa_ldi_b(0xff); |
406 | while (length >= 16) { |
407 | CALC_RGB16(y, u, v, R, G, B); |
408 | STORE16_4(B, G, R, A, dst); |
409 | y += 16; |
410 | u += 16; |
411 | v += 16; |
412 | dst += 16 * 4; |
413 | length -= 16; |
414 | } |
415 | if (length > 8) { |
416 | uint8_t temp[4 * 16] = { 0 }; |
417 | memcpy(temp, y, length * sizeof(*temp)); |
418 | CALC_RGB16(temp, u, v, R, G, B); |
419 | STORE16_4(B, G, R, A, temp); |
420 | memcpy(dst, temp, length * 4 * sizeof(*dst)); |
421 | } else if (length > 0) { |
422 | uint8_t temp[4 * 8] = { 0 }; |
423 | memcpy(temp, y, length * sizeof(*temp)); |
424 | CALC_RGB8(temp, u, v, R, G, B); |
425 | STORE8_4(B, G, R, A, temp); |
426 | memcpy(dst, temp, length * 4 * sizeof(*dst)); |
427 | } |
428 | } |
429 | |
430 | static void YuvToArgbLine(const uint8_t* y, const uint8_t* u, |
431 | const uint8_t* v, uint8_t* dst, int length) { |
432 | v16u8 R, G, B; |
433 | const v16u8 A = (v16u8)__msa_ldi_b(0xff); |
434 | while (length >= 16) { |
435 | CALC_RGB16(y, u, v, R, G, B); |
436 | STORE16_4(A, R, G, B, dst); |
437 | y += 16; |
438 | u += 16; |
439 | v += 16; |
440 | dst += 16 * 4; |
441 | length -= 16; |
442 | } |
443 | if (length > 8) { |
444 | uint8_t temp[4 * 16] = { 0 }; |
445 | memcpy(temp, y, length * sizeof(*temp)); |
446 | CALC_RGB16(temp, u, v, R, G, B); |
447 | STORE16_4(A, R, G, B, temp); |
448 | memcpy(dst, temp, length * 4 * sizeof(*dst)); |
449 | } else if (length > 0) { |
450 | uint8_t temp[4 * 8] = { 0 }; |
451 | memcpy(temp, y, length * sizeof(*temp)); |
452 | CALC_RGB8(temp, u, v, R, G, B); |
453 | STORE8_4(A, R, G, B, temp); |
454 | memcpy(dst, temp, length * 4 * sizeof(*dst)); |
455 | } |
456 | } |
457 | |
458 | static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u, |
459 | const uint8_t* v, uint8_t* dst, int length) { |
460 | v16u8 R, G, B, RG, BA, tmp0, tmp1; |
461 | while (length >= 16) { |
462 | #ifdef WEBP_SWAP_16BIT_CSP |
463 | CALC_RGBA4444(y, u, v, BA, RG, 16, dst); |
464 | #else |
465 | CALC_RGBA4444(y, u, v, RG, BA, 16, dst); |
466 | #endif |
467 | y += 16; |
468 | u += 16; |
469 | v += 16; |
470 | dst += 16 * 2; |
471 | length -= 16; |
472 | } |
473 | if (length > 8) { |
474 | uint8_t temp[2 * 16] = { 0 }; |
475 | memcpy(temp, y, length * sizeof(*temp)); |
476 | #ifdef WEBP_SWAP_16BIT_CSP |
477 | CALC_RGBA4444(temp, u, v, BA, RG, 16, temp); |
478 | #else |
479 | CALC_RGBA4444(temp, u, v, RG, BA, 16, temp); |
480 | #endif |
481 | memcpy(dst, temp, length * 2 * sizeof(*dst)); |
482 | } else if (length > 0) { |
483 | uint8_t temp[2 * 8] = { 0 }; |
484 | memcpy(temp, y, length * sizeof(*temp)); |
485 | #ifdef WEBP_SWAP_16BIT_CSP |
486 | CALC_RGBA4444(temp, u, v, BA, RG, 8, temp); |
487 | #else |
488 | CALC_RGBA4444(temp, u, v, RG, BA, 8, temp); |
489 | #endif |
490 | memcpy(dst, temp, length * 2 * sizeof(*dst)); |
491 | } |
492 | } |
493 | |
494 | static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u, |
495 | const uint8_t* v, uint8_t* dst, int length) { |
496 | v16u8 R, G, B, RG, GB, tmp0, tmp1; |
497 | while (length >= 16) { |
498 | #ifdef WEBP_SWAP_16BIT_CSP |
499 | CALC_RGB565(y, u, v, GB, RG, 16, dst); |
500 | #else |
501 | CALC_RGB565(y, u, v, RG, GB, 16, dst); |
502 | #endif |
503 | y += 16; |
504 | u += 16; |
505 | v += 16; |
506 | dst += 16 * 2; |
507 | length -= 16; |
508 | } |
509 | if (length > 8) { |
510 | uint8_t temp[2 * 16] = { 0 }; |
511 | memcpy(temp, y, length * sizeof(*temp)); |
512 | #ifdef WEBP_SWAP_16BIT_CSP |
513 | CALC_RGB565(temp, u, v, GB, RG, 16, temp); |
514 | #else |
515 | CALC_RGB565(temp, u, v, RG, GB, 16, temp); |
516 | #endif |
517 | memcpy(dst, temp, length * 2 * sizeof(*dst)); |
518 | } else if (length > 0) { |
519 | uint8_t temp[2 * 8] = { 0 }; |
520 | memcpy(temp, y, length * sizeof(*temp)); |
521 | #ifdef WEBP_SWAP_16BIT_CSP |
522 | CALC_RGB565(temp, u, v, GB, RG, 8, temp); |
523 | #else |
524 | CALC_RGB565(temp, u, v, RG, GB, 8, temp); |
525 | #endif |
526 | memcpy(dst, temp, length * 2 * sizeof(*dst)); |
527 | } |
528 | } |
529 | |
530 | #define UPSAMPLE_32PIXELS(a, b, c, d) do { \ |
531 | v16u8 s = __msa_aver_u_b(a, d); \ |
532 | v16u8 t = __msa_aver_u_b(b, c); \ |
533 | const v16u8 st = s ^ t; \ |
534 | v16u8 ad = a ^ d; \ |
535 | v16u8 bc = b ^ c; \ |
536 | v16u8 t0 = ad | bc; \ |
537 | v16u8 t1 = t0 | st; \ |
538 | v16u8 t2 = ANDI_B(t1, 1); \ |
539 | v16u8 t3 = __msa_aver_u_b(s, t); \ |
540 | const v16u8 k = t3 - t2; \ |
541 | v16u8 diag1, diag2; \ |
542 | AVER_UB2_UB(t, k, s, k, t0, t1); \ |
543 | bc = bc & st; \ |
544 | ad = ad & st; \ |
545 | t = t ^ k; \ |
546 | s = s ^ k; \ |
547 | t2 = bc | t; \ |
548 | t3 = ad | s; \ |
549 | t2 = ANDI_B(t2, 1); \ |
550 | t3 = ANDI_B(t3, 1); \ |
551 | SUB2(t0, t2, t1, t3, diag1, diag2); \ |
552 | AVER_UB2_UB(a, diag1, b, diag2, t0, t1); \ |
553 | ILVRL_B2_UB(t1, t0, a, b); \ |
554 | if (pbot_y != NULL) { \ |
555 | AVER_UB2_UB(c, diag2, d, diag1, t0, t1); \ |
556 | ILVRL_B2_UB(t1, t0, c, d); \ |
557 | } \ |
558 | } while (0) |
559 | |
560 | #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ |
561 | static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y, \ |
562 | const uint8_t* top_u, const uint8_t* top_v, \ |
563 | const uint8_t* cur_u, const uint8_t* cur_v, \ |
564 | uint8_t* top_dst, uint8_t* bot_dst, int len) \ |
565 | { \ |
566 | int size = (len - 1) >> 1; \ |
567 | uint8_t temp_u[64]; \ |
568 | uint8_t temp_v[64]; \ |
569 | const uint32_t tl_uv = ((top_u[0]) | ((top_v[0]) << 16)); \ |
570 | const uint32_t l_uv = ((cur_u[0]) | ((cur_v[0]) << 16)); \ |
571 | const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \ |
572 | const uint8_t* ptop_y = &top_y[1]; \ |
573 | uint8_t *ptop_dst = top_dst + XSTEP; \ |
574 | const uint8_t* pbot_y = &bot_y[1]; \ |
575 | uint8_t *pbot_dst = bot_dst + XSTEP; \ |
576 | \ |
577 | FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst); \ |
578 | if (bot_y != NULL) { \ |
579 | const uint32_t uv1 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \ |
580 | FUNC(bot_y[0], uv1 & 0xff, (uv1 >> 16), bot_dst); \ |
581 | } \ |
582 | while (size >= 16) { \ |
583 | v16u8 tu0, tu1, tv0, tv1, cu0, cu1, cv0, cv1; \ |
584 | LD_UB2(top_u, 1, tu0, tu1); \ |
585 | LD_UB2(cur_u, 1, cu0, cu1); \ |
586 | LD_UB2(top_v, 1, tv0, tv1); \ |
587 | LD_UB2(cur_v, 1, cv0, cv1); \ |
588 | UPSAMPLE_32PIXELS(tu0, tu1, cu0, cu1); \ |
589 | UPSAMPLE_32PIXELS(tv0, tv1, cv0, cv1); \ |
590 | ST_UB4(tu0, tu1, cu0, cu1, &temp_u[0], 16); \ |
591 | ST_UB4(tv0, tv1, cv0, cv1, &temp_v[0], 16); \ |
592 | FUNC##Line(ptop_y, &temp_u[ 0], &temp_v[0], ptop_dst, 32); \ |
593 | if (bot_y != NULL) { \ |
594 | FUNC##Line(pbot_y, &temp_u[32], &temp_v[32], pbot_dst, 32); \ |
595 | } \ |
596 | ptop_y += 32; \ |
597 | pbot_y += 32; \ |
598 | ptop_dst += XSTEP * 32; \ |
599 | pbot_dst += XSTEP * 32; \ |
600 | top_u += 16; \ |
601 | top_v += 16; \ |
602 | cur_u += 16; \ |
603 | cur_v += 16; \ |
604 | size -= 16; \ |
605 | } \ |
606 | if (size > 0) { \ |
607 | v16u8 tu0, tu1, tv0, tv1, cu0, cu1, cv0, cv1; \ |
608 | memcpy(&temp_u[ 0], top_u, 17 * sizeof(uint8_t)); \ |
609 | memcpy(&temp_u[32], cur_u, 17 * sizeof(uint8_t)); \ |
610 | memcpy(&temp_v[ 0], top_v, 17 * sizeof(uint8_t)); \ |
611 | memcpy(&temp_v[32], cur_v, 17 * sizeof(uint8_t)); \ |
612 | LD_UB2(&temp_u[ 0], 1, tu0, tu1); \ |
613 | LD_UB2(&temp_u[32], 1, cu0, cu1); \ |
614 | LD_UB2(&temp_v[ 0], 1, tv0, tv1); \ |
615 | LD_UB2(&temp_v[32], 1, cv0, cv1); \ |
616 | UPSAMPLE_32PIXELS(tu0, tu1, cu0, cu1); \ |
617 | UPSAMPLE_32PIXELS(tv0, tv1, cv0, cv1); \ |
618 | ST_UB4(tu0, tu1, cu0, cu1, &temp_u[0], 16); \ |
619 | ST_UB4(tv0, tv1, cv0, cv1, &temp_v[0], 16); \ |
620 | FUNC##Line(ptop_y, &temp_u[ 0], &temp_v[0], ptop_dst, size * 2); \ |
621 | if (bot_y != NULL) { \ |
622 | FUNC##Line(pbot_y, &temp_u[32], &temp_v[32], pbot_dst, size * 2); \ |
623 | } \ |
624 | top_u += size; \ |
625 | top_v += size; \ |
626 | cur_u += size; \ |
627 | cur_v += size; \ |
628 | } \ |
629 | if (!(len & 1)) { \ |
630 | const uint32_t t0 = ((top_u[0]) | ((top_v[0]) << 16)); \ |
631 | const uint32_t c0 = ((cur_u[0]) | ((cur_v[0]) << 16)); \ |
632 | const uint32_t tmp0 = (3 * t0 + c0 + 0x00020002u) >> 2; \ |
633 | FUNC(top_y[len - 1], tmp0 & 0xff, (tmp0 >> 16), \ |
634 | top_dst + (len - 1) * XSTEP); \ |
635 | if (bot_y != NULL) { \ |
636 | const uint32_t tmp1 = (3 * c0 + t0 + 0x00020002u) >> 2; \ |
637 | FUNC(bot_y[len - 1], tmp1 & 0xff, (tmp1 >> 16), \ |
638 | bot_dst + (len - 1) * XSTEP); \ |
639 | } \ |
640 | } \ |
641 | } |
642 | |
643 | UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3) |
644 | UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3) |
645 | UPSAMPLE_FUNC(UpsampleRgbaLinePair, YuvToRgba, 4) |
646 | UPSAMPLE_FUNC(UpsampleBgraLinePair, YuvToBgra, 4) |
647 | UPSAMPLE_FUNC(UpsampleArgbLinePair, YuvToArgb, 4) |
648 | UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2) |
649 | UPSAMPLE_FUNC(UpsampleRgb565LinePair, YuvToRgb565, 2) |
650 | |
651 | //------------------------------------------------------------------------------ |
652 | // Entry point |
653 | |
654 | extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */]; |
655 | |
656 | extern void WebPInitUpsamplersMSA(void); |
657 | |
658 | WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMSA(void) { |
659 | WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair; |
660 | WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair; |
661 | WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair; |
662 | WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair; |
663 | WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair; |
664 | WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair; |
665 | WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair; |
666 | WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair; |
667 | WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair; |
668 | WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair; |
669 | WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair; |
670 | } |
671 | |
672 | #endif // FANCY_UPSAMPLING |
673 | |
674 | #endif // WEBP_USE_MSA |
675 | |
676 | #if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_MSA)) |
677 | WEBP_DSP_INIT_STUB(WebPInitUpsamplersMSA) |
678 | #endif |
679 | |