1 | // Copyright 2016 Google Inc. All Rights Reserved. |
2 | // |
3 | // Use of this source code is governed by a BSD-style license |
4 | // that can be found in the COPYING file in the root of the source |
5 | // tree. An additional intellectual property rights grant can be found |
6 | // in the file PATENTS. All contributing project authors may |
7 | // be found in the AUTHORS file in the root of the source tree. |
8 | // ----------------------------------------------------------------------------- |
9 | // |
10 | // MSA variant of methods for lossless decoder |
11 | // |
12 | // Author: Prashant Patil (prashant.patil@imgtec.com) |
13 | |
14 | #include "./dsp.h" |
15 | |
16 | #if defined(WEBP_USE_MSA) |
17 | |
18 | #include "./lossless.h" |
19 | #include "./msa_macro.h" |
20 | |
21 | //------------------------------------------------------------------------------ |
22 | // Colorspace conversion functions |
23 | |
24 | #define CONVERT16_BGRA_XXX(psrc, pdst, m0, m1, m2) do { \ |
25 | v16u8 src0, src1, src2, src3, dst0, dst1, dst2; \ |
26 | LD_UB4(psrc, 16, src0, src1, src2, src3); \ |
27 | VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1); \ |
28 | dst2 = VSHF_UB(src2, src3, m2); \ |
29 | ST_UB2(dst0, dst1, pdst, 16); \ |
30 | ST_UB(dst2, pdst + 32); \ |
31 | } while (0) |
32 | |
33 | #define CONVERT12_BGRA_XXX(psrc, pdst, m0, m1, m2) do { \ |
34 | uint32_t pix_w; \ |
35 | v16u8 src0, src1, src2, dst0, dst1, dst2; \ |
36 | LD_UB3(psrc, 16, src0, src1, src2); \ |
37 | VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1); \ |
38 | dst2 = VSHF_UB(src2, src2, m2); \ |
39 | ST_UB2(dst0, dst1, pdst, 16); \ |
40 | pix_w = __msa_copy_s_w((v4i32)dst2, 0); \ |
41 | SW(pix_w, pdst + 32); \ |
42 | } while (0) |
43 | |
44 | #define CONVERT8_BGRA_XXX(psrc, pdst, m0, m1) do { \ |
45 | uint64_t pix_d; \ |
46 | v16u8 src0, src1, src2, dst0, dst1; \ |
47 | LD_UB2(psrc, 16, src0, src1); \ |
48 | VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1); \ |
49 | ST_UB(dst0, pdst); \ |
50 | pix_d = __msa_copy_s_d((v2i64)dst1, 0); \ |
51 | SD(pix_d, pdst + 16); \ |
52 | } while (0) |
53 | |
54 | #define CONVERT4_BGRA_XXX(psrc, pdst, m) do { \ |
55 | const v16u8 src0 = LD_UB(psrc); \ |
56 | const v16u8 dst0 = VSHF_UB(src0, src0, m); \ |
57 | uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0); \ |
58 | uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2); \ |
59 | SD(pix_d, pdst + 0); \ |
60 | SW(pix_w, pdst + 8); \ |
61 | } while (0) |
62 | |
63 | #define CONVERT1_BGRA_BGR(psrc, pdst) do { \ |
64 | const int32_t b = (psrc)[0]; \ |
65 | const int32_t g = (psrc)[1]; \ |
66 | const int32_t r = (psrc)[2]; \ |
67 | (pdst)[0] = b; \ |
68 | (pdst)[1] = g; \ |
69 | (pdst)[2] = r; \ |
70 | } while (0) |
71 | |
72 | #define CONVERT1_BGRA_RGB(psrc, pdst) do { \ |
73 | const int32_t b = (psrc)[0]; \ |
74 | const int32_t g = (psrc)[1]; \ |
75 | const int32_t r = (psrc)[2]; \ |
76 | (pdst)[0] = r; \ |
77 | (pdst)[1] = g; \ |
78 | (pdst)[2] = b; \ |
79 | } while (0) |
80 | |
81 | #define TRANSFORM_COLOR_INVERSE_8(src0, src1, dst0, dst1, \ |
82 | c0, c1, mask0, mask1) do { \ |
83 | v8i16 g0, g1, t0, t1, t2, t3; \ |
84 | v4i32 t4, t5; \ |
85 | VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, g0, g1); \ |
86 | DOTP_SB2_SH(g0, g1, c0, c0, t0, t1); \ |
87 | SRAI_H2_SH(t0, t1, 5); \ |
88 | t0 = __msa_addv_h(t0, (v8i16)src0); \ |
89 | t1 = __msa_addv_h(t1, (v8i16)src1); \ |
90 | t4 = __msa_srli_w((v4i32)t0, 16); \ |
91 | t5 = __msa_srli_w((v4i32)t1, 16); \ |
92 | DOTP_SB2_SH(t4, t5, c1, c1, t2, t3); \ |
93 | SRAI_H2_SH(t2, t3, 5); \ |
94 | ADD2(t0, t2, t1, t3, t0, t1); \ |
95 | VSHF_B2_UB(src0, t0, src1, t1, mask1, mask1, dst0, dst1); \ |
96 | } while (0) |
97 | |
98 | #define TRANSFORM_COLOR_INVERSE_4(src, dst, c0, c1, mask0, mask1) do { \ |
99 | const v16i8 g0 = VSHF_SB(src, src, mask0); \ |
100 | v8i16 t0 = __msa_dotp_s_h(c0, g0); \ |
101 | v8i16 t1; \ |
102 | v4i32 t2; \ |
103 | t0 = SRAI_H(t0, 5); \ |
104 | t0 = __msa_addv_h(t0, (v8i16)src); \ |
105 | t2 = __msa_srli_w((v4i32)t0, 16); \ |
106 | t1 = __msa_dotp_s_h(c1, (v16i8)t2); \ |
107 | t1 = SRAI_H(t1, 5); \ |
108 | t0 = t0 + t1; \ |
109 | dst = VSHF_UB(src, t0, mask1); \ |
110 | } while (0) |
111 | |
112 | static void ConvertBGRAToRGBA(const uint32_t* src, |
113 | int num_pixels, uint8_t* dst) { |
114 | int i; |
115 | const uint8_t* ptemp_src = (const uint8_t*)src; |
116 | uint8_t* ptemp_dst = (uint8_t*)dst; |
117 | v16u8 src0, dst0; |
118 | const v16u8 mask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15 }; |
119 | |
120 | while (num_pixels >= 8) { |
121 | v16u8 src1, dst1; |
122 | LD_UB2(ptemp_src, 16, src0, src1); |
123 | VSHF_B2_UB(src0, src0, src1, src1, mask, mask, dst0, dst1); |
124 | ST_UB2(dst0, dst1, ptemp_dst, 16); |
125 | ptemp_src += 32; |
126 | ptemp_dst += 32; |
127 | num_pixels -= 8; |
128 | } |
129 | if (num_pixels > 0) { |
130 | if (num_pixels >= 4) { |
131 | src0 = LD_UB(ptemp_src); |
132 | dst0 = VSHF_UB(src0, src0, mask); |
133 | ST_UB(dst0, ptemp_dst); |
134 | ptemp_src += 16; |
135 | ptemp_dst += 16; |
136 | num_pixels -= 4; |
137 | } |
138 | for (i = 0; i < num_pixels; i++) { |
139 | const uint8_t b = ptemp_src[2]; |
140 | const uint8_t g = ptemp_src[1]; |
141 | const uint8_t r = ptemp_src[0]; |
142 | const uint8_t a = ptemp_src[3]; |
143 | ptemp_dst[0] = b; |
144 | ptemp_dst[1] = g; |
145 | ptemp_dst[2] = r; |
146 | ptemp_dst[3] = a; |
147 | ptemp_src += 4; |
148 | ptemp_dst += 4; |
149 | } |
150 | } |
151 | } |
152 | |
153 | static void ConvertBGRAToBGR(const uint32_t* src, |
154 | int num_pixels, uint8_t* dst) { |
155 | const uint8_t* ptemp_src = (const uint8_t*)src; |
156 | uint8_t* ptemp_dst = (uint8_t*)dst; |
157 | const v16u8 mask0 = { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, |
158 | 16, 17, 18, 20 }; |
159 | const v16u8 mask1 = { 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20, |
160 | 21, 22, 24, 25 }; |
161 | const v16u8 mask2 = { 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25, |
162 | 26, 28, 29, 30 }; |
163 | |
164 | while (num_pixels >= 16) { |
165 | CONVERT16_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2); |
166 | ptemp_src += 64; |
167 | ptemp_dst += 48; |
168 | num_pixels -= 16; |
169 | } |
170 | if (num_pixels > 0) { |
171 | if (num_pixels >= 12) { |
172 | CONVERT12_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2); |
173 | ptemp_src += 48; |
174 | ptemp_dst += 36; |
175 | num_pixels -= 12; |
176 | } else if (num_pixels >= 8) { |
177 | CONVERT8_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1); |
178 | ptemp_src += 32; |
179 | ptemp_dst += 24; |
180 | num_pixels -= 8; |
181 | } else if (num_pixels >= 4) { |
182 | CONVERT4_BGRA_XXX(ptemp_src, ptemp_dst, mask0); |
183 | ptemp_src += 16; |
184 | ptemp_dst += 12; |
185 | num_pixels -= 4; |
186 | } |
187 | if (num_pixels == 3) { |
188 | CONVERT1_BGRA_BGR(ptemp_src + 0, ptemp_dst + 0); |
189 | CONVERT1_BGRA_BGR(ptemp_src + 4, ptemp_dst + 3); |
190 | CONVERT1_BGRA_BGR(ptemp_src + 8, ptemp_dst + 6); |
191 | } else if (num_pixels == 2) { |
192 | CONVERT1_BGRA_BGR(ptemp_src + 0, ptemp_dst + 0); |
193 | CONVERT1_BGRA_BGR(ptemp_src + 4, ptemp_dst + 3); |
194 | } else if (num_pixels == 1) { |
195 | CONVERT1_BGRA_BGR(ptemp_src, ptemp_dst); |
196 | } |
197 | } |
198 | } |
199 | |
200 | static void ConvertBGRAToRGB(const uint32_t* src, |
201 | int num_pixels, uint8_t* dst) { |
202 | const uint8_t* ptemp_src = (const uint8_t*)src; |
203 | uint8_t* ptemp_dst = (uint8_t*)dst; |
204 | const v16u8 mask0 = { 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, |
205 | 18, 17, 16, 22 }; |
206 | const v16u8 mask1 = { 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22, |
207 | 21, 20, 26, 25 }; |
208 | const v16u8 mask2 = { 8, 14, 13, 12, 18, 17, 16, 22, 21, 20, 26, 25, |
209 | 24, 30, 29, 28 }; |
210 | |
211 | while (num_pixels >= 16) { |
212 | CONVERT16_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2); |
213 | ptemp_src += 64; |
214 | ptemp_dst += 48; |
215 | num_pixels -= 16; |
216 | } |
217 | if (num_pixels) { |
218 | if (num_pixels >= 12) { |
219 | CONVERT12_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2); |
220 | ptemp_src += 48; |
221 | ptemp_dst += 36; |
222 | num_pixels -= 12; |
223 | } else if (num_pixels >= 8) { |
224 | CONVERT8_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1); |
225 | ptemp_src += 32; |
226 | ptemp_dst += 24; |
227 | num_pixels -= 8; |
228 | } else if (num_pixels >= 4) { |
229 | CONVERT4_BGRA_XXX(ptemp_src, ptemp_dst, mask0); |
230 | ptemp_src += 16; |
231 | ptemp_dst += 12; |
232 | num_pixels -= 4; |
233 | } |
234 | if (num_pixels == 3) { |
235 | CONVERT1_BGRA_RGB(ptemp_src + 0, ptemp_dst + 0); |
236 | CONVERT1_BGRA_RGB(ptemp_src + 4, ptemp_dst + 3); |
237 | CONVERT1_BGRA_RGB(ptemp_src + 8, ptemp_dst + 6); |
238 | } else if (num_pixels == 2) { |
239 | CONVERT1_BGRA_RGB(ptemp_src + 0, ptemp_dst + 0); |
240 | CONVERT1_BGRA_RGB(ptemp_src + 4, ptemp_dst + 3); |
241 | } else if (num_pixels == 1) { |
242 | CONVERT1_BGRA_RGB(ptemp_src, ptemp_dst); |
243 | } |
244 | } |
245 | } |
246 | |
247 | static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels, |
248 | uint32_t* dst) { |
249 | int i; |
250 | const uint8_t* in = (const uint8_t*)src; |
251 | uint8_t* out = (uint8_t*)dst; |
252 | v16u8 src0, dst0, tmp0; |
253 | const v16u8 mask = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, |
254 | 13, 255, 13, 255 }; |
255 | |
256 | while (num_pixels >= 8) { |
257 | v16u8 src1, dst1, tmp1; |
258 | LD_UB2(in, 16, src0, src1); |
259 | VSHF_B2_UB(src0, src1, src1, src0, mask, mask, tmp0, tmp1); |
260 | ADD2(src0, tmp0, src1, tmp1, dst0, dst1); |
261 | ST_UB2(dst0, dst1, out, 16); |
262 | in += 32; |
263 | out += 32; |
264 | num_pixels -= 8; |
265 | } |
266 | if (num_pixels > 0) { |
267 | if (num_pixels >= 4) { |
268 | src0 = LD_UB(in); |
269 | tmp0 = VSHF_UB(src0, src0, mask); |
270 | dst0 = src0 + tmp0; |
271 | ST_UB(dst0, out); |
272 | in += 16; |
273 | out += 16; |
274 | num_pixels -= 4; |
275 | } |
276 | for (i = 0; i < num_pixels; i++) { |
277 | const uint8_t b = in[0]; |
278 | const uint8_t g = in[1]; |
279 | const uint8_t r = in[2]; |
280 | out[0] = (b + g) & 0xff; |
281 | out[1] = g; |
282 | out[2] = (r + g) & 0xff; |
283 | out[4] = in[4]; |
284 | out += 4; |
285 | } |
286 | } |
287 | } |
288 | |
289 | static void TransformColorInverse(const VP8LMultipliers* const m, |
290 | const uint32_t* src, int num_pixels, |
291 | uint32_t* dst) { |
292 | v16u8 src0, dst0; |
293 | const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ | |
294 | (m->green_to_red_ << 16)); |
295 | const v16i8 r2b = (v16i8)__msa_fill_w(m->red_to_blue_); |
296 | const v16u8 mask0 = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, |
297 | 13, 255, 13, 255 }; |
298 | const v16u8 mask1 = { 16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11, |
299 | 28, 13, 30, 15 }; |
300 | |
301 | while (num_pixels >= 8) { |
302 | v16u8 src1, dst1; |
303 | LD_UB2(src, 4, src0, src1); |
304 | TRANSFORM_COLOR_INVERSE_8(src0, src1, dst0, dst1, g2br, r2b, mask0, mask1); |
305 | ST_UB2(dst0, dst1, dst, 4); |
306 | src += 8; |
307 | dst += 8; |
308 | num_pixels -= 8; |
309 | } |
310 | if (num_pixels > 0) { |
311 | if (num_pixels >= 4) { |
312 | src0 = LD_UB(src); |
313 | TRANSFORM_COLOR_INVERSE_4(src0, dst0, g2br, r2b, mask0, mask1); |
314 | ST_UB(dst0, dst); |
315 | src += 4; |
316 | dst += 4; |
317 | num_pixels -= 4; |
318 | } |
319 | if (num_pixels > 0) { |
320 | src0 = LD_UB(src); |
321 | TRANSFORM_COLOR_INVERSE_4(src0, dst0, g2br, r2b, mask0, mask1); |
322 | if (num_pixels == 3) { |
323 | const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0); |
324 | const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2); |
325 | SD(pix_d, dst + 0); |
326 | SW(pix_w, dst + 2); |
327 | } else if (num_pixels == 2) { |
328 | const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0); |
329 | SD(pix_d, dst); |
330 | } else { |
331 | const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 0); |
332 | SW(pix_w, dst); |
333 | } |
334 | } |
335 | } |
336 | } |
337 | |
338 | //------------------------------------------------------------------------------ |
339 | // Entry point |
340 | |
341 | extern void VP8LDspInitMSA(void); |
342 | |
343 | WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMSA(void) { |
344 | VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA; |
345 | VP8LConvertBGRAToBGR = ConvertBGRAToBGR; |
346 | VP8LConvertBGRAToRGB = ConvertBGRAToRGB; |
347 | VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed; |
348 | VP8LTransformColorInverse = TransformColorInverse; |
349 | } |
350 | |
351 | #else // !WEBP_USE_MSA |
352 | |
353 | WEBP_DSP_INIT_STUB(VP8LDspInitMSA) |
354 | |
355 | #endif // WEBP_USE_MSA |
356 | |