1 | // Copyright 2014 Google Inc. All Rights Reserved. |
2 | // |
3 | // Use of this source code is governed by a BSD-style license |
4 | // that can be found in the COPYING file in the root of the source |
5 | // tree. An additional intellectual property rights grant can be found |
6 | // in the file PATENTS. All contributing project authors may |
7 | // be found in the AUTHORS file in the root of the source tree. |
8 | // ----------------------------------------------------------------------------- |
9 | // |
10 | // MIPS version of dsp functions |
11 | // |
12 | // Author(s): Djordje Pesut (djordje.pesut@imgtec.com) |
13 | // Jovan Zelincevic (jovan.zelincevic@imgtec.com) |
14 | |
15 | #include "src/dsp/dsp.h" |
16 | |
17 | #if defined(WEBP_USE_MIPS32) |
18 | |
19 | #include "src/dsp/mips_macro.h" |
20 | |
21 | static const int kC1 = 20091 + (1 << 16); |
22 | static const int kC2 = 35468; |
23 | |
24 | static WEBP_INLINE int abs_mips32(int x) { |
25 | const int sign = x >> 31; |
26 | return (x ^ sign) - sign; |
27 | } |
28 | |
29 | // 4 pixels in, 2 pixels out |
30 | static WEBP_INLINE void do_filter2(uint8_t* p, int step) { |
31 | const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step]; |
32 | const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1]; |
33 | const int a1 = VP8ksclip2[(a + 4) >> 3]; |
34 | const int a2 = VP8ksclip2[(a + 3) >> 3]; |
35 | p[-step] = VP8kclip1[p0 + a2]; |
36 | p[ 0] = VP8kclip1[q0 - a1]; |
37 | } |
38 | |
39 | // 4 pixels in, 4 pixels out |
40 | static WEBP_INLINE void do_filter4(uint8_t* p, int step) { |
41 | const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step]; |
42 | const int a = 3 * (q0 - p0); |
43 | const int a1 = VP8ksclip2[(a + 4) >> 3]; |
44 | const int a2 = VP8ksclip2[(a + 3) >> 3]; |
45 | const int a3 = (a1 + 1) >> 1; |
46 | p[-2 * step] = VP8kclip1[p1 + a3]; |
47 | p[- step] = VP8kclip1[p0 + a2]; |
48 | p[ 0] = VP8kclip1[q0 - a1]; |
49 | p[ step] = VP8kclip1[q1 - a3]; |
50 | } |
51 | |
52 | // 6 pixels in, 6 pixels out |
53 | static WEBP_INLINE void do_filter6(uint8_t* p, int step) { |
54 | const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step]; |
55 | const int q0 = p[0], q1 = p[step], q2 = p[2 * step]; |
56 | const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]]; |
57 | // a is in [-128,127], a1 in [-27,27], a2 in [-18,18] and a3 in [-9,9] |
58 | const int a1 = (27 * a + 63) >> 7; // eq. to ((3 * a + 7) * 9) >> 7 |
59 | const int a2 = (18 * a + 63) >> 7; // eq. to ((2 * a + 7) * 9) >> 7 |
60 | const int a3 = (9 * a + 63) >> 7; // eq. to ((1 * a + 7) * 9) >> 7 |
61 | p[-3 * step] = VP8kclip1[p2 + a3]; |
62 | p[-2 * step] = VP8kclip1[p1 + a2]; |
63 | p[- step] = VP8kclip1[p0 + a1]; |
64 | p[ 0] = VP8kclip1[q0 - a1]; |
65 | p[ step] = VP8kclip1[q1 - a2]; |
66 | p[ 2 * step] = VP8kclip1[q2 - a3]; |
67 | } |
68 | |
69 | static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) { |
70 | const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step]; |
71 | return (abs_mips32(p1 - p0) > thresh) || (abs_mips32(q1 - q0) > thresh); |
72 | } |
73 | |
74 | static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) { |
75 | const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step]; |
76 | return ((4 * abs_mips32(p0 - q0) + abs_mips32(p1 - q1)) <= t); |
77 | } |
78 | |
79 | static WEBP_INLINE int needs_filter2(const uint8_t* p, |
80 | int step, int t, int it) { |
81 | const int p3 = p[-4 * step], p2 = p[-3 * step]; |
82 | const int p1 = p[-2 * step], p0 = p[-step]; |
83 | const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step]; |
84 | if ((4 * abs_mips32(p0 - q0) + abs_mips32(p1 - q1)) > t) { |
85 | return 0; |
86 | } |
87 | return abs_mips32(p3 - p2) <= it && abs_mips32(p2 - p1) <= it && |
88 | abs_mips32(p1 - p0) <= it && abs_mips32(q3 - q2) <= it && |
89 | abs_mips32(q2 - q1) <= it && abs_mips32(q1 - q0) <= it; |
90 | } |
91 | |
92 | static WEBP_INLINE void FilterLoop26(uint8_t* p, |
93 | int hstride, int vstride, int size, |
94 | int thresh, int ithresh, int hev_thresh) { |
95 | const int thresh2 = 2 * thresh + 1; |
96 | while (size-- > 0) { |
97 | if (needs_filter2(p, hstride, thresh2, ithresh)) { |
98 | if (hev(p, hstride, hev_thresh)) { |
99 | do_filter2(p, hstride); |
100 | } else { |
101 | do_filter6(p, hstride); |
102 | } |
103 | } |
104 | p += vstride; |
105 | } |
106 | } |
107 | |
108 | static WEBP_INLINE void FilterLoop24(uint8_t* p, |
109 | int hstride, int vstride, int size, |
110 | int thresh, int ithresh, int hev_thresh) { |
111 | const int thresh2 = 2 * thresh + 1; |
112 | while (size-- > 0) { |
113 | if (needs_filter2(p, hstride, thresh2, ithresh)) { |
114 | if (hev(p, hstride, hev_thresh)) { |
115 | do_filter2(p, hstride); |
116 | } else { |
117 | do_filter4(p, hstride); |
118 | } |
119 | } |
120 | p += vstride; |
121 | } |
122 | } |
123 | |
124 | // on macroblock edges |
125 | static void VFilter16(uint8_t* p, int stride, |
126 | int thresh, int ithresh, int hev_thresh) { |
127 | FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh); |
128 | } |
129 | |
130 | static void HFilter16(uint8_t* p, int stride, |
131 | int thresh, int ithresh, int hev_thresh) { |
132 | FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh); |
133 | } |
134 | |
135 | // 8-pixels wide variant, for chroma filtering |
136 | static void VFilter8(uint8_t* u, uint8_t* v, int stride, |
137 | int thresh, int ithresh, int hev_thresh) { |
138 | FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh); |
139 | FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh); |
140 | } |
141 | |
142 | static void HFilter8(uint8_t* u, uint8_t* v, int stride, |
143 | int thresh, int ithresh, int hev_thresh) { |
144 | FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh); |
145 | FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh); |
146 | } |
147 | |
148 | static void VFilter8i(uint8_t* u, uint8_t* v, int stride, |
149 | int thresh, int ithresh, int hev_thresh) { |
150 | FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); |
151 | FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); |
152 | } |
153 | |
154 | static void HFilter8i(uint8_t* u, uint8_t* v, int stride, |
155 | int thresh, int ithresh, int hev_thresh) { |
156 | FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh); |
157 | FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh); |
158 | } |
159 | |
160 | // on three inner edges |
161 | static void VFilter16i(uint8_t* p, int stride, |
162 | int thresh, int ithresh, int hev_thresh) { |
163 | int k; |
164 | for (k = 3; k > 0; --k) { |
165 | p += 4 * stride; |
166 | FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh); |
167 | } |
168 | } |
169 | |
170 | static void HFilter16i(uint8_t* p, int stride, |
171 | int thresh, int ithresh, int hev_thresh) { |
172 | int k; |
173 | for (k = 3; k > 0; --k) { |
174 | p += 4; |
175 | FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh); |
176 | } |
177 | } |
178 | |
179 | //------------------------------------------------------------------------------ |
180 | // Simple In-loop filtering (Paragraph 15.2) |
181 | |
182 | static void SimpleVFilter16(uint8_t* p, int stride, int thresh) { |
183 | int i; |
184 | const int thresh2 = 2 * thresh + 1; |
185 | for (i = 0; i < 16; ++i) { |
186 | if (needs_filter(p + i, stride, thresh2)) { |
187 | do_filter2(p + i, stride); |
188 | } |
189 | } |
190 | } |
191 | |
192 | static void SimpleHFilter16(uint8_t* p, int stride, int thresh) { |
193 | int i; |
194 | const int thresh2 = 2 * thresh + 1; |
195 | for (i = 0; i < 16; ++i) { |
196 | if (needs_filter(p + i * stride, 1, thresh2)) { |
197 | do_filter2(p + i * stride, 1); |
198 | } |
199 | } |
200 | } |
201 | |
202 | static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) { |
203 | int k; |
204 | for (k = 3; k > 0; --k) { |
205 | p += 4 * stride; |
206 | SimpleVFilter16(p, stride, thresh); |
207 | } |
208 | } |
209 | |
210 | static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { |
211 | int k; |
212 | for (k = 3; k > 0; --k) { |
213 | p += 4; |
214 | SimpleHFilter16(p, stride, thresh); |
215 | } |
216 | } |
217 | |
218 | static void TransformOne(const int16_t* in, uint8_t* dst) { |
219 | int temp0, temp1, temp2, temp3, temp4; |
220 | int temp5, temp6, temp7, temp8, temp9; |
221 | int temp10, temp11, temp12, temp13, temp14; |
222 | int temp15, temp16, temp17, temp18; |
223 | int16_t* p_in = (int16_t*)in; |
224 | |
225 | // loops unrolled and merged to avoid usage of tmp buffer |
226 | // and to reduce number of stalls. MUL macro is written |
227 | // in assembler and inlined |
228 | __asm__ volatile( |
229 | "lh %[temp0], 0(%[in]) \n\t" |
230 | "lh %[temp8], 16(%[in]) \n\t" |
231 | "lh %[temp4], 8(%[in]) \n\t" |
232 | "lh %[temp12], 24(%[in]) \n\t" |
233 | "addu %[temp16], %[temp0], %[temp8] \n\t" |
234 | "subu %[temp0], %[temp0], %[temp8] \n\t" |
235 | "mul %[temp8], %[temp4], %[kC2] \n\t" |
236 | "mul %[temp17], %[temp12], %[kC1] \n\t" |
237 | "mul %[temp4], %[temp4], %[kC1] \n\t" |
238 | "mul %[temp12], %[temp12], %[kC2] \n\t" |
239 | "lh %[temp1], 2(%[in]) \n\t" |
240 | "lh %[temp5], 10(%[in]) \n\t" |
241 | "lh %[temp9], 18(%[in]) \n\t" |
242 | "lh %[temp13], 26(%[in]) \n\t" |
243 | "sra %[temp8], %[temp8], 16 \n\t" |
244 | "sra %[temp17], %[temp17], 16 \n\t" |
245 | "sra %[temp4], %[temp4], 16 \n\t" |
246 | "sra %[temp12], %[temp12], 16 \n\t" |
247 | "lh %[temp2], 4(%[in]) \n\t" |
248 | "lh %[temp6], 12(%[in]) \n\t" |
249 | "lh %[temp10], 20(%[in]) \n\t" |
250 | "lh %[temp14], 28(%[in]) \n\t" |
251 | "subu %[temp17], %[temp8], %[temp17] \n\t" |
252 | "addu %[temp4], %[temp4], %[temp12] \n\t" |
253 | "addu %[temp8], %[temp16], %[temp4] \n\t" |
254 | "subu %[temp4], %[temp16], %[temp4] \n\t" |
255 | "addu %[temp16], %[temp1], %[temp9] \n\t" |
256 | "subu %[temp1], %[temp1], %[temp9] \n\t" |
257 | "lh %[temp3], 6(%[in]) \n\t" |
258 | "lh %[temp7], 14(%[in]) \n\t" |
259 | "lh %[temp11], 22(%[in]) \n\t" |
260 | "lh %[temp15], 30(%[in]) \n\t" |
261 | "addu %[temp12], %[temp0], %[temp17] \n\t" |
262 | "subu %[temp0], %[temp0], %[temp17] \n\t" |
263 | "mul %[temp9], %[temp5], %[kC2] \n\t" |
264 | "mul %[temp17], %[temp13], %[kC1] \n\t" |
265 | "mul %[temp5], %[temp5], %[kC1] \n\t" |
266 | "mul %[temp13], %[temp13], %[kC2] \n\t" |
267 | "sra %[temp9], %[temp9], 16 \n\t" |
268 | "sra %[temp17], %[temp17], 16 \n\t" |
269 | "subu %[temp17], %[temp9], %[temp17] \n\t" |
270 | "sra %[temp5], %[temp5], 16 \n\t" |
271 | "sra %[temp13], %[temp13], 16 \n\t" |
272 | "addu %[temp5], %[temp5], %[temp13] \n\t" |
273 | "addu %[temp13], %[temp1], %[temp17] \n\t" |
274 | "subu %[temp1], %[temp1], %[temp17] \n\t" |
275 | "mul %[temp17], %[temp14], %[kC1] \n\t" |
276 | "mul %[temp14], %[temp14], %[kC2] \n\t" |
277 | "addu %[temp9], %[temp16], %[temp5] \n\t" |
278 | "subu %[temp5], %[temp16], %[temp5] \n\t" |
279 | "addu %[temp16], %[temp2], %[temp10] \n\t" |
280 | "subu %[temp2], %[temp2], %[temp10] \n\t" |
281 | "mul %[temp10], %[temp6], %[kC2] \n\t" |
282 | "mul %[temp6], %[temp6], %[kC1] \n\t" |
283 | "sra %[temp17], %[temp17], 16 \n\t" |
284 | "sra %[temp14], %[temp14], 16 \n\t" |
285 | "sra %[temp10], %[temp10], 16 \n\t" |
286 | "sra %[temp6], %[temp6], 16 \n\t" |
287 | "subu %[temp17], %[temp10], %[temp17] \n\t" |
288 | "addu %[temp6], %[temp6], %[temp14] \n\t" |
289 | "addu %[temp10], %[temp16], %[temp6] \n\t" |
290 | "subu %[temp6], %[temp16], %[temp6] \n\t" |
291 | "addu %[temp14], %[temp2], %[temp17] \n\t" |
292 | "subu %[temp2], %[temp2], %[temp17] \n\t" |
293 | "mul %[temp17], %[temp15], %[kC1] \n\t" |
294 | "mul %[temp15], %[temp15], %[kC2] \n\t" |
295 | "addu %[temp16], %[temp3], %[temp11] \n\t" |
296 | "subu %[temp3], %[temp3], %[temp11] \n\t" |
297 | "mul %[temp11], %[temp7], %[kC2] \n\t" |
298 | "mul %[temp7], %[temp7], %[kC1] \n\t" |
299 | "addiu %[temp8], %[temp8], 4 \n\t" |
300 | "addiu %[temp12], %[temp12], 4 \n\t" |
301 | "addiu %[temp0], %[temp0], 4 \n\t" |
302 | "addiu %[temp4], %[temp4], 4 \n\t" |
303 | "sra %[temp17], %[temp17], 16 \n\t" |
304 | "sra %[temp15], %[temp15], 16 \n\t" |
305 | "sra %[temp11], %[temp11], 16 \n\t" |
306 | "sra %[temp7], %[temp7], 16 \n\t" |
307 | "subu %[temp17], %[temp11], %[temp17] \n\t" |
308 | "addu %[temp7], %[temp7], %[temp15] \n\t" |
309 | "addu %[temp15], %[temp3], %[temp17] \n\t" |
310 | "subu %[temp3], %[temp3], %[temp17] \n\t" |
311 | "addu %[temp11], %[temp16], %[temp7] \n\t" |
312 | "subu %[temp7], %[temp16], %[temp7] \n\t" |
313 | "addu %[temp16], %[temp8], %[temp10] \n\t" |
314 | "subu %[temp8], %[temp8], %[temp10] \n\t" |
315 | "mul %[temp10], %[temp9], %[kC2] \n\t" |
316 | "mul %[temp17], %[temp11], %[kC1] \n\t" |
317 | "mul %[temp9], %[temp9], %[kC1] \n\t" |
318 | "mul %[temp11], %[temp11], %[kC2] \n\t" |
319 | "sra %[temp10], %[temp10], 16 \n\t" |
320 | "sra %[temp17], %[temp17], 16 \n\t" |
321 | "sra %[temp9], %[temp9], 16 \n\t" |
322 | "sra %[temp11], %[temp11], 16 \n\t" |
323 | "subu %[temp17], %[temp10], %[temp17] \n\t" |
324 | "addu %[temp11], %[temp9], %[temp11] \n\t" |
325 | "addu %[temp10], %[temp12], %[temp14] \n\t" |
326 | "subu %[temp12], %[temp12], %[temp14] \n\t" |
327 | "mul %[temp14], %[temp13], %[kC2] \n\t" |
328 | "mul %[temp9], %[temp15], %[kC1] \n\t" |
329 | "mul %[temp13], %[temp13], %[kC1] \n\t" |
330 | "mul %[temp15], %[temp15], %[kC2] \n\t" |
331 | "sra %[temp14], %[temp14], 16 \n\t" |
332 | "sra %[temp9], %[temp9], 16 \n\t" |
333 | "sra %[temp13], %[temp13], 16 \n\t" |
334 | "sra %[temp15], %[temp15], 16 \n\t" |
335 | "subu %[temp9], %[temp14], %[temp9] \n\t" |
336 | "addu %[temp15], %[temp13], %[temp15] \n\t" |
337 | "addu %[temp14], %[temp0], %[temp2] \n\t" |
338 | "subu %[temp0], %[temp0], %[temp2] \n\t" |
339 | "mul %[temp2], %[temp1], %[kC2] \n\t" |
340 | "mul %[temp13], %[temp3], %[kC1] \n\t" |
341 | "mul %[temp1], %[temp1], %[kC1] \n\t" |
342 | "mul %[temp3], %[temp3], %[kC2] \n\t" |
343 | "sra %[temp2], %[temp2], 16 \n\t" |
344 | "sra %[temp13], %[temp13], 16 \n\t" |
345 | "sra %[temp1], %[temp1], 16 \n\t" |
346 | "sra %[temp3], %[temp3], 16 \n\t" |
347 | "subu %[temp13], %[temp2], %[temp13] \n\t" |
348 | "addu %[temp3], %[temp1], %[temp3] \n\t" |
349 | "addu %[temp2], %[temp4], %[temp6] \n\t" |
350 | "subu %[temp4], %[temp4], %[temp6] \n\t" |
351 | "mul %[temp6], %[temp5], %[kC2] \n\t" |
352 | "mul %[temp1], %[temp7], %[kC1] \n\t" |
353 | "mul %[temp5], %[temp5], %[kC1] \n\t" |
354 | "mul %[temp7], %[temp7], %[kC2] \n\t" |
355 | "sra %[temp6], %[temp6], 16 \n\t" |
356 | "sra %[temp1], %[temp1], 16 \n\t" |
357 | "sra %[temp5], %[temp5], 16 \n\t" |
358 | "sra %[temp7], %[temp7], 16 \n\t" |
359 | "subu %[temp1], %[temp6], %[temp1] \n\t" |
360 | "addu %[temp7], %[temp5], %[temp7] \n\t" |
361 | "addu %[temp5], %[temp16], %[temp11] \n\t" |
362 | "subu %[temp16], %[temp16], %[temp11] \n\t" |
363 | "addu %[temp11], %[temp8], %[temp17] \n\t" |
364 | "subu %[temp8], %[temp8], %[temp17] \n\t" |
365 | "sra %[temp5], %[temp5], 3 \n\t" |
366 | "sra %[temp16], %[temp16], 3 \n\t" |
367 | "sra %[temp11], %[temp11], 3 \n\t" |
368 | "sra %[temp8], %[temp8], 3 \n\t" |
369 | "addu %[temp17], %[temp10], %[temp15] \n\t" |
370 | "subu %[temp10], %[temp10], %[temp15] \n\t" |
371 | "addu %[temp15], %[temp12], %[temp9] \n\t" |
372 | "subu %[temp12], %[temp12], %[temp9] \n\t" |
373 | "sra %[temp17], %[temp17], 3 \n\t" |
374 | "sra %[temp10], %[temp10], 3 \n\t" |
375 | "sra %[temp15], %[temp15], 3 \n\t" |
376 | "sra %[temp12], %[temp12], 3 \n\t" |
377 | "addu %[temp9], %[temp14], %[temp3] \n\t" |
378 | "subu %[temp14], %[temp14], %[temp3] \n\t" |
379 | "addu %[temp3], %[temp0], %[temp13] \n\t" |
380 | "subu %[temp0], %[temp0], %[temp13] \n\t" |
381 | "sra %[temp9], %[temp9], 3 \n\t" |
382 | "sra %[temp14], %[temp14], 3 \n\t" |
383 | "sra %[temp3], %[temp3], 3 \n\t" |
384 | "sra %[temp0], %[temp0], 3 \n\t" |
385 | "addu %[temp13], %[temp2], %[temp7] \n\t" |
386 | "subu %[temp2], %[temp2], %[temp7] \n\t" |
387 | "addu %[temp7], %[temp4], %[temp1] \n\t" |
388 | "subu %[temp4], %[temp4], %[temp1] \n\t" |
389 | "sra %[temp13], %[temp13], 3 \n\t" |
390 | "sra %[temp2], %[temp2], 3 \n\t" |
391 | "sra %[temp7], %[temp7], 3 \n\t" |
392 | "sra %[temp4], %[temp4], 3 \n\t" |
393 | "addiu %[temp6], $zero, 255 \n\t" |
394 | "lbu %[temp1], 0+0*" XSTR(BPS) "(%[dst]) \n\t" |
395 | "addu %[temp1], %[temp1], %[temp5] \n\t" |
396 | "sra %[temp5], %[temp1], 8 \n\t" |
397 | "sra %[temp18], %[temp1], 31 \n\t" |
398 | "beqz %[temp5], 1f \n\t" |
399 | "xor %[temp1], %[temp1], %[temp1] \n\t" |
400 | "movz %[temp1], %[temp6], %[temp18] \n\t" |
401 | "1: \n\t" |
402 | "lbu %[temp18], 1+0*" XSTR(BPS) "(%[dst]) \n\t" |
403 | "sb %[temp1], 0+0*" XSTR(BPS) "(%[dst]) \n\t" |
404 | "addu %[temp18], %[temp18], %[temp11] \n\t" |
405 | "sra %[temp11], %[temp18], 8 \n\t" |
406 | "sra %[temp1], %[temp18], 31 \n\t" |
407 | "beqz %[temp11], 2f \n\t" |
408 | "xor %[temp18], %[temp18], %[temp18] \n\t" |
409 | "movz %[temp18], %[temp6], %[temp1] \n\t" |
410 | "2: \n\t" |
411 | "lbu %[temp1], 2+0*" XSTR(BPS) "(%[dst]) \n\t" |
412 | "sb %[temp18], 1+0*" XSTR(BPS) "(%[dst]) \n\t" |
413 | "addu %[temp1], %[temp1], %[temp8] \n\t" |
414 | "sra %[temp8], %[temp1], 8 \n\t" |
415 | "sra %[temp18], %[temp1], 31 \n\t" |
416 | "beqz %[temp8], 3f \n\t" |
417 | "xor %[temp1], %[temp1], %[temp1] \n\t" |
418 | "movz %[temp1], %[temp6], %[temp18] \n\t" |
419 | "3: \n\t" |
420 | "lbu %[temp18], 3+0*" XSTR(BPS) "(%[dst]) \n\t" |
421 | "sb %[temp1], 2+0*" XSTR(BPS) "(%[dst]) \n\t" |
422 | "addu %[temp18], %[temp18], %[temp16] \n\t" |
423 | "sra %[temp16], %[temp18], 8 \n\t" |
424 | "sra %[temp1], %[temp18], 31 \n\t" |
425 | "beqz %[temp16], 4f \n\t" |
426 | "xor %[temp18], %[temp18], %[temp18] \n\t" |
427 | "movz %[temp18], %[temp6], %[temp1] \n\t" |
428 | "4: \n\t" |
429 | "sb %[temp18], 3+0*" XSTR(BPS) "(%[dst]) \n\t" |
430 | "lbu %[temp5], 0+1*" XSTR(BPS) "(%[dst]) \n\t" |
431 | "lbu %[temp8], 1+1*" XSTR(BPS) "(%[dst]) \n\t" |
432 | "lbu %[temp11], 2+1*" XSTR(BPS) "(%[dst]) \n\t" |
433 | "lbu %[temp16], 3+1*" XSTR(BPS) "(%[dst]) \n\t" |
434 | "addu %[temp5], %[temp5], %[temp17] \n\t" |
435 | "addu %[temp8], %[temp8], %[temp15] \n\t" |
436 | "addu %[temp11], %[temp11], %[temp12] \n\t" |
437 | "addu %[temp16], %[temp16], %[temp10] \n\t" |
438 | "sra %[temp18], %[temp5], 8 \n\t" |
439 | "sra %[temp1], %[temp5], 31 \n\t" |
440 | "beqz %[temp18], 5f \n\t" |
441 | "xor %[temp5], %[temp5], %[temp5] \n\t" |
442 | "movz %[temp5], %[temp6], %[temp1] \n\t" |
443 | "5: \n\t" |
444 | "sra %[temp18], %[temp8], 8 \n\t" |
445 | "sra %[temp1], %[temp8], 31 \n\t" |
446 | "beqz %[temp18], 6f \n\t" |
447 | "xor %[temp8], %[temp8], %[temp8] \n\t" |
448 | "movz %[temp8], %[temp6], %[temp1] \n\t" |
449 | "6: \n\t" |
450 | "sra %[temp18], %[temp11], 8 \n\t" |
451 | "sra %[temp1], %[temp11], 31 \n\t" |
452 | "sra %[temp17], %[temp16], 8 \n\t" |
453 | "sra %[temp15], %[temp16], 31 \n\t" |
454 | "beqz %[temp18], 7f \n\t" |
455 | "xor %[temp11], %[temp11], %[temp11] \n\t" |
456 | "movz %[temp11], %[temp6], %[temp1] \n\t" |
457 | "7: \n\t" |
458 | "beqz %[temp17], 8f \n\t" |
459 | "xor %[temp16], %[temp16], %[temp16] \n\t" |
460 | "movz %[temp16], %[temp6], %[temp15] \n\t" |
461 | "8: \n\t" |
462 | "sb %[temp5], 0+1*" XSTR(BPS) "(%[dst]) \n\t" |
463 | "sb %[temp8], 1+1*" XSTR(BPS) "(%[dst]) \n\t" |
464 | "sb %[temp11], 2+1*" XSTR(BPS) "(%[dst]) \n\t" |
465 | "sb %[temp16], 3+1*" XSTR(BPS) "(%[dst]) \n\t" |
466 | "lbu %[temp5], 0+2*" XSTR(BPS) "(%[dst]) \n\t" |
467 | "lbu %[temp8], 1+2*" XSTR(BPS) "(%[dst]) \n\t" |
468 | "lbu %[temp11], 2+2*" XSTR(BPS) "(%[dst]) \n\t" |
469 | "lbu %[temp16], 3+2*" XSTR(BPS) "(%[dst]) \n\t" |
470 | "addu %[temp5], %[temp5], %[temp9] \n\t" |
471 | "addu %[temp8], %[temp8], %[temp3] \n\t" |
472 | "addu %[temp11], %[temp11], %[temp0] \n\t" |
473 | "addu %[temp16], %[temp16], %[temp14] \n\t" |
474 | "sra %[temp18], %[temp5], 8 \n\t" |
475 | "sra %[temp1], %[temp5], 31 \n\t" |
476 | "sra %[temp17], %[temp8], 8 \n\t" |
477 | "sra %[temp15], %[temp8], 31 \n\t" |
478 | "sra %[temp12], %[temp11], 8 \n\t" |
479 | "sra %[temp10], %[temp11], 31 \n\t" |
480 | "sra %[temp9], %[temp16], 8 \n\t" |
481 | "sra %[temp3], %[temp16], 31 \n\t" |
482 | "beqz %[temp18], 9f \n\t" |
483 | "xor %[temp5], %[temp5], %[temp5] \n\t" |
484 | "movz %[temp5], %[temp6], %[temp1] \n\t" |
485 | "9: \n\t" |
486 | "beqz %[temp17], 10f \n\t" |
487 | "xor %[temp8], %[temp8], %[temp8] \n\t" |
488 | "movz %[temp8], %[temp6], %[temp15] \n\t" |
489 | "10: \n\t" |
490 | "beqz %[temp12], 11f \n\t" |
491 | "xor %[temp11], %[temp11], %[temp11] \n\t" |
492 | "movz %[temp11], %[temp6], %[temp10] \n\t" |
493 | "11: \n\t" |
494 | "beqz %[temp9], 12f \n\t" |
495 | "xor %[temp16], %[temp16], %[temp16] \n\t" |
496 | "movz %[temp16], %[temp6], %[temp3] \n\t" |
497 | "12: \n\t" |
498 | "sb %[temp5], 0+2*" XSTR(BPS) "(%[dst]) \n\t" |
499 | "sb %[temp8], 1+2*" XSTR(BPS) "(%[dst]) \n\t" |
500 | "sb %[temp11], 2+2*" XSTR(BPS) "(%[dst]) \n\t" |
501 | "sb %[temp16], 3+2*" XSTR(BPS) "(%[dst]) \n\t" |
502 | "lbu %[temp5], 0+3*" XSTR(BPS) "(%[dst]) \n\t" |
503 | "lbu %[temp8], 1+3*" XSTR(BPS) "(%[dst]) \n\t" |
504 | "lbu %[temp11], 2+3*" XSTR(BPS) "(%[dst]) \n\t" |
505 | "lbu %[temp16], 3+3*" XSTR(BPS) "(%[dst]) \n\t" |
506 | "addu %[temp5], %[temp5], %[temp13] \n\t" |
507 | "addu %[temp8], %[temp8], %[temp7] \n\t" |
508 | "addu %[temp11], %[temp11], %[temp4] \n\t" |
509 | "addu %[temp16], %[temp16], %[temp2] \n\t" |
510 | "sra %[temp18], %[temp5], 8 \n\t" |
511 | "sra %[temp1], %[temp5], 31 \n\t" |
512 | "sra %[temp17], %[temp8], 8 \n\t" |
513 | "sra %[temp15], %[temp8], 31 \n\t" |
514 | "sra %[temp12], %[temp11], 8 \n\t" |
515 | "sra %[temp10], %[temp11], 31 \n\t" |
516 | "sra %[temp9], %[temp16], 8 \n\t" |
517 | "sra %[temp3], %[temp16], 31 \n\t" |
518 | "beqz %[temp18], 13f \n\t" |
519 | "xor %[temp5], %[temp5], %[temp5] \n\t" |
520 | "movz %[temp5], %[temp6], %[temp1] \n\t" |
521 | "13: \n\t" |
522 | "beqz %[temp17], 14f \n\t" |
523 | "xor %[temp8], %[temp8], %[temp8] \n\t" |
524 | "movz %[temp8], %[temp6], %[temp15] \n\t" |
525 | "14: \n\t" |
526 | "beqz %[temp12], 15f \n\t" |
527 | "xor %[temp11], %[temp11], %[temp11] \n\t" |
528 | "movz %[temp11], %[temp6], %[temp10] \n\t" |
529 | "15: \n\t" |
530 | "beqz %[temp9], 16f \n\t" |
531 | "xor %[temp16], %[temp16], %[temp16] \n\t" |
532 | "movz %[temp16], %[temp6], %[temp3] \n\t" |
533 | "16: \n\t" |
534 | "sb %[temp5], 0+3*" XSTR(BPS) "(%[dst]) \n\t" |
535 | "sb %[temp8], 1+3*" XSTR(BPS) "(%[dst]) \n\t" |
536 | "sb %[temp11], 2+3*" XSTR(BPS) "(%[dst]) \n\t" |
537 | "sb %[temp16], 3+3*" XSTR(BPS) "(%[dst]) \n\t" |
538 | |
539 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
540 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
541 | [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [temp8]"=&r" (temp8), |
542 | [temp9]"=&r" (temp9), [temp10]"=&r" (temp10), [temp11]"=&r" (temp11), |
543 | [temp12]"=&r" (temp12), [temp13]"=&r" (temp13), [temp14]"=&r" (temp14), |
544 | [temp15]"=&r" (temp15), [temp16]"=&r" (temp16), [temp17]"=&r" (temp17), |
545 | [temp18]"=&r" (temp18) |
546 | : [in]"r" (p_in), [kC1]"r" (kC1), [kC2]"r" (kC2), [dst]"r" (dst) |
547 | : "memory" , "hi" , "lo" |
548 | ); |
549 | } |
550 | |
551 | static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { |
552 | TransformOne(in, dst); |
553 | if (do_two) { |
554 | TransformOne(in + 16, dst + 4); |
555 | } |
556 | } |
557 | |
558 | //------------------------------------------------------------------------------ |
559 | // Entry point |
560 | |
561 | extern void VP8DspInitMIPS32(void); |
562 | |
563 | WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPS32(void) { |
564 | VP8InitClipTables(); |
565 | |
566 | VP8Transform = TransformTwo; |
567 | |
568 | VP8VFilter16 = VFilter16; |
569 | VP8HFilter16 = HFilter16; |
570 | VP8VFilter8 = VFilter8; |
571 | VP8HFilter8 = HFilter8; |
572 | VP8VFilter16i = VFilter16i; |
573 | VP8HFilter16i = HFilter16i; |
574 | VP8VFilter8i = VFilter8i; |
575 | VP8HFilter8i = HFilter8i; |
576 | |
577 | VP8SimpleVFilter16 = SimpleVFilter16; |
578 | VP8SimpleHFilter16 = SimpleHFilter16; |
579 | VP8SimpleVFilter16i = SimpleVFilter16i; |
580 | VP8SimpleHFilter16i = SimpleHFilter16i; |
581 | } |
582 | |
583 | #else // !WEBP_USE_MIPS32 |
584 | |
585 | WEBP_DSP_INIT_STUB(VP8DspInitMIPS32) |
586 | |
587 | #endif // WEBP_USE_MIPS32 |
588 | |