1 | // Copyright 2014 Google Inc. All Rights Reserved. |
2 | // |
3 | // Use of this source code is governed by a BSD-style license |
4 | // that can be found in the COPYING file in the root of the source |
5 | // tree. An additional intellectual property rights grant can be found |
6 | // in the file PATENTS. All contributing project authors may |
7 | // be found in the AUTHORS file in the root of the source tree. |
8 | // ----------------------------------------------------------------------------- |
9 | // |
10 | // MIPS version of dsp functions |
11 | // |
12 | // Author(s): Djordje Pesut (djordje.pesut@imgtec.com) |
13 | // Jovan Zelincevic (jovan.zelincevic@imgtec.com) |
14 | |
15 | #include "./dsp.h" |
16 | |
17 | #if defined(WEBP_USE_MIPS_DSP_R2) |
18 | |
19 | #include "./mips_macro.h" |
20 | |
21 | static const int kC1 = 20091 + (1 << 16); |
22 | static const int kC2 = 35468; |
23 | |
24 | #define MUL(a, b) (((a) * (b)) >> 16) |
25 | |
26 | static void TransformDC(const int16_t* in, uint8_t* dst) { |
27 | int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10; |
28 | |
29 | __asm__ volatile ( |
30 | LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst, |
31 | 0, 0, 0, 0, |
32 | 0, 1, 2, 3, |
33 | BPS) |
34 | "lh %[temp5], 0(%[in]) \n\t" |
35 | "addiu %[temp5], %[temp5], 4 \n\t" |
36 | "ins %[temp5], %[temp5], 16, 16 \n\t" |
37 | "shra.ph %[temp5], %[temp5], 3 \n\t" |
38 | CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2, |
39 | temp3, temp1, temp2, temp3, temp4) |
40 | STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3, |
41 | temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5, |
42 | dst, 0, 1, 2, 3, BPS) |
43 | |
44 | OUTPUT_EARLY_CLOBBER_REGS_10() |
45 | : [in]"r" (in), [dst]"r" (dst) |
46 | : "memory" |
47 | ); |
48 | } |
49 | |
50 | static void TransformAC3(const int16_t* in, uint8_t* dst) { |
51 | const int a = in[0] + 4; |
52 | int c4 = MUL(in[4], kC2); |
53 | const int d4 = MUL(in[4], kC1); |
54 | const int c1 = MUL(in[1], kC2); |
55 | const int d1 = MUL(in[1], kC1); |
56 | int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; |
57 | int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18; |
58 | |
59 | __asm__ volatile ( |
60 | "ins %[c4], %[d4], 16, 16 \n\t" |
61 | "replv.ph %[temp1], %[a] \n\t" |
62 | "replv.ph %[temp4], %[d1] \n\t" |
63 | ADD_SUB_HALVES(temp2, temp3, temp1, c4) |
64 | "replv.ph %[temp5], %[c1] \n\t" |
65 | SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4, |
66 | temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5) |
67 | LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst, |
68 | 0, 0, 0, 0, |
69 | 0, 1, 2, 3, |
70 | BPS) |
71 | CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16, |
72 | temp11, temp17, temp3, temp5, temp11, temp12) |
73 | PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2, |
74 | temp4, temp7, temp6, temp10, temp9) |
75 | STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11, |
76 | temp17, temp12, temp18, temp1, temp8, temp2, temp4, |
77 | temp7, temp6, dst, 0, 1, 2, 3, BPS) |
78 | |
79 | OUTPUT_EARLY_CLOBBER_REGS_18(), |
80 | [c4]"+&r" (c4) |
81 | : [dst]"r" (dst), [a]"r" (a), [d1]"r" (d1), [d4]"r" (d4), [c1]"r" (c1) |
82 | : "memory" |
83 | ); |
84 | } |
85 | |
86 | static void TransformOne(const int16_t* in, uint8_t* dst) { |
87 | int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; |
88 | int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18; |
89 | |
90 | __asm__ volatile ( |
91 | "ulw %[temp1], 0(%[in]) \n\t" |
92 | "ulw %[temp2], 16(%[in]) \n\t" |
93 | LOAD_IN_X2(temp5, temp6, 24, 26) |
94 | ADD_SUB_HALVES(temp3, temp4, temp1, temp2) |
95 | LOAD_IN_X2(temp1, temp2, 8, 10) |
96 | MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14, |
97 | temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6, |
98 | temp13, temp11, temp14, temp12) |
99 | INSERT_HALF_X2(temp8, temp7, temp10, temp9) |
100 | "ulw %[temp17], 4(%[in]) \n\t" |
101 | "ulw %[temp18], 20(%[in]) \n\t" |
102 | ADD_SUB_HALVES(temp1, temp2, temp3, temp8) |
103 | ADD_SUB_HALVES(temp5, temp6, temp4, temp7) |
104 | ADD_SUB_HALVES(temp7, temp8, temp17, temp18) |
105 | LOAD_IN_X2(temp17, temp18, 12, 14) |
106 | LOAD_IN_X2(temp9, temp10, 28, 30) |
107 | MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17, |
108 | temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10, |
109 | temp15, temp4, temp16, temp17) |
110 | INSERT_HALF_X2(temp11, temp12, temp13, temp14) |
111 | ADD_SUB_HALVES(temp17, temp8, temp8, temp11) |
112 | ADD_SUB_HALVES(temp3, temp4, temp7, temp12) |
113 | |
114 | // horizontal |
115 | SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6) |
116 | INSERT_HALF_X2(temp1, temp6, temp5, temp2) |
117 | SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8) |
118 | "repl.ph %[temp2], 0x4 \n\t" |
119 | INSERT_HALF_X2(temp3, temp8, temp17, temp4) |
120 | "addq.ph %[temp1], %[temp1], %[temp2] \n\t" |
121 | "addq.ph %[temp6], %[temp6], %[temp2] \n\t" |
122 | ADD_SUB_HALVES(temp2, temp4, temp1, temp3) |
123 | ADD_SUB_HALVES(temp5, temp7, temp6, temp8) |
124 | MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18, |
125 | temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15, |
126 | temp6, temp17, temp8, temp18) |
127 | MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16, |
128 | temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14, |
129 | temp18, temp12, temp17, temp16) |
130 | INSERT_HALF_X2(temp1, temp3, temp9, temp13) |
131 | INSERT_HALF_X2(temp6, temp8, temp11, temp15) |
132 | SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15, |
133 | temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8, |
134 | temp6) |
135 | PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13, |
136 | temp16, temp11, temp10, temp15, temp14) |
137 | LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst, |
138 | 0, 0, 0, 0, |
139 | 0, 1, 2, 3, |
140 | BPS) |
141 | CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10, |
142 | temp11, temp10, temp11, temp14, temp15) |
143 | STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11, |
144 | temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4, |
145 | dst, 0, 1, 2, 3, BPS) |
146 | |
147 | OUTPUT_EARLY_CLOBBER_REGS_18() |
148 | : [dst]"r" (dst), [in]"r" (in), [kC1]"r" (kC1), [kC2]"r" (kC2) |
149 | : "memory" , "hi" , "lo" |
150 | ); |
151 | } |
152 | |
153 | static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { |
154 | TransformOne(in, dst); |
155 | if (do_two) { |
156 | TransformOne(in + 16, dst + 4); |
157 | } |
158 | } |
159 | |
160 | static WEBP_INLINE void FilterLoop26(uint8_t* p, |
161 | int hstride, int vstride, int size, |
162 | int thresh, int ithresh, int hev_thresh) { |
163 | const int thresh2 = 2 * thresh + 1; |
164 | int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; |
165 | int temp10, temp11, temp12, temp13, temp14, temp15; |
166 | |
167 | __asm__ volatile ( |
168 | ".set push \n\t" |
169 | ".set noreorder \n\t" |
170 | "1: \n\t" |
171 | "negu %[temp1], %[hstride] \n\t" |
172 | "addiu %[size], %[size], -1 \n\t" |
173 | "sll %[temp2], %[hstride], 1 \n\t" |
174 | "sll %[temp3], %[temp1], 1 \n\t" |
175 | "addu %[temp4], %[temp2], %[hstride] \n\t" |
176 | "addu %[temp5], %[temp3], %[temp1] \n\t" |
177 | "lbu %[temp7], 0(%[p]) \n\t" |
178 | "sll %[temp6], %[temp3], 1 \n\t" |
179 | "lbux %[temp8], %[temp5](%[p]) \n\t" |
180 | "lbux %[temp9], %[temp3](%[p]) \n\t" |
181 | "lbux %[temp10], %[temp1](%[p]) \n\t" |
182 | "lbux %[temp11], %[temp6](%[p]) \n\t" |
183 | "lbux %[temp12], %[hstride](%[p]) \n\t" |
184 | "lbux %[temp13], %[temp2](%[p]) \n\t" |
185 | "lbux %[temp14], %[temp4](%[p]) \n\t" |
186 | "subu %[temp1], %[temp10], %[temp7] \n\t" |
187 | "subu %[temp2], %[temp9], %[temp12] \n\t" |
188 | "absq_s.w %[temp3], %[temp1] \n\t" |
189 | "absq_s.w %[temp4], %[temp2] \n\t" |
190 | "negu %[temp1], %[temp1] \n\t" |
191 | "sll %[temp3], %[temp3], 2 \n\t" |
192 | "addu %[temp15], %[temp3], %[temp4] \n\t" |
193 | "subu %[temp3], %[temp15], %[thresh2] \n\t" |
194 | "sll %[temp6], %[temp1], 1 \n\t" |
195 | "bgtz %[temp3], 3f \n\t" |
196 | " subu %[temp4], %[temp11], %[temp8] \n\t" |
197 | "absq_s.w %[temp4], %[temp4] \n\t" |
198 | "shll_s.w %[temp2], %[temp2], 24 \n\t" |
199 | "subu %[temp4], %[temp4], %[ithresh] \n\t" |
200 | "bgtz %[temp4], 3f \n\t" |
201 | " subu %[temp3], %[temp8], %[temp9] \n\t" |
202 | "absq_s.w %[temp3], %[temp3] \n\t" |
203 | "subu %[temp3], %[temp3], %[ithresh] \n\t" |
204 | "bgtz %[temp3], 3f \n\t" |
205 | " subu %[temp5], %[temp9], %[temp10] \n\t" |
206 | "absq_s.w %[temp3], %[temp5] \n\t" |
207 | "absq_s.w %[temp5], %[temp5] \n\t" |
208 | "subu %[temp3], %[temp3], %[ithresh] \n\t" |
209 | "bgtz %[temp3], 3f \n\t" |
210 | " subu %[temp3], %[temp14], %[temp13] \n\t" |
211 | "absq_s.w %[temp3], %[temp3] \n\t" |
212 | "slt %[temp5], %[hev_thresh], %[temp5] \n\t" |
213 | "subu %[temp3], %[temp3], %[ithresh] \n\t" |
214 | "bgtz %[temp3], 3f \n\t" |
215 | " subu %[temp3], %[temp13], %[temp12] \n\t" |
216 | "absq_s.w %[temp3], %[temp3] \n\t" |
217 | "sra %[temp4], %[temp2], 24 \n\t" |
218 | "subu %[temp3], %[temp3], %[ithresh] \n\t" |
219 | "bgtz %[temp3], 3f \n\t" |
220 | " subu %[temp15], %[temp12], %[temp7] \n\t" |
221 | "absq_s.w %[temp3], %[temp15] \n\t" |
222 | "absq_s.w %[temp15], %[temp15] \n\t" |
223 | "subu %[temp3], %[temp3], %[ithresh] \n\t" |
224 | "bgtz %[temp3], 3f \n\t" |
225 | " slt %[temp15], %[hev_thresh], %[temp15] \n\t" |
226 | "addu %[temp3], %[temp6], %[temp1] \n\t" |
227 | "or %[temp2], %[temp5], %[temp15] \n\t" |
228 | "addu %[temp5], %[temp4], %[temp3] \n\t" |
229 | "beqz %[temp2], 4f \n\t" |
230 | " shra_r.w %[temp1], %[temp5], 3 \n\t" |
231 | "addiu %[temp2], %[temp5], 3 \n\t" |
232 | "sra %[temp2], %[temp2], 3 \n\t" |
233 | "shll_s.w %[temp1], %[temp1], 27 \n\t" |
234 | "shll_s.w %[temp2], %[temp2], 27 \n\t" |
235 | "subu %[temp3], %[p], %[hstride] \n\t" |
236 | "sra %[temp1], %[temp1], 27 \n\t" |
237 | "sra %[temp2], %[temp2], 27 \n\t" |
238 | "subu %[temp1], %[temp7], %[temp1] \n\t" |
239 | "addu %[temp2], %[temp10], %[temp2] \n\t" |
240 | "lbux %[temp2], %[temp2](%[VP8kclip1]) \n\t" |
241 | "lbux %[temp1], %[temp1](%[VP8kclip1]) \n\t" |
242 | "sb %[temp2], 0(%[temp3]) \n\t" |
243 | "j 3f \n\t" |
244 | " sb %[temp1], 0(%[p]) \n\t" |
245 | "4: \n\t" |
246 | "shll_s.w %[temp5], %[temp5], 24 \n\t" |
247 | "subu %[temp14], %[p], %[hstride] \n\t" |
248 | "subu %[temp11], %[temp14], %[hstride] \n\t" |
249 | "sra %[temp6], %[temp5], 24 \n\t" |
250 | "sll %[temp1], %[temp6], 3 \n\t" |
251 | "subu %[temp15], %[temp11], %[hstride] \n\t" |
252 | "addu %[temp2], %[temp6], %[temp1] \n\t" |
253 | "sll %[temp3], %[temp2], 1 \n\t" |
254 | "addu %[temp4], %[temp3], %[temp2] \n\t" |
255 | "addiu %[temp2], %[temp2], 63 \n\t" |
256 | "addiu %[temp3], %[temp3], 63 \n\t" |
257 | "addiu %[temp4], %[temp4], 63 \n\t" |
258 | "sra %[temp2], %[temp2], 7 \n\t" |
259 | "sra %[temp3], %[temp3], 7 \n\t" |
260 | "sra %[temp4], %[temp4], 7 \n\t" |
261 | "addu %[temp1], %[temp8], %[temp2] \n\t" |
262 | "addu %[temp5], %[temp9], %[temp3] \n\t" |
263 | "addu %[temp6], %[temp10], %[temp4] \n\t" |
264 | "subu %[temp8], %[temp7], %[temp4] \n\t" |
265 | "subu %[temp7], %[temp12], %[temp3] \n\t" |
266 | "addu %[temp10], %[p], %[hstride] \n\t" |
267 | "subu %[temp9], %[temp13], %[temp2] \n\t" |
268 | "addu %[temp12], %[temp10], %[hstride] \n\t" |
269 | "lbux %[temp2], %[temp1](%[VP8kclip1]) \n\t" |
270 | "lbux %[temp3], %[temp5](%[VP8kclip1]) \n\t" |
271 | "lbux %[temp4], %[temp6](%[VP8kclip1]) \n\t" |
272 | "lbux %[temp5], %[temp8](%[VP8kclip1]) \n\t" |
273 | "lbux %[temp6], %[temp7](%[VP8kclip1]) \n\t" |
274 | "lbux %[temp8], %[temp9](%[VP8kclip1]) \n\t" |
275 | "sb %[temp2], 0(%[temp15]) \n\t" |
276 | "sb %[temp3], 0(%[temp11]) \n\t" |
277 | "sb %[temp4], 0(%[temp14]) \n\t" |
278 | "sb %[temp5], 0(%[p]) \n\t" |
279 | "sb %[temp6], 0(%[temp10]) \n\t" |
280 | "sb %[temp8], 0(%[temp12]) \n\t" |
281 | "3: \n\t" |
282 | "bgtz %[size], 1b \n\t" |
283 | " addu %[p], %[p], %[vstride] \n\t" |
284 | ".set pop \n\t" |
285 | : [temp1]"=&r" (temp1), [temp2]"=&r" (temp2),[temp3]"=&r" (temp3), |
286 | [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), [temp6]"=&r" (temp6), |
287 | [temp7]"=&r" (temp7),[temp8]"=&r" (temp8),[temp9]"=&r" (temp9), |
288 | [temp10]"=&r" (temp10),[temp11]"=&r" (temp11),[temp12]"=&r" (temp12), |
289 | [temp13]"=&r" (temp13),[temp14]"=&r" (temp14),[temp15]"=&r" (temp15), |
290 | [size]"+&r" (size), [p]"+&r" (p) |
291 | : [hstride]"r" (hstride), [thresh2]"r" (thresh2), |
292 | [ithresh]"r" (ithresh),[vstride]"r" (vstride), [hev_thresh]"r" (hev_thresh), |
293 | [VP8kclip1]"r" (VP8kclip1) |
294 | : "memory" |
295 | ); |
296 | } |
297 | |
298 | static WEBP_INLINE void FilterLoop24(uint8_t* p, |
299 | int hstride, int vstride, int size, |
300 | int thresh, int ithresh, int hev_thresh) { |
301 | int p0, q0, p1, q1, p2, q2, p3, q3; |
302 | int step1, step2, temp1, temp2, temp3, temp4; |
303 | uint8_t* pTemp0; |
304 | uint8_t* pTemp1; |
305 | const int thresh2 = 2 * thresh + 1; |
306 | |
307 | __asm__ volatile ( |
308 | ".set push \n\t" |
309 | ".set noreorder \n\t" |
310 | "bltz %[size], 3f \n\t" |
311 | " nop \n\t" |
312 | "2: \n\t" |
313 | "negu %[step1], %[hstride] \n\t" |
314 | "lbu %[q0], 0(%[p]) \n\t" |
315 | "lbux %[p0], %[step1](%[p]) \n\t" |
316 | "subu %[step1], %[step1], %[hstride] \n\t" |
317 | "lbux %[q1], %[hstride](%[p]) \n\t" |
318 | "subu %[temp1], %[p0], %[q0] \n\t" |
319 | "lbux %[p1], %[step1](%[p]) \n\t" |
320 | "addu %[step2], %[hstride], %[hstride] \n\t" |
321 | "absq_s.w %[temp2], %[temp1] \n\t" |
322 | "subu %[temp3], %[p1], %[q1] \n\t" |
323 | "absq_s.w %[temp4], %[temp3] \n\t" |
324 | "sll %[temp2], %[temp2], 2 \n\t" |
325 | "addu %[temp2], %[temp2], %[temp4] \n\t" |
326 | "subu %[temp4], %[temp2], %[thresh2] \n\t" |
327 | "subu %[step1], %[step1], %[hstride] \n\t" |
328 | "bgtz %[temp4], 0f \n\t" |
329 | " lbux %[p2], %[step1](%[p]) \n\t" |
330 | "subu %[step1], %[step1], %[hstride] \n\t" |
331 | "lbux %[q2], %[step2](%[p]) \n\t" |
332 | "lbux %[p3], %[step1](%[p]) \n\t" |
333 | "subu %[temp4], %[p2], %[p1] \n\t" |
334 | "addu %[step2], %[step2], %[hstride] \n\t" |
335 | "subu %[temp2], %[p3], %[p2] \n\t" |
336 | "absq_s.w %[temp4], %[temp4] \n\t" |
337 | "absq_s.w %[temp2], %[temp2] \n\t" |
338 | "lbux %[q3], %[step2](%[p]) \n\t" |
339 | "subu %[temp4], %[temp4], %[ithresh] \n\t" |
340 | "negu %[temp1], %[temp1] \n\t" |
341 | "bgtz %[temp4], 0f \n\t" |
342 | " subu %[temp2], %[temp2], %[ithresh] \n\t" |
343 | "subu %[p3], %[p1], %[p0] \n\t" |
344 | "bgtz %[temp2], 0f \n\t" |
345 | " absq_s.w %[p3], %[p3] \n\t" |
346 | "subu %[temp4], %[q3], %[q2] \n\t" |
347 | "subu %[pTemp0], %[p], %[hstride] \n\t" |
348 | "absq_s.w %[temp4], %[temp4] \n\t" |
349 | "subu %[temp2], %[p3], %[ithresh] \n\t" |
350 | "sll %[step1], %[temp1], 1 \n\t" |
351 | "bgtz %[temp2], 0f \n\t" |
352 | " subu %[temp4], %[temp4], %[ithresh] \n\t" |
353 | "subu %[temp2], %[q2], %[q1] \n\t" |
354 | "bgtz %[temp4], 0f \n\t" |
355 | " absq_s.w %[temp2], %[temp2] \n\t" |
356 | "subu %[q3], %[q1], %[q0] \n\t" |
357 | "absq_s.w %[q3], %[q3] \n\t" |
358 | "subu %[temp2], %[temp2], %[ithresh] \n\t" |
359 | "addu %[temp1], %[temp1], %[step1] \n\t" |
360 | "bgtz %[temp2], 0f \n\t" |
361 | " subu %[temp4], %[q3], %[ithresh] \n\t" |
362 | "slt %[p3], %[hev_thresh], %[p3] \n\t" |
363 | "bgtz %[temp4], 0f \n\t" |
364 | " slt %[q3], %[hev_thresh], %[q3] \n\t" |
365 | "or %[q3], %[q3], %[p3] \n\t" |
366 | "bgtz %[q3], 1f \n\t" |
367 | " shra_r.w %[temp2], %[temp1], 3 \n\t" |
368 | "addiu %[temp1], %[temp1], 3 \n\t" |
369 | "sra %[temp1], %[temp1], 3 \n\t" |
370 | "shll_s.w %[temp2], %[temp2], 27 \n\t" |
371 | "shll_s.w %[temp1], %[temp1], 27 \n\t" |
372 | "addu %[pTemp1], %[p], %[hstride] \n\t" |
373 | "sra %[temp2], %[temp2], 27 \n\t" |
374 | "sra %[temp1], %[temp1], 27 \n\t" |
375 | "addiu %[step1], %[temp2], 1 \n\t" |
376 | "sra %[step1], %[step1], 1 \n\t" |
377 | "addu %[p0], %[p0], %[temp1] \n\t" |
378 | "addu %[p1], %[p1], %[step1] \n\t" |
379 | "subu %[q0], %[q0], %[temp2] \n\t" |
380 | "subu %[q1], %[q1], %[step1] \n\t" |
381 | "lbux %[temp2], %[p0](%[VP8kclip1]) \n\t" |
382 | "lbux %[temp3], %[q0](%[VP8kclip1]) \n\t" |
383 | "lbux %[temp4], %[q1](%[VP8kclip1]) \n\t" |
384 | "sb %[temp2], 0(%[pTemp0]) \n\t" |
385 | "lbux %[temp1], %[p1](%[VP8kclip1]) \n\t" |
386 | "subu %[pTemp0], %[pTemp0], %[hstride] \n\t" |
387 | "sb %[temp3], 0(%[p]) \n\t" |
388 | "sb %[temp4], 0(%[pTemp1]) \n\t" |
389 | "j 0f \n\t" |
390 | " sb %[temp1], 0(%[pTemp0]) \n\t" |
391 | "1: \n\t" |
392 | "shll_s.w %[temp3], %[temp3], 24 \n\t" |
393 | "sra %[temp3], %[temp3], 24 \n\t" |
394 | "addu %[temp1], %[temp1], %[temp3] \n\t" |
395 | "shra_r.w %[temp2], %[temp1], 3 \n\t" |
396 | "addiu %[temp1], %[temp1], 3 \n\t" |
397 | "shll_s.w %[temp2], %[temp2], 27 \n\t" |
398 | "sra %[temp1], %[temp1], 3 \n\t" |
399 | "shll_s.w %[temp1], %[temp1], 27 \n\t" |
400 | "sra %[temp2], %[temp2], 27 \n\t" |
401 | "sra %[temp1], %[temp1], 27 \n\t" |
402 | "addu %[p0], %[p0], %[temp1] \n\t" |
403 | "subu %[q0], %[q0], %[temp2] \n\t" |
404 | "lbux %[temp1], %[p0](%[VP8kclip1]) \n\t" |
405 | "lbux %[temp2], %[q0](%[VP8kclip1]) \n\t" |
406 | "sb %[temp2], 0(%[p]) \n\t" |
407 | "sb %[temp1], 0(%[pTemp0]) \n\t" |
408 | "0: \n\t" |
409 | "subu %[size], %[size], 1 \n\t" |
410 | "bgtz %[size], 2b \n\t" |
411 | " addu %[p], %[p], %[vstride] \n\t" |
412 | "3: \n\t" |
413 | ".set pop \n\t" |
414 | : [p0]"=&r" (p0), [q0]"=&r" (q0), [p1]"=&r" (p1), [q1]"=&r" (q1), |
415 | [p2]"=&r" (p2), [q2]"=&r" (q2), [p3]"=&r" (p3), [q3]"=&r" (q3), |
416 | [step2]"=&r" (step2), [step1]"=&r" (step1), [temp1]"=&r" (temp1), |
417 | [temp2]"=&r" (temp2), [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), |
418 | [pTemp0]"=&r" (pTemp0), [pTemp1]"=&r" (pTemp1), [p]"+&r" (p), |
419 | [size]"+&r" (size) |
420 | : [vstride]"r" (vstride), [ithresh]"r" (ithresh), |
421 | [hev_thresh]"r" (hev_thresh), [hstride]"r" (hstride), |
422 | [VP8kclip1]"r" (VP8kclip1), [thresh2]"r" (thresh2) |
423 | : "memory" |
424 | ); |
425 | } |
426 | |
427 | // on macroblock edges |
428 | static void VFilter16(uint8_t* p, int stride, |
429 | int thresh, int ithresh, int hev_thresh) { |
430 | FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh); |
431 | } |
432 | |
433 | static void HFilter16(uint8_t* p, int stride, |
434 | int thresh, int ithresh, int hev_thresh) { |
435 | FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh); |
436 | } |
437 | |
438 | // 8-pixels wide variant, for chroma filtering |
439 | static void VFilter8(uint8_t* u, uint8_t* v, int stride, |
440 | int thresh, int ithresh, int hev_thresh) { |
441 | FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh); |
442 | FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh); |
443 | } |
444 | |
445 | static void HFilter8(uint8_t* u, uint8_t* v, int stride, |
446 | int thresh, int ithresh, int hev_thresh) { |
447 | FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh); |
448 | FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh); |
449 | } |
450 | |
451 | // on three inner edges |
452 | static void VFilter16i(uint8_t* p, int stride, |
453 | int thresh, int ithresh, int hev_thresh) { |
454 | int k; |
455 | for (k = 3; k > 0; --k) { |
456 | p += 4 * stride; |
457 | FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh); |
458 | } |
459 | } |
460 | |
461 | static void HFilter16i(uint8_t* p, int stride, |
462 | int thresh, int ithresh, int hev_thresh) { |
463 | int k; |
464 | for (k = 3; k > 0; --k) { |
465 | p += 4; |
466 | FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh); |
467 | } |
468 | } |
469 | |
470 | static void VFilter8i(uint8_t* u, uint8_t* v, int stride, |
471 | int thresh, int ithresh, int hev_thresh) { |
472 | FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); |
473 | FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); |
474 | } |
475 | |
476 | static void HFilter8i(uint8_t* u, uint8_t* v, int stride, |
477 | int thresh, int ithresh, int hev_thresh) { |
478 | FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh); |
479 | FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh); |
480 | } |
481 | |
482 | #undef MUL |
483 | |
484 | //------------------------------------------------------------------------------ |
485 | // Simple In-loop filtering (Paragraph 15.2) |
486 | |
487 | static void SimpleVFilter16(uint8_t* p, int stride, int thresh) { |
488 | int i; |
489 | const int thresh2 = 2 * thresh + 1; |
490 | int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; |
491 | uint8_t* p1 = p - stride; |
492 | __asm__ volatile ( |
493 | ".set push \n\t" |
494 | ".set noreorder \n\t" |
495 | "li %[i], 16 \n\t" |
496 | "0: \n\t" |
497 | "negu %[temp4], %[stride] \n\t" |
498 | "sll %[temp5], %[temp4], 1 \n\t" |
499 | "lbu %[temp2], 0(%[p]) \n\t" |
500 | "lbux %[temp3], %[stride](%[p]) \n\t" |
501 | "lbux %[temp1], %[temp4](%[p]) \n\t" |
502 | "lbux %[temp0], %[temp5](%[p]) \n\t" |
503 | "subu %[temp7], %[temp1], %[temp2] \n\t" |
504 | "subu %[temp6], %[temp0], %[temp3] \n\t" |
505 | "absq_s.w %[temp4], %[temp7] \n\t" |
506 | "absq_s.w %[temp5], %[temp6] \n\t" |
507 | "sll %[temp4], %[temp4], 2 \n\t" |
508 | "subu %[temp5], %[temp5], %[thresh2] \n\t" |
509 | "addu %[temp5], %[temp4], %[temp5] \n\t" |
510 | "negu %[temp8], %[temp7] \n\t" |
511 | "bgtz %[temp5], 1f \n\t" |
512 | " addiu %[i], %[i], -1 \n\t" |
513 | "sll %[temp4], %[temp8], 1 \n\t" |
514 | "shll_s.w %[temp5], %[temp6], 24 \n\t" |
515 | "addu %[temp3], %[temp4], %[temp8] \n\t" |
516 | "sra %[temp5], %[temp5], 24 \n\t" |
517 | "addu %[temp3], %[temp3], %[temp5] \n\t" |
518 | "addiu %[temp7], %[temp3], 3 \n\t" |
519 | "sra %[temp7], %[temp7], 3 \n\t" |
520 | "shra_r.w %[temp8], %[temp3], 3 \n\t" |
521 | "shll_s.w %[temp0], %[temp7], 27 \n\t" |
522 | "shll_s.w %[temp4], %[temp8], 27 \n\t" |
523 | "sra %[temp0], %[temp0], 27 \n\t" |
524 | "sra %[temp4], %[temp4], 27 \n\t" |
525 | "addu %[temp7], %[temp1], %[temp0] \n\t" |
526 | "subu %[temp2], %[temp2], %[temp4] \n\t" |
527 | "lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t" |
528 | "lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t" |
529 | "sb %[temp3], 0(%[p1]) \n\t" |
530 | "sb %[temp4], 0(%[p]) \n\t" |
531 | "1: \n\t" |
532 | "addiu %[p1], %[p1], 1 \n\t" |
533 | "bgtz %[i], 0b \n\t" |
534 | " addiu %[p], %[p], 1 \n\t" |
535 | " .set pop \n\t" |
536 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
537 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
538 | [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [temp8]"=&r" (temp8), |
539 | [p]"+&r" (p), [i]"=&r" (i), [p1]"+&r" (p1) |
540 | : [stride]"r" (stride), [VP8kclip1]"r" (VP8kclip1), [thresh2]"r" (thresh2) |
541 | : "memory" |
542 | ); |
543 | } |
544 | |
545 | // TEMP0 = SRC[A + A1 * BPS] |
546 | // TEMP1 = SRC[B + B1 * BPS] |
547 | // TEMP2 = SRC[C + C1 * BPS] |
548 | // TEMP3 = SRC[D + D1 * BPS] |
549 | #define LOAD_4_BYTES(TEMP0, TEMP1, TEMP2, TEMP3, \ |
550 | A, A1, B, B1, C, C1, D, D1, SRC) \ |
551 | "lbu %[" #TEMP0 "], " #A "+" #A1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \ |
552 | "lbu %[" #TEMP1 "], " #B "+" #B1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \ |
553 | "lbu %[" #TEMP2 "], " #C "+" #C1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \ |
554 | "lbu %[" #TEMP3 "], " #D "+" #D1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \ |
555 | |
556 | static void SimpleHFilter16(uint8_t* p, int stride, int thresh) { |
557 | int i; |
558 | const int thresh2 = 2 * thresh + 1; |
559 | int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; |
560 | __asm__ volatile ( |
561 | ".set push \n\t" |
562 | ".set noreorder \n\t" |
563 | "li %[i], 16 \n\t" |
564 | "0: \n\t" |
565 | LOAD_4_BYTES(temp0, temp1, temp2, temp3, -2, 0, -1, 0, 0, 0, 1, 0, p) |
566 | "subu %[temp7], %[temp1], %[temp2] \n\t" |
567 | "subu %[temp6], %[temp0], %[temp3] \n\t" |
568 | "absq_s.w %[temp4], %[temp7] \n\t" |
569 | "absq_s.w %[temp5], %[temp6] \n\t" |
570 | "sll %[temp4], %[temp4], 2 \n\t" |
571 | "addu %[temp5], %[temp4], %[temp5] \n\t" |
572 | "subu %[temp5], %[temp5], %[thresh2] \n\t" |
573 | "negu %[temp8], %[temp7] \n\t" |
574 | "bgtz %[temp5], 1f \n\t" |
575 | " addiu %[i], %[i], -1 \n\t" |
576 | "sll %[temp4], %[temp8], 1 \n\t" |
577 | "shll_s.w %[temp5], %[temp6], 24 \n\t" |
578 | "addu %[temp3], %[temp4], %[temp8] \n\t" |
579 | "sra %[temp5], %[temp5], 24 \n\t" |
580 | "addu %[temp3], %[temp3], %[temp5] \n\t" |
581 | "addiu %[temp7], %[temp3], 3 \n\t" |
582 | "sra %[temp7], %[temp7], 3 \n\t" |
583 | "shra_r.w %[temp8], %[temp3], 3 \n\t" |
584 | "shll_s.w %[temp0], %[temp7], 27 \n\t" |
585 | "shll_s.w %[temp4], %[temp8], 27 \n\t" |
586 | "sra %[temp0], %[temp0], 27 \n\t" |
587 | "sra %[temp4], %[temp4], 27 \n\t" |
588 | "addu %[temp7], %[temp1], %[temp0] \n\t" |
589 | "subu %[temp2], %[temp2], %[temp4] \n\t" |
590 | "lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t" |
591 | "lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t" |
592 | "sb %[temp3], -1(%[p]) \n\t" |
593 | "sb %[temp4], 0(%[p]) \n\t" |
594 | "1: \n\t" |
595 | "bgtz %[i], 0b \n\t" |
596 | " addu %[p], %[p], %[stride] \n\t" |
597 | ".set pop \n\t" |
598 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
599 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
600 | [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [temp8]"=&r" (temp8), |
601 | [p]"+&r" (p), [i]"=&r" (i) |
602 | : [stride]"r" (stride), [VP8kclip1]"r" (VP8kclip1), [thresh2]"r" (thresh2) |
603 | : "memory" |
604 | ); |
605 | } |
606 | |
607 | static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) { |
608 | int k; |
609 | for (k = 3; k > 0; --k) { |
610 | p += 4 * stride; |
611 | SimpleVFilter16(p, stride, thresh); |
612 | } |
613 | } |
614 | |
615 | static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { |
616 | int k; |
617 | for (k = 3; k > 0; --k) { |
618 | p += 4; |
619 | SimpleHFilter16(p, stride, thresh); |
620 | } |
621 | } |
622 | |
623 | // DST[A * BPS] = TEMP0 |
624 | // DST[B + C * BPS] = TEMP1 |
625 | #define STORE_8_BYTES(TEMP0, TEMP1, A, B, C, DST) \ |
626 | "usw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #DST "]) \n\t" \ |
627 | "usw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #DST "]) \n\t" |
628 | |
629 | static void VE4(uint8_t* dst) { // vertical |
630 | const uint8_t* top = dst - BPS; |
631 | int temp0, temp1, temp2, temp3, temp4, temp5, temp6; |
632 | __asm__ volatile ( |
633 | "ulw %[temp0], -1(%[top]) \n\t" |
634 | "ulh %[temp1], 3(%[top]) \n\t" |
635 | "preceu.ph.qbr %[temp2], %[temp0] \n\t" |
636 | "preceu.ph.qbl %[temp3], %[temp0] \n\t" |
637 | "preceu.ph.qbr %[temp4], %[temp1] \n\t" |
638 | "packrl.ph %[temp5], %[temp3], %[temp2] \n\t" |
639 | "packrl.ph %[temp6], %[temp4], %[temp3] \n\t" |
640 | "shll.ph %[temp5], %[temp5], 1 \n\t" |
641 | "shll.ph %[temp6], %[temp6], 1 \n\t" |
642 | "addq.ph %[temp2], %[temp5], %[temp2] \n\t" |
643 | "addq.ph %[temp6], %[temp6], %[temp4] \n\t" |
644 | "addq.ph %[temp2], %[temp2], %[temp3] \n\t" |
645 | "addq.ph %[temp6], %[temp6], %[temp3] \n\t" |
646 | "shra_r.ph %[temp2], %[temp2], 2 \n\t" |
647 | "shra_r.ph %[temp6], %[temp6], 2 \n\t" |
648 | "precr.qb.ph %[temp4], %[temp6], %[temp2] \n\t" |
649 | STORE_8_BYTES(temp4, temp4, 0, 0, 1, dst) |
650 | STORE_8_BYTES(temp4, temp4, 2, 0, 3, dst) |
651 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
652 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
653 | [temp6]"=&r" (temp6) |
654 | : [top]"r" (top), [dst]"r" (dst) |
655 | : "memory" |
656 | ); |
657 | } |
658 | |
659 | static void DC4(uint8_t* dst) { // DC |
660 | int temp0, temp1, temp2, temp3, temp4; |
661 | __asm__ volatile ( |
662 | "ulw %[temp0], -1*" XSTR(BPS) "(%[dst]) \n\t" |
663 | LOAD_4_BYTES(temp1, temp2, temp3, temp4, -1, 0, -1, 1, -1, 2, -1, 3, dst) |
664 | "ins %[temp1], %[temp2], 8, 8 \n\t" |
665 | "ins %[temp1], %[temp3], 16, 8 \n\t" |
666 | "ins %[temp1], %[temp4], 24, 8 \n\t" |
667 | "raddu.w.qb %[temp0], %[temp0] \n\t" |
668 | "raddu.w.qb %[temp1], %[temp1] \n\t" |
669 | "addu %[temp0], %[temp0], %[temp1] \n\t" |
670 | "shra_r.w %[temp0], %[temp0], 3 \n\t" |
671 | "replv.qb %[temp0], %[temp0] \n\t" |
672 | STORE_8_BYTES(temp0, temp0, 0, 0, 1, dst) |
673 | STORE_8_BYTES(temp0, temp0, 2, 0, 3, dst) |
674 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
675 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4) |
676 | : [dst]"r" (dst) |
677 | : "memory" |
678 | ); |
679 | } |
680 | |
681 | static void RD4(uint8_t* dst) { // Down-right |
682 | int temp0, temp1, temp2, temp3, temp4; |
683 | int temp5, temp6, temp7, temp8; |
684 | __asm__ volatile ( |
685 | LOAD_4_BYTES(temp0, temp1, temp2, temp3, -1, 0, -1, 1, -1, 2, -1, 3, dst) |
686 | "ulw %[temp7], -1-" XSTR(BPS) "(%[dst]) \n\t" |
687 | "ins %[temp1], %[temp0], 16, 16 \n\t" |
688 | "preceu.ph.qbr %[temp5], %[temp7] \n\t" |
689 | "ins %[temp2], %[temp1], 16, 16 \n\t" |
690 | "preceu.ph.qbl %[temp4], %[temp7] \n\t" |
691 | "ins %[temp3], %[temp2], 16, 16 \n\t" |
692 | "shll.ph %[temp2], %[temp2], 1 \n\t" |
693 | "addq.ph %[temp3], %[temp3], %[temp1] \n\t" |
694 | "packrl.ph %[temp6], %[temp5], %[temp1] \n\t" |
695 | "addq.ph %[temp3], %[temp3], %[temp2] \n\t" |
696 | "addq.ph %[temp1], %[temp1], %[temp5] \n\t" |
697 | "shll.ph %[temp6], %[temp6], 1 \n\t" |
698 | "addq.ph %[temp1], %[temp1], %[temp6] \n\t" |
699 | "packrl.ph %[temp0], %[temp4], %[temp5] \n\t" |
700 | "addq.ph %[temp8], %[temp5], %[temp4] \n\t" |
701 | "shra_r.ph %[temp3], %[temp3], 2 \n\t" |
702 | "shll.ph %[temp0], %[temp0], 1 \n\t" |
703 | "shra_r.ph %[temp1], %[temp1], 2 \n\t" |
704 | "addq.ph %[temp8], %[temp0], %[temp8] \n\t" |
705 | "lbu %[temp5], 3-" XSTR(BPS) "(%[dst]) \n\t" |
706 | "precrq.ph.w %[temp7], %[temp7], %[temp7] \n\t" |
707 | "shra_r.ph %[temp8], %[temp8], 2 \n\t" |
708 | "ins %[temp7], %[temp5], 0, 8 \n\t" |
709 | "precr.qb.ph %[temp2], %[temp1], %[temp3] \n\t" |
710 | "raddu.w.qb %[temp4], %[temp7] \n\t" |
711 | "precr.qb.ph %[temp6], %[temp8], %[temp1] \n\t" |
712 | "shra_r.w %[temp4], %[temp4], 2 \n\t" |
713 | STORE_8_BYTES(temp2, temp6, 3, 0, 1, dst) |
714 | "prepend %[temp2], %[temp8], 8 \n\t" |
715 | "prepend %[temp6], %[temp4], 8 \n\t" |
716 | STORE_8_BYTES(temp2, temp6, 2, 0, 0, dst) |
717 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
718 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
719 | [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [temp8]"=&r" (temp8) |
720 | : [dst]"r" (dst) |
721 | : "memory" |
722 | ); |
723 | } |
724 | |
725 | // TEMP0 = SRC[A * BPS] |
726 | // TEMP1 = SRC[B + C * BPS] |
727 | #define LOAD_8_BYTES(TEMP0, TEMP1, A, B, C, SRC) \ |
728 | "ulw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \ |
729 | "ulw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #SRC "]) \n\t" |
730 | |
731 | static void LD4(uint8_t* dst) { // Down-Left |
732 | int temp0, temp1, temp2, temp3, temp4; |
733 | int temp5, temp6, temp7, temp8, temp9; |
734 | __asm__ volatile ( |
735 | LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst) |
736 | "preceu.ph.qbl %[temp2], %[temp0] \n\t" |
737 | "preceu.ph.qbr %[temp3], %[temp0] \n\t" |
738 | "preceu.ph.qbr %[temp4], %[temp1] \n\t" |
739 | "preceu.ph.qbl %[temp5], %[temp1] \n\t" |
740 | "packrl.ph %[temp6], %[temp2], %[temp3] \n\t" |
741 | "packrl.ph %[temp7], %[temp4], %[temp2] \n\t" |
742 | "packrl.ph %[temp8], %[temp5], %[temp4] \n\t" |
743 | "shll.ph %[temp6], %[temp6], 1 \n\t" |
744 | "addq.ph %[temp9], %[temp2], %[temp6] \n\t" |
745 | "shll.ph %[temp7], %[temp7], 1 \n\t" |
746 | "addq.ph %[temp9], %[temp9], %[temp3] \n\t" |
747 | "shll.ph %[temp8], %[temp8], 1 \n\t" |
748 | "shra_r.ph %[temp9], %[temp9], 2 \n\t" |
749 | "addq.ph %[temp3], %[temp4], %[temp7] \n\t" |
750 | "addq.ph %[temp0], %[temp5], %[temp8] \n\t" |
751 | "addq.ph %[temp3], %[temp3], %[temp2] \n\t" |
752 | "addq.ph %[temp0], %[temp0], %[temp4] \n\t" |
753 | "shra_r.ph %[temp3], %[temp3], 2 \n\t" |
754 | "shra_r.ph %[temp0], %[temp0], 2 \n\t" |
755 | "srl %[temp1], %[temp1], 24 \n\t" |
756 | "sll %[temp1], %[temp1], 1 \n\t" |
757 | "raddu.w.qb %[temp5], %[temp5] \n\t" |
758 | "precr.qb.ph %[temp9], %[temp3], %[temp9] \n\t" |
759 | "precr.qb.ph %[temp3], %[temp0], %[temp3] \n\t" |
760 | "addu %[temp1], %[temp1], %[temp5] \n\t" |
761 | "shra_r.w %[temp1], %[temp1], 2 \n\t" |
762 | STORE_8_BYTES(temp9, temp3, 0, 0, 2, dst) |
763 | "prepend %[temp9], %[temp0], 8 \n\t" |
764 | "prepend %[temp3], %[temp1], 8 \n\t" |
765 | STORE_8_BYTES(temp9, temp3, 1, 0, 3, dst) |
766 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
767 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
768 | [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [temp8]"=&r" (temp8), |
769 | [temp9]"=&r" (temp9) |
770 | : [dst]"r" (dst) |
771 | : "memory" |
772 | ); |
773 | } |
774 | |
775 | //------------------------------------------------------------------------------ |
776 | // Chroma |
777 | |
778 | static void DC8uv(uint8_t* dst) { // DC |
779 | int temp0, temp1, temp2, temp3, temp4; |
780 | int temp5, temp6, temp7, temp8, temp9; |
781 | __asm__ volatile ( |
782 | LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst) |
783 | LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst) |
784 | LOAD_4_BYTES(temp6, temp7, temp8, temp9, -1, 4, -1, 5, -1, 6, -1, 7, dst) |
785 | "raddu.w.qb %[temp0], %[temp0] \n\t" |
786 | "raddu.w.qb %[temp1], %[temp1] \n\t" |
787 | "addu %[temp2], %[temp2], %[temp3] \n\t" |
788 | "addu %[temp4], %[temp4], %[temp5] \n\t" |
789 | "addu %[temp6], %[temp6], %[temp7] \n\t" |
790 | "addu %[temp8], %[temp8], %[temp9] \n\t" |
791 | "addu %[temp0], %[temp0], %[temp1] \n\t" |
792 | "addu %[temp2], %[temp2], %[temp4] \n\t" |
793 | "addu %[temp6], %[temp6], %[temp8] \n\t" |
794 | "addu %[temp0], %[temp0], %[temp2] \n\t" |
795 | "addu %[temp0], %[temp0], %[temp6] \n\t" |
796 | "shra_r.w %[temp0], %[temp0], 4 \n\t" |
797 | "replv.qb %[temp0], %[temp0] \n\t" |
798 | STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst) |
799 | STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst) |
800 | STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst) |
801 | STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst) |
802 | STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst) |
803 | STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst) |
804 | STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst) |
805 | STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst) |
806 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
807 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
808 | [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [temp8]"=&r" (temp8), |
809 | [temp9]"=&r" (temp9) |
810 | : [dst]"r" (dst) |
811 | : "memory" |
812 | ); |
813 | } |
814 | |
815 | static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples |
816 | int temp0, temp1; |
817 | __asm__ volatile ( |
818 | LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst) |
819 | "raddu.w.qb %[temp0], %[temp0] \n\t" |
820 | "raddu.w.qb %[temp1], %[temp1] \n\t" |
821 | "addu %[temp0], %[temp0], %[temp1] \n\t" |
822 | "shra_r.w %[temp0], %[temp0], 3 \n\t" |
823 | "replv.qb %[temp0], %[temp0] \n\t" |
824 | STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst) |
825 | STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst) |
826 | STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst) |
827 | STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst) |
828 | STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst) |
829 | STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst) |
830 | STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst) |
831 | STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst) |
832 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1) |
833 | : [dst]"r" (dst) |
834 | : "memory" |
835 | ); |
836 | } |
837 | |
838 | static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples |
839 | int temp0, temp1, temp2, temp3, temp4; |
840 | int temp5, temp6, temp7, temp8; |
841 | __asm__ volatile ( |
842 | LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst) |
843 | LOAD_4_BYTES(temp6, temp7, temp8, temp1, -1, 4, -1, 5, -1, 6, -1, 7, dst) |
844 | "addu %[temp2], %[temp2], %[temp3] \n\t" |
845 | "addu %[temp4], %[temp4], %[temp5] \n\t" |
846 | "addu %[temp6], %[temp6], %[temp7] \n\t" |
847 | "addu %[temp8], %[temp8], %[temp1] \n\t" |
848 | "addu %[temp2], %[temp2], %[temp4] \n\t" |
849 | "addu %[temp6], %[temp6], %[temp8] \n\t" |
850 | "addu %[temp0], %[temp6], %[temp2] \n\t" |
851 | "shra_r.w %[temp0], %[temp0], 3 \n\t" |
852 | "replv.qb %[temp0], %[temp0] \n\t" |
853 | STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst) |
854 | STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst) |
855 | STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst) |
856 | STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst) |
857 | STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst) |
858 | STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst) |
859 | STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst) |
860 | STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst) |
861 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
862 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
863 | [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [temp8]"=&r" (temp8) |
864 | : [dst]"r" (dst) |
865 | : "memory" |
866 | ); |
867 | } |
868 | |
869 | #undef LOAD_8_BYTES |
870 | #undef STORE_8_BYTES |
871 | #undef LOAD_4_BYTES |
872 | |
873 | #define CLIPPING(SIZE) \ |
874 | "preceu.ph.qbl %[temp2], %[temp0] \n\t" \ |
875 | "preceu.ph.qbr %[temp0], %[temp0] \n\t" \ |
876 | ".if " #SIZE " == 8 \n\t" \ |
877 | "preceu.ph.qbl %[temp3], %[temp1] \n\t" \ |
878 | "preceu.ph.qbr %[temp1], %[temp1] \n\t" \ |
879 | ".endif \n\t" \ |
880 | "addu.ph %[temp2], %[temp2], %[dst_1] \n\t" \ |
881 | "addu.ph %[temp0], %[temp0], %[dst_1] \n\t" \ |
882 | ".if " #SIZE " == 8 \n\t" \ |
883 | "addu.ph %[temp3], %[temp3], %[dst_1] \n\t" \ |
884 | "addu.ph %[temp1], %[temp1], %[dst_1] \n\t" \ |
885 | ".endif \n\t" \ |
886 | "shll_s.ph %[temp2], %[temp2], 7 \n\t" \ |
887 | "shll_s.ph %[temp0], %[temp0], 7 \n\t" \ |
888 | ".if " #SIZE " == 8 \n\t" \ |
889 | "shll_s.ph %[temp3], %[temp3], 7 \n\t" \ |
890 | "shll_s.ph %[temp1], %[temp1], 7 \n\t" \ |
891 | ".endif \n\t" \ |
892 | "precrqu_s.qb.ph %[temp0], %[temp2], %[temp0] \n\t" \ |
893 | ".if " #SIZE " == 8 \n\t" \ |
894 | "precrqu_s.qb.ph %[temp1], %[temp3], %[temp1] \n\t" \ |
895 | ".endif \n\t" |
896 | |
897 | |
898 | #define CLIP_8B_TO_DST(DST, TOP, SIZE) do { \ |
899 | int dst_1 = ((int)(DST)[-1] << 16) + (DST)[-1]; \ |
900 | int temp0, temp1, temp2, temp3; \ |
901 | __asm__ volatile ( \ |
902 | ".if " #SIZE " < 8 \n\t" \ |
903 | "ulw %[temp0], 0(%[top]) \n\t" \ |
904 | "subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \ |
905 | CLIPPING(4) \ |
906 | "usw %[temp0], 0(%[dst]) \n\t" \ |
907 | ".else \n\t" \ |
908 | "ulw %[temp0], 0(%[top]) \n\t" \ |
909 | "ulw %[temp1], 4(%[top]) \n\t" \ |
910 | "subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \ |
911 | CLIPPING(8) \ |
912 | "usw %[temp0], 0(%[dst]) \n\t" \ |
913 | "usw %[temp1], 4(%[dst]) \n\t" \ |
914 | ".if " #SIZE " == 16 \n\t" \ |
915 | "ulw %[temp0], 8(%[top]) \n\t" \ |
916 | "ulw %[temp1], 12(%[top]) \n\t" \ |
917 | CLIPPING(8) \ |
918 | "usw %[temp0], 8(%[dst]) \n\t" \ |
919 | "usw %[temp1], 12(%[dst]) \n\t" \ |
920 | ".endif \n\t" \ |
921 | ".endif \n\t" \ |
922 | : [dst_1]"+&r"(dst_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \ |
923 | [temp2]"=&r"(temp2), [temp3]"=&r"(temp3) \ |
924 | : [top_1]"r"(top_1), [top]"r"((TOP)), [dst]"r"((DST)) \ |
925 | : "memory" \ |
926 | ); \ |
927 | } while (0) |
928 | |
929 | #define CLIP_TO_DST(DST, SIZE) do { \ |
930 | int y; \ |
931 | const uint8_t* top = (DST) - BPS; \ |
932 | const int top_1 = ((int)top[-1] << 16) + top[-1]; \ |
933 | for (y = 0; y < (SIZE); ++y) { \ |
934 | CLIP_8B_TO_DST((DST), top, (SIZE)); \ |
935 | (DST) += BPS; \ |
936 | } \ |
937 | } while (0) |
938 | |
939 | #define TRUE_MOTION(DST, SIZE) \ |
940 | static void TrueMotion##SIZE(uint8_t* (DST)) { \ |
941 | CLIP_TO_DST((DST), (SIZE)); \ |
942 | } |
943 | |
944 | TRUE_MOTION(dst, 4) |
945 | TRUE_MOTION(dst, 8) |
946 | TRUE_MOTION(dst, 16) |
947 | |
948 | #undef TRUE_MOTION |
949 | #undef CLIP_TO_DST |
950 | #undef CLIP_8B_TO_DST |
951 | #undef CLIPPING |
952 | |
953 | //------------------------------------------------------------------------------ |
954 | // Entry point |
955 | |
956 | extern void VP8DspInitMIPSdspR2(void); |
957 | |
958 | WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPSdspR2(void) { |
959 | VP8TransformDC = TransformDC; |
960 | VP8TransformAC3 = TransformAC3; |
961 | VP8Transform = TransformTwo; |
962 | |
963 | VP8VFilter16 = VFilter16; |
964 | VP8HFilter16 = HFilter16; |
965 | VP8VFilter8 = VFilter8; |
966 | VP8HFilter8 = HFilter8; |
967 | VP8VFilter16i = VFilter16i; |
968 | VP8HFilter16i = HFilter16i; |
969 | VP8VFilter8i = VFilter8i; |
970 | VP8HFilter8i = HFilter8i; |
971 | VP8SimpleVFilter16 = SimpleVFilter16; |
972 | VP8SimpleHFilter16 = SimpleHFilter16; |
973 | VP8SimpleVFilter16i = SimpleVFilter16i; |
974 | VP8SimpleHFilter16i = SimpleHFilter16i; |
975 | |
976 | VP8PredLuma4[0] = DC4; |
977 | VP8PredLuma4[1] = TrueMotion4; |
978 | VP8PredLuma4[2] = VE4; |
979 | VP8PredLuma4[4] = RD4; |
980 | VP8PredLuma4[6] = LD4; |
981 | |
982 | VP8PredChroma8[0] = DC8uv; |
983 | VP8PredChroma8[1] = TrueMotion8; |
984 | VP8PredChroma8[4] = DC8uvNoTop; |
985 | VP8PredChroma8[5] = DC8uvNoLeft; |
986 | |
987 | VP8PredLuma16[1] = TrueMotion16; |
988 | } |
989 | |
990 | #else // !WEBP_USE_MIPS_DSP_R2 |
991 | |
992 | WEBP_DSP_INIT_STUB(VP8DspInitMIPSdspR2) |
993 | |
994 | #endif // WEBP_USE_MIPS_DSP_R2 |
995 | |