1 | // Copyright 2014 Google Inc. All Rights Reserved. |
2 | // |
3 | // Use of this source code is governed by a BSD-style license |
4 | // that can be found in the COPYING file in the root of the source |
5 | // tree. An additional intellectual property rights grant can be found |
6 | // in the file PATENTS. All contributing project authors may |
7 | // be found in the AUTHORS file in the root of the source tree. |
8 | // ----------------------------------------------------------------------------- |
9 | // |
10 | // MIPS version of speed-critical encoding functions. |
11 | // |
12 | // Author(s): Darko Laus (darko.laus@imgtec.com) |
13 | // Mirko Raus (mirko.raus@imgtec.com) |
14 | |
15 | #include "src/dsp/dsp.h" |
16 | |
17 | #if defined(WEBP_USE_MIPS_DSP_R2) |
18 | |
19 | #include "src/dsp/mips_macro.h" |
20 | #include "src/enc/cost_enc.h" |
21 | #include "src/enc/vp8i_enc.h" |
22 | |
23 | static const int kC1 = 20091 + (1 << 16); |
24 | static const int kC2 = 35468; |
25 | |
26 | // O - output |
27 | // I - input (macro doesn't change it) |
28 | #define ADD_SUB_HALVES_X4(O0, O1, O2, O3, O4, O5, O6, O7, \ |
29 | I0, I1, I2, I3, I4, I5, I6, I7) \ |
30 | "addq.ph %[" #O0 "], %[" #I0 "], %[" #I1 "] \n\t" \ |
31 | "subq.ph %[" #O1 "], %[" #I0 "], %[" #I1 "] \n\t" \ |
32 | "addq.ph %[" #O2 "], %[" #I2 "], %[" #I3 "] \n\t" \ |
33 | "subq.ph %[" #O3 "], %[" #I2 "], %[" #I3 "] \n\t" \ |
34 | "addq.ph %[" #O4 "], %[" #I4 "], %[" #I5 "] \n\t" \ |
35 | "subq.ph %[" #O5 "], %[" #I4 "], %[" #I5 "] \n\t" \ |
36 | "addq.ph %[" #O6 "], %[" #I6 "], %[" #I7 "] \n\t" \ |
37 | "subq.ph %[" #O7 "], %[" #I6 "], %[" #I7 "] \n\t" |
38 | |
39 | // IO - input/output |
40 | #define ABS_X8(IO0, IO1, IO2, IO3, IO4, IO5, IO6, IO7) \ |
41 | "absq_s.ph %[" #IO0 "], %[" #IO0 "] \n\t" \ |
42 | "absq_s.ph %[" #IO1 "], %[" #IO1 "] \n\t" \ |
43 | "absq_s.ph %[" #IO2 "], %[" #IO2 "] \n\t" \ |
44 | "absq_s.ph %[" #IO3 "], %[" #IO3 "] \n\t" \ |
45 | "absq_s.ph %[" #IO4 "], %[" #IO4 "] \n\t" \ |
46 | "absq_s.ph %[" #IO5 "], %[" #IO5 "] \n\t" \ |
47 | "absq_s.ph %[" #IO6 "], %[" #IO6 "] \n\t" \ |
48 | "absq_s.ph %[" #IO7 "], %[" #IO7 "] \n\t" |
49 | |
50 | // dpa.w.ph $ac0 temp0 ,temp1 |
51 | // $ac += temp0[31..16] * temp1[31..16] + temp0[15..0] * temp1[15..0] |
52 | // dpax.w.ph $ac0 temp0 ,temp1 |
53 | // $ac += temp0[31..16] * temp1[15..0] + temp0[15..0] * temp1[31..16] |
54 | // O - output |
55 | // I - input (macro doesn't change it) |
56 | #define MUL_HALF(O0, I0, I1, I2, I3, I4, I5, I6, I7, \ |
57 | I8, I9, I10, I11, I12, I13, I14, I15) \ |
58 | "mult $ac0, $zero, $zero \n\t" \ |
59 | "dpa.w.ph $ac0, %[" #I2 "], %[" #I0 "] \n\t" \ |
60 | "dpax.w.ph $ac0, %[" #I5 "], %[" #I6 "] \n\t" \ |
61 | "dpa.w.ph $ac0, %[" #I8 "], %[" #I9 "] \n\t" \ |
62 | "dpax.w.ph $ac0, %[" #I11 "], %[" #I4 "] \n\t" \ |
63 | "dpa.w.ph $ac0, %[" #I12 "], %[" #I7 "] \n\t" \ |
64 | "dpax.w.ph $ac0, %[" #I13 "], %[" #I1 "] \n\t" \ |
65 | "dpa.w.ph $ac0, %[" #I14 "], %[" #I3 "] \n\t" \ |
66 | "dpax.w.ph $ac0, %[" #I15 "], %[" #I10 "] \n\t" \ |
67 | "mflo %[" #O0 "], $ac0 \n\t" |
68 | |
69 | #define OUTPUT_EARLY_CLOBBER_REGS_17() \ |
70 | OUTPUT_EARLY_CLOBBER_REGS_10(), \ |
71 | [temp11]"=&r"(temp11), [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), \ |
72 | [temp14]"=&r"(temp14), [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), \ |
73 | [temp17]"=&r"(temp17) |
74 | |
75 | // macro for one horizontal pass in FTransform |
76 | // temp0..temp15 holds tmp[0]..tmp[15] |
77 | // A - offset in bytes to load from src and ref buffers |
78 | // TEMP0..TEMP3 - registers for corresponding tmp elements |
79 | #define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3) \ |
80 | "lw %[" #TEMP0 "], 0(%[args]) \n\t" \ |
81 | "lw %[" #TEMP1 "], 4(%[args]) \n\t" \ |
82 | "lw %[" #TEMP2 "], " XSTR(BPS) "*" #A "(%[" #TEMP0 "]) \n\t" \ |
83 | "lw %[" #TEMP3 "], " XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \ |
84 | "preceu.ph.qbl %[" #TEMP0 "], %[" #TEMP2 "] \n\t" \ |
85 | "preceu.ph.qbl %[" #TEMP1 "], %[" #TEMP3 "] \n\t" \ |
86 | "preceu.ph.qbr %[" #TEMP2 "], %[" #TEMP2 "] \n\t" \ |
87 | "preceu.ph.qbr %[" #TEMP3 "], %[" #TEMP3 "] \n\t" \ |
88 | "subq.ph %[" #TEMP0 "], %[" #TEMP0 "], %[" #TEMP1 "] \n\t" \ |
89 | "subq.ph %[" #TEMP2 "], %[" #TEMP2 "], %[" #TEMP3 "] \n\t" \ |
90 | "rotr %[" #TEMP0 "], %[" #TEMP0 "], 16 \n\t" \ |
91 | "addq.ph %[" #TEMP1 "], %[" #TEMP2 "], %[" #TEMP0 "] \n\t" \ |
92 | "subq.ph %[" #TEMP3 "], %[" #TEMP2 "], %[" #TEMP0 "] \n\t" \ |
93 | "seh %[" #TEMP0 "], %[" #TEMP1 "] \n\t" \ |
94 | "sra %[temp16], %[" #TEMP1 "], 16 \n\t" \ |
95 | "seh %[temp19], %[" #TEMP3 "] \n\t" \ |
96 | "sra %[" #TEMP3 "], %[" #TEMP3 "], 16 \n\t" \ |
97 | "subu %[" #TEMP2 "], %[" #TEMP0 "], %[temp16] \n\t" \ |
98 | "addu %[" #TEMP0 "], %[" #TEMP0 "], %[temp16] \n\t" \ |
99 | "mul %[temp17], %[temp19], %[c2217] \n\t" \ |
100 | "mul %[temp18], %[" #TEMP3 "], %[c5352] \n\t" \ |
101 | "mul %[" #TEMP1 "], %[temp19], %[c5352] \n\t" \ |
102 | "mul %[temp16], %[" #TEMP3 "], %[c2217] \n\t" \ |
103 | "sll %[" #TEMP2 "], %[" #TEMP2 "], 3 \n\t" \ |
104 | "sll %[" #TEMP0 "], %[" #TEMP0 "], 3 \n\t" \ |
105 | "subu %[" #TEMP3 "], %[temp17], %[temp18] \n\t" \ |
106 | "addu %[" #TEMP1 "], %[temp16], %[" #TEMP1 "] \n\t" \ |
107 | "addiu %[" #TEMP3 "], %[" #TEMP3 "], 937 \n\t" \ |
108 | "addiu %[" #TEMP1 "], %[" #TEMP1 "], 1812 \n\t" \ |
109 | "sra %[" #TEMP3 "], %[" #TEMP3 "], 9 \n\t" \ |
110 | "sra %[" #TEMP1 "], %[" #TEMP1 "], 9 \n\t" |
111 | |
112 | // macro for one vertical pass in FTransform |
113 | // temp0..temp15 holds tmp[0]..tmp[15] |
114 | // A..D - offsets in bytes to store to out buffer |
115 | // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements |
116 | #define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12) \ |
117 | "addu %[temp16], %[" #TEMP0 "], %[" #TEMP12 "] \n\t" \ |
118 | "subu %[temp19], %[" #TEMP0 "], %[" #TEMP12 "] \n\t" \ |
119 | "addu %[temp17], %[" #TEMP4 "], %[" #TEMP8 "] \n\t" \ |
120 | "subu %[temp18], %[" #TEMP4 "], %[" #TEMP8 "] \n\t" \ |
121 | "mul %[" #TEMP8 "], %[temp19], %[c2217] \n\t" \ |
122 | "mul %[" #TEMP12 "], %[temp18], %[c2217] \n\t" \ |
123 | "mul %[" #TEMP4 "], %[temp19], %[c5352] \n\t" \ |
124 | "mul %[temp18], %[temp18], %[c5352] \n\t" \ |
125 | "addiu %[temp16], %[temp16], 7 \n\t" \ |
126 | "addu %[" #TEMP0 "], %[temp16], %[temp17] \n\t" \ |
127 | "sra %[" #TEMP0 "], %[" #TEMP0 "], 4 \n\t" \ |
128 | "addu %[" #TEMP12 "], %[" #TEMP12 "], %[" #TEMP4 "] \n\t" \ |
129 | "subu %[" #TEMP4 "], %[temp16], %[temp17] \n\t" \ |
130 | "sra %[" #TEMP4 "], %[" #TEMP4 "], 4 \n\t" \ |
131 | "addiu %[" #TEMP8 "], %[" #TEMP8 "], 30000 \n\t" \ |
132 | "addiu %[" #TEMP12 "], %[" #TEMP12 "], 12000 \n\t" \ |
133 | "addiu %[" #TEMP8 "], %[" #TEMP8 "], 21000 \n\t" \ |
134 | "subu %[" #TEMP8 "], %[" #TEMP8 "], %[temp18] \n\t" \ |
135 | "sra %[" #TEMP12 "], %[" #TEMP12 "], 16 \n\t" \ |
136 | "sra %[" #TEMP8 "], %[" #TEMP8 "], 16 \n\t" \ |
137 | "addiu %[temp16], %[" #TEMP12 "], 1 \n\t" \ |
138 | "movn %[" #TEMP12 "], %[temp16], %[temp19] \n\t" \ |
139 | "sh %[" #TEMP0 "], " #A "(%[temp20]) \n\t" \ |
140 | "sh %[" #TEMP4 "], " #C "(%[temp20]) \n\t" \ |
141 | "sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \ |
142 | "sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t" |
143 | |
144 | static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref, |
145 | int16_t* out) { |
146 | const int c2217 = 2217; |
147 | const int c5352 = 5352; |
148 | int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; |
149 | int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16; |
150 | int temp17, temp18, temp19, temp20; |
151 | const int* const args[3] = |
152 | { (const int*)src, (const int*)ref, (const int*)out }; |
153 | |
154 | __asm__ volatile ( |
155 | HORIZONTAL_PASS(0, temp0, temp1, temp2, temp3) |
156 | HORIZONTAL_PASS(1, temp4, temp5, temp6, temp7) |
157 | HORIZONTAL_PASS(2, temp8, temp9, temp10, temp11) |
158 | HORIZONTAL_PASS(3, temp12, temp13, temp14, temp15) |
159 | "lw %[temp20], 8(%[args]) \n\t" |
160 | VERTICAL_PASS(0, 8, 16, 24, temp0, temp4, temp8, temp12) |
161 | VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9, temp13) |
162 | VERTICAL_PASS(4, 12, 20, 28, temp2, temp6, temp10, temp14) |
163 | VERTICAL_PASS(6, 14, 22, 30, temp3, temp7, temp11, temp15) |
164 | OUTPUT_EARLY_CLOBBER_REGS_18(), |
165 | [temp0]"=&r" (temp0), [temp19]"=&r" (temp19), [temp20]"=&r" (temp20) |
166 | : [args]"r" (args), [c2217]"r" (c2217), [c5352]"r" (c5352) |
167 | : "memory" , "hi" , "lo" |
168 | ); |
169 | } |
170 | |
171 | #undef VERTICAL_PASS |
172 | #undef HORIZONTAL_PASS |
173 | |
174 | static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, |
175 | uint8_t* dst) { |
176 | int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; |
177 | int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18; |
178 | |
179 | __asm__ volatile ( |
180 | "ulw %[temp1], 0(%[in]) \n\t" |
181 | "ulw %[temp2], 16(%[in]) \n\t" |
182 | LOAD_IN_X2(temp5, temp6, 24, 26) |
183 | ADD_SUB_HALVES(temp3, temp4, temp1, temp2) |
184 | LOAD_IN_X2(temp1, temp2, 8, 10) |
185 | MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14, |
186 | temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6, |
187 | temp13, temp11, temp14, temp12) |
188 | INSERT_HALF_X2(temp8, temp7, temp10, temp9) |
189 | "ulw %[temp17], 4(%[in]) \n\t" |
190 | "ulw %[temp18], 20(%[in]) \n\t" |
191 | ADD_SUB_HALVES(temp1, temp2, temp3, temp8) |
192 | ADD_SUB_HALVES(temp5, temp6, temp4, temp7) |
193 | ADD_SUB_HALVES(temp7, temp8, temp17, temp18) |
194 | LOAD_IN_X2(temp17, temp18, 12, 14) |
195 | LOAD_IN_X2(temp9, temp10, 28, 30) |
196 | MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17, |
197 | temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10, |
198 | temp15, temp4, temp16, temp17) |
199 | INSERT_HALF_X2(temp11, temp12, temp13, temp14) |
200 | ADD_SUB_HALVES(temp17, temp8, temp8, temp11) |
201 | ADD_SUB_HALVES(temp3, temp4, temp7, temp12) |
202 | |
203 | // horizontal |
204 | SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6) |
205 | INSERT_HALF_X2(temp1, temp6, temp5, temp2) |
206 | SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8) |
207 | "repl.ph %[temp2], 0x4 \n\t" |
208 | INSERT_HALF_X2(temp3, temp8, temp17, temp4) |
209 | "addq.ph %[temp1], %[temp1], %[temp2] \n\t" |
210 | "addq.ph %[temp6], %[temp6], %[temp2] \n\t" |
211 | ADD_SUB_HALVES(temp2, temp4, temp1, temp3) |
212 | ADD_SUB_HALVES(temp5, temp7, temp6, temp8) |
213 | MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18, |
214 | temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15, |
215 | temp6, temp17, temp8, temp18) |
216 | MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16, |
217 | temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14, |
218 | temp18, temp12, temp17, temp16) |
219 | INSERT_HALF_X2(temp1, temp3, temp9, temp13) |
220 | INSERT_HALF_X2(temp6, temp8, temp11, temp15) |
221 | SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15, |
222 | temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8, |
223 | temp6) |
224 | PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13, |
225 | temp16, temp11, temp10, temp15, temp14) |
226 | LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, ref, |
227 | 0, 0, 0, 0, |
228 | 0, 1, 2, 3, |
229 | BPS) |
230 | CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10, |
231 | temp11, temp10, temp11, temp14, temp15) |
232 | STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11, |
233 | temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4, |
234 | dst, 0, 1, 2, 3, BPS) |
235 | |
236 | OUTPUT_EARLY_CLOBBER_REGS_18() |
237 | : [dst]"r" (dst), [in]"r" (in), [kC1]"r" (kC1), [kC2]"r" (kC2), [ref]"r" (ref) |
238 | : "memory" , "hi" , "lo" |
239 | ); |
240 | } |
241 | |
242 | static void ITransform_MIPSdspR2(const uint8_t* ref, const int16_t* in, |
243 | uint8_t* dst, int do_two) { |
244 | ITransformOne(ref, in, dst); |
245 | if (do_two) { |
246 | ITransformOne(ref + 4, in + 16, dst + 4); |
247 | } |
248 | } |
249 | |
250 | static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b, |
251 | const uint16_t* const w) { |
252 | int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; |
253 | int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17; |
254 | |
255 | __asm__ volatile ( |
256 | LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, a, |
257 | 0, 0, 0, 0, |
258 | 0, 1, 2, 3, |
259 | BPS) |
260 | CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp9,temp10, temp11, |
261 | temp12, temp1, temp2, temp3, temp4) |
262 | ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, |
263 | temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12) |
264 | PACK_2_HALVES_TO_WORD(temp9, temp10, temp11, temp12, temp1, temp3, temp5, |
265 | temp7, temp2, temp4, temp6, temp8) |
266 | ADD_SUB_HALVES_X4(temp2, temp4, temp6, temp8, temp9, temp1, temp3, temp10, |
267 | temp1, temp9, temp3, temp10, temp5, temp11, temp7, temp12) |
268 | ADD_SUB_HALVES_X4(temp5, temp11, temp7, temp2, temp9, temp3, temp6, temp12, |
269 | temp2, temp9, temp6, temp3, temp4, temp1, temp8, temp10) |
270 | ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2, |
271 | temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12) |
272 | ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2) |
273 | LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w, |
274 | 0, 4, 8, 12, |
275 | 0, 0, 0, 0, |
276 | 0) |
277 | LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w, |
278 | 0, 4, 8, 12, |
279 | 1, 1, 1, 1, |
280 | 16) |
281 | MUL_HALF(temp17, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, |
282 | temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16) |
283 | LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, b, |
284 | 0, 0, 0, 0, |
285 | 0, 1, 2, 3, |
286 | BPS) |
287 | CONVERT_2_BYTES_TO_HALF(temp5,temp6, temp7, temp8, temp9,temp10, temp11, |
288 | temp12, temp1, temp2, temp3, temp4) |
289 | ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, |
290 | temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12) |
291 | PACK_2_HALVES_TO_WORD(temp9, temp10, temp11, temp12, temp1, temp3, temp5, |
292 | temp7, temp2, temp4, temp6, temp8) |
293 | ADD_SUB_HALVES_X4(temp2, temp4, temp6, temp8, temp9, temp1, temp3, temp10, |
294 | temp1, temp9, temp3, temp10, temp5, temp11, temp7, temp12) |
295 | ADD_SUB_HALVES_X4(temp5, temp11, temp7, temp2, temp9, temp3, temp6, temp12, |
296 | temp2, temp9, temp6, temp3, temp4, temp1, temp8, temp10) |
297 | ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2, |
298 | temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12) |
299 | ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2) |
300 | LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w, |
301 | 0, 4, 8, 12, |
302 | 0, 0, 0, 0, |
303 | 0) |
304 | LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w, |
305 | 0, 4, 8, 12, |
306 | 1, 1, 1, 1, |
307 | 16) |
308 | MUL_HALF(temp3, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, |
309 | temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16) |
310 | OUTPUT_EARLY_CLOBBER_REGS_17() |
311 | : [a]"r" (a), [b]"r" (b), [w]"r" (w) |
312 | : "memory" , "hi" , "lo" |
313 | ); |
314 | return abs(temp3 - temp17) >> 5; |
315 | } |
316 | |
317 | static int Disto16x16_MIPSdspR2(const uint8_t* const a, |
318 | const uint8_t* const b, |
319 | const uint16_t* const w) { |
320 | int D = 0; |
321 | int x, y; |
322 | for (y = 0; y < 16 * BPS; y += 4 * BPS) { |
323 | for (x = 0; x < 16; x += 4) { |
324 | D += Disto4x4_MIPSdspR2(a + x + y, b + x + y, w); |
325 | } |
326 | } |
327 | return D; |
328 | } |
329 | |
330 | //------------------------------------------------------------------------------ |
331 | // Intra predictions |
332 | |
333 | #define FILL_PART(J, SIZE) \ |
334 | "usw %[value], 0+" #J "*" XSTR(BPS) "(%[dst]) \n\t" \ |
335 | "usw %[value], 4+" #J "*" XSTR(BPS) "(%[dst]) \n\t" \ |
336 | ".if " #SIZE " == 16 \n\t" \ |
337 | "usw %[value], 8+" #J "*" XSTR(BPS) "(%[dst]) \n\t" \ |
338 | "usw %[value], 12+" #J "*" XSTR(BPS) "(%[dst]) \n\t" \ |
339 | ".endif \n\t" |
340 | |
341 | #define FILL_8_OR_16(DST, VALUE, SIZE) do { \ |
342 | int value = (VALUE); \ |
343 | __asm__ volatile ( \ |
344 | "replv.qb %[value], %[value] \n\t" \ |
345 | FILL_PART( 0, SIZE) \ |
346 | FILL_PART( 1, SIZE) \ |
347 | FILL_PART( 2, SIZE) \ |
348 | FILL_PART( 3, SIZE) \ |
349 | FILL_PART( 4, SIZE) \ |
350 | FILL_PART( 5, SIZE) \ |
351 | FILL_PART( 6, SIZE) \ |
352 | FILL_PART( 7, SIZE) \ |
353 | ".if " #SIZE " == 16 \n\t" \ |
354 | FILL_PART( 8, 16) \ |
355 | FILL_PART( 9, 16) \ |
356 | FILL_PART(10, 16) \ |
357 | FILL_PART(11, 16) \ |
358 | FILL_PART(12, 16) \ |
359 | FILL_PART(13, 16) \ |
360 | FILL_PART(14, 16) \ |
361 | FILL_PART(15, 16) \ |
362 | ".endif \n\t" \ |
363 | : [value]"+&r"(value) \ |
364 | : [dst]"r"((DST)) \ |
365 | : "memory" \ |
366 | ); \ |
367 | } while (0) |
368 | |
369 | #define VERTICAL_PRED(DST, TOP, SIZE) \ |
370 | static WEBP_INLINE void VerticalPred##SIZE(uint8_t* (DST), \ |
371 | const uint8_t* (TOP)) { \ |
372 | int j; \ |
373 | if ((TOP)) { \ |
374 | for (j = 0; j < (SIZE); ++j) memcpy((DST) + j * BPS, (TOP), (SIZE)); \ |
375 | } else { \ |
376 | FILL_8_OR_16((DST), 127, (SIZE)); \ |
377 | } \ |
378 | } |
379 | |
380 | VERTICAL_PRED(dst, top, 8) |
381 | VERTICAL_PRED(dst, top, 16) |
382 | |
383 | #undef VERTICAL_PRED |
384 | |
385 | #define HORIZONTAL_PRED(DST, LEFT, SIZE) \ |
386 | static WEBP_INLINE void HorizontalPred##SIZE(uint8_t* (DST), \ |
387 | const uint8_t* (LEFT)) { \ |
388 | if (LEFT) { \ |
389 | int j; \ |
390 | for (j = 0; j < (SIZE); ++j) { \ |
391 | memset((DST) + j * BPS, (LEFT)[j], (SIZE)); \ |
392 | } \ |
393 | } else { \ |
394 | FILL_8_OR_16((DST), 129, (SIZE)); \ |
395 | } \ |
396 | } |
397 | |
398 | HORIZONTAL_PRED(dst, left, 8) |
399 | HORIZONTAL_PRED(dst, left, 16) |
400 | |
401 | #undef HORIZONTAL_PRED |
402 | |
403 | #define CLIPPING() \ |
404 | "preceu.ph.qbl %[temp2], %[temp0] \n\t" \ |
405 | "preceu.ph.qbr %[temp0], %[temp0] \n\t" \ |
406 | "preceu.ph.qbl %[temp3], %[temp1] \n\t" \ |
407 | "preceu.ph.qbr %[temp1], %[temp1] \n\t" \ |
408 | "addu.ph %[temp2], %[temp2], %[leftY_1] \n\t" \ |
409 | "addu.ph %[temp0], %[temp0], %[leftY_1] \n\t" \ |
410 | "addu.ph %[temp3], %[temp3], %[leftY_1] \n\t" \ |
411 | "addu.ph %[temp1], %[temp1], %[leftY_1] \n\t" \ |
412 | "shll_s.ph %[temp2], %[temp2], 7 \n\t" \ |
413 | "shll_s.ph %[temp0], %[temp0], 7 \n\t" \ |
414 | "shll_s.ph %[temp3], %[temp3], 7 \n\t" \ |
415 | "shll_s.ph %[temp1], %[temp1], 7 \n\t" \ |
416 | "precrqu_s.qb.ph %[temp0], %[temp2], %[temp0] \n\t" \ |
417 | "precrqu_s.qb.ph %[temp1], %[temp3], %[temp1] \n\t" |
418 | |
419 | #define CLIP_8B_TO_DST(DST, LEFT, TOP, SIZE) do { \ |
420 | int leftY_1 = ((int)(LEFT)[y] << 16) + (LEFT)[y]; \ |
421 | int temp0, temp1, temp2, temp3; \ |
422 | __asm__ volatile ( \ |
423 | "replv.ph %[leftY_1], %[leftY_1] \n\t" \ |
424 | "ulw %[temp0], 0(%[top]) \n\t" \ |
425 | "ulw %[temp1], 4(%[top]) \n\t" \ |
426 | "subu.ph %[leftY_1], %[leftY_1], %[left_1] \n\t" \ |
427 | CLIPPING() \ |
428 | "usw %[temp0], 0(%[dst]) \n\t" \ |
429 | "usw %[temp1], 4(%[dst]) \n\t" \ |
430 | ".if " #SIZE " == 16 \n\t" \ |
431 | "ulw %[temp0], 8(%[top]) \n\t" \ |
432 | "ulw %[temp1], 12(%[top]) \n\t" \ |
433 | CLIPPING() \ |
434 | "usw %[temp0], 8(%[dst]) \n\t" \ |
435 | "usw %[temp1], 12(%[dst]) \n\t" \ |
436 | ".endif \n\t" \ |
437 | : [leftY_1]"+&r"(leftY_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \ |
438 | [temp2]"=&r"(temp2), [temp3]"=&r"(temp3) \ |
439 | : [left_1]"r"(left_1), [top]"r"((TOP)), [dst]"r"((DST)) \ |
440 | : "memory" \ |
441 | ); \ |
442 | } while (0) |
443 | |
444 | #define CLIP_TO_DST(DST, LEFT, TOP, SIZE) do { \ |
445 | int y; \ |
446 | const int left_1 = ((int)(LEFT)[-1] << 16) + (LEFT)[-1]; \ |
447 | for (y = 0; y < (SIZE); ++y) { \ |
448 | CLIP_8B_TO_DST((DST), (LEFT), (TOP), (SIZE)); \ |
449 | (DST) += BPS; \ |
450 | } \ |
451 | } while (0) |
452 | |
453 | #define TRUE_MOTION(DST, LEFT, TOP, SIZE) \ |
454 | static WEBP_INLINE void TrueMotion##SIZE(uint8_t* (DST), const uint8_t* (LEFT),\ |
455 | const uint8_t* (TOP)) { \ |
456 | if ((LEFT) != NULL) { \ |
457 | if ((TOP) != NULL) { \ |
458 | CLIP_TO_DST((DST), (LEFT), (TOP), (SIZE)); \ |
459 | } else { \ |
460 | HorizontalPred##SIZE((DST), (LEFT)); \ |
461 | } \ |
462 | } else { \ |
463 | /* true motion without left samples (hence: with default 129 value) */ \ |
464 | /* is equivalent to VE prediction where you just copy the top samples. */ \ |
465 | /* Note that if top samples are not available, the default value is */ \ |
466 | /* then 129, and not 127 as in the VerticalPred case. */ \ |
467 | if ((TOP) != NULL) { \ |
468 | VerticalPred##SIZE((DST), (TOP)); \ |
469 | } else { \ |
470 | FILL_8_OR_16((DST), 129, (SIZE)); \ |
471 | } \ |
472 | } \ |
473 | } |
474 | |
475 | TRUE_MOTION(dst, left, top, 8) |
476 | TRUE_MOTION(dst, left, top, 16) |
477 | |
478 | #undef TRUE_MOTION |
479 | #undef CLIP_TO_DST |
480 | #undef CLIP_8B_TO_DST |
481 | #undef CLIPPING |
482 | |
483 | static WEBP_INLINE void DCMode16(uint8_t* dst, const uint8_t* left, |
484 | const uint8_t* top) { |
485 | int DC, DC1; |
486 | int temp0, temp1, temp2, temp3; |
487 | |
488 | __asm__ volatile( |
489 | "beqz %[top], 2f \n\t" |
490 | LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, top, |
491 | 0, 4, 8, 12, |
492 | 0, 0, 0, 0, |
493 | 0) |
494 | "raddu.w.qb %[temp0], %[temp0] \n\t" |
495 | "raddu.w.qb %[temp1], %[temp1] \n\t" |
496 | "raddu.w.qb %[temp2], %[temp2] \n\t" |
497 | "raddu.w.qb %[temp3], %[temp3] \n\t" |
498 | "addu %[temp0], %[temp0], %[temp1] \n\t" |
499 | "addu %[temp2], %[temp2], %[temp3] \n\t" |
500 | "addu %[DC], %[temp0], %[temp2] \n\t" |
501 | "move %[DC1], %[DC] \n\t" |
502 | "beqz %[left], 1f \n\t" |
503 | LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, left, |
504 | 0, 4, 8, 12, |
505 | 0, 0, 0, 0, |
506 | 0) |
507 | "raddu.w.qb %[temp0], %[temp0] \n\t" |
508 | "raddu.w.qb %[temp1], %[temp1] \n\t" |
509 | "raddu.w.qb %[temp2], %[temp2] \n\t" |
510 | "raddu.w.qb %[temp3], %[temp3] \n\t" |
511 | "addu %[temp0], %[temp0], %[temp1] \n\t" |
512 | "addu %[temp2], %[temp2], %[temp3] \n\t" |
513 | "addu %[DC1], %[temp0], %[temp2] \n\t" |
514 | "1: \n\t" |
515 | "addu %[DC], %[DC], %[DC1] \n\t" |
516 | "j 3f \n\t" |
517 | "2: \n\t" |
518 | "beqz %[left], 4f \n\t" |
519 | LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, left, |
520 | 0, 4, 8, 12, |
521 | 0, 0, 0, 0, |
522 | 0) |
523 | "raddu.w.qb %[temp0], %[temp0] \n\t" |
524 | "raddu.w.qb %[temp1], %[temp1] \n\t" |
525 | "raddu.w.qb %[temp2], %[temp2] \n\t" |
526 | "raddu.w.qb %[temp3], %[temp3] \n\t" |
527 | "addu %[temp0], %[temp0], %[temp1] \n\t" |
528 | "addu %[temp2], %[temp2], %[temp3] \n\t" |
529 | "addu %[DC], %[temp0], %[temp2] \n\t" |
530 | "addu %[DC], %[DC], %[DC] \n\t" |
531 | "3: \n\t" |
532 | "shra_r.w %[DC], %[DC], 5 \n\t" |
533 | "j 5f \n\t" |
534 | "4: \n\t" |
535 | "li %[DC], 0x80 \n\t" |
536 | "5: \n\t" |
537 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [DC]"=&r" (DC), |
538 | [temp2]"=&r" (temp2), [temp3]"=&r" (temp3), [DC1]"=&r" (DC1) |
539 | : [left]"r" (left), [top]"r" (top) |
540 | : "memory" |
541 | ); |
542 | |
543 | FILL_8_OR_16(dst, DC, 16); |
544 | } |
545 | |
546 | static WEBP_INLINE void DCMode8(uint8_t* dst, const uint8_t* left, |
547 | const uint8_t* top) { |
548 | int DC, DC1; |
549 | int temp0, temp1, temp2, temp3; |
550 | |
551 | __asm__ volatile( |
552 | "beqz %[top], 2f \n\t" |
553 | "ulw %[temp0], 0(%[top]) \n\t" |
554 | "ulw %[temp1], 4(%[top]) \n\t" |
555 | "raddu.w.qb %[temp0], %[temp0] \n\t" |
556 | "raddu.w.qb %[temp1], %[temp1] \n\t" |
557 | "addu %[DC], %[temp0], %[temp1] \n\t" |
558 | "move %[DC1], %[DC] \n\t" |
559 | "beqz %[left], 1f \n\t" |
560 | "ulw %[temp2], 0(%[left]) \n\t" |
561 | "ulw %[temp3], 4(%[left]) \n\t" |
562 | "raddu.w.qb %[temp2], %[temp2] \n\t" |
563 | "raddu.w.qb %[temp3], %[temp3] \n\t" |
564 | "addu %[DC1], %[temp2], %[temp3] \n\t" |
565 | "1: \n\t" |
566 | "addu %[DC], %[DC], %[DC1] \n\t" |
567 | "j 3f \n\t" |
568 | "2: \n\t" |
569 | "beqz %[left], 4f \n\t" |
570 | "ulw %[temp2], 0(%[left]) \n\t" |
571 | "ulw %[temp3], 4(%[left]) \n\t" |
572 | "raddu.w.qb %[temp2], %[temp2] \n\t" |
573 | "raddu.w.qb %[temp3], %[temp3] \n\t" |
574 | "addu %[DC], %[temp2], %[temp3] \n\t" |
575 | "addu %[DC], %[DC], %[DC] \n\t" |
576 | "3: \n\t" |
577 | "shra_r.w %[DC], %[DC], 4 \n\t" |
578 | "j 5f \n\t" |
579 | "4: \n\t" |
580 | "li %[DC], 0x80 \n\t" |
581 | "5: \n\t" |
582 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [DC]"=&r" (DC), |
583 | [temp2]"=&r" (temp2), [temp3]"=&r" (temp3), [DC1]"=&r" (DC1) |
584 | : [left]"r" (left), [top]"r" (top) |
585 | : "memory" |
586 | ); |
587 | |
588 | FILL_8_OR_16(dst, DC, 8); |
589 | } |
590 | |
591 | static void DC4(uint8_t* dst, const uint8_t* top) { |
592 | int temp0, temp1; |
593 | __asm__ volatile( |
594 | "ulw %[temp0], 0(%[top]) \n\t" |
595 | "ulw %[temp1], -5(%[top]) \n\t" |
596 | "raddu.w.qb %[temp0], %[temp0] \n\t" |
597 | "raddu.w.qb %[temp1], %[temp1] \n\t" |
598 | "addu %[temp0], %[temp0], %[temp1] \n\t" |
599 | "addiu %[temp0], %[temp0], 4 \n\t" |
600 | "srl %[temp0], %[temp0], 3 \n\t" |
601 | "replv.qb %[temp0], %[temp0] \n\t" |
602 | "usw %[temp0], 0*" XSTR(BPS) "(%[dst]) \n\t" |
603 | "usw %[temp0], 1*" XSTR(BPS) "(%[dst]) \n\t" |
604 | "usw %[temp0], 2*" XSTR(BPS) "(%[dst]) \n\t" |
605 | "usw %[temp0], 3*" XSTR(BPS) "(%[dst]) \n\t" |
606 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1) |
607 | : [top]"r" (top), [dst]"r" (dst) |
608 | : "memory" |
609 | ); |
610 | } |
611 | |
612 | static void TM4(uint8_t* dst, const uint8_t* top) { |
613 | int a10, a32, temp0, temp1, temp2, temp3, temp4, temp5; |
614 | const int c35 = 0xff00ff; |
615 | __asm__ volatile ( |
616 | "lbu %[temp1], 0(%[top]) \n\t" |
617 | "lbu %[a10], 1(%[top]) \n\t" |
618 | "lbu %[temp2], 2(%[top]) \n\t" |
619 | "lbu %[a32], 3(%[top]) \n\t" |
620 | "ulw %[temp0], -5(%[top]) \n\t" |
621 | "lbu %[temp4], -1(%[top]) \n\t" |
622 | "append %[a10], %[temp1], 16 \n\t" |
623 | "append %[a32], %[temp2], 16 \n\t" |
624 | "replv.ph %[temp4], %[temp4] \n\t" |
625 | "shrl.ph %[temp1], %[temp0], 8 \n\t" |
626 | "and %[temp0], %[temp0], %[c35] \n\t" |
627 | "subu.ph %[temp1], %[temp1], %[temp4] \n\t" |
628 | "subu.ph %[temp0], %[temp0], %[temp4] \n\t" |
629 | "srl %[temp2], %[temp1], 16 \n\t" |
630 | "srl %[temp3], %[temp0], 16 \n\t" |
631 | "replv.ph %[temp2], %[temp2] \n\t" |
632 | "replv.ph %[temp3], %[temp3] \n\t" |
633 | "replv.ph %[temp4], %[temp1] \n\t" |
634 | "replv.ph %[temp5], %[temp0] \n\t" |
635 | "addu.ph %[temp0], %[temp3], %[a10] \n\t" |
636 | "addu.ph %[temp1], %[temp3], %[a32] \n\t" |
637 | "addu.ph %[temp3], %[temp2], %[a10] \n\t" |
638 | "addu.ph %[temp2], %[temp2], %[a32] \n\t" |
639 | "shll_s.ph %[temp0], %[temp0], 7 \n\t" |
640 | "shll_s.ph %[temp1], %[temp1], 7 \n\t" |
641 | "shll_s.ph %[temp3], %[temp3], 7 \n\t" |
642 | "shll_s.ph %[temp2], %[temp2], 7 \n\t" |
643 | "precrqu_s.qb.ph %[temp0], %[temp1], %[temp0] \n\t" |
644 | "precrqu_s.qb.ph %[temp1], %[temp2], %[temp3] \n\t" |
645 | "addu.ph %[temp2], %[temp5], %[a10] \n\t" |
646 | "addu.ph %[temp3], %[temp5], %[a32] \n\t" |
647 | "addu.ph %[temp5], %[temp4], %[a10] \n\t" |
648 | "addu.ph %[temp4], %[temp4], %[a32] \n\t" |
649 | "shll_s.ph %[temp2], %[temp2], 7 \n\t" |
650 | "shll_s.ph %[temp3], %[temp3], 7 \n\t" |
651 | "shll_s.ph %[temp4], %[temp4], 7 \n\t" |
652 | "shll_s.ph %[temp5], %[temp5], 7 \n\t" |
653 | "precrqu_s.qb.ph %[temp2], %[temp3], %[temp2] \n\t" |
654 | "precrqu_s.qb.ph %[temp3], %[temp4], %[temp5] \n\t" |
655 | "usw %[temp1], 0*" XSTR(BPS) "(%[dst]) \n\t" |
656 | "usw %[temp0], 1*" XSTR(BPS) "(%[dst]) \n\t" |
657 | "usw %[temp3], 2*" XSTR(BPS) "(%[dst]) \n\t" |
658 | "usw %[temp2], 3*" XSTR(BPS) "(%[dst]) \n\t" |
659 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
660 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
661 | [a10]"=&r" (a10), [a32]"=&r" (a32) |
662 | : [c35]"r" (c35), [top]"r" (top), [dst]"r" (dst) |
663 | : "memory" |
664 | ); |
665 | } |
666 | |
667 | static void VE4(uint8_t* dst, const uint8_t* top) { |
668 | int temp0, temp1, temp2, temp3, temp4, temp5, temp6; |
669 | __asm__ volatile( |
670 | "ulw %[temp0], -1(%[top]) \n\t" |
671 | "ulh %[temp1], 3(%[top]) \n\t" |
672 | "preceu.ph.qbr %[temp2], %[temp0] \n\t" |
673 | "preceu.ph.qbl %[temp3], %[temp0] \n\t" |
674 | "preceu.ph.qbr %[temp4], %[temp1] \n\t" |
675 | "packrl.ph %[temp5], %[temp3], %[temp2] \n\t" |
676 | "packrl.ph %[temp6], %[temp4], %[temp3] \n\t" |
677 | "shll.ph %[temp5], %[temp5], 1 \n\t" |
678 | "shll.ph %[temp6], %[temp6], 1 \n\t" |
679 | "addq.ph %[temp2], %[temp5], %[temp2] \n\t" |
680 | "addq.ph %[temp6], %[temp6], %[temp4] \n\t" |
681 | "addq.ph %[temp2], %[temp2], %[temp3] \n\t" |
682 | "addq.ph %[temp6], %[temp6], %[temp3] \n\t" |
683 | "shra_r.ph %[temp2], %[temp2], 2 \n\t" |
684 | "shra_r.ph %[temp6], %[temp6], 2 \n\t" |
685 | "precr.qb.ph %[temp4], %[temp6], %[temp2] \n\t" |
686 | "usw %[temp4], 0*" XSTR(BPS) "(%[dst]) \n\t" |
687 | "usw %[temp4], 1*" XSTR(BPS) "(%[dst]) \n\t" |
688 | "usw %[temp4], 2*" XSTR(BPS) "(%[dst]) \n\t" |
689 | "usw %[temp4], 3*" XSTR(BPS) "(%[dst]) \n\t" |
690 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
691 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
692 | [temp6]"=&r" (temp6) |
693 | : [top]"r" (top), [dst]"r" (dst) |
694 | : "memory" |
695 | ); |
696 | } |
697 | |
698 | static void HE4(uint8_t* dst, const uint8_t* top) { |
699 | int temp0, temp1, temp2, temp3, temp4, temp5, temp6; |
700 | __asm__ volatile( |
701 | "ulw %[temp0], -4(%[top]) \n\t" |
702 | "lbu %[temp1], -5(%[top]) \n\t" |
703 | "preceu.ph.qbr %[temp2], %[temp0] \n\t" |
704 | "preceu.ph.qbl %[temp3], %[temp0] \n\t" |
705 | "replv.ph %[temp4], %[temp1] \n\t" |
706 | "packrl.ph %[temp5], %[temp3], %[temp2] \n\t" |
707 | "packrl.ph %[temp6], %[temp2], %[temp4] \n\t" |
708 | "shll.ph %[temp5], %[temp5], 1 \n\t" |
709 | "shll.ph %[temp6], %[temp6], 1 \n\t" |
710 | "addq.ph %[temp3], %[temp3], %[temp5] \n\t" |
711 | "addq.ph %[temp3], %[temp3], %[temp2] \n\t" |
712 | "addq.ph %[temp2], %[temp2], %[temp6] \n\t" |
713 | "addq.ph %[temp2], %[temp2], %[temp4] \n\t" |
714 | "shra_r.ph %[temp3], %[temp3], 2 \n\t" |
715 | "shra_r.ph %[temp2], %[temp2], 2 \n\t" |
716 | "replv.qb %[temp0], %[temp3] \n\t" |
717 | "replv.qb %[temp1], %[temp2] \n\t" |
718 | "srl %[temp3], %[temp3], 16 \n\t" |
719 | "srl %[temp2], %[temp2], 16 \n\t" |
720 | "replv.qb %[temp3], %[temp3] \n\t" |
721 | "replv.qb %[temp2], %[temp2] \n\t" |
722 | "usw %[temp3], 0*" XSTR(BPS) "(%[dst]) \n\t" |
723 | "usw %[temp0], 1*" XSTR(BPS) "(%[dst]) \n\t" |
724 | "usw %[temp2], 2*" XSTR(BPS) "(%[dst]) \n\t" |
725 | "usw %[temp1], 3*" XSTR(BPS) "(%[dst]) \n\t" |
726 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
727 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
728 | [temp6]"=&r" (temp6) |
729 | : [top]"r" (top), [dst]"r" (dst) |
730 | : "memory" |
731 | ); |
732 | } |
733 | |
734 | static void RD4(uint8_t* dst, const uint8_t* top) { |
735 | int temp0, temp1, temp2, temp3, temp4, temp5; |
736 | int temp6, temp7, temp8, temp9, temp10, temp11; |
737 | __asm__ volatile( |
738 | "ulw %[temp0], -5(%[top]) \n\t" |
739 | "ulw %[temp1], -1(%[top]) \n\t" |
740 | "preceu.ph.qbl %[temp2], %[temp0] \n\t" |
741 | "preceu.ph.qbr %[temp3], %[temp0] \n\t" |
742 | "preceu.ph.qbr %[temp4], %[temp1] \n\t" |
743 | "preceu.ph.qbl %[temp5], %[temp1] \n\t" |
744 | "packrl.ph %[temp6], %[temp2], %[temp3] \n\t" |
745 | "packrl.ph %[temp7], %[temp4], %[temp2] \n\t" |
746 | "packrl.ph %[temp8], %[temp5], %[temp4] \n\t" |
747 | "shll.ph %[temp6], %[temp6], 1 \n\t" |
748 | "addq.ph %[temp9], %[temp2], %[temp6] \n\t" |
749 | "shll.ph %[temp7], %[temp7], 1 \n\t" |
750 | "addq.ph %[temp9], %[temp9], %[temp3] \n\t" |
751 | "shll.ph %[temp8], %[temp8], 1 \n\t" |
752 | "shra_r.ph %[temp9], %[temp9], 2 \n\t" |
753 | "addq.ph %[temp10], %[temp4], %[temp7] \n\t" |
754 | "addq.ph %[temp11], %[temp5], %[temp8] \n\t" |
755 | "addq.ph %[temp10], %[temp10], %[temp2] \n\t" |
756 | "addq.ph %[temp11], %[temp11], %[temp4] \n\t" |
757 | "shra_r.ph %[temp10], %[temp10], 2 \n\t" |
758 | "shra_r.ph %[temp11], %[temp11], 2 \n\t" |
759 | "lbu %[temp0], 3(%[top]) \n\t" |
760 | "lbu %[temp1], 2(%[top]) \n\t" |
761 | "lbu %[temp2], 1(%[top]) \n\t" |
762 | "sll %[temp1], %[temp1], 1 \n\t" |
763 | "addu %[temp0], %[temp0], %[temp1] \n\t" |
764 | "addu %[temp0], %[temp0], %[temp2] \n\t" |
765 | "precr.qb.ph %[temp9], %[temp10], %[temp9] \n\t" |
766 | "shra_r.w %[temp0], %[temp0], 2 \n\t" |
767 | "precr.qb.ph %[temp10], %[temp11], %[temp10] \n\t" |
768 | "usw %[temp9], 3*" XSTR(BPS) "(%[dst]) \n\t" |
769 | "usw %[temp10], 1*" XSTR(BPS) "(%[dst]) \n\t" |
770 | "prepend %[temp9], %[temp11], 8 \n\t" |
771 | "prepend %[temp10], %[temp0], 8 \n\t" |
772 | "usw %[temp9], 2*" XSTR(BPS) "(%[dst]) \n\t" |
773 | "usw %[temp10], 0*" XSTR(BPS) "(%[dst]) \n\t" |
774 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
775 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
776 | [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [temp8]"=&r" (temp8), |
777 | [temp9]"=&r" (temp9), [temp10]"=&r" (temp10), [temp11]"=&r" (temp11) |
778 | : [top]"r" (top), [dst]"r" (dst) |
779 | : "memory" |
780 | ); |
781 | } |
782 | |
783 | static void VR4(uint8_t* dst, const uint8_t* top) { |
784 | int temp0, temp1, temp2, temp3, temp4; |
785 | int temp5, temp6, temp7, temp8, temp9; |
786 | __asm__ volatile ( |
787 | "ulw %[temp0], -4(%[top]) \n\t" |
788 | "ulw %[temp1], 0(%[top]) \n\t" |
789 | "preceu.ph.qbl %[temp2], %[temp0] \n\t" |
790 | "preceu.ph.qbr %[temp0], %[temp0] \n\t" |
791 | "preceu.ph.qbla %[temp3], %[temp1] \n\t" |
792 | "preceu.ph.qbra %[temp1], %[temp1] \n\t" |
793 | "packrl.ph %[temp7], %[temp3], %[temp2] \n\t" |
794 | "addqh_r.ph %[temp4], %[temp1], %[temp3] \n\t" |
795 | "move %[temp6], %[temp1] \n\t" |
796 | "append %[temp1], %[temp2], 16 \n\t" |
797 | "shll.ph %[temp9], %[temp6], 1 \n\t" |
798 | "addqh_r.ph %[temp5], %[temp7], %[temp6] \n\t" |
799 | "shll.ph %[temp8], %[temp7], 1 \n\t" |
800 | "addu.ph %[temp3], %[temp7], %[temp3] \n\t" |
801 | "addu.ph %[temp1], %[temp1], %[temp6] \n\t" |
802 | "packrl.ph %[temp7], %[temp2], %[temp0] \n\t" |
803 | "addu.ph %[temp6], %[temp0], %[temp2] \n\t" |
804 | "addu.ph %[temp3], %[temp3], %[temp9] \n\t" |
805 | "addu.ph %[temp1], %[temp1], %[temp8] \n\t" |
806 | "shll.ph %[temp7], %[temp7], 1 \n\t" |
807 | "shra_r.ph %[temp3], %[temp3], 2 \n\t" |
808 | "shra_r.ph %[temp1], %[temp1], 2 \n\t" |
809 | "addu.ph %[temp6], %[temp6], %[temp7] \n\t" |
810 | "shra_r.ph %[temp6], %[temp6], 2 \n\t" |
811 | "precrq.ph.w %[temp8], %[temp4], %[temp5] \n\t" |
812 | "append %[temp4], %[temp5], 16 \n\t" |
813 | "precrq.ph.w %[temp2], %[temp3], %[temp1] \n\t" |
814 | "append %[temp3], %[temp1], 16 \n\t" |
815 | "precr.qb.ph %[temp8], %[temp8], %[temp4] \n\t" |
816 | "precr.qb.ph %[temp3], %[temp2], %[temp3] \n\t" |
817 | "usw %[temp8], 0*" XSTR(BPS) "(%[dst]) \n\t" |
818 | "usw %[temp3], 1*" XSTR(BPS) "(%[dst]) \n\t" |
819 | "append %[temp3], %[temp6], 8 \n\t" |
820 | "srl %[temp6], %[temp6], 16 \n\t" |
821 | "append %[temp8], %[temp6], 8 \n\t" |
822 | "usw %[temp3], 3*" XSTR(BPS) "(%[dst]) \n\t" |
823 | "usw %[temp8], 2*" XSTR(BPS) "(%[dst]) \n\t" |
824 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
825 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
826 | [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [temp8]"=&r" (temp8), |
827 | [temp9]"=&r" (temp9) |
828 | : [top]"r" (top), [dst]"r" (dst) |
829 | : "memory" |
830 | ); |
831 | } |
832 | |
833 | static void LD4(uint8_t* dst, const uint8_t* top) { |
834 | int temp0, temp1, temp2, temp3, temp4, temp5; |
835 | int temp6, temp7, temp8, temp9, temp10, temp11; |
836 | __asm__ volatile( |
837 | "ulw %[temp0], 0(%[top]) \n\t" |
838 | "ulw %[temp1], 4(%[top]) \n\t" |
839 | "preceu.ph.qbl %[temp2], %[temp0] \n\t" |
840 | "preceu.ph.qbr %[temp3], %[temp0] \n\t" |
841 | "preceu.ph.qbr %[temp4], %[temp1] \n\t" |
842 | "preceu.ph.qbl %[temp5], %[temp1] \n\t" |
843 | "packrl.ph %[temp6], %[temp2], %[temp3] \n\t" |
844 | "packrl.ph %[temp7], %[temp4], %[temp2] \n\t" |
845 | "packrl.ph %[temp8], %[temp5], %[temp4] \n\t" |
846 | "shll.ph %[temp6], %[temp6], 1 \n\t" |
847 | "addq.ph %[temp9], %[temp2], %[temp6] \n\t" |
848 | "shll.ph %[temp7], %[temp7], 1 \n\t" |
849 | "addq.ph %[temp9], %[temp9], %[temp3] \n\t" |
850 | "shll.ph %[temp8], %[temp8], 1 \n\t" |
851 | "shra_r.ph %[temp9], %[temp9], 2 \n\t" |
852 | "addq.ph %[temp10], %[temp4], %[temp7] \n\t" |
853 | "addq.ph %[temp11], %[temp5], %[temp8] \n\t" |
854 | "addq.ph %[temp10], %[temp10], %[temp2] \n\t" |
855 | "addq.ph %[temp11], %[temp11], %[temp4] \n\t" |
856 | "shra_r.ph %[temp10], %[temp10], 2 \n\t" |
857 | "shra_r.ph %[temp11], %[temp11], 2 \n\t" |
858 | "srl %[temp1], %[temp1], 24 \n\t" |
859 | "sll %[temp1], %[temp1], 1 \n\t" |
860 | "raddu.w.qb %[temp5], %[temp5] \n\t" |
861 | "precr.qb.ph %[temp9], %[temp10], %[temp9] \n\t" |
862 | "precr.qb.ph %[temp10], %[temp11], %[temp10] \n\t" |
863 | "addu %[temp1], %[temp1], %[temp5] \n\t" |
864 | "shra_r.w %[temp1], %[temp1], 2 \n\t" |
865 | "usw %[temp9], 0*" XSTR(BPS) "(%[dst]) \n\t" |
866 | "usw %[temp10], 2*" XSTR(BPS) "(%[dst]) \n\t" |
867 | "prepend %[temp9], %[temp11], 8 \n\t" |
868 | "prepend %[temp10], %[temp1], 8 \n\t" |
869 | "usw %[temp9], 1*" XSTR(BPS) "(%[dst]) \n\t" |
870 | "usw %[temp10], 3*" XSTR(BPS) "(%[dst]) \n\t" |
871 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
872 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
873 | [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [temp8]"=&r" (temp8), |
874 | [temp9]"=&r" (temp9), [temp10]"=&r" (temp10), [temp11]"=&r" (temp11) |
875 | : [top]"r" (top), [dst]"r" (dst) |
876 | : "memory" |
877 | ); |
878 | } |
879 | |
880 | static void VL4(uint8_t* dst, const uint8_t* top) { |
881 | int temp0, temp1, temp2, temp3, temp4; |
882 | int temp5, temp6, temp7, temp8, temp9; |
883 | __asm__ volatile ( |
884 | "ulw %[temp0], 0(%[top]) \n\t" |
885 | "ulw %[temp1], 4(%[top]) \n\t" |
886 | "preceu.ph.qbla %[temp2], %[temp0] \n\t" |
887 | "preceu.ph.qbra %[temp0], %[temp0] \n\t" |
888 | "preceu.ph.qbl %[temp3], %[temp1] \n\t" |
889 | "preceu.ph.qbr %[temp1], %[temp1] \n\t" |
890 | "addqh_r.ph %[temp4], %[temp0], %[temp2] \n\t" |
891 | "packrl.ph %[temp7], %[temp1], %[temp0] \n\t" |
892 | "precrq.ph.w %[temp6], %[temp1], %[temp2] \n\t" |
893 | "shll.ph %[temp9], %[temp2], 1 \n\t" |
894 | "addqh_r.ph %[temp5], %[temp7], %[temp2] \n\t" |
895 | "shll.ph %[temp8], %[temp7], 1 \n\t" |
896 | "addu.ph %[temp2], %[temp2], %[temp6] \n\t" |
897 | "addu.ph %[temp0], %[temp0], %[temp7] \n\t" |
898 | "packrl.ph %[temp7], %[temp3], %[temp1] \n\t" |
899 | "addu.ph %[temp6], %[temp1], %[temp3] \n\t" |
900 | "addu.ph %[temp2], %[temp2], %[temp8] \n\t" |
901 | "addu.ph %[temp0], %[temp0], %[temp9] \n\t" |
902 | "shll.ph %[temp7], %[temp7], 1 \n\t" |
903 | "shra_r.ph %[temp2], %[temp2], 2 \n\t" |
904 | "shra_r.ph %[temp0], %[temp0], 2 \n\t" |
905 | "addu.ph %[temp6], %[temp6], %[temp7] \n\t" |
906 | "shra_r.ph %[temp6], %[temp6], 2 \n\t" |
907 | "precrq.ph.w %[temp8], %[temp5], %[temp4] \n\t" |
908 | "append %[temp5], %[temp4], 16 \n\t" |
909 | "precrq.ph.w %[temp3], %[temp2], %[temp0] \n\t" |
910 | "append %[temp2], %[temp0], 16 \n\t" |
911 | "precr.qb.ph %[temp8], %[temp8], %[temp5] \n\t" |
912 | "precr.qb.ph %[temp3], %[temp3], %[temp2] \n\t" |
913 | "usw %[temp8], 0*" XSTR(BPS) "(%[dst]) \n\t" |
914 | "prepend %[temp8], %[temp6], 8 \n\t" |
915 | "usw %[temp3], 1*" XSTR(BPS) "(%[dst]) \n\t" |
916 | "srl %[temp6], %[temp6], 16 \n\t" |
917 | "prepend %[temp3], %[temp6], 8 \n\t" |
918 | "usw %[temp8], 2*" XSTR(BPS) "(%[dst]) \n\t" |
919 | "usw %[temp3], 3*" XSTR(BPS) "(%[dst]) \n\t" |
920 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
921 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
922 | [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [temp8]"=&r" (temp8), |
923 | [temp9]"=&r" (temp9) |
924 | : [top]"r" (top), [dst]"r" (dst) |
925 | : "memory" |
926 | ); |
927 | } |
928 | |
929 | static void HD4(uint8_t* dst, const uint8_t* top) { |
930 | int temp0, temp1, temp2, temp3, temp4; |
931 | int temp5, temp6, temp7, temp8, temp9; |
932 | __asm__ volatile ( |
933 | "ulw %[temp0], -5(%[top]) \n\t" |
934 | "ulw %[temp1], -1(%[top]) \n\t" |
935 | "preceu.ph.qbla %[temp2], %[temp0] \n\t" |
936 | "preceu.ph.qbra %[temp0], %[temp0] \n\t" |
937 | "preceu.ph.qbl %[temp3], %[temp1] \n\t" |
938 | "preceu.ph.qbr %[temp1], %[temp1] \n\t" |
939 | "addqh_r.ph %[temp4], %[temp0], %[temp2] \n\t" |
940 | "packrl.ph %[temp7], %[temp1], %[temp0] \n\t" |
941 | "precrq.ph.w %[temp6], %[temp1], %[temp2] \n\t" |
942 | "shll.ph %[temp9], %[temp2], 1 \n\t" |
943 | "addqh_r.ph %[temp5], %[temp7], %[temp2] \n\t" |
944 | "shll.ph %[temp8], %[temp7], 1 \n\t" |
945 | "addu.ph %[temp2], %[temp2], %[temp6] \n\t" |
946 | "addu.ph %[temp0], %[temp0], %[temp7] \n\t" |
947 | "packrl.ph %[temp7], %[temp3], %[temp1] \n\t" |
948 | "addu.ph %[temp6], %[temp1], %[temp3] \n\t" |
949 | "addu.ph %[temp2], %[temp2], %[temp8] \n\t" |
950 | "addu.ph %[temp0], %[temp0], %[temp9] \n\t" |
951 | "shll.ph %[temp7], %[temp7], 1 \n\t" |
952 | "shra_r.ph %[temp2], %[temp2], 2 \n\t" |
953 | "shra_r.ph %[temp0], %[temp0], 2 \n\t" |
954 | "addu.ph %[temp6], %[temp6], %[temp7] \n\t" |
955 | "shra_r.ph %[temp6], %[temp6], 2 \n\t" |
956 | "precrq.ph.w %[temp1], %[temp2], %[temp5] \n\t" |
957 | "precrq.ph.w %[temp3], %[temp0], %[temp4] \n\t" |
958 | "precr.qb.ph %[temp7], %[temp6], %[temp1] \n\t" |
959 | "precr.qb.ph %[temp6], %[temp1], %[temp3] \n\t" |
960 | "usw %[temp7], 0*" XSTR(BPS) "(%[dst]) \n\t" |
961 | "usw %[temp6], 1*" XSTR(BPS) "(%[dst]) \n\t" |
962 | "append %[temp2], %[temp5], 16 \n\t" |
963 | "append %[temp0], %[temp4], 16 \n\t" |
964 | "precr.qb.ph %[temp5], %[temp3], %[temp2] \n\t" |
965 | "precr.qb.ph %[temp4], %[temp2], %[temp0] \n\t" |
966 | "usw %[temp5], 2*" XSTR(BPS) "(%[dst]) \n\t" |
967 | "usw %[temp4], 3*" XSTR(BPS) "(%[dst]) \n\t" |
968 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
969 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
970 | [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [temp8]"=&r" (temp8), |
971 | [temp9]"=&r" (temp9) |
972 | : [top]"r" (top), [dst]"r" (dst) |
973 | : "memory" |
974 | ); |
975 | } |
976 | |
977 | static void HU4(uint8_t* dst, const uint8_t* top) { |
978 | int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; |
979 | __asm__ volatile ( |
980 | "ulw %[temp0], -5(%[top]) \n\t" |
981 | "preceu.ph.qbl %[temp1], %[temp0] \n\t" |
982 | "preceu.ph.qbr %[temp2], %[temp0] \n\t" |
983 | "packrl.ph %[temp3], %[temp1], %[temp2] \n\t" |
984 | "replv.qb %[temp7], %[temp2] \n\t" |
985 | "addqh_r.ph %[temp4], %[temp1], %[temp3] \n\t" |
986 | "addqh_r.ph %[temp5], %[temp3], %[temp2] \n\t" |
987 | "shll.ph %[temp6], %[temp3], 1 \n\t" |
988 | "addu.ph %[temp3], %[temp2], %[temp3] \n\t" |
989 | "addu.ph %[temp6], %[temp1], %[temp6] \n\t" |
990 | "shll.ph %[temp0], %[temp2], 1 \n\t" |
991 | "addu.ph %[temp6], %[temp6], %[temp2] \n\t" |
992 | "addu.ph %[temp0], %[temp3], %[temp0] \n\t" |
993 | "shra_r.ph %[temp6], %[temp6], 2 \n\t" |
994 | "shra_r.ph %[temp0], %[temp0], 2 \n\t" |
995 | "packrl.ph %[temp3], %[temp6], %[temp5] \n\t" |
996 | "precrq.ph.w %[temp2], %[temp6], %[temp4] \n\t" |
997 | "append %[temp0], %[temp5], 16 \n\t" |
998 | "precr.qb.ph %[temp3], %[temp3], %[temp2] \n\t" |
999 | "usw %[temp3], 0*" XSTR(BPS) "(%[dst]) \n\t" |
1000 | "precr.qb.ph %[temp1], %[temp7], %[temp0] \n\t" |
1001 | "usw %[temp7], 3*" XSTR(BPS) "(%[dst]) \n\t" |
1002 | "packrl.ph %[temp2], %[temp1], %[temp3] \n\t" |
1003 | "usw %[temp1], 2*" XSTR(BPS) "(%[dst]) \n\t" |
1004 | "usw %[temp2], 1*" XSTR(BPS) "(%[dst]) \n\t" |
1005 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
1006 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
1007 | [temp6]"=&r" (temp6), [temp7]"=&r" (temp7) |
1008 | : [top]"r" (top), [dst]"r" (dst) |
1009 | : "memory" |
1010 | ); |
1011 | } |
1012 | |
1013 | //------------------------------------------------------------------------------ |
1014 | // Chroma 8x8 prediction (paragraph 12.2) |
1015 | |
1016 | static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left, |
1017 | const uint8_t* top) { |
1018 | // U block |
1019 | DCMode8(C8DC8 + dst, left, top); |
1020 | VerticalPred8(C8VE8 + dst, top); |
1021 | HorizontalPred8(C8HE8 + dst, left); |
1022 | TrueMotion8(C8TM8 + dst, left, top); |
1023 | // V block |
1024 | dst += 8; |
1025 | if (top) top += 8; |
1026 | if (left) left += 16; |
1027 | DCMode8(C8DC8 + dst, left, top); |
1028 | VerticalPred8(C8VE8 + dst, top); |
1029 | HorizontalPred8(C8HE8 + dst, left); |
1030 | TrueMotion8(C8TM8 + dst, left, top); |
1031 | } |
1032 | |
1033 | //------------------------------------------------------------------------------ |
1034 | // luma 16x16 prediction (paragraph 12.3) |
1035 | |
1036 | static void Intra16Preds_MIPSdspR2(uint8_t* dst, |
1037 | const uint8_t* left, const uint8_t* top) { |
1038 | DCMode16(I16DC16 + dst, left, top); |
1039 | VerticalPred16(I16VE16 + dst, top); |
1040 | HorizontalPred16(I16HE16 + dst, left); |
1041 | TrueMotion16(I16TM16 + dst, left, top); |
1042 | } |
1043 | |
1044 | // Left samples are top[-5 .. -2], top_left is top[-1], top are |
1045 | // located at top[0..3], and top right is top[4..7] |
1046 | static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) { |
1047 | DC4(I4DC4 + dst, top); |
1048 | TM4(I4TM4 + dst, top); |
1049 | VE4(I4VE4 + dst, top); |
1050 | HE4(I4HE4 + dst, top); |
1051 | RD4(I4RD4 + dst, top); |
1052 | VR4(I4VR4 + dst, top); |
1053 | LD4(I4LD4 + dst, top); |
1054 | VL4(I4VL4 + dst, top); |
1055 | HD4(I4HD4 + dst, top); |
1056 | HU4(I4HU4 + dst, top); |
1057 | } |
1058 | |
1059 | //------------------------------------------------------------------------------ |
1060 | // Metric |
1061 | |
1062 | #if !defined(WORK_AROUND_GCC) |
1063 | |
1064 | #define GET_SSE_INNER(A) \ |
1065 | "lw %[temp0], " #A "(%[a]) \n\t" \ |
1066 | "lw %[temp1], " #A "(%[b]) \n\t" \ |
1067 | "preceu.ph.qbr %[temp2], %[temp0] \n\t" \ |
1068 | "preceu.ph.qbl %[temp0], %[temp0] \n\t" \ |
1069 | "preceu.ph.qbr %[temp3], %[temp1] \n\t" \ |
1070 | "preceu.ph.qbl %[temp1], %[temp1] \n\t" \ |
1071 | "subq.ph %[temp2], %[temp2], %[temp3] \n\t" \ |
1072 | "subq.ph %[temp0], %[temp0], %[temp1] \n\t" \ |
1073 | "dpa.w.ph $ac0, %[temp2], %[temp2] \n\t" \ |
1074 | "dpa.w.ph $ac0, %[temp0], %[temp0] \n\t" |
1075 | |
1076 | #define GET_SSE(A, B, C, D) \ |
1077 | GET_SSE_INNER(A) \ |
1078 | GET_SSE_INNER(B) \ |
1079 | GET_SSE_INNER(C) \ |
1080 | GET_SSE_INNER(D) |
1081 | |
1082 | static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) { |
1083 | int count; |
1084 | int temp0, temp1, temp2, temp3; |
1085 | __asm__ volatile ( |
1086 | "mult $zero, $zero \n\t" |
1087 | GET_SSE( 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS) |
1088 | GET_SSE( 1 * BPS, 4 + 1 * BPS, 8 + 1 * BPS, 12 + 1 * BPS) |
1089 | GET_SSE( 2 * BPS, 4 + 2 * BPS, 8 + 2 * BPS, 12 + 2 * BPS) |
1090 | GET_SSE( 3 * BPS, 4 + 3 * BPS, 8 + 3 * BPS, 12 + 3 * BPS) |
1091 | GET_SSE( 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS) |
1092 | GET_SSE( 5 * BPS, 4 + 5 * BPS, 8 + 5 * BPS, 12 + 5 * BPS) |
1093 | GET_SSE( 6 * BPS, 4 + 6 * BPS, 8 + 6 * BPS, 12 + 6 * BPS) |
1094 | GET_SSE( 7 * BPS, 4 + 7 * BPS, 8 + 7 * BPS, 12 + 7 * BPS) |
1095 | GET_SSE( 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS) |
1096 | GET_SSE( 9 * BPS, 4 + 9 * BPS, 8 + 9 * BPS, 12 + 9 * BPS) |
1097 | GET_SSE(10 * BPS, 4 + 10 * BPS, 8 + 10 * BPS, 12 + 10 * BPS) |
1098 | GET_SSE(11 * BPS, 4 + 11 * BPS, 8 + 11 * BPS, 12 + 11 * BPS) |
1099 | GET_SSE(12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS) |
1100 | GET_SSE(13 * BPS, 4 + 13 * BPS, 8 + 13 * BPS, 12 + 13 * BPS) |
1101 | GET_SSE(14 * BPS, 4 + 14 * BPS, 8 + 14 * BPS, 12 + 14 * BPS) |
1102 | GET_SSE(15 * BPS, 4 + 15 * BPS, 8 + 15 * BPS, 12 + 15 * BPS) |
1103 | "mflo %[count] \n\t" |
1104 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
1105 | [temp3]"=&r" (temp3), [count]"=&r" (count) |
1106 | : [a]"r" (a), [b]"r" (b) |
1107 | : "memory" , "hi" , "lo" |
1108 | ); |
1109 | return count; |
1110 | } |
1111 | |
1112 | static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) { |
1113 | int count; |
1114 | int temp0, temp1, temp2, temp3; |
1115 | __asm__ volatile ( |
1116 | "mult $zero, $zero \n\t" |
1117 | GET_SSE( 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS) |
1118 | GET_SSE( 1 * BPS, 4 + 1 * BPS, 8 + 1 * BPS, 12 + 1 * BPS) |
1119 | GET_SSE( 2 * BPS, 4 + 2 * BPS, 8 + 2 * BPS, 12 + 2 * BPS) |
1120 | GET_SSE( 3 * BPS, 4 + 3 * BPS, 8 + 3 * BPS, 12 + 3 * BPS) |
1121 | GET_SSE( 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS) |
1122 | GET_SSE( 5 * BPS, 4 + 5 * BPS, 8 + 5 * BPS, 12 + 5 * BPS) |
1123 | GET_SSE( 6 * BPS, 4 + 6 * BPS, 8 + 6 * BPS, 12 + 6 * BPS) |
1124 | GET_SSE( 7 * BPS, 4 + 7 * BPS, 8 + 7 * BPS, 12 + 7 * BPS) |
1125 | "mflo %[count] \n\t" |
1126 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
1127 | [temp3]"=&r" (temp3), [count]"=&r" (count) |
1128 | : [a]"r" (a), [b]"r" (b) |
1129 | : "memory" , "hi" , "lo" |
1130 | ); |
1131 | return count; |
1132 | } |
1133 | |
1134 | static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) { |
1135 | int count; |
1136 | int temp0, temp1, temp2, temp3; |
1137 | __asm__ volatile ( |
1138 | "mult $zero, $zero \n\t" |
1139 | GET_SSE(0 * BPS, 4 + 0 * BPS, 1 * BPS, 4 + 1 * BPS) |
1140 | GET_SSE(2 * BPS, 4 + 2 * BPS, 3 * BPS, 4 + 3 * BPS) |
1141 | GET_SSE(4 * BPS, 4 + 4 * BPS, 5 * BPS, 4 + 5 * BPS) |
1142 | GET_SSE(6 * BPS, 4 + 6 * BPS, 7 * BPS, 4 + 7 * BPS) |
1143 | "mflo %[count] \n\t" |
1144 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
1145 | [temp3]"=&r" (temp3), [count]"=&r" (count) |
1146 | : [a]"r" (a), [b]"r" (b) |
1147 | : "memory" , "hi" , "lo" |
1148 | ); |
1149 | return count; |
1150 | } |
1151 | |
1152 | static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) { |
1153 | int count; |
1154 | int temp0, temp1, temp2, temp3; |
1155 | __asm__ volatile ( |
1156 | "mult $zero, $zero \n\t" |
1157 | GET_SSE(0 * BPS, 1 * BPS, 2 * BPS, 3 * BPS) |
1158 | "mflo %[count] \n\t" |
1159 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
1160 | [temp3]"=&r" (temp3), [count]"=&r" (count) |
1161 | : [a]"r" (a), [b]"r" (b) |
1162 | : "memory" , "hi" , "lo" |
1163 | ); |
1164 | return count; |
1165 | } |
1166 | |
1167 | #undef GET_SSE |
1168 | #undef GET_SSE_INNER |
1169 | |
1170 | #endif // !WORK_AROUND_GCC |
1171 | |
1172 | #undef FILL_8_OR_16 |
1173 | #undef FILL_PART |
1174 | #undef OUTPUT_EARLY_CLOBBER_REGS_17 |
1175 | #undef MUL_HALF |
1176 | #undef ABS_X8 |
1177 | #undef ADD_SUB_HALVES_X4 |
1178 | |
1179 | //------------------------------------------------------------------------------ |
1180 | // Quantization |
1181 | // |
1182 | |
1183 | // macro for one pass through for loop in QuantizeBlock reading 2 values at time |
1184 | // QUANTDIV macro inlined |
1185 | // J - offset in bytes (kZigzag[n] * 2) |
1186 | // K - offset in bytes (kZigzag[n] * 4) |
1187 | // N - offset in bytes (n * 2) |
1188 | // N1 - offset in bytes ((n + 1) * 2) |
1189 | #define QUANTIZE_ONE(J, K, N, N1) \ |
1190 | "ulw %[temp1], " #J "(%[ppin]) \n\t" \ |
1191 | "ulw %[temp2], " #J "(%[ppsharpen]) \n\t" \ |
1192 | "lhu %[temp3], " #K "(%[ppzthresh]) \n\t" \ |
1193 | "lhu %[temp6], " #K "+4(%[ppzthresh]) \n\t" \ |
1194 | "absq_s.ph %[temp4], %[temp1] \n\t" \ |
1195 | "ins %[temp3], %[temp6], 16, 16 \n\t" \ |
1196 | "addu.ph %[coeff], %[temp4], %[temp2] \n\t" \ |
1197 | "shra.ph %[sign], %[temp1], 15 \n\t" \ |
1198 | "li %[level], 0x10001 \n\t" \ |
1199 | "cmp.lt.ph %[temp3], %[coeff] \n\t" \ |
1200 | "lhu %[temp1], " #J "(%[ppiq]) \n\t" \ |
1201 | "pick.ph %[temp5], %[level], $0 \n\t" \ |
1202 | "lw %[temp2], " #K "(%[ppbias]) \n\t" \ |
1203 | "beqz %[temp5], 0f \n\t" \ |
1204 | "lhu %[temp3], " #J "(%[ppq]) \n\t" \ |
1205 | "beq %[temp5], %[level], 1f \n\t" \ |
1206 | "andi %[temp5], %[temp5], 0x1 \n\t" \ |
1207 | "andi %[temp4], %[coeff], 0xffff \n\t" \ |
1208 | "beqz %[temp5], 2f \n\t" \ |
1209 | "mul %[level], %[temp4], %[temp1] \n\t" \ |
1210 | "sh $0, " #J "+2(%[ppin]) \n\t" \ |
1211 | "sh $0, " #N1 "(%[pout]) \n\t" \ |
1212 | "addu %[level], %[level], %[temp2] \n\t" \ |
1213 | "sra %[level], %[level], 17 \n\t" \ |
1214 | "slt %[temp4], %[max_level], %[level] \n\t" \ |
1215 | "movn %[level], %[max_level], %[temp4] \n\t" \ |
1216 | "andi %[temp6], %[sign], 0xffff \n\t" \ |
1217 | "xor %[level], %[level], %[temp6] \n\t" \ |
1218 | "subu %[level], %[level], %[temp6] \n\t" \ |
1219 | "mul %[temp5], %[level], %[temp3] \n\t" \ |
1220 | "or %[ret], %[ret], %[level] \n\t" \ |
1221 | "sh %[level], " #N "(%[pout]) \n\t" \ |
1222 | "sh %[temp5], " #J "(%[ppin]) \n\t" \ |
1223 | "j 3f \n\t" \ |
1224 | "2: \n\t" \ |
1225 | "lhu %[temp1], " #J "+2(%[ppiq]) \n\t" \ |
1226 | "srl %[temp5], %[coeff], 16 \n\t" \ |
1227 | "mul %[level], %[temp5], %[temp1] \n\t" \ |
1228 | "lw %[temp2], " #K "+4(%[ppbias]) \n\t" \ |
1229 | "lhu %[temp3], " #J "+2(%[ppq]) \n\t" \ |
1230 | "addu %[level], %[level], %[temp2] \n\t" \ |
1231 | "sra %[level], %[level], 17 \n\t" \ |
1232 | "srl %[temp6], %[sign], 16 \n\t" \ |
1233 | "slt %[temp4], %[max_level], %[level] \n\t" \ |
1234 | "movn %[level], %[max_level], %[temp4] \n\t" \ |
1235 | "xor %[level], %[level], %[temp6] \n\t" \ |
1236 | "subu %[level], %[level], %[temp6] \n\t" \ |
1237 | "mul %[temp5], %[level], %[temp3] \n\t" \ |
1238 | "sh $0, " #J "(%[ppin]) \n\t" \ |
1239 | "sh $0, " #N "(%[pout]) \n\t" \ |
1240 | "or %[ret], %[ret], %[level] \n\t" \ |
1241 | "sh %[temp5], " #J "+2(%[ppin]) \n\t" \ |
1242 | "sh %[level], " #N1 "(%[pout]) \n\t" \ |
1243 | "j 3f \n\t" \ |
1244 | "1: \n\t" \ |
1245 | "lhu %[temp1], " #J "(%[ppiq]) \n\t" \ |
1246 | "lw %[temp2], " #K "(%[ppbias]) \n\t" \ |
1247 | "ulw %[temp3], " #J "(%[ppq]) \n\t" \ |
1248 | "andi %[temp5], %[coeff], 0xffff \n\t" \ |
1249 | "srl %[temp0], %[coeff], 16 \n\t" \ |
1250 | "lhu %[temp6], " #J "+2(%[ppiq]) \n\t" \ |
1251 | "lw %[coeff], " #K "+4(%[ppbias]) \n\t" \ |
1252 | "mul %[level], %[temp5], %[temp1] \n\t" \ |
1253 | "mul %[temp4], %[temp0], %[temp6] \n\t" \ |
1254 | "addu %[level], %[level], %[temp2] \n\t" \ |
1255 | "addu %[temp4], %[temp4], %[coeff] \n\t" \ |
1256 | "precrq.ph.w %[level], %[temp4], %[level] \n\t" \ |
1257 | "shra.ph %[level], %[level], 1 \n\t" \ |
1258 | "cmp.lt.ph %[max_level1],%[level] \n\t" \ |
1259 | "pick.ph %[level], %[max_level], %[level] \n\t" \ |
1260 | "xor %[level], %[level], %[sign] \n\t" \ |
1261 | "subu.ph %[level], %[level], %[sign] \n\t" \ |
1262 | "mul.ph %[temp3], %[level], %[temp3] \n\t" \ |
1263 | "or %[ret], %[ret], %[level] \n\t" \ |
1264 | "sh %[level], " #N "(%[pout]) \n\t" \ |
1265 | "srl %[level], %[level], 16 \n\t" \ |
1266 | "sh %[level], " #N1 "(%[pout]) \n\t" \ |
1267 | "usw %[temp3], " #J "(%[ppin]) \n\t" \ |
1268 | "j 3f \n\t" \ |
1269 | "0: \n\t" \ |
1270 | "sh $0, " #N "(%[pout]) \n\t" \ |
1271 | "sh $0, " #N1 "(%[pout]) \n\t" \ |
1272 | "usw $0, " #J "(%[ppin]) \n\t" \ |
1273 | "3: \n\t" |
1274 | |
1275 | static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16], |
1276 | const VP8Matrix* const mtx) { |
1277 | int temp0, temp1, temp2, temp3, temp4, temp5,temp6; |
1278 | int sign, coeff, level; |
1279 | int max_level = MAX_LEVEL; |
1280 | int max_level1 = max_level << 16 | max_level; |
1281 | int ret = 0; |
1282 | |
1283 | int16_t* ppin = &in[0]; |
1284 | int16_t* pout = &out[0]; |
1285 | const uint16_t* ppsharpen = &mtx->sharpen_[0]; |
1286 | const uint32_t* ppzthresh = &mtx->zthresh_[0]; |
1287 | const uint16_t* ppq = &mtx->q_[0]; |
1288 | const uint16_t* ppiq = &mtx->iq_[0]; |
1289 | const uint32_t* ppbias = &mtx->bias_[0]; |
1290 | |
1291 | __asm__ volatile ( |
1292 | QUANTIZE_ONE( 0, 0, 0, 2) |
1293 | QUANTIZE_ONE( 4, 8, 10, 12) |
1294 | QUANTIZE_ONE( 8, 16, 4, 8) |
1295 | QUANTIZE_ONE(12, 24, 14, 24) |
1296 | QUANTIZE_ONE(16, 32, 6, 16) |
1297 | QUANTIZE_ONE(20, 40, 22, 26) |
1298 | QUANTIZE_ONE(24, 48, 18, 20) |
1299 | QUANTIZE_ONE(28, 56, 28, 30) |
1300 | |
1301 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), |
1302 | [temp2]"=&r" (temp2), [temp3]"=&r" (temp3), |
1303 | [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
1304 | [sign]"=&r" (sign), [coeff]"=&r" (coeff), |
1305 | [level]"=&r" (level), [temp6]"=&r" (temp6), [ret]"+&r" (ret) |
1306 | : [ppin]"r" (ppin), [pout]"r" (pout), [max_level1]"r" (max_level1), |
1307 | [ppiq]"r" (ppiq), [max_level]"r" (max_level), |
1308 | [ppbias]"r" (ppbias), [ppzthresh]"r" (ppzthresh), |
1309 | [ppsharpen]"r" (ppsharpen), [ppq]"r" (ppq) |
1310 | : "memory" , "hi" , "lo" |
1311 | ); |
1312 | |
1313 | return (ret != 0); |
1314 | } |
1315 | |
1316 | static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32], |
1317 | const VP8Matrix* const mtx) { |
1318 | int nz; |
1319 | nz = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0; |
1320 | nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1; |
1321 | return nz; |
1322 | } |
1323 | |
1324 | #undef QUANTIZE_ONE |
1325 | |
1326 | // macro for one horizontal pass in FTransformWHT |
1327 | // temp0..temp7 holds tmp[0]..tmp[15] |
1328 | // A, B, C, D - offset in bytes to load from in buffer |
1329 | // TEMP0, TEMP1 - registers for corresponding tmp elements |
1330 | #define HORIZONTAL_PASS_WHT(A, B, C, D, TEMP0, TEMP1) \ |
1331 | "lh %[" #TEMP0 "], " #A "(%[in]) \n\t" \ |
1332 | "lh %[" #TEMP1 "], " #B "(%[in]) \n\t" \ |
1333 | "lh %[temp8], " #C "(%[in]) \n\t" \ |
1334 | "lh %[temp9], " #D "(%[in]) \n\t" \ |
1335 | "ins %[" #TEMP1 "], %[" #TEMP0 "], 16, 16 \n\t" \ |
1336 | "ins %[temp9], %[temp8], 16, 16 \n\t" \ |
1337 | "subq.ph %[temp8], %[" #TEMP1 "], %[temp9] \n\t" \ |
1338 | "addq.ph %[temp9], %[" #TEMP1 "], %[temp9] \n\t" \ |
1339 | "precrq.ph.w %[" #TEMP0 "], %[temp8], %[temp9] \n\t" \ |
1340 | "append %[temp8], %[temp9], 16 \n\t" \ |
1341 | "subq.ph %[" #TEMP1 "], %[" #TEMP0 "], %[temp8] \n\t" \ |
1342 | "addq.ph %[" #TEMP0 "], %[" #TEMP0 "], %[temp8] \n\t" \ |
1343 | "rotr %[" #TEMP1 "], %[" #TEMP1 "], 16 \n\t" |
1344 | |
1345 | // macro for one vertical pass in FTransformWHT |
1346 | // temp0..temp7 holds tmp[0]..tmp[15] |
1347 | // A, B, C, D - offsets in bytes to store to out buffer |
1348 | // TEMP0, TEMP2, TEMP4 and TEMP6 - registers for corresponding tmp elements |
1349 | #define VERTICAL_PASS_WHT(A, B, C, D, TEMP0, TEMP2, TEMP4, TEMP6) \ |
1350 | "addq.ph %[temp8], %[" #TEMP0 "], %[" #TEMP4 "] \n\t" \ |
1351 | "addq.ph %[temp9], %[" #TEMP2 "], %[" #TEMP6 "] \n\t" \ |
1352 | "subq.ph %[" #TEMP2 "], %[" #TEMP2 "], %[" #TEMP6 "] \n\t" \ |
1353 | "subq.ph %[" #TEMP6 "], %[" #TEMP0 "], %[" #TEMP4 "] \n\t" \ |
1354 | "addqh.ph %[" #TEMP0 "], %[temp8], %[temp9] \n\t" \ |
1355 | "subqh.ph %[" #TEMP4 "], %[" #TEMP6 "], %[" #TEMP2 "] \n\t" \ |
1356 | "addqh.ph %[" #TEMP2 "], %[" #TEMP2 "], %[" #TEMP6 "] \n\t" \ |
1357 | "subqh.ph %[" #TEMP6 "], %[temp8], %[temp9] \n\t" \ |
1358 | "usw %[" #TEMP0 "], " #A "(%[out]) \n\t" \ |
1359 | "usw %[" #TEMP2 "], " #B "(%[out]) \n\t" \ |
1360 | "usw %[" #TEMP4 "], " #C "(%[out]) \n\t" \ |
1361 | "usw %[" #TEMP6 "], " #D "(%[out]) \n\t" |
1362 | |
1363 | static void FTransformWHT_MIPSdspR2(const int16_t* in, int16_t* out) { |
1364 | int temp0, temp1, temp2, temp3, temp4; |
1365 | int temp5, temp6, temp7, temp8, temp9; |
1366 | |
1367 | __asm__ volatile ( |
1368 | HORIZONTAL_PASS_WHT( 0, 32, 64, 96, temp0, temp1) |
1369 | HORIZONTAL_PASS_WHT(128, 160, 192, 224, temp2, temp3) |
1370 | HORIZONTAL_PASS_WHT(256, 288, 320, 352, temp4, temp5) |
1371 | HORIZONTAL_PASS_WHT(384, 416, 448, 480, temp6, temp7) |
1372 | VERTICAL_PASS_WHT(0, 8, 16, 24, temp0, temp2, temp4, temp6) |
1373 | VERTICAL_PASS_WHT(4, 12, 20, 28, temp1, temp3, temp5, temp7) |
1374 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
1375 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
1376 | [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [temp8]"=&r" (temp8), |
1377 | [temp9]"=&r" (temp9) |
1378 | : [in]"r" (in), [out]"r" (out) |
1379 | : "memory" |
1380 | ); |
1381 | } |
1382 | |
1383 | #undef VERTICAL_PASS_WHT |
1384 | #undef HORIZONTAL_PASS_WHT |
1385 | |
1386 | // macro for converting coefficients to bin |
1387 | // convert 8 coeffs at time |
1388 | // A, B, C, D - offsets in bytes to load from out buffer |
1389 | #define CONVERT_COEFFS_TO_BIN(A, B, C, D) \ |
1390 | "ulw %[temp0], " #A "(%[out]) \n\t" \ |
1391 | "ulw %[temp1], " #B "(%[out]) \n\t" \ |
1392 | "ulw %[temp2], " #C "(%[out]) \n\t" \ |
1393 | "ulw %[temp3], " #D "(%[out]) \n\t" \ |
1394 | "absq_s.ph %[temp0], %[temp0] \n\t" \ |
1395 | "absq_s.ph %[temp1], %[temp1] \n\t" \ |
1396 | "absq_s.ph %[temp2], %[temp2] \n\t" \ |
1397 | "absq_s.ph %[temp3], %[temp3] \n\t" \ |
1398 | "shra.ph %[temp0], %[temp0], 3 \n\t" \ |
1399 | "shra.ph %[temp1], %[temp1], 3 \n\t" \ |
1400 | "shra.ph %[temp2], %[temp2], 3 \n\t" \ |
1401 | "shra.ph %[temp3], %[temp3], 3 \n\t" \ |
1402 | "shll_s.ph %[temp0], %[temp0], 10 \n\t" \ |
1403 | "shll_s.ph %[temp1], %[temp1], 10 \n\t" \ |
1404 | "shll_s.ph %[temp2], %[temp2], 10 \n\t" \ |
1405 | "shll_s.ph %[temp3], %[temp3], 10 \n\t" \ |
1406 | "shrl.ph %[temp0], %[temp0], 10 \n\t" \ |
1407 | "shrl.ph %[temp1], %[temp1], 10 \n\t" \ |
1408 | "shrl.ph %[temp2], %[temp2], 10 \n\t" \ |
1409 | "shrl.ph %[temp3], %[temp3], 10 \n\t" \ |
1410 | "shll.ph %[temp0], %[temp0], 2 \n\t" \ |
1411 | "shll.ph %[temp1], %[temp1], 2 \n\t" \ |
1412 | "shll.ph %[temp2], %[temp2], 2 \n\t" \ |
1413 | "shll.ph %[temp3], %[temp3], 2 \n\t" \ |
1414 | "ext %[temp4], %[temp0], 0, 16 \n\t" \ |
1415 | "ext %[temp0], %[temp0], 16, 16 \n\t" \ |
1416 | "addu %[temp4], %[temp4], %[dist] \n\t" \ |
1417 | "addu %[temp0], %[temp0], %[dist] \n\t" \ |
1418 | "ext %[temp5], %[temp1], 0, 16 \n\t" \ |
1419 | "lw %[temp8], 0(%[temp4]) \n\t" \ |
1420 | "ext %[temp1], %[temp1], 16, 16 \n\t" \ |
1421 | "addu %[temp5], %[temp5], %[dist] \n\t" \ |
1422 | "addiu %[temp8], %[temp8], 1 \n\t" \ |
1423 | "sw %[temp8], 0(%[temp4]) \n\t" \ |
1424 | "lw %[temp8], 0(%[temp0]) \n\t" \ |
1425 | "addu %[temp1], %[temp1], %[dist] \n\t" \ |
1426 | "ext %[temp6], %[temp2], 0, 16 \n\t" \ |
1427 | "addiu %[temp8], %[temp8], 1 \n\t" \ |
1428 | "sw %[temp8], 0(%[temp0]) \n\t" \ |
1429 | "lw %[temp8], 0(%[temp5]) \n\t" \ |
1430 | "ext %[temp2], %[temp2], 16, 16 \n\t" \ |
1431 | "addu %[temp6], %[temp6], %[dist] \n\t" \ |
1432 | "addiu %[temp8], %[temp8], 1 \n\t" \ |
1433 | "sw %[temp8], 0(%[temp5]) \n\t" \ |
1434 | "lw %[temp8], 0(%[temp1]) \n\t" \ |
1435 | "addu %[temp2], %[temp2], %[dist] \n\t" \ |
1436 | "ext %[temp7], %[temp3], 0, 16 \n\t" \ |
1437 | "addiu %[temp8], %[temp8], 1 \n\t" \ |
1438 | "sw %[temp8], 0(%[temp1]) \n\t" \ |
1439 | "lw %[temp8], 0(%[temp6]) \n\t" \ |
1440 | "ext %[temp3], %[temp3], 16, 16 \n\t" \ |
1441 | "addu %[temp7], %[temp7], %[dist] \n\t" \ |
1442 | "addiu %[temp8], %[temp8], 1 \n\t" \ |
1443 | "sw %[temp8], 0(%[temp6]) \n\t" \ |
1444 | "lw %[temp8], 0(%[temp2]) \n\t" \ |
1445 | "addu %[temp3], %[temp3], %[dist] \n\t" \ |
1446 | "addiu %[temp8], %[temp8], 1 \n\t" \ |
1447 | "sw %[temp8], 0(%[temp2]) \n\t" \ |
1448 | "lw %[temp8], 0(%[temp7]) \n\t" \ |
1449 | "addiu %[temp8], %[temp8], 1 \n\t" \ |
1450 | "sw %[temp8], 0(%[temp7]) \n\t" \ |
1451 | "lw %[temp8], 0(%[temp3]) \n\t" \ |
1452 | "addiu %[temp8], %[temp8], 1 \n\t" \ |
1453 | "sw %[temp8], 0(%[temp3]) \n\t" |
1454 | |
1455 | static void CollectHistogram_MIPSdspR2(const uint8_t* ref, const uint8_t* pred, |
1456 | int start_block, int end_block, |
1457 | VP8Histogram* const histo) { |
1458 | int j; |
1459 | int distribution[MAX_COEFF_THRESH + 1] = { 0 }; |
1460 | const int max_coeff = (MAX_COEFF_THRESH << 16) + MAX_COEFF_THRESH; |
1461 | for (j = start_block; j < end_block; ++j) { |
1462 | int16_t out[16]; |
1463 | int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; |
1464 | |
1465 | VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); |
1466 | |
1467 | // Convert coefficients to bin. |
1468 | __asm__ volatile ( |
1469 | CONVERT_COEFFS_TO_BIN( 0, 4, 8, 12) |
1470 | CONVERT_COEFFS_TO_BIN(16, 20, 24, 28) |
1471 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
1472 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
1473 | [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [temp8]"=&r" (temp8) |
1474 | : [dist]"r" (distribution), [out]"r" (out), [max_coeff]"r" (max_coeff) |
1475 | : "memory" |
1476 | ); |
1477 | } |
1478 | VP8SetHistogramData(distribution, histo); |
1479 | } |
1480 | |
1481 | #undef CONVERT_COEFFS_TO_BIN |
1482 | |
1483 | //------------------------------------------------------------------------------ |
1484 | // Entry point |
1485 | |
1486 | extern void VP8EncDspInitMIPSdspR2(void); |
1487 | |
1488 | WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) { |
1489 | VP8FTransform = FTransform_MIPSdspR2; |
1490 | VP8FTransformWHT = FTransformWHT_MIPSdspR2; |
1491 | VP8ITransform = ITransform_MIPSdspR2; |
1492 | |
1493 | VP8TDisto4x4 = Disto4x4_MIPSdspR2; |
1494 | VP8TDisto16x16 = Disto16x16_MIPSdspR2; |
1495 | |
1496 | VP8EncPredLuma16 = Intra16Preds_MIPSdspR2; |
1497 | VP8EncPredChroma8 = IntraChromaPreds_MIPSdspR2; |
1498 | VP8EncPredLuma4 = Intra4Preds_MIPSdspR2; |
1499 | |
1500 | #if !defined(WORK_AROUND_GCC) |
1501 | VP8SSE16x16 = SSE16x16_MIPSdspR2; |
1502 | VP8SSE8x8 = SSE8x8_MIPSdspR2; |
1503 | VP8SSE16x8 = SSE16x8_MIPSdspR2; |
1504 | VP8SSE4x4 = SSE4x4_MIPSdspR2; |
1505 | #endif |
1506 | |
1507 | VP8EncQuantizeBlock = QuantizeBlock_MIPSdspR2; |
1508 | VP8EncQuantize2Blocks = Quantize2Blocks_MIPSdspR2; |
1509 | |
1510 | VP8CollectHistogram = CollectHistogram_MIPSdspR2; |
1511 | } |
1512 | |
1513 | #else // !WEBP_USE_MIPS_DSP_R2 |
1514 | |
1515 | WEBP_DSP_INIT_STUB(VP8EncDspInitMIPSdspR2) |
1516 | |
1517 | #endif // WEBP_USE_MIPS_DSP_R2 |
1518 | |