1 | // Copyright 2014 Google Inc. All Rights Reserved. |
2 | // |
3 | // Use of this source code is governed by a BSD-style license |
4 | // that can be found in the COPYING file in the root of the source |
5 | // tree. An additional intellectual property rights grant can be found |
6 | // in the file PATENTS. All contributing project authors may |
7 | // be found in the AUTHORS file in the root of the source tree. |
8 | // ----------------------------------------------------------------------------- |
9 | // |
10 | // MIPS version of speed-critical encoding functions. |
11 | // |
12 | // Author(s): Darko Laus (darko.laus@imgtec.com) |
13 | // Mirko Raus (mirko.raus@imgtec.com) |
14 | |
15 | #include "./dsp.h" |
16 | |
17 | #if defined(WEBP_USE_MIPS_DSP_R2) |
18 | |
19 | #include "./mips_macro.h" |
20 | #include "../enc/cost_enc.h" |
21 | #include "../enc/vp8i_enc.h" |
22 | |
23 | static const int kC1 = 20091 + (1 << 16); |
24 | static const int kC2 = 35468; |
25 | |
26 | // O - output |
27 | // I - input (macro doesn't change it) |
28 | #define ADD_SUB_HALVES_X4(O0, O1, O2, O3, O4, O5, O6, O7, \ |
29 | I0, I1, I2, I3, I4, I5, I6, I7) \ |
30 | "addq.ph %[" #O0 "], %[" #I0 "], %[" #I1 "] \n\t" \ |
31 | "subq.ph %[" #O1 "], %[" #I0 "], %[" #I1 "] \n\t" \ |
32 | "addq.ph %[" #O2 "], %[" #I2 "], %[" #I3 "] \n\t" \ |
33 | "subq.ph %[" #O3 "], %[" #I2 "], %[" #I3 "] \n\t" \ |
34 | "addq.ph %[" #O4 "], %[" #I4 "], %[" #I5 "] \n\t" \ |
35 | "subq.ph %[" #O5 "], %[" #I4 "], %[" #I5 "] \n\t" \ |
36 | "addq.ph %[" #O6 "], %[" #I6 "], %[" #I7 "] \n\t" \ |
37 | "subq.ph %[" #O7 "], %[" #I6 "], %[" #I7 "] \n\t" |
38 | |
39 | // IO - input/output |
40 | #define ABS_X8(IO0, IO1, IO2, IO3, IO4, IO5, IO6, IO7) \ |
41 | "absq_s.ph %[" #IO0 "], %[" #IO0 "] \n\t" \ |
42 | "absq_s.ph %[" #IO1 "], %[" #IO1 "] \n\t" \ |
43 | "absq_s.ph %[" #IO2 "], %[" #IO2 "] \n\t" \ |
44 | "absq_s.ph %[" #IO3 "], %[" #IO3 "] \n\t" \ |
45 | "absq_s.ph %[" #IO4 "], %[" #IO4 "] \n\t" \ |
46 | "absq_s.ph %[" #IO5 "], %[" #IO5 "] \n\t" \ |
47 | "absq_s.ph %[" #IO6 "], %[" #IO6 "] \n\t" \ |
48 | "absq_s.ph %[" #IO7 "], %[" #IO7 "] \n\t" |
49 | |
50 | // dpa.w.ph $ac0 temp0 ,temp1 |
51 | // $ac += temp0[31..16] * temp1[31..16] + temp0[15..0] * temp1[15..0] |
52 | // dpax.w.ph $ac0 temp0 ,temp1 |
53 | // $ac += temp0[31..16] * temp1[15..0] + temp0[15..0] * temp1[31..16] |
54 | // O - output |
55 | // I - input (macro doesn't change it) |
56 | #define MUL_HALF(O0, I0, I1, I2, I3, I4, I5, I6, I7, \ |
57 | I8, I9, I10, I11, I12, I13, I14, I15) \ |
58 | "mult $ac0, $zero, $zero \n\t" \ |
59 | "dpa.w.ph $ac0, %[" #I2 "], %[" #I0 "] \n\t" \ |
60 | "dpax.w.ph $ac0, %[" #I5 "], %[" #I6 "] \n\t" \ |
61 | "dpa.w.ph $ac0, %[" #I8 "], %[" #I9 "] \n\t" \ |
62 | "dpax.w.ph $ac0, %[" #I11 "], %[" #I4 "] \n\t" \ |
63 | "dpa.w.ph $ac0, %[" #I12 "], %[" #I7 "] \n\t" \ |
64 | "dpax.w.ph $ac0, %[" #I13 "], %[" #I1 "] \n\t" \ |
65 | "dpa.w.ph $ac0, %[" #I14 "], %[" #I3 "] \n\t" \ |
66 | "dpax.w.ph $ac0, %[" #I15 "], %[" #I10 "] \n\t" \ |
67 | "mflo %[" #O0 "], $ac0 \n\t" |
68 | |
69 | #define OUTPUT_EARLY_CLOBBER_REGS_17() \ |
70 | OUTPUT_EARLY_CLOBBER_REGS_10(), \ |
71 | [temp11]"=&r"(temp11), [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), \ |
72 | [temp14]"=&r"(temp14), [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), \ |
73 | [temp17]"=&r"(temp17) |
74 | |
75 | // macro for one horizontal pass in FTransform |
76 | // temp0..temp15 holds tmp[0]..tmp[15] |
77 | // A - offset in bytes to load from src and ref buffers |
78 | // TEMP0..TEMP3 - registers for corresponding tmp elements |
79 | #define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3) \ |
80 | "lw %[" #TEMP0 "], 0(%[args]) \n\t" \ |
81 | "lw %[" #TEMP1 "], 4(%[args]) \n\t" \ |
82 | "lw %[" #TEMP2 "], " XSTR(BPS) "*" #A "(%[" #TEMP0 "]) \n\t" \ |
83 | "lw %[" #TEMP3 "], " XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \ |
84 | "preceu.ph.qbl %[" #TEMP0 "], %[" #TEMP2 "] \n\t" \ |
85 | "preceu.ph.qbl %[" #TEMP1 "], %[" #TEMP3 "] \n\t" \ |
86 | "preceu.ph.qbr %[" #TEMP2 "], %[" #TEMP2 "] \n\t" \ |
87 | "preceu.ph.qbr %[" #TEMP3 "], %[" #TEMP3 "] \n\t" \ |
88 | "subq.ph %[" #TEMP0 "], %[" #TEMP0 "], %[" #TEMP1 "] \n\t" \ |
89 | "subq.ph %[" #TEMP2 "], %[" #TEMP2 "], %[" #TEMP3 "] \n\t" \ |
90 | "rotr %[" #TEMP0 "], %[" #TEMP0 "], 16 \n\t" \ |
91 | "addq.ph %[" #TEMP1 "], %[" #TEMP2 "], %[" #TEMP0 "] \n\t" \ |
92 | "subq.ph %[" #TEMP3 "], %[" #TEMP2 "], %[" #TEMP0 "] \n\t" \ |
93 | "seh %[" #TEMP0 "], %[" #TEMP1 "] \n\t" \ |
94 | "sra %[temp16], %[" #TEMP1 "], 16 \n\t" \ |
95 | "seh %[temp19], %[" #TEMP3 "] \n\t" \ |
96 | "sra %[" #TEMP3 "], %[" #TEMP3 "], 16 \n\t" \ |
97 | "subu %[" #TEMP2 "], %[" #TEMP0 "], %[temp16] \n\t" \ |
98 | "addu %[" #TEMP0 "], %[" #TEMP0 "], %[temp16] \n\t" \ |
99 | "mul %[temp17], %[temp19], %[c2217] \n\t" \ |
100 | "mul %[temp18], %[" #TEMP3 "], %[c5352] \n\t" \ |
101 | "mul %[" #TEMP1 "], %[temp19], %[c5352] \n\t" \ |
102 | "mul %[temp16], %[" #TEMP3 "], %[c2217] \n\t" \ |
103 | "sll %[" #TEMP2 "], %[" #TEMP2 "], 3 \n\t" \ |
104 | "sll %[" #TEMP0 "], %[" #TEMP0 "], 3 \n\t" \ |
105 | "subu %[" #TEMP3 "], %[temp17], %[temp18] \n\t" \ |
106 | "addu %[" #TEMP1 "], %[temp16], %[" #TEMP1 "] \n\t" \ |
107 | "addiu %[" #TEMP3 "], %[" #TEMP3 "], 937 \n\t" \ |
108 | "addiu %[" #TEMP1 "], %[" #TEMP1 "], 1812 \n\t" \ |
109 | "sra %[" #TEMP3 "], %[" #TEMP3 "], 9 \n\t" \ |
110 | "sra %[" #TEMP1 "], %[" #TEMP1 "], 9 \n\t" |
111 | |
112 | // macro for one vertical pass in FTransform |
113 | // temp0..temp15 holds tmp[0]..tmp[15] |
114 | // A..D - offsets in bytes to store to out buffer |
115 | // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements |
116 | #define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12) \ |
117 | "addu %[temp16], %[" #TEMP0 "], %[" #TEMP12 "] \n\t" \ |
118 | "subu %[temp19], %[" #TEMP0 "], %[" #TEMP12 "] \n\t" \ |
119 | "addu %[temp17], %[" #TEMP4 "], %[" #TEMP8 "] \n\t" \ |
120 | "subu %[temp18], %[" #TEMP4 "], %[" #TEMP8 "] \n\t" \ |
121 | "mul %[" #TEMP8 "], %[temp19], %[c2217] \n\t" \ |
122 | "mul %[" #TEMP12 "], %[temp18], %[c2217] \n\t" \ |
123 | "mul %[" #TEMP4 "], %[temp19], %[c5352] \n\t" \ |
124 | "mul %[temp18], %[temp18], %[c5352] \n\t" \ |
125 | "addiu %[temp16], %[temp16], 7 \n\t" \ |
126 | "addu %[" #TEMP0 "], %[temp16], %[temp17] \n\t" \ |
127 | "sra %[" #TEMP0 "], %[" #TEMP0 "], 4 \n\t" \ |
128 | "addu %[" #TEMP12 "], %[" #TEMP12 "], %[" #TEMP4 "] \n\t" \ |
129 | "subu %[" #TEMP4 "], %[temp16], %[temp17] \n\t" \ |
130 | "sra %[" #TEMP4 "], %[" #TEMP4 "], 4 \n\t" \ |
131 | "addiu %[" #TEMP8 "], %[" #TEMP8 "], 30000 \n\t" \ |
132 | "addiu %[" #TEMP12 "], %[" #TEMP12 "], 12000 \n\t" \ |
133 | "addiu %[" #TEMP8 "], %[" #TEMP8 "], 21000 \n\t" \ |
134 | "subu %[" #TEMP8 "], %[" #TEMP8 "], %[temp18] \n\t" \ |
135 | "sra %[" #TEMP12 "], %[" #TEMP12 "], 16 \n\t" \ |
136 | "sra %[" #TEMP8 "], %[" #TEMP8 "], 16 \n\t" \ |
137 | "addiu %[temp16], %[" #TEMP12 "], 1 \n\t" \ |
138 | "movn %[" #TEMP12 "], %[temp16], %[temp19] \n\t" \ |
139 | "sh %[" #TEMP0 "], " #A "(%[temp20]) \n\t" \ |
140 | "sh %[" #TEMP4 "], " #C "(%[temp20]) \n\t" \ |
141 | "sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \ |
142 | "sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t" |
143 | |
144 | static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { |
145 | const int c2217 = 2217; |
146 | const int c5352 = 5352; |
147 | int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; |
148 | int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16; |
149 | int temp17, temp18, temp19, temp20; |
150 | const int* const args[3] = |
151 | { (const int*)src, (const int*)ref, (const int*)out }; |
152 | |
153 | __asm__ volatile ( |
154 | HORIZONTAL_PASS(0, temp0, temp1, temp2, temp3) |
155 | HORIZONTAL_PASS(1, temp4, temp5, temp6, temp7) |
156 | HORIZONTAL_PASS(2, temp8, temp9, temp10, temp11) |
157 | HORIZONTAL_PASS(3, temp12, temp13, temp14, temp15) |
158 | "lw %[temp20], 8(%[args]) \n\t" |
159 | VERTICAL_PASS(0, 8, 16, 24, temp0, temp4, temp8, temp12) |
160 | VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9, temp13) |
161 | VERTICAL_PASS(4, 12, 20, 28, temp2, temp6, temp10, temp14) |
162 | VERTICAL_PASS(6, 14, 22, 30, temp3, temp7, temp11, temp15) |
163 | OUTPUT_EARLY_CLOBBER_REGS_18(), |
164 | [temp0]"=&r" (temp0), [temp19]"=&r" (temp19), [temp20]"=&r" (temp20) |
165 | : [args]"r" (args), [c2217]"r" (c2217), [c5352]"r" (c5352) |
166 | : "memory" , "hi" , "lo" |
167 | ); |
168 | } |
169 | |
170 | #undef VERTICAL_PASS |
171 | #undef HORIZONTAL_PASS |
172 | |
173 | static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, |
174 | uint8_t* dst) { |
175 | int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; |
176 | int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18; |
177 | |
178 | __asm__ volatile ( |
179 | "ulw %[temp1], 0(%[in]) \n\t" |
180 | "ulw %[temp2], 16(%[in]) \n\t" |
181 | LOAD_IN_X2(temp5, temp6, 24, 26) |
182 | ADD_SUB_HALVES(temp3, temp4, temp1, temp2) |
183 | LOAD_IN_X2(temp1, temp2, 8, 10) |
184 | MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14, |
185 | temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6, |
186 | temp13, temp11, temp14, temp12) |
187 | INSERT_HALF_X2(temp8, temp7, temp10, temp9) |
188 | "ulw %[temp17], 4(%[in]) \n\t" |
189 | "ulw %[temp18], 20(%[in]) \n\t" |
190 | ADD_SUB_HALVES(temp1, temp2, temp3, temp8) |
191 | ADD_SUB_HALVES(temp5, temp6, temp4, temp7) |
192 | ADD_SUB_HALVES(temp7, temp8, temp17, temp18) |
193 | LOAD_IN_X2(temp17, temp18, 12, 14) |
194 | LOAD_IN_X2(temp9, temp10, 28, 30) |
195 | MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17, |
196 | temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10, |
197 | temp15, temp4, temp16, temp17) |
198 | INSERT_HALF_X2(temp11, temp12, temp13, temp14) |
199 | ADD_SUB_HALVES(temp17, temp8, temp8, temp11) |
200 | ADD_SUB_HALVES(temp3, temp4, temp7, temp12) |
201 | |
202 | // horizontal |
203 | SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6) |
204 | INSERT_HALF_X2(temp1, temp6, temp5, temp2) |
205 | SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8) |
206 | "repl.ph %[temp2], 0x4 \n\t" |
207 | INSERT_HALF_X2(temp3, temp8, temp17, temp4) |
208 | "addq.ph %[temp1], %[temp1], %[temp2] \n\t" |
209 | "addq.ph %[temp6], %[temp6], %[temp2] \n\t" |
210 | ADD_SUB_HALVES(temp2, temp4, temp1, temp3) |
211 | ADD_SUB_HALVES(temp5, temp7, temp6, temp8) |
212 | MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18, |
213 | temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15, |
214 | temp6, temp17, temp8, temp18) |
215 | MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16, |
216 | temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14, |
217 | temp18, temp12, temp17, temp16) |
218 | INSERT_HALF_X2(temp1, temp3, temp9, temp13) |
219 | INSERT_HALF_X2(temp6, temp8, temp11, temp15) |
220 | SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15, |
221 | temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8, |
222 | temp6) |
223 | PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13, |
224 | temp16, temp11, temp10, temp15, temp14) |
225 | LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, ref, |
226 | 0, 0, 0, 0, |
227 | 0, 1, 2, 3, |
228 | BPS) |
229 | CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10, |
230 | temp11, temp10, temp11, temp14, temp15) |
231 | STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11, |
232 | temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4, |
233 | dst, 0, 1, 2, 3, BPS) |
234 | |
235 | OUTPUT_EARLY_CLOBBER_REGS_18() |
236 | : [dst]"r" (dst), [in]"r" (in), [kC1]"r" (kC1), [kC2]"r" (kC2), [ref]"r" (ref) |
237 | : "memory" , "hi" , "lo" |
238 | ); |
239 | } |
240 | |
241 | static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, |
242 | int do_two) { |
243 | ITransformOne(ref, in, dst); |
244 | if (do_two) { |
245 | ITransformOne(ref + 4, in + 16, dst + 4); |
246 | } |
247 | } |
248 | |
249 | static int Disto4x4(const uint8_t* const a, const uint8_t* const b, |
250 | const uint16_t* const w) { |
251 | int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; |
252 | int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17; |
253 | |
254 | __asm__ volatile ( |
255 | LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, a, |
256 | 0, 0, 0, 0, |
257 | 0, 1, 2, 3, |
258 | BPS) |
259 | CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp9,temp10, temp11, |
260 | temp12, temp1, temp2, temp3, temp4) |
261 | ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, |
262 | temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12) |
263 | PACK_2_HALVES_TO_WORD(temp9, temp10, temp11, temp12, temp1, temp3, temp5, |
264 | temp7, temp2, temp4, temp6, temp8) |
265 | ADD_SUB_HALVES_X4(temp2, temp4, temp6, temp8, temp9, temp1, temp3, temp10, |
266 | temp1, temp9, temp3, temp10, temp5, temp11, temp7, temp12) |
267 | ADD_SUB_HALVES_X4(temp5, temp11, temp7, temp2, temp9, temp3, temp6, temp12, |
268 | temp2, temp9, temp6, temp3, temp4, temp1, temp8, temp10) |
269 | ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2, |
270 | temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12) |
271 | ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2) |
272 | LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w, |
273 | 0, 4, 8, 12, |
274 | 0, 0, 0, 0, |
275 | 0) |
276 | LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w, |
277 | 0, 4, 8, 12, |
278 | 1, 1, 1, 1, |
279 | 16) |
280 | MUL_HALF(temp17, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, |
281 | temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16) |
282 | LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, b, |
283 | 0, 0, 0, 0, |
284 | 0, 1, 2, 3, |
285 | BPS) |
286 | CONVERT_2_BYTES_TO_HALF(temp5,temp6, temp7, temp8, temp9,temp10, temp11, |
287 | temp12, temp1, temp2, temp3, temp4) |
288 | ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, |
289 | temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12) |
290 | PACK_2_HALVES_TO_WORD(temp9, temp10, temp11, temp12, temp1, temp3, temp5, |
291 | temp7, temp2, temp4, temp6, temp8) |
292 | ADD_SUB_HALVES_X4(temp2, temp4, temp6, temp8, temp9, temp1, temp3, temp10, |
293 | temp1, temp9, temp3, temp10, temp5, temp11, temp7, temp12) |
294 | ADD_SUB_HALVES_X4(temp5, temp11, temp7, temp2, temp9, temp3, temp6, temp12, |
295 | temp2, temp9, temp6, temp3, temp4, temp1, temp8, temp10) |
296 | ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2, |
297 | temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12) |
298 | ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2) |
299 | LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w, |
300 | 0, 4, 8, 12, |
301 | 0, 0, 0, 0, |
302 | 0) |
303 | LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w, |
304 | 0, 4, 8, 12, |
305 | 1, 1, 1, 1, |
306 | 16) |
307 | MUL_HALF(temp3, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, |
308 | temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16) |
309 | OUTPUT_EARLY_CLOBBER_REGS_17() |
310 | : [a]"r" (a), [b]"r" (b), [w]"r" (w) |
311 | : "memory" , "hi" , "lo" |
312 | ); |
313 | return abs(temp3 - temp17) >> 5; |
314 | } |
315 | |
316 | static int Disto16x16(const uint8_t* const a, const uint8_t* const b, |
317 | const uint16_t* const w) { |
318 | int D = 0; |
319 | int x, y; |
320 | for (y = 0; y < 16 * BPS; y += 4 * BPS) { |
321 | for (x = 0; x < 16; x += 4) { |
322 | D += Disto4x4(a + x + y, b + x + y, w); |
323 | } |
324 | } |
325 | return D; |
326 | } |
327 | |
328 | //------------------------------------------------------------------------------ |
329 | // Intra predictions |
330 | |
331 | #define FILL_PART(J, SIZE) \ |
332 | "usw %[value], 0+" #J "*" XSTR(BPS) "(%[dst]) \n\t" \ |
333 | "usw %[value], 4+" #J "*" XSTR(BPS) "(%[dst]) \n\t" \ |
334 | ".if " #SIZE " == 16 \n\t" \ |
335 | "usw %[value], 8+" #J "*" XSTR(BPS) "(%[dst]) \n\t" \ |
336 | "usw %[value], 12+" #J "*" XSTR(BPS) "(%[dst]) \n\t" \ |
337 | ".endif \n\t" |
338 | |
339 | #define FILL_8_OR_16(DST, VALUE, SIZE) do { \ |
340 | int value = (VALUE); \ |
341 | __asm__ volatile ( \ |
342 | "replv.qb %[value], %[value] \n\t" \ |
343 | FILL_PART( 0, SIZE) \ |
344 | FILL_PART( 1, SIZE) \ |
345 | FILL_PART( 2, SIZE) \ |
346 | FILL_PART( 3, SIZE) \ |
347 | FILL_PART( 4, SIZE) \ |
348 | FILL_PART( 5, SIZE) \ |
349 | FILL_PART( 6, SIZE) \ |
350 | FILL_PART( 7, SIZE) \ |
351 | ".if " #SIZE " == 16 \n\t" \ |
352 | FILL_PART( 8, 16) \ |
353 | FILL_PART( 9, 16) \ |
354 | FILL_PART(10, 16) \ |
355 | FILL_PART(11, 16) \ |
356 | FILL_PART(12, 16) \ |
357 | FILL_PART(13, 16) \ |
358 | FILL_PART(14, 16) \ |
359 | FILL_PART(15, 16) \ |
360 | ".endif \n\t" \ |
361 | : [value]"+&r"(value) \ |
362 | : [dst]"r"((DST)) \ |
363 | : "memory" \ |
364 | ); \ |
365 | } while (0) |
366 | |
367 | #define VERTICAL_PRED(DST, TOP, SIZE) \ |
368 | static WEBP_INLINE void VerticalPred##SIZE(uint8_t* (DST), \ |
369 | const uint8_t* (TOP)) { \ |
370 | int j; \ |
371 | if ((TOP)) { \ |
372 | for (j = 0; j < (SIZE); ++j) memcpy((DST) + j * BPS, (TOP), (SIZE)); \ |
373 | } else { \ |
374 | FILL_8_OR_16((DST), 127, (SIZE)); \ |
375 | } \ |
376 | } |
377 | |
378 | VERTICAL_PRED(dst, top, 8) |
379 | VERTICAL_PRED(dst, top, 16) |
380 | |
381 | #undef VERTICAL_PRED |
382 | |
383 | #define HORIZONTAL_PRED(DST, LEFT, SIZE) \ |
384 | static WEBP_INLINE void HorizontalPred##SIZE(uint8_t* (DST), \ |
385 | const uint8_t* (LEFT)) { \ |
386 | if (LEFT) { \ |
387 | int j; \ |
388 | for (j = 0; j < (SIZE); ++j) { \ |
389 | memset((DST) + j * BPS, (LEFT)[j], (SIZE)); \ |
390 | } \ |
391 | } else { \ |
392 | FILL_8_OR_16((DST), 129, (SIZE)); \ |
393 | } \ |
394 | } |
395 | |
396 | HORIZONTAL_PRED(dst, left, 8) |
397 | HORIZONTAL_PRED(dst, left, 16) |
398 | |
399 | #undef HORIZONTAL_PRED |
400 | |
401 | #define CLIPPING() \ |
402 | "preceu.ph.qbl %[temp2], %[temp0] \n\t" \ |
403 | "preceu.ph.qbr %[temp0], %[temp0] \n\t" \ |
404 | "preceu.ph.qbl %[temp3], %[temp1] \n\t" \ |
405 | "preceu.ph.qbr %[temp1], %[temp1] \n\t" \ |
406 | "addu.ph %[temp2], %[temp2], %[leftY_1] \n\t" \ |
407 | "addu.ph %[temp0], %[temp0], %[leftY_1] \n\t" \ |
408 | "addu.ph %[temp3], %[temp3], %[leftY_1] \n\t" \ |
409 | "addu.ph %[temp1], %[temp1], %[leftY_1] \n\t" \ |
410 | "shll_s.ph %[temp2], %[temp2], 7 \n\t" \ |
411 | "shll_s.ph %[temp0], %[temp0], 7 \n\t" \ |
412 | "shll_s.ph %[temp3], %[temp3], 7 \n\t" \ |
413 | "shll_s.ph %[temp1], %[temp1], 7 \n\t" \ |
414 | "precrqu_s.qb.ph %[temp0], %[temp2], %[temp0] \n\t" \ |
415 | "precrqu_s.qb.ph %[temp1], %[temp3], %[temp1] \n\t" |
416 | |
417 | #define CLIP_8B_TO_DST(DST, LEFT, TOP, SIZE) do { \ |
418 | int leftY_1 = ((int)(LEFT)[y] << 16) + (LEFT)[y]; \ |
419 | int temp0, temp1, temp2, temp3; \ |
420 | __asm__ volatile ( \ |
421 | "replv.ph %[leftY_1], %[leftY_1] \n\t" \ |
422 | "ulw %[temp0], 0(%[top]) \n\t" \ |
423 | "ulw %[temp1], 4(%[top]) \n\t" \ |
424 | "subu.ph %[leftY_1], %[leftY_1], %[left_1] \n\t" \ |
425 | CLIPPING() \ |
426 | "usw %[temp0], 0(%[dst]) \n\t" \ |
427 | "usw %[temp1], 4(%[dst]) \n\t" \ |
428 | ".if " #SIZE " == 16 \n\t" \ |
429 | "ulw %[temp0], 8(%[top]) \n\t" \ |
430 | "ulw %[temp1], 12(%[top]) \n\t" \ |
431 | CLIPPING() \ |
432 | "usw %[temp0], 8(%[dst]) \n\t" \ |
433 | "usw %[temp1], 12(%[dst]) \n\t" \ |
434 | ".endif \n\t" \ |
435 | : [leftY_1]"+&r"(leftY_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \ |
436 | [temp2]"=&r"(temp2), [temp3]"=&r"(temp3) \ |
437 | : [left_1]"r"(left_1), [top]"r"((TOP)), [dst]"r"((DST)) \ |
438 | : "memory" \ |
439 | ); \ |
440 | } while (0) |
441 | |
442 | #define CLIP_TO_DST(DST, LEFT, TOP, SIZE) do { \ |
443 | int y; \ |
444 | const int left_1 = ((int)(LEFT)[-1] << 16) + (LEFT)[-1]; \ |
445 | for (y = 0; y < (SIZE); ++y) { \ |
446 | CLIP_8B_TO_DST((DST), (LEFT), (TOP), (SIZE)); \ |
447 | (DST) += BPS; \ |
448 | } \ |
449 | } while (0) |
450 | |
451 | #define TRUE_MOTION(DST, LEFT, TOP, SIZE) \ |
452 | static WEBP_INLINE void TrueMotion##SIZE(uint8_t* (DST), const uint8_t* (LEFT),\ |
453 | const uint8_t* (TOP)) { \ |
454 | if ((LEFT) != NULL) { \ |
455 | if ((TOP) != NULL) { \ |
456 | CLIP_TO_DST((DST), (LEFT), (TOP), (SIZE)); \ |
457 | } else { \ |
458 | HorizontalPred##SIZE((DST), (LEFT)); \ |
459 | } \ |
460 | } else { \ |
461 | /* true motion without left samples (hence: with default 129 value) */ \ |
462 | /* is equivalent to VE prediction where you just copy the top samples. */ \ |
463 | /* Note that if top samples are not available, the default value is */ \ |
464 | /* then 129, and not 127 as in the VerticalPred case. */ \ |
465 | if ((TOP) != NULL) { \ |
466 | VerticalPred##SIZE((DST), (TOP)); \ |
467 | } else { \ |
468 | FILL_8_OR_16((DST), 129, (SIZE)); \ |
469 | } \ |
470 | } \ |
471 | } |
472 | |
473 | TRUE_MOTION(dst, left, top, 8) |
474 | TRUE_MOTION(dst, left, top, 16) |
475 | |
476 | #undef TRUE_MOTION |
477 | #undef CLIP_TO_DST |
478 | #undef CLIP_8B_TO_DST |
479 | #undef CLIPPING |
480 | |
481 | static WEBP_INLINE void DCMode16(uint8_t* dst, const uint8_t* left, |
482 | const uint8_t* top) { |
483 | int DC, DC1; |
484 | int temp0, temp1, temp2, temp3; |
485 | |
486 | __asm__ volatile( |
487 | "beqz %[top], 2f \n\t" |
488 | LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, top, |
489 | 0, 4, 8, 12, |
490 | 0, 0, 0, 0, |
491 | 0) |
492 | "raddu.w.qb %[temp0], %[temp0] \n\t" |
493 | "raddu.w.qb %[temp1], %[temp1] \n\t" |
494 | "raddu.w.qb %[temp2], %[temp2] \n\t" |
495 | "raddu.w.qb %[temp3], %[temp3] \n\t" |
496 | "addu %[temp0], %[temp0], %[temp1] \n\t" |
497 | "addu %[temp2], %[temp2], %[temp3] \n\t" |
498 | "addu %[DC], %[temp0], %[temp2] \n\t" |
499 | "move %[DC1], %[DC] \n\t" |
500 | "beqz %[left], 1f \n\t" |
501 | LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, left, |
502 | 0, 4, 8, 12, |
503 | 0, 0, 0, 0, |
504 | 0) |
505 | "raddu.w.qb %[temp0], %[temp0] \n\t" |
506 | "raddu.w.qb %[temp1], %[temp1] \n\t" |
507 | "raddu.w.qb %[temp2], %[temp2] \n\t" |
508 | "raddu.w.qb %[temp3], %[temp3] \n\t" |
509 | "addu %[temp0], %[temp0], %[temp1] \n\t" |
510 | "addu %[temp2], %[temp2], %[temp3] \n\t" |
511 | "addu %[DC1], %[temp0], %[temp2] \n\t" |
512 | "1: \n\t" |
513 | "addu %[DC], %[DC], %[DC1] \n\t" |
514 | "j 3f \n\t" |
515 | "2: \n\t" |
516 | "beqz %[left], 4f \n\t" |
517 | LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, left, |
518 | 0, 4, 8, 12, |
519 | 0, 0, 0, 0, |
520 | 0) |
521 | "raddu.w.qb %[temp0], %[temp0] \n\t" |
522 | "raddu.w.qb %[temp1], %[temp1] \n\t" |
523 | "raddu.w.qb %[temp2], %[temp2] \n\t" |
524 | "raddu.w.qb %[temp3], %[temp3] \n\t" |
525 | "addu %[temp0], %[temp0], %[temp1] \n\t" |
526 | "addu %[temp2], %[temp2], %[temp3] \n\t" |
527 | "addu %[DC], %[temp0], %[temp2] \n\t" |
528 | "addu %[DC], %[DC], %[DC] \n\t" |
529 | "3: \n\t" |
530 | "shra_r.w %[DC], %[DC], 5 \n\t" |
531 | "j 5f \n\t" |
532 | "4: \n\t" |
533 | "li %[DC], 0x80 \n\t" |
534 | "5: \n\t" |
535 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [DC]"=&r" (DC), |
536 | [temp2]"=&r" (temp2), [temp3]"=&r" (temp3), [DC1]"=&r" (DC1) |
537 | : [left]"r" (left), [top]"r" (top) |
538 | : "memory" |
539 | ); |
540 | |
541 | FILL_8_OR_16(dst, DC, 16); |
542 | } |
543 | |
544 | static WEBP_INLINE void DCMode8(uint8_t* dst, const uint8_t* left, |
545 | const uint8_t* top) { |
546 | int DC, DC1; |
547 | int temp0, temp1, temp2, temp3; |
548 | |
549 | __asm__ volatile( |
550 | "beqz %[top], 2f \n\t" |
551 | "ulw %[temp0], 0(%[top]) \n\t" |
552 | "ulw %[temp1], 4(%[top]) \n\t" |
553 | "raddu.w.qb %[temp0], %[temp0] \n\t" |
554 | "raddu.w.qb %[temp1], %[temp1] \n\t" |
555 | "addu %[DC], %[temp0], %[temp1] \n\t" |
556 | "move %[DC1], %[DC] \n\t" |
557 | "beqz %[left], 1f \n\t" |
558 | "ulw %[temp2], 0(%[left]) \n\t" |
559 | "ulw %[temp3], 4(%[left]) \n\t" |
560 | "raddu.w.qb %[temp2], %[temp2] \n\t" |
561 | "raddu.w.qb %[temp3], %[temp3] \n\t" |
562 | "addu %[DC1], %[temp2], %[temp3] \n\t" |
563 | "1: \n\t" |
564 | "addu %[DC], %[DC], %[DC1] \n\t" |
565 | "j 3f \n\t" |
566 | "2: \n\t" |
567 | "beqz %[left], 4f \n\t" |
568 | "ulw %[temp2], 0(%[left]) \n\t" |
569 | "ulw %[temp3], 4(%[left]) \n\t" |
570 | "raddu.w.qb %[temp2], %[temp2] \n\t" |
571 | "raddu.w.qb %[temp3], %[temp3] \n\t" |
572 | "addu %[DC], %[temp2], %[temp3] \n\t" |
573 | "addu %[DC], %[DC], %[DC] \n\t" |
574 | "3: \n\t" |
575 | "shra_r.w %[DC], %[DC], 4 \n\t" |
576 | "j 5f \n\t" |
577 | "4: \n\t" |
578 | "li %[DC], 0x80 \n\t" |
579 | "5: \n\t" |
580 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [DC]"=&r" (DC), |
581 | [temp2]"=&r" (temp2), [temp3]"=&r" (temp3), [DC1]"=&r" (DC1) |
582 | : [left]"r" (left), [top]"r" (top) |
583 | : "memory" |
584 | ); |
585 | |
586 | FILL_8_OR_16(dst, DC, 8); |
587 | } |
588 | |
589 | static void DC4(uint8_t* dst, const uint8_t* top) { |
590 | int temp0, temp1; |
591 | __asm__ volatile( |
592 | "ulw %[temp0], 0(%[top]) \n\t" |
593 | "ulw %[temp1], -5(%[top]) \n\t" |
594 | "raddu.w.qb %[temp0], %[temp0] \n\t" |
595 | "raddu.w.qb %[temp1], %[temp1] \n\t" |
596 | "addu %[temp0], %[temp0], %[temp1] \n\t" |
597 | "addiu %[temp0], %[temp0], 4 \n\t" |
598 | "srl %[temp0], %[temp0], 3 \n\t" |
599 | "replv.qb %[temp0], %[temp0] \n\t" |
600 | "usw %[temp0], 0*" XSTR(BPS) "(%[dst]) \n\t" |
601 | "usw %[temp0], 1*" XSTR(BPS) "(%[dst]) \n\t" |
602 | "usw %[temp0], 2*" XSTR(BPS) "(%[dst]) \n\t" |
603 | "usw %[temp0], 3*" XSTR(BPS) "(%[dst]) \n\t" |
604 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1) |
605 | : [top]"r" (top), [dst]"r" (dst) |
606 | : "memory" |
607 | ); |
608 | } |
609 | |
610 | static void TM4(uint8_t* dst, const uint8_t* top) { |
611 | int a10, a32, temp0, temp1, temp2, temp3, temp4, temp5; |
612 | const int c35 = 0xff00ff; |
613 | __asm__ volatile ( |
614 | "lbu %[temp1], 0(%[top]) \n\t" |
615 | "lbu %[a10], 1(%[top]) \n\t" |
616 | "lbu %[temp2], 2(%[top]) \n\t" |
617 | "lbu %[a32], 3(%[top]) \n\t" |
618 | "ulw %[temp0], -5(%[top]) \n\t" |
619 | "lbu %[temp4], -1(%[top]) \n\t" |
620 | "append %[a10], %[temp1], 16 \n\t" |
621 | "append %[a32], %[temp2], 16 \n\t" |
622 | "replv.ph %[temp4], %[temp4] \n\t" |
623 | "shrl.ph %[temp1], %[temp0], 8 \n\t" |
624 | "and %[temp0], %[temp0], %[c35] \n\t" |
625 | "subu.ph %[temp1], %[temp1], %[temp4] \n\t" |
626 | "subu.ph %[temp0], %[temp0], %[temp4] \n\t" |
627 | "srl %[temp2], %[temp1], 16 \n\t" |
628 | "srl %[temp3], %[temp0], 16 \n\t" |
629 | "replv.ph %[temp2], %[temp2] \n\t" |
630 | "replv.ph %[temp3], %[temp3] \n\t" |
631 | "replv.ph %[temp4], %[temp1] \n\t" |
632 | "replv.ph %[temp5], %[temp0] \n\t" |
633 | "addu.ph %[temp0], %[temp3], %[a10] \n\t" |
634 | "addu.ph %[temp1], %[temp3], %[a32] \n\t" |
635 | "addu.ph %[temp3], %[temp2], %[a10] \n\t" |
636 | "addu.ph %[temp2], %[temp2], %[a32] \n\t" |
637 | "shll_s.ph %[temp0], %[temp0], 7 \n\t" |
638 | "shll_s.ph %[temp1], %[temp1], 7 \n\t" |
639 | "shll_s.ph %[temp3], %[temp3], 7 \n\t" |
640 | "shll_s.ph %[temp2], %[temp2], 7 \n\t" |
641 | "precrqu_s.qb.ph %[temp0], %[temp1], %[temp0] \n\t" |
642 | "precrqu_s.qb.ph %[temp1], %[temp2], %[temp3] \n\t" |
643 | "addu.ph %[temp2], %[temp5], %[a10] \n\t" |
644 | "addu.ph %[temp3], %[temp5], %[a32] \n\t" |
645 | "addu.ph %[temp5], %[temp4], %[a10] \n\t" |
646 | "addu.ph %[temp4], %[temp4], %[a32] \n\t" |
647 | "shll_s.ph %[temp2], %[temp2], 7 \n\t" |
648 | "shll_s.ph %[temp3], %[temp3], 7 \n\t" |
649 | "shll_s.ph %[temp4], %[temp4], 7 \n\t" |
650 | "shll_s.ph %[temp5], %[temp5], 7 \n\t" |
651 | "precrqu_s.qb.ph %[temp2], %[temp3], %[temp2] \n\t" |
652 | "precrqu_s.qb.ph %[temp3], %[temp4], %[temp5] \n\t" |
653 | "usw %[temp1], 0*" XSTR(BPS) "(%[dst]) \n\t" |
654 | "usw %[temp0], 1*" XSTR(BPS) "(%[dst]) \n\t" |
655 | "usw %[temp3], 2*" XSTR(BPS) "(%[dst]) \n\t" |
656 | "usw %[temp2], 3*" XSTR(BPS) "(%[dst]) \n\t" |
657 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
658 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
659 | [a10]"=&r" (a10), [a32]"=&r" (a32) |
660 | : [c35]"r" (c35), [top]"r" (top), [dst]"r" (dst) |
661 | : "memory" |
662 | ); |
663 | } |
664 | |
665 | static void VE4(uint8_t* dst, const uint8_t* top) { |
666 | int temp0, temp1, temp2, temp3, temp4, temp5, temp6; |
667 | __asm__ volatile( |
668 | "ulw %[temp0], -1(%[top]) \n\t" |
669 | "ulh %[temp1], 3(%[top]) \n\t" |
670 | "preceu.ph.qbr %[temp2], %[temp0] \n\t" |
671 | "preceu.ph.qbl %[temp3], %[temp0] \n\t" |
672 | "preceu.ph.qbr %[temp4], %[temp1] \n\t" |
673 | "packrl.ph %[temp5], %[temp3], %[temp2] \n\t" |
674 | "packrl.ph %[temp6], %[temp4], %[temp3] \n\t" |
675 | "shll.ph %[temp5], %[temp5], 1 \n\t" |
676 | "shll.ph %[temp6], %[temp6], 1 \n\t" |
677 | "addq.ph %[temp2], %[temp5], %[temp2] \n\t" |
678 | "addq.ph %[temp6], %[temp6], %[temp4] \n\t" |
679 | "addq.ph %[temp2], %[temp2], %[temp3] \n\t" |
680 | "addq.ph %[temp6], %[temp6], %[temp3] \n\t" |
681 | "shra_r.ph %[temp2], %[temp2], 2 \n\t" |
682 | "shra_r.ph %[temp6], %[temp6], 2 \n\t" |
683 | "precr.qb.ph %[temp4], %[temp6], %[temp2] \n\t" |
684 | "usw %[temp4], 0*" XSTR(BPS) "(%[dst]) \n\t" |
685 | "usw %[temp4], 1*" XSTR(BPS) "(%[dst]) \n\t" |
686 | "usw %[temp4], 2*" XSTR(BPS) "(%[dst]) \n\t" |
687 | "usw %[temp4], 3*" XSTR(BPS) "(%[dst]) \n\t" |
688 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
689 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
690 | [temp6]"=&r" (temp6) |
691 | : [top]"r" (top), [dst]"r" (dst) |
692 | : "memory" |
693 | ); |
694 | } |
695 | |
696 | static void HE4(uint8_t* dst, const uint8_t* top) { |
697 | int temp0, temp1, temp2, temp3, temp4, temp5, temp6; |
698 | __asm__ volatile( |
699 | "ulw %[temp0], -4(%[top]) \n\t" |
700 | "lbu %[temp1], -5(%[top]) \n\t" |
701 | "preceu.ph.qbr %[temp2], %[temp0] \n\t" |
702 | "preceu.ph.qbl %[temp3], %[temp0] \n\t" |
703 | "replv.ph %[temp4], %[temp1] \n\t" |
704 | "packrl.ph %[temp5], %[temp3], %[temp2] \n\t" |
705 | "packrl.ph %[temp6], %[temp2], %[temp4] \n\t" |
706 | "shll.ph %[temp5], %[temp5], 1 \n\t" |
707 | "shll.ph %[temp6], %[temp6], 1 \n\t" |
708 | "addq.ph %[temp3], %[temp3], %[temp5] \n\t" |
709 | "addq.ph %[temp3], %[temp3], %[temp2] \n\t" |
710 | "addq.ph %[temp2], %[temp2], %[temp6] \n\t" |
711 | "addq.ph %[temp2], %[temp2], %[temp4] \n\t" |
712 | "shra_r.ph %[temp3], %[temp3], 2 \n\t" |
713 | "shra_r.ph %[temp2], %[temp2], 2 \n\t" |
714 | "replv.qb %[temp0], %[temp3] \n\t" |
715 | "replv.qb %[temp1], %[temp2] \n\t" |
716 | "srl %[temp3], %[temp3], 16 \n\t" |
717 | "srl %[temp2], %[temp2], 16 \n\t" |
718 | "replv.qb %[temp3], %[temp3] \n\t" |
719 | "replv.qb %[temp2], %[temp2] \n\t" |
720 | "usw %[temp3], 0*" XSTR(BPS) "(%[dst]) \n\t" |
721 | "usw %[temp0], 1*" XSTR(BPS) "(%[dst]) \n\t" |
722 | "usw %[temp2], 2*" XSTR(BPS) "(%[dst]) \n\t" |
723 | "usw %[temp1], 3*" XSTR(BPS) "(%[dst]) \n\t" |
724 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
725 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
726 | [temp6]"=&r" (temp6) |
727 | : [top]"r" (top), [dst]"r" (dst) |
728 | : "memory" |
729 | ); |
730 | } |
731 | |
732 | static void RD4(uint8_t* dst, const uint8_t* top) { |
733 | int temp0, temp1, temp2, temp3, temp4, temp5; |
734 | int temp6, temp7, temp8, temp9, temp10, temp11; |
735 | __asm__ volatile( |
736 | "ulw %[temp0], -5(%[top]) \n\t" |
737 | "ulw %[temp1], -1(%[top]) \n\t" |
738 | "preceu.ph.qbl %[temp2], %[temp0] \n\t" |
739 | "preceu.ph.qbr %[temp3], %[temp0] \n\t" |
740 | "preceu.ph.qbr %[temp4], %[temp1] \n\t" |
741 | "preceu.ph.qbl %[temp5], %[temp1] \n\t" |
742 | "packrl.ph %[temp6], %[temp2], %[temp3] \n\t" |
743 | "packrl.ph %[temp7], %[temp4], %[temp2] \n\t" |
744 | "packrl.ph %[temp8], %[temp5], %[temp4] \n\t" |
745 | "shll.ph %[temp6], %[temp6], 1 \n\t" |
746 | "addq.ph %[temp9], %[temp2], %[temp6] \n\t" |
747 | "shll.ph %[temp7], %[temp7], 1 \n\t" |
748 | "addq.ph %[temp9], %[temp9], %[temp3] \n\t" |
749 | "shll.ph %[temp8], %[temp8], 1 \n\t" |
750 | "shra_r.ph %[temp9], %[temp9], 2 \n\t" |
751 | "addq.ph %[temp10], %[temp4], %[temp7] \n\t" |
752 | "addq.ph %[temp11], %[temp5], %[temp8] \n\t" |
753 | "addq.ph %[temp10], %[temp10], %[temp2] \n\t" |
754 | "addq.ph %[temp11], %[temp11], %[temp4] \n\t" |
755 | "shra_r.ph %[temp10], %[temp10], 2 \n\t" |
756 | "shra_r.ph %[temp11], %[temp11], 2 \n\t" |
757 | "lbu %[temp0], 3(%[top]) \n\t" |
758 | "lbu %[temp1], 2(%[top]) \n\t" |
759 | "lbu %[temp2], 1(%[top]) \n\t" |
760 | "sll %[temp1], %[temp1], 1 \n\t" |
761 | "addu %[temp0], %[temp0], %[temp1] \n\t" |
762 | "addu %[temp0], %[temp0], %[temp2] \n\t" |
763 | "precr.qb.ph %[temp9], %[temp10], %[temp9] \n\t" |
764 | "shra_r.w %[temp0], %[temp0], 2 \n\t" |
765 | "precr.qb.ph %[temp10], %[temp11], %[temp10] \n\t" |
766 | "usw %[temp9], 3*" XSTR(BPS) "(%[dst]) \n\t" |
767 | "usw %[temp10], 1*" XSTR(BPS) "(%[dst]) \n\t" |
768 | "prepend %[temp9], %[temp11], 8 \n\t" |
769 | "prepend %[temp10], %[temp0], 8 \n\t" |
770 | "usw %[temp9], 2*" XSTR(BPS) "(%[dst]) \n\t" |
771 | "usw %[temp10], 0*" XSTR(BPS) "(%[dst]) \n\t" |
772 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
773 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
774 | [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [temp8]"=&r" (temp8), |
775 | [temp9]"=&r" (temp9), [temp10]"=&r" (temp10), [temp11]"=&r" (temp11) |
776 | : [top]"r" (top), [dst]"r" (dst) |
777 | : "memory" |
778 | ); |
779 | } |
780 | |
781 | static void VR4(uint8_t* dst, const uint8_t* top) { |
782 | int temp0, temp1, temp2, temp3, temp4; |
783 | int temp5, temp6, temp7, temp8, temp9; |
784 | __asm__ volatile ( |
785 | "ulw %[temp0], -4(%[top]) \n\t" |
786 | "ulw %[temp1], 0(%[top]) \n\t" |
787 | "preceu.ph.qbl %[temp2], %[temp0] \n\t" |
788 | "preceu.ph.qbr %[temp0], %[temp0] \n\t" |
789 | "preceu.ph.qbla %[temp3], %[temp1] \n\t" |
790 | "preceu.ph.qbra %[temp1], %[temp1] \n\t" |
791 | "packrl.ph %[temp7], %[temp3], %[temp2] \n\t" |
792 | "addqh_r.ph %[temp4], %[temp1], %[temp3] \n\t" |
793 | "move %[temp6], %[temp1] \n\t" |
794 | "append %[temp1], %[temp2], 16 \n\t" |
795 | "shll.ph %[temp9], %[temp6], 1 \n\t" |
796 | "addqh_r.ph %[temp5], %[temp7], %[temp6] \n\t" |
797 | "shll.ph %[temp8], %[temp7], 1 \n\t" |
798 | "addu.ph %[temp3], %[temp7], %[temp3] \n\t" |
799 | "addu.ph %[temp1], %[temp1], %[temp6] \n\t" |
800 | "packrl.ph %[temp7], %[temp2], %[temp0] \n\t" |
801 | "addu.ph %[temp6], %[temp0], %[temp2] \n\t" |
802 | "addu.ph %[temp3], %[temp3], %[temp9] \n\t" |
803 | "addu.ph %[temp1], %[temp1], %[temp8] \n\t" |
804 | "shll.ph %[temp7], %[temp7], 1 \n\t" |
805 | "shra_r.ph %[temp3], %[temp3], 2 \n\t" |
806 | "shra_r.ph %[temp1], %[temp1], 2 \n\t" |
807 | "addu.ph %[temp6], %[temp6], %[temp7] \n\t" |
808 | "shra_r.ph %[temp6], %[temp6], 2 \n\t" |
809 | "precrq.ph.w %[temp8], %[temp4], %[temp5] \n\t" |
810 | "append %[temp4], %[temp5], 16 \n\t" |
811 | "precrq.ph.w %[temp2], %[temp3], %[temp1] \n\t" |
812 | "append %[temp3], %[temp1], 16 \n\t" |
813 | "precr.qb.ph %[temp8], %[temp8], %[temp4] \n\t" |
814 | "precr.qb.ph %[temp3], %[temp2], %[temp3] \n\t" |
815 | "usw %[temp8], 0*" XSTR(BPS) "(%[dst]) \n\t" |
816 | "usw %[temp3], 1*" XSTR(BPS) "(%[dst]) \n\t" |
817 | "append %[temp3], %[temp6], 8 \n\t" |
818 | "srl %[temp6], %[temp6], 16 \n\t" |
819 | "append %[temp8], %[temp6], 8 \n\t" |
820 | "usw %[temp3], 3*" XSTR(BPS) "(%[dst]) \n\t" |
821 | "usw %[temp8], 2*" XSTR(BPS) "(%[dst]) \n\t" |
822 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
823 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
824 | [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [temp8]"=&r" (temp8), |
825 | [temp9]"=&r" (temp9) |
826 | : [top]"r" (top), [dst]"r" (dst) |
827 | : "memory" |
828 | ); |
829 | } |
830 | |
831 | static void LD4(uint8_t* dst, const uint8_t* top) { |
832 | int temp0, temp1, temp2, temp3, temp4, temp5; |
833 | int temp6, temp7, temp8, temp9, temp10, temp11; |
834 | __asm__ volatile( |
835 | "ulw %[temp0], 0(%[top]) \n\t" |
836 | "ulw %[temp1], 4(%[top]) \n\t" |
837 | "preceu.ph.qbl %[temp2], %[temp0] \n\t" |
838 | "preceu.ph.qbr %[temp3], %[temp0] \n\t" |
839 | "preceu.ph.qbr %[temp4], %[temp1] \n\t" |
840 | "preceu.ph.qbl %[temp5], %[temp1] \n\t" |
841 | "packrl.ph %[temp6], %[temp2], %[temp3] \n\t" |
842 | "packrl.ph %[temp7], %[temp4], %[temp2] \n\t" |
843 | "packrl.ph %[temp8], %[temp5], %[temp4] \n\t" |
844 | "shll.ph %[temp6], %[temp6], 1 \n\t" |
845 | "addq.ph %[temp9], %[temp2], %[temp6] \n\t" |
846 | "shll.ph %[temp7], %[temp7], 1 \n\t" |
847 | "addq.ph %[temp9], %[temp9], %[temp3] \n\t" |
848 | "shll.ph %[temp8], %[temp8], 1 \n\t" |
849 | "shra_r.ph %[temp9], %[temp9], 2 \n\t" |
850 | "addq.ph %[temp10], %[temp4], %[temp7] \n\t" |
851 | "addq.ph %[temp11], %[temp5], %[temp8] \n\t" |
852 | "addq.ph %[temp10], %[temp10], %[temp2] \n\t" |
853 | "addq.ph %[temp11], %[temp11], %[temp4] \n\t" |
854 | "shra_r.ph %[temp10], %[temp10], 2 \n\t" |
855 | "shra_r.ph %[temp11], %[temp11], 2 \n\t" |
856 | "srl %[temp1], %[temp1], 24 \n\t" |
857 | "sll %[temp1], %[temp1], 1 \n\t" |
858 | "raddu.w.qb %[temp5], %[temp5] \n\t" |
859 | "precr.qb.ph %[temp9], %[temp10], %[temp9] \n\t" |
860 | "precr.qb.ph %[temp10], %[temp11], %[temp10] \n\t" |
861 | "addu %[temp1], %[temp1], %[temp5] \n\t" |
862 | "shra_r.w %[temp1], %[temp1], 2 \n\t" |
863 | "usw %[temp9], 0*" XSTR(BPS) "(%[dst]) \n\t" |
864 | "usw %[temp10], 2*" XSTR(BPS) "(%[dst]) \n\t" |
865 | "prepend %[temp9], %[temp11], 8 \n\t" |
866 | "prepend %[temp10], %[temp1], 8 \n\t" |
867 | "usw %[temp9], 1*" XSTR(BPS) "(%[dst]) \n\t" |
868 | "usw %[temp10], 3*" XSTR(BPS) "(%[dst]) \n\t" |
869 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
870 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
871 | [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [temp8]"=&r" (temp8), |
872 | [temp9]"=&r" (temp9), [temp10]"=&r" (temp10), [temp11]"=&r" (temp11) |
873 | : [top]"r" (top), [dst]"r" (dst) |
874 | : "memory" |
875 | ); |
876 | } |
877 | |
878 | static void VL4(uint8_t* dst, const uint8_t* top) { |
879 | int temp0, temp1, temp2, temp3, temp4; |
880 | int temp5, temp6, temp7, temp8, temp9; |
881 | __asm__ volatile ( |
882 | "ulw %[temp0], 0(%[top]) \n\t" |
883 | "ulw %[temp1], 4(%[top]) \n\t" |
884 | "preceu.ph.qbla %[temp2], %[temp0] \n\t" |
885 | "preceu.ph.qbra %[temp0], %[temp0] \n\t" |
886 | "preceu.ph.qbl %[temp3], %[temp1] \n\t" |
887 | "preceu.ph.qbr %[temp1], %[temp1] \n\t" |
888 | "addqh_r.ph %[temp4], %[temp0], %[temp2] \n\t" |
889 | "packrl.ph %[temp7], %[temp1], %[temp0] \n\t" |
890 | "precrq.ph.w %[temp6], %[temp1], %[temp2] \n\t" |
891 | "shll.ph %[temp9], %[temp2], 1 \n\t" |
892 | "addqh_r.ph %[temp5], %[temp7], %[temp2] \n\t" |
893 | "shll.ph %[temp8], %[temp7], 1 \n\t" |
894 | "addu.ph %[temp2], %[temp2], %[temp6] \n\t" |
895 | "addu.ph %[temp0], %[temp0], %[temp7] \n\t" |
896 | "packrl.ph %[temp7], %[temp3], %[temp1] \n\t" |
897 | "addu.ph %[temp6], %[temp1], %[temp3] \n\t" |
898 | "addu.ph %[temp2], %[temp2], %[temp8] \n\t" |
899 | "addu.ph %[temp0], %[temp0], %[temp9] \n\t" |
900 | "shll.ph %[temp7], %[temp7], 1 \n\t" |
901 | "shra_r.ph %[temp2], %[temp2], 2 \n\t" |
902 | "shra_r.ph %[temp0], %[temp0], 2 \n\t" |
903 | "addu.ph %[temp6], %[temp6], %[temp7] \n\t" |
904 | "shra_r.ph %[temp6], %[temp6], 2 \n\t" |
905 | "precrq.ph.w %[temp8], %[temp5], %[temp4] \n\t" |
906 | "append %[temp5], %[temp4], 16 \n\t" |
907 | "precrq.ph.w %[temp3], %[temp2], %[temp0] \n\t" |
908 | "append %[temp2], %[temp0], 16 \n\t" |
909 | "precr.qb.ph %[temp8], %[temp8], %[temp5] \n\t" |
910 | "precr.qb.ph %[temp3], %[temp3], %[temp2] \n\t" |
911 | "usw %[temp8], 0*" XSTR(BPS) "(%[dst]) \n\t" |
912 | "prepend %[temp8], %[temp6], 8 \n\t" |
913 | "usw %[temp3], 1*" XSTR(BPS) "(%[dst]) \n\t" |
914 | "srl %[temp6], %[temp6], 16 \n\t" |
915 | "prepend %[temp3], %[temp6], 8 \n\t" |
916 | "usw %[temp8], 2*" XSTR(BPS) "(%[dst]) \n\t" |
917 | "usw %[temp3], 3*" XSTR(BPS) "(%[dst]) \n\t" |
918 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
919 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
920 | [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [temp8]"=&r" (temp8), |
921 | [temp9]"=&r" (temp9) |
922 | : [top]"r" (top), [dst]"r" (dst) |
923 | : "memory" |
924 | ); |
925 | } |
926 | |
927 | static void HD4(uint8_t* dst, const uint8_t* top) { |
928 | int temp0, temp1, temp2, temp3, temp4; |
929 | int temp5, temp6, temp7, temp8, temp9; |
930 | __asm__ volatile ( |
931 | "ulw %[temp0], -5(%[top]) \n\t" |
932 | "ulw %[temp1], -1(%[top]) \n\t" |
933 | "preceu.ph.qbla %[temp2], %[temp0] \n\t" |
934 | "preceu.ph.qbra %[temp0], %[temp0] \n\t" |
935 | "preceu.ph.qbl %[temp3], %[temp1] \n\t" |
936 | "preceu.ph.qbr %[temp1], %[temp1] \n\t" |
937 | "addqh_r.ph %[temp4], %[temp0], %[temp2] \n\t" |
938 | "packrl.ph %[temp7], %[temp1], %[temp0] \n\t" |
939 | "precrq.ph.w %[temp6], %[temp1], %[temp2] \n\t" |
940 | "shll.ph %[temp9], %[temp2], 1 \n\t" |
941 | "addqh_r.ph %[temp5], %[temp7], %[temp2] \n\t" |
942 | "shll.ph %[temp8], %[temp7], 1 \n\t" |
943 | "addu.ph %[temp2], %[temp2], %[temp6] \n\t" |
944 | "addu.ph %[temp0], %[temp0], %[temp7] \n\t" |
945 | "packrl.ph %[temp7], %[temp3], %[temp1] \n\t" |
946 | "addu.ph %[temp6], %[temp1], %[temp3] \n\t" |
947 | "addu.ph %[temp2], %[temp2], %[temp8] \n\t" |
948 | "addu.ph %[temp0], %[temp0], %[temp9] \n\t" |
949 | "shll.ph %[temp7], %[temp7], 1 \n\t" |
950 | "shra_r.ph %[temp2], %[temp2], 2 \n\t" |
951 | "shra_r.ph %[temp0], %[temp0], 2 \n\t" |
952 | "addu.ph %[temp6], %[temp6], %[temp7] \n\t" |
953 | "shra_r.ph %[temp6], %[temp6], 2 \n\t" |
954 | "precrq.ph.w %[temp1], %[temp2], %[temp5] \n\t" |
955 | "precrq.ph.w %[temp3], %[temp0], %[temp4] \n\t" |
956 | "precr.qb.ph %[temp7], %[temp6], %[temp1] \n\t" |
957 | "precr.qb.ph %[temp6], %[temp1], %[temp3] \n\t" |
958 | "usw %[temp7], 0*" XSTR(BPS) "(%[dst]) \n\t" |
959 | "usw %[temp6], 1*" XSTR(BPS) "(%[dst]) \n\t" |
960 | "append %[temp2], %[temp5], 16 \n\t" |
961 | "append %[temp0], %[temp4], 16 \n\t" |
962 | "precr.qb.ph %[temp5], %[temp3], %[temp2] \n\t" |
963 | "precr.qb.ph %[temp4], %[temp2], %[temp0] \n\t" |
964 | "usw %[temp5], 2*" XSTR(BPS) "(%[dst]) \n\t" |
965 | "usw %[temp4], 3*" XSTR(BPS) "(%[dst]) \n\t" |
966 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
967 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
968 | [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [temp8]"=&r" (temp8), |
969 | [temp9]"=&r" (temp9) |
970 | : [top]"r" (top), [dst]"r" (dst) |
971 | : "memory" |
972 | ); |
973 | } |
974 | |
975 | static void HU4(uint8_t* dst, const uint8_t* top) { |
976 | int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; |
977 | __asm__ volatile ( |
978 | "ulw %[temp0], -5(%[top]) \n\t" |
979 | "preceu.ph.qbl %[temp1], %[temp0] \n\t" |
980 | "preceu.ph.qbr %[temp2], %[temp0] \n\t" |
981 | "packrl.ph %[temp3], %[temp1], %[temp2] \n\t" |
982 | "replv.qb %[temp7], %[temp2] \n\t" |
983 | "addqh_r.ph %[temp4], %[temp1], %[temp3] \n\t" |
984 | "addqh_r.ph %[temp5], %[temp3], %[temp2] \n\t" |
985 | "shll.ph %[temp6], %[temp3], 1 \n\t" |
986 | "addu.ph %[temp3], %[temp2], %[temp3] \n\t" |
987 | "addu.ph %[temp6], %[temp1], %[temp6] \n\t" |
988 | "shll.ph %[temp0], %[temp2], 1 \n\t" |
989 | "addu.ph %[temp6], %[temp6], %[temp2] \n\t" |
990 | "addu.ph %[temp0], %[temp3], %[temp0] \n\t" |
991 | "shra_r.ph %[temp6], %[temp6], 2 \n\t" |
992 | "shra_r.ph %[temp0], %[temp0], 2 \n\t" |
993 | "packrl.ph %[temp3], %[temp6], %[temp5] \n\t" |
994 | "precrq.ph.w %[temp2], %[temp6], %[temp4] \n\t" |
995 | "append %[temp0], %[temp5], 16 \n\t" |
996 | "precr.qb.ph %[temp3], %[temp3], %[temp2] \n\t" |
997 | "usw %[temp3], 0*" XSTR(BPS) "(%[dst]) \n\t" |
998 | "precr.qb.ph %[temp1], %[temp7], %[temp0] \n\t" |
999 | "usw %[temp7], 3*" XSTR(BPS) "(%[dst]) \n\t" |
1000 | "packrl.ph %[temp2], %[temp1], %[temp3] \n\t" |
1001 | "usw %[temp1], 2*" XSTR(BPS) "(%[dst]) \n\t" |
1002 | "usw %[temp2], 1*" XSTR(BPS) "(%[dst]) \n\t" |
1003 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
1004 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
1005 | [temp6]"=&r" (temp6), [temp7]"=&r" (temp7) |
1006 | : [top]"r" (top), [dst]"r" (dst) |
1007 | : "memory" |
1008 | ); |
1009 | } |
1010 | |
1011 | //------------------------------------------------------------------------------ |
1012 | // Chroma 8x8 prediction (paragraph 12.2) |
1013 | |
1014 | static void IntraChromaPreds(uint8_t* dst, const uint8_t* left, |
1015 | const uint8_t* top) { |
1016 | // U block |
1017 | DCMode8(C8DC8 + dst, left, top); |
1018 | VerticalPred8(C8VE8 + dst, top); |
1019 | HorizontalPred8(C8HE8 + dst, left); |
1020 | TrueMotion8(C8TM8 + dst, left, top); |
1021 | // V block |
1022 | dst += 8; |
1023 | if (top) top += 8; |
1024 | if (left) left += 16; |
1025 | DCMode8(C8DC8 + dst, left, top); |
1026 | VerticalPred8(C8VE8 + dst, top); |
1027 | HorizontalPred8(C8HE8 + dst, left); |
1028 | TrueMotion8(C8TM8 + dst, left, top); |
1029 | } |
1030 | |
1031 | //------------------------------------------------------------------------------ |
1032 | // luma 16x16 prediction (paragraph 12.3) |
1033 | |
1034 | static void Intra16Preds(uint8_t* dst, |
1035 | const uint8_t* left, const uint8_t* top) { |
1036 | DCMode16(I16DC16 + dst, left, top); |
1037 | VerticalPred16(I16VE16 + dst, top); |
1038 | HorizontalPred16(I16HE16 + dst, left); |
1039 | TrueMotion16(I16TM16 + dst, left, top); |
1040 | } |
1041 | |
1042 | // Left samples are top[-5 .. -2], top_left is top[-1], top are |
1043 | // located at top[0..3], and top right is top[4..7] |
1044 | static void Intra4Preds(uint8_t* dst, const uint8_t* top) { |
1045 | DC4(I4DC4 + dst, top); |
1046 | TM4(I4TM4 + dst, top); |
1047 | VE4(I4VE4 + dst, top); |
1048 | HE4(I4HE4 + dst, top); |
1049 | RD4(I4RD4 + dst, top); |
1050 | VR4(I4VR4 + dst, top); |
1051 | LD4(I4LD4 + dst, top); |
1052 | VL4(I4VL4 + dst, top); |
1053 | HD4(I4HD4 + dst, top); |
1054 | HU4(I4HU4 + dst, top); |
1055 | } |
1056 | |
1057 | //------------------------------------------------------------------------------ |
1058 | // Metric |
1059 | |
1060 | #if !defined(WORK_AROUND_GCC) |
1061 | |
1062 | #define GET_SSE_INNER(A) \ |
1063 | "lw %[temp0], " #A "(%[a]) \n\t" \ |
1064 | "lw %[temp1], " #A "(%[b]) \n\t" \ |
1065 | "preceu.ph.qbr %[temp2], %[temp0] \n\t" \ |
1066 | "preceu.ph.qbl %[temp0], %[temp0] \n\t" \ |
1067 | "preceu.ph.qbr %[temp3], %[temp1] \n\t" \ |
1068 | "preceu.ph.qbl %[temp1], %[temp1] \n\t" \ |
1069 | "subq.ph %[temp2], %[temp2], %[temp3] \n\t" \ |
1070 | "subq.ph %[temp0], %[temp0], %[temp1] \n\t" \ |
1071 | "dpa.w.ph $ac0, %[temp2], %[temp2] \n\t" \ |
1072 | "dpa.w.ph $ac0, %[temp0], %[temp0] \n\t" |
1073 | |
1074 | #define GET_SSE(A, B, C, D) \ |
1075 | GET_SSE_INNER(A) \ |
1076 | GET_SSE_INNER(B) \ |
1077 | GET_SSE_INNER(C) \ |
1078 | GET_SSE_INNER(D) |
1079 | |
1080 | static int SSE16x16(const uint8_t* a, const uint8_t* b) { |
1081 | int count; |
1082 | int temp0, temp1, temp2, temp3; |
1083 | __asm__ volatile ( |
1084 | "mult $zero, $zero \n\t" |
1085 | GET_SSE( 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS) |
1086 | GET_SSE( 1 * BPS, 4 + 1 * BPS, 8 + 1 * BPS, 12 + 1 * BPS) |
1087 | GET_SSE( 2 * BPS, 4 + 2 * BPS, 8 + 2 * BPS, 12 + 2 * BPS) |
1088 | GET_SSE( 3 * BPS, 4 + 3 * BPS, 8 + 3 * BPS, 12 + 3 * BPS) |
1089 | GET_SSE( 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS) |
1090 | GET_SSE( 5 * BPS, 4 + 5 * BPS, 8 + 5 * BPS, 12 + 5 * BPS) |
1091 | GET_SSE( 6 * BPS, 4 + 6 * BPS, 8 + 6 * BPS, 12 + 6 * BPS) |
1092 | GET_SSE( 7 * BPS, 4 + 7 * BPS, 8 + 7 * BPS, 12 + 7 * BPS) |
1093 | GET_SSE( 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS) |
1094 | GET_SSE( 9 * BPS, 4 + 9 * BPS, 8 + 9 * BPS, 12 + 9 * BPS) |
1095 | GET_SSE(10 * BPS, 4 + 10 * BPS, 8 + 10 * BPS, 12 + 10 * BPS) |
1096 | GET_SSE(11 * BPS, 4 + 11 * BPS, 8 + 11 * BPS, 12 + 11 * BPS) |
1097 | GET_SSE(12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS) |
1098 | GET_SSE(13 * BPS, 4 + 13 * BPS, 8 + 13 * BPS, 12 + 13 * BPS) |
1099 | GET_SSE(14 * BPS, 4 + 14 * BPS, 8 + 14 * BPS, 12 + 14 * BPS) |
1100 | GET_SSE(15 * BPS, 4 + 15 * BPS, 8 + 15 * BPS, 12 + 15 * BPS) |
1101 | "mflo %[count] \n\t" |
1102 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
1103 | [temp3]"=&r" (temp3), [count]"=&r" (count) |
1104 | : [a]"r" (a), [b]"r" (b) |
1105 | : "memory" , "hi" , "lo" |
1106 | ); |
1107 | return count; |
1108 | } |
1109 | |
1110 | static int SSE16x8(const uint8_t* a, const uint8_t* b) { |
1111 | int count; |
1112 | int temp0, temp1, temp2, temp3; |
1113 | __asm__ volatile ( |
1114 | "mult $zero, $zero \n\t" |
1115 | GET_SSE( 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS) |
1116 | GET_SSE( 1 * BPS, 4 + 1 * BPS, 8 + 1 * BPS, 12 + 1 * BPS) |
1117 | GET_SSE( 2 * BPS, 4 + 2 * BPS, 8 + 2 * BPS, 12 + 2 * BPS) |
1118 | GET_SSE( 3 * BPS, 4 + 3 * BPS, 8 + 3 * BPS, 12 + 3 * BPS) |
1119 | GET_SSE( 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS) |
1120 | GET_SSE( 5 * BPS, 4 + 5 * BPS, 8 + 5 * BPS, 12 + 5 * BPS) |
1121 | GET_SSE( 6 * BPS, 4 + 6 * BPS, 8 + 6 * BPS, 12 + 6 * BPS) |
1122 | GET_SSE( 7 * BPS, 4 + 7 * BPS, 8 + 7 * BPS, 12 + 7 * BPS) |
1123 | "mflo %[count] \n\t" |
1124 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
1125 | [temp3]"=&r" (temp3), [count]"=&r" (count) |
1126 | : [a]"r" (a), [b]"r" (b) |
1127 | : "memory" , "hi" , "lo" |
1128 | ); |
1129 | return count; |
1130 | } |
1131 | |
1132 | static int SSE8x8(const uint8_t* a, const uint8_t* b) { |
1133 | int count; |
1134 | int temp0, temp1, temp2, temp3; |
1135 | __asm__ volatile ( |
1136 | "mult $zero, $zero \n\t" |
1137 | GET_SSE(0 * BPS, 4 + 0 * BPS, 1 * BPS, 4 + 1 * BPS) |
1138 | GET_SSE(2 * BPS, 4 + 2 * BPS, 3 * BPS, 4 + 3 * BPS) |
1139 | GET_SSE(4 * BPS, 4 + 4 * BPS, 5 * BPS, 4 + 5 * BPS) |
1140 | GET_SSE(6 * BPS, 4 + 6 * BPS, 7 * BPS, 4 + 7 * BPS) |
1141 | "mflo %[count] \n\t" |
1142 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
1143 | [temp3]"=&r" (temp3), [count]"=&r" (count) |
1144 | : [a]"r" (a), [b]"r" (b) |
1145 | : "memory" , "hi" , "lo" |
1146 | ); |
1147 | return count; |
1148 | } |
1149 | |
1150 | static int SSE4x4(const uint8_t* a, const uint8_t* b) { |
1151 | int count; |
1152 | int temp0, temp1, temp2, temp3; |
1153 | __asm__ volatile ( |
1154 | "mult $zero, $zero \n\t" |
1155 | GET_SSE(0 * BPS, 1 * BPS, 2 * BPS, 3 * BPS) |
1156 | "mflo %[count] \n\t" |
1157 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
1158 | [temp3]"=&r" (temp3), [count]"=&r" (count) |
1159 | : [a]"r" (a), [b]"r" (b) |
1160 | : "memory" , "hi" , "lo" |
1161 | ); |
1162 | return count; |
1163 | } |
1164 | |
1165 | #undef GET_SSE |
1166 | #undef GET_SSE_INNER |
1167 | |
1168 | #endif // !WORK_AROUND_GCC |
1169 | |
1170 | #undef FILL_8_OR_16 |
1171 | #undef FILL_PART |
1172 | #undef OUTPUT_EARLY_CLOBBER_REGS_17 |
1173 | #undef MUL_HALF |
1174 | #undef ABS_X8 |
1175 | #undef ADD_SUB_HALVES_X4 |
1176 | |
1177 | //------------------------------------------------------------------------------ |
1178 | // Quantization |
1179 | // |
1180 | |
1181 | // macro for one pass through for loop in QuantizeBlock reading 2 values at time |
1182 | // QUANTDIV macro inlined |
1183 | // J - offset in bytes (kZigzag[n] * 2) |
1184 | // K - offset in bytes (kZigzag[n] * 4) |
1185 | // N - offset in bytes (n * 2) |
1186 | // N1 - offset in bytes ((n + 1) * 2) |
1187 | #define QUANTIZE_ONE(J, K, N, N1) \ |
1188 | "ulw %[temp1], " #J "(%[ppin]) \n\t" \ |
1189 | "ulw %[temp2], " #J "(%[ppsharpen]) \n\t" \ |
1190 | "lhu %[temp3], " #K "(%[ppzthresh]) \n\t" \ |
1191 | "lhu %[temp6], " #K "+4(%[ppzthresh]) \n\t" \ |
1192 | "absq_s.ph %[temp4], %[temp1] \n\t" \ |
1193 | "ins %[temp3], %[temp6], 16, 16 \n\t" \ |
1194 | "addu.ph %[coeff], %[temp4], %[temp2] \n\t" \ |
1195 | "shra.ph %[sign], %[temp1], 15 \n\t" \ |
1196 | "li %[level], 0x10001 \n\t" \ |
1197 | "cmp.lt.ph %[temp3], %[coeff] \n\t" \ |
1198 | "lhu %[temp1], " #J "(%[ppiq]) \n\t" \ |
1199 | "pick.ph %[temp5], %[level], $0 \n\t" \ |
1200 | "lw %[temp2], " #K "(%[ppbias]) \n\t" \ |
1201 | "beqz %[temp5], 0f \n\t" \ |
1202 | "lhu %[temp3], " #J "(%[ppq]) \n\t" \ |
1203 | "beq %[temp5], %[level], 1f \n\t" \ |
1204 | "andi %[temp5], %[temp5], 0x1 \n\t" \ |
1205 | "andi %[temp4], %[coeff], 0xffff \n\t" \ |
1206 | "beqz %[temp5], 2f \n\t" \ |
1207 | "mul %[level], %[temp4], %[temp1] \n\t" \ |
1208 | "sh $0, " #J "+2(%[ppin]) \n\t" \ |
1209 | "sh $0, " #N1 "(%[pout]) \n\t" \ |
1210 | "addu %[level], %[level], %[temp2] \n\t" \ |
1211 | "sra %[level], %[level], 17 \n\t" \ |
1212 | "slt %[temp4], %[max_level], %[level] \n\t" \ |
1213 | "movn %[level], %[max_level], %[temp4] \n\t" \ |
1214 | "andi %[temp6], %[sign], 0xffff \n\t" \ |
1215 | "xor %[level], %[level], %[temp6] \n\t" \ |
1216 | "subu %[level], %[level], %[temp6] \n\t" \ |
1217 | "mul %[temp5], %[level], %[temp3] \n\t" \ |
1218 | "or %[ret], %[ret], %[level] \n\t" \ |
1219 | "sh %[level], " #N "(%[pout]) \n\t" \ |
1220 | "sh %[temp5], " #J "(%[ppin]) \n\t" \ |
1221 | "j 3f \n\t" \ |
1222 | "2: \n\t" \ |
1223 | "lhu %[temp1], " #J "+2(%[ppiq]) \n\t" \ |
1224 | "srl %[temp5], %[coeff], 16 \n\t" \ |
1225 | "mul %[level], %[temp5], %[temp1] \n\t" \ |
1226 | "lw %[temp2], " #K "+4(%[ppbias]) \n\t" \ |
1227 | "lhu %[temp3], " #J "+2(%[ppq]) \n\t" \ |
1228 | "addu %[level], %[level], %[temp2] \n\t" \ |
1229 | "sra %[level], %[level], 17 \n\t" \ |
1230 | "srl %[temp6], %[sign], 16 \n\t" \ |
1231 | "slt %[temp4], %[max_level], %[level] \n\t" \ |
1232 | "movn %[level], %[max_level], %[temp4] \n\t" \ |
1233 | "xor %[level], %[level], %[temp6] \n\t" \ |
1234 | "subu %[level], %[level], %[temp6] \n\t" \ |
1235 | "mul %[temp5], %[level], %[temp3] \n\t" \ |
1236 | "sh $0, " #J "(%[ppin]) \n\t" \ |
1237 | "sh $0, " #N "(%[pout]) \n\t" \ |
1238 | "or %[ret], %[ret], %[level] \n\t" \ |
1239 | "sh %[temp5], " #J "+2(%[ppin]) \n\t" \ |
1240 | "sh %[level], " #N1 "(%[pout]) \n\t" \ |
1241 | "j 3f \n\t" \ |
1242 | "1: \n\t" \ |
1243 | "lhu %[temp1], " #J "(%[ppiq]) \n\t" \ |
1244 | "lw %[temp2], " #K "(%[ppbias]) \n\t" \ |
1245 | "ulw %[temp3], " #J "(%[ppq]) \n\t" \ |
1246 | "andi %[temp5], %[coeff], 0xffff \n\t" \ |
1247 | "srl %[temp0], %[coeff], 16 \n\t" \ |
1248 | "lhu %[temp6], " #J "+2(%[ppiq]) \n\t" \ |
1249 | "lw %[coeff], " #K "+4(%[ppbias]) \n\t" \ |
1250 | "mul %[level], %[temp5], %[temp1] \n\t" \ |
1251 | "mul %[temp4], %[temp0], %[temp6] \n\t" \ |
1252 | "addu %[level], %[level], %[temp2] \n\t" \ |
1253 | "addu %[temp4], %[temp4], %[coeff] \n\t" \ |
1254 | "precrq.ph.w %[level], %[temp4], %[level] \n\t" \ |
1255 | "shra.ph %[level], %[level], 1 \n\t" \ |
1256 | "cmp.lt.ph %[max_level1],%[level] \n\t" \ |
1257 | "pick.ph %[level], %[max_level], %[level] \n\t" \ |
1258 | "xor %[level], %[level], %[sign] \n\t" \ |
1259 | "subu.ph %[level], %[level], %[sign] \n\t" \ |
1260 | "mul.ph %[temp3], %[level], %[temp3] \n\t" \ |
1261 | "or %[ret], %[ret], %[level] \n\t" \ |
1262 | "sh %[level], " #N "(%[pout]) \n\t" \ |
1263 | "srl %[level], %[level], 16 \n\t" \ |
1264 | "sh %[level], " #N1 "(%[pout]) \n\t" \ |
1265 | "usw %[temp3], " #J "(%[ppin]) \n\t" \ |
1266 | "j 3f \n\t" \ |
1267 | "0: \n\t" \ |
1268 | "sh $0, " #N "(%[pout]) \n\t" \ |
1269 | "sh $0, " #N1 "(%[pout]) \n\t" \ |
1270 | "usw $0, " #J "(%[ppin]) \n\t" \ |
1271 | "3: \n\t" |
1272 | |
1273 | static int QuantizeBlock(int16_t in[16], int16_t out[16], |
1274 | const VP8Matrix* const mtx) { |
1275 | int temp0, temp1, temp2, temp3, temp4, temp5,temp6; |
1276 | int sign, coeff, level; |
1277 | int max_level = MAX_LEVEL; |
1278 | int max_level1 = max_level << 16 | max_level; |
1279 | int ret = 0; |
1280 | |
1281 | int16_t* ppin = &in[0]; |
1282 | int16_t* pout = &out[0]; |
1283 | const uint16_t* ppsharpen = &mtx->sharpen_[0]; |
1284 | const uint32_t* ppzthresh = &mtx->zthresh_[0]; |
1285 | const uint16_t* ppq = &mtx->q_[0]; |
1286 | const uint16_t* ppiq = &mtx->iq_[0]; |
1287 | const uint32_t* ppbias = &mtx->bias_[0]; |
1288 | |
1289 | __asm__ volatile ( |
1290 | QUANTIZE_ONE( 0, 0, 0, 2) |
1291 | QUANTIZE_ONE( 4, 8, 10, 12) |
1292 | QUANTIZE_ONE( 8, 16, 4, 8) |
1293 | QUANTIZE_ONE(12, 24, 14, 24) |
1294 | QUANTIZE_ONE(16, 32, 6, 16) |
1295 | QUANTIZE_ONE(20, 40, 22, 26) |
1296 | QUANTIZE_ONE(24, 48, 18, 20) |
1297 | QUANTIZE_ONE(28, 56, 28, 30) |
1298 | |
1299 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), |
1300 | [temp2]"=&r" (temp2), [temp3]"=&r" (temp3), |
1301 | [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
1302 | [sign]"=&r" (sign), [coeff]"=&r" (coeff), |
1303 | [level]"=&r" (level), [temp6]"=&r" (temp6), [ret]"+&r" (ret) |
1304 | : [ppin]"r" (ppin), [pout]"r" (pout), [max_level1]"r" (max_level1), |
1305 | [ppiq]"r" (ppiq), [max_level]"r" (max_level), |
1306 | [ppbias]"r" (ppbias), [ppzthresh]"r" (ppzthresh), |
1307 | [ppsharpen]"r" (ppsharpen), [ppq]"r" (ppq) |
1308 | : "memory" , "hi" , "lo" |
1309 | ); |
1310 | |
1311 | return (ret != 0); |
1312 | } |
1313 | |
1314 | static int Quantize2Blocks(int16_t in[32], int16_t out[32], |
1315 | const VP8Matrix* const mtx) { |
1316 | int nz; |
1317 | nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0; |
1318 | nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1; |
1319 | return nz; |
1320 | } |
1321 | |
1322 | #undef QUANTIZE_ONE |
1323 | |
1324 | // macro for one horizontal pass in FTransformWHT |
1325 | // temp0..temp7 holds tmp[0]..tmp[15] |
1326 | // A, B, C, D - offset in bytes to load from in buffer |
1327 | // TEMP0, TEMP1 - registers for corresponding tmp elements |
1328 | #define HORIZONTAL_PASS_WHT(A, B, C, D, TEMP0, TEMP1) \ |
1329 | "lh %[" #TEMP0 "], " #A "(%[in]) \n\t" \ |
1330 | "lh %[" #TEMP1 "], " #B "(%[in]) \n\t" \ |
1331 | "lh %[temp8], " #C "(%[in]) \n\t" \ |
1332 | "lh %[temp9], " #D "(%[in]) \n\t" \ |
1333 | "ins %[" #TEMP1 "], %[" #TEMP0 "], 16, 16 \n\t" \ |
1334 | "ins %[temp9], %[temp8], 16, 16 \n\t" \ |
1335 | "subq.ph %[temp8], %[" #TEMP1 "], %[temp9] \n\t" \ |
1336 | "addq.ph %[temp9], %[" #TEMP1 "], %[temp9] \n\t" \ |
1337 | "precrq.ph.w %[" #TEMP0 "], %[temp8], %[temp9] \n\t" \ |
1338 | "append %[temp8], %[temp9], 16 \n\t" \ |
1339 | "subq.ph %[" #TEMP1 "], %[" #TEMP0 "], %[temp8] \n\t" \ |
1340 | "addq.ph %[" #TEMP0 "], %[" #TEMP0 "], %[temp8] \n\t" \ |
1341 | "rotr %[" #TEMP1 "], %[" #TEMP1 "], 16 \n\t" |
1342 | |
1343 | // macro for one vertical pass in FTransformWHT |
1344 | // temp0..temp7 holds tmp[0]..tmp[15] |
1345 | // A, B, C, D - offsets in bytes to store to out buffer |
1346 | // TEMP0, TEMP2, TEMP4 and TEMP6 - registers for corresponding tmp elements |
1347 | #define VERTICAL_PASS_WHT(A, B, C, D, TEMP0, TEMP2, TEMP4, TEMP6) \ |
1348 | "addq.ph %[temp8], %[" #TEMP0 "], %[" #TEMP4 "] \n\t" \ |
1349 | "addq.ph %[temp9], %[" #TEMP2 "], %[" #TEMP6 "] \n\t" \ |
1350 | "subq.ph %[" #TEMP2 "], %[" #TEMP2 "], %[" #TEMP6 "] \n\t" \ |
1351 | "subq.ph %[" #TEMP6 "], %[" #TEMP0 "], %[" #TEMP4 "] \n\t" \ |
1352 | "addqh.ph %[" #TEMP0 "], %[temp8], %[temp9] \n\t" \ |
1353 | "subqh.ph %[" #TEMP4 "], %[" #TEMP6 "], %[" #TEMP2 "] \n\t" \ |
1354 | "addqh.ph %[" #TEMP2 "], %[" #TEMP2 "], %[" #TEMP6 "] \n\t" \ |
1355 | "subqh.ph %[" #TEMP6 "], %[temp8], %[temp9] \n\t" \ |
1356 | "usw %[" #TEMP0 "], " #A "(%[out]) \n\t" \ |
1357 | "usw %[" #TEMP2 "], " #B "(%[out]) \n\t" \ |
1358 | "usw %[" #TEMP4 "], " #C "(%[out]) \n\t" \ |
1359 | "usw %[" #TEMP6 "], " #D "(%[out]) \n\t" |
1360 | |
1361 | static void FTransformWHT(const int16_t* in, int16_t* out) { |
1362 | int temp0, temp1, temp2, temp3, temp4; |
1363 | int temp5, temp6, temp7, temp8, temp9; |
1364 | |
1365 | __asm__ volatile ( |
1366 | HORIZONTAL_PASS_WHT( 0, 32, 64, 96, temp0, temp1) |
1367 | HORIZONTAL_PASS_WHT(128, 160, 192, 224, temp2, temp3) |
1368 | HORIZONTAL_PASS_WHT(256, 288, 320, 352, temp4, temp5) |
1369 | HORIZONTAL_PASS_WHT(384, 416, 448, 480, temp6, temp7) |
1370 | VERTICAL_PASS_WHT(0, 8, 16, 24, temp0, temp2, temp4, temp6) |
1371 | VERTICAL_PASS_WHT(4, 12, 20, 28, temp1, temp3, temp5, temp7) |
1372 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
1373 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
1374 | [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [temp8]"=&r" (temp8), |
1375 | [temp9]"=&r" (temp9) |
1376 | : [in]"r" (in), [out]"r" (out) |
1377 | : "memory" |
1378 | ); |
1379 | } |
1380 | |
1381 | #undef VERTICAL_PASS_WHT |
1382 | #undef HORIZONTAL_PASS_WHT |
1383 | |
1384 | // macro for converting coefficients to bin |
1385 | // convert 8 coeffs at time |
1386 | // A, B, C, D - offsets in bytes to load from out buffer |
1387 | #define CONVERT_COEFFS_TO_BIN(A, B, C, D) \ |
1388 | "ulw %[temp0], " #A "(%[out]) \n\t" \ |
1389 | "ulw %[temp1], " #B "(%[out]) \n\t" \ |
1390 | "ulw %[temp2], " #C "(%[out]) \n\t" \ |
1391 | "ulw %[temp3], " #D "(%[out]) \n\t" \ |
1392 | "absq_s.ph %[temp0], %[temp0] \n\t" \ |
1393 | "absq_s.ph %[temp1], %[temp1] \n\t" \ |
1394 | "absq_s.ph %[temp2], %[temp2] \n\t" \ |
1395 | "absq_s.ph %[temp3], %[temp3] \n\t" \ |
1396 | "shra.ph %[temp0], %[temp0], 3 \n\t" \ |
1397 | "shra.ph %[temp1], %[temp1], 3 \n\t" \ |
1398 | "shra.ph %[temp2], %[temp2], 3 \n\t" \ |
1399 | "shra.ph %[temp3], %[temp3], 3 \n\t" \ |
1400 | "shll_s.ph %[temp0], %[temp0], 10 \n\t" \ |
1401 | "shll_s.ph %[temp1], %[temp1], 10 \n\t" \ |
1402 | "shll_s.ph %[temp2], %[temp2], 10 \n\t" \ |
1403 | "shll_s.ph %[temp3], %[temp3], 10 \n\t" \ |
1404 | "shrl.ph %[temp0], %[temp0], 10 \n\t" \ |
1405 | "shrl.ph %[temp1], %[temp1], 10 \n\t" \ |
1406 | "shrl.ph %[temp2], %[temp2], 10 \n\t" \ |
1407 | "shrl.ph %[temp3], %[temp3], 10 \n\t" \ |
1408 | "shll.ph %[temp0], %[temp0], 2 \n\t" \ |
1409 | "shll.ph %[temp1], %[temp1], 2 \n\t" \ |
1410 | "shll.ph %[temp2], %[temp2], 2 \n\t" \ |
1411 | "shll.ph %[temp3], %[temp3], 2 \n\t" \ |
1412 | "ext %[temp4], %[temp0], 0, 16 \n\t" \ |
1413 | "ext %[temp0], %[temp0], 16, 16 \n\t" \ |
1414 | "addu %[temp4], %[temp4], %[dist] \n\t" \ |
1415 | "addu %[temp0], %[temp0], %[dist] \n\t" \ |
1416 | "ext %[temp5], %[temp1], 0, 16 \n\t" \ |
1417 | "lw %[temp8], 0(%[temp4]) \n\t" \ |
1418 | "ext %[temp1], %[temp1], 16, 16 \n\t" \ |
1419 | "addu %[temp5], %[temp5], %[dist] \n\t" \ |
1420 | "addiu %[temp8], %[temp8], 1 \n\t" \ |
1421 | "sw %[temp8], 0(%[temp4]) \n\t" \ |
1422 | "lw %[temp8], 0(%[temp0]) \n\t" \ |
1423 | "addu %[temp1], %[temp1], %[dist] \n\t" \ |
1424 | "ext %[temp6], %[temp2], 0, 16 \n\t" \ |
1425 | "addiu %[temp8], %[temp8], 1 \n\t" \ |
1426 | "sw %[temp8], 0(%[temp0]) \n\t" \ |
1427 | "lw %[temp8], 0(%[temp5]) \n\t" \ |
1428 | "ext %[temp2], %[temp2], 16, 16 \n\t" \ |
1429 | "addu %[temp6], %[temp6], %[dist] \n\t" \ |
1430 | "addiu %[temp8], %[temp8], 1 \n\t" \ |
1431 | "sw %[temp8], 0(%[temp5]) \n\t" \ |
1432 | "lw %[temp8], 0(%[temp1]) \n\t" \ |
1433 | "addu %[temp2], %[temp2], %[dist] \n\t" \ |
1434 | "ext %[temp7], %[temp3], 0, 16 \n\t" \ |
1435 | "addiu %[temp8], %[temp8], 1 \n\t" \ |
1436 | "sw %[temp8], 0(%[temp1]) \n\t" \ |
1437 | "lw %[temp8], 0(%[temp6]) \n\t" \ |
1438 | "ext %[temp3], %[temp3], 16, 16 \n\t" \ |
1439 | "addu %[temp7], %[temp7], %[dist] \n\t" \ |
1440 | "addiu %[temp8], %[temp8], 1 \n\t" \ |
1441 | "sw %[temp8], 0(%[temp6]) \n\t" \ |
1442 | "lw %[temp8], 0(%[temp2]) \n\t" \ |
1443 | "addu %[temp3], %[temp3], %[dist] \n\t" \ |
1444 | "addiu %[temp8], %[temp8], 1 \n\t" \ |
1445 | "sw %[temp8], 0(%[temp2]) \n\t" \ |
1446 | "lw %[temp8], 0(%[temp7]) \n\t" \ |
1447 | "addiu %[temp8], %[temp8], 1 \n\t" \ |
1448 | "sw %[temp8], 0(%[temp7]) \n\t" \ |
1449 | "lw %[temp8], 0(%[temp3]) \n\t" \ |
1450 | "addiu %[temp8], %[temp8], 1 \n\t" \ |
1451 | "sw %[temp8], 0(%[temp3]) \n\t" |
1452 | |
1453 | static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, |
1454 | int start_block, int end_block, |
1455 | VP8Histogram* const histo) { |
1456 | int j; |
1457 | int distribution[MAX_COEFF_THRESH + 1] = { 0 }; |
1458 | const int max_coeff = (MAX_COEFF_THRESH << 16) + MAX_COEFF_THRESH; |
1459 | for (j = start_block; j < end_block; ++j) { |
1460 | int16_t out[16]; |
1461 | int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; |
1462 | |
1463 | VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); |
1464 | |
1465 | // Convert coefficients to bin. |
1466 | __asm__ volatile ( |
1467 | CONVERT_COEFFS_TO_BIN( 0, 4, 8, 12) |
1468 | CONVERT_COEFFS_TO_BIN(16, 20, 24, 28) |
1469 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), |
1470 | [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), |
1471 | [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [temp8]"=&r" (temp8) |
1472 | : [dist]"r" (distribution), [out]"r" (out), [max_coeff]"r" (max_coeff) |
1473 | : "memory" |
1474 | ); |
1475 | } |
1476 | VP8SetHistogramData(distribution, histo); |
1477 | } |
1478 | |
1479 | #undef CONVERT_COEFFS_TO_BIN |
1480 | |
1481 | //------------------------------------------------------------------------------ |
1482 | // Entry point |
1483 | |
1484 | extern void VP8EncDspInitMIPSdspR2(void); |
1485 | |
1486 | WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) { |
1487 | VP8FTransform = FTransform; |
1488 | VP8ITransform = ITransform; |
1489 | VP8TDisto4x4 = Disto4x4; |
1490 | VP8TDisto16x16 = Disto16x16; |
1491 | VP8EncPredLuma16 = Intra16Preds; |
1492 | VP8EncPredChroma8 = IntraChromaPreds; |
1493 | VP8EncPredLuma4 = Intra4Preds; |
1494 | #if !defined(WORK_AROUND_GCC) |
1495 | VP8SSE16x16 = SSE16x16; |
1496 | VP8SSE8x8 = SSE8x8; |
1497 | VP8SSE16x8 = SSE16x8; |
1498 | VP8SSE4x4 = SSE4x4; |
1499 | #endif |
1500 | VP8EncQuantizeBlock = QuantizeBlock; |
1501 | VP8EncQuantize2Blocks = Quantize2Blocks; |
1502 | VP8FTransformWHT = FTransformWHT; |
1503 | VP8CollectHistogram = CollectHistogram; |
1504 | } |
1505 | |
1506 | #else // !WEBP_USE_MIPS_DSP_R2 |
1507 | |
1508 | WEBP_DSP_INIT_STUB(VP8EncDspInitMIPSdspR2) |
1509 | |
1510 | #endif // WEBP_USE_MIPS_DSP_R2 |
1511 | |