enc_mips32.c source code [engine/third_party/libwebp/src/dsp/enc_mips32.c]

1	// Copyright 2014 Google Inc. All Rights Reserved.
2	//
3	// Use of this source code is governed by a BSD-style license
4	// that can be found in the COPYING file in the root of the source
5	// tree. An additional intellectual property rights grant can be found
6	// in the file PATENTS. All contributing project authors may
7	// be found in the AUTHORS file in the root of the source tree.
8	// -----------------------------------------------------------------------------
9	//
10	// MIPS version of speed-critical encoding functions.
11	//
12	// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
13	// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
14	// Slobodan Prijic (slobodan.prijic@imgtec.com)
15
16	#include "./dsp.h"
17
18	#if defined(WEBP_USE_MIPS32)
19
20	#include "./mips_macro.h"
21	#include "../enc/vp8i_enc.h"
22	#include "../enc/cost_enc.h"
23
24	static const int kC1 = `20091` + (`1` << `16`);
25	static const int kC2 = `35468`;
26
27	// macro for one vertical pass in ITransformOne
28	// MUL macro inlined
29	// temp0..temp15 holds tmp[0]..tmp[15]
30	// A..D - offsets in bytes to load from in buffer
31	// TEMP0..TEMP3 - registers for corresponding tmp elements
32	// TEMP4..TEMP5 - temporary registers
33	#define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3) \
34	"lh %[temp16], " #A "(%[temp20]) \n\t" \
35	"lh %[temp18], " #B "(%[temp20]) \n\t" \
36	"lh %[temp17], " #C "(%[temp20]) \n\t" \
37	"lh %[temp19], " #D "(%[temp20]) \n\t" \
38	"addu %[" #TEMP4 "], %[temp16], %[temp18] \n\t" \
39	"subu %[temp16], %[temp16], %[temp18] \n\t" \
40	"mul %[" #TEMP0 "], %[temp17], %[kC2] \n\t" \
41	"mul %[temp18], %[temp19], %[kC1] \n\t" \
42	"mul %[temp17], %[temp17], %[kC1] \n\t" \
43	"mul %[temp19], %[temp19], %[kC2] \n\t" \
44	"sra %[" #TEMP0 "], %[" #TEMP0 "], 16 \n\n" \
45	"sra %[temp18], %[temp18], 16 \n\n" \
46	"sra %[temp17], %[temp17], 16 \n\n" \
47	"sra %[temp19], %[temp19], 16 \n\n" \
48	"subu %[" #TEMP2 "], %[" #TEMP0 "], %[temp18] \n\t" \
49	"addu %[" #TEMP3 "], %[temp17], %[temp19] \n\t" \
50	"addu %[" #TEMP0 "], %[" #TEMP4 "], %[" #TEMP3 "] \n\t" \
51	"addu %[" #TEMP1 "], %[temp16], %[" #TEMP2 "] \n\t" \
52	"subu %[" #TEMP2 "], %[temp16], %[" #TEMP2 "] \n\t" \
53	"subu %[" #TEMP3 "], %[" #TEMP4 "], %[" #TEMP3 "] \n\t"
54
55	// macro for one horizontal pass in ITransformOne
56	// MUL and STORE macros inlined
57	// a = clip_8b(a) is replaced with: a = max(a, 0); a = min(a, 255)
58	// temp0..temp15 holds tmp[0]..tmp[15]
59	// A - offset in bytes to load from ref and store to dst buffer
60	// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
61	#define HORIZONTAL_PASS(A, TEMP0, TEMP4, TEMP8, TEMP12) \
62	"addiu %[" #TEMP0 "], %[" #TEMP0 "], 4 \n\t" \
63	"addu %[temp16], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
64	"subu %[temp17], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
65	"mul %[" #TEMP0 "], %[" #TEMP4 "], %[kC2] \n\t" \
66	"mul %[" #TEMP8 "], %[" #TEMP12 "], %[kC1] \n\t" \
67	"mul %[" #TEMP4 "], %[" #TEMP4 "], %[kC1] \n\t" \
68	"mul %[" #TEMP12 "], %[" #TEMP12 "], %[kC2] \n\t" \
69	"sra %[" #TEMP0 "], %[" #TEMP0 "], 16 \n\t" \
70	"sra %[" #TEMP8 "], %[" #TEMP8 "], 16 \n\t" \
71	"sra %[" #TEMP4 "], %[" #TEMP4 "], 16 \n\t" \
72	"sra %[" #TEMP12 "], %[" #TEMP12 "], 16 \n\t" \
73	"subu %[temp18], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
74	"addu %[temp19], %[" #TEMP4 "], %[" #TEMP12 "] \n\t" \
75	"addu %[" #TEMP0 "], %[temp16], %[temp19] \n\t" \
76	"addu %[" #TEMP4 "], %[temp17], %[temp18] \n\t" \
77	"subu %[" #TEMP8 "], %[temp17], %[temp18] \n\t" \
78	"subu %[" #TEMP12 "], %[temp16], %[temp19] \n\t" \
79	"lw %[temp20], 0(%[args]) \n\t" \
80	"sra %[" #TEMP0 "], %[" #TEMP0 "], 3 \n\t" \
81	"sra %[" #TEMP4 "], %[" #TEMP4 "], 3 \n\t" \
82	"sra %[" #TEMP8 "], %[" #TEMP8 "], 3 \n\t" \
83	"sra %[" #TEMP12 "], %[" #TEMP12 "], 3 \n\t" \
84	"lbu %[temp16], 0+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
85	"lbu %[temp17], 1+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
86	"lbu %[temp18], 2+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
87	"lbu %[temp19], 3+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
88	"addu %[" #TEMP0 "], %[temp16], %[" #TEMP0 "] \n\t" \
89	"addu %[" #TEMP4 "], %[temp17], %[" #TEMP4 "] \n\t" \
90	"addu %[" #TEMP8 "], %[temp18], %[" #TEMP8 "] \n\t" \
91	"addu %[" #TEMP12 "], %[temp19], %[" #TEMP12 "] \n\t" \
92	"slt %[temp16], %[" #TEMP0 "], $zero \n\t" \
93	"slt %[temp17], %[" #TEMP4 "], $zero \n\t" \
94	"slt %[temp18], %[" #TEMP8 "], $zero \n\t" \
95	"slt %[temp19], %[" #TEMP12 "], $zero \n\t" \
96	"movn %[" #TEMP0 "], $zero, %[temp16] \n\t" \
97	"movn %[" #TEMP4 "], $zero, %[temp17] \n\t" \
98	"movn %[" #TEMP8 "], $zero, %[temp18] \n\t" \
99	"movn %[" #TEMP12 "], $zero, %[temp19] \n\t" \
100	"addiu %[temp20], $zero, 255 \n\t" \
101	"slt %[temp16], %[" #TEMP0 "], %[temp20] \n\t" \
102	"slt %[temp17], %[" #TEMP4 "], %[temp20] \n\t" \
103	"slt %[temp18], %[" #TEMP8 "], %[temp20] \n\t" \
104	"slt %[temp19], %[" #TEMP12 "], %[temp20] \n\t" \
105	"movz %[" #TEMP0 "], %[temp20], %[temp16] \n\t" \
106	"movz %[" #TEMP4 "], %[temp20], %[temp17] \n\t" \
107	"lw %[temp16], 8(%[args]) \n\t" \
108	"movz %[" #TEMP8 "], %[temp20], %[temp18] \n\t" \
109	"movz %[" #TEMP12 "], %[temp20], %[temp19] \n\t" \
110	"sb %[" #TEMP0 "], 0+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" \
111	"sb %[" #TEMP4 "], 1+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" \
112	"sb %[" #TEMP8 "], 2+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" \
113	"sb %[" #TEMP12 "], 3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"
114
115	// Does one or two inverse transforms.
116	static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
117	uint8_t* dst) {
118	int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
119	int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
120	int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
121	const int* args[`3`] = {(const int)ref, (const* int)in, (const* int*)dst};
122
123	__asm__ volatile(
124	"lw %[temp20], 4(%[args]) \n\t"
125	VERTICAL_PASS(`0`, `16`, `8`, `24`, temp4, temp0, temp1, temp2, temp3)
126	VERTICAL_PASS(`2`, `18`, `10`, `26`, temp8, temp4, temp5, temp6, temp7)
127	VERTICAL_PASS(`4`, `20`, `12`, `28`, temp12, temp8, temp9, temp10, temp11)
128	VERTICAL_PASS(`6`, `22`, `14`, `30`, temp20, temp12, temp13, temp14, temp15)
129
130	HORIZONTAL_PASS(`0`, temp0, temp4, temp8, temp12)
131	HORIZONTAL_PASS(`1`, temp1, temp5, temp9, temp13)
132	HORIZONTAL_PASS(`2`, temp2, temp6, temp10, temp14)
133	HORIZONTAL_PASS(`3`, temp3, temp7, temp11, temp15)
134
135	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
136	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
137	[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
138	[temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
139	[temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
140	[temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
141	[temp18]"=&r"(temp18), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
142	: [args]"r"(args), [kC1]"r"(kC1), [kC2]"r"(kC2)
143	: "memory", "hi", "lo"
144	);
145	}
146
147	static void ITransform(const uint8_t* ref, const int16_t* in,
148	uint8_t* dst, int do_two) {
149	ITransformOne(ref, in, dst);
150	if (do_two) {
151	ITransformOne(ref + `4`, in + `16`, dst + `4`);
152	}
153	}
154
155	#undef VERTICAL_PASS
156	#undef HORIZONTAL_PASS
157
158	// macro for one pass through for loop in QuantizeBlock
159	// QUANTDIV macro inlined
160	// J - offset in bytes (kZigzag[n] 2)*
161	// K - offset in bytes (kZigzag[n] 4)*
162	// N - offset in bytes (n 2)*
163	#define QUANTIZE_ONE(J, K, N) \
164	"lh %[temp0], " #J "(%[ppin]) \n\t" \
165	"lhu %[temp1], " #J "(%[ppsharpen]) \n\t" \
166	"lw %[temp2], " #K "(%[ppzthresh]) \n\t" \
167	"sra %[sign], %[temp0], 15 \n\t" \
168	"xor %[coeff], %[temp0], %[sign] \n\t" \
169	"subu %[coeff], %[coeff], %[sign] \n\t" \
170	"addu %[coeff], %[coeff], %[temp1] \n\t" \
171	"slt %[temp4], %[temp2], %[coeff] \n\t" \
172	"addiu %[temp5], $zero, 0 \n\t" \
173	"addiu %[level], $zero, 0 \n\t" \
174	"beqz %[temp4], 2f \n\t" \
175	"lhu %[temp1], " #J "(%[ppiq]) \n\t" \
176	"lw %[temp2], " #K "(%[ppbias]) \n\t" \
177	"lhu %[temp3], " #J "(%[ppq]) \n\t" \
178	"mul %[level], %[coeff], %[temp1] \n\t" \
179	"addu %[level], %[level], %[temp2] \n\t" \
180	"sra %[level], %[level], 17 \n\t" \
181	"slt %[temp4], %[max_level], %[level] \n\t" \
182	"movn %[level], %[max_level], %[temp4] \n\t" \
183	"xor %[level], %[level], %[sign] \n\t" \
184	"subu %[level], %[level], %[sign] \n\t" \
185	"mul %[temp5], %[level], %[temp3] \n\t" \
186	"2: \n\t" \
187	"sh %[temp5], " #J "(%[ppin]) \n\t" \
188	"sh %[level], " #N "(%[pout]) \n\t"
189
190	static int QuantizeBlock(int16_t in[`16`], int16_t out[`16`],
191	const VP8Matrix* const mtx) {
192	int temp0, temp1, temp2, temp3, temp4, temp5;
193	int sign, coeff, level, i;
194	int max_level = MAX_LEVEL;
195
196	int16_t* ppin = &in[`0`];
197	int16_t* pout = &out[`0`];
198	const uint16_t* ppsharpen = &mtx->sharpen_[`0`];
199	const uint32_t* ppzthresh = &mtx->zthresh_[`0`];
200	const uint16_t* ppq = &mtx->q_[`0`];
201	const uint16_t* ppiq = &mtx->iq_[`0`];
202	const uint32_t* ppbias = &mtx->bias_[`0`];
203
204	__asm__ volatile(
205	QUANTIZE_ONE( `0`, `0`, `0`)
206	QUANTIZE_ONE( `2`, `4`, `2`)
207	QUANTIZE_ONE( `8`, `16`, `4`)
208	QUANTIZE_ONE(`16`, `32`, `6`)
209	QUANTIZE_ONE(`10`, `20`, `8`)
210	QUANTIZE_ONE( `4`, `8`, `10`)
211	QUANTIZE_ONE( `6`, `12`, `12`)
212	QUANTIZE_ONE(`12`, `24`, `14`)
213	QUANTIZE_ONE(`18`, `36`, `16`)
214	QUANTIZE_ONE(`24`, `48`, `18`)
215	QUANTIZE_ONE(`26`, `52`, `20`)
216	QUANTIZE_ONE(`20`, `40`, `22`)
217	QUANTIZE_ONE(`14`, `28`, `24`)
218	QUANTIZE_ONE(`22`, `44`, `26`)
219	QUANTIZE_ONE(`28`, `56`, `28`)
220	QUANTIZE_ONE(`30`, `60`, `30`)
221
222	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
223	[temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
224	[temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
225	[sign]"=&r"(sign), [coeff]"=&r"(coeff),
226	[level]"=&r"(level)
227	: [pout]"r"(pout), [ppin]"r"(ppin),
228	[ppiq]"r"(ppiq), [max_level]"r"(max_level),
229	[ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
230	[ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
231	: "memory", "hi", "lo"
232	);
233
234	// moved out from macro to increase possibility for earlier breaking
235	for (i = `15`; i >= `0`; i--) {
236	if (out[i]) return `1`;
237	}
238	return `0`;
239	}
240
241	static int Quantize2Blocks(int16_t in[`32`], int16_t out[`32`],
242	const VP8Matrix* const mtx) {
243	int nz;
244	nz = QuantizeBlock(in + `0` * `16`, out + `0` * `16`, mtx) << `0`;
245	nz \|= QuantizeBlock(in + `1` * `16`, out + `1` * `16`, mtx) << `1`;
246	return nz;
247	}
248
249	#undef QUANTIZE_ONE
250
251	// macro for one horizontal pass in Disto4x4 (TTransform)
252	// two calls of function TTransform are merged into single one
253	// A - offset in bytes to load from a and b buffers
254	// E..H - offsets in bytes to store first results to tmp buffer
255	// E1..H1 - offsets in bytes to store second results to tmp buffer
256	#define HORIZONTAL_PASS(A, E, F, G, H, E1, F1, G1, H1) \
257	"lbu %[temp0], 0+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
258	"lbu %[temp1], 1+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
259	"lbu %[temp2], 2+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
260	"lbu %[temp3], 3+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
261	"lbu %[temp4], 0+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
262	"lbu %[temp5], 1+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
263	"lbu %[temp6], 2+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
264	"lbu %[temp7], 3+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
265	"addu %[temp8], %[temp0], %[temp2] \n\t" \
266	"subu %[temp0], %[temp0], %[temp2] \n\t" \
267	"addu %[temp2], %[temp1], %[temp3] \n\t" \
268	"subu %[temp1], %[temp1], %[temp3] \n\t" \
269	"addu %[temp3], %[temp4], %[temp6] \n\t" \
270	"subu %[temp4], %[temp4], %[temp6] \n\t" \
271	"addu %[temp6], %[temp5], %[temp7] \n\t" \
272	"subu %[temp5], %[temp5], %[temp7] \n\t" \
273	"addu %[temp7], %[temp8], %[temp2] \n\t" \
274	"subu %[temp2], %[temp8], %[temp2] \n\t" \
275	"addu %[temp8], %[temp0], %[temp1] \n\t" \
276	"subu %[temp0], %[temp0], %[temp1] \n\t" \
277	"addu %[temp1], %[temp3], %[temp6] \n\t" \
278	"subu %[temp3], %[temp3], %[temp6] \n\t" \
279	"addu %[temp6], %[temp4], %[temp5] \n\t" \
280	"subu %[temp4], %[temp4], %[temp5] \n\t" \
281	"sw %[temp7], " #E "(%[tmp]) \n\t" \
282	"sw %[temp2], " #H "(%[tmp]) \n\t" \
283	"sw %[temp8], " #F "(%[tmp]) \n\t" \
284	"sw %[temp0], " #G "(%[tmp]) \n\t" \
285	"sw %[temp1], " #E1 "(%[tmp]) \n\t" \
286	"sw %[temp3], " #H1 "(%[tmp]) \n\t" \
287	"sw %[temp6], " #F1 "(%[tmp]) \n\t" \
288	"sw %[temp4], " #G1 "(%[tmp]) \n\t"
289
290	// macro for one vertical pass in Disto4x4 (TTransform)
291	// two calls of function TTransform are merged into single one
292	// since only one accu is available in mips32r1 instruction set
293	// first is done second call of function TTransform and after
294	// that first one.
295	// const int sum1 = TTransform(a, w);
296	// const int sum2 = TTransform(b, w);
297	// return abs(sum2 - sum1) >> 5;
298	// (sum2 - sum1) is calculated with madds (sub2) and msubs (sub1)
299	// A..D - offsets in bytes to load first results from tmp buffer
300	// A1..D1 - offsets in bytes to load second results from tmp buffer
301	// E..H - offsets in bytes to load from w buffer
302	#define VERTICAL_PASS(A, B, C, D, A1, B1, C1, D1, E, F, G, H) \
303	"lw %[temp0], " #A1 "(%[tmp]) \n\t" \
304	"lw %[temp1], " #C1 "(%[tmp]) \n\t" \
305	"lw %[temp2], " #B1 "(%[tmp]) \n\t" \
306	"lw %[temp3], " #D1 "(%[tmp]) \n\t" \
307	"addu %[temp8], %[temp0], %[temp1] \n\t" \
308	"subu %[temp0], %[temp0], %[temp1] \n\t" \
309	"addu %[temp1], %[temp2], %[temp3] \n\t" \
310	"subu %[temp2], %[temp2], %[temp3] \n\t" \
311	"addu %[temp3], %[temp8], %[temp1] \n\t" \
312	"subu %[temp8], %[temp8], %[temp1] \n\t" \
313	"addu %[temp1], %[temp0], %[temp2] \n\t" \
314	"subu %[temp0], %[temp0], %[temp2] \n\t" \
315	"sra %[temp4], %[temp3], 31 \n\t" \
316	"sra %[temp5], %[temp1], 31 \n\t" \
317	"sra %[temp6], %[temp0], 31 \n\t" \
318	"sra %[temp7], %[temp8], 31 \n\t" \
319	"xor %[temp3], %[temp3], %[temp4] \n\t" \
320	"xor %[temp1], %[temp1], %[temp5] \n\t" \
321	"xor %[temp0], %[temp0], %[temp6] \n\t" \
322	"xor %[temp8], %[temp8], %[temp7] \n\t" \
323	"subu %[temp3], %[temp3], %[temp4] \n\t" \
324	"subu %[temp1], %[temp1], %[temp5] \n\t" \
325	"subu %[temp0], %[temp0], %[temp6] \n\t" \
326	"subu %[temp8], %[temp8], %[temp7] \n\t" \
327	"lhu %[temp4], " #E "(%[w]) \n\t" \
328	"lhu %[temp5], " #F "(%[w]) \n\t" \
329	"lhu %[temp6], " #G "(%[w]) \n\t" \
330	"lhu %[temp7], " #H "(%[w]) \n\t" \
331	"madd %[temp4], %[temp3] \n\t" \
332	"madd %[temp5], %[temp1] \n\t" \
333	"madd %[temp6], %[temp0] \n\t" \
334	"madd %[temp7], %[temp8] \n\t" \
335	"lw %[temp0], " #A "(%[tmp]) \n\t" \
336	"lw %[temp1], " #C "(%[tmp]) \n\t" \
337	"lw %[temp2], " #B "(%[tmp]) \n\t" \
338	"lw %[temp3], " #D "(%[tmp]) \n\t" \
339	"addu %[temp8], %[temp0], %[temp1] \n\t" \
340	"subu %[temp0], %[temp0], %[temp1] \n\t" \
341	"addu %[temp1], %[temp2], %[temp3] \n\t" \
342	"subu %[temp2], %[temp2], %[temp3] \n\t" \
343	"addu %[temp3], %[temp8], %[temp1] \n\t" \
344	"subu %[temp1], %[temp8], %[temp1] \n\t" \
345	"addu %[temp8], %[temp0], %[temp2] \n\t" \
346	"subu %[temp0], %[temp0], %[temp2] \n\t" \
347	"sra %[temp2], %[temp3], 31 \n\t" \
348	"xor %[temp3], %[temp3], %[temp2] \n\t" \
349	"subu %[temp3], %[temp3], %[temp2] \n\t" \
350	"msub %[temp4], %[temp3] \n\t" \
351	"sra %[temp2], %[temp8], 31 \n\t" \
352	"sra %[temp3], %[temp0], 31 \n\t" \
353	"sra %[temp4], %[temp1], 31 \n\t" \
354	"xor %[temp8], %[temp8], %[temp2] \n\t" \
355	"xor %[temp0], %[temp0], %[temp3] \n\t" \
356	"xor %[temp1], %[temp1], %[temp4] \n\t" \
357	"subu %[temp8], %[temp8], %[temp2] \n\t" \
358	"subu %[temp0], %[temp0], %[temp3] \n\t" \
359	"subu %[temp1], %[temp1], %[temp4] \n\t" \
360	"msub %[temp5], %[temp8] \n\t" \
361	"msub %[temp6], %[temp0] \n\t" \
362	"msub %[temp7], %[temp1] \n\t"
363
364	static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
365	const uint16_t* const w) {
366	int tmp[`32`];
367	int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
368
369	__asm__ volatile(
370	HORIZONTAL_PASS(`0`, `0`, `4`, `8`, `12`, `64`, `68`, `72`, `76`)
371	HORIZONTAL_PASS(`1`, `16`, `20`, `24`, `28`, `80`, `84`, `88`, `92`)
372	HORIZONTAL_PASS(`2`, `32`, `36`, `40`, `44`, `96`, `100`, `104`, `108`)
373	HORIZONTAL_PASS(`3`, `48`, `52`, `56`, `60`, `112`, `116`, `120`, `124`)
374	"mthi $zero \n\t"
375	"mtlo $zero \n\t"
376	VERTICAL_PASS( `0`, `16`, `32`, `48`, `64`, `80`, `96`, `112`, `0`, `8`, `16`, `24`)
377	VERTICAL_PASS( `4`, `20`, `36`, `52`, `68`, `84`, `100`, `116`, `2`, `10`, `18`, `26`)
378	VERTICAL_PASS( `8`, `24`, `40`, `56`, `72`, `88`, `104`, `120`, `4`, `12`, `20`, `28`)
379	VERTICAL_PASS(`12`, `28`, `44`, `60`, `76`, `92`, `108`, `124`, `6`, `14`, `22`, `30`)
380	"mflo %[temp0] \n\t"
381	"sra %[temp1], %[temp0], 31 \n\t"
382	"xor %[temp0], %[temp0], %[temp1] \n\t"
383	"subu %[temp0], %[temp0], %[temp1] \n\t"
384	"sra %[temp0], %[temp0], 5 \n\t"
385
386	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
387	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
388	[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
389	: [a]"r"(a), [b]"r"(b), [w]"r"(w), [tmp]"r"(tmp)
390	: "memory", "hi", "lo"
391	);
392
393	return temp0;
394	}
395
396	#undef VERTICAL_PASS
397	#undef HORIZONTAL_PASS
398
399	static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
400	const uint16_t* const w) {
401	int D = `0`;
402	int x, y;
403	for (y = `0`; y < `16` * BPS; y += `4` * BPS) {
404	for (x = `0`; x < `16`; x += `4`) {
405	D += Disto4x4(a + x + y, b + x + y, w);
406	}
407	}
408	return D;
409	}
410
411	// macro for one horizontal pass in FTransform
412	// temp0..temp15 holds tmp[0]..tmp[15]
413	// A - offset in bytes to load from src and ref buffers
414	// TEMP0..TEMP3 - registers for corresponding tmp elements
415	#define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3) \
416	"lw %[" #TEMP1 "], 0(%[args]) \n\t" \
417	"lw %[" #TEMP2 "], 4(%[args]) \n\t" \
418	"lbu %[temp16], 0+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
419	"lbu %[temp17], 0+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
420	"lbu %[temp18], 1+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
421	"lbu %[temp19], 1+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
422	"subu %[temp20], %[temp16], %[temp17] \n\t" \
423	"lbu %[temp16], 2+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
424	"lbu %[temp17], 2+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
425	"subu %[" #TEMP0 "], %[temp18], %[temp19] \n\t" \
426	"lbu %[temp18], 3+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
427	"lbu %[temp19], 3+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
428	"subu %[" #TEMP1 "], %[temp16], %[temp17] \n\t" \
429	"subu %[" #TEMP2 "], %[temp18], %[temp19] \n\t" \
430	"addu %[" #TEMP3 "], %[temp20], %[" #TEMP2 "] \n\t" \
431	"subu %[" #TEMP2 "], %[temp20], %[" #TEMP2 "] \n\t" \
432	"addu %[temp20], %[" #TEMP0 "], %[" #TEMP1 "] \n\t" \
433	"subu %[" #TEMP0 "], %[" #TEMP0 "], %[" #TEMP1 "] \n\t" \
434	"mul %[temp16], %[" #TEMP2 "], %[c5352] \n\t" \
435	"mul %[temp17], %[" #TEMP2 "], %[c2217] \n\t" \
436	"mul %[temp18], %[" #TEMP0 "], %[c5352] \n\t" \
437	"mul %[temp19], %[" #TEMP0 "], %[c2217] \n\t" \
438	"addu %[" #TEMP1 "], %[" #TEMP3 "], %[temp20] \n\t" \
439	"subu %[temp20], %[" #TEMP3 "], %[temp20] \n\t" \
440	"sll %[" #TEMP0 "], %[" #TEMP1 "], 3 \n\t" \
441	"sll %[" #TEMP2 "], %[temp20], 3 \n\t" \
442	"addiu %[temp16], %[temp16], 1812 \n\t" \
443	"addiu %[temp17], %[temp17], 937 \n\t" \
444	"addu %[temp16], %[temp16], %[temp19] \n\t" \
445	"subu %[temp17], %[temp17], %[temp18] \n\t" \
446	"sra %[" #TEMP1 "], %[temp16], 9 \n\t" \
447	"sra %[" #TEMP3 "], %[temp17], 9 \n\t"
448
449	// macro for one vertical pass in FTransform
450	// temp0..temp15 holds tmp[0]..tmp[15]
451	// A..D - offsets in bytes to store to out buffer
452	// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
453	#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12) \
454	"addu %[temp16], %[" #TEMP0 "], %[" #TEMP12 "] \n\t" \
455	"subu %[temp19], %[" #TEMP0 "], %[" #TEMP12 "] \n\t" \
456	"addu %[temp17], %[" #TEMP4 "], %[" #TEMP8 "] \n\t" \
457	"subu %[temp18], %[" #TEMP4 "], %[" #TEMP8 "] \n\t" \
458	"mul %[" #TEMP8 "], %[temp19], %[c2217] \n\t" \
459	"mul %[" #TEMP12 "], %[temp18], %[c2217] \n\t" \
460	"mul %[" #TEMP4 "], %[temp19], %[c5352] \n\t" \
461	"mul %[temp18], %[temp18], %[c5352] \n\t" \
462	"addiu %[temp16], %[temp16], 7 \n\t" \
463	"addu %[" #TEMP0 "], %[temp16], %[temp17] \n\t" \
464	"sra %[" #TEMP0 "], %[" #TEMP0 "], 4 \n\t" \
465	"addu %[" #TEMP12 "], %[" #TEMP12 "], %[" #TEMP4 "] \n\t" \
466	"subu %[" #TEMP4 "], %[temp16], %[temp17] \n\t" \
467	"sra %[" #TEMP4 "], %[" #TEMP4 "], 4 \n\t" \
468	"addiu %[" #TEMP8 "], %[" #TEMP8 "], 30000 \n\t" \
469	"addiu %[" #TEMP12 "], %[" #TEMP12 "], 12000 \n\t" \
470	"addiu %[" #TEMP8 "], %[" #TEMP8 "], 21000 \n\t" \
471	"subu %[" #TEMP8 "], %[" #TEMP8 "], %[temp18] \n\t" \
472	"sra %[" #TEMP12 "], %[" #TEMP12 "], 16 \n\t" \
473	"sra %[" #TEMP8 "], %[" #TEMP8 "], 16 \n\t" \
474	"addiu %[temp16], %[" #TEMP12 "], 1 \n\t" \
475	"movn %[" #TEMP12 "], %[temp16], %[temp19] \n\t" \
476	"sh %[" #TEMP0 "], " #A "(%[temp20]) \n\t" \
477	"sh %[" #TEMP4 "], " #C "(%[temp20]) \n\t" \
478	"sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \
479	"sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t"
480
481	static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
482	int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
483	int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
484	int temp17, temp18, temp19, temp20;
485	const int c2217 = `2217`;
486	const int c5352 = `5352`;
487	const int* const args[`3`] =
488	{ (const int)src, (const* int)ref, (const* int*)out };
489
490	__asm__ volatile(
491	HORIZONTAL_PASS(`0`, temp0, temp1, temp2, temp3)
492	HORIZONTAL_PASS(`1`, temp4, temp5, temp6, temp7)
493	HORIZONTAL_PASS(`2`, temp8, temp9, temp10, temp11)
494	HORIZONTAL_PASS(`3`, temp12, temp13, temp14, temp15)
495	"lw %[temp20], 8(%[args]) \n\t"
496	VERTICAL_PASS(`0`, `8`, `16`, `24`, temp0, temp4, temp8, temp12)
497	VERTICAL_PASS(`2`, `10`, `18`, `26`, temp1, temp5, temp9, temp13)
498	VERTICAL_PASS(`4`, `12`, `20`, `28`, temp2, temp6, temp10, temp14)
499	VERTICAL_PASS(`6`, `14`, `22`, `30`, temp3, temp7, temp11, temp15)
500
501	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
502	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
503	[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
504	[temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
505	[temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
506	[temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
507	[temp18]"=&r"(temp18), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
508	: [args]"r"(args), [c2217]"r"(c2217), [c5352]"r"(c5352)
509	: "memory", "hi", "lo"
510	);
511	}
512
513	#undef VERTICAL_PASS
514	#undef HORIZONTAL_PASS
515
516	#if !defined(WORK_AROUND_GCC)
517
518	#define GET_SSE_INNER(A, B, C, D) \
519	"lbu %[temp0], " #A "(%[a]) \n\t" \
520	"lbu %[temp1], " #A "(%[b]) \n\t" \
521	"lbu %[temp2], " #B "(%[a]) \n\t" \
522	"lbu %[temp3], " #B "(%[b]) \n\t" \
523	"lbu %[temp4], " #C "(%[a]) \n\t" \
524	"lbu %[temp5], " #C "(%[b]) \n\t" \
525	"lbu %[temp6], " #D "(%[a]) \n\t" \
526	"lbu %[temp7], " #D "(%[b]) \n\t" \
527	"subu %[temp0], %[temp0], %[temp1] \n\t" \
528	"subu %[temp2], %[temp2], %[temp3] \n\t" \
529	"subu %[temp4], %[temp4], %[temp5] \n\t" \
530	"subu %[temp6], %[temp6], %[temp7] \n\t" \
531	"madd %[temp0], %[temp0] \n\t" \
532	"madd %[temp2], %[temp2] \n\t" \
533	"madd %[temp4], %[temp4] \n\t" \
534	"madd %[temp6], %[temp6] \n\t"
535
536	#define GET_SSE(A, B, C, D) \
537	GET_SSE_INNER(A, A + 1, A + 2, A + 3) \
538	GET_SSE_INNER(B, B + 1, B + 2, B + 3) \
539	GET_SSE_INNER(C, C + 1, C + 2, C + 3) \
540	GET_SSE_INNER(D, D + 1, D + 2, D + 3)
541
542	static int SSE16x16(const uint8_t* a, const uint8_t* b) {
543	int count;
544	int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
545
546	__asm__ volatile(
547	"mult $zero, $zero \n\t"
548
549	GET_SSE( `0` * BPS, `4` + `0` * BPS, `8` + `0` * BPS, `12` + `0` * BPS)
550	GET_SSE( `1` * BPS, `4` + `1` * BPS, `8` + `1` * BPS, `12` + `1` * BPS)
551	GET_SSE( `2` * BPS, `4` + `2` * BPS, `8` + `2` * BPS, `12` + `2` * BPS)
552	GET_SSE( `3` * BPS, `4` + `3` * BPS, `8` + `3` * BPS, `12` + `3` * BPS)
553	GET_SSE( `4` * BPS, `4` + `4` * BPS, `8` + `4` * BPS, `12` + `4` * BPS)
554	GET_SSE( `5` * BPS, `4` + `5` * BPS, `8` + `5` * BPS, `12` + `5` * BPS)
555	GET_SSE( `6` * BPS, `4` + `6` * BPS, `8` + `6` * BPS, `12` + `6` * BPS)
556	GET_SSE( `7` * BPS, `4` + `7` * BPS, `8` + `7` * BPS, `12` + `7` * BPS)
557	GET_SSE( `8` * BPS, `4` + `8` * BPS, `8` + `8` * BPS, `12` + `8` * BPS)
558	GET_SSE( `9` * BPS, `4` + `9` * BPS, `8` + `9` * BPS, `12` + `9` * BPS)
559	GET_SSE(`10` * BPS, `4` + `10` * BPS, `8` + `10` * BPS, `12` + `10` * BPS)
560	GET_SSE(`11` * BPS, `4` + `11` * BPS, `8` + `11` * BPS, `12` + `11` * BPS)
561	GET_SSE(`12` * BPS, `4` + `12` * BPS, `8` + `12` * BPS, `12` + `12` * BPS)
562	GET_SSE(`13` * BPS, `4` + `13` * BPS, `8` + `13` * BPS, `12` + `13` * BPS)
563	GET_SSE(`14` * BPS, `4` + `14` * BPS, `8` + `14` * BPS, `12` + `14` * BPS)
564	GET_SSE(`15` * BPS, `4` + `15` * BPS, `8` + `15` * BPS, `12` + `15` * BPS)
565
566	"mflo %[count] \n\t"
567	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
568	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
569	[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
570	: [a]"r"(a), [b]"r"(b)
571	: "memory", "hi", "lo"
572	);
573	return count;
574	}
575
576	static int SSE16x8(const uint8_t* a, const uint8_t* b) {
577	int count;
578	int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
579
580	__asm__ volatile(
581	"mult $zero, $zero \n\t"
582
583	GET_SSE( `0` * BPS, `4` + `0` * BPS, `8` + `0` * BPS, `12` + `0` * BPS)
584	GET_SSE( `1` * BPS, `4` + `1` * BPS, `8` + `1` * BPS, `12` + `1` * BPS)
585	GET_SSE( `2` * BPS, `4` + `2` * BPS, `8` + `2` * BPS, `12` + `2` * BPS)
586	GET_SSE( `3` * BPS, `4` + `3` * BPS, `8` + `3` * BPS, `12` + `3` * BPS)
587	GET_SSE( `4` * BPS, `4` + `4` * BPS, `8` + `4` * BPS, `12` + `4` * BPS)
588	GET_SSE( `5` * BPS, `4` + `5` * BPS, `8` + `5` * BPS, `12` + `5` * BPS)
589	GET_SSE( `6` * BPS, `4` + `6` * BPS, `8` + `6` * BPS, `12` + `6` * BPS)
590	GET_SSE( `7` * BPS, `4` + `7` * BPS, `8` + `7` * BPS, `12` + `7` * BPS)
591
592	"mflo %[count] \n\t"
593	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
594	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
595	[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
596	: [a]"r"(a), [b]"r"(b)
597	: "memory", "hi", "lo"
598	);
599	return count;
600	}
601
602	static int SSE8x8(const uint8_t* a, const uint8_t* b) {
603	int count;
604	int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
605
606	__asm__ volatile(
607	"mult $zero, $zero \n\t"
608
609	GET_SSE(`0` * BPS, `4` + `0` * BPS, `1` * BPS, `4` + `1` * BPS)
610	GET_SSE(`2` * BPS, `4` + `2` * BPS, `3` * BPS, `4` + `3` * BPS)
611	GET_SSE(`4` * BPS, `4` + `4` * BPS, `5` * BPS, `4` + `5` * BPS)
612	GET_SSE(`6` * BPS, `4` + `6` * BPS, `7` * BPS, `4` + `7` * BPS)
613
614	"mflo %[count] \n\t"
615	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
616	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
617	[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
618	: [a]"r"(a), [b]"r"(b)
619	: "memory", "hi", "lo"
620	);
621	return count;
622	}
623
624	static int SSE4x4(const uint8_t* a, const uint8_t* b) {
625	int count;
626	int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
627
628	__asm__ volatile(
629	"mult $zero, $zero \n\t"
630
631	GET_SSE(`0` * BPS, `1` * BPS, `2` * BPS, `3` * BPS)
632
633	"mflo %[count] \n\t"
634	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
635	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
636	[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
637	: [a]"r"(a), [b]"r"(b)
638	: "memory", "hi", "lo"
639	);
640	return count;
641	}
642
643	#undef GET_SSE
644	#undef GET_SSE_INNER
645
646	#endif // !WORK_AROUND_GCC
647
648	//------------------------------------------------------------------------------
649	// Entry point
650
651	extern void VP8EncDspInitMIPS32(void);
652
653	WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) {
654	VP8ITransform = ITransform;
655	VP8FTransform = FTransform;
656	VP8EncQuantizeBlock = QuantizeBlock;
657	VP8EncQuantize2Blocks = Quantize2Blocks;
658	VP8TDisto4x4 = Disto4x4;
659	VP8TDisto16x16 = Disto16x16;
660	#if !defined(WORK_AROUND_GCC)
661	VP8SSE16x16 = SSE16x16;
662	VP8SSE8x8 = SSE8x8;
663	VP8SSE16x8 = SSE16x8;
664	VP8SSE4x4 = SSE4x4;
665	#endif
666	}
667
668	#else // !WEBP_USE_MIPS32
669
670	WEBP_DSP_INIT_STUB(VP8EncDspInitMIPS32)
671
672	#endif // WEBP_USE_MIPS32
673

Browse the source code of engine/third_party/libwebp/src/dsp/enc_mips32.c