enc_mips32.c source code [Godot/thirdparty/libwebp/src/dsp/enc_mips32.c]

1	// Copyright 2014 Google Inc. All Rights Reserved.
2	//
3	// Use of this source code is governed by a BSD-style license
4	// that can be found in the COPYING file in the root of the source
5	// tree. An additional intellectual property rights grant can be found
6	// in the file PATENTS. All contributing project authors may
7	// be found in the AUTHORS file in the root of the source tree.
8	// -----------------------------------------------------------------------------
9	//
10	// MIPS version of speed-critical encoding functions.
11	//
12	// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
13	// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
14	// Slobodan Prijic (slobodan.prijic@imgtec.com)
15
16	#include "src/dsp/dsp.h"
17
18	#if defined(WEBP_USE_MIPS32)
19
20	#include "src/dsp/mips_macro.h"
21	#include "src/enc/vp8i_enc.h"
22	#include "src/enc/cost_enc.h"
23
24	static const int kC1 = `20091` + (`1` << `16`);
25	static const int kC2 = `35468`;
26
27	// macro for one vertical pass in ITransformOne
28	// MUL macro inlined
29	// temp0..temp15 holds tmp[0]..tmp[15]
30	// A..D - offsets in bytes to load from in buffer
31	// TEMP0..TEMP3 - registers for corresponding tmp elements
32	// TEMP4..TEMP5 - temporary registers
33	#define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3) \
34	"lh %[temp16], " #A "(%[temp20]) \n\t" \
35	"lh %[temp18], " #B "(%[temp20]) \n\t" \
36	"lh %[temp17], " #C "(%[temp20]) \n\t" \
37	"lh %[temp19], " #D "(%[temp20]) \n\t" \
38	"addu %[" #TEMP4 "], %[temp16], %[temp18] \n\t" \
39	"subu %[temp16], %[temp16], %[temp18] \n\t" \
40	"mul %[" #TEMP0 "], %[temp17], %[kC2] \n\t" \
41	"mul %[temp18], %[temp19], %[kC1] \n\t" \
42	"mul %[temp17], %[temp17], %[kC1] \n\t" \
43	"mul %[temp19], %[temp19], %[kC2] \n\t" \
44	"sra %[" #TEMP0 "], %[" #TEMP0 "], 16 \n\n" \
45	"sra %[temp18], %[temp18], 16 \n\n" \
46	"sra %[temp17], %[temp17], 16 \n\n" \
47	"sra %[temp19], %[temp19], 16 \n\n" \
48	"subu %[" #TEMP2 "], %[" #TEMP0 "], %[temp18] \n\t" \
49	"addu %[" #TEMP3 "], %[temp17], %[temp19] \n\t" \
50	"addu %[" #TEMP0 "], %[" #TEMP4 "], %[" #TEMP3 "] \n\t" \
51	"addu %[" #TEMP1 "], %[temp16], %[" #TEMP2 "] \n\t" \
52	"subu %[" #TEMP2 "], %[temp16], %[" #TEMP2 "] \n\t" \
53	"subu %[" #TEMP3 "], %[" #TEMP4 "], %[" #TEMP3 "] \n\t"
54
55	// macro for one horizontal pass in ITransformOne
56	// MUL and STORE macros inlined
57	// a = clip_8b(a) is replaced with: a = max(a, 0); a = min(a, 255)
58	// temp0..temp15 holds tmp[0]..tmp[15]
59	// A - offset in bytes to load from ref and store to dst buffer
60	// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
61	#define HORIZONTAL_PASS(A, TEMP0, TEMP4, TEMP8, TEMP12) \
62	"addiu %[" #TEMP0 "], %[" #TEMP0 "], 4 \n\t" \
63	"addu %[temp16], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
64	"subu %[temp17], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
65	"mul %[" #TEMP0 "], %[" #TEMP4 "], %[kC2] \n\t" \
66	"mul %[" #TEMP8 "], %[" #TEMP12 "], %[kC1] \n\t" \
67	"mul %[" #TEMP4 "], %[" #TEMP4 "], %[kC1] \n\t" \
68	"mul %[" #TEMP12 "], %[" #TEMP12 "], %[kC2] \n\t" \
69	"sra %[" #TEMP0 "], %[" #TEMP0 "], 16 \n\t" \
70	"sra %[" #TEMP8 "], %[" #TEMP8 "], 16 \n\t" \
71	"sra %[" #TEMP4 "], %[" #TEMP4 "], 16 \n\t" \
72	"sra %[" #TEMP12 "], %[" #TEMP12 "], 16 \n\t" \
73	"subu %[temp18], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
74	"addu %[temp19], %[" #TEMP4 "], %[" #TEMP12 "] \n\t" \
75	"addu %[" #TEMP0 "], %[temp16], %[temp19] \n\t" \
76	"addu %[" #TEMP4 "], %[temp17], %[temp18] \n\t" \
77	"subu %[" #TEMP8 "], %[temp17], %[temp18] \n\t" \
78	"subu %[" #TEMP12 "], %[temp16], %[temp19] \n\t" \
79	"lw %[temp20], 0(%[args]) \n\t" \
80	"sra %[" #TEMP0 "], %[" #TEMP0 "], 3 \n\t" \
81	"sra %[" #TEMP4 "], %[" #TEMP4 "], 3 \n\t" \
82	"sra %[" #TEMP8 "], %[" #TEMP8 "], 3 \n\t" \
83	"sra %[" #TEMP12 "], %[" #TEMP12 "], 3 \n\t" \
84	"lbu %[temp16], 0+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
85	"lbu %[temp17], 1+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
86	"lbu %[temp18], 2+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
87	"lbu %[temp19], 3+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
88	"addu %[" #TEMP0 "], %[temp16], %[" #TEMP0 "] \n\t" \
89	"addu %[" #TEMP4 "], %[temp17], %[" #TEMP4 "] \n\t" \
90	"addu %[" #TEMP8 "], %[temp18], %[" #TEMP8 "] \n\t" \
91	"addu %[" #TEMP12 "], %[temp19], %[" #TEMP12 "] \n\t" \
92	"slt %[temp16], %[" #TEMP0 "], $zero \n\t" \
93	"slt %[temp17], %[" #TEMP4 "], $zero \n\t" \
94	"slt %[temp18], %[" #TEMP8 "], $zero \n\t" \
95	"slt %[temp19], %[" #TEMP12 "], $zero \n\t" \
96	"movn %[" #TEMP0 "], $zero, %[temp16] \n\t" \
97	"movn %[" #TEMP4 "], $zero, %[temp17] \n\t" \
98	"movn %[" #TEMP8 "], $zero, %[temp18] \n\t" \
99	"movn %[" #TEMP12 "], $zero, %[temp19] \n\t" \
100	"addiu %[temp20], $zero, 255 \n\t" \
101	"slt %[temp16], %[" #TEMP0 "], %[temp20] \n\t" \
102	"slt %[temp17], %[" #TEMP4 "], %[temp20] \n\t" \
103	"slt %[temp18], %[" #TEMP8 "], %[temp20] \n\t" \
104	"slt %[temp19], %[" #TEMP12 "], %[temp20] \n\t" \
105	"movz %[" #TEMP0 "], %[temp20], %[temp16] \n\t" \
106	"movz %[" #TEMP4 "], %[temp20], %[temp17] \n\t" \
107	"lw %[temp16], 8(%[args]) \n\t" \
108	"movz %[" #TEMP8 "], %[temp20], %[temp18] \n\t" \
109	"movz %[" #TEMP12 "], %[temp20], %[temp19] \n\t" \
110	"sb %[" #TEMP0 "], 0+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" \
111	"sb %[" #TEMP4 "], 1+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" \
112	"sb %[" #TEMP8 "], 2+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" \
113	"sb %[" #TEMP12 "], 3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"
114
115	// Does one or two inverse transforms.
116	static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref,
117	const int16_t* in,
118	uint8_t* dst) {
119	int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
120	int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
121	int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
122	const int* args[`3`] = {(const int)ref, (const* int)in, (const* int*)dst};
123
124	__asm__ volatile(
125	"lw %[temp20], 4(%[args]) \n\t"
126	VERTICAL_PASS(`0`, `16`, `8`, `24`, temp4, temp0, temp1, temp2, temp3)
127	VERTICAL_PASS(`2`, `18`, `10`, `26`, temp8, temp4, temp5, temp6, temp7)
128	VERTICAL_PASS(`4`, `20`, `12`, `28`, temp12, temp8, temp9, temp10, temp11)
129	VERTICAL_PASS(`6`, `22`, `14`, `30`, temp20, temp12, temp13, temp14, temp15)
130
131	HORIZONTAL_PASS(`0`, temp0, temp4, temp8, temp12)
132	HORIZONTAL_PASS(`1`, temp1, temp5, temp9, temp13)
133	HORIZONTAL_PASS(`2`, temp2, temp6, temp10, temp14)
134	HORIZONTAL_PASS(`3`, temp3, temp7, temp11, temp15)
135
136	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
137	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
138	[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
139	[temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
140	[temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
141	[temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
142	[temp18]"=&r"(temp18), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
143	: [args]"r"(args), [kC1]"r"(kC1), [kC2]"r"(kC2)
144	: "memory", "hi", "lo"
145	);
146	}
147
148	static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in,
149	uint8_t* dst, int do_two) {
150	ITransformOne_MIPS32(ref, in, dst);
151	if (do_two) {
152	ITransformOne_MIPS32(ref + `4`, in + `16`, dst + `4`);
153	}
154	}
155
156	#undef VERTICAL_PASS
157	#undef HORIZONTAL_PASS
158
159	// macro for one pass through for loop in QuantizeBlock
160	// QUANTDIV macro inlined
161	// J - offset in bytes (kZigzag[n] 2)*
162	// K - offset in bytes (kZigzag[n] 4)*
163	// N - offset in bytes (n 2)*
164	#define QUANTIZE_ONE(J, K, N) \
165	"lh %[temp0], " #J "(%[ppin]) \n\t" \
166	"lhu %[temp1], " #J "(%[ppsharpen]) \n\t" \
167	"lw %[temp2], " #K "(%[ppzthresh]) \n\t" \
168	"sra %[sign], %[temp0], 15 \n\t" \
169	"xor %[coeff], %[temp0], %[sign] \n\t" \
170	"subu %[coeff], %[coeff], %[sign] \n\t" \
171	"addu %[coeff], %[coeff], %[temp1] \n\t" \
172	"slt %[temp4], %[temp2], %[coeff] \n\t" \
173	"addiu %[temp5], $zero, 0 \n\t" \
174	"addiu %[level], $zero, 0 \n\t" \
175	"beqz %[temp4], 2f \n\t" \
176	"lhu %[temp1], " #J "(%[ppiq]) \n\t" \
177	"lw %[temp2], " #K "(%[ppbias]) \n\t" \
178	"lhu %[temp3], " #J "(%[ppq]) \n\t" \
179	"mul %[level], %[coeff], %[temp1] \n\t" \
180	"addu %[level], %[level], %[temp2] \n\t" \
181	"sra %[level], %[level], 17 \n\t" \
182	"slt %[temp4], %[max_level], %[level] \n\t" \
183	"movn %[level], %[max_level], %[temp4] \n\t" \
184	"xor %[level], %[level], %[sign] \n\t" \
185	"subu %[level], %[level], %[sign] \n\t" \
186	"mul %[temp5], %[level], %[temp3] \n\t" \
187	"2: \n\t" \
188	"sh %[temp5], " #J "(%[ppin]) \n\t" \
189	"sh %[level], " #N "(%[pout]) \n\t"
190
191	static int QuantizeBlock_MIPS32(int16_t in[`16`], int16_t out[`16`],
192	const VP8Matrix* const mtx) {
193	int temp0, temp1, temp2, temp3, temp4, temp5;
194	int sign, coeff, level, i;
195	int max_level = MAX_LEVEL;
196
197	int16_t* ppin = &in[`0`];
198	int16_t* pout = &out[`0`];
199	const uint16_t* ppsharpen = &mtx->sharpen_[`0`];
200	const uint32_t* ppzthresh = &mtx->zthresh_[`0`];
201	const uint16_t* ppq = &mtx->q_[`0`];
202	const uint16_t* ppiq = &mtx->iq_[`0`];
203	const uint32_t* ppbias = &mtx->bias_[`0`];
204
205	__asm__ volatile(
206	QUANTIZE_ONE( `0`, `0`, `0`)
207	QUANTIZE_ONE( `2`, `4`, `2`)
208	QUANTIZE_ONE( `8`, `16`, `4`)
209	QUANTIZE_ONE(`16`, `32`, `6`)
210	QUANTIZE_ONE(`10`, `20`, `8`)
211	QUANTIZE_ONE( `4`, `8`, `10`)
212	QUANTIZE_ONE( `6`, `12`, `12`)
213	QUANTIZE_ONE(`12`, `24`, `14`)
214	QUANTIZE_ONE(`18`, `36`, `16`)
215	QUANTIZE_ONE(`24`, `48`, `18`)
216	QUANTIZE_ONE(`26`, `52`, `20`)
217	QUANTIZE_ONE(`20`, `40`, `22`)
218	QUANTIZE_ONE(`14`, `28`, `24`)
219	QUANTIZE_ONE(`22`, `44`, `26`)
220	QUANTIZE_ONE(`28`, `56`, `28`)
221	QUANTIZE_ONE(`30`, `60`, `30`)
222
223	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
224	[temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
225	[temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
226	[sign]"=&r"(sign), [coeff]"=&r"(coeff),
227	[level]"=&r"(level)
228	: [pout]"r"(pout), [ppin]"r"(ppin),
229	[ppiq]"r"(ppiq), [max_level]"r"(max_level),
230	[ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
231	[ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
232	: "memory", "hi", "lo"
233	);
234
235	// moved out from macro to increase possibility for earlier breaking
236	for (i = `15`; i >= `0`; i--) {
237	if (out[i]) return `1`;
238	}
239	return `0`;
240	}
241
242	static int Quantize2Blocks_MIPS32(int16_t in[`32`], int16_t out[`32`],
243	const VP8Matrix* const mtx) {
244	int nz;
245	nz = QuantizeBlock_MIPS32(in + `0` * `16`, out + `0` * `16`, mtx) << `0`;
246	nz \|= QuantizeBlock_MIPS32(in + `1` * `16`, out + `1` * `16`, mtx) << `1`;
247	return nz;
248	}
249
250	#undef QUANTIZE_ONE
251
252	// macro for one horizontal pass in Disto4x4 (TTransform)
253	// two calls of function TTransform are merged into single one
254	// A - offset in bytes to load from a and b buffers
255	// E..H - offsets in bytes to store first results to tmp buffer
256	// E1..H1 - offsets in bytes to store second results to tmp buffer
257	#define HORIZONTAL_PASS(A, E, F, G, H, E1, F1, G1, H1) \
258	"lbu %[temp0], 0+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
259	"lbu %[temp1], 1+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
260	"lbu %[temp2], 2+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
261	"lbu %[temp3], 3+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
262	"lbu %[temp4], 0+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
263	"lbu %[temp5], 1+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
264	"lbu %[temp6], 2+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
265	"lbu %[temp7], 3+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
266	"addu %[temp8], %[temp0], %[temp2] \n\t" \
267	"subu %[temp0], %[temp0], %[temp2] \n\t" \
268	"addu %[temp2], %[temp1], %[temp3] \n\t" \
269	"subu %[temp1], %[temp1], %[temp3] \n\t" \
270	"addu %[temp3], %[temp4], %[temp6] \n\t" \
271	"subu %[temp4], %[temp4], %[temp6] \n\t" \
272	"addu %[temp6], %[temp5], %[temp7] \n\t" \
273	"subu %[temp5], %[temp5], %[temp7] \n\t" \
274	"addu %[temp7], %[temp8], %[temp2] \n\t" \
275	"subu %[temp2], %[temp8], %[temp2] \n\t" \
276	"addu %[temp8], %[temp0], %[temp1] \n\t" \
277	"subu %[temp0], %[temp0], %[temp1] \n\t" \
278	"addu %[temp1], %[temp3], %[temp6] \n\t" \
279	"subu %[temp3], %[temp3], %[temp6] \n\t" \
280	"addu %[temp6], %[temp4], %[temp5] \n\t" \
281	"subu %[temp4], %[temp4], %[temp5] \n\t" \
282	"sw %[temp7], " #E "(%[tmp]) \n\t" \
283	"sw %[temp2], " #H "(%[tmp]) \n\t" \
284	"sw %[temp8], " #F "(%[tmp]) \n\t" \
285	"sw %[temp0], " #G "(%[tmp]) \n\t" \
286	"sw %[temp1], " #E1 "(%[tmp]) \n\t" \
287	"sw %[temp3], " #H1 "(%[tmp]) \n\t" \
288	"sw %[temp6], " #F1 "(%[tmp]) \n\t" \
289	"sw %[temp4], " #G1 "(%[tmp]) \n\t"
290
291	// macro for one vertical pass in Disto4x4 (TTransform)
292	// two calls of function TTransform are merged into single one
293	// since only one accu is available in mips32r1 instruction set
294	// first is done second call of function TTransform and after
295	// that first one.
296	// const int sum1 = TTransform(a, w);
297	// const int sum2 = TTransform(b, w);
298	// return abs(sum2 - sum1) >> 5;
299	// (sum2 - sum1) is calculated with madds (sub2) and msubs (sub1)
300	// A..D - offsets in bytes to load first results from tmp buffer
301	// A1..D1 - offsets in bytes to load second results from tmp buffer
302	// E..H - offsets in bytes to load from w buffer
303	#define VERTICAL_PASS(A, B, C, D, A1, B1, C1, D1, E, F, G, H) \
304	"lw %[temp0], " #A1 "(%[tmp]) \n\t" \
305	"lw %[temp1], " #C1 "(%[tmp]) \n\t" \
306	"lw %[temp2], " #B1 "(%[tmp]) \n\t" \
307	"lw %[temp3], " #D1 "(%[tmp]) \n\t" \
308	"addu %[temp8], %[temp0], %[temp1] \n\t" \
309	"subu %[temp0], %[temp0], %[temp1] \n\t" \
310	"addu %[temp1], %[temp2], %[temp3] \n\t" \
311	"subu %[temp2], %[temp2], %[temp3] \n\t" \
312	"addu %[temp3], %[temp8], %[temp1] \n\t" \
313	"subu %[temp8], %[temp8], %[temp1] \n\t" \
314	"addu %[temp1], %[temp0], %[temp2] \n\t" \
315	"subu %[temp0], %[temp0], %[temp2] \n\t" \
316	"sra %[temp4], %[temp3], 31 \n\t" \
317	"sra %[temp5], %[temp1], 31 \n\t" \
318	"sra %[temp6], %[temp0], 31 \n\t" \
319	"sra %[temp7], %[temp8], 31 \n\t" \
320	"xor %[temp3], %[temp3], %[temp4] \n\t" \
321	"xor %[temp1], %[temp1], %[temp5] \n\t" \
322	"xor %[temp0], %[temp0], %[temp6] \n\t" \
323	"xor %[temp8], %[temp8], %[temp7] \n\t" \
324	"subu %[temp3], %[temp3], %[temp4] \n\t" \
325	"subu %[temp1], %[temp1], %[temp5] \n\t" \
326	"subu %[temp0], %[temp0], %[temp6] \n\t" \
327	"subu %[temp8], %[temp8], %[temp7] \n\t" \
328	"lhu %[temp4], " #E "(%[w]) \n\t" \
329	"lhu %[temp5], " #F "(%[w]) \n\t" \
330	"lhu %[temp6], " #G "(%[w]) \n\t" \
331	"lhu %[temp7], " #H "(%[w]) \n\t" \
332	"madd %[temp4], %[temp3] \n\t" \
333	"madd %[temp5], %[temp1] \n\t" \
334	"madd %[temp6], %[temp0] \n\t" \
335	"madd %[temp7], %[temp8] \n\t" \
336	"lw %[temp0], " #A "(%[tmp]) \n\t" \
337	"lw %[temp1], " #C "(%[tmp]) \n\t" \
338	"lw %[temp2], " #B "(%[tmp]) \n\t" \
339	"lw %[temp3], " #D "(%[tmp]) \n\t" \
340	"addu %[temp8], %[temp0], %[temp1] \n\t" \
341	"subu %[temp0], %[temp0], %[temp1] \n\t" \
342	"addu %[temp1], %[temp2], %[temp3] \n\t" \
343	"subu %[temp2], %[temp2], %[temp3] \n\t" \
344	"addu %[temp3], %[temp8], %[temp1] \n\t" \
345	"subu %[temp1], %[temp8], %[temp1] \n\t" \
346	"addu %[temp8], %[temp0], %[temp2] \n\t" \
347	"subu %[temp0], %[temp0], %[temp2] \n\t" \
348	"sra %[temp2], %[temp3], 31 \n\t" \
349	"xor %[temp3], %[temp3], %[temp2] \n\t" \
350	"subu %[temp3], %[temp3], %[temp2] \n\t" \
351	"msub %[temp4], %[temp3] \n\t" \
352	"sra %[temp2], %[temp8], 31 \n\t" \
353	"sra %[temp3], %[temp0], 31 \n\t" \
354	"sra %[temp4], %[temp1], 31 \n\t" \
355	"xor %[temp8], %[temp8], %[temp2] \n\t" \
356	"xor %[temp0], %[temp0], %[temp3] \n\t" \
357	"xor %[temp1], %[temp1], %[temp4] \n\t" \
358	"subu %[temp8], %[temp8], %[temp2] \n\t" \
359	"subu %[temp0], %[temp0], %[temp3] \n\t" \
360	"subu %[temp1], %[temp1], %[temp4] \n\t" \
361	"msub %[temp5], %[temp8] \n\t" \
362	"msub %[temp6], %[temp0] \n\t" \
363	"msub %[temp7], %[temp1] \n\t"
364
365	static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b,
366	const uint16_t* const w) {
367	int tmp[`32`];
368	int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
369
370	__asm__ volatile(
371	HORIZONTAL_PASS(`0`, `0`, `4`, `8`, `12`, `64`, `68`, `72`, `76`)
372	HORIZONTAL_PASS(`1`, `16`, `20`, `24`, `28`, `80`, `84`, `88`, `92`)
373	HORIZONTAL_PASS(`2`, `32`, `36`, `40`, `44`, `96`, `100`, `104`, `108`)
374	HORIZONTAL_PASS(`3`, `48`, `52`, `56`, `60`, `112`, `116`, `120`, `124`)
375	"mthi $zero \n\t"
376	"mtlo $zero \n\t"
377	VERTICAL_PASS( `0`, `16`, `32`, `48`, `64`, `80`, `96`, `112`, `0`, `8`, `16`, `24`)
378	VERTICAL_PASS( `4`, `20`, `36`, `52`, `68`, `84`, `100`, `116`, `2`, `10`, `18`, `26`)
379	VERTICAL_PASS( `8`, `24`, `40`, `56`, `72`, `88`, `104`, `120`, `4`, `12`, `20`, `28`)
380	VERTICAL_PASS(`12`, `28`, `44`, `60`, `76`, `92`, `108`, `124`, `6`, `14`, `22`, `30`)
381	"mflo %[temp0] \n\t"
382	"sra %[temp1], %[temp0], 31 \n\t"
383	"xor %[temp0], %[temp0], %[temp1] \n\t"
384	"subu %[temp0], %[temp0], %[temp1] \n\t"
385	"sra %[temp0], %[temp0], 5 \n\t"
386
387	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
388	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
389	[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
390	: [a]"r"(a), [b]"r"(b), [w]"r"(w), [tmp]"r"(tmp)
391	: "memory", "hi", "lo"
392	);
393
394	return temp0;
395	}
396
397	#undef VERTICAL_PASS
398	#undef HORIZONTAL_PASS
399
400	static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b,
401	const uint16_t* const w) {
402	int D = `0`;
403	int x, y;
404	for (y = `0`; y < `16` * BPS; y += `4` * BPS) {
405	for (x = `0`; x < `16`; x += `4`) {
406	D += Disto4x4_MIPS32(a + x + y, b + x + y, w);
407	}
408	}
409	return D;
410	}
411
412	// macro for one horizontal pass in FTransform
413	// temp0..temp15 holds tmp[0]..tmp[15]
414	// A - offset in bytes to load from src and ref buffers
415	// TEMP0..TEMP3 - registers for corresponding tmp elements
416	#define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3) \
417	"lw %[" #TEMP1 "], 0(%[args]) \n\t" \
418	"lw %[" #TEMP2 "], 4(%[args]) \n\t" \
419	"lbu %[temp16], 0+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
420	"lbu %[temp17], 0+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
421	"lbu %[temp18], 1+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
422	"lbu %[temp19], 1+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
423	"subu %[temp20], %[temp16], %[temp17] \n\t" \
424	"lbu %[temp16], 2+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
425	"lbu %[temp17], 2+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
426	"subu %[" #TEMP0 "], %[temp18], %[temp19] \n\t" \
427	"lbu %[temp18], 3+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
428	"lbu %[temp19], 3+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
429	"subu %[" #TEMP1 "], %[temp16], %[temp17] \n\t" \
430	"subu %[" #TEMP2 "], %[temp18], %[temp19] \n\t" \
431	"addu %[" #TEMP3 "], %[temp20], %[" #TEMP2 "] \n\t" \
432	"subu %[" #TEMP2 "], %[temp20], %[" #TEMP2 "] \n\t" \
433	"addu %[temp20], %[" #TEMP0 "], %[" #TEMP1 "] \n\t" \
434	"subu %[" #TEMP0 "], %[" #TEMP0 "], %[" #TEMP1 "] \n\t" \
435	"mul %[temp16], %[" #TEMP2 "], %[c5352] \n\t" \
436	"mul %[temp17], %[" #TEMP2 "], %[c2217] \n\t" \
437	"mul %[temp18], %[" #TEMP0 "], %[c5352] \n\t" \
438	"mul %[temp19], %[" #TEMP0 "], %[c2217] \n\t" \
439	"addu %[" #TEMP1 "], %[" #TEMP3 "], %[temp20] \n\t" \
440	"subu %[temp20], %[" #TEMP3 "], %[temp20] \n\t" \
441	"sll %[" #TEMP0 "], %[" #TEMP1 "], 3 \n\t" \
442	"sll %[" #TEMP2 "], %[temp20], 3 \n\t" \
443	"addiu %[temp16], %[temp16], 1812 \n\t" \
444	"addiu %[temp17], %[temp17], 937 \n\t" \
445	"addu %[temp16], %[temp16], %[temp19] \n\t" \
446	"subu %[temp17], %[temp17], %[temp18] \n\t" \
447	"sra %[" #TEMP1 "], %[temp16], 9 \n\t" \
448	"sra %[" #TEMP3 "], %[temp17], 9 \n\t"
449
450	// macro for one vertical pass in FTransform
451	// temp0..temp15 holds tmp[0]..tmp[15]
452	// A..D - offsets in bytes to store to out buffer
453	// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
454	#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12) \
455	"addu %[temp16], %[" #TEMP0 "], %[" #TEMP12 "] \n\t" \
456	"subu %[temp19], %[" #TEMP0 "], %[" #TEMP12 "] \n\t" \
457	"addu %[temp17], %[" #TEMP4 "], %[" #TEMP8 "] \n\t" \
458	"subu %[temp18], %[" #TEMP4 "], %[" #TEMP8 "] \n\t" \
459	"mul %[" #TEMP8 "], %[temp19], %[c2217] \n\t" \
460	"mul %[" #TEMP12 "], %[temp18], %[c2217] \n\t" \
461	"mul %[" #TEMP4 "], %[temp19], %[c5352] \n\t" \
462	"mul %[temp18], %[temp18], %[c5352] \n\t" \
463	"addiu %[temp16], %[temp16], 7 \n\t" \
464	"addu %[" #TEMP0 "], %[temp16], %[temp17] \n\t" \
465	"sra %[" #TEMP0 "], %[" #TEMP0 "], 4 \n\t" \
466	"addu %[" #TEMP12 "], %[" #TEMP12 "], %[" #TEMP4 "] \n\t" \
467	"subu %[" #TEMP4 "], %[temp16], %[temp17] \n\t" \
468	"sra %[" #TEMP4 "], %[" #TEMP4 "], 4 \n\t" \
469	"addiu %[" #TEMP8 "], %[" #TEMP8 "], 30000 \n\t" \
470	"addiu %[" #TEMP12 "], %[" #TEMP12 "], 12000 \n\t" \
471	"addiu %[" #TEMP8 "], %[" #TEMP8 "], 21000 \n\t" \
472	"subu %[" #TEMP8 "], %[" #TEMP8 "], %[temp18] \n\t" \
473	"sra %[" #TEMP12 "], %[" #TEMP12 "], 16 \n\t" \
474	"sra %[" #TEMP8 "], %[" #TEMP8 "], 16 \n\t" \
475	"addiu %[temp16], %[" #TEMP12 "], 1 \n\t" \
476	"movn %[" #TEMP12 "], %[temp16], %[temp19] \n\t" \
477	"sh %[" #TEMP0 "], " #A "(%[temp20]) \n\t" \
478	"sh %[" #TEMP4 "], " #C "(%[temp20]) \n\t" \
479	"sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \
480	"sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t"
481
482	static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref,
483	int16_t* out) {
484	int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
485	int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
486	int temp17, temp18, temp19, temp20;
487	const int c2217 = `2217`;
488	const int c5352 = `5352`;
489	const int* const args[`3`] =
490	{ (const int)src, (const* int)ref, (const* int*)out };
491
492	__asm__ volatile(
493	HORIZONTAL_PASS(`0`, temp0, temp1, temp2, temp3)
494	HORIZONTAL_PASS(`1`, temp4, temp5, temp6, temp7)
495	HORIZONTAL_PASS(`2`, temp8, temp9, temp10, temp11)
496	HORIZONTAL_PASS(`3`, temp12, temp13, temp14, temp15)
497	"lw %[temp20], 8(%[args]) \n\t"
498	VERTICAL_PASS(`0`, `8`, `16`, `24`, temp0, temp4, temp8, temp12)
499	VERTICAL_PASS(`2`, `10`, `18`, `26`, temp1, temp5, temp9, temp13)
500	VERTICAL_PASS(`4`, `12`, `20`, `28`, temp2, temp6, temp10, temp14)
501	VERTICAL_PASS(`6`, `14`, `22`, `30`, temp3, temp7, temp11, temp15)
502
503	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
504	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
505	[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
506	[temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
507	[temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
508	[temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
509	[temp18]"=&r"(temp18), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
510	: [args]"r"(args), [c2217]"r"(c2217), [c5352]"r"(c5352)
511	: "memory", "hi", "lo"
512	);
513	}
514
515	#undef VERTICAL_PASS
516	#undef HORIZONTAL_PASS
517
518	#if !defined(WORK_AROUND_GCC)
519
520	#define GET_SSE_INNER(A, B, C, D) \
521	"lbu %[temp0], " #A "(%[a]) \n\t" \
522	"lbu %[temp1], " #A "(%[b]) \n\t" \
523	"lbu %[temp2], " #B "(%[a]) \n\t" \
524	"lbu %[temp3], " #B "(%[b]) \n\t" \
525	"lbu %[temp4], " #C "(%[a]) \n\t" \
526	"lbu %[temp5], " #C "(%[b]) \n\t" \
527	"lbu %[temp6], " #D "(%[a]) \n\t" \
528	"lbu %[temp7], " #D "(%[b]) \n\t" \
529	"subu %[temp0], %[temp0], %[temp1] \n\t" \
530	"subu %[temp2], %[temp2], %[temp3] \n\t" \
531	"subu %[temp4], %[temp4], %[temp5] \n\t" \
532	"subu %[temp6], %[temp6], %[temp7] \n\t" \
533	"madd %[temp0], %[temp0] \n\t" \
534	"madd %[temp2], %[temp2] \n\t" \
535	"madd %[temp4], %[temp4] \n\t" \
536	"madd %[temp6], %[temp6] \n\t"
537
538	#define GET_SSE(A, B, C, D) \
539	GET_SSE_INNER(A, A + 1, A + 2, A + 3) \
540	GET_SSE_INNER(B, B + 1, B + 2, B + 3) \
541	GET_SSE_INNER(C, C + 1, C + 2, C + 3) \
542	GET_SSE_INNER(D, D + 1, D + 2, D + 3)
543
544	static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) {
545	int count;
546	int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
547
548	__asm__ volatile(
549	"mult $zero, $zero \n\t"
550
551	GET_SSE( `0` * BPS, `4` + `0` * BPS, `8` + `0` * BPS, `12` + `0` * BPS)
552	GET_SSE( `1` * BPS, `4` + `1` * BPS, `8` + `1` * BPS, `12` + `1` * BPS)
553	GET_SSE( `2` * BPS, `4` + `2` * BPS, `8` + `2` * BPS, `12` + `2` * BPS)
554	GET_SSE( `3` * BPS, `4` + `3` * BPS, `8` + `3` * BPS, `12` + `3` * BPS)
555	GET_SSE( `4` * BPS, `4` + `4` * BPS, `8` + `4` * BPS, `12` + `4` * BPS)
556	GET_SSE( `5` * BPS, `4` + `5` * BPS, `8` + `5` * BPS, `12` + `5` * BPS)
557	GET_SSE( `6` * BPS, `4` + `6` * BPS, `8` + `6` * BPS, `12` + `6` * BPS)
558	GET_SSE( `7` * BPS, `4` + `7` * BPS, `8` + `7` * BPS, `12` + `7` * BPS)
559	GET_SSE( `8` * BPS, `4` + `8` * BPS, `8` + `8` * BPS, `12` + `8` * BPS)
560	GET_SSE( `9` * BPS, `4` + `9` * BPS, `8` + `9` * BPS, `12` + `9` * BPS)
561	GET_SSE(`10` * BPS, `4` + `10` * BPS, `8` + `10` * BPS, `12` + `10` * BPS)
562	GET_SSE(`11` * BPS, `4` + `11` * BPS, `8` + `11` * BPS, `12` + `11` * BPS)
563	GET_SSE(`12` * BPS, `4` + `12` * BPS, `8` + `12` * BPS, `12` + `12` * BPS)
564	GET_SSE(`13` * BPS, `4` + `13` * BPS, `8` + `13` * BPS, `12` + `13` * BPS)
565	GET_SSE(`14` * BPS, `4` + `14` * BPS, `8` + `14` * BPS, `12` + `14` * BPS)
566	GET_SSE(`15` * BPS, `4` + `15` * BPS, `8` + `15` * BPS, `12` + `15` * BPS)
567
568	"mflo %[count] \n\t"
569	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
570	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
571	[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
572	: [a]"r"(a), [b]"r"(b)
573	: "memory", "hi", "lo"
574	);
575	return count;
576	}
577
578	static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) {
579	int count;
580	int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
581
582	__asm__ volatile(
583	"mult $zero, $zero \n\t"
584
585	GET_SSE( `0` * BPS, `4` + `0` * BPS, `8` + `0` * BPS, `12` + `0` * BPS)
586	GET_SSE( `1` * BPS, `4` + `1` * BPS, `8` + `1` * BPS, `12` + `1` * BPS)
587	GET_SSE( `2` * BPS, `4` + `2` * BPS, `8` + `2` * BPS, `12` + `2` * BPS)
588	GET_SSE( `3` * BPS, `4` + `3` * BPS, `8` + `3` * BPS, `12` + `3` * BPS)
589	GET_SSE( `4` * BPS, `4` + `4` * BPS, `8` + `4` * BPS, `12` + `4` * BPS)
590	GET_SSE( `5` * BPS, `4` + `5` * BPS, `8` + `5` * BPS, `12` + `5` * BPS)
591	GET_SSE( `6` * BPS, `4` + `6` * BPS, `8` + `6` * BPS, `12` + `6` * BPS)
592	GET_SSE( `7` * BPS, `4` + `7` * BPS, `8` + `7` * BPS, `12` + `7` * BPS)
593
594	"mflo %[count] \n\t"
595	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
596	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
597	[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
598	: [a]"r"(a), [b]"r"(b)
599	: "memory", "hi", "lo"
600	);
601	return count;
602	}
603
604	static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) {
605	int count;
606	int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
607
608	__asm__ volatile(
609	"mult $zero, $zero \n\t"
610
611	GET_SSE(`0` * BPS, `4` + `0` * BPS, `1` * BPS, `4` + `1` * BPS)
612	GET_SSE(`2` * BPS, `4` + `2` * BPS, `3` * BPS, `4` + `3` * BPS)
613	GET_SSE(`4` * BPS, `4` + `4` * BPS, `5` * BPS, `4` + `5` * BPS)
614	GET_SSE(`6` * BPS, `4` + `6` * BPS, `7` * BPS, `4` + `7` * BPS)
615
616	"mflo %[count] \n\t"
617	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
618	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
619	[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
620	: [a]"r"(a), [b]"r"(b)
621	: "memory", "hi", "lo"
622	);
623	return count;
624	}
625
626	static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) {
627	int count;
628	int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
629
630	__asm__ volatile(
631	"mult $zero, $zero \n\t"
632
633	GET_SSE(`0` * BPS, `1` * BPS, `2` * BPS, `3` * BPS)
634
635	"mflo %[count] \n\t"
636	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
637	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
638	[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
639	: [a]"r"(a), [b]"r"(b)
640	: "memory", "hi", "lo"
641	);
642	return count;
643	}
644
645	#undef GET_SSE
646	#undef GET_SSE_INNER
647
648	#endif // !WORK_AROUND_GCC
649
650	//------------------------------------------------------------------------------
651	// Entry point
652
653	extern void VP8EncDspInitMIPS32(void);
654
655	WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) {
656	VP8ITransform = ITransform_MIPS32;
657	VP8FTransform = FTransform_MIPS32;
658
659	VP8EncQuantizeBlock = QuantizeBlock_MIPS32;
660	VP8EncQuantize2Blocks = Quantize2Blocks_MIPS32;
661
662	VP8TDisto4x4 = Disto4x4_MIPS32;
663	VP8TDisto16x16 = Disto16x16_MIPS32;
664
665	#if !defined(WORK_AROUND_GCC)
666	VP8SSE16x16 = SSE16x16_MIPS32;
667	VP8SSE8x8 = SSE8x8_MIPS32;
668	VP8SSE16x8 = SSE16x8_MIPS32;
669	VP8SSE4x4 = SSE4x4_MIPS32;
670	#endif
671	}
672
673	#else // !WEBP_USE_MIPS32
674
675	WEBP_DSP_INIT_STUB(VP8EncDspInitMIPS32)
676
677	#endif // WEBP_USE_MIPS32
678

Browse the source code of Godot/thirdparty/libwebp/src/dsp/enc_mips32.c