enc_msa.c source code [engine/third_party/libwebp/src/dsp/enc_msa.c]

1	// Copyright 2016 Google Inc. All Rights Reserved.
2	//
3	// Use of this source code is governed by a BSD-style license
4	// that can be found in the COPYING file in the root of the source
5	// tree. An additional intellectual property rights grant can be found
6	// in the file PATENTS. All contributing project authors may
7	// be found in the AUTHORS file in the root of the source tree.
8	// -----------------------------------------------------------------------------
9	//
10	// MSA version of encoder dsp functions.
11	//
12	// Author: Prashant Patil (prashant.patil@imgtec.com)
13
14	#include "./dsp.h"
15
16	#if defined(WEBP_USE_MSA)
17
18	#include <stdlib.h>
19	#include "./msa_macro.h"
20	#include "../enc/vp8i_enc.h"
21
22	//------------------------------------------------------------------------------
23	// Transforms
24
25	#define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) do { \
26	v4i32 a1_m, b1_m, c1_m, d1_m; \
27	const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091); \
28	const v4i32 sinpi8sqrt2 = __msa_fill_w(35468); \
29	v4i32 c_tmp1_m = in1 * sinpi8sqrt2; \
30	v4i32 c_tmp2_m = in3 * cospi8sqrt2minus1; \
31	v4i32 d_tmp1_m = in1 * cospi8sqrt2minus1; \
32	v4i32 d_tmp2_m = in3 * sinpi8sqrt2; \
33	\
34	ADDSUB2(in0, in2, a1_m, b1_m); \
35	SRAI_W2_SW(c_tmp1_m, c_tmp2_m, 16); \
36	c_tmp2_m = c_tmp2_m + in3; \
37	c1_m = c_tmp1_m - c_tmp2_m; \
38	SRAI_W2_SW(d_tmp1_m, d_tmp2_m, 16); \
39	d_tmp1_m = d_tmp1_m + in1; \
40	d1_m = d_tmp1_m + d_tmp2_m; \
41	BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
42	} while (0)
43
44	static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
45	uint8_t* dst) {
46	v8i16 input0, input1;
47	v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
48	v4i32 res0, res1, res2, res3;
49	v16i8 dest0, dest1, dest2, dest3;
50	const v16i8 zero = { `0` };
51
52	LD_SH2(in, `8`, input0, input1);
53	UNPCK_SH_SW(input0, in0, in1);
54	UNPCK_SH_SW(input1, in2, in3);
55	IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
56	TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
57	IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
58	SRARI_W4_SW(vt0, vt1, vt2, vt3, `3`);
59	TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
60	LD_SB4(ref, BPS, dest0, dest1, dest2, dest3);
61	ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
62	res0, res1, res2, res3);
63	ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
64	res0, res1, res2, res3);
65	ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
66	CLIP_SW4_0_255(res0, res1, res2, res3);
67	PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
68	res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
69	ST4x4_UB(res0, res0, `3`, `2`, `1`, `0`, dst, BPS);
70	}
71
72	static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
73	int do_two) {
74	ITransformOne(ref, in, dst);
75	if (do_two) {
76	ITransformOne(ref + `4`, in + `16`, dst + `4`);
77	}
78	}
79
80	static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
81	uint64_t out0, out1, out2, out3;
82	uint32_t in0, in1, in2, in3;
83	v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
84	v8i16 t0, t1, t2, t3;
85	v16u8 srcl0, srcl1, src0, src1;
86	const v8i16 mask0 = { `0`, `4`, `8`, `12`, `1`, `5`, `9`, `13` };
87	const v8i16 mask1 = { `3`, `7`, `11`, `15`, `2`, `6`, `10`, `14` };
88	const v8i16 mask2 = { `4`, `0`, `5`, `1`, `6`, `2`, `7`, `3` };
89	const v8i16 mask3 = { `0`, `4`, `1`, `5`, `2`, `6`, `3`, `7` };
90	const v8i16 cnst0 = { `2217`, -`5352`, `2217`, -`5352`, `2217`, -`5352`, `2217`, -`5352` };
91	const v8i16 cnst1 = { `5352`, `2217`, `5352`, `2217`, `5352`, `2217`, `5352`, `2217` };
92
93	LW4(src, BPS, in0, in1, in2, in3);
94	INSERT_W4_UB(in0, in1, in2, in3, src0);
95	LW4(ref, BPS, in0, in1, in2, in3);
96	INSERT_W4_UB(in0, in1, in2, in3, src1);
97	ILVRL_B2_UB(src0, src1, srcl0, srcl1);
98	HSUB_UB2_SH(srcl0, srcl1, t0, t1);
99	VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
100	ADDSUB2(t2, t3, t0, t1);
101	t0 = SRLI_H(t0, `3`);
102	VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
103	tmp0 = __msa_hadd_s_w(t3, t3);
104	tmp2 = __msa_hsub_s_w(t3, t3);
105	FILL_W2_SW(`1812`, `937`, tmp1, tmp3);
106	DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
107	SRAI_W2_SW(tmp1, tmp3, `9`);
108	PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
109	VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
110	ADDSUB2(t2, t3, t0, t1);
111	VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
112	tmp0 = __msa_hadd_s_w(t3, t3);
113	tmp2 = __msa_hsub_s_w(t3, t3);
114	ADDVI_W2_SW(tmp0, `7`, tmp2, `7`, tmp0, tmp2);
115	SRAI_W2_SW(tmp0, tmp2, `4`);
116	FILL_W2_SW(`12000`, `51000`, tmp1, tmp3);
117	DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
118	SRAI_W2_SW(tmp1, tmp3, `16`);
119	UNPCK_R_SH_SW(t1, tmp4);
120	tmp5 = __msa_ceqi_w(tmp4, `0`);
121	tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5);
122	tmp5 = __msa_fill_w(`1`);
123	tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4);
124	tmp1 += tmp5;
125	PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
126	out0 = __msa_copy_s_d((v2i64)t0, `0`);
127	out1 = __msa_copy_s_d((v2i64)t0, `1`);
128	out2 = __msa_copy_s_d((v2i64)t1, `0`);
129	out3 = __msa_copy_s_d((v2i64)t1, `1`);
130	SD4(out0, out1, out2, out3, out, `8`);
131	}
132
133	static void FTransformWHT(const int16_t* in, int16_t* out) {
134	v8i16 in0 = { `0` };
135	v8i16 in1 = { `0` };
136	v8i16 tmp0, tmp1, tmp2, tmp3;
137	v8i16 out0, out1;
138	const v8i16 mask0 = { `0`, `1`, `2`, `3`, `8`, `9`, `10`, `11` };
139	const v8i16 mask1 = { `4`, `5`, `6`, `7`, `12`, `13`, `14`, `15` };
140	const v8i16 mask2 = { `0`, `4`, `8`, `12`, `1`, `5`, `9`, `13` };
141	const v8i16 mask3 = { `3`, `7`, `11`, `15`, `2`, `6`, `10`, `14` };
142
143	in0 = __msa_insert_h(in0, `0`, in[ `0`]);
144	in0 = __msa_insert_h(in0, `1`, in[ `64`]);
145	in0 = __msa_insert_h(in0, `2`, in[`128`]);
146	in0 = __msa_insert_h(in0, `3`, in[`192`]);
147	in0 = __msa_insert_h(in0, `4`, in[ `16`]);
148	in0 = __msa_insert_h(in0, `5`, in[ `80`]);
149	in0 = __msa_insert_h(in0, `6`, in[`144`]);
150	in0 = __msa_insert_h(in0, `7`, in[`208`]);
151	in1 = __msa_insert_h(in1, `0`, in[ `48`]);
152	in1 = __msa_insert_h(in1, `1`, in[`112`]);
153	in1 = __msa_insert_h(in1, `2`, in[`176`]);
154	in1 = __msa_insert_h(in1, `3`, in[`240`]);
155	in1 = __msa_insert_h(in1, `4`, in[ `32`]);
156	in1 = __msa_insert_h(in1, `5`, in[ `96`]);
157	in1 = __msa_insert_h(in1, `6`, in[`160`]);
158	in1 = __msa_insert_h(in1, `7`, in[`224`]);
159	ADDSUB2(in0, in1, tmp0, tmp1);
160	VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
161	ADDSUB2(tmp2, tmp3, tmp0, tmp1);
162	VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
163	ADDSUB2(in0, in1, tmp0, tmp1);
164	VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
165	ADDSUB2(tmp2, tmp3, out0, out1);
166	SRAI_H2_SH(out0, out1, `1`);
167	ST_SH2(out0, out1, out, `8`);
168	}
169
170	static int TTransform(const uint8_t* in, const uint16_t* w) {
171	int sum;
172	uint32_t in0_m, in1_m, in2_m, in3_m;
173	v16i8 src0;
174	v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3;
175	v4i32 dst0, dst1;
176	const v16i8 zero = { `0` };
177	const v8i16 mask0 = { `0`, `1`, `2`, `3`, `8`, `9`, `10`, `11` };
178	const v8i16 mask1 = { `4`, `5`, `6`, `7`, `12`, `13`, `14`, `15` };
179	const v8i16 mask2 = { `0`, `4`, `8`, `12`, `1`, `5`, `9`, `13` };
180	const v8i16 mask3 = { `3`, `7`, `11`, `15`, `2`, `6`, `10`, `14` };
181
182	LW4(in, BPS, in0_m, in1_m, in2_m, in3_m);
183	INSERT_W4_SB(in0_m, in1_m, in2_m, in3_m, src0);
184	ILVRL_B2_SH(zero, src0, tmp0, tmp1);
185	VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
186	ADDSUB2(in0, in1, tmp0, tmp1);
187	VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
188	ADDSUB2(tmp2, tmp3, tmp0, tmp1);
189	VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
190	ADDSUB2(in0, in1, tmp0, tmp1);
191	VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
192	ADDSUB2(tmp2, tmp3, tmp0, tmp1);
193	tmp0 = __msa_add_a_h(tmp0, (v8i16)zero);
194	tmp1 = __msa_add_a_h(tmp1, (v8i16)zero);
195	LD_SH2(w, `8`, tmp2, tmp3);
196	DOTP_SH2_SW(tmp0, tmp1, tmp2, tmp3, dst0, dst1);
197	dst0 = dst0 + dst1;
198	sum = HADD_SW_S32(dst0);
199	return sum;
200	}
201
202	static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
203	const uint16_t* const w) {
204	const int sum1 = TTransform(a, w);
205	const int sum2 = TTransform(b, w);
206	return abs(sum2 - sum1) >> `5`;
207	}
208
209	static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
210	const uint16_t* const w) {
211	int D = `0`;
212	int x, y;
213	for (y = `0`; y < `16` * BPS; y += `4` * BPS) {
214	for (x = `0`; x < `16`; x += `4`) {
215	D += Disto4x4(a + x + y, b + x + y, w);
216	}
217	}
218	return D;
219	}
220
221	//------------------------------------------------------------------------------
222	// Histogram
223
224	static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
225	int start_block, int end_block,
226	VP8Histogram* const histo) {
227	int j;
228	int distribution[MAX_COEFF_THRESH + `1`] = { `0` };
229	for (j = start_block; j < end_block; ++j) {
230	int16_t out[`16`];
231	VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
232	{
233	int k;
234	v8i16 coeff0, coeff1;
235	const v8i16 zero = { `0` };
236	const v8i16 max_coeff_thr = __msa_ldi_h(MAX_COEFF_THRESH);
237	LD_SH2(&out[`0`], `8`, coeff0, coeff1);
238	coeff0 = __msa_add_a_h(coeff0, zero);
239	coeff1 = __msa_add_a_h(coeff1, zero);
240	SRAI_H2_SH(coeff0, coeff1, `3`);
241	coeff0 = __msa_min_s_h(coeff0, max_coeff_thr);
242	coeff1 = __msa_min_s_h(coeff1, max_coeff_thr);
243	ST_SH2(coeff0, coeff1, &out[`0`], `8`);
244	for (k = `0`; k < `16`; ++k) {
245	++distribution[out[k]];
246	}
247	}
248	}
249	VP8SetHistogramData(distribution, histo);
250	}
251
252	//------------------------------------------------------------------------------
253	// Intra predictions
254
255	// luma 4x4 prediction
256
257	#define DST(x, y) dst[(x) + (y) * BPS]
258	#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
259	#define AVG2(a, b) (((a) + (b) + 1) >> 1)
260
261	static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical
262	const uint64_t val_m = LD(top - `1`);
263	const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, `0`, val_m);
264	const v16u8 B = SLDI_UB(A, A, `1`);
265	const v16u8 C = SLDI_UB(A, A, `2`);
266	const v16u8 AC = __msa_ave_u_b(A, C);
267	const v16u8 B2 = __msa_ave_u_b(B, B);
268	const v16u8 R = __msa_aver_u_b(AC, B2);
269	const uint32_t out = __msa_copy_s_w((v4i32)R, `0`);
270	SW4(out, out, out, out, dst, BPS);
271	}
272
273	static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
274	const int X = top[-`1`];
275	const int I = top[-`2`];
276	const int J = top[-`3`];
277	const int K = top[-`4`];
278	const int L = top[-`5`];
279	WebPUint32ToMem(dst + `0` * BPS, `0x01010101U` * AVG3(X, I, J));
280	WebPUint32ToMem(dst + `1` * BPS, `0x01010101U` * AVG3(I, J, K));
281	WebPUint32ToMem(dst + `2` * BPS, `0x01010101U` * AVG3(J, K, L));
282	WebPUint32ToMem(dst + `3` * BPS, `0x01010101U` * AVG3(K, L, L));
283	}
284
285	static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
286	uint32_t dc = `4`;
287	int i;
288	for (i = `0`; i < `4`; ++i) dc += top[i] + top[-`5` + i];
289	dc >>= `3`;
290	dc = dc \| (dc << `8`) \| (dc << `16`) \| (dc << `24`);
291	SW4(dc, dc, dc, dc, dst, BPS);
292	}
293
294	static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
295	const uint64_t val_m = LD(top - `5`);
296	const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A1, `0`, val_m);
297	const v16u8 A = (v16u8)__msa_insert_b((v16i8)A1, `8`, top[`3`]);
298	const v16u8 B = SLDI_UB(A, A, `1`);
299	const v16u8 C = SLDI_UB(A, A, `2`);
300	const v16u8 AC = __msa_ave_u_b(A, C);
301	const v16u8 B2 = __msa_ave_u_b(B, B);
302	const v16u8 R0 = __msa_aver_u_b(AC, B2);
303	const v16u8 R1 = SLDI_UB(R0, R0, `1`);
304	const v16u8 R2 = SLDI_UB(R1, R1, `1`);
305	const v16u8 R3 = SLDI_UB(R2, R2, `1`);
306	const uint32_t val0 = __msa_copy_s_w((v4i32)R0, `0`);
307	const uint32_t val1 = __msa_copy_s_w((v4i32)R1, `0`);
308	const uint32_t val2 = __msa_copy_s_w((v4i32)R2, `0`);
309	const uint32_t val3 = __msa_copy_s_w((v4i32)R3, `0`);
310	SW4(val3, val2, val1, val0, dst, BPS);
311	}
312
313	static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
314	const uint64_t val_m = LD(top);
315	const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, `0`, val_m);
316	const v16u8 B = SLDI_UB(A, A, `1`);
317	const v16u8 C1 = SLDI_UB(A, A, `2`);
318	const v16u8 C = (v16u8)__msa_insert_b((v16i8)C1, `6`, top[`7`]);
319	const v16u8 AC = __msa_ave_u_b(A, C);
320	const v16u8 B2 = __msa_ave_u_b(B, B);
321	const v16u8 R0 = __msa_aver_u_b(AC, B2);
322	const v16u8 R1 = SLDI_UB(R0, R0, `1`);
323	const v16u8 R2 = SLDI_UB(R1, R1, `1`);
324	const v16u8 R3 = SLDI_UB(R2, R2, `1`);
325	const uint32_t val0 = __msa_copy_s_w((v4i32)R0, `0`);
326	const uint32_t val1 = __msa_copy_s_w((v4i32)R1, `0`);
327	const uint32_t val2 = __msa_copy_s_w((v4i32)R2, `0`);
328	const uint32_t val3 = __msa_copy_s_w((v4i32)R3, `0`);
329	SW4(val0, val1, val2, val3, dst, BPS);
330	}
331
332	static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) {
333	const int X = top[-`1`];
334	const int I = top[-`2`];
335	const int J = top[-`3`];
336	const int K = top[-`4`];
337	const int A = top[`0`];
338	const int B = top[`1`];
339	const int C = top[`2`];
340	const int D = top[`3`];
341	DST(`0`, `0`) = DST(`1`, `2`) = AVG2(X, A);
342	DST(`1`, `0`) = DST(`2`, `2`) = AVG2(A, B);
343	DST(`2`, `0`) = DST(`3`, `2`) = AVG2(B, C);
344	DST(`3`, `0`) = AVG2(C, D);
345	DST(`0`, `3`) = AVG3(K, J, I);
346	DST(`0`, `2`) = AVG3(J, I, X);
347	DST(`0`, `1`) = DST(`1`, `3`) = AVG3(I, X, A);
348	DST(`1`, `1`) = DST(`2`, `3`) = AVG3(X, A, B);
349	DST(`2`, `1`) = DST(`3`, `3`) = AVG3(A, B, C);
350	DST(`3`, `1`) = AVG3(B, C, D);
351	}
352
353	static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) {
354	const int A = top[`0`];
355	const int B = top[`1`];
356	const int C = top[`2`];
357	const int D = top[`3`];
358	const int E = top[`4`];
359	const int F = top[`5`];
360	const int G = top[`6`];
361	const int H = top[`7`];
362	DST(`0`, `0`) = AVG2(A, B);
363	DST(`1`, `0`) = DST(`0`, `2`) = AVG2(B, C);
364	DST(`2`, `0`) = DST(`1`, `2`) = AVG2(C, D);
365	DST(`3`, `0`) = DST(`2`, `2`) = AVG2(D, E);
366	DST(`0`, `1`) = AVG3(A, B, C);
367	DST(`1`, `1`) = DST(`0`, `3`) = AVG3(B, C, D);
368	DST(`2`, `1`) = DST(`1`, `3`) = AVG3(C, D, E);
369	DST(`3`, `1`) = DST(`2`, `3`) = AVG3(D, E, F);
370	DST(`3`, `2`) = AVG3(E, F, G);
371	DST(`3`, `3`) = AVG3(F, G, H);
372	}
373
374	static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
375	const int I = top[-`2`];
376	const int J = top[-`3`];
377	const int K = top[-`4`];
378	const int L = top[-`5`];
379	DST(`0`, `0`) = AVG2(I, J);
380	DST(`2`, `0`) = DST(`0`, `1`) = AVG2(J, K);
381	DST(`2`, `1`) = DST(`0`, `2`) = AVG2(K, L);
382	DST(`1`, `0`) = AVG3(I, J, K);
383	DST(`3`, `0`) = DST(`1`, `1`) = AVG3(J, K, L);
384	DST(`3`, `1`) = DST(`1`, `2`) = AVG3(K, L, L);
385	DST(`3`, `2`) = DST(`2`, `2`) =
386	DST(`0`, `3`) = DST(`1`, `3`) = DST(`2`, `3`) = DST(`3`, `3`) = L;
387	}
388
389	static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
390	const int X = top[-`1`];
391	const int I = top[-`2`];
392	const int J = top[-`3`];
393	const int K = top[-`4`];
394	const int L = top[-`5`];
395	const int A = top[`0`];
396	const int B = top[`1`];
397	const int C = top[`2`];
398	DST(`0`, `0`) = DST(`2`, `1`) = AVG2(I, X);
399	DST(`0`, `1`) = DST(`2`, `2`) = AVG2(J, I);
400	DST(`0`, `2`) = DST(`2`, `3`) = AVG2(K, J);
401	DST(`0`, `3`) = AVG2(L, K);
402	DST(`3`, `0`) = AVG3(A, B, C);
403	DST(`2`, `0`) = AVG3(X, A, B);
404	DST(`1`, `0`) = DST(`3`, `1`) = AVG3(I, X, A);
405	DST(`1`, `1`) = DST(`3`, `2`) = AVG3(J, I, X);
406	DST(`1`, `2`) = DST(`3`, `3`) = AVG3(K, J, I);
407	DST(`1`, `3`) = AVG3(L, K, J);
408	}
409
410	static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
411	const v16i8 zero = { `0` };
412	const v8i16 TL = (v8i16)__msa_fill_h(top[-`1`]);
413	const v8i16 L0 = (v8i16)__msa_fill_h(top[-`2`]);
414	const v8i16 L1 = (v8i16)__msa_fill_h(top[-`3`]);
415	const v8i16 L2 = (v8i16)__msa_fill_h(top[-`4`]);
416	const v8i16 L3 = (v8i16)__msa_fill_h(top[-`5`]);
417	const v16u8 T1 = LD_UB(top);
418	const v8i16 T = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
419	const v8i16 d = T - TL;
420	v8i16 r0, r1, r2, r3;
421	ADD4(d, L0, d, L1, d, L2, d, L3, r0, r1, r2, r3);
422	CLIP_SH4_0_255(r0, r1, r2, r3);
423	PCKEV_ST4x4_UB(r0, r1, r2, r3, dst, BPS);
424	}
425
426	#undef DST
427	#undef AVG3
428	#undef AVG2
429
430	static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
431	DC4(I4DC4 + dst, top);
432	TM4(I4TM4 + dst, top);
433	VE4(I4VE4 + dst, top);
434	HE4(I4HE4 + dst, top);
435	RD4(I4RD4 + dst, top);
436	VR4(I4VR4 + dst, top);
437	LD4(I4LD4 + dst, top);
438	VL4(I4VL4 + dst, top);
439	HD4(I4HD4 + dst, top);
440	HU4(I4HU4 + dst, top);
441	}
442
443	// luma 16x16 prediction
444
445	#define STORE16x16(out, dst) do { \
446	ST_UB8(out, out, out, out, out, out, out, out, dst + 0 * BPS, BPS); \
447	ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS); \
448	} while (0)
449
450	static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) {
451	if (top != NULL) {
452	const v16u8 out = LD_UB(top);
453	STORE16x16(out, dst);
454	} else {
455	const v16u8 out = (v16u8)__msa_fill_b(`0x7f`);
456	STORE16x16(out, dst);
457	}
458	}
459
460	static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst,
461	const uint8_t* left) {
462	if (left != NULL) {
463	int j;
464	for (j = `0`; j < `16`; j += `4`) {
465	const v16u8 L0 = (v16u8)__msa_fill_b(left[`0`]);
466	const v16u8 L1 = (v16u8)__msa_fill_b(left[`1`]);
467	const v16u8 L2 = (v16u8)__msa_fill_b(left[`2`]);
468	const v16u8 L3 = (v16u8)__msa_fill_b(left[`3`]);
469	ST_UB4(L0, L1, L2, L3, dst, BPS);
470	dst += `4` * BPS;
471	left += `4`;
472	}
473	} else {
474	const v16u8 out = (v16u8)__msa_fill_b(`0x81`);
475	STORE16x16(out, dst);
476	}
477	}
478
479	static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left,
480	const uint8_t* top) {
481	if (left != NULL) {
482	if (top != NULL) {
483	int j;
484	v8i16 d1, d2;
485	const v16i8 zero = { `0` };
486	const v8i16 TL = (v8i16)__msa_fill_h(left[-`1`]);
487	const v16u8 T = LD_UB(top);
488	ILVRL_B2_SH(zero, T, d1, d2);
489	SUB2(d1, TL, d2, TL, d1, d2);
490	for (j = `0`; j < `16`; j += `4`) {
491	v16i8 t0, t1, t2, t3;
492	v8i16 r0, r1, r2, r3, r4, r5, r6, r7;
493	const v8i16 L0 = (v8i16)__msa_fill_h(left[j + `0`]);
494	const v8i16 L1 = (v8i16)__msa_fill_h(left[j + `1`]);
495	const v8i16 L2 = (v8i16)__msa_fill_h(left[j + `2`]);
496	const v8i16 L3 = (v8i16)__msa_fill_h(left[j + `3`]);
497	ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3);
498	ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7);
499	CLIP_SH4_0_255(r0, r1, r2, r3);
500	CLIP_SH4_0_255(r4, r5, r6, r7);
501	PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3);
502	ST_SB4(t0, t1, t2, t3, dst, BPS);
503	dst += `4` * BPS;
504	}
505	} else {
506	HorizontalPred16x16(dst, left);
507	}
508	} else {
509	if (top != NULL) {
510	VerticalPred16x16(dst, top);
511	} else {
512	const v16u8 out = (v16u8)__msa_fill_b(`0x81`);
513	STORE16x16(out, dst);
514	}
515	}
516	}
517
518	static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
519	const uint8_t* top) {
520	int DC;
521	v16u8 out;
522	if (top != NULL && left != NULL) {
523	const v16u8 rtop = LD_UB(top);
524	const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
525	const v16u8 rleft = LD_UB(left);
526	const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
527	const v8u16 dctemp = dctop + dcleft;
528	DC = HADD_UH_U32(dctemp);
529	DC = (DC + `16`) >> `5`;
530	} else if (left != NULL) { // left but no top
531	const v16u8 rleft = LD_UB(left);
532	const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
533	DC = HADD_UH_U32(dcleft);
534	DC = (DC + DC + `16`) >> `5`;
535	} else if (top != NULL) { // top but no left
536	const v16u8 rtop = LD_UB(top);
537	const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
538	DC = HADD_UH_U32(dctop);
539	DC = (DC + DC + `16`) >> `5`;
540	} else { // no top, no left, nothing.
541	DC = `0x80`;
542	}
543	out = (v16u8)__msa_fill_b(DC);
544	STORE16x16(out, dst);
545	}
546
547	static void Intra16Preds(uint8_t* dst,
548	const uint8_t* left, const uint8_t* top) {
549	DCMode16x16(I16DC16 + dst, left, top);
550	VerticalPred16x16(I16VE16 + dst, top);
551	HorizontalPred16x16(I16HE16 + dst, left);
552	TrueMotion16x16(I16TM16 + dst, left, top);
553	}
554
555	// Chroma 8x8 prediction
556
557	#define CALC_DC8(in, out) do { \
558	const v8u16 temp0 = __msa_hadd_u_h(in, in); \
559	const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0); \
560	const v2i64 temp2 = (v2i64)__msa_hadd_u_d(temp1, temp1); \
561	const v2i64 temp3 = __msa_splati_d(temp2, 1); \
562	const v2i64 temp4 = temp3 + temp2; \
563	const v16i8 temp5 = (v16i8)__msa_srari_d(temp4, 4); \
564	const v2i64 temp6 = (v2i64)__msa_splati_b(temp5, 0); \
565	out = __msa_copy_s_d(temp6, 0); \
566	} while (0)
567
568	#define STORE8x8(out, dst) do { \
569	SD4(out, out, out, out, dst + 0 * BPS, BPS); \
570	SD4(out, out, out, out, dst + 4 * BPS, BPS); \
571	} while (0)
572
573	static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) {
574	if (top != NULL) {
575	const uint64_t out = LD(top);
576	STORE8x8(out, dst);
577	} else {
578	const uint64_t out = `0x7f7f7f7f7f7f7f7fULL`;
579	STORE8x8(out, dst);
580	}
581	}
582
583	static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) {
584	if (left != NULL) {
585	int j;
586	for (j = `0`; j < `8`; j += `4`) {
587	const v16u8 L0 = (v16u8)__msa_fill_b(left[`0`]);
588	const v16u8 L1 = (v16u8)__msa_fill_b(left[`1`]);
589	const v16u8 L2 = (v16u8)__msa_fill_b(left[`2`]);
590	const v16u8 L3 = (v16u8)__msa_fill_b(left[`3`]);
591	const uint64_t out0 = __msa_copy_s_d((v2i64)L0, `0`);
592	const uint64_t out1 = __msa_copy_s_d((v2i64)L1, `0`);
593	const uint64_t out2 = __msa_copy_s_d((v2i64)L2, `0`);
594	const uint64_t out3 = __msa_copy_s_d((v2i64)L3, `0`);
595	SD4(out0, out1, out2, out3, dst, BPS);
596	dst += `4` * BPS;
597	left += `4`;
598	}
599	} else {
600	const uint64_t out = `0x8181818181818181ULL`;
601	STORE8x8(out, dst);
602	}
603	}
604
605	static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
606	const uint8_t* top) {
607	if (left != NULL) {
608	if (top != NULL) {
609	int j;
610	const v8i16 TL = (v8i16)__msa_fill_h(left[-`1`]);
611	const v16u8 T1 = LD_UB(top);
612	const v16i8 zero = { `0` };
613	const v8i16 T = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
614	const v8i16 d = T - TL;
615	for (j = `0`; j < `8`; j += `4`) {
616	uint64_t out0, out1, out2, out3;
617	v16i8 t0, t1;
618	v8i16 r0 = (v8i16)__msa_fill_h(left[j + `0`]);
619	v8i16 r1 = (v8i16)__msa_fill_h(left[j + `1`]);
620	v8i16 r2 = (v8i16)__msa_fill_h(left[j + `2`]);
621	v8i16 r3 = (v8i16)__msa_fill_h(left[j + `3`]);
622	ADD4(d, r0, d, r1, d, r2, d, r3, r0, r1, r2, r3);
623	CLIP_SH4_0_255(r0, r1, r2, r3);
624	PCKEV_B2_SB(r1, r0, r3, r2, t0, t1);
625	out0 = __msa_copy_s_d((v2i64)t0, `0`);
626	out1 = __msa_copy_s_d((v2i64)t0, `1`);
627	out2 = __msa_copy_s_d((v2i64)t1, `0`);
628	out3 = __msa_copy_s_d((v2i64)t1, `1`);
629	SD4(out0, out1, out2, out3, dst, BPS);
630	dst += `4` * BPS;
631	}
632	} else {
633	HorizontalPred8x8(dst, left);
634	}
635	} else {
636	if (top != NULL) {
637	VerticalPred8x8(dst, top);
638	} else {
639	const uint64_t out = `0x8181818181818181ULL`;
640	STORE8x8(out, dst);
641	}
642	}
643	}
644
645	static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
646	const uint8_t* top) {
647	uint64_t out;
648	v16u8 src;
649	if (top != NULL && left != NULL) {
650	const uint64_t left_m = LD(left);
651	const uint64_t top_m = LD(top);
652	INSERT_D2_UB(left_m, top_m, src);
653	CALC_DC8(src, out);
654	} else if (left != NULL) { // left but no top
655	const uint64_t left_m = LD(left);
656	INSERT_D2_UB(left_m, left_m, src);
657	CALC_DC8(src, out);
658	} else if (top != NULL) { // top but no left
659	const uint64_t top_m = LD(top);
660	INSERT_D2_UB(top_m, top_m, src);
661	CALC_DC8(src, out);
662	} else { // no top, no left, nothing.
663	src = (v16u8)__msa_fill_b(`0x80`);
664	out = __msa_copy_s_d((v2i64)src, `0`);
665	}
666	STORE8x8(out, dst);
667	}
668
669	static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
670	const uint8_t* top) {
671	// U block
672	DCMode8x8(C8DC8 + dst, left, top);
673	VerticalPred8x8(C8VE8 + dst, top);
674	HorizontalPred8x8(C8HE8 + dst, left);
675	TrueMotion8x8(C8TM8 + dst, left, top);
676	// V block
677	dst += `8`;
678	if (top != NULL) top += `8`;
679	if (left != NULL) left += `16`;
680	DCMode8x8(C8DC8 + dst, left, top);
681	VerticalPred8x8(C8VE8 + dst, top);
682	HorizontalPred8x8(C8HE8 + dst, left);
683	TrueMotion8x8(C8TM8 + dst, left, top);
684	}
685
686	//------------------------------------------------------------------------------
687	// Metric
688
689	#define PACK_DOTP_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do { \
690	v16u8 tmp0, tmp1; \
691	v8i16 tmp2, tmp3; \
692	ILVRL_B2_UB(in0, in1, tmp0, tmp1); \
693	HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \
694	DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1); \
695	ILVRL_B2_UB(in2, in3, tmp0, tmp1); \
696	HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \
697	DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \
698	} while (0)
699
700	#define PACK_DPADD_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do { \
701	v16u8 tmp0, tmp1; \
702	v8i16 tmp2, tmp3; \
703	ILVRL_B2_UB(in0, in1, tmp0, tmp1); \
704	HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \
705	DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1); \
706	ILVRL_B2_UB(in2, in3, tmp0, tmp1); \
707	HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \
708	DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \
709	} while (0)
710
711	static int SSE16x16(const uint8_t* a, const uint8_t* b) {
712	uint32_t sum;
713	v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
714	v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
715	v4i32 out0, out1, out2, out3;
716
717	LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
718	LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
719	PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
720	PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
721	PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
722	PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
723	a += `8` * BPS;
724	b += `8` * BPS;
725	LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
726	LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
727	PACK_DPADD_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
728	PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
729	PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
730	PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
731	out0 += out1;
732	out2 += out3;
733	out0 += out2;
734	sum = HADD_SW_S32(out0);
735	return sum;
736	}
737
738	static int SSE16x8(const uint8_t* a, const uint8_t* b) {
739	uint32_t sum;
740	v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
741	v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
742	v4i32 out0, out1, out2, out3;
743
744	LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
745	LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
746	PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
747	PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
748	PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
749	PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
750	out0 += out1;
751	out2 += out3;
752	out0 += out2;
753	sum = HADD_SW_S32(out0);
754	return sum;
755	}
756
757	static int SSE8x8(const uint8_t* a, const uint8_t* b) {
758	uint32_t sum;
759	v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
760	v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
761	v16u8 t0, t1, t2, t3;
762	v4i32 out0, out1, out2, out3;
763
764	LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
765	LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
766	ILVR_B4_UB(src0, src1, src2, src3, ref0, ref1, ref2, ref3, t0, t1, t2, t3);
767	PACK_DOTP_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3);
768	ILVR_B4_UB(src4, src5, src6, src7, ref4, ref5, ref6, ref7, t0, t1, t2, t3);
769	PACK_DPADD_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3);
770	out0 += out1;
771	out2 += out3;
772	out0 += out2;
773	sum = HADD_SW_S32(out0);
774	return sum;
775	}
776
777	static int SSE4x4(const uint8_t* a, const uint8_t* b) {
778	uint32_t sum = `0`;
779	uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
780	v16u8 src, ref, tmp0, tmp1;
781	v8i16 diff0, diff1;
782	v4i32 out0, out1;
783
784	LW4(a, BPS, src0, src1, src2, src3);
785	LW4(b, BPS, ref0, ref1, ref2, ref3);
786	INSERT_W4_UB(src0, src1, src2, src3, src);
787	INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
788	ILVRL_B2_UB(src, ref, tmp0, tmp1);
789	HSUB_UB2_SH(tmp0, tmp1, diff0, diff1);
790	DOTP_SH2_SW(diff0, diff1, diff0, diff1, out0, out1);
791	out0 += out1;
792	sum = HADD_SW_S32(out0);
793	return sum;
794	}
795
796	//------------------------------------------------------------------------------
797	// Quantization
798
799	static int QuantizeBlock(int16_t in[`16`], int16_t out[`16`],
800	const VP8Matrix* const mtx) {
801	int sum;
802	v8i16 in0, in1, sh0, sh1, out0, out1;
803	v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
804	v4i32 s0, s1, s2, s3, b0, b1, b2, b3, t0, t1, t2, t3;
805	const v8i16 zero = { `0` };
806	const v8i16 zigzag0 = { `0`, `1`, `4`, `8`, `5`, `2`, `3`, `6` };
807	const v8i16 zigzag1 = { `9`, `12`, `13`, `10`, `7`, `11`, `14`, `15` };
808	const v8i16 maxlevel = __msa_fill_h(MAX_LEVEL);
809
810	LD_SH2(&in[`0`], `8`, in0, in1);
811	LD_SH2(&mtx->sharpen_[`0`], `8`, sh0, sh1);
812	tmp4 = __msa_add_a_h(in0, zero);
813	tmp5 = __msa_add_a_h(in1, zero);
814	ILVRL_H2_SH(sh0, tmp4, tmp0, tmp1);
815	ILVRL_H2_SH(sh1, tmp5, tmp2, tmp3);
816	HADD_SH4_SW(tmp0, tmp1, tmp2, tmp3, s0, s1, s2, s3);
817	sign0 = (in0 < zero);
818	sign1 = (in1 < zero); // sign
819	LD_SH2(&mtx->iq_[`0`], `8`, tmp0, tmp1); // iq
820	ILVRL_H2_SW(zero, tmp0, t0, t1);
821	ILVRL_H2_SW(zero, tmp1, t2, t3);
822	LD_SW4(&mtx->bias_[`0`], `4`, b0, b1, b2, b3); // bias
823	MUL4(t0, s0, t1, s1, t2, s2, t3, s3, t0, t1, t2, t3);
824	ADD4(b0, t0, b1, t1, b2, t2, b3, t3, b0, b1, b2, b3);
825	SRAI_W4_SW(b0, b1, b2, b3, `17`);
826	PCKEV_H2_SH(b1, b0, b3, b2, tmp2, tmp3);
827	tmp0 = (tmp2 > maxlevel);
828	tmp1 = (tmp3 > maxlevel);
829	tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)maxlevel, (v16u8)tmp0);
830	tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)maxlevel, (v16u8)tmp1);
831	SUB2(`0`, tmp2, `0`, tmp3, tmp0, tmp1);
832	tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)tmp0, (v16u8)sign0);
833	tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)tmp1, (v16u8)sign1);
834	LD_SW4(&mtx->zthresh_[`0`], `4`, t0, t1, t2, t3); // zthresh
835	t0 = (s0 > t0);
836	t1 = (s1 > t1);
837	t2 = (s2 > t2);
838	t3 = (s3 > t3);
839	PCKEV_H2_SH(t1, t0, t3, t2, tmp0, tmp1);
840	tmp4 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp2, (v16u8)tmp0);
841	tmp5 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp3, (v16u8)tmp1);
842	LD_SH2(&mtx->q_[`0`], `8`, tmp0, tmp1);
843	MUL2(tmp4, tmp0, tmp5, tmp1, in0, in1);
844	VSHF_H2_SH(tmp4, tmp5, tmp4, tmp5, zigzag0, zigzag1, out0, out1);
845	ST_SH2(in0, in1, &in[`0`], `8`);
846	ST_SH2(out0, out1, &out[`0`], `8`);
847	out0 = __msa_add_a_h(out0, out1);
848	sum = HADD_SH_S32(out0);
849	return (sum > `0`);
850	}
851
852	static int Quantize2Blocks(int16_t in[`32`], int16_t out[`32`],
853	const VP8Matrix* const mtx) {
854	int nz;
855	nz = VP8EncQuantizeBlock(in + `0` * `16`, out + `0` * `16`, mtx) << `0`;
856	nz \|= VP8EncQuantizeBlock(in + `1` * `16`, out + `1` * `16`, mtx) << `1`;
857	return nz;
858	}
859
860	//------------------------------------------------------------------------------
861	// Entry point
862
863	extern void VP8EncDspInitMSA(void);
864
865	WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) {
866	VP8ITransform = ITransform;
867	VP8FTransform = FTransform;
868	VP8FTransformWHT = FTransformWHT;
869
870	VP8TDisto4x4 = Disto4x4;
871	VP8TDisto16x16 = Disto16x16;
872	VP8CollectHistogram = CollectHistogram;
873
874	VP8EncPredLuma4 = Intra4Preds;
875	VP8EncPredLuma16 = Intra16Preds;
876	VP8EncPredChroma8 = IntraChromaPreds;
877
878	VP8SSE16x16 = SSE16x16;
879	VP8SSE16x8 = SSE16x8;
880	VP8SSE8x8 = SSE8x8;
881	VP8SSE4x4 = SSE4x4;
882
883	VP8EncQuantizeBlock = QuantizeBlock;
884	VP8EncQuantize2Blocks = Quantize2Blocks;
885	VP8EncQuantizeBlockWHT = QuantizeBlock;
886	}
887
888	#else // !WEBP_USE_MSA
889
890	WEBP_DSP_INIT_STUB(VP8EncDspInitMSA)
891
892	#endif // WEBP_USE_MSA
893

Browse the source code of engine/third_party/libwebp/src/dsp/enc_msa.c