enc_msa.c source code [Skia/third_party/externals/libwebp/src/dsp/enc_msa.c]

1	// Copyright 2016 Google Inc. All Rights Reserved.
2	//
3	// Use of this source code is governed by a BSD-style license
4	// that can be found in the COPYING file in the root of the source
5	// tree. An additional intellectual property rights grant can be found
6	// in the file PATENTS. All contributing project authors may
7	// be found in the AUTHORS file in the root of the source tree.
8	// -----------------------------------------------------------------------------
9	//
10	// MSA version of encoder dsp functions.
11	//
12	// Author: Prashant Patil (prashant.patil@imgtec.com)
13
14	#include "src/dsp/dsp.h"
15
16	#if defined(WEBP_USE_MSA)
17
18	#include <stdlib.h>
19	#include "src/dsp/msa_macro.h"
20	#include "src/enc/vp8i_enc.h"
21
22	//------------------------------------------------------------------------------
23	// Transforms
24
25	#define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) do { \
26	v4i32 a1_m, b1_m, c1_m, d1_m; \
27	const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091); \
28	const v4i32 sinpi8sqrt2 = __msa_fill_w(35468); \
29	v4i32 c_tmp1_m = in1 * sinpi8sqrt2; \
30	v4i32 c_tmp2_m = in3 * cospi8sqrt2minus1; \
31	v4i32 d_tmp1_m = in1 * cospi8sqrt2minus1; \
32	v4i32 d_tmp2_m = in3 * sinpi8sqrt2; \
33	\
34	ADDSUB2(in0, in2, a1_m, b1_m); \
35	SRAI_W2_SW(c_tmp1_m, c_tmp2_m, 16); \
36	c_tmp2_m = c_tmp2_m + in3; \
37	c1_m = c_tmp1_m - c_tmp2_m; \
38	SRAI_W2_SW(d_tmp1_m, d_tmp2_m, 16); \
39	d_tmp1_m = d_tmp1_m + in1; \
40	d1_m = d_tmp1_m + d_tmp2_m; \
41	BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
42	} while (0)
43
44	static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
45	uint8_t* dst) {
46	v8i16 input0, input1;
47	v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
48	v4i32 res0, res1, res2, res3;
49	v16i8 dest0, dest1, dest2, dest3;
50	const v16i8 zero = { `0` };
51
52	LD_SH2(in, `8`, input0, input1);
53	UNPCK_SH_SW(input0, in0, in1);
54	UNPCK_SH_SW(input1, in2, in3);
55	IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
56	TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
57	IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
58	SRARI_W4_SW(vt0, vt1, vt2, vt3, `3`);
59	TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
60	LD_SB4(ref, BPS, dest0, dest1, dest2, dest3);
61	ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
62	res0, res1, res2, res3);
63	ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
64	res0, res1, res2, res3);
65	ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
66	CLIP_SW4_0_255(res0, res1, res2, res3);
67	PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
68	res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
69	ST4x4_UB(res0, res0, `3`, `2`, `1`, `0`, dst, BPS);
70	}
71
72	static void ITransform_MSA(const uint8_t* ref, const int16_t* in, uint8_t* dst,
73	int do_two) {
74	ITransformOne(ref, in, dst);
75	if (do_two) {
76	ITransformOne(ref + `4`, in + `16`, dst + `4`);
77	}
78	}
79
80	static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,
81	int16_t* out) {
82	uint64_t out0, out1, out2, out3;
83	uint32_t in0, in1, in2, in3;
84	v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
85	v8i16 t0, t1, t2, t3;
86	v16u8 srcl0, srcl1, src0 = { `0` }, src1 = { `0` };
87	const v8i16 mask0 = { `0`, `4`, `8`, `12`, `1`, `5`, `9`, `13` };
88	const v8i16 mask1 = { `3`, `7`, `11`, `15`, `2`, `6`, `10`, `14` };
89	const v8i16 mask2 = { `4`, `0`, `5`, `1`, `6`, `2`, `7`, `3` };
90	const v8i16 mask3 = { `0`, `4`, `1`, `5`, `2`, `6`, `3`, `7` };
91	const v8i16 cnst0 = { `2217`, -`5352`, `2217`, -`5352`, `2217`, -`5352`, `2217`, -`5352` };
92	const v8i16 cnst1 = { `5352`, `2217`, `5352`, `2217`, `5352`, `2217`, `5352`, `2217` };
93
94	LW4(src, BPS, in0, in1, in2, in3);
95	INSERT_W4_UB(in0, in1, in2, in3, src0);
96	LW4(ref, BPS, in0, in1, in2, in3);
97	INSERT_W4_UB(in0, in1, in2, in3, src1);
98	ILVRL_B2_UB(src0, src1, srcl0, srcl1);
99	HSUB_UB2_SH(srcl0, srcl1, t0, t1);
100	VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
101	ADDSUB2(t2, t3, t0, t1);
102	t0 = SRLI_H(t0, `3`);
103	VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
104	tmp0 = __msa_hadd_s_w(t3, t3);
105	tmp2 = __msa_hsub_s_w(t3, t3);
106	FILL_W2_SW(`1812`, `937`, tmp1, tmp3);
107	DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
108	SRAI_W2_SW(tmp1, tmp3, `9`);
109	PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
110	VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
111	ADDSUB2(t2, t3, t0, t1);
112	VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
113	tmp0 = __msa_hadd_s_w(t3, t3);
114	tmp2 = __msa_hsub_s_w(t3, t3);
115	ADDVI_W2_SW(tmp0, `7`, tmp2, `7`, tmp0, tmp2);
116	SRAI_W2_SW(tmp0, tmp2, `4`);
117	FILL_W2_SW(`12000`, `51000`, tmp1, tmp3);
118	DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
119	SRAI_W2_SW(tmp1, tmp3, `16`);
120	UNPCK_R_SH_SW(t1, tmp4);
121	tmp5 = __msa_ceqi_w(tmp4, `0`);
122	tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5);
123	tmp5 = __msa_fill_w(`1`);
124	tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4);
125	tmp1 += tmp5;
126	PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
127	out0 = __msa_copy_s_d((v2i64)t0, `0`);
128	out1 = __msa_copy_s_d((v2i64)t0, `1`);
129	out2 = __msa_copy_s_d((v2i64)t1, `0`);
130	out3 = __msa_copy_s_d((v2i64)t1, `1`);
131	SD4(out0, out1, out2, out3, out, `8`);
132	}
133
134	static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {
135	v8i16 in0 = { `0` };
136	v8i16 in1 = { `0` };
137	v8i16 tmp0, tmp1, tmp2, tmp3;
138	v8i16 out0, out1;
139	const v8i16 mask0 = { `0`, `1`, `2`, `3`, `8`, `9`, `10`, `11` };
140	const v8i16 mask1 = { `4`, `5`, `6`, `7`, `12`, `13`, `14`, `15` };
141	const v8i16 mask2 = { `0`, `4`, `8`, `12`, `1`, `5`, `9`, `13` };
142	const v8i16 mask3 = { `3`, `7`, `11`, `15`, `2`, `6`, `10`, `14` };
143
144	in0 = __msa_insert_h(in0, `0`, in[ `0`]);
145	in0 = __msa_insert_h(in0, `1`, in[ `64`]);
146	in0 = __msa_insert_h(in0, `2`, in[`128`]);
147	in0 = __msa_insert_h(in0, `3`, in[`192`]);
148	in0 = __msa_insert_h(in0, `4`, in[ `16`]);
149	in0 = __msa_insert_h(in0, `5`, in[ `80`]);
150	in0 = __msa_insert_h(in0, `6`, in[`144`]);
151	in0 = __msa_insert_h(in0, `7`, in[`208`]);
152	in1 = __msa_insert_h(in1, `0`, in[ `48`]);
153	in1 = __msa_insert_h(in1, `1`, in[`112`]);
154	in1 = __msa_insert_h(in1, `2`, in[`176`]);
155	in1 = __msa_insert_h(in1, `3`, in[`240`]);
156	in1 = __msa_insert_h(in1, `4`, in[ `32`]);
157	in1 = __msa_insert_h(in1, `5`, in[ `96`]);
158	in1 = __msa_insert_h(in1, `6`, in[`160`]);
159	in1 = __msa_insert_h(in1, `7`, in[`224`]);
160	ADDSUB2(in0, in1, tmp0, tmp1);
161	VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
162	ADDSUB2(tmp2, tmp3, tmp0, tmp1);
163	VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
164	ADDSUB2(in0, in1, tmp0, tmp1);
165	VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
166	ADDSUB2(tmp2, tmp3, out0, out1);
167	SRAI_H2_SH(out0, out1, `1`);
168	ST_SH2(out0, out1, out, `8`);
169	}
170
171	static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {
172	int sum;
173	uint32_t in0_m, in1_m, in2_m, in3_m;
174	v16i8 src0 = { `0` };
175	v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3;
176	v4i32 dst0, dst1;
177	const v16i8 zero = { `0` };
178	const v8i16 mask0 = { `0`, `1`, `2`, `3`, `8`, `9`, `10`, `11` };
179	const v8i16 mask1 = { `4`, `5`, `6`, `7`, `12`, `13`, `14`, `15` };
180	const v8i16 mask2 = { `0`, `4`, `8`, `12`, `1`, `5`, `9`, `13` };
181	const v8i16 mask3 = { `3`, `7`, `11`, `15`, `2`, `6`, `10`, `14` };
182
183	LW4(in, BPS, in0_m, in1_m, in2_m, in3_m);
184	INSERT_W4_SB(in0_m, in1_m, in2_m, in3_m, src0);
185	ILVRL_B2_SH(zero, src0, tmp0, tmp1);
186	VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
187	ADDSUB2(in0, in1, tmp0, tmp1);
188	VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
189	ADDSUB2(tmp2, tmp3, tmp0, tmp1);
190	VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
191	ADDSUB2(in0, in1, tmp0, tmp1);
192	VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
193	ADDSUB2(tmp2, tmp3, tmp0, tmp1);
194	tmp0 = __msa_add_a_h(tmp0, (v8i16)zero);
195	tmp1 = __msa_add_a_h(tmp1, (v8i16)zero);
196	LD_SH2(w, `8`, tmp2, tmp3);
197	DOTP_SH2_SW(tmp0, tmp1, tmp2, tmp3, dst0, dst1);
198	dst0 = dst0 + dst1;
199	sum = HADD_SW_S32(dst0);
200	return sum;
201	}
202
203	static int Disto4x4_MSA(const uint8_t* const a, const uint8_t* const b,
204	const uint16_t* const w) {
205	const int sum1 = TTransform_MSA(a, w);
206	const int sum2 = TTransform_MSA(b, w);
207	return abs(sum2 - sum1) >> `5`;
208	}
209
210	static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b,
211	const uint16_t* const w) {
212	int D = `0`;
213	int x, y;
214	for (y = `0`; y < `16` * BPS; y += `4` * BPS) {
215	for (x = `0`; x < `16`; x += `4`) {
216	D += Disto4x4_MSA(a + x + y, b + x + y, w);
217	}
218	}
219	return D;
220	}
221
222	//------------------------------------------------------------------------------
223	// Histogram
224
225	static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred,
226	int start_block, int end_block,
227	VP8Histogram* const histo) {
228	int j;
229	int distribution[MAX_COEFF_THRESH + `1`] = { `0` };
230	for (j = start_block; j < end_block; ++j) {
231	int16_t out[`16`];
232	VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
233	{
234	int k;
235	v8i16 coeff0, coeff1;
236	const v8i16 zero = { `0` };
237	const v8i16 max_coeff_thr = __msa_ldi_h(MAX_COEFF_THRESH);
238	LD_SH2(&out[`0`], `8`, coeff0, coeff1);
239	coeff0 = __msa_add_a_h(coeff0, zero);
240	coeff1 = __msa_add_a_h(coeff1, zero);
241	SRAI_H2_SH(coeff0, coeff1, `3`);
242	coeff0 = __msa_min_s_h(coeff0, max_coeff_thr);
243	coeff1 = __msa_min_s_h(coeff1, max_coeff_thr);
244	ST_SH2(coeff0, coeff1, &out[`0`], `8`);
245	for (k = `0`; k < `16`; ++k) {
246	++distribution[out[k]];
247	}
248	}
249	}
250	VP8SetHistogramData(distribution, histo);
251	}
252
253	//------------------------------------------------------------------------------
254	// Intra predictions
255
256	// luma 4x4 prediction
257
258	#define DST(x, y) dst[(x) + (y) * BPS]
259	#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
260	#define AVG2(a, b) (((a) + (b) + 1) >> 1)
261
262	static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical
263	const v16u8 A1 = { `0` };
264	const uint64_t val_m = LD(top - `1`);
265	const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, `0`, val_m);
266	const v16u8 B = SLDI_UB(A, A, `1`);
267	const v16u8 C = SLDI_UB(A, A, `2`);
268	const v16u8 AC = __msa_ave_u_b(A, C);
269	const v16u8 B2 = __msa_ave_u_b(B, B);
270	const v16u8 R = __msa_aver_u_b(AC, B2);
271	const uint32_t out = __msa_copy_s_w((v4i32)R, `0`);
272	SW4(out, out, out, out, dst, BPS);
273	}
274
275	static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
276	const int X = top[-`1`];
277	const int I = top[-`2`];
278	const int J = top[-`3`];
279	const int K = top[-`4`];
280	const int L = top[-`5`];
281	WebPUint32ToMem(dst + `0` * BPS, `0x01010101U` * AVG3(X, I, J));
282	WebPUint32ToMem(dst + `1` * BPS, `0x01010101U` * AVG3(I, J, K));
283	WebPUint32ToMem(dst + `2` * BPS, `0x01010101U` * AVG3(J, K, L));
284	WebPUint32ToMem(dst + `3` * BPS, `0x01010101U` * AVG3(K, L, L));
285	}
286
287	static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
288	uint32_t dc = `4`;
289	int i;
290	for (i = `0`; i < `4`; ++i) dc += top[i] + top[-`5` + i];
291	dc >>= `3`;
292	dc = dc \| (dc << `8`) \| (dc << `16`) \| (dc << `24`);
293	SW4(dc, dc, dc, dc, dst, BPS);
294	}
295
296	static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
297	const v16u8 A2 = { `0` };
298	const uint64_t val_m = LD(top - `5`);
299	const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, `0`, val_m);
300	const v16u8 A = (v16u8)__msa_insert_b((v16i8)A1, `8`, top[`3`]);
301	const v16u8 B = SLDI_UB(A, A, `1`);
302	const v16u8 C = SLDI_UB(A, A, `2`);
303	const v16u8 AC = __msa_ave_u_b(A, C);
304	const v16u8 B2 = __msa_ave_u_b(B, B);
305	const v16u8 R0 = __msa_aver_u_b(AC, B2);
306	const v16u8 R1 = SLDI_UB(R0, R0, `1`);
307	const v16u8 R2 = SLDI_UB(R1, R1, `1`);
308	const v16u8 R3 = SLDI_UB(R2, R2, `1`);
309	const uint32_t val0 = __msa_copy_s_w((v4i32)R0, `0`);
310	const uint32_t val1 = __msa_copy_s_w((v4i32)R1, `0`);
311	const uint32_t val2 = __msa_copy_s_w((v4i32)R2, `0`);
312	const uint32_t val3 = __msa_copy_s_w((v4i32)R3, `0`);
313	SW4(val3, val2, val1, val0, dst, BPS);
314	}
315
316	static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
317	const v16u8 A1 = { `0` };
318	const uint64_t val_m = LD(top);
319	const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, `0`, val_m);
320	const v16u8 B = SLDI_UB(A, A, `1`);
321	const v16u8 C1 = SLDI_UB(A, A, `2`);
322	const v16u8 C = (v16u8)__msa_insert_b((v16i8)C1, `6`, top[`7`]);
323	const v16u8 AC = __msa_ave_u_b(A, C);
324	const v16u8 B2 = __msa_ave_u_b(B, B);
325	const v16u8 R0 = __msa_aver_u_b(AC, B2);
326	const v16u8 R1 = SLDI_UB(R0, R0, `1`);
327	const v16u8 R2 = SLDI_UB(R1, R1, `1`);
328	const v16u8 R3 = SLDI_UB(R2, R2, `1`);
329	const uint32_t val0 = __msa_copy_s_w((v4i32)R0, `0`);
330	const uint32_t val1 = __msa_copy_s_w((v4i32)R1, `0`);
331	const uint32_t val2 = __msa_copy_s_w((v4i32)R2, `0`);
332	const uint32_t val3 = __msa_copy_s_w((v4i32)R3, `0`);
333	SW4(val0, val1, val2, val3, dst, BPS);
334	}
335
336	static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) {
337	const int X = top[-`1`];
338	const int I = top[-`2`];
339	const int J = top[-`3`];
340	const int K = top[-`4`];
341	const int A = top[`0`];
342	const int B = top[`1`];
343	const int C = top[`2`];
344	const int D = top[`3`];
345	DST(`0`, `0`) = DST(`1`, `2`) = AVG2(X, A);
346	DST(`1`, `0`) = DST(`2`, `2`) = AVG2(A, B);
347	DST(`2`, `0`) = DST(`3`, `2`) = AVG2(B, C);
348	DST(`3`, `0`) = AVG2(C, D);
349	DST(`0`, `3`) = AVG3(K, J, I);
350	DST(`0`, `2`) = AVG3(J, I, X);
351	DST(`0`, `1`) = DST(`1`, `3`) = AVG3(I, X, A);
352	DST(`1`, `1`) = DST(`2`, `3`) = AVG3(X, A, B);
353	DST(`2`, `1`) = DST(`3`, `3`) = AVG3(A, B, C);
354	DST(`3`, `1`) = AVG3(B, C, D);
355	}
356
357	static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) {
358	const int A = top[`0`];
359	const int B = top[`1`];
360	const int C = top[`2`];
361	const int D = top[`3`];
362	const int E = top[`4`];
363	const int F = top[`5`];
364	const int G = top[`6`];
365	const int H = top[`7`];
366	DST(`0`, `0`) = AVG2(A, B);
367	DST(`1`, `0`) = DST(`0`, `2`) = AVG2(B, C);
368	DST(`2`, `0`) = DST(`1`, `2`) = AVG2(C, D);
369	DST(`3`, `0`) = DST(`2`, `2`) = AVG2(D, E);
370	DST(`0`, `1`) = AVG3(A, B, C);
371	DST(`1`, `1`) = DST(`0`, `3`) = AVG3(B, C, D);
372	DST(`2`, `1`) = DST(`1`, `3`) = AVG3(C, D, E);
373	DST(`3`, `1`) = DST(`2`, `3`) = AVG3(D, E, F);
374	DST(`3`, `2`) = AVG3(E, F, G);
375	DST(`3`, `3`) = AVG3(F, G, H);
376	}
377
378	static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
379	const int I = top[-`2`];
380	const int J = top[-`3`];
381	const int K = top[-`4`];
382	const int L = top[-`5`];
383	DST(`0`, `0`) = AVG2(I, J);
384	DST(`2`, `0`) = DST(`0`, `1`) = AVG2(J, K);
385	DST(`2`, `1`) = DST(`0`, `2`) = AVG2(K, L);
386	DST(`1`, `0`) = AVG3(I, J, K);
387	DST(`3`, `0`) = DST(`1`, `1`) = AVG3(J, K, L);
388	DST(`3`, `1`) = DST(`1`, `2`) = AVG3(K, L, L);
389	DST(`3`, `2`) = DST(`2`, `2`) =
390	DST(`0`, `3`) = DST(`1`, `3`) = DST(`2`, `3`) = DST(`3`, `3`) = L;
391	}
392
393	static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
394	const int X = top[-`1`];
395	const int I = top[-`2`];
396	const int J = top[-`3`];
397	const int K = top[-`4`];
398	const int L = top[-`5`];
399	const int A = top[`0`];
400	const int B = top[`1`];
401	const int C = top[`2`];
402	DST(`0`, `0`) = DST(`2`, `1`) = AVG2(I, X);
403	DST(`0`, `1`) = DST(`2`, `2`) = AVG2(J, I);
404	DST(`0`, `2`) = DST(`2`, `3`) = AVG2(K, J);
405	DST(`0`, `3`) = AVG2(L, K);
406	DST(`3`, `0`) = AVG3(A, B, C);
407	DST(`2`, `0`) = AVG3(X, A, B);
408	DST(`1`, `0`) = DST(`3`, `1`) = AVG3(I, X, A);
409	DST(`1`, `1`) = DST(`3`, `2`) = AVG3(J, I, X);
410	DST(`1`, `2`) = DST(`3`, `3`) = AVG3(K, J, I);
411	DST(`1`, `3`) = AVG3(L, K, J);
412	}
413
414	static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
415	const v16i8 zero = { `0` };
416	const v8i16 TL = (v8i16)__msa_fill_h(top[-`1`]);
417	const v8i16 L0 = (v8i16)__msa_fill_h(top[-`2`]);
418	const v8i16 L1 = (v8i16)__msa_fill_h(top[-`3`]);
419	const v8i16 L2 = (v8i16)__msa_fill_h(top[-`4`]);
420	const v8i16 L3 = (v8i16)__msa_fill_h(top[-`5`]);
421	const v16u8 T1 = LD_UB(top);
422	const v8i16 T = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
423	const v8i16 d = T - TL;
424	v8i16 r0, r1, r2, r3;
425	ADD4(d, L0, d, L1, d, L2, d, L3, r0, r1, r2, r3);
426	CLIP_SH4_0_255(r0, r1, r2, r3);
427	PCKEV_ST4x4_UB(r0, r1, r2, r3, dst, BPS);
428	}
429
430	#undef DST
431	#undef AVG3
432	#undef AVG2
433
434	static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) {
435	DC4(I4DC4 + dst, top);
436	TM4(I4TM4 + dst, top);
437	VE4(I4VE4 + dst, top);
438	HE4(I4HE4 + dst, top);
439	RD4(I4RD4 + dst, top);
440	VR4(I4VR4 + dst, top);
441	LD4(I4LD4 + dst, top);
442	VL4(I4VL4 + dst, top);
443	HD4(I4HD4 + dst, top);
444	HU4(I4HU4 + dst, top);
445	}
446
447	// luma 16x16 prediction
448
449	#define STORE16x16(out, dst) do { \
450	ST_UB8(out, out, out, out, out, out, out, out, dst + 0 * BPS, BPS); \
451	ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS); \
452	} while (0)
453
454	static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) {
455	if (top != NULL) {
456	const v16u8 out = LD_UB(top);
457	STORE16x16(out, dst);
458	} else {
459	const v16u8 out = (v16u8)__msa_fill_b(`0x7f`);
460	STORE16x16(out, dst);
461	}
462	}
463
464	static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst,
465	const uint8_t* left) {
466	if (left != NULL) {
467	int j;
468	for (j = `0`; j < `16`; j += `4`) {
469	const v16u8 L0 = (v16u8)__msa_fill_b(left[`0`]);
470	const v16u8 L1 = (v16u8)__msa_fill_b(left[`1`]);
471	const v16u8 L2 = (v16u8)__msa_fill_b(left[`2`]);
472	const v16u8 L3 = (v16u8)__msa_fill_b(left[`3`]);
473	ST_UB4(L0, L1, L2, L3, dst, BPS);
474	dst += `4` * BPS;
475	left += `4`;
476	}
477	} else {
478	const v16u8 out = (v16u8)__msa_fill_b(`0x81`);
479	STORE16x16(out, dst);
480	}
481	}
482
483	static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left,
484	const uint8_t* top) {
485	if (left != NULL) {
486	if (top != NULL) {
487	int j;
488	v8i16 d1, d2;
489	const v16i8 zero = { `0` };
490	const v8i16 TL = (v8i16)__msa_fill_h(left[-`1`]);
491	const v16u8 T = LD_UB(top);
492	ILVRL_B2_SH(zero, T, d1, d2);
493	SUB2(d1, TL, d2, TL, d1, d2);
494	for (j = `0`; j < `16`; j += `4`) {
495	v16i8 t0, t1, t2, t3;
496	v8i16 r0, r1, r2, r3, r4, r5, r6, r7;
497	const v8i16 L0 = (v8i16)__msa_fill_h(left[j + `0`]);
498	const v8i16 L1 = (v8i16)__msa_fill_h(left[j + `1`]);
499	const v8i16 L2 = (v8i16)__msa_fill_h(left[j + `2`]);
500	const v8i16 L3 = (v8i16)__msa_fill_h(left[j + `3`]);
501	ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3);
502	ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7);
503	CLIP_SH4_0_255(r0, r1, r2, r3);
504	CLIP_SH4_0_255(r4, r5, r6, r7);
505	PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3);
506	ST_SB4(t0, t1, t2, t3, dst, BPS);
507	dst += `4` * BPS;
508	}
509	} else {
510	HorizontalPred16x16(dst, left);
511	}
512	} else {
513	if (top != NULL) {
514	VerticalPred16x16(dst, top);
515	} else {
516	const v16u8 out = (v16u8)__msa_fill_b(`0x81`);
517	STORE16x16(out, dst);
518	}
519	}
520	}
521
522	static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
523	const uint8_t* top) {
524	int DC;
525	v16u8 out;
526	if (top != NULL && left != NULL) {
527	const v16u8 rtop = LD_UB(top);
528	const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
529	const v16u8 rleft = LD_UB(left);
530	const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
531	const v8u16 dctemp = dctop + dcleft;
532	DC = HADD_UH_U32(dctemp);
533	DC = (DC + `16`) >> `5`;
534	} else if (left != NULL) { // left but no top
535	const v16u8 rleft = LD_UB(left);
536	const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
537	DC = HADD_UH_U32(dcleft);
538	DC = (DC + DC + `16`) >> `5`;
539	} else if (top != NULL) { // top but no left
540	const v16u8 rtop = LD_UB(top);
541	const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
542	DC = HADD_UH_U32(dctop);
543	DC = (DC + DC + `16`) >> `5`;
544	} else { // no top, no left, nothing.
545	DC = `0x80`;
546	}
547	out = (v16u8)__msa_fill_b(DC);
548	STORE16x16(out, dst);
549	}
550
551	static void Intra16Preds_MSA(uint8_t* dst,
552	const uint8_t* left, const uint8_t* top) {
553	DCMode16x16(I16DC16 + dst, left, top);
554	VerticalPred16x16(I16VE16 + dst, top);
555	HorizontalPred16x16(I16HE16 + dst, left);
556	TrueMotion16x16(I16TM16 + dst, left, top);
557	}
558
559	// Chroma 8x8 prediction
560
561	#define CALC_DC8(in, out) do { \
562	const v8u16 temp0 = __msa_hadd_u_h(in, in); \
563	const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0); \
564	const v2i64 temp2 = (v2i64)__msa_hadd_u_d(temp1, temp1); \
565	const v2i64 temp3 = __msa_splati_d(temp2, 1); \
566	const v2i64 temp4 = temp3 + temp2; \
567	const v16i8 temp5 = (v16i8)__msa_srari_d(temp4, 4); \
568	const v2i64 temp6 = (v2i64)__msa_splati_b(temp5, 0); \
569	out = __msa_copy_s_d(temp6, 0); \
570	} while (0)
571
572	#define STORE8x8(out, dst) do { \
573	SD4(out, out, out, out, dst + 0 * BPS, BPS); \
574	SD4(out, out, out, out, dst + 4 * BPS, BPS); \
575	} while (0)
576
577	static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) {
578	if (top != NULL) {
579	const uint64_t out = LD(top);
580	STORE8x8(out, dst);
581	} else {
582	const uint64_t out = `0x7f7f7f7f7f7f7f7fULL`;
583	STORE8x8(out, dst);
584	}
585	}
586
587	static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) {
588	if (left != NULL) {
589	int j;
590	for (j = `0`; j < `8`; j += `4`) {
591	const v16u8 L0 = (v16u8)__msa_fill_b(left[`0`]);
592	const v16u8 L1 = (v16u8)__msa_fill_b(left[`1`]);
593	const v16u8 L2 = (v16u8)__msa_fill_b(left[`2`]);
594	const v16u8 L3 = (v16u8)__msa_fill_b(left[`3`]);
595	const uint64_t out0 = __msa_copy_s_d((v2i64)L0, `0`);
596	const uint64_t out1 = __msa_copy_s_d((v2i64)L1, `0`);
597	const uint64_t out2 = __msa_copy_s_d((v2i64)L2, `0`);
598	const uint64_t out3 = __msa_copy_s_d((v2i64)L3, `0`);
599	SD4(out0, out1, out2, out3, dst, BPS);
600	dst += `4` * BPS;
601	left += `4`;
602	}
603	} else {
604	const uint64_t out = `0x8181818181818181ULL`;
605	STORE8x8(out, dst);
606	}
607	}
608
609	static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
610	const uint8_t* top) {
611	if (left != NULL) {
612	if (top != NULL) {
613	int j;
614	const v8i16 TL = (v8i16)__msa_fill_h(left[-`1`]);
615	const v16u8 T1 = LD_UB(top);
616	const v16i8 zero = { `0` };
617	const v8i16 T = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
618	const v8i16 d = T - TL;
619	for (j = `0`; j < `8`; j += `4`) {
620	uint64_t out0, out1, out2, out3;
621	v16i8 t0, t1;
622	v8i16 r0 = (v8i16)__msa_fill_h(left[j + `0`]);
623	v8i16 r1 = (v8i16)__msa_fill_h(left[j + `1`]);
624	v8i16 r2 = (v8i16)__msa_fill_h(left[j + `2`]);
625	v8i16 r3 = (v8i16)__msa_fill_h(left[j + `3`]);
626	ADD4(d, r0, d, r1, d, r2, d, r3, r0, r1, r2, r3);
627	CLIP_SH4_0_255(r0, r1, r2, r3);
628	PCKEV_B2_SB(r1, r0, r3, r2, t0, t1);
629	out0 = __msa_copy_s_d((v2i64)t0, `0`);
630	out1 = __msa_copy_s_d((v2i64)t0, `1`);
631	out2 = __msa_copy_s_d((v2i64)t1, `0`);
632	out3 = __msa_copy_s_d((v2i64)t1, `1`);
633	SD4(out0, out1, out2, out3, dst, BPS);
634	dst += `4` * BPS;
635	}
636	} else {
637	HorizontalPred8x8(dst, left);
638	}
639	} else {
640	if (top != NULL) {
641	VerticalPred8x8(dst, top);
642	} else {
643	const uint64_t out = `0x8181818181818181ULL`;
644	STORE8x8(out, dst);
645	}
646	}
647	}
648
649	static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
650	const uint8_t* top) {
651	uint64_t out;
652	v16u8 src = { `0` };
653	if (top != NULL && left != NULL) {
654	const uint64_t left_m = LD(left);
655	const uint64_t top_m = LD(top);
656	INSERT_D2_UB(left_m, top_m, src);
657	CALC_DC8(src, out);
658	} else if (left != NULL) { // left but no top
659	const uint64_t left_m = LD(left);
660	INSERT_D2_UB(left_m, left_m, src);
661	CALC_DC8(src, out);
662	} else if (top != NULL) { // top but no left
663	const uint64_t top_m = LD(top);
664	INSERT_D2_UB(top_m, top_m, src);
665	CALC_DC8(src, out);
666	} else { // no top, no left, nothing.
667	src = (v16u8)__msa_fill_b(`0x80`);
668	out = __msa_copy_s_d((v2i64)src, `0`);
669	}
670	STORE8x8(out, dst);
671	}
672
673	static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,
674	const uint8_t* top) {
675	// U block
676	DCMode8x8(C8DC8 + dst, left, top);
677	VerticalPred8x8(C8VE8 + dst, top);
678	HorizontalPred8x8(C8HE8 + dst, left);
679	TrueMotion8x8(C8TM8 + dst, left, top);
680	// V block
681	dst += `8`;
682	if (top != NULL) top += `8`;
683	if (left != NULL) left += `16`;
684	DCMode8x8(C8DC8 + dst, left, top);
685	VerticalPred8x8(C8VE8 + dst, top);
686	HorizontalPred8x8(C8HE8 + dst, left);
687	TrueMotion8x8(C8TM8 + dst, left, top);
688	}
689
690	//------------------------------------------------------------------------------
691	// Metric
692
693	#define PACK_DOTP_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do { \
694	v16u8 tmp0, tmp1; \
695	v8i16 tmp2, tmp3; \
696	ILVRL_B2_UB(in0, in1, tmp0, tmp1); \
697	HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \
698	DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1); \
699	ILVRL_B2_UB(in2, in3, tmp0, tmp1); \
700	HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \
701	DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \
702	} while (0)
703
704	#define PACK_DPADD_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do { \
705	v16u8 tmp0, tmp1; \
706	v8i16 tmp2, tmp3; \
707	ILVRL_B2_UB(in0, in1, tmp0, tmp1); \
708	HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \
709	DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1); \
710	ILVRL_B2_UB(in2, in3, tmp0, tmp1); \
711	HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \
712	DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \
713	} while (0)
714
715	static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {
716	uint32_t sum;
717	v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
718	v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
719	v4i32 out0, out1, out2, out3;
720
721	LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
722	LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
723	PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
724	PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
725	PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
726	PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
727	a += `8` * BPS;
728	b += `8` * BPS;
729	LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
730	LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
731	PACK_DPADD_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
732	PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
733	PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
734	PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
735	out0 += out1;
736	out2 += out3;
737	out0 += out2;
738	sum = HADD_SW_S32(out0);
739	return sum;
740	}
741
742	static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {
743	uint32_t sum;
744	v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
745	v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
746	v4i32 out0, out1, out2, out3;
747
748	LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
749	LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
750	PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
751	PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
752	PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
753	PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
754	out0 += out1;
755	out2 += out3;
756	out0 += out2;
757	sum = HADD_SW_S32(out0);
758	return sum;
759	}
760
761	static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {
762	uint32_t sum;
763	v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
764	v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
765	v16u8 t0, t1, t2, t3;
766	v4i32 out0, out1, out2, out3;
767
768	LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
769	LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
770	ILVR_B4_UB(src0, src1, src2, src3, ref0, ref1, ref2, ref3, t0, t1, t2, t3);
771	PACK_DOTP_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3);
772	ILVR_B4_UB(src4, src5, src6, src7, ref4, ref5, ref6, ref7, t0, t1, t2, t3);
773	PACK_DPADD_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3);
774	out0 += out1;
775	out2 += out3;
776	out0 += out2;
777	sum = HADD_SW_S32(out0);
778	return sum;
779	}
780
781	static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {
782	uint32_t sum = `0`;
783	uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
784	v16u8 src = { `0` }, ref = { `0` }, tmp0, tmp1;
785	v8i16 diff0, diff1;
786	v4i32 out0, out1;
787
788	LW4(a, BPS, src0, src1, src2, src3);
789	LW4(b, BPS, ref0, ref1, ref2, ref3);
790	INSERT_W4_UB(src0, src1, src2, src3, src);
791	INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
792	ILVRL_B2_UB(src, ref, tmp0, tmp1);
793	HSUB_UB2_SH(tmp0, tmp1, diff0, diff1);
794	DOTP_SH2_SW(diff0, diff1, diff0, diff1, out0, out1);
795	out0 += out1;
796	sum = HADD_SW_S32(out0);
797	return sum;
798	}
799
800	//------------------------------------------------------------------------------
801	// Quantization
802
803	static int QuantizeBlock_MSA(int16_t in[`16`], int16_t out[`16`],
804	const VP8Matrix* const mtx) {
805	int sum;
806	v8i16 in0, in1, sh0, sh1, out0, out1;
807	v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
808	v4i32 s0, s1, s2, s3, b0, b1, b2, b3, t0, t1, t2, t3;
809	const v8i16 zero = { `0` };
810	const v8i16 zigzag0 = { `0`, `1`, `4`, `8`, `5`, `2`, `3`, `6` };
811	const v8i16 zigzag1 = { `9`, `12`, `13`, `10`, `7`, `11`, `14`, `15` };
812	const v8i16 maxlevel = __msa_fill_h(MAX_LEVEL);
813
814	LD_SH2(&in[`0`], `8`, in0, in1);
815	LD_SH2(&mtx->sharpen_[`0`], `8`, sh0, sh1);
816	tmp4 = __msa_add_a_h(in0, zero);
817	tmp5 = __msa_add_a_h(in1, zero);
818	ILVRL_H2_SH(sh0, tmp4, tmp0, tmp1);
819	ILVRL_H2_SH(sh1, tmp5, tmp2, tmp3);
820	HADD_SH4_SW(tmp0, tmp1, tmp2, tmp3, s0, s1, s2, s3);
821	sign0 = (in0 < zero);
822	sign1 = (in1 < zero); // sign
823	LD_SH2(&mtx->iq_[`0`], `8`, tmp0, tmp1); // iq
824	ILVRL_H2_SW(zero, tmp0, t0, t1);
825	ILVRL_H2_SW(zero, tmp1, t2, t3);
826	LD_SW4(&mtx->bias_[`0`], `4`, b0, b1, b2, b3); // bias
827	MUL4(t0, s0, t1, s1, t2, s2, t3, s3, t0, t1, t2, t3);
828	ADD4(b0, t0, b1, t1, b2, t2, b3, t3, b0, b1, b2, b3);
829	SRAI_W4_SW(b0, b1, b2, b3, `17`);
830	PCKEV_H2_SH(b1, b0, b3, b2, tmp2, tmp3);
831	tmp0 = (tmp2 > maxlevel);
832	tmp1 = (tmp3 > maxlevel);
833	tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)maxlevel, (v16u8)tmp0);
834	tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)maxlevel, (v16u8)tmp1);
835	SUB2(zero, tmp2, zero, tmp3, tmp0, tmp1);
836	tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)tmp0, (v16u8)sign0);
837	tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)tmp1, (v16u8)sign1);
838	LD_SW4(&mtx->zthresh_[`0`], `4`, t0, t1, t2, t3); // zthresh
839	t0 = (s0 > t0);
840	t1 = (s1 > t1);
841	t2 = (s2 > t2);
842	t3 = (s3 > t3);
843	PCKEV_H2_SH(t1, t0, t3, t2, tmp0, tmp1);
844	tmp4 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp2, (v16u8)tmp0);
845	tmp5 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp3, (v16u8)tmp1);
846	LD_SH2(&mtx->q_[`0`], `8`, tmp0, tmp1);
847	MUL2(tmp4, tmp0, tmp5, tmp1, in0, in1);
848	VSHF_H2_SH(tmp4, tmp5, tmp4, tmp5, zigzag0, zigzag1, out0, out1);
849	ST_SH2(in0, in1, &in[`0`], `8`);
850	ST_SH2(out0, out1, &out[`0`], `8`);
851	out0 = __msa_add_a_h(out0, out1);
852	sum = HADD_SH_S32(out0);
853	return (sum > `0`);
854	}
855
856	static int Quantize2Blocks_MSA(int16_t in[`32`], int16_t out[`32`],
857	const VP8Matrix* const mtx) {
858	int nz;
859	nz = VP8EncQuantizeBlock(in + `0` * `16`, out + `0` * `16`, mtx) << `0`;
860	nz \|= VP8EncQuantizeBlock(in + `1` * `16`, out + `1` * `16`, mtx) << `1`;
861	return nz;
862	}
863
864	//------------------------------------------------------------------------------
865	// Entry point
866
867	extern void VP8EncDspInitMSA(void);
868
869	WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) {
870	VP8ITransform = ITransform_MSA;
871	VP8FTransform = FTransform_MSA;
872	VP8FTransformWHT = FTransformWHT_MSA;
873
874	VP8TDisto4x4 = Disto4x4_MSA;
875	VP8TDisto16x16 = Disto16x16_MSA;
876	VP8CollectHistogram = CollectHistogram_MSA;
877
878	VP8EncPredLuma4 = Intra4Preds_MSA;
879	VP8EncPredLuma16 = Intra16Preds_MSA;
880	VP8EncPredChroma8 = IntraChromaPreds_MSA;
881
882	VP8SSE16x16 = SSE16x16_MSA;
883	VP8SSE16x8 = SSE16x8_MSA;
884	VP8SSE8x8 = SSE8x8_MSA;
885	VP8SSE4x4 = SSE4x4_MSA;
886
887	VP8EncQuantizeBlock = QuantizeBlock_MSA;
888	VP8EncQuantize2Blocks = Quantize2Blocks_MSA;
889	VP8EncQuantizeBlockWHT = QuantizeBlock_MSA;
890	}
891
892	#else // !WEBP_USE_MSA
893
894	WEBP_DSP_INIT_STUB(VP8EncDspInitMSA)
895
896	#endif // WEBP_USE_MSA
897

Browse the source code of Skia/third_party/externals/libwebp/src/dsp/enc_msa.c