dec_neon.c source code [Godot/thirdparty/libwebp/src/dsp/dec_neon.c]

1	// Copyright 2012 Google Inc. All Rights Reserved.
2	//
3	// Use of this source code is governed by a BSD-style license
4	// that can be found in the COPYING file in the root of the source
5	// tree. An additional intellectual property rights grant can be found
6	// in the file PATENTS. All contributing project authors may
7	// be found in the AUTHORS file in the root of the source tree.
8	// -----------------------------------------------------------------------------
9	//
10	// ARM NEON version of dsp functions and loop filtering.
11	//
12	// Authors: Somnath Banerjee (somnath@google.com)
13	// Johann Koenig (johannkoenig@google.com)
14
15	#include "src/dsp/dsp.h"
16
17	#if defined(WEBP_USE_NEON)
18
19	#include "src/dsp/neon.h"
20	#include "src/dec/vp8i_dec.h"
21
22	//------------------------------------------------------------------------------
23	// NxM Loading functions
24
25	#if !defined(WORK_AROUND_GCC)
26
27	// This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
28	// (register alloc, probably). The variants somewhat mitigate the problem, but
29	// not quite. HFilter16i() remains problematic.
30	static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src,
31	int stride) {
32	const uint8x8_t zero = vdup_n_u8(`0`);
33	uint8x8x4_t out;
34	INIT_VECTOR4(out, zero, zero, zero, zero);
35	out = vld4_lane_u8(src + `0` * stride, out, `0`);
36	out = vld4_lane_u8(src + `1` * stride, out, `1`);
37	out = vld4_lane_u8(src + `2` * stride, out, `2`);
38	out = vld4_lane_u8(src + `3` * stride, out, `3`);
39	out = vld4_lane_u8(src + `4` * stride, out, `4`);
40	out = vld4_lane_u8(src + `5` * stride, out, `5`);
41	out = vld4_lane_u8(src + `6` * stride, out, `6`);
42	out = vld4_lane_u8(src + `7` * stride, out, `7`);
43	return out;
44	}
45
46	static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride,
47	uint8x16_t* const p1,
48	uint8x16_t* const p0,
49	uint8x16_t* const q0,
50	uint8x16_t* const q1) {
51	// row0 = p1[0..7]\|p0[0..7]\|q0[0..7]\|q1[0..7]
52	// row8 = p1[8..15]\|p0[8..15]\|q0[8..15]\|q1[8..15]
53	const uint8x8x4_t row0 = Load4x8_NEON(src - `2` + `0` * stride, stride);
54	const uint8x8x4_t row8 = Load4x8_NEON(src - `2` + `8` * stride, stride);
55	*p1 = vcombine_u8(row0.val[`0`], row8.val[`0`]);
56	*p0 = vcombine_u8(row0.val[`1`], row8.val[`1`]);
57	*q0 = vcombine_u8(row0.val[`2`], row8.val[`2`]);
58	*q1 = vcombine_u8(row0.val[`3`], row8.val[`3`]);
59	}
60
61	#else // WORK_AROUND_GCC
62
63	#define LOADQ_LANE_32b(VALUE, LANE) do { \
64	(VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE)); \
65	src += stride; \
66	} while (0)
67
68	static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride,
69	uint8x16_t* const p1,
70	uint8x16_t* const p0,
71	uint8x16_t* const q0,
72	uint8x16_t* const q1) {
73	const uint32x4_t zero = vdupq_n_u32(`0`);
74	uint32x4x4_t in;
75	INIT_VECTOR4(in, zero, zero, zero, zero);
76	src -= `2`;
77	LOADQ_LANE_32b(in.val[`0`], `0`);
78	LOADQ_LANE_32b(in.val[`1`], `0`);
79	LOADQ_LANE_32b(in.val[`2`], `0`);
80	LOADQ_LANE_32b(in.val[`3`], `0`);
81	LOADQ_LANE_32b(in.val[`0`], `1`);
82	LOADQ_LANE_32b(in.val[`1`], `1`);
83	LOADQ_LANE_32b(in.val[`2`], `1`);
84	LOADQ_LANE_32b(in.val[`3`], `1`);
85	LOADQ_LANE_32b(in.val[`0`], `2`);
86	LOADQ_LANE_32b(in.val[`1`], `2`);
87	LOADQ_LANE_32b(in.val[`2`], `2`);
88	LOADQ_LANE_32b(in.val[`3`], `2`);
89	LOADQ_LANE_32b(in.val[`0`], `3`);
90	LOADQ_LANE_32b(in.val[`1`], `3`);
91	LOADQ_LANE_32b(in.val[`2`], `3`);
92	LOADQ_LANE_32b(in.val[`3`], `3`);
93	// Transpose four 4x4 parts:
94	{
95	const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[`0`]),
96	vreinterpretq_u8_u32(in.val[`1`]));
97	const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[`2`]),
98	vreinterpretq_u8_u32(in.val[`3`]));
99	const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[`0`]),
100	vreinterpretq_u16_u8(row23.val[`0`]));
101	const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[`1`]),
102	vreinterpretq_u16_u8(row23.val[`1`]));
103	*p1 = vreinterpretq_u8_u16(row02.val[`0`]);
104	*p0 = vreinterpretq_u8_u16(row13.val[`0`]);
105	*q0 = vreinterpretq_u8_u16(row02.val[`1`]);
106	*q1 = vreinterpretq_u8_u16(row13.val[`1`]);
107	}
108	}
109	#undef LOADQ_LANE_32b
110
111	#endif // !WORK_AROUND_GCC
112
113	static WEBP_INLINE void Load8x16_NEON(
114	const uint8_t* const src, int stride,
115	uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
116	uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
117	uint8x16_t* const q2, uint8x16_t* const q3) {
118	Load4x16_NEON(src - `2`, stride, p3, p2, p1, p0);
119	Load4x16_NEON(src + `2`, stride, q0, q1, q2, q3);
120	}
121
122	static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride,
123	uint8x16_t* const p1,
124	uint8x16_t* const p0,
125	uint8x16_t* const q0,
126	uint8x16_t* const q1) {
127	p1 = vld1q_u8(src - `2` stride);
128	p0 = vld1q_u8(src - `1` stride);
129	q0 = vld1q_u8(src + `0` stride);
130	q1 = vld1q_u8(src + `1` stride);
131	}
132
133	static WEBP_INLINE void Load16x8_NEON(
134	const uint8_t* const src, int stride,
135	uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
136	uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
137	uint8x16_t* const q2, uint8x16_t* const q3) {
138	Load16x4_NEON(src - `2` * stride, stride, p3, p2, p1, p0);
139	Load16x4_NEON(src + `2` * stride, stride, q0, q1, q2, q3);
140	}
141
142	static WEBP_INLINE void Load8x8x2_NEON(
143	const uint8_t* const u, const uint8_t* const v, int stride,
144	uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
145	uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
146	uint8x16_t* const q2, uint8x16_t* const q3) {
147	// We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
148	// and the v-samples on the higher half.
149	p3 = vcombine_u8(vld1_u8(u - `4` stride), vld1_u8(v - `4` * stride));
150	p2 = vcombine_u8(vld1_u8(u - `3` stride), vld1_u8(v - `3` * stride));
151	p1 = vcombine_u8(vld1_u8(u - `2` stride), vld1_u8(v - `2` * stride));
152	p0 = vcombine_u8(vld1_u8(u - `1` stride), vld1_u8(v - `1` * stride));
153	q0 = vcombine_u8(vld1_u8(u + `0` stride), vld1_u8(v + `0` * stride));
154	q1 = vcombine_u8(vld1_u8(u + `1` stride), vld1_u8(v + `1` * stride));
155	q2 = vcombine_u8(vld1_u8(u + `2` stride), vld1_u8(v + `2` * stride));
156	q3 = vcombine_u8(vld1_u8(u + `3` stride), vld1_u8(v + `3` * stride));
157	}
158
159	#if !defined(WORK_AROUND_GCC)
160
161	#define LOAD_UV_8(ROW) \
162	vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
163
164	static WEBP_INLINE void Load8x8x2T_NEON(
165	const uint8_t* const u, const uint8_t* const v, int stride,
166	uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
167	uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
168	uint8x16_t* const q2, uint8x16_t* const q3) {
169	// We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
170	// and the v-samples on the higher half.
171	const uint8x16_t row0 = LOAD_UV_8(`0`);
172	const uint8x16_t row1 = LOAD_UV_8(`1`);
173	const uint8x16_t row2 = LOAD_UV_8(`2`);
174	const uint8x16_t row3 = LOAD_UV_8(`3`);
175	const uint8x16_t row4 = LOAD_UV_8(`4`);
176	const uint8x16_t row5 = LOAD_UV_8(`5`);
177	const uint8x16_t row6 = LOAD_UV_8(`6`);
178	const uint8x16_t row7 = LOAD_UV_8(`7`);
179	// Perform two side-by-side 8x8 transposes
180	// u00 u01 u02 u03 u04 u05 u06 u07 \| v00 v01 v02 v03 v04 v05 v06 v07
181	// u10 u11 u12 u13 u14 u15 u16 u17 \| v10 v11 v12 ...
182	// u20 u21 u22 u23 u24 u25 u26 u27 \| v20 v21 ...
183	// u30 u31 u32 u33 u34 u35 u36 u37 \| ...
184	// u40 u41 u42 u43 u44 u45 u46 u47 \| ...
185	// u50 u51 u52 u53 u54 u55 u56 u57 \| ...
186	// u60 u61 u62 u63 u64 u65 u66 u67 \| v60 ...
187	// u70 u71 u72 u73 u74 u75 u76 u77 \| v70 v71 v72 ...
188	const uint8x16x2_t row01 = vtrnq_u8(row0, row1); // u00 u10 u02 u12 ...
189	// u01 u11 u03 u13 ...
190	const uint8x16x2_t row23 = vtrnq_u8(row2, row3); // u20 u30 u22 u32 ...
191	// u21 u31 u23 u33 ...
192	const uint8x16x2_t row45 = vtrnq_u8(row4, row5); // ...
193	const uint8x16x2_t row67 = vtrnq_u8(row6, row7); // ...
194	const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[`0`]),
195	vreinterpretq_u16_u8(row23.val[`0`]));
196	const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[`1`]),
197	vreinterpretq_u16_u8(row23.val[`1`]));
198	const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[`0`]),
199	vreinterpretq_u16_u8(row67.val[`0`]));
200	const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[`1`]),
201	vreinterpretq_u16_u8(row67.val[`1`]));
202	const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[`0`]),
203	vreinterpretq_u32_u16(row46.val[`0`]));
204	const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[`1`]),
205	vreinterpretq_u32_u16(row46.val[`1`]));
206	const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[`0`]),
207	vreinterpretq_u32_u16(row57.val[`0`]));
208	const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[`1`]),
209	vreinterpretq_u32_u16(row57.val[`1`]));
210	*p3 = vreinterpretq_u8_u32(row04.val[`0`]);
211	*p2 = vreinterpretq_u8_u32(row15.val[`0`]);
212	*p1 = vreinterpretq_u8_u32(row26.val[`0`]);
213	*p0 = vreinterpretq_u8_u32(row37.val[`0`]);
214	*q0 = vreinterpretq_u8_u32(row04.val[`1`]);
215	*q1 = vreinterpretq_u8_u32(row15.val[`1`]);
216	*q2 = vreinterpretq_u8_u32(row26.val[`1`]);
217	*q3 = vreinterpretq_u8_u32(row37.val[`1`]);
218	}
219	#undef LOAD_UV_8
220
221	#endif // !WORK_AROUND_GCC
222
223	static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v,
224	uint8_t* const dst, int stride) {
225	vst2_lane_u8(dst + `0` * stride, v, `0`);
226	vst2_lane_u8(dst + `1` * stride, v, `1`);
227	vst2_lane_u8(dst + `2` * stride, v, `2`);
228	vst2_lane_u8(dst + `3` * stride, v, `3`);
229	vst2_lane_u8(dst + `4` * stride, v, `4`);
230	vst2_lane_u8(dst + `5` * stride, v, `5`);
231	vst2_lane_u8(dst + `6` * stride, v, `6`);
232	vst2_lane_u8(dst + `7` * stride, v, `7`);
233	}
234
235	static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0,
236	uint8_t* const dst, int stride) {
237	uint8x8x2_t lo, hi;
238	lo.val[`0`] = vget_low_u8(p0);
239	lo.val[`1`] = vget_low_u8(q0);
240	hi.val[`0`] = vget_high_u8(p0);
241	hi.val[`1`] = vget_high_u8(q0);
242	Store2x8_NEON(lo, dst - `1` + `0` * stride, stride);
243	Store2x8_NEON(hi, dst - `1` + `8` * stride, stride);
244	}
245
246	#if !defined(WORK_AROUND_GCC)
247	static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v,
248	uint8_t* const dst, int stride) {
249	vst4_lane_u8(dst + `0` * stride, v, `0`);
250	vst4_lane_u8(dst + `1` * stride, v, `1`);
251	vst4_lane_u8(dst + `2` * stride, v, `2`);
252	vst4_lane_u8(dst + `3` * stride, v, `3`);
253	vst4_lane_u8(dst + `4` * stride, v, `4`);
254	vst4_lane_u8(dst + `5` * stride, v, `5`);
255	vst4_lane_u8(dst + `6` * stride, v, `6`);
256	vst4_lane_u8(dst + `7` * stride, v, `7`);
257	}
258
259	static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0,
260	const uint8x16_t q0, const uint8x16_t q1,
261	uint8_t* const dst, int stride) {
262	uint8x8x4_t lo, hi;
263	INIT_VECTOR4(lo,
264	vget_low_u8(p1), vget_low_u8(p0),
265	vget_low_u8(q0), vget_low_u8(q1));
266	INIT_VECTOR4(hi,
267	vget_high_u8(p1), vget_high_u8(p0),
268	vget_high_u8(q0), vget_high_u8(q1));
269	Store4x8_NEON(lo, dst - `2` + `0` * stride, stride);
270	Store4x8_NEON(hi, dst - `2` + `8` * stride, stride);
271	}
272	#endif // !WORK_AROUND_GCC
273
274	static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0,
275	uint8_t* const dst, int stride) {
276	vst1q_u8(dst - stride, p0);
277	vst1q_u8(dst, q0);
278	}
279
280	static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0,
281	const uint8x16_t q0, const uint8x16_t q1,
282	uint8_t* const dst, int stride) {
283	Store16x2_NEON(p1, p0, dst - stride, stride);
284	Store16x2_NEON(q0, q1, dst + stride, stride);
285	}
286
287	static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0,
288	const uint8x16_t q0,
289	uint8_t* const u, uint8_t* const v,
290	int stride) {
291	// p0 and q0 contain the u+v samples packed in low/high halves.
292	vst1_u8(u - stride, vget_low_u8(p0));
293	vst1_u8(u, vget_low_u8(q0));
294	vst1_u8(v - stride, vget_high_u8(p0));
295	vst1_u8(v, vget_high_u8(q0));
296	}
297
298	static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1,
299	const uint8x16_t p0,
300	const uint8x16_t q0,
301	const uint8x16_t q1,
302	uint8_t* const u, uint8_t* const v,
303	int stride) {
304	// The p1...q1 registers contain the u+v samples packed in low/high halves.
305	Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride);
306	Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride);
307	}
308
309	#if !defined(WORK_AROUND_GCC)
310
311	#define STORE6_LANE(DST, VAL0, VAL1, LANE) do { \
312	vst3_lane_u8((DST) - 3, (VAL0), (LANE)); \
313	vst3_lane_u8((DST) + 0, (VAL1), (LANE)); \
314	(DST) += stride; \
315	} while (0)
316
317	static WEBP_INLINE void Store6x8x2_NEON(
318	const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
319	const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
320	uint8_t* u, uint8_t* v, int stride) {
321	uint8x8x3_t u0, u1, v0, v1;
322	INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
323	INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
324	INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
325	INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
326	STORE6_LANE(u, u0, u1, `0`);
327	STORE6_LANE(u, u0, u1, `1`);
328	STORE6_LANE(u, u0, u1, `2`);
329	STORE6_LANE(u, u0, u1, `3`);
330	STORE6_LANE(u, u0, u1, `4`);
331	STORE6_LANE(u, u0, u1, `5`);
332	STORE6_LANE(u, u0, u1, `6`);
333	STORE6_LANE(u, u0, u1, `7`);
334	STORE6_LANE(v, v0, v1, `0`);
335	STORE6_LANE(v, v0, v1, `1`);
336	STORE6_LANE(v, v0, v1, `2`);
337	STORE6_LANE(v, v0, v1, `3`);
338	STORE6_LANE(v, v0, v1, `4`);
339	STORE6_LANE(v, v0, v1, `5`);
340	STORE6_LANE(v, v0, v1, `6`);
341	STORE6_LANE(v, v0, v1, `7`);
342	}
343	#undef STORE6_LANE
344
345	static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1,
346	const uint8x16_t p0,
347	const uint8x16_t q0,
348	const uint8x16_t q1,
349	uint8_t* const u, uint8_t* const v,
350	int stride) {
351	uint8x8x4_t u0, v0;
352	INIT_VECTOR4(u0,
353	vget_low_u8(p1), vget_low_u8(p0),
354	vget_low_u8(q0), vget_low_u8(q1));
355	INIT_VECTOR4(v0,
356	vget_high_u8(p1), vget_high_u8(p0),
357	vget_high_u8(q0), vget_high_u8(q1));
358	vst4_lane_u8(u - `2` + `0` * stride, u0, `0`);
359	vst4_lane_u8(u - `2` + `1` * stride, u0, `1`);
360	vst4_lane_u8(u - `2` + `2` * stride, u0, `2`);
361	vst4_lane_u8(u - `2` + `3` * stride, u0, `3`);
362	vst4_lane_u8(u - `2` + `4` * stride, u0, `4`);
363	vst4_lane_u8(u - `2` + `5` * stride, u0, `5`);
364	vst4_lane_u8(u - `2` + `6` * stride, u0, `6`);
365	vst4_lane_u8(u - `2` + `7` * stride, u0, `7`);
366	vst4_lane_u8(v - `2` + `0` * stride, v0, `0`);
367	vst4_lane_u8(v - `2` + `1` * stride, v0, `1`);
368	vst4_lane_u8(v - `2` + `2` * stride, v0, `2`);
369	vst4_lane_u8(v - `2` + `3` * stride, v0, `3`);
370	vst4_lane_u8(v - `2` + `4` * stride, v0, `4`);
371	vst4_lane_u8(v - `2` + `5` * stride, v0, `5`);
372	vst4_lane_u8(v - `2` + `6` * stride, v0, `6`);
373	vst4_lane_u8(v - `2` + `7` * stride, v0, `7`);
374	}
375
376	#endif // !WORK_AROUND_GCC
377
378	// Zero extend 'v' to an int16x8_t.
379	static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) {
380	return vreinterpretq_s16_u16(vmovl_u8(v));
381	}
382
383	// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
384	// to the corresponding rows of 'dst'.
385	static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
386	const int16x8_t dst01,
387	const int16x8_t dst23) {
388	// Unsigned saturate to 8b.
389	const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
390	const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
391
392	// Store the results.
393	vst1_lane_u32((uint32_t)(dst + `0` BPS), vreinterpret_u32_u8(dst01_u8), `0`);
394	vst1_lane_u32((uint32_t)(dst + `1` BPS), vreinterpret_u32_u8(dst01_u8), `1`);
395	vst1_lane_u32((uint32_t)(dst + `2` BPS), vreinterpret_u32_u8(dst23_u8), `0`);
396	vst1_lane_u32((uint32_t)(dst + `3` BPS), vreinterpret_u32_u8(dst23_u8), `1`);
397	}
398
399	static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
400	const int16x8_t row23,
401	uint8_t* const dst) {
402	uint32x2_t dst01 = vdup_n_u32(`0`);
403	uint32x2_t dst23 = vdup_n_u32(`0`);
404
405	// Load the source pixels.
406	dst01 = vld1_lane_u32((uint32_t)(dst + `0` BPS), dst01, `0`);
407	dst23 = vld1_lane_u32((uint32_t)(dst + `2` BPS), dst23, `0`);
408	dst01 = vld1_lane_u32((uint32_t)(dst + `1` BPS), dst01, `1`);
409	dst23 = vld1_lane_u32((uint32_t)(dst + `3` BPS), dst23, `1`);
410
411	{
412	// Convert to 16b.
413	const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01));
414	const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23));
415
416	// Descale with rounding.
417	const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, `3`);
418	const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, `3`);
419	// Add the inverse transform.
420	SaturateAndStore4x4_NEON(dst, out01, out23);
421	}
422	}
423
424	//-----------------------------------------------------------------------------
425	// Simple In-loop filtering (Paragraph 15.2)
426
427	static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0,
428	const uint8x16_t q0, const uint8x16_t q1,
429	int thresh) {
430	const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
431	const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0); // abs(p0-q0)
432	const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1); // abs(p1-q1)
433	const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0); // 2 abs(p0-q0)*
434	const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, `1`); // abs(p1-q1) / 2
435	const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
436	const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
437	return mask;
438	}
439
440	static int8x16_t FlipSign_NEON(const uint8x16_t v) {
441	const uint8x16_t sign_bit = vdupq_n_u8(`0x80`);
442	return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
443	}
444
445	static uint8x16_t FlipSignBack_NEON(const int8x16_t v) {
446	const int8x16_t sign_bit = vdupq_n_s8(`0x80`);
447	return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
448	}
449
450	static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0,
451	const int8x16_t q0, const int8x16_t q1) {
452	const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
453	const int8x16_t p1_q1 = vqsubq_s8(p1, q1); // (p1-q1)
454	const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0); // (p1-q1) + 1 (q0 - p0)*
455	const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // (p1-q1) + 2 (q0 - p0)*
456	const int8x16_t s3 = vqaddq_s8(q0_p0, s2); // (p1-q1) + 3 (q0 - p0)*
457	return s3;
458	}
459
460	static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) {
461	const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
462	const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 (q0 - p0)*
463	const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 (q0 - p0)*
464	return s2;
465	}
466
467	//------------------------------------------------------------------------------
468
469	static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s,
470	const int8x16_t delta,
471	int8x16_t* const op0,
472	int8x16_t* const oq0) {
473	const int8x16_t kCst3 = vdupq_n_s8(`0x03`);
474	const int8x16_t kCst4 = vdupq_n_s8(`0x04`);
475	const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
476	const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
477	const int8x16_t delta3 = vshrq_n_s8(delta_p3, `3`);
478	const int8x16_t delta4 = vshrq_n_s8(delta_p4, `3`);
479	*op0 = vqaddq_s8(p0s, delta3);
480	*oq0 = vqsubq_s8(q0s, delta4);
481	}
482
483	#if defined(WEBP_USE_INTRINSICS)
484
485	static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s,
486	const int8x16_t delta,
487	uint8x16_t* const op0, uint8x16_t* const oq0) {
488	const int8x16_t kCst3 = vdupq_n_s8(`0x03`);
489	const int8x16_t kCst4 = vdupq_n_s8(`0x04`);
490	const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
491	const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
492	const int8x16_t delta3 = vshrq_n_s8(delta_p3, `3`);
493	const int8x16_t delta4 = vshrq_n_s8(delta_p4, `3`);
494	const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
495	const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
496	*op0 = FlipSignBack_NEON(sp0);
497	*oq0 = FlipSignBack_NEON(sq0);
498	}
499
500	static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0,
501	const uint8x16_t q0, const uint8x16_t q1,
502	const uint8x16_t mask,
503	uint8x16_t* const op0, uint8x16_t* const oq0) {
504	const int8x16_t p1s = FlipSign_NEON(p1);
505	const int8x16_t p0s = FlipSign_NEON(p0);
506	const int8x16_t q0s = FlipSign_NEON(q0);
507	const int8x16_t q1s = FlipSign_NEON(q1);
508	const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
509	const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
510	ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0);
511	}
512
513	static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
514	uint8x16_t p1, p0, q0, q1, op0, oq0;
515	Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1);
516	{
517	const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
518	DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
519	}
520	Store16x2_NEON(op0, oq0, p, stride);
521	}
522
523	static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
524	uint8x16_t p1, p0, q0, q1, oq0, op0;
525	Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1);
526	{
527	const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
528	DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
529	}
530	Store2x16_NEON(op0, oq0, p, stride);
531	}
532
533	#else
534
535	// Load/Store vertical edge
536	#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \
537	"vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
538	"vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
539	"vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
540	"vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
541	"vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
542	"vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
543	"vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
544	"vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
545
546	#define STORE8x2(c1, c2, p, stride) \
547	"vst2.8 {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n" \
548	"vst2.8 {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n" \
549	"vst2.8 {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n" \
550	"vst2.8 {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n" \
551	"vst2.8 {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n" \
552	"vst2.8 {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n" \
553	"vst2.8 {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n" \
554	"vst2.8 {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
555
556	#define QRegs "q0", "q1", "q2", "q3", \
557	"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
558
559	#define FLIP_SIGN_BIT2(a, b, s) \
560	"veor " #a "," #a "," #s " \n" \
561	"veor " #b "," #b "," #s " \n" \
562
563	#define FLIP_SIGN_BIT4(a, b, c, d, s) \
564	FLIP_SIGN_BIT2(a, b, s) \
565	FLIP_SIGN_BIT2(c, d, s) \
566
567	#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask) \
568	"vabd.u8 q15," #p0 "," #q0 " \n" /* abs(p0 - q0) */ \
569	"vabd.u8 q14," #p1 "," #q1 " \n" /* abs(p1 - q1) */ \
570	"vqadd.u8 q15, q15, q15 \n" /* abs(p0 - q0) * 2 */ \
571	"vshr.u8 q14, q14, #1 \n" /* abs(p1 - q1) / 2 */ \
572	"vqadd.u8 q15, q15, q14 \n" /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
573	"vdup.8 q14, " #thresh " \n" \
574	"vcge.u8 " #mask ", q14, q15 \n" /* mask <= thresh */
575
576	#define GET_BASE_DELTA(p1, p0, q0, q1, o) \
577	"vqsub.s8 q15," #q0 "," #p0 " \n" /* (q0 - p0) */ \
578	"vqsub.s8 " #o "," #p1 "," #q1 " \n" /* (p1 - q1) */ \
579	"vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 1 * (p0 - q0) */ \
580	"vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 2 * (p0 - q0) */ \
581	"vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 3 * (p0 - q0) */
582
583	#define DO_SIMPLE_FILTER(p0, q0, fl) \
584	"vmov.i8 q15, #0x03 \n" \
585	"vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 3 */ \
586	"vshr.s8 q15, q15, #3 \n" /* filter1 >> 3 */ \
587	"vqadd.s8 " #p0 "," #p0 ", q15 \n" /* p0 += filter1 */ \
588	\
589	"vmov.i8 q15, #0x04 \n" \
590	"vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 4 */ \
591	"vshr.s8 q15, q15, #3 \n" /* filter2 >> 3 */ \
592	"vqsub.s8 " #q0 "," #q0 ", q15 \n" /* q0 -= filter2 */
593
594	// Applies filter on 2 pixels (p0 and q0)
595	#define DO_FILTER2(p1, p0, q0, q1, thresh) \
596	NEEDS_FILTER(p1, p0, q0, q1, thresh, q9) /* filter mask in q9 */ \
597	"vmov.i8 q10, #0x80 \n" /* sign bit */ \
598	FLIP_SIGN_BIT4(p1, p0, q0, q1, q10) /* convert to signed value */ \
599	GET_BASE_DELTA(p1, p0, q0, q1, q11) /* get filter level */ \
600	"vand q9, q9, q11 \n" /* apply filter mask */ \
601	DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \
602	FLIP_SIGN_BIT2(p0, q0, q10)
603
604	static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
605	__asm__ volatile (
606	"sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 stride*
607
608	"vld1.u8 {q1}, [%[p]], %[stride] \n" // p1
609	"vld1.u8 {q2}, [%[p]], %[stride] \n" // p0
610	"vld1.u8 {q3}, [%[p]], %[stride] \n" // q0
611	"vld1.u8 {q12}, [%[p]] \n" // q1
612
613	DO_FILTER2(q1, q2, q3, q12, %[thresh])
614
615	"sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 stride*
616
617	"vst1.u8 {q2}, [%[p]], %[stride] \n" // store op0
618	"vst1.u8 {q3}, [%[p]] \n" // store oq0
619	: [p] "+r"(p)
620	: [stride] "r"(stride), [thresh] "r"(thresh)
621	: "memory", QRegs
622	);
623	}
624
625	static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
626	__asm__ volatile (
627	"sub r4, %[p], #2 \n" // base1 = p - 2
628	"lsl r6, %[stride], #1 \n" // r6 = 2 stride*
629	"add r5, r4, %[stride] \n" // base2 = base1 + stride
630
631	LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
632	LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
633	"vswp d3, d24 \n" // p1:q1 p0:q3
634	"vswp d5, d26 \n" // q0:q2 q1:q4
635	"vswp q2, q12 \n" // p1:q1 p0:q2 q0:q3 q1:q4
636
637	DO_FILTER2(q1, q2, q12, q13, %[thresh])
638
639	"sub %[p], %[p], #1 \n" // p - 1
640
641	"vswp d5, d24 \n"
642	STORE8x2(d4, d5, [%[p]], %[stride])
643	STORE8x2(d24, d25, [%[p]], %[stride])
644
645	: [p] "+r"(p)
646	: [stride] "r"(stride), [thresh] "r"(thresh)
647	: "memory", "r4", "r5", "r6", QRegs
648	);
649	}
650
651	#undef LOAD8x4
652	#undef STORE8x2
653
654	#endif // WEBP_USE_INTRINSICS
655
656	static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) {
657	uint32_t k;
658	for (k = `3`; k != `0`; --k) {
659	p += `4` * stride;
660	SimpleVFilter16_NEON(p, stride, thresh);
661	}
662	}
663
664	static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) {
665	uint32_t k;
666	for (k = `3`; k != `0`; --k) {
667	p += `4`;
668	SimpleHFilter16_NEON(p, stride, thresh);
669	}
670	}
671
672	//------------------------------------------------------------------------------
673	// Complex In-loop filtering (Paragraph 15.3)
674
675	static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0,
676	const uint8x16_t q0, const uint8x16_t q1,
677	int hev_thresh) {
678	const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
679	const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
680	const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
681	const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);
682	const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);
683	return mask;
684	}
685
686	static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2,
687	const uint8x16_t p1, const uint8x16_t p0,
688	const uint8x16_t q0, const uint8x16_t q1,
689	const uint8x16_t q2, const uint8x16_t q3,
690	int ithresh, int thresh) {
691	const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
692	const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2)
693	const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1); // abs(p2 - p1)
694	const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
695	const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2); // abs(q3 - q2)
696	const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1); // abs(q2 - q1)
697	const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
698	const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
699	const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
700	const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
701	const uint8x16_t max12 = vmaxq_u8(max1, max2);
702	const uint8x16_t max123 = vmaxq_u8(max12, max3);
703	const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
704	const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
705	const uint8x16_t mask = vandq_u8(mask1, mask2);
706	return mask;
707	}
708
709	// 4-points filter
710
711	static void ApplyFilter4_NEON(
712	const int8x16_t p1, const int8x16_t p0,
713	const int8x16_t q0, const int8x16_t q1,
714	const int8x16_t delta0,
715	uint8x16_t* const op1, uint8x16_t* const op0,
716	uint8x16_t* const oq0, uint8x16_t* const oq1) {
717	const int8x16_t kCst3 = vdupq_n_s8(`0x03`);
718	const int8x16_t kCst4 = vdupq_n_s8(`0x04`);
719	const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
720	const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
721	const int8x16_t a1 = vshrq_n_s8(delta1, `3`);
722	const int8x16_t a2 = vshrq_n_s8(delta2, `3`);
723	const int8x16_t a3 = vrshrq_n_s8(a1, `1`); // a3 = (a1 + 1) >> 1
724	op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2)); // clip(p0 + a2)*
725	oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1)); // clip(q0 - a1)*
726	op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3)); // clip(p1 + a3)*
727	oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3)); // clip(q1 - a3)*
728	}
729
730	static void DoFilter4_NEON(
731	const uint8x16_t p1, const uint8x16_t p0,
732	const uint8x16_t q0, const uint8x16_t q1,
733	const uint8x16_t mask, const uint8x16_t hev_mask,
734	uint8x16_t* const op1, uint8x16_t* const op0,
735	uint8x16_t* const oq0, uint8x16_t* const oq1) {
736	// This is a fused version of DoFilter2() calling ApplyFilter2 directly
737	const int8x16_t p1s = FlipSign_NEON(p1);
738	int8x16_t p0s = FlipSign_NEON(p0);
739	int8x16_t q0s = FlipSign_NEON(q0);
740	const int8x16_t q1s = FlipSign_NEON(q1);
741	const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
742
743	// do_filter2 part (simple loopfilter on pixels with hev)
744	{
745	const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
746	const int8x16_t simple_lf_delta =
747	vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
748	ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
749	}
750
751	// do_filter4 part (complex loopfilter on pixels without hev)
752	{
753	const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s);
754	// we use: (mask & hev_mask) ^ mask = mask & !hev_mask
755	const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
756	const int8x16_t complex_lf_delta =
757	vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
758	ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
759	}
760	}
761
762	// 6-points filter
763
764	static void ApplyFilter6_NEON(
765	const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
766	const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
767	const int8x16_t delta,
768	uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
769	uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
770	// We have to compute: X = (9a+63) >> 7, Y = (18a+63)>>7, Z = (27a+63) >> 7*
771	// Turns out, there's a common sub-expression S=9 a - 1 that can be used*
772	// with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:
773	// X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 a + S + 64) >> 7*
774	const int8x8_t delta_lo = vget_low_s8(delta);
775	const int8x8_t delta_hi = vget_high_s8(delta);
776	const int8x8_t kCst9 = vdup_n_s8(`9`);
777	const int16x8_t kCstm1 = vdupq_n_s16(-`1`);
778	const int8x8_t kCst18 = vdup_n_s8(`18`);
779	const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo); // S = 9 a - 1*
780	const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);
781	const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo); // S + 18 a*
782	const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);
783	const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, `7`); // (9 a + 63) >> 7*
784	const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, `7`);
785	const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, `6`); // (9 a + 31) >> 6*
786	const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, `6`);
787	const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, `7`); // (27 a + 63) >> 7*
788	const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, `7`);
789	const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
790	const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
791	const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
792
793	op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1)); // clip(p0 + a1)*
794	oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1)); // clip(q0 - q1)*
795	oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2)); // clip(q1 - a2)*
796	op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2)); // clip(p1 + a2)*
797	oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3)); // clip(q2 - a3)*
798	op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3)); // clip(p2 + a3)*
799	}
800
801	static void DoFilter6_NEON(
802	const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
803	const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
804	const uint8x16_t mask, const uint8x16_t hev_mask,
805	uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
806	uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
807	// This is a fused version of DoFilter2() calling ApplyFilter2 directly
808	const int8x16_t p2s = FlipSign_NEON(p2);
809	const int8x16_t p1s = FlipSign_NEON(p1);
810	int8x16_t p0s = FlipSign_NEON(p0);
811	int8x16_t q0s = FlipSign_NEON(q0);
812	const int8x16_t q1s = FlipSign_NEON(q1);
813	const int8x16_t q2s = FlipSign_NEON(q2);
814	const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
815	const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
816
817	// do_filter2 part (simple loopfilter on pixels with hev)
818	{
819	const int8x16_t simple_lf_delta =
820	vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
821	ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
822	}
823
824	// do_filter6 part (complex loopfilter on pixels without hev)
825	{
826	// we use: (mask & hev_mask) ^ mask = mask & !hev_mask
827	const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
828	const int8x16_t complex_lf_delta =
829	vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
830	ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
831	op2, op1, op0, oq0, oq1, oq2);
832	}
833	}
834
835	// on macroblock edges
836
837	static void VFilter16_NEON(uint8_t* p, int stride,
838	int thresh, int ithresh, int hev_thresh) {
839	uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
840	Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
841	{
842	const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
843	ithresh, thresh);
844	const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
845	uint8x16_t op2, op1, op0, oq0, oq1, oq2;
846	DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
847	&op2, &op1, &op0, &oq0, &oq1, &oq2);
848	Store16x2_NEON(op2, op1, p - `2` * stride, stride);
849	Store16x2_NEON(op0, oq0, p + `0` * stride, stride);
850	Store16x2_NEON(oq1, oq2, p + `2` * stride, stride);
851	}
852	}
853
854	static void HFilter16_NEON(uint8_t* p, int stride,
855	int thresh, int ithresh, int hev_thresh) {
856	uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
857	Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
858	{
859	const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
860	ithresh, thresh);
861	const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
862	uint8x16_t op2, op1, op0, oq0, oq1, oq2;
863	DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
864	&op2, &op1, &op0, &oq0, &oq1, &oq2);
865	Store2x16_NEON(op2, op1, p - `2`, stride);
866	Store2x16_NEON(op0, oq0, p + `0`, stride);
867	Store2x16_NEON(oq1, oq2, p + `2`, stride);
868	}
869	}
870
871	// on three inner edges
872	static void VFilter16i_NEON(uint8_t* p, int stride,
873	int thresh, int ithresh, int hev_thresh) {
874	uint32_t k;
875	uint8x16_t p3, p2, p1, p0;
876	Load16x4_NEON(p + `2` * stride, stride, &p3, &p2, &p1, &p0);
877	for (k = `3`; k != `0`; --k) {
878	uint8x16_t q0, q1, q2, q3;
879	p += `4` * stride;
880	Load16x4_NEON(p + `2` * stride, stride, &q0, &q1, &q2, &q3);
881	{
882	const uint8x16_t mask =
883	NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
884	const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
885	// p3 and p2 are not just temporary variables here: they will be
886	// re-used for next span. And q2/q3 will become p1/p0 accordingly.
887	DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
888	Store16x4_NEON(p1, p0, p3, p2, p, stride);
889	p1 = q2;
890	p0 = q3;
891	}
892	}
893	}
894
895	#if !defined(WORK_AROUND_GCC)
896	static void HFilter16i_NEON(uint8_t* p, int stride,
897	int thresh, int ithresh, int hev_thresh) {
898	uint32_t k;
899	uint8x16_t p3, p2, p1, p0;
900	Load4x16_NEON(p + `2`, stride, &p3, &p2, &p1, &p0);
901	for (k = `3`; k != `0`; --k) {
902	uint8x16_t q0, q1, q2, q3;
903	p += `4`;
904	Load4x16_NEON(p + `2`, stride, &q0, &q1, &q2, &q3);
905	{
906	const uint8x16_t mask =
907	NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
908	const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
909	DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
910	Store4x16_NEON(p1, p0, p3, p2, p, stride);
911	p1 = q2;
912	p0 = q3;
913	}
914	}
915	}
916	#endif // !WORK_AROUND_GCC
917
918	// 8-pixels wide variant, for chroma filtering
919	static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
920	int thresh, int ithresh, int hev_thresh) {
921	uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
922	Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
923	{
924	const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
925	ithresh, thresh);
926	const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
927	uint8x16_t op2, op1, op0, oq0, oq1, oq2;
928	DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
929	&op2, &op1, &op0, &oq0, &oq1, &oq2);
930	Store8x2x2_NEON(op2, op1, u - `2` * stride, v - `2` * stride, stride);
931	Store8x2x2_NEON(op0, oq0, u + `0` * stride, v + `0` * stride, stride);
932	Store8x2x2_NEON(oq1, oq2, u + `2` * stride, v + `2` * stride, stride);
933	}
934	}
935	static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
936	int thresh, int ithresh, int hev_thresh) {
937	uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
938	u += `4` * stride;
939	v += `4` * stride;
940	Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
941	{
942	const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
943	ithresh, thresh);
944	const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
945	uint8x16_t op1, op0, oq0, oq1;
946	DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
947	Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride);
948	}
949	}
950
951	#if !defined(WORK_AROUND_GCC)
952	static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
953	int thresh, int ithresh, int hev_thresh) {
954	uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
955	Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
956	{
957	const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
958	ithresh, thresh);
959	const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
960	uint8x16_t op2, op1, op0, oq0, oq1, oq2;
961	DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
962	&op2, &op1, &op0, &oq0, &oq1, &oq2);
963	Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
964	}
965	}
966
967	static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
968	int thresh, int ithresh, int hev_thresh) {
969	uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
970	u += `4`;
971	v += `4`;
972	Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
973	{
974	const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
975	ithresh, thresh);
976	const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
977	uint8x16_t op1, op0, oq0, oq1;
978	DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
979	Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride);
980	}
981	}
982	#endif // !WORK_AROUND_GCC
983
984	//-----------------------------------------------------------------------------
985	// Inverse transforms (Paragraph 14.4)
986
987	// Technically these are unsigned but vqdmulh is only available in signed.
988	// vqdmulh returns high half (effectively >> 16) but also doubles the value,
989	// changing the >> 16 to >> 15 and requiring an additional >> 1.
990	// We use this to our advantage with kC2. The canonical value is 35468.
991	// However, the high bit is set so treating it as signed will give incorrect
992	// results. We avoid this by down shifting by 1 here to clear the highest bit.
993	// Combined with the doubling effect of vqdmulh we get >> 16.
994	// This can not be applied to kC1 because the lowest bit is set. Down shifting
995	// the constant would reduce precision.
996
997	// libwebp uses a trick to avoid some extra addition that libvpx does.
998	// Instead of:
999	// temp2 = ip[12] + ((ip[12] cospi8sqrt2minus1) >> 16);*
1000	// libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
1001	// same issue with kC1 and vqdmulh that we work around by down shifting kC2
1002
1003	static const int16_t kC1 = `20091`;
1004	static const int16_t kC2 = `17734`; // half of kC2, actually. See comment above.
1005
1006	#if defined(WEBP_USE_INTRINSICS)
1007	static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
1008	const int16x8_t in1,
1009	int16x8x2_t* const out) {
1010	// a0 a1 a2 a3 \| b0 b1 b2 b3 => a0 b0 c0 d0 \| a1 b1 c1 d1
1011	// c0 c1 c2 c3 \| d0 d1 d2 d3 a2 b2 c2 d2 \| a3 b3 c3 d3
1012	const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
1013	// b0 d0 b1 d1 b2 d2 ...
1014	*out = vzipq_s16(tmp0.val[`0`], tmp0.val[`1`]);
1015	}
1016
1017	static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
1018	// {rows} = in0 \| in4
1019	// in8 \| in12
1020	// B1 = in4 \| in12
1021	const int16x8_t B1 =
1022	vcombine_s16(vget_high_s16(rows->val[`0`]), vget_high_s16(rows->val[`1`]));
1023	// C0 = kC1 in4 \| kC1 * in12*
1024	// C1 = kC2 in4 \| kC2 * in12*
1025	const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), `1`);
1026	const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
1027	const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[`0`]),
1028	vget_low_s16(rows->val[`1`])); // in0 + in8
1029	const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[`0`]),
1030	vget_low_s16(rows->val[`1`])); // in0 - in8
1031	// c = kC2 in4 - kC1 * in12*
1032	// d = kC1 in4 + kC2 * in12*
1033	const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
1034	const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
1035	const int16x8_t D0 = vcombine_s16(a, b); // D0 = a \| b
1036	const int16x8_t D1 = vcombine_s16(d, c); // D1 = d \| c
1037	const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d \| b+c
1038	const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d \| b-c
1039	const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
1040	Transpose8x2_NEON(E0, E1, rows);
1041	}
1042
1043	static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
1044	int16x8x2_t rows;
1045	INIT_VECTOR2(rows, vld1q_s16(in + `0`), vld1q_s16(in + `8`));
1046	TransformPass_NEON(&rows);
1047	TransformPass_NEON(&rows);
1048	Add4x4_NEON(rows.val[`0`], rows.val[`1`], dst);
1049	}
1050
1051	#else
1052
1053	static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
1054	const int kBPS = BPS;
1055	// kC1, kC2. Padded because vld1.16 loads 8 bytes
1056	const int16_t constants[`4`] = { kC1, kC2, `0`, `0` };
1057	/ Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm /
1058	__asm__ volatile (
1059	"vld1.16 {q1, q2}, [%[in]] \n"
1060	"vld1.16 {d0}, [%[constants]] \n"
1061
1062	/ d2: in[0]*
1063	* d3: in[8]
1064	* d4: in[4]
1065	* d5: in[12]
1066	*/
1067	"vswp d3, d4 \n"
1068
1069	/ q8 = {in[4], in[12]} * kC1 * 2 >> 16*
1070	* q9 = {in[4], in[12]} * kC2 >> 16
1071	*/
1072	"vqdmulh.s16 q8, q2, d0[0] \n"
1073	"vqdmulh.s16 q9, q2, d0[1] \n"
1074
1075	/ d22 = a = in[0] + in[8]*
1076	* d23 = b = in[0] - in[8]
1077	*/
1078	"vqadd.s16 d22, d2, d3 \n"
1079	"vqsub.s16 d23, d2, d3 \n"
1080
1081	/ The multiplication should be x * kC1 >> 16*
1082	* However, with vqdmulh we get x * kC1 * 2 >> 16
1083	* (multiply, double, return high half)
1084	* We avoided this in kC2 by pre-shifting the constant.
1085	* q8 = in[4]/[12] * kC1 >> 16
1086	*/
1087	"vshr.s16 q8, q8, #1 \n"
1088
1089	/ Add {in[4], in[12]} back after the multiplication. This is handled by*
1090	* adding 1 << 16 to kC1 in the libwebp C code.
1091	*/
1092	"vqadd.s16 q8, q2, q8 \n"
1093
1094	/ d20 = c = in[4]kC2 - in[12]kC1*
1095	* d21 = d = in[4]kC1 + in[12]kC2
1096	*/
1097	"vqsub.s16 d20, d18, d17 \n"
1098	"vqadd.s16 d21, d19, d16 \n"
1099
1100	/ d2 = tmp[0] = a + d*
1101	* d3 = tmp[1] = b + c
1102	* d4 = tmp[2] = b - c
1103	* d5 = tmp[3] = a - d
1104	*/
1105	"vqadd.s16 d2, d22, d21 \n"
1106	"vqadd.s16 d3, d23, d20 \n"
1107	"vqsub.s16 d4, d23, d20 \n"
1108	"vqsub.s16 d5, d22, d21 \n"
1109
1110	"vzip.16 q1, q2 \n"
1111	"vzip.16 q1, q2 \n"
1112
1113	"vswp d3, d4 \n"
1114
1115	/ q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16*
1116	* q9 = {tmp[4], tmp[12]} * kC2 >> 16
1117	*/
1118	"vqdmulh.s16 q8, q2, d0[0] \n"
1119	"vqdmulh.s16 q9, q2, d0[1] \n"
1120
1121	/ d22 = a = tmp[0] + tmp[8]*
1122	* d23 = b = tmp[0] - tmp[8]
1123	*/
1124	"vqadd.s16 d22, d2, d3 \n"
1125	"vqsub.s16 d23, d2, d3 \n"
1126
1127	/ See long winded explanations prior /
1128	"vshr.s16 q8, q8, #1 \n"
1129	"vqadd.s16 q8, q2, q8 \n"
1130
1131	/ d20 = c = in[4]kC2 - in[12]kC1*
1132	* d21 = d = in[4]kC1 + in[12]kC2
1133	*/
1134	"vqsub.s16 d20, d18, d17 \n"
1135	"vqadd.s16 d21, d19, d16 \n"
1136
1137	/ d2 = tmp[0] = a + d*
1138	* d3 = tmp[1] = b + c
1139	* d4 = tmp[2] = b - c
1140	* d5 = tmp[3] = a - d
1141	*/
1142	"vqadd.s16 d2, d22, d21 \n"
1143	"vqadd.s16 d3, d23, d20 \n"
1144	"vqsub.s16 d4, d23, d20 \n"
1145	"vqsub.s16 d5, d22, d21 \n"
1146
1147	"vld1.32 d6[0], [%[dst]], %[kBPS] \n"
1148	"vld1.32 d6[1], [%[dst]], %[kBPS] \n"
1149	"vld1.32 d7[0], [%[dst]], %[kBPS] \n"
1150	"vld1.32 d7[1], [%[dst]], %[kBPS] \n"
1151
1152	"sub %[dst], %[dst], %[kBPS], lsl #2 \n"
1153
1154	/ (val) + 4 >> 3 /
1155	"vrshr.s16 d2, d2, #3 \n"
1156	"vrshr.s16 d3, d3, #3 \n"
1157	"vrshr.s16 d4, d4, #3 \n"
1158	"vrshr.s16 d5, d5, #3 \n"
1159
1160	"vzip.16 q1, q2 \n"
1161	"vzip.16 q1, q2 \n"
1162
1163	/ Must accumulate before saturating /
1164	"vmovl.u8 q8, d6 \n"
1165	"vmovl.u8 q9, d7 \n"
1166
1167	"vqadd.s16 q1, q1, q8 \n"
1168	"vqadd.s16 q2, q2, q9 \n"
1169
1170	"vqmovun.s16 d0, q1 \n"
1171	"vqmovun.s16 d1, q2 \n"
1172
1173	"vst1.32 d0[0], [%[dst]], %[kBPS] \n"
1174	"vst1.32 d0[1], [%[dst]], %[kBPS] \n"
1175	"vst1.32 d1[0], [%[dst]], %[kBPS] \n"
1176	"vst1.32 d1[1], [%[dst]] \n"
1177
1178	: [in] "+r"(in), [dst] "+r"(dst) / modified registers /
1179	: [kBPS] "r"(kBPS), [constants] "r"(constants) / constants /
1180	: "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" / clobbered /
1181	);
1182	}
1183
1184	#endif // WEBP_USE_INTRINSICS
1185
1186	static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) {
1187	TransformOne_NEON(in, dst);
1188	if (do_two) {
1189	TransformOne_NEON(in + `16`, dst + `4`);
1190	}
1191	}
1192
1193	static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
1194	const int16x8_t DC = vdupq_n_s16(in[`0`]);
1195	Add4x4_NEON(DC, DC, dst);
1196	}
1197
1198	//------------------------------------------------------------------------------
1199
1200	#define STORE_WHT(dst, col, rows) do { \
1201	*dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
1202	*dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
1203	*dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
1204	*dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
1205	} while (0)
1206
1207	static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
1208	int32x4x4_t tmp;
1209
1210	{
1211	// Load the source.
1212	const int16x4_t in00_03 = vld1_s16(in + `0`);
1213	const int16x4_t in04_07 = vld1_s16(in + `4`);
1214	const int16x4_t in08_11 = vld1_s16(in + `8`);
1215	const int16x4_t in12_15 = vld1_s16(in + `12`);
1216	const int32x4_t a0 = vaddl_s16(in00_03, in12_15); // in[0..3] + in[12..15]
1217	const int32x4_t a1 = vaddl_s16(in04_07, in08_11); // in[4..7] + in[8..11]
1218	const int32x4_t a2 = vsubl_s16(in04_07, in08_11); // in[4..7] - in[8..11]
1219	const int32x4_t a3 = vsubl_s16(in00_03, in12_15); // in[0..3] - in[12..15]
1220	tmp.val[`0`] = vaddq_s32(a0, a1);
1221	tmp.val[`1`] = vaddq_s32(a3, a2);
1222	tmp.val[`2`] = vsubq_s32(a0, a1);
1223	tmp.val[`3`] = vsubq_s32(a3, a2);
1224	// Arrange the temporary results column-wise.
1225	tmp = Transpose4x4_NEON(tmp);
1226	}
1227
1228	{
1229	const int32x4_t kCst3 = vdupq_n_s32(`3`);
1230	const int32x4_t dc = vaddq_s32(tmp.val[`0`], kCst3); // add rounder
1231	const int32x4_t a0 = vaddq_s32(dc, tmp.val[`3`]);
1232	const int32x4_t a1 = vaddq_s32(tmp.val[`1`], tmp.val[`2`]);
1233	const int32x4_t a2 = vsubq_s32(tmp.val[`1`], tmp.val[`2`]);
1234	const int32x4_t a3 = vsubq_s32(dc, tmp.val[`3`]);
1235
1236	tmp.val[`0`] = vaddq_s32(a0, a1);
1237	tmp.val[`1`] = vaddq_s32(a3, a2);
1238	tmp.val[`2`] = vsubq_s32(a0, a1);
1239	tmp.val[`3`] = vsubq_s32(a3, a2);
1240
1241	// right shift the results by 3.
1242	tmp.val[`0`] = vshrq_n_s32(tmp.val[`0`], `3`);
1243	tmp.val[`1`] = vshrq_n_s32(tmp.val[`1`], `3`);
1244	tmp.val[`2`] = vshrq_n_s32(tmp.val[`2`], `3`);
1245	tmp.val[`3`] = vshrq_n_s32(tmp.val[`3`], `3`);
1246
1247	STORE_WHT(out, `0`, tmp);
1248	STORE_WHT(out, `1`, tmp);
1249	STORE_WHT(out, `2`, tmp);
1250	STORE_WHT(out, `3`, tmp);
1251	}
1252	}
1253
1254	#undef STORE_WHT
1255
1256	//------------------------------------------------------------------------------
1257
1258	#define MUL(a, b) (((a) * (b)) >> 16)
1259	static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
1260	static const int kC1_full = `20091` + (`1` << `16`);
1261	static const int kC2_full = `35468`;
1262	const int16x4_t A = vld1_dup_s16(in);
1263	const int16x4_t c4 = vdup_n_s16(MUL(in[`4`], kC2_full));
1264	const int16x4_t d4 = vdup_n_s16(MUL(in[`4`], kC1_full));
1265	const int c1 = MUL(in[`1`], kC2_full);
1266	const int d1 = MUL(in[`1`], kC1_full);
1267	const uint64_t cd = (uint64_t)( d1 & `0xffff`) << `0` \|
1268	(uint64_t)( c1 & `0xffff`) << `16` \|
1269	(uint64_t)(-c1 & `0xffff`) << `32` \|
1270	(uint64_t)(-d1 & `0xffff`) << `48`;
1271	const int16x4_t CD = vcreate_s16(cd);
1272	const int16x4_t B = vqadd_s16(A, CD);
1273	const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
1274	const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
1275	Add4x4_NEON(m0_m1, m2_m3, dst);
1276	}
1277	#undef MUL
1278
1279	//------------------------------------------------------------------------------
1280	// 4x4
1281
1282	static void DC4_NEON(uint8_t* dst) { // DC
1283	const uint8x8_t A = vld1_u8(dst - BPS); // top row
1284	const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
1285	const uint16x4_t p1 = vpadd_u16(p0, p0);
1286	const uint8x8_t L0 = vld1_u8(dst + `0` * BPS - `1`);
1287	const uint8x8_t L1 = vld1_u8(dst + `1` * BPS - `1`);
1288	const uint8x8_t L2 = vld1_u8(dst + `2` * BPS - `1`);
1289	const uint8x8_t L3 = vld1_u8(dst + `3` * BPS - `1`);
1290	const uint16x8_t s0 = vaddl_u8(L0, L1);
1291	const uint16x8_t s1 = vaddl_u8(L2, L3);
1292	const uint16x8_t s01 = vaddq_u16(s0, s1);
1293	const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
1294	const uint8x8_t dc0 = vrshrn_n_u16(sum, `3`); // (sum + 4) >> 3
1295	const uint8x8_t dc = vdup_lane_u8(dc0, `0`);
1296	int i;
1297	for (i = `0`; i < `4`; ++i) {
1298	vst1_lane_u32((uint32_t)(dst + i BPS), vreinterpret_u32_u8(dc), `0`);
1299	}
1300	}
1301
1302	// TrueMotion (4x4 + 8x8)
1303	static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
1304	const uint8x8_t TL = vld1_dup_u8(dst - BPS - `1`); // top-left pixel 'A[-1]'
1305	const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]'
1306	const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL)); // A[c] - A[-1]
1307	int y;
1308	for (y = `0`; y < size; y += `4`) {
1309	// left edge
1310	const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + `0` * BPS - `1`));
1311	const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + `1` * BPS - `1`));
1312	const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + `2` * BPS - `1`));
1313	const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + `3` * BPS - `1`));
1314	const int16x8_t r0 = vaddq_s16(L0, d); // L[r] + A[c] - A[-1]
1315	const int16x8_t r1 = vaddq_s16(L1, d);
1316	const int16x8_t r2 = vaddq_s16(L2, d);
1317	const int16x8_t r3 = vaddq_s16(L3, d);
1318	// Saturate and store the result.
1319	const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
1320	const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
1321	const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
1322	const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
1323	if (size == `4`) {
1324	vst1_lane_u32((uint32_t)(dst + `0` BPS), r0_u32, `0`);
1325	vst1_lane_u32((uint32_t)(dst + `1` BPS), r1_u32, `0`);
1326	vst1_lane_u32((uint32_t)(dst + `2` BPS), r2_u32, `0`);
1327	vst1_lane_u32((uint32_t)(dst + `3` BPS), r3_u32, `0`);
1328	} else {
1329	vst1_u32((uint32_t)(dst + `0` BPS), r0_u32);
1330	vst1_u32((uint32_t)(dst + `1` BPS), r1_u32);
1331	vst1_u32((uint32_t)(dst + `2` BPS), r2_u32);
1332	vst1_u32((uint32_t)(dst + `3` BPS), r3_u32);
1333	}
1334	dst += `4` * BPS;
1335	}
1336	}
1337
1338	static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, `4`); }
1339
1340	static void VE4_NEON(uint8_t* dst) { // vertical
1341	// NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
1342	const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - `1`)); // top row
1343	const uint64x1_t A1 = vshr_n_u64(A0, `8`);
1344	const uint64x1_t A2 = vshr_n_u64(A0, `16`);
1345	const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
1346	const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
1347	const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
1348	const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
1349	const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
1350	int i;
1351	for (i = `0`; i < `4`; ++i) {
1352	vst1_lane_u32((uint32_t)(dst + i BPS), vreinterpret_u32_u8(avg), `0`);
1353	}
1354	}
1355
1356	static void RD4_NEON(uint8_t* dst) { // Down-right
1357	const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - `1`);
1358	const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
1359	const uint64x1_t ____XABC = vshl_n_u64(XABCD, `32`);
1360	const uint32_t I = dst[-`1` + `0` * BPS];
1361	const uint32_t J = dst[-`1` + `1` * BPS];
1362	const uint32_t K = dst[-`1` + `2` * BPS];
1363	const uint32_t L = dst[-`1` + `3` * BPS];
1364	const uint64x1_t LKJI____ =
1365	vcreate_u64((uint64_t)L \| (K << `8`) \| (J << `16`) \| (I << `24`));
1366	const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
1367	const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, `8`));
1368	const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, `16`));
1369	const uint8_t D = vget_lane_u8(XABCD_u8, `4`);
1370	const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, `6`);
1371	const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
1372	const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
1373	const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
1374	const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1375	const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
1376	const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, `8`));
1377	const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, `16`));
1378	const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, `24`));
1379	vst1_lane_u32((uint32_t)(dst + `0` BPS), r0, `0`);
1380	vst1_lane_u32((uint32_t)(dst + `1` BPS), r1, `0`);
1381	vst1_lane_u32((uint32_t)(dst + `2` BPS), r2, `0`);
1382	vst1_lane_u32((uint32_t)(dst + `3` BPS), r3, `0`);
1383	}
1384
1385	static void LD4_NEON(uint8_t* dst) { // Down-left
1386	// Note using the same shift trick as VE4() is slower here.
1387	const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + `0`);
1388	const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + `1`);
1389	const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + `2`);
1390	const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + `7`], CDEFGH00, `6`);
1391	const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
1392	const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
1393	const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1394	const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
1395	const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, `8`));
1396	const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, `16`));
1397	const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, `24`));
1398	vst1_lane_u32((uint32_t)(dst + `0` BPS), r0, `0`);
1399	vst1_lane_u32((uint32_t)(dst + `1` BPS), r1, `0`);
1400	vst1_lane_u32((uint32_t)(dst + `2` BPS), r2, `0`);
1401	vst1_lane_u32((uint32_t)(dst + `3` BPS), r3, `0`);
1402	}
1403
1404	//------------------------------------------------------------------------------
1405	// Chroma
1406
1407	static void VE8uv_NEON(uint8_t* dst) { // vertical
1408	const uint8x8_t top = vld1_u8(dst - BPS);
1409	int j;
1410	for (j = `0`; j < `8`; ++j) {
1411	vst1_u8(dst + j * BPS, top);
1412	}
1413	}
1414
1415	static void HE8uv_NEON(uint8_t* dst) { // horizontal
1416	int j;
1417	for (j = `0`; j < `8`; ++j) {
1418	const uint8x8_t left = vld1_dup_u8(dst - `1`);
1419	vst1_u8(dst, left);
1420	dst += BPS;
1421	}
1422	}
1423
1424	static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
1425	uint16x8_t sum_top;
1426	uint16x8_t sum_left;
1427	uint8x8_t dc0;
1428
1429	if (do_top) {
1430	const uint8x8_t A = vld1_u8(dst - BPS); // top row
1431	#if WEBP_AARCH64
1432	const uint16_t p2 = vaddlv_u8(A);
1433	sum_top = vdupq_n_u16(p2);
1434	#else
1435	const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
1436	const uint16x4_t p1 = vpadd_u16(p0, p0);
1437	const uint16x4_t p2 = vpadd_u16(p1, p1);
1438	sum_top = vcombine_u16(p2, p2);
1439	#endif
1440	}
1441
1442	if (do_left) {
1443	const uint8x8_t L0 = vld1_u8(dst + `0` * BPS - `1`);
1444	const uint8x8_t L1 = vld1_u8(dst + `1` * BPS - `1`);
1445	const uint8x8_t L2 = vld1_u8(dst + `2` * BPS - `1`);
1446	const uint8x8_t L3 = vld1_u8(dst + `3` * BPS - `1`);
1447	const uint8x8_t L4 = vld1_u8(dst + `4` * BPS - `1`);
1448	const uint8x8_t L5 = vld1_u8(dst + `5` * BPS - `1`);
1449	const uint8x8_t L6 = vld1_u8(dst + `6` * BPS - `1`);
1450	const uint8x8_t L7 = vld1_u8(dst + `7` * BPS - `1`);
1451	const uint16x8_t s0 = vaddl_u8(L0, L1);
1452	const uint16x8_t s1 = vaddl_u8(L2, L3);
1453	const uint16x8_t s2 = vaddl_u8(L4, L5);
1454	const uint16x8_t s3 = vaddl_u8(L6, L7);
1455	const uint16x8_t s01 = vaddq_u16(s0, s1);
1456	const uint16x8_t s23 = vaddq_u16(s2, s3);
1457	sum_left = vaddq_u16(s01, s23);
1458	}
1459
1460	if (do_top && do_left) {
1461	const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1462	dc0 = vrshrn_n_u16(sum, `4`);
1463	} else if (do_top) {
1464	dc0 = vrshrn_n_u16(sum_top, `3`);
1465	} else if (do_left) {
1466	dc0 = vrshrn_n_u16(sum_left, `3`);
1467	} else {
1468	dc0 = vdup_n_u8(`0x80`);
1469	}
1470
1471	{
1472	const uint8x8_t dc = vdup_lane_u8(dc0, `0`);
1473	int i;
1474	for (i = `0`; i < `8`; ++i) {
1475	vst1_u32((uint32_t)(dst + i BPS), vreinterpret_u32_u8(dc));
1476	}
1477	}
1478	}
1479
1480	static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, `1`, `1`); }
1481	static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, `0`, `1`); }
1482	static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, `1`, `0`); }
1483	static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, `0`, `0`); }
1484
1485	static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, `8`); }
1486
1487	//------------------------------------------------------------------------------
1488	// 16x16
1489
1490	static void VE16_NEON(uint8_t* dst) { // vertical
1491	const uint8x16_t top = vld1q_u8(dst - BPS);
1492	int j;
1493	for (j = `0`; j < `16`; ++j) {
1494	vst1q_u8(dst + j * BPS, top);
1495	}
1496	}
1497
1498	static void HE16_NEON(uint8_t* dst) { // horizontal
1499	int j;
1500	for (j = `0`; j < `16`; ++j) {
1501	const uint8x16_t left = vld1q_dup_u8(dst - `1`);
1502	vst1q_u8(dst, left);
1503	dst += BPS;
1504	}
1505	}
1506
1507	static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
1508	uint16x8_t sum_top;
1509	uint16x8_t sum_left;
1510	uint8x8_t dc0;
1511
1512	if (do_top) {
1513	const uint8x16_t A = vld1q_u8(dst - BPS); // top row
1514	#if WEBP_AARCH64
1515	const uint16_t p3 = vaddlvq_u8(A);
1516	sum_top = vdupq_n_u16(p3);
1517	#else
1518	const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
1519	const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
1520	const uint16x4_t p2 = vpadd_u16(p1, p1);
1521	const uint16x4_t p3 = vpadd_u16(p2, p2);
1522	sum_top = vcombine_u16(p3, p3);
1523	#endif
1524	}
1525
1526	if (do_left) {
1527	int i;
1528	sum_left = vdupq_n_u16(`0`);
1529	for (i = `0`; i < `16`; i += `8`) {
1530	const uint8x8_t L0 = vld1_u8(dst + (i + `0`) * BPS - `1`);
1531	const uint8x8_t L1 = vld1_u8(dst + (i + `1`) * BPS - `1`);
1532	const uint8x8_t L2 = vld1_u8(dst + (i + `2`) * BPS - `1`);
1533	const uint8x8_t L3 = vld1_u8(dst + (i + `3`) * BPS - `1`);
1534	const uint8x8_t L4 = vld1_u8(dst + (i + `4`) * BPS - `1`);
1535	const uint8x8_t L5 = vld1_u8(dst + (i + `5`) * BPS - `1`);
1536	const uint8x8_t L6 = vld1_u8(dst + (i + `6`) * BPS - `1`);
1537	const uint8x8_t L7 = vld1_u8(dst + (i + `7`) * BPS - `1`);
1538	const uint16x8_t s0 = vaddl_u8(L0, L1);
1539	const uint16x8_t s1 = vaddl_u8(L2, L3);
1540	const uint16x8_t s2 = vaddl_u8(L4, L5);
1541	const uint16x8_t s3 = vaddl_u8(L6, L7);
1542	const uint16x8_t s01 = vaddq_u16(s0, s1);
1543	const uint16x8_t s23 = vaddq_u16(s2, s3);
1544	const uint16x8_t sum = vaddq_u16(s01, s23);
1545	sum_left = vaddq_u16(sum_left, sum);
1546	}
1547	}
1548
1549	if (do_top && do_left) {
1550	const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1551	dc0 = vrshrn_n_u16(sum, `5`);
1552	} else if (do_top) {
1553	dc0 = vrshrn_n_u16(sum_top, `4`);
1554	} else if (do_left) {
1555	dc0 = vrshrn_n_u16(sum_left, `4`);
1556	} else {
1557	dc0 = vdup_n_u8(`0x80`);
1558	}
1559
1560	{
1561	const uint8x16_t dc = vdupq_lane_u8(dc0, `0`);
1562	int i;
1563	for (i = `0`; i < `16`; ++i) {
1564	vst1q_u8(dst + i * BPS, dc);
1565	}
1566	}
1567	}
1568
1569	static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, `1`, `1`); }
1570	static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, `0`, `1`); }
1571	static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, `1`, `0`); }
1572	static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, `0`, `0`); }
1573
1574	static void TM16_NEON(uint8_t* dst) {
1575	const uint8x8_t TL = vld1_dup_u8(dst - BPS - `1`); // top-left pixel 'A[-1]'
1576	const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]'
1577	// A[c] - A[-1]
1578	const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));
1579	const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));
1580	int y;
1581	for (y = `0`; y < `16`; y += `4`) {
1582	// left edge
1583	const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + `0` * BPS - `1`));
1584	const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + `1` * BPS - `1`));
1585	const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + `2` * BPS - `1`));
1586	const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + `3` * BPS - `1`));
1587	const int16x8_t r0_lo = vaddq_s16(L0, d_lo); // L[r] + A[c] - A[-1]
1588	const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
1589	const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
1590	const int16x8_t r3_lo = vaddq_s16(L3, d_lo);
1591	const int16x8_t r0_hi = vaddq_s16(L0, d_hi);
1592	const int16x8_t r1_hi = vaddq_s16(L1, d_hi);
1593	const int16x8_t r2_hi = vaddq_s16(L2, d_hi);
1594	const int16x8_t r3_hi = vaddq_s16(L3, d_hi);
1595	// Saturate and store the result.
1596	const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
1597	const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
1598	const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
1599	const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
1600	vst1q_u8(dst + `0` * BPS, row0);
1601	vst1q_u8(dst + `1` * BPS, row1);
1602	vst1q_u8(dst + `2` * BPS, row2);
1603	vst1q_u8(dst + `3` * BPS, row3);
1604	dst += `4` * BPS;
1605	}
1606	}
1607
1608	//------------------------------------------------------------------------------
1609	// Entry point
1610
1611	extern void VP8DspInitNEON(void);
1612
1613	WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
1614	VP8Transform = TransformTwo_NEON;
1615	VP8TransformAC3 = TransformAC3_NEON;
1616	VP8TransformDC = TransformDC_NEON;
1617	VP8TransformWHT = TransformWHT_NEON;
1618
1619	VP8VFilter16 = VFilter16_NEON;
1620	VP8VFilter16i = VFilter16i_NEON;
1621	VP8HFilter16 = HFilter16_NEON;
1622	#if !defined(WORK_AROUND_GCC)
1623	VP8HFilter16i = HFilter16i_NEON;
1624	#endif
1625	VP8VFilter8 = VFilter8_NEON;
1626	VP8VFilter8i = VFilter8i_NEON;
1627	#if !defined(WORK_AROUND_GCC)
1628	VP8HFilter8 = HFilter8_NEON;
1629	VP8HFilter8i = HFilter8i_NEON;
1630	#endif
1631	VP8SimpleVFilter16 = SimpleVFilter16_NEON;
1632	VP8SimpleHFilter16 = SimpleHFilter16_NEON;
1633	VP8SimpleVFilter16i = SimpleVFilter16i_NEON;
1634	VP8SimpleHFilter16i = SimpleHFilter16i_NEON;
1635
1636	VP8PredLuma4[`0`] = DC4_NEON;
1637	VP8PredLuma4[`1`] = TM4_NEON;
1638	VP8PredLuma4[`2`] = VE4_NEON;
1639	VP8PredLuma4[`4`] = RD4_NEON;
1640	VP8PredLuma4[`6`] = LD4_NEON;
1641
1642	VP8PredLuma16[`0`] = DC16TopLeft_NEON;
1643	VP8PredLuma16[`1`] = TM16_NEON;
1644	VP8PredLuma16[`2`] = VE16_NEON;
1645	VP8PredLuma16[`3`] = HE16_NEON;
1646	VP8PredLuma16[`4`] = DC16NoTop_NEON;
1647	VP8PredLuma16[`5`] = DC16NoLeft_NEON;
1648	VP8PredLuma16[`6`] = DC16NoTopLeft_NEON;
1649
1650	VP8PredChroma8[`0`] = DC8uv_NEON;
1651	VP8PredChroma8[`1`] = TM8uv_NEON;
1652	VP8PredChroma8[`2`] = VE8uv_NEON;
1653	VP8PredChroma8[`3`] = HE8uv_NEON;
1654	VP8PredChroma8[`4`] = DC8uvNoTop_NEON;
1655	VP8PredChroma8[`5`] = DC8uvNoLeft_NEON;
1656	VP8PredChroma8[`6`] = DC8uvNoTopLeft_NEON;
1657	}
1658
1659	#else // !WEBP_USE_NEON
1660
1661	WEBP_DSP_INIT_STUB(VP8DspInitNEON)
1662
1663	#endif // WEBP_USE_NEON
1664

Browse the source code of Godot/thirdparty/libwebp/src/dsp/dec_neon.c