dec_neon.c source code [engine/third_party/libwebp/src/dsp/dec_neon.c]

1	// Copyright 2012 Google Inc. All Rights Reserved.
2	//
3	// Use of this source code is governed by a BSD-style license
4	// that can be found in the COPYING file in the root of the source
5	// tree. An additional intellectual property rights grant can be found
6	// in the file PATENTS. All contributing project authors may
7	// be found in the AUTHORS file in the root of the source tree.
8	// -----------------------------------------------------------------------------
9	//
10	// ARM NEON version of dsp functions and loop filtering.
11	//
12	// Authors: Somnath Banerjee (somnath@google.com)
13	// Johann Koenig (johannkoenig@google.com)
14
15	#include "./dsp.h"
16
17	#if defined(WEBP_USE_NEON)
18
19	#include "./neon.h"
20	#include "../dec/vp8i_dec.h"
21
22	//------------------------------------------------------------------------------
23	// NxM Loading functions
24
25	// Load/Store vertical edge
26	#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \
27	"vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
28	"vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
29	"vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
30	"vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
31	"vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
32	"vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
33	"vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
34	"vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
35
36	#define STORE8x2(c1, c2, p, stride) \
37	"vst2.8 {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n" \
38	"vst2.8 {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n" \
39	"vst2.8 {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n" \
40	"vst2.8 {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n" \
41	"vst2.8 {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n" \
42	"vst2.8 {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n" \
43	"vst2.8 {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n" \
44	"vst2.8 {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
45
46	#if !defined(WORK_AROUND_GCC)
47
48	// This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
49	// (register alloc, probably). The variants somewhat mitigate the problem, but
50	// not quite. HFilter16i() remains problematic.
51	static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) {
52	const uint8x8_t zero = vdup_n_u8(`0`);
53	uint8x8x4_t out;
54	INIT_VECTOR4(out, zero, zero, zero, zero);
55	out = vld4_lane_u8(src + `0` * stride, out, `0`);
56	out = vld4_lane_u8(src + `1` * stride, out, `1`);
57	out = vld4_lane_u8(src + `2` * stride, out, `2`);
58	out = vld4_lane_u8(src + `3` * stride, out, `3`);
59	out = vld4_lane_u8(src + `4` * stride, out, `4`);
60	out = vld4_lane_u8(src + `5` * stride, out, `5`);
61	out = vld4_lane_u8(src + `6` * stride, out, `6`);
62	out = vld4_lane_u8(src + `7` * stride, out, `7`);
63	return out;
64	}
65
66	static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride,
67	uint8x16_t* const p1, uint8x16_t* const p0,
68	uint8x16_t* const q0, uint8x16_t* const q1) {
69	// row0 = p1[0..7]\|p0[0..7]\|q0[0..7]\|q1[0..7]
70	// row8 = p1[8..15]\|p0[8..15]\|q0[8..15]\|q1[8..15]
71	const uint8x8x4_t row0 = Load4x8(src - `2` + `0` * stride, stride);
72	const uint8x8x4_t row8 = Load4x8(src - `2` + `8` * stride, stride);
73	*p1 = vcombine_u8(row0.val[`0`], row8.val[`0`]);
74	*p0 = vcombine_u8(row0.val[`1`], row8.val[`1`]);
75	*q0 = vcombine_u8(row0.val[`2`], row8.val[`2`]);
76	*q1 = vcombine_u8(row0.val[`3`], row8.val[`3`]);
77	}
78
79	#else // WORK_AROUND_GCC
80
81	#define LOADQ_LANE_32b(VALUE, LANE) do { \
82	(VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE)); \
83	src += stride; \
84	} while (0)
85
86	static WEBP_INLINE void Load4x16(const uint8_t* src, int stride,
87	uint8x16_t* const p1, uint8x16_t* const p0,
88	uint8x16_t* const q0, uint8x16_t* const q1) {
89	const uint32x4_t zero = vdupq_n_u32(`0`);
90	uint32x4x4_t in;
91	INIT_VECTOR4(in, zero, zero, zero, zero);
92	src -= `2`;
93	LOADQ_LANE_32b(in.val[`0`], `0`);
94	LOADQ_LANE_32b(in.val[`1`], `0`);
95	LOADQ_LANE_32b(in.val[`2`], `0`);
96	LOADQ_LANE_32b(in.val[`3`], `0`);
97	LOADQ_LANE_32b(in.val[`0`], `1`);
98	LOADQ_LANE_32b(in.val[`1`], `1`);
99	LOADQ_LANE_32b(in.val[`2`], `1`);
100	LOADQ_LANE_32b(in.val[`3`], `1`);
101	LOADQ_LANE_32b(in.val[`0`], `2`);
102	LOADQ_LANE_32b(in.val[`1`], `2`);
103	LOADQ_LANE_32b(in.val[`2`], `2`);
104	LOADQ_LANE_32b(in.val[`3`], `2`);
105	LOADQ_LANE_32b(in.val[`0`], `3`);
106	LOADQ_LANE_32b(in.val[`1`], `3`);
107	LOADQ_LANE_32b(in.val[`2`], `3`);
108	LOADQ_LANE_32b(in.val[`3`], `3`);
109	// Transpose four 4x4 parts:
110	{
111	const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[`0`]),
112	vreinterpretq_u8_u32(in.val[`1`]));
113	const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[`2`]),
114	vreinterpretq_u8_u32(in.val[`3`]));
115	const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[`0`]),
116	vreinterpretq_u16_u8(row23.val[`0`]));
117	const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[`1`]),
118	vreinterpretq_u16_u8(row23.val[`1`]));
119	*p1 = vreinterpretq_u8_u16(row02.val[`0`]);
120	*p0 = vreinterpretq_u8_u16(row13.val[`0`]);
121	*q0 = vreinterpretq_u8_u16(row02.val[`1`]);
122	*q1 = vreinterpretq_u8_u16(row13.val[`1`]);
123	}
124	}
125	#undef LOADQ_LANE_32b
126
127	#endif // !WORK_AROUND_GCC
128
129	static WEBP_INLINE void Load8x16(const uint8_t* const src, int stride,
130	uint8x16_t* const p3, uint8x16_t* const p2,
131	uint8x16_t* const p1, uint8x16_t* const p0,
132	uint8x16_t* const q0, uint8x16_t* const q1,
133	uint8x16_t* const q2, uint8x16_t* const q3) {
134	Load4x16(src - `2`, stride, p3, p2, p1, p0);
135	Load4x16(src + `2`, stride, q0, q1, q2, q3);
136	}
137
138	static WEBP_INLINE void Load16x4(const uint8_t* const src, int stride,
139	uint8x16_t* const p1, uint8x16_t* const p0,
140	uint8x16_t* const q0, uint8x16_t* const q1) {
141	p1 = vld1q_u8(src - `2` stride);
142	p0 = vld1q_u8(src - `1` stride);
143	q0 = vld1q_u8(src + `0` stride);
144	q1 = vld1q_u8(src + `1` stride);
145	}
146
147	static WEBP_INLINE void Load16x8(const uint8_t* const src, int stride,
148	uint8x16_t* const p3, uint8x16_t* const p2,
149	uint8x16_t* const p1, uint8x16_t* const p0,
150	uint8x16_t* const q0, uint8x16_t* const q1,
151	uint8x16_t* const q2, uint8x16_t* const q3) {
152	Load16x4(src - `2` * stride, stride, p3, p2, p1, p0);
153	Load16x4(src + `2` * stride, stride, q0, q1, q2, q3);
154	}
155
156	static WEBP_INLINE void Load8x8x2(const uint8_t* const u,
157	const uint8_t* const v,
158	int stride,
159	uint8x16_t* const p3, uint8x16_t* const p2,
160	uint8x16_t* const p1, uint8x16_t* const p0,
161	uint8x16_t* const q0, uint8x16_t* const q1,
162	uint8x16_t* const q2, uint8x16_t* const q3) {
163	// We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
164	// and the v-samples on the higher half.
165	p3 = vcombine_u8(vld1_u8(u - `4` stride), vld1_u8(v - `4` * stride));
166	p2 = vcombine_u8(vld1_u8(u - `3` stride), vld1_u8(v - `3` * stride));
167	p1 = vcombine_u8(vld1_u8(u - `2` stride), vld1_u8(v - `2` * stride));
168	p0 = vcombine_u8(vld1_u8(u - `1` stride), vld1_u8(v - `1` * stride));
169	q0 = vcombine_u8(vld1_u8(u + `0` stride), vld1_u8(v + `0` * stride));
170	q1 = vcombine_u8(vld1_u8(u + `1` stride), vld1_u8(v + `1` * stride));
171	q2 = vcombine_u8(vld1_u8(u + `2` stride), vld1_u8(v + `2` * stride));
172	q3 = vcombine_u8(vld1_u8(u + `3` stride), vld1_u8(v + `3` * stride));
173	}
174
175	#if !defined(WORK_AROUND_GCC)
176
177	#define LOAD_UV_8(ROW) \
178	vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
179
180	static WEBP_INLINE void Load8x8x2T(const uint8_t* const u,
181	const uint8_t* const v,
182	int stride,
183	uint8x16_t* const p3, uint8x16_t* const p2,
184	uint8x16_t* const p1, uint8x16_t* const p0,
185	uint8x16_t* const q0, uint8x16_t* const q1,
186	uint8x16_t* const q2, uint8x16_t* const q3) {
187	// We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
188	// and the v-samples on the higher half.
189	const uint8x16_t row0 = LOAD_UV_8(`0`);
190	const uint8x16_t row1 = LOAD_UV_8(`1`);
191	const uint8x16_t row2 = LOAD_UV_8(`2`);
192	const uint8x16_t row3 = LOAD_UV_8(`3`);
193	const uint8x16_t row4 = LOAD_UV_8(`4`);
194	const uint8x16_t row5 = LOAD_UV_8(`5`);
195	const uint8x16_t row6 = LOAD_UV_8(`6`);
196	const uint8x16_t row7 = LOAD_UV_8(`7`);
197	// Perform two side-by-side 8x8 transposes
198	// u00 u01 u02 u03 u04 u05 u06 u07 \| v00 v01 v02 v03 v04 v05 v06 v07
199	// u10 u11 u12 u13 u14 u15 u16 u17 \| v10 v11 v12 ...
200	// u20 u21 u22 u23 u24 u25 u26 u27 \| v20 v21 ...
201	// u30 u31 u32 u33 u34 u35 u36 u37 \| ...
202	// u40 u41 u42 u43 u44 u45 u46 u47 \| ...
203	// u50 u51 u52 u53 u54 u55 u56 u57 \| ...
204	// u60 u61 u62 u63 u64 u65 u66 u67 \| v60 ...
205	// u70 u71 u72 u73 u74 u75 u76 u77 \| v70 v71 v72 ...
206	const uint8x16x2_t row01 = vtrnq_u8(row0, row1); // u00 u10 u02 u12 ...
207	// u01 u11 u03 u13 ...
208	const uint8x16x2_t row23 = vtrnq_u8(row2, row3); // u20 u30 u22 u32 ...
209	// u21 u31 u23 u33 ...
210	const uint8x16x2_t row45 = vtrnq_u8(row4, row5); // ...
211	const uint8x16x2_t row67 = vtrnq_u8(row6, row7); // ...
212	const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[`0`]),
213	vreinterpretq_u16_u8(row23.val[`0`]));
214	const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[`1`]),
215	vreinterpretq_u16_u8(row23.val[`1`]));
216	const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[`0`]),
217	vreinterpretq_u16_u8(row67.val[`0`]));
218	const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[`1`]),
219	vreinterpretq_u16_u8(row67.val[`1`]));
220	const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[`0`]),
221	vreinterpretq_u32_u16(row46.val[`0`]));
222	const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[`1`]),
223	vreinterpretq_u32_u16(row46.val[`1`]));
224	const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[`0`]),
225	vreinterpretq_u32_u16(row57.val[`0`]));
226	const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[`1`]),
227	vreinterpretq_u32_u16(row57.val[`1`]));
228	*p3 = vreinterpretq_u8_u32(row04.val[`0`]);
229	*p2 = vreinterpretq_u8_u32(row15.val[`0`]);
230	*p1 = vreinterpretq_u8_u32(row26.val[`0`]);
231	*p0 = vreinterpretq_u8_u32(row37.val[`0`]);
232	*q0 = vreinterpretq_u8_u32(row04.val[`1`]);
233	*q1 = vreinterpretq_u8_u32(row15.val[`1`]);
234	*q2 = vreinterpretq_u8_u32(row26.val[`1`]);
235	*q3 = vreinterpretq_u8_u32(row37.val[`1`]);
236	}
237	#undef LOAD_UV_8
238
239	#endif // !WORK_AROUND_GCC
240
241	static WEBP_INLINE void Store2x8(const uint8x8x2_t v,
242	uint8_t* const dst, int stride) {
243	vst2_lane_u8(dst + `0` * stride, v, `0`);
244	vst2_lane_u8(dst + `1` * stride, v, `1`);
245	vst2_lane_u8(dst + `2` * stride, v, `2`);
246	vst2_lane_u8(dst + `3` * stride, v, `3`);
247	vst2_lane_u8(dst + `4` * stride, v, `4`);
248	vst2_lane_u8(dst + `5` * stride, v, `5`);
249	vst2_lane_u8(dst + `6` * stride, v, `6`);
250	vst2_lane_u8(dst + `7` * stride, v, `7`);
251	}
252
253	static WEBP_INLINE void Store2x16(const uint8x16_t p0, const uint8x16_t q0,
254	uint8_t* const dst, int stride) {
255	uint8x8x2_t lo, hi;
256	lo.val[`0`] = vget_low_u8(p0);
257	lo.val[`1`] = vget_low_u8(q0);
258	hi.val[`0`] = vget_high_u8(p0);
259	hi.val[`1`] = vget_high_u8(q0);
260	Store2x8(lo, dst - `1` + `0` * stride, stride);
261	Store2x8(hi, dst - `1` + `8` * stride, stride);
262	}
263
264	#if !defined(WORK_AROUND_GCC)
265	static WEBP_INLINE void Store4x8(const uint8x8x4_t v,
266	uint8_t* const dst, int stride) {
267	vst4_lane_u8(dst + `0` * stride, v, `0`);
268	vst4_lane_u8(dst + `1` * stride, v, `1`);
269	vst4_lane_u8(dst + `2` * stride, v, `2`);
270	vst4_lane_u8(dst + `3` * stride, v, `3`);
271	vst4_lane_u8(dst + `4` * stride, v, `4`);
272	vst4_lane_u8(dst + `5` * stride, v, `5`);
273	vst4_lane_u8(dst + `6` * stride, v, `6`);
274	vst4_lane_u8(dst + `7` * stride, v, `7`);
275	}
276
277	static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0,
278	const uint8x16_t q0, const uint8x16_t q1,
279	uint8_t* const dst, int stride) {
280	uint8x8x4_t lo, hi;
281	INIT_VECTOR4(lo,
282	vget_low_u8(p1), vget_low_u8(p0),
283	vget_low_u8(q0), vget_low_u8(q1));
284	INIT_VECTOR4(hi,
285	vget_high_u8(p1), vget_high_u8(p0),
286	vget_high_u8(q0), vget_high_u8(q1));
287	Store4x8(lo, dst - `2` + `0` * stride, stride);
288	Store4x8(hi, dst - `2` + `8` * stride, stride);
289	}
290	#endif // !WORK_AROUND_GCC
291
292	static WEBP_INLINE void Store16x2(const uint8x16_t p0, const uint8x16_t q0,
293	uint8_t* const dst, int stride) {
294	vst1q_u8(dst - stride, p0);
295	vst1q_u8(dst, q0);
296	}
297
298	static WEBP_INLINE void Store16x4(const uint8x16_t p1, const uint8x16_t p0,
299	const uint8x16_t q0, const uint8x16_t q1,
300	uint8_t* const dst, int stride) {
301	Store16x2(p1, p0, dst - stride, stride);
302	Store16x2(q0, q1, dst + stride, stride);
303	}
304
305	static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0,
306	uint8_t* const u, uint8_t* const v,
307	int stride) {
308	// p0 and q0 contain the u+v samples packed in low/high halves.
309	vst1_u8(u - stride, vget_low_u8(p0));
310	vst1_u8(u, vget_low_u8(q0));
311	vst1_u8(v - stride, vget_high_u8(p0));
312	vst1_u8(v, vget_high_u8(q0));
313	}
314
315	static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0,
316	const uint8x16_t q0, const uint8x16_t q1,
317	uint8_t* const u, uint8_t* const v,
318	int stride) {
319	// The p1...q1 registers contain the u+v samples packed in low/high halves.
320	Store8x2x2(p1, p0, u - stride, v - stride, stride);
321	Store8x2x2(q0, q1, u + stride, v + stride, stride);
322	}
323
324	#if !defined(WORK_AROUND_GCC)
325
326	#define STORE6_LANE(DST, VAL0, VAL1, LANE) do { \
327	vst3_lane_u8((DST) - 3, (VAL0), (LANE)); \
328	vst3_lane_u8((DST) + 0, (VAL1), (LANE)); \
329	(DST) += stride; \
330	} while (0)
331
332	static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1,
333	const uint8x16_t p0, const uint8x16_t q0,
334	const uint8x16_t q1, const uint8x16_t q2,
335	uint8_t* u, uint8_t* v,
336	int stride) {
337	uint8x8x3_t u0, u1, v0, v1;
338	INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
339	INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
340	INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
341	INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
342	STORE6_LANE(u, u0, u1, `0`);
343	STORE6_LANE(u, u0, u1, `1`);
344	STORE6_LANE(u, u0, u1, `2`);
345	STORE6_LANE(u, u0, u1, `3`);
346	STORE6_LANE(u, u0, u1, `4`);
347	STORE6_LANE(u, u0, u1, `5`);
348	STORE6_LANE(u, u0, u1, `6`);
349	STORE6_LANE(u, u0, u1, `7`);
350	STORE6_LANE(v, v0, v1, `0`);
351	STORE6_LANE(v, v0, v1, `1`);
352	STORE6_LANE(v, v0, v1, `2`);
353	STORE6_LANE(v, v0, v1, `3`);
354	STORE6_LANE(v, v0, v1, `4`);
355	STORE6_LANE(v, v0, v1, `5`);
356	STORE6_LANE(v, v0, v1, `6`);
357	STORE6_LANE(v, v0, v1, `7`);
358	}
359	#undef STORE6_LANE
360
361	static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,
362	const uint8x16_t q0, const uint8x16_t q1,
363	uint8_t* const u, uint8_t* const v,
364	int stride) {
365	uint8x8x4_t u0, v0;
366	INIT_VECTOR4(u0,
367	vget_low_u8(p1), vget_low_u8(p0),
368	vget_low_u8(q0), vget_low_u8(q1));
369	INIT_VECTOR4(v0,
370	vget_high_u8(p1), vget_high_u8(p0),
371	vget_high_u8(q0), vget_high_u8(q1));
372	vst4_lane_u8(u - `2` + `0` * stride, u0, `0`);
373	vst4_lane_u8(u - `2` + `1` * stride, u0, `1`);
374	vst4_lane_u8(u - `2` + `2` * stride, u0, `2`);
375	vst4_lane_u8(u - `2` + `3` * stride, u0, `3`);
376	vst4_lane_u8(u - `2` + `4` * stride, u0, `4`);
377	vst4_lane_u8(u - `2` + `5` * stride, u0, `5`);
378	vst4_lane_u8(u - `2` + `6` * stride, u0, `6`);
379	vst4_lane_u8(u - `2` + `7` * stride, u0, `7`);
380	vst4_lane_u8(v - `2` + `0` * stride, v0, `0`);
381	vst4_lane_u8(v - `2` + `1` * stride, v0, `1`);
382	vst4_lane_u8(v - `2` + `2` * stride, v0, `2`);
383	vst4_lane_u8(v - `2` + `3` * stride, v0, `3`);
384	vst4_lane_u8(v - `2` + `4` * stride, v0, `4`);
385	vst4_lane_u8(v - `2` + `5` * stride, v0, `5`);
386	vst4_lane_u8(v - `2` + `6` * stride, v0, `6`);
387	vst4_lane_u8(v - `2` + `7` * stride, v0, `7`);
388	}
389
390	#endif // !WORK_AROUND_GCC
391
392	// Zero extend 'v' to an int16x8_t.
393	static WEBP_INLINE int16x8_t ConvertU8ToS16(uint8x8_t v) {
394	return vreinterpretq_s16_u16(vmovl_u8(v));
395	}
396
397	// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
398	// to the corresponding rows of 'dst'.
399	static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
400	const int16x8_t dst01,
401	const int16x8_t dst23) {
402	// Unsigned saturate to 8b.
403	const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
404	const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
405
406	// Store the results.
407	vst1_lane_u32((uint32_t)(dst + `0` BPS), vreinterpret_u32_u8(dst01_u8), `0`);
408	vst1_lane_u32((uint32_t)(dst + `1` BPS), vreinterpret_u32_u8(dst01_u8), `1`);
409	vst1_lane_u32((uint32_t)(dst + `2` BPS), vreinterpret_u32_u8(dst23_u8), `0`);
410	vst1_lane_u32((uint32_t)(dst + `3` BPS), vreinterpret_u32_u8(dst23_u8), `1`);
411	}
412
413	static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
414	uint8_t* const dst) {
415	uint32x2_t dst01 = vdup_n_u32(`0`);
416	uint32x2_t dst23 = vdup_n_u32(`0`);
417
418	// Load the source pixels.
419	dst01 = vld1_lane_u32((uint32_t)(dst + `0` BPS), dst01, `0`);
420	dst23 = vld1_lane_u32((uint32_t)(dst + `2` BPS), dst23, `0`);
421	dst01 = vld1_lane_u32((uint32_t)(dst + `1` BPS), dst01, `1`);
422	dst23 = vld1_lane_u32((uint32_t)(dst + `3` BPS), dst23, `1`);
423
424	{
425	// Convert to 16b.
426	const int16x8_t dst01_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst01));
427	const int16x8_t dst23_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst23));
428
429	// Descale with rounding.
430	const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, `3`);
431	const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, `3`);
432	// Add the inverse transform.
433	SaturateAndStore4x4(dst, out01, out23);
434	}
435	}
436
437	//-----------------------------------------------------------------------------
438	// Simple In-loop filtering (Paragraph 15.2)
439
440	static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0,
441	const uint8x16_t q0, const uint8x16_t q1,
442	int thresh) {
443	const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
444	const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0); // abs(p0-q0)
445	const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1); // abs(p1-q1)
446	const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0); // 2 abs(p0-q0)*
447	const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, `1`); // abs(p1-q1) / 2
448	const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
449	const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
450	return mask;
451	}
452
453	static int8x16_t FlipSign(const uint8x16_t v) {
454	const uint8x16_t sign_bit = vdupq_n_u8(`0x80`);
455	return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
456	}
457
458	static uint8x16_t FlipSignBack(const int8x16_t v) {
459	const int8x16_t sign_bit = vdupq_n_s8(`0x80`);
460	return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
461	}
462
463	static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0,
464	const int8x16_t q0, const int8x16_t q1) {
465	const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
466	const int8x16_t p1_q1 = vqsubq_s8(p1, q1); // (p1-q1)
467	const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0); // (p1-q1) + 1 (q0 - p0)*
468	const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // (p1-q1) + 2 (q0 - p0)*
469	const int8x16_t s3 = vqaddq_s8(q0_p0, s2); // (p1-q1) + 3 (q0 - p0)*
470	return s3;
471	}
472
473	static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) {
474	const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
475	const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 (q0 - p0)*
476	const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 (q0 - p0)*
477	return s2;
478	}
479
480	//------------------------------------------------------------------------------
481
482	static void ApplyFilter2NoFlip(const int8x16_t p0s, const int8x16_t q0s,
483	const int8x16_t delta,
484	int8x16_t* const op0, int8x16_t* const oq0) {
485	const int8x16_t kCst3 = vdupq_n_s8(`0x03`);
486	const int8x16_t kCst4 = vdupq_n_s8(`0x04`);
487	const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
488	const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
489	const int8x16_t delta3 = vshrq_n_s8(delta_p3, `3`);
490	const int8x16_t delta4 = vshrq_n_s8(delta_p4, `3`);
491	*op0 = vqaddq_s8(p0s, delta3);
492	*oq0 = vqsubq_s8(q0s, delta4);
493	}
494
495	#if defined(WEBP_USE_INTRINSICS)
496
497	static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
498	const int8x16_t delta,
499	uint8x16_t* const op0, uint8x16_t* const oq0) {
500	const int8x16_t kCst3 = vdupq_n_s8(`0x03`);
501	const int8x16_t kCst4 = vdupq_n_s8(`0x04`);
502	const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
503	const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
504	const int8x16_t delta3 = vshrq_n_s8(delta_p3, `3`);
505	const int8x16_t delta4 = vshrq_n_s8(delta_p4, `3`);
506	const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
507	const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
508	*op0 = FlipSignBack(sp0);
509	*oq0 = FlipSignBack(sq0);
510	}
511
512	static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,
513	const uint8x16_t q0, const uint8x16_t q1,
514	const uint8x16_t mask,
515	uint8x16_t* const op0, uint8x16_t* const oq0) {
516	const int8x16_t p1s = FlipSign(p1);
517	const int8x16_t p0s = FlipSign(p0);
518	const int8x16_t q0s = FlipSign(q0);
519	const int8x16_t q1s = FlipSign(q1);
520	const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
521	const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
522	ApplyFilter2(p0s, q0s, delta1, op0, oq0);
523	}
524
525	static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
526	uint8x16_t p1, p0, q0, q1, op0, oq0;
527	Load16x4(p, stride, &p1, &p0, &q0, &q1);
528	{
529	const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
530	DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
531	}
532	Store16x2(op0, oq0, p, stride);
533	}
534
535	static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
536	uint8x16_t p1, p0, q0, q1, oq0, op0;
537	Load4x16(p, stride, &p1, &p0, &q0, &q1);
538	{
539	const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
540	DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
541	}
542	Store2x16(op0, oq0, p, stride);
543	}
544
545	#else
546
547	#define QRegs "q0", "q1", "q2", "q3", \
548	"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
549
550	#define FLIP_SIGN_BIT2(a, b, s) \
551	"veor " #a "," #a "," #s " \n" \
552	"veor " #b "," #b "," #s " \n" \
553
554	#define FLIP_SIGN_BIT4(a, b, c, d, s) \
555	FLIP_SIGN_BIT2(a, b, s) \
556	FLIP_SIGN_BIT2(c, d, s) \
557
558	#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask) \
559	"vabd.u8 q15," #p0 "," #q0 " \n" /* abs(p0 - q0) */ \
560	"vabd.u8 q14," #p1 "," #q1 " \n" /* abs(p1 - q1) */ \
561	"vqadd.u8 q15, q15, q15 \n" /* abs(p0 - q0) * 2 */ \
562	"vshr.u8 q14, q14, #1 \n" /* abs(p1 - q1) / 2 */ \
563	"vqadd.u8 q15, q15, q14 \n" /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
564	"vdup.8 q14, " #thresh " \n" \
565	"vcge.u8 " #mask ", q14, q15 \n" /* mask <= thresh */
566
567	#define GET_BASE_DELTA(p1, p0, q0, q1, o) \
568	"vqsub.s8 q15," #q0 "," #p0 " \n" /* (q0 - p0) */ \
569	"vqsub.s8 " #o "," #p1 "," #q1 " \n" /* (p1 - q1) */ \
570	"vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 1 * (p0 - q0) */ \
571	"vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 2 * (p0 - q0) */ \
572	"vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 3 * (p0 - q0) */
573
574	#define DO_SIMPLE_FILTER(p0, q0, fl) \
575	"vmov.i8 q15, #0x03 \n" \
576	"vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 3 */ \
577	"vshr.s8 q15, q15, #3 \n" /* filter1 >> 3 */ \
578	"vqadd.s8 " #p0 "," #p0 ", q15 \n" /* p0 += filter1 */ \
579	\
580	"vmov.i8 q15, #0x04 \n" \
581	"vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 4 */ \
582	"vshr.s8 q15, q15, #3 \n" /* filter2 >> 3 */ \
583	"vqsub.s8 " #q0 "," #q0 ", q15 \n" /* q0 -= filter2 */
584
585	// Applies filter on 2 pixels (p0 and q0)
586	#define DO_FILTER2(p1, p0, q0, q1, thresh) \
587	NEEDS_FILTER(p1, p0, q0, q1, thresh, q9) /* filter mask in q9 */ \
588	"vmov.i8 q10, #0x80 \n" /* sign bit */ \
589	FLIP_SIGN_BIT4(p1, p0, q0, q1, q10) /* convert to signed value */ \
590	GET_BASE_DELTA(p1, p0, q0, q1, q11) /* get filter level */ \
591	"vand q9, q9, q11 \n" /* apply filter mask */ \
592	DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \
593	FLIP_SIGN_BIT2(p0, q0, q10)
594
595	static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
596	__asm__ volatile (
597	"sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 stride*
598
599	"vld1.u8 {q1}, [%[p]], %[stride] \n" // p1
600	"vld1.u8 {q2}, [%[p]], %[stride] \n" // p0
601	"vld1.u8 {q3}, [%[p]], %[stride] \n" // q0
602	"vld1.u8 {q12}, [%[p]] \n" // q1
603
604	DO_FILTER2(q1, q2, q3, q12, %[thresh])
605
606	"sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 stride*
607
608	"vst1.u8 {q2}, [%[p]], %[stride] \n" // store op0
609	"vst1.u8 {q3}, [%[p]] \n" // store oq0
610	: [p] "+r"(p)
611	: [stride] "r"(stride), [thresh] "r"(thresh)
612	: "memory", QRegs
613	);
614	}
615
616	static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
617	__asm__ volatile (
618	"sub r4, %[p], #2 \n" // base1 = p - 2
619	"lsl r6, %[stride], #1 \n" // r6 = 2 stride*
620	"add r5, r4, %[stride] \n" // base2 = base1 + stride
621
622	LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
623	LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
624	"vswp d3, d24 \n" // p1:q1 p0:q3
625	"vswp d5, d26 \n" // q0:q2 q1:q4
626	"vswp q2, q12 \n" // p1:q1 p0:q2 q0:q3 q1:q4
627
628	DO_FILTER2(q1, q2, q12, q13, %[thresh])
629
630	"sub %[p], %[p], #1 \n" // p - 1
631
632	"vswp d5, d24 \n"
633	STORE8x2(d4, d5, [%[p]], %[stride])
634	STORE8x2(d24, d25, [%[p]], %[stride])
635
636	: [p] "+r"(p)
637	: [stride] "r"(stride), [thresh] "r"(thresh)
638	: "memory", "r4", "r5", "r6", QRegs
639	);
640	}
641
642	#endif // WEBP_USE_INTRINSICS
643
644	static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
645	uint32_t k;
646	for (k = `3`; k != `0`; --k) {
647	p += `4` * stride;
648	SimpleVFilter16(p, stride, thresh);
649	}
650	}
651
652	static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
653	uint32_t k;
654	for (k = `3`; k != `0`; --k) {
655	p += `4`;
656	SimpleHFilter16(p, stride, thresh);
657	}
658	}
659
660	//------------------------------------------------------------------------------
661	// Complex In-loop filtering (Paragraph 15.3)
662
663	static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0,
664	const uint8x16_t q0, const uint8x16_t q1,
665	int hev_thresh) {
666	const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
667	const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
668	const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
669	const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);
670	const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);
671	return mask;
672	}
673
674	static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2,
675	const uint8x16_t p1, const uint8x16_t p0,
676	const uint8x16_t q0, const uint8x16_t q1,
677	const uint8x16_t q2, const uint8x16_t q3,
678	int ithresh, int thresh) {
679	const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
680	const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2)
681	const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1); // abs(p2 - p1)
682	const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
683	const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2); // abs(q3 - q2)
684	const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1); // abs(q2 - q1)
685	const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
686	const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
687	const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
688	const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
689	const uint8x16_t max12 = vmaxq_u8(max1, max2);
690	const uint8x16_t max123 = vmaxq_u8(max12, max3);
691	const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
692	const uint8x16_t mask1 = NeedsFilter(p1, p0, q0, q1, thresh);
693	const uint8x16_t mask = vandq_u8(mask1, mask2);
694	return mask;
695	}
696
697	// 4-points filter
698
699	static void ApplyFilter4(
700	const int8x16_t p1, const int8x16_t p0,
701	const int8x16_t q0, const int8x16_t q1,
702	const int8x16_t delta0,
703	uint8x16_t* const op1, uint8x16_t* const op0,
704	uint8x16_t* const oq0, uint8x16_t* const oq1) {
705	const int8x16_t kCst3 = vdupq_n_s8(`0x03`);
706	const int8x16_t kCst4 = vdupq_n_s8(`0x04`);
707	const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
708	const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
709	const int8x16_t a1 = vshrq_n_s8(delta1, `3`);
710	const int8x16_t a2 = vshrq_n_s8(delta2, `3`);
711	const int8x16_t a3 = vrshrq_n_s8(a1, `1`); // a3 = (a1 + 1) >> 1
712	op0 = FlipSignBack(vqaddq_s8(p0, a2)); // clip(p0 + a2)*
713	oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - a1)*
714	op1 = FlipSignBack(vqaddq_s8(p1, a3)); // clip(p1 + a3)*
715	oq1 = FlipSignBack(vqsubq_s8(q1, a3)); // clip(q1 - a3)*
716	}
717
718	static void DoFilter4(
719	const uint8x16_t p1, const uint8x16_t p0,
720	const uint8x16_t q0, const uint8x16_t q1,
721	const uint8x16_t mask, const uint8x16_t hev_mask,
722	uint8x16_t* const op1, uint8x16_t* const op0,
723	uint8x16_t* const oq0, uint8x16_t* const oq1) {
724	// This is a fused version of DoFilter2() calling ApplyFilter2 directly
725	const int8x16_t p1s = FlipSign(p1);
726	int8x16_t p0s = FlipSign(p0);
727	int8x16_t q0s = FlipSign(q0);
728	const int8x16_t q1s = FlipSign(q1);
729	const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
730
731	// do_filter2 part (simple loopfilter on pixels with hev)
732	{
733	const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s);
734	const int8x16_t simple_lf_delta =
735	vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
736	ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s);
737	}
738
739	// do_filter4 part (complex loopfilter on pixels without hev)
740	{
741	const int8x16_t delta0 = GetBaseDelta0(p0s, q0s);
742	// we use: (mask & hev_mask) ^ mask = mask & !hev_mask
743	const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
744	const int8x16_t complex_lf_delta =
745	vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
746	ApplyFilter4(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
747	}
748	}
749
750	// 6-points filter
751
752	static void ApplyFilter6(
753	const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
754	const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
755	const int8x16_t delta,
756	uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
757	uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
758	// We have to compute: X = (9a+63) >> 7, Y = (18a+63)>>7, Z = (27a+63) >> 7*
759	// Turns out, there's a common sub-expression S=9 a - 1 that can be used*
760	// with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:
761	// X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 a + S + 64) >> 7*
762	const int8x8_t delta_lo = vget_low_s8(delta);
763	const int8x8_t delta_hi = vget_high_s8(delta);
764	const int8x8_t kCst9 = vdup_n_s8(`9`);
765	const int16x8_t kCstm1 = vdupq_n_s16(-`1`);
766	const int8x8_t kCst18 = vdup_n_s8(`18`);
767	const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo); // S = 9 a - 1*
768	const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);
769	const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo); // S + 18 a*
770	const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);
771	const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, `7`); // (9 a + 63) >> 7*
772	const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, `7`);
773	const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, `6`); // (9 a + 31) >> 6*
774	const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, `6`);
775	const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, `7`); // (27 a + 63) >> 7*
776	const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, `7`);
777	const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
778	const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
779	const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
780
781	op0 = FlipSignBack(vqaddq_s8(p0, a1)); // clip(p0 + a1)*
782	oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - q1)*
783	oq1 = FlipSignBack(vqsubq_s8(q1, a2)); // clip(q1 - a2)*
784	op1 = FlipSignBack(vqaddq_s8(p1, a2)); // clip(p1 + a2)*
785	oq2 = FlipSignBack(vqsubq_s8(q2, a3)); // clip(q2 - a3)*
786	op2 = FlipSignBack(vqaddq_s8(p2, a3)); // clip(p2 + a3)*
787	}
788
789	static void DoFilter6(
790	const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
791	const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
792	const uint8x16_t mask, const uint8x16_t hev_mask,
793	uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
794	uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
795	// This is a fused version of DoFilter2() calling ApplyFilter2 directly
796	const int8x16_t p2s = FlipSign(p2);
797	const int8x16_t p1s = FlipSign(p1);
798	int8x16_t p0s = FlipSign(p0);
799	int8x16_t q0s = FlipSign(q0);
800	const int8x16_t q1s = FlipSign(q1);
801	const int8x16_t q2s = FlipSign(q2);
802	const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
803	const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
804
805	// do_filter2 part (simple loopfilter on pixels with hev)
806	{
807	const int8x16_t simple_lf_delta =
808	vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
809	ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s);
810	}
811
812	// do_filter6 part (complex loopfilter on pixels without hev)
813	{
814	// we use: (mask & hev_mask) ^ mask = mask & !hev_mask
815	const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
816	const int8x16_t complex_lf_delta =
817	vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
818	ApplyFilter6(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
819	op2, op1, op0, oq0, oq1, oq2);
820	}
821	}
822
823	// on macroblock edges
824
825	static void VFilter16(uint8_t* p, int stride,
826	int thresh, int ithresh, int hev_thresh) {
827	uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
828	Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
829	{
830	const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
831	ithresh, thresh);
832	const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
833	uint8x16_t op2, op1, op0, oq0, oq1, oq2;
834	DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
835	&op2, &op1, &op0, &oq0, &oq1, &oq2);
836	Store16x2(op2, op1, p - `2` * stride, stride);
837	Store16x2(op0, oq0, p + `0` * stride, stride);
838	Store16x2(oq1, oq2, p + `2` * stride, stride);
839	}
840	}
841
842	static void HFilter16(uint8_t* p, int stride,
843	int thresh, int ithresh, int hev_thresh) {
844	uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
845	Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
846	{
847	const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
848	ithresh, thresh);
849	const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
850	uint8x16_t op2, op1, op0, oq0, oq1, oq2;
851	DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
852	&op2, &op1, &op0, &oq0, &oq1, &oq2);
853	Store2x16(op2, op1, p - `2`, stride);
854	Store2x16(op0, oq0, p + `0`, stride);
855	Store2x16(oq1, oq2, p + `2`, stride);
856	}
857	}
858
859	// on three inner edges
860	static void VFilter16i(uint8_t* p, int stride,
861	int thresh, int ithresh, int hev_thresh) {
862	uint32_t k;
863	uint8x16_t p3, p2, p1, p0;
864	Load16x4(p + `2` * stride, stride, &p3, &p2, &p1, &p0);
865	for (k = `3`; k != `0`; --k) {
866	uint8x16_t q0, q1, q2, q3;
867	p += `4` * stride;
868	Load16x4(p + `2` * stride, stride, &q0, &q1, &q2, &q3);
869	{
870	const uint8x16_t mask =
871	NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
872	const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
873	// p3 and p2 are not just temporary variables here: they will be
874	// re-used for next span. And q2/q3 will become p1/p0 accordingly.
875	DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
876	Store16x4(p1, p0, p3, p2, p, stride);
877	p1 = q2;
878	p0 = q3;
879	}
880	}
881	}
882
883	#if !defined(WORK_AROUND_GCC)
884	static void HFilter16i(uint8_t* p, int stride,
885	int thresh, int ithresh, int hev_thresh) {
886	uint32_t k;
887	uint8x16_t p3, p2, p1, p0;
888	Load4x16(p + `2`, stride, &p3, &p2, &p1, &p0);
889	for (k = `3`; k != `0`; --k) {
890	uint8x16_t q0, q1, q2, q3;
891	p += `4`;
892	Load4x16(p + `2`, stride, &q0, &q1, &q2, &q3);
893	{
894	const uint8x16_t mask =
895	NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
896	const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
897	DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
898	Store4x16(p1, p0, p3, p2, p, stride);
899	p1 = q2;
900	p0 = q3;
901	}
902	}
903	}
904	#endif // !WORK_AROUND_GCC
905
906	// 8-pixels wide variant, for chroma filtering
907	static void VFilter8(uint8_t* u, uint8_t* v, int stride,
908	int thresh, int ithresh, int hev_thresh) {
909	uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
910	Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
911	{
912	const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
913	ithresh, thresh);
914	const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
915	uint8x16_t op2, op1, op0, oq0, oq1, oq2;
916	DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
917	&op2, &op1, &op0, &oq0, &oq1, &oq2);
918	Store8x2x2(op2, op1, u - `2` * stride, v - `2` * stride, stride);
919	Store8x2x2(op0, oq0, u + `0` * stride, v + `0` * stride, stride);
920	Store8x2x2(oq1, oq2, u + `2` * stride, v + `2` * stride, stride);
921	}
922	}
923	static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
924	int thresh, int ithresh, int hev_thresh) {
925	uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
926	u += `4` * stride;
927	v += `4` * stride;
928	Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
929	{
930	const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
931	ithresh, thresh);
932	const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
933	uint8x16_t op1, op0, oq0, oq1;
934	DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
935	Store8x4x2(op1, op0, oq0, oq1, u, v, stride);
936	}
937	}
938
939	#if !defined(WORK_AROUND_GCC)
940	static void HFilter8(uint8_t* u, uint8_t* v, int stride,
941	int thresh, int ithresh, int hev_thresh) {
942	uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
943	Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
944	{
945	const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
946	ithresh, thresh);
947	const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
948	uint8x16_t op2, op1, op0, oq0, oq1, oq2;
949	DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
950	&op2, &op1, &op0, &oq0, &oq1, &oq2);
951	Store6x8x2(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
952	}
953	}
954
955	static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
956	int thresh, int ithresh, int hev_thresh) {
957	uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
958	u += `4`;
959	v += `4`;
960	Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
961	{
962	const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
963	ithresh, thresh);
964	const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
965	uint8x16_t op1, op0, oq0, oq1;
966	DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
967	Store4x8x2(op1, op0, oq0, oq1, u, v, stride);
968	}
969	}
970	#endif // !WORK_AROUND_GCC
971
972	//-----------------------------------------------------------------------------
973	// Inverse transforms (Paragraph 14.4)
974
975	// Technically these are unsigned but vqdmulh is only available in signed.
976	// vqdmulh returns high half (effectively >> 16) but also doubles the value,
977	// changing the >> 16 to >> 15 and requiring an additional >> 1.
978	// We use this to our advantage with kC2. The canonical value is 35468.
979	// However, the high bit is set so treating it as signed will give incorrect
980	// results. We avoid this by down shifting by 1 here to clear the highest bit.
981	// Combined with the doubling effect of vqdmulh we get >> 16.
982	// This can not be applied to kC1 because the lowest bit is set. Down shifting
983	// the constant would reduce precision.
984
985	// libwebp uses a trick to avoid some extra addition that libvpx does.
986	// Instead of:
987	// temp2 = ip[12] + ((ip[12] cospi8sqrt2minus1) >> 16);*
988	// libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
989	// same issue with kC1 and vqdmulh that we work around by down shifting kC2
990
991	static const int16_t kC1 = `20091`;
992	static const int16_t kC2 = `17734`; // half of kC2, actually. See comment above.
993
994	#if defined(WEBP_USE_INTRINSICS)
995	static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
996	int16x8x2_t* const out) {
997	// a0 a1 a2 a3 \| b0 b1 b2 b3 => a0 b0 c0 d0 \| a1 b1 c1 d1
998	// c0 c1 c2 c3 \| d0 d1 d2 d3 a2 b2 c2 d2 \| a3 b3 c3 d3
999	const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
1000	// b0 d0 b1 d1 b2 d2 ...
1001	*out = vzipq_s16(tmp0.val[`0`], tmp0.val[`1`]);
1002	}
1003
1004	static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
1005	// {rows} = in0 \| in4
1006	// in8 \| in12
1007	// B1 = in4 \| in12
1008	const int16x8_t B1 =
1009	vcombine_s16(vget_high_s16(rows->val[`0`]), vget_high_s16(rows->val[`1`]));
1010	// C0 = kC1 in4 \| kC1 * in12*
1011	// C1 = kC2 in4 \| kC2 * in12*
1012	const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), `1`);
1013	const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
1014	const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[`0`]),
1015	vget_low_s16(rows->val[`1`])); // in0 + in8
1016	const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[`0`]),
1017	vget_low_s16(rows->val[`1`])); // in0 - in8
1018	// c = kC2 in4 - kC1 * in12*
1019	// d = kC1 in4 + kC2 * in12*
1020	const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
1021	const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
1022	const int16x8_t D0 = vcombine_s16(a, b); // D0 = a \| b
1023	const int16x8_t D1 = vcombine_s16(d, c); // D1 = d \| c
1024	const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d \| b+c
1025	const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d \| b-c
1026	const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
1027	Transpose8x2(E0, E1, rows);
1028	}
1029
1030	static void TransformOne(const int16_t* in, uint8_t* dst) {
1031	int16x8x2_t rows;
1032	INIT_VECTOR2(rows, vld1q_s16(in + `0`), vld1q_s16(in + `8`));
1033	TransformPass(&rows);
1034	TransformPass(&rows);
1035	Add4x4(rows.val[`0`], rows.val[`1`], dst);
1036	}
1037
1038	#else
1039
1040	static void TransformOne(const int16_t* in, uint8_t* dst) {
1041	const int kBPS = BPS;
1042	// kC1, kC2. Padded because vld1.16 loads 8 bytes
1043	const int16_t constants[`4`] = { kC1, kC2, `0`, `0` };
1044	/ Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm /
1045	__asm__ volatile (
1046	"vld1.16 {q1, q2}, [%[in]] \n"
1047	"vld1.16 {d0}, [%[constants]] \n"
1048
1049	/ d2: in[0]*
1050	* d3: in[8]
1051	* d4: in[4]
1052	* d5: in[12]
1053	*/
1054	"vswp d3, d4 \n"
1055
1056	/ q8 = {in[4], in[12]} * kC1 * 2 >> 16*
1057	* q9 = {in[4], in[12]} * kC2 >> 16
1058	*/
1059	"vqdmulh.s16 q8, q2, d0[0] \n"
1060	"vqdmulh.s16 q9, q2, d0[1] \n"
1061
1062	/ d22 = a = in[0] + in[8]*
1063	* d23 = b = in[0] - in[8]
1064	*/
1065	"vqadd.s16 d22, d2, d3 \n"
1066	"vqsub.s16 d23, d2, d3 \n"
1067
1068	/ The multiplication should be x * kC1 >> 16*
1069	* However, with vqdmulh we get x * kC1 * 2 >> 16
1070	* (multiply, double, return high half)
1071	* We avoided this in kC2 by pre-shifting the constant.
1072	* q8 = in[4]/[12] * kC1 >> 16
1073	*/
1074	"vshr.s16 q8, q8, #1 \n"
1075
1076	/ Add {in[4], in[12]} back after the multiplication. This is handled by*
1077	* adding 1 << 16 to kC1 in the libwebp C code.
1078	*/
1079	"vqadd.s16 q8, q2, q8 \n"
1080
1081	/ d20 = c = in[4]kC2 - in[12]kC1*
1082	* d21 = d = in[4]kC1 + in[12]kC2
1083	*/
1084	"vqsub.s16 d20, d18, d17 \n"
1085	"vqadd.s16 d21, d19, d16 \n"
1086
1087	/ d2 = tmp[0] = a + d*
1088	* d3 = tmp[1] = b + c
1089	* d4 = tmp[2] = b - c
1090	* d5 = tmp[3] = a - d
1091	*/
1092	"vqadd.s16 d2, d22, d21 \n"
1093	"vqadd.s16 d3, d23, d20 \n"
1094	"vqsub.s16 d4, d23, d20 \n"
1095	"vqsub.s16 d5, d22, d21 \n"
1096
1097	"vzip.16 q1, q2 \n"
1098	"vzip.16 q1, q2 \n"
1099
1100	"vswp d3, d4 \n"
1101
1102	/ q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16*
1103	* q9 = {tmp[4], tmp[12]} * kC2 >> 16
1104	*/
1105	"vqdmulh.s16 q8, q2, d0[0] \n"
1106	"vqdmulh.s16 q9, q2, d0[1] \n"
1107
1108	/ d22 = a = tmp[0] + tmp[8]*
1109	* d23 = b = tmp[0] - tmp[8]
1110	*/
1111	"vqadd.s16 d22, d2, d3 \n"
1112	"vqsub.s16 d23, d2, d3 \n"
1113
1114	/ See long winded explanations prior /
1115	"vshr.s16 q8, q8, #1 \n"
1116	"vqadd.s16 q8, q2, q8 \n"
1117
1118	/ d20 = c = in[4]kC2 - in[12]kC1*
1119	* d21 = d = in[4]kC1 + in[12]kC2
1120	*/
1121	"vqsub.s16 d20, d18, d17 \n"
1122	"vqadd.s16 d21, d19, d16 \n"
1123
1124	/ d2 = tmp[0] = a + d*
1125	* d3 = tmp[1] = b + c
1126	* d4 = tmp[2] = b - c
1127	* d5 = tmp[3] = a - d
1128	*/
1129	"vqadd.s16 d2, d22, d21 \n"
1130	"vqadd.s16 d3, d23, d20 \n"
1131	"vqsub.s16 d4, d23, d20 \n"
1132	"vqsub.s16 d5, d22, d21 \n"
1133
1134	"vld1.32 d6[0], [%[dst]], %[kBPS] \n"
1135	"vld1.32 d6[1], [%[dst]], %[kBPS] \n"
1136	"vld1.32 d7[0], [%[dst]], %[kBPS] \n"
1137	"vld1.32 d7[1], [%[dst]], %[kBPS] \n"
1138
1139	"sub %[dst], %[dst], %[kBPS], lsl #2 \n"
1140
1141	/ (val) + 4 >> 3 /
1142	"vrshr.s16 d2, d2, #3 \n"
1143	"vrshr.s16 d3, d3, #3 \n"
1144	"vrshr.s16 d4, d4, #3 \n"
1145	"vrshr.s16 d5, d5, #3 \n"
1146
1147	"vzip.16 q1, q2 \n"
1148	"vzip.16 q1, q2 \n"
1149
1150	/ Must accumulate before saturating /
1151	"vmovl.u8 q8, d6 \n"
1152	"vmovl.u8 q9, d7 \n"
1153
1154	"vqadd.s16 q1, q1, q8 \n"
1155	"vqadd.s16 q2, q2, q9 \n"
1156
1157	"vqmovun.s16 d0, q1 \n"
1158	"vqmovun.s16 d1, q2 \n"
1159
1160	"vst1.32 d0[0], [%[dst]], %[kBPS] \n"
1161	"vst1.32 d0[1], [%[dst]], %[kBPS] \n"
1162	"vst1.32 d1[0], [%[dst]], %[kBPS] \n"
1163	"vst1.32 d1[1], [%[dst]] \n"
1164
1165	: [in] "+r"(in), [dst] "+r"(dst) / modified registers /
1166	: [kBPS] "r"(kBPS), [constants] "r"(constants) / constants /
1167	: "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" / clobbered /
1168	);
1169	}
1170
1171	#endif // WEBP_USE_INTRINSICS
1172
1173	static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
1174	TransformOne(in, dst);
1175	if (do_two) {
1176	TransformOne(in + `16`, dst + `4`);
1177	}
1178	}
1179
1180	static void TransformDC(const int16_t* in, uint8_t* dst) {
1181	const int16x8_t DC = vdupq_n_s16(in[`0`]);
1182	Add4x4(DC, DC, dst);
1183	}
1184
1185	//------------------------------------------------------------------------------
1186
1187	#define STORE_WHT(dst, col, rows) do { \
1188	*dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
1189	*dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
1190	*dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
1191	*dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
1192	} while (0)
1193
1194	static void TransformWHT(const int16_t* in, int16_t* out) {
1195	int32x4x4_t tmp;
1196
1197	{
1198	// Load the source.
1199	const int16x4_t in00_03 = vld1_s16(in + `0`);
1200	const int16x4_t in04_07 = vld1_s16(in + `4`);
1201	const int16x4_t in08_11 = vld1_s16(in + `8`);
1202	const int16x4_t in12_15 = vld1_s16(in + `12`);
1203	const int32x4_t a0 = vaddl_s16(in00_03, in12_15); // in[0..3] + in[12..15]
1204	const int32x4_t a1 = vaddl_s16(in04_07, in08_11); // in[4..7] + in[8..11]
1205	const int32x4_t a2 = vsubl_s16(in04_07, in08_11); // in[4..7] - in[8..11]
1206	const int32x4_t a3 = vsubl_s16(in00_03, in12_15); // in[0..3] - in[12..15]
1207	tmp.val[`0`] = vaddq_s32(a0, a1);
1208	tmp.val[`1`] = vaddq_s32(a3, a2);
1209	tmp.val[`2`] = vsubq_s32(a0, a1);
1210	tmp.val[`3`] = vsubq_s32(a3, a2);
1211	// Arrange the temporary results column-wise.
1212	tmp = Transpose4x4(tmp);
1213	}
1214
1215	{
1216	const int32x4_t kCst3 = vdupq_n_s32(`3`);
1217	const int32x4_t dc = vaddq_s32(tmp.val[`0`], kCst3); // add rounder
1218	const int32x4_t a0 = vaddq_s32(dc, tmp.val[`3`]);
1219	const int32x4_t a1 = vaddq_s32(tmp.val[`1`], tmp.val[`2`]);
1220	const int32x4_t a2 = vsubq_s32(tmp.val[`1`], tmp.val[`2`]);
1221	const int32x4_t a3 = vsubq_s32(dc, tmp.val[`3`]);
1222
1223	tmp.val[`0`] = vaddq_s32(a0, a1);
1224	tmp.val[`1`] = vaddq_s32(a3, a2);
1225	tmp.val[`2`] = vsubq_s32(a0, a1);
1226	tmp.val[`3`] = vsubq_s32(a3, a2);
1227
1228	// right shift the results by 3.
1229	tmp.val[`0`] = vshrq_n_s32(tmp.val[`0`], `3`);
1230	tmp.val[`1`] = vshrq_n_s32(tmp.val[`1`], `3`);
1231	tmp.val[`2`] = vshrq_n_s32(tmp.val[`2`], `3`);
1232	tmp.val[`3`] = vshrq_n_s32(tmp.val[`3`], `3`);
1233
1234	STORE_WHT(out, `0`, tmp);
1235	STORE_WHT(out, `1`, tmp);
1236	STORE_WHT(out, `2`, tmp);
1237	STORE_WHT(out, `3`, tmp);
1238	}
1239	}
1240
1241	#undef STORE_WHT
1242
1243	//------------------------------------------------------------------------------
1244
1245	#define MUL(a, b) (((a) * (b)) >> 16)
1246	static void TransformAC3(const int16_t* in, uint8_t* dst) {
1247	static const int kC1_full = `20091` + (`1` << `16`);
1248	static const int kC2_full = `35468`;
1249	const int16x4_t A = vld1_dup_s16(in);
1250	const int16x4_t c4 = vdup_n_s16(MUL(in[`4`], kC2_full));
1251	const int16x4_t d4 = vdup_n_s16(MUL(in[`4`], kC1_full));
1252	const int c1 = MUL(in[`1`], kC2_full);
1253	const int d1 = MUL(in[`1`], kC1_full);
1254	const uint64_t cd = (uint64_t)( d1 & `0xffff`) << `0` \|
1255	(uint64_t)( c1 & `0xffff`) << `16` \|
1256	(uint64_t)(-c1 & `0xffff`) << `32` \|
1257	(uint64_t)(-d1 & `0xffff`) << `48`;
1258	const int16x4_t CD = vcreate_s16(cd);
1259	const int16x4_t B = vqadd_s16(A, CD);
1260	const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
1261	const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
1262	Add4x4(m0_m1, m2_m3, dst);
1263	}
1264	#undef MUL
1265
1266	//------------------------------------------------------------------------------
1267	// 4x4
1268
1269	static void DC4(uint8_t* dst) { // DC
1270	const uint8x8_t A = vld1_u8(dst - BPS); // top row
1271	const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
1272	const uint16x4_t p1 = vpadd_u16(p0, p0);
1273	const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + `0` * BPS - `1`));
1274	const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + `1` * BPS - `1`));
1275	const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + `2` * BPS - `1`));
1276	const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + `3` * BPS - `1`));
1277	const uint16x8_t s0 = vaddq_u16(L0, L1);
1278	const uint16x8_t s1 = vaddq_u16(L2, L3);
1279	const uint16x8_t s01 = vaddq_u16(s0, s1);
1280	const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
1281	const uint8x8_t dc0 = vrshrn_n_u16(sum, `3`); // (sum + 4) >> 3
1282	const uint8x8_t dc = vdup_lane_u8(dc0, `0`);
1283	int i;
1284	for (i = `0`; i < `4`; ++i) {
1285	vst1_lane_u32((uint32_t)(dst + i BPS), vreinterpret_u32_u8(dc), `0`);
1286	}
1287	}
1288
1289	// TrueMotion (4x4 + 8x8)
1290	static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
1291	const uint8x8_t TL = vld1_dup_u8(dst - BPS - `1`); // top-left pixel 'A[-1]'
1292	const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]'
1293	const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL)); // A[c] - A[-1]
1294	int y;
1295	for (y = `0`; y < size; y += `4`) {
1296	// left edge
1297	const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + `0` * BPS - `1`));
1298	const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + `1` * BPS - `1`));
1299	const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + `2` * BPS - `1`));
1300	const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + `3` * BPS - `1`));
1301	const int16x8_t r0 = vaddq_s16(L0, d); // L[r] + A[c] - A[-1]
1302	const int16x8_t r1 = vaddq_s16(L1, d);
1303	const int16x8_t r2 = vaddq_s16(L2, d);
1304	const int16x8_t r3 = vaddq_s16(L3, d);
1305	// Saturate and store the result.
1306	const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
1307	const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
1308	const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
1309	const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
1310	if (size == `4`) {
1311	vst1_lane_u32((uint32_t)(dst + `0` BPS), r0_u32, `0`);
1312	vst1_lane_u32((uint32_t)(dst + `1` BPS), r1_u32, `0`);
1313	vst1_lane_u32((uint32_t)(dst + `2` BPS), r2_u32, `0`);
1314	vst1_lane_u32((uint32_t)(dst + `3` BPS), r3_u32, `0`);
1315	} else {
1316	vst1_u32((uint32_t)(dst + `0` BPS), r0_u32);
1317	vst1_u32((uint32_t)(dst + `1` BPS), r1_u32);
1318	vst1_u32((uint32_t)(dst + `2` BPS), r2_u32);
1319	vst1_u32((uint32_t)(dst + `3` BPS), r3_u32);
1320	}
1321	dst += `4` * BPS;
1322	}
1323	}
1324
1325	static void TM4(uint8_t* dst) { TrueMotion(dst, `4`); }
1326
1327	static void VE4(uint8_t* dst) { // vertical
1328	// NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
1329	const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - `1`)); // top row
1330	const uint64x1_t A1 = vshr_n_u64(A0, `8`);
1331	const uint64x1_t A2 = vshr_n_u64(A0, `16`);
1332	const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
1333	const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
1334	const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
1335	const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
1336	const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
1337	int i;
1338	for (i = `0`; i < `4`; ++i) {
1339	vst1_lane_u32((uint32_t)(dst + i BPS), vreinterpret_u32_u8(avg), `0`);
1340	}
1341	}
1342
1343	static void RD4(uint8_t* dst) { // Down-right
1344	const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - `1`);
1345	const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
1346	const uint64x1_t ____XABC = vshl_n_u64(XABCD, `32`);
1347	const uint32_t I = dst[-`1` + `0` * BPS];
1348	const uint32_t J = dst[-`1` + `1` * BPS];
1349	const uint32_t K = dst[-`1` + `2` * BPS];
1350	const uint32_t L = dst[-`1` + `3` * BPS];
1351	const uint64x1_t LKJI____ = vcreate_u64(L \| (K << `8`) \| (J << `16`) \| (I << `24`));
1352	const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
1353	const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, `8`));
1354	const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, `16`));
1355	const uint8_t D = vget_lane_u8(XABCD_u8, `4`);
1356	const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, `6`);
1357	const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
1358	const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
1359	const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
1360	const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1361	const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
1362	const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, `8`));
1363	const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, `16`));
1364	const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, `24`));
1365	vst1_lane_u32((uint32_t)(dst + `0` BPS), r0, `0`);
1366	vst1_lane_u32((uint32_t)(dst + `1` BPS), r1, `0`);
1367	vst1_lane_u32((uint32_t)(dst + `2` BPS), r2, `0`);
1368	vst1_lane_u32((uint32_t)(dst + `3` BPS), r3, `0`);
1369	}
1370
1371	static void LD4(uint8_t* dst) { // Down-left
1372	// Note using the same shift trick as VE4() is slower here.
1373	const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + `0`);
1374	const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + `1`);
1375	const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + `2`);
1376	const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + `7`], CDEFGH00, `6`);
1377	const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
1378	const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
1379	const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1380	const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
1381	const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, `8`));
1382	const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, `16`));
1383	const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, `24`));
1384	vst1_lane_u32((uint32_t)(dst + `0` BPS), r0, `0`);
1385	vst1_lane_u32((uint32_t)(dst + `1` BPS), r1, `0`);
1386	vst1_lane_u32((uint32_t)(dst + `2` BPS), r2, `0`);
1387	vst1_lane_u32((uint32_t)(dst + `3` BPS), r3, `0`);
1388	}
1389
1390	//------------------------------------------------------------------------------
1391	// Chroma
1392
1393	static void VE8uv(uint8_t* dst) { // vertical
1394	const uint8x8_t top = vld1_u8(dst - BPS);
1395	int j;
1396	for (j = `0`; j < `8`; ++j) {
1397	vst1_u8(dst + j * BPS, top);
1398	}
1399	}
1400
1401	static void HE8uv(uint8_t* dst) { // horizontal
1402	int j;
1403	for (j = `0`; j < `8`; ++j) {
1404	const uint8x8_t left = vld1_dup_u8(dst - `1`);
1405	vst1_u8(dst, left);
1406	dst += BPS;
1407	}
1408	}
1409
1410	static WEBP_INLINE void DC8(uint8_t* dst, int do_top, int do_left) {
1411	uint16x8_t sum_top;
1412	uint16x8_t sum_left;
1413	uint8x8_t dc0;
1414
1415	if (do_top) {
1416	const uint8x8_t A = vld1_u8(dst - BPS); // top row
1417	const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
1418	const uint16x4_t p1 = vpadd_u16(p0, p0);
1419	const uint16x4_t p2 = vpadd_u16(p1, p1);
1420	sum_top = vcombine_u16(p2, p2);
1421	}
1422
1423	if (do_left) {
1424	const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + `0` * BPS - `1`));
1425	const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + `1` * BPS - `1`));
1426	const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + `2` * BPS - `1`));
1427	const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + `3` * BPS - `1`));
1428	const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + `4` * BPS - `1`));
1429	const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + `5` * BPS - `1`));
1430	const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + `6` * BPS - `1`));
1431	const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + `7` * BPS - `1`));
1432	const uint16x8_t s0 = vaddq_u16(L0, L1);
1433	const uint16x8_t s1 = vaddq_u16(L2, L3);
1434	const uint16x8_t s2 = vaddq_u16(L4, L5);
1435	const uint16x8_t s3 = vaddq_u16(L6, L7);
1436	const uint16x8_t s01 = vaddq_u16(s0, s1);
1437	const uint16x8_t s23 = vaddq_u16(s2, s3);
1438	sum_left = vaddq_u16(s01, s23);
1439	}
1440
1441	if (do_top && do_left) {
1442	const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1443	dc0 = vrshrn_n_u16(sum, `4`);
1444	} else if (do_top) {
1445	dc0 = vrshrn_n_u16(sum_top, `3`);
1446	} else if (do_left) {
1447	dc0 = vrshrn_n_u16(sum_left, `3`);
1448	} else {
1449	dc0 = vdup_n_u8(`0x80`);
1450	}
1451
1452	{
1453	const uint8x8_t dc = vdup_lane_u8(dc0, `0`);
1454	int i;
1455	for (i = `0`; i < `8`; ++i) {
1456	vst1_u32((uint32_t)(dst + i BPS), vreinterpret_u32_u8(dc));
1457	}
1458	}
1459	}
1460
1461	static void DC8uv(uint8_t* dst) { DC8(dst, `1`, `1`); }
1462	static void DC8uvNoTop(uint8_t* dst) { DC8(dst, `0`, `1`); }
1463	static void DC8uvNoLeft(uint8_t* dst) { DC8(dst, `1`, `0`); }
1464	static void DC8uvNoTopLeft(uint8_t* dst) { DC8(dst, `0`, `0`); }
1465
1466	static void TM8uv(uint8_t* dst) { TrueMotion(dst, `8`); }
1467
1468	//------------------------------------------------------------------------------
1469	// 16x16
1470
1471	static void VE16(uint8_t* dst) { // vertical
1472	const uint8x16_t top = vld1q_u8(dst - BPS);
1473	int j;
1474	for (j = `0`; j < `16`; ++j) {
1475	vst1q_u8(dst + j * BPS, top);
1476	}
1477	}
1478
1479	static void HE16(uint8_t* dst) { // horizontal
1480	int j;
1481	for (j = `0`; j < `16`; ++j) {
1482	const uint8x16_t left = vld1q_dup_u8(dst - `1`);
1483	vst1q_u8(dst, left);
1484	dst += BPS;
1485	}
1486	}
1487
1488	static WEBP_INLINE void DC16(uint8_t* dst, int do_top, int do_left) {
1489	uint16x8_t sum_top;
1490	uint16x8_t sum_left;
1491	uint8x8_t dc0;
1492
1493	if (do_top) {
1494	const uint8x16_t A = vld1q_u8(dst - BPS); // top row
1495	const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
1496	const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
1497	const uint16x4_t p2 = vpadd_u16(p1, p1);
1498	const uint16x4_t p3 = vpadd_u16(p2, p2);
1499	sum_top = vcombine_u16(p3, p3);
1500	}
1501
1502	if (do_left) {
1503	int i;
1504	sum_left = vdupq_n_u16(`0`);
1505	for (i = `0`; i < `16`; i += `8`) {
1506	const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + `0`) * BPS - `1`));
1507	const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + `1`) * BPS - `1`));
1508	const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + `2`) * BPS - `1`));
1509	const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + `3`) * BPS - `1`));
1510	const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + `4`) * BPS - `1`));
1511	const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + `5`) * BPS - `1`));
1512	const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + `6`) * BPS - `1`));
1513	const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + `7`) * BPS - `1`));
1514	const uint16x8_t s0 = vaddq_u16(L0, L1);
1515	const uint16x8_t s1 = vaddq_u16(L2, L3);
1516	const uint16x8_t s2 = vaddq_u16(L4, L5);
1517	const uint16x8_t s3 = vaddq_u16(L6, L7);
1518	const uint16x8_t s01 = vaddq_u16(s0, s1);
1519	const uint16x8_t s23 = vaddq_u16(s2, s3);
1520	const uint16x8_t sum = vaddq_u16(s01, s23);
1521	sum_left = vaddq_u16(sum_left, sum);
1522	}
1523	}
1524
1525	if (do_top && do_left) {
1526	const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1527	dc0 = vrshrn_n_u16(sum, `5`);
1528	} else if (do_top) {
1529	dc0 = vrshrn_n_u16(sum_top, `4`);
1530	} else if (do_left) {
1531	dc0 = vrshrn_n_u16(sum_left, `4`);
1532	} else {
1533	dc0 = vdup_n_u8(`0x80`);
1534	}
1535
1536	{
1537	const uint8x16_t dc = vdupq_lane_u8(dc0, `0`);
1538	int i;
1539	for (i = `0`; i < `16`; ++i) {
1540	vst1q_u8(dst + i * BPS, dc);
1541	}
1542	}
1543	}
1544
1545	static void DC16TopLeft(uint8_t* dst) { DC16(dst, `1`, `1`); }
1546	static void DC16NoTop(uint8_t* dst) { DC16(dst, `0`, `1`); }
1547	static void DC16NoLeft(uint8_t* dst) { DC16(dst, `1`, `0`); }
1548	static void DC16NoTopLeft(uint8_t* dst) { DC16(dst, `0`, `0`); }
1549
1550	static void TM16(uint8_t* dst) {
1551	const uint8x8_t TL = vld1_dup_u8(dst - BPS - `1`); // top-left pixel 'A[-1]'
1552	const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]'
1553	// A[c] - A[-1]
1554	const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));
1555	const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));
1556	int y;
1557	for (y = `0`; y < `16`; y += `4`) {
1558	// left edge
1559	const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + `0` * BPS - `1`));
1560	const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + `1` * BPS - `1`));
1561	const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + `2` * BPS - `1`));
1562	const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + `3` * BPS - `1`));
1563	const int16x8_t r0_lo = vaddq_s16(L0, d_lo); // L[r] + A[c] - A[-1]
1564	const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
1565	const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
1566	const int16x8_t r3_lo = vaddq_s16(L3, d_lo);
1567	const int16x8_t r0_hi = vaddq_s16(L0, d_hi);
1568	const int16x8_t r1_hi = vaddq_s16(L1, d_hi);
1569	const int16x8_t r2_hi = vaddq_s16(L2, d_hi);
1570	const int16x8_t r3_hi = vaddq_s16(L3, d_hi);
1571	// Saturate and store the result.
1572	const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
1573	const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
1574	const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
1575	const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
1576	vst1q_u8(dst + `0` * BPS, row0);
1577	vst1q_u8(dst + `1` * BPS, row1);
1578	vst1q_u8(dst + `2` * BPS, row2);
1579	vst1q_u8(dst + `3` * BPS, row3);
1580	dst += `4` * BPS;
1581	}
1582	}
1583
1584	//------------------------------------------------------------------------------
1585	// Entry point
1586
1587	extern void VP8DspInitNEON(void);
1588
1589	WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
1590	VP8Transform = TransformTwo;
1591	VP8TransformAC3 = TransformAC3;
1592	VP8TransformDC = TransformDC;
1593	VP8TransformWHT = TransformWHT;
1594
1595	VP8VFilter16 = VFilter16;
1596	VP8VFilter16i = VFilter16i;
1597	VP8HFilter16 = HFilter16;
1598	#if !defined(WORK_AROUND_GCC)
1599	VP8HFilter16i = HFilter16i;
1600	#endif
1601	VP8VFilter8 = VFilter8;
1602	VP8VFilter8i = VFilter8i;
1603	#if !defined(WORK_AROUND_GCC)
1604	VP8HFilter8 = HFilter8;
1605	VP8HFilter8i = HFilter8i;
1606	#endif
1607	VP8SimpleVFilter16 = SimpleVFilter16;
1608	VP8SimpleHFilter16 = SimpleHFilter16;
1609	VP8SimpleVFilter16i = SimpleVFilter16i;
1610	VP8SimpleHFilter16i = SimpleHFilter16i;
1611
1612	VP8PredLuma4[`0`] = DC4;
1613	VP8PredLuma4[`1`] = TM4;
1614	VP8PredLuma4[`2`] = VE4;
1615	VP8PredLuma4[`4`] = RD4;
1616	VP8PredLuma4[`6`] = LD4;
1617
1618	VP8PredLuma16[`0`] = DC16TopLeft;
1619	VP8PredLuma16[`1`] = TM16;
1620	VP8PredLuma16[`2`] = VE16;
1621	VP8PredLuma16[`3`] = HE16;
1622	VP8PredLuma16[`4`] = DC16NoTop;
1623	VP8PredLuma16[`5`] = DC16NoLeft;
1624	VP8PredLuma16[`6`] = DC16NoTopLeft;
1625
1626	VP8PredChroma8[`0`] = DC8uv;
1627	VP8PredChroma8[`1`] = TM8uv;
1628	VP8PredChroma8[`2`] = VE8uv;
1629	VP8PredChroma8[`3`] = HE8uv;
1630	VP8PredChroma8[`4`] = DC8uvNoTop;
1631	VP8PredChroma8[`5`] = DC8uvNoLeft;
1632	VP8PredChroma8[`6`] = DC8uvNoTopLeft;
1633	}
1634
1635	#else // !WEBP_USE_NEON
1636
1637	WEBP_DSP_INIT_STUB(VP8DspInitNEON)
1638
1639	#endif // WEBP_USE_NEON
1640

Browse the source code of engine/third_party/libwebp/src/dsp/dec_neon.c