enc_neon.c source code [Godot/thirdparty/libwebp/src/dsp/enc_neon.c]

1	// Copyright 2012 Google Inc. All Rights Reserved.
2	//
3	// Use of this source code is governed by a BSD-style license
4	// that can be found in the COPYING file in the root of the source
5	// tree. An additional intellectual property rights grant can be found
6	// in the file PATENTS. All contributing project authors may
7	// be found in the AUTHORS file in the root of the source tree.
8	// -----------------------------------------------------------------------------
9	//
10	// ARM NEON version of speed-critical encoding functions.
11	//
12	// adapted from libvpx (https://www.webmproject.org/code/)
13
14	#include "src/dsp/dsp.h"
15
16	#if defined(WEBP_USE_NEON)
17
18	#include <assert.h>
19
20	#include "src/dsp/neon.h"
21	#include "src/enc/vp8i_enc.h"
22
23	//------------------------------------------------------------------------------
24	// Transforms (Paragraph 14.4)
25
26	// Inverse transform.
27	// This code is pretty much the same as TransformOne in the dec_neon.c, except
28	// for subtraction to ref. See the comments there for algorithmic explanations.*
29
30	static const int16_t kC1 = `20091`;
31	static const int16_t kC2 = `17734`; // half of kC2, actually. See comment above.
32
33	// This code works but is slower* than the inlined-asm version below*
34	// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
35	// WEBP_USE_INTRINSICS define.
36	// With gcc-4.8, it's a little faster speed than inlined-assembly.
37	#if defined(WEBP_USE_INTRINSICS)
38
39	// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
40	static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
41	return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
42	}
43
44	// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
45	// to the corresponding rows of 'dst'.
46	static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
47	const int16x8_t dst01,
48	const int16x8_t dst23) {
49	// Unsigned saturate to 8b.
50	const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
51	const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
52
53	// Store the results.
54	vst1_lane_u32((uint32_t)(dst + `0` BPS), vreinterpret_u32_u8(dst01_u8), `0`);
55	vst1_lane_u32((uint32_t)(dst + `1` BPS), vreinterpret_u32_u8(dst01_u8), `1`);
56	vst1_lane_u32((uint32_t)(dst + `2` BPS), vreinterpret_u32_u8(dst23_u8), `0`);
57	vst1_lane_u32((uint32_t)(dst + `3` BPS), vreinterpret_u32_u8(dst23_u8), `1`);
58	}
59
60	static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
61	const int16x8_t row23,
62	const uint8_t* const ref,
63	uint8_t* const dst) {
64	uint32x2_t dst01 = vdup_n_u32(`0`);
65	uint32x2_t dst23 = vdup_n_u32(`0`);
66
67	// Load the source pixels.
68	dst01 = vld1_lane_u32((uint32_t)(ref + `0` BPS), dst01, `0`);
69	dst23 = vld1_lane_u32((uint32_t)(ref + `2` BPS), dst23, `0`);
70	dst01 = vld1_lane_u32((uint32_t)(ref + `1` BPS), dst01, `1`);
71	dst23 = vld1_lane_u32((uint32_t)(ref + `3` BPS), dst23, `1`);
72
73	{
74	// Convert to 16b.
75	const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
76	const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
77
78	// Descale with rounding.
79	const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, `3`);
80	const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, `3`);
81	// Add the inverse transform.
82	SaturateAndStore4x4_NEON(dst, out01, out23);
83	}
84	}
85
86	static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
87	const int16x8_t in1,
88	int16x8x2_t* const out) {
89	// a0 a1 a2 a3 \| b0 b1 b2 b3 => a0 b0 c0 d0 \| a1 b1 c1 d1
90	// c0 c1 c2 c3 \| d0 d1 d2 d3 a2 b2 c2 d2 \| a3 b3 c3 d3
91	const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
92	// b0 d0 b1 d1 b2 d2 ...
93	*out = vzipq_s16(tmp0.val[`0`], tmp0.val[`1`]);
94	}
95
96	static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
97	// {rows} = in0 \| in4
98	// in8 \| in12
99	// B1 = in4 \| in12
100	const int16x8_t B1 =
101	vcombine_s16(vget_high_s16(rows->val[`0`]), vget_high_s16(rows->val[`1`]));
102	// C0 = kC1 in4 \| kC1 * in12*
103	// C1 = kC2 in4 \| kC2 * in12*
104	const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), `1`);
105	const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
106	const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[`0`]),
107	vget_low_s16(rows->val[`1`])); // in0 + in8
108	const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[`0`]),
109	vget_low_s16(rows->val[`1`])); // in0 - in8
110	// c = kC2 in4 - kC1 * in12*
111	// d = kC1 in4 + kC2 * in12*
112	const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
113	const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
114	const int16x8_t D0 = vcombine_s16(a, b); // D0 = a \| b
115	const int16x8_t D1 = vcombine_s16(d, c); // D1 = d \| c
116	const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d \| b+c
117	const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d \| b-c
118	const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
119	Transpose8x2_NEON(E0, E1, rows);
120	}
121
122	static void ITransformOne_NEON(const uint8_t* ref,
123	const int16_t* in, uint8_t* dst) {
124	int16x8x2_t rows;
125	INIT_VECTOR2(rows, vld1q_s16(in + `0`), vld1q_s16(in + `8`));
126	TransformPass_NEON(&rows);
127	TransformPass_NEON(&rows);
128	Add4x4_NEON(rows.val[`0`], rows.val[`1`], ref, dst);
129	}
130
131	#else
132
133	static void ITransformOne_NEON(const uint8_t* ref,
134	const int16_t* in, uint8_t* dst) {
135	const int kBPS = BPS;
136	const int16_t kC1C2[] = { kC1, kC2, `0`, `0` };
137
138	__asm__ volatile (
139	"vld1.16 {q1, q2}, [%[in]] \n"
140	"vld1.16 {d0}, [%[kC1C2]] \n"
141
142	// d2: in[0]
143	// d3: in[8]
144	// d4: in[4]
145	// d5: in[12]
146	"vswp d3, d4 \n"
147
148	// q8 = {in[4], in[12]} kC1 * 2 >> 16*
149	// q9 = {in[4], in[12]} kC2 >> 16*
150	"vqdmulh.s16 q8, q2, d0[0] \n"
151	"vqdmulh.s16 q9, q2, d0[1] \n"
152
153	// d22 = a = in[0] + in[8]
154	// d23 = b = in[0] - in[8]
155	"vqadd.s16 d22, d2, d3 \n"
156	"vqsub.s16 d23, d2, d3 \n"
157
158	// q8 = in[4]/[12] kC1 >> 16*
159	"vshr.s16 q8, q8, #1 \n"
160
161	// Add {in[4], in[12]} back after the multiplication.
162	"vqadd.s16 q8, q2, q8 \n"
163
164	// d20 = c = in[4]kC2 - in[12]kC1
165	// d21 = d = in[4]kC1 + in[12]kC2
166	"vqsub.s16 d20, d18, d17 \n"
167	"vqadd.s16 d21, d19, d16 \n"
168
169	// d2 = tmp[0] = a + d
170	// d3 = tmp[1] = b + c
171	// d4 = tmp[2] = b - c
172	// d5 = tmp[3] = a - d
173	"vqadd.s16 d2, d22, d21 \n"
174	"vqadd.s16 d3, d23, d20 \n"
175	"vqsub.s16 d4, d23, d20 \n"
176	"vqsub.s16 d5, d22, d21 \n"
177
178	"vzip.16 q1, q2 \n"
179	"vzip.16 q1, q2 \n"
180
181	"vswp d3, d4 \n"
182
183	// q8 = {tmp[4], tmp[12]} kC1 * 2 >> 16*
184	// q9 = {tmp[4], tmp[12]} kC2 >> 16*
185	"vqdmulh.s16 q8, q2, d0[0] \n"
186	"vqdmulh.s16 q9, q2, d0[1] \n"
187
188	// d22 = a = tmp[0] + tmp[8]
189	// d23 = b = tmp[0] - tmp[8]
190	"vqadd.s16 d22, d2, d3 \n"
191	"vqsub.s16 d23, d2, d3 \n"
192
193	"vshr.s16 q8, q8, #1 \n"
194	"vqadd.s16 q8, q2, q8 \n"
195
196	// d20 = c = in[4]kC2 - in[12]kC1
197	// d21 = d = in[4]kC1 + in[12]kC2
198	"vqsub.s16 d20, d18, d17 \n"
199	"vqadd.s16 d21, d19, d16 \n"
200
201	// d2 = tmp[0] = a + d
202	// d3 = tmp[1] = b + c
203	// d4 = tmp[2] = b - c
204	// d5 = tmp[3] = a - d
205	"vqadd.s16 d2, d22, d21 \n"
206	"vqadd.s16 d3, d23, d20 \n"
207	"vqsub.s16 d4, d23, d20 \n"
208	"vqsub.s16 d5, d22, d21 \n"
209
210	"vld1.32 d6[0], [%[ref]], %[kBPS] \n"
211	"vld1.32 d6[1], [%[ref]], %[kBPS] \n"
212	"vld1.32 d7[0], [%[ref]], %[kBPS] \n"
213	"vld1.32 d7[1], [%[ref]], %[kBPS] \n"
214
215	"sub %[ref], %[ref], %[kBPS], lsl #2 \n"
216
217	// (val) + 4 >> 3
218	"vrshr.s16 d2, d2, #3 \n"
219	"vrshr.s16 d3, d3, #3 \n"
220	"vrshr.s16 d4, d4, #3 \n"
221	"vrshr.s16 d5, d5, #3 \n"
222
223	"vzip.16 q1, q2 \n"
224	"vzip.16 q1, q2 \n"
225
226	// Must accumulate before saturating
227	"vmovl.u8 q8, d6 \n"
228	"vmovl.u8 q9, d7 \n"
229
230	"vqadd.s16 q1, q1, q8 \n"
231	"vqadd.s16 q2, q2, q9 \n"
232
233	"vqmovun.s16 d0, q1 \n"
234	"vqmovun.s16 d1, q2 \n"
235
236	"vst1.32 d0[0], [%[dst]], %[kBPS] \n"
237	"vst1.32 d0[1], [%[dst]], %[kBPS] \n"
238	"vst1.32 d1[0], [%[dst]], %[kBPS] \n"
239	"vst1.32 d1[1], [%[dst]] \n"
240
241	: [in] "+r"(in), [dst] "+r"(dst) // modified registers
242	: [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants
243	: "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered
244	);
245	}
246
247	#endif // WEBP_USE_INTRINSICS
248
249	static void ITransform_NEON(const uint8_t* ref,
250	const int16_t* in, uint8_t* dst, int do_two) {
251	ITransformOne_NEON(ref, in, dst);
252	if (do_two) {
253	ITransformOne_NEON(ref + `4`, in + `16`, dst + `4`);
254	}
255	}
256
257	// Load all 4x4 pixels into a single uint8x16_t variable.
258	static uint8x16_t Load4x4_NEON(const uint8_t* src) {
259	uint32x4_t out = vdupq_n_u32(`0`);
260	out = vld1q_lane_u32((const uint32_t)(src + `0` BPS), out, `0`);
261	out = vld1q_lane_u32((const uint32_t)(src + `1` BPS), out, `1`);
262	out = vld1q_lane_u32((const uint32_t)(src + `2` BPS), out, `2`);
263	out = vld1q_lane_u32((const uint32_t)(src + `3` BPS), out, `3`);
264	return vreinterpretq_u8_u32(out);
265	}
266
267	// Forward transform.
268
269	#if defined(WEBP_USE_INTRINSICS)
270
271	static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
272	const int16x4_t B,
273	const int16x4_t C,
274	const int16x4_t D,
275	int16x8_t* const out01,
276	int16x8_t* const out32) {
277	const int16x4x2_t AB = vtrn_s16(A, B);
278	const int16x4x2_t CD = vtrn_s16(C, D);
279	const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[`0`]),
280	vreinterpret_s32_s16(CD.val[`0`]));
281	const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[`1`]),
282	vreinterpret_s32_s16(CD.val[`1`]));
283	*out01 = vreinterpretq_s16_s64(
284	vcombine_s64(vreinterpret_s64_s32(tmp02.val[`0`]),
285	vreinterpret_s64_s32(tmp13.val[`0`])));
286	*out32 = vreinterpretq_s16_s64(
287	vcombine_s64(vreinterpret_s64_s32(tmp13.val[`1`]),
288	vreinterpret_s64_s32(tmp02.val[`1`])));
289	}
290
291	static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
292	const uint8x8_t b) {
293	return vreinterpretq_s16_u16(vsubl_u8(a, b));
294	}
295
296	static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
297	int16_t* out) {
298	int16x8_t d0d1, d3d2; // working 4x4 int16 variables
299	{
300	const uint8x16_t S0 = Load4x4_NEON(src);
301	const uint8x16_t R0 = Load4x4_NEON(ref);
302	const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
303	const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
304	const int16x4_t D0 = vget_low_s16(D0D1);
305	const int16x4_t D1 = vget_high_s16(D0D1);
306	const int16x4_t D2 = vget_low_s16(D2D3);
307	const int16x4_t D3 = vget_high_s16(D2D3);
308	Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
309	}
310	{ // 1rst pass
311	const int32x4_t kCst937 = vdupq_n_s32(`937`);
312	const int32x4_t kCst1812 = vdupq_n_s32(`1812`);
313	const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 \| d1+d2 (=a0\|a1)
314	const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 \| d1-d2 (=a3\|a2)
315	const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, `3`);
316	const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
317	vget_high_s16(a0a1_2));
318	const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
319	vget_high_s16(a0a1_2));
320	const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), `2217`);
321	const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), `2217`);
322	const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), `5352`);
323	const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), `5352`);
324	const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), `9`);
325	const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), `9`);
326	Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
327	}
328	{ // 2nd pass
329	// the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0)
330	const int32x4_t kCst12000 = vdupq_n_s32(`12000` + (`1` << `16`));
331	const int32x4_t kCst51000 = vdupq_n_s32(`51000`);
332	const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 \| d1+d2 (=a0\|a1)
333	const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 \| d1-d2 (=a3\|a2)
334	const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(`7`));
335	const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), `4`);
336	const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), `4`);
337	const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), `2217`);
338	const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), `2217`);
339	const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), `5352`);
340	const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), `5352`);
341	const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
342	const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
343	const int16x4_t a3_eq_0 =
344	vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(`0`)));
345	const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
346	vst1_s16(out + `0`, out0);
347	vst1_s16(out + `4`, out1);
348	vst1_s16(out + `8`, out2);
349	vst1_s16(out + `12`, out3);
350	}
351	}
352
353	#else
354
355	// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
356	static const int16_t kCoeff16[] = {
357	`5352`, `5352`, `5352`, `5352`, `2217`, `2217`, `2217`, `2217`
358	};
359	static const int32_t kCoeff32[] = {
360	`1812`, `1812`, `1812`, `1812`,
361	`937`, `937`, `937`, `937`,
362	`12000`, `12000`, `12000`, `12000`,
363	`51000`, `51000`, `51000`, `51000`
364	};
365
366	static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
367	int16_t* out) {
368	const int kBPS = BPS;
369	const uint8_t* src_ptr = src;
370	const uint8_t* ref_ptr = ref;
371	const int16_t* coeff16 = kCoeff16;
372	const int32_t* coeff32 = kCoeff32;
373
374	__asm__ volatile (
375	// load src into q4, q5 in high half
376	"vld1.8 {d8}, [%[src_ptr]], %[kBPS] \n"
377	"vld1.8 {d10}, [%[src_ptr]], %[kBPS] \n"
378	"vld1.8 {d9}, [%[src_ptr]], %[kBPS] \n"
379	"vld1.8 {d11}, [%[src_ptr]] \n"
380
381	// load ref into q6, q7 in high half
382	"vld1.8 {d12}, [%[ref_ptr]], %[kBPS] \n"
383	"vld1.8 {d14}, [%[ref_ptr]], %[kBPS] \n"
384	"vld1.8 {d13}, [%[ref_ptr]], %[kBPS] \n"
385	"vld1.8 {d15}, [%[ref_ptr]] \n"
386
387	// Pack the high values in to q4 and q6
388	"vtrn.32 q4, q5 \n"
389	"vtrn.32 q6, q7 \n"
390
391	// d[0-3] = src - ref
392	"vsubl.u8 q0, d8, d12 \n"
393	"vsubl.u8 q1, d9, d13 \n"
394
395	// load coeff16 into q8(d16=5352, d17=2217)
396	"vld1.16 {q8}, [%[coeff16]] \n"
397
398	// load coeff32 high half into q9 = 1812, q10 = 937
399	"vld1.32 {q9, q10}, [%[coeff32]]! \n"
400
401	// load coeff32 low half into q11=12000, q12=51000
402	"vld1.32 {q11,q12}, [%[coeff32]] \n"
403
404	// part 1
405	// Transpose. Register dN is the same as dN in C
406	"vtrn.32 d0, d2 \n"
407	"vtrn.32 d1, d3 \n"
408	"vtrn.16 d0, d1 \n"
409	"vtrn.16 d2, d3 \n"
410
411	"vadd.s16 d4, d0, d3 \n" // a0 = d0 + d3
412	"vadd.s16 d5, d1, d2 \n" // a1 = d1 + d2
413	"vsub.s16 d6, d1, d2 \n" // a2 = d1 - d2
414	"vsub.s16 d7, d0, d3 \n" // a3 = d0 - d3
415
416	"vadd.s16 d0, d4, d5 \n" // a0 + a1
417	"vshl.s16 d0, d0, #3 \n" // temp[0+i4] = (a0+a1) << 3*
418	"vsub.s16 d2, d4, d5 \n" // a0 - a1
419	"vshl.s16 d2, d2, #3 \n" // (temp[2+i4] = (a0-a1) << 3*
420
421	"vmlal.s16 q9, d7, d16 \n" // a35352 + 1812*
422	"vmlal.s16 q10, d7, d17 \n" // a32217 + 937*
423	"vmlal.s16 q9, d6, d17 \n" // a22217 + a35352 + 1812
424	"vmlsl.s16 q10, d6, d16 \n" // a32217 + 937 - a25352
425
426	// temp[1+i4] = (d22217 + d35352 + 1812) >> 9*
427	// temp[3+i4] = (d32217 + 937 - d25352) >> 9*
428	"vshrn.s32 d1, q9, #9 \n"
429	"vshrn.s32 d3, q10, #9 \n"
430
431	// part 2
432	// transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
433	"vtrn.32 d0, d2 \n"
434	"vtrn.32 d1, d3 \n"
435	"vtrn.16 d0, d1 \n"
436	"vtrn.16 d2, d3 \n"
437
438	"vmov.s16 d26, #7 \n"
439
440	"vadd.s16 d4, d0, d3 \n" // a1 = ip[0] + ip[12]
441	"vadd.s16 d5, d1, d2 \n" // b1 = ip[4] + ip[8]
442	"vsub.s16 d6, d1, d2 \n" // c1 = ip[4] - ip[8]
443	"vadd.s16 d4, d4, d26 \n" // a1 + 7
444	"vsub.s16 d7, d0, d3 \n" // d1 = ip[0] - ip[12]
445
446	"vadd.s16 d0, d4, d5 \n" // op[0] = a1 + b1 + 7
447	"vsub.s16 d2, d4, d5 \n" // op[8] = a1 - b1 + 7
448
449	"vmlal.s16 q11, d7, d16 \n" // d15352 + 12000*
450	"vmlal.s16 q12, d7, d17 \n" // d12217 + 51000*
451
452	"vceq.s16 d4, d7, #0 \n"
453
454	"vshr.s16 d0, d0, #4 \n"
455	"vshr.s16 d2, d2, #4 \n"
456
457	"vmlal.s16 q11, d6, d17 \n" // c12217 + d15352 + 12000
458	"vmlsl.s16 q12, d6, d16 \n" // d12217 - c15352 + 51000
459
460	"vmvn d4, d4 \n" // !(d1 == 0)
461	// op[4] = (c12217 + d15352 + 12000)>>16
462	"vshrn.s32 d1, q11, #16 \n"
463	// op[4] += (d1!=0)
464	"vsub.s16 d1, d1, d4 \n"
465	// op[12]= (d12217 - c15352 + 51000)>>16
466	"vshrn.s32 d3, q12, #16 \n"
467
468	// set result to out array
469	"vst1.16 {q0, q1}, [%[out]] \n"
470	: [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
471	[coeff32] "+r"(coeff32) // modified registers
472	: [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
473	[out] "r"(out) // constants
474	: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
475	"q10", "q11", "q12", "q13" // clobbered
476	);
477	}
478
479	#endif
480
481	#define LOAD_LANE_16b(VALUE, LANE) do { \
482	(VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \
483	src += stride; \
484	} while (0)
485
486	static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
487	const int stride = `16`;
488	const int16x4_t zero = vdup_n_s16(`0`);
489	int32x4x4_t tmp0;
490	int16x4x4_t in;
491	INIT_VECTOR4(in, zero, zero, zero, zero);
492	LOAD_LANE_16b(in.val[`0`], `0`);
493	LOAD_LANE_16b(in.val[`1`], `0`);
494	LOAD_LANE_16b(in.val[`2`], `0`);
495	LOAD_LANE_16b(in.val[`3`], `0`);
496	LOAD_LANE_16b(in.val[`0`], `1`);
497	LOAD_LANE_16b(in.val[`1`], `1`);
498	LOAD_LANE_16b(in.val[`2`], `1`);
499	LOAD_LANE_16b(in.val[`3`], `1`);
500	LOAD_LANE_16b(in.val[`0`], `2`);
501	LOAD_LANE_16b(in.val[`1`], `2`);
502	LOAD_LANE_16b(in.val[`2`], `2`);
503	LOAD_LANE_16b(in.val[`3`], `2`);
504	LOAD_LANE_16b(in.val[`0`], `3`);
505	LOAD_LANE_16b(in.val[`1`], `3`);
506	LOAD_LANE_16b(in.val[`2`], `3`);
507	LOAD_LANE_16b(in.val[`3`], `3`);
508
509	{
510	// a0 = in[0 16] + in[2 * 16]*
511	// a1 = in[1 16] + in[3 * 16]*
512	// a2 = in[1 16] - in[3 * 16]*
513	// a3 = in[0 16] - in[2 * 16]*
514	const int32x4_t a0 = vaddl_s16(in.val[`0`], in.val[`2`]);
515	const int32x4_t a1 = vaddl_s16(in.val[`1`], in.val[`3`]);
516	const int32x4_t a2 = vsubl_s16(in.val[`1`], in.val[`3`]);
517	const int32x4_t a3 = vsubl_s16(in.val[`0`], in.val[`2`]);
518	tmp0.val[`0`] = vaddq_s32(a0, a1);
519	tmp0.val[`1`] = vaddq_s32(a3, a2);
520	tmp0.val[`2`] = vsubq_s32(a3, a2);
521	tmp0.val[`3`] = vsubq_s32(a0, a1);
522	}
523	{
524	const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
525	// a0 = tmp[0 + i] + tmp[ 8 + i]
526	// a1 = tmp[4 + i] + tmp[12 + i]
527	// a2 = tmp[4 + i] - tmp[12 + i]
528	// a3 = tmp[0 + i] - tmp[ 8 + i]
529	const int32x4_t a0 = vaddq_s32(tmp1.val[`0`], tmp1.val[`2`]);
530	const int32x4_t a1 = vaddq_s32(tmp1.val[`1`], tmp1.val[`3`]);
531	const int32x4_t a2 = vsubq_s32(tmp1.val[`1`], tmp1.val[`3`]);
532	const int32x4_t a3 = vsubq_s32(tmp1.val[`0`], tmp1.val[`2`]);
533	const int32x4_t b0 = vhaddq_s32(a0, a1); // (a0 + a1) >> 1
534	const int32x4_t b1 = vhaddq_s32(a3, a2); // (a3 + a2) >> 1
535	const int32x4_t b2 = vhsubq_s32(a3, a2); // (a3 - a2) >> 1
536	const int32x4_t b3 = vhsubq_s32(a0, a1); // (a0 - a1) >> 1
537	const int16x4_t out0 = vmovn_s32(b0);
538	const int16x4_t out1 = vmovn_s32(b1);
539	const int16x4_t out2 = vmovn_s32(b2);
540	const int16x4_t out3 = vmovn_s32(b3);
541
542	vst1_s16(out + `0`, out0);
543	vst1_s16(out + `4`, out1);
544	vst1_s16(out + `8`, out2);
545	vst1_s16(out + `12`, out3);
546	}
547	}
548	#undef LOAD_LANE_16b
549
550	//------------------------------------------------------------------------------
551	// Texture distortion
552	//
553	// We try to match the spectral content (weighted) between source and
554	// reconstructed samples.
555
556	// a 0123, b 0123
557	// a 4567, b 4567
558	// a 89ab, b 89ab
559	// a cdef, b cdef
560	//
561	// transpose
562	//
563	// a 048c, b 048c
564	// a 159d, b 159d
565	// a 26ae, b 26ae
566	// a 37bf, b 37bf
567	//
568	static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
569	const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[`0`], q4_in.val[`1`]);
570	const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[`2`], q4_in.val[`3`]);
571	const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[`0`]),
572	vreinterpretq_s32_s16(q2_tmp1.val[`0`]));
573	const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[`1`]),
574	vreinterpretq_s32_s16(q2_tmp1.val[`1`]));
575	q4_in.val[`0`] = vreinterpretq_s16_s32(q2_tmp2.val[`0`]);
576	q4_in.val[`2`] = vreinterpretq_s16_s32(q2_tmp2.val[`1`]);
577	q4_in.val[`1`] = vreinterpretq_s16_s32(q2_tmp3.val[`0`]);
578	q4_in.val[`3`] = vreinterpretq_s16_s32(q2_tmp3.val[`1`]);
579	return q4_in;
580	}
581
582	static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
583	const int16x8x4_t q4_in) {
584	// {a0, a1} = {in[0] + in[2], in[1] + in[3]}
585	// {a3, a2} = {in[0] - in[2], in[1] - in[3]}
586	const int16x8_t q_a0 = vaddq_s16(q4_in.val[`0`], q4_in.val[`2`]);
587	const int16x8_t q_a1 = vaddq_s16(q4_in.val[`1`], q4_in.val[`3`]);
588	const int16x8_t q_a3 = vsubq_s16(q4_in.val[`0`], q4_in.val[`2`]);
589	const int16x8_t q_a2 = vsubq_s16(q4_in.val[`1`], q4_in.val[`3`]);
590	int16x8x4_t q4_out;
591	// tmp[0] = a0 + a1
592	// tmp[1] = a3 + a2
593	// tmp[2] = a3 - a2
594	// tmp[3] = a0 - a1
595	INIT_VECTOR4(q4_out,
596	vabsq_s16(vaddq_s16(q_a0, q_a1)),
597	vabsq_s16(vaddq_s16(q_a3, q_a2)),
598	vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));
599	return q4_out;
600	}
601
602	static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
603	const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[`0`],
604	q4_in.val[`2`]));
605	const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[`1`],
606	q4_in.val[`3`]));
607	const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[`1`],
608	q4_in.val[`3`]));
609	const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[`0`],
610	q4_in.val[`2`]));
611	int16x8x4_t q4_out;
612
613	INIT_VECTOR4(q4_out,
614	vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
615	vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
616	return q4_out;
617	}
618
619	static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
620	const uint16x8_t q_w07 = vld1q_u16(&w[`0`]);
621	const uint16x8_t q_w8f = vld1q_u16(&w[`8`]);
622	int16x4x4_t d4_w;
623	INIT_VECTOR4(d4_w,
624	vget_low_s16(vreinterpretq_s16_u16(q_w07)),
625	vget_high_s16(vreinterpretq_s16_u16(q_w07)),
626	vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
627	vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
628	return d4_w;
629	}
630
631	static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
632	const int16x4x4_t d4_w) {
633	int32x2_t d_sum;
634	// sum += w[ 0] abs(b0);*
635	// sum += w[ 4] abs(b1);*
636	// sum += w[ 8] abs(b2);*
637	// sum += w[12] abs(b3);*
638	int32x4_t q_sum0 = vmull_s16(d4_w.val[`0`], vget_low_s16(q4_in.val[`0`]));
639	int32x4_t q_sum1 = vmull_s16(d4_w.val[`1`], vget_low_s16(q4_in.val[`1`]));
640	int32x4_t q_sum2 = vmull_s16(d4_w.val[`2`], vget_low_s16(q4_in.val[`2`]));
641	int32x4_t q_sum3 = vmull_s16(d4_w.val[`3`], vget_low_s16(q4_in.val[`3`]));
642	q_sum0 = vmlsl_s16(q_sum0, d4_w.val[`0`], vget_high_s16(q4_in.val[`0`]));
643	q_sum1 = vmlsl_s16(q_sum1, d4_w.val[`1`], vget_high_s16(q4_in.val[`1`]));
644	q_sum2 = vmlsl_s16(q_sum2, d4_w.val[`2`], vget_high_s16(q4_in.val[`2`]));
645	q_sum3 = vmlsl_s16(q_sum3, d4_w.val[`3`], vget_high_s16(q4_in.val[`3`]));
646
647	q_sum0 = vaddq_s32(q_sum0, q_sum1);
648	q_sum2 = vaddq_s32(q_sum2, q_sum3);
649	q_sum2 = vaddq_s32(q_sum0, q_sum2);
650	d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
651	d_sum = vpadd_s32(d_sum, d_sum);
652	return d_sum;
653	}
654
655	#define LOAD_LANE_32b(src, VALUE, LANE) \
656	(VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
657
658	// Hadamard transform
659	// Returns the weighted sum of the absolute value of transformed coefficients.
660	// w[] contains a row-major 4 by 4 symmetric matrix.
661	static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
662	const uint16_t* const w) {
663	uint32x2_t d_in_ab_0123 = vdup_n_u32(`0`);
664	uint32x2_t d_in_ab_4567 = vdup_n_u32(`0`);
665	uint32x2_t d_in_ab_89ab = vdup_n_u32(`0`);
666	uint32x2_t d_in_ab_cdef = vdup_n_u32(`0`);
667	uint8x8x4_t d4_in;
668
669	// load data a, b
670	LOAD_LANE_32b(a + `0` * BPS, d_in_ab_0123, `0`);
671	LOAD_LANE_32b(a + `1` * BPS, d_in_ab_4567, `0`);
672	LOAD_LANE_32b(a + `2` * BPS, d_in_ab_89ab, `0`);
673	LOAD_LANE_32b(a + `3` * BPS, d_in_ab_cdef, `0`);
674	LOAD_LANE_32b(b + `0` * BPS, d_in_ab_0123, `1`);
675	LOAD_LANE_32b(b + `1` * BPS, d_in_ab_4567, `1`);
676	LOAD_LANE_32b(b + `2` * BPS, d_in_ab_89ab, `1`);
677	LOAD_LANE_32b(b + `3` * BPS, d_in_ab_cdef, `1`);
678	INIT_VECTOR4(d4_in,
679	vreinterpret_u8_u32(d_in_ab_0123),
680	vreinterpret_u8_u32(d_in_ab_4567),
681	vreinterpret_u8_u32(d_in_ab_89ab),
682	vreinterpret_u8_u32(d_in_ab_cdef));
683
684	{
685	// Vertical pass first to avoid a transpose (vertical and horizontal passes
686	// are commutative because w/kWeightY is symmetric) and subsequent
687	// transpose.
688	const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
689	const int16x4x4_t d4_w = DistoLoadW_NEON(w);
690	// horizontal pass
691	const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
692	const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
693	int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
694
695	// abs(sum2 - sum1) >> 5
696	d_sum = vabs_s32(d_sum);
697	d_sum = vshr_n_s32(d_sum, `5`);
698	return vget_lane_s32(d_sum, `0`);
699	}
700	}
701	#undef LOAD_LANE_32b
702
703	static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
704	const uint16_t* const w) {
705	int D = `0`;
706	int x, y;
707	for (y = `0`; y < `16` * BPS; y += `4` * BPS) {
708	for (x = `0`; x < `16`; x += `4`) {
709	D += Disto4x4_NEON(a + x + y, b + x + y, w);
710	}
711	}
712	return D;
713	}
714
715	//------------------------------------------------------------------------------
716
717	static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
718	int start_block, int end_block,
719	VP8Histogram* const histo) {
720	const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
721	int j;
722	int distribution[MAX_COEFF_THRESH + `1`] = { `0` };
723	for (j = start_block; j < end_block; ++j) {
724	int16_t out[`16`];
725	FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
726	{
727	int k;
728	const int16x8_t a0 = vld1q_s16(out + `0`);
729	const int16x8_t b0 = vld1q_s16(out + `8`);
730	const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
731	const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
732	const uint16x8_t a2 = vshrq_n_u16(a1, `3`);
733	const uint16x8_t b2 = vshrq_n_u16(b1, `3`);
734	const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
735	const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
736	vst1q_s16(out + `0`, vreinterpretq_s16_u16(a3));
737	vst1q_s16(out + `8`, vreinterpretq_s16_u16(b3));
738	// Convert coefficients to bin.
739	for (k = `0`; k < `16`; ++k) {
740	++distribution[out[k]];
741	}
742	}
743	}
744	VP8SetHistogramData(distribution, histo);
745	}
746
747	//------------------------------------------------------------------------------
748
749	static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
750	const uint8_t* const b,
751	uint32x4_t* const sum) {
752	const uint8x16_t a0 = vld1q_u8(a);
753	const uint8x16_t b0 = vld1q_u8(b);
754	const uint8x16_t abs_diff = vabdq_u8(a0, b0);
755	const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
756	vget_low_u8(abs_diff));
757	const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
758	vget_high_u8(abs_diff));
759	/ pair-wise adds and widen /
760	const uint32x4_t sum1 = vpaddlq_u16(prod1);
761	const uint32x4_t sum2 = vpaddlq_u16(prod2);
762	sum = vaddq_u32(sum, vaddq_u32(sum1, sum2));
763	}
764
765	// Horizontal sum of all four uint32_t values in 'sum'.
766	static int SumToInt_NEON(uint32x4_t sum) {
767	#if WEBP_AARCH64
768	return (int)vaddvq_u32(sum);
769	#else
770	const uint64x2_t sum2 = vpaddlq_u32(sum);
771	const uint32x2_t sum3 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(sum2)),
772	vreinterpret_u32_u64(vget_high_u64(sum2)));
773	return (int)vget_lane_u32(sum3, `0`);
774	#endif
775	}
776
777	static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
778	uint32x4_t sum = vdupq_n_u32(`0`);
779	int y;
780	for (y = `0`; y < `16`; ++y) {
781	AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
782	}
783	return SumToInt_NEON(sum);
784	}
785
786	static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
787	uint32x4_t sum = vdupq_n_u32(`0`);
788	int y;
789	for (y = `0`; y < `8`; ++y) {
790	AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
791	}
792	return SumToInt_NEON(sum);
793	}
794
795	static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
796	uint32x4_t sum = vdupq_n_u32(`0`);
797	int y;
798	for (y = `0`; y < `8`; ++y) {
799	const uint8x8_t a0 = vld1_u8(a + y * BPS);
800	const uint8x8_t b0 = vld1_u8(b + y * BPS);
801	const uint8x8_t abs_diff = vabd_u8(a0, b0);
802	const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
803	sum = vpadalq_u16(sum, prod);
804	}
805	return SumToInt_NEON(sum);
806	}
807
808	static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
809	const uint8x16_t a0 = Load4x4_NEON(a);
810	const uint8x16_t b0 = Load4x4_NEON(b);
811	const uint8x16_t abs_diff = vabdq_u8(a0, b0);
812	const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
813	vget_low_u8(abs_diff));
814	const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
815	vget_high_u8(abs_diff));
816	/ pair-wise adds and widen /
817	const uint32x4_t sum1 = vpaddlq_u16(prod1);
818	const uint32x4_t sum2 = vpaddlq_u16(prod2);
819	return SumToInt_NEON(vaddq_u32(sum1, sum2));
820	}
821
822	//------------------------------------------------------------------------------
823
824	// Compilation with gcc-4.6.x is problematic for now.
825	#if !defined(WORK_AROUND_GCC)
826
827	static int16x8_t Quantize_NEON(int16_t* const in,
828	const VP8Matrix* const mtx, int offset) {
829	const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
830	const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
831	const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
832	const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + `0`]);
833	const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + `4`]);
834
835	const int16x8_t a = vld1q_s16(in + offset); // in
836	const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in)
837	const int16x8_t sign = vshrq_n_s16(a, `15`); // sign
838	const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen
839	const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
840	const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
841	const uint32x4_t m2 = vhaddq_u32(m0, bias0);
842	const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff iQ + bias) >> 1*
843	const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, `16`),
844	vshrn_n_u32(m3, `16`)); // QFIX=17 = 16+1
845	const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
846	const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
847	const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign
848	const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
849	vst1q_s16(in + offset, c4);
850	assert(QFIX == `17`); // this function can't work as is if QFIX != 16+1
851	return c3;
852	}
853
854	static const uint8_t kShuffles[`4`][`8`] = {
855	{ `0`, `1`, `2`, `3`, `8`, `9`, `16`, `17` },
856	{ `10`, `11`, `4`, `5`, `6`, `7`, `12`, `13` },
857	{ `18`, `19`, `24`, `25`, `26`, `27`, `20`, `21` },
858	{ `14`, `15`, `22`, `23`, `28`, `29`, `30`, `31` }
859	};
860
861	static int QuantizeBlock_NEON(int16_t in[`16`], int16_t out[`16`],
862	const VP8Matrix* const mtx) {
863	const int16x8_t out0 = Quantize_NEON(in, mtx, `0`);
864	const int16x8_t out1 = Quantize_NEON(in, mtx, `8`);
865	uint8x8x4_t shuffles;
866	// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
867	// non-standard versions there.
868	#if defined(__APPLE__) && WEBP_AARCH64 && \
869	defined(__apple_build_version__) && (__apple_build_version__< 6020037)
870	uint8x16x2_t all_out;
871	INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
872	INIT_VECTOR4(shuffles,
873	vtbl2q_u8(all_out, vld1_u8(kShuffles[`0`])),
874	vtbl2q_u8(all_out, vld1_u8(kShuffles[`1`])),
875	vtbl2q_u8(all_out, vld1_u8(kShuffles[`2`])),
876	vtbl2q_u8(all_out, vld1_u8(kShuffles[`3`])));
877	#else
878	uint8x8x4_t all_out;
879	INIT_VECTOR4(all_out,
880	vreinterpret_u8_s16(vget_low_s16(out0)),
881	vreinterpret_u8_s16(vget_high_s16(out0)),
882	vreinterpret_u8_s16(vget_low_s16(out1)),
883	vreinterpret_u8_s16(vget_high_s16(out1)));
884	INIT_VECTOR4(shuffles,
885	vtbl4_u8(all_out, vld1_u8(kShuffles[`0`])),
886	vtbl4_u8(all_out, vld1_u8(kShuffles[`1`])),
887	vtbl4_u8(all_out, vld1_u8(kShuffles[`2`])),
888	vtbl4_u8(all_out, vld1_u8(kShuffles[`3`])));
889	#endif
890	// Zigzag reordering
891	vst1_u8((uint8_t*)(out + `0`), shuffles.val[`0`]);
892	vst1_u8((uint8_t*)(out + `4`), shuffles.val[`1`]);
893	vst1_u8((uint8_t*)(out + `8`), shuffles.val[`2`]);
894	vst1_u8((uint8_t*)(out + `12`), shuffles.val[`3`]);
895	// test zeros
896	if ((uint64_t)(out + `0`) != `0`) return `1`;
897	if ((uint64_t)(out + `4`) != `0`) return `1`;
898	if ((uint64_t)(out + `8`) != `0`) return `1`;
899	if ((uint64_t)(out + `12`) != `0`) return `1`;
900	return `0`;
901	}
902
903	static int Quantize2Blocks_NEON(int16_t in[`32`], int16_t out[`32`],
904	const VP8Matrix* const mtx) {
905	int nz;
906	nz = QuantizeBlock_NEON(in + `0` * `16`, out + `0` * `16`, mtx) << `0`;
907	nz \|= QuantizeBlock_NEON(in + `1` * `16`, out + `1` * `16`, mtx) << `1`;
908	return nz;
909	}
910
911	#endif // !WORK_AROUND_GCC
912
913	//------------------------------------------------------------------------------
914	// Entry point
915
916	extern void VP8EncDspInitNEON(void);
917
918	WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
919	VP8ITransform = ITransform_NEON;
920	VP8FTransform = FTransform_NEON;
921
922	VP8FTransformWHT = FTransformWHT_NEON;
923
924	VP8TDisto4x4 = Disto4x4_NEON;
925	VP8TDisto16x16 = Disto16x16_NEON;
926	VP8CollectHistogram = CollectHistogram_NEON;
927
928	VP8SSE16x16 = SSE16x16_NEON;
929	VP8SSE16x8 = SSE16x8_NEON;
930	VP8SSE8x8 = SSE8x8_NEON;
931	VP8SSE4x4 = SSE4x4_NEON;
932
933	#if !defined(WORK_AROUND_GCC)
934	VP8EncQuantizeBlock = QuantizeBlock_NEON;
935	VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
936	#endif
937	}
938
939	#else // !WEBP_USE_NEON
940
941	WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
942
943	#endif // WEBP_USE_NEON
944

Browse the source code of Godot/thirdparty/libwebp/src/dsp/enc_neon.c