enc_neon.c source code [engine/third_party/libwebp/src/dsp/enc_neon.c]

1	// Copyright 2012 Google Inc. All Rights Reserved.
2	//
3	// Use of this source code is governed by a BSD-style license
4	// that can be found in the COPYING file in the root of the source
5	// tree. An additional intellectual property rights grant can be found
6	// in the file PATENTS. All contributing project authors may
7	// be found in the AUTHORS file in the root of the source tree.
8	// -----------------------------------------------------------------------------
9	//
10	// ARM NEON version of speed-critical encoding functions.
11	//
12	// adapted from libvpx (http://www.webmproject.org/code/)
13
14	#include "./dsp.h"
15
16	#if defined(WEBP_USE_NEON)
17
18	#include <assert.h>
19
20	#include "./neon.h"
21	#include "../enc/vp8i_enc.h"
22
23	//------------------------------------------------------------------------------
24	// Transforms (Paragraph 14.4)
25
26	// Inverse transform.
27	// This code is pretty much the same as TransformOne in the dec_neon.c, except
28	// for subtraction to ref. See the comments there for algorithmic explanations.*
29
30	static const int16_t kC1 = `20091`;
31	static const int16_t kC2 = `17734`; // half of kC2, actually. See comment above.
32
33	// This code works but is slower* than the inlined-asm version below*
34	// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
35	// WEBP_USE_INTRINSICS define.
36	// With gcc-4.8, it's a little faster speed than inlined-assembly.
37	#if defined(WEBP_USE_INTRINSICS)
38
39	// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
40	static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
41	return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
42	}
43
44	// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
45	// to the corresponding rows of 'dst'.
46	static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
47	const int16x8_t dst01,
48	const int16x8_t dst23) {
49	// Unsigned saturate to 8b.
50	const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
51	const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
52
53	// Store the results.
54	vst1_lane_u32((uint32_t)(dst + `0` BPS), vreinterpret_u32_u8(dst01_u8), `0`);
55	vst1_lane_u32((uint32_t)(dst + `1` BPS), vreinterpret_u32_u8(dst01_u8), `1`);
56	vst1_lane_u32((uint32_t)(dst + `2` BPS), vreinterpret_u32_u8(dst23_u8), `0`);
57	vst1_lane_u32((uint32_t)(dst + `3` BPS), vreinterpret_u32_u8(dst23_u8), `1`);
58	}
59
60	static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
61	const uint8_t* const ref, uint8_t* const dst) {
62	uint32x2_t dst01 = vdup_n_u32(`0`);
63	uint32x2_t dst23 = vdup_n_u32(`0`);
64
65	// Load the source pixels.
66	dst01 = vld1_lane_u32((uint32_t)(ref + `0` BPS), dst01, `0`);
67	dst23 = vld1_lane_u32((uint32_t)(ref + `2` BPS), dst23, `0`);
68	dst01 = vld1_lane_u32((uint32_t)(ref + `1` BPS), dst01, `1`);
69	dst23 = vld1_lane_u32((uint32_t)(ref + `3` BPS), dst23, `1`);
70
71	{
72	// Convert to 16b.
73	const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
74	const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
75
76	// Descale with rounding.
77	const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, `3`);
78	const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, `3`);
79	// Add the inverse transform.
80	SaturateAndStore4x4(dst, out01, out23);
81	}
82	}
83
84	static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
85	int16x8x2_t* const out) {
86	// a0 a1 a2 a3 \| b0 b1 b2 b3 => a0 b0 c0 d0 \| a1 b1 c1 d1
87	// c0 c1 c2 c3 \| d0 d1 d2 d3 a2 b2 c2 d2 \| a3 b3 c3 d3
88	const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
89	// b0 d0 b1 d1 b2 d2 ...
90	*out = vzipq_s16(tmp0.val[`0`], tmp0.val[`1`]);
91	}
92
93	static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
94	// {rows} = in0 \| in4
95	// in8 \| in12
96	// B1 = in4 \| in12
97	const int16x8_t B1 =
98	vcombine_s16(vget_high_s16(rows->val[`0`]), vget_high_s16(rows->val[`1`]));
99	// C0 = kC1 in4 \| kC1 * in12*
100	// C1 = kC2 in4 \| kC2 * in12*
101	const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), `1`);
102	const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
103	const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[`0`]),
104	vget_low_s16(rows->val[`1`])); // in0 + in8
105	const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[`0`]),
106	vget_low_s16(rows->val[`1`])); // in0 - in8
107	// c = kC2 in4 - kC1 * in12*
108	// d = kC1 in4 + kC2 * in12*
109	const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
110	const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
111	const int16x8_t D0 = vcombine_s16(a, b); // D0 = a \| b
112	const int16x8_t D1 = vcombine_s16(d, c); // D1 = d \| c
113	const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d \| b+c
114	const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d \| b-c
115	const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
116	Transpose8x2(E0, E1, rows);
117	}
118
119	static void ITransformOne(const uint8_t* ref,
120	const int16_t* in, uint8_t* dst) {
121	int16x8x2_t rows;
122	INIT_VECTOR2(rows, vld1q_s16(in + `0`), vld1q_s16(in + `8`));
123	TransformPass(&rows);
124	TransformPass(&rows);
125	Add4x4(rows.val[`0`], rows.val[`1`], ref, dst);
126	}
127
128	#else
129
130	static void ITransformOne(const uint8_t* ref,
131	const int16_t* in, uint8_t* dst) {
132	const int kBPS = BPS;
133	const int16_t kC1C2[] = { kC1, kC2, `0`, `0` };
134
135	__asm__ volatile (
136	"vld1.16 {q1, q2}, [%[in]] \n"
137	"vld1.16 {d0}, [%[kC1C2]] \n"
138
139	// d2: in[0]
140	// d3: in[8]
141	// d4: in[4]
142	// d5: in[12]
143	"vswp d3, d4 \n"
144
145	// q8 = {in[4], in[12]} kC1 * 2 >> 16*
146	// q9 = {in[4], in[12]} kC2 >> 16*
147	"vqdmulh.s16 q8, q2, d0[0] \n"
148	"vqdmulh.s16 q9, q2, d0[1] \n"
149
150	// d22 = a = in[0] + in[8]
151	// d23 = b = in[0] - in[8]
152	"vqadd.s16 d22, d2, d3 \n"
153	"vqsub.s16 d23, d2, d3 \n"
154
155	// q8 = in[4]/[12] kC1 >> 16*
156	"vshr.s16 q8, q8, #1 \n"
157
158	// Add {in[4], in[12]} back after the multiplication.
159	"vqadd.s16 q8, q2, q8 \n"
160
161	// d20 = c = in[4]kC2 - in[12]kC1
162	// d21 = d = in[4]kC1 + in[12]kC2
163	"vqsub.s16 d20, d18, d17 \n"
164	"vqadd.s16 d21, d19, d16 \n"
165
166	// d2 = tmp[0] = a + d
167	// d3 = tmp[1] = b + c
168	// d4 = tmp[2] = b - c
169	// d5 = tmp[3] = a - d
170	"vqadd.s16 d2, d22, d21 \n"
171	"vqadd.s16 d3, d23, d20 \n"
172	"vqsub.s16 d4, d23, d20 \n"
173	"vqsub.s16 d5, d22, d21 \n"
174
175	"vzip.16 q1, q2 \n"
176	"vzip.16 q1, q2 \n"
177
178	"vswp d3, d4 \n"
179
180	// q8 = {tmp[4], tmp[12]} kC1 * 2 >> 16*
181	// q9 = {tmp[4], tmp[12]} kC2 >> 16*
182	"vqdmulh.s16 q8, q2, d0[0] \n"
183	"vqdmulh.s16 q9, q2, d0[1] \n"
184
185	// d22 = a = tmp[0] + tmp[8]
186	// d23 = b = tmp[0] - tmp[8]
187	"vqadd.s16 d22, d2, d3 \n"
188	"vqsub.s16 d23, d2, d3 \n"
189
190	"vshr.s16 q8, q8, #1 \n"
191	"vqadd.s16 q8, q2, q8 \n"
192
193	// d20 = c = in[4]kC2 - in[12]kC1
194	// d21 = d = in[4]kC1 + in[12]kC2
195	"vqsub.s16 d20, d18, d17 \n"
196	"vqadd.s16 d21, d19, d16 \n"
197
198	// d2 = tmp[0] = a + d
199	// d3 = tmp[1] = b + c
200	// d4 = tmp[2] = b - c
201	// d5 = tmp[3] = a - d
202	"vqadd.s16 d2, d22, d21 \n"
203	"vqadd.s16 d3, d23, d20 \n"
204	"vqsub.s16 d4, d23, d20 \n"
205	"vqsub.s16 d5, d22, d21 \n"
206
207	"vld1.32 d6[0], [%[ref]], %[kBPS] \n"
208	"vld1.32 d6[1], [%[ref]], %[kBPS] \n"
209	"vld1.32 d7[0], [%[ref]], %[kBPS] \n"
210	"vld1.32 d7[1], [%[ref]], %[kBPS] \n"
211
212	"sub %[ref], %[ref], %[kBPS], lsl #2 \n"
213
214	// (val) + 4 >> 3
215	"vrshr.s16 d2, d2, #3 \n"
216	"vrshr.s16 d3, d3, #3 \n"
217	"vrshr.s16 d4, d4, #3 \n"
218	"vrshr.s16 d5, d5, #3 \n"
219
220	"vzip.16 q1, q2 \n"
221	"vzip.16 q1, q2 \n"
222
223	// Must accumulate before saturating
224	"vmovl.u8 q8, d6 \n"
225	"vmovl.u8 q9, d7 \n"
226
227	"vqadd.s16 q1, q1, q8 \n"
228	"vqadd.s16 q2, q2, q9 \n"
229
230	"vqmovun.s16 d0, q1 \n"
231	"vqmovun.s16 d1, q2 \n"
232
233	"vst1.32 d0[0], [%[dst]], %[kBPS] \n"
234	"vst1.32 d0[1], [%[dst]], %[kBPS] \n"
235	"vst1.32 d1[0], [%[dst]], %[kBPS] \n"
236	"vst1.32 d1[1], [%[dst]] \n"
237
238	: [in] "+r"(in), [dst] "+r"(dst) // modified registers
239	: [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants
240	: "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered
241	);
242	}
243
244	#endif // WEBP_USE_INTRINSICS
245
246	static void ITransform(const uint8_t* ref,
247	const int16_t* in, uint8_t* dst, int do_two) {
248	ITransformOne(ref, in, dst);
249	if (do_two) {
250	ITransformOne(ref + `4`, in + `16`, dst + `4`);
251	}
252	}
253
254	// Load all 4x4 pixels into a single uint8x16_t variable.
255	static uint8x16_t Load4x4(const uint8_t* src) {
256	uint32x4_t out = vdupq_n_u32(`0`);
257	out = vld1q_lane_u32((const uint32_t)(src + `0` BPS), out, `0`);
258	out = vld1q_lane_u32((const uint32_t)(src + `1` BPS), out, `1`);
259	out = vld1q_lane_u32((const uint32_t)(src + `2` BPS), out, `2`);
260	out = vld1q_lane_u32((const uint32_t)(src + `3` BPS), out, `3`);
261	return vreinterpretq_u8_u32(out);
262	}
263
264	// Forward transform.
265
266	#if defined(WEBP_USE_INTRINSICS)
267
268	static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
269	const int16x4_t C, const int16x4_t D,
270	int16x8_t* const out01,
271	int16x8_t* const out32) {
272	const int16x4x2_t AB = vtrn_s16(A, B);
273	const int16x4x2_t CD = vtrn_s16(C, D);
274	const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[`0`]),
275	vreinterpret_s32_s16(CD.val[`0`]));
276	const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[`1`]),
277	vreinterpret_s32_s16(CD.val[`1`]));
278	*out01 = vreinterpretq_s16_s64(
279	vcombine_s64(vreinterpret_s64_s32(tmp02.val[`0`]),
280	vreinterpret_s64_s32(tmp13.val[`0`])));
281	*out32 = vreinterpretq_s16_s64(
282	vcombine_s64(vreinterpret_s64_s32(tmp13.val[`1`]),
283	vreinterpret_s64_s32(tmp02.val[`1`])));
284	}
285
286	static WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a,
287	const uint8x8_t b) {
288	return vreinterpretq_s16_u16(vsubl_u8(a, b));
289	}
290
291	static void FTransform(const uint8_t* src, const uint8_t* ref,
292	int16_t* out) {
293	int16x8_t d0d1, d3d2; // working 4x4 int16 variables
294	{
295	const uint8x16_t S0 = Load4x4(src);
296	const uint8x16_t R0 = Load4x4(ref);
297	const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0));
298	const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0));
299	const int16x4_t D0 = vget_low_s16(D0D1);
300	const int16x4_t D1 = vget_high_s16(D0D1);
301	const int16x4_t D2 = vget_low_s16(D2D3);
302	const int16x4_t D3 = vget_high_s16(D2D3);
303	Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2);
304	}
305	{ // 1rst pass
306	const int32x4_t kCst937 = vdupq_n_s32(`937`);
307	const int32x4_t kCst1812 = vdupq_n_s32(`1812`);
308	const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 \| d1+d2 (=a0\|a1)
309	const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 \| d1-d2 (=a3\|a2)
310	const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, `3`);
311	const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
312	vget_high_s16(a0a1_2));
313	const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
314	vget_high_s16(a0a1_2));
315	const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), `2217`);
316	const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), `2217`);
317	const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), `5352`);
318	const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), `5352`);
319	const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), `9`);
320	const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), `9`);
321	Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
322	}
323	{ // 2nd pass
324	// the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0)
325	const int32x4_t kCst12000 = vdupq_n_s32(`12000` + (`1` << `16`));
326	const int32x4_t kCst51000 = vdupq_n_s32(`51000`);
327	const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 \| d1+d2 (=a0\|a1)
328	const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 \| d1-d2 (=a3\|a2)
329	const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(`7`));
330	const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), `4`);
331	const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), `4`);
332	const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), `2217`);
333	const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), `2217`);
334	const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), `5352`);
335	const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), `5352`);
336	const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
337	const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
338	const int16x4_t a3_eq_0 =
339	vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(`0`)));
340	const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
341	vst1_s16(out + `0`, out0);
342	vst1_s16(out + `4`, out1);
343	vst1_s16(out + `8`, out2);
344	vst1_s16(out + `12`, out3);
345	}
346	}
347
348	#else
349
350	// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
351	static const int16_t kCoeff16[] = {
352	`5352`, `5352`, `5352`, `5352`, `2217`, `2217`, `2217`, `2217`
353	};
354	static const int32_t kCoeff32[] = {
355	`1812`, `1812`, `1812`, `1812`,
356	`937`, `937`, `937`, `937`,
357	`12000`, `12000`, `12000`, `12000`,
358	`51000`, `51000`, `51000`, `51000`
359	};
360
361	static void FTransform(const uint8_t* src, const uint8_t* ref,
362	int16_t* out) {
363	const int kBPS = BPS;
364	const uint8_t* src_ptr = src;
365	const uint8_t* ref_ptr = ref;
366	const int16_t* coeff16 = kCoeff16;
367	const int32_t* coeff32 = kCoeff32;
368
369	__asm__ volatile (
370	// load src into q4, q5 in high half
371	"vld1.8 {d8}, [%[src_ptr]], %[kBPS] \n"
372	"vld1.8 {d10}, [%[src_ptr]], %[kBPS] \n"
373	"vld1.8 {d9}, [%[src_ptr]], %[kBPS] \n"
374	"vld1.8 {d11}, [%[src_ptr]] \n"
375
376	// load ref into q6, q7 in high half
377	"vld1.8 {d12}, [%[ref_ptr]], %[kBPS] \n"
378	"vld1.8 {d14}, [%[ref_ptr]], %[kBPS] \n"
379	"vld1.8 {d13}, [%[ref_ptr]], %[kBPS] \n"
380	"vld1.8 {d15}, [%[ref_ptr]] \n"
381
382	// Pack the high values in to q4 and q6
383	"vtrn.32 q4, q5 \n"
384	"vtrn.32 q6, q7 \n"
385
386	// d[0-3] = src - ref
387	"vsubl.u8 q0, d8, d12 \n"
388	"vsubl.u8 q1, d9, d13 \n"
389
390	// load coeff16 into q8(d16=5352, d17=2217)
391	"vld1.16 {q8}, [%[coeff16]] \n"
392
393	// load coeff32 high half into q9 = 1812, q10 = 937
394	"vld1.32 {q9, q10}, [%[coeff32]]! \n"
395
396	// load coeff32 low half into q11=12000, q12=51000
397	"vld1.32 {q11,q12}, [%[coeff32]] \n"
398
399	// part 1
400	// Transpose. Register dN is the same as dN in C
401	"vtrn.32 d0, d2 \n"
402	"vtrn.32 d1, d3 \n"
403	"vtrn.16 d0, d1 \n"
404	"vtrn.16 d2, d3 \n"
405
406	"vadd.s16 d4, d0, d3 \n" // a0 = d0 + d3
407	"vadd.s16 d5, d1, d2 \n" // a1 = d1 + d2
408	"vsub.s16 d6, d1, d2 \n" // a2 = d1 - d2
409	"vsub.s16 d7, d0, d3 \n" // a3 = d0 - d3
410
411	"vadd.s16 d0, d4, d5 \n" // a0 + a1
412	"vshl.s16 d0, d0, #3 \n" // temp[0+i4] = (a0+a1) << 3*
413	"vsub.s16 d2, d4, d5 \n" // a0 - a1
414	"vshl.s16 d2, d2, #3 \n" // (temp[2+i4] = (a0-a1) << 3*
415
416	"vmlal.s16 q9, d7, d16 \n" // a35352 + 1812*
417	"vmlal.s16 q10, d7, d17 \n" // a32217 + 937*
418	"vmlal.s16 q9, d6, d17 \n" // a22217 + a35352 + 1812
419	"vmlsl.s16 q10, d6, d16 \n" // a32217 + 937 - a25352
420
421	// temp[1+i4] = (d22217 + d35352 + 1812) >> 9*
422	// temp[3+i4] = (d32217 + 937 - d25352) >> 9*
423	"vshrn.s32 d1, q9, #9 \n"
424	"vshrn.s32 d3, q10, #9 \n"
425
426	// part 2
427	// transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
428	"vtrn.32 d0, d2 \n"
429	"vtrn.32 d1, d3 \n"
430	"vtrn.16 d0, d1 \n"
431	"vtrn.16 d2, d3 \n"
432
433	"vmov.s16 d26, #7 \n"
434
435	"vadd.s16 d4, d0, d3 \n" // a1 = ip[0] + ip[12]
436	"vadd.s16 d5, d1, d2 \n" // b1 = ip[4] + ip[8]
437	"vsub.s16 d6, d1, d2 \n" // c1 = ip[4] - ip[8]
438	"vadd.s16 d4, d4, d26 \n" // a1 + 7
439	"vsub.s16 d7, d0, d3 \n" // d1 = ip[0] - ip[12]
440
441	"vadd.s16 d0, d4, d5 \n" // op[0] = a1 + b1 + 7
442	"vsub.s16 d2, d4, d5 \n" // op[8] = a1 - b1 + 7
443
444	"vmlal.s16 q11, d7, d16 \n" // d15352 + 12000*
445	"vmlal.s16 q12, d7, d17 \n" // d12217 + 51000*
446
447	"vceq.s16 d4, d7, #0 \n"
448
449	"vshr.s16 d0, d0, #4 \n"
450	"vshr.s16 d2, d2, #4 \n"
451
452	"vmlal.s16 q11, d6, d17 \n" // c12217 + d15352 + 12000
453	"vmlsl.s16 q12, d6, d16 \n" // d12217 - c15352 + 51000
454
455	"vmvn d4, d4 \n" // !(d1 == 0)
456	// op[4] = (c12217 + d15352 + 12000)>>16
457	"vshrn.s32 d1, q11, #16 \n"
458	// op[4] += (d1!=0)
459	"vsub.s16 d1, d1, d4 \n"
460	// op[12]= (d12217 - c15352 + 51000)>>16
461	"vshrn.s32 d3, q12, #16 \n"
462
463	// set result to out array
464	"vst1.16 {q0, q1}, [%[out]] \n"
465	: [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
466	[coeff32] "+r"(coeff32) // modified registers
467	: [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
468	[out] "r"(out) // constants
469	: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
470	"q10", "q11", "q12", "q13" // clobbered
471	);
472	}
473
474	#endif
475
476	#define LOAD_LANE_16b(VALUE, LANE) do { \
477	(VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \
478	src += stride; \
479	} while (0)
480
481	static void FTransformWHT(const int16_t* src, int16_t* out) {
482	const int stride = `16`;
483	const int16x4_t zero = vdup_n_s16(`0`);
484	int32x4x4_t tmp0;
485	int16x4x4_t in;
486	INIT_VECTOR4(in, zero, zero, zero, zero);
487	LOAD_LANE_16b(in.val[`0`], `0`);
488	LOAD_LANE_16b(in.val[`1`], `0`);
489	LOAD_LANE_16b(in.val[`2`], `0`);
490	LOAD_LANE_16b(in.val[`3`], `0`);
491	LOAD_LANE_16b(in.val[`0`], `1`);
492	LOAD_LANE_16b(in.val[`1`], `1`);
493	LOAD_LANE_16b(in.val[`2`], `1`);
494	LOAD_LANE_16b(in.val[`3`], `1`);
495	LOAD_LANE_16b(in.val[`0`], `2`);
496	LOAD_LANE_16b(in.val[`1`], `2`);
497	LOAD_LANE_16b(in.val[`2`], `2`);
498	LOAD_LANE_16b(in.val[`3`], `2`);
499	LOAD_LANE_16b(in.val[`0`], `3`);
500	LOAD_LANE_16b(in.val[`1`], `3`);
501	LOAD_LANE_16b(in.val[`2`], `3`);
502	LOAD_LANE_16b(in.val[`3`], `3`);
503
504	{
505	// a0 = in[0 16] + in[2 * 16]*
506	// a1 = in[1 16] + in[3 * 16]*
507	// a2 = in[1 16] - in[3 * 16]*
508	// a3 = in[0 16] - in[2 * 16]*
509	const int32x4_t a0 = vaddl_s16(in.val[`0`], in.val[`2`]);
510	const int32x4_t a1 = vaddl_s16(in.val[`1`], in.val[`3`]);
511	const int32x4_t a2 = vsubl_s16(in.val[`1`], in.val[`3`]);
512	const int32x4_t a3 = vsubl_s16(in.val[`0`], in.val[`2`]);
513	tmp0.val[`0`] = vaddq_s32(a0, a1);
514	tmp0.val[`1`] = vaddq_s32(a3, a2);
515	tmp0.val[`2`] = vsubq_s32(a3, a2);
516	tmp0.val[`3`] = vsubq_s32(a0, a1);
517	}
518	{
519	const int32x4x4_t tmp1 = Transpose4x4(tmp0);
520	// a0 = tmp[0 + i] + tmp[ 8 + i]
521	// a1 = tmp[4 + i] + tmp[12 + i]
522	// a2 = tmp[4 + i] - tmp[12 + i]
523	// a3 = tmp[0 + i] - tmp[ 8 + i]
524	const int32x4_t a0 = vaddq_s32(tmp1.val[`0`], tmp1.val[`2`]);
525	const int32x4_t a1 = vaddq_s32(tmp1.val[`1`], tmp1.val[`3`]);
526	const int32x4_t a2 = vsubq_s32(tmp1.val[`1`], tmp1.val[`3`]);
527	const int32x4_t a3 = vsubq_s32(tmp1.val[`0`], tmp1.val[`2`]);
528	const int32x4_t b0 = vhaddq_s32(a0, a1); // (a0 + a1) >> 1
529	const int32x4_t b1 = vhaddq_s32(a3, a2); // (a3 + a2) >> 1
530	const int32x4_t b2 = vhsubq_s32(a3, a2); // (a3 - a2) >> 1
531	const int32x4_t b3 = vhsubq_s32(a0, a1); // (a0 - a1) >> 1
532	const int16x4_t out0 = vmovn_s32(b0);
533	const int16x4_t out1 = vmovn_s32(b1);
534	const int16x4_t out2 = vmovn_s32(b2);
535	const int16x4_t out3 = vmovn_s32(b3);
536
537	vst1_s16(out + `0`, out0);
538	vst1_s16(out + `4`, out1);
539	vst1_s16(out + `8`, out2);
540	vst1_s16(out + `12`, out3);
541	}
542	}
543	#undef LOAD_LANE_16b
544
545	//------------------------------------------------------------------------------
546	// Texture distortion
547	//
548	// We try to match the spectral content (weighted) between source and
549	// reconstructed samples.
550
551	// a 0123, b 0123
552	// a 4567, b 4567
553	// a 89ab, b 89ab
554	// a cdef, b cdef
555	//
556	// transpose
557	//
558	// a 048c, b 048c
559	// a 159d, b 159d
560	// a 26ae, b 26ae
561	// a 37bf, b 37bf
562	//
563	static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
564	const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[`0`], q4_in.val[`1`]);
565	const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[`2`], q4_in.val[`3`]);
566	const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[`0`]),
567	vreinterpretq_s32_s16(q2_tmp1.val[`0`]));
568	const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[`1`]),
569	vreinterpretq_s32_s16(q2_tmp1.val[`1`]));
570	q4_in.val[`0`] = vreinterpretq_s16_s32(q2_tmp2.val[`0`]);
571	q4_in.val[`2`] = vreinterpretq_s16_s32(q2_tmp2.val[`1`]);
572	q4_in.val[`1`] = vreinterpretq_s16_s32(q2_tmp3.val[`0`]);
573	q4_in.val[`3`] = vreinterpretq_s16_s32(q2_tmp3.val[`1`]);
574	return q4_in;
575	}
576
577	static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) {
578	// {a0, a1} = {in[0] + in[2], in[1] + in[3]}
579	// {a3, a2} = {in[0] - in[2], in[1] - in[3]}
580	const int16x8_t q_a0 = vaddq_s16(q4_in.val[`0`], q4_in.val[`2`]);
581	const int16x8_t q_a1 = vaddq_s16(q4_in.val[`1`], q4_in.val[`3`]);
582	const int16x8_t q_a3 = vsubq_s16(q4_in.val[`0`], q4_in.val[`2`]);
583	const int16x8_t q_a2 = vsubq_s16(q4_in.val[`1`], q4_in.val[`3`]);
584	int16x8x4_t q4_out;
585	// tmp[0] = a0 + a1
586	// tmp[1] = a3 + a2
587	// tmp[2] = a3 - a2
588	// tmp[3] = a0 - a1
589	INIT_VECTOR4(q4_out,
590	vabsq_s16(vaddq_s16(q_a0, q_a1)),
591	vabsq_s16(vaddq_s16(q_a3, q_a2)),
592	vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));
593	return q4_out;
594	}
595
596	static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) {
597	const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[`0`],
598	q4_in.val[`2`]));
599	const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[`1`],
600	q4_in.val[`3`]));
601	const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[`1`],
602	q4_in.val[`3`]));
603	const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[`0`],
604	q4_in.val[`2`]));
605	int16x8x4_t q4_out;
606
607	INIT_VECTOR4(q4_out,
608	vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
609	vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
610	return q4_out;
611	}
612
613	static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
614	const uint16x8_t q_w07 = vld1q_u16(&w[`0`]);
615	const uint16x8_t q_w8f = vld1q_u16(&w[`8`]);
616	int16x4x4_t d4_w;
617	INIT_VECTOR4(d4_w,
618	vget_low_s16(vreinterpretq_s16_u16(q_w07)),
619	vget_high_s16(vreinterpretq_s16_u16(q_w07)),
620	vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
621	vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
622	return d4_w;
623	}
624
625	static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
626	const int16x4x4_t d4_w) {
627	int32x2_t d_sum;
628	// sum += w[ 0] abs(b0);*
629	// sum += w[ 4] abs(b1);*
630	// sum += w[ 8] abs(b2);*
631	// sum += w[12] abs(b3);*
632	int32x4_t q_sum0 = vmull_s16(d4_w.val[`0`], vget_low_s16(q4_in.val[`0`]));
633	int32x4_t q_sum1 = vmull_s16(d4_w.val[`1`], vget_low_s16(q4_in.val[`1`]));
634	int32x4_t q_sum2 = vmull_s16(d4_w.val[`2`], vget_low_s16(q4_in.val[`2`]));
635	int32x4_t q_sum3 = vmull_s16(d4_w.val[`3`], vget_low_s16(q4_in.val[`3`]));
636	q_sum0 = vmlsl_s16(q_sum0, d4_w.val[`0`], vget_high_s16(q4_in.val[`0`]));
637	q_sum1 = vmlsl_s16(q_sum1, d4_w.val[`1`], vget_high_s16(q4_in.val[`1`]));
638	q_sum2 = vmlsl_s16(q_sum2, d4_w.val[`2`], vget_high_s16(q4_in.val[`2`]));
639	q_sum3 = vmlsl_s16(q_sum3, d4_w.val[`3`], vget_high_s16(q4_in.val[`3`]));
640
641	q_sum0 = vaddq_s32(q_sum0, q_sum1);
642	q_sum2 = vaddq_s32(q_sum2, q_sum3);
643	q_sum2 = vaddq_s32(q_sum0, q_sum2);
644	d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
645	d_sum = vpadd_s32(d_sum, d_sum);
646	return d_sum;
647	}
648
649	#define LOAD_LANE_32b(src, VALUE, LANE) \
650	(VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
651
652	// Hadamard transform
653	// Returns the weighted sum of the absolute value of transformed coefficients.
654	// w[] contains a row-major 4 by 4 symmetric matrix.
655	static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
656	const uint16_t* const w) {
657	uint32x2_t d_in_ab_0123 = vdup_n_u32(`0`);
658	uint32x2_t d_in_ab_4567 = vdup_n_u32(`0`);
659	uint32x2_t d_in_ab_89ab = vdup_n_u32(`0`);
660	uint32x2_t d_in_ab_cdef = vdup_n_u32(`0`);
661	uint8x8x4_t d4_in;
662
663	// load data a, b
664	LOAD_LANE_32b(a + `0` * BPS, d_in_ab_0123, `0`);
665	LOAD_LANE_32b(a + `1` * BPS, d_in_ab_4567, `0`);
666	LOAD_LANE_32b(a + `2` * BPS, d_in_ab_89ab, `0`);
667	LOAD_LANE_32b(a + `3` * BPS, d_in_ab_cdef, `0`);
668	LOAD_LANE_32b(b + `0` * BPS, d_in_ab_0123, `1`);
669	LOAD_LANE_32b(b + `1` * BPS, d_in_ab_4567, `1`);
670	LOAD_LANE_32b(b + `2` * BPS, d_in_ab_89ab, `1`);
671	LOAD_LANE_32b(b + `3` * BPS, d_in_ab_cdef, `1`);
672	INIT_VECTOR4(d4_in,
673	vreinterpret_u8_u32(d_in_ab_0123),
674	vreinterpret_u8_u32(d_in_ab_4567),
675	vreinterpret_u8_u32(d_in_ab_89ab),
676	vreinterpret_u8_u32(d_in_ab_cdef));
677
678	{
679	// Vertical pass first to avoid a transpose (vertical and horizontal passes
680	// are commutative because w/kWeightY is symmetric) and subsequent
681	// transpose.
682	const int16x8x4_t q4_v = DistoVerticalPass(d4_in);
683	const int16x4x4_t d4_w = DistoLoadW(w);
684	// horizontal pass
685	const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_v);
686	const int16x8x4_t q4_h = DistoHorizontalPass(q4_t);
687	int32x2_t d_sum = DistoSum(q4_h, d4_w);
688
689	// abs(sum2 - sum1) >> 5
690	d_sum = vabs_s32(d_sum);
691	d_sum = vshr_n_s32(d_sum, `5`);
692	return vget_lane_s32(d_sum, `0`);
693	}
694	}
695	#undef LOAD_LANE_32b
696
697	static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
698	const uint16_t* const w) {
699	int D = `0`;
700	int x, y;
701	for (y = `0`; y < `16` * BPS; y += `4` * BPS) {
702	for (x = `0`; x < `16`; x += `4`) {
703	D += Disto4x4(a + x + y, b + x + y, w);
704	}
705	}
706	return D;
707	}
708
709	//------------------------------------------------------------------------------
710
711	static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
712	int start_block, int end_block,
713	VP8Histogram* const histo) {
714	const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
715	int j;
716	int distribution[MAX_COEFF_THRESH + `1`] = { `0` };
717	for (j = start_block; j < end_block; ++j) {
718	int16_t out[`16`];
719	FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
720	{
721	int k;
722	const int16x8_t a0 = vld1q_s16(out + `0`);
723	const int16x8_t b0 = vld1q_s16(out + `8`);
724	const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
725	const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
726	const uint16x8_t a2 = vshrq_n_u16(a1, `3`);
727	const uint16x8_t b2 = vshrq_n_u16(b1, `3`);
728	const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
729	const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
730	vst1q_s16(out + `0`, vreinterpretq_s16_u16(a3));
731	vst1q_s16(out + `8`, vreinterpretq_s16_u16(b3));
732	// Convert coefficients to bin.
733	for (k = `0`; k < `16`; ++k) {
734	++distribution[out[k]];
735	}
736	}
737	}
738	VP8SetHistogramData(distribution, histo);
739	}
740
741	//------------------------------------------------------------------------------
742
743	static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
744	const uint8_t* const b,
745	uint32x4_t* const sum) {
746	const uint8x16_t a0 = vld1q_u8(a);
747	const uint8x16_t b0 = vld1q_u8(b);
748	const uint8x16_t abs_diff = vabdq_u8(a0, b0);
749	const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
750	vget_low_u8(abs_diff));
751	const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
752	vget_high_u8(abs_diff));
753	/ pair-wise adds and widen /
754	const uint32x4_t sum1 = vpaddlq_u16(prod1);
755	const uint32x4_t sum2 = vpaddlq_u16(prod2);
756	sum = vaddq_u32(sum, vaddq_u32(sum1, sum2));
757	}
758
759	// Horizontal sum of all four uint32_t values in 'sum'.
760	static int SumToInt(uint32x4_t sum) {
761	const uint64x2_t sum2 = vpaddlq_u32(sum);
762	const uint64_t sum3 = vgetq_lane_u64(sum2, `0`) + vgetq_lane_u64(sum2, `1`);
763	return (int)sum3;
764	}
765
766	static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
767	uint32x4_t sum = vdupq_n_u32(`0`);
768	int y;
769	for (y = `0`; y < `16`; ++y) {
770	AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
771	}
772	return SumToInt(sum);
773	}
774
775	static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
776	uint32x4_t sum = vdupq_n_u32(`0`);
777	int y;
778	for (y = `0`; y < `8`; ++y) {
779	AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
780	}
781	return SumToInt(sum);
782	}
783
784	static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
785	uint32x4_t sum = vdupq_n_u32(`0`);
786	int y;
787	for (y = `0`; y < `8`; ++y) {
788	const uint8x8_t a0 = vld1_u8(a + y * BPS);
789	const uint8x8_t b0 = vld1_u8(b + y * BPS);
790	const uint8x8_t abs_diff = vabd_u8(a0, b0);
791	const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
792	sum = vpadalq_u16(sum, prod);
793	}
794	return SumToInt(sum);
795	}
796
797	static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
798	const uint8x16_t a0 = Load4x4(a);
799	const uint8x16_t b0 = Load4x4(b);
800	const uint8x16_t abs_diff = vabdq_u8(a0, b0);
801	const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
802	vget_low_u8(abs_diff));
803	const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
804	vget_high_u8(abs_diff));
805	/ pair-wise adds and widen /
806	const uint32x4_t sum1 = vpaddlq_u16(prod1);
807	const uint32x4_t sum2 = vpaddlq_u16(prod2);
808	return SumToInt(vaddq_u32(sum1, sum2));
809	}
810
811	//------------------------------------------------------------------------------
812
813	// Compilation with gcc-4.6.x is problematic for now.
814	#if !defined(WORK_AROUND_GCC)
815
816	static int16x8_t Quantize(int16_t* const in,
817	const VP8Matrix* const mtx, int offset) {
818	const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
819	const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
820	const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
821	const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + `0`]);
822	const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + `4`]);
823
824	const int16x8_t a = vld1q_s16(in + offset); // in
825	const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in)
826	const int16x8_t sign = vshrq_n_s16(a, `15`); // sign
827	const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen
828	const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
829	const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
830	const uint32x4_t m2 = vhaddq_u32(m0, bias0);
831	const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff iQ + bias) >> 1*
832	const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, `16`),
833	vshrn_n_u32(m3, `16`)); // QFIX=17 = 16+1
834	const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
835	const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
836	const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign
837	const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
838	vst1q_s16(in + offset, c4);
839	assert(QFIX == `17`); // this function can't work as is if QFIX != 16+1
840	return c3;
841	}
842
843	static const uint8_t kShuffles[`4`][`8`] = {
844	{ `0`, `1`, `2`, `3`, `8`, `9`, `16`, `17` },
845	{ `10`, `11`, `4`, `5`, `6`, `7`, `12`, `13` },
846	{ `18`, `19`, `24`, `25`, `26`, `27`, `20`, `21` },
847	{ `14`, `15`, `22`, `23`, `28`, `29`, `30`, `31` }
848	};
849
850	static int QuantizeBlock(int16_t in[`16`], int16_t out[`16`],
851	const VP8Matrix* const mtx) {
852	const int16x8_t out0 = Quantize(in, mtx, `0`);
853	const int16x8_t out1 = Quantize(in, mtx, `8`);
854	uint8x8x4_t shuffles;
855	// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
856	// non-standard versions there.
857	#if defined(__APPLE__) && defined(__aarch64__) && \
858	defined(__apple_build_version__) && (__apple_build_version__< 6020037)
859	uint8x16x2_t all_out;
860	INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
861	INIT_VECTOR4(shuffles,
862	vtbl2q_u8(all_out, vld1_u8(kShuffles[`0`])),
863	vtbl2q_u8(all_out, vld1_u8(kShuffles[`1`])),
864	vtbl2q_u8(all_out, vld1_u8(kShuffles[`2`])),
865	vtbl2q_u8(all_out, vld1_u8(kShuffles[`3`])));
866	#else
867	uint8x8x4_t all_out;
868	INIT_VECTOR4(all_out,
869	vreinterpret_u8_s16(vget_low_s16(out0)),
870	vreinterpret_u8_s16(vget_high_s16(out0)),
871	vreinterpret_u8_s16(vget_low_s16(out1)),
872	vreinterpret_u8_s16(vget_high_s16(out1)));
873	INIT_VECTOR4(shuffles,
874	vtbl4_u8(all_out, vld1_u8(kShuffles[`0`])),
875	vtbl4_u8(all_out, vld1_u8(kShuffles[`1`])),
876	vtbl4_u8(all_out, vld1_u8(kShuffles[`2`])),
877	vtbl4_u8(all_out, vld1_u8(kShuffles[`3`])));
878	#endif
879	// Zigzag reordering
880	vst1_u8((uint8_t*)(out + `0`), shuffles.val[`0`]);
881	vst1_u8((uint8_t*)(out + `4`), shuffles.val[`1`]);
882	vst1_u8((uint8_t*)(out + `8`), shuffles.val[`2`]);
883	vst1_u8((uint8_t*)(out + `12`), shuffles.val[`3`]);
884	// test zeros
885	if ((uint64_t)(out + `0`) != `0`) return `1`;
886	if ((uint64_t)(out + `4`) != `0`) return `1`;
887	if ((uint64_t)(out + `8`) != `0`) return `1`;
888	if ((uint64_t)(out + `12`) != `0`) return `1`;
889	return `0`;
890	}
891
892	static int Quantize2Blocks(int16_t in[`32`], int16_t out[`32`],
893	const VP8Matrix* const mtx) {
894	int nz;
895	nz = QuantizeBlock(in + `0` * `16`, out + `0` * `16`, mtx) << `0`;
896	nz \|= QuantizeBlock(in + `1` * `16`, out + `1` * `16`, mtx) << `1`;
897	return nz;
898	}
899
900	#endif // !WORK_AROUND_GCC
901
902	//------------------------------------------------------------------------------
903	// Entry point
904
905	extern void VP8EncDspInitNEON(void);
906
907	WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
908	VP8ITransform = ITransform;
909	VP8FTransform = FTransform;
910
911	VP8FTransformWHT = FTransformWHT;
912
913	VP8TDisto4x4 = Disto4x4;
914	VP8TDisto16x16 = Disto16x16;
915	VP8CollectHistogram = CollectHistogram;
916
917	VP8SSE16x16 = SSE16x16_NEON;
918	VP8SSE16x8 = SSE16x8_NEON;
919	VP8SSE8x8 = SSE8x8_NEON;
920	VP8SSE4x4 = SSE4x4_NEON;
921
922	#if !defined(WORK_AROUND_GCC)
923	VP8EncQuantizeBlock = QuantizeBlock;
924	VP8EncQuantize2Blocks = Quantize2Blocks;
925	#endif
926	}
927
928	#else // !WEBP_USE_NEON
929
930	WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
931
932	#endif // WEBP_USE_NEON
933

Browse the source code of engine/third_party/libwebp/src/dsp/enc_neon.c