1// Copyright 2012 Google Inc. All Rights Reserved.
2//
3// Use of this source code is governed by a BSD-style license
4// that can be found in the COPYING file in the root of the source
5// tree. An additional intellectual property rights grant can be found
6// in the file PATENTS. All contributing project authors may
7// be found in the AUTHORS file in the root of the source tree.
8// -----------------------------------------------------------------------------
9//
10// ARM NEON version of dsp functions and loop filtering.
11//
12// Authors: Somnath Banerjee (somnath@google.com)
13// Johann Koenig (johannkoenig@google.com)
14
15#include "src/dsp/dsp.h"
16
17#if defined(WEBP_USE_NEON)
18
19#include "src/dsp/neon.h"
20#include "src/dec/vp8i_dec.h"
21
22//------------------------------------------------------------------------------
23// NxM Loading functions
24
25#if !defined(WORK_AROUND_GCC)
26
27// This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
28// (register alloc, probably). The variants somewhat mitigate the problem, but
29// not quite. HFilter16i() remains problematic.
30static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src,
31 int stride) {
32 const uint8x8_t zero = vdup_n_u8(0);
33 uint8x8x4_t out;
34 INIT_VECTOR4(out, zero, zero, zero, zero);
35 out = vld4_lane_u8(src + 0 * stride, out, 0);
36 out = vld4_lane_u8(src + 1 * stride, out, 1);
37 out = vld4_lane_u8(src + 2 * stride, out, 2);
38 out = vld4_lane_u8(src + 3 * stride, out, 3);
39 out = vld4_lane_u8(src + 4 * stride, out, 4);
40 out = vld4_lane_u8(src + 5 * stride, out, 5);
41 out = vld4_lane_u8(src + 6 * stride, out, 6);
42 out = vld4_lane_u8(src + 7 * stride, out, 7);
43 return out;
44}
45
46static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride,
47 uint8x16_t* const p1,
48 uint8x16_t* const p0,
49 uint8x16_t* const q0,
50 uint8x16_t* const q1) {
51 // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
52 // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
53 const uint8x8x4_t row0 = Load4x8_NEON(src - 2 + 0 * stride, stride);
54 const uint8x8x4_t row8 = Load4x8_NEON(src - 2 + 8 * stride, stride);
55 *p1 = vcombine_u8(row0.val[0], row8.val[0]);
56 *p0 = vcombine_u8(row0.val[1], row8.val[1]);
57 *q0 = vcombine_u8(row0.val[2], row8.val[2]);
58 *q1 = vcombine_u8(row0.val[3], row8.val[3]);
59}
60
61#else // WORK_AROUND_GCC
62
63#define LOADQ_LANE_32b(VALUE, LANE) do { \
64 (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE)); \
65 src += stride; \
66} while (0)
67
68static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride,
69 uint8x16_t* const p1,
70 uint8x16_t* const p0,
71 uint8x16_t* const q0,
72 uint8x16_t* const q1) {
73 const uint32x4_t zero = vdupq_n_u32(0);
74 uint32x4x4_t in;
75 INIT_VECTOR4(in, zero, zero, zero, zero);
76 src -= 2;
77 LOADQ_LANE_32b(in.val[0], 0);
78 LOADQ_LANE_32b(in.val[1], 0);
79 LOADQ_LANE_32b(in.val[2], 0);
80 LOADQ_LANE_32b(in.val[3], 0);
81 LOADQ_LANE_32b(in.val[0], 1);
82 LOADQ_LANE_32b(in.val[1], 1);
83 LOADQ_LANE_32b(in.val[2], 1);
84 LOADQ_LANE_32b(in.val[3], 1);
85 LOADQ_LANE_32b(in.val[0], 2);
86 LOADQ_LANE_32b(in.val[1], 2);
87 LOADQ_LANE_32b(in.val[2], 2);
88 LOADQ_LANE_32b(in.val[3], 2);
89 LOADQ_LANE_32b(in.val[0], 3);
90 LOADQ_LANE_32b(in.val[1], 3);
91 LOADQ_LANE_32b(in.val[2], 3);
92 LOADQ_LANE_32b(in.val[3], 3);
93 // Transpose four 4x4 parts:
94 {
95 const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
96 vreinterpretq_u8_u32(in.val[1]));
97 const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
98 vreinterpretq_u8_u32(in.val[3]));
99 const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
100 vreinterpretq_u16_u8(row23.val[0]));
101 const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
102 vreinterpretq_u16_u8(row23.val[1]));
103 *p1 = vreinterpretq_u8_u16(row02.val[0]);
104 *p0 = vreinterpretq_u8_u16(row13.val[0]);
105 *q0 = vreinterpretq_u8_u16(row02.val[1]);
106 *q1 = vreinterpretq_u8_u16(row13.val[1]);
107 }
108}
109#undef LOADQ_LANE_32b
110
111#endif // !WORK_AROUND_GCC
112
113static WEBP_INLINE void Load8x16_NEON(
114 const uint8_t* const src, int stride,
115 uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
116 uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
117 uint8x16_t* const q2, uint8x16_t* const q3) {
118 Load4x16_NEON(src - 2, stride, p3, p2, p1, p0);
119 Load4x16_NEON(src + 2, stride, q0, q1, q2, q3);
120}
121
122static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride,
123 uint8x16_t* const p1,
124 uint8x16_t* const p0,
125 uint8x16_t* const q0,
126 uint8x16_t* const q1) {
127 *p1 = vld1q_u8(src - 2 * stride);
128 *p0 = vld1q_u8(src - 1 * stride);
129 *q0 = vld1q_u8(src + 0 * stride);
130 *q1 = vld1q_u8(src + 1 * stride);
131}
132
133static WEBP_INLINE void Load16x8_NEON(
134 const uint8_t* const src, int stride,
135 uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
136 uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
137 uint8x16_t* const q2, uint8x16_t* const q3) {
138 Load16x4_NEON(src - 2 * stride, stride, p3, p2, p1, p0);
139 Load16x4_NEON(src + 2 * stride, stride, q0, q1, q2, q3);
140}
141
142static WEBP_INLINE void Load8x8x2_NEON(
143 const uint8_t* const u, const uint8_t* const v, int stride,
144 uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
145 uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
146 uint8x16_t* const q2, uint8x16_t* const q3) {
147 // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
148 // and the v-samples on the higher half.
149 *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
150 *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
151 *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
152 *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
153 *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
154 *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
155 *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
156 *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
157}
158
159#if !defined(WORK_AROUND_GCC)
160
161#define LOAD_UV_8(ROW) \
162 vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
163
164static WEBP_INLINE void Load8x8x2T_NEON(
165 const uint8_t* const u, const uint8_t* const v, int stride,
166 uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
167 uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
168 uint8x16_t* const q2, uint8x16_t* const q3) {
169 // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
170 // and the v-samples on the higher half.
171 const uint8x16_t row0 = LOAD_UV_8(0);
172 const uint8x16_t row1 = LOAD_UV_8(1);
173 const uint8x16_t row2 = LOAD_UV_8(2);
174 const uint8x16_t row3 = LOAD_UV_8(3);
175 const uint8x16_t row4 = LOAD_UV_8(4);
176 const uint8x16_t row5 = LOAD_UV_8(5);
177 const uint8x16_t row6 = LOAD_UV_8(6);
178 const uint8x16_t row7 = LOAD_UV_8(7);
179 // Perform two side-by-side 8x8 transposes
180 // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
181 // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
182 // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
183 // u30 u31 u32 u33 u34 u35 u36 u37 | ...
184 // u40 u41 u42 u43 u44 u45 u46 u47 | ...
185 // u50 u51 u52 u53 u54 u55 u56 u57 | ...
186 // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
187 // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
188 const uint8x16x2_t row01 = vtrnq_u8(row0, row1); // u00 u10 u02 u12 ...
189 // u01 u11 u03 u13 ...
190 const uint8x16x2_t row23 = vtrnq_u8(row2, row3); // u20 u30 u22 u32 ...
191 // u21 u31 u23 u33 ...
192 const uint8x16x2_t row45 = vtrnq_u8(row4, row5); // ...
193 const uint8x16x2_t row67 = vtrnq_u8(row6, row7); // ...
194 const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
195 vreinterpretq_u16_u8(row23.val[0]));
196 const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
197 vreinterpretq_u16_u8(row23.val[1]));
198 const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
199 vreinterpretq_u16_u8(row67.val[0]));
200 const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
201 vreinterpretq_u16_u8(row67.val[1]));
202 const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
203 vreinterpretq_u32_u16(row46.val[0]));
204 const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
205 vreinterpretq_u32_u16(row46.val[1]));
206 const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
207 vreinterpretq_u32_u16(row57.val[0]));
208 const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
209 vreinterpretq_u32_u16(row57.val[1]));
210 *p3 = vreinterpretq_u8_u32(row04.val[0]);
211 *p2 = vreinterpretq_u8_u32(row15.val[0]);
212 *p1 = vreinterpretq_u8_u32(row26.val[0]);
213 *p0 = vreinterpretq_u8_u32(row37.val[0]);
214 *q0 = vreinterpretq_u8_u32(row04.val[1]);
215 *q1 = vreinterpretq_u8_u32(row15.val[1]);
216 *q2 = vreinterpretq_u8_u32(row26.val[1]);
217 *q3 = vreinterpretq_u8_u32(row37.val[1]);
218}
219#undef LOAD_UV_8
220
221#endif // !WORK_AROUND_GCC
222
223static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v,
224 uint8_t* const dst, int stride) {
225 vst2_lane_u8(dst + 0 * stride, v, 0);
226 vst2_lane_u8(dst + 1 * stride, v, 1);
227 vst2_lane_u8(dst + 2 * stride, v, 2);
228 vst2_lane_u8(dst + 3 * stride, v, 3);
229 vst2_lane_u8(dst + 4 * stride, v, 4);
230 vst2_lane_u8(dst + 5 * stride, v, 5);
231 vst2_lane_u8(dst + 6 * stride, v, 6);
232 vst2_lane_u8(dst + 7 * stride, v, 7);
233}
234
235static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0,
236 uint8_t* const dst, int stride) {
237 uint8x8x2_t lo, hi;
238 lo.val[0] = vget_low_u8(p0);
239 lo.val[1] = vget_low_u8(q0);
240 hi.val[0] = vget_high_u8(p0);
241 hi.val[1] = vget_high_u8(q0);
242 Store2x8_NEON(lo, dst - 1 + 0 * stride, stride);
243 Store2x8_NEON(hi, dst - 1 + 8 * stride, stride);
244}
245
246#if !defined(WORK_AROUND_GCC)
247static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v,
248 uint8_t* const dst, int stride) {
249 vst4_lane_u8(dst + 0 * stride, v, 0);
250 vst4_lane_u8(dst + 1 * stride, v, 1);
251 vst4_lane_u8(dst + 2 * stride, v, 2);
252 vst4_lane_u8(dst + 3 * stride, v, 3);
253 vst4_lane_u8(dst + 4 * stride, v, 4);
254 vst4_lane_u8(dst + 5 * stride, v, 5);
255 vst4_lane_u8(dst + 6 * stride, v, 6);
256 vst4_lane_u8(dst + 7 * stride, v, 7);
257}
258
259static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0,
260 const uint8x16_t q0, const uint8x16_t q1,
261 uint8_t* const dst, int stride) {
262 uint8x8x4_t lo, hi;
263 INIT_VECTOR4(lo,
264 vget_low_u8(p1), vget_low_u8(p0),
265 vget_low_u8(q0), vget_low_u8(q1));
266 INIT_VECTOR4(hi,
267 vget_high_u8(p1), vget_high_u8(p0),
268 vget_high_u8(q0), vget_high_u8(q1));
269 Store4x8_NEON(lo, dst - 2 + 0 * stride, stride);
270 Store4x8_NEON(hi, dst - 2 + 8 * stride, stride);
271}
272#endif // !WORK_AROUND_GCC
273
274static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0,
275 uint8_t* const dst, int stride) {
276 vst1q_u8(dst - stride, p0);
277 vst1q_u8(dst, q0);
278}
279
280static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0,
281 const uint8x16_t q0, const uint8x16_t q1,
282 uint8_t* const dst, int stride) {
283 Store16x2_NEON(p1, p0, dst - stride, stride);
284 Store16x2_NEON(q0, q1, dst + stride, stride);
285}
286
287static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0,
288 const uint8x16_t q0,
289 uint8_t* const u, uint8_t* const v,
290 int stride) {
291 // p0 and q0 contain the u+v samples packed in low/high halves.
292 vst1_u8(u - stride, vget_low_u8(p0));
293 vst1_u8(u, vget_low_u8(q0));
294 vst1_u8(v - stride, vget_high_u8(p0));
295 vst1_u8(v, vget_high_u8(q0));
296}
297
298static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1,
299 const uint8x16_t p0,
300 const uint8x16_t q0,
301 const uint8x16_t q1,
302 uint8_t* const u, uint8_t* const v,
303 int stride) {
304 // The p1...q1 registers contain the u+v samples packed in low/high halves.
305 Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride);
306 Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride);
307}
308
309#if !defined(WORK_AROUND_GCC)
310
311#define STORE6_LANE(DST, VAL0, VAL1, LANE) do { \
312 vst3_lane_u8((DST) - 3, (VAL0), (LANE)); \
313 vst3_lane_u8((DST) + 0, (VAL1), (LANE)); \
314 (DST) += stride; \
315} while (0)
316
317static WEBP_INLINE void Store6x8x2_NEON(
318 const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
319 const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
320 uint8_t* u, uint8_t* v, int stride) {
321 uint8x8x3_t u0, u1, v0, v1;
322 INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
323 INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
324 INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
325 INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
326 STORE6_LANE(u, u0, u1, 0);
327 STORE6_LANE(u, u0, u1, 1);
328 STORE6_LANE(u, u0, u1, 2);
329 STORE6_LANE(u, u0, u1, 3);
330 STORE6_LANE(u, u0, u1, 4);
331 STORE6_LANE(u, u0, u1, 5);
332 STORE6_LANE(u, u0, u1, 6);
333 STORE6_LANE(u, u0, u1, 7);
334 STORE6_LANE(v, v0, v1, 0);
335 STORE6_LANE(v, v0, v1, 1);
336 STORE6_LANE(v, v0, v1, 2);
337 STORE6_LANE(v, v0, v1, 3);
338 STORE6_LANE(v, v0, v1, 4);
339 STORE6_LANE(v, v0, v1, 5);
340 STORE6_LANE(v, v0, v1, 6);
341 STORE6_LANE(v, v0, v1, 7);
342}
343#undef STORE6_LANE
344
345static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1,
346 const uint8x16_t p0,
347 const uint8x16_t q0,
348 const uint8x16_t q1,
349 uint8_t* const u, uint8_t* const v,
350 int stride) {
351 uint8x8x4_t u0, v0;
352 INIT_VECTOR4(u0,
353 vget_low_u8(p1), vget_low_u8(p0),
354 vget_low_u8(q0), vget_low_u8(q1));
355 INIT_VECTOR4(v0,
356 vget_high_u8(p1), vget_high_u8(p0),
357 vget_high_u8(q0), vget_high_u8(q1));
358 vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
359 vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
360 vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
361 vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
362 vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
363 vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
364 vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
365 vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
366 vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
367 vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
368 vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
369 vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
370 vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
371 vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
372 vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
373 vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
374}
375
376#endif // !WORK_AROUND_GCC
377
378// Zero extend 'v' to an int16x8_t.
379static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) {
380 return vreinterpretq_s16_u16(vmovl_u8(v));
381}
382
383// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
384// to the corresponding rows of 'dst'.
385static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
386 const int16x8_t dst01,
387 const int16x8_t dst23) {
388 // Unsigned saturate to 8b.
389 const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
390 const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
391
392 // Store the results.
393 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
394 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
395 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
396 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
397}
398
399static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
400 const int16x8_t row23,
401 uint8_t* const dst) {
402 uint32x2_t dst01 = vdup_n_u32(0);
403 uint32x2_t dst23 = vdup_n_u32(0);
404
405 // Load the source pixels.
406 dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
407 dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
408 dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
409 dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
410
411 {
412 // Convert to 16b.
413 const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01));
414 const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23));
415
416 // Descale with rounding.
417 const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
418 const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
419 // Add the inverse transform.
420 SaturateAndStore4x4_NEON(dst, out01, out23);
421 }
422}
423
424//-----------------------------------------------------------------------------
425// Simple In-loop filtering (Paragraph 15.2)
426
427static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0,
428 const uint8x16_t q0, const uint8x16_t q1,
429 int thresh) {
430 const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
431 const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0); // abs(p0-q0)
432 const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1); // abs(p1-q1)
433 const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0); // 2 * abs(p0-q0)
434 const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1); // abs(p1-q1) / 2
435 const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
436 const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
437 return mask;
438}
439
440static int8x16_t FlipSign_NEON(const uint8x16_t v) {
441 const uint8x16_t sign_bit = vdupq_n_u8(0x80);
442 return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
443}
444
445static uint8x16_t FlipSignBack_NEON(const int8x16_t v) {
446 const int8x16_t sign_bit = vdupq_n_s8(0x80);
447 return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
448}
449
450static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0,
451 const int8x16_t q0, const int8x16_t q1) {
452 const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
453 const int8x16_t p1_q1 = vqsubq_s8(p1, q1); // (p1-q1)
454 const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0); // (p1-q1) + 1 * (q0 - p0)
455 const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // (p1-q1) + 2 * (q0 - p0)
456 const int8x16_t s3 = vqaddq_s8(q0_p0, s2); // (p1-q1) + 3 * (q0 - p0)
457 return s3;
458}
459
460static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) {
461 const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
462 const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 * (q0 - p0)
463 const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 * (q0 - p0)
464 return s2;
465}
466
467//------------------------------------------------------------------------------
468
469static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s,
470 const int8x16_t delta,
471 int8x16_t* const op0,
472 int8x16_t* const oq0) {
473 const int8x16_t kCst3 = vdupq_n_s8(0x03);
474 const int8x16_t kCst4 = vdupq_n_s8(0x04);
475 const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
476 const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
477 const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
478 const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
479 *op0 = vqaddq_s8(p0s, delta3);
480 *oq0 = vqsubq_s8(q0s, delta4);
481}
482
483#if defined(WEBP_USE_INTRINSICS)
484
485static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s,
486 const int8x16_t delta,
487 uint8x16_t* const op0, uint8x16_t* const oq0) {
488 const int8x16_t kCst3 = vdupq_n_s8(0x03);
489 const int8x16_t kCst4 = vdupq_n_s8(0x04);
490 const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
491 const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
492 const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
493 const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
494 const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
495 const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
496 *op0 = FlipSignBack_NEON(sp0);
497 *oq0 = FlipSignBack_NEON(sq0);
498}
499
500static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0,
501 const uint8x16_t q0, const uint8x16_t q1,
502 const uint8x16_t mask,
503 uint8x16_t* const op0, uint8x16_t* const oq0) {
504 const int8x16_t p1s = FlipSign_NEON(p1);
505 const int8x16_t p0s = FlipSign_NEON(p0);
506 const int8x16_t q0s = FlipSign_NEON(q0);
507 const int8x16_t q1s = FlipSign_NEON(q1);
508 const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
509 const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
510 ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0);
511}
512
513static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
514 uint8x16_t p1, p0, q0, q1, op0, oq0;
515 Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1);
516 {
517 const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
518 DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
519 }
520 Store16x2_NEON(op0, oq0, p, stride);
521}
522
523static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
524 uint8x16_t p1, p0, q0, q1, oq0, op0;
525 Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1);
526 {
527 const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
528 DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
529 }
530 Store2x16_NEON(op0, oq0, p, stride);
531}
532
533#else
534
535// Load/Store vertical edge
536#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \
537 "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
538 "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
539 "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
540 "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
541 "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
542 "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
543 "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
544 "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
545
546#define STORE8x2(c1, c2, p, stride) \
547 "vst2.8 {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n" \
548 "vst2.8 {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n" \
549 "vst2.8 {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n" \
550 "vst2.8 {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n" \
551 "vst2.8 {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n" \
552 "vst2.8 {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n" \
553 "vst2.8 {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n" \
554 "vst2.8 {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
555
556#define QRegs "q0", "q1", "q2", "q3", \
557 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
558
559#define FLIP_SIGN_BIT2(a, b, s) \
560 "veor " #a "," #a "," #s " \n" \
561 "veor " #b "," #b "," #s " \n" \
562
563#define FLIP_SIGN_BIT4(a, b, c, d, s) \
564 FLIP_SIGN_BIT2(a, b, s) \
565 FLIP_SIGN_BIT2(c, d, s) \
566
567#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask) \
568 "vabd.u8 q15," #p0 "," #q0 " \n" /* abs(p0 - q0) */ \
569 "vabd.u8 q14," #p1 "," #q1 " \n" /* abs(p1 - q1) */ \
570 "vqadd.u8 q15, q15, q15 \n" /* abs(p0 - q0) * 2 */ \
571 "vshr.u8 q14, q14, #1 \n" /* abs(p1 - q1) / 2 */ \
572 "vqadd.u8 q15, q15, q14 \n" /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
573 "vdup.8 q14, " #thresh " \n" \
574 "vcge.u8 " #mask ", q14, q15 \n" /* mask <= thresh */
575
576#define GET_BASE_DELTA(p1, p0, q0, q1, o) \
577 "vqsub.s8 q15," #q0 "," #p0 " \n" /* (q0 - p0) */ \
578 "vqsub.s8 " #o "," #p1 "," #q1 " \n" /* (p1 - q1) */ \
579 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 1 * (p0 - q0) */ \
580 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 2 * (p0 - q0) */ \
581 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 3 * (p0 - q0) */
582
583#define DO_SIMPLE_FILTER(p0, q0, fl) \
584 "vmov.i8 q15, #0x03 \n" \
585 "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 3 */ \
586 "vshr.s8 q15, q15, #3 \n" /* filter1 >> 3 */ \
587 "vqadd.s8 " #p0 "," #p0 ", q15 \n" /* p0 += filter1 */ \
588 \
589 "vmov.i8 q15, #0x04 \n" \
590 "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 4 */ \
591 "vshr.s8 q15, q15, #3 \n" /* filter2 >> 3 */ \
592 "vqsub.s8 " #q0 "," #q0 ", q15 \n" /* q0 -= filter2 */
593
594// Applies filter on 2 pixels (p0 and q0)
595#define DO_FILTER2(p1, p0, q0, q1, thresh) \
596 NEEDS_FILTER(p1, p0, q0, q1, thresh, q9) /* filter mask in q9 */ \
597 "vmov.i8 q10, #0x80 \n" /* sign bit */ \
598 FLIP_SIGN_BIT4(p1, p0, q0, q1, q10) /* convert to signed value */ \
599 GET_BASE_DELTA(p1, p0, q0, q1, q11) /* get filter level */ \
600 "vand q9, q9, q11 \n" /* apply filter mask */ \
601 DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \
602 FLIP_SIGN_BIT2(p0, q0, q10)
603
604static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
605 __asm__ volatile (
606 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride
607
608 "vld1.u8 {q1}, [%[p]], %[stride] \n" // p1
609 "vld1.u8 {q2}, [%[p]], %[stride] \n" // p0
610 "vld1.u8 {q3}, [%[p]], %[stride] \n" // q0
611 "vld1.u8 {q12}, [%[p]] \n" // q1
612
613 DO_FILTER2(q1, q2, q3, q12, %[thresh])
614
615 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride
616
617 "vst1.u8 {q2}, [%[p]], %[stride] \n" // store op0
618 "vst1.u8 {q3}, [%[p]] \n" // store oq0
619 : [p] "+r"(p)
620 : [stride] "r"(stride), [thresh] "r"(thresh)
621 : "memory", QRegs
622 );
623}
624
625static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
626 __asm__ volatile (
627 "sub r4, %[p], #2 \n" // base1 = p - 2
628 "lsl r6, %[stride], #1 \n" // r6 = 2 * stride
629 "add r5, r4, %[stride] \n" // base2 = base1 + stride
630
631 LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
632 LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
633 "vswp d3, d24 \n" // p1:q1 p0:q3
634 "vswp d5, d26 \n" // q0:q2 q1:q4
635 "vswp q2, q12 \n" // p1:q1 p0:q2 q0:q3 q1:q4
636
637 DO_FILTER2(q1, q2, q12, q13, %[thresh])
638
639 "sub %[p], %[p], #1 \n" // p - 1
640
641 "vswp d5, d24 \n"
642 STORE8x2(d4, d5, [%[p]], %[stride])
643 STORE8x2(d24, d25, [%[p]], %[stride])
644
645 : [p] "+r"(p)
646 : [stride] "r"(stride), [thresh] "r"(thresh)
647 : "memory", "r4", "r5", "r6", QRegs
648 );
649}
650
651#undef LOAD8x4
652#undef STORE8x2
653
654#endif // WEBP_USE_INTRINSICS
655
656static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) {
657 uint32_t k;
658 for (k = 3; k != 0; --k) {
659 p += 4 * stride;
660 SimpleVFilter16_NEON(p, stride, thresh);
661 }
662}
663
664static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) {
665 uint32_t k;
666 for (k = 3; k != 0; --k) {
667 p += 4;
668 SimpleHFilter16_NEON(p, stride, thresh);
669 }
670}
671
672//------------------------------------------------------------------------------
673// Complex In-loop filtering (Paragraph 15.3)
674
675static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0,
676 const uint8x16_t q0, const uint8x16_t q1,
677 int hev_thresh) {
678 const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
679 const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
680 const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
681 const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);
682 const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);
683 return mask;
684}
685
686static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2,
687 const uint8x16_t p1, const uint8x16_t p0,
688 const uint8x16_t q0, const uint8x16_t q1,
689 const uint8x16_t q2, const uint8x16_t q3,
690 int ithresh, int thresh) {
691 const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
692 const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2)
693 const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1); // abs(p2 - p1)
694 const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
695 const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2); // abs(q3 - q2)
696 const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1); // abs(q2 - q1)
697 const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
698 const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
699 const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
700 const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
701 const uint8x16_t max12 = vmaxq_u8(max1, max2);
702 const uint8x16_t max123 = vmaxq_u8(max12, max3);
703 const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
704 const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
705 const uint8x16_t mask = vandq_u8(mask1, mask2);
706 return mask;
707}
708
709// 4-points filter
710
711static void ApplyFilter4_NEON(
712 const int8x16_t p1, const int8x16_t p0,
713 const int8x16_t q0, const int8x16_t q1,
714 const int8x16_t delta0,
715 uint8x16_t* const op1, uint8x16_t* const op0,
716 uint8x16_t* const oq0, uint8x16_t* const oq1) {
717 const int8x16_t kCst3 = vdupq_n_s8(0x03);
718 const int8x16_t kCst4 = vdupq_n_s8(0x04);
719 const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
720 const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
721 const int8x16_t a1 = vshrq_n_s8(delta1, 3);
722 const int8x16_t a2 = vshrq_n_s8(delta2, 3);
723 const int8x16_t a3 = vrshrq_n_s8(a1, 1); // a3 = (a1 + 1) >> 1
724 *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2)); // clip(p0 + a2)
725 *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1)); // clip(q0 - a1)
726 *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3)); // clip(p1 + a3)
727 *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3)); // clip(q1 - a3)
728}
729
730static void DoFilter4_NEON(
731 const uint8x16_t p1, const uint8x16_t p0,
732 const uint8x16_t q0, const uint8x16_t q1,
733 const uint8x16_t mask, const uint8x16_t hev_mask,
734 uint8x16_t* const op1, uint8x16_t* const op0,
735 uint8x16_t* const oq0, uint8x16_t* const oq1) {
736 // This is a fused version of DoFilter2() calling ApplyFilter2 directly
737 const int8x16_t p1s = FlipSign_NEON(p1);
738 int8x16_t p0s = FlipSign_NEON(p0);
739 int8x16_t q0s = FlipSign_NEON(q0);
740 const int8x16_t q1s = FlipSign_NEON(q1);
741 const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
742
743 // do_filter2 part (simple loopfilter on pixels with hev)
744 {
745 const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
746 const int8x16_t simple_lf_delta =
747 vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
748 ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
749 }
750
751 // do_filter4 part (complex loopfilter on pixels without hev)
752 {
753 const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s);
754 // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
755 const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
756 const int8x16_t complex_lf_delta =
757 vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
758 ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
759 }
760}
761
762// 6-points filter
763
764static void ApplyFilter6_NEON(
765 const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
766 const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
767 const int8x16_t delta,
768 uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
769 uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
770 // We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7
771 // Turns out, there's a common sub-expression S=9 * a - 1 that can be used
772 // with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:
773 // X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7
774 const int8x8_t delta_lo = vget_low_s8(delta);
775 const int8x8_t delta_hi = vget_high_s8(delta);
776 const int8x8_t kCst9 = vdup_n_s8(9);
777 const int16x8_t kCstm1 = vdupq_n_s16(-1);
778 const int8x8_t kCst18 = vdup_n_s8(18);
779 const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo); // S = 9 * a - 1
780 const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);
781 const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo); // S + 18 * a
782 const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);
783 const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7); // (9 * a + 63) >> 7
784 const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7);
785 const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6); // (9 * a + 31) >> 6
786 const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6);
787 const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7); // (27 * a + 63) >> 7
788 const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7);
789 const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
790 const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
791 const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
792
793 *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1)); // clip(p0 + a1)
794 *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1)); // clip(q0 - q1)
795 *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2)); // clip(q1 - a2)
796 *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2)); // clip(p1 + a2)
797 *oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3)); // clip(q2 - a3)
798 *op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3)); // clip(p2 + a3)
799}
800
801static void DoFilter6_NEON(
802 const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
803 const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
804 const uint8x16_t mask, const uint8x16_t hev_mask,
805 uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
806 uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
807 // This is a fused version of DoFilter2() calling ApplyFilter2 directly
808 const int8x16_t p2s = FlipSign_NEON(p2);
809 const int8x16_t p1s = FlipSign_NEON(p1);
810 int8x16_t p0s = FlipSign_NEON(p0);
811 int8x16_t q0s = FlipSign_NEON(q0);
812 const int8x16_t q1s = FlipSign_NEON(q1);
813 const int8x16_t q2s = FlipSign_NEON(q2);
814 const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
815 const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
816
817 // do_filter2 part (simple loopfilter on pixels with hev)
818 {
819 const int8x16_t simple_lf_delta =
820 vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
821 ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
822 }
823
824 // do_filter6 part (complex loopfilter on pixels without hev)
825 {
826 // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
827 const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
828 const int8x16_t complex_lf_delta =
829 vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
830 ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
831 op2, op1, op0, oq0, oq1, oq2);
832 }
833}
834
835// on macroblock edges
836
837static void VFilter16_NEON(uint8_t* p, int stride,
838 int thresh, int ithresh, int hev_thresh) {
839 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
840 Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
841 {
842 const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
843 ithresh, thresh);
844 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
845 uint8x16_t op2, op1, op0, oq0, oq1, oq2;
846 DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
847 &op2, &op1, &op0, &oq0, &oq1, &oq2);
848 Store16x2_NEON(op2, op1, p - 2 * stride, stride);
849 Store16x2_NEON(op0, oq0, p + 0 * stride, stride);
850 Store16x2_NEON(oq1, oq2, p + 2 * stride, stride);
851 }
852}
853
854static void HFilter16_NEON(uint8_t* p, int stride,
855 int thresh, int ithresh, int hev_thresh) {
856 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
857 Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
858 {
859 const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
860 ithresh, thresh);
861 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
862 uint8x16_t op2, op1, op0, oq0, oq1, oq2;
863 DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
864 &op2, &op1, &op0, &oq0, &oq1, &oq2);
865 Store2x16_NEON(op2, op1, p - 2, stride);
866 Store2x16_NEON(op0, oq0, p + 0, stride);
867 Store2x16_NEON(oq1, oq2, p + 2, stride);
868 }
869}
870
871// on three inner edges
872static void VFilter16i_NEON(uint8_t* p, int stride,
873 int thresh, int ithresh, int hev_thresh) {
874 uint32_t k;
875 uint8x16_t p3, p2, p1, p0;
876 Load16x4_NEON(p + 2 * stride, stride, &p3, &p2, &p1, &p0);
877 for (k = 3; k != 0; --k) {
878 uint8x16_t q0, q1, q2, q3;
879 p += 4 * stride;
880 Load16x4_NEON(p + 2 * stride, stride, &q0, &q1, &q2, &q3);
881 {
882 const uint8x16_t mask =
883 NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
884 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
885 // p3 and p2 are not just temporary variables here: they will be
886 // re-used for next span. And q2/q3 will become p1/p0 accordingly.
887 DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
888 Store16x4_NEON(p1, p0, p3, p2, p, stride);
889 p1 = q2;
890 p0 = q3;
891 }
892 }
893}
894
895#if !defined(WORK_AROUND_GCC)
896static void HFilter16i_NEON(uint8_t* p, int stride,
897 int thresh, int ithresh, int hev_thresh) {
898 uint32_t k;
899 uint8x16_t p3, p2, p1, p0;
900 Load4x16_NEON(p + 2, stride, &p3, &p2, &p1, &p0);
901 for (k = 3; k != 0; --k) {
902 uint8x16_t q0, q1, q2, q3;
903 p += 4;
904 Load4x16_NEON(p + 2, stride, &q0, &q1, &q2, &q3);
905 {
906 const uint8x16_t mask =
907 NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
908 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
909 DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
910 Store4x16_NEON(p1, p0, p3, p2, p, stride);
911 p1 = q2;
912 p0 = q3;
913 }
914 }
915}
916#endif // !WORK_AROUND_GCC
917
918// 8-pixels wide variant, for chroma filtering
919static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
920 int thresh, int ithresh, int hev_thresh) {
921 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
922 Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
923 {
924 const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
925 ithresh, thresh);
926 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
927 uint8x16_t op2, op1, op0, oq0, oq1, oq2;
928 DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
929 &op2, &op1, &op0, &oq0, &oq1, &oq2);
930 Store8x2x2_NEON(op2, op1, u - 2 * stride, v - 2 * stride, stride);
931 Store8x2x2_NEON(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
932 Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
933 }
934}
935static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
936 int thresh, int ithresh, int hev_thresh) {
937 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
938 u += 4 * stride;
939 v += 4 * stride;
940 Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
941 {
942 const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
943 ithresh, thresh);
944 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
945 uint8x16_t op1, op0, oq0, oq1;
946 DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
947 Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride);
948 }
949}
950
951#if !defined(WORK_AROUND_GCC)
952static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
953 int thresh, int ithresh, int hev_thresh) {
954 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
955 Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
956 {
957 const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
958 ithresh, thresh);
959 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
960 uint8x16_t op2, op1, op0, oq0, oq1, oq2;
961 DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
962 &op2, &op1, &op0, &oq0, &oq1, &oq2);
963 Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
964 }
965}
966
967static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
968 int thresh, int ithresh, int hev_thresh) {
969 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
970 u += 4;
971 v += 4;
972 Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
973 {
974 const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
975 ithresh, thresh);
976 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
977 uint8x16_t op1, op0, oq0, oq1;
978 DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
979 Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride);
980 }
981}
982#endif // !WORK_AROUND_GCC
983
984//-----------------------------------------------------------------------------
985// Inverse transforms (Paragraph 14.4)
986
987// Technically these are unsigned but vqdmulh is only available in signed.
988// vqdmulh returns high half (effectively >> 16) but also doubles the value,
989// changing the >> 16 to >> 15 and requiring an additional >> 1.
990// We use this to our advantage with kC2. The canonical value is 35468.
991// However, the high bit is set so treating it as signed will give incorrect
992// results. We avoid this by down shifting by 1 here to clear the highest bit.
993// Combined with the doubling effect of vqdmulh we get >> 16.
994// This can not be applied to kC1 because the lowest bit is set. Down shifting
995// the constant would reduce precision.
996
997// libwebp uses a trick to avoid some extra addition that libvpx does.
998// Instead of:
999// temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
1000// libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
1001// same issue with kC1 and vqdmulh that we work around by down shifting kC2
1002
1003static const int16_t kC1 = 20091;
1004static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
1005
1006#if defined(WEBP_USE_INTRINSICS)
1007static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
1008 const int16x8_t in1,
1009 int16x8x2_t* const out) {
1010 // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
1011 // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
1012 const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
1013 // b0 d0 b1 d1 b2 d2 ...
1014 *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
1015}
1016
1017static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
1018 // {rows} = in0 | in4
1019 // in8 | in12
1020 // B1 = in4 | in12
1021 const int16x8_t B1 =
1022 vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
1023 // C0 = kC1 * in4 | kC1 * in12
1024 // C1 = kC2 * in4 | kC2 * in12
1025 const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
1026 const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
1027 const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
1028 vget_low_s16(rows->val[1])); // in0 + in8
1029 const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
1030 vget_low_s16(rows->val[1])); // in0 - in8
1031 // c = kC2 * in4 - kC1 * in12
1032 // d = kC1 * in4 + kC2 * in12
1033 const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
1034 const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
1035 const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b
1036 const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c
1037 const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
1038 const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
1039 const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
1040 Transpose8x2_NEON(E0, E1, rows);
1041}
1042
1043static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
1044 int16x8x2_t rows;
1045 INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
1046 TransformPass_NEON(&rows);
1047 TransformPass_NEON(&rows);
1048 Add4x4_NEON(rows.val[0], rows.val[1], dst);
1049}
1050
1051#else
1052
1053static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
1054 const int kBPS = BPS;
1055 // kC1, kC2. Padded because vld1.16 loads 8 bytes
1056 const int16_t constants[4] = { kC1, kC2, 0, 0 };
1057 /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
1058 __asm__ volatile (
1059 "vld1.16 {q1, q2}, [%[in]] \n"
1060 "vld1.16 {d0}, [%[constants]] \n"
1061
1062 /* d2: in[0]
1063 * d3: in[8]
1064 * d4: in[4]
1065 * d5: in[12]
1066 */
1067 "vswp d3, d4 \n"
1068
1069 /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
1070 * q9 = {in[4], in[12]} * kC2 >> 16
1071 */
1072 "vqdmulh.s16 q8, q2, d0[0] \n"
1073 "vqdmulh.s16 q9, q2, d0[1] \n"
1074
1075 /* d22 = a = in[0] + in[8]
1076 * d23 = b = in[0] - in[8]
1077 */
1078 "vqadd.s16 d22, d2, d3 \n"
1079 "vqsub.s16 d23, d2, d3 \n"
1080
1081 /* The multiplication should be x * kC1 >> 16
1082 * However, with vqdmulh we get x * kC1 * 2 >> 16
1083 * (multiply, double, return high half)
1084 * We avoided this in kC2 by pre-shifting the constant.
1085 * q8 = in[4]/[12] * kC1 >> 16
1086 */
1087 "vshr.s16 q8, q8, #1 \n"
1088
1089 /* Add {in[4], in[12]} back after the multiplication. This is handled by
1090 * adding 1 << 16 to kC1 in the libwebp C code.
1091 */
1092 "vqadd.s16 q8, q2, q8 \n"
1093
1094 /* d20 = c = in[4]*kC2 - in[12]*kC1
1095 * d21 = d = in[4]*kC1 + in[12]*kC2
1096 */
1097 "vqsub.s16 d20, d18, d17 \n"
1098 "vqadd.s16 d21, d19, d16 \n"
1099
1100 /* d2 = tmp[0] = a + d
1101 * d3 = tmp[1] = b + c
1102 * d4 = tmp[2] = b - c
1103 * d5 = tmp[3] = a - d
1104 */
1105 "vqadd.s16 d2, d22, d21 \n"
1106 "vqadd.s16 d3, d23, d20 \n"
1107 "vqsub.s16 d4, d23, d20 \n"
1108 "vqsub.s16 d5, d22, d21 \n"
1109
1110 "vzip.16 q1, q2 \n"
1111 "vzip.16 q1, q2 \n"
1112
1113 "vswp d3, d4 \n"
1114
1115 /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
1116 * q9 = {tmp[4], tmp[12]} * kC2 >> 16
1117 */
1118 "vqdmulh.s16 q8, q2, d0[0] \n"
1119 "vqdmulh.s16 q9, q2, d0[1] \n"
1120
1121 /* d22 = a = tmp[0] + tmp[8]
1122 * d23 = b = tmp[0] - tmp[8]
1123 */
1124 "vqadd.s16 d22, d2, d3 \n"
1125 "vqsub.s16 d23, d2, d3 \n"
1126
1127 /* See long winded explanations prior */
1128 "vshr.s16 q8, q8, #1 \n"
1129 "vqadd.s16 q8, q2, q8 \n"
1130
1131 /* d20 = c = in[4]*kC2 - in[12]*kC1
1132 * d21 = d = in[4]*kC1 + in[12]*kC2
1133 */
1134 "vqsub.s16 d20, d18, d17 \n"
1135 "vqadd.s16 d21, d19, d16 \n"
1136
1137 /* d2 = tmp[0] = a + d
1138 * d3 = tmp[1] = b + c
1139 * d4 = tmp[2] = b - c
1140 * d5 = tmp[3] = a - d
1141 */
1142 "vqadd.s16 d2, d22, d21 \n"
1143 "vqadd.s16 d3, d23, d20 \n"
1144 "vqsub.s16 d4, d23, d20 \n"
1145 "vqsub.s16 d5, d22, d21 \n"
1146
1147 "vld1.32 d6[0], [%[dst]], %[kBPS] \n"
1148 "vld1.32 d6[1], [%[dst]], %[kBPS] \n"
1149 "vld1.32 d7[0], [%[dst]], %[kBPS] \n"
1150 "vld1.32 d7[1], [%[dst]], %[kBPS] \n"
1151
1152 "sub %[dst], %[dst], %[kBPS], lsl #2 \n"
1153
1154 /* (val) + 4 >> 3 */
1155 "vrshr.s16 d2, d2, #3 \n"
1156 "vrshr.s16 d3, d3, #3 \n"
1157 "vrshr.s16 d4, d4, #3 \n"
1158 "vrshr.s16 d5, d5, #3 \n"
1159
1160 "vzip.16 q1, q2 \n"
1161 "vzip.16 q1, q2 \n"
1162
1163 /* Must accumulate before saturating */
1164 "vmovl.u8 q8, d6 \n"
1165 "vmovl.u8 q9, d7 \n"
1166
1167 "vqadd.s16 q1, q1, q8 \n"
1168 "vqadd.s16 q2, q2, q9 \n"
1169
1170 "vqmovun.s16 d0, q1 \n"
1171 "vqmovun.s16 d1, q2 \n"
1172
1173 "vst1.32 d0[0], [%[dst]], %[kBPS] \n"
1174 "vst1.32 d0[1], [%[dst]], %[kBPS] \n"
1175 "vst1.32 d1[0], [%[dst]], %[kBPS] \n"
1176 "vst1.32 d1[1], [%[dst]] \n"
1177
1178 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */
1179 : [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */
1180 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */
1181 );
1182}
1183
1184#endif // WEBP_USE_INTRINSICS
1185
1186static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) {
1187 TransformOne_NEON(in, dst);
1188 if (do_two) {
1189 TransformOne_NEON(in + 16, dst + 4);
1190 }
1191}
1192
1193static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
1194 const int16x8_t DC = vdupq_n_s16(in[0]);
1195 Add4x4_NEON(DC, DC, dst);
1196}
1197
1198//------------------------------------------------------------------------------
1199
1200#define STORE_WHT(dst, col, rows) do { \
1201 *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
1202 *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
1203 *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
1204 *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
1205} while (0)
1206
1207static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
1208 int32x4x4_t tmp;
1209
1210 {
1211 // Load the source.
1212 const int16x4_t in00_03 = vld1_s16(in + 0);
1213 const int16x4_t in04_07 = vld1_s16(in + 4);
1214 const int16x4_t in08_11 = vld1_s16(in + 8);
1215 const int16x4_t in12_15 = vld1_s16(in + 12);
1216 const int32x4_t a0 = vaddl_s16(in00_03, in12_15); // in[0..3] + in[12..15]
1217 const int32x4_t a1 = vaddl_s16(in04_07, in08_11); // in[4..7] + in[8..11]
1218 const int32x4_t a2 = vsubl_s16(in04_07, in08_11); // in[4..7] - in[8..11]
1219 const int32x4_t a3 = vsubl_s16(in00_03, in12_15); // in[0..3] - in[12..15]
1220 tmp.val[0] = vaddq_s32(a0, a1);
1221 tmp.val[1] = vaddq_s32(a3, a2);
1222 tmp.val[2] = vsubq_s32(a0, a1);
1223 tmp.val[3] = vsubq_s32(a3, a2);
1224 // Arrange the temporary results column-wise.
1225 tmp = Transpose4x4_NEON(tmp);
1226 }
1227
1228 {
1229 const int32x4_t kCst3 = vdupq_n_s32(3);
1230 const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3); // add rounder
1231 const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
1232 const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
1233 const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
1234 const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
1235
1236 tmp.val[0] = vaddq_s32(a0, a1);
1237 tmp.val[1] = vaddq_s32(a3, a2);
1238 tmp.val[2] = vsubq_s32(a0, a1);
1239 tmp.val[3] = vsubq_s32(a3, a2);
1240
1241 // right shift the results by 3.
1242 tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
1243 tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
1244 tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
1245 tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
1246
1247 STORE_WHT(out, 0, tmp);
1248 STORE_WHT(out, 1, tmp);
1249 STORE_WHT(out, 2, tmp);
1250 STORE_WHT(out, 3, tmp);
1251 }
1252}
1253
1254#undef STORE_WHT
1255
1256//------------------------------------------------------------------------------
1257
1258#define MUL(a, b) (((a) * (b)) >> 16)
1259static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
1260 static const int kC1_full = 20091 + (1 << 16);
1261 static const int kC2_full = 35468;
1262 const int16x4_t A = vld1_dup_s16(in);
1263 const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
1264 const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
1265 const int c1 = MUL(in[1], kC2_full);
1266 const int d1 = MUL(in[1], kC1_full);
1267 const uint64_t cd = (uint64_t)( d1 & 0xffff) << 0 |
1268 (uint64_t)( c1 & 0xffff) << 16 |
1269 (uint64_t)(-c1 & 0xffff) << 32 |
1270 (uint64_t)(-d1 & 0xffff) << 48;
1271 const int16x4_t CD = vcreate_s16(cd);
1272 const int16x4_t B = vqadd_s16(A, CD);
1273 const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
1274 const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
1275 Add4x4_NEON(m0_m1, m2_m3, dst);
1276}
1277#undef MUL
1278
1279//------------------------------------------------------------------------------
1280// 4x4
1281
1282static void DC4_NEON(uint8_t* dst) { // DC
1283 const uint8x8_t A = vld1_u8(dst - BPS); // top row
1284 const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
1285 const uint16x4_t p1 = vpadd_u16(p0, p0);
1286 const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
1287 const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
1288 const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
1289 const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
1290 const uint16x8_t s0 = vaddl_u8(L0, L1);
1291 const uint16x8_t s1 = vaddl_u8(L2, L3);
1292 const uint16x8_t s01 = vaddq_u16(s0, s1);
1293 const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
1294 const uint8x8_t dc0 = vrshrn_n_u16(sum, 3); // (sum + 4) >> 3
1295 const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1296 int i;
1297 for (i = 0; i < 4; ++i) {
1298 vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
1299 }
1300}
1301
1302// TrueMotion (4x4 + 8x8)
1303static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
1304 const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
1305 const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]'
1306 const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL)); // A[c] - A[-1]
1307 int y;
1308 for (y = 0; y < size; y += 4) {
1309 // left edge
1310 const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
1311 const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
1312 const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
1313 const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
1314 const int16x8_t r0 = vaddq_s16(L0, d); // L[r] + A[c] - A[-1]
1315 const int16x8_t r1 = vaddq_s16(L1, d);
1316 const int16x8_t r2 = vaddq_s16(L2, d);
1317 const int16x8_t r3 = vaddq_s16(L3, d);
1318 // Saturate and store the result.
1319 const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
1320 const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
1321 const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
1322 const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
1323 if (size == 4) {
1324 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
1325 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
1326 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
1327 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
1328 } else {
1329 vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
1330 vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
1331 vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
1332 vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
1333 }
1334 dst += 4 * BPS;
1335 }
1336}
1337
1338static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); }
1339
1340static void VE4_NEON(uint8_t* dst) { // vertical
1341 // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
1342 const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1)); // top row
1343 const uint64x1_t A1 = vshr_n_u64(A0, 8);
1344 const uint64x1_t A2 = vshr_n_u64(A0, 16);
1345 const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
1346 const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
1347 const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
1348 const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
1349 const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
1350 int i;
1351 for (i = 0; i < 4; ++i) {
1352 vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
1353 }
1354}
1355
1356static void RD4_NEON(uint8_t* dst) { // Down-right
1357 const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
1358 const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
1359 const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
1360 const uint32_t I = dst[-1 + 0 * BPS];
1361 const uint32_t J = dst[-1 + 1 * BPS];
1362 const uint32_t K = dst[-1 + 2 * BPS];
1363 const uint32_t L = dst[-1 + 3 * BPS];
1364 const uint64x1_t LKJI____ =
1365 vcreate_u64((uint64_t)L | (K << 8) | (J << 16) | (I << 24));
1366 const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
1367 const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
1368 const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
1369 const uint8_t D = vget_lane_u8(XABCD_u8, 4);
1370 const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
1371 const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
1372 const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
1373 const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
1374 const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1375 const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
1376 const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1377 const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1378 const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1379 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1380 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1381 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1382 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1383}
1384
1385static void LD4_NEON(uint8_t* dst) { // Down-left
1386 // Note using the same shift trick as VE4() is slower here.
1387 const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
1388 const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
1389 const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
1390 const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
1391 const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
1392 const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
1393 const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1394 const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
1395 const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1396 const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1397 const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1398 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1399 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1400 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1401 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1402}
1403
1404//------------------------------------------------------------------------------
1405// Chroma
1406
1407static void VE8uv_NEON(uint8_t* dst) { // vertical
1408 const uint8x8_t top = vld1_u8(dst - BPS);
1409 int j;
1410 for (j = 0; j < 8; ++j) {
1411 vst1_u8(dst + j * BPS, top);
1412 }
1413}
1414
1415static void HE8uv_NEON(uint8_t* dst) { // horizontal
1416 int j;
1417 for (j = 0; j < 8; ++j) {
1418 const uint8x8_t left = vld1_dup_u8(dst - 1);
1419 vst1_u8(dst, left);
1420 dst += BPS;
1421 }
1422}
1423
1424static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
1425 uint16x8_t sum_top;
1426 uint16x8_t sum_left;
1427 uint8x8_t dc0;
1428
1429 if (do_top) {
1430 const uint8x8_t A = vld1_u8(dst - BPS); // top row
1431#if WEBP_AARCH64
1432 const uint16_t p2 = vaddlv_u8(A);
1433 sum_top = vdupq_n_u16(p2);
1434#else
1435 const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
1436 const uint16x4_t p1 = vpadd_u16(p0, p0);
1437 const uint16x4_t p2 = vpadd_u16(p1, p1);
1438 sum_top = vcombine_u16(p2, p2);
1439#endif
1440 }
1441
1442 if (do_left) {
1443 const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
1444 const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
1445 const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
1446 const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
1447 const uint8x8_t L4 = vld1_u8(dst + 4 * BPS - 1);
1448 const uint8x8_t L5 = vld1_u8(dst + 5 * BPS - 1);
1449 const uint8x8_t L6 = vld1_u8(dst + 6 * BPS - 1);
1450 const uint8x8_t L7 = vld1_u8(dst + 7 * BPS - 1);
1451 const uint16x8_t s0 = vaddl_u8(L0, L1);
1452 const uint16x8_t s1 = vaddl_u8(L2, L3);
1453 const uint16x8_t s2 = vaddl_u8(L4, L5);
1454 const uint16x8_t s3 = vaddl_u8(L6, L7);
1455 const uint16x8_t s01 = vaddq_u16(s0, s1);
1456 const uint16x8_t s23 = vaddq_u16(s2, s3);
1457 sum_left = vaddq_u16(s01, s23);
1458 }
1459
1460 if (do_top && do_left) {
1461 const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1462 dc0 = vrshrn_n_u16(sum, 4);
1463 } else if (do_top) {
1464 dc0 = vrshrn_n_u16(sum_top, 3);
1465 } else if (do_left) {
1466 dc0 = vrshrn_n_u16(sum_left, 3);
1467 } else {
1468 dc0 = vdup_n_u8(0x80);
1469 }
1470
1471 {
1472 const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1473 int i;
1474 for (i = 0; i < 8; ++i) {
1475 vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
1476 }
1477 }
1478}
1479
1480static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); }
1481static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); }
1482static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); }
1483static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); }
1484
1485static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); }
1486
1487//------------------------------------------------------------------------------
1488// 16x16
1489
1490static void VE16_NEON(uint8_t* dst) { // vertical
1491 const uint8x16_t top = vld1q_u8(dst - BPS);
1492 int j;
1493 for (j = 0; j < 16; ++j) {
1494 vst1q_u8(dst + j * BPS, top);
1495 }
1496}
1497
1498static void HE16_NEON(uint8_t* dst) { // horizontal
1499 int j;
1500 for (j = 0; j < 16; ++j) {
1501 const uint8x16_t left = vld1q_dup_u8(dst - 1);
1502 vst1q_u8(dst, left);
1503 dst += BPS;
1504 }
1505}
1506
1507static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
1508 uint16x8_t sum_top;
1509 uint16x8_t sum_left;
1510 uint8x8_t dc0;
1511
1512 if (do_top) {
1513 const uint8x16_t A = vld1q_u8(dst - BPS); // top row
1514#if WEBP_AARCH64
1515 const uint16_t p3 = vaddlvq_u8(A);
1516 sum_top = vdupq_n_u16(p3);
1517#else
1518 const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
1519 const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
1520 const uint16x4_t p2 = vpadd_u16(p1, p1);
1521 const uint16x4_t p3 = vpadd_u16(p2, p2);
1522 sum_top = vcombine_u16(p3, p3);
1523#endif
1524 }
1525
1526 if (do_left) {
1527 int i;
1528 sum_left = vdupq_n_u16(0);
1529 for (i = 0; i < 16; i += 8) {
1530 const uint8x8_t L0 = vld1_u8(dst + (i + 0) * BPS - 1);
1531 const uint8x8_t L1 = vld1_u8(dst + (i + 1) * BPS - 1);
1532 const uint8x8_t L2 = vld1_u8(dst + (i + 2) * BPS - 1);
1533 const uint8x8_t L3 = vld1_u8(dst + (i + 3) * BPS - 1);
1534 const uint8x8_t L4 = vld1_u8(dst + (i + 4) * BPS - 1);
1535 const uint8x8_t L5 = vld1_u8(dst + (i + 5) * BPS - 1);
1536 const uint8x8_t L6 = vld1_u8(dst + (i + 6) * BPS - 1);
1537 const uint8x8_t L7 = vld1_u8(dst + (i + 7) * BPS - 1);
1538 const uint16x8_t s0 = vaddl_u8(L0, L1);
1539 const uint16x8_t s1 = vaddl_u8(L2, L3);
1540 const uint16x8_t s2 = vaddl_u8(L4, L5);
1541 const uint16x8_t s3 = vaddl_u8(L6, L7);
1542 const uint16x8_t s01 = vaddq_u16(s0, s1);
1543 const uint16x8_t s23 = vaddq_u16(s2, s3);
1544 const uint16x8_t sum = vaddq_u16(s01, s23);
1545 sum_left = vaddq_u16(sum_left, sum);
1546 }
1547 }
1548
1549 if (do_top && do_left) {
1550 const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1551 dc0 = vrshrn_n_u16(sum, 5);
1552 } else if (do_top) {
1553 dc0 = vrshrn_n_u16(sum_top, 4);
1554 } else if (do_left) {
1555 dc0 = vrshrn_n_u16(sum_left, 4);
1556 } else {
1557 dc0 = vdup_n_u8(0x80);
1558 }
1559
1560 {
1561 const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
1562 int i;
1563 for (i = 0; i < 16; ++i) {
1564 vst1q_u8(dst + i * BPS, dc);
1565 }
1566 }
1567}
1568
1569static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); }
1570static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); }
1571static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); }
1572static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); }
1573
1574static void TM16_NEON(uint8_t* dst) {
1575 const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
1576 const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]'
1577 // A[c] - A[-1]
1578 const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));
1579 const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));
1580 int y;
1581 for (y = 0; y < 16; y += 4) {
1582 // left edge
1583 const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
1584 const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
1585 const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
1586 const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
1587 const int16x8_t r0_lo = vaddq_s16(L0, d_lo); // L[r] + A[c] - A[-1]
1588 const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
1589 const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
1590 const int16x8_t r3_lo = vaddq_s16(L3, d_lo);
1591 const int16x8_t r0_hi = vaddq_s16(L0, d_hi);
1592 const int16x8_t r1_hi = vaddq_s16(L1, d_hi);
1593 const int16x8_t r2_hi = vaddq_s16(L2, d_hi);
1594 const int16x8_t r3_hi = vaddq_s16(L3, d_hi);
1595 // Saturate and store the result.
1596 const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
1597 const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
1598 const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
1599 const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
1600 vst1q_u8(dst + 0 * BPS, row0);
1601 vst1q_u8(dst + 1 * BPS, row1);
1602 vst1q_u8(dst + 2 * BPS, row2);
1603 vst1q_u8(dst + 3 * BPS, row3);
1604 dst += 4 * BPS;
1605 }
1606}
1607
1608//------------------------------------------------------------------------------
1609// Entry point
1610
1611extern void VP8DspInitNEON(void);
1612
1613WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
1614 VP8Transform = TransformTwo_NEON;
1615 VP8TransformAC3 = TransformAC3_NEON;
1616 VP8TransformDC = TransformDC_NEON;
1617 VP8TransformWHT = TransformWHT_NEON;
1618
1619 VP8VFilter16 = VFilter16_NEON;
1620 VP8VFilter16i = VFilter16i_NEON;
1621 VP8HFilter16 = HFilter16_NEON;
1622#if !defined(WORK_AROUND_GCC)
1623 VP8HFilter16i = HFilter16i_NEON;
1624#endif
1625 VP8VFilter8 = VFilter8_NEON;
1626 VP8VFilter8i = VFilter8i_NEON;
1627#if !defined(WORK_AROUND_GCC)
1628 VP8HFilter8 = HFilter8_NEON;
1629 VP8HFilter8i = HFilter8i_NEON;
1630#endif
1631 VP8SimpleVFilter16 = SimpleVFilter16_NEON;
1632 VP8SimpleHFilter16 = SimpleHFilter16_NEON;
1633 VP8SimpleVFilter16i = SimpleVFilter16i_NEON;
1634 VP8SimpleHFilter16i = SimpleHFilter16i_NEON;
1635
1636 VP8PredLuma4[0] = DC4_NEON;
1637 VP8PredLuma4[1] = TM4_NEON;
1638 VP8PredLuma4[2] = VE4_NEON;
1639 VP8PredLuma4[4] = RD4_NEON;
1640 VP8PredLuma4[6] = LD4_NEON;
1641
1642 VP8PredLuma16[0] = DC16TopLeft_NEON;
1643 VP8PredLuma16[1] = TM16_NEON;
1644 VP8PredLuma16[2] = VE16_NEON;
1645 VP8PredLuma16[3] = HE16_NEON;
1646 VP8PredLuma16[4] = DC16NoTop_NEON;
1647 VP8PredLuma16[5] = DC16NoLeft_NEON;
1648 VP8PredLuma16[6] = DC16NoTopLeft_NEON;
1649
1650 VP8PredChroma8[0] = DC8uv_NEON;
1651 VP8PredChroma8[1] = TM8uv_NEON;
1652 VP8PredChroma8[2] = VE8uv_NEON;
1653 VP8PredChroma8[3] = HE8uv_NEON;
1654 VP8PredChroma8[4] = DC8uvNoTop_NEON;
1655 VP8PredChroma8[5] = DC8uvNoLeft_NEON;
1656 VP8PredChroma8[6] = DC8uvNoTopLeft_NEON;
1657}
1658
1659#else // !WEBP_USE_NEON
1660
1661WEBP_DSP_INIT_STUB(VP8DspInitNEON)
1662
1663#endif // WEBP_USE_NEON
1664