1// Copyright 2012 Google Inc. All Rights Reserved.
2//
3// Use of this source code is governed by a BSD-style license
4// that can be found in the COPYING file in the root of the source
5// tree. An additional intellectual property rights grant can be found
6// in the file PATENTS. All contributing project authors may
7// be found in the AUTHORS file in the root of the source tree.
8// -----------------------------------------------------------------------------
9//
10// ARM NEON version of dsp functions and loop filtering.
11//
12// Authors: Somnath Banerjee (somnath@google.com)
13// Johann Koenig (johannkoenig@google.com)
14
15#include "./dsp.h"
16
17#if defined(WEBP_USE_NEON)
18
19#include "./neon.h"
20#include "../dec/vp8i_dec.h"
21
22//------------------------------------------------------------------------------
23// NxM Loading functions
24
25// Load/Store vertical edge
26#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \
27 "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
28 "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
29 "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
30 "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
31 "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
32 "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
33 "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
34 "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
35
36#define STORE8x2(c1, c2, p, stride) \
37 "vst2.8 {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n" \
38 "vst2.8 {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n" \
39 "vst2.8 {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n" \
40 "vst2.8 {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n" \
41 "vst2.8 {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n" \
42 "vst2.8 {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n" \
43 "vst2.8 {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n" \
44 "vst2.8 {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
45
46#if !defined(WORK_AROUND_GCC)
47
48// This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
49// (register alloc, probably). The variants somewhat mitigate the problem, but
50// not quite. HFilter16i() remains problematic.
51static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) {
52 const uint8x8_t zero = vdup_n_u8(0);
53 uint8x8x4_t out;
54 INIT_VECTOR4(out, zero, zero, zero, zero);
55 out = vld4_lane_u8(src + 0 * stride, out, 0);
56 out = vld4_lane_u8(src + 1 * stride, out, 1);
57 out = vld4_lane_u8(src + 2 * stride, out, 2);
58 out = vld4_lane_u8(src + 3 * stride, out, 3);
59 out = vld4_lane_u8(src + 4 * stride, out, 4);
60 out = vld4_lane_u8(src + 5 * stride, out, 5);
61 out = vld4_lane_u8(src + 6 * stride, out, 6);
62 out = vld4_lane_u8(src + 7 * stride, out, 7);
63 return out;
64}
65
66static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride,
67 uint8x16_t* const p1, uint8x16_t* const p0,
68 uint8x16_t* const q0, uint8x16_t* const q1) {
69 // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
70 // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
71 const uint8x8x4_t row0 = Load4x8(src - 2 + 0 * stride, stride);
72 const uint8x8x4_t row8 = Load4x8(src - 2 + 8 * stride, stride);
73 *p1 = vcombine_u8(row0.val[0], row8.val[0]);
74 *p0 = vcombine_u8(row0.val[1], row8.val[1]);
75 *q0 = vcombine_u8(row0.val[2], row8.val[2]);
76 *q1 = vcombine_u8(row0.val[3], row8.val[3]);
77}
78
79#else // WORK_AROUND_GCC
80
81#define LOADQ_LANE_32b(VALUE, LANE) do { \
82 (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE)); \
83 src += stride; \
84} while (0)
85
86static WEBP_INLINE void Load4x16(const uint8_t* src, int stride,
87 uint8x16_t* const p1, uint8x16_t* const p0,
88 uint8x16_t* const q0, uint8x16_t* const q1) {
89 const uint32x4_t zero = vdupq_n_u32(0);
90 uint32x4x4_t in;
91 INIT_VECTOR4(in, zero, zero, zero, zero);
92 src -= 2;
93 LOADQ_LANE_32b(in.val[0], 0);
94 LOADQ_LANE_32b(in.val[1], 0);
95 LOADQ_LANE_32b(in.val[2], 0);
96 LOADQ_LANE_32b(in.val[3], 0);
97 LOADQ_LANE_32b(in.val[0], 1);
98 LOADQ_LANE_32b(in.val[1], 1);
99 LOADQ_LANE_32b(in.val[2], 1);
100 LOADQ_LANE_32b(in.val[3], 1);
101 LOADQ_LANE_32b(in.val[0], 2);
102 LOADQ_LANE_32b(in.val[1], 2);
103 LOADQ_LANE_32b(in.val[2], 2);
104 LOADQ_LANE_32b(in.val[3], 2);
105 LOADQ_LANE_32b(in.val[0], 3);
106 LOADQ_LANE_32b(in.val[1], 3);
107 LOADQ_LANE_32b(in.val[2], 3);
108 LOADQ_LANE_32b(in.val[3], 3);
109 // Transpose four 4x4 parts:
110 {
111 const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
112 vreinterpretq_u8_u32(in.val[1]));
113 const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
114 vreinterpretq_u8_u32(in.val[3]));
115 const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
116 vreinterpretq_u16_u8(row23.val[0]));
117 const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
118 vreinterpretq_u16_u8(row23.val[1]));
119 *p1 = vreinterpretq_u8_u16(row02.val[0]);
120 *p0 = vreinterpretq_u8_u16(row13.val[0]);
121 *q0 = vreinterpretq_u8_u16(row02.val[1]);
122 *q1 = vreinterpretq_u8_u16(row13.val[1]);
123 }
124}
125#undef LOADQ_LANE_32b
126
127#endif // !WORK_AROUND_GCC
128
129static WEBP_INLINE void Load8x16(const uint8_t* const src, int stride,
130 uint8x16_t* const p3, uint8x16_t* const p2,
131 uint8x16_t* const p1, uint8x16_t* const p0,
132 uint8x16_t* const q0, uint8x16_t* const q1,
133 uint8x16_t* const q2, uint8x16_t* const q3) {
134 Load4x16(src - 2, stride, p3, p2, p1, p0);
135 Load4x16(src + 2, stride, q0, q1, q2, q3);
136}
137
138static WEBP_INLINE void Load16x4(const uint8_t* const src, int stride,
139 uint8x16_t* const p1, uint8x16_t* const p0,
140 uint8x16_t* const q0, uint8x16_t* const q1) {
141 *p1 = vld1q_u8(src - 2 * stride);
142 *p0 = vld1q_u8(src - 1 * stride);
143 *q0 = vld1q_u8(src + 0 * stride);
144 *q1 = vld1q_u8(src + 1 * stride);
145}
146
147static WEBP_INLINE void Load16x8(const uint8_t* const src, int stride,
148 uint8x16_t* const p3, uint8x16_t* const p2,
149 uint8x16_t* const p1, uint8x16_t* const p0,
150 uint8x16_t* const q0, uint8x16_t* const q1,
151 uint8x16_t* const q2, uint8x16_t* const q3) {
152 Load16x4(src - 2 * stride, stride, p3, p2, p1, p0);
153 Load16x4(src + 2 * stride, stride, q0, q1, q2, q3);
154}
155
156static WEBP_INLINE void Load8x8x2(const uint8_t* const u,
157 const uint8_t* const v,
158 int stride,
159 uint8x16_t* const p3, uint8x16_t* const p2,
160 uint8x16_t* const p1, uint8x16_t* const p0,
161 uint8x16_t* const q0, uint8x16_t* const q1,
162 uint8x16_t* const q2, uint8x16_t* const q3) {
163 // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
164 // and the v-samples on the higher half.
165 *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
166 *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
167 *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
168 *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
169 *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
170 *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
171 *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
172 *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
173}
174
175#if !defined(WORK_AROUND_GCC)
176
177#define LOAD_UV_8(ROW) \
178 vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
179
180static WEBP_INLINE void Load8x8x2T(const uint8_t* const u,
181 const uint8_t* const v,
182 int stride,
183 uint8x16_t* const p3, uint8x16_t* const p2,
184 uint8x16_t* const p1, uint8x16_t* const p0,
185 uint8x16_t* const q0, uint8x16_t* const q1,
186 uint8x16_t* const q2, uint8x16_t* const q3) {
187 // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
188 // and the v-samples on the higher half.
189 const uint8x16_t row0 = LOAD_UV_8(0);
190 const uint8x16_t row1 = LOAD_UV_8(1);
191 const uint8x16_t row2 = LOAD_UV_8(2);
192 const uint8x16_t row3 = LOAD_UV_8(3);
193 const uint8x16_t row4 = LOAD_UV_8(4);
194 const uint8x16_t row5 = LOAD_UV_8(5);
195 const uint8x16_t row6 = LOAD_UV_8(6);
196 const uint8x16_t row7 = LOAD_UV_8(7);
197 // Perform two side-by-side 8x8 transposes
198 // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
199 // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
200 // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
201 // u30 u31 u32 u33 u34 u35 u36 u37 | ...
202 // u40 u41 u42 u43 u44 u45 u46 u47 | ...
203 // u50 u51 u52 u53 u54 u55 u56 u57 | ...
204 // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
205 // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
206 const uint8x16x2_t row01 = vtrnq_u8(row0, row1); // u00 u10 u02 u12 ...
207 // u01 u11 u03 u13 ...
208 const uint8x16x2_t row23 = vtrnq_u8(row2, row3); // u20 u30 u22 u32 ...
209 // u21 u31 u23 u33 ...
210 const uint8x16x2_t row45 = vtrnq_u8(row4, row5); // ...
211 const uint8x16x2_t row67 = vtrnq_u8(row6, row7); // ...
212 const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
213 vreinterpretq_u16_u8(row23.val[0]));
214 const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
215 vreinterpretq_u16_u8(row23.val[1]));
216 const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
217 vreinterpretq_u16_u8(row67.val[0]));
218 const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
219 vreinterpretq_u16_u8(row67.val[1]));
220 const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
221 vreinterpretq_u32_u16(row46.val[0]));
222 const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
223 vreinterpretq_u32_u16(row46.val[1]));
224 const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
225 vreinterpretq_u32_u16(row57.val[0]));
226 const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
227 vreinterpretq_u32_u16(row57.val[1]));
228 *p3 = vreinterpretq_u8_u32(row04.val[0]);
229 *p2 = vreinterpretq_u8_u32(row15.val[0]);
230 *p1 = vreinterpretq_u8_u32(row26.val[0]);
231 *p0 = vreinterpretq_u8_u32(row37.val[0]);
232 *q0 = vreinterpretq_u8_u32(row04.val[1]);
233 *q1 = vreinterpretq_u8_u32(row15.val[1]);
234 *q2 = vreinterpretq_u8_u32(row26.val[1]);
235 *q3 = vreinterpretq_u8_u32(row37.val[1]);
236}
237#undef LOAD_UV_8
238
239#endif // !WORK_AROUND_GCC
240
241static WEBP_INLINE void Store2x8(const uint8x8x2_t v,
242 uint8_t* const dst, int stride) {
243 vst2_lane_u8(dst + 0 * stride, v, 0);
244 vst2_lane_u8(dst + 1 * stride, v, 1);
245 vst2_lane_u8(dst + 2 * stride, v, 2);
246 vst2_lane_u8(dst + 3 * stride, v, 3);
247 vst2_lane_u8(dst + 4 * stride, v, 4);
248 vst2_lane_u8(dst + 5 * stride, v, 5);
249 vst2_lane_u8(dst + 6 * stride, v, 6);
250 vst2_lane_u8(dst + 7 * stride, v, 7);
251}
252
253static WEBP_INLINE void Store2x16(const uint8x16_t p0, const uint8x16_t q0,
254 uint8_t* const dst, int stride) {
255 uint8x8x2_t lo, hi;
256 lo.val[0] = vget_low_u8(p0);
257 lo.val[1] = vget_low_u8(q0);
258 hi.val[0] = vget_high_u8(p0);
259 hi.val[1] = vget_high_u8(q0);
260 Store2x8(lo, dst - 1 + 0 * stride, stride);
261 Store2x8(hi, dst - 1 + 8 * stride, stride);
262}
263
264#if !defined(WORK_AROUND_GCC)
265static WEBP_INLINE void Store4x8(const uint8x8x4_t v,
266 uint8_t* const dst, int stride) {
267 vst4_lane_u8(dst + 0 * stride, v, 0);
268 vst4_lane_u8(dst + 1 * stride, v, 1);
269 vst4_lane_u8(dst + 2 * stride, v, 2);
270 vst4_lane_u8(dst + 3 * stride, v, 3);
271 vst4_lane_u8(dst + 4 * stride, v, 4);
272 vst4_lane_u8(dst + 5 * stride, v, 5);
273 vst4_lane_u8(dst + 6 * stride, v, 6);
274 vst4_lane_u8(dst + 7 * stride, v, 7);
275}
276
277static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0,
278 const uint8x16_t q0, const uint8x16_t q1,
279 uint8_t* const dst, int stride) {
280 uint8x8x4_t lo, hi;
281 INIT_VECTOR4(lo,
282 vget_low_u8(p1), vget_low_u8(p0),
283 vget_low_u8(q0), vget_low_u8(q1));
284 INIT_VECTOR4(hi,
285 vget_high_u8(p1), vget_high_u8(p0),
286 vget_high_u8(q0), vget_high_u8(q1));
287 Store4x8(lo, dst - 2 + 0 * stride, stride);
288 Store4x8(hi, dst - 2 + 8 * stride, stride);
289}
290#endif // !WORK_AROUND_GCC
291
292static WEBP_INLINE void Store16x2(const uint8x16_t p0, const uint8x16_t q0,
293 uint8_t* const dst, int stride) {
294 vst1q_u8(dst - stride, p0);
295 vst1q_u8(dst, q0);
296}
297
298static WEBP_INLINE void Store16x4(const uint8x16_t p1, const uint8x16_t p0,
299 const uint8x16_t q0, const uint8x16_t q1,
300 uint8_t* const dst, int stride) {
301 Store16x2(p1, p0, dst - stride, stride);
302 Store16x2(q0, q1, dst + stride, stride);
303}
304
305static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0,
306 uint8_t* const u, uint8_t* const v,
307 int stride) {
308 // p0 and q0 contain the u+v samples packed in low/high halves.
309 vst1_u8(u - stride, vget_low_u8(p0));
310 vst1_u8(u, vget_low_u8(q0));
311 vst1_u8(v - stride, vget_high_u8(p0));
312 vst1_u8(v, vget_high_u8(q0));
313}
314
315static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0,
316 const uint8x16_t q0, const uint8x16_t q1,
317 uint8_t* const u, uint8_t* const v,
318 int stride) {
319 // The p1...q1 registers contain the u+v samples packed in low/high halves.
320 Store8x2x2(p1, p0, u - stride, v - stride, stride);
321 Store8x2x2(q0, q1, u + stride, v + stride, stride);
322}
323
324#if !defined(WORK_AROUND_GCC)
325
326#define STORE6_LANE(DST, VAL0, VAL1, LANE) do { \
327 vst3_lane_u8((DST) - 3, (VAL0), (LANE)); \
328 vst3_lane_u8((DST) + 0, (VAL1), (LANE)); \
329 (DST) += stride; \
330} while (0)
331
332static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1,
333 const uint8x16_t p0, const uint8x16_t q0,
334 const uint8x16_t q1, const uint8x16_t q2,
335 uint8_t* u, uint8_t* v,
336 int stride) {
337 uint8x8x3_t u0, u1, v0, v1;
338 INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
339 INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
340 INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
341 INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
342 STORE6_LANE(u, u0, u1, 0);
343 STORE6_LANE(u, u0, u1, 1);
344 STORE6_LANE(u, u0, u1, 2);
345 STORE6_LANE(u, u0, u1, 3);
346 STORE6_LANE(u, u0, u1, 4);
347 STORE6_LANE(u, u0, u1, 5);
348 STORE6_LANE(u, u0, u1, 6);
349 STORE6_LANE(u, u0, u1, 7);
350 STORE6_LANE(v, v0, v1, 0);
351 STORE6_LANE(v, v0, v1, 1);
352 STORE6_LANE(v, v0, v1, 2);
353 STORE6_LANE(v, v0, v1, 3);
354 STORE6_LANE(v, v0, v1, 4);
355 STORE6_LANE(v, v0, v1, 5);
356 STORE6_LANE(v, v0, v1, 6);
357 STORE6_LANE(v, v0, v1, 7);
358}
359#undef STORE6_LANE
360
361static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,
362 const uint8x16_t q0, const uint8x16_t q1,
363 uint8_t* const u, uint8_t* const v,
364 int stride) {
365 uint8x8x4_t u0, v0;
366 INIT_VECTOR4(u0,
367 vget_low_u8(p1), vget_low_u8(p0),
368 vget_low_u8(q0), vget_low_u8(q1));
369 INIT_VECTOR4(v0,
370 vget_high_u8(p1), vget_high_u8(p0),
371 vget_high_u8(q0), vget_high_u8(q1));
372 vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
373 vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
374 vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
375 vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
376 vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
377 vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
378 vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
379 vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
380 vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
381 vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
382 vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
383 vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
384 vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
385 vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
386 vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
387 vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
388}
389
390#endif // !WORK_AROUND_GCC
391
392// Zero extend 'v' to an int16x8_t.
393static WEBP_INLINE int16x8_t ConvertU8ToS16(uint8x8_t v) {
394 return vreinterpretq_s16_u16(vmovl_u8(v));
395}
396
397// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
398// to the corresponding rows of 'dst'.
399static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
400 const int16x8_t dst01,
401 const int16x8_t dst23) {
402 // Unsigned saturate to 8b.
403 const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
404 const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
405
406 // Store the results.
407 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
408 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
409 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
410 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
411}
412
413static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
414 uint8_t* const dst) {
415 uint32x2_t dst01 = vdup_n_u32(0);
416 uint32x2_t dst23 = vdup_n_u32(0);
417
418 // Load the source pixels.
419 dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
420 dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
421 dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
422 dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
423
424 {
425 // Convert to 16b.
426 const int16x8_t dst01_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst01));
427 const int16x8_t dst23_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst23));
428
429 // Descale with rounding.
430 const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
431 const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
432 // Add the inverse transform.
433 SaturateAndStore4x4(dst, out01, out23);
434 }
435}
436
437//-----------------------------------------------------------------------------
438// Simple In-loop filtering (Paragraph 15.2)
439
440static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0,
441 const uint8x16_t q0, const uint8x16_t q1,
442 int thresh) {
443 const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
444 const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0); // abs(p0-q0)
445 const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1); // abs(p1-q1)
446 const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0); // 2 * abs(p0-q0)
447 const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1); // abs(p1-q1) / 2
448 const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
449 const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
450 return mask;
451}
452
453static int8x16_t FlipSign(const uint8x16_t v) {
454 const uint8x16_t sign_bit = vdupq_n_u8(0x80);
455 return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
456}
457
458static uint8x16_t FlipSignBack(const int8x16_t v) {
459 const int8x16_t sign_bit = vdupq_n_s8(0x80);
460 return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
461}
462
463static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0,
464 const int8x16_t q0, const int8x16_t q1) {
465 const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
466 const int8x16_t p1_q1 = vqsubq_s8(p1, q1); // (p1-q1)
467 const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0); // (p1-q1) + 1 * (q0 - p0)
468 const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // (p1-q1) + 2 * (q0 - p0)
469 const int8x16_t s3 = vqaddq_s8(q0_p0, s2); // (p1-q1) + 3 * (q0 - p0)
470 return s3;
471}
472
473static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) {
474 const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
475 const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 * (q0 - p0)
476 const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 * (q0 - p0)
477 return s2;
478}
479
480//------------------------------------------------------------------------------
481
482static void ApplyFilter2NoFlip(const int8x16_t p0s, const int8x16_t q0s,
483 const int8x16_t delta,
484 int8x16_t* const op0, int8x16_t* const oq0) {
485 const int8x16_t kCst3 = vdupq_n_s8(0x03);
486 const int8x16_t kCst4 = vdupq_n_s8(0x04);
487 const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
488 const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
489 const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
490 const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
491 *op0 = vqaddq_s8(p0s, delta3);
492 *oq0 = vqsubq_s8(q0s, delta4);
493}
494
495#if defined(WEBP_USE_INTRINSICS)
496
497static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
498 const int8x16_t delta,
499 uint8x16_t* const op0, uint8x16_t* const oq0) {
500 const int8x16_t kCst3 = vdupq_n_s8(0x03);
501 const int8x16_t kCst4 = vdupq_n_s8(0x04);
502 const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
503 const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
504 const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
505 const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
506 const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
507 const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
508 *op0 = FlipSignBack(sp0);
509 *oq0 = FlipSignBack(sq0);
510}
511
512static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,
513 const uint8x16_t q0, const uint8x16_t q1,
514 const uint8x16_t mask,
515 uint8x16_t* const op0, uint8x16_t* const oq0) {
516 const int8x16_t p1s = FlipSign(p1);
517 const int8x16_t p0s = FlipSign(p0);
518 const int8x16_t q0s = FlipSign(q0);
519 const int8x16_t q1s = FlipSign(q1);
520 const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
521 const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
522 ApplyFilter2(p0s, q0s, delta1, op0, oq0);
523}
524
525static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
526 uint8x16_t p1, p0, q0, q1, op0, oq0;
527 Load16x4(p, stride, &p1, &p0, &q0, &q1);
528 {
529 const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
530 DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
531 }
532 Store16x2(op0, oq0, p, stride);
533}
534
535static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
536 uint8x16_t p1, p0, q0, q1, oq0, op0;
537 Load4x16(p, stride, &p1, &p0, &q0, &q1);
538 {
539 const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
540 DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
541 }
542 Store2x16(op0, oq0, p, stride);
543}
544
545#else
546
547#define QRegs "q0", "q1", "q2", "q3", \
548 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
549
550#define FLIP_SIGN_BIT2(a, b, s) \
551 "veor " #a "," #a "," #s " \n" \
552 "veor " #b "," #b "," #s " \n" \
553
554#define FLIP_SIGN_BIT4(a, b, c, d, s) \
555 FLIP_SIGN_BIT2(a, b, s) \
556 FLIP_SIGN_BIT2(c, d, s) \
557
558#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask) \
559 "vabd.u8 q15," #p0 "," #q0 " \n" /* abs(p0 - q0) */ \
560 "vabd.u8 q14," #p1 "," #q1 " \n" /* abs(p1 - q1) */ \
561 "vqadd.u8 q15, q15, q15 \n" /* abs(p0 - q0) * 2 */ \
562 "vshr.u8 q14, q14, #1 \n" /* abs(p1 - q1) / 2 */ \
563 "vqadd.u8 q15, q15, q14 \n" /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
564 "vdup.8 q14, " #thresh " \n" \
565 "vcge.u8 " #mask ", q14, q15 \n" /* mask <= thresh */
566
567#define GET_BASE_DELTA(p1, p0, q0, q1, o) \
568 "vqsub.s8 q15," #q0 "," #p0 " \n" /* (q0 - p0) */ \
569 "vqsub.s8 " #o "," #p1 "," #q1 " \n" /* (p1 - q1) */ \
570 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 1 * (p0 - q0) */ \
571 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 2 * (p0 - q0) */ \
572 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 3 * (p0 - q0) */
573
574#define DO_SIMPLE_FILTER(p0, q0, fl) \
575 "vmov.i8 q15, #0x03 \n" \
576 "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 3 */ \
577 "vshr.s8 q15, q15, #3 \n" /* filter1 >> 3 */ \
578 "vqadd.s8 " #p0 "," #p0 ", q15 \n" /* p0 += filter1 */ \
579 \
580 "vmov.i8 q15, #0x04 \n" \
581 "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 4 */ \
582 "vshr.s8 q15, q15, #3 \n" /* filter2 >> 3 */ \
583 "vqsub.s8 " #q0 "," #q0 ", q15 \n" /* q0 -= filter2 */
584
585// Applies filter on 2 pixels (p0 and q0)
586#define DO_FILTER2(p1, p0, q0, q1, thresh) \
587 NEEDS_FILTER(p1, p0, q0, q1, thresh, q9) /* filter mask in q9 */ \
588 "vmov.i8 q10, #0x80 \n" /* sign bit */ \
589 FLIP_SIGN_BIT4(p1, p0, q0, q1, q10) /* convert to signed value */ \
590 GET_BASE_DELTA(p1, p0, q0, q1, q11) /* get filter level */ \
591 "vand q9, q9, q11 \n" /* apply filter mask */ \
592 DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \
593 FLIP_SIGN_BIT2(p0, q0, q10)
594
595static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
596 __asm__ volatile (
597 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride
598
599 "vld1.u8 {q1}, [%[p]], %[stride] \n" // p1
600 "vld1.u8 {q2}, [%[p]], %[stride] \n" // p0
601 "vld1.u8 {q3}, [%[p]], %[stride] \n" // q0
602 "vld1.u8 {q12}, [%[p]] \n" // q1
603
604 DO_FILTER2(q1, q2, q3, q12, %[thresh])
605
606 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride
607
608 "vst1.u8 {q2}, [%[p]], %[stride] \n" // store op0
609 "vst1.u8 {q3}, [%[p]] \n" // store oq0
610 : [p] "+r"(p)
611 : [stride] "r"(stride), [thresh] "r"(thresh)
612 : "memory", QRegs
613 );
614}
615
616static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
617 __asm__ volatile (
618 "sub r4, %[p], #2 \n" // base1 = p - 2
619 "lsl r6, %[stride], #1 \n" // r6 = 2 * stride
620 "add r5, r4, %[stride] \n" // base2 = base1 + stride
621
622 LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
623 LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
624 "vswp d3, d24 \n" // p1:q1 p0:q3
625 "vswp d5, d26 \n" // q0:q2 q1:q4
626 "vswp q2, q12 \n" // p1:q1 p0:q2 q0:q3 q1:q4
627
628 DO_FILTER2(q1, q2, q12, q13, %[thresh])
629
630 "sub %[p], %[p], #1 \n" // p - 1
631
632 "vswp d5, d24 \n"
633 STORE8x2(d4, d5, [%[p]], %[stride])
634 STORE8x2(d24, d25, [%[p]], %[stride])
635
636 : [p] "+r"(p)
637 : [stride] "r"(stride), [thresh] "r"(thresh)
638 : "memory", "r4", "r5", "r6", QRegs
639 );
640}
641
642#endif // WEBP_USE_INTRINSICS
643
644static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
645 uint32_t k;
646 for (k = 3; k != 0; --k) {
647 p += 4 * stride;
648 SimpleVFilter16(p, stride, thresh);
649 }
650}
651
652static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
653 uint32_t k;
654 for (k = 3; k != 0; --k) {
655 p += 4;
656 SimpleHFilter16(p, stride, thresh);
657 }
658}
659
660//------------------------------------------------------------------------------
661// Complex In-loop filtering (Paragraph 15.3)
662
663static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0,
664 const uint8x16_t q0, const uint8x16_t q1,
665 int hev_thresh) {
666 const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
667 const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
668 const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
669 const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);
670 const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);
671 return mask;
672}
673
674static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2,
675 const uint8x16_t p1, const uint8x16_t p0,
676 const uint8x16_t q0, const uint8x16_t q1,
677 const uint8x16_t q2, const uint8x16_t q3,
678 int ithresh, int thresh) {
679 const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
680 const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2)
681 const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1); // abs(p2 - p1)
682 const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
683 const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2); // abs(q3 - q2)
684 const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1); // abs(q2 - q1)
685 const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
686 const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
687 const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
688 const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
689 const uint8x16_t max12 = vmaxq_u8(max1, max2);
690 const uint8x16_t max123 = vmaxq_u8(max12, max3);
691 const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
692 const uint8x16_t mask1 = NeedsFilter(p1, p0, q0, q1, thresh);
693 const uint8x16_t mask = vandq_u8(mask1, mask2);
694 return mask;
695}
696
697// 4-points filter
698
699static void ApplyFilter4(
700 const int8x16_t p1, const int8x16_t p0,
701 const int8x16_t q0, const int8x16_t q1,
702 const int8x16_t delta0,
703 uint8x16_t* const op1, uint8x16_t* const op0,
704 uint8x16_t* const oq0, uint8x16_t* const oq1) {
705 const int8x16_t kCst3 = vdupq_n_s8(0x03);
706 const int8x16_t kCst4 = vdupq_n_s8(0x04);
707 const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
708 const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
709 const int8x16_t a1 = vshrq_n_s8(delta1, 3);
710 const int8x16_t a2 = vshrq_n_s8(delta2, 3);
711 const int8x16_t a3 = vrshrq_n_s8(a1, 1); // a3 = (a1 + 1) >> 1
712 *op0 = FlipSignBack(vqaddq_s8(p0, a2)); // clip(p0 + a2)
713 *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - a1)
714 *op1 = FlipSignBack(vqaddq_s8(p1, a3)); // clip(p1 + a3)
715 *oq1 = FlipSignBack(vqsubq_s8(q1, a3)); // clip(q1 - a3)
716}
717
718static void DoFilter4(
719 const uint8x16_t p1, const uint8x16_t p0,
720 const uint8x16_t q0, const uint8x16_t q1,
721 const uint8x16_t mask, const uint8x16_t hev_mask,
722 uint8x16_t* const op1, uint8x16_t* const op0,
723 uint8x16_t* const oq0, uint8x16_t* const oq1) {
724 // This is a fused version of DoFilter2() calling ApplyFilter2 directly
725 const int8x16_t p1s = FlipSign(p1);
726 int8x16_t p0s = FlipSign(p0);
727 int8x16_t q0s = FlipSign(q0);
728 const int8x16_t q1s = FlipSign(q1);
729 const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
730
731 // do_filter2 part (simple loopfilter on pixels with hev)
732 {
733 const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s);
734 const int8x16_t simple_lf_delta =
735 vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
736 ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s);
737 }
738
739 // do_filter4 part (complex loopfilter on pixels without hev)
740 {
741 const int8x16_t delta0 = GetBaseDelta0(p0s, q0s);
742 // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
743 const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
744 const int8x16_t complex_lf_delta =
745 vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
746 ApplyFilter4(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
747 }
748}
749
750// 6-points filter
751
752static void ApplyFilter6(
753 const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
754 const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
755 const int8x16_t delta,
756 uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
757 uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
758 // We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7
759 // Turns out, there's a common sub-expression S=9 * a - 1 that can be used
760 // with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:
761 // X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7
762 const int8x8_t delta_lo = vget_low_s8(delta);
763 const int8x8_t delta_hi = vget_high_s8(delta);
764 const int8x8_t kCst9 = vdup_n_s8(9);
765 const int16x8_t kCstm1 = vdupq_n_s16(-1);
766 const int8x8_t kCst18 = vdup_n_s8(18);
767 const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo); // S = 9 * a - 1
768 const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);
769 const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo); // S + 18 * a
770 const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);
771 const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7); // (9 * a + 63) >> 7
772 const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7);
773 const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6); // (9 * a + 31) >> 6
774 const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6);
775 const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7); // (27 * a + 63) >> 7
776 const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7);
777 const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
778 const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
779 const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
780
781 *op0 = FlipSignBack(vqaddq_s8(p0, a1)); // clip(p0 + a1)
782 *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - q1)
783 *oq1 = FlipSignBack(vqsubq_s8(q1, a2)); // clip(q1 - a2)
784 *op1 = FlipSignBack(vqaddq_s8(p1, a2)); // clip(p1 + a2)
785 *oq2 = FlipSignBack(vqsubq_s8(q2, a3)); // clip(q2 - a3)
786 *op2 = FlipSignBack(vqaddq_s8(p2, a3)); // clip(p2 + a3)
787}
788
789static void DoFilter6(
790 const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
791 const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
792 const uint8x16_t mask, const uint8x16_t hev_mask,
793 uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
794 uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
795 // This is a fused version of DoFilter2() calling ApplyFilter2 directly
796 const int8x16_t p2s = FlipSign(p2);
797 const int8x16_t p1s = FlipSign(p1);
798 int8x16_t p0s = FlipSign(p0);
799 int8x16_t q0s = FlipSign(q0);
800 const int8x16_t q1s = FlipSign(q1);
801 const int8x16_t q2s = FlipSign(q2);
802 const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
803 const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
804
805 // do_filter2 part (simple loopfilter on pixels with hev)
806 {
807 const int8x16_t simple_lf_delta =
808 vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
809 ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s);
810 }
811
812 // do_filter6 part (complex loopfilter on pixels without hev)
813 {
814 // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
815 const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
816 const int8x16_t complex_lf_delta =
817 vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
818 ApplyFilter6(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
819 op2, op1, op0, oq0, oq1, oq2);
820 }
821}
822
823// on macroblock edges
824
825static void VFilter16(uint8_t* p, int stride,
826 int thresh, int ithresh, int hev_thresh) {
827 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
828 Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
829 {
830 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
831 ithresh, thresh);
832 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
833 uint8x16_t op2, op1, op0, oq0, oq1, oq2;
834 DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
835 &op2, &op1, &op0, &oq0, &oq1, &oq2);
836 Store16x2(op2, op1, p - 2 * stride, stride);
837 Store16x2(op0, oq0, p + 0 * stride, stride);
838 Store16x2(oq1, oq2, p + 2 * stride, stride);
839 }
840}
841
842static void HFilter16(uint8_t* p, int stride,
843 int thresh, int ithresh, int hev_thresh) {
844 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
845 Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
846 {
847 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
848 ithresh, thresh);
849 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
850 uint8x16_t op2, op1, op0, oq0, oq1, oq2;
851 DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
852 &op2, &op1, &op0, &oq0, &oq1, &oq2);
853 Store2x16(op2, op1, p - 2, stride);
854 Store2x16(op0, oq0, p + 0, stride);
855 Store2x16(oq1, oq2, p + 2, stride);
856 }
857}
858
859// on three inner edges
860static void VFilter16i(uint8_t* p, int stride,
861 int thresh, int ithresh, int hev_thresh) {
862 uint32_t k;
863 uint8x16_t p3, p2, p1, p0;
864 Load16x4(p + 2 * stride, stride, &p3, &p2, &p1, &p0);
865 for (k = 3; k != 0; --k) {
866 uint8x16_t q0, q1, q2, q3;
867 p += 4 * stride;
868 Load16x4(p + 2 * stride, stride, &q0, &q1, &q2, &q3);
869 {
870 const uint8x16_t mask =
871 NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
872 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
873 // p3 and p2 are not just temporary variables here: they will be
874 // re-used for next span. And q2/q3 will become p1/p0 accordingly.
875 DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
876 Store16x4(p1, p0, p3, p2, p, stride);
877 p1 = q2;
878 p0 = q3;
879 }
880 }
881}
882
883#if !defined(WORK_AROUND_GCC)
884static void HFilter16i(uint8_t* p, int stride,
885 int thresh, int ithresh, int hev_thresh) {
886 uint32_t k;
887 uint8x16_t p3, p2, p1, p0;
888 Load4x16(p + 2, stride, &p3, &p2, &p1, &p0);
889 for (k = 3; k != 0; --k) {
890 uint8x16_t q0, q1, q2, q3;
891 p += 4;
892 Load4x16(p + 2, stride, &q0, &q1, &q2, &q3);
893 {
894 const uint8x16_t mask =
895 NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
896 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
897 DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
898 Store4x16(p1, p0, p3, p2, p, stride);
899 p1 = q2;
900 p0 = q3;
901 }
902 }
903}
904#endif // !WORK_AROUND_GCC
905
906// 8-pixels wide variant, for chroma filtering
907static void VFilter8(uint8_t* u, uint8_t* v, int stride,
908 int thresh, int ithresh, int hev_thresh) {
909 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
910 Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
911 {
912 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
913 ithresh, thresh);
914 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
915 uint8x16_t op2, op1, op0, oq0, oq1, oq2;
916 DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
917 &op2, &op1, &op0, &oq0, &oq1, &oq2);
918 Store8x2x2(op2, op1, u - 2 * stride, v - 2 * stride, stride);
919 Store8x2x2(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
920 Store8x2x2(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
921 }
922}
923static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
924 int thresh, int ithresh, int hev_thresh) {
925 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
926 u += 4 * stride;
927 v += 4 * stride;
928 Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
929 {
930 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
931 ithresh, thresh);
932 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
933 uint8x16_t op1, op0, oq0, oq1;
934 DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
935 Store8x4x2(op1, op0, oq0, oq1, u, v, stride);
936 }
937}
938
939#if !defined(WORK_AROUND_GCC)
940static void HFilter8(uint8_t* u, uint8_t* v, int stride,
941 int thresh, int ithresh, int hev_thresh) {
942 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
943 Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
944 {
945 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
946 ithresh, thresh);
947 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
948 uint8x16_t op2, op1, op0, oq0, oq1, oq2;
949 DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
950 &op2, &op1, &op0, &oq0, &oq1, &oq2);
951 Store6x8x2(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
952 }
953}
954
955static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
956 int thresh, int ithresh, int hev_thresh) {
957 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
958 u += 4;
959 v += 4;
960 Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
961 {
962 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
963 ithresh, thresh);
964 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
965 uint8x16_t op1, op0, oq0, oq1;
966 DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
967 Store4x8x2(op1, op0, oq0, oq1, u, v, stride);
968 }
969}
970#endif // !WORK_AROUND_GCC
971
972//-----------------------------------------------------------------------------
973// Inverse transforms (Paragraph 14.4)
974
975// Technically these are unsigned but vqdmulh is only available in signed.
976// vqdmulh returns high half (effectively >> 16) but also doubles the value,
977// changing the >> 16 to >> 15 and requiring an additional >> 1.
978// We use this to our advantage with kC2. The canonical value is 35468.
979// However, the high bit is set so treating it as signed will give incorrect
980// results. We avoid this by down shifting by 1 here to clear the highest bit.
981// Combined with the doubling effect of vqdmulh we get >> 16.
982// This can not be applied to kC1 because the lowest bit is set. Down shifting
983// the constant would reduce precision.
984
985// libwebp uses a trick to avoid some extra addition that libvpx does.
986// Instead of:
987// temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
988// libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
989// same issue with kC1 and vqdmulh that we work around by down shifting kC2
990
991static const int16_t kC1 = 20091;
992static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
993
994#if defined(WEBP_USE_INTRINSICS)
995static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
996 int16x8x2_t* const out) {
997 // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
998 // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
999 const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
1000 // b0 d0 b1 d1 b2 d2 ...
1001 *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
1002}
1003
1004static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
1005 // {rows} = in0 | in4
1006 // in8 | in12
1007 // B1 = in4 | in12
1008 const int16x8_t B1 =
1009 vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
1010 // C0 = kC1 * in4 | kC1 * in12
1011 // C1 = kC2 * in4 | kC2 * in12
1012 const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
1013 const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
1014 const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
1015 vget_low_s16(rows->val[1])); // in0 + in8
1016 const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
1017 vget_low_s16(rows->val[1])); // in0 - in8
1018 // c = kC2 * in4 - kC1 * in12
1019 // d = kC1 * in4 + kC2 * in12
1020 const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
1021 const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
1022 const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b
1023 const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c
1024 const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
1025 const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
1026 const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
1027 Transpose8x2(E0, E1, rows);
1028}
1029
1030static void TransformOne(const int16_t* in, uint8_t* dst) {
1031 int16x8x2_t rows;
1032 INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
1033 TransformPass(&rows);
1034 TransformPass(&rows);
1035 Add4x4(rows.val[0], rows.val[1], dst);
1036}
1037
1038#else
1039
1040static void TransformOne(const int16_t* in, uint8_t* dst) {
1041 const int kBPS = BPS;
1042 // kC1, kC2. Padded because vld1.16 loads 8 bytes
1043 const int16_t constants[4] = { kC1, kC2, 0, 0 };
1044 /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
1045 __asm__ volatile (
1046 "vld1.16 {q1, q2}, [%[in]] \n"
1047 "vld1.16 {d0}, [%[constants]] \n"
1048
1049 /* d2: in[0]
1050 * d3: in[8]
1051 * d4: in[4]
1052 * d5: in[12]
1053 */
1054 "vswp d3, d4 \n"
1055
1056 /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
1057 * q9 = {in[4], in[12]} * kC2 >> 16
1058 */
1059 "vqdmulh.s16 q8, q2, d0[0] \n"
1060 "vqdmulh.s16 q9, q2, d0[1] \n"
1061
1062 /* d22 = a = in[0] + in[8]
1063 * d23 = b = in[0] - in[8]
1064 */
1065 "vqadd.s16 d22, d2, d3 \n"
1066 "vqsub.s16 d23, d2, d3 \n"
1067
1068 /* The multiplication should be x * kC1 >> 16
1069 * However, with vqdmulh we get x * kC1 * 2 >> 16
1070 * (multiply, double, return high half)
1071 * We avoided this in kC2 by pre-shifting the constant.
1072 * q8 = in[4]/[12] * kC1 >> 16
1073 */
1074 "vshr.s16 q8, q8, #1 \n"
1075
1076 /* Add {in[4], in[12]} back after the multiplication. This is handled by
1077 * adding 1 << 16 to kC1 in the libwebp C code.
1078 */
1079 "vqadd.s16 q8, q2, q8 \n"
1080
1081 /* d20 = c = in[4]*kC2 - in[12]*kC1
1082 * d21 = d = in[4]*kC1 + in[12]*kC2
1083 */
1084 "vqsub.s16 d20, d18, d17 \n"
1085 "vqadd.s16 d21, d19, d16 \n"
1086
1087 /* d2 = tmp[0] = a + d
1088 * d3 = tmp[1] = b + c
1089 * d4 = tmp[2] = b - c
1090 * d5 = tmp[3] = a - d
1091 */
1092 "vqadd.s16 d2, d22, d21 \n"
1093 "vqadd.s16 d3, d23, d20 \n"
1094 "vqsub.s16 d4, d23, d20 \n"
1095 "vqsub.s16 d5, d22, d21 \n"
1096
1097 "vzip.16 q1, q2 \n"
1098 "vzip.16 q1, q2 \n"
1099
1100 "vswp d3, d4 \n"
1101
1102 /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
1103 * q9 = {tmp[4], tmp[12]} * kC2 >> 16
1104 */
1105 "vqdmulh.s16 q8, q2, d0[0] \n"
1106 "vqdmulh.s16 q9, q2, d0[1] \n"
1107
1108 /* d22 = a = tmp[0] + tmp[8]
1109 * d23 = b = tmp[0] - tmp[8]
1110 */
1111 "vqadd.s16 d22, d2, d3 \n"
1112 "vqsub.s16 d23, d2, d3 \n"
1113
1114 /* See long winded explanations prior */
1115 "vshr.s16 q8, q8, #1 \n"
1116 "vqadd.s16 q8, q2, q8 \n"
1117
1118 /* d20 = c = in[4]*kC2 - in[12]*kC1
1119 * d21 = d = in[4]*kC1 + in[12]*kC2
1120 */
1121 "vqsub.s16 d20, d18, d17 \n"
1122 "vqadd.s16 d21, d19, d16 \n"
1123
1124 /* d2 = tmp[0] = a + d
1125 * d3 = tmp[1] = b + c
1126 * d4 = tmp[2] = b - c
1127 * d5 = tmp[3] = a - d
1128 */
1129 "vqadd.s16 d2, d22, d21 \n"
1130 "vqadd.s16 d3, d23, d20 \n"
1131 "vqsub.s16 d4, d23, d20 \n"
1132 "vqsub.s16 d5, d22, d21 \n"
1133
1134 "vld1.32 d6[0], [%[dst]], %[kBPS] \n"
1135 "vld1.32 d6[1], [%[dst]], %[kBPS] \n"
1136 "vld1.32 d7[0], [%[dst]], %[kBPS] \n"
1137 "vld1.32 d7[1], [%[dst]], %[kBPS] \n"
1138
1139 "sub %[dst], %[dst], %[kBPS], lsl #2 \n"
1140
1141 /* (val) + 4 >> 3 */
1142 "vrshr.s16 d2, d2, #3 \n"
1143 "vrshr.s16 d3, d3, #3 \n"
1144 "vrshr.s16 d4, d4, #3 \n"
1145 "vrshr.s16 d5, d5, #3 \n"
1146
1147 "vzip.16 q1, q2 \n"
1148 "vzip.16 q1, q2 \n"
1149
1150 /* Must accumulate before saturating */
1151 "vmovl.u8 q8, d6 \n"
1152 "vmovl.u8 q9, d7 \n"
1153
1154 "vqadd.s16 q1, q1, q8 \n"
1155 "vqadd.s16 q2, q2, q9 \n"
1156
1157 "vqmovun.s16 d0, q1 \n"
1158 "vqmovun.s16 d1, q2 \n"
1159
1160 "vst1.32 d0[0], [%[dst]], %[kBPS] \n"
1161 "vst1.32 d0[1], [%[dst]], %[kBPS] \n"
1162 "vst1.32 d1[0], [%[dst]], %[kBPS] \n"
1163 "vst1.32 d1[1], [%[dst]] \n"
1164
1165 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */
1166 : [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */
1167 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */
1168 );
1169}
1170
1171#endif // WEBP_USE_INTRINSICS
1172
1173static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
1174 TransformOne(in, dst);
1175 if (do_two) {
1176 TransformOne(in + 16, dst + 4);
1177 }
1178}
1179
1180static void TransformDC(const int16_t* in, uint8_t* dst) {
1181 const int16x8_t DC = vdupq_n_s16(in[0]);
1182 Add4x4(DC, DC, dst);
1183}
1184
1185//------------------------------------------------------------------------------
1186
1187#define STORE_WHT(dst, col, rows) do { \
1188 *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
1189 *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
1190 *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
1191 *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
1192} while (0)
1193
1194static void TransformWHT(const int16_t* in, int16_t* out) {
1195 int32x4x4_t tmp;
1196
1197 {
1198 // Load the source.
1199 const int16x4_t in00_03 = vld1_s16(in + 0);
1200 const int16x4_t in04_07 = vld1_s16(in + 4);
1201 const int16x4_t in08_11 = vld1_s16(in + 8);
1202 const int16x4_t in12_15 = vld1_s16(in + 12);
1203 const int32x4_t a0 = vaddl_s16(in00_03, in12_15); // in[0..3] + in[12..15]
1204 const int32x4_t a1 = vaddl_s16(in04_07, in08_11); // in[4..7] + in[8..11]
1205 const int32x4_t a2 = vsubl_s16(in04_07, in08_11); // in[4..7] - in[8..11]
1206 const int32x4_t a3 = vsubl_s16(in00_03, in12_15); // in[0..3] - in[12..15]
1207 tmp.val[0] = vaddq_s32(a0, a1);
1208 tmp.val[1] = vaddq_s32(a3, a2);
1209 tmp.val[2] = vsubq_s32(a0, a1);
1210 tmp.val[3] = vsubq_s32(a3, a2);
1211 // Arrange the temporary results column-wise.
1212 tmp = Transpose4x4(tmp);
1213 }
1214
1215 {
1216 const int32x4_t kCst3 = vdupq_n_s32(3);
1217 const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3); // add rounder
1218 const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
1219 const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
1220 const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
1221 const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
1222
1223 tmp.val[0] = vaddq_s32(a0, a1);
1224 tmp.val[1] = vaddq_s32(a3, a2);
1225 tmp.val[2] = vsubq_s32(a0, a1);
1226 tmp.val[3] = vsubq_s32(a3, a2);
1227
1228 // right shift the results by 3.
1229 tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
1230 tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
1231 tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
1232 tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
1233
1234 STORE_WHT(out, 0, tmp);
1235 STORE_WHT(out, 1, tmp);
1236 STORE_WHT(out, 2, tmp);
1237 STORE_WHT(out, 3, tmp);
1238 }
1239}
1240
1241#undef STORE_WHT
1242
1243//------------------------------------------------------------------------------
1244
1245#define MUL(a, b) (((a) * (b)) >> 16)
1246static void TransformAC3(const int16_t* in, uint8_t* dst) {
1247 static const int kC1_full = 20091 + (1 << 16);
1248 static const int kC2_full = 35468;
1249 const int16x4_t A = vld1_dup_s16(in);
1250 const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
1251 const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
1252 const int c1 = MUL(in[1], kC2_full);
1253 const int d1 = MUL(in[1], kC1_full);
1254 const uint64_t cd = (uint64_t)( d1 & 0xffff) << 0 |
1255 (uint64_t)( c1 & 0xffff) << 16 |
1256 (uint64_t)(-c1 & 0xffff) << 32 |
1257 (uint64_t)(-d1 & 0xffff) << 48;
1258 const int16x4_t CD = vcreate_s16(cd);
1259 const int16x4_t B = vqadd_s16(A, CD);
1260 const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
1261 const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
1262 Add4x4(m0_m1, m2_m3, dst);
1263}
1264#undef MUL
1265
1266//------------------------------------------------------------------------------
1267// 4x4
1268
1269static void DC4(uint8_t* dst) { // DC
1270 const uint8x8_t A = vld1_u8(dst - BPS); // top row
1271 const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
1272 const uint16x4_t p1 = vpadd_u16(p0, p0);
1273 const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
1274 const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
1275 const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
1276 const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
1277 const uint16x8_t s0 = vaddq_u16(L0, L1);
1278 const uint16x8_t s1 = vaddq_u16(L2, L3);
1279 const uint16x8_t s01 = vaddq_u16(s0, s1);
1280 const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
1281 const uint8x8_t dc0 = vrshrn_n_u16(sum, 3); // (sum + 4) >> 3
1282 const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1283 int i;
1284 for (i = 0; i < 4; ++i) {
1285 vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
1286 }
1287}
1288
1289// TrueMotion (4x4 + 8x8)
1290static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
1291 const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
1292 const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]'
1293 const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL)); // A[c] - A[-1]
1294 int y;
1295 for (y = 0; y < size; y += 4) {
1296 // left edge
1297 const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1));
1298 const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1));
1299 const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1));
1300 const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1));
1301 const int16x8_t r0 = vaddq_s16(L0, d); // L[r] + A[c] - A[-1]
1302 const int16x8_t r1 = vaddq_s16(L1, d);
1303 const int16x8_t r2 = vaddq_s16(L2, d);
1304 const int16x8_t r3 = vaddq_s16(L3, d);
1305 // Saturate and store the result.
1306 const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
1307 const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
1308 const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
1309 const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
1310 if (size == 4) {
1311 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
1312 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
1313 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
1314 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
1315 } else {
1316 vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
1317 vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
1318 vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
1319 vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
1320 }
1321 dst += 4 * BPS;
1322 }
1323}
1324
1325static void TM4(uint8_t* dst) { TrueMotion(dst, 4); }
1326
1327static void VE4(uint8_t* dst) { // vertical
1328 // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
1329 const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1)); // top row
1330 const uint64x1_t A1 = vshr_n_u64(A0, 8);
1331 const uint64x1_t A2 = vshr_n_u64(A0, 16);
1332 const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
1333 const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
1334 const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
1335 const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
1336 const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
1337 int i;
1338 for (i = 0; i < 4; ++i) {
1339 vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
1340 }
1341}
1342
1343static void RD4(uint8_t* dst) { // Down-right
1344 const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
1345 const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
1346 const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
1347 const uint32_t I = dst[-1 + 0 * BPS];
1348 const uint32_t J = dst[-1 + 1 * BPS];
1349 const uint32_t K = dst[-1 + 2 * BPS];
1350 const uint32_t L = dst[-1 + 3 * BPS];
1351 const uint64x1_t LKJI____ = vcreate_u64(L | (K << 8) | (J << 16) | (I << 24));
1352 const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
1353 const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
1354 const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
1355 const uint8_t D = vget_lane_u8(XABCD_u8, 4);
1356 const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
1357 const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
1358 const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
1359 const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
1360 const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1361 const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
1362 const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1363 const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1364 const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1365 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1366 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1367 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1368 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1369}
1370
1371static void LD4(uint8_t* dst) { // Down-left
1372 // Note using the same shift trick as VE4() is slower here.
1373 const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
1374 const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
1375 const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
1376 const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
1377 const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
1378 const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
1379 const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1380 const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
1381 const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1382 const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1383 const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1384 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1385 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1386 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1387 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1388}
1389
1390//------------------------------------------------------------------------------
1391// Chroma
1392
1393static void VE8uv(uint8_t* dst) { // vertical
1394 const uint8x8_t top = vld1_u8(dst - BPS);
1395 int j;
1396 for (j = 0; j < 8; ++j) {
1397 vst1_u8(dst + j * BPS, top);
1398 }
1399}
1400
1401static void HE8uv(uint8_t* dst) { // horizontal
1402 int j;
1403 for (j = 0; j < 8; ++j) {
1404 const uint8x8_t left = vld1_dup_u8(dst - 1);
1405 vst1_u8(dst, left);
1406 dst += BPS;
1407 }
1408}
1409
1410static WEBP_INLINE void DC8(uint8_t* dst, int do_top, int do_left) {
1411 uint16x8_t sum_top;
1412 uint16x8_t sum_left;
1413 uint8x8_t dc0;
1414
1415 if (do_top) {
1416 const uint8x8_t A = vld1_u8(dst - BPS); // top row
1417 const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
1418 const uint16x4_t p1 = vpadd_u16(p0, p0);
1419 const uint16x4_t p2 = vpadd_u16(p1, p1);
1420 sum_top = vcombine_u16(p2, p2);
1421 }
1422
1423 if (do_left) {
1424 const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
1425 const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
1426 const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
1427 const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
1428 const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
1429 const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
1430 const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
1431 const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
1432 const uint16x8_t s0 = vaddq_u16(L0, L1);
1433 const uint16x8_t s1 = vaddq_u16(L2, L3);
1434 const uint16x8_t s2 = vaddq_u16(L4, L5);
1435 const uint16x8_t s3 = vaddq_u16(L6, L7);
1436 const uint16x8_t s01 = vaddq_u16(s0, s1);
1437 const uint16x8_t s23 = vaddq_u16(s2, s3);
1438 sum_left = vaddq_u16(s01, s23);
1439 }
1440
1441 if (do_top && do_left) {
1442 const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1443 dc0 = vrshrn_n_u16(sum, 4);
1444 } else if (do_top) {
1445 dc0 = vrshrn_n_u16(sum_top, 3);
1446 } else if (do_left) {
1447 dc0 = vrshrn_n_u16(sum_left, 3);
1448 } else {
1449 dc0 = vdup_n_u8(0x80);
1450 }
1451
1452 {
1453 const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1454 int i;
1455 for (i = 0; i < 8; ++i) {
1456 vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
1457 }
1458 }
1459}
1460
1461static void DC8uv(uint8_t* dst) { DC8(dst, 1, 1); }
1462static void DC8uvNoTop(uint8_t* dst) { DC8(dst, 0, 1); }
1463static void DC8uvNoLeft(uint8_t* dst) { DC8(dst, 1, 0); }
1464static void DC8uvNoTopLeft(uint8_t* dst) { DC8(dst, 0, 0); }
1465
1466static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
1467
1468//------------------------------------------------------------------------------
1469// 16x16
1470
1471static void VE16(uint8_t* dst) { // vertical
1472 const uint8x16_t top = vld1q_u8(dst - BPS);
1473 int j;
1474 for (j = 0; j < 16; ++j) {
1475 vst1q_u8(dst + j * BPS, top);
1476 }
1477}
1478
1479static void HE16(uint8_t* dst) { // horizontal
1480 int j;
1481 for (j = 0; j < 16; ++j) {
1482 const uint8x16_t left = vld1q_dup_u8(dst - 1);
1483 vst1q_u8(dst, left);
1484 dst += BPS;
1485 }
1486}
1487
1488static WEBP_INLINE void DC16(uint8_t* dst, int do_top, int do_left) {
1489 uint16x8_t sum_top;
1490 uint16x8_t sum_left;
1491 uint8x8_t dc0;
1492
1493 if (do_top) {
1494 const uint8x16_t A = vld1q_u8(dst - BPS); // top row
1495 const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
1496 const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
1497 const uint16x4_t p2 = vpadd_u16(p1, p1);
1498 const uint16x4_t p3 = vpadd_u16(p2, p2);
1499 sum_top = vcombine_u16(p3, p3);
1500 }
1501
1502 if (do_left) {
1503 int i;
1504 sum_left = vdupq_n_u16(0);
1505 for (i = 0; i < 16; i += 8) {
1506 const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1));
1507 const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1));
1508 const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1));
1509 const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1));
1510 const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1));
1511 const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1));
1512 const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1));
1513 const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1));
1514 const uint16x8_t s0 = vaddq_u16(L0, L1);
1515 const uint16x8_t s1 = vaddq_u16(L2, L3);
1516 const uint16x8_t s2 = vaddq_u16(L4, L5);
1517 const uint16x8_t s3 = vaddq_u16(L6, L7);
1518 const uint16x8_t s01 = vaddq_u16(s0, s1);
1519 const uint16x8_t s23 = vaddq_u16(s2, s3);
1520 const uint16x8_t sum = vaddq_u16(s01, s23);
1521 sum_left = vaddq_u16(sum_left, sum);
1522 }
1523 }
1524
1525 if (do_top && do_left) {
1526 const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1527 dc0 = vrshrn_n_u16(sum, 5);
1528 } else if (do_top) {
1529 dc0 = vrshrn_n_u16(sum_top, 4);
1530 } else if (do_left) {
1531 dc0 = vrshrn_n_u16(sum_left, 4);
1532 } else {
1533 dc0 = vdup_n_u8(0x80);
1534 }
1535
1536 {
1537 const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
1538 int i;
1539 for (i = 0; i < 16; ++i) {
1540 vst1q_u8(dst + i * BPS, dc);
1541 }
1542 }
1543}
1544
1545static void DC16TopLeft(uint8_t* dst) { DC16(dst, 1, 1); }
1546static void DC16NoTop(uint8_t* dst) { DC16(dst, 0, 1); }
1547static void DC16NoLeft(uint8_t* dst) { DC16(dst, 1, 0); }
1548static void DC16NoTopLeft(uint8_t* dst) { DC16(dst, 0, 0); }
1549
1550static void TM16(uint8_t* dst) {
1551 const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
1552 const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]'
1553 // A[c] - A[-1]
1554 const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));
1555 const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));
1556 int y;
1557 for (y = 0; y < 16; y += 4) {
1558 // left edge
1559 const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1));
1560 const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1));
1561 const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1));
1562 const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1));
1563 const int16x8_t r0_lo = vaddq_s16(L0, d_lo); // L[r] + A[c] - A[-1]
1564 const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
1565 const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
1566 const int16x8_t r3_lo = vaddq_s16(L3, d_lo);
1567 const int16x8_t r0_hi = vaddq_s16(L0, d_hi);
1568 const int16x8_t r1_hi = vaddq_s16(L1, d_hi);
1569 const int16x8_t r2_hi = vaddq_s16(L2, d_hi);
1570 const int16x8_t r3_hi = vaddq_s16(L3, d_hi);
1571 // Saturate and store the result.
1572 const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
1573 const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
1574 const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
1575 const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
1576 vst1q_u8(dst + 0 * BPS, row0);
1577 vst1q_u8(dst + 1 * BPS, row1);
1578 vst1q_u8(dst + 2 * BPS, row2);
1579 vst1q_u8(dst + 3 * BPS, row3);
1580 dst += 4 * BPS;
1581 }
1582}
1583
1584//------------------------------------------------------------------------------
1585// Entry point
1586
1587extern void VP8DspInitNEON(void);
1588
1589WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
1590 VP8Transform = TransformTwo;
1591 VP8TransformAC3 = TransformAC3;
1592 VP8TransformDC = TransformDC;
1593 VP8TransformWHT = TransformWHT;
1594
1595 VP8VFilter16 = VFilter16;
1596 VP8VFilter16i = VFilter16i;
1597 VP8HFilter16 = HFilter16;
1598#if !defined(WORK_AROUND_GCC)
1599 VP8HFilter16i = HFilter16i;
1600#endif
1601 VP8VFilter8 = VFilter8;
1602 VP8VFilter8i = VFilter8i;
1603#if !defined(WORK_AROUND_GCC)
1604 VP8HFilter8 = HFilter8;
1605 VP8HFilter8i = HFilter8i;
1606#endif
1607 VP8SimpleVFilter16 = SimpleVFilter16;
1608 VP8SimpleHFilter16 = SimpleHFilter16;
1609 VP8SimpleVFilter16i = SimpleVFilter16i;
1610 VP8SimpleHFilter16i = SimpleHFilter16i;
1611
1612 VP8PredLuma4[0] = DC4;
1613 VP8PredLuma4[1] = TM4;
1614 VP8PredLuma4[2] = VE4;
1615 VP8PredLuma4[4] = RD4;
1616 VP8PredLuma4[6] = LD4;
1617
1618 VP8PredLuma16[0] = DC16TopLeft;
1619 VP8PredLuma16[1] = TM16;
1620 VP8PredLuma16[2] = VE16;
1621 VP8PredLuma16[3] = HE16;
1622 VP8PredLuma16[4] = DC16NoTop;
1623 VP8PredLuma16[5] = DC16NoLeft;
1624 VP8PredLuma16[6] = DC16NoTopLeft;
1625
1626 VP8PredChroma8[0] = DC8uv;
1627 VP8PredChroma8[1] = TM8uv;
1628 VP8PredChroma8[2] = VE8uv;
1629 VP8PredChroma8[3] = HE8uv;
1630 VP8PredChroma8[4] = DC8uvNoTop;
1631 VP8PredChroma8[5] = DC8uvNoLeft;
1632 VP8PredChroma8[6] = DC8uvNoTopLeft;
1633}
1634
1635#else // !WEBP_USE_NEON
1636
1637WEBP_DSP_INIT_STUB(VP8DspInitNEON)
1638
1639#endif // WEBP_USE_NEON
1640