1
2/* filter_sse2_intrinsics.c - SSE2 optimized filter functions
3 *
4 * Copyright (c) 2018 Cosmin Truta
5 * Copyright (c) 2016-2017 Glenn Randers-Pehrson
6 * Written by Mike Klein and Matt Sarett
7 * Derived from arm/filter_neon_intrinsics.c
8 *
9 * This code is released under the libpng license.
10 * For conditions of distribution and use, see the disclaimer
11 * and license in png.h
12 */
13
14#include "../pngpriv.h"
15
16#ifdef PNG_READ_SUPPORTED
17
18#if PNG_INTEL_SSE_IMPLEMENTATION > 0
19
20#include <immintrin.h>
21
22/* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
23 * They're positioned like this:
24 * prev: c b
25 * row: a d
26 * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
27 * whichever of a, b, or c is closest to p=a+b-c.
28 */
29
30static __m128i load4(const void* p) {
31 int tmp;
32 memcpy(&tmp, p, sizeof(tmp));
33 return _mm_cvtsi32_si128(tmp);
34}
35
36static void store4(void* p, __m128i v) {
37 int tmp = _mm_cvtsi128_si32(v);
38 memcpy(p, &tmp, sizeof(int));
39}
40
41static __m128i load3(const void* p) {
42 png_uint_32 tmp = 0;
43 memcpy(&tmp, p, 3);
44 return _mm_cvtsi32_si128(tmp);
45}
46
47static void store3(void* p, __m128i v) {
48 int tmp = _mm_cvtsi128_si32(v);
49 memcpy(p, &tmp, 3);
50}
51
52void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row,
53 png_const_bytep prev)
54{
55 /* The Sub filter predicts each pixel as the previous pixel, a.
56 * There is no pixel to the left of the first pixel. It's encoded directly.
57 * That works with our main loop if we just say that left pixel was zero.
58 */
59 size_t rb;
60
61 __m128i a, d = _mm_setzero_si128();
62
63 png_debug(1, "in png_read_filter_row_sub3_sse2");
64
65 rb = row_info->rowbytes;
66 while (rb >= 4) {
67 a = d; d = load4(row);
68 d = _mm_add_epi8(d, a);
69 store3(row, d);
70
71 row += 3;
72 rb -= 3;
73 }
74 if (rb > 0) {
75 a = d; d = load3(row);
76 d = _mm_add_epi8(d, a);
77 store3(row, d);
78
79 row += 3;
80 rb -= 3;
81 }
82 PNG_UNUSED(prev)
83}
84
85void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row,
86 png_const_bytep prev)
87{
88 /* The Sub filter predicts each pixel as the previous pixel, a.
89 * There is no pixel to the left of the first pixel. It's encoded directly.
90 * That works with our main loop if we just say that left pixel was zero.
91 */
92 size_t rb;
93
94 __m128i a, d = _mm_setzero_si128();
95
96 png_debug(1, "in png_read_filter_row_sub4_sse2");
97
98 rb = row_info->rowbytes+4;
99 while (rb > 4) {
100 a = d; d = load4(row);
101 d = _mm_add_epi8(d, a);
102 store4(row, d);
103
104 row += 4;
105 rb -= 4;
106 }
107 PNG_UNUSED(prev)
108}
109
110void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row,
111 png_const_bytep prev)
112{
113 /* The Avg filter predicts each pixel as the (truncated) average of a and b.
114 * There's no pixel to the left of the first pixel. Luckily, it's
115 * predicted to be half of the pixel above it. So again, this works
116 * perfectly with our loop if we make sure a starts at zero.
117 */
118
119 size_t rb;
120
121 const __m128i zero = _mm_setzero_si128();
122
123 __m128i b;
124 __m128i a, d = zero;
125
126 png_debug(1, "in png_read_filter_row_avg3_sse2");
127 rb = row_info->rowbytes;
128 while (rb >= 4) {
129 __m128i avg;
130 b = load4(prev);
131 a = d; d = load4(row );
132
133 /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
134 avg = _mm_avg_epu8(a,b);
135 /* ...but we can fix it up by subtracting off 1 if it rounded up. */
136 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
137 _mm_set1_epi8(1)));
138 d = _mm_add_epi8(d, avg);
139 store3(row, d);
140
141 prev += 3;
142 row += 3;
143 rb -= 3;
144 }
145 if (rb > 0) {
146 __m128i avg;
147 b = load3(prev);
148 a = d; d = load3(row );
149
150 /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
151 avg = _mm_avg_epu8(a,b);
152 /* ...but we can fix it up by subtracting off 1 if it rounded up. */
153 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
154 _mm_set1_epi8(1)));
155
156 d = _mm_add_epi8(d, avg);
157 store3(row, d);
158
159 prev += 3;
160 row += 3;
161 rb -= 3;
162 }
163}
164
165void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row,
166 png_const_bytep prev)
167{
168 /* The Avg filter predicts each pixel as the (truncated) average of a and b.
169 * There's no pixel to the left of the first pixel. Luckily, it's
170 * predicted to be half of the pixel above it. So again, this works
171 * perfectly with our loop if we make sure a starts at zero.
172 */
173 size_t rb;
174 const __m128i zero = _mm_setzero_si128();
175 __m128i b;
176 __m128i a, d = zero;
177
178 png_debug(1, "in png_read_filter_row_avg4_sse2");
179
180 rb = row_info->rowbytes+4;
181 while (rb > 4) {
182 __m128i avg;
183 b = load4(prev);
184 a = d; d = load4(row );
185
186 /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
187 avg = _mm_avg_epu8(a,b);
188 /* ...but we can fix it up by subtracting off 1 if it rounded up. */
189 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
190 _mm_set1_epi8(1)));
191
192 d = _mm_add_epi8(d, avg);
193 store4(row, d);
194
195 prev += 4;
196 row += 4;
197 rb -= 4;
198 }
199}
200
201/* Returns |x| for 16-bit lanes. */
202static __m128i abs_i16(__m128i x) {
203#if PNG_INTEL_SSE_IMPLEMENTATION >= 2
204 return _mm_abs_epi16(x);
205#else
206 /* Read this all as, return x<0 ? -x : x.
207 * To negate two's complement, you flip all the bits then add 1.
208 */
209 __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());
210
211 /* Flip negative lanes. */
212 x = _mm_xor_si128(x, is_negative);
213
214 /* +1 to negative lanes, else +0. */
215 x = _mm_sub_epi16(x, is_negative);
216 return x;
217#endif
218}
219
220/* Bytewise c ? t : e. */
221static __m128i if_then_else(__m128i c, __m128i t, __m128i e) {
222#if PNG_INTEL_SSE_IMPLEMENTATION >= 3
223 return _mm_blendv_epi8(e,t,c);
224#else
225 return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e));
226#endif
227}
228
229void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row,
230 png_const_bytep prev)
231{
232 /* Paeth tries to predict pixel d using the pixel to the left of it, a,
233 * and two pixels from the previous row, b and c:
234 * prev: c b
235 * row: a d
236 * The Paeth function predicts d to be whichever of a, b, or c is nearest to
237 * p=a+b-c.
238 *
239 * The first pixel has no left context, and so uses an Up filter, p = b.
240 * This works naturally with our main loop's p = a+b-c if we force a and c
241 * to zero.
242 * Here we zero b and d, which become c and a respectively at the start of
243 * the loop.
244 */
245 size_t rb;
246 const __m128i zero = _mm_setzero_si128();
247 __m128i c, b = zero,
248 a, d = zero;
249
250 png_debug(1, "in png_read_filter_row_paeth3_sse2");
251
252 rb = row_info->rowbytes;
253 while (rb >= 4) {
254 /* It's easiest to do this math (particularly, deal with pc) with 16-bit
255 * intermediates.
256 */
257 __m128i pa,pb,pc,smallest,nearest;
258 c = b; b = _mm_unpacklo_epi8(load4(prev), zero);
259 a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
260
261 /* (p-a) == (a+b-c - a) == (b-c) */
262
263 pa = _mm_sub_epi16(b,c);
264
265 /* (p-b) == (a+b-c - b) == (a-c) */
266 pb = _mm_sub_epi16(a,c);
267
268 /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
269 pc = _mm_add_epi16(pa,pb);
270
271 pa = abs_i16(pa); /* |p-a| */
272 pb = abs_i16(pb); /* |p-b| */
273 pc = abs_i16(pc); /* |p-c| */
274
275 smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
276
277 /* Paeth breaks ties favoring a over b over c. */
278 nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
279 if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
280 c));
281
282 /* Note `_epi8`: we need addition to wrap modulo 255. */
283 d = _mm_add_epi8(d, nearest);
284 store3(row, _mm_packus_epi16(d,d));
285
286 prev += 3;
287 row += 3;
288 rb -= 3;
289 }
290 if (rb > 0) {
291 /* It's easiest to do this math (particularly, deal with pc) with 16-bit
292 * intermediates.
293 */
294 __m128i pa,pb,pc,smallest,nearest;
295 c = b; b = _mm_unpacklo_epi8(load3(prev), zero);
296 a = d; d = _mm_unpacklo_epi8(load3(row ), zero);
297
298 /* (p-a) == (a+b-c - a) == (b-c) */
299 pa = _mm_sub_epi16(b,c);
300
301 /* (p-b) == (a+b-c - b) == (a-c) */
302 pb = _mm_sub_epi16(a,c);
303
304 /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
305 pc = _mm_add_epi16(pa,pb);
306
307 pa = abs_i16(pa); /* |p-a| */
308 pb = abs_i16(pb); /* |p-b| */
309 pc = abs_i16(pc); /* |p-c| */
310
311 smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
312
313 /* Paeth breaks ties favoring a over b over c. */
314 nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
315 if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
316 c));
317
318 /* Note `_epi8`: we need addition to wrap modulo 255. */
319 d = _mm_add_epi8(d, nearest);
320 store3(row, _mm_packus_epi16(d,d));
321
322 prev += 3;
323 row += 3;
324 rb -= 3;
325 }
326}
327
328void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row,
329 png_const_bytep prev)
330{
331 /* Paeth tries to predict pixel d using the pixel to the left of it, a,
332 * and two pixels from the previous row, b and c:
333 * prev: c b
334 * row: a d
335 * The Paeth function predicts d to be whichever of a, b, or c is nearest to
336 * p=a+b-c.
337 *
338 * The first pixel has no left context, and so uses an Up filter, p = b.
339 * This works naturally with our main loop's p = a+b-c if we force a and c
340 * to zero.
341 * Here we zero b and d, which become c and a respectively at the start of
342 * the loop.
343 */
344 size_t rb;
345 const __m128i zero = _mm_setzero_si128();
346 __m128i pa,pb,pc,smallest,nearest;
347 __m128i c, b = zero,
348 a, d = zero;
349
350 png_debug(1, "in png_read_filter_row_paeth4_sse2");
351
352 rb = row_info->rowbytes+4;
353 while (rb > 4) {
354 /* It's easiest to do this math (particularly, deal with pc) with 16-bit
355 * intermediates.
356 */
357 c = b; b = _mm_unpacklo_epi8(load4(prev), zero);
358 a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
359
360 /* (p-a) == (a+b-c - a) == (b-c) */
361 pa = _mm_sub_epi16(b,c);
362
363 /* (p-b) == (a+b-c - b) == (a-c) */
364 pb = _mm_sub_epi16(a,c);
365
366 /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
367 pc = _mm_add_epi16(pa,pb);
368
369 pa = abs_i16(pa); /* |p-a| */
370 pb = abs_i16(pb); /* |p-b| */
371 pc = abs_i16(pc); /* |p-c| */
372
373 smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
374
375 /* Paeth breaks ties favoring a over b over c. */
376 nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
377 if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
378 c));
379
380 /* Note `_epi8`: we need addition to wrap modulo 255. */
381 d = _mm_add_epi8(d, nearest);
382 store4(row, _mm_packus_epi16(d,d));
383
384 prev += 4;
385 row += 4;
386 rb -= 4;
387 }
388}
389
390#endif /* PNG_INTEL_SSE_IMPLEMENTATION > 0 */
391#endif /* READ */
392