filter_sse2_intrinsics.c source code [Godot/thirdparty/libpng/intel/filter_sse2_intrinsics.c]

1
2	/ filter_sse2_intrinsics.c - SSE2 optimized filter functions*
3	*
4	* Copyright (c) 2018 Cosmin Truta
5	* Copyright (c) 2016-2017 Glenn Randers-Pehrson
6	* Written by Mike Klein and Matt Sarett
7	* Derived from arm/filter_neon_intrinsics.c
8	*
9	* This code is released under the libpng license.
10	* For conditions of distribution and use, see the disclaimer
11	* and license in png.h
12	*/
13
14	#include "../pngpriv.h"
15
16	#ifdef PNG_READ_SUPPORTED
17
18	#if PNG_INTEL_SSE_IMPLEMENTATION > 0
19
20	#include <immintrin.h>
21
22	/ Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).*
23	* They're positioned like this:
24	* prev: c b
25	* row: a d
26	* The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
27	* whichever of a, b, or c is closest to p=a+b-c.
28	*/
29
30	static __m128i load4(const void* p) {
31	int tmp;
32	memcpy(&tmp, p, sizeof(tmp));
33	return _mm_cvtsi32_si128(tmp);
34	}
35
36	static void store4(void* p, __m128i v) {
37	int tmp = _mm_cvtsi128_si32(v);
38	memcpy(p, &tmp, sizeof(int));
39	}
40
41	static __m128i load3(const void* p) {
42	png_uint_32 tmp = `0`;
43	memcpy(&tmp, p, `3`);
44	return _mm_cvtsi32_si128(tmp);
45	}
46
47	static void store3(void* p, __m128i v) {
48	int tmp = _mm_cvtsi128_si32(v);
49	memcpy(p, &tmp, `3`);
50	}
51
52	void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row,
53	png_const_bytep prev)
54	{
55	/ The Sub filter predicts each pixel as the previous pixel, a.*
56	* There is no pixel to the left of the first pixel. It's encoded directly.
57	* That works with our main loop if we just say that left pixel was zero.
58	*/
59	size_t rb;
60
61	__m128i a, d = _mm_setzero_si128();
62
63	png_debug(`1`, "in png_read_filter_row_sub3_sse2");
64
65	rb = row_info->rowbytes;
66	while (rb >= `4`) {
67	a = d; d = load4(row);
68	d = _mm_add_epi8(d, a);
69	store3(row, d);
70
71	row += `3`;
72	rb -= `3`;
73	}
74	if (rb > `0`) {
75	a = d; d = load3(row);
76	d = _mm_add_epi8(d, a);
77	store3(row, d);
78
79	row += `3`;
80	rb -= `3`;
81	}
82	PNG_UNUSED(prev)
83	}
84
85	void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row,
86	png_const_bytep prev)
87	{
88	/ The Sub filter predicts each pixel as the previous pixel, a.*
89	* There is no pixel to the left of the first pixel. It's encoded directly.
90	* That works with our main loop if we just say that left pixel was zero.
91	*/
92	size_t rb;
93
94	__m128i a, d = _mm_setzero_si128();
95
96	png_debug(`1`, "in png_read_filter_row_sub4_sse2");
97
98	rb = row_info->rowbytes+`4`;
99	while (rb > `4`) {
100	a = d; d = load4(row);
101	d = _mm_add_epi8(d, a);
102	store4(row, d);
103
104	row += `4`;
105	rb -= `4`;
106	}
107	PNG_UNUSED(prev)
108	}
109
110	void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row,
111	png_const_bytep prev)
112	{
113	/ The Avg filter predicts each pixel as the (truncated) average of a and b.*
114	* There's no pixel to the left of the first pixel. Luckily, it's
115	* predicted to be half of the pixel above it. So again, this works
116	* perfectly with our loop if we make sure a starts at zero.
117	*/
118
119	size_t rb;
120
121	const __m128i zero = _mm_setzero_si128();
122
123	__m128i b;
124	__m128i a, d = zero;
125
126	png_debug(`1`, "in png_read_filter_row_avg3_sse2");
127	rb = row_info->rowbytes;
128	while (rb >= `4`) {
129	__m128i avg;
130	b = load4(prev);
131	a = d; d = load4(row );
132
133	/ PNG requires a truncating average, so we can't just use _mm_avg_epu8 /
134	avg = _mm_avg_epu8(a,b);
135	/ ...but we can fix it up by subtracting off 1 if it rounded up. /
136	avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
137	_mm_set1_epi8(`1`)));
138	d = _mm_add_epi8(d, avg);
139	store3(row, d);
140
141	prev += `3`;
142	row += `3`;
143	rb -= `3`;
144	}
145	if (rb > `0`) {
146	__m128i avg;
147	b = load3(prev);
148	a = d; d = load3(row );
149
150	/ PNG requires a truncating average, so we can't just use _mm_avg_epu8 /
151	avg = _mm_avg_epu8(a,b);
152	/ ...but we can fix it up by subtracting off 1 if it rounded up. /
153	avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
154	_mm_set1_epi8(`1`)));
155
156	d = _mm_add_epi8(d, avg);
157	store3(row, d);
158
159	prev += `3`;
160	row += `3`;
161	rb -= `3`;
162	}
163	}
164
165	void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row,
166	png_const_bytep prev)
167	{
168	/ The Avg filter predicts each pixel as the (truncated) average of a and b.*
169	* There's no pixel to the left of the first pixel. Luckily, it's
170	* predicted to be half of the pixel above it. So again, this works
171	* perfectly with our loop if we make sure a starts at zero.
172	*/
173	size_t rb;
174	const __m128i zero = _mm_setzero_si128();
175	__m128i b;
176	__m128i a, d = zero;
177
178	png_debug(`1`, "in png_read_filter_row_avg4_sse2");
179
180	rb = row_info->rowbytes+`4`;
181	while (rb > `4`) {
182	__m128i avg;
183	b = load4(prev);
184	a = d; d = load4(row );
185
186	/ PNG requires a truncating average, so we can't just use _mm_avg_epu8 /
187	avg = _mm_avg_epu8(a,b);
188	/ ...but we can fix it up by subtracting off 1 if it rounded up. /
189	avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
190	_mm_set1_epi8(`1`)));
191
192	d = _mm_add_epi8(d, avg);
193	store4(row, d);
194
195	prev += `4`;
196	row += `4`;
197	rb -= `4`;
198	}
199	}
200
201	/ Returns \|x\| for 16-bit lanes. /
202	static __m128i abs_i16(__m128i x) {
203	#if PNG_INTEL_SSE_IMPLEMENTATION >= 2
204	return _mm_abs_epi16(x);
205	#else
206	/ Read this all as, return x<0 ? -x : x.*
207	* To negate two's complement, you flip all the bits then add 1.
208	*/
209	__m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());
210
211	/ Flip negative lanes. /
212	x = _mm_xor_si128(x, is_negative);
213
214	/ +1 to negative lanes, else +0. /
215	x = _mm_sub_epi16(x, is_negative);
216	return x;
217	#endif
218	}
219
220	/ Bytewise c ? t : e. /
221	static __m128i if_then_else(__m128i c, __m128i t, __m128i e) {
222	#if PNG_INTEL_SSE_IMPLEMENTATION >= 3
223	return _mm_blendv_epi8(e,t,c);
224	#else
225	return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e));
226	#endif
227	}
228
229	void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row,
230	png_const_bytep prev)
231	{
232	/ Paeth tries to predict pixel d using the pixel to the left of it, a,*
233	* and two pixels from the previous row, b and c:
234	* prev: c b
235	* row: a d
236	* The Paeth function predicts d to be whichever of a, b, or c is nearest to
237	* p=a+b-c.
238	*
239	* The first pixel has no left context, and so uses an Up filter, p = b.
240	* This works naturally with our main loop's p = a+b-c if we force a and c
241	* to zero.
242	* Here we zero b and d, which become c and a respectively at the start of
243	* the loop.
244	*/
245	size_t rb;
246	const __m128i zero = _mm_setzero_si128();
247	__m128i c, b = zero,
248	a, d = zero;
249
250	png_debug(`1`, "in png_read_filter_row_paeth3_sse2");
251
252	rb = row_info->rowbytes;
253	while (rb >= `4`) {
254	/ It's easiest to do this math (particularly, deal with pc) with 16-bit*
255	* intermediates.
256	*/
257	__m128i pa,pb,pc,smallest,nearest;
258	c = b; b = _mm_unpacklo_epi8(load4(prev), zero);
259	a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
260
261	/ (p-a) == (a+b-c - a) == (b-c) /
262
263	pa = _mm_sub_epi16(b,c);
264
265	/ (p-b) == (a+b-c - b) == (a-c) /
266	pb = _mm_sub_epi16(a,c);
267
268	/ (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) /
269	pc = _mm_add_epi16(pa,pb);
270
271	pa = abs_i16(pa); / \|p-a\| /
272	pb = abs_i16(pb); / \|p-b\| /
273	pc = abs_i16(pc); / \|p-c\| /
274
275	smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
276
277	/ Paeth breaks ties favoring a over b over c. /
278	nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
279	if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
280	c));
281
282	/ Note `_epi8`: we need addition to wrap modulo 255. /
283	d = _mm_add_epi8(d, nearest);
284	store3(row, _mm_packus_epi16(d,d));
285
286	prev += `3`;
287	row += `3`;
288	rb -= `3`;
289	}
290	if (rb > `0`) {
291	/ It's easiest to do this math (particularly, deal with pc) with 16-bit*
292	* intermediates.
293	*/
294	__m128i pa,pb,pc,smallest,nearest;
295	c = b; b = _mm_unpacklo_epi8(load3(prev), zero);
296	a = d; d = _mm_unpacklo_epi8(load3(row ), zero);
297
298	/ (p-a) == (a+b-c - a) == (b-c) /
299	pa = _mm_sub_epi16(b,c);
300
301	/ (p-b) == (a+b-c - b) == (a-c) /
302	pb = _mm_sub_epi16(a,c);
303
304	/ (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) /
305	pc = _mm_add_epi16(pa,pb);
306
307	pa = abs_i16(pa); / \|p-a\| /
308	pb = abs_i16(pb); / \|p-b\| /
309	pc = abs_i16(pc); / \|p-c\| /
310
311	smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
312
313	/ Paeth breaks ties favoring a over b over c. /
314	nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
315	if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
316	c));
317
318	/ Note `_epi8`: we need addition to wrap modulo 255. /
319	d = _mm_add_epi8(d, nearest);
320	store3(row, _mm_packus_epi16(d,d));
321
322	prev += `3`;
323	row += `3`;
324	rb -= `3`;
325	}
326	}
327
328	void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row,
329	png_const_bytep prev)
330	{
331	/ Paeth tries to predict pixel d using the pixel to the left of it, a,*
332	* and two pixels from the previous row, b and c:
333	* prev: c b
334	* row: a d
335	* The Paeth function predicts d to be whichever of a, b, or c is nearest to
336	* p=a+b-c.
337	*
338	* The first pixel has no left context, and so uses an Up filter, p = b.
339	* This works naturally with our main loop's p = a+b-c if we force a and c
340	* to zero.
341	* Here we zero b and d, which become c and a respectively at the start of
342	* the loop.
343	*/
344	size_t rb;
345	const __m128i zero = _mm_setzero_si128();
346	__m128i pa,pb,pc,smallest,nearest;
347	__m128i c, b = zero,
348	a, d = zero;
349
350	png_debug(`1`, "in png_read_filter_row_paeth4_sse2");
351
352	rb = row_info->rowbytes+`4`;
353	while (rb > `4`) {
354	/ It's easiest to do this math (particularly, deal with pc) with 16-bit*
355	* intermediates.
356	*/
357	c = b; b = _mm_unpacklo_epi8(load4(prev), zero);
358	a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
359
360	/ (p-a) == (a+b-c - a) == (b-c) /
361	pa = _mm_sub_epi16(b,c);
362
363	/ (p-b) == (a+b-c - b) == (a-c) /
364	pb = _mm_sub_epi16(a,c);
365
366	/ (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) /
367	pc = _mm_add_epi16(pa,pb);
368
369	pa = abs_i16(pa); / \|p-a\| /
370	pb = abs_i16(pb); / \|p-b\| /
371	pc = abs_i16(pc); / \|p-c\| /
372
373	smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
374
375	/ Paeth breaks ties favoring a over b over c. /
376	nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
377	if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
378	c));
379
380	/ Note `_epi8`: we need addition to wrap modulo 255. /
381	d = _mm_add_epi8(d, nearest);
382	store4(row, _mm_packus_epi16(d,d));
383
384	prev += `4`;
385	row += `4`;
386	rb -= `4`;
387	}
388	}
389
390	#endif /* PNG_INTEL_SSE_IMPLEMENTATION > 0 */
391	#endif /* READ */
392

Browse the source code of Godot/thirdparty/libpng/intel/filter_sse2_intrinsics.c