alpha_processing_sse2.c source code [Godot/thirdparty/libwebp/src/dsp/alpha_processing_sse2.c]

1	// Copyright 2014 Google Inc. All Rights Reserved.
2	//
3	// Use of this source code is governed by a BSD-style license
4	// that can be found in the COPYING file in the root of the source
5	// tree. An additional intellectual property rights grant can be found
6	// in the file PATENTS. All contributing project authors may
7	// be found in the AUTHORS file in the root of the source tree.
8	// -----------------------------------------------------------------------------
9	//
10	// Utilities for processing transparent channel.
11	//
12	// Author: Skal (pascal.massimino@gmail.com)
13
14	#include "src/dsp/dsp.h"
15
16	#if defined(WEBP_USE_SSE2)
17	#include <emmintrin.h>
18
19	//------------------------------------------------------------------------------
20
21	static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha,
22	int alpha_stride, int width, int height,
23	uint8_t* WEBP_RESTRICT dst, int dst_stride) {
24	// alpha_and stores an 'and' operation of all the alpha[] values. The final
25	// value is not 0xff if any of the alpha[] is not equal to 0xff.
26	uint32_t alpha_and = `0xff`;
27	int i, j;
28	const __m128i zero = _mm_setzero_si128();
29	const __m128i rgb_mask = _mm_set1_epi32((int)`0xffffff00`); // to preserve RGB
30	const __m128i all_0xff = _mm_set_epi32(`0`, `0`, ~`0`, ~`0`);
31	__m128i all_alphas = all_0xff;
32
33	// We must be able to access 3 extra bytes after the last written byte
34	// 'dst[4 width - 4]', because we don't know if alpha is the first or the*
35	// last byte of the quadruplet.
36	const int limit = (width - `1`) & ~`7`;
37
38	for (j = `0`; j < height; ++j) {
39	__m128i* out = (__m128i*)dst;
40	for (i = `0`; i < limit; i += `8`) {
41	// load 8 alpha bytes
42	const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[i]);
43	const __m128i a1 = _mm_unpacklo_epi8(a0, zero);
44	const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero);
45	const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero);
46	// load 8 dst pixels (32 bytes)
47	const __m128i b0_lo = _mm_loadu_si128(out + `0`);
48	const __m128i b0_hi = _mm_loadu_si128(out + `1`);
49	// mask dst alpha values
50	const __m128i b1_lo = _mm_and_si128(b0_lo, rgb_mask);
51	const __m128i b1_hi = _mm_and_si128(b0_hi, rgb_mask);
52	// combine
53	const __m128i b2_lo = _mm_or_si128(b1_lo, a2_lo);
54	const __m128i b2_hi = _mm_or_si128(b1_hi, a2_hi);
55	// store
56	_mm_storeu_si128(out + `0`, b2_lo);
57	_mm_storeu_si128(out + `1`, b2_hi);
58	// accumulate eight alpha 'and' in parallel
59	all_alphas = _mm_and_si128(all_alphas, a0);
60	out += `2`;
61	}
62	for (; i < width; ++i) {
63	const uint32_t alpha_value = alpha[i];
64	dst[`4` * i] = alpha_value;
65	alpha_and &= alpha_value;
66	}
67	alpha += alpha_stride;
68	dst += dst_stride;
69	}
70	// Combine the eight alpha 'and' into a 8-bit mask.
71	alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
72	return (alpha_and != `0xff`);
73	}
74
75	static void DispatchAlphaToGreen_SSE2(const uint8_t* WEBP_RESTRICT alpha,
76	int alpha_stride, int width, int height,
77	uint32_t* WEBP_RESTRICT dst,
78	int dst_stride) {
79	int i, j;
80	const __m128i zero = _mm_setzero_si128();
81	const int limit = width & ~`15`;
82	for (j = `0`; j < height; ++j) {
83	for (i = `0`; i < limit; i += `16`) { // process 16 alpha bytes
84	const __m128i a0 = _mm_loadu_si128((const __m128i*)&alpha[i]);
85	const __m128i a1 = _mm_unpacklo_epi8(zero, a0); // note the 'zero' first!
86	const __m128i b1 = _mm_unpackhi_epi8(zero, a0);
87	const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero);
88	const __m128i b2_lo = _mm_unpacklo_epi16(b1, zero);
89	const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero);
90	const __m128i b2_hi = _mm_unpackhi_epi16(b1, zero);
91	_mm_storeu_si128((__m128i*)&dst[i + `0`], a2_lo);
92	_mm_storeu_si128((__m128i*)&dst[i + `4`], a2_hi);
93	_mm_storeu_si128((__m128i*)&dst[i + `8`], b2_lo);
94	_mm_storeu_si128((__m128i*)&dst[i + `12`], b2_hi);
95	}
96	for (; i < width; ++i) dst[i] = alpha[i] << `8`;
97	alpha += alpha_stride;
98	dst += dst_stride;
99	}
100	}
101
102	static int ExtractAlpha_SSE2(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
103	int width, int height,
104	uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
105	// alpha_and stores an 'and' operation of all the alpha[] values. The final
106	// value is not 0xff if any of the alpha[] is not equal to 0xff.
107	uint32_t alpha_and = `0xff`;
108	int i, j;
109	const __m128i a_mask = _mm_set1_epi32(`0xff`); // to preserve alpha
110	const __m128i all_0xff = _mm_set_epi32(`0`, `0`, ~`0`, ~`0`);
111	__m128i all_alphas = all_0xff;
112
113	// We must be able to access 3 extra bytes after the last written byte
114	// 'src[4 width - 4]', because we don't know if alpha is the first or the*
115	// last byte of the quadruplet.
116	const int limit = (width - `1`) & ~`7`;
117
118	for (j = `0`; j < height; ++j) {
119	const __m128i* src = (const __m128i*)argb;
120	for (i = `0`; i < limit; i += `8`) {
121	// load 32 argb bytes
122	const __m128i a0 = _mm_loadu_si128(src + `0`);
123	const __m128i a1 = _mm_loadu_si128(src + `1`);
124	const __m128i b0 = _mm_and_si128(a0, a_mask);
125	const __m128i b1 = _mm_and_si128(a1, a_mask);
126	const __m128i c0 = _mm_packs_epi32(b0, b1);
127	const __m128i d0 = _mm_packus_epi16(c0, c0);
128	// store
129	_mm_storel_epi64((__m128i*)&alpha[i], d0);
130	// accumulate eight alpha 'and' in parallel
131	all_alphas = _mm_and_si128(all_alphas, d0);
132	src += `2`;
133	}
134	for (; i < width; ++i) {
135	const uint32_t alpha_value = argb[`4` * i];
136	alpha[i] = alpha_value;
137	alpha_and &= alpha_value;
138	}
139	argb += argb_stride;
140	alpha += alpha_stride;
141	}
142	// Combine the eight alpha 'and' into a 8-bit mask.
143	alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
144	return (alpha_and == `0xff`);
145	}
146
147	//------------------------------------------------------------------------------
148	// Non-dither premultiplied modes
149
150	#define MULTIPLIER(a) ((a) * 0x8081)
151	#define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
152
153	// We can't use a 'const int' for the SHUFFLE value, because it has to be an
154	// immediate in the _mm_shufflexx_epi16() instruction. We really need a macro.
155	// We use: v / 255 = (v 0x8081) >> 23, where v = alpha * {r,g,b} is a 16bit*
156	// value.
157	#define APPLY_ALPHA(RGBX, SHUFFLE) do { \
158	const __m128i argb0 = _mm_loadu_si128((const __m128i*)&(RGBX)); \
159	const __m128i argb1_lo = _mm_unpacklo_epi8(argb0, zero); \
160	const __m128i argb1_hi = _mm_unpackhi_epi8(argb0, zero); \
161	const __m128i alpha0_lo = _mm_or_si128(argb1_lo, kMask); \
162	const __m128i alpha0_hi = _mm_or_si128(argb1_hi, kMask); \
163	const __m128i alpha1_lo = _mm_shufflelo_epi16(alpha0_lo, SHUFFLE); \
164	const __m128i alpha1_hi = _mm_shufflelo_epi16(alpha0_hi, SHUFFLE); \
165	const __m128i alpha2_lo = _mm_shufflehi_epi16(alpha1_lo, SHUFFLE); \
166	const __m128i alpha2_hi = _mm_shufflehi_epi16(alpha1_hi, SHUFFLE); \
167	/* alpha2 = [ff a0 a0 a0][ff a1 a1 a1] */ \
168	const __m128i A0_lo = _mm_mullo_epi16(alpha2_lo, argb1_lo); \
169	const __m128i A0_hi = _mm_mullo_epi16(alpha2_hi, argb1_hi); \
170	const __m128i A1_lo = _mm_mulhi_epu16(A0_lo, kMult); \
171	const __m128i A1_hi = _mm_mulhi_epu16(A0_hi, kMult); \
172	const __m128i A2_lo = _mm_srli_epi16(A1_lo, 7); \
173	const __m128i A2_hi = _mm_srli_epi16(A1_hi, 7); \
174	const __m128i A3 = _mm_packus_epi16(A2_lo, A2_hi); \
175	_mm_storeu_si128((__m128i*)&(RGBX), A3); \
176	} while (0)
177
178	static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first,
179	int w, int h, int stride) {
180	const __m128i zero = _mm_setzero_si128();
181	const __m128i kMult = _mm_set1_epi16((short)`0x8081`);
182	const __m128i kMask = _mm_set_epi16(`0`, `0xff`, `0xff`, `0`, `0`, `0xff`, `0xff`, `0`);
183	const int kSpan = `4`;
184	while (h-- > `0`) {
185	uint32_t* const rgbx = (uint32_t*)rgba;
186	int i;
187	if (!alpha_first) {
188	for (i = `0`; i + kSpan <= w; i += kSpan) {
189	APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(`2`, `3`, `3`, `3`));
190	}
191	} else {
192	for (i = `0`; i + kSpan <= w; i += kSpan) {
193	APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(`0`, `0`, `0`, `1`));
194	}
195	}
196	// Finish with left-overs.
197	for (; i < w; ++i) {
198	uint8_t* const rgb = rgba + (alpha_first ? `1` : `0`);
199	const uint8_t* const alpha = rgba + (alpha_first ? `0` : `3`);
200	const uint32_t a = alpha[`4` * i];
201	if (a != `0xff`) {
202	const uint32_t mult = MULTIPLIER(a);
203	rgb[`4` * i + `0`] = PREMULTIPLY(rgb[`4` * i + `0`], mult);
204	rgb[`4` * i + `1`] = PREMULTIPLY(rgb[`4` * i + `1`], mult);
205	rgb[`4` * i + `2`] = PREMULTIPLY(rgb[`4` * i + `2`], mult);
206	}
207	}
208	rgba += stride;
209	}
210	}
211	#undef MULTIPLIER
212	#undef PREMULTIPLY
213
214	//------------------------------------------------------------------------------
215	// Alpha detection
216
217	static int HasAlpha8b_SSE2(const uint8_t* src, int length) {
218	const __m128i all_0xff = _mm_set1_epi8((char)`0xff`);
219	int i = `0`;
220	for (; i + `16` <= length; i += `16`) {
221	const __m128i v = _mm_loadu_si128((const __m128i*)(src + i));
222	const __m128i bits = _mm_cmpeq_epi8(v, all_0xff);
223	const int mask = _mm_movemask_epi8(bits);
224	if (mask != `0xffff`) return `1`;
225	}
226	for (; i < length; ++i) if (src[i] != `0xff`) return `1`;
227	return `0`;
228	}
229
230	static int HasAlpha32b_SSE2(const uint8_t* src, int length) {
231	const __m128i alpha_mask = _mm_set1_epi32(`0xff`);
232	const __m128i all_0xff = _mm_set1_epi8((char)`0xff`);
233	int i = `0`;
234	// We don't know if we can access the last 3 bytes after the last alpha
235	// value 'src[4 length - 4]' (because we don't know if alpha is the first*
236	// or the last byte of the quadruplet). Hence the '-3' protection below.
237	length = length * `4` - `3`; // size in bytes
238	for (; i + `64` <= length; i += `64`) {
239	const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + `0`));
240	const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + `16`));
241	const __m128i a2 = _mm_loadu_si128((const __m128i*)(src + i + `32`));
242	const __m128i a3 = _mm_loadu_si128((const __m128i*)(src + i + `48`));
243	const __m128i b0 = _mm_and_si128(a0, alpha_mask);
244	const __m128i b1 = _mm_and_si128(a1, alpha_mask);
245	const __m128i b2 = _mm_and_si128(a2, alpha_mask);
246	const __m128i b3 = _mm_and_si128(a3, alpha_mask);
247	const __m128i c0 = _mm_packs_epi32(b0, b1);
248	const __m128i c1 = _mm_packs_epi32(b2, b3);
249	const __m128i d = _mm_packus_epi16(c0, c1);
250	const __m128i bits = _mm_cmpeq_epi8(d, all_0xff);
251	const int mask = _mm_movemask_epi8(bits);
252	if (mask != `0xffff`) return `1`;
253	}
254	for (; i + `32` <= length; i += `32`) {
255	const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + `0`));
256	const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + `16`));
257	const __m128i b0 = _mm_and_si128(a0, alpha_mask);
258	const __m128i b1 = _mm_and_si128(a1, alpha_mask);
259	const __m128i c = _mm_packs_epi32(b0, b1);
260	const __m128i d = _mm_packus_epi16(c, c);
261	const __m128i bits = _mm_cmpeq_epi8(d, all_0xff);
262	const int mask = _mm_movemask_epi8(bits);
263	if (mask != `0xffff`) return `1`;
264	}
265	for (; i <= length; i += `4`) if (src[i] != `0xff`) return `1`;
266	return `0`;
267	}
268
269	static void AlphaReplace_SSE2(uint32_t* src, int length, uint32_t color) {
270	const __m128i m_color = _mm_set1_epi32((int)color);
271	const __m128i zero = _mm_setzero_si128();
272	int i = `0`;
273	for (; i + `8` <= length; i += `8`) {
274	const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + `0`));
275	const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + `4`));
276	const __m128i b0 = _mm_srai_epi32(a0, `24`);
277	const __m128i b1 = _mm_srai_epi32(a1, `24`);
278	const __m128i c0 = _mm_cmpeq_epi32(b0, zero);
279	const __m128i c1 = _mm_cmpeq_epi32(b1, zero);
280	const __m128i d0 = _mm_and_si128(c0, m_color);
281	const __m128i d1 = _mm_and_si128(c1, m_color);
282	const __m128i e0 = _mm_andnot_si128(c0, a0);
283	const __m128i e1 = _mm_andnot_si128(c1, a1);
284	_mm_storeu_si128((__m128i*)(src + i + `0`), _mm_or_si128(d0, e0));
285	_mm_storeu_si128((__m128i*)(src + i + `4`), _mm_or_si128(d1, e1));
286	}
287	for (; i < length; ++i) if ((src[i] >> `24`) == `0`) src[i] = color;
288	}
289
290	// -----------------------------------------------------------------------------
291	// Apply alpha value to rows
292
293	static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) {
294	int x = `0`;
295	if (!inverse) {
296	const int kSpan = `2`;
297	const __m128i zero = _mm_setzero_si128();
298	const __m128i k128 = _mm_set1_epi16(`128`);
299	const __m128i kMult = _mm_set1_epi16(`0x0101`);
300	const __m128i kMask = _mm_set_epi16(`0`, `0xff`, `0`, `0`, `0`, `0xff`, `0`, `0`);
301	for (x = `0`; x + kSpan <= width; x += kSpan) {
302	// To compute 'result = (int)(a x / 255. + .5)', we use:*
303	// tmp = a v + 128, result = (tmp * 0x0101u) >> 16*
304	const __m128i A0 = _mm_loadl_epi64((const __m128i*)&ptr[x]);
305	const __m128i A1 = _mm_unpacklo_epi8(A0, zero);
306	const __m128i A2 = _mm_or_si128(A1, kMask);
307	const __m128i A3 = _mm_shufflelo_epi16(A2, _MM_SHUFFLE(`2`, `3`, `3`, `3`));
308	const __m128i A4 = _mm_shufflehi_epi16(A3, _MM_SHUFFLE(`2`, `3`, `3`, `3`));
309	// here, A4 = [ff a0 a0 a0][ff a1 a1 a1]
310	const __m128i A5 = _mm_mullo_epi16(A4, A1);
311	const __m128i A6 = _mm_add_epi16(A5, k128);
312	const __m128i A7 = _mm_mulhi_epu16(A6, kMult);
313	const __m128i A10 = _mm_packus_epi16(A7, zero);
314	_mm_storel_epi64((__m128i*)&ptr[x], A10);
315	}
316	}
317	width -= x;
318	if (width > `0`) WebPMultARGBRow_C(ptr + x, width, inverse);
319	}
320
321	static void MultRow_SSE2(uint8_t* WEBP_RESTRICT const ptr,
322	const uint8_t* WEBP_RESTRICT const alpha,
323	int width, int inverse) {
324	int x = `0`;
325	if (!inverse) {
326	const __m128i zero = _mm_setzero_si128();
327	const __m128i k128 = _mm_set1_epi16(`128`);
328	const __m128i kMult = _mm_set1_epi16(`0x0101`);
329	for (x = `0`; x + `8` <= width; x += `8`) {
330	const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]);
331	const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[x]);
332	const __m128i v1 = _mm_unpacklo_epi8(v0, zero);
333	const __m128i a1 = _mm_unpacklo_epi8(a0, zero);
334	const __m128i v2 = _mm_mullo_epi16(v1, a1);
335	const __m128i v3 = _mm_add_epi16(v2, k128);
336	const __m128i v4 = _mm_mulhi_epu16(v3, kMult);
337	const __m128i v5 = _mm_packus_epi16(v4, zero);
338	_mm_storel_epi64((__m128i*)&ptr[x], v5);
339	}
340	}
341	width -= x;
342	if (width > `0`) WebPMultRow_C(ptr + x, alpha + x, width, inverse);
343	}
344
345	//------------------------------------------------------------------------------
346	// Entry point
347
348	extern void WebPInitAlphaProcessingSSE2(void);
349
350	WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
351	WebPMultARGBRow = MultARGBRow_SSE2;
352	WebPMultRow = MultRow_SSE2;
353	WebPApplyAlphaMultiply = ApplyAlphaMultiply_SSE2;
354	WebPDispatchAlpha = DispatchAlpha_SSE2;
355	WebPDispatchAlphaToGreen = DispatchAlphaToGreen_SSE2;
356	WebPExtractAlpha = ExtractAlpha_SSE2;
357
358	WebPHasAlpha8b = HasAlpha8b_SSE2;
359	WebPHasAlpha32b = HasAlpha32b_SSE2;
360	WebPAlphaReplace = AlphaReplace_SSE2;
361	}
362
363	#else // !WEBP_USE_SSE2
364
365	WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE2)
366
367	#endif // WEBP_USE_SSE2
368

Browse the source code of Godot/thirdparty/libwebp/src/dsp/alpha_processing_sse2.c