SDL_blit_A.c source code [SDL/src/video/SDL_blit_A.c]

1	/*
2	Simple DirectMedia Layer
3	Copyright (C) 1997-2025 Sam Lantinga <slouken@libsdl.org>
4
5	This software is provided 'as-is', without any express or implied
6	warranty. In no event will the authors be held liable for any damages
7	arising from the use of this software.
8
9	Permission is granted to anyone to use this software for any purpose,
10	including commercial applications, and to alter it and redistribute it
11	freely, subject to the following restrictions:
12
13	1. The origin of this software must not be misrepresented; you must not
14	claim that you wrote the original software. If you use this software
15	in a product, an acknowledgment in the product documentation would be
16	appreciated but is not required.
17	2. Altered source versions must be plainly marked as such, and must not be
18	misrepresented as being the original software.
19	3. This notice may not be removed or altered from any source distribution.
20	*/
21	#include "SDL_internal.h"
22
23	#ifdef SDL_HAVE_BLIT_A
24
25	#include "SDL_surface_c.h"
26
27	// Functions to perform alpha blended blitting
28
29	// N->1 blending with per-surface alpha
30	static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
31	{
32	int width = info->dst_w;
33	int height = info->dst_h;
34	Uint8 *src = info->src;
35	int srcskip = info->src_skip;
36	Uint8 *dst = info->dst;
37	int dstskip = info->dst_skip;
38	Uint8 *palmap = info->table;
39	const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
40	const SDL_Color *dstpal = info->dst_pal->colors;
41	int srcbpp = srcfmt->bytes_per_pixel;
42	Uint32 Pixel;
43	unsigned sR, sG, sB;
44	unsigned dR, dG, dB;
45	const unsigned A = info->a;
46
47	while (height--) {
48	/ INDENT-OFF / // clang-format off
49	DUFFS_LOOP(
50	{
51	DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
52	dR = dstpal[*dst].r;
53	dG = dstpal[*dst].g;
54	dB = dstpal[*dst].b;
55	ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
56	dR &= `0xff`;
57	dG &= `0xff`;
58	dB &= `0xff`;
59	// Pack RGB into 8bit pixel
60	if ( palmap == NULL ) {
61	*dst = (Uint8)(((dR>>`5`)<<(`3`+`2`))\|((dG>>`5`)<<(`2`))\|((dB>>`6`)<<(`0`)));
62	} else {
63	*dst = palmap[((dR>>`5`)<<(`3`+`2`))\|((dG>>`5`)<<(`2`))\|((dB>>`6`)<<(`0`))];
64	}
65	dst++;
66	src += srcbpp;
67	},
68	width);
69	/ INDENT-ON / // clang-format on
70	src += srcskip;
71	dst += dstskip;
72	}
73	}
74
75	// N->1 blending with pixel alpha
76	static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
77	{
78	int width = info->dst_w;
79	int height = info->dst_h;
80	Uint8 *src = info->src;
81	int srcskip = info->src_skip;
82	Uint8 *dst = info->dst;
83	int dstskip = info->dst_skip;
84	Uint8 *palmap = info->table;
85	const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
86	const SDL_Color *dstpal = info->dst_pal->colors;
87	int srcbpp = srcfmt->bytes_per_pixel;
88	Uint32 Pixel;
89	unsigned sR, sG, sB, sA;
90	unsigned dR, dG, dB;
91
92	while (height--) {
93	/ INDENT-OFF / // clang-format off
94	DUFFS_LOOP(
95	{
96	DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
97	dR = dstpal[*dst].r;
98	dG = dstpal[*dst].g;
99	dB = dstpal[*dst].b;
100	ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB);
101	dR &= `0xff`;
102	dG &= `0xff`;
103	dB &= `0xff`;
104	// Pack RGB into 8bit pixel
105	if ( palmap == NULL ) {
106	*dst = (Uint8)(((dR>>`5`)<<(`3`+`2`))\|((dG>>`5`)<<(`2`))\|((dB>>`6`)<<(`0`)));
107	} else {
108	*dst = palmap[((dR>>`5`)<<(`3`+`2`))\|((dG>>`5`)<<(`2`))\|((dB>>`6`)<<(`0`))];
109	}
110	dst++;
111	src += srcbpp;
112	},
113	width);
114	/ INDENT-ON / // clang-format on
115	src += srcskip;
116	dst += dstskip;
117	}
118	}
119
120	// colorkeyed N->1 blending with per-surface alpha
121	static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
122	{
123	int width = info->dst_w;
124	int height = info->dst_h;
125	Uint8 *src = info->src;
126	int srcskip = info->src_skip;
127	Uint8 *dst = info->dst;
128	int dstskip = info->dst_skip;
129	Uint8 *palmap = info->table;
130	const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
131	const SDL_Color *dstpal = info->dst_pal->colors;
132	int srcbpp = srcfmt->bytes_per_pixel;
133	Uint32 ckey = info->colorkey;
134	Uint32 Pixel;
135	unsigned sR, sG, sB;
136	unsigned dR, dG, dB;
137	const unsigned A = info->a;
138
139	while (height--) {
140	/ INDENT-OFF / // clang-format off
141	DUFFS_LOOP(
142	{
143	DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
144	if ( Pixel != ckey ) {
145	dR = dstpal[*dst].r;
146	dG = dstpal[*dst].g;
147	dB = dstpal[*dst].b;
148	ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
149	dR &= `0xff`;
150	dG &= `0xff`;
151	dB &= `0xff`;
152	// Pack RGB into 8bit pixel
153	if ( palmap == NULL ) {
154	*dst = (Uint8)(((dR>>`5`)<<(`3`+`2`))\|((dG>>`5`)<<(`2`))\|((dB>>`6`)<<(`0`)));
155	} else {
156	*dst = palmap[((dR>>`5`)<<(`3`+`2`))\|((dG>>`5`)<<(`2`))\|((dB>>`6`)<<(`0`))];
157	}
158	}
159	dst++;
160	src += srcbpp;
161	},
162	width);
163	/ INDENT-ON / // clang-format on
164	src += srcskip;
165	dst += dstskip;
166	}
167	}
168
169	#ifdef SDL_SSE2_INTRINSICS
170
171	static void SDL_TARGETING("sse2") Blit888to888SurfaceAlphaSSE2(SDL_BlitInfo *info)
172	{
173	int width = info->dst_w;
174	int height = info->dst_h;
175	Uint8 *src = info->src;
176	int srcskip = info->src_skip;
177	Uint8 *dst = info->dst;
178	int dstskip = info->dst_skip;
179	Uint8 alpha = info->a;
180
181	const __m128i alpha_fill_mask = _mm_set1_epi32((int)`0xff000000`);
182	const __m128i srcA = _mm_set1_epi16(alpha);
183
184	while (height--) {
185	int i = `0`;
186
187	for (; i + `4` <= width; i += `4`) {
188	// Load 4 src pixels
189	__m128i src128 = _mm_loadu_si128((__m128i *)src);
190
191	// Load 4 dst pixels
192	__m128i dst128 = _mm_loadu_si128((__m128i *)dst);
193
194	__m128i src_lo = _mm_unpacklo_epi8(src128, _mm_setzero_si128());
195	__m128i src_hi = _mm_unpackhi_epi8(src128, _mm_setzero_si128());
196
197	__m128i dst_lo = _mm_unpacklo_epi8(dst128, _mm_setzero_si128());
198	__m128i dst_hi = _mm_unpackhi_epi8(dst128, _mm_setzero_si128());
199
200	// dst = ((src - dst) srcA) + ((dst << 8) - dst)*
201	dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srcA),
202	_mm_sub_epi16(_mm_slli_epi16(dst_lo, `8`), dst_lo));
203	dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srcA),
204	_mm_sub_epi16(_mm_slli_epi16(dst_hi, `8`), dst_hi));
205
206	// dst += 0x1U (use 0x80 to round instead of floor)
207	dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(`1`));
208	dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(`1`));
209
210	// dst = (dst + (dst >> 8)) >> 8
211	dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, `8`)), `8`);
212	dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, `8`)), `8`);
213
214	dst128 = _mm_packus_epi16(dst_lo, dst_hi);
215
216	// Set the alpha channels of dst to 255
217	dst128 = _mm_or_si128(dst128, alpha_fill_mask);
218
219	_mm_storeu_si128((__m128i *)dst, dst128);
220
221	src += `16`;
222	dst += `16`;
223	}
224
225	for (; i < width; ++i) {
226	Uint32 src32 = (Uint32 )src;
227	Uint32 dst32 = (Uint32 )dst;
228
229	FACTOR_BLEND_8888(src32, dst32, alpha);
230
231	*dst = dst32 \| `0xff000000`;
232
233	src += `4`;
234	dst += `4`;
235	}
236
237	src += srcskip;
238	dst += dstskip;
239	}
240	}
241
242	#endif
243
244	// fast RGB888->(A)RGB888 blending with surface alpha=128 special case
245	static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
246	{
247	int width = info->dst_w;
248	int height = info->dst_h;
249	Uint32 srcp = (Uint32 )info->src;
250	int srcskip = info->src_skip >> `2`;
251	Uint32 dstp = (Uint32 )info->dst;
252	int dstskip = info->dst_skip >> `2`;
253
254	while (height--) {
255	/ INDENT-OFF / // clang-format off
256	DUFFS_LOOP({
257	Uint32 s = *srcp++;
258	Uint32 d = *dstp;
259	*dstp++ = ((((s & `0x00fefefe`) + (d & `0x00fefefe`)) >> `1`)
260	+ (s & d & `0x00010101`)) \| `0xff000000`;
261	}, width);
262	/ INDENT-ON / // clang-format on
263	srcp += srcskip;
264	dstp += dstskip;
265	}
266	}
267
268	// fast RGB888->(A)RGB888 blending with surface alpha
269	static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
270	{
271	unsigned alpha = info->a;
272	if (alpha == `128`) {
273	BlitRGBtoRGBSurfaceAlpha128(info);
274	} else {
275	int width = info->dst_w;
276	int height = info->dst_h;
277	Uint32 srcp = (Uint32 )info->src;
278	int srcskip = info->src_skip >> `2`;
279	Uint32 dstp = (Uint32 )info->dst;
280	int dstskip = info->dst_skip >> `2`;
281	Uint32 s;
282	Uint32 d;
283
284	while (height--) {
285	/ INDENT-OFF / // clang-format off
286	DUFFS_LOOP({
287	s = *srcp;
288	d = *dstp;
289
290	FACTOR_BLEND_8888(s, d, alpha);
291
292	*dstp = d \| `0xff000000`;
293	++srcp;
294	++dstp;
295	}, width);
296	/ INDENT-ON / // clang-format on
297	srcp += srcskip;
298	dstp += dstskip;
299	}
300	}
301	}
302
303	// 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel
304
305	// blend a single 16 bit pixel at 50%
306	#define BLEND16_50(d, s, mask) \
307	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
308
309	// blend two 16 bit pixels at 50%
310	#define BLEND2x16_50(d, s, mask) \
311	(((s & (mask \| mask << 16)) >> 1) + ((d & (mask \| mask << 16)) >> 1) + (s & d & (~(mask \| mask << 16))))
312
313	static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
314	{
315	int width = info->dst_w;
316	int height = info->dst_h;
317	Uint16 srcp = (Uint16 )info->src;
318	int srcskip = info->src_skip >> `1`;
319	Uint16 dstp = (Uint16 )info->dst;
320	int dstskip = info->dst_skip >> `1`;
321
322	while (height--) {
323	if (((uintptr_t)srcp ^ (uintptr_t)dstp) & `2`) {
324	/*
325	* Source and destination not aligned, pipeline it.
326	* This is mostly a win for big blits but no loss for
327	* small ones
328	*/
329	Uint32 prev_sw;
330	int w = width;
331
332	// handle odd destination
333	if ((uintptr_t)dstp & `2`) {
334	Uint16 d = dstp, s = srcp;
335	*dstp = BLEND16_50(d, s, mask);
336	dstp++;
337	srcp++;
338	w--;
339	}
340	srcp++; // srcp is now 32-bit aligned
341
342	// bootstrap pipeline with first halfword
343	prev_sw = ((Uint32 *)srcp)[-`1`];
344
345	while (w > `1`) {
346	Uint32 sw, dw, s;
347	sw = (Uint32 )srcp;
348	dw = (Uint32 )dstp;
349	#if SDL_BYTEORDER == SDL_BIG_ENDIAN
350	s = (prev_sw << `16`) + (sw >> `16`);
351	#else
352	s = (prev_sw >> `16`) + (sw << `16`);
353	#endif
354	prev_sw = sw;
355	(Uint32 )dstp = BLEND2x16_50(dw, s, mask);
356	dstp += `2`;
357	srcp += `2`;
358	w -= `2`;
359	}
360
361	// final pixel if any
362	if (w) {
363	Uint16 d = *dstp, s;
364	#if SDL_BYTEORDER == SDL_BIG_ENDIAN
365	s = (Uint16)prev_sw;
366	#else
367	s = (Uint16)(prev_sw >> `16`);
368	#endif
369	*dstp = BLEND16_50(d, s, mask);
370	srcp++;
371	dstp++;
372	}
373	srcp += srcskip - `1`;
374	dstp += dstskip;
375	} else {
376	// source and destination are aligned
377	int w = width;
378
379	// first odd pixel?
380	if ((uintptr_t)srcp & `2`) {
381	Uint16 d = dstp, s = srcp;
382	*dstp = BLEND16_50(d, s, mask);
383	srcp++;
384	dstp++;
385	w--;
386	}
387	// srcp and dstp are now 32-bit aligned
388
389	while (w > `1`) {
390	Uint32 sw = (Uint32 )srcp;
391	Uint32 dw = (Uint32 )dstp;
392	(Uint32 )dstp = BLEND2x16_50(dw, sw, mask);
393	srcp += `2`;
394	dstp += `2`;
395	w -= `2`;
396	}
397
398	// last odd pixel?
399	if (w) {
400	Uint16 d = dstp, s = srcp;
401	*dstp = BLEND16_50(d, s, mask);
402	srcp++;
403	dstp++;
404	}
405	srcp += srcskip;
406	dstp += dstskip;
407	}
408	}
409	}
410
411	#ifdef SDL_MMX_INTRINSICS
412
413	// fast RGB565->RGB565 blending with surface alpha
414	static void SDL_TARGETING("mmx") Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
415	{
416	unsigned alpha = info->a;
417	if (alpha == `128`) {
418	Blit16to16SurfaceAlpha128(info, `0xf7de`);
419	} else {
420	int width = info->dst_w;
421	int height = info->dst_h;
422	Uint16 srcp = (Uint16 )info->src;
423	int srcskip = info->src_skip >> `1`;
424	Uint16 dstp = (Uint16 )info->dst;
425	int dstskip = info->dst_skip >> `1`;
426	Uint32 s, d;
427
428	#ifdef USE_DUFFS_LOOP
429	__m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
430
431	alpha &= ~(`1` + `2` + `4`); // cut alpha to get the exact same behaviour
432	mm_alpha = _mm_set_pi32(`0`, alpha); // 0000000A -> mm_alpha
433	alpha >>= `3`; // downscale alpha to 5 bits
434
435	mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); // 00000A0A -> mm_alpha
436	mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); // 0A0A0A0A -> mm_alpha
437	/ position alpha to allow for mullo and mulhi on diff channels*
438	to reduce the number of operations /*
439	mm_alpha = _mm_slli_si64(mm_alpha, `3`);
440
441	// Setup the 565 color channel masks
442	gmask = _mm_set_pi32(`0x07E007E0`, `0x07E007E0`); // MASKGREEN -> gmask
443	bmask = _mm_set_pi32(`0x001F001F`, `0x001F001F`); // MASKBLUE -> bmask
444	#endif
445
446	while (height--) {
447	/ INDENT-OFF / // clang-format off
448	DUFFS_LOOP_124(
449	{
450	s = *srcp++;
451	d = *dstp;
452	/*
453	* shift out the middle component (green) to
454	* the high 16 bits, and process all three RGB
455	* components at the same time.
456	*/
457	s = (s \| s << `16`) & `0x07e0f81f`;
458	d = (d \| d << `16`) & `0x07e0f81f`;
459	d += (s - d) * alpha >> `5`;
460	d &= `0x07e0f81f`;
461	*dstp++ = (Uint16)(d \| d >> `16`);
462	},{
463	s = *srcp++;
464	d = *dstp;
465	/*
466	* shift out the middle component (green) to
467	* the high 16 bits, and process all three RGB
468	* components at the same time.
469	*/
470	s = (s \| s << `16`) & `0x07e0f81f`;
471	d = (d \| d << `16`) & `0x07e0f81f`;
472	d += (s - d) * alpha >> `5`;
473	d &= `0x07e0f81f`;
474	*dstp++ = (Uint16)(d \| d >> `16`);
475	s = *srcp++;
476	d = *dstp;
477	/*
478	* shift out the middle component (green) to
479	* the high 16 bits, and process all three RGB
480	* components at the same time.
481	*/
482	s = (s \| s << `16`) & `0x07e0f81f`;
483	d = (d \| d << `16`) & `0x07e0f81f`;
484	d += (s - d) * alpha >> `5`;
485	d &= `0x07e0f81f`;
486	*dstp++ = (Uint16)(d \| d >> `16`);
487	},{
488	src1 = (__m64)srcp; // 4 src pixels -> src1
489	dst1 = (__m64)dstp; // 4 dst pixels -> dst1
490
491	// red
492	src2 = src1;
493	src2 = _mm_srli_pi16(src2, `11`); // src2 >> 11 -> src2 [000r 000r 000r 000r]
494
495	dst2 = dst1;
496	dst2 = _mm_srli_pi16(dst2, `11`); // dst2 >> 11 -> dst2 [000r 000r 000r 000r]
497
498	// blend
499	src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2
500	src2 = _mm_mullo_pi16(src2, mm_alpha); / src2 * alpha -> src2 /
501	src2 = _mm_srli_pi16(src2, `11`); // src2 >> 11 -> src2
502	dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2
503	dst2 = _mm_slli_pi16(dst2, `11`); // dst2 << 11 -> dst2
504
505	mm_res = dst2; // RED -> mm_res
506
507	// green -- process the bits in place
508	src2 = src1;
509	src2 = _mm_and_si64(src2, gmask); // src & MASKGREEN -> src2
510
511	dst2 = dst1;
512	dst2 = _mm_and_si64(dst2, gmask); // dst & MASKGREEN -> dst2
513
514	// blend
515	src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2
516	src2 = _mm_mulhi_pi16(src2, mm_alpha); / src2 * alpha -> src2 /
517	src2 = _mm_slli_pi16(src2, `5`); // src2 << 5 -> src2
518	dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2
519
520	mm_res = _mm_or_si64(mm_res, dst2); // RED \| GREEN -> mm_res
521
522	// blue
523	src2 = src1;
524	src2 = _mm_and_si64(src2, bmask); // src & MASKBLUE -> src2[000b 000b 000b 000b]
525
526	dst2 = dst1;
527	dst2 = _mm_and_si64(dst2, bmask); // dst & MASKBLUE -> dst2[000b 000b 000b 000b]
528
529	// blend
530	src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2
531	src2 = _mm_mullo_pi16(src2, mm_alpha); / src2 * alpha -> src2 /
532	src2 = _mm_srli_pi16(src2, `11`); // src2 >> 11 -> src2
533	dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2
534	dst2 = _mm_and_si64(dst2, bmask); // dst2 & MASKBLUE -> dst2
535
536	mm_res = _mm_or_si64(mm_res, dst2); // RED \| GREEN \| BLUE -> mm_res
537
538	(__m64)dstp = mm_res; // mm_res -> 4 dst pixels
539
540	srcp += `4`;
541	dstp += `4`;
542	}, width);
543	/ INDENT-ON / // clang-format on
544	srcp += srcskip;
545	dstp += dstskip;
546	}
547	_mm_empty();
548	}
549	}
550
551	// fast RGB555->RGB555 blending with surface alpha
552	static void SDL_TARGETING("mmx") Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
553	{
554	unsigned alpha = info->a;
555	if (alpha == `128`) {
556	Blit16to16SurfaceAlpha128(info, `0xfbde`);
557	} else {
558	int width = info->dst_w;
559	int height = info->dst_h;
560	Uint16 srcp = (Uint16 )info->src;
561	int srcskip = info->src_skip >> `1`;
562	Uint16 dstp = (Uint16 )info->dst;
563	int dstskip = info->dst_skip >> `1`;
564	Uint32 s, d;
565
566	#ifdef USE_DUFFS_LOOP
567	__m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
568
569	alpha &= ~(`1` + `2` + `4`); // cut alpha to get the exact same behaviour
570	mm_alpha = _mm_set_pi32(`0`, alpha); // 0000000A -> mm_alpha
571	alpha >>= `3`; // downscale alpha to 5 bits
572
573	mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); // 00000A0A -> mm_alpha
574	mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); // 0A0A0A0A -> mm_alpha
575	/ position alpha to allow for mullo and mulhi on diff channels*
576	to reduce the number of operations /*
577	mm_alpha = _mm_slli_si64(mm_alpha, `3`);
578
579	// Setup the 555 color channel masks
580	rmask = _mm_set_pi32(`0x7C007C00`, `0x7C007C00`); // MASKRED -> rmask
581	gmask = _mm_set_pi32(`0x03E003E0`, `0x03E003E0`); // MASKGREEN -> gmask
582	bmask = _mm_set_pi32(`0x001F001F`, `0x001F001F`); // MASKBLUE -> bmask
583	#endif
584	while (height--) {
585	/ INDENT-OFF / // clang-format off
586	DUFFS_LOOP_124(
587	{
588	s = *srcp++;
589	d = *dstp;
590	/*
591	* shift out the middle component (green) to
592	* the high 16 bits, and process all three RGB
593	* components at the same time.
594	*/
595	s = (s \| s << `16`) & `0x03e07c1f`;
596	d = (d \| d << `16`) & `0x03e07c1f`;
597	d += (s - d) * alpha >> `5`;
598	d &= `0x03e07c1f`;
599	*dstp++ = (Uint16)(d \| d >> `16`);
600	},{
601	s = *srcp++;
602	d = *dstp;
603	/*
604	* shift out the middle component (green) to
605	* the high 16 bits, and process all three RGB
606	* components at the same time.
607	*/
608	s = (s \| s << `16`) & `0x03e07c1f`;
609	d = (d \| d << `16`) & `0x03e07c1f`;
610	d += (s - d) * alpha >> `5`;
611	d &= `0x03e07c1f`;
612	*dstp++ = (Uint16)(d \| d >> `16`);
613	s = *srcp++;
614	d = *dstp;
615	/*
616	* shift out the middle component (green) to
617	* the high 16 bits, and process all three RGB
618	* components at the same time.
619	*/
620	s = (s \| s << `16`) & `0x03e07c1f`;
621	d = (d \| d << `16`) & `0x03e07c1f`;
622	d += (s - d) * alpha >> `5`;
623	d &= `0x03e07c1f`;
624	*dstp++ = (Uint16)(d \| d >> `16`);
625	},{
626	src1 = (__m64)srcp; // 4 src pixels -> src1
627	dst1 = (__m64)dstp; // 4 dst pixels -> dst1
628
629	// red -- process the bits in place
630	src2 = src1;
631	src2 = _mm_and_si64(src2, rmask); // src & MASKRED -> src2
632
633	dst2 = dst1;
634	dst2 = _mm_and_si64(dst2, rmask); // dst & MASKRED -> dst2
635
636	// blend
637	src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2
638	src2 = _mm_mulhi_pi16(src2, mm_alpha); / src2 * alpha -> src2 /
639	src2 = _mm_slli_pi16(src2, `5`); // src2 << 5 -> src2
640	dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2
641	dst2 = _mm_and_si64(dst2, rmask); // dst2 & MASKRED -> dst2
642
643	mm_res = dst2; // RED -> mm_res
644
645	// green -- process the bits in place
646	src2 = src1;
647	src2 = _mm_and_si64(src2, gmask); // src & MASKGREEN -> src2
648
649	dst2 = dst1;
650	dst2 = _mm_and_si64(dst2, gmask); // dst & MASKGREEN -> dst2
651
652	// blend
653	src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2
654	src2 = _mm_mulhi_pi16(src2, mm_alpha); / src2 * alpha -> src2 /
655	src2 = _mm_slli_pi16(src2, `5`); // src2 << 5 -> src2
656	dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2
657
658	mm_res = _mm_or_si64(mm_res, dst2); // RED \| GREEN -> mm_res
659
660	// blue
661	src2 = src1; // src -> src2
662	src2 = _mm_and_si64(src2, bmask); // src & MASKBLUE -> src2[000b 000b 000b 000b]
663
664	dst2 = dst1; // dst -> dst2
665	dst2 = _mm_and_si64(dst2, bmask); // dst & MASKBLUE -> dst2[000b 000b 000b 000b]
666
667	// blend
668	src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2
669	src2 = _mm_mullo_pi16(src2, mm_alpha); / src2 * alpha -> src2 /
670	src2 = _mm_srli_pi16(src2, `11`); // src2 >> 11 -> src2
671	dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2
672	dst2 = _mm_and_si64(dst2, bmask); // dst2 & MASKBLUE -> dst2
673
674	mm_res = _mm_or_si64(mm_res, dst2); // RED \| GREEN \| BLUE -> mm_res
675
676	(__m64)dstp = mm_res; // mm_res -> 4 dst pixels
677
678	srcp += `4`;
679	dstp += `4`;
680	}, width);
681	/ INDENT-ON / // clang-format on
682	srcp += srcskip;
683	dstp += dstskip;
684	}
685	_mm_empty();
686	}
687	}
688
689	#endif // SDL_MMX_INTRINSICS
690
691	// fast RGB565->RGB565 blending with surface alpha
692	static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
693	{
694	unsigned alpha = info->a;
695	if (alpha == `128`) {
696	Blit16to16SurfaceAlpha128(info, `0xf7de`);
697	} else {
698	int width = info->dst_w;
699	int height = info->dst_h;
700	Uint16 srcp = (Uint16 )info->src;
701	int srcskip = info->src_skip >> `1`;
702	Uint16 dstp = (Uint16 )info->dst;
703	int dstskip = info->dst_skip >> `1`;
704	alpha >>= `3`; // downscale alpha to 5 bits
705
706	while (height--) {
707	/ INDENT-OFF / // clang-format off
708	DUFFS_LOOP({
709	Uint32 s = *srcp++;
710	Uint32 d = *dstp;
711	/*
712	* shift out the middle component (green) to
713	* the high 16 bits, and process all three RGB
714	* components at the same time.
715	*/
716	s = (s \| s << `16`) & `0x07e0f81f`;
717	d = (d \| d << `16`) & `0x07e0f81f`;
718	d += (s - d) * alpha >> `5`;
719	d &= `0x07e0f81f`;
720	*dstp++ = (Uint16)(d \| d >> `16`);
721	}, width);
722	/ INDENT-ON / // clang-format on
723	srcp += srcskip;
724	dstp += dstskip;
725	}
726	}
727	}
728
729	// fast RGB555->RGB555 blending with surface alpha
730	static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
731	{
732	unsigned alpha = info->a; // downscale alpha to 5 bits
733	if (alpha == `128`) {
734	Blit16to16SurfaceAlpha128(info, `0xfbde`);
735	} else {
736	int width = info->dst_w;
737	int height = info->dst_h;
738	Uint16 srcp = (Uint16 )info->src;
739	int srcskip = info->src_skip >> `1`;
740	Uint16 dstp = (Uint16 )info->dst;
741	int dstskip = info->dst_skip >> `1`;
742	alpha >>= `3`; // downscale alpha to 5 bits
743
744	while (height--) {
745	/ INDENT-OFF / // clang-format off
746	DUFFS_LOOP({
747	Uint32 s = *srcp++;
748	Uint32 d = *dstp;
749	/*
750	* shift out the middle component (green) to
751	* the high 16 bits, and process all three RGB
752	* components at the same time.
753	*/
754	s = (s \| s << `16`) & `0x03e07c1f`;
755	d = (d \| d << `16`) & `0x03e07c1f`;
756	d += (s - d) * alpha >> `5`;
757	d &= `0x03e07c1f`;
758	*dstp++ = (Uint16)(d \| d >> `16`);
759	}, width);
760	/ INDENT-ON / // clang-format on
761	srcp += srcskip;
762	dstp += dstskip;
763	}
764	}
765	}
766
767	// fast ARGB8888->RGB565 blending with pixel alpha
768	static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
769	{
770	int width = info->dst_w;
771	int height = info->dst_h;
772	Uint32 srcp = (Uint32 )info->src;
773	int srcskip = info->src_skip >> `2`;
774	Uint16 dstp = (Uint16 )info->dst;
775	int dstskip = info->dst_skip >> `1`;
776
777	while (height--) {
778	/ INDENT-OFF / // clang-format off
779	DUFFS_LOOP({
780	Uint32 s = *srcp;
781	unsigned alpha = s >> `27`; // downscale alpha to 5 bits
782	/ Here we special-case opaque alpha since the*
783	compositioning used (>>8 instead of /255) doesn't handle
784	it correctly. /*
785	if (alpha) {
786	if (alpha == (SDL_ALPHA_OPAQUE >> `3`)) {
787	*dstp = (Uint16)((s >> `8` & `0xf800`) + (s >> `5` & `0x7e0`) + (s >> `3` & `0x1f`));
788	} else {
789	Uint32 d = *dstp;
790	/*
791	* convert source and destination to G0RAB65565
792	* and blend all components at the same time
793	*/
794	s = ((s & `0xfc00`) << `11`) + (s >> `8` & `0xf800`) + (s >> `3` & `0x1f`);
795	d = (d \| d << `16`) & `0x07e0f81f`;
796	d += (s - d) * alpha >> `5`;
797	d &= `0x07e0f81f`;
798	*dstp = (Uint16)(d \| d >> `16`);
799	}
800	}
801	srcp++;
802	dstp++;
803	}, width);
804	/ INDENT-ON / // clang-format on
805	srcp += srcskip;
806	dstp += dstskip;
807	}
808	}
809
810	// fast ARGB8888->RGB555 blending with pixel alpha
811	static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
812	{
813	int width = info->dst_w;
814	int height = info->dst_h;
815	Uint32 srcp = (Uint32 )info->src;
816	int srcskip = info->src_skip >> `2`;
817	Uint16 dstp = (Uint16 )info->dst;
818	int dstskip = info->dst_skip >> `1`;
819
820	while (height--) {
821	/ INDENT-OFF / // clang-format off
822	DUFFS_LOOP({
823	unsigned alpha;
824	Uint32 s = *srcp;
825	alpha = s >> `27`; // downscale alpha to 5 bits
826	/ Here we special-case opaque alpha since the*
827	compositioning used (>>8 instead of /255) doesn't handle
828	it correctly. /*
829	if (alpha) {
830	if (alpha == (SDL_ALPHA_OPAQUE >> `3`)) {
831	*dstp = (Uint16)((s >> `9` & `0x7c00`) + (s >> `6` & `0x3e0`) + (s >> `3` & `0x1f`));
832	} else {
833	Uint32 d = *dstp;
834	/*
835	* convert source and destination to G0RAB55555
836	* and blend all components at the same time
837	*/
838	s = ((s & `0xf800`) << `10`) + (s >> `9` & `0x7c00`) + (s >> `3` & `0x1f`);
839	d = (d \| d << `16`) & `0x03e07c1f`;
840	d += (s - d) * alpha >> `5`;
841	d &= `0x03e07c1f`;
842	*dstp = (Uint16)(d \| d >> `16`);
843	}
844	}
845	srcp++;
846	dstp++;
847	}, width);
848	/ INDENT-ON / // clang-format on
849	srcp += srcskip;
850	dstp += dstskip;
851	}
852	}
853
854	// General (slow) N->N blending with per-surface alpha
855	static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
856	{
857	int width = info->dst_w;
858	int height = info->dst_h;
859	Uint8 *src = info->src;
860	int srcskip = info->src_skip;
861	Uint8 *dst = info->dst;
862	int dstskip = info->dst_skip;
863	const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
864	const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
865	int srcbpp = srcfmt->bytes_per_pixel;
866	int dstbpp = dstfmt->bytes_per_pixel;
867	Uint32 Pixel;
868	unsigned sR, sG, sB;
869	unsigned dR, dG, dB, dA;
870	const unsigned sA = info->a;
871
872	if (sA) {
873	while (height--) {
874	/ INDENT-OFF / // clang-format off
875	DUFFS_LOOP(
876	{
877	DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
878	DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
879	ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
880	ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
881	src += srcbpp;
882	dst += dstbpp;
883	},
884	width);
885	/ INDENT-ON / // clang-format on
886	src += srcskip;
887	dst += dstskip;
888	}
889	}
890	}
891
892	// General (slow) colorkeyed N->N blending with per-surface alpha
893	static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
894	{
895	int width = info->dst_w;
896	int height = info->dst_h;
897	Uint8 *src = info->src;
898	int srcskip = info->src_skip;
899	Uint8 *dst = info->dst;
900	int dstskip = info->dst_skip;
901	const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
902	const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
903	Uint32 ckey = info->colorkey;
904	int srcbpp = srcfmt->bytes_per_pixel;
905	int dstbpp = dstfmt->bytes_per_pixel;
906	Uint32 Pixel;
907	unsigned sR, sG, sB;
908	unsigned dR, dG, dB, dA;
909	const unsigned sA = info->a;
910
911	while (height--) {
912	/ INDENT-OFF / // clang-format off
913	DUFFS_LOOP(
914	{
915	RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
916	if (sA && Pixel != ckey) {
917	RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
918	DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
919	ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
920	ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
921	}
922	src += srcbpp;
923	dst += dstbpp;
924	},
925	width);
926	/ INDENT-ON / // clang-format on
927	src += srcskip;
928	dst += dstskip;
929	}
930	}
931
932	// Fast 32-bit RGBA->RGBA blending with pixel alpha
933	static void Blit8888to8888PixelAlpha(SDL_BlitInfo *info)
934	{
935	int width = info->dst_w;
936	int height = info->dst_h;
937	Uint8 *src = info->src;
938	int srcskip = info->src_skip;
939	Uint8 *dst = info->dst;
940	int dstskip = info->dst_skip;
941	const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
942
943	while (height--) {
944	int i = `0`;
945
946	for (; i < width; ++i) {
947	Uint32 src32 = (Uint32 )src;
948	Uint32 dst32 = (Uint32 )dst;
949	ALPHA_BLEND_8888(src32, dst32, srcfmt);
950	(Uint32 )dst = dst32;
951	src += `4`;
952	dst += `4`;
953	}
954
955	src += srcskip;
956	dst += dstskip;
957	}
958	}
959
960	// Fast 32-bit RGBA->RGB(A) blending with pixel alpha and src swizzling
961	static void Blit8888to8888PixelAlphaSwizzle(SDL_BlitInfo *info)
962	{
963	int width = info->dst_w;
964	int height = info->dst_h;
965	Uint8 *src = info->src;
966	int srcskip = info->src_skip;
967	Uint8 *dst = info->dst;
968	int dstskip = info->dst_skip;
969	const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
970	const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
971
972	while (height--) {
973	int i = `0`;
974
975	for (; i < width; ++i) {
976	Uint32 src32 = (Uint32 )src;
977	Uint32 dst32 = (Uint32 )dst;
978	ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt);
979	(Uint32 )dst = dst32;
980	src += `4`;
981	dst += `4`;
982	}
983
984	src += srcskip;
985	dst += dstskip;
986	}
987	}
988
989	#ifdef SDL_SSE4_1_INTRINSICS
990
991	static void SDL_TARGETING("sse4.1") Blit8888to8888PixelAlphaSwizzleSSE41(SDL_BlitInfo *info)
992	{
993	int width = info->dst_w;
994	int height = info->dst_h;
995	Uint8 *src = info->src;
996	int srcskip = info->src_skip;
997	Uint8 *dst = info->dst;
998	int dstskip = info->dst_skip;
999	const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
1000	const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
1001
1002	// The byte offsets for the start of each pixel
1003	const __m128i mask_offsets = _mm_set_epi8(
1004	`12`, `12`, `12`, `12`, `8`, `8`, `8`, `8`, `4`, `4`, `4`, `4`, `0`, `0`, `0`, `0`);
1005
1006	const __m128i convert_mask = _mm_add_epi32(
1007	_mm_set1_epi32(
1008	((srcfmt->Rshift >> `3`) << dstfmt->Rshift) \|
1009	((srcfmt->Gshift >> `3`) << dstfmt->Gshift) \|
1010	((srcfmt->Bshift >> `3`) << dstfmt->Bshift)),
1011	mask_offsets);
1012
1013	const __m128i alpha_splat_mask = _mm_add_epi8(_mm_set1_epi8(srcfmt->Ashift >> `3`), mask_offsets);
1014	const __m128i alpha_fill_mask = _mm_set1_epi32((int)dstfmt->Amask);
1015
1016	while (height--) {
1017	int i = `0`;
1018
1019	for (; i + `4` <= width; i += `4`) {
1020	// Load 4 src pixels
1021	__m128i src128 = _mm_loadu_si128((__m128i *)src);
1022
1023	// Load 4 dst pixels
1024	__m128i dst128 = _mm_loadu_si128((__m128i *)dst);
1025
1026	// Extract the alpha from each pixel and splat it into all the channels
1027	__m128i srcA = _mm_shuffle_epi8(src128, alpha_splat_mask);
1028
1029	// Convert to dst format
1030	src128 = _mm_shuffle_epi8(src128, convert_mask);
1031
1032	// Set the alpha channels of src to 255
1033	src128 = _mm_or_si128(src128, alpha_fill_mask);
1034
1035	// Duplicate each 8-bit alpha value into both bytes of 16-bit lanes
1036	__m128i srca_lo = _mm_unpacklo_epi8(srcA, srcA);
1037	__m128i srca_hi = _mm_unpackhi_epi8(srcA, srcA);
1038
1039	// Calculate 255-srcA in every second 8-bit lane (255-srcA = srcA^0xff)
1040	srca_lo = _mm_xor_si128(srca_lo, _mm_set1_epi16(`0xff00`));
1041	srca_hi = _mm_xor_si128(srca_hi, _mm_set1_epi16(`0xff00`));
1042
1043	// maddubs expects second argument to be signed, so subtract 128
1044	src128 = _mm_sub_epi8(src128, _mm_set1_epi8((Uint8)`128`));
1045	dst128 = _mm_sub_epi8(dst128, _mm_set1_epi8((Uint8)`128`));
1046
1047	// dst = srcA(src-128) + (255-srcA)(dst-128) = srcAsrc + (255-srcA)dst - 128255*
1048	__m128i dst_lo = _mm_maddubs_epi16(srca_lo, _mm_unpacklo_epi8(src128, dst128));
1049	__m128i dst_hi = _mm_maddubs_epi16(srca_hi, _mm_unpackhi_epi8(src128, dst128));
1050
1051	// dst += 0x1U (use 0x80 to round instead of floor) + 128255 (to fix maddubs result)*
1052	dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(`1` + `128`*`255`));
1053	dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(`1` + `128`*`255`));
1054
1055	// dst = (dst + (dst >> 8)) >> 8 = (dst 257) >> 16*
1056	dst_lo = _mm_mulhi_epu16(dst_lo, _mm_set1_epi16(`257`));
1057	dst_hi = _mm_mulhi_epu16(dst_hi, _mm_set1_epi16(`257`));
1058
1059	// Blend the pixels together and save the result
1060	_mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(dst_lo, dst_hi));
1061
1062	src += `16`;
1063	dst += `16`;
1064	}
1065
1066	for (; i < width; ++i) {
1067	Uint32 src32 = (Uint32 )src;
1068	Uint32 dst32 = (Uint32 )dst;
1069	ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt);
1070	(Uint32 )dst = dst32;
1071	src += `4`;
1072	dst += `4`;
1073	}
1074
1075	src += srcskip;
1076	dst += dstskip;
1077	}
1078	}
1079
1080	#endif
1081
1082	#ifdef SDL_AVX2_INTRINSICS
1083
1084	static void SDL_TARGETING("avx2") Blit8888to8888PixelAlphaSwizzleAVX2(SDL_BlitInfo *info)
1085	{
1086	int width = info->dst_w;
1087	int height = info->dst_h;
1088	Uint8 *src = info->src;
1089	int srcskip = info->src_skip;
1090	Uint8 *dst = info->dst;
1091	int dstskip = info->dst_skip;
1092	const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
1093	const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
1094
1095	// The byte offsets for the start of each pixel
1096	const __m256i mask_offsets = _mm256_set_epi8(
1097	`28`, `28`, `28`, `28`, `24`, `24`, `24`, `24`, `20`, `20`, `20`, `20`, `16`, `16`, `16`, `16`, `12`, `12`, `12`, `12`, `8`, `8`, `8`, `8`, `4`, `4`, `4`, `4`, `0`, `0`, `0`, `0`);
1098
1099	const __m256i convert_mask = _mm256_add_epi32(
1100	_mm256_set1_epi32(
1101	((srcfmt->Rshift >> `3`) << dstfmt->Rshift) \|
1102	((srcfmt->Gshift >> `3`) << dstfmt->Gshift) \|
1103	((srcfmt->Bshift >> `3`) << dstfmt->Bshift)),
1104	mask_offsets);
1105
1106	const __m256i alpha_splat_mask = _mm256_add_epi8(_mm256_set1_epi8(srcfmt->Ashift >> `3`), mask_offsets);
1107	const __m256i alpha_fill_mask = _mm256_set1_epi32((int)dstfmt->Amask);
1108
1109	while (height--) {
1110	int i = `0`;
1111
1112	for (; i + `8` <= width; i += `8`) {
1113	// Load 8 src pixels
1114	__m256i src256 = _mm256_loadu_si256((__m256i *)src);
1115
1116	// Load 8 dst pixels
1117	__m256i dst256 = _mm256_loadu_si256((__m256i *)dst);
1118
1119	// Extract the alpha from each pixel and splat it into all the channels
1120	__m256i srcA = _mm256_shuffle_epi8(src256, alpha_splat_mask);
1121
1122	// Convert to dst format
1123	src256 = _mm256_shuffle_epi8(src256, convert_mask);
1124
1125	// Set the alpha channels of src to 255
1126	src256 = _mm256_or_si256(src256, alpha_fill_mask);
1127
1128	// Duplicate each 8-bit alpha value into both bytes of 16-bit lanes
1129	__m256i alpha_lo = _mm256_unpacklo_epi8(srcA, srcA);
1130	__m256i alpha_hi = _mm256_unpackhi_epi8(srcA, srcA);
1131
1132	// Calculate 255-srcA in every second 8-bit lane (255-srcA = srcA^0xff)
1133	alpha_lo = _mm256_xor_si256(alpha_lo, _mm256_set1_epi16(`0xff00`));
1134	alpha_hi = _mm256_xor_si256(alpha_hi, _mm256_set1_epi16(`0xff00`));
1135
1136	// maddubs expects second argument to be signed, so subtract 128
1137	src256 = _mm256_sub_epi8(src256, _mm256_set1_epi8((Uint8)`128`));
1138	dst256 = _mm256_sub_epi8(dst256, _mm256_set1_epi8((Uint8)`128`));
1139
1140	// dst = srcA(src-128) + (255-srcA)(dst-128) = srcAsrc + (255-srcA)dst - 128255*
1141	__m256i dst_lo = _mm256_maddubs_epi16(alpha_lo, _mm256_unpacklo_epi8(src256, dst256));
1142	__m256i dst_hi = _mm256_maddubs_epi16(alpha_hi, _mm256_unpackhi_epi8(src256, dst256));
1143
1144	// dst += 0x1U (use 0x80 to round instead of floor) + 128255 (to fix maddubs result)*
1145	dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(`1` + `128`*`255`));
1146	dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(`1` + `128`*`255`));
1147
1148	// dst = (dst + (dst >> 8)) >> 8 = (dst 257) >> 16*
1149	dst_lo = _mm256_mulhi_epu16(dst_lo, _mm256_set1_epi16(`257`));
1150	dst_hi = _mm256_mulhi_epu16(dst_hi, _mm256_set1_epi16(`257`));
1151
1152	// Blend the pixels together and save the result
1153	_mm256_storeu_si256((__m256i *)dst, _mm256_packus_epi16(dst_lo, dst_hi));
1154
1155	src += `32`;
1156	dst += `32`;
1157	}
1158
1159	for (; i < width; ++i) {
1160	Uint32 src32 = (Uint32 )src;
1161	Uint32 dst32 = (Uint32 )dst;
1162	ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt);
1163	(Uint32 )dst = dst32;
1164	src += `4`;
1165	dst += `4`;
1166	}
1167
1168	src += srcskip;
1169	dst += dstskip;
1170	}
1171	}
1172
1173	#endif
1174
1175	#if defined(SDL_NEON_INTRINSICS) && (__ARM_ARCH >= 8)
1176
1177	static void Blit8888to8888PixelAlphaSwizzleNEON(SDL_BlitInfo *info)
1178	{
1179	int width = info->dst_w;
1180	int height = info->dst_h;
1181	Uint8 *src = info->src;
1182	int srcskip = info->src_skip;
1183	Uint8 *dst = info->dst;
1184	int dstskip = info->dst_skip;
1185	const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
1186	const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
1187
1188	// The byte offsets for the start of each pixel
1189	const uint8x16_t mask_offsets = vreinterpretq_u8_u64(vcombine_u64(
1190	vcreate_u64(`0x0404040400000000`), vcreate_u64(`0x0c0c0c0c08080808`)));
1191
1192	const uint8x16_t convert_mask = vreinterpretq_u8_u32(vaddq_u32(
1193	vreinterpretq_u32_u8(mask_offsets),
1194	vdupq_n_u32(
1195	((srcfmt->Rshift >> `3`) << dstfmt->Rshift) \|
1196	((srcfmt->Gshift >> `3`) << dstfmt->Gshift) \|
1197	((srcfmt->Bshift >> `3`) << dstfmt->Bshift))));
1198
1199	const uint8x16_t alpha_splat_mask = vaddq_u8(vdupq_n_u8(srcfmt->Ashift >> `3`), mask_offsets);
1200	const uint8x16_t alpha_fill_mask = vreinterpretq_u8_u32(vdupq_n_u32(dstfmt->Amask));
1201
1202	while (height--) {
1203	int i = `0`;
1204
1205	for (; i + `4` <= width; i += `4`) {
1206	// Load 4 src pixels
1207	uint8x16_t src128 = vld1q_u8(src);
1208
1209	// Load 4 dst pixels
1210	uint8x16_t dst128 = vld1q_u8(dst);
1211
1212	// Extract the alpha from each pixel and splat it into all the channels
1213	uint8x16_t srcA = vqtbl1q_u8(src128, alpha_splat_mask);
1214
1215	// Convert to dst format
1216	src128 = vqtbl1q_u8(src128, convert_mask);
1217
1218	// Set the alpha channels of src to 255
1219	src128 = vorrq_u8(src128, alpha_fill_mask);
1220
1221	// 255 - srcA = ~srcA
1222	uint8x16_t srcInvA = vmvnq_u8(srcA);
1223
1224	// Result initialized with 1, this is for truncated divide later
1225	uint16x8_t res_lo = vdupq_n_u16(`1`);
1226	uint16x8_t res_hi = vdupq_n_u16(`1`);
1227
1228	// res = alpha src + (255 - alpha) * dst*
1229	res_lo = vmlal_u8(res_lo, vget_low_u8(srcA), vget_low_u8(src128));
1230	res_lo = vmlal_u8(res_lo, vget_low_u8(srcInvA), vget_low_u8(dst128));
1231	res_hi = vmlal_high_u8(res_hi, srcA, src128);
1232	res_hi = vmlal_high_u8(res_hi, srcInvA, dst128);
1233
1234	// Now result has +1 already added for truncated division
1235	// dst = (res + (res >> 8)) >> 8
1236	uint8x8_t temp;
1237	temp = vaddhn_u16(res_lo, vshrq_n_u16(res_lo, `8`));
1238	dst128 = vaddhn_high_u16(temp, res_hi, vshrq_n_u16(res_hi, `8`));
1239
1240	// For rounded division remove the constant 1 and change first two vmlal_u8 to vmull_u8
1241	// Then replace two previous lines with following code:
1242	// temp = vraddhn_u16(res_lo, vrshrq_n_u16(res_lo, 8));
1243	// dst128 = vraddhn_high_u16(temp, res_hi, vrshrq_n_u16(res_hi, 8));
1244
1245	// Save the result
1246	vst1q_u8(dst, dst128);
1247
1248	src += `16`;
1249	dst += `16`;
1250	}
1251
1252	// Process 1 pixel per iteration, max 3 iterations, same calculations as above
1253	for (; i < width; ++i) {
1254	// Top 32-bits will be not used in src32 & dst32
1255	uint8x8_t src32 = vreinterpret_u8_u32(vld1_dup_u32((Uint32*)src));
1256	uint8x8_t dst32 = vreinterpret_u8_u32(vld1_dup_u32((Uint32*)dst));
1257
1258	uint8x8_t srcA = vtbl1_u8(src32, vget_low_u8(alpha_splat_mask));
1259	src32 = vtbl1_u8(src32, vget_low_u8(convert_mask));
1260	src32 = vorr_u8(src32, vget_low_u8(alpha_fill_mask));
1261	uint8x8_t srcInvA = vmvn_u8(srcA);
1262
1263	uint16x8_t res = vdupq_n_u16(`1`);
1264	res = vmlal_u8(res, srcA, src32);
1265	res = vmlal_u8(res, srcInvA, dst32);
1266
1267	dst32 = vaddhn_u16(res, vshrq_n_u16(res, `8`));
1268
1269	// Save the result, only low 32-bits
1270	vst1_lane_u32((Uint32*)dst, vreinterpret_u32_u8(dst32), `0`);
1271
1272	src += `4`;
1273	dst += `4`;
1274	}
1275
1276	src += srcskip;
1277	dst += dstskip;
1278	}
1279	}
1280
1281	#endif
1282
1283	// General (slow) N->N blending with pixel alpha
1284	static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
1285	{
1286	int width = info->dst_w;
1287	int height = info->dst_h;
1288	Uint8 *src = info->src;
1289	int srcskip = info->src_skip;
1290	Uint8 *dst = info->dst;
1291	int dstskip = info->dst_skip;
1292	const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
1293	const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
1294	int srcbpp;
1295	int dstbpp;
1296	Uint32 Pixel;
1297	unsigned sR, sG, sB, sA;
1298	unsigned dR, dG, dB, dA;
1299
1300	// Set up some basic variables
1301	srcbpp = srcfmt->bytes_per_pixel;
1302	dstbpp = dstfmt->bytes_per_pixel;
1303
1304	while (height--) {
1305	DUFFS_LOOP(
1306	{
1307	DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
1308	if (sA) {
1309	DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1310	ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1311	ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1312	}
1313	src += srcbpp;
1314	dst += dstbpp;
1315	},
1316	width);
1317	/ INDENT-ON / // clang-format on
1318	src += srcskip;
1319	dst += dstskip;
1320	}
1321	}
1322
1323	SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
1324	{
1325	const SDL_PixelFormatDetails *sf = surface->fmt;
1326	const SDL_PixelFormatDetails *df = surface->map.info.dst_fmt;
1327
1328	switch (surface->map.info.flags & ~SDL_COPY_RLE_MASK) {
1329	case SDL_COPY_BLEND:
1330	// Per-pixel alpha blits
1331	switch (df->bytes_per_pixel) {
1332	case `1`:
1333	if (surface->map.info.dst_pal) {
1334	return BlitNto1PixelAlpha;
1335	} else {
1336	// RGB332 has no palette !
1337	return BlitNtoNPixelAlpha;
1338	}
1339
1340	case `2`:
1341	if (sf->bytes_per_pixel == `4` && sf->Amask == `0xff000000` && sf->Gmask == `0xff00` && ((sf->Rmask == `0xff` && df->Rmask == `0x1f`) \|\| (sf->Bmask == `0xff` && df->Bmask == `0x1f`))) {
1342	if (df->Gmask == `0x7e0`) {
1343	return BlitARGBto565PixelAlpha;
1344	} else if (df->Gmask == `0x3e0` && !df->Amask) {
1345	return BlitARGBto555PixelAlpha;
1346	}
1347	}
1348	return BlitNtoNPixelAlpha;
1349
1350	case `4`:
1351	if (SDL_PIXELLAYOUT(sf->format) == SDL_PACKEDLAYOUT_8888 && sf->Amask &&
1352	SDL_PIXELLAYOUT(df->format) == SDL_PACKEDLAYOUT_8888) {
1353	#ifdef SDL_AVX2_INTRINSICS
1354	if (SDL_HasAVX2()) {
1355	return Blit8888to8888PixelAlphaSwizzleAVX2;
1356	}
1357	#endif
1358	#ifdef SDL_SSE4_1_INTRINSICS
1359	if (SDL_HasSSE41()) {
1360	return Blit8888to8888PixelAlphaSwizzleSSE41;
1361	}
1362	#endif
1363	#if defined(SDL_NEON_INTRINSICS) && (__ARM_ARCH >= 8)
1364	// To prevent "unused function" compiler warnings/errors
1365	(void)Blit8888to8888PixelAlpha;
1366	(void)Blit8888to8888PixelAlphaSwizzle;
1367	return Blit8888to8888PixelAlphaSwizzleNEON;
1368	#else
1369	if (sf->format == df->format) {
1370	return Blit8888to8888PixelAlpha;
1371	} else {
1372	return Blit8888to8888PixelAlphaSwizzle;
1373	}
1374	#endif
1375	}
1376	return BlitNtoNPixelAlpha;
1377
1378	case `3`:
1379	default:
1380	break;
1381	}
1382	return BlitNtoNPixelAlpha;
1383
1384	case SDL_COPY_MODULATE_ALPHA \| SDL_COPY_BLEND:
1385	if (sf->Amask == `0`) {
1386	// Per-surface alpha blits
1387	switch (df->bytes_per_pixel) {
1388	case `1`:
1389	if (surface->map.info.dst_pal) {
1390	return BlitNto1SurfaceAlpha;
1391	} else {
1392	// RGB332 has no palette !
1393	return BlitNtoNSurfaceAlpha;
1394	}
1395
1396	case `2`:
1397	if (surface->map.identity) {
1398	if (df->Gmask == `0x7e0`) {
1399	#ifdef SDL_MMX_INTRINSICS
1400	if (SDL_HasMMX()) {
1401	return Blit565to565SurfaceAlphaMMX;
1402	} else
1403	#endif
1404	{
1405	return Blit565to565SurfaceAlpha;
1406	}
1407	} else if (df->Gmask == `0x3e0`) {
1408	#ifdef SDL_MMX_INTRINSICS
1409	if (SDL_HasMMX()) {
1410	return Blit555to555SurfaceAlphaMMX;
1411	} else
1412	#endif
1413	{
1414	return Blit555to555SurfaceAlpha;
1415	}
1416	}
1417	}
1418	return BlitNtoNSurfaceAlpha;
1419
1420	case `4`:
1421	if (sf->Rmask == df->Rmask && sf->Gmask == df->Gmask && sf->Bmask == df->Bmask && sf->bytes_per_pixel == `4`) {
1422	#ifdef SDL_SSE2_INTRINSICS
1423	if (sf->Rshift % `8` == `0` && sf->Gshift % `8` == `0` && sf->Bshift % `8` == `0` && SDL_HasSSE2()) {
1424	return Blit888to888SurfaceAlphaSSE2;
1425	}
1426	#endif
1427	if ((sf->Rmask \| sf->Gmask \| sf->Bmask) == `0xffffff`) {
1428	return BlitRGBtoRGBSurfaceAlpha;
1429	}
1430	}
1431	return BlitNtoNSurfaceAlpha;
1432
1433	case `3`:
1434	default:
1435	return BlitNtoNSurfaceAlpha;
1436	}
1437	}
1438	break;
1439
1440	case SDL_COPY_COLORKEY \| SDL_COPY_MODULATE_ALPHA \| SDL_COPY_BLEND:
1441	if (sf->Amask == `0`) {
1442	if (df->bytes_per_pixel == `1`) {
1443
1444	if (surface->map.info.dst_pal) {
1445	return BlitNto1SurfaceAlphaKey;
1446	} else {
1447	// RGB332 has no palette !
1448	return BlitNtoNSurfaceAlphaKey;
1449	}
1450	} else {
1451	return BlitNtoNSurfaceAlphaKey;
1452	}
1453	}
1454	break;
1455	}
1456
1457	return NULL;
1458	}
1459
1460	#endif // SDL_HAVE_BLIT_A
1461
1462

Browse the source code of SDL/src/video/SDL_blit_A.c