1/*
2 Simple DirectMedia Layer
3 Copyright (C) 1997-2025 Sam Lantinga <slouken@libsdl.org>
4
5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the authors be held liable for any damages
7 arising from the use of this software.
8
9 Permission is granted to anyone to use this software for any purpose,
10 including commercial applications, and to alter it and redistribute it
11 freely, subject to the following restrictions:
12
13 1. The origin of this software must not be misrepresented; you must not
14 claim that you wrote the original software. If you use this software
15 in a product, an acknowledgment in the product documentation would be
16 appreciated but is not required.
17 2. Altered source versions must be plainly marked as such, and must not be
18 misrepresented as being the original software.
19 3. This notice may not be removed or altered from any source distribution.
20*/
21#include "SDL_internal.h"
22
23#ifdef SDL_HAVE_BLIT_A
24
25#include "SDL_surface_c.h"
26
27// Functions to perform alpha blended blitting
28
29// N->1 blending with per-surface alpha
30static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
31{
32 int width = info->dst_w;
33 int height = info->dst_h;
34 Uint8 *src = info->src;
35 int srcskip = info->src_skip;
36 Uint8 *dst = info->dst;
37 int dstskip = info->dst_skip;
38 Uint8 *palmap = info->table;
39 const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
40 const SDL_Color *dstpal = info->dst_pal->colors;
41 int srcbpp = srcfmt->bytes_per_pixel;
42 Uint32 Pixel;
43 unsigned sR, sG, sB;
44 unsigned dR, dG, dB;
45 const unsigned A = info->a;
46
47 while (height--) {
48 /* *INDENT-OFF* */ // clang-format off
49 DUFFS_LOOP(
50 {
51 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
52 dR = dstpal[*dst].r;
53 dG = dstpal[*dst].g;
54 dB = dstpal[*dst].b;
55 ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
56 dR &= 0xff;
57 dG &= 0xff;
58 dB &= 0xff;
59 // Pack RGB into 8bit pixel
60 if ( palmap == NULL ) {
61 *dst = (Uint8)(((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0)));
62 } else {
63 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
64 }
65 dst++;
66 src += srcbpp;
67 },
68 width);
69 /* *INDENT-ON* */ // clang-format on
70 src += srcskip;
71 dst += dstskip;
72 }
73}
74
75// N->1 blending with pixel alpha
76static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
77{
78 int width = info->dst_w;
79 int height = info->dst_h;
80 Uint8 *src = info->src;
81 int srcskip = info->src_skip;
82 Uint8 *dst = info->dst;
83 int dstskip = info->dst_skip;
84 Uint8 *palmap = info->table;
85 const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
86 const SDL_Color *dstpal = info->dst_pal->colors;
87 int srcbpp = srcfmt->bytes_per_pixel;
88 Uint32 Pixel;
89 unsigned sR, sG, sB, sA;
90 unsigned dR, dG, dB;
91
92 while (height--) {
93 /* *INDENT-OFF* */ // clang-format off
94 DUFFS_LOOP(
95 {
96 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
97 dR = dstpal[*dst].r;
98 dG = dstpal[*dst].g;
99 dB = dstpal[*dst].b;
100 ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB);
101 dR &= 0xff;
102 dG &= 0xff;
103 dB &= 0xff;
104 // Pack RGB into 8bit pixel
105 if ( palmap == NULL ) {
106 *dst = (Uint8)(((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0)));
107 } else {
108 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
109 }
110 dst++;
111 src += srcbpp;
112 },
113 width);
114 /* *INDENT-ON* */ // clang-format on
115 src += srcskip;
116 dst += dstskip;
117 }
118}
119
120// colorkeyed N->1 blending with per-surface alpha
121static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
122{
123 int width = info->dst_w;
124 int height = info->dst_h;
125 Uint8 *src = info->src;
126 int srcskip = info->src_skip;
127 Uint8 *dst = info->dst;
128 int dstskip = info->dst_skip;
129 Uint8 *palmap = info->table;
130 const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
131 const SDL_Color *dstpal = info->dst_pal->colors;
132 int srcbpp = srcfmt->bytes_per_pixel;
133 Uint32 ckey = info->colorkey;
134 Uint32 Pixel;
135 unsigned sR, sG, sB;
136 unsigned dR, dG, dB;
137 const unsigned A = info->a;
138
139 while (height--) {
140 /* *INDENT-OFF* */ // clang-format off
141 DUFFS_LOOP(
142 {
143 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
144 if ( Pixel != ckey ) {
145 dR = dstpal[*dst].r;
146 dG = dstpal[*dst].g;
147 dB = dstpal[*dst].b;
148 ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
149 dR &= 0xff;
150 dG &= 0xff;
151 dB &= 0xff;
152 // Pack RGB into 8bit pixel
153 if ( palmap == NULL ) {
154 *dst = (Uint8)(((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0)));
155 } else {
156 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
157 }
158 }
159 dst++;
160 src += srcbpp;
161 },
162 width);
163 /* *INDENT-ON* */ // clang-format on
164 src += srcskip;
165 dst += dstskip;
166 }
167}
168
169#ifdef SDL_SSE2_INTRINSICS
170
171static void SDL_TARGETING("sse2") Blit888to888SurfaceAlphaSSE2(SDL_BlitInfo *info)
172{
173 int width = info->dst_w;
174 int height = info->dst_h;
175 Uint8 *src = info->src;
176 int srcskip = info->src_skip;
177 Uint8 *dst = info->dst;
178 int dstskip = info->dst_skip;
179 Uint8 alpha = info->a;
180
181 const __m128i alpha_fill_mask = _mm_set1_epi32((int)0xff000000);
182 const __m128i srcA = _mm_set1_epi16(alpha);
183
184 while (height--) {
185 int i = 0;
186
187 for (; i + 4 <= width; i += 4) {
188 // Load 4 src pixels
189 __m128i src128 = _mm_loadu_si128((__m128i *)src);
190
191 // Load 4 dst pixels
192 __m128i dst128 = _mm_loadu_si128((__m128i *)dst);
193
194 __m128i src_lo = _mm_unpacklo_epi8(src128, _mm_setzero_si128());
195 __m128i src_hi = _mm_unpackhi_epi8(src128, _mm_setzero_si128());
196
197 __m128i dst_lo = _mm_unpacklo_epi8(dst128, _mm_setzero_si128());
198 __m128i dst_hi = _mm_unpackhi_epi8(dst128, _mm_setzero_si128());
199
200 // dst = ((src - dst) * srcA) + ((dst << 8) - dst)
201 dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srcA),
202 _mm_sub_epi16(_mm_slli_epi16(dst_lo, 8), dst_lo));
203 dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srcA),
204 _mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi));
205
206 // dst += 0x1U (use 0x80 to round instead of floor)
207 dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1));
208 dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1));
209
210 // dst = (dst + (dst >> 8)) >> 8
211 dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8);
212 dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8);
213
214 dst128 = _mm_packus_epi16(dst_lo, dst_hi);
215
216 // Set the alpha channels of dst to 255
217 dst128 = _mm_or_si128(dst128, alpha_fill_mask);
218
219 _mm_storeu_si128((__m128i *)dst, dst128);
220
221 src += 16;
222 dst += 16;
223 }
224
225 for (; i < width; ++i) {
226 Uint32 src32 = *(Uint32 *)src;
227 Uint32 dst32 = *(Uint32 *)dst;
228
229 FACTOR_BLEND_8888(src32, dst32, alpha);
230
231 *dst = dst32 | 0xff000000;
232
233 src += 4;
234 dst += 4;
235 }
236
237 src += srcskip;
238 dst += dstskip;
239 }
240}
241
242#endif
243
244// fast RGB888->(A)RGB888 blending with surface alpha=128 special case
245static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
246{
247 int width = info->dst_w;
248 int height = info->dst_h;
249 Uint32 *srcp = (Uint32 *)info->src;
250 int srcskip = info->src_skip >> 2;
251 Uint32 *dstp = (Uint32 *)info->dst;
252 int dstskip = info->dst_skip >> 2;
253
254 while (height--) {
255 /* *INDENT-OFF* */ // clang-format off
256 DUFFS_LOOP({
257 Uint32 s = *srcp++;
258 Uint32 d = *dstp;
259 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
260 + (s & d & 0x00010101)) | 0xff000000;
261 }, width);
262 /* *INDENT-ON* */ // clang-format on
263 srcp += srcskip;
264 dstp += dstskip;
265 }
266}
267
268// fast RGB888->(A)RGB888 blending with surface alpha
269static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
270{
271 unsigned alpha = info->a;
272 if (alpha == 128) {
273 BlitRGBtoRGBSurfaceAlpha128(info);
274 } else {
275 int width = info->dst_w;
276 int height = info->dst_h;
277 Uint32 *srcp = (Uint32 *)info->src;
278 int srcskip = info->src_skip >> 2;
279 Uint32 *dstp = (Uint32 *)info->dst;
280 int dstskip = info->dst_skip >> 2;
281 Uint32 s;
282 Uint32 d;
283
284 while (height--) {
285 /* *INDENT-OFF* */ // clang-format off
286 DUFFS_LOOP({
287 s = *srcp;
288 d = *dstp;
289
290 FACTOR_BLEND_8888(s, d, alpha);
291
292 *dstp = d | 0xff000000;
293 ++srcp;
294 ++dstp;
295 }, width);
296 /* *INDENT-ON* */ // clang-format on
297 srcp += srcskip;
298 dstp += dstskip;
299 }
300 }
301}
302
303// 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel
304
305// blend a single 16 bit pixel at 50%
306#define BLEND16_50(d, s, mask) \
307 ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
308
309// blend two 16 bit pixels at 50%
310#define BLEND2x16_50(d, s, mask) \
311 (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) + (s & d & (~(mask | mask << 16))))
312
313static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
314{
315 int width = info->dst_w;
316 int height = info->dst_h;
317 Uint16 *srcp = (Uint16 *)info->src;
318 int srcskip = info->src_skip >> 1;
319 Uint16 *dstp = (Uint16 *)info->dst;
320 int dstskip = info->dst_skip >> 1;
321
322 while (height--) {
323 if (((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
324 /*
325 * Source and destination not aligned, pipeline it.
326 * This is mostly a win for big blits but no loss for
327 * small ones
328 */
329 Uint32 prev_sw;
330 int w = width;
331
332 // handle odd destination
333 if ((uintptr_t)dstp & 2) {
334 Uint16 d = *dstp, s = *srcp;
335 *dstp = BLEND16_50(d, s, mask);
336 dstp++;
337 srcp++;
338 w--;
339 }
340 srcp++; // srcp is now 32-bit aligned
341
342 // bootstrap pipeline with first halfword
343 prev_sw = ((Uint32 *)srcp)[-1];
344
345 while (w > 1) {
346 Uint32 sw, dw, s;
347 sw = *(Uint32 *)srcp;
348 dw = *(Uint32 *)dstp;
349#if SDL_BYTEORDER == SDL_BIG_ENDIAN
350 s = (prev_sw << 16) + (sw >> 16);
351#else
352 s = (prev_sw >> 16) + (sw << 16);
353#endif
354 prev_sw = sw;
355 *(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
356 dstp += 2;
357 srcp += 2;
358 w -= 2;
359 }
360
361 // final pixel if any
362 if (w) {
363 Uint16 d = *dstp, s;
364#if SDL_BYTEORDER == SDL_BIG_ENDIAN
365 s = (Uint16)prev_sw;
366#else
367 s = (Uint16)(prev_sw >> 16);
368#endif
369 *dstp = BLEND16_50(d, s, mask);
370 srcp++;
371 dstp++;
372 }
373 srcp += srcskip - 1;
374 dstp += dstskip;
375 } else {
376 // source and destination are aligned
377 int w = width;
378
379 // first odd pixel?
380 if ((uintptr_t)srcp & 2) {
381 Uint16 d = *dstp, s = *srcp;
382 *dstp = BLEND16_50(d, s, mask);
383 srcp++;
384 dstp++;
385 w--;
386 }
387 // srcp and dstp are now 32-bit aligned
388
389 while (w > 1) {
390 Uint32 sw = *(Uint32 *)srcp;
391 Uint32 dw = *(Uint32 *)dstp;
392 *(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
393 srcp += 2;
394 dstp += 2;
395 w -= 2;
396 }
397
398 // last odd pixel?
399 if (w) {
400 Uint16 d = *dstp, s = *srcp;
401 *dstp = BLEND16_50(d, s, mask);
402 srcp++;
403 dstp++;
404 }
405 srcp += srcskip;
406 dstp += dstskip;
407 }
408 }
409}
410
411#ifdef SDL_MMX_INTRINSICS
412
413// fast RGB565->RGB565 blending with surface alpha
414static void SDL_TARGETING("mmx") Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
415{
416 unsigned alpha = info->a;
417 if (alpha == 128) {
418 Blit16to16SurfaceAlpha128(info, 0xf7de);
419 } else {
420 int width = info->dst_w;
421 int height = info->dst_h;
422 Uint16 *srcp = (Uint16 *)info->src;
423 int srcskip = info->src_skip >> 1;
424 Uint16 *dstp = (Uint16 *)info->dst;
425 int dstskip = info->dst_skip >> 1;
426 Uint32 s, d;
427
428#ifdef USE_DUFFS_LOOP
429 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
430
431 alpha &= ~(1 + 2 + 4); // cut alpha to get the exact same behaviour
432 mm_alpha = _mm_set_pi32(0, alpha); // 0000000A -> mm_alpha
433 alpha >>= 3; // downscale alpha to 5 bits
434
435 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); // 00000A0A -> mm_alpha
436 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); // 0A0A0A0A -> mm_alpha
437 /* position alpha to allow for mullo and mulhi on diff channels
438 to reduce the number of operations */
439 mm_alpha = _mm_slli_si64(mm_alpha, 3);
440
441 // Setup the 565 color channel masks
442 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); // MASKGREEN -> gmask
443 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); // MASKBLUE -> bmask
444#endif
445
446 while (height--) {
447 /* *INDENT-OFF* */ // clang-format off
448 DUFFS_LOOP_124(
449 {
450 s = *srcp++;
451 d = *dstp;
452 /*
453 * shift out the middle component (green) to
454 * the high 16 bits, and process all three RGB
455 * components at the same time.
456 */
457 s = (s | s << 16) & 0x07e0f81f;
458 d = (d | d << 16) & 0x07e0f81f;
459 d += (s - d) * alpha >> 5;
460 d &= 0x07e0f81f;
461 *dstp++ = (Uint16)(d | d >> 16);
462 },{
463 s = *srcp++;
464 d = *dstp;
465 /*
466 * shift out the middle component (green) to
467 * the high 16 bits, and process all three RGB
468 * components at the same time.
469 */
470 s = (s | s << 16) & 0x07e0f81f;
471 d = (d | d << 16) & 0x07e0f81f;
472 d += (s - d) * alpha >> 5;
473 d &= 0x07e0f81f;
474 *dstp++ = (Uint16)(d | d >> 16);
475 s = *srcp++;
476 d = *dstp;
477 /*
478 * shift out the middle component (green) to
479 * the high 16 bits, and process all three RGB
480 * components at the same time.
481 */
482 s = (s | s << 16) & 0x07e0f81f;
483 d = (d | d << 16) & 0x07e0f81f;
484 d += (s - d) * alpha >> 5;
485 d &= 0x07e0f81f;
486 *dstp++ = (Uint16)(d | d >> 16);
487 },{
488 src1 = *(__m64*)srcp; // 4 src pixels -> src1
489 dst1 = *(__m64*)dstp; // 4 dst pixels -> dst1
490
491 // red
492 src2 = src1;
493 src2 = _mm_srli_pi16(src2, 11); // src2 >> 11 -> src2 [000r 000r 000r 000r]
494
495 dst2 = dst1;
496 dst2 = _mm_srli_pi16(dst2, 11); // dst2 >> 11 -> dst2 [000r 000r 000r 000r]
497
498 // blend
499 src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2
500 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
501 src2 = _mm_srli_pi16(src2, 11); // src2 >> 11 -> src2
502 dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2
503 dst2 = _mm_slli_pi16(dst2, 11); // dst2 << 11 -> dst2
504
505 mm_res = dst2; // RED -> mm_res
506
507 // green -- process the bits in place
508 src2 = src1;
509 src2 = _mm_and_si64(src2, gmask); // src & MASKGREEN -> src2
510
511 dst2 = dst1;
512 dst2 = _mm_and_si64(dst2, gmask); // dst & MASKGREEN -> dst2
513
514 // blend
515 src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2
516 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
517 src2 = _mm_slli_pi16(src2, 5); // src2 << 5 -> src2
518 dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2
519
520 mm_res = _mm_or_si64(mm_res, dst2); // RED | GREEN -> mm_res
521
522 // blue
523 src2 = src1;
524 src2 = _mm_and_si64(src2, bmask); // src & MASKBLUE -> src2[000b 000b 000b 000b]
525
526 dst2 = dst1;
527 dst2 = _mm_and_si64(dst2, bmask); // dst & MASKBLUE -> dst2[000b 000b 000b 000b]
528
529 // blend
530 src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2
531 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
532 src2 = _mm_srli_pi16(src2, 11); // src2 >> 11 -> src2
533 dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2
534 dst2 = _mm_and_si64(dst2, bmask); // dst2 & MASKBLUE -> dst2
535
536 mm_res = _mm_or_si64(mm_res, dst2); // RED | GREEN | BLUE -> mm_res
537
538 *(__m64*)dstp = mm_res; // mm_res -> 4 dst pixels
539
540 srcp += 4;
541 dstp += 4;
542 }, width);
543 /* *INDENT-ON* */ // clang-format on
544 srcp += srcskip;
545 dstp += dstskip;
546 }
547 _mm_empty();
548 }
549}
550
551// fast RGB555->RGB555 blending with surface alpha
552static void SDL_TARGETING("mmx") Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
553{
554 unsigned alpha = info->a;
555 if (alpha == 128) {
556 Blit16to16SurfaceAlpha128(info, 0xfbde);
557 } else {
558 int width = info->dst_w;
559 int height = info->dst_h;
560 Uint16 *srcp = (Uint16 *)info->src;
561 int srcskip = info->src_skip >> 1;
562 Uint16 *dstp = (Uint16 *)info->dst;
563 int dstskip = info->dst_skip >> 1;
564 Uint32 s, d;
565
566#ifdef USE_DUFFS_LOOP
567 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
568
569 alpha &= ~(1 + 2 + 4); // cut alpha to get the exact same behaviour
570 mm_alpha = _mm_set_pi32(0, alpha); // 0000000A -> mm_alpha
571 alpha >>= 3; // downscale alpha to 5 bits
572
573 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); // 00000A0A -> mm_alpha
574 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); // 0A0A0A0A -> mm_alpha
575 /* position alpha to allow for mullo and mulhi on diff channels
576 to reduce the number of operations */
577 mm_alpha = _mm_slli_si64(mm_alpha, 3);
578
579 // Setup the 555 color channel masks
580 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); // MASKRED -> rmask
581 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); // MASKGREEN -> gmask
582 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); // MASKBLUE -> bmask
583#endif
584 while (height--) {
585 /* *INDENT-OFF* */ // clang-format off
586 DUFFS_LOOP_124(
587 {
588 s = *srcp++;
589 d = *dstp;
590 /*
591 * shift out the middle component (green) to
592 * the high 16 bits, and process all three RGB
593 * components at the same time.
594 */
595 s = (s | s << 16) & 0x03e07c1f;
596 d = (d | d << 16) & 0x03e07c1f;
597 d += (s - d) * alpha >> 5;
598 d &= 0x03e07c1f;
599 *dstp++ = (Uint16)(d | d >> 16);
600 },{
601 s = *srcp++;
602 d = *dstp;
603 /*
604 * shift out the middle component (green) to
605 * the high 16 bits, and process all three RGB
606 * components at the same time.
607 */
608 s = (s | s << 16) & 0x03e07c1f;
609 d = (d | d << 16) & 0x03e07c1f;
610 d += (s - d) * alpha >> 5;
611 d &= 0x03e07c1f;
612 *dstp++ = (Uint16)(d | d >> 16);
613 s = *srcp++;
614 d = *dstp;
615 /*
616 * shift out the middle component (green) to
617 * the high 16 bits, and process all three RGB
618 * components at the same time.
619 */
620 s = (s | s << 16) & 0x03e07c1f;
621 d = (d | d << 16) & 0x03e07c1f;
622 d += (s - d) * alpha >> 5;
623 d &= 0x03e07c1f;
624 *dstp++ = (Uint16)(d | d >> 16);
625 },{
626 src1 = *(__m64*)srcp; // 4 src pixels -> src1
627 dst1 = *(__m64*)dstp; // 4 dst pixels -> dst1
628
629 // red -- process the bits in place
630 src2 = src1;
631 src2 = _mm_and_si64(src2, rmask); // src & MASKRED -> src2
632
633 dst2 = dst1;
634 dst2 = _mm_and_si64(dst2, rmask); // dst & MASKRED -> dst2
635
636 // blend
637 src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2
638 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
639 src2 = _mm_slli_pi16(src2, 5); // src2 << 5 -> src2
640 dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2
641 dst2 = _mm_and_si64(dst2, rmask); // dst2 & MASKRED -> dst2
642
643 mm_res = dst2; // RED -> mm_res
644
645 // green -- process the bits in place
646 src2 = src1;
647 src2 = _mm_and_si64(src2, gmask); // src & MASKGREEN -> src2
648
649 dst2 = dst1;
650 dst2 = _mm_and_si64(dst2, gmask); // dst & MASKGREEN -> dst2
651
652 // blend
653 src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2
654 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
655 src2 = _mm_slli_pi16(src2, 5); // src2 << 5 -> src2
656 dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2
657
658 mm_res = _mm_or_si64(mm_res, dst2); // RED | GREEN -> mm_res
659
660 // blue
661 src2 = src1; // src -> src2
662 src2 = _mm_and_si64(src2, bmask); // src & MASKBLUE -> src2[000b 000b 000b 000b]
663
664 dst2 = dst1; // dst -> dst2
665 dst2 = _mm_and_si64(dst2, bmask); // dst & MASKBLUE -> dst2[000b 000b 000b 000b]
666
667 // blend
668 src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2
669 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
670 src2 = _mm_srli_pi16(src2, 11); // src2 >> 11 -> src2
671 dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2
672 dst2 = _mm_and_si64(dst2, bmask); // dst2 & MASKBLUE -> dst2
673
674 mm_res = _mm_or_si64(mm_res, dst2); // RED | GREEN | BLUE -> mm_res
675
676 *(__m64*)dstp = mm_res; // mm_res -> 4 dst pixels
677
678 srcp += 4;
679 dstp += 4;
680 }, width);
681 /* *INDENT-ON* */ // clang-format on
682 srcp += srcskip;
683 dstp += dstskip;
684 }
685 _mm_empty();
686 }
687}
688
689#endif // SDL_MMX_INTRINSICS
690
691// fast RGB565->RGB565 blending with surface alpha
692static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
693{
694 unsigned alpha = info->a;
695 if (alpha == 128) {
696 Blit16to16SurfaceAlpha128(info, 0xf7de);
697 } else {
698 int width = info->dst_w;
699 int height = info->dst_h;
700 Uint16 *srcp = (Uint16 *)info->src;
701 int srcskip = info->src_skip >> 1;
702 Uint16 *dstp = (Uint16 *)info->dst;
703 int dstskip = info->dst_skip >> 1;
704 alpha >>= 3; // downscale alpha to 5 bits
705
706 while (height--) {
707 /* *INDENT-OFF* */ // clang-format off
708 DUFFS_LOOP({
709 Uint32 s = *srcp++;
710 Uint32 d = *dstp;
711 /*
712 * shift out the middle component (green) to
713 * the high 16 bits, and process all three RGB
714 * components at the same time.
715 */
716 s = (s | s << 16) & 0x07e0f81f;
717 d = (d | d << 16) & 0x07e0f81f;
718 d += (s - d) * alpha >> 5;
719 d &= 0x07e0f81f;
720 *dstp++ = (Uint16)(d | d >> 16);
721 }, width);
722 /* *INDENT-ON* */ // clang-format on
723 srcp += srcskip;
724 dstp += dstskip;
725 }
726 }
727}
728
729// fast RGB555->RGB555 blending with surface alpha
730static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
731{
732 unsigned alpha = info->a; // downscale alpha to 5 bits
733 if (alpha == 128) {
734 Blit16to16SurfaceAlpha128(info, 0xfbde);
735 } else {
736 int width = info->dst_w;
737 int height = info->dst_h;
738 Uint16 *srcp = (Uint16 *)info->src;
739 int srcskip = info->src_skip >> 1;
740 Uint16 *dstp = (Uint16 *)info->dst;
741 int dstskip = info->dst_skip >> 1;
742 alpha >>= 3; // downscale alpha to 5 bits
743
744 while (height--) {
745 /* *INDENT-OFF* */ // clang-format off
746 DUFFS_LOOP({
747 Uint32 s = *srcp++;
748 Uint32 d = *dstp;
749 /*
750 * shift out the middle component (green) to
751 * the high 16 bits, and process all three RGB
752 * components at the same time.
753 */
754 s = (s | s << 16) & 0x03e07c1f;
755 d = (d | d << 16) & 0x03e07c1f;
756 d += (s - d) * alpha >> 5;
757 d &= 0x03e07c1f;
758 *dstp++ = (Uint16)(d | d >> 16);
759 }, width);
760 /* *INDENT-ON* */ // clang-format on
761 srcp += srcskip;
762 dstp += dstskip;
763 }
764 }
765}
766
767// fast ARGB8888->RGB565 blending with pixel alpha
768static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
769{
770 int width = info->dst_w;
771 int height = info->dst_h;
772 Uint32 *srcp = (Uint32 *)info->src;
773 int srcskip = info->src_skip >> 2;
774 Uint16 *dstp = (Uint16 *)info->dst;
775 int dstskip = info->dst_skip >> 1;
776
777 while (height--) {
778 /* *INDENT-OFF* */ // clang-format off
779 DUFFS_LOOP({
780 Uint32 s = *srcp;
781 unsigned alpha = s >> 27; // downscale alpha to 5 bits
782 /* Here we special-case opaque alpha since the
783 compositioning used (>>8 instead of /255) doesn't handle
784 it correctly. */
785 if (alpha) {
786 if (alpha == (SDL_ALPHA_OPAQUE >> 3)) {
787 *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f));
788 } else {
789 Uint32 d = *dstp;
790 /*
791 * convert source and destination to G0RAB65565
792 * and blend all components at the same time
793 */
794 s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800) + (s >> 3 & 0x1f);
795 d = (d | d << 16) & 0x07e0f81f;
796 d += (s - d) * alpha >> 5;
797 d &= 0x07e0f81f;
798 *dstp = (Uint16)(d | d >> 16);
799 }
800 }
801 srcp++;
802 dstp++;
803 }, width);
804 /* *INDENT-ON* */ // clang-format on
805 srcp += srcskip;
806 dstp += dstskip;
807 }
808}
809
810// fast ARGB8888->RGB555 blending with pixel alpha
811static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
812{
813 int width = info->dst_w;
814 int height = info->dst_h;
815 Uint32 *srcp = (Uint32 *)info->src;
816 int srcskip = info->src_skip >> 2;
817 Uint16 *dstp = (Uint16 *)info->dst;
818 int dstskip = info->dst_skip >> 1;
819
820 while (height--) {
821 /* *INDENT-OFF* */ // clang-format off
822 DUFFS_LOOP({
823 unsigned alpha;
824 Uint32 s = *srcp;
825 alpha = s >> 27; // downscale alpha to 5 bits
826 /* Here we special-case opaque alpha since the
827 compositioning used (>>8 instead of /255) doesn't handle
828 it correctly. */
829 if (alpha) {
830 if (alpha == (SDL_ALPHA_OPAQUE >> 3)) {
831 *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f));
832 } else {
833 Uint32 d = *dstp;
834 /*
835 * convert source and destination to G0RAB55555
836 * and blend all components at the same time
837 */
838 s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00) + (s >> 3 & 0x1f);
839 d = (d | d << 16) & 0x03e07c1f;
840 d += (s - d) * alpha >> 5;
841 d &= 0x03e07c1f;
842 *dstp = (Uint16)(d | d >> 16);
843 }
844 }
845 srcp++;
846 dstp++;
847 }, width);
848 /* *INDENT-ON* */ // clang-format on
849 srcp += srcskip;
850 dstp += dstskip;
851 }
852}
853
854// General (slow) N->N blending with per-surface alpha
855static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
856{
857 int width = info->dst_w;
858 int height = info->dst_h;
859 Uint8 *src = info->src;
860 int srcskip = info->src_skip;
861 Uint8 *dst = info->dst;
862 int dstskip = info->dst_skip;
863 const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
864 const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
865 int srcbpp = srcfmt->bytes_per_pixel;
866 int dstbpp = dstfmt->bytes_per_pixel;
867 Uint32 Pixel;
868 unsigned sR, sG, sB;
869 unsigned dR, dG, dB, dA;
870 const unsigned sA = info->a;
871
872 if (sA) {
873 while (height--) {
874 /* *INDENT-OFF* */ // clang-format off
875 DUFFS_LOOP(
876 {
877 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
878 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
879 ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
880 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
881 src += srcbpp;
882 dst += dstbpp;
883 },
884 width);
885 /* *INDENT-ON* */ // clang-format on
886 src += srcskip;
887 dst += dstskip;
888 }
889 }
890}
891
892// General (slow) colorkeyed N->N blending with per-surface alpha
893static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
894{
895 int width = info->dst_w;
896 int height = info->dst_h;
897 Uint8 *src = info->src;
898 int srcskip = info->src_skip;
899 Uint8 *dst = info->dst;
900 int dstskip = info->dst_skip;
901 const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
902 const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
903 Uint32 ckey = info->colorkey;
904 int srcbpp = srcfmt->bytes_per_pixel;
905 int dstbpp = dstfmt->bytes_per_pixel;
906 Uint32 Pixel;
907 unsigned sR, sG, sB;
908 unsigned dR, dG, dB, dA;
909 const unsigned sA = info->a;
910
911 while (height--) {
912 /* *INDENT-OFF* */ // clang-format off
913 DUFFS_LOOP(
914 {
915 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
916 if (sA && Pixel != ckey) {
917 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
918 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
919 ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
920 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
921 }
922 src += srcbpp;
923 dst += dstbpp;
924 },
925 width);
926 /* *INDENT-ON* */ // clang-format on
927 src += srcskip;
928 dst += dstskip;
929 }
930}
931
932// Fast 32-bit RGBA->RGBA blending with pixel alpha
933static void Blit8888to8888PixelAlpha(SDL_BlitInfo *info)
934{
935 int width = info->dst_w;
936 int height = info->dst_h;
937 Uint8 *src = info->src;
938 int srcskip = info->src_skip;
939 Uint8 *dst = info->dst;
940 int dstskip = info->dst_skip;
941 const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
942
943 while (height--) {
944 int i = 0;
945
946 for (; i < width; ++i) {
947 Uint32 src32 = *(Uint32 *)src;
948 Uint32 dst32 = *(Uint32 *)dst;
949 ALPHA_BLEND_8888(src32, dst32, srcfmt);
950 *(Uint32 *)dst = dst32;
951 src += 4;
952 dst += 4;
953 }
954
955 src += srcskip;
956 dst += dstskip;
957 }
958}
959
960// Fast 32-bit RGBA->RGB(A) blending with pixel alpha and src swizzling
961static void Blit8888to8888PixelAlphaSwizzle(SDL_BlitInfo *info)
962{
963 int width = info->dst_w;
964 int height = info->dst_h;
965 Uint8 *src = info->src;
966 int srcskip = info->src_skip;
967 Uint8 *dst = info->dst;
968 int dstskip = info->dst_skip;
969 const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
970 const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
971
972 while (height--) {
973 int i = 0;
974
975 for (; i < width; ++i) {
976 Uint32 src32 = *(Uint32 *)src;
977 Uint32 dst32 = *(Uint32 *)dst;
978 ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt);
979 *(Uint32 *)dst = dst32;
980 src += 4;
981 dst += 4;
982 }
983
984 src += srcskip;
985 dst += dstskip;
986 }
987}
988
989#ifdef SDL_SSE4_1_INTRINSICS
990
991static void SDL_TARGETING("sse4.1") Blit8888to8888PixelAlphaSwizzleSSE41(SDL_BlitInfo *info)
992{
993 int width = info->dst_w;
994 int height = info->dst_h;
995 Uint8 *src = info->src;
996 int srcskip = info->src_skip;
997 Uint8 *dst = info->dst;
998 int dstskip = info->dst_skip;
999 const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
1000 const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
1001
1002 // The byte offsets for the start of each pixel
1003 const __m128i mask_offsets = _mm_set_epi8(
1004 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
1005
1006 const __m128i convert_mask = _mm_add_epi32(
1007 _mm_set1_epi32(
1008 ((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
1009 ((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
1010 ((srcfmt->Bshift >> 3) << dstfmt->Bshift)),
1011 mask_offsets);
1012
1013 const __m128i alpha_splat_mask = _mm_add_epi8(_mm_set1_epi8(srcfmt->Ashift >> 3), mask_offsets);
1014 const __m128i alpha_fill_mask = _mm_set1_epi32((int)dstfmt->Amask);
1015
1016 while (height--) {
1017 int i = 0;
1018
1019 for (; i + 4 <= width; i += 4) {
1020 // Load 4 src pixels
1021 __m128i src128 = _mm_loadu_si128((__m128i *)src);
1022
1023 // Load 4 dst pixels
1024 __m128i dst128 = _mm_loadu_si128((__m128i *)dst);
1025
1026 // Extract the alpha from each pixel and splat it into all the channels
1027 __m128i srcA = _mm_shuffle_epi8(src128, alpha_splat_mask);
1028
1029 // Convert to dst format
1030 src128 = _mm_shuffle_epi8(src128, convert_mask);
1031
1032 // Set the alpha channels of src to 255
1033 src128 = _mm_or_si128(src128, alpha_fill_mask);
1034
1035 // Duplicate each 8-bit alpha value into both bytes of 16-bit lanes
1036 __m128i srca_lo = _mm_unpacklo_epi8(srcA, srcA);
1037 __m128i srca_hi = _mm_unpackhi_epi8(srcA, srcA);
1038
1039 // Calculate 255-srcA in every second 8-bit lane (255-srcA = srcA^0xff)
1040 srca_lo = _mm_xor_si128(srca_lo, _mm_set1_epi16(0xff00));
1041 srca_hi = _mm_xor_si128(srca_hi, _mm_set1_epi16(0xff00));
1042
1043 // maddubs expects second argument to be signed, so subtract 128
1044 src128 = _mm_sub_epi8(src128, _mm_set1_epi8((Uint8)128));
1045 dst128 = _mm_sub_epi8(dst128, _mm_set1_epi8((Uint8)128));
1046
1047 // dst = srcA*(src-128) + (255-srcA)*(dst-128) = srcA*src + (255-srcA)*dst - 128*255
1048 __m128i dst_lo = _mm_maddubs_epi16(srca_lo, _mm_unpacklo_epi8(src128, dst128));
1049 __m128i dst_hi = _mm_maddubs_epi16(srca_hi, _mm_unpackhi_epi8(src128, dst128));
1050
1051 // dst += 0x1U (use 0x80 to round instead of floor) + 128*255 (to fix maddubs result)
1052 dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1 + 128*255));
1053 dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1 + 128*255));
1054
1055 // dst = (dst + (dst >> 8)) >> 8 = (dst * 257) >> 16
1056 dst_lo = _mm_mulhi_epu16(dst_lo, _mm_set1_epi16(257));
1057 dst_hi = _mm_mulhi_epu16(dst_hi, _mm_set1_epi16(257));
1058
1059 // Blend the pixels together and save the result
1060 _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(dst_lo, dst_hi));
1061
1062 src += 16;
1063 dst += 16;
1064 }
1065
1066 for (; i < width; ++i) {
1067 Uint32 src32 = *(Uint32 *)src;
1068 Uint32 dst32 = *(Uint32 *)dst;
1069 ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt);
1070 *(Uint32 *)dst = dst32;
1071 src += 4;
1072 dst += 4;
1073 }
1074
1075 src += srcskip;
1076 dst += dstskip;
1077 }
1078}
1079
1080#endif
1081
1082#ifdef SDL_AVX2_INTRINSICS
1083
1084static void SDL_TARGETING("avx2") Blit8888to8888PixelAlphaSwizzleAVX2(SDL_BlitInfo *info)
1085{
1086 int width = info->dst_w;
1087 int height = info->dst_h;
1088 Uint8 *src = info->src;
1089 int srcskip = info->src_skip;
1090 Uint8 *dst = info->dst;
1091 int dstskip = info->dst_skip;
1092 const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
1093 const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
1094
1095 // The byte offsets for the start of each pixel
1096 const __m256i mask_offsets = _mm256_set_epi8(
1097 28, 28, 28, 28, 24, 24, 24, 24, 20, 20, 20, 20, 16, 16, 16, 16, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
1098
1099 const __m256i convert_mask = _mm256_add_epi32(
1100 _mm256_set1_epi32(
1101 ((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
1102 ((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
1103 ((srcfmt->Bshift >> 3) << dstfmt->Bshift)),
1104 mask_offsets);
1105
1106 const __m256i alpha_splat_mask = _mm256_add_epi8(_mm256_set1_epi8(srcfmt->Ashift >> 3), mask_offsets);
1107 const __m256i alpha_fill_mask = _mm256_set1_epi32((int)dstfmt->Amask);
1108
1109 while (height--) {
1110 int i = 0;
1111
1112 for (; i + 8 <= width; i += 8) {
1113 // Load 8 src pixels
1114 __m256i src256 = _mm256_loadu_si256((__m256i *)src);
1115
1116 // Load 8 dst pixels
1117 __m256i dst256 = _mm256_loadu_si256((__m256i *)dst);
1118
1119 // Extract the alpha from each pixel and splat it into all the channels
1120 __m256i srcA = _mm256_shuffle_epi8(src256, alpha_splat_mask);
1121
1122 // Convert to dst format
1123 src256 = _mm256_shuffle_epi8(src256, convert_mask);
1124
1125 // Set the alpha channels of src to 255
1126 src256 = _mm256_or_si256(src256, alpha_fill_mask);
1127
1128 // Duplicate each 8-bit alpha value into both bytes of 16-bit lanes
1129 __m256i alpha_lo = _mm256_unpacklo_epi8(srcA, srcA);
1130 __m256i alpha_hi = _mm256_unpackhi_epi8(srcA, srcA);
1131
1132 // Calculate 255-srcA in every second 8-bit lane (255-srcA = srcA^0xff)
1133 alpha_lo = _mm256_xor_si256(alpha_lo, _mm256_set1_epi16(0xff00));
1134 alpha_hi = _mm256_xor_si256(alpha_hi, _mm256_set1_epi16(0xff00));
1135
1136 // maddubs expects second argument to be signed, so subtract 128
1137 src256 = _mm256_sub_epi8(src256, _mm256_set1_epi8((Uint8)128));
1138 dst256 = _mm256_sub_epi8(dst256, _mm256_set1_epi8((Uint8)128));
1139
1140 // dst = srcA*(src-128) + (255-srcA)*(dst-128) = srcA*src + (255-srcA)*dst - 128*255
1141 __m256i dst_lo = _mm256_maddubs_epi16(alpha_lo, _mm256_unpacklo_epi8(src256, dst256));
1142 __m256i dst_hi = _mm256_maddubs_epi16(alpha_hi, _mm256_unpackhi_epi8(src256, dst256));
1143
1144 // dst += 0x1U (use 0x80 to round instead of floor) + 128*255 (to fix maddubs result)
1145 dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(1 + 128*255));
1146 dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(1 + 128*255));
1147
1148 // dst = (dst + (dst >> 8)) >> 8 = (dst * 257) >> 16
1149 dst_lo = _mm256_mulhi_epu16(dst_lo, _mm256_set1_epi16(257));
1150 dst_hi = _mm256_mulhi_epu16(dst_hi, _mm256_set1_epi16(257));
1151
1152 // Blend the pixels together and save the result
1153 _mm256_storeu_si256((__m256i *)dst, _mm256_packus_epi16(dst_lo, dst_hi));
1154
1155 src += 32;
1156 dst += 32;
1157 }
1158
1159 for (; i < width; ++i) {
1160 Uint32 src32 = *(Uint32 *)src;
1161 Uint32 dst32 = *(Uint32 *)dst;
1162 ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt);
1163 *(Uint32 *)dst = dst32;
1164 src += 4;
1165 dst += 4;
1166 }
1167
1168 src += srcskip;
1169 dst += dstskip;
1170 }
1171}
1172
1173#endif
1174
1175#if defined(SDL_NEON_INTRINSICS) && (__ARM_ARCH >= 8)
1176
1177static void Blit8888to8888PixelAlphaSwizzleNEON(SDL_BlitInfo *info)
1178{
1179 int width = info->dst_w;
1180 int height = info->dst_h;
1181 Uint8 *src = info->src;
1182 int srcskip = info->src_skip;
1183 Uint8 *dst = info->dst;
1184 int dstskip = info->dst_skip;
1185 const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
1186 const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
1187
1188 // The byte offsets for the start of each pixel
1189 const uint8x16_t mask_offsets = vreinterpretq_u8_u64(vcombine_u64(
1190 vcreate_u64(0x0404040400000000), vcreate_u64(0x0c0c0c0c08080808)));
1191
1192 const uint8x16_t convert_mask = vreinterpretq_u8_u32(vaddq_u32(
1193 vreinterpretq_u32_u8(mask_offsets),
1194 vdupq_n_u32(
1195 ((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
1196 ((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
1197 ((srcfmt->Bshift >> 3) << dstfmt->Bshift))));
1198
1199 const uint8x16_t alpha_splat_mask = vaddq_u8(vdupq_n_u8(srcfmt->Ashift >> 3), mask_offsets);
1200 const uint8x16_t alpha_fill_mask = vreinterpretq_u8_u32(vdupq_n_u32(dstfmt->Amask));
1201
1202 while (height--) {
1203 int i = 0;
1204
1205 for (; i + 4 <= width; i += 4) {
1206 // Load 4 src pixels
1207 uint8x16_t src128 = vld1q_u8(src);
1208
1209 // Load 4 dst pixels
1210 uint8x16_t dst128 = vld1q_u8(dst);
1211
1212 // Extract the alpha from each pixel and splat it into all the channels
1213 uint8x16_t srcA = vqtbl1q_u8(src128, alpha_splat_mask);
1214
1215 // Convert to dst format
1216 src128 = vqtbl1q_u8(src128, convert_mask);
1217
1218 // Set the alpha channels of src to 255
1219 src128 = vorrq_u8(src128, alpha_fill_mask);
1220
1221 // 255 - srcA = ~srcA
1222 uint8x16_t srcInvA = vmvnq_u8(srcA);
1223
1224 // Result initialized with 1, this is for truncated divide later
1225 uint16x8_t res_lo = vdupq_n_u16(1);
1226 uint16x8_t res_hi = vdupq_n_u16(1);
1227
1228 // res = alpha * src + (255 - alpha) * dst
1229 res_lo = vmlal_u8(res_lo, vget_low_u8(srcA), vget_low_u8(src128));
1230 res_lo = vmlal_u8(res_lo, vget_low_u8(srcInvA), vget_low_u8(dst128));
1231 res_hi = vmlal_high_u8(res_hi, srcA, src128);
1232 res_hi = vmlal_high_u8(res_hi, srcInvA, dst128);
1233
1234 // Now result has +1 already added for truncated division
1235 // dst = (res + (res >> 8)) >> 8
1236 uint8x8_t temp;
1237 temp = vaddhn_u16(res_lo, vshrq_n_u16(res_lo, 8));
1238 dst128 = vaddhn_high_u16(temp, res_hi, vshrq_n_u16(res_hi, 8));
1239
1240 // For rounded division remove the constant 1 and change first two vmlal_u8 to vmull_u8
1241 // Then replace two previous lines with following code:
1242 // temp = vraddhn_u16(res_lo, vrshrq_n_u16(res_lo, 8));
1243 // dst128 = vraddhn_high_u16(temp, res_hi, vrshrq_n_u16(res_hi, 8));
1244
1245 // Save the result
1246 vst1q_u8(dst, dst128);
1247
1248 src += 16;
1249 dst += 16;
1250 }
1251
1252 // Process 1 pixel per iteration, max 3 iterations, same calculations as above
1253 for (; i < width; ++i) {
1254 // Top 32-bits will be not used in src32 & dst32
1255 uint8x8_t src32 = vreinterpret_u8_u32(vld1_dup_u32((Uint32*)src));
1256 uint8x8_t dst32 = vreinterpret_u8_u32(vld1_dup_u32((Uint32*)dst));
1257
1258 uint8x8_t srcA = vtbl1_u8(src32, vget_low_u8(alpha_splat_mask));
1259 src32 = vtbl1_u8(src32, vget_low_u8(convert_mask));
1260 src32 = vorr_u8(src32, vget_low_u8(alpha_fill_mask));
1261 uint8x8_t srcInvA = vmvn_u8(srcA);
1262
1263 uint16x8_t res = vdupq_n_u16(1);
1264 res = vmlal_u8(res, srcA, src32);
1265 res = vmlal_u8(res, srcInvA, dst32);
1266
1267 dst32 = vaddhn_u16(res, vshrq_n_u16(res, 8));
1268
1269 // Save the result, only low 32-bits
1270 vst1_lane_u32((Uint32*)dst, vreinterpret_u32_u8(dst32), 0);
1271
1272 src += 4;
1273 dst += 4;
1274 }
1275
1276 src += srcskip;
1277 dst += dstskip;
1278 }
1279}
1280
1281#endif
1282
1283// General (slow) N->N blending with pixel alpha
1284static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
1285{
1286 int width = info->dst_w;
1287 int height = info->dst_h;
1288 Uint8 *src = info->src;
1289 int srcskip = info->src_skip;
1290 Uint8 *dst = info->dst;
1291 int dstskip = info->dst_skip;
1292 const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
1293 const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
1294 int srcbpp;
1295 int dstbpp;
1296 Uint32 Pixel;
1297 unsigned sR, sG, sB, sA;
1298 unsigned dR, dG, dB, dA;
1299
1300 // Set up some basic variables
1301 srcbpp = srcfmt->bytes_per_pixel;
1302 dstbpp = dstfmt->bytes_per_pixel;
1303
1304 while (height--) {
1305 DUFFS_LOOP(
1306 {
1307 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
1308 if (sA) {
1309 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1310 ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1311 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1312 }
1313 src += srcbpp;
1314 dst += dstbpp;
1315 },
1316 width);
1317 /* *INDENT-ON* */ // clang-format on
1318 src += srcskip;
1319 dst += dstskip;
1320 }
1321}
1322
1323SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
1324{
1325 const SDL_PixelFormatDetails *sf = surface->fmt;
1326 const SDL_PixelFormatDetails *df = surface->map.info.dst_fmt;
1327
1328 switch (surface->map.info.flags & ~SDL_COPY_RLE_MASK) {
1329 case SDL_COPY_BLEND:
1330 // Per-pixel alpha blits
1331 switch (df->bytes_per_pixel) {
1332 case 1:
1333 if (surface->map.info.dst_pal) {
1334 return BlitNto1PixelAlpha;
1335 } else {
1336 // RGB332 has no palette !
1337 return BlitNtoNPixelAlpha;
1338 }
1339
1340 case 2:
1341 if (sf->bytes_per_pixel == 4 && sf->Amask == 0xff000000 && sf->Gmask == 0xff00 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
1342 if (df->Gmask == 0x7e0) {
1343 return BlitARGBto565PixelAlpha;
1344 } else if (df->Gmask == 0x3e0 && !df->Amask) {
1345 return BlitARGBto555PixelAlpha;
1346 }
1347 }
1348 return BlitNtoNPixelAlpha;
1349
1350 case 4:
1351 if (SDL_PIXELLAYOUT(sf->format) == SDL_PACKEDLAYOUT_8888 && sf->Amask &&
1352 SDL_PIXELLAYOUT(df->format) == SDL_PACKEDLAYOUT_8888) {
1353#ifdef SDL_AVX2_INTRINSICS
1354 if (SDL_HasAVX2()) {
1355 return Blit8888to8888PixelAlphaSwizzleAVX2;
1356 }
1357#endif
1358#ifdef SDL_SSE4_1_INTRINSICS
1359 if (SDL_HasSSE41()) {
1360 return Blit8888to8888PixelAlphaSwizzleSSE41;
1361 }
1362#endif
1363#if defined(SDL_NEON_INTRINSICS) && (__ARM_ARCH >= 8)
1364 // To prevent "unused function" compiler warnings/errors
1365 (void)Blit8888to8888PixelAlpha;
1366 (void)Blit8888to8888PixelAlphaSwizzle;
1367 return Blit8888to8888PixelAlphaSwizzleNEON;
1368#else
1369 if (sf->format == df->format) {
1370 return Blit8888to8888PixelAlpha;
1371 } else {
1372 return Blit8888to8888PixelAlphaSwizzle;
1373 }
1374#endif
1375 }
1376 return BlitNtoNPixelAlpha;
1377
1378 case 3:
1379 default:
1380 break;
1381 }
1382 return BlitNtoNPixelAlpha;
1383
1384 case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
1385 if (sf->Amask == 0) {
1386 // Per-surface alpha blits
1387 switch (df->bytes_per_pixel) {
1388 case 1:
1389 if (surface->map.info.dst_pal) {
1390 return BlitNto1SurfaceAlpha;
1391 } else {
1392 // RGB332 has no palette !
1393 return BlitNtoNSurfaceAlpha;
1394 }
1395
1396 case 2:
1397 if (surface->map.identity) {
1398 if (df->Gmask == 0x7e0) {
1399#ifdef SDL_MMX_INTRINSICS
1400 if (SDL_HasMMX()) {
1401 return Blit565to565SurfaceAlphaMMX;
1402 } else
1403#endif
1404 {
1405 return Blit565to565SurfaceAlpha;
1406 }
1407 } else if (df->Gmask == 0x3e0) {
1408#ifdef SDL_MMX_INTRINSICS
1409 if (SDL_HasMMX()) {
1410 return Blit555to555SurfaceAlphaMMX;
1411 } else
1412#endif
1413 {
1414 return Blit555to555SurfaceAlpha;
1415 }
1416 }
1417 }
1418 return BlitNtoNSurfaceAlpha;
1419
1420 case 4:
1421 if (sf->Rmask == df->Rmask && sf->Gmask == df->Gmask && sf->Bmask == df->Bmask && sf->bytes_per_pixel == 4) {
1422#ifdef SDL_SSE2_INTRINSICS
1423 if (sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0 && SDL_HasSSE2()) {
1424 return Blit888to888SurfaceAlphaSSE2;
1425 }
1426#endif
1427 if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
1428 return BlitRGBtoRGBSurfaceAlpha;
1429 }
1430 }
1431 return BlitNtoNSurfaceAlpha;
1432
1433 case 3:
1434 default:
1435 return BlitNtoNSurfaceAlpha;
1436 }
1437 }
1438 break;
1439
1440 case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
1441 if (sf->Amask == 0) {
1442 if (df->bytes_per_pixel == 1) {
1443
1444 if (surface->map.info.dst_pal) {
1445 return BlitNto1SurfaceAlphaKey;
1446 } else {
1447 // RGB332 has no palette !
1448 return BlitNtoNSurfaceAlphaKey;
1449 }
1450 } else {
1451 return BlitNtoNSurfaceAlphaKey;
1452 }
1453 }
1454 break;
1455 }
1456
1457 return NULL;
1458}
1459
1460#endif // SDL_HAVE_BLIT_A
1461
1462