1 | /* |
2 | Simple DirectMedia Layer |
3 | Copyright (C) 1997-2025 Sam Lantinga <slouken@libsdl.org> |
4 | |
5 | This software is provided 'as-is', without any express or implied |
6 | warranty. In no event will the authors be held liable for any damages |
7 | arising from the use of this software. |
8 | |
9 | Permission is granted to anyone to use this software for any purpose, |
10 | including commercial applications, and to alter it and redistribute it |
11 | freely, subject to the following restrictions: |
12 | |
13 | 1. The origin of this software must not be misrepresented; you must not |
14 | claim that you wrote the original software. If you use this software |
15 | in a product, an acknowledgment in the product documentation would be |
16 | appreciated but is not required. |
17 | 2. Altered source versions must be plainly marked as such, and must not be |
18 | misrepresented as being the original software. |
19 | 3. This notice may not be removed or altered from any source distribution. |
20 | */ |
21 | #include "SDL_internal.h" |
22 | |
23 | #ifdef SDL_HAVE_BLIT_A |
24 | |
25 | #include "SDL_surface_c.h" |
26 | |
27 | // Functions to perform alpha blended blitting |
28 | |
29 | // N->1 blending with per-surface alpha |
30 | static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info) |
31 | { |
32 | int width = info->dst_w; |
33 | int height = info->dst_h; |
34 | Uint8 *src = info->src; |
35 | int srcskip = info->src_skip; |
36 | Uint8 *dst = info->dst; |
37 | int dstskip = info->dst_skip; |
38 | Uint8 *palmap = info->table; |
39 | const SDL_PixelFormatDetails *srcfmt = info->src_fmt; |
40 | const SDL_Color *dstpal = info->dst_pal->colors; |
41 | int srcbpp = srcfmt->bytes_per_pixel; |
42 | Uint32 Pixel; |
43 | unsigned sR, sG, sB; |
44 | unsigned dR, dG, dB; |
45 | const unsigned A = info->a; |
46 | |
47 | while (height--) { |
48 | /* *INDENT-OFF* */ // clang-format off |
49 | DUFFS_LOOP( |
50 | { |
51 | DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB); |
52 | dR = dstpal[*dst].r; |
53 | dG = dstpal[*dst].g; |
54 | dB = dstpal[*dst].b; |
55 | ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB); |
56 | dR &= 0xff; |
57 | dG &= 0xff; |
58 | dB &= 0xff; |
59 | // Pack RGB into 8bit pixel |
60 | if ( palmap == NULL ) { |
61 | *dst = (Uint8)(((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))); |
62 | } else { |
63 | *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))]; |
64 | } |
65 | dst++; |
66 | src += srcbpp; |
67 | }, |
68 | width); |
69 | /* *INDENT-ON* */ // clang-format on |
70 | src += srcskip; |
71 | dst += dstskip; |
72 | } |
73 | } |
74 | |
75 | // N->1 blending with pixel alpha |
76 | static void BlitNto1PixelAlpha(SDL_BlitInfo *info) |
77 | { |
78 | int width = info->dst_w; |
79 | int height = info->dst_h; |
80 | Uint8 *src = info->src; |
81 | int srcskip = info->src_skip; |
82 | Uint8 *dst = info->dst; |
83 | int dstskip = info->dst_skip; |
84 | Uint8 *palmap = info->table; |
85 | const SDL_PixelFormatDetails *srcfmt = info->src_fmt; |
86 | const SDL_Color *dstpal = info->dst_pal->colors; |
87 | int srcbpp = srcfmt->bytes_per_pixel; |
88 | Uint32 Pixel; |
89 | unsigned sR, sG, sB, sA; |
90 | unsigned dR, dG, dB; |
91 | |
92 | while (height--) { |
93 | /* *INDENT-OFF* */ // clang-format off |
94 | DUFFS_LOOP( |
95 | { |
96 | DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA); |
97 | dR = dstpal[*dst].r; |
98 | dG = dstpal[*dst].g; |
99 | dB = dstpal[*dst].b; |
100 | ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB); |
101 | dR &= 0xff; |
102 | dG &= 0xff; |
103 | dB &= 0xff; |
104 | // Pack RGB into 8bit pixel |
105 | if ( palmap == NULL ) { |
106 | *dst = (Uint8)(((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))); |
107 | } else { |
108 | *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))]; |
109 | } |
110 | dst++; |
111 | src += srcbpp; |
112 | }, |
113 | width); |
114 | /* *INDENT-ON* */ // clang-format on |
115 | src += srcskip; |
116 | dst += dstskip; |
117 | } |
118 | } |
119 | |
120 | // colorkeyed N->1 blending with per-surface alpha |
121 | static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info) |
122 | { |
123 | int width = info->dst_w; |
124 | int height = info->dst_h; |
125 | Uint8 *src = info->src; |
126 | int srcskip = info->src_skip; |
127 | Uint8 *dst = info->dst; |
128 | int dstskip = info->dst_skip; |
129 | Uint8 *palmap = info->table; |
130 | const SDL_PixelFormatDetails *srcfmt = info->src_fmt; |
131 | const SDL_Color *dstpal = info->dst_pal->colors; |
132 | int srcbpp = srcfmt->bytes_per_pixel; |
133 | Uint32 ckey = info->colorkey; |
134 | Uint32 Pixel; |
135 | unsigned sR, sG, sB; |
136 | unsigned dR, dG, dB; |
137 | const unsigned A = info->a; |
138 | |
139 | while (height--) { |
140 | /* *INDENT-OFF* */ // clang-format off |
141 | DUFFS_LOOP( |
142 | { |
143 | DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB); |
144 | if ( Pixel != ckey ) { |
145 | dR = dstpal[*dst].r; |
146 | dG = dstpal[*dst].g; |
147 | dB = dstpal[*dst].b; |
148 | ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB); |
149 | dR &= 0xff; |
150 | dG &= 0xff; |
151 | dB &= 0xff; |
152 | // Pack RGB into 8bit pixel |
153 | if ( palmap == NULL ) { |
154 | *dst = (Uint8)(((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))); |
155 | } else { |
156 | *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))]; |
157 | } |
158 | } |
159 | dst++; |
160 | src += srcbpp; |
161 | }, |
162 | width); |
163 | /* *INDENT-ON* */ // clang-format on |
164 | src += srcskip; |
165 | dst += dstskip; |
166 | } |
167 | } |
168 | |
169 | #ifdef SDL_SSE2_INTRINSICS |
170 | |
171 | static void SDL_TARGETING("sse2" ) Blit888to888SurfaceAlphaSSE2(SDL_BlitInfo *info) |
172 | { |
173 | int width = info->dst_w; |
174 | int height = info->dst_h; |
175 | Uint8 *src = info->src; |
176 | int srcskip = info->src_skip; |
177 | Uint8 *dst = info->dst; |
178 | int dstskip = info->dst_skip; |
179 | Uint8 alpha = info->a; |
180 | |
181 | const __m128i alpha_fill_mask = _mm_set1_epi32((int)0xff000000); |
182 | const __m128i srcA = _mm_set1_epi16(alpha); |
183 | |
184 | while (height--) { |
185 | int i = 0; |
186 | |
187 | for (; i + 4 <= width; i += 4) { |
188 | // Load 4 src pixels |
189 | __m128i src128 = _mm_loadu_si128((__m128i *)src); |
190 | |
191 | // Load 4 dst pixels |
192 | __m128i dst128 = _mm_loadu_si128((__m128i *)dst); |
193 | |
194 | __m128i src_lo = _mm_unpacklo_epi8(src128, _mm_setzero_si128()); |
195 | __m128i src_hi = _mm_unpackhi_epi8(src128, _mm_setzero_si128()); |
196 | |
197 | __m128i dst_lo = _mm_unpacklo_epi8(dst128, _mm_setzero_si128()); |
198 | __m128i dst_hi = _mm_unpackhi_epi8(dst128, _mm_setzero_si128()); |
199 | |
200 | // dst = ((src - dst) * srcA) + ((dst << 8) - dst) |
201 | dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srcA), |
202 | _mm_sub_epi16(_mm_slli_epi16(dst_lo, 8), dst_lo)); |
203 | dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srcA), |
204 | _mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi)); |
205 | |
206 | // dst += 0x1U (use 0x80 to round instead of floor) |
207 | dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1)); |
208 | dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1)); |
209 | |
210 | // dst = (dst + (dst >> 8)) >> 8 |
211 | dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8); |
212 | dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8); |
213 | |
214 | dst128 = _mm_packus_epi16(dst_lo, dst_hi); |
215 | |
216 | // Set the alpha channels of dst to 255 |
217 | dst128 = _mm_or_si128(dst128, alpha_fill_mask); |
218 | |
219 | _mm_storeu_si128((__m128i *)dst, dst128); |
220 | |
221 | src += 16; |
222 | dst += 16; |
223 | } |
224 | |
225 | for (; i < width; ++i) { |
226 | Uint32 src32 = *(Uint32 *)src; |
227 | Uint32 dst32 = *(Uint32 *)dst; |
228 | |
229 | FACTOR_BLEND_8888(src32, dst32, alpha); |
230 | |
231 | *dst = dst32 | 0xff000000; |
232 | |
233 | src += 4; |
234 | dst += 4; |
235 | } |
236 | |
237 | src += srcskip; |
238 | dst += dstskip; |
239 | } |
240 | } |
241 | |
242 | #endif |
243 | |
244 | // fast RGB888->(A)RGB888 blending with surface alpha=128 special case |
245 | static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info) |
246 | { |
247 | int width = info->dst_w; |
248 | int height = info->dst_h; |
249 | Uint32 *srcp = (Uint32 *)info->src; |
250 | int srcskip = info->src_skip >> 2; |
251 | Uint32 *dstp = (Uint32 *)info->dst; |
252 | int dstskip = info->dst_skip >> 2; |
253 | |
254 | while (height--) { |
255 | /* *INDENT-OFF* */ // clang-format off |
256 | DUFFS_LOOP({ |
257 | Uint32 s = *srcp++; |
258 | Uint32 d = *dstp; |
259 | *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) |
260 | + (s & d & 0x00010101)) | 0xff000000; |
261 | }, width); |
262 | /* *INDENT-ON* */ // clang-format on |
263 | srcp += srcskip; |
264 | dstp += dstskip; |
265 | } |
266 | } |
267 | |
268 | // fast RGB888->(A)RGB888 blending with surface alpha |
269 | static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info) |
270 | { |
271 | unsigned alpha = info->a; |
272 | if (alpha == 128) { |
273 | BlitRGBtoRGBSurfaceAlpha128(info); |
274 | } else { |
275 | int width = info->dst_w; |
276 | int height = info->dst_h; |
277 | Uint32 *srcp = (Uint32 *)info->src; |
278 | int srcskip = info->src_skip >> 2; |
279 | Uint32 *dstp = (Uint32 *)info->dst; |
280 | int dstskip = info->dst_skip >> 2; |
281 | Uint32 s; |
282 | Uint32 d; |
283 | |
284 | while (height--) { |
285 | /* *INDENT-OFF* */ // clang-format off |
286 | DUFFS_LOOP({ |
287 | s = *srcp; |
288 | d = *dstp; |
289 | |
290 | FACTOR_BLEND_8888(s, d, alpha); |
291 | |
292 | *dstp = d | 0xff000000; |
293 | ++srcp; |
294 | ++dstp; |
295 | }, width); |
296 | /* *INDENT-ON* */ // clang-format on |
297 | srcp += srcskip; |
298 | dstp += dstskip; |
299 | } |
300 | } |
301 | } |
302 | |
303 | // 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel |
304 | |
305 | // blend a single 16 bit pixel at 50% |
306 | #define BLEND16_50(d, s, mask) \ |
307 | ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff))) |
308 | |
309 | // blend two 16 bit pixels at 50% |
310 | #define BLEND2x16_50(d, s, mask) \ |
311 | (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) + (s & d & (~(mask | mask << 16)))) |
312 | |
313 | static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask) |
314 | { |
315 | int width = info->dst_w; |
316 | int height = info->dst_h; |
317 | Uint16 *srcp = (Uint16 *)info->src; |
318 | int srcskip = info->src_skip >> 1; |
319 | Uint16 *dstp = (Uint16 *)info->dst; |
320 | int dstskip = info->dst_skip >> 1; |
321 | |
322 | while (height--) { |
323 | if (((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) { |
324 | /* |
325 | * Source and destination not aligned, pipeline it. |
326 | * This is mostly a win for big blits but no loss for |
327 | * small ones |
328 | */ |
329 | Uint32 prev_sw; |
330 | int w = width; |
331 | |
332 | // handle odd destination |
333 | if ((uintptr_t)dstp & 2) { |
334 | Uint16 d = *dstp, s = *srcp; |
335 | *dstp = BLEND16_50(d, s, mask); |
336 | dstp++; |
337 | srcp++; |
338 | w--; |
339 | } |
340 | srcp++; // srcp is now 32-bit aligned |
341 | |
342 | // bootstrap pipeline with first halfword |
343 | prev_sw = ((Uint32 *)srcp)[-1]; |
344 | |
345 | while (w > 1) { |
346 | Uint32 sw, dw, s; |
347 | sw = *(Uint32 *)srcp; |
348 | dw = *(Uint32 *)dstp; |
349 | #if SDL_BYTEORDER == SDL_BIG_ENDIAN |
350 | s = (prev_sw << 16) + (sw >> 16); |
351 | #else |
352 | s = (prev_sw >> 16) + (sw << 16); |
353 | #endif |
354 | prev_sw = sw; |
355 | *(Uint32 *)dstp = BLEND2x16_50(dw, s, mask); |
356 | dstp += 2; |
357 | srcp += 2; |
358 | w -= 2; |
359 | } |
360 | |
361 | // final pixel if any |
362 | if (w) { |
363 | Uint16 d = *dstp, s; |
364 | #if SDL_BYTEORDER == SDL_BIG_ENDIAN |
365 | s = (Uint16)prev_sw; |
366 | #else |
367 | s = (Uint16)(prev_sw >> 16); |
368 | #endif |
369 | *dstp = BLEND16_50(d, s, mask); |
370 | srcp++; |
371 | dstp++; |
372 | } |
373 | srcp += srcskip - 1; |
374 | dstp += dstskip; |
375 | } else { |
376 | // source and destination are aligned |
377 | int w = width; |
378 | |
379 | // first odd pixel? |
380 | if ((uintptr_t)srcp & 2) { |
381 | Uint16 d = *dstp, s = *srcp; |
382 | *dstp = BLEND16_50(d, s, mask); |
383 | srcp++; |
384 | dstp++; |
385 | w--; |
386 | } |
387 | // srcp and dstp are now 32-bit aligned |
388 | |
389 | while (w > 1) { |
390 | Uint32 sw = *(Uint32 *)srcp; |
391 | Uint32 dw = *(Uint32 *)dstp; |
392 | *(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask); |
393 | srcp += 2; |
394 | dstp += 2; |
395 | w -= 2; |
396 | } |
397 | |
398 | // last odd pixel? |
399 | if (w) { |
400 | Uint16 d = *dstp, s = *srcp; |
401 | *dstp = BLEND16_50(d, s, mask); |
402 | srcp++; |
403 | dstp++; |
404 | } |
405 | srcp += srcskip; |
406 | dstp += dstskip; |
407 | } |
408 | } |
409 | } |
410 | |
411 | #ifdef SDL_MMX_INTRINSICS |
412 | |
413 | // fast RGB565->RGB565 blending with surface alpha |
414 | static void SDL_TARGETING("mmx" ) Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info) |
415 | { |
416 | unsigned alpha = info->a; |
417 | if (alpha == 128) { |
418 | Blit16to16SurfaceAlpha128(info, 0xf7de); |
419 | } else { |
420 | int width = info->dst_w; |
421 | int height = info->dst_h; |
422 | Uint16 *srcp = (Uint16 *)info->src; |
423 | int srcskip = info->src_skip >> 1; |
424 | Uint16 *dstp = (Uint16 *)info->dst; |
425 | int dstskip = info->dst_skip >> 1; |
426 | Uint32 s, d; |
427 | |
428 | #ifdef USE_DUFFS_LOOP |
429 | __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha; |
430 | |
431 | alpha &= ~(1 + 2 + 4); // cut alpha to get the exact same behaviour |
432 | mm_alpha = _mm_set_pi32(0, alpha); // 0000000A -> mm_alpha |
433 | alpha >>= 3; // downscale alpha to 5 bits |
434 | |
435 | mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); // 00000A0A -> mm_alpha |
436 | mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); // 0A0A0A0A -> mm_alpha |
437 | /* position alpha to allow for mullo and mulhi on diff channels |
438 | to reduce the number of operations */ |
439 | mm_alpha = _mm_slli_si64(mm_alpha, 3); |
440 | |
441 | // Setup the 565 color channel masks |
442 | gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); // MASKGREEN -> gmask |
443 | bmask = _mm_set_pi32(0x001F001F, 0x001F001F); // MASKBLUE -> bmask |
444 | #endif |
445 | |
446 | while (height--) { |
447 | /* *INDENT-OFF* */ // clang-format off |
448 | DUFFS_LOOP_124( |
449 | { |
450 | s = *srcp++; |
451 | d = *dstp; |
452 | /* |
453 | * shift out the middle component (green) to |
454 | * the high 16 bits, and process all three RGB |
455 | * components at the same time. |
456 | */ |
457 | s = (s | s << 16) & 0x07e0f81f; |
458 | d = (d | d << 16) & 0x07e0f81f; |
459 | d += (s - d) * alpha >> 5; |
460 | d &= 0x07e0f81f; |
461 | *dstp++ = (Uint16)(d | d >> 16); |
462 | },{ |
463 | s = *srcp++; |
464 | d = *dstp; |
465 | /* |
466 | * shift out the middle component (green) to |
467 | * the high 16 bits, and process all three RGB |
468 | * components at the same time. |
469 | */ |
470 | s = (s | s << 16) & 0x07e0f81f; |
471 | d = (d | d << 16) & 0x07e0f81f; |
472 | d += (s - d) * alpha >> 5; |
473 | d &= 0x07e0f81f; |
474 | *dstp++ = (Uint16)(d | d >> 16); |
475 | s = *srcp++; |
476 | d = *dstp; |
477 | /* |
478 | * shift out the middle component (green) to |
479 | * the high 16 bits, and process all three RGB |
480 | * components at the same time. |
481 | */ |
482 | s = (s | s << 16) & 0x07e0f81f; |
483 | d = (d | d << 16) & 0x07e0f81f; |
484 | d += (s - d) * alpha >> 5; |
485 | d &= 0x07e0f81f; |
486 | *dstp++ = (Uint16)(d | d >> 16); |
487 | },{ |
488 | src1 = *(__m64*)srcp; // 4 src pixels -> src1 |
489 | dst1 = *(__m64*)dstp; // 4 dst pixels -> dst1 |
490 | |
491 | // red |
492 | src2 = src1; |
493 | src2 = _mm_srli_pi16(src2, 11); // src2 >> 11 -> src2 [000r 000r 000r 000r] |
494 | |
495 | dst2 = dst1; |
496 | dst2 = _mm_srli_pi16(dst2, 11); // dst2 >> 11 -> dst2 [000r 000r 000r 000r] |
497 | |
498 | // blend |
499 | src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2 |
500 | src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ |
501 | src2 = _mm_srli_pi16(src2, 11); // src2 >> 11 -> src2 |
502 | dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2 |
503 | dst2 = _mm_slli_pi16(dst2, 11); // dst2 << 11 -> dst2 |
504 | |
505 | mm_res = dst2; // RED -> mm_res |
506 | |
507 | // green -- process the bits in place |
508 | src2 = src1; |
509 | src2 = _mm_and_si64(src2, gmask); // src & MASKGREEN -> src2 |
510 | |
511 | dst2 = dst1; |
512 | dst2 = _mm_and_si64(dst2, gmask); // dst & MASKGREEN -> dst2 |
513 | |
514 | // blend |
515 | src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2 |
516 | src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ |
517 | src2 = _mm_slli_pi16(src2, 5); // src2 << 5 -> src2 |
518 | dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2 |
519 | |
520 | mm_res = _mm_or_si64(mm_res, dst2); // RED | GREEN -> mm_res |
521 | |
522 | // blue |
523 | src2 = src1; |
524 | src2 = _mm_and_si64(src2, bmask); // src & MASKBLUE -> src2[000b 000b 000b 000b] |
525 | |
526 | dst2 = dst1; |
527 | dst2 = _mm_and_si64(dst2, bmask); // dst & MASKBLUE -> dst2[000b 000b 000b 000b] |
528 | |
529 | // blend |
530 | src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2 |
531 | src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ |
532 | src2 = _mm_srli_pi16(src2, 11); // src2 >> 11 -> src2 |
533 | dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2 |
534 | dst2 = _mm_and_si64(dst2, bmask); // dst2 & MASKBLUE -> dst2 |
535 | |
536 | mm_res = _mm_or_si64(mm_res, dst2); // RED | GREEN | BLUE -> mm_res |
537 | |
538 | *(__m64*)dstp = mm_res; // mm_res -> 4 dst pixels |
539 | |
540 | srcp += 4; |
541 | dstp += 4; |
542 | }, width); |
543 | /* *INDENT-ON* */ // clang-format on |
544 | srcp += srcskip; |
545 | dstp += dstskip; |
546 | } |
547 | _mm_empty(); |
548 | } |
549 | } |
550 | |
551 | // fast RGB555->RGB555 blending with surface alpha |
552 | static void SDL_TARGETING("mmx" ) Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info) |
553 | { |
554 | unsigned alpha = info->a; |
555 | if (alpha == 128) { |
556 | Blit16to16SurfaceAlpha128(info, 0xfbde); |
557 | } else { |
558 | int width = info->dst_w; |
559 | int height = info->dst_h; |
560 | Uint16 *srcp = (Uint16 *)info->src; |
561 | int srcskip = info->src_skip >> 1; |
562 | Uint16 *dstp = (Uint16 *)info->dst; |
563 | int dstskip = info->dst_skip >> 1; |
564 | Uint32 s, d; |
565 | |
566 | #ifdef USE_DUFFS_LOOP |
567 | __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha; |
568 | |
569 | alpha &= ~(1 + 2 + 4); // cut alpha to get the exact same behaviour |
570 | mm_alpha = _mm_set_pi32(0, alpha); // 0000000A -> mm_alpha |
571 | alpha >>= 3; // downscale alpha to 5 bits |
572 | |
573 | mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); // 00000A0A -> mm_alpha |
574 | mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); // 0A0A0A0A -> mm_alpha |
575 | /* position alpha to allow for mullo and mulhi on diff channels |
576 | to reduce the number of operations */ |
577 | mm_alpha = _mm_slli_si64(mm_alpha, 3); |
578 | |
579 | // Setup the 555 color channel masks |
580 | rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); // MASKRED -> rmask |
581 | gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); // MASKGREEN -> gmask |
582 | bmask = _mm_set_pi32(0x001F001F, 0x001F001F); // MASKBLUE -> bmask |
583 | #endif |
584 | while (height--) { |
585 | /* *INDENT-OFF* */ // clang-format off |
586 | DUFFS_LOOP_124( |
587 | { |
588 | s = *srcp++; |
589 | d = *dstp; |
590 | /* |
591 | * shift out the middle component (green) to |
592 | * the high 16 bits, and process all three RGB |
593 | * components at the same time. |
594 | */ |
595 | s = (s | s << 16) & 0x03e07c1f; |
596 | d = (d | d << 16) & 0x03e07c1f; |
597 | d += (s - d) * alpha >> 5; |
598 | d &= 0x03e07c1f; |
599 | *dstp++ = (Uint16)(d | d >> 16); |
600 | },{ |
601 | s = *srcp++; |
602 | d = *dstp; |
603 | /* |
604 | * shift out the middle component (green) to |
605 | * the high 16 bits, and process all three RGB |
606 | * components at the same time. |
607 | */ |
608 | s = (s | s << 16) & 0x03e07c1f; |
609 | d = (d | d << 16) & 0x03e07c1f; |
610 | d += (s - d) * alpha >> 5; |
611 | d &= 0x03e07c1f; |
612 | *dstp++ = (Uint16)(d | d >> 16); |
613 | s = *srcp++; |
614 | d = *dstp; |
615 | /* |
616 | * shift out the middle component (green) to |
617 | * the high 16 bits, and process all three RGB |
618 | * components at the same time. |
619 | */ |
620 | s = (s | s << 16) & 0x03e07c1f; |
621 | d = (d | d << 16) & 0x03e07c1f; |
622 | d += (s - d) * alpha >> 5; |
623 | d &= 0x03e07c1f; |
624 | *dstp++ = (Uint16)(d | d >> 16); |
625 | },{ |
626 | src1 = *(__m64*)srcp; // 4 src pixels -> src1 |
627 | dst1 = *(__m64*)dstp; // 4 dst pixels -> dst1 |
628 | |
629 | // red -- process the bits in place |
630 | src2 = src1; |
631 | src2 = _mm_and_si64(src2, rmask); // src & MASKRED -> src2 |
632 | |
633 | dst2 = dst1; |
634 | dst2 = _mm_and_si64(dst2, rmask); // dst & MASKRED -> dst2 |
635 | |
636 | // blend |
637 | src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2 |
638 | src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ |
639 | src2 = _mm_slli_pi16(src2, 5); // src2 << 5 -> src2 |
640 | dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2 |
641 | dst2 = _mm_and_si64(dst2, rmask); // dst2 & MASKRED -> dst2 |
642 | |
643 | mm_res = dst2; // RED -> mm_res |
644 | |
645 | // green -- process the bits in place |
646 | src2 = src1; |
647 | src2 = _mm_and_si64(src2, gmask); // src & MASKGREEN -> src2 |
648 | |
649 | dst2 = dst1; |
650 | dst2 = _mm_and_si64(dst2, gmask); // dst & MASKGREEN -> dst2 |
651 | |
652 | // blend |
653 | src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2 |
654 | src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ |
655 | src2 = _mm_slli_pi16(src2, 5); // src2 << 5 -> src2 |
656 | dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2 |
657 | |
658 | mm_res = _mm_or_si64(mm_res, dst2); // RED | GREEN -> mm_res |
659 | |
660 | // blue |
661 | src2 = src1; // src -> src2 |
662 | src2 = _mm_and_si64(src2, bmask); // src & MASKBLUE -> src2[000b 000b 000b 000b] |
663 | |
664 | dst2 = dst1; // dst -> dst2 |
665 | dst2 = _mm_and_si64(dst2, bmask); // dst & MASKBLUE -> dst2[000b 000b 000b 000b] |
666 | |
667 | // blend |
668 | src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2 |
669 | src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ |
670 | src2 = _mm_srli_pi16(src2, 11); // src2 >> 11 -> src2 |
671 | dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2 |
672 | dst2 = _mm_and_si64(dst2, bmask); // dst2 & MASKBLUE -> dst2 |
673 | |
674 | mm_res = _mm_or_si64(mm_res, dst2); // RED | GREEN | BLUE -> mm_res |
675 | |
676 | *(__m64*)dstp = mm_res; // mm_res -> 4 dst pixels |
677 | |
678 | srcp += 4; |
679 | dstp += 4; |
680 | }, width); |
681 | /* *INDENT-ON* */ // clang-format on |
682 | srcp += srcskip; |
683 | dstp += dstskip; |
684 | } |
685 | _mm_empty(); |
686 | } |
687 | } |
688 | |
689 | #endif // SDL_MMX_INTRINSICS |
690 | |
691 | // fast RGB565->RGB565 blending with surface alpha |
692 | static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info) |
693 | { |
694 | unsigned alpha = info->a; |
695 | if (alpha == 128) { |
696 | Blit16to16SurfaceAlpha128(info, 0xf7de); |
697 | } else { |
698 | int width = info->dst_w; |
699 | int height = info->dst_h; |
700 | Uint16 *srcp = (Uint16 *)info->src; |
701 | int srcskip = info->src_skip >> 1; |
702 | Uint16 *dstp = (Uint16 *)info->dst; |
703 | int dstskip = info->dst_skip >> 1; |
704 | alpha >>= 3; // downscale alpha to 5 bits |
705 | |
706 | while (height--) { |
707 | /* *INDENT-OFF* */ // clang-format off |
708 | DUFFS_LOOP({ |
709 | Uint32 s = *srcp++; |
710 | Uint32 d = *dstp; |
711 | /* |
712 | * shift out the middle component (green) to |
713 | * the high 16 bits, and process all three RGB |
714 | * components at the same time. |
715 | */ |
716 | s = (s | s << 16) & 0x07e0f81f; |
717 | d = (d | d << 16) & 0x07e0f81f; |
718 | d += (s - d) * alpha >> 5; |
719 | d &= 0x07e0f81f; |
720 | *dstp++ = (Uint16)(d | d >> 16); |
721 | }, width); |
722 | /* *INDENT-ON* */ // clang-format on |
723 | srcp += srcskip; |
724 | dstp += dstskip; |
725 | } |
726 | } |
727 | } |
728 | |
729 | // fast RGB555->RGB555 blending with surface alpha |
730 | static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info) |
731 | { |
732 | unsigned alpha = info->a; // downscale alpha to 5 bits |
733 | if (alpha == 128) { |
734 | Blit16to16SurfaceAlpha128(info, 0xfbde); |
735 | } else { |
736 | int width = info->dst_w; |
737 | int height = info->dst_h; |
738 | Uint16 *srcp = (Uint16 *)info->src; |
739 | int srcskip = info->src_skip >> 1; |
740 | Uint16 *dstp = (Uint16 *)info->dst; |
741 | int dstskip = info->dst_skip >> 1; |
742 | alpha >>= 3; // downscale alpha to 5 bits |
743 | |
744 | while (height--) { |
745 | /* *INDENT-OFF* */ // clang-format off |
746 | DUFFS_LOOP({ |
747 | Uint32 s = *srcp++; |
748 | Uint32 d = *dstp; |
749 | /* |
750 | * shift out the middle component (green) to |
751 | * the high 16 bits, and process all three RGB |
752 | * components at the same time. |
753 | */ |
754 | s = (s | s << 16) & 0x03e07c1f; |
755 | d = (d | d << 16) & 0x03e07c1f; |
756 | d += (s - d) * alpha >> 5; |
757 | d &= 0x03e07c1f; |
758 | *dstp++ = (Uint16)(d | d >> 16); |
759 | }, width); |
760 | /* *INDENT-ON* */ // clang-format on |
761 | srcp += srcskip; |
762 | dstp += dstskip; |
763 | } |
764 | } |
765 | } |
766 | |
767 | // fast ARGB8888->RGB565 blending with pixel alpha |
768 | static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info) |
769 | { |
770 | int width = info->dst_w; |
771 | int height = info->dst_h; |
772 | Uint32 *srcp = (Uint32 *)info->src; |
773 | int srcskip = info->src_skip >> 2; |
774 | Uint16 *dstp = (Uint16 *)info->dst; |
775 | int dstskip = info->dst_skip >> 1; |
776 | |
777 | while (height--) { |
778 | /* *INDENT-OFF* */ // clang-format off |
779 | DUFFS_LOOP({ |
780 | Uint32 s = *srcp; |
781 | unsigned alpha = s >> 27; // downscale alpha to 5 bits |
782 | /* Here we special-case opaque alpha since the |
783 | compositioning used (>>8 instead of /255) doesn't handle |
784 | it correctly. */ |
785 | if (alpha) { |
786 | if (alpha == (SDL_ALPHA_OPAQUE >> 3)) { |
787 | *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f)); |
788 | } else { |
789 | Uint32 d = *dstp; |
790 | /* |
791 | * convert source and destination to G0RAB65565 |
792 | * and blend all components at the same time |
793 | */ |
794 | s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800) + (s >> 3 & 0x1f); |
795 | d = (d | d << 16) & 0x07e0f81f; |
796 | d += (s - d) * alpha >> 5; |
797 | d &= 0x07e0f81f; |
798 | *dstp = (Uint16)(d | d >> 16); |
799 | } |
800 | } |
801 | srcp++; |
802 | dstp++; |
803 | }, width); |
804 | /* *INDENT-ON* */ // clang-format on |
805 | srcp += srcskip; |
806 | dstp += dstskip; |
807 | } |
808 | } |
809 | |
810 | // fast ARGB8888->RGB555 blending with pixel alpha |
811 | static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info) |
812 | { |
813 | int width = info->dst_w; |
814 | int height = info->dst_h; |
815 | Uint32 *srcp = (Uint32 *)info->src; |
816 | int srcskip = info->src_skip >> 2; |
817 | Uint16 *dstp = (Uint16 *)info->dst; |
818 | int dstskip = info->dst_skip >> 1; |
819 | |
820 | while (height--) { |
821 | /* *INDENT-OFF* */ // clang-format off |
822 | DUFFS_LOOP({ |
823 | unsigned alpha; |
824 | Uint32 s = *srcp; |
825 | alpha = s >> 27; // downscale alpha to 5 bits |
826 | /* Here we special-case opaque alpha since the |
827 | compositioning used (>>8 instead of /255) doesn't handle |
828 | it correctly. */ |
829 | if (alpha) { |
830 | if (alpha == (SDL_ALPHA_OPAQUE >> 3)) { |
831 | *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f)); |
832 | } else { |
833 | Uint32 d = *dstp; |
834 | /* |
835 | * convert source and destination to G0RAB55555 |
836 | * and blend all components at the same time |
837 | */ |
838 | s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00) + (s >> 3 & 0x1f); |
839 | d = (d | d << 16) & 0x03e07c1f; |
840 | d += (s - d) * alpha >> 5; |
841 | d &= 0x03e07c1f; |
842 | *dstp = (Uint16)(d | d >> 16); |
843 | } |
844 | } |
845 | srcp++; |
846 | dstp++; |
847 | }, width); |
848 | /* *INDENT-ON* */ // clang-format on |
849 | srcp += srcskip; |
850 | dstp += dstskip; |
851 | } |
852 | } |
853 | |
854 | // General (slow) N->N blending with per-surface alpha |
855 | static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info) |
856 | { |
857 | int width = info->dst_w; |
858 | int height = info->dst_h; |
859 | Uint8 *src = info->src; |
860 | int srcskip = info->src_skip; |
861 | Uint8 *dst = info->dst; |
862 | int dstskip = info->dst_skip; |
863 | const SDL_PixelFormatDetails *srcfmt = info->src_fmt; |
864 | const SDL_PixelFormatDetails *dstfmt = info->dst_fmt; |
865 | int srcbpp = srcfmt->bytes_per_pixel; |
866 | int dstbpp = dstfmt->bytes_per_pixel; |
867 | Uint32 Pixel; |
868 | unsigned sR, sG, sB; |
869 | unsigned dR, dG, dB, dA; |
870 | const unsigned sA = info->a; |
871 | |
872 | if (sA) { |
873 | while (height--) { |
874 | /* *INDENT-OFF* */ // clang-format off |
875 | DUFFS_LOOP( |
876 | { |
877 | DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB); |
878 | DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA); |
879 | ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA); |
880 | ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); |
881 | src += srcbpp; |
882 | dst += dstbpp; |
883 | }, |
884 | width); |
885 | /* *INDENT-ON* */ // clang-format on |
886 | src += srcskip; |
887 | dst += dstskip; |
888 | } |
889 | } |
890 | } |
891 | |
892 | // General (slow) colorkeyed N->N blending with per-surface alpha |
893 | static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info) |
894 | { |
895 | int width = info->dst_w; |
896 | int height = info->dst_h; |
897 | Uint8 *src = info->src; |
898 | int srcskip = info->src_skip; |
899 | Uint8 *dst = info->dst; |
900 | int dstskip = info->dst_skip; |
901 | const SDL_PixelFormatDetails *srcfmt = info->src_fmt; |
902 | const SDL_PixelFormatDetails *dstfmt = info->dst_fmt; |
903 | Uint32 ckey = info->colorkey; |
904 | int srcbpp = srcfmt->bytes_per_pixel; |
905 | int dstbpp = dstfmt->bytes_per_pixel; |
906 | Uint32 Pixel; |
907 | unsigned sR, sG, sB; |
908 | unsigned dR, dG, dB, dA; |
909 | const unsigned sA = info->a; |
910 | |
911 | while (height--) { |
912 | /* *INDENT-OFF* */ // clang-format off |
913 | DUFFS_LOOP( |
914 | { |
915 | RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel); |
916 | if (sA && Pixel != ckey) { |
917 | RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); |
918 | DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA); |
919 | ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA); |
920 | ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); |
921 | } |
922 | src += srcbpp; |
923 | dst += dstbpp; |
924 | }, |
925 | width); |
926 | /* *INDENT-ON* */ // clang-format on |
927 | src += srcskip; |
928 | dst += dstskip; |
929 | } |
930 | } |
931 | |
932 | // Fast 32-bit RGBA->RGBA blending with pixel alpha |
933 | static void Blit8888to8888PixelAlpha(SDL_BlitInfo *info) |
934 | { |
935 | int width = info->dst_w; |
936 | int height = info->dst_h; |
937 | Uint8 *src = info->src; |
938 | int srcskip = info->src_skip; |
939 | Uint8 *dst = info->dst; |
940 | int dstskip = info->dst_skip; |
941 | const SDL_PixelFormatDetails *srcfmt = info->src_fmt; |
942 | |
943 | while (height--) { |
944 | int i = 0; |
945 | |
946 | for (; i < width; ++i) { |
947 | Uint32 src32 = *(Uint32 *)src; |
948 | Uint32 dst32 = *(Uint32 *)dst; |
949 | ALPHA_BLEND_8888(src32, dst32, srcfmt); |
950 | *(Uint32 *)dst = dst32; |
951 | src += 4; |
952 | dst += 4; |
953 | } |
954 | |
955 | src += srcskip; |
956 | dst += dstskip; |
957 | } |
958 | } |
959 | |
960 | // Fast 32-bit RGBA->RGB(A) blending with pixel alpha and src swizzling |
961 | static void Blit8888to8888PixelAlphaSwizzle(SDL_BlitInfo *info) |
962 | { |
963 | int width = info->dst_w; |
964 | int height = info->dst_h; |
965 | Uint8 *src = info->src; |
966 | int srcskip = info->src_skip; |
967 | Uint8 *dst = info->dst; |
968 | int dstskip = info->dst_skip; |
969 | const SDL_PixelFormatDetails *srcfmt = info->src_fmt; |
970 | const SDL_PixelFormatDetails *dstfmt = info->dst_fmt; |
971 | |
972 | while (height--) { |
973 | int i = 0; |
974 | |
975 | for (; i < width; ++i) { |
976 | Uint32 src32 = *(Uint32 *)src; |
977 | Uint32 dst32 = *(Uint32 *)dst; |
978 | ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt); |
979 | *(Uint32 *)dst = dst32; |
980 | src += 4; |
981 | dst += 4; |
982 | } |
983 | |
984 | src += srcskip; |
985 | dst += dstskip; |
986 | } |
987 | } |
988 | |
989 | #ifdef SDL_SSE4_1_INTRINSICS |
990 | |
991 | static void SDL_TARGETING("sse4.1" ) Blit8888to8888PixelAlphaSwizzleSSE41(SDL_BlitInfo *info) |
992 | { |
993 | int width = info->dst_w; |
994 | int height = info->dst_h; |
995 | Uint8 *src = info->src; |
996 | int srcskip = info->src_skip; |
997 | Uint8 *dst = info->dst; |
998 | int dstskip = info->dst_skip; |
999 | const SDL_PixelFormatDetails *srcfmt = info->src_fmt; |
1000 | const SDL_PixelFormatDetails *dstfmt = info->dst_fmt; |
1001 | |
1002 | // The byte offsets for the start of each pixel |
1003 | const __m128i mask_offsets = _mm_set_epi8( |
1004 | 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0); |
1005 | |
1006 | const __m128i convert_mask = _mm_add_epi32( |
1007 | _mm_set1_epi32( |
1008 | ((srcfmt->Rshift >> 3) << dstfmt->Rshift) | |
1009 | ((srcfmt->Gshift >> 3) << dstfmt->Gshift) | |
1010 | ((srcfmt->Bshift >> 3) << dstfmt->Bshift)), |
1011 | mask_offsets); |
1012 | |
1013 | const __m128i alpha_splat_mask = _mm_add_epi8(_mm_set1_epi8(srcfmt->Ashift >> 3), mask_offsets); |
1014 | const __m128i alpha_fill_mask = _mm_set1_epi32((int)dstfmt->Amask); |
1015 | |
1016 | while (height--) { |
1017 | int i = 0; |
1018 | |
1019 | for (; i + 4 <= width; i += 4) { |
1020 | // Load 4 src pixels |
1021 | __m128i src128 = _mm_loadu_si128((__m128i *)src); |
1022 | |
1023 | // Load 4 dst pixels |
1024 | __m128i dst128 = _mm_loadu_si128((__m128i *)dst); |
1025 | |
1026 | // Extract the alpha from each pixel and splat it into all the channels |
1027 | __m128i srcA = _mm_shuffle_epi8(src128, alpha_splat_mask); |
1028 | |
1029 | // Convert to dst format |
1030 | src128 = _mm_shuffle_epi8(src128, convert_mask); |
1031 | |
1032 | // Set the alpha channels of src to 255 |
1033 | src128 = _mm_or_si128(src128, alpha_fill_mask); |
1034 | |
1035 | // Duplicate each 8-bit alpha value into both bytes of 16-bit lanes |
1036 | __m128i srca_lo = _mm_unpacklo_epi8(srcA, srcA); |
1037 | __m128i srca_hi = _mm_unpackhi_epi8(srcA, srcA); |
1038 | |
1039 | // Calculate 255-srcA in every second 8-bit lane (255-srcA = srcA^0xff) |
1040 | srca_lo = _mm_xor_si128(srca_lo, _mm_set1_epi16(0xff00)); |
1041 | srca_hi = _mm_xor_si128(srca_hi, _mm_set1_epi16(0xff00)); |
1042 | |
1043 | // maddubs expects second argument to be signed, so subtract 128 |
1044 | src128 = _mm_sub_epi8(src128, _mm_set1_epi8((Uint8)128)); |
1045 | dst128 = _mm_sub_epi8(dst128, _mm_set1_epi8((Uint8)128)); |
1046 | |
1047 | // dst = srcA*(src-128) + (255-srcA)*(dst-128) = srcA*src + (255-srcA)*dst - 128*255 |
1048 | __m128i dst_lo = _mm_maddubs_epi16(srca_lo, _mm_unpacklo_epi8(src128, dst128)); |
1049 | __m128i dst_hi = _mm_maddubs_epi16(srca_hi, _mm_unpackhi_epi8(src128, dst128)); |
1050 | |
1051 | // dst += 0x1U (use 0x80 to round instead of floor) + 128*255 (to fix maddubs result) |
1052 | dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1 + 128*255)); |
1053 | dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1 + 128*255)); |
1054 | |
1055 | // dst = (dst + (dst >> 8)) >> 8 = (dst * 257) >> 16 |
1056 | dst_lo = _mm_mulhi_epu16(dst_lo, _mm_set1_epi16(257)); |
1057 | dst_hi = _mm_mulhi_epu16(dst_hi, _mm_set1_epi16(257)); |
1058 | |
1059 | // Blend the pixels together and save the result |
1060 | _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(dst_lo, dst_hi)); |
1061 | |
1062 | src += 16; |
1063 | dst += 16; |
1064 | } |
1065 | |
1066 | for (; i < width; ++i) { |
1067 | Uint32 src32 = *(Uint32 *)src; |
1068 | Uint32 dst32 = *(Uint32 *)dst; |
1069 | ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt); |
1070 | *(Uint32 *)dst = dst32; |
1071 | src += 4; |
1072 | dst += 4; |
1073 | } |
1074 | |
1075 | src += srcskip; |
1076 | dst += dstskip; |
1077 | } |
1078 | } |
1079 | |
1080 | #endif |
1081 | |
1082 | #ifdef SDL_AVX2_INTRINSICS |
1083 | |
1084 | static void SDL_TARGETING("avx2" ) Blit8888to8888PixelAlphaSwizzleAVX2(SDL_BlitInfo *info) |
1085 | { |
1086 | int width = info->dst_w; |
1087 | int height = info->dst_h; |
1088 | Uint8 *src = info->src; |
1089 | int srcskip = info->src_skip; |
1090 | Uint8 *dst = info->dst; |
1091 | int dstskip = info->dst_skip; |
1092 | const SDL_PixelFormatDetails *srcfmt = info->src_fmt; |
1093 | const SDL_PixelFormatDetails *dstfmt = info->dst_fmt; |
1094 | |
1095 | // The byte offsets for the start of each pixel |
1096 | const __m256i mask_offsets = _mm256_set_epi8( |
1097 | 28, 28, 28, 28, 24, 24, 24, 24, 20, 20, 20, 20, 16, 16, 16, 16, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0); |
1098 | |
1099 | const __m256i convert_mask = _mm256_add_epi32( |
1100 | _mm256_set1_epi32( |
1101 | ((srcfmt->Rshift >> 3) << dstfmt->Rshift) | |
1102 | ((srcfmt->Gshift >> 3) << dstfmt->Gshift) | |
1103 | ((srcfmt->Bshift >> 3) << dstfmt->Bshift)), |
1104 | mask_offsets); |
1105 | |
1106 | const __m256i alpha_splat_mask = _mm256_add_epi8(_mm256_set1_epi8(srcfmt->Ashift >> 3), mask_offsets); |
1107 | const __m256i alpha_fill_mask = _mm256_set1_epi32((int)dstfmt->Amask); |
1108 | |
1109 | while (height--) { |
1110 | int i = 0; |
1111 | |
1112 | for (; i + 8 <= width; i += 8) { |
1113 | // Load 8 src pixels |
1114 | __m256i src256 = _mm256_loadu_si256((__m256i *)src); |
1115 | |
1116 | // Load 8 dst pixels |
1117 | __m256i dst256 = _mm256_loadu_si256((__m256i *)dst); |
1118 | |
1119 | // Extract the alpha from each pixel and splat it into all the channels |
1120 | __m256i srcA = _mm256_shuffle_epi8(src256, alpha_splat_mask); |
1121 | |
1122 | // Convert to dst format |
1123 | src256 = _mm256_shuffle_epi8(src256, convert_mask); |
1124 | |
1125 | // Set the alpha channels of src to 255 |
1126 | src256 = _mm256_or_si256(src256, alpha_fill_mask); |
1127 | |
1128 | // Duplicate each 8-bit alpha value into both bytes of 16-bit lanes |
1129 | __m256i alpha_lo = _mm256_unpacklo_epi8(srcA, srcA); |
1130 | __m256i alpha_hi = _mm256_unpackhi_epi8(srcA, srcA); |
1131 | |
1132 | // Calculate 255-srcA in every second 8-bit lane (255-srcA = srcA^0xff) |
1133 | alpha_lo = _mm256_xor_si256(alpha_lo, _mm256_set1_epi16(0xff00)); |
1134 | alpha_hi = _mm256_xor_si256(alpha_hi, _mm256_set1_epi16(0xff00)); |
1135 | |
1136 | // maddubs expects second argument to be signed, so subtract 128 |
1137 | src256 = _mm256_sub_epi8(src256, _mm256_set1_epi8((Uint8)128)); |
1138 | dst256 = _mm256_sub_epi8(dst256, _mm256_set1_epi8((Uint8)128)); |
1139 | |
1140 | // dst = srcA*(src-128) + (255-srcA)*(dst-128) = srcA*src + (255-srcA)*dst - 128*255 |
1141 | __m256i dst_lo = _mm256_maddubs_epi16(alpha_lo, _mm256_unpacklo_epi8(src256, dst256)); |
1142 | __m256i dst_hi = _mm256_maddubs_epi16(alpha_hi, _mm256_unpackhi_epi8(src256, dst256)); |
1143 | |
1144 | // dst += 0x1U (use 0x80 to round instead of floor) + 128*255 (to fix maddubs result) |
1145 | dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(1 + 128*255)); |
1146 | dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(1 + 128*255)); |
1147 | |
1148 | // dst = (dst + (dst >> 8)) >> 8 = (dst * 257) >> 16 |
1149 | dst_lo = _mm256_mulhi_epu16(dst_lo, _mm256_set1_epi16(257)); |
1150 | dst_hi = _mm256_mulhi_epu16(dst_hi, _mm256_set1_epi16(257)); |
1151 | |
1152 | // Blend the pixels together and save the result |
1153 | _mm256_storeu_si256((__m256i *)dst, _mm256_packus_epi16(dst_lo, dst_hi)); |
1154 | |
1155 | src += 32; |
1156 | dst += 32; |
1157 | } |
1158 | |
1159 | for (; i < width; ++i) { |
1160 | Uint32 src32 = *(Uint32 *)src; |
1161 | Uint32 dst32 = *(Uint32 *)dst; |
1162 | ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt); |
1163 | *(Uint32 *)dst = dst32; |
1164 | src += 4; |
1165 | dst += 4; |
1166 | } |
1167 | |
1168 | src += srcskip; |
1169 | dst += dstskip; |
1170 | } |
1171 | } |
1172 | |
1173 | #endif |
1174 | |
1175 | #if defined(SDL_NEON_INTRINSICS) && (__ARM_ARCH >= 8) |
1176 | |
1177 | static void Blit8888to8888PixelAlphaSwizzleNEON(SDL_BlitInfo *info) |
1178 | { |
1179 | int width = info->dst_w; |
1180 | int height = info->dst_h; |
1181 | Uint8 *src = info->src; |
1182 | int srcskip = info->src_skip; |
1183 | Uint8 *dst = info->dst; |
1184 | int dstskip = info->dst_skip; |
1185 | const SDL_PixelFormatDetails *srcfmt = info->src_fmt; |
1186 | const SDL_PixelFormatDetails *dstfmt = info->dst_fmt; |
1187 | |
1188 | // The byte offsets for the start of each pixel |
1189 | const uint8x16_t mask_offsets = vreinterpretq_u8_u64(vcombine_u64( |
1190 | vcreate_u64(0x0404040400000000), vcreate_u64(0x0c0c0c0c08080808))); |
1191 | |
1192 | const uint8x16_t convert_mask = vreinterpretq_u8_u32(vaddq_u32( |
1193 | vreinterpretq_u32_u8(mask_offsets), |
1194 | vdupq_n_u32( |
1195 | ((srcfmt->Rshift >> 3) << dstfmt->Rshift) | |
1196 | ((srcfmt->Gshift >> 3) << dstfmt->Gshift) | |
1197 | ((srcfmt->Bshift >> 3) << dstfmt->Bshift)))); |
1198 | |
1199 | const uint8x16_t alpha_splat_mask = vaddq_u8(vdupq_n_u8(srcfmt->Ashift >> 3), mask_offsets); |
1200 | const uint8x16_t alpha_fill_mask = vreinterpretq_u8_u32(vdupq_n_u32(dstfmt->Amask)); |
1201 | |
1202 | while (height--) { |
1203 | int i = 0; |
1204 | |
1205 | for (; i + 4 <= width; i += 4) { |
1206 | // Load 4 src pixels |
1207 | uint8x16_t src128 = vld1q_u8(src); |
1208 | |
1209 | // Load 4 dst pixels |
1210 | uint8x16_t dst128 = vld1q_u8(dst); |
1211 | |
1212 | // Extract the alpha from each pixel and splat it into all the channels |
1213 | uint8x16_t srcA = vqtbl1q_u8(src128, alpha_splat_mask); |
1214 | |
1215 | // Convert to dst format |
1216 | src128 = vqtbl1q_u8(src128, convert_mask); |
1217 | |
1218 | // Set the alpha channels of src to 255 |
1219 | src128 = vorrq_u8(src128, alpha_fill_mask); |
1220 | |
1221 | // 255 - srcA = ~srcA |
1222 | uint8x16_t srcInvA = vmvnq_u8(srcA); |
1223 | |
1224 | // Result initialized with 1, this is for truncated divide later |
1225 | uint16x8_t res_lo = vdupq_n_u16(1); |
1226 | uint16x8_t res_hi = vdupq_n_u16(1); |
1227 | |
1228 | // res = alpha * src + (255 - alpha) * dst |
1229 | res_lo = vmlal_u8(res_lo, vget_low_u8(srcA), vget_low_u8(src128)); |
1230 | res_lo = vmlal_u8(res_lo, vget_low_u8(srcInvA), vget_low_u8(dst128)); |
1231 | res_hi = vmlal_high_u8(res_hi, srcA, src128); |
1232 | res_hi = vmlal_high_u8(res_hi, srcInvA, dst128); |
1233 | |
1234 | // Now result has +1 already added for truncated division |
1235 | // dst = (res + (res >> 8)) >> 8 |
1236 | uint8x8_t temp; |
1237 | temp = vaddhn_u16(res_lo, vshrq_n_u16(res_lo, 8)); |
1238 | dst128 = vaddhn_high_u16(temp, res_hi, vshrq_n_u16(res_hi, 8)); |
1239 | |
1240 | // For rounded division remove the constant 1 and change first two vmlal_u8 to vmull_u8 |
1241 | // Then replace two previous lines with following code: |
1242 | // temp = vraddhn_u16(res_lo, vrshrq_n_u16(res_lo, 8)); |
1243 | // dst128 = vraddhn_high_u16(temp, res_hi, vrshrq_n_u16(res_hi, 8)); |
1244 | |
1245 | // Save the result |
1246 | vst1q_u8(dst, dst128); |
1247 | |
1248 | src += 16; |
1249 | dst += 16; |
1250 | } |
1251 | |
1252 | // Process 1 pixel per iteration, max 3 iterations, same calculations as above |
1253 | for (; i < width; ++i) { |
1254 | // Top 32-bits will be not used in src32 & dst32 |
1255 | uint8x8_t src32 = vreinterpret_u8_u32(vld1_dup_u32((Uint32*)src)); |
1256 | uint8x8_t dst32 = vreinterpret_u8_u32(vld1_dup_u32((Uint32*)dst)); |
1257 | |
1258 | uint8x8_t srcA = vtbl1_u8(src32, vget_low_u8(alpha_splat_mask)); |
1259 | src32 = vtbl1_u8(src32, vget_low_u8(convert_mask)); |
1260 | src32 = vorr_u8(src32, vget_low_u8(alpha_fill_mask)); |
1261 | uint8x8_t srcInvA = vmvn_u8(srcA); |
1262 | |
1263 | uint16x8_t res = vdupq_n_u16(1); |
1264 | res = vmlal_u8(res, srcA, src32); |
1265 | res = vmlal_u8(res, srcInvA, dst32); |
1266 | |
1267 | dst32 = vaddhn_u16(res, vshrq_n_u16(res, 8)); |
1268 | |
1269 | // Save the result, only low 32-bits |
1270 | vst1_lane_u32((Uint32*)dst, vreinterpret_u32_u8(dst32), 0); |
1271 | |
1272 | src += 4; |
1273 | dst += 4; |
1274 | } |
1275 | |
1276 | src += srcskip; |
1277 | dst += dstskip; |
1278 | } |
1279 | } |
1280 | |
1281 | #endif |
1282 | |
1283 | // General (slow) N->N blending with pixel alpha |
1284 | static void BlitNtoNPixelAlpha(SDL_BlitInfo *info) |
1285 | { |
1286 | int width = info->dst_w; |
1287 | int height = info->dst_h; |
1288 | Uint8 *src = info->src; |
1289 | int srcskip = info->src_skip; |
1290 | Uint8 *dst = info->dst; |
1291 | int dstskip = info->dst_skip; |
1292 | const SDL_PixelFormatDetails *srcfmt = info->src_fmt; |
1293 | const SDL_PixelFormatDetails *dstfmt = info->dst_fmt; |
1294 | int srcbpp; |
1295 | int dstbpp; |
1296 | Uint32 Pixel; |
1297 | unsigned sR, sG, sB, sA; |
1298 | unsigned dR, dG, dB, dA; |
1299 | |
1300 | // Set up some basic variables |
1301 | srcbpp = srcfmt->bytes_per_pixel; |
1302 | dstbpp = dstfmt->bytes_per_pixel; |
1303 | |
1304 | while (height--) { |
1305 | DUFFS_LOOP( |
1306 | { |
1307 | DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA); |
1308 | if (sA) { |
1309 | DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA); |
1310 | ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA); |
1311 | ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); |
1312 | } |
1313 | src += srcbpp; |
1314 | dst += dstbpp; |
1315 | }, |
1316 | width); |
1317 | /* *INDENT-ON* */ // clang-format on |
1318 | src += srcskip; |
1319 | dst += dstskip; |
1320 | } |
1321 | } |
1322 | |
1323 | SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface) |
1324 | { |
1325 | const SDL_PixelFormatDetails *sf = surface->fmt; |
1326 | const SDL_PixelFormatDetails *df = surface->map.info.dst_fmt; |
1327 | |
1328 | switch (surface->map.info.flags & ~SDL_COPY_RLE_MASK) { |
1329 | case SDL_COPY_BLEND: |
1330 | // Per-pixel alpha blits |
1331 | switch (df->bytes_per_pixel) { |
1332 | case 1: |
1333 | if (surface->map.info.dst_pal) { |
1334 | return BlitNto1PixelAlpha; |
1335 | } else { |
1336 | // RGB332 has no palette ! |
1337 | return BlitNtoNPixelAlpha; |
1338 | } |
1339 | |
1340 | case 2: |
1341 | if (sf->bytes_per_pixel == 4 && sf->Amask == 0xff000000 && sf->Gmask == 0xff00 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) || (sf->Bmask == 0xff && df->Bmask == 0x1f))) { |
1342 | if (df->Gmask == 0x7e0) { |
1343 | return BlitARGBto565PixelAlpha; |
1344 | } else if (df->Gmask == 0x3e0 && !df->Amask) { |
1345 | return BlitARGBto555PixelAlpha; |
1346 | } |
1347 | } |
1348 | return BlitNtoNPixelAlpha; |
1349 | |
1350 | case 4: |
1351 | if (SDL_PIXELLAYOUT(sf->format) == SDL_PACKEDLAYOUT_8888 && sf->Amask && |
1352 | SDL_PIXELLAYOUT(df->format) == SDL_PACKEDLAYOUT_8888) { |
1353 | #ifdef SDL_AVX2_INTRINSICS |
1354 | if (SDL_HasAVX2()) { |
1355 | return Blit8888to8888PixelAlphaSwizzleAVX2; |
1356 | } |
1357 | #endif |
1358 | #ifdef SDL_SSE4_1_INTRINSICS |
1359 | if (SDL_HasSSE41()) { |
1360 | return Blit8888to8888PixelAlphaSwizzleSSE41; |
1361 | } |
1362 | #endif |
1363 | #if defined(SDL_NEON_INTRINSICS) && (__ARM_ARCH >= 8) |
1364 | // To prevent "unused function" compiler warnings/errors |
1365 | (void)Blit8888to8888PixelAlpha; |
1366 | (void)Blit8888to8888PixelAlphaSwizzle; |
1367 | return Blit8888to8888PixelAlphaSwizzleNEON; |
1368 | #else |
1369 | if (sf->format == df->format) { |
1370 | return Blit8888to8888PixelAlpha; |
1371 | } else { |
1372 | return Blit8888to8888PixelAlphaSwizzle; |
1373 | } |
1374 | #endif |
1375 | } |
1376 | return BlitNtoNPixelAlpha; |
1377 | |
1378 | case 3: |
1379 | default: |
1380 | break; |
1381 | } |
1382 | return BlitNtoNPixelAlpha; |
1383 | |
1384 | case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND: |
1385 | if (sf->Amask == 0) { |
1386 | // Per-surface alpha blits |
1387 | switch (df->bytes_per_pixel) { |
1388 | case 1: |
1389 | if (surface->map.info.dst_pal) { |
1390 | return BlitNto1SurfaceAlpha; |
1391 | } else { |
1392 | // RGB332 has no palette ! |
1393 | return BlitNtoNSurfaceAlpha; |
1394 | } |
1395 | |
1396 | case 2: |
1397 | if (surface->map.identity) { |
1398 | if (df->Gmask == 0x7e0) { |
1399 | #ifdef SDL_MMX_INTRINSICS |
1400 | if (SDL_HasMMX()) { |
1401 | return Blit565to565SurfaceAlphaMMX; |
1402 | } else |
1403 | #endif |
1404 | { |
1405 | return Blit565to565SurfaceAlpha; |
1406 | } |
1407 | } else if (df->Gmask == 0x3e0) { |
1408 | #ifdef SDL_MMX_INTRINSICS |
1409 | if (SDL_HasMMX()) { |
1410 | return Blit555to555SurfaceAlphaMMX; |
1411 | } else |
1412 | #endif |
1413 | { |
1414 | return Blit555to555SurfaceAlpha; |
1415 | } |
1416 | } |
1417 | } |
1418 | return BlitNtoNSurfaceAlpha; |
1419 | |
1420 | case 4: |
1421 | if (sf->Rmask == df->Rmask && sf->Gmask == df->Gmask && sf->Bmask == df->Bmask && sf->bytes_per_pixel == 4) { |
1422 | #ifdef SDL_SSE2_INTRINSICS |
1423 | if (sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0 && SDL_HasSSE2()) { |
1424 | return Blit888to888SurfaceAlphaSSE2; |
1425 | } |
1426 | #endif |
1427 | if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) { |
1428 | return BlitRGBtoRGBSurfaceAlpha; |
1429 | } |
1430 | } |
1431 | return BlitNtoNSurfaceAlpha; |
1432 | |
1433 | case 3: |
1434 | default: |
1435 | return BlitNtoNSurfaceAlpha; |
1436 | } |
1437 | } |
1438 | break; |
1439 | |
1440 | case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND: |
1441 | if (sf->Amask == 0) { |
1442 | if (df->bytes_per_pixel == 1) { |
1443 | |
1444 | if (surface->map.info.dst_pal) { |
1445 | return BlitNto1SurfaceAlphaKey; |
1446 | } else { |
1447 | // RGB332 has no palette ! |
1448 | return BlitNtoNSurfaceAlphaKey; |
1449 | } |
1450 | } else { |
1451 | return BlitNtoNSurfaceAlphaKey; |
1452 | } |
1453 | } |
1454 | break; |
1455 | } |
1456 | |
1457 | return NULL; |
1458 | } |
1459 | |
1460 | #endif // SDL_HAVE_BLIT_A |
1461 | |
1462 | |