1/*
2 Simple DirectMedia Layer
3 Copyright (C) 1997-2021 Sam Lantinga <slouken@libsdl.org>
4
5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the authors be held liable for any damages
7 arising from the use of this software.
8
9 Permission is granted to anyone to use this software for any purpose,
10 including commercial applications, and to alter it and redistribute it
11 freely, subject to the following restrictions:
12
13 1. The origin of this software must not be misrepresented; you must not
14 claim that you wrote the original software. If you use this software
15 in a product, an acknowledgment in the product documentation would be
16 appreciated but is not required.
17 2. Altered source versions must be plainly marked as such, and must not be
18 misrepresented as being the original software.
19 3. This notice may not be removed or altered from any source distribution.
20*/
21#include "../SDL_internal.h"
22
23#if SDL_HAVE_BLIT_A
24
25#include "SDL_video.h"
26#include "SDL_blit.h"
27
28/* Functions to perform alpha blended blitting */
29
30/* N->1 blending with per-surface alpha */
31static void
32BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
33{
34 int width = info->dst_w;
35 int height = info->dst_h;
36 Uint8 *src = info->src;
37 int srcskip = info->src_skip;
38 Uint8 *dst = info->dst;
39 int dstskip = info->dst_skip;
40 Uint8 *palmap = info->table;
41 SDL_PixelFormat *srcfmt = info->src_fmt;
42 SDL_PixelFormat *dstfmt = info->dst_fmt;
43 int srcbpp = srcfmt->BytesPerPixel;
44 Uint32 Pixel;
45 unsigned sR, sG, sB;
46 unsigned dR, dG, dB;
47 const unsigned A = info->a;
48
49 while (height--) {
50 /* *INDENT-OFF* */
51 DUFFS_LOOP4(
52 {
53 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
54 dR = dstfmt->palette->colors[*dst].r;
55 dG = dstfmt->palette->colors[*dst].g;
56 dB = dstfmt->palette->colors[*dst].b;
57 ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
58 dR &= 0xff;
59 dG &= 0xff;
60 dB &= 0xff;
61 /* Pack RGB into 8bit pixel */
62 if ( palmap == NULL ) {
63 *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
64 } else {
65 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
66 }
67 dst++;
68 src += srcbpp;
69 },
70 width);
71 /* *INDENT-ON* */
72 src += srcskip;
73 dst += dstskip;
74 }
75}
76
77/* N->1 blending with pixel alpha */
78static void
79BlitNto1PixelAlpha(SDL_BlitInfo * info)
80{
81 int width = info->dst_w;
82 int height = info->dst_h;
83 Uint8 *src = info->src;
84 int srcskip = info->src_skip;
85 Uint8 *dst = info->dst;
86 int dstskip = info->dst_skip;
87 Uint8 *palmap = info->table;
88 SDL_PixelFormat *srcfmt = info->src_fmt;
89 SDL_PixelFormat *dstfmt = info->dst_fmt;
90 int srcbpp = srcfmt->BytesPerPixel;
91 Uint32 Pixel;
92 unsigned sR, sG, sB, sA;
93 unsigned dR, dG, dB;
94
95 while (height--) {
96 /* *INDENT-OFF* */
97 DUFFS_LOOP4(
98 {
99 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
100 dR = dstfmt->palette->colors[*dst].r;
101 dG = dstfmt->palette->colors[*dst].g;
102 dB = dstfmt->palette->colors[*dst].b;
103 ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB);
104 dR &= 0xff;
105 dG &= 0xff;
106 dB &= 0xff;
107 /* Pack RGB into 8bit pixel */
108 if ( palmap == NULL ) {
109 *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
110 } else {
111 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
112 }
113 dst++;
114 src += srcbpp;
115 },
116 width);
117 /* *INDENT-ON* */
118 src += srcskip;
119 dst += dstskip;
120 }
121}
122
123/* colorkeyed N->1 blending with per-surface alpha */
124static void
125BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
126{
127 int width = info->dst_w;
128 int height = info->dst_h;
129 Uint8 *src = info->src;
130 int srcskip = info->src_skip;
131 Uint8 *dst = info->dst;
132 int dstskip = info->dst_skip;
133 Uint8 *palmap = info->table;
134 SDL_PixelFormat *srcfmt = info->src_fmt;
135 SDL_PixelFormat *dstfmt = info->dst_fmt;
136 int srcbpp = srcfmt->BytesPerPixel;
137 Uint32 ckey = info->colorkey;
138 Uint32 Pixel;
139 unsigned sR, sG, sB;
140 unsigned dR, dG, dB;
141 const unsigned A = info->a;
142
143 while (height--) {
144 /* *INDENT-OFF* */
145 DUFFS_LOOP(
146 {
147 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
148 if ( Pixel != ckey ) {
149 dR = dstfmt->palette->colors[*dst].r;
150 dG = dstfmt->palette->colors[*dst].g;
151 dB = dstfmt->palette->colors[*dst].b;
152 ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
153 dR &= 0xff;
154 dG &= 0xff;
155 dB &= 0xff;
156 /* Pack RGB into 8bit pixel */
157 if ( palmap == NULL ) {
158 *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
159 } else {
160 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
161 }
162 }
163 dst++;
164 src += srcbpp;
165 },
166 width);
167 /* *INDENT-ON* */
168 src += srcskip;
169 dst += dstskip;
170 }
171}
172
173#ifdef __MMX__
174
175/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
176static void
177BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
178{
179 int width = info->dst_w;
180 int height = info->dst_h;
181 Uint32 *srcp = (Uint32 *) info->src;
182 int srcskip = info->src_skip >> 2;
183 Uint32 *dstp = (Uint32 *) info->dst;
184 int dstskip = info->dst_skip >> 2;
185 Uint32 dalpha = info->dst_fmt->Amask;
186
187 __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
188
189 hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
190 lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
191 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
192
193 while (height--) {
194 int n = width;
195 if (n & 1) {
196 Uint32 s = *srcp++;
197 Uint32 d = *dstp;
198 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
199 + (s & d & 0x00010101)) | dalpha;
200 n--;
201 }
202
203 for (n >>= 1; n > 0; --n) {
204 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
205 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
206
207 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
208 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
209
210 dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
211 src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
212 src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
213 src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
214
215 dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
216 dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
217 dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
218 dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
219
220 *(__m64 *) dstp = dst1; /* dst1 -> 2 x dst pixels */
221 dstp += 2;
222 srcp += 2;
223 }
224
225 srcp += srcskip;
226 dstp += dstskip;
227 }
228 _mm_empty();
229}
230
231/* fast RGB888->(A)RGB888 blending with surface alpha */
232static void
233BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
234{
235 SDL_PixelFormat *df = info->dst_fmt;
236 Uint32 chanmask;
237 unsigned alpha = info->a;
238
239 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
240 /* only call a128 version when R,G,B occupy lower bits */
241 BlitRGBtoRGBSurfaceAlpha128MMX(info);
242 } else {
243 int width = info->dst_w;
244 int height = info->dst_h;
245 Uint32 *srcp = (Uint32 *) info->src;
246 int srcskip = info->src_skip >> 2;
247 Uint32 *dstp = (Uint32 *) info->dst;
248 int dstskip = info->dst_skip >> 2;
249 Uint32 dalpha = df->Amask;
250 Uint32 amult;
251
252 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
253
254 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
255 /* form the alpha mult */
256 amult = alpha | (alpha << 8);
257 amult = amult | (amult << 16);
258 chanmask =
259 (0xff << df->Rshift) | (0xff << df->
260 Gshift) | (0xff << df->Bshift);
261 mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
262 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
263 /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
264 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
265
266 while (height--) {
267 int n = width;
268 if (n & 1) {
269 /* One Pixel Blend */
270 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
271 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
272
273 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
274 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
275
276 src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
277 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
278 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
279 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
280
281 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
282 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
283 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
284
285 ++srcp;
286 ++dstp;
287
288 n--;
289 }
290
291 for (n >>= 1; n > 0; --n) {
292 /* Two Pixels Blend */
293 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
294 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
295 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
296 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
297
298 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
299 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
300 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
301 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
302
303 src1 = _mm_sub_pi16(src1, dst1); /* src1 - dst1 -> src1 */
304 src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
305 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
306 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
307
308 src2 = _mm_sub_pi16(src2, dst2); /* src2 - dst2 -> src2 */
309 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
310 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
311 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
312
313 dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
314 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
315
316 *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
317
318 srcp += 2;
319 dstp += 2;
320 }
321 srcp += srcskip;
322 dstp += dstskip;
323 }
324 _mm_empty();
325 }
326}
327
328/* fast ARGB888->(A)RGB888 blending with pixel alpha */
329static void
330BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
331{
332 int width = info->dst_w;
333 int height = info->dst_h;
334 Uint32 *srcp = (Uint32 *) info->src;
335 int srcskip = info->src_skip >> 2;
336 Uint32 *dstp = (Uint32 *) info->dst;
337 int dstskip = info->dst_skip >> 2;
338 SDL_PixelFormat *sf = info->src_fmt;
339 Uint32 amask = sf->Amask;
340 Uint32 ashift = sf->Ashift;
341 Uint64 multmask, multmask2;
342
343 __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
344
345 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
346 multmask = 0x00FF;
347 multmask <<= (ashift * 2);
348 multmask2 = 0x00FF00FF00FF00FFULL;
349
350 while (height--) {
351 /* *INDENT-OFF* */
352 DUFFS_LOOP4({
353 Uint32 alpha = *srcp & amask;
354 if (alpha == 0) {
355 /* do nothing */
356 } else if (alpha == amask) {
357 *dstp = *srcp;
358 } else {
359 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
360 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
361
362 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
363 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
364
365 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
366 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
367 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
368 mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
369 mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha */
370 mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha */
371
372 /* blend */
373 src1 = _mm_mullo_pi16(src1, mm_alpha);
374 src1 = _mm_srli_pi16(src1, 8);
375 dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
376 dst1 = _mm_srli_pi16(dst1, 8);
377 dst1 = _mm_add_pi16(src1, dst1);
378 dst1 = _mm_packs_pu16(dst1, mm_zero);
379
380 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
381 }
382 ++srcp;
383 ++dstp;
384 }, width);
385 /* *INDENT-ON* */
386 srcp += srcskip;
387 dstp += dstskip;
388 }
389 _mm_empty();
390}
391
392#endif /* __MMX__ */
393
394#if SDL_ARM_SIMD_BLITTERS
395void BlitARGBto565PixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
396
397static void
398BlitARGBto565PixelAlphaARMSIMD(SDL_BlitInfo * info)
399{
400 int32_t width = info->dst_w;
401 int32_t height = info->dst_h;
402 uint16_t *dstp = (uint16_t *)info->dst;
403 int32_t dststride = width + (info->dst_skip >> 1);
404 uint32_t *srcp = (uint32_t *)info->src;
405 int32_t srcstride = width + (info->src_skip >> 2);
406
407 BlitARGBto565PixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
408}
409
410void BlitRGBtoRGBPixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
411
412static void
413BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo * info)
414{
415 int32_t width = info->dst_w;
416 int32_t height = info->dst_h;
417 uint32_t *dstp = (uint32_t *)info->dst;
418 int32_t dststride = width + (info->dst_skip >> 2);
419 uint32_t *srcp = (uint32_t *)info->src;
420 int32_t srcstride = width + (info->src_skip >> 2);
421
422 BlitRGBtoRGBPixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
423}
424#endif
425
426#if SDL_ARM_NEON_BLITTERS
427void BlitARGBto565PixelAlphaARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
428
429static void
430BlitARGBto565PixelAlphaARMNEON(SDL_BlitInfo * info)
431{
432 int32_t width = info->dst_w;
433 int32_t height = info->dst_h;
434 uint16_t *dstp = (uint16_t *)info->dst;
435 int32_t dststride = width + (info->dst_skip >> 1);
436 uint32_t *srcp = (uint32_t *)info->src;
437 int32_t srcstride = width + (info->src_skip >> 2);
438
439 BlitARGBto565PixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
440}
441
442void BlitRGBtoRGBPixelAlphaARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
443
444static void
445BlitRGBtoRGBPixelAlphaARMNEON(SDL_BlitInfo * info)
446{
447 int32_t width = info->dst_w;
448 int32_t height = info->dst_h;
449 uint32_t *dstp = (uint32_t *)info->dst;
450 int32_t dststride = width + (info->dst_skip >> 2);
451 uint32_t *srcp = (uint32_t *)info->src;
452 int32_t srcstride = width + (info->src_skip >> 2);
453
454 BlitRGBtoRGBPixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
455}
456#endif
457
458/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
459static void
460BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
461{
462 int width = info->dst_w;
463 int height = info->dst_h;
464 Uint32 *srcp = (Uint32 *) info->src;
465 int srcskip = info->src_skip >> 2;
466 Uint32 *dstp = (Uint32 *) info->dst;
467 int dstskip = info->dst_skip >> 2;
468
469 while (height--) {
470 /* *INDENT-OFF* */
471 DUFFS_LOOP4({
472 Uint32 s = *srcp++;
473 Uint32 d = *dstp;
474 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
475 + (s & d & 0x00010101)) | 0xff000000;
476 }, width);
477 /* *INDENT-ON* */
478 srcp += srcskip;
479 dstp += dstskip;
480 }
481}
482
483/* fast RGB888->(A)RGB888 blending with surface alpha */
484static void
485BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
486{
487 unsigned alpha = info->a;
488 if (alpha == 128) {
489 BlitRGBtoRGBSurfaceAlpha128(info);
490 } else {
491 int width = info->dst_w;
492 int height = info->dst_h;
493 Uint32 *srcp = (Uint32 *) info->src;
494 int srcskip = info->src_skip >> 2;
495 Uint32 *dstp = (Uint32 *) info->dst;
496 int dstskip = info->dst_skip >> 2;
497 Uint32 s;
498 Uint32 d;
499 Uint32 s1;
500 Uint32 d1;
501
502 while (height--) {
503 /* *INDENT-OFF* */
504 DUFFS_LOOP4({
505 s = *srcp;
506 d = *dstp;
507 s1 = s & 0xff00ff;
508 d1 = d & 0xff00ff;
509 d1 = (d1 + ((s1 - d1) * alpha >> 8))
510 & 0xff00ff;
511 s &= 0xff00;
512 d &= 0xff00;
513 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
514 *dstp = d1 | d | 0xff000000;
515 ++srcp;
516 ++dstp;
517 }, width);
518 /* *INDENT-ON* */
519 srcp += srcskip;
520 dstp += dstskip;
521 }
522 }
523}
524
525/* fast ARGB888->(A)RGB888 blending with pixel alpha */
526static void
527BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
528{
529 int width = info->dst_w;
530 int height = info->dst_h;
531 Uint32 *srcp = (Uint32 *) info->src;
532 int srcskip = info->src_skip >> 2;
533 Uint32 *dstp = (Uint32 *) info->dst;
534 int dstskip = info->dst_skip >> 2;
535
536 while (height--) {
537 /* *INDENT-OFF* */
538 DUFFS_LOOP4({
539 Uint32 dalpha;
540 Uint32 d;
541 Uint32 s1;
542 Uint32 d1;
543 Uint32 s = *srcp;
544 Uint32 alpha = s >> 24;
545 /* FIXME: Here we special-case opaque alpha since the
546 compositioning used (>>8 instead of /255) doesn't handle
547 it correctly. Also special-case alpha=0 for speed?
548 Benchmark this! */
549 if (alpha) {
550 if (alpha == SDL_ALPHA_OPAQUE) {
551 *dstp = *srcp;
552 } else {
553 /*
554 * take out the middle component (green), and process
555 * the other two in parallel. One multiply less.
556 */
557 d = *dstp;
558 dalpha = d >> 24;
559 s1 = s & 0xff00ff;
560 d1 = d & 0xff00ff;
561 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
562 s &= 0xff00;
563 d &= 0xff00;
564 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
565 dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
566 *dstp = d1 | d | (dalpha << 24);
567 }
568 }
569 ++srcp;
570 ++dstp;
571 }, width);
572 /* *INDENT-ON* */
573 srcp += srcskip;
574 dstp += dstskip;
575 }
576}
577
578#ifdef __3dNOW__
579/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
580static void
581BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
582{
583 int width = info->dst_w;
584 int height = info->dst_h;
585 Uint32 *srcp = (Uint32 *) info->src;
586 int srcskip = info->src_skip >> 2;
587 Uint32 *dstp = (Uint32 *) info->dst;
588 int dstskip = info->dst_skip >> 2;
589 SDL_PixelFormat *sf = info->src_fmt;
590 Uint32 amask = sf->Amask;
591 Uint32 ashift = sf->Ashift;
592 Uint64 multmask, multmask2;
593
594 __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
595
596 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
597 multmask = 0x00FF;
598 multmask <<= (ashift * 2);
599 multmask2 = 0x00FF00FF00FF00FFULL;
600
601 while (height--) {
602 /* *INDENT-OFF* */
603 DUFFS_LOOP4({
604 Uint32 alpha;
605
606 _m_prefetch(srcp + 16);
607 _m_prefetch(dstp + 16);
608
609 alpha = *srcp & amask;
610 if (alpha == 0) {
611 /* do nothing */
612 } else if (alpha == amask) {
613 *dstp = *srcp;
614 } else {
615 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
616 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
617
618 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
619 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
620
621 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
622 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
623 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
624 mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
625 mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha */
626 mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha */
627
628
629 /* blend */
630 src1 = _mm_mullo_pi16(src1, mm_alpha);
631 src1 = _mm_srli_pi16(src1, 8);
632 dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
633 dst1 = _mm_srli_pi16(dst1, 8);
634 dst1 = _mm_add_pi16(src1, dst1);
635 dst1 = _mm_packs_pu16(dst1, mm_zero);
636
637 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
638 }
639 ++srcp;
640 ++dstp;
641 }, width);
642 /* *INDENT-ON* */
643 srcp += srcskip;
644 dstp += dstskip;
645 }
646 _mm_empty();
647}
648
649#endif /* __3dNOW__ */
650
651/* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
652
653/* blend a single 16 bit pixel at 50% */
654#define BLEND16_50(d, s, mask) \
655 ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
656
657/* blend two 16 bit pixels at 50% */
658#define BLEND2x16_50(d, s, mask) \
659 (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
660 + (s & d & (~(mask | mask << 16))))
661
662static void
663Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
664{
665 int width = info->dst_w;
666 int height = info->dst_h;
667 Uint16 *srcp = (Uint16 *) info->src;
668 int srcskip = info->src_skip >> 1;
669 Uint16 *dstp = (Uint16 *) info->dst;
670 int dstskip = info->dst_skip >> 1;
671
672 while (height--) {
673 if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
674 /*
675 * Source and destination not aligned, pipeline it.
676 * This is mostly a win for big blits but no loss for
677 * small ones
678 */
679 Uint32 prev_sw;
680 int w = width;
681
682 /* handle odd destination */
683 if ((uintptr_t) dstp & 2) {
684 Uint16 d = *dstp, s = *srcp;
685 *dstp = BLEND16_50(d, s, mask);
686 dstp++;
687 srcp++;
688 w--;
689 }
690 srcp++; /* srcp is now 32-bit aligned */
691
692 /* bootstrap pipeline with first halfword */
693 prev_sw = ((Uint32 *) srcp)[-1];
694
695 while (w > 1) {
696 Uint32 sw, dw, s;
697 sw = *(Uint32 *) srcp;
698 dw = *(Uint32 *) dstp;
699#if SDL_BYTEORDER == SDL_BIG_ENDIAN
700 s = (prev_sw << 16) + (sw >> 16);
701#else
702 s = (prev_sw >> 16) + (sw << 16);
703#endif
704 prev_sw = sw;
705 *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
706 dstp += 2;
707 srcp += 2;
708 w -= 2;
709 }
710
711 /* final pixel if any */
712 if (w) {
713 Uint16 d = *dstp, s;
714#if SDL_BYTEORDER == SDL_BIG_ENDIAN
715 s = (Uint16) prev_sw;
716#else
717 s = (Uint16) (prev_sw >> 16);
718#endif
719 *dstp = BLEND16_50(d, s, mask);
720 srcp++;
721 dstp++;
722 }
723 srcp += srcskip - 1;
724 dstp += dstskip;
725 } else {
726 /* source and destination are aligned */
727 int w = width;
728
729 /* first odd pixel? */
730 if ((uintptr_t) srcp & 2) {
731 Uint16 d = *dstp, s = *srcp;
732 *dstp = BLEND16_50(d, s, mask);
733 srcp++;
734 dstp++;
735 w--;
736 }
737 /* srcp and dstp are now 32-bit aligned */
738
739 while (w > 1) {
740 Uint32 sw = *(Uint32 *) srcp;
741 Uint32 dw = *(Uint32 *) dstp;
742 *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
743 srcp += 2;
744 dstp += 2;
745 w -= 2;
746 }
747
748 /* last odd pixel? */
749 if (w) {
750 Uint16 d = *dstp, s = *srcp;
751 *dstp = BLEND16_50(d, s, mask);
752 srcp++;
753 dstp++;
754 }
755 srcp += srcskip;
756 dstp += dstskip;
757 }
758 }
759}
760
761#ifdef __MMX__
762
763/* fast RGB565->RGB565 blending with surface alpha */
764static void
765Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
766{
767 unsigned alpha = info->a;
768 if (alpha == 128) {
769 Blit16to16SurfaceAlpha128(info, 0xf7de);
770 } else {
771 int width = info->dst_w;
772 int height = info->dst_h;
773 Uint16 *srcp = (Uint16 *) info->src;
774 int srcskip = info->src_skip >> 1;
775 Uint16 *dstp = (Uint16 *) info->dst;
776 int dstskip = info->dst_skip >> 1;
777 Uint32 s, d;
778
779 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
780
781 alpha &= ~(1 + 2 + 4); /* cut alpha to get the exact same behaviour */
782 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
783 alpha >>= 3; /* downscale alpha to 5 bits */
784
785 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
786 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
787 /* position alpha to allow for mullo and mulhi on diff channels
788 to reduce the number of operations */
789 mm_alpha = _mm_slli_si64(mm_alpha, 3);
790
791 /* Setup the 565 color channel masks */
792 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
793 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
794
795 while (height--) {
796 /* *INDENT-OFF* */
797 DUFFS_LOOP_124(
798 {
799 s = *srcp++;
800 d = *dstp;
801 /*
802 * shift out the middle component (green) to
803 * the high 16 bits, and process all three RGB
804 * components at the same time.
805 */
806 s = (s | s << 16) & 0x07e0f81f;
807 d = (d | d << 16) & 0x07e0f81f;
808 d += (s - d) * alpha >> 5;
809 d &= 0x07e0f81f;
810 *dstp++ = (Uint16)(d | d >> 16);
811 },{
812 s = *srcp++;
813 d = *dstp;
814 /*
815 * shift out the middle component (green) to
816 * the high 16 bits, and process all three RGB
817 * components at the same time.
818 */
819 s = (s | s << 16) & 0x07e0f81f;
820 d = (d | d << 16) & 0x07e0f81f;
821 d += (s - d) * alpha >> 5;
822 d &= 0x07e0f81f;
823 *dstp++ = (Uint16)(d | d >> 16);
824 s = *srcp++;
825 d = *dstp;
826 /*
827 * shift out the middle component (green) to
828 * the high 16 bits, and process all three RGB
829 * components at the same time.
830 */
831 s = (s | s << 16) & 0x07e0f81f;
832 d = (d | d << 16) & 0x07e0f81f;
833 d += (s - d) * alpha >> 5;
834 d &= 0x07e0f81f;
835 *dstp++ = (Uint16)(d | d >> 16);
836 },{
837 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
838 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
839
840 /* red */
841 src2 = src1;
842 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
843
844 dst2 = dst1;
845 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
846
847 /* blend */
848 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
849 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
850 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
851 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
852 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
853
854 mm_res = dst2; /* RED -> mm_res */
855
856 /* green -- process the bits in place */
857 src2 = src1;
858 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
859
860 dst2 = dst1;
861 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
862
863 /* blend */
864 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
865 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
866 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
867 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
868
869 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
870
871 /* blue */
872 src2 = src1;
873 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
874
875 dst2 = dst1;
876 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
877
878 /* blend */
879 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
880 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
881 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
882 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
883 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
884
885 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
886
887 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
888
889 srcp += 4;
890 dstp += 4;
891 }, width);
892 /* *INDENT-ON* */
893 srcp += srcskip;
894 dstp += dstskip;
895 }
896 _mm_empty();
897 }
898}
899
900/* fast RGB555->RGB555 blending with surface alpha */
901static void
902Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
903{
904 unsigned alpha = info->a;
905 if (alpha == 128) {
906 Blit16to16SurfaceAlpha128(info, 0xfbde);
907 } else {
908 int width = info->dst_w;
909 int height = info->dst_h;
910 Uint16 *srcp = (Uint16 *) info->src;
911 int srcskip = info->src_skip >> 1;
912 Uint16 *dstp = (Uint16 *) info->dst;
913 int dstskip = info->dst_skip >> 1;
914 Uint32 s, d;
915
916 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
917
918 alpha &= ~(1 + 2 + 4); /* cut alpha to get the exact same behaviour */
919 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
920 alpha >>= 3; /* downscale alpha to 5 bits */
921
922 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
923 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
924 /* position alpha to allow for mullo and mulhi on diff channels
925 to reduce the number of operations */
926 mm_alpha = _mm_slli_si64(mm_alpha, 3);
927
928 /* Setup the 555 color channel masks */
929 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
930 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
931 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
932
933 while (height--) {
934 /* *INDENT-OFF* */
935 DUFFS_LOOP_124(
936 {
937 s = *srcp++;
938 d = *dstp;
939 /*
940 * shift out the middle component (green) to
941 * the high 16 bits, and process all three RGB
942 * components at the same time.
943 */
944 s = (s | s << 16) & 0x03e07c1f;
945 d = (d | d << 16) & 0x03e07c1f;
946 d += (s - d) * alpha >> 5;
947 d &= 0x03e07c1f;
948 *dstp++ = (Uint16)(d | d >> 16);
949 },{
950 s = *srcp++;
951 d = *dstp;
952 /*
953 * shift out the middle component (green) to
954 * the high 16 bits, and process all three RGB
955 * components at the same time.
956 */
957 s = (s | s << 16) & 0x03e07c1f;
958 d = (d | d << 16) & 0x03e07c1f;
959 d += (s - d) * alpha >> 5;
960 d &= 0x03e07c1f;
961 *dstp++ = (Uint16)(d | d >> 16);
962 s = *srcp++;
963 d = *dstp;
964 /*
965 * shift out the middle component (green) to
966 * the high 16 bits, and process all three RGB
967 * components at the same time.
968 */
969 s = (s | s << 16) & 0x03e07c1f;
970 d = (d | d << 16) & 0x03e07c1f;
971 d += (s - d) * alpha >> 5;
972 d &= 0x03e07c1f;
973 *dstp++ = (Uint16)(d | d >> 16);
974 },{
975 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
976 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
977
978 /* red -- process the bits in place */
979 src2 = src1;
980 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
981
982 dst2 = dst1;
983 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
984
985 /* blend */
986 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
987 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
988 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
989 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
990 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
991
992 mm_res = dst2; /* RED -> mm_res */
993
994 /* green -- process the bits in place */
995 src2 = src1;
996 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
997
998 dst2 = dst1;
999 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
1000
1001 /* blend */
1002 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
1003 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
1004 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
1005 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
1006
1007 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
1008
1009 /* blue */
1010 src2 = src1; /* src -> src2 */
1011 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
1012
1013 dst2 = dst1; /* dst -> dst2 */
1014 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
1015
1016 /* blend */
1017 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
1018 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
1019 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
1020 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
1021 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
1022
1023 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
1024
1025 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
1026
1027 srcp += 4;
1028 dstp += 4;
1029 }, width);
1030 /* *INDENT-ON* */
1031 srcp += srcskip;
1032 dstp += dstskip;
1033 }
1034 _mm_empty();
1035 }
1036}
1037
1038#endif /* __MMX__ */
1039
1040/* fast RGB565->RGB565 blending with surface alpha */
1041static void
1042Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
1043{
1044 unsigned alpha = info->a;
1045 if (alpha == 128) {
1046 Blit16to16SurfaceAlpha128(info, 0xf7de);
1047 } else {
1048 int width = info->dst_w;
1049 int height = info->dst_h;
1050 Uint16 *srcp = (Uint16 *) info->src;
1051 int srcskip = info->src_skip >> 1;
1052 Uint16 *dstp = (Uint16 *) info->dst;
1053 int dstskip = info->dst_skip >> 1;
1054 alpha >>= 3; /* downscale alpha to 5 bits */
1055
1056 while (height--) {
1057 /* *INDENT-OFF* */
1058 DUFFS_LOOP4({
1059 Uint32 s = *srcp++;
1060 Uint32 d = *dstp;
1061 /*
1062 * shift out the middle component (green) to
1063 * the high 16 bits, and process all three RGB
1064 * components at the same time.
1065 */
1066 s = (s | s << 16) & 0x07e0f81f;
1067 d = (d | d << 16) & 0x07e0f81f;
1068 d += (s - d) * alpha >> 5;
1069 d &= 0x07e0f81f;
1070 *dstp++ = (Uint16)(d | d >> 16);
1071 }, width);
1072 /* *INDENT-ON* */
1073 srcp += srcskip;
1074 dstp += dstskip;
1075 }
1076 }
1077}
1078
1079/* fast RGB555->RGB555 blending with surface alpha */
1080static void
1081Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
1082{
1083 unsigned alpha = info->a; /* downscale alpha to 5 bits */
1084 if (alpha == 128) {
1085 Blit16to16SurfaceAlpha128(info, 0xfbde);
1086 } else {
1087 int width = info->dst_w;
1088 int height = info->dst_h;
1089 Uint16 *srcp = (Uint16 *) info->src;
1090 int srcskip = info->src_skip >> 1;
1091 Uint16 *dstp = (Uint16 *) info->dst;
1092 int dstskip = info->dst_skip >> 1;
1093 alpha >>= 3; /* downscale alpha to 5 bits */
1094
1095 while (height--) {
1096 /* *INDENT-OFF* */
1097 DUFFS_LOOP4({
1098 Uint32 s = *srcp++;
1099 Uint32 d = *dstp;
1100 /*
1101 * shift out the middle component (green) to
1102 * the high 16 bits, and process all three RGB
1103 * components at the same time.
1104 */
1105 s = (s | s << 16) & 0x03e07c1f;
1106 d = (d | d << 16) & 0x03e07c1f;
1107 d += (s - d) * alpha >> 5;
1108 d &= 0x03e07c1f;
1109 *dstp++ = (Uint16)(d | d >> 16);
1110 }, width);
1111 /* *INDENT-ON* */
1112 srcp += srcskip;
1113 dstp += dstskip;
1114 }
1115 }
1116}
1117
1118/* fast ARGB8888->RGB565 blending with pixel alpha */
1119static void
1120BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
1121{
1122 int width = info->dst_w;
1123 int height = info->dst_h;
1124 Uint32 *srcp = (Uint32 *) info->src;
1125 int srcskip = info->src_skip >> 2;
1126 Uint16 *dstp = (Uint16 *) info->dst;
1127 int dstskip = info->dst_skip >> 1;
1128
1129 while (height--) {
1130 /* *INDENT-OFF* */
1131 DUFFS_LOOP4({
1132 Uint32 s = *srcp;
1133 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
1134 /* FIXME: Here we special-case opaque alpha since the
1135 compositioning used (>>8 instead of /255) doesn't handle
1136 it correctly. Also special-case alpha=0 for speed?
1137 Benchmark this! */
1138 if(alpha) {
1139 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
1140 *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f));
1141 } else {
1142 Uint32 d = *dstp;
1143 /*
1144 * convert source and destination to G0RAB65565
1145 * and blend all components at the same time
1146 */
1147 s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
1148 + (s >> 3 & 0x1f);
1149 d = (d | d << 16) & 0x07e0f81f;
1150 d += (s - d) * alpha >> 5;
1151 d &= 0x07e0f81f;
1152 *dstp = (Uint16)(d | d >> 16);
1153 }
1154 }
1155 srcp++;
1156 dstp++;
1157 }, width);
1158 /* *INDENT-ON* */
1159 srcp += srcskip;
1160 dstp += dstskip;
1161 }
1162}
1163
1164/* fast ARGB8888->RGB555 blending with pixel alpha */
1165static void
1166BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
1167{
1168 int width = info->dst_w;
1169 int height = info->dst_h;
1170 Uint32 *srcp = (Uint32 *) info->src;
1171 int srcskip = info->src_skip >> 2;
1172 Uint16 *dstp = (Uint16 *) info->dst;
1173 int dstskip = info->dst_skip >> 1;
1174
1175 while (height--) {
1176 /* *INDENT-OFF* */
1177 DUFFS_LOOP4({
1178 unsigned alpha;
1179 Uint32 s = *srcp;
1180 alpha = s >> 27; /* downscale alpha to 5 bits */
1181 /* FIXME: Here we special-case opaque alpha since the
1182 compositioning used (>>8 instead of /255) doesn't handle
1183 it correctly. Also special-case alpha=0 for speed?
1184 Benchmark this! */
1185 if(alpha) {
1186 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
1187 *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f));
1188 } else {
1189 Uint32 d = *dstp;
1190 /*
1191 * convert source and destination to G0RAB65565
1192 * and blend all components at the same time
1193 */
1194 s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
1195 + (s >> 3 & 0x1f);
1196 d = (d | d << 16) & 0x03e07c1f;
1197 d += (s - d) * alpha >> 5;
1198 d &= 0x03e07c1f;
1199 *dstp = (Uint16)(d | d >> 16);
1200 }
1201 }
1202 srcp++;
1203 dstp++;
1204 }, width);
1205 /* *INDENT-ON* */
1206 srcp += srcskip;
1207 dstp += dstskip;
1208 }
1209}
1210
1211/* General (slow) N->N blending with per-surface alpha */
1212static void
1213BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
1214{
1215 int width = info->dst_w;
1216 int height = info->dst_h;
1217 Uint8 *src = info->src;
1218 int srcskip = info->src_skip;
1219 Uint8 *dst = info->dst;
1220 int dstskip = info->dst_skip;
1221 SDL_PixelFormat *srcfmt = info->src_fmt;
1222 SDL_PixelFormat *dstfmt = info->dst_fmt;
1223 int srcbpp = srcfmt->BytesPerPixel;
1224 int dstbpp = dstfmt->BytesPerPixel;
1225 Uint32 Pixel;
1226 unsigned sR, sG, sB;
1227 unsigned dR, dG, dB, dA;
1228 const unsigned sA = info->a;
1229
1230 if (sA) {
1231 while (height--) {
1232 /* *INDENT-OFF* */
1233 DUFFS_LOOP4(
1234 {
1235 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
1236 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1237 ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1238 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1239 src += srcbpp;
1240 dst += dstbpp;
1241 },
1242 width);
1243 /* *INDENT-ON* */
1244 src += srcskip;
1245 dst += dstskip;
1246 }
1247 }
1248}
1249
1250/* General (slow) colorkeyed N->N blending with per-surface alpha */
1251static void
1252BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
1253{
1254 int width = info->dst_w;
1255 int height = info->dst_h;
1256 Uint8 *src = info->src;
1257 int srcskip = info->src_skip;
1258 Uint8 *dst = info->dst;
1259 int dstskip = info->dst_skip;
1260 SDL_PixelFormat *srcfmt = info->src_fmt;
1261 SDL_PixelFormat *dstfmt = info->dst_fmt;
1262 Uint32 ckey = info->colorkey;
1263 int srcbpp = srcfmt->BytesPerPixel;
1264 int dstbpp = dstfmt->BytesPerPixel;
1265 Uint32 Pixel;
1266 unsigned sR, sG, sB;
1267 unsigned dR, dG, dB, dA;
1268 const unsigned sA = info->a;
1269
1270 while (height--) {
1271 /* *INDENT-OFF* */
1272 DUFFS_LOOP4(
1273 {
1274 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
1275 if(sA && Pixel != ckey) {
1276 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
1277 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1278 ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1279 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1280 }
1281 src += srcbpp;
1282 dst += dstbpp;
1283 },
1284 width);
1285 /* *INDENT-ON* */
1286 src += srcskip;
1287 dst += dstskip;
1288 }
1289}
1290
1291/* General (slow) N->N blending with pixel alpha */
1292static void
1293BlitNtoNPixelAlpha(SDL_BlitInfo * info)
1294{
1295 int width = info->dst_w;
1296 int height = info->dst_h;
1297 Uint8 *src = info->src;
1298 int srcskip = info->src_skip;
1299 Uint8 *dst = info->dst;
1300 int dstskip = info->dst_skip;
1301 SDL_PixelFormat *srcfmt = info->src_fmt;
1302 SDL_PixelFormat *dstfmt = info->dst_fmt;
1303 int srcbpp;
1304 int dstbpp;
1305 Uint32 Pixel;
1306 unsigned sR, sG, sB, sA;
1307 unsigned dR, dG, dB, dA;
1308
1309 /* Set up some basic variables */
1310 srcbpp = srcfmt->BytesPerPixel;
1311 dstbpp = dstfmt->BytesPerPixel;
1312
1313 while (height--) {
1314 /* *INDENT-OFF* */
1315 DUFFS_LOOP4(
1316 {
1317 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
1318 if(sA) {
1319 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1320 ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1321 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1322 }
1323 src += srcbpp;
1324 dst += dstbpp;
1325 },
1326 width);
1327 /* *INDENT-ON* */
1328 src += srcskip;
1329 dst += dstskip;
1330 }
1331}
1332
1333
1334SDL_BlitFunc
1335SDL_CalculateBlitA(SDL_Surface * surface)
1336{
1337 SDL_PixelFormat *sf = surface->format;
1338 SDL_PixelFormat *df = surface->map->dst->format;
1339
1340 switch (surface->map->info.flags & ~SDL_COPY_RLE_MASK) {
1341 case SDL_COPY_BLEND:
1342 /* Per-pixel alpha blits */
1343 switch (df->BytesPerPixel) {
1344 case 1:
1345 if (df->palette != NULL) {
1346 return BlitNto1PixelAlpha;
1347 } else {
1348 /* RGB332 has no palette ! */
1349 return BlitNtoNPixelAlpha;
1350 }
1351
1352 case 2:
1353#if SDL_ARM_NEON_BLITTERS || SDL_ARM_SIMD_BLITTERS
1354 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
1355 && sf->Gmask == 0xff00 && df->Gmask == 0x7e0
1356 && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
1357 || (sf->Bmask == 0xff && df->Bmask == 0x1f)))
1358 {
1359#if SDL_ARM_NEON_BLITTERS
1360 if (SDL_HasNEON())
1361 return BlitARGBto565PixelAlphaARMNEON;
1362#endif
1363#if SDL_ARM_SIMD_BLITTERS
1364 if (SDL_HasARMSIMD())
1365 return BlitARGBto565PixelAlphaARMSIMD;
1366#endif
1367 }
1368#endif
1369 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
1370 && sf->Gmask == 0xff00
1371 && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
1372 || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
1373 if (df->Gmask == 0x7e0)
1374 return BlitARGBto565PixelAlpha;
1375 else if (df->Gmask == 0x3e0)
1376 return BlitARGBto555PixelAlpha;
1377 }
1378 return BlitNtoNPixelAlpha;
1379
1380 case 4:
1381 if (sf->Rmask == df->Rmask
1382 && sf->Gmask == df->Gmask
1383 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
1384#if defined(__MMX__) || defined(__3dNOW__)
1385 if (sf->Rshift % 8 == 0
1386 && sf->Gshift % 8 == 0
1387 && sf->Bshift % 8 == 0
1388 && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
1389#ifdef __3dNOW__
1390 if (SDL_Has3DNow())
1391 return BlitRGBtoRGBPixelAlphaMMX3DNOW;
1392#endif
1393#ifdef __MMX__
1394 if (SDL_HasMMX())
1395 return BlitRGBtoRGBPixelAlphaMMX;
1396#endif
1397 }
1398#endif /* __MMX__ || __3dNOW__ */
1399 if (sf->Amask == 0xff000000) {
1400#if SDL_ARM_NEON_BLITTERS
1401 if (SDL_HasNEON())
1402 return BlitRGBtoRGBPixelAlphaARMNEON;
1403#endif
1404#if SDL_ARM_SIMD_BLITTERS
1405 if (SDL_HasARMSIMD())
1406 return BlitRGBtoRGBPixelAlphaARMSIMD;
1407#endif
1408 return BlitRGBtoRGBPixelAlpha;
1409 }
1410 }
1411 return BlitNtoNPixelAlpha;
1412
1413 case 3:
1414 default:
1415 break;
1416 }
1417 return BlitNtoNPixelAlpha;
1418
1419 case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
1420 if (sf->Amask == 0) {
1421 /* Per-surface alpha blits */
1422 switch (df->BytesPerPixel) {
1423 case 1:
1424 if (df->palette != NULL) {
1425 return BlitNto1SurfaceAlpha;
1426 } else {
1427 /* RGB332 has no palette ! */
1428 return BlitNtoNSurfaceAlpha;
1429 }
1430
1431 case 2:
1432 if (surface->map->identity) {
1433 if (df->Gmask == 0x7e0) {
1434#ifdef __MMX__
1435 if (SDL_HasMMX())
1436 return Blit565to565SurfaceAlphaMMX;
1437 else
1438#endif
1439 return Blit565to565SurfaceAlpha;
1440 } else if (df->Gmask == 0x3e0) {
1441#ifdef __MMX__
1442 if (SDL_HasMMX())
1443 return Blit555to555SurfaceAlphaMMX;
1444 else
1445#endif
1446 return Blit555to555SurfaceAlpha;
1447 }
1448 }
1449 return BlitNtoNSurfaceAlpha;
1450
1451 case 4:
1452 if (sf->Rmask == df->Rmask
1453 && sf->Gmask == df->Gmask
1454 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
1455#ifdef __MMX__
1456 if (sf->Rshift % 8 == 0
1457 && sf->Gshift % 8 == 0
1458 && sf->Bshift % 8 == 0 && SDL_HasMMX())
1459 return BlitRGBtoRGBSurfaceAlphaMMX;
1460#endif
1461 if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
1462 return BlitRGBtoRGBSurfaceAlpha;
1463 }
1464 }
1465 return BlitNtoNSurfaceAlpha;
1466
1467 case 3:
1468 default:
1469 return BlitNtoNSurfaceAlpha;
1470 }
1471 }
1472 break;
1473
1474 case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
1475 if (sf->Amask == 0) {
1476 if (df->BytesPerPixel == 1) {
1477
1478 if (df->palette != NULL) {
1479 return BlitNto1SurfaceAlphaKey;
1480 } else {
1481 /* RGB332 has no palette ! */
1482 return BlitNtoNSurfaceAlphaKey;
1483 }
1484 } else {
1485 return BlitNtoNSurfaceAlphaKey;
1486 }
1487 }
1488 break;
1489 }
1490
1491 return NULL;
1492}
1493
1494#endif /* SDL_HAVE_BLIT_A */
1495
1496/* vi: set ts=4 sw=4 expandtab: */
1497