1 | /* |
2 | Simple DirectMedia Layer |
3 | Copyright (C) 1997-2021 Sam Lantinga <slouken@libsdl.org> |
4 | |
5 | This software is provided 'as-is', without any express or implied |
6 | warranty. In no event will the authors be held liable for any damages |
7 | arising from the use of this software. |
8 | |
9 | Permission is granted to anyone to use this software for any purpose, |
10 | including commercial applications, and to alter it and redistribute it |
11 | freely, subject to the following restrictions: |
12 | |
13 | 1. The origin of this software must not be misrepresented; you must not |
14 | claim that you wrote the original software. If you use this software |
15 | in a product, an acknowledgment in the product documentation would be |
16 | appreciated but is not required. |
17 | 2. Altered source versions must be plainly marked as such, and must not be |
18 | misrepresented as being the original software. |
19 | 3. This notice may not be removed or altered from any source distribution. |
20 | */ |
21 | #include "../SDL_internal.h" |
22 | |
23 | #if SDL_HAVE_BLIT_A |
24 | |
25 | #include "SDL_video.h" |
26 | #include "SDL_blit.h" |
27 | |
28 | /* Functions to perform alpha blended blitting */ |
29 | |
30 | /* N->1 blending with per-surface alpha */ |
31 | static void |
32 | BlitNto1SurfaceAlpha(SDL_BlitInfo * info) |
33 | { |
34 | int width = info->dst_w; |
35 | int height = info->dst_h; |
36 | Uint8 *src = info->src; |
37 | int srcskip = info->src_skip; |
38 | Uint8 *dst = info->dst; |
39 | int dstskip = info->dst_skip; |
40 | Uint8 *palmap = info->table; |
41 | SDL_PixelFormat *srcfmt = info->src_fmt; |
42 | SDL_PixelFormat *dstfmt = info->dst_fmt; |
43 | int srcbpp = srcfmt->BytesPerPixel; |
44 | Uint32 Pixel; |
45 | unsigned sR, sG, sB; |
46 | unsigned dR, dG, dB; |
47 | const unsigned A = info->a; |
48 | |
49 | while (height--) { |
50 | /* *INDENT-OFF* */ |
51 | DUFFS_LOOP4( |
52 | { |
53 | DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB); |
54 | dR = dstfmt->palette->colors[*dst].r; |
55 | dG = dstfmt->palette->colors[*dst].g; |
56 | dB = dstfmt->palette->colors[*dst].b; |
57 | ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB); |
58 | dR &= 0xff; |
59 | dG &= 0xff; |
60 | dB &= 0xff; |
61 | /* Pack RGB into 8bit pixel */ |
62 | if ( palmap == NULL ) { |
63 | *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0)); |
64 | } else { |
65 | *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))]; |
66 | } |
67 | dst++; |
68 | src += srcbpp; |
69 | }, |
70 | width); |
71 | /* *INDENT-ON* */ |
72 | src += srcskip; |
73 | dst += dstskip; |
74 | } |
75 | } |
76 | |
77 | /* N->1 blending with pixel alpha */ |
78 | static void |
79 | BlitNto1PixelAlpha(SDL_BlitInfo * info) |
80 | { |
81 | int width = info->dst_w; |
82 | int height = info->dst_h; |
83 | Uint8 *src = info->src; |
84 | int srcskip = info->src_skip; |
85 | Uint8 *dst = info->dst; |
86 | int dstskip = info->dst_skip; |
87 | Uint8 *palmap = info->table; |
88 | SDL_PixelFormat *srcfmt = info->src_fmt; |
89 | SDL_PixelFormat *dstfmt = info->dst_fmt; |
90 | int srcbpp = srcfmt->BytesPerPixel; |
91 | Uint32 Pixel; |
92 | unsigned sR, sG, sB, sA; |
93 | unsigned dR, dG, dB; |
94 | |
95 | while (height--) { |
96 | /* *INDENT-OFF* */ |
97 | DUFFS_LOOP4( |
98 | { |
99 | DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA); |
100 | dR = dstfmt->palette->colors[*dst].r; |
101 | dG = dstfmt->palette->colors[*dst].g; |
102 | dB = dstfmt->palette->colors[*dst].b; |
103 | ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB); |
104 | dR &= 0xff; |
105 | dG &= 0xff; |
106 | dB &= 0xff; |
107 | /* Pack RGB into 8bit pixel */ |
108 | if ( palmap == NULL ) { |
109 | *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0)); |
110 | } else { |
111 | *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))]; |
112 | } |
113 | dst++; |
114 | src += srcbpp; |
115 | }, |
116 | width); |
117 | /* *INDENT-ON* */ |
118 | src += srcskip; |
119 | dst += dstskip; |
120 | } |
121 | } |
122 | |
123 | /* colorkeyed N->1 blending with per-surface alpha */ |
124 | static void |
125 | BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info) |
126 | { |
127 | int width = info->dst_w; |
128 | int height = info->dst_h; |
129 | Uint8 *src = info->src; |
130 | int srcskip = info->src_skip; |
131 | Uint8 *dst = info->dst; |
132 | int dstskip = info->dst_skip; |
133 | Uint8 *palmap = info->table; |
134 | SDL_PixelFormat *srcfmt = info->src_fmt; |
135 | SDL_PixelFormat *dstfmt = info->dst_fmt; |
136 | int srcbpp = srcfmt->BytesPerPixel; |
137 | Uint32 ckey = info->colorkey; |
138 | Uint32 Pixel; |
139 | unsigned sR, sG, sB; |
140 | unsigned dR, dG, dB; |
141 | const unsigned A = info->a; |
142 | |
143 | while (height--) { |
144 | /* *INDENT-OFF* */ |
145 | DUFFS_LOOP( |
146 | { |
147 | DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB); |
148 | if ( Pixel != ckey ) { |
149 | dR = dstfmt->palette->colors[*dst].r; |
150 | dG = dstfmt->palette->colors[*dst].g; |
151 | dB = dstfmt->palette->colors[*dst].b; |
152 | ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB); |
153 | dR &= 0xff; |
154 | dG &= 0xff; |
155 | dB &= 0xff; |
156 | /* Pack RGB into 8bit pixel */ |
157 | if ( palmap == NULL ) { |
158 | *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0)); |
159 | } else { |
160 | *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))]; |
161 | } |
162 | } |
163 | dst++; |
164 | src += srcbpp; |
165 | }, |
166 | width); |
167 | /* *INDENT-ON* */ |
168 | src += srcskip; |
169 | dst += dstskip; |
170 | } |
171 | } |
172 | |
173 | #ifdef __MMX__ |
174 | |
175 | /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ |
176 | static void |
177 | BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info) |
178 | { |
179 | int width = info->dst_w; |
180 | int height = info->dst_h; |
181 | Uint32 *srcp = (Uint32 *) info->src; |
182 | int srcskip = info->src_skip >> 2; |
183 | Uint32 *dstp = (Uint32 *) info->dst; |
184 | int dstskip = info->dst_skip >> 2; |
185 | Uint32 dalpha = info->dst_fmt->Amask; |
186 | |
187 | __m64 src1, src2, dst1, dst2, lmask, hmask, dsta; |
188 | |
189 | hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */ |
190 | lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */ |
191 | dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */ |
192 | |
193 | while (height--) { |
194 | int n = width; |
195 | if (n & 1) { |
196 | Uint32 s = *srcp++; |
197 | Uint32 d = *dstp; |
198 | *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) |
199 | + (s & d & 0x00010101)) | dalpha; |
200 | n--; |
201 | } |
202 | |
203 | for (n >>= 1; n > 0; --n) { |
204 | dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */ |
205 | dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */ |
206 | |
207 | src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */ |
208 | src2 = src1; /* 2 x src -> src2(ARGBARGB) */ |
209 | |
210 | dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */ |
211 | src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */ |
212 | src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */ |
213 | src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */ |
214 | |
215 | dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */ |
216 | dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */ |
217 | dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */ |
218 | dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */ |
219 | |
220 | *(__m64 *) dstp = dst1; /* dst1 -> 2 x dst pixels */ |
221 | dstp += 2; |
222 | srcp += 2; |
223 | } |
224 | |
225 | srcp += srcskip; |
226 | dstp += dstskip; |
227 | } |
228 | _mm_empty(); |
229 | } |
230 | |
231 | /* fast RGB888->(A)RGB888 blending with surface alpha */ |
232 | static void |
233 | BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info) |
234 | { |
235 | SDL_PixelFormat *df = info->dst_fmt; |
236 | Uint32 chanmask; |
237 | unsigned alpha = info->a; |
238 | |
239 | if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) { |
240 | /* only call a128 version when R,G,B occupy lower bits */ |
241 | BlitRGBtoRGBSurfaceAlpha128MMX(info); |
242 | } else { |
243 | int width = info->dst_w; |
244 | int height = info->dst_h; |
245 | Uint32 *srcp = (Uint32 *) info->src; |
246 | int srcskip = info->src_skip >> 2; |
247 | Uint32 *dstp = (Uint32 *) info->dst; |
248 | int dstskip = info->dst_skip >> 2; |
249 | Uint32 dalpha = df->Amask; |
250 | Uint32 amult; |
251 | |
252 | __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta; |
253 | |
254 | mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ |
255 | /* form the alpha mult */ |
256 | amult = alpha | (alpha << 8); |
257 | amult = amult | (amult << 16); |
258 | chanmask = |
259 | (0xff << df->Rshift) | (0xff << df-> |
260 | Gshift) | (0xff << df->Bshift); |
261 | mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */ |
262 | mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */ |
263 | /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */ |
264 | dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */ |
265 | |
266 | while (height--) { |
267 | int n = width; |
268 | if (n & 1) { |
269 | /* One Pixel Blend */ |
270 | src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */ |
271 | src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */ |
272 | |
273 | dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */ |
274 | dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */ |
275 | |
276 | src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */ |
277 | src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ |
278 | src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */ |
279 | dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */ |
280 | |
281 | dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */ |
282 | dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */ |
283 | *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ |
284 | |
285 | ++srcp; |
286 | ++dstp; |
287 | |
288 | n--; |
289 | } |
290 | |
291 | for (n >>= 1; n > 0; --n) { |
292 | /* Two Pixels Blend */ |
293 | src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */ |
294 | src2 = src1; /* 2 x src -> src2(ARGBARGB) */ |
295 | src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */ |
296 | src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */ |
297 | |
298 | dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */ |
299 | dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */ |
300 | dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */ |
301 | dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */ |
302 | |
303 | src1 = _mm_sub_pi16(src1, dst1); /* src1 - dst1 -> src1 */ |
304 | src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */ |
305 | src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */ |
306 | dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */ |
307 | |
308 | src2 = _mm_sub_pi16(src2, dst2); /* src2 - dst2 -> src2 */ |
309 | src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ |
310 | src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */ |
311 | dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */ |
312 | |
313 | dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */ |
314 | dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */ |
315 | |
316 | *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */ |
317 | |
318 | srcp += 2; |
319 | dstp += 2; |
320 | } |
321 | srcp += srcskip; |
322 | dstp += dstskip; |
323 | } |
324 | _mm_empty(); |
325 | } |
326 | } |
327 | |
328 | /* fast ARGB888->(A)RGB888 blending with pixel alpha */ |
329 | static void |
330 | BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info) |
331 | { |
332 | int width = info->dst_w; |
333 | int height = info->dst_h; |
334 | Uint32 *srcp = (Uint32 *) info->src; |
335 | int srcskip = info->src_skip >> 2; |
336 | Uint32 *dstp = (Uint32 *) info->dst; |
337 | int dstskip = info->dst_skip >> 2; |
338 | SDL_PixelFormat *sf = info->src_fmt; |
339 | Uint32 amask = sf->Amask; |
340 | Uint32 ashift = sf->Ashift; |
341 | Uint64 multmask, multmask2; |
342 | |
343 | __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2; |
344 | |
345 | mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ |
346 | multmask = 0x00FF; |
347 | multmask <<= (ashift * 2); |
348 | multmask2 = 0x00FF00FF00FF00FFULL; |
349 | |
350 | while (height--) { |
351 | /* *INDENT-OFF* */ |
352 | DUFFS_LOOP4({ |
353 | Uint32 alpha = *srcp & amask; |
354 | if (alpha == 0) { |
355 | /* do nothing */ |
356 | } else if (alpha == amask) { |
357 | *dstp = *srcp; |
358 | } else { |
359 | src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */ |
360 | src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */ |
361 | |
362 | dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */ |
363 | dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */ |
364 | |
365 | mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */ |
366 | mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */ |
367 | mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ |
368 | mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */ |
369 | mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha */ |
370 | mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha */ |
371 | |
372 | /* blend */ |
373 | src1 = _mm_mullo_pi16(src1, mm_alpha); |
374 | src1 = _mm_srli_pi16(src1, 8); |
375 | dst1 = _mm_mullo_pi16(dst1, mm_alpha2); |
376 | dst1 = _mm_srli_pi16(dst1, 8); |
377 | dst1 = _mm_add_pi16(src1, dst1); |
378 | dst1 = _mm_packs_pu16(dst1, mm_zero); |
379 | |
380 | *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ |
381 | } |
382 | ++srcp; |
383 | ++dstp; |
384 | }, width); |
385 | /* *INDENT-ON* */ |
386 | srcp += srcskip; |
387 | dstp += dstskip; |
388 | } |
389 | _mm_empty(); |
390 | } |
391 | |
392 | #endif /* __MMX__ */ |
393 | |
394 | #if SDL_ARM_SIMD_BLITTERS |
395 | void BlitARGBto565PixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride); |
396 | |
397 | static void |
398 | BlitARGBto565PixelAlphaARMSIMD(SDL_BlitInfo * info) |
399 | { |
400 | int32_t width = info->dst_w; |
401 | int32_t height = info->dst_h; |
402 | uint16_t *dstp = (uint16_t *)info->dst; |
403 | int32_t dststride = width + (info->dst_skip >> 1); |
404 | uint32_t *srcp = (uint32_t *)info->src; |
405 | int32_t srcstride = width + (info->src_skip >> 2); |
406 | |
407 | BlitARGBto565PixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride); |
408 | } |
409 | |
410 | void BlitRGBtoRGBPixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride); |
411 | |
412 | static void |
413 | BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo * info) |
414 | { |
415 | int32_t width = info->dst_w; |
416 | int32_t height = info->dst_h; |
417 | uint32_t *dstp = (uint32_t *)info->dst; |
418 | int32_t dststride = width + (info->dst_skip >> 2); |
419 | uint32_t *srcp = (uint32_t *)info->src; |
420 | int32_t srcstride = width + (info->src_skip >> 2); |
421 | |
422 | BlitRGBtoRGBPixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride); |
423 | } |
424 | #endif |
425 | |
426 | #if SDL_ARM_NEON_BLITTERS |
427 | void BlitARGBto565PixelAlphaARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride); |
428 | |
429 | static void |
430 | BlitARGBto565PixelAlphaARMNEON(SDL_BlitInfo * info) |
431 | { |
432 | int32_t width = info->dst_w; |
433 | int32_t height = info->dst_h; |
434 | uint16_t *dstp = (uint16_t *)info->dst; |
435 | int32_t dststride = width + (info->dst_skip >> 1); |
436 | uint32_t *srcp = (uint32_t *)info->src; |
437 | int32_t srcstride = width + (info->src_skip >> 2); |
438 | |
439 | BlitARGBto565PixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride); |
440 | } |
441 | |
442 | void BlitRGBtoRGBPixelAlphaARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride); |
443 | |
444 | static void |
445 | BlitRGBtoRGBPixelAlphaARMNEON(SDL_BlitInfo * info) |
446 | { |
447 | int32_t width = info->dst_w; |
448 | int32_t height = info->dst_h; |
449 | uint32_t *dstp = (uint32_t *)info->dst; |
450 | int32_t dststride = width + (info->dst_skip >> 2); |
451 | uint32_t *srcp = (uint32_t *)info->src; |
452 | int32_t srcstride = width + (info->src_skip >> 2); |
453 | |
454 | BlitRGBtoRGBPixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride); |
455 | } |
456 | #endif |
457 | |
458 | /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ |
459 | static void |
460 | BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info) |
461 | { |
462 | int width = info->dst_w; |
463 | int height = info->dst_h; |
464 | Uint32 *srcp = (Uint32 *) info->src; |
465 | int srcskip = info->src_skip >> 2; |
466 | Uint32 *dstp = (Uint32 *) info->dst; |
467 | int dstskip = info->dst_skip >> 2; |
468 | |
469 | while (height--) { |
470 | /* *INDENT-OFF* */ |
471 | DUFFS_LOOP4({ |
472 | Uint32 s = *srcp++; |
473 | Uint32 d = *dstp; |
474 | *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) |
475 | + (s & d & 0x00010101)) | 0xff000000; |
476 | }, width); |
477 | /* *INDENT-ON* */ |
478 | srcp += srcskip; |
479 | dstp += dstskip; |
480 | } |
481 | } |
482 | |
483 | /* fast RGB888->(A)RGB888 blending with surface alpha */ |
484 | static void |
485 | BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info) |
486 | { |
487 | unsigned alpha = info->a; |
488 | if (alpha == 128) { |
489 | BlitRGBtoRGBSurfaceAlpha128(info); |
490 | } else { |
491 | int width = info->dst_w; |
492 | int height = info->dst_h; |
493 | Uint32 *srcp = (Uint32 *) info->src; |
494 | int srcskip = info->src_skip >> 2; |
495 | Uint32 *dstp = (Uint32 *) info->dst; |
496 | int dstskip = info->dst_skip >> 2; |
497 | Uint32 s; |
498 | Uint32 d; |
499 | Uint32 s1; |
500 | Uint32 d1; |
501 | |
502 | while (height--) { |
503 | /* *INDENT-OFF* */ |
504 | DUFFS_LOOP4({ |
505 | s = *srcp; |
506 | d = *dstp; |
507 | s1 = s & 0xff00ff; |
508 | d1 = d & 0xff00ff; |
509 | d1 = (d1 + ((s1 - d1) * alpha >> 8)) |
510 | & 0xff00ff; |
511 | s &= 0xff00; |
512 | d &= 0xff00; |
513 | d = (d + ((s - d) * alpha >> 8)) & 0xff00; |
514 | *dstp = d1 | d | 0xff000000; |
515 | ++srcp; |
516 | ++dstp; |
517 | }, width); |
518 | /* *INDENT-ON* */ |
519 | srcp += srcskip; |
520 | dstp += dstskip; |
521 | } |
522 | } |
523 | } |
524 | |
525 | /* fast ARGB888->(A)RGB888 blending with pixel alpha */ |
526 | static void |
527 | BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info) |
528 | { |
529 | int width = info->dst_w; |
530 | int height = info->dst_h; |
531 | Uint32 *srcp = (Uint32 *) info->src; |
532 | int srcskip = info->src_skip >> 2; |
533 | Uint32 *dstp = (Uint32 *) info->dst; |
534 | int dstskip = info->dst_skip >> 2; |
535 | |
536 | while (height--) { |
537 | /* *INDENT-OFF* */ |
538 | DUFFS_LOOP4({ |
539 | Uint32 dalpha; |
540 | Uint32 d; |
541 | Uint32 s1; |
542 | Uint32 d1; |
543 | Uint32 s = *srcp; |
544 | Uint32 alpha = s >> 24; |
545 | /* FIXME: Here we special-case opaque alpha since the |
546 | compositioning used (>>8 instead of /255) doesn't handle |
547 | it correctly. Also special-case alpha=0 for speed? |
548 | Benchmark this! */ |
549 | if (alpha) { |
550 | if (alpha == SDL_ALPHA_OPAQUE) { |
551 | *dstp = *srcp; |
552 | } else { |
553 | /* |
554 | * take out the middle component (green), and process |
555 | * the other two in parallel. One multiply less. |
556 | */ |
557 | d = *dstp; |
558 | dalpha = d >> 24; |
559 | s1 = s & 0xff00ff; |
560 | d1 = d & 0xff00ff; |
561 | d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; |
562 | s &= 0xff00; |
563 | d &= 0xff00; |
564 | d = (d + ((s - d) * alpha >> 8)) & 0xff00; |
565 | dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8); |
566 | *dstp = d1 | d | (dalpha << 24); |
567 | } |
568 | } |
569 | ++srcp; |
570 | ++dstp; |
571 | }, width); |
572 | /* *INDENT-ON* */ |
573 | srcp += srcskip; |
574 | dstp += dstskip; |
575 | } |
576 | } |
577 | |
578 | #ifdef __3dNOW__ |
579 | /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */ |
580 | static void |
581 | BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info) |
582 | { |
583 | int width = info->dst_w; |
584 | int height = info->dst_h; |
585 | Uint32 *srcp = (Uint32 *) info->src; |
586 | int srcskip = info->src_skip >> 2; |
587 | Uint32 *dstp = (Uint32 *) info->dst; |
588 | int dstskip = info->dst_skip >> 2; |
589 | SDL_PixelFormat *sf = info->src_fmt; |
590 | Uint32 amask = sf->Amask; |
591 | Uint32 ashift = sf->Ashift; |
592 | Uint64 multmask, multmask2; |
593 | |
594 | __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2; |
595 | |
596 | mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ |
597 | multmask = 0x00FF; |
598 | multmask <<= (ashift * 2); |
599 | multmask2 = 0x00FF00FF00FF00FFULL; |
600 | |
601 | while (height--) { |
602 | /* *INDENT-OFF* */ |
603 | DUFFS_LOOP4({ |
604 | Uint32 alpha; |
605 | |
606 | _m_prefetch(srcp + 16); |
607 | _m_prefetch(dstp + 16); |
608 | |
609 | alpha = *srcp & amask; |
610 | if (alpha == 0) { |
611 | /* do nothing */ |
612 | } else if (alpha == amask) { |
613 | *dstp = *srcp; |
614 | } else { |
615 | src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */ |
616 | src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */ |
617 | |
618 | dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */ |
619 | dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */ |
620 | |
621 | mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */ |
622 | mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */ |
623 | mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ |
624 | mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */ |
625 | mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha */ |
626 | mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha */ |
627 | |
628 | |
629 | /* blend */ |
630 | src1 = _mm_mullo_pi16(src1, mm_alpha); |
631 | src1 = _mm_srli_pi16(src1, 8); |
632 | dst1 = _mm_mullo_pi16(dst1, mm_alpha2); |
633 | dst1 = _mm_srli_pi16(dst1, 8); |
634 | dst1 = _mm_add_pi16(src1, dst1); |
635 | dst1 = _mm_packs_pu16(dst1, mm_zero); |
636 | |
637 | *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ |
638 | } |
639 | ++srcp; |
640 | ++dstp; |
641 | }, width); |
642 | /* *INDENT-ON* */ |
643 | srcp += srcskip; |
644 | dstp += dstskip; |
645 | } |
646 | _mm_empty(); |
647 | } |
648 | |
649 | #endif /* __3dNOW__ */ |
650 | |
651 | /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */ |
652 | |
653 | /* blend a single 16 bit pixel at 50% */ |
654 | #define BLEND16_50(d, s, mask) \ |
655 | ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff))) |
656 | |
657 | /* blend two 16 bit pixels at 50% */ |
658 | #define BLEND2x16_50(d, s, mask) \ |
659 | (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \ |
660 | + (s & d & (~(mask | mask << 16)))) |
661 | |
662 | static void |
663 | Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask) |
664 | { |
665 | int width = info->dst_w; |
666 | int height = info->dst_h; |
667 | Uint16 *srcp = (Uint16 *) info->src; |
668 | int srcskip = info->src_skip >> 1; |
669 | Uint16 *dstp = (Uint16 *) info->dst; |
670 | int dstskip = info->dst_skip >> 1; |
671 | |
672 | while (height--) { |
673 | if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) { |
674 | /* |
675 | * Source and destination not aligned, pipeline it. |
676 | * This is mostly a win for big blits but no loss for |
677 | * small ones |
678 | */ |
679 | Uint32 prev_sw; |
680 | int w = width; |
681 | |
682 | /* handle odd destination */ |
683 | if ((uintptr_t) dstp & 2) { |
684 | Uint16 d = *dstp, s = *srcp; |
685 | *dstp = BLEND16_50(d, s, mask); |
686 | dstp++; |
687 | srcp++; |
688 | w--; |
689 | } |
690 | srcp++; /* srcp is now 32-bit aligned */ |
691 | |
692 | /* bootstrap pipeline with first halfword */ |
693 | prev_sw = ((Uint32 *) srcp)[-1]; |
694 | |
695 | while (w > 1) { |
696 | Uint32 sw, dw, s; |
697 | sw = *(Uint32 *) srcp; |
698 | dw = *(Uint32 *) dstp; |
699 | #if SDL_BYTEORDER == SDL_BIG_ENDIAN |
700 | s = (prev_sw << 16) + (sw >> 16); |
701 | #else |
702 | s = (prev_sw >> 16) + (sw << 16); |
703 | #endif |
704 | prev_sw = sw; |
705 | *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask); |
706 | dstp += 2; |
707 | srcp += 2; |
708 | w -= 2; |
709 | } |
710 | |
711 | /* final pixel if any */ |
712 | if (w) { |
713 | Uint16 d = *dstp, s; |
714 | #if SDL_BYTEORDER == SDL_BIG_ENDIAN |
715 | s = (Uint16) prev_sw; |
716 | #else |
717 | s = (Uint16) (prev_sw >> 16); |
718 | #endif |
719 | *dstp = BLEND16_50(d, s, mask); |
720 | srcp++; |
721 | dstp++; |
722 | } |
723 | srcp += srcskip - 1; |
724 | dstp += dstskip; |
725 | } else { |
726 | /* source and destination are aligned */ |
727 | int w = width; |
728 | |
729 | /* first odd pixel? */ |
730 | if ((uintptr_t) srcp & 2) { |
731 | Uint16 d = *dstp, s = *srcp; |
732 | *dstp = BLEND16_50(d, s, mask); |
733 | srcp++; |
734 | dstp++; |
735 | w--; |
736 | } |
737 | /* srcp and dstp are now 32-bit aligned */ |
738 | |
739 | while (w > 1) { |
740 | Uint32 sw = *(Uint32 *) srcp; |
741 | Uint32 dw = *(Uint32 *) dstp; |
742 | *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask); |
743 | srcp += 2; |
744 | dstp += 2; |
745 | w -= 2; |
746 | } |
747 | |
748 | /* last odd pixel? */ |
749 | if (w) { |
750 | Uint16 d = *dstp, s = *srcp; |
751 | *dstp = BLEND16_50(d, s, mask); |
752 | srcp++; |
753 | dstp++; |
754 | } |
755 | srcp += srcskip; |
756 | dstp += dstskip; |
757 | } |
758 | } |
759 | } |
760 | |
761 | #ifdef __MMX__ |
762 | |
763 | /* fast RGB565->RGB565 blending with surface alpha */ |
764 | static void |
765 | Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info) |
766 | { |
767 | unsigned alpha = info->a; |
768 | if (alpha == 128) { |
769 | Blit16to16SurfaceAlpha128(info, 0xf7de); |
770 | } else { |
771 | int width = info->dst_w; |
772 | int height = info->dst_h; |
773 | Uint16 *srcp = (Uint16 *) info->src; |
774 | int srcskip = info->src_skip >> 1; |
775 | Uint16 *dstp = (Uint16 *) info->dst; |
776 | int dstskip = info->dst_skip >> 1; |
777 | Uint32 s, d; |
778 | |
779 | __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha; |
780 | |
781 | alpha &= ~(1 + 2 + 4); /* cut alpha to get the exact same behaviour */ |
782 | mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */ |
783 | alpha >>= 3; /* downscale alpha to 5 bits */ |
784 | |
785 | mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ |
786 | mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ |
787 | /* position alpha to allow for mullo and mulhi on diff channels |
788 | to reduce the number of operations */ |
789 | mm_alpha = _mm_slli_si64(mm_alpha, 3); |
790 | |
791 | /* Setup the 565 color channel masks */ |
792 | gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */ |
793 | bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */ |
794 | |
795 | while (height--) { |
796 | /* *INDENT-OFF* */ |
797 | DUFFS_LOOP_124( |
798 | { |
799 | s = *srcp++; |
800 | d = *dstp; |
801 | /* |
802 | * shift out the middle component (green) to |
803 | * the high 16 bits, and process all three RGB |
804 | * components at the same time. |
805 | */ |
806 | s = (s | s << 16) & 0x07e0f81f; |
807 | d = (d | d << 16) & 0x07e0f81f; |
808 | d += (s - d) * alpha >> 5; |
809 | d &= 0x07e0f81f; |
810 | *dstp++ = (Uint16)(d | d >> 16); |
811 | },{ |
812 | s = *srcp++; |
813 | d = *dstp; |
814 | /* |
815 | * shift out the middle component (green) to |
816 | * the high 16 bits, and process all three RGB |
817 | * components at the same time. |
818 | */ |
819 | s = (s | s << 16) & 0x07e0f81f; |
820 | d = (d | d << 16) & 0x07e0f81f; |
821 | d += (s - d) * alpha >> 5; |
822 | d &= 0x07e0f81f; |
823 | *dstp++ = (Uint16)(d | d >> 16); |
824 | s = *srcp++; |
825 | d = *dstp; |
826 | /* |
827 | * shift out the middle component (green) to |
828 | * the high 16 bits, and process all three RGB |
829 | * components at the same time. |
830 | */ |
831 | s = (s | s << 16) & 0x07e0f81f; |
832 | d = (d | d << 16) & 0x07e0f81f; |
833 | d += (s - d) * alpha >> 5; |
834 | d &= 0x07e0f81f; |
835 | *dstp++ = (Uint16)(d | d >> 16); |
836 | },{ |
837 | src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */ |
838 | dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */ |
839 | |
840 | /* red */ |
841 | src2 = src1; |
842 | src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */ |
843 | |
844 | dst2 = dst1; |
845 | dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */ |
846 | |
847 | /* blend */ |
848 | src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ |
849 | src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ |
850 | src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */ |
851 | dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ |
852 | dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */ |
853 | |
854 | mm_res = dst2; /* RED -> mm_res */ |
855 | |
856 | /* green -- process the bits in place */ |
857 | src2 = src1; |
858 | src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */ |
859 | |
860 | dst2 = dst1; |
861 | dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */ |
862 | |
863 | /* blend */ |
864 | src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ |
865 | src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ |
866 | src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */ |
867 | dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ |
868 | |
869 | mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */ |
870 | |
871 | /* blue */ |
872 | src2 = src1; |
873 | src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */ |
874 | |
875 | dst2 = dst1; |
876 | dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */ |
877 | |
878 | /* blend */ |
879 | src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ |
880 | src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ |
881 | src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */ |
882 | dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ |
883 | dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */ |
884 | |
885 | mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */ |
886 | |
887 | *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */ |
888 | |
889 | srcp += 4; |
890 | dstp += 4; |
891 | }, width); |
892 | /* *INDENT-ON* */ |
893 | srcp += srcskip; |
894 | dstp += dstskip; |
895 | } |
896 | _mm_empty(); |
897 | } |
898 | } |
899 | |
900 | /* fast RGB555->RGB555 blending with surface alpha */ |
901 | static void |
902 | Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info) |
903 | { |
904 | unsigned alpha = info->a; |
905 | if (alpha == 128) { |
906 | Blit16to16SurfaceAlpha128(info, 0xfbde); |
907 | } else { |
908 | int width = info->dst_w; |
909 | int height = info->dst_h; |
910 | Uint16 *srcp = (Uint16 *) info->src; |
911 | int srcskip = info->src_skip >> 1; |
912 | Uint16 *dstp = (Uint16 *) info->dst; |
913 | int dstskip = info->dst_skip >> 1; |
914 | Uint32 s, d; |
915 | |
916 | __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha; |
917 | |
918 | alpha &= ~(1 + 2 + 4); /* cut alpha to get the exact same behaviour */ |
919 | mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */ |
920 | alpha >>= 3; /* downscale alpha to 5 bits */ |
921 | |
922 | mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ |
923 | mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ |
924 | /* position alpha to allow for mullo and mulhi on diff channels |
925 | to reduce the number of operations */ |
926 | mm_alpha = _mm_slli_si64(mm_alpha, 3); |
927 | |
928 | /* Setup the 555 color channel masks */ |
929 | rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */ |
930 | gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */ |
931 | bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */ |
932 | |
933 | while (height--) { |
934 | /* *INDENT-OFF* */ |
935 | DUFFS_LOOP_124( |
936 | { |
937 | s = *srcp++; |
938 | d = *dstp; |
939 | /* |
940 | * shift out the middle component (green) to |
941 | * the high 16 bits, and process all three RGB |
942 | * components at the same time. |
943 | */ |
944 | s = (s | s << 16) & 0x03e07c1f; |
945 | d = (d | d << 16) & 0x03e07c1f; |
946 | d += (s - d) * alpha >> 5; |
947 | d &= 0x03e07c1f; |
948 | *dstp++ = (Uint16)(d | d >> 16); |
949 | },{ |
950 | s = *srcp++; |
951 | d = *dstp; |
952 | /* |
953 | * shift out the middle component (green) to |
954 | * the high 16 bits, and process all three RGB |
955 | * components at the same time. |
956 | */ |
957 | s = (s | s << 16) & 0x03e07c1f; |
958 | d = (d | d << 16) & 0x03e07c1f; |
959 | d += (s - d) * alpha >> 5; |
960 | d &= 0x03e07c1f; |
961 | *dstp++ = (Uint16)(d | d >> 16); |
962 | s = *srcp++; |
963 | d = *dstp; |
964 | /* |
965 | * shift out the middle component (green) to |
966 | * the high 16 bits, and process all three RGB |
967 | * components at the same time. |
968 | */ |
969 | s = (s | s << 16) & 0x03e07c1f; |
970 | d = (d | d << 16) & 0x03e07c1f; |
971 | d += (s - d) * alpha >> 5; |
972 | d &= 0x03e07c1f; |
973 | *dstp++ = (Uint16)(d | d >> 16); |
974 | },{ |
975 | src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */ |
976 | dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */ |
977 | |
978 | /* red -- process the bits in place */ |
979 | src2 = src1; |
980 | src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */ |
981 | |
982 | dst2 = dst1; |
983 | dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */ |
984 | |
985 | /* blend */ |
986 | src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ |
987 | src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ |
988 | src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */ |
989 | dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ |
990 | dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */ |
991 | |
992 | mm_res = dst2; /* RED -> mm_res */ |
993 | |
994 | /* green -- process the bits in place */ |
995 | src2 = src1; |
996 | src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */ |
997 | |
998 | dst2 = dst1; |
999 | dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */ |
1000 | |
1001 | /* blend */ |
1002 | src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ |
1003 | src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ |
1004 | src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */ |
1005 | dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ |
1006 | |
1007 | mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */ |
1008 | |
1009 | /* blue */ |
1010 | src2 = src1; /* src -> src2 */ |
1011 | src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */ |
1012 | |
1013 | dst2 = dst1; /* dst -> dst2 */ |
1014 | dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */ |
1015 | |
1016 | /* blend */ |
1017 | src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ |
1018 | src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ |
1019 | src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */ |
1020 | dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ |
1021 | dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */ |
1022 | |
1023 | mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */ |
1024 | |
1025 | *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */ |
1026 | |
1027 | srcp += 4; |
1028 | dstp += 4; |
1029 | }, width); |
1030 | /* *INDENT-ON* */ |
1031 | srcp += srcskip; |
1032 | dstp += dstskip; |
1033 | } |
1034 | _mm_empty(); |
1035 | } |
1036 | } |
1037 | |
1038 | #endif /* __MMX__ */ |
1039 | |
1040 | /* fast RGB565->RGB565 blending with surface alpha */ |
1041 | static void |
1042 | Blit565to565SurfaceAlpha(SDL_BlitInfo * info) |
1043 | { |
1044 | unsigned alpha = info->a; |
1045 | if (alpha == 128) { |
1046 | Blit16to16SurfaceAlpha128(info, 0xf7de); |
1047 | } else { |
1048 | int width = info->dst_w; |
1049 | int height = info->dst_h; |
1050 | Uint16 *srcp = (Uint16 *) info->src; |
1051 | int srcskip = info->src_skip >> 1; |
1052 | Uint16 *dstp = (Uint16 *) info->dst; |
1053 | int dstskip = info->dst_skip >> 1; |
1054 | alpha >>= 3; /* downscale alpha to 5 bits */ |
1055 | |
1056 | while (height--) { |
1057 | /* *INDENT-OFF* */ |
1058 | DUFFS_LOOP4({ |
1059 | Uint32 s = *srcp++; |
1060 | Uint32 d = *dstp; |
1061 | /* |
1062 | * shift out the middle component (green) to |
1063 | * the high 16 bits, and process all three RGB |
1064 | * components at the same time. |
1065 | */ |
1066 | s = (s | s << 16) & 0x07e0f81f; |
1067 | d = (d | d << 16) & 0x07e0f81f; |
1068 | d += (s - d) * alpha >> 5; |
1069 | d &= 0x07e0f81f; |
1070 | *dstp++ = (Uint16)(d | d >> 16); |
1071 | }, width); |
1072 | /* *INDENT-ON* */ |
1073 | srcp += srcskip; |
1074 | dstp += dstskip; |
1075 | } |
1076 | } |
1077 | } |
1078 | |
1079 | /* fast RGB555->RGB555 blending with surface alpha */ |
1080 | static void |
1081 | Blit555to555SurfaceAlpha(SDL_BlitInfo * info) |
1082 | { |
1083 | unsigned alpha = info->a; /* downscale alpha to 5 bits */ |
1084 | if (alpha == 128) { |
1085 | Blit16to16SurfaceAlpha128(info, 0xfbde); |
1086 | } else { |
1087 | int width = info->dst_w; |
1088 | int height = info->dst_h; |
1089 | Uint16 *srcp = (Uint16 *) info->src; |
1090 | int srcskip = info->src_skip >> 1; |
1091 | Uint16 *dstp = (Uint16 *) info->dst; |
1092 | int dstskip = info->dst_skip >> 1; |
1093 | alpha >>= 3; /* downscale alpha to 5 bits */ |
1094 | |
1095 | while (height--) { |
1096 | /* *INDENT-OFF* */ |
1097 | DUFFS_LOOP4({ |
1098 | Uint32 s = *srcp++; |
1099 | Uint32 d = *dstp; |
1100 | /* |
1101 | * shift out the middle component (green) to |
1102 | * the high 16 bits, and process all three RGB |
1103 | * components at the same time. |
1104 | */ |
1105 | s = (s | s << 16) & 0x03e07c1f; |
1106 | d = (d | d << 16) & 0x03e07c1f; |
1107 | d += (s - d) * alpha >> 5; |
1108 | d &= 0x03e07c1f; |
1109 | *dstp++ = (Uint16)(d | d >> 16); |
1110 | }, width); |
1111 | /* *INDENT-ON* */ |
1112 | srcp += srcskip; |
1113 | dstp += dstskip; |
1114 | } |
1115 | } |
1116 | } |
1117 | |
1118 | /* fast ARGB8888->RGB565 blending with pixel alpha */ |
1119 | static void |
1120 | BlitARGBto565PixelAlpha(SDL_BlitInfo * info) |
1121 | { |
1122 | int width = info->dst_w; |
1123 | int height = info->dst_h; |
1124 | Uint32 *srcp = (Uint32 *) info->src; |
1125 | int srcskip = info->src_skip >> 2; |
1126 | Uint16 *dstp = (Uint16 *) info->dst; |
1127 | int dstskip = info->dst_skip >> 1; |
1128 | |
1129 | while (height--) { |
1130 | /* *INDENT-OFF* */ |
1131 | DUFFS_LOOP4({ |
1132 | Uint32 s = *srcp; |
1133 | unsigned alpha = s >> 27; /* downscale alpha to 5 bits */ |
1134 | /* FIXME: Here we special-case opaque alpha since the |
1135 | compositioning used (>>8 instead of /255) doesn't handle |
1136 | it correctly. Also special-case alpha=0 for speed? |
1137 | Benchmark this! */ |
1138 | if(alpha) { |
1139 | if(alpha == (SDL_ALPHA_OPAQUE >> 3)) { |
1140 | *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f)); |
1141 | } else { |
1142 | Uint32 d = *dstp; |
1143 | /* |
1144 | * convert source and destination to G0RAB65565 |
1145 | * and blend all components at the same time |
1146 | */ |
1147 | s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800) |
1148 | + (s >> 3 & 0x1f); |
1149 | d = (d | d << 16) & 0x07e0f81f; |
1150 | d += (s - d) * alpha >> 5; |
1151 | d &= 0x07e0f81f; |
1152 | *dstp = (Uint16)(d | d >> 16); |
1153 | } |
1154 | } |
1155 | srcp++; |
1156 | dstp++; |
1157 | }, width); |
1158 | /* *INDENT-ON* */ |
1159 | srcp += srcskip; |
1160 | dstp += dstskip; |
1161 | } |
1162 | } |
1163 | |
1164 | /* fast ARGB8888->RGB555 blending with pixel alpha */ |
1165 | static void |
1166 | BlitARGBto555PixelAlpha(SDL_BlitInfo * info) |
1167 | { |
1168 | int width = info->dst_w; |
1169 | int height = info->dst_h; |
1170 | Uint32 *srcp = (Uint32 *) info->src; |
1171 | int srcskip = info->src_skip >> 2; |
1172 | Uint16 *dstp = (Uint16 *) info->dst; |
1173 | int dstskip = info->dst_skip >> 1; |
1174 | |
1175 | while (height--) { |
1176 | /* *INDENT-OFF* */ |
1177 | DUFFS_LOOP4({ |
1178 | unsigned alpha; |
1179 | Uint32 s = *srcp; |
1180 | alpha = s >> 27; /* downscale alpha to 5 bits */ |
1181 | /* FIXME: Here we special-case opaque alpha since the |
1182 | compositioning used (>>8 instead of /255) doesn't handle |
1183 | it correctly. Also special-case alpha=0 for speed? |
1184 | Benchmark this! */ |
1185 | if(alpha) { |
1186 | if(alpha == (SDL_ALPHA_OPAQUE >> 3)) { |
1187 | *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f)); |
1188 | } else { |
1189 | Uint32 d = *dstp; |
1190 | /* |
1191 | * convert source and destination to G0RAB65565 |
1192 | * and blend all components at the same time |
1193 | */ |
1194 | s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00) |
1195 | + (s >> 3 & 0x1f); |
1196 | d = (d | d << 16) & 0x03e07c1f; |
1197 | d += (s - d) * alpha >> 5; |
1198 | d &= 0x03e07c1f; |
1199 | *dstp = (Uint16)(d | d >> 16); |
1200 | } |
1201 | } |
1202 | srcp++; |
1203 | dstp++; |
1204 | }, width); |
1205 | /* *INDENT-ON* */ |
1206 | srcp += srcskip; |
1207 | dstp += dstskip; |
1208 | } |
1209 | } |
1210 | |
1211 | /* General (slow) N->N blending with per-surface alpha */ |
1212 | static void |
1213 | BlitNtoNSurfaceAlpha(SDL_BlitInfo * info) |
1214 | { |
1215 | int width = info->dst_w; |
1216 | int height = info->dst_h; |
1217 | Uint8 *src = info->src; |
1218 | int srcskip = info->src_skip; |
1219 | Uint8 *dst = info->dst; |
1220 | int dstskip = info->dst_skip; |
1221 | SDL_PixelFormat *srcfmt = info->src_fmt; |
1222 | SDL_PixelFormat *dstfmt = info->dst_fmt; |
1223 | int srcbpp = srcfmt->BytesPerPixel; |
1224 | int dstbpp = dstfmt->BytesPerPixel; |
1225 | Uint32 Pixel; |
1226 | unsigned sR, sG, sB; |
1227 | unsigned dR, dG, dB, dA; |
1228 | const unsigned sA = info->a; |
1229 | |
1230 | if (sA) { |
1231 | while (height--) { |
1232 | /* *INDENT-OFF* */ |
1233 | DUFFS_LOOP4( |
1234 | { |
1235 | DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB); |
1236 | DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA); |
1237 | ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA); |
1238 | ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); |
1239 | src += srcbpp; |
1240 | dst += dstbpp; |
1241 | }, |
1242 | width); |
1243 | /* *INDENT-ON* */ |
1244 | src += srcskip; |
1245 | dst += dstskip; |
1246 | } |
1247 | } |
1248 | } |
1249 | |
1250 | /* General (slow) colorkeyed N->N blending with per-surface alpha */ |
1251 | static void |
1252 | BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info) |
1253 | { |
1254 | int width = info->dst_w; |
1255 | int height = info->dst_h; |
1256 | Uint8 *src = info->src; |
1257 | int srcskip = info->src_skip; |
1258 | Uint8 *dst = info->dst; |
1259 | int dstskip = info->dst_skip; |
1260 | SDL_PixelFormat *srcfmt = info->src_fmt; |
1261 | SDL_PixelFormat *dstfmt = info->dst_fmt; |
1262 | Uint32 ckey = info->colorkey; |
1263 | int srcbpp = srcfmt->BytesPerPixel; |
1264 | int dstbpp = dstfmt->BytesPerPixel; |
1265 | Uint32 Pixel; |
1266 | unsigned sR, sG, sB; |
1267 | unsigned dR, dG, dB, dA; |
1268 | const unsigned sA = info->a; |
1269 | |
1270 | while (height--) { |
1271 | /* *INDENT-OFF* */ |
1272 | DUFFS_LOOP4( |
1273 | { |
1274 | RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel); |
1275 | if(sA && Pixel != ckey) { |
1276 | RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); |
1277 | DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA); |
1278 | ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA); |
1279 | ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); |
1280 | } |
1281 | src += srcbpp; |
1282 | dst += dstbpp; |
1283 | }, |
1284 | width); |
1285 | /* *INDENT-ON* */ |
1286 | src += srcskip; |
1287 | dst += dstskip; |
1288 | } |
1289 | } |
1290 | |
1291 | /* General (slow) N->N blending with pixel alpha */ |
1292 | static void |
1293 | BlitNtoNPixelAlpha(SDL_BlitInfo * info) |
1294 | { |
1295 | int width = info->dst_w; |
1296 | int height = info->dst_h; |
1297 | Uint8 *src = info->src; |
1298 | int srcskip = info->src_skip; |
1299 | Uint8 *dst = info->dst; |
1300 | int dstskip = info->dst_skip; |
1301 | SDL_PixelFormat *srcfmt = info->src_fmt; |
1302 | SDL_PixelFormat *dstfmt = info->dst_fmt; |
1303 | int srcbpp; |
1304 | int dstbpp; |
1305 | Uint32 Pixel; |
1306 | unsigned sR, sG, sB, sA; |
1307 | unsigned dR, dG, dB, dA; |
1308 | |
1309 | /* Set up some basic variables */ |
1310 | srcbpp = srcfmt->BytesPerPixel; |
1311 | dstbpp = dstfmt->BytesPerPixel; |
1312 | |
1313 | while (height--) { |
1314 | /* *INDENT-OFF* */ |
1315 | DUFFS_LOOP4( |
1316 | { |
1317 | DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA); |
1318 | if(sA) { |
1319 | DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA); |
1320 | ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA); |
1321 | ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); |
1322 | } |
1323 | src += srcbpp; |
1324 | dst += dstbpp; |
1325 | }, |
1326 | width); |
1327 | /* *INDENT-ON* */ |
1328 | src += srcskip; |
1329 | dst += dstskip; |
1330 | } |
1331 | } |
1332 | |
1333 | |
1334 | SDL_BlitFunc |
1335 | SDL_CalculateBlitA(SDL_Surface * surface) |
1336 | { |
1337 | SDL_PixelFormat *sf = surface->format; |
1338 | SDL_PixelFormat *df = surface->map->dst->format; |
1339 | |
1340 | switch (surface->map->info.flags & ~SDL_COPY_RLE_MASK) { |
1341 | case SDL_COPY_BLEND: |
1342 | /* Per-pixel alpha blits */ |
1343 | switch (df->BytesPerPixel) { |
1344 | case 1: |
1345 | if (df->palette != NULL) { |
1346 | return BlitNto1PixelAlpha; |
1347 | } else { |
1348 | /* RGB332 has no palette ! */ |
1349 | return BlitNtoNPixelAlpha; |
1350 | } |
1351 | |
1352 | case 2: |
1353 | #if SDL_ARM_NEON_BLITTERS || SDL_ARM_SIMD_BLITTERS |
1354 | if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000 |
1355 | && sf->Gmask == 0xff00 && df->Gmask == 0x7e0 |
1356 | && ((sf->Rmask == 0xff && df->Rmask == 0x1f) |
1357 | || (sf->Bmask == 0xff && df->Bmask == 0x1f))) |
1358 | { |
1359 | #if SDL_ARM_NEON_BLITTERS |
1360 | if (SDL_HasNEON()) |
1361 | return BlitARGBto565PixelAlphaARMNEON; |
1362 | #endif |
1363 | #if SDL_ARM_SIMD_BLITTERS |
1364 | if (SDL_HasARMSIMD()) |
1365 | return BlitARGBto565PixelAlphaARMSIMD; |
1366 | #endif |
1367 | } |
1368 | #endif |
1369 | if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000 |
1370 | && sf->Gmask == 0xff00 |
1371 | && ((sf->Rmask == 0xff && df->Rmask == 0x1f) |
1372 | || (sf->Bmask == 0xff && df->Bmask == 0x1f))) { |
1373 | if (df->Gmask == 0x7e0) |
1374 | return BlitARGBto565PixelAlpha; |
1375 | else if (df->Gmask == 0x3e0) |
1376 | return BlitARGBto555PixelAlpha; |
1377 | } |
1378 | return BlitNtoNPixelAlpha; |
1379 | |
1380 | case 4: |
1381 | if (sf->Rmask == df->Rmask |
1382 | && sf->Gmask == df->Gmask |
1383 | && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) { |
1384 | #if defined(__MMX__) || defined(__3dNOW__) |
1385 | if (sf->Rshift % 8 == 0 |
1386 | && sf->Gshift % 8 == 0 |
1387 | && sf->Bshift % 8 == 0 |
1388 | && sf->Ashift % 8 == 0 && sf->Aloss == 0) { |
1389 | #ifdef __3dNOW__ |
1390 | if (SDL_Has3DNow()) |
1391 | return BlitRGBtoRGBPixelAlphaMMX3DNOW; |
1392 | #endif |
1393 | #ifdef __MMX__ |
1394 | if (SDL_HasMMX()) |
1395 | return BlitRGBtoRGBPixelAlphaMMX; |
1396 | #endif |
1397 | } |
1398 | #endif /* __MMX__ || __3dNOW__ */ |
1399 | if (sf->Amask == 0xff000000) { |
1400 | #if SDL_ARM_NEON_BLITTERS |
1401 | if (SDL_HasNEON()) |
1402 | return BlitRGBtoRGBPixelAlphaARMNEON; |
1403 | #endif |
1404 | #if SDL_ARM_SIMD_BLITTERS |
1405 | if (SDL_HasARMSIMD()) |
1406 | return BlitRGBtoRGBPixelAlphaARMSIMD; |
1407 | #endif |
1408 | return BlitRGBtoRGBPixelAlpha; |
1409 | } |
1410 | } |
1411 | return BlitNtoNPixelAlpha; |
1412 | |
1413 | case 3: |
1414 | default: |
1415 | break; |
1416 | } |
1417 | return BlitNtoNPixelAlpha; |
1418 | |
1419 | case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND: |
1420 | if (sf->Amask == 0) { |
1421 | /* Per-surface alpha blits */ |
1422 | switch (df->BytesPerPixel) { |
1423 | case 1: |
1424 | if (df->palette != NULL) { |
1425 | return BlitNto1SurfaceAlpha; |
1426 | } else { |
1427 | /* RGB332 has no palette ! */ |
1428 | return BlitNtoNSurfaceAlpha; |
1429 | } |
1430 | |
1431 | case 2: |
1432 | if (surface->map->identity) { |
1433 | if (df->Gmask == 0x7e0) { |
1434 | #ifdef __MMX__ |
1435 | if (SDL_HasMMX()) |
1436 | return Blit565to565SurfaceAlphaMMX; |
1437 | else |
1438 | #endif |
1439 | return Blit565to565SurfaceAlpha; |
1440 | } else if (df->Gmask == 0x3e0) { |
1441 | #ifdef __MMX__ |
1442 | if (SDL_HasMMX()) |
1443 | return Blit555to555SurfaceAlphaMMX; |
1444 | else |
1445 | #endif |
1446 | return Blit555to555SurfaceAlpha; |
1447 | } |
1448 | } |
1449 | return BlitNtoNSurfaceAlpha; |
1450 | |
1451 | case 4: |
1452 | if (sf->Rmask == df->Rmask |
1453 | && sf->Gmask == df->Gmask |
1454 | && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) { |
1455 | #ifdef __MMX__ |
1456 | if (sf->Rshift % 8 == 0 |
1457 | && sf->Gshift % 8 == 0 |
1458 | && sf->Bshift % 8 == 0 && SDL_HasMMX()) |
1459 | return BlitRGBtoRGBSurfaceAlphaMMX; |
1460 | #endif |
1461 | if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) { |
1462 | return BlitRGBtoRGBSurfaceAlpha; |
1463 | } |
1464 | } |
1465 | return BlitNtoNSurfaceAlpha; |
1466 | |
1467 | case 3: |
1468 | default: |
1469 | return BlitNtoNSurfaceAlpha; |
1470 | } |
1471 | } |
1472 | break; |
1473 | |
1474 | case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND: |
1475 | if (sf->Amask == 0) { |
1476 | if (df->BytesPerPixel == 1) { |
1477 | |
1478 | if (df->palette != NULL) { |
1479 | return BlitNto1SurfaceAlphaKey; |
1480 | } else { |
1481 | /* RGB332 has no palette ! */ |
1482 | return BlitNtoNSurfaceAlphaKey; |
1483 | } |
1484 | } else { |
1485 | return BlitNtoNSurfaceAlphaKey; |
1486 | } |
1487 | } |
1488 | break; |
1489 | } |
1490 | |
1491 | return NULL; |
1492 | } |
1493 | |
1494 | #endif /* SDL_HAVE_BLIT_A */ |
1495 | |
1496 | /* vi: set ts=4 sw=4 expandtab: */ |
1497 | |