1/*
2 Simple DirectMedia Layer
3 Copyright (C) 1997-2025 Sam Lantinga <slouken@libsdl.org>
4
5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the authors be held liable for any damages
7 arising from the use of this software.
8
9 Permission is granted to anyone to use this software for any purpose,
10 including commercial applications, and to alter it and redistribute it
11 freely, subject to the following restrictions:
12
13 1. The origin of this software must not be misrepresented; you must not
14 claim that you wrote the original software. If you use this software
15 in a product, an acknowledgment in the product documentation would be
16 appreciated but is not required.
17 2. Altered source versions must be plainly marked as such, and must not be
18 misrepresented as being the original software.
19 3. This notice may not be removed or altered from any source distribution.
20*/
21#include "SDL_internal.h"
22
23#include "SDL_surface_c.h"
24
25static bool SDL_StretchSurfaceUncheckedNearest(SDL_Surface *src, const SDL_Rect *srcrect, SDL_Surface *dst, const SDL_Rect *dstrect);
26static bool SDL_StretchSurfaceUncheckedLinear(SDL_Surface *src, const SDL_Rect *srcrect, SDL_Surface *dst, const SDL_Rect *dstrect);
27
28bool SDL_StretchSurface(SDL_Surface *src, const SDL_Rect *srcrect, SDL_Surface *dst, const SDL_Rect *dstrect, SDL_ScaleMode scaleMode)
29{
30 bool result;
31 int src_locked;
32 int dst_locked;
33 SDL_Rect full_src;
34 SDL_Rect full_dst;
35
36 if (!src) {
37 return SDL_InvalidParamError("src");
38 }
39 if (!dst) {
40 return SDL_InvalidParamError("dst");
41 }
42
43 if (src->format != dst->format) {
44 // Slow!
45 SDL_Surface *src_tmp = SDL_ConvertSurfaceAndColorspace(src, dst->format, dst->palette, dst->colorspace, dst->props);
46 if (!src_tmp) {
47 return false;
48 }
49 result = SDL_StretchSurface(src_tmp, srcrect, dst, dstrect, scaleMode);
50 SDL_DestroySurface(src_tmp);
51 return result;
52 }
53
54 if (SDL_ISPIXELFORMAT_FOURCC(src->format)) {
55 // Slow!
56 if (!dstrect) {
57 full_dst.x = 0;
58 full_dst.y = 0;
59 full_dst.w = dst->w;
60 full_dst.h = dst->h;
61 dstrect = &full_dst;
62 }
63
64 SDL_Surface *src_tmp = SDL_ConvertSurface(src, SDL_PIXELFORMAT_XRGB8888);
65 SDL_Surface *dst_tmp = SDL_CreateSurface(dstrect->w, dstrect->h, SDL_PIXELFORMAT_XRGB8888);
66 if (src_tmp && dst_tmp) {
67 result = SDL_StretchSurface(src_tmp, srcrect, dst_tmp, NULL, scaleMode);
68 if (result) {
69 result = SDL_ConvertPixelsAndColorspace(dstrect->w, dstrect->h,
70 dst_tmp->format, SDL_COLORSPACE_SRGB, 0,
71 dst_tmp->pixels, dst_tmp->pitch,
72 dst->format, dst->colorspace, SDL_GetSurfaceProperties(dst),
73 (Uint8 *)dst->pixels + dstrect->y * dst->pitch + dstrect->x * SDL_BYTESPERPIXEL(dst->format), dst->pitch);
74 }
75 } else {
76 result = false;
77 }
78 SDL_DestroySurface(src_tmp);
79 SDL_DestroySurface(dst_tmp);
80 return result;
81 }
82
83 switch (scaleMode) {
84 case SDL_SCALEMODE_NEAREST:
85 break;
86 case SDL_SCALEMODE_LINEAR:
87 break;
88 case SDL_SCALEMODE_PIXELART:
89 scaleMode = SDL_SCALEMODE_NEAREST;
90 break;
91 default:
92 return SDL_InvalidParamError("scaleMode");
93 }
94
95 if (scaleMode == SDL_SCALEMODE_LINEAR) {
96 if (SDL_BYTESPERPIXEL(src->format) != 4 || src->format == SDL_PIXELFORMAT_ARGB2101010) {
97 return SDL_SetError("Wrong format");
98 }
99 }
100
101 // Verify the blit rectangles
102 if (srcrect) {
103 if ((srcrect->x < 0) || (srcrect->y < 0) ||
104 ((srcrect->x + srcrect->w) > src->w) ||
105 ((srcrect->y + srcrect->h) > src->h)) {
106 return SDL_SetError("Invalid source blit rectangle");
107 }
108 } else {
109 full_src.x = 0;
110 full_src.y = 0;
111 full_src.w = src->w;
112 full_src.h = src->h;
113 srcrect = &full_src;
114 }
115 if (dstrect) {
116 if ((dstrect->x < 0) || (dstrect->y < 0) ||
117 ((dstrect->x + dstrect->w) > dst->w) ||
118 ((dstrect->y + dstrect->h) > dst->h)) {
119 return SDL_SetError("Invalid destination blit rectangle");
120 }
121 } else {
122 full_dst.x = 0;
123 full_dst.y = 0;
124 full_dst.w = dst->w;
125 full_dst.h = dst->h;
126 dstrect = &full_dst;
127 }
128
129 if (dstrect->w <= 0 || dstrect->h <= 0) {
130 return true;
131 }
132
133 if (srcrect->w > SDL_MAX_UINT16 || srcrect->h > SDL_MAX_UINT16 ||
134 dstrect->w > SDL_MAX_UINT16 || dstrect->h > SDL_MAX_UINT16) {
135 return SDL_SetError("Size too large for scaling");
136 }
137
138 // Lock the destination if it's in hardware
139 dst_locked = 0;
140 if (SDL_MUSTLOCK(dst)) {
141 if (!SDL_LockSurface(dst)) {
142 return SDL_SetError("Unable to lock destination surface");
143 }
144 dst_locked = 1;
145 }
146 // Lock the source if it's in hardware
147 src_locked = 0;
148 if (SDL_MUSTLOCK(src)) {
149 if (!SDL_LockSurface(src)) {
150 if (dst_locked) {
151 SDL_UnlockSurface(dst);
152 }
153 return SDL_SetError("Unable to lock source surface");
154 }
155 src_locked = 1;
156 }
157
158 if (scaleMode == SDL_SCALEMODE_NEAREST) {
159 result = SDL_StretchSurfaceUncheckedNearest(src, srcrect, dst, dstrect);
160 } else {
161 result = SDL_StretchSurfaceUncheckedLinear(src, srcrect, dst, dstrect);
162 }
163
164 // We need to unlock the surfaces if they're locked
165 if (dst_locked) {
166 SDL_UnlockSurface(dst);
167 }
168 if (src_locked) {
169 SDL_UnlockSurface(src);
170 }
171
172 return result;
173}
174
175/* bilinear interpolation precision must be < 8
176 Because with SSE: add-multiply: _mm_madd_epi16 works with signed int
177 so pixels 0xb1...... are negatives and false the result
178 same in NEON probably */
179#define PRECISION 7
180
181#define FIXED_POINT(i) ((Uint32)(i) << 16)
182#define SRC_INDEX(fp) ((Uint32)(fp) >> 16)
183#define INTEGER(fp) ((Uint32)(fp) >> PRECISION)
184#define FRAC(fp) ((Uint32)((fp) >> (16 - PRECISION)) & ((1 << PRECISION) - 1))
185#define FRAC_ZERO 0
186#define FRAC_ONE (1 << PRECISION)
187#define FP_ONE FIXED_POINT(1)
188
189#define BILINEAR___START \
190 int i; \
191 Sint64 fp_sum_h; \
192 int fp_step_h, left_pad_h, right_pad_h; \
193 Sint64 fp_sum_w; \
194 int fp_step_w, left_pad_w, right_pad_w; \
195 Sint64 fp_sum_w_init; \
196 int left_pad_w_init, right_pad_w_init, dst_gap, middle_init; \
197 get_scaler_datas(src_h, dst_h, &fp_sum_h, &fp_step_h, &left_pad_h, &right_pad_h); \
198 get_scaler_datas(src_w, dst_w, &fp_sum_w, &fp_step_w, &left_pad_w, &right_pad_w); \
199 fp_sum_w_init = fp_sum_w + left_pad_w * fp_step_w; \
200 left_pad_w_init = left_pad_w; \
201 right_pad_w_init = right_pad_w; \
202 dst_gap = dst_pitch - 4 * dst_w; \
203 middle_init = dst_w - left_pad_w - right_pad_w;
204
205#define BILINEAR___HEIGHT \
206 int index_h, frac_h0, frac_h1, middle; \
207 const Uint32 *src_h0, *src_h1; \
208 int no_padding; \
209 Uint64 incr_h0, incr_h1; \
210 \
211 no_padding = !(i < left_pad_h || i > dst_h - 1 - right_pad_h); \
212 index_h = SRC_INDEX(fp_sum_h); \
213 frac_h0 = FRAC(fp_sum_h); \
214 \
215 index_h = no_padding ? index_h : (i < left_pad_h ? 0 : src_h - 1); \
216 frac_h0 = no_padding ? frac_h0 : 0; \
217 incr_h1 = no_padding ? src_pitch : 0; \
218 incr_h0 = (Uint64)index_h * src_pitch; \
219 \
220 src_h0 = (const Uint32 *)((const Uint8 *)src + incr_h0); \
221 src_h1 = (const Uint32 *)((const Uint8 *)src_h0 + incr_h1); \
222 \
223 fp_sum_h += fp_step_h; \
224 \
225 frac_h1 = FRAC_ONE - frac_h0; \
226 fp_sum_w = fp_sum_w_init; \
227 right_pad_w = right_pad_w_init; \
228 left_pad_w = left_pad_w_init; \
229 middle = middle_init;
230
231#ifdef __clang__
232// Remove inlining of this function
233// Compiler crash with clang 9.0.8 / android-ndk-r21d
234// Compiler crash with clang 11.0.3 / Xcode
235// OK with clang 11.0.5 / android-ndk-22
236// OK with clang 12.0.0 / Xcode
237__attribute__((noinline))
238#endif
239static void get_scaler_datas(int src_nb, int dst_nb, Sint64 *fp_start, int *fp_step, int *left_pad, int *right_pad)
240{
241
242 int step = FIXED_POINT(src_nb) / (dst_nb); // source step in fixed point
243 int x0 = FP_ONE / 2; // dst first pixel center at 0.5 in fixed point
244 Sint64 fp_sum;
245 int i;
246#if 0
247 // scale to source coordinates
248 x0 *= src_nb;
249 x0 /= dst_nb; // x0 == step / 2
250#else
251 // Use this code for perfect match with pixman
252 Sint64 tmp[2];
253 tmp[0] = (Sint64)step * (x0 >> 16);
254 tmp[1] = (Sint64)step * (x0 & 0xFFFF);
255 x0 = (int)(tmp[0] + ((tmp[1] + 0x8000) >> 16)); // x0 == (step + 1) / 2
256#endif
257 // -= 0.5, get back the pixel origin, in source coordinates
258 x0 -= FP_ONE / 2;
259
260 *fp_start = x0;
261 *fp_step = step;
262 *left_pad = 0;
263 *right_pad = 0;
264
265 fp_sum = x0;
266 for (i = 0; i < dst_nb; i++) {
267 if (fp_sum < 0) {
268 *left_pad += 1;
269 } else {
270 int index = SRC_INDEX(fp_sum);
271 if (index > src_nb - 2) {
272 *right_pad += 1;
273 }
274 }
275 fp_sum += step;
276 }
277 // SDL_Log("%d -> %d x0=%d step=%d left_pad=%d right_pad=%d", src_nb, dst_nb, *fp_start, *fp_step, *left_pad, *right_pad);
278}
279
280typedef struct color_t
281{
282 Uint8 a;
283 Uint8 b;
284 Uint8 c;
285 Uint8 d;
286} color_t;
287
288#if 0
289static void printf_64(const char *str, void *var)
290{
291 uint8_t *val = (uint8_t*) var;
292 printf(" * %s: %02x %02x %02x %02x _ %02x %02x %02x %02x\n",
293 str, val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]);
294}
295#endif
296
297/* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
298
299static SDL_INLINE void INTERPOL(const Uint32 *src_x0, const Uint32 *src_x1, int frac0, int frac1, Uint32 *dst)
300{
301 const color_t *c0 = (const color_t *)src_x0;
302 const color_t *c1 = (const color_t *)src_x1;
303 color_t *cx = (color_t *)dst;
304#if 0
305 cx->a = c0->a + INTEGER(frac0 * (c1->a - c0->a));
306 cx->b = c0->b + INTEGER(frac0 * (c1->b - c0->b));
307 cx->c = c0->c + INTEGER(frac0 * (c1->c - c0->c));
308 cx->d = c0->d + INTEGER(frac0 * (c1->d - c0->d));
309#else
310 cx->a = (Uint8)INTEGER(frac1 * c0->a + frac0 * c1->a);
311 cx->b = (Uint8)INTEGER(frac1 * c0->b + frac0 * c1->b);
312 cx->c = (Uint8)INTEGER(frac1 * c0->c + frac0 * c1->c);
313 cx->d = (Uint8)INTEGER(frac1 * c0->d + frac0 * c1->d);
314#endif
315}
316
317static SDL_INLINE void INTERPOL_BILINEAR(const Uint32 *s0, const Uint32 *s1, int frac_w0, int frac_h0, int frac_h1, Uint32 *dst)
318{
319 Uint32 tmp[2];
320 unsigned int frac_w1 = FRAC_ONE - frac_w0;
321
322 // Vertical first, store to 'tmp'
323 INTERPOL(s0, s1, frac_h0, frac_h1, tmp);
324 INTERPOL(s0 + 1, s1 + 1, frac_h0, frac_h1, tmp + 1);
325
326 // Horizontal, store to 'dst'
327 INTERPOL(tmp, tmp + 1, frac_w0, frac_w1, dst);
328}
329
330static bool scale_mat(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
331{
332 BILINEAR___START
333
334 for (i = 0; i < dst_h; i++) {
335
336 BILINEAR___HEIGHT
337
338 while (left_pad_w--) {
339 INTERPOL_BILINEAR(src_h0, src_h1, FRAC_ZERO, frac_h0, frac_h1, dst);
340 dst += 1;
341 }
342
343 while (middle--) {
344 const Uint32 *s_00_01;
345 const Uint32 *s_10_11;
346 int index_w = 4 * SRC_INDEX(fp_sum_w);
347 int frac_w = FRAC(fp_sum_w);
348 fp_sum_w += fp_step_w;
349
350 /*
351 x00 ... x0_ ..... x01
352 . . .
353 . x .
354 . . .
355 . . .
356 x10 ... x1_ ..... x11
357 */
358 s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
359 s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
360
361 INTERPOL_BILINEAR(s_00_01, s_10_11, frac_w, frac_h0, frac_h1, dst);
362
363 dst += 1;
364 }
365
366 while (right_pad_w--) {
367 int index_w = 4 * (src_w - 2);
368 const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
369 const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
370 INTERPOL_BILINEAR(s_00_01, s_10_11, FRAC_ONE, frac_h0, frac_h1, dst);
371 dst += 1;
372 }
373 dst = (Uint32 *)((Uint8 *)dst + dst_gap);
374 }
375 return true;
376}
377
378#ifdef SDL_NEON_INTRINSICS
379#define CAST_uint8x8_t (uint8x8_t)
380#define CAST_uint32x2_t (uint32x2_t)
381#endif
382
383#if defined(_MSC_VER)
384#ifdef SDL_NEON_INTRINSICS
385#undef CAST_uint8x8_t
386#undef CAST_uint32x2_t
387#define CAST_uint8x8_t
388#define CAST_uint32x2_t
389#endif
390#endif
391
392#ifdef SDL_SSE2_INTRINSICS
393
394#if 0
395static void SDL_TARGETING("sse2") printf_128(const char *str, __m128i var)
396{
397 uint16_t *val = (uint16_t*) &var;
398 printf(" * %s: %04x %04x %04x %04x _ %04x %04x %04x %04x\n",
399 str, val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]);
400}
401#endif
402
403static SDL_INLINE int hasSSE2(void)
404{
405 static int val = -1;
406 if (val != -1) {
407 return val;
408 }
409 val = SDL_HasSSE2();
410 return val;
411}
412
413static SDL_INLINE void SDL_TARGETING("sse2") INTERPOL_BILINEAR_SSE(const Uint32 *s0, const Uint32 *s1, int frac_w, __m128i v_frac_h0, __m128i v_frac_h1, Uint32 *dst, __m128i zero)
414{
415 __m128i x_00_01, x_10_11; /* Pixels in 4*uint8 in row */
416 __m128i v_frac_w0, k0, l0, d0, e0;
417
418 int f, f2;
419 f = frac_w;
420 f2 = FRAC_ONE - frac_w;
421 v_frac_w0 = _mm_set_epi16((short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2);
422
423 x_00_01 = _mm_loadl_epi64((const __m128i *)s0); // Load x00 and x01
424 x_10_11 = _mm_loadl_epi64((const __m128i *)s1);
425
426 /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
427
428 // Interpolation vertical
429 k0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_00_01, zero), v_frac_h1);
430 l0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_10_11, zero), v_frac_h0);
431 k0 = _mm_add_epi16(k0, l0);
432
433 // For perfect match, clear the factionnal part eventually.
434 /*
435 k0 = _mm_srli_epi16(k0, PRECISION);
436 k0 = _mm_slli_epi16(k0, PRECISION);
437 */
438
439 // Interpolation horizontal
440 l0 = _mm_unpacklo_epi64(/* unused */ l0, k0);
441 k0 = _mm_madd_epi16(_mm_unpackhi_epi16(l0, k0), v_frac_w0);
442
443 // Store 1 pixel
444 d0 = _mm_srli_epi32(k0, PRECISION * 2);
445 e0 = _mm_packs_epi32(d0, d0);
446 e0 = _mm_packus_epi16(e0, e0);
447 *dst = _mm_cvtsi128_si32(e0);
448}
449
450static bool SDL_TARGETING("sse2") scale_mat_SSE(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
451{
452 BILINEAR___START
453
454 for (i = 0; i < dst_h; i++) {
455 int nb_block2;
456 __m128i v_frac_h0;
457 __m128i v_frac_h1;
458 __m128i zero;
459
460 BILINEAR___HEIGHT
461
462 nb_block2 = middle / 2;
463
464 v_frac_h0 = _mm_set_epi16((short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0);
465 v_frac_h1 = _mm_set_epi16((short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1);
466 zero = _mm_setzero_si128();
467
468 while (left_pad_w--) {
469 INTERPOL_BILINEAR_SSE(src_h0, src_h1, FRAC_ZERO, v_frac_h0, v_frac_h1, dst, zero);
470 dst += 1;
471 }
472
473 while (nb_block2--) {
474 int index_w_0, frac_w_0;
475 int index_w_1, frac_w_1;
476
477 const Uint32 *s_00_01, *s_02_03, *s_10_11, *s_12_13;
478
479 __m128i x_00_01, x_10_11, x_02_03, x_12_13; /* Pixels in 4*uint8 in row */
480 __m128i v_frac_w0, k0, l0, d0, e0;
481 __m128i v_frac_w1, k1, l1, d1, e1;
482
483 int f, f2;
484 index_w_0 = 4 * SRC_INDEX(fp_sum_w);
485 frac_w_0 = FRAC(fp_sum_w);
486 fp_sum_w += fp_step_w;
487 index_w_1 = 4 * SRC_INDEX(fp_sum_w);
488 frac_w_1 = FRAC(fp_sum_w);
489 fp_sum_w += fp_step_w;
490 /*
491 x00............ x01 x02...........x03
492 . . . . . .
493 j0 f0 j1 j2 f1 j3
494 . . . . . .
495 . . . . . .
496 . . . . . .
497 x10............ x11 x12...........x13
498 */
499 s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_0);
500 s_02_03 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_1);
501 s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_0);
502 s_12_13 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_1);
503
504 f = frac_w_0;
505 f2 = FRAC_ONE - frac_w_0;
506 v_frac_w0 = _mm_set_epi16((short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2);
507
508 f = frac_w_1;
509 f2 = FRAC_ONE - frac_w_1;
510 v_frac_w1 = _mm_set_epi16((short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2);
511
512 x_00_01 = _mm_loadl_epi64((const __m128i *)s_00_01); // Load x00 and x01
513 x_02_03 = _mm_loadl_epi64((const __m128i *)s_02_03);
514 x_10_11 = _mm_loadl_epi64((const __m128i *)s_10_11);
515 x_12_13 = _mm_loadl_epi64((const __m128i *)s_12_13);
516
517 // Interpolation vertical
518 k0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_00_01, zero), v_frac_h1);
519 l0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_10_11, zero), v_frac_h0);
520 k0 = _mm_add_epi16(k0, l0);
521 k1 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_02_03, zero), v_frac_h1);
522 l1 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_12_13, zero), v_frac_h0);
523 k1 = _mm_add_epi16(k1, l1);
524
525 // Interpolation horizontal
526 l0 = _mm_unpacklo_epi64(/* unused */ l0, k0);
527 k0 = _mm_madd_epi16(_mm_unpackhi_epi16(l0, k0), v_frac_w0);
528 l1 = _mm_unpacklo_epi64(/* unused */ l1, k1);
529 k1 = _mm_madd_epi16(_mm_unpackhi_epi16(l1, k1), v_frac_w1);
530
531 // Store 1 pixel
532 d0 = _mm_srli_epi32(k0, PRECISION * 2);
533 e0 = _mm_packs_epi32(d0, d0);
534 e0 = _mm_packus_epi16(e0, e0);
535 *dst++ = _mm_cvtsi128_si32(e0);
536
537 // Store 1 pixel
538 d1 = _mm_srli_epi32(k1, PRECISION * 2);
539 e1 = _mm_packs_epi32(d1, d1);
540 e1 = _mm_packus_epi16(e1, e1);
541 *dst++ = _mm_cvtsi128_si32(e1);
542 }
543
544 // Last point
545 if (middle & 0x1) {
546 const Uint32 *s_00_01;
547 const Uint32 *s_10_11;
548 int index_w = 4 * SRC_INDEX(fp_sum_w);
549 int frac_w = FRAC(fp_sum_w);
550 fp_sum_w += fp_step_w;
551 s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
552 s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
553 INTERPOL_BILINEAR_SSE(s_00_01, s_10_11, frac_w, v_frac_h0, v_frac_h1, dst, zero);
554 dst += 1;
555 }
556
557 while (right_pad_w--) {
558 int index_w = 4 * (src_w - 2);
559 const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
560 const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
561 INTERPOL_BILINEAR_SSE(s_00_01, s_10_11, FRAC_ONE, v_frac_h0, v_frac_h1, dst, zero);
562 dst += 1;
563 }
564 dst = (Uint32 *)((Uint8 *)dst + dst_gap);
565 }
566 return true;
567}
568#endif
569
570#ifdef SDL_NEON_INTRINSICS
571
572static SDL_INLINE int hasNEON(void)
573{
574 static int val = -1;
575 if (val != -1) {
576 return val;
577 }
578 val = SDL_HasNEON();
579 return val;
580}
581
582static SDL_INLINE void INTERPOL_BILINEAR_NEON(const Uint32 *s0, const Uint32 *s1, int frac_w, uint8x8_t v_frac_h0, uint8x8_t v_frac_h1, Uint32 *dst)
583{
584 uint8x8_t x_00_01, x_10_11; /* Pixels in 4*uint8 in row */
585 uint16x8_t k0;
586 uint32x4_t l0;
587 uint16x8_t d0;
588 uint8x8_t e0;
589
590 x_00_01 = CAST_uint8x8_t vld1_u32(s0); // Load 2 pixels
591 x_10_11 = CAST_uint8x8_t vld1_u32(s1);
592
593 /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
594 k0 = vmull_u8(x_00_01, v_frac_h1); /* k0 := x0 * (1 - frac) */
595 k0 = vmlal_u8(k0, x_10_11, v_frac_h0); /* k0 += x1 * frac */
596
597 // k0 now contains 2 interpolated pixels { j0, j1 }
598 l0 = vshll_n_u16(vget_low_u16(k0), PRECISION);
599 l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w);
600 l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w);
601
602 // Shift and narrow
603 d0 = vcombine_u16(
604 /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION),
605 /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION));
606
607 // Narrow again
608 e0 = vmovn_u16(d0);
609
610 // Store 1 pixel
611 *dst = vget_lane_u32(CAST_uint32x2_t e0, 0);
612}
613
614static bool scale_mat_NEON(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
615{
616 BILINEAR___START
617
618 for (i = 0; i < dst_h; i++) {
619 int nb_block4;
620 uint8x8_t v_frac_h0, v_frac_h1;
621
622 BILINEAR___HEIGHT
623
624 nb_block4 = middle / 4;
625
626 v_frac_h0 = vmov_n_u8(frac_h0);
627 v_frac_h1 = vmov_n_u8(frac_h1);
628
629 while (left_pad_w--) {
630 INTERPOL_BILINEAR_NEON(src_h0, src_h1, FRAC_ZERO, v_frac_h0, v_frac_h1, dst);
631 dst += 1;
632 }
633
634 while (nb_block4--) {
635 int index_w_0, frac_w_0;
636 int index_w_1, frac_w_1;
637 int index_w_2, frac_w_2;
638 int index_w_3, frac_w_3;
639
640 const Uint32 *s_00_01, *s_02_03, *s_04_05, *s_06_07;
641 const Uint32 *s_10_11, *s_12_13, *s_14_15, *s_16_17;
642
643 uint8x8_t x_00_01, x_10_11, x_02_03, x_12_13; /* Pixels in 4*uint8 in row */
644 uint8x8_t x_04_05, x_14_15, x_06_07, x_16_17;
645
646 uint16x8_t k0, k1, k2, k3;
647 uint32x4_t l0, l1, l2, l3;
648 uint16x8_t d0, d1;
649 uint8x8_t e0, e1;
650 uint32x4_t f0;
651
652 index_w_0 = 4 * SRC_INDEX(fp_sum_w);
653 frac_w_0 = FRAC(fp_sum_w);
654 fp_sum_w += fp_step_w;
655 index_w_1 = 4 * SRC_INDEX(fp_sum_w);
656 frac_w_1 = FRAC(fp_sum_w);
657 fp_sum_w += fp_step_w;
658 index_w_2 = 4 * SRC_INDEX(fp_sum_w);
659 frac_w_2 = FRAC(fp_sum_w);
660 fp_sum_w += fp_step_w;
661 index_w_3 = 4 * SRC_INDEX(fp_sum_w);
662 frac_w_3 = FRAC(fp_sum_w);
663 fp_sum_w += fp_step_w;
664
665 s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_0);
666 s_02_03 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_1);
667 s_04_05 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_2);
668 s_06_07 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_3);
669 s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_0);
670 s_12_13 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_1);
671 s_14_15 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_2);
672 s_16_17 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_3);
673
674 // Interpolation vertical
675 x_00_01 = CAST_uint8x8_t vld1_u32(s_00_01); // Load 2 pixels
676 x_02_03 = CAST_uint8x8_t vld1_u32(s_02_03);
677 x_04_05 = CAST_uint8x8_t vld1_u32(s_04_05);
678 x_06_07 = CAST_uint8x8_t vld1_u32(s_06_07);
679 x_10_11 = CAST_uint8x8_t vld1_u32(s_10_11);
680 x_12_13 = CAST_uint8x8_t vld1_u32(s_12_13);
681 x_14_15 = CAST_uint8x8_t vld1_u32(s_14_15);
682 x_16_17 = CAST_uint8x8_t vld1_u32(s_16_17);
683
684 /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
685 k0 = vmull_u8(x_00_01, v_frac_h1); /* k0 := x0 * (1 - frac) */
686 k0 = vmlal_u8(k0, x_10_11, v_frac_h0); /* k0 += x1 * frac */
687
688 k1 = vmull_u8(x_02_03, v_frac_h1);
689 k1 = vmlal_u8(k1, x_12_13, v_frac_h0);
690
691 k2 = vmull_u8(x_04_05, v_frac_h1);
692 k2 = vmlal_u8(k2, x_14_15, v_frac_h0);
693
694 k3 = vmull_u8(x_06_07, v_frac_h1);
695 k3 = vmlal_u8(k3, x_16_17, v_frac_h0);
696
697 // k0 now contains 2 interpolated pixels { j0, j1 }
698 // k1 now contains 2 interpolated pixels { j2, j3 }
699 // k2 now contains 2 interpolated pixels { j4, j5 }
700 // k3 now contains 2 interpolated pixels { j6, j7 }
701
702 l0 = vshll_n_u16(vget_low_u16(k0), PRECISION);
703 l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w_0);
704 l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w_0);
705
706 l1 = vshll_n_u16(vget_low_u16(k1), PRECISION);
707 l1 = vmlsl_n_u16(l1, vget_low_u16(k1), frac_w_1);
708 l1 = vmlal_n_u16(l1, vget_high_u16(k1), frac_w_1);
709
710 l2 = vshll_n_u16(vget_low_u16(k2), PRECISION);
711 l2 = vmlsl_n_u16(l2, vget_low_u16(k2), frac_w_2);
712 l2 = vmlal_n_u16(l2, vget_high_u16(k2), frac_w_2);
713
714 l3 = vshll_n_u16(vget_low_u16(k3), PRECISION);
715 l3 = vmlsl_n_u16(l3, vget_low_u16(k3), frac_w_3);
716 l3 = vmlal_n_u16(l3, vget_high_u16(k3), frac_w_3);
717
718 // shift and narrow
719 d0 = vcombine_u16(
720 /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION),
721 /* uint16x4_t */ vshrn_n_u32(l1, 2 * PRECISION));
722 // narrow again
723 e0 = vmovn_u16(d0);
724
725 // Shift and narrow
726 d1 = vcombine_u16(
727 /* uint16x4_t */ vshrn_n_u32(l2, 2 * PRECISION),
728 /* uint16x4_t */ vshrn_n_u32(l3, 2 * PRECISION));
729 // Narrow again
730 e1 = vmovn_u16(d1);
731
732 f0 = vcombine_u32(CAST_uint32x2_t e0, CAST_uint32x2_t e1);
733 // Store 4 pixels
734 vst1q_u32(dst, f0);
735
736 dst += 4;
737 }
738
739 if (middle & 0x2) {
740 int index_w_0, frac_w_0;
741 int index_w_1, frac_w_1;
742 const Uint32 *s_00_01, *s_02_03;
743 const Uint32 *s_10_11, *s_12_13;
744 uint8x8_t x_00_01, x_10_11, x_02_03, x_12_13; /* Pixels in 4*uint8 in row */
745 uint16x8_t k0, k1;
746 uint32x4_t l0, l1;
747 uint16x8_t d0;
748 uint8x8_t e0;
749
750 index_w_0 = 4 * SRC_INDEX(fp_sum_w);
751 frac_w_0 = FRAC(fp_sum_w);
752 fp_sum_w += fp_step_w;
753 index_w_1 = 4 * SRC_INDEX(fp_sum_w);
754 frac_w_1 = FRAC(fp_sum_w);
755 fp_sum_w += fp_step_w;
756 /*
757 x00............ x01 x02...........x03
758 . . . . . .
759 j0 dest0 j1 j2 dest1 j3
760 . . . . . .
761 . . . . . .
762 . . . . . .
763 x10............ x11 x12...........x13
764 */
765 s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_0);
766 s_02_03 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_1);
767 s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_0);
768 s_12_13 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_1);
769
770 // Interpolation vertical
771 x_00_01 = CAST_uint8x8_t vld1_u32(s_00_01); // Load 2 pixels
772 x_02_03 = CAST_uint8x8_t vld1_u32(s_02_03);
773 x_10_11 = CAST_uint8x8_t vld1_u32(s_10_11);
774 x_12_13 = CAST_uint8x8_t vld1_u32(s_12_13);
775
776 /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
777 k0 = vmull_u8(x_00_01, v_frac_h1); /* k0 := x0 * (1 - frac) */
778 k0 = vmlal_u8(k0, x_10_11, v_frac_h0); /* k0 += x1 * frac */
779
780 k1 = vmull_u8(x_02_03, v_frac_h1);
781 k1 = vmlal_u8(k1, x_12_13, v_frac_h0);
782
783 // k0 now contains 2 interpolated pixels { j0, j1 }
784 // k1 now contains 2 interpolated pixels { j2, j3 }
785
786 l0 = vshll_n_u16(vget_low_u16(k0), PRECISION);
787 l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w_0);
788 l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w_0);
789
790 l1 = vshll_n_u16(vget_low_u16(k1), PRECISION);
791 l1 = vmlsl_n_u16(l1, vget_low_u16(k1), frac_w_1);
792 l1 = vmlal_n_u16(l1, vget_high_u16(k1), frac_w_1);
793
794 // Shift and narrow
795
796 d0 = vcombine_u16(
797 /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION),
798 /* uint16x4_t */ vshrn_n_u32(l1, 2 * PRECISION));
799
800 // Narrow again
801 e0 = vmovn_u16(d0);
802
803 // Store 2 pixels
804 vst1_u32(dst, CAST_uint32x2_t e0);
805 dst += 2;
806 }
807
808 // Last point
809 if (middle & 0x1) {
810 int index_w = 4 * SRC_INDEX(fp_sum_w);
811 int frac_w = FRAC(fp_sum_w);
812 const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
813 const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
814 INTERPOL_BILINEAR_NEON(s_00_01, s_10_11, frac_w, v_frac_h0, v_frac_h1, dst);
815 dst += 1;
816 }
817
818 while (right_pad_w--) {
819 int index_w = 4 * (src_w - 2);
820 const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
821 const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
822 INTERPOL_BILINEAR_NEON(s_00_01, s_10_11, FRAC_ONE, v_frac_h0, v_frac_h1, dst);
823 dst += 1;
824 }
825
826 dst = (Uint32 *)((Uint8 *)dst + dst_gap);
827 }
828 return true;
829}
830#endif
831
832bool SDL_StretchSurfaceUncheckedLinear(SDL_Surface *s, const SDL_Rect *srcrect, SDL_Surface *d, const SDL_Rect *dstrect)
833{
834 bool result = false;
835 int src_w = srcrect->w;
836 int src_h = srcrect->h;
837 int dst_w = dstrect->w;
838 int dst_h = dstrect->h;
839 int src_pitch = s->pitch;
840 int dst_pitch = d->pitch;
841 Uint32 *src = (Uint32 *)((Uint8 *)s->pixels + srcrect->x * 4 + srcrect->y * src_pitch);
842 Uint32 *dst = (Uint32 *)((Uint8 *)d->pixels + dstrect->x * 4 + dstrect->y * dst_pitch);
843
844#ifdef SDL_NEON_INTRINSICS
845 if (!result && hasNEON()) {
846 result = scale_mat_NEON(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
847 }
848#endif
849
850#ifdef SDL_SSE2_INTRINSICS
851 if (!result && hasSSE2()) {
852 result = scale_mat_SSE(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
853 }
854#endif
855
856 if (!result) {
857 result = scale_mat(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
858 }
859
860 return result;
861}
862
863#define SDL_SCALE_NEAREST__START \
864 int i; \
865 Uint64 posy, incy; \
866 Uint64 posx, incx; \
867 Uint64 srcy, srcx; \
868 int dst_gap, n; \
869 const Uint32 *src_h0; \
870 incy = ((Uint64)src_h << 16) / dst_h; \
871 incx = ((Uint64)src_w << 16) / dst_w; \
872 dst_gap = dst_pitch - bpp * dst_w; \
873 posy = incy / 2;
874
875#define SDL_SCALE_NEAREST__HEIGHT \
876 srcy = (posy >> 16); \
877 src_h0 = (const Uint32 *)((const Uint8 *)src_ptr + srcy * src_pitch); \
878 posy += incy; \
879 posx = incx / 2; \
880 n = dst_w;
881
882static bool scale_mat_nearest_1(const Uint32 *src_ptr, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
883{
884 Uint32 bpp = 1;
885 SDL_SCALE_NEAREST__START
886 for (i = 0; i < dst_h; i++) {
887 SDL_SCALE_NEAREST__HEIGHT
888 while (n--) {
889 const Uint8 *src;
890 srcx = bpp * (posx >> 16);
891 posx += incx;
892 src = (const Uint8 *)src_h0 + srcx;
893 *(Uint8 *)dst = *src;
894 dst = (Uint32 *)((Uint8 *)dst + bpp);
895 }
896 dst = (Uint32 *)((Uint8 *)dst + dst_gap);
897 }
898 return true;
899}
900
901static bool scale_mat_nearest_2(const Uint32 *src_ptr, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
902{
903 Uint32 bpp = 2;
904 SDL_SCALE_NEAREST__START
905 for (i = 0; i < dst_h; i++) {
906 SDL_SCALE_NEAREST__HEIGHT
907 while (n--) {
908 const Uint16 *src;
909 srcx = bpp * (posx >> 16);
910 posx += incx;
911 src = (const Uint16 *)((const Uint8 *)src_h0 + srcx);
912 *(Uint16 *)dst = *src;
913 dst = (Uint32 *)((Uint8 *)dst + bpp);
914 }
915 dst = (Uint32 *)((Uint8 *)dst + dst_gap);
916 }
917 return true;
918}
919
920static bool scale_mat_nearest_3(const Uint32 *src_ptr, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
921{
922 Uint32 bpp = 3;
923 SDL_SCALE_NEAREST__START
924 for (i = 0; i < dst_h; i++) {
925 SDL_SCALE_NEAREST__HEIGHT
926 while (n--) {
927 const Uint8 *src;
928 srcx = bpp * (posx >> 16);
929 posx += incx;
930 src = (const Uint8 *)src_h0 + srcx;
931 ((Uint8 *)dst)[0] = src[0];
932 ((Uint8 *)dst)[1] = src[1];
933 ((Uint8 *)dst)[2] = src[2];
934 dst = (Uint32 *)((Uint8 *)dst + bpp);
935 }
936 dst = (Uint32 *)((Uint8 *)dst + dst_gap);
937 }
938 return true;
939}
940
941static bool scale_mat_nearest_4(const Uint32 *src_ptr, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
942{
943 Uint32 bpp = 4;
944 SDL_SCALE_NEAREST__START
945 for (i = 0; i < dst_h; i++) {
946 SDL_SCALE_NEAREST__HEIGHT
947 while (n--) {
948 const Uint32 *src;
949 srcx = bpp * (posx >> 16);
950 posx += incx;
951 src = (const Uint32 *)((const Uint8 *)src_h0 + srcx);
952 *dst = *src;
953 dst = (Uint32 *)((Uint8 *)dst + bpp);
954 }
955 dst = (Uint32 *)((Uint8 *)dst + dst_gap);
956 }
957 return true;
958}
959
960bool SDL_StretchSurfaceUncheckedNearest(SDL_Surface *s, const SDL_Rect *srcrect, SDL_Surface *d, const SDL_Rect *dstrect)
961{
962 int src_w = srcrect->w;
963 int src_h = srcrect->h;
964 int dst_w = dstrect->w;
965 int dst_h = dstrect->h;
966 int src_pitch = s->pitch;
967 int dst_pitch = d->pitch;
968 int bpp = SDL_BYTESPERPIXEL(d->format);
969
970 Uint32 *src = (Uint32 *)((Uint8 *)s->pixels + srcrect->x * bpp + srcrect->y * src_pitch);
971 Uint32 *dst = (Uint32 *)((Uint8 *)d->pixels + dstrect->x * bpp + dstrect->y * dst_pitch);
972
973 if (bpp == 4) {
974 return scale_mat_nearest_4(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
975 } else if (bpp == 3) {
976 return scale_mat_nearest_3(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
977 } else if (bpp == 2) {
978 return scale_mat_nearest_2(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
979 } else {
980 return scale_mat_nearest_1(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
981 }
982}
983