1/*
2 Simple DirectMedia Layer
3 Copyright (C) 1997-2021 Sam Lantinga <slouken@libsdl.org>
4
5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the authors be held liable for any damages
7 arising from the use of this software.
8
9 Permission is granted to anyone to use this software for any purpose,
10 including commercial applications, and to alter it and redistribute it
11 freely, subject to the following restrictions:
12
13 1. The origin of this software must not be misrepresented; you must not
14 claim that you wrote the original software. If you use this software
15 in a product, an acknowledgment in the product documentation would be
16 appreciated but is not required.
17 2. Altered source versions must be plainly marked as such, and must not be
18 misrepresented as being the original software.
19 3. This notice may not be removed or altered from any source distribution.
20*/
21#include "../SDL_internal.h"
22
23#include "SDL_video.h"
24#include "SDL_blit.h"
25#include "SDL_render.h"
26
27static int SDL_LowerSoftStretchNearest(SDL_Surface *src, const SDL_Rect *srcrect, SDL_Surface *dst, const SDL_Rect *dstrect);
28static int SDL_LowerSoftStretchLinear(SDL_Surface *src, const SDL_Rect *srcrect, SDL_Surface *dst, const SDL_Rect *dstrect);
29static int SDL_UpperSoftStretch(SDL_Surface * src, const SDL_Rect * srcrect, SDL_Surface * dst, const SDL_Rect * dstrect, SDL_ScaleMode scaleMode);
30
31int
32SDL_SoftStretch(SDL_Surface *src, const SDL_Rect *srcrect,
33 SDL_Surface *dst, const SDL_Rect *dstrect)
34{
35 return SDL_UpperSoftStretch(src, srcrect, dst, dstrect, SDL_ScaleModeNearest);
36}
37
38int
39SDL_SoftStretchLinear(SDL_Surface *src, const SDL_Rect *srcrect,
40 SDL_Surface *dst, const SDL_Rect *dstrect)
41{
42 return SDL_UpperSoftStretch(src, srcrect, dst, dstrect, SDL_ScaleModeLinear);
43}
44
45static int
46SDL_UpperSoftStretch(SDL_Surface * src, const SDL_Rect * srcrect,
47 SDL_Surface * dst, const SDL_Rect * dstrect, SDL_ScaleMode scaleMode)
48{
49 int ret;
50 int src_locked;
51 int dst_locked;
52 SDL_Rect full_src;
53 SDL_Rect full_dst;
54
55 if (src->format->format != dst->format->format) {
56 return SDL_SetError("Only works with same format surfaces");
57 }
58
59 if (scaleMode != SDL_ScaleModeNearest) {
60 if (src->format->BytesPerPixel != 4 || src->format->format == SDL_PIXELFORMAT_ARGB2101010) {
61 return SDL_SetError("Wrong format");
62 }
63 }
64
65 /* Verify the blit rectangles */
66 if (srcrect) {
67 if ((srcrect->x < 0) || (srcrect->y < 0) ||
68 ((srcrect->x + srcrect->w) > src->w) ||
69 ((srcrect->y + srcrect->h) > src->h)) {
70 return SDL_SetError("Invalid source blit rectangle");
71 }
72 } else {
73 full_src.x = 0;
74 full_src.y = 0;
75 full_src.w = src->w;
76 full_src.h = src->h;
77 srcrect = &full_src;
78 }
79 if (dstrect) {
80 if ((dstrect->x < 0) || (dstrect->y < 0) ||
81 ((dstrect->x + dstrect->w) > dst->w) ||
82 ((dstrect->y + dstrect->h) > dst->h)) {
83 return SDL_SetError("Invalid destination blit rectangle");
84 }
85 } else {
86 full_dst.x = 0;
87 full_dst.y = 0;
88 full_dst.w = dst->w;
89 full_dst.h = dst->h;
90 dstrect = &full_dst;
91 }
92
93 if (dstrect->w <= 0 || dstrect->h <= 0) {
94 return 0;
95 }
96
97 if (srcrect->w > SDL_MAX_UINT16 || srcrect->h > SDL_MAX_UINT16 ||
98 dstrect->w > SDL_MAX_UINT16 || dstrect->h > SDL_MAX_UINT16) {
99 return SDL_SetError("Size too large for scaling");
100 }
101
102 /* Lock the destination if it's in hardware */
103 dst_locked = 0;
104 if (SDL_MUSTLOCK(dst)) {
105 if (SDL_LockSurface(dst) < 0) {
106 return SDL_SetError("Unable to lock destination surface");
107 }
108 dst_locked = 1;
109 }
110 /* Lock the source if it's in hardware */
111 src_locked = 0;
112 if (SDL_MUSTLOCK(src)) {
113 if (SDL_LockSurface(src) < 0) {
114 if (dst_locked) {
115 SDL_UnlockSurface(dst);
116 }
117 return SDL_SetError("Unable to lock source surface");
118 }
119 src_locked = 1;
120 }
121
122 if (scaleMode == SDL_ScaleModeNearest) {
123 ret = SDL_LowerSoftStretchNearest(src, srcrect, dst, dstrect);
124 } else {
125 ret = SDL_LowerSoftStretchLinear(src, srcrect, dst, dstrect);
126 }
127
128 /* We need to unlock the surfaces if they're locked */
129 if (dst_locked) {
130 SDL_UnlockSurface(dst);
131 }
132 if (src_locked) {
133 SDL_UnlockSurface(src);
134 }
135
136 return ret;
137}
138
139/* bilinear interpolation precision must be < 8
140 Because with SSE: add-multiply: _mm_madd_epi16 works with signed int
141 so pixels 0xb1...... are negatives and false the result
142 same in NEON probably */
143#define PRECISION 7
144
145#define FIXED_POINT(i) ((Uint32)(i) << 16)
146#define SRC_INDEX(fp) ((Uint32)(fp) >> 16)
147#define INTEGER(fp) ((Uint32)(fp) >> PRECISION)
148#define FRAC(fp) ((Uint32)(fp >> (16 - PRECISION)) & ((1<<PRECISION) - 1))
149#define FRAC_ZERO 0
150#define FRAC_ONE (1 << PRECISION)
151#define FP_ONE FIXED_POINT(1)
152
153#define BILINEAR___START \
154 int i; \
155 int fp_sum_h, fp_step_h, left_pad_h, right_pad_h; \
156 int fp_sum_w, fp_step_w, left_pad_w, right_pad_w; \
157 int fp_sum_w_init, left_pad_w_init, right_pad_w_init, dst_gap, middle_init; \
158 get_scaler_datas(src_h, dst_h, &fp_sum_h, &fp_step_h, &left_pad_h, &right_pad_h); \
159 get_scaler_datas(src_w, dst_w, &fp_sum_w, &fp_step_w, &left_pad_w, &right_pad_w); \
160 fp_sum_w_init = fp_sum_w + left_pad_w * fp_step_w; \
161 left_pad_w_init = left_pad_w; \
162 right_pad_w_init = right_pad_w; \
163 dst_gap = dst_pitch - 4 * dst_w; \
164 middle_init = dst_w - left_pad_w - right_pad_w; \
165
166#define BILINEAR___HEIGHT \
167 int index_h, frac_h0, frac_h1, middle; \
168 const Uint32 *src_h0, *src_h1; \
169 int no_padding, incr_h0, incr_h1; \
170 \
171 no_padding = !(i < left_pad_h || i > dst_h - 1 - right_pad_h); \
172 index_h = SRC_INDEX(fp_sum_h); \
173 frac_h0 = FRAC(fp_sum_h); \
174 \
175 index_h = no_padding ? index_h : (i < left_pad_h ? 0 : src_h - 1); \
176 frac_h0 = no_padding ? frac_h0 : 0; \
177 incr_h1 = no_padding ? src_pitch : 0; \
178 incr_h0 = index_h * src_pitch; \
179 \
180 src_h0 = (const Uint32 *)((const Uint8 *)src + incr_h0); \
181 src_h1 = (const Uint32 *)((const Uint8 *)src_h0 + incr_h1); \
182 \
183 fp_sum_h += fp_step_h; \
184 \
185 frac_h1 = FRAC_ONE - frac_h0; \
186 fp_sum_w = fp_sum_w_init; \
187 right_pad_w = right_pad_w_init; \
188 left_pad_w = left_pad_w_init; \
189 middle = middle_init; \
190
191
192
193#if defined(__clang__)
194// Remove inlining of this function
195// Compiler crash with clang 9.0.8 / android-ndk-r21d
196// Compiler crash with clang 11.0.3 / Xcode
197// OK with clang 11.0.5 / android-ndk-22
198// OK with clang 12.0.0 / Xcode
199__attribute__((noinline))
200#endif
201static void
202get_scaler_datas(int src_nb, int dst_nb, int *fp_start, int *fp_step, int *left_pad, int *right_pad)
203{
204
205 int step = FIXED_POINT(src_nb) / (dst_nb); /* source step in fixed point */
206 int x0 = FP_ONE / 2; /* dst first pixel center at 0.5 in fixed point */
207 int fp_sum;
208 int i;
209#if 0
210 /* scale to source coordinates */
211 x0 *= src_nb;
212 x0 /= dst_nb; /* x0 == step / 2 */
213#else
214 /* Use this code for perfect match with pixman */
215 Sint64 tmp[2];
216 tmp[0] = (Sint64)step * (x0 >> 16);
217 tmp[1] = (Sint64)step * (x0 & 0xFFFF);
218 x0 = (int) (tmp[0] + ((tmp[1] + 0x8000) >> 16)); /* x0 == (step + 1) / 2 */
219#endif
220 /* -= 0.5, get back the pixel origin, in source coordinates */
221 x0 -= FP_ONE / 2;
222
223 *fp_start = x0;
224 *fp_step = step;
225 *left_pad = 0;
226 *right_pad = 0;
227
228 fp_sum = x0;
229 for (i = 0; i < dst_nb; i++) {
230 if (fp_sum < 0) {
231 *left_pad += 1;
232 } else {
233 int index = SRC_INDEX(fp_sum);
234 if (index > src_nb - 2) {
235 *right_pad += 1;
236 }
237 }
238 fp_sum += step;
239 }
240// SDL_Log("%d -> %d x0=%d step=%d left_pad=%d right_pad=%d", src_nb, dst_nb, *fp_start, *fp_step, *left_pad, *right_pad);
241}
242
243typedef struct color_t {
244 Uint8 a;
245 Uint8 b;
246 Uint8 c;
247 Uint8 d;
248} color_t;
249
250#if 0
251static void
252printf_64(const char *str, void *var)
253{
254 uint8_t *val = (uint8_t*) var;
255 printf(" * %s: %02x %02x %02x %02x _ %02x %02x %02x %02x\n",
256 str, val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]);
257}
258#endif
259
260/* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
261
262static SDL_INLINE void
263INTERPOL(const Uint32 *src_x0, const Uint32 *src_x1, int frac0, int frac1, Uint32 *dst)
264{
265 const color_t *c0 = (const color_t *)src_x0;
266 const color_t *c1 = (const color_t *)src_x1;
267 color_t *cx = (color_t *)dst;
268#if 0
269 cx->a = c0->a + INTEGER(frac0 * (c1->a - c0->a));
270 cx->b = c0->b + INTEGER(frac0 * (c1->b - c0->b));
271 cx->c = c0->c + INTEGER(frac0 * (c1->c - c0->c));
272 cx->d = c0->d + INTEGER(frac0 * (c1->d - c0->d));
273#else
274 cx->a = INTEGER(frac1 * c0->a + frac0 * c1->a);
275 cx->b = INTEGER(frac1 * c0->b + frac0 * c1->b);
276 cx->c = INTEGER(frac1 * c0->c + frac0 * c1->c);
277 cx->d = INTEGER(frac1 * c0->d + frac0 * c1->d);
278#endif
279}
280
281static SDL_INLINE void
282INTERPOL_BILINEAR(const Uint32 *s0, const Uint32 *s1, int frac_w0, int frac_h0, int frac_h1, Uint32 *dst)
283{
284 Uint32 tmp[2];
285 unsigned int frac_w1 = FRAC_ONE - frac_w0;
286
287 /* Vertical first, store to 'tmp' */
288 INTERPOL(s0, s1, frac_h0, frac_h1, tmp);
289 INTERPOL(s0 + 1, s1 + 1, frac_h0, frac_h1, tmp + 1);
290
291 /* Horizontal, store to 'dst' */
292 INTERPOL(tmp, tmp + 1, frac_w0, frac_w1, dst);
293}
294
295static int
296scale_mat(const Uint32 *src, int src_w, int src_h, int src_pitch,
297 Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
298{
299 BILINEAR___START
300
301 for (i = 0; i < dst_h; i++) {
302
303 BILINEAR___HEIGHT
304
305 while (left_pad_w--) {
306 INTERPOL_BILINEAR(src_h0, src_h1, FRAC_ZERO, frac_h0, frac_h1, dst);
307 dst += 1;
308 }
309
310 while (middle--) {
311 const Uint32 *s_00_01;
312 const Uint32 *s_10_11;
313 int index_w = 4 * SRC_INDEX(fp_sum_w);
314 int frac_w = FRAC(fp_sum_w);
315 fp_sum_w += fp_step_w;
316
317/*
318 x00 ... x0_ ..... x01
319 . . .
320 . x .
321 . . .
322 . . .
323 x10 ... x1_ ..... x11
324*/
325 s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
326 s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
327
328 INTERPOL_BILINEAR(s_00_01, s_10_11, frac_w, frac_h0, frac_h1, dst);
329
330 dst += 1;
331 }
332
333 while (right_pad_w--) {
334 int index_w = 4 * (src_w - 2);
335 const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
336 const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
337 INTERPOL_BILINEAR(s_00_01, s_10_11, FRAC_ONE, frac_h0, frac_h1, dst);
338 dst += 1;
339 }
340 dst = (Uint32 *)((Uint8 *)dst + dst_gap);
341 }
342 return 0;
343}
344
345#if defined(__SSE2__)
346# define HAVE_SSE2_INTRINSICS 1
347#endif
348
349#if defined(__ARM_NEON)
350# define HAVE_NEON_INTRINSICS 1
351# define CAST_uint8x8_t (uint8x8_t)
352# define CAST_uint32x2_t (uint32x2_t)
353#endif
354
355#if defined(__WINRT__) || defined(_MSC_VER)
356# if defined(HAVE_NEON_INTRINSICS)
357# undef CAST_uint8x8_t
358# undef CAST_uint32x2_t
359# define CAST_uint8x8_t
360# define CAST_uint32x2_t
361# endif
362#endif
363
364#if defined(HAVE_SSE2_INTRINSICS)
365
366#if 0
367static void
368printf_128(const char *str, __m128i var)
369{
370 uint16_t *val = (uint16_t*) &var;
371 printf(" * %s: %04x %04x %04x %04x _ %04x %04x %04x %04x\n",
372 str, val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]);
373}
374#endif
375
376static SDL_INLINE int
377hasSSE2()
378{
379 static int val = -1;
380 if (val != -1) {
381 return val;
382 }
383 val = SDL_HasSSE2();
384 return val;
385}
386
387static SDL_INLINE void
388INTERPOL_BILINEAR_SSE(const Uint32 *s0, const Uint32 *s1, int frac_w, __m128i v_frac_h0, __m128i v_frac_h1, Uint32 *dst, __m128i zero)
389{
390 __m128i x_00_01, x_10_11; /* Pixels in 4*uint8 in row */
391 __m128i v_frac_w0, k0, l0, d0, e0;
392
393 int f, f2;
394 f = frac_w;
395 f2 = FRAC_ONE - frac_w;
396 v_frac_w0 = _mm_set_epi16(f, f2, f, f2, f, f2, f, f2);
397
398
399 x_00_01 = _mm_loadl_epi64((const __m128i *)s0); /* Load x00 and x01 */
400 x_10_11 = _mm_loadl_epi64((const __m128i *)s1);
401
402 /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
403
404 /* Interpolation vertical */
405 k0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_00_01, zero), v_frac_h1);
406 l0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_10_11, zero), v_frac_h0);
407 k0 = _mm_add_epi16(k0, l0);
408
409 /* For perfect match, clear the factionnal part eventually. */
410 /*
411 k0 = _mm_srli_epi16(k0, PRECISION);
412 k0 = _mm_slli_epi16(k0, PRECISION);
413 */
414
415 /* Interpolation horizontal */
416 l0 = _mm_unpacklo_epi64(/* unused */ l0, k0);
417 k0 = _mm_madd_epi16(_mm_unpackhi_epi16(l0, k0), v_frac_w0);
418
419 /* Store 1 pixel */
420 d0 = _mm_srli_epi32(k0, PRECISION * 2);
421 e0 = _mm_packs_epi32(d0, d0);
422 e0 = _mm_packus_epi16(e0, e0);
423 *dst = _mm_cvtsi128_si32(e0);
424}
425
426static int
427scale_mat_SSE(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
428{
429 BILINEAR___START
430
431 for (i = 0; i < dst_h; i++) {
432 int nb_block2;
433 __m128i v_frac_h0;
434 __m128i v_frac_h1;
435 __m128i zero;
436
437 BILINEAR___HEIGHT
438
439 nb_block2 = middle / 2;
440
441 v_frac_h0 = _mm_set_epi16(frac_h0, frac_h0, frac_h0, frac_h0, frac_h0, frac_h0, frac_h0, frac_h0);
442 v_frac_h1 = _mm_set_epi16(frac_h1, frac_h1, frac_h1, frac_h1, frac_h1, frac_h1, frac_h1, frac_h1);
443 zero = _mm_setzero_si128();
444
445 while (left_pad_w--) {
446 INTERPOL_BILINEAR_SSE(src_h0, src_h1, FRAC_ZERO, v_frac_h0, v_frac_h1, dst, zero);
447 dst += 1;
448 }
449
450 while (nb_block2--) {
451 int index_w_0, frac_w_0;
452 int index_w_1, frac_w_1;
453
454 const Uint32 *s_00_01, *s_02_03, *s_10_11, *s_12_13;
455
456 __m128i x_00_01, x_10_11, x_02_03, x_12_13;/* Pixels in 4*uint8 in row */
457 __m128i v_frac_w0, k0, l0, d0, e0;
458 __m128i v_frac_w1, k1, l1, d1, e1;
459
460 int f, f2;
461 index_w_0 = 4 * SRC_INDEX(fp_sum_w);
462 frac_w_0 = FRAC(fp_sum_w);
463 fp_sum_w += fp_step_w;
464 index_w_1 = 4 * SRC_INDEX(fp_sum_w);
465 frac_w_1 = FRAC(fp_sum_w);
466 fp_sum_w += fp_step_w;
467/*
468 x00............ x01 x02...........x03
469 . . . . . .
470 j0 f0 j1 j2 f1 j3
471 . . . . . .
472 . . . . . .
473 . . . . . .
474 x10............ x11 x12...........x13
475 */
476 s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_0);
477 s_02_03 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_1);
478 s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_0);
479 s_12_13 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_1);
480
481 f = frac_w_0;
482 f2 = FRAC_ONE - frac_w_0;
483 v_frac_w0 = _mm_set_epi16(f, f2, f, f2, f, f2, f, f2);
484
485 f = frac_w_1;
486 f2 = FRAC_ONE - frac_w_1;
487 v_frac_w1 = _mm_set_epi16(f, f2, f, f2, f, f2, f, f2);
488
489 x_00_01 = _mm_loadl_epi64((const __m128i *)s_00_01); /* Load x00 and x01 */
490 x_02_03 = _mm_loadl_epi64((const __m128i *)s_02_03);
491 x_10_11 = _mm_loadl_epi64((const __m128i *)s_10_11);
492 x_12_13 = _mm_loadl_epi64((const __m128i *)s_12_13);
493
494 /* Interpolation vertical */
495 k0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_00_01, zero), v_frac_h1);
496 l0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_10_11, zero), v_frac_h0);
497 k0 = _mm_add_epi16(k0, l0);
498 k1 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_02_03, zero), v_frac_h1);
499 l1 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_12_13, zero), v_frac_h0);
500 k1 = _mm_add_epi16(k1, l1);
501
502 /* Interpolation horizontal */
503 l0 = _mm_unpacklo_epi64(/* unused */ l0, k0);
504 k0 = _mm_madd_epi16(_mm_unpackhi_epi16(l0, k0), v_frac_w0);
505 l1 = _mm_unpacklo_epi64(/* unused */ l1, k1);
506 k1 = _mm_madd_epi16(_mm_unpackhi_epi16(l1, k1), v_frac_w1);
507
508 /* Store 1 pixel */
509 d0 = _mm_srli_epi32(k0, PRECISION * 2);
510 e0 = _mm_packs_epi32(d0, d0);
511 e0 = _mm_packus_epi16(e0, e0);
512 *dst++ = _mm_cvtsi128_si32(e0);
513
514 /* Store 1 pixel */
515 d1 = _mm_srli_epi32(k1, PRECISION * 2);
516 e1 = _mm_packs_epi32(d1, d1);
517 e1 = _mm_packus_epi16(e1, e1);
518 *dst++ = _mm_cvtsi128_si32(e1);
519 }
520
521 /* Last point */
522 if (middle & 0x1) {
523 const Uint32 *s_00_01;
524 const Uint32 *s_10_11;
525 int index_w = 4 * SRC_INDEX(fp_sum_w);
526 int frac_w = FRAC(fp_sum_w);
527 fp_sum_w += fp_step_w;
528 s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
529 s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
530 INTERPOL_BILINEAR_SSE(s_00_01, s_10_11, frac_w, v_frac_h0, v_frac_h1, dst, zero);
531 dst += 1;
532 }
533
534 while (right_pad_w--) {
535 int index_w = 4 * (src_w - 2);
536 const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
537 const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
538 INTERPOL_BILINEAR_SSE(s_00_01, s_10_11, FRAC_ONE, v_frac_h0, v_frac_h1, dst, zero);
539 dst += 1;
540 }
541 dst = (Uint32 *)((Uint8 *)dst + dst_gap);
542 }
543 return 0;
544}
545#endif
546
547#if defined(HAVE_NEON_INTRINSICS)
548
549static SDL_INLINE int
550hasNEON()
551{
552 static int val = -1;
553 if (val != -1) {
554 return val;
555 }
556 val = SDL_HasNEON();
557 return val;
558}
559
560static SDL_INLINE void
561INTERPOL_BILINEAR_NEON(const Uint32 *s0, const Uint32 *s1, int frac_w, uint8x8_t v_frac_h0, uint8x8_t v_frac_h1, Uint32 *dst)
562{
563 uint8x8_t x_00_01, x_10_11; /* Pixels in 4*uint8 in row */
564 uint16x8_t k0;
565 uint32x4_t l0;
566 uint16x8_t d0;
567 uint8x8_t e0;
568
569 x_00_01 = CAST_uint8x8_t vld1_u32(s0); /* Load 2 pixels */
570 x_10_11 = CAST_uint8x8_t vld1_u32(s1);
571
572 /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
573 k0 = vmull_u8(x_00_01, v_frac_h1); /* k0 := x0 * (1 - frac) */
574 k0 = vmlal_u8(k0, x_10_11, v_frac_h0); /* k0 += x1 * frac */
575
576 /* k0 now contains 2 interpolated pixels { j0, j1 } */
577 l0 = vshll_n_u16(vget_low_u16(k0), PRECISION);
578 l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w);
579 l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w);
580
581 /* Shift and narrow */
582 d0 = vcombine_u16(
583 /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION),
584 /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION)
585 );
586
587 /* Narrow again */
588 e0 = vmovn_u16(d0);
589
590 /* Store 1 pixel */
591 *dst = vget_lane_u32(CAST_uint32x2_t e0, 0);
592}
593
594 static int
595scale_mat_NEON(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
596{
597 BILINEAR___START
598
599 for (i = 0; i < dst_h; i++) {
600 int nb_block4;
601 uint8x8_t v_frac_h0, v_frac_h1;
602
603 BILINEAR___HEIGHT
604
605 nb_block4 = middle / 4;
606
607 v_frac_h0 = vmov_n_u8(frac_h0);
608 v_frac_h1 = vmov_n_u8(frac_h1);
609
610 while (left_pad_w--) {
611 INTERPOL_BILINEAR_NEON(src_h0, src_h1, FRAC_ZERO, v_frac_h0, v_frac_h1, dst);
612 dst += 1;
613 }
614
615 while (nb_block4--) {
616 int index_w_0, frac_w_0;
617 int index_w_1, frac_w_1;
618 int index_w_2, frac_w_2;
619 int index_w_3, frac_w_3;
620
621 const Uint32 *s_00_01, *s_02_03, *s_04_05, *s_06_07;
622 const Uint32 *s_10_11, *s_12_13, *s_14_15, *s_16_17;
623
624 uint8x8_t x_00_01, x_10_11, x_02_03, x_12_13;/* Pixels in 4*uint8 in row */
625 uint8x8_t x_04_05, x_14_15, x_06_07, x_16_17;
626
627 uint16x8_t k0, k1, k2, k3;
628 uint32x4_t l0, l1, l2, l3;
629 uint16x8_t d0, d1;
630 uint8x8_t e0, e1;
631 uint32x4_t f0;
632
633 index_w_0 = 4 * SRC_INDEX(fp_sum_w);
634 frac_w_0 = FRAC(fp_sum_w);
635 fp_sum_w += fp_step_w;
636 index_w_1 = 4 * SRC_INDEX(fp_sum_w);
637 frac_w_1 = FRAC(fp_sum_w);
638 fp_sum_w += fp_step_w;
639 index_w_2 = 4 * SRC_INDEX(fp_sum_w);
640 frac_w_2 = FRAC(fp_sum_w);
641 fp_sum_w += fp_step_w;
642 index_w_3 = 4 * SRC_INDEX(fp_sum_w);
643 frac_w_3 = FRAC(fp_sum_w);
644 fp_sum_w += fp_step_w;
645
646 s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_0);
647 s_02_03 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_1);
648 s_04_05 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_2);
649 s_06_07 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_3);
650 s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_0);
651 s_12_13 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_1);
652 s_14_15 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_2);
653 s_16_17 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_3);
654
655 /* Interpolation vertical */
656 x_00_01 = CAST_uint8x8_t vld1_u32(s_00_01); /* Load 2 pixels */
657 x_02_03 = CAST_uint8x8_t vld1_u32(s_02_03);
658 x_04_05 = CAST_uint8x8_t vld1_u32(s_04_05);
659 x_06_07 = CAST_uint8x8_t vld1_u32(s_06_07);
660 x_10_11 = CAST_uint8x8_t vld1_u32(s_10_11);
661 x_12_13 = CAST_uint8x8_t vld1_u32(s_12_13);
662 x_14_15 = CAST_uint8x8_t vld1_u32(s_14_15);
663 x_16_17 = CAST_uint8x8_t vld1_u32(s_16_17);
664
665 /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
666 k0 = vmull_u8(x_00_01, v_frac_h1); /* k0 := x0 * (1 - frac) */
667 k0 = vmlal_u8(k0, x_10_11, v_frac_h0); /* k0 += x1 * frac */
668
669 k1 = vmull_u8(x_02_03, v_frac_h1);
670 k1 = vmlal_u8(k1, x_12_13, v_frac_h0);
671
672 k2 = vmull_u8(x_04_05, v_frac_h1);
673 k2 = vmlal_u8(k2, x_14_15, v_frac_h0);
674
675 k3 = vmull_u8(x_06_07, v_frac_h1);
676 k3 = vmlal_u8(k3, x_16_17, v_frac_h0);
677
678 /* k0 now contains 2 interpolated pixels { j0, j1 } */
679 /* k1 now contains 2 interpolated pixels { j2, j3 } */
680 /* k2 now contains 2 interpolated pixels { j4, j5 } */
681 /* k3 now contains 2 interpolated pixels { j6, j7 } */
682
683 l0 = vshll_n_u16(vget_low_u16(k0), PRECISION);
684 l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w_0);
685 l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w_0);
686
687 l1 = vshll_n_u16(vget_low_u16(k1), PRECISION);
688 l1 = vmlsl_n_u16(l1, vget_low_u16(k1), frac_w_1);
689 l1 = vmlal_n_u16(l1, vget_high_u16(k1), frac_w_1);
690
691 l2 = vshll_n_u16(vget_low_u16(k2), PRECISION);
692 l2 = vmlsl_n_u16(l2, vget_low_u16(k2), frac_w_2);
693 l2 = vmlal_n_u16(l2, vget_high_u16(k2), frac_w_2);
694
695 l3 = vshll_n_u16(vget_low_u16(k3), PRECISION);
696 l3 = vmlsl_n_u16(l3, vget_low_u16(k3), frac_w_3);
697 l3 = vmlal_n_u16(l3, vget_high_u16(k3), frac_w_3);
698
699 /* shift and narrow */
700 d0 = vcombine_u16(
701 /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION),
702 /* uint16x4_t */ vshrn_n_u32(l1, 2 * PRECISION)
703 );
704 /* narrow again */
705 e0 = vmovn_u16(d0);
706
707 /* Shift and narrow */
708 d1 = vcombine_u16(
709 /* uint16x4_t */ vshrn_n_u32(l2, 2 * PRECISION),
710 /* uint16x4_t */ vshrn_n_u32(l3, 2 * PRECISION)
711 );
712 /* Narrow again */
713 e1 = vmovn_u16(d1);
714
715 f0 = vcombine_u32(CAST_uint32x2_t e0, CAST_uint32x2_t e1);
716 /* Store 4 pixels */
717 vst1q_u32(dst, f0);
718
719 dst += 4;
720 }
721
722 if (middle & 0x2) {
723 int index_w_0, frac_w_0;
724 int index_w_1, frac_w_1;
725 const Uint32 *s_00_01, *s_02_03;
726 const Uint32 *s_10_11, *s_12_13;
727 uint8x8_t x_00_01, x_10_11, x_02_03, x_12_13;/* Pixels in 4*uint8 in row */
728 uint16x8_t k0, k1;
729 uint32x4_t l0, l1;
730 uint16x8_t d0;
731 uint8x8_t e0;
732
733 index_w_0 = 4 * SRC_INDEX(fp_sum_w);
734 frac_w_0 = FRAC(fp_sum_w);
735 fp_sum_w += fp_step_w;
736 index_w_1 = 4 * SRC_INDEX(fp_sum_w);
737 frac_w_1 = FRAC(fp_sum_w);
738 fp_sum_w += fp_step_w;
739/*
740 x00............ x01 x02...........x03
741 . . . . . .
742 j0 dest0 j1 j2 dest1 j3
743 . . . . . .
744 . . . . . .
745 . . . . . .
746 x10............ x11 x12...........x13
747*/
748 s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_0);
749 s_02_03 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_1);
750 s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_0);
751 s_12_13 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_1);
752
753 /* Interpolation vertical */
754 x_00_01 = CAST_uint8x8_t vld1_u32(s_00_01);/* Load 2 pixels */
755 x_02_03 = CAST_uint8x8_t vld1_u32(s_02_03);
756 x_10_11 = CAST_uint8x8_t vld1_u32(s_10_11);
757 x_12_13 = CAST_uint8x8_t vld1_u32(s_12_13);
758
759 /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
760 k0 = vmull_u8(x_00_01, v_frac_h1); /* k0 := x0 * (1 - frac) */
761 k0 = vmlal_u8(k0, x_10_11, v_frac_h0); /* k0 += x1 * frac */
762
763 k1 = vmull_u8(x_02_03, v_frac_h1);
764 k1 = vmlal_u8(k1, x_12_13, v_frac_h0);
765
766 /* k0 now contains 2 interpolated pixels { j0, j1 } */
767 /* k1 now contains 2 interpolated pixels { j2, j3 } */
768
769 l0 = vshll_n_u16(vget_low_u16(k0), PRECISION);
770 l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w_0);
771 l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w_0);
772
773 l1 = vshll_n_u16(vget_low_u16(k1), PRECISION);
774 l1 = vmlsl_n_u16(l1, vget_low_u16(k1), frac_w_1);
775 l1 = vmlal_n_u16(l1, vget_high_u16(k1), frac_w_1);
776
777 /* Shift and narrow */
778
779 d0 = vcombine_u16(
780 /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION),
781 /* uint16x4_t */ vshrn_n_u32(l1, 2 * PRECISION)
782 );
783
784 /* Narrow again */
785 e0 = vmovn_u16(d0);
786
787 /* Store 2 pixels */
788 vst1_u32(dst, CAST_uint32x2_t e0);
789 dst += 2;
790 }
791
792 /* Last point */
793 if (middle & 0x1) {
794 int index_w = 4 * SRC_INDEX(fp_sum_w);
795 int frac_w = FRAC(fp_sum_w);
796 const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
797 const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
798 INTERPOL_BILINEAR_NEON(s_00_01, s_10_11, frac_w, v_frac_h0, v_frac_h1, dst);
799 dst += 1;
800 }
801
802 while (right_pad_w--) {
803 int index_w = 4 * (src_w - 2);
804 const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
805 const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
806 INTERPOL_BILINEAR_NEON(s_00_01, s_10_11, FRAC_ONE, v_frac_h0, v_frac_h1, dst);
807 dst += 1;
808 }
809
810 dst = (Uint32 *)((Uint8 *)dst + dst_gap);
811 }
812 return 0;
813}
814#endif
815
816int
817SDL_LowerSoftStretchLinear(SDL_Surface *s, const SDL_Rect *srcrect,
818 SDL_Surface *d, const SDL_Rect *dstrect)
819{
820 int ret = -1;
821 int src_w = srcrect->w;
822 int src_h = srcrect->h;
823 int dst_w = dstrect->w;
824 int dst_h = dstrect->h;
825 int src_pitch = s->pitch;
826 int dst_pitch = d->pitch;
827 Uint32 *src = (Uint32 *) ((Uint8 *)s->pixels + srcrect->x * 4 + srcrect->y * src_pitch);
828 Uint32 *dst = (Uint32 *) ((Uint8 *)d->pixels + dstrect->x * 4 + dstrect->y * dst_pitch);
829
830#if defined(HAVE_NEON_INTRINSICS)
831 if (ret == -1 && hasNEON()) {
832 ret = scale_mat_NEON(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
833 }
834#endif
835
836#if defined(HAVE_SSE2_INTRINSICS)
837 if (ret == -1 && hasSSE2()) {
838 ret = scale_mat_SSE(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
839 }
840#endif
841
842 if (ret == -1) {
843 ret = scale_mat(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
844 }
845
846 return ret;
847}
848
849
850#define SDL_SCALE_NEAREST__START \
851 int i; \
852 Uint32 posy, incy; \
853 Uint32 posx, incx; \
854 int dst_gap; \
855 int srcy, n; \
856 const Uint32 *src_h0; \
857 incy = (src_h << 16) / dst_h; \
858 incx = (src_w << 16) / dst_w; \
859 dst_gap = dst_pitch - bpp * dst_w; \
860 posy = incy / 2; \
861
862#define SDL_SCALE_NEAREST__HEIGHT \
863 srcy = (posy >> 16); \
864 src_h0 = (const Uint32 *)((const Uint8 *)src_ptr + srcy * src_pitch); \
865 posy += incy; \
866 posx = incx / 2; \
867 n = dst_w;
868
869
870static int
871scale_mat_nearest_1(const Uint32 *src_ptr, int src_w, int src_h, int src_pitch,
872 Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
873{
874 Uint32 bpp = 1;
875 SDL_SCALE_NEAREST__START
876 for (i = 0; i < dst_h; i++) {
877 SDL_SCALE_NEAREST__HEIGHT
878 while (n--) {
879 const Uint8 *src;
880 int srcx = bpp * (posx >> 16);
881 posx += incx;
882 src = (const Uint8 *)src_h0 + srcx;
883 *(Uint8*)dst = *src;
884 dst = (Uint32 *)((Uint8*)dst + bpp);
885 }
886 dst = (Uint32 *)((Uint8 *)dst + dst_gap);
887 }
888 return 0;
889}
890
891static int
892scale_mat_nearest_2(const Uint32 *src_ptr, int src_w, int src_h, int src_pitch,
893 Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
894{
895 Uint32 bpp = 2;
896 SDL_SCALE_NEAREST__START
897 for (i = 0; i < dst_h; i++) {
898 SDL_SCALE_NEAREST__HEIGHT
899 while (n--) {
900 const Uint16 *src;
901 int srcx = bpp * (posx >> 16);
902 posx += incx;
903 src = (const Uint16 *)((const Uint8 *)src_h0 + srcx);
904 *(Uint16*)dst = *src;
905 dst = (Uint32 *)((Uint8*)dst + bpp);
906 }
907 dst = (Uint32 *)((Uint8 *)dst + dst_gap);
908 }
909 return 0;
910}
911
912static int
913scale_mat_nearest_3(const Uint32 *src_ptr, int src_w, int src_h, int src_pitch,
914 Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
915{
916 Uint32 bpp = 3;
917 SDL_SCALE_NEAREST__START
918 for (i = 0; i < dst_h; i++) {
919 SDL_SCALE_NEAREST__HEIGHT
920 while (n--) {
921 const Uint8 *src;
922 int srcx = bpp * (posx >> 16);
923 posx += incx;
924 src = (const Uint8 *)src_h0 + srcx;
925 ((Uint8*)dst)[0] = src[0];
926 ((Uint8*)dst)[1] = src[1];
927 ((Uint8*)dst)[2] = src[2];
928 dst = (Uint32 *)((Uint8*)dst + bpp);
929 }
930 dst = (Uint32 *)((Uint8 *)dst + dst_gap);
931 }
932 return 0;
933}
934
935static int
936scale_mat_nearest_4(const Uint32 *src_ptr, int src_w, int src_h, int src_pitch,
937 Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
938{
939 Uint32 bpp = 4;
940 SDL_SCALE_NEAREST__START
941 for (i = 0; i < dst_h; i++) {
942 SDL_SCALE_NEAREST__HEIGHT
943 while (n--) {
944 const Uint32 *src;
945 int srcx = bpp * (posx >> 16);
946 posx += incx;
947 src = (const Uint32 *)((const Uint8 *)src_h0 + srcx);
948 *dst = *src;
949 dst = (Uint32 *)((Uint8*)dst + bpp);
950 }
951 dst = (Uint32 *)((Uint8 *)dst + dst_gap);
952 }
953 return 0;
954}
955
956int
957SDL_LowerSoftStretchNearest(SDL_Surface *s, const SDL_Rect *srcrect,
958 SDL_Surface *d, const SDL_Rect *dstrect)
959{
960 int src_w = srcrect->w;
961 int src_h = srcrect->h;
962 int dst_w = dstrect->w;
963 int dst_h = dstrect->h;
964 int src_pitch = s->pitch;
965 int dst_pitch = d->pitch;
966
967 const int bpp = d->format->BytesPerPixel;
968
969 Uint32 *src = (Uint32 *) ((Uint8 *)s->pixels + srcrect->x * bpp + srcrect->y * src_pitch);
970 Uint32 *dst = (Uint32 *) ((Uint8 *)d->pixels + dstrect->x * bpp + dstrect->y * dst_pitch);
971
972 if (bpp == 4) {
973 return scale_mat_nearest_4(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
974 } else if (bpp == 3) {
975 return scale_mat_nearest_3(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
976 } else if (bpp == 2) {
977 return scale_mat_nearest_2(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
978 } else {
979 return scale_mat_nearest_1(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
980 }
981}
982
983/* vi: set ts=4 sw=4 expandtab: */
984