SDL_stretch.c source code [SDL/src/video/SDL_stretch.c]

1	/*
2	Simple DirectMedia Layer
3	Copyright (C) 1997-2025 Sam Lantinga <slouken@libsdl.org>
4
5	This software is provided 'as-is', without any express or implied
6	warranty. In no event will the authors be held liable for any damages
7	arising from the use of this software.
8
9	Permission is granted to anyone to use this software for any purpose,
10	including commercial applications, and to alter it and redistribute it
11	freely, subject to the following restrictions:
12
13	1. The origin of this software must not be misrepresented; you must not
14	claim that you wrote the original software. If you use this software
15	in a product, an acknowledgment in the product documentation would be
16	appreciated but is not required.
17	2. Altered source versions must be plainly marked as such, and must not be
18	misrepresented as being the original software.
19	3. This notice may not be removed or altered from any source distribution.
20	*/
21	#include "SDL_internal.h"
22
23	#include "SDL_surface_c.h"
24
25	static bool SDL_StretchSurfaceUncheckedNearest(SDL_Surface src, const* SDL_Rect srcrect, SDL_Surface dst, const SDL_Rect *dstrect);
26	static bool SDL_StretchSurfaceUncheckedLinear(SDL_Surface src, const* SDL_Rect srcrect, SDL_Surface dst, const SDL_Rect *dstrect);
27
28	bool SDL_StretchSurface(SDL_Surface src, const* SDL_Rect srcrect, SDL_Surface dst, const SDL_Rect *dstrect, SDL_ScaleMode scaleMode)
29	{
30	bool result;
31	int src_locked;
32	int dst_locked;
33	SDL_Rect full_src;
34	SDL_Rect full_dst;
35
36	if (!src) {
37	return SDL_InvalidParamError("src");
38	}
39	if (!dst) {
40	return SDL_InvalidParamError("dst");
41	}
42
43	if (src->format != dst->format) {
44	// Slow!
45	SDL_Surface *src_tmp = SDL_ConvertSurfaceAndColorspace(src, dst->format, dst->palette, dst->colorspace, dst->props);
46	if (!src_tmp) {
47	return false;
48	}
49	result = SDL_StretchSurface(src_tmp, srcrect, dst, dstrect, scaleMode);
50	SDL_DestroySurface(src_tmp);
51	return result;
52	}
53
54	if (SDL_ISPIXELFORMAT_FOURCC(src->format)) {
55	// Slow!
56	if (!dstrect) {
57	full_dst.x = `0`;
58	full_dst.y = `0`;
59	full_dst.w = dst->w;
60	full_dst.h = dst->h;
61	dstrect = &full_dst;
62	}
63
64	SDL_Surface *src_tmp = SDL_ConvertSurface(src, SDL_PIXELFORMAT_XRGB8888);
65	SDL_Surface *dst_tmp = SDL_CreateSurface(dstrect->w, dstrect->h, SDL_PIXELFORMAT_XRGB8888);
66	if (src_tmp && dst_tmp) {
67	result = SDL_StretchSurface(src_tmp, srcrect, dst_tmp, NULL, scaleMode);
68	if (result) {
69	result = SDL_ConvertPixelsAndColorspace(dstrect->w, dstrect->h,
70	dst_tmp->format, SDL_COLORSPACE_SRGB, `0`,
71	dst_tmp->pixels, dst_tmp->pitch,
72	dst->format, dst->colorspace, SDL_GetSurfaceProperties(dst),
73	(Uint8 )dst->pixels + dstrect->y dst->pitch + dstrect->x * SDL_BYTESPERPIXEL(dst->format), dst->pitch);
74	}
75	} else {
76	result = false;
77	}
78	SDL_DestroySurface(src_tmp);
79	SDL_DestroySurface(dst_tmp);
80	return result;
81	}
82
83	switch (scaleMode) {
84	case SDL_SCALEMODE_NEAREST:
85	break;
86	case SDL_SCALEMODE_LINEAR:
87	break;
88	case SDL_SCALEMODE_PIXELART:
89	scaleMode = SDL_SCALEMODE_NEAREST;
90	break;
91	default:
92	return SDL_InvalidParamError("scaleMode");
93	}
94
95	if (scaleMode == SDL_SCALEMODE_LINEAR) {
96	if (SDL_BYTESPERPIXEL(src->format) != `4` \|\| src->format == SDL_PIXELFORMAT_ARGB2101010) {
97	return SDL_SetError("Wrong format");
98	}
99	}
100
101	// Verify the blit rectangles
102	if (srcrect) {
103	if ((srcrect->x < `0`) \|\| (srcrect->y < `0`) \|\|
104	((srcrect->x + srcrect->w) > src->w) \|\|
105	((srcrect->y + srcrect->h) > src->h)) {
106	return SDL_SetError("Invalid source blit rectangle");
107	}
108	} else {
109	full_src.x = `0`;
110	full_src.y = `0`;
111	full_src.w = src->w;
112	full_src.h = src->h;
113	srcrect = &full_src;
114	}
115	if (dstrect) {
116	if ((dstrect->x < `0`) \|\| (dstrect->y < `0`) \|\|
117	((dstrect->x + dstrect->w) > dst->w) \|\|
118	((dstrect->y + dstrect->h) > dst->h)) {
119	return SDL_SetError("Invalid destination blit rectangle");
120	}
121	} else {
122	full_dst.x = `0`;
123	full_dst.y = `0`;
124	full_dst.w = dst->w;
125	full_dst.h = dst->h;
126	dstrect = &full_dst;
127	}
128
129	if (dstrect->w <= `0` \|\| dstrect->h <= `0`) {
130	return true;
131	}
132
133	if (srcrect->w > SDL_MAX_UINT16 \|\| srcrect->h > SDL_MAX_UINT16 \|\|
134	dstrect->w > SDL_MAX_UINT16 \|\| dstrect->h > SDL_MAX_UINT16) {
135	return SDL_SetError("Size too large for scaling");
136	}
137
138	// Lock the destination if it's in hardware
139	dst_locked = `0`;
140	if (SDL_MUSTLOCK(dst)) {
141	if (!SDL_LockSurface(dst)) {
142	return SDL_SetError("Unable to lock destination surface");
143	}
144	dst_locked = `1`;
145	}
146	// Lock the source if it's in hardware
147	src_locked = `0`;
148	if (SDL_MUSTLOCK(src)) {
149	if (!SDL_LockSurface(src)) {
150	if (dst_locked) {
151	SDL_UnlockSurface(dst);
152	}
153	return SDL_SetError("Unable to lock source surface");
154	}
155	src_locked = `1`;
156	}
157
158	if (scaleMode == SDL_SCALEMODE_NEAREST) {
159	result = SDL_StretchSurfaceUncheckedNearest(src, srcrect, dst, dstrect);
160	} else {
161	result = SDL_StretchSurfaceUncheckedLinear(src, srcrect, dst, dstrect);
162	}
163
164	// We need to unlock the surfaces if they're locked
165	if (dst_locked) {
166	SDL_UnlockSurface(dst);
167	}
168	if (src_locked) {
169	SDL_UnlockSurface(src);
170	}
171
172	return result;
173	}
174
175	/ bilinear interpolation precision must be < 8*
176	Because with SSE: add-multiply: _mm_madd_epi16 works with signed int
177	so pixels 0xb1...... are negatives and false the result
178	same in NEON probably /*
179	#define PRECISION 7
180
181	#define FIXED_POINT(i) ((Uint32)(i) << 16)
182	#define SRC_INDEX(fp) ((Uint32)(fp) >> 16)
183	#define INTEGER(fp) ((Uint32)(fp) >> PRECISION)
184	#define FRAC(fp) ((Uint32)((fp) >> (16 - PRECISION)) & ((1 << PRECISION) - 1))
185	#define FRAC_ZERO 0
186	#define FRAC_ONE (1 << PRECISION)
187	#define FP_ONE FIXED_POINT(1)
188
189	#define BILINEAR___START \
190	int i; \
191	Sint64 fp_sum_h; \
192	int fp_step_h, left_pad_h, right_pad_h; \
193	Sint64 fp_sum_w; \
194	int fp_step_w, left_pad_w, right_pad_w; \
195	Sint64 fp_sum_w_init; \
196	int left_pad_w_init, right_pad_w_init, dst_gap, middle_init; \
197	get_scaler_datas(src_h, dst_h, &fp_sum_h, &fp_step_h, &left_pad_h, &right_pad_h); \
198	get_scaler_datas(src_w, dst_w, &fp_sum_w, &fp_step_w, &left_pad_w, &right_pad_w); \
199	fp_sum_w_init = fp_sum_w + left_pad_w * fp_step_w; \
200	left_pad_w_init = left_pad_w; \
201	right_pad_w_init = right_pad_w; \
202	dst_gap = dst_pitch - 4 * dst_w; \
203	middle_init = dst_w - left_pad_w - right_pad_w;
204
205	#define BILINEAR___HEIGHT \
206	int index_h, frac_h0, frac_h1, middle; \
207	const Uint32 src_h0, src_h1; \
208	int no_padding; \
209	Uint64 incr_h0, incr_h1; \
210	\
211	no_padding = !(i < left_pad_h \|\| i > dst_h - 1 - right_pad_h); \
212	index_h = SRC_INDEX(fp_sum_h); \
213	frac_h0 = FRAC(fp_sum_h); \
214	\
215	index_h = no_padding ? index_h : (i < left_pad_h ? 0 : src_h - 1); \
216	frac_h0 = no_padding ? frac_h0 : 0; \
217	incr_h1 = no_padding ? src_pitch : 0; \
218	incr_h0 = (Uint64)index_h * src_pitch; \
219	\
220	src_h0 = (const Uint32 )((const Uint8 )src + incr_h0); \
221	src_h1 = (const Uint32 )((const Uint8 )src_h0 + incr_h1); \
222	\
223	fp_sum_h += fp_step_h; \
224	\
225	frac_h1 = FRAC_ONE - frac_h0; \
226	fp_sum_w = fp_sum_w_init; \
227	right_pad_w = right_pad_w_init; \
228	left_pad_w = left_pad_w_init; \
229	middle = middle_init;
230
231	#ifdef __clang__
232	// Remove inlining of this function
233	// Compiler crash with clang 9.0.8 / android-ndk-r21d
234	// Compiler crash with clang 11.0.3 / Xcode
235	// OK with clang 11.0.5 / android-ndk-22
236	// OK with clang 12.0.0 / Xcode
237	__attribute__((noinline))
238	#endif
239	static void get_scaler_datas(int src_nb, int dst_nb, Sint64 fp_start, int* fp_step, int* left_pad, int* *right_pad)
240	{
241
242	int step = FIXED_POINT(src_nb) / (dst_nb); // source step in fixed point
243	int x0 = FP_ONE / `2`; // dst first pixel center at 0.5 in fixed point
244	Sint64 fp_sum;
245	int i;
246	#if 0
247	// scale to source coordinates
248	x0 *= src_nb;
249	x0 /= dst_nb; // x0 == step / 2
250	#else
251	// Use this code for perfect match with pixman
252	Sint64 tmp[`2`];
253	tmp[`0`] = (Sint64)step * (x0 >> `16`);
254	tmp[`1`] = (Sint64)step * (x0 & `0xFFFF`);
255	x0 = (int)(tmp[`0`] + ((tmp[`1`] + `0x8000`) >> `16`)); // x0 == (step + 1) / 2
256	#endif
257	// -= 0.5, get back the pixel origin, in source coordinates
258	x0 -= FP_ONE / `2`;
259
260	*fp_start = x0;
261	*fp_step = step;
262	*left_pad = `0`;
263	*right_pad = `0`;
264
265	fp_sum = x0;
266	for (i = `0`; i < dst_nb; i++) {
267	if (fp_sum < `0`) {
268	*left_pad += `1`;
269	} else {
270	int index = SRC_INDEX(fp_sum);
271	if (index > src_nb - `2`) {
272	*right_pad += `1`;
273	}
274	}
275	fp_sum += step;
276	}
277	// SDL_Log("%d -> %d x0=%d step=%d left_pad=%d right_pad=%d", src_nb, dst_nb, fp_start, fp_step, left_pad, right_pad);
278	}
279
280	typedef struct color_t
281	{
282	Uint8 a;
283	Uint8 b;
284	Uint8 c;
285	Uint8 d;
286	} color_t;
287
288	#if 0
289	static void printf_64(const char str, void* *var)
290	{
291	uint8_t val = (uint8_t) var;
292	printf(" * %s: %02x %02x %02x %02x _ %02x %02x %02x %02x\n",
293	str, val[`0`], val[`1`], val[`2`], val[`3`], val[`4`], val[`5`], val[`6`], val[`7`]);
294	}
295	#endif
296
297	/ Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac /
298
299	static SDL_INLINE void INTERPOL(const Uint32 src_x0, const* Uint32 src_x1, int* frac0, int frac1, Uint32 *dst)
300	{
301	const color_t c0 = (const* color_t *)src_x0;
302	const color_t c1 = (const* color_t *)src_x1;
303	color_t cx = (color_t )dst;
304	#if 0
305	cx->a = c0->a + INTEGER(frac0 * (c1->a - c0->a));
306	cx->b = c0->b + INTEGER(frac0 * (c1->b - c0->b));
307	cx->c = c0->c + INTEGER(frac0 * (c1->c - c0->c));
308	cx->d = c0->d + INTEGER(frac0 * (c1->d - c0->d));
309	#else
310	cx->a = (Uint8)INTEGER(frac1 * c0->a + frac0 * c1->a);
311	cx->b = (Uint8)INTEGER(frac1 * c0->b + frac0 * c1->b);
312	cx->c = (Uint8)INTEGER(frac1 * c0->c + frac0 * c1->c);
313	cx->d = (Uint8)INTEGER(frac1 * c0->d + frac0 * c1->d);
314	#endif
315	}
316
317	static SDL_INLINE void INTERPOL_BILINEAR(const Uint32 s0, const* Uint32 s1, int* frac_w0, int frac_h0, int frac_h1, Uint32 *dst)
318	{
319	Uint32 tmp[`2`];
320	unsigned int frac_w1 = FRAC_ONE - frac_w0;
321
322	// Vertical first, store to 'tmp'
323	INTERPOL(s0, s1, frac_h0, frac_h1, tmp);
324	INTERPOL(s0 + `1`, s1 + `1`, frac_h0, frac_h1, tmp + `1`);
325
326	// Horizontal, store to 'dst'
327	INTERPOL(tmp, tmp + `1`, frac_w0, frac_w1, dst);
328	}
329
330	static bool scale_mat(const Uint32 src, int* src_w, int src_h, int src_pitch, Uint32 dst, int* dst_w, int dst_h, int dst_pitch)
331	{
332	BILINEAR___START
333
334	for (i = `0`; i < dst_h; i++) {
335
336	BILINEAR___HEIGHT
337
338	while (left_pad_w--) {
339	INTERPOL_BILINEAR(src_h0, src_h1, FRAC_ZERO, frac_h0, frac_h1, dst);
340	dst += `1`;
341	}
342
343	while (middle--) {
344	const Uint32 *s_00_01;
345	const Uint32 *s_10_11;
346	int index_w = `4` * SRC_INDEX(fp_sum_w);
347	int frac_w = FRAC(fp_sum_w);
348	fp_sum_w += fp_step_w;
349
350	/*
351	x00 ... x0_ ..... x01
352	. . .
353	. x .
354	. . .
355	. . .
356	x10 ... x1_ ..... x11
357	*/
358	s_00_01 = (const Uint32 )((const* Uint8 *)src_h0 + index_w);
359	s_10_11 = (const Uint32 )((const* Uint8 *)src_h1 + index_w);
360
361	INTERPOL_BILINEAR(s_00_01, s_10_11, frac_w, frac_h0, frac_h1, dst);
362
363	dst += `1`;
364	}
365
366	while (right_pad_w--) {
367	int index_w = `4` * (src_w - `2`);
368	const Uint32 s_00_01 = (const* Uint32 )((const* Uint8 *)src_h0 + index_w);
369	const Uint32 s_10_11 = (const* Uint32 )((const* Uint8 *)src_h1 + index_w);
370	INTERPOL_BILINEAR(s_00_01, s_10_11, FRAC_ONE, frac_h0, frac_h1, dst);
371	dst += `1`;
372	}
373	dst = (Uint32 )((Uint8 )dst + dst_gap);
374	}
375	return true;
376	}
377
378	#ifdef SDL_NEON_INTRINSICS
379	#define CAST_uint8x8_t (uint8x8_t)
380	#define CAST_uint32x2_t (uint32x2_t)
381	#endif
382
383	#if defined(_MSC_VER)
384	#ifdef SDL_NEON_INTRINSICS
385	#undef CAST_uint8x8_t
386	#undef CAST_uint32x2_t
387	#define CAST_uint8x8_t
388	#define CAST_uint32x2_t
389	#endif
390	#endif
391
392	#ifdef SDL_SSE2_INTRINSICS
393
394	#if 0
395	static void SDL_TARGETING("sse2") printf_128(const char *str, __m128i var)
396	{
397	uint16_t val = (uint16_t) &var;
398	printf(" * %s: %04x %04x %04x %04x _ %04x %04x %04x %04x\n",
399	str, val[`0`], val[`1`], val[`2`], val[`3`], val[`4`], val[`5`], val[`6`], val[`7`]);
400	}
401	#endif
402
403	static SDL_INLINE int hasSSE2(void)
404	{
405	static int val = -`1`;
406	if (val != -`1`) {
407	return val;
408	}
409	val = SDL_HasSSE2();
410	return val;
411	}
412
413	static SDL_INLINE void SDL_TARGETING("sse2") INTERPOL_BILINEAR_SSE(const Uint32 s0, const* Uint32 s1, int* frac_w, __m128i v_frac_h0, __m128i v_frac_h1, Uint32 *dst, __m128i zero)
414	{
415	__m128i x_00_01, x_10_11; / Pixels in 4uint8 in row /*
416	__m128i v_frac_w0, k0, l0, d0, e0;
417
418	int f, f2;
419	f = frac_w;
420	f2 = FRAC_ONE - frac_w;
421	v_frac_w0 = _mm_set_epi16((short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2);
422
423	x_00_01 = _mm_loadl_epi64((const __m128i )s0); // Load x00 and x01*
424	x_10_11 = _mm_loadl_epi64((const __m128i *)s1);
425
426	/ Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac /
427
428	// Interpolation vertical
429	k0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_00_01, zero), v_frac_h1);
430	l0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_10_11, zero), v_frac_h0);
431	k0 = _mm_add_epi16(k0, l0);
432
433	// For perfect match, clear the factionnal part eventually.
434	/*
435	k0 = _mm_srli_epi16(k0, PRECISION);
436	k0 = _mm_slli_epi16(k0, PRECISION);
437	*/
438
439	// Interpolation horizontal
440	l0 = _mm_unpacklo_epi64(/ unused / l0, k0);
441	k0 = _mm_madd_epi16(_mm_unpackhi_epi16(l0, k0), v_frac_w0);
442
443	// Store 1 pixel
444	d0 = _mm_srli_epi32(k0, PRECISION * `2`);
445	e0 = _mm_packs_epi32(d0, d0);
446	e0 = _mm_packus_epi16(e0, e0);
447	*dst = _mm_cvtsi128_si32(e0);
448	}
449
450	static bool SDL_TARGETING("sse2") scale_mat_SSE(const Uint32 src, int* src_w, int src_h, int src_pitch, Uint32 dst, int* dst_w, int dst_h, int dst_pitch)
451	{
452	BILINEAR___START
453
454	for (i = `0`; i < dst_h; i++) {
455	int nb_block2;
456	__m128i v_frac_h0;
457	__m128i v_frac_h1;
458	__m128i zero;
459
460	BILINEAR___HEIGHT
461
462	nb_block2 = middle / `2`;
463
464	v_frac_h0 = _mm_set_epi16((short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0);
465	v_frac_h1 = _mm_set_epi16((short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1);
466	zero = _mm_setzero_si128();
467
468	while (left_pad_w--) {
469	INTERPOL_BILINEAR_SSE(src_h0, src_h1, FRAC_ZERO, v_frac_h0, v_frac_h1, dst, zero);
470	dst += `1`;
471	}
472
473	while (nb_block2--) {
474	int index_w_0, frac_w_0;
475	int index_w_1, frac_w_1;
476
477	const Uint32 s_00_01, s_02_03, s_10_11, s_12_13;
478
479	__m128i x_00_01, x_10_11, x_02_03, x_12_13; / Pixels in 4uint8 in row /*
480	__m128i v_frac_w0, k0, l0, d0, e0;
481	__m128i v_frac_w1, k1, l1, d1, e1;
482
483	int f, f2;
484	index_w_0 = `4` * SRC_INDEX(fp_sum_w);
485	frac_w_0 = FRAC(fp_sum_w);
486	fp_sum_w += fp_step_w;
487	index_w_1 = `4` * SRC_INDEX(fp_sum_w);
488	frac_w_1 = FRAC(fp_sum_w);
489	fp_sum_w += fp_step_w;
490	/*
491	x00............ x01 x02...........x03
492	. . . . . .
493	j0 f0 j1 j2 f1 j3
494	. . . . . .
495	. . . . . .
496	. . . . . .
497	x10............ x11 x12...........x13
498	*/
499	s_00_01 = (const Uint32 )((const* Uint8 *)src_h0 + index_w_0);
500	s_02_03 = (const Uint32 )((const* Uint8 *)src_h0 + index_w_1);
501	s_10_11 = (const Uint32 )((const* Uint8 *)src_h1 + index_w_0);
502	s_12_13 = (const Uint32 )((const* Uint8 *)src_h1 + index_w_1);
503
504	f = frac_w_0;
505	f2 = FRAC_ONE - frac_w_0;
506	v_frac_w0 = _mm_set_epi16((short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2);
507
508	f = frac_w_1;
509	f2 = FRAC_ONE - frac_w_1;
510	v_frac_w1 = _mm_set_epi16((short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2);
511
512	x_00_01 = _mm_loadl_epi64((const __m128i )s_00_01); // Load x00 and x01*
513	x_02_03 = _mm_loadl_epi64((const __m128i *)s_02_03);
514	x_10_11 = _mm_loadl_epi64((const __m128i *)s_10_11);
515	x_12_13 = _mm_loadl_epi64((const __m128i *)s_12_13);
516
517	// Interpolation vertical
518	k0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_00_01, zero), v_frac_h1);
519	l0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_10_11, zero), v_frac_h0);
520	k0 = _mm_add_epi16(k0, l0);
521	k1 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_02_03, zero), v_frac_h1);
522	l1 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_12_13, zero), v_frac_h0);
523	k1 = _mm_add_epi16(k1, l1);
524
525	// Interpolation horizontal
526	l0 = _mm_unpacklo_epi64(/ unused / l0, k0);
527	k0 = _mm_madd_epi16(_mm_unpackhi_epi16(l0, k0), v_frac_w0);
528	l1 = _mm_unpacklo_epi64(/ unused / l1, k1);
529	k1 = _mm_madd_epi16(_mm_unpackhi_epi16(l1, k1), v_frac_w1);
530
531	// Store 1 pixel
532	d0 = _mm_srli_epi32(k0, PRECISION * `2`);
533	e0 = _mm_packs_epi32(d0, d0);
534	e0 = _mm_packus_epi16(e0, e0);
535	*dst++ = _mm_cvtsi128_si32(e0);
536
537	// Store 1 pixel
538	d1 = _mm_srli_epi32(k1, PRECISION * `2`);
539	e1 = _mm_packs_epi32(d1, d1);
540	e1 = _mm_packus_epi16(e1, e1);
541	*dst++ = _mm_cvtsi128_si32(e1);
542	}
543
544	// Last point
545	if (middle & `0x1`) {
546	const Uint32 *s_00_01;
547	const Uint32 *s_10_11;
548	int index_w = `4` * SRC_INDEX(fp_sum_w);
549	int frac_w = FRAC(fp_sum_w);
550	fp_sum_w += fp_step_w;
551	s_00_01 = (const Uint32 )((const* Uint8 *)src_h0 + index_w);
552	s_10_11 = (const Uint32 )((const* Uint8 *)src_h1 + index_w);
553	INTERPOL_BILINEAR_SSE(s_00_01, s_10_11, frac_w, v_frac_h0, v_frac_h1, dst, zero);
554	dst += `1`;
555	}
556
557	while (right_pad_w--) {
558	int index_w = `4` * (src_w - `2`);
559	const Uint32 s_00_01 = (const* Uint32 )((const* Uint8 *)src_h0 + index_w);
560	const Uint32 s_10_11 = (const* Uint32 )((const* Uint8 *)src_h1 + index_w);
561	INTERPOL_BILINEAR_SSE(s_00_01, s_10_11, FRAC_ONE, v_frac_h0, v_frac_h1, dst, zero);
562	dst += `1`;
563	}
564	dst = (Uint32 )((Uint8 )dst + dst_gap);
565	}
566	return true;
567	}
568	#endif
569
570	#ifdef SDL_NEON_INTRINSICS
571
572	static SDL_INLINE int hasNEON(void)
573	{
574	static int val = -`1`;
575	if (val != -`1`) {
576	return val;
577	}
578	val = SDL_HasNEON();
579	return val;
580	}
581
582	static SDL_INLINE void INTERPOL_BILINEAR_NEON(const Uint32 s0, const* Uint32 s1, int* frac_w, uint8x8_t v_frac_h0, uint8x8_t v_frac_h1, Uint32 *dst)
583	{
584	uint8x8_t x_00_01, x_10_11; / Pixels in 4uint8 in row /*
585	uint16x8_t k0;
586	uint32x4_t l0;
587	uint16x8_t d0;
588	uint8x8_t e0;
589
590	x_00_01 = CAST_uint8x8_t vld1_u32(s0); // Load 2 pixels
591	x_10_11 = CAST_uint8x8_t vld1_u32(s1);
592
593	/ Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac /
594	k0 = vmull_u8(x_00_01, v_frac_h1); / k0 := x0 * (1 - frac) /
595	k0 = vmlal_u8(k0, x_10_11, v_frac_h0); / k0 += x1 * frac /
596
597	// k0 now contains 2 interpolated pixels { j0, j1 }
598	l0 = vshll_n_u16(vget_low_u16(k0), PRECISION);
599	l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w);
600	l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w);
601
602	// Shift and narrow
603	d0 = vcombine_u16(
604	/ uint16x4_t / vshrn_n_u32(l0, `2` * PRECISION),
605	/ uint16x4_t / vshrn_n_u32(l0, `2` * PRECISION));
606
607	// Narrow again
608	e0 = vmovn_u16(d0);
609
610	// Store 1 pixel
611	*dst = vget_lane_u32(CAST_uint32x2_t e0, `0`);
612	}
613
614	static bool scale_mat_NEON(const Uint32 src, int* src_w, int src_h, int src_pitch, Uint32 dst, int* dst_w, int dst_h, int dst_pitch)
615	{
616	BILINEAR___START
617
618	for (i = `0`; i < dst_h; i++) {
619	int nb_block4;
620	uint8x8_t v_frac_h0, v_frac_h1;
621
622	BILINEAR___HEIGHT
623
624	nb_block4 = middle / `4`;
625
626	v_frac_h0 = vmov_n_u8(frac_h0);
627	v_frac_h1 = vmov_n_u8(frac_h1);
628
629	while (left_pad_w--) {
630	INTERPOL_BILINEAR_NEON(src_h0, src_h1, FRAC_ZERO, v_frac_h0, v_frac_h1, dst);
631	dst += `1`;
632	}
633
634	while (nb_block4--) {
635	int index_w_0, frac_w_0;
636	int index_w_1, frac_w_1;
637	int index_w_2, frac_w_2;
638	int index_w_3, frac_w_3;
639
640	const Uint32 s_00_01, s_02_03, s_04_05, s_06_07;
641	const Uint32 s_10_11, s_12_13, s_14_15, s_16_17;
642
643	uint8x8_t x_00_01, x_10_11, x_02_03, x_12_13; / Pixels in 4uint8 in row /*
644	uint8x8_t x_04_05, x_14_15, x_06_07, x_16_17;
645
646	uint16x8_t k0, k1, k2, k3;
647	uint32x4_t l0, l1, l2, l3;
648	uint16x8_t d0, d1;
649	uint8x8_t e0, e1;
650	uint32x4_t f0;
651
652	index_w_0 = `4` * SRC_INDEX(fp_sum_w);
653	frac_w_0 = FRAC(fp_sum_w);
654	fp_sum_w += fp_step_w;
655	index_w_1 = `4` * SRC_INDEX(fp_sum_w);
656	frac_w_1 = FRAC(fp_sum_w);
657	fp_sum_w += fp_step_w;
658	index_w_2 = `4` * SRC_INDEX(fp_sum_w);
659	frac_w_2 = FRAC(fp_sum_w);
660	fp_sum_w += fp_step_w;
661	index_w_3 = `4` * SRC_INDEX(fp_sum_w);
662	frac_w_3 = FRAC(fp_sum_w);
663	fp_sum_w += fp_step_w;
664
665	s_00_01 = (const Uint32 )((const* Uint8 *)src_h0 + index_w_0);
666	s_02_03 = (const Uint32 )((const* Uint8 *)src_h0 + index_w_1);
667	s_04_05 = (const Uint32 )((const* Uint8 *)src_h0 + index_w_2);
668	s_06_07 = (const Uint32 )((const* Uint8 *)src_h0 + index_w_3);
669	s_10_11 = (const Uint32 )((const* Uint8 *)src_h1 + index_w_0);
670	s_12_13 = (const Uint32 )((const* Uint8 *)src_h1 + index_w_1);
671	s_14_15 = (const Uint32 )((const* Uint8 *)src_h1 + index_w_2);
672	s_16_17 = (const Uint32 )((const* Uint8 *)src_h1 + index_w_3);
673
674	// Interpolation vertical
675	x_00_01 = CAST_uint8x8_t vld1_u32(s_00_01); // Load 2 pixels
676	x_02_03 = CAST_uint8x8_t vld1_u32(s_02_03);
677	x_04_05 = CAST_uint8x8_t vld1_u32(s_04_05);
678	x_06_07 = CAST_uint8x8_t vld1_u32(s_06_07);
679	x_10_11 = CAST_uint8x8_t vld1_u32(s_10_11);
680	x_12_13 = CAST_uint8x8_t vld1_u32(s_12_13);
681	x_14_15 = CAST_uint8x8_t vld1_u32(s_14_15);
682	x_16_17 = CAST_uint8x8_t vld1_u32(s_16_17);
683
684	/ Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac /
685	k0 = vmull_u8(x_00_01, v_frac_h1); / k0 := x0 * (1 - frac) /
686	k0 = vmlal_u8(k0, x_10_11, v_frac_h0); / k0 += x1 * frac /
687
688	k1 = vmull_u8(x_02_03, v_frac_h1);
689	k1 = vmlal_u8(k1, x_12_13, v_frac_h0);
690
691	k2 = vmull_u8(x_04_05, v_frac_h1);
692	k2 = vmlal_u8(k2, x_14_15, v_frac_h0);
693
694	k3 = vmull_u8(x_06_07, v_frac_h1);
695	k3 = vmlal_u8(k3, x_16_17, v_frac_h0);
696
697	// k0 now contains 2 interpolated pixels { j0, j1 }
698	// k1 now contains 2 interpolated pixels { j2, j3 }
699	// k2 now contains 2 interpolated pixels { j4, j5 }
700	// k3 now contains 2 interpolated pixels { j6, j7 }
701
702	l0 = vshll_n_u16(vget_low_u16(k0), PRECISION);
703	l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w_0);
704	l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w_0);
705
706	l1 = vshll_n_u16(vget_low_u16(k1), PRECISION);
707	l1 = vmlsl_n_u16(l1, vget_low_u16(k1), frac_w_1);
708	l1 = vmlal_n_u16(l1, vget_high_u16(k1), frac_w_1);
709
710	l2 = vshll_n_u16(vget_low_u16(k2), PRECISION);
711	l2 = vmlsl_n_u16(l2, vget_low_u16(k2), frac_w_2);
712	l2 = vmlal_n_u16(l2, vget_high_u16(k2), frac_w_2);
713
714	l3 = vshll_n_u16(vget_low_u16(k3), PRECISION);
715	l3 = vmlsl_n_u16(l3, vget_low_u16(k3), frac_w_3);
716	l3 = vmlal_n_u16(l3, vget_high_u16(k3), frac_w_3);
717
718	// shift and narrow
719	d0 = vcombine_u16(
720	/ uint16x4_t / vshrn_n_u32(l0, `2` * PRECISION),
721	/ uint16x4_t / vshrn_n_u32(l1, `2` * PRECISION));
722	// narrow again
723	e0 = vmovn_u16(d0);
724
725	// Shift and narrow
726	d1 = vcombine_u16(
727	/ uint16x4_t / vshrn_n_u32(l2, `2` * PRECISION),
728	/ uint16x4_t / vshrn_n_u32(l3, `2` * PRECISION));
729	// Narrow again
730	e1 = vmovn_u16(d1);
731
732	f0 = vcombine_u32(CAST_uint32x2_t e0, CAST_uint32x2_t e1);
733	// Store 4 pixels
734	vst1q_u32(dst, f0);
735
736	dst += `4`;
737	}
738
739	if (middle & `0x2`) {
740	int index_w_0, frac_w_0;
741	int index_w_1, frac_w_1;
742	const Uint32 s_00_01, s_02_03;
743	const Uint32 s_10_11, s_12_13;
744	uint8x8_t x_00_01, x_10_11, x_02_03, x_12_13; / Pixels in 4uint8 in row /*
745	uint16x8_t k0, k1;
746	uint32x4_t l0, l1;
747	uint16x8_t d0;
748	uint8x8_t e0;
749
750	index_w_0 = `4` * SRC_INDEX(fp_sum_w);
751	frac_w_0 = FRAC(fp_sum_w);
752	fp_sum_w += fp_step_w;
753	index_w_1 = `4` * SRC_INDEX(fp_sum_w);
754	frac_w_1 = FRAC(fp_sum_w);
755	fp_sum_w += fp_step_w;
756	/*
757	x00............ x01 x02...........x03
758	. . . . . .
759	j0 dest0 j1 j2 dest1 j3
760	. . . . . .
761	. . . . . .
762	. . . . . .
763	x10............ x11 x12...........x13
764	*/
765	s_00_01 = (const Uint32 )((const* Uint8 *)src_h0 + index_w_0);
766	s_02_03 = (const Uint32 )((const* Uint8 *)src_h0 + index_w_1);
767	s_10_11 = (const Uint32 )((const* Uint8 *)src_h1 + index_w_0);
768	s_12_13 = (const Uint32 )((const* Uint8 *)src_h1 + index_w_1);
769
770	// Interpolation vertical
771	x_00_01 = CAST_uint8x8_t vld1_u32(s_00_01); // Load 2 pixels
772	x_02_03 = CAST_uint8x8_t vld1_u32(s_02_03);
773	x_10_11 = CAST_uint8x8_t vld1_u32(s_10_11);
774	x_12_13 = CAST_uint8x8_t vld1_u32(s_12_13);
775
776	/ Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac /
777	k0 = vmull_u8(x_00_01, v_frac_h1); / k0 := x0 * (1 - frac) /
778	k0 = vmlal_u8(k0, x_10_11, v_frac_h0); / k0 += x1 * frac /
779
780	k1 = vmull_u8(x_02_03, v_frac_h1);
781	k1 = vmlal_u8(k1, x_12_13, v_frac_h0);
782
783	// k0 now contains 2 interpolated pixels { j0, j1 }
784	// k1 now contains 2 interpolated pixels { j2, j3 }
785
786	l0 = vshll_n_u16(vget_low_u16(k0), PRECISION);
787	l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w_0);
788	l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w_0);
789
790	l1 = vshll_n_u16(vget_low_u16(k1), PRECISION);
791	l1 = vmlsl_n_u16(l1, vget_low_u16(k1), frac_w_1);
792	l1 = vmlal_n_u16(l1, vget_high_u16(k1), frac_w_1);
793
794	// Shift and narrow
795
796	d0 = vcombine_u16(
797	/ uint16x4_t / vshrn_n_u32(l0, `2` * PRECISION),
798	/ uint16x4_t / vshrn_n_u32(l1, `2` * PRECISION));
799
800	// Narrow again
801	e0 = vmovn_u16(d0);
802
803	// Store 2 pixels
804	vst1_u32(dst, CAST_uint32x2_t e0);
805	dst += `2`;
806	}
807
808	// Last point
809	if (middle & `0x1`) {
810	int index_w = `4` * SRC_INDEX(fp_sum_w);
811	int frac_w = FRAC(fp_sum_w);
812	const Uint32 s_00_01 = (const* Uint32 )((const* Uint8 *)src_h0 + index_w);
813	const Uint32 s_10_11 = (const* Uint32 )((const* Uint8 *)src_h1 + index_w);
814	INTERPOL_BILINEAR_NEON(s_00_01, s_10_11, frac_w, v_frac_h0, v_frac_h1, dst);
815	dst += `1`;
816	}
817
818	while (right_pad_w--) {
819	int index_w = `4` * (src_w - `2`);
820	const Uint32 s_00_01 = (const* Uint32 )((const* Uint8 *)src_h0 + index_w);
821	const Uint32 s_10_11 = (const* Uint32 )((const* Uint8 *)src_h1 + index_w);
822	INTERPOL_BILINEAR_NEON(s_00_01, s_10_11, FRAC_ONE, v_frac_h0, v_frac_h1, dst);
823	dst += `1`;
824	}
825
826	dst = (Uint32 )((Uint8 )dst + dst_gap);
827	}
828	return true;
829	}
830	#endif
831
832	bool SDL_StretchSurfaceUncheckedLinear(SDL_Surface s, const* SDL_Rect srcrect, SDL_Surface d, const SDL_Rect *dstrect)
833	{
834	bool result = false;
835	int src_w = srcrect->w;
836	int src_h = srcrect->h;
837	int dst_w = dstrect->w;
838	int dst_h = dstrect->h;
839	int src_pitch = s->pitch;
840	int dst_pitch = d->pitch;
841	Uint32 src = (Uint32 )((Uint8 )s->pixels + srcrect->x `4` + srcrect->y * src_pitch);
842	Uint32 dst = (Uint32 )((Uint8 )d->pixels + dstrect->x `4` + dstrect->y * dst_pitch);
843
844	#ifdef SDL_NEON_INTRINSICS
845	if (!result && hasNEON()) {
846	result = scale_mat_NEON(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
847	}
848	#endif
849
850	#ifdef SDL_SSE2_INTRINSICS
851	if (!result && hasSSE2()) {
852	result = scale_mat_SSE(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
853	}
854	#endif
855
856	if (!result) {
857	result = scale_mat(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
858	}
859
860	return result;
861	}
862
863	#define SDL_SCALE_NEAREST__START \
864	int i; \
865	Uint64 posy, incy; \
866	Uint64 posx, incx; \
867	Uint64 srcy, srcx; \
868	int dst_gap, n; \
869	const Uint32 *src_h0; \
870	incy = ((Uint64)src_h << 16) / dst_h; \
871	incx = ((Uint64)src_w << 16) / dst_w; \
872	dst_gap = dst_pitch - bpp * dst_w; \
873	posy = incy / 2;
874
875	#define SDL_SCALE_NEAREST__HEIGHT \
876	srcy = (posy >> 16); \
877	src_h0 = (const Uint32 )((const Uint8 )src_ptr + srcy * src_pitch); \
878	posy += incy; \
879	posx = incx / 2; \
880	n = dst_w;
881
882	static bool scale_mat_nearest_1(const Uint32 src_ptr, int* src_w, int src_h, int src_pitch, Uint32 dst, int* dst_w, int dst_h, int dst_pitch)
883	{
884	Uint32 bpp = `1`;
885	SDL_SCALE_NEAREST__START
886	for (i = `0`; i < dst_h; i++) {
887	SDL_SCALE_NEAREST__HEIGHT
888	while (n--) {
889	const Uint8 *src;
890	srcx = bpp * (posx >> `16`);
891	posx += incx;
892	src = (const Uint8 *)src_h0 + srcx;
893	(Uint8 )dst = *src;
894	dst = (Uint32 )((Uint8 )dst + bpp);
895	}
896	dst = (Uint32 )((Uint8 )dst + dst_gap);
897	}
898	return true;
899	}
900
901	static bool scale_mat_nearest_2(const Uint32 src_ptr, int* src_w, int src_h, int src_pitch, Uint32 dst, int* dst_w, int dst_h, int dst_pitch)
902	{
903	Uint32 bpp = `2`;
904	SDL_SCALE_NEAREST__START
905	for (i = `0`; i < dst_h; i++) {
906	SDL_SCALE_NEAREST__HEIGHT
907	while (n--) {
908	const Uint16 *src;
909	srcx = bpp * (posx >> `16`);
910	posx += incx;
911	src = (const Uint16 )((const* Uint8 *)src_h0 + srcx);
912	(Uint16 )dst = *src;
913	dst = (Uint32 )((Uint8 )dst + bpp);
914	}
915	dst = (Uint32 )((Uint8 )dst + dst_gap);
916	}
917	return true;
918	}
919
920	static bool scale_mat_nearest_3(const Uint32 src_ptr, int* src_w, int src_h, int src_pitch, Uint32 dst, int* dst_w, int dst_h, int dst_pitch)
921	{
922	Uint32 bpp = `3`;
923	SDL_SCALE_NEAREST__START
924	for (i = `0`; i < dst_h; i++) {
925	SDL_SCALE_NEAREST__HEIGHT
926	while (n--) {
927	const Uint8 *src;
928	srcx = bpp * (posx >> `16`);
929	posx += incx;
930	src = (const Uint8 *)src_h0 + srcx;
931	((Uint8 *)dst)[`0`] = src[`0`];
932	((Uint8 *)dst)[`1`] = src[`1`];
933	((Uint8 *)dst)[`2`] = src[`2`];
934	dst = (Uint32 )((Uint8 )dst + bpp);
935	}
936	dst = (Uint32 )((Uint8 )dst + dst_gap);
937	}
938	return true;
939	}
940
941	static bool scale_mat_nearest_4(const Uint32 src_ptr, int* src_w, int src_h, int src_pitch, Uint32 dst, int* dst_w, int dst_h, int dst_pitch)
942	{
943	Uint32 bpp = `4`;
944	SDL_SCALE_NEAREST__START
945	for (i = `0`; i < dst_h; i++) {
946	SDL_SCALE_NEAREST__HEIGHT
947	while (n--) {
948	const Uint32 *src;
949	srcx = bpp * (posx >> `16`);
950	posx += incx;
951	src = (const Uint32 )((const* Uint8 *)src_h0 + srcx);
952	dst = src;
953	dst = (Uint32 )((Uint8 )dst + bpp);
954	}
955	dst = (Uint32 )((Uint8 )dst + dst_gap);
956	}
957	return true;
958	}
959
960	bool SDL_StretchSurfaceUncheckedNearest(SDL_Surface s, const* SDL_Rect srcrect, SDL_Surface d, const SDL_Rect *dstrect)
961	{
962	int src_w = srcrect->w;
963	int src_h = srcrect->h;
964	int dst_w = dstrect->w;
965	int dst_h = dstrect->h;
966	int src_pitch = s->pitch;
967	int dst_pitch = d->pitch;
968	int bpp = SDL_BYTESPERPIXEL(d->format);
969
970	Uint32 src = (Uint32 )((Uint8 )s->pixels + srcrect->x bpp + srcrect->y * src_pitch);
971	Uint32 dst = (Uint32 )((Uint8 )d->pixels + dstrect->x bpp + dstrect->y * dst_pitch);
972
973	if (bpp == `4`) {
974	return scale_mat_nearest_4(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
975	} else if (bpp == `3`) {
976	return scale_mat_nearest_3(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
977	} else if (bpp == `2`) {
978	return scale_mat_nearest_2(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
979	} else {
980	return scale_mat_nearest_1(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
981	}
982	}
983

Browse the source code of SDL/src/video/SDL_stretch.c