draw-scale-simple.c source code [MuPDF/source/fitz/draw-scale-simple.c]

1	/*
2	This code does smooth scaling of a pixmap.
3
4	This function returns a new pixmap representing the area starting at (0,0)
5	given by taking the source pixmap src, scaling it to width w, and height h,
6	and then positioning it at (frac(x),frac(y)).
7
8	This is a cut-down version of draw_scale.c that only copes with filters
9	that return values strictly in the 0..1 range, and uses bytes for
10	intermediate results rather than ints.
11	*/
12
13	#include "mupdf/fitz.h"
14	#include "draw-imp.h"
15
16	#include <math.h>
17	#include <string.h>
18	#include <assert.h>
19	#include <limits.h>
20
21	/ Do we special case handling of single pixel high/wide images? The*
22	* 'purest' handling is given by not special casing them, but certain
23	* files that use such images 'stack' them to give full images. Not
24	* special casing them results in then being fainter and giving noticeable
25	* rounding errors.
26	*/
27	#define SINGLE_PIXEL_SPECIALS
28
29	/*
30	Consider a row of source samples, src, of width src_w, positioned at x,
31	scaled to width dst_w.
32
33	src[i] is centred at: x + (i + 0.5)dst_w/src_w*
34
35	Therefore the distance between the centre of the jth output pixel and
36	the centre of the ith source sample is:
37
38	dist[j,i] = j + 0.5 - (x + (i + 0.5)dst_w/src_w)*
39
40	When scaling up, therefore:
41
42	dst[j] = SUM(filter(dist[j,i]) src[i])*
43	(for all ints i)
44
45	This can be simplified by noticing that filters are only non zero within
46	a given filter width (henceforth called W). So:
47
48	dst[j] = SUM(filter(dist[j,i]) src[i])*
49	(for ints i, s.t. (jsrc_w/dst_w)-W < i < (jsrc_w/dst_w)+W)
50
51	When scaling down, each filtered source sample is stretched to be wider
52	to avoid aliasing issues. This effectively reduces the distance between
53	centres.
54
55	dst[j] = SUM(filter(dist[j,i] F) * F * src[i])*
56	(where F = dst_w/src_w)
57	(for ints i, s.t. (j-W)/F < i < (j+W)/F)
58
59	*/
60
61	typedef struct fz_scale_filter_s fz_scale_filter;
62
63	struct fz_scale_filter_s
64	{
65	int width;
66	float (fn)(fz_scale_filter , float);
67	};
68
69	/ Image scale filters /
70
71	static float
72	triangle(fz_scale_filter filter, float* f)
73	{
74	if (f >= `1`)
75	return `0`;
76	return `1`-f;
77	}
78
79	static float
80	box(fz_scale_filter filter, float* f)
81	{
82	if (f >= `0.5f`)
83	return `0`;
84	return `1`;
85	}
86
87	static float
88	simple(fz_scale_filter filter, float* x)
89	{
90	if (x >= `1`)
91	return `0`;
92	return `1` + (`2`x - `3`)x*x;
93	}
94
95	fz_scale_filter fz_scale_filter_box = { `1`, box };
96	fz_scale_filter fz_scale_filter_triangle = { `1`, triangle };
97	fz_scale_filter fz_scale_filter_simple = { `1`, simple };
98
99	/*
100	We build ourselves a set of tables to contain the precalculated weights
101	for a given set of scale settings.
102
103	The first dst_w entries in index are the index into index of the
104	sets of weight for each destination pixel.
105
106	Each of the sets of weights is a set of values consisting of:
107	the minimum source pixel index used for this destination pixel
108	the number of weights used for this destination pixel
109	the weights themselves
110
111	So to calculate dst[i] we do the following:
112
113	weights = &index[index[i]];
114	min = weights++;*
115	len = weights++;*
116	dst[i] = 0;
117	while (--len > 0)
118	dst[i] += src[min++] weights++
119
120	in addition, we guarantee that at the end of this process weights will now
121	point to the weights value for dst pixel i+1.
122
123	In the simplest version of this algorithm, we would scale the whole image
124	horizontally first into a temporary buffer, then scale that temporary
125	buffer again vertically to give us our result. Using such a simple
126	algorithm would mean that could use the same style of weights for both
127	horizontal and vertical scaling.
128
129	Unfortunately, this would also require a large temporary buffer,
130	particularly in the case where we are scaling up.
131
132	We therefore modify the algorithm as follows; we scale scanlines from the
133	source image horizontally into a temporary buffer, until we have all the
134	contributors for a given output scanline. We then produce that output
135	scanline from the temporary buffer. In this way we restrict the height
136	of the temporary buffer to a small fraction of the final size.
137
138	Unfortunately, this means that the pseudo code for recombining a
139	scanline of fully scaled pixels is as follows:
140
141	weights = &index[index[y]];
142	min = weights++;*
143	len = weights++;*
144	for (x=0 to dst_w)
145	min2 = min
146	len2 = len
147	weights2 = weights
148	dst[x] = 0;
149	while (--len2 > 0)
150	dst[x] += temp[x][(min2++) % tmp_buf_height] weights2++
151
152	i.e. it requires a % operation for every source pixel - this is typically
153	expensive.
154
155	To avoid this, we alter the order in which vertical weights are stored,
156	so that they are ordered in the same order as the temporary buffer lines
157	would appear. This simplifies the algorithm to:
158
159	weights = &index[index[y]];
160	min = weights++;*
161	len = weights++;*
162	for (x=0 to dst_w)
163	min2 = 0
164	len2 = len
165	weights2 = weights
166	dst[x] = 0;
167	while (--len2 > 0)
168	dst[x] += temp[i][min2++] weights2++
169
170	This means that len may be larger than it needs to be (due to the
171	possible inclusion of a zero weight row or two), but in practise this
172	is only an increase of 1 or 2 at worst.
173
174	We implement this by generating the weights as normal (but ensuring we
175	leave enough space) and then reordering afterwards.
176
177	*/
178
179	typedef struct fz_weights_s fz_weights;
180
181	/ This structure is accessed from ARM code - bear this in mind before*
182	* altering it! */
183	struct fz_weights_s
184	{
185	int flip; / true if outputting reversed /
186	int count; / number of output pixels we have records for in this table /
187	int max_len; / Maximum number of weights for any one output pixel /
188	int n; / number of components (src->n) /
189	int new_line; / True if no weights for the current output pixel /
190	int patch_l; / How many output pixels we skip over /
191	int index[`1`];
192	};
193
194	struct fz_scale_cache_s
195	{
196	int src_w;
197	float x;
198	float dst_w;
199	fz_scale_filter *filter;
200	int vertical;
201	int dst_w_int;
202	int patch_l;
203	int patch_r;
204	int n;
205	int flip;
206	fz_weights *weights;
207	};
208
209	static fz_weights *
210	new_weights(fz_context ctx, fz_scale_filter filter, int src_w, float dst_w, int patch_w, int n, int flip, int patch_l)
211	{
212	int max_len;
213	fz_weights *weights;
214
215	if (src_w > dst_w)
216	{
217	/ Scaling down, so there will be a maximum of*
218	* 2filterwidthsrc_w/dst_w src pixels
219	* contributing to each dst pixel. */
220	max_len = (int)ceilf((`2` * filter->width * src_w)/dst_w);
221	if (max_len > src_w)
222	max_len = src_w;
223	}
224	else
225	{
226	/ Scaling up, so there will be a maximum of*
227	* 2*filterwidth src pixels contributing to each dst pixel.
228	*/
229	max_len = `2` * filter->width;
230	}
231	/ We need the size of the struct,*
232	* plus patch_w*sizeof(int) for the index
233	* plus (2+max_len)*sizeof(int) for the weights
234	* plus room for an extra set of weights for reordering.
235	*/
236	weights = fz_malloc(ctx, sizeof(weights)+(max_len+`3`)(patch_w+`1`)*sizeof(int));
237	if (!weights)
238	return NULL;
239	weights->count = -`1`;
240	weights->max_len = max_len;
241	weights->index[`0`] = patch_w;
242	weights->n = n;
243	weights->patch_l = patch_l;
244	weights->flip = flip;
245	return weights;
246	}
247
248	/ j is destination pixel in the patch_l..patch_l+patch_w range /
249	static void
250	init_weights(fz_weights weights, int* j)
251	{
252	int index;
253
254	j -= weights->patch_l;
255	assert(weights->count == j-`1`);
256	weights->count++;
257	weights->new_line = `1`;
258	if (j == `0`)
259	index = weights->index[`0`];
260	else
261	{
262	index = weights->index[j-`1`];
263	index += `2` + weights->index[index+`1`];
264	}
265	weights->index[j] = index; / row pointer /
266	weights->index[index] = `0`; / min /
267	weights->index[index+`1`] = `0`; / len /
268	}
269
270	static void
271	add_weight(fz_weights weights, int* j, int i, fz_scale_filter *filter,
272	float x, float F, float G, int src_w, float dst_w)
273	{
274	float dist = j - x + `0.5f` - ((i + `0.5f`)*dst_w/src_w);
275	float f;
276	int min, len, index, weight;
277
278	dist *= G;
279	if (dist < `0`)
280	dist = -dist;
281	f = filter->fn(filter, dist)*F;
282	weight = (int)(`256`*f+`0.5f`);
283
284	/ Ensure i is in range /
285	if (i < `0` \|\| i >= src_w)
286	return;
287	if (weight == `0`)
288	{
289	/ We add a fudge factor here to allow for extreme downscales*
290	* where all the weights round to 0. Ensure that at least one
291	* (arbitrarily the first one) is non zero. */
292	if (weights->new_line && f > `0`)
293	weight = `1`;
294	else
295	return;
296	}
297
298	/ Move j from patch_l...patch_l+patch_w range to 0..patch_w range /
299	j -= weights->patch_l;
300	if (weights->new_line)
301	{
302	/ New line /
303	weights->new_line = `0`;
304	index = weights->index[j]; / row pointer /
305	weights->index[index] = i; / min /
306	weights->index[index+`1`] = `0`; / len /
307	}
308	index = weights->index[j];
309	min = weights->index[index++];
310	len = weights->index[index++];
311	while (i < min)
312	{
313	/ This only happens in rare cases, but we need to insert*
314	* one earlier. In exceedingly rare cases we may need to
315	* insert more than one earlier. */
316	int k;
317
318	for (k = len; k > `0`; k--)
319	{
320	weights->index[index+k] = weights->index[index+k-`1`];
321	}
322	weights->index[index] = `0`;
323	min--;
324	len++;
325	weights->index[index-`2`] = min;
326	weights->index[index-`1`] = len;
327	}
328	if (i-min >= len)
329	{
330	/ The usual case /
331	while (i-min >= ++len)
332	{
333	weights->index[index+len-`1`] = `0`;
334	}
335	assert(len-`1` == i-min);
336	weights->index[index+i-min] = weight;
337	weights->index[index-`1`] = len;
338	assert(len <= weights->max_len);
339	}
340	else
341	{
342	/ Infrequent case /
343	weights->index[index+i-min] += weight;
344	}
345	}
346
347	static void
348	reorder_weights(fz_weights weights, int* j, int src_w)
349	{
350	int idx = weights->index[j - weights->patch_l];
351	int min = weights->index[idx++];
352	int len = weights->index[idx++];
353	int max = weights->max_len;
354	int tmp = idx+max;
355	int i, off;
356
357	/ Copy into the temporary area /
358	memcpy(&weights->index[tmp], &weights->index[idx], sizeof(int)*len);
359
360	/ Pad out if required /
361	assert(len <= max);
362	assert(min+len <= src_w);
363	off = `0`;
364	if (len < max)
365	{
366	memset(&weights->index[tmp+len], `0`, sizeof(int)*(max-len));
367	len = max;
368	if (min + len > src_w)
369	{
370	off = min + len - src_w;
371	min = src_w - len;
372	weights->index[idx-`2`] = min;
373	}
374	weights->index[idx-`1`] = len;
375	}
376
377	/ Copy back into the proper places /
378	for (i = `0`; i < len; i++)
379	{
380	weights->index[idx+((min+i+off) % max)] = weights->index[tmp+i];
381	}
382	}
383
384	/ Due to rounding and edge effects, the sums for the weights sometimes don't*
385	* add up to 256. This causes visible rendering effects. Therefore, we take
386	* pains to ensure that they 1) never exceed 256, and 2) add up to exactly
387	* 256 for all pixels that are completely covered. See bug #691629. */
388	static void
389	check_weights(fz_weights weights, int* j, int w, float x, float wf)
390	{
391	int idx, len;
392	int sum = `0`;
393	int max = -`256`;
394	int maxidx = `0`;
395	int i;
396
397	idx = weights->index[j - weights->patch_l];
398	idx++; / min /
399	len = weights->index[idx++];
400
401	for(i=`0`; i < len; i++)
402	{
403	int v = weights->index[idx++];
404	sum += v;
405	if (v > max)
406	{
407	max = v;
408	maxidx = idx;
409	}
410	}
411	/ If we aren't the first or last pixel, OR if the sum is too big*
412	* then adjust it. */
413	if (((j != `0`) && (j != w-`1`)) \|\| (sum > `256`))
414	weights->index[maxidx-`1`] += `256`-sum;
415	/ Otherwise, if we are the first pixel, and it's fully covered, then*
416	* adjust it. */
417	else if ((j == `0`) && (x < `0.0001f`) && (sum != `256`))
418	weights->index[maxidx-`1`] += `256`-sum;
419	/ Finally, if we are the last pixel, and it's fully covered, then*
420	* adjust it. */
421	else if ((j == w-`1`) && (w - wf < `0.0001f`) && (sum != `256`))
422	weights->index[maxidx-`1`] += `256`-sum;
423	}
424
425	static fz_weights *
426	make_weights(fz_context ctx, int* src_w, float x, float dst_w, fz_scale_filter filter, int* vertical, int dst_w_int, int patch_l, int patch_r, int n, int flip, fz_scale_cache *cache)
427	{
428	fz_weights *weights;
429	float F, G;
430	float window;
431	int j;
432
433	if (cache)
434	{
435	if (cache->src_w == src_w && cache->x == x && cache->dst_w == dst_w &&
436	cache->filter == filter && cache->vertical == vertical &&
437	cache->dst_w_int == dst_w_int &&
438	cache->patch_l == patch_l && cache->patch_r == patch_r &&
439	cache->n == n && cache->flip == flip)
440	{
441	return cache->weights;
442	}
443	cache->src_w = src_w;
444	cache->x = x;
445	cache->dst_w = dst_w;
446	cache->filter = filter;
447	cache->vertical = vertical;
448	cache->dst_w_int = dst_w_int;
449	cache->patch_l = patch_l;
450	cache->patch_r = patch_r;
451	cache->n = n;
452	cache->flip = flip;
453	fz_free(ctx, cache->weights);
454	cache->weights = NULL;
455	}
456
457	if (dst_w < src_w)
458	{
459	/ Scaling down /
460	F = dst_w / src_w;
461	G = `1`;
462	}
463	else
464	{
465	/ Scaling up /
466	F = `1`;
467	G = src_w / dst_w;
468	}
469	window = filter->width / F;
470	weights = new_weights(ctx, filter, src_w, dst_w, patch_r-patch_l, n, flip, patch_l);
471	if (!weights)
472	return NULL;
473	for (j = patch_l; j < patch_r; j++)
474	{
475	/ find the position of the centre of dst[j] in src space /
476	float centre = (j - x + `0.5f`)*src_w/dst_w - `0.5f`;
477	int l, r;
478	l = ceilf(centre - window);
479	r = floorf(centre + window);
480	init_weights(weights, j);
481	for (; l <= r; l++)
482	{
483	add_weight(weights, j, l, filter, x, F, G, src_w, dst_w);
484	}
485	check_weights(weights, j, dst_w_int, x, dst_w);
486	if (vertical)
487	{
488	reorder_weights(weights, j, src_w);
489	}
490	}
491	weights->count++; / weights->count = dst_w_int now /
492	if (cache)
493	{
494	cache->weights = weights;
495	}
496	return weights;
497	}
498
499	static void
500	scale_row_to_temp(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
501	{
502	const int *contrib = &weights->index[weights->index[`0`]];
503	int len, i, j, n;
504	const unsigned char *min;
505	int tmp[FZ_MAX_COLORS];
506	int *t = tmp;
507
508	n = weights->n;
509	for (j = `0`; j < n; j++)
510	tmp[j] = `128`;
511	if (weights->flip)
512	{
513	dst += (weights->count-`1`)*n;
514	for (i=weights->count; i > `0`; i--)
515	{
516	min = &src[n * *contrib++];
517	len = *contrib++;
518	while (len-- > `0`)
519	{
520	for (j = n; j > `0`; j--)
521	t++ += min++ * *contrib;
522	t -= n;
523	contrib++;
524	}
525	for (j = n; j > `0`; j--)
526	{
527	dst++ = (unsigned* char)(*t>>`8`);
528	*t++ = `128`;
529	}
530	t -= n;
531	dst -= n*`2`;
532	}
533	}
534	else
535	{
536	for (i=weights->count; i > `0`; i--)
537	{
538	min = &src[n * *contrib++];
539	len = *contrib++;
540	while (len-- > `0`)
541	{
542	for (j = n; j > `0`; j--)
543	t++ += min++ * *contrib;
544	t -= n;
545	contrib++;
546	}
547	for (j = n; j > `0`; j--)
548	{
549	dst++ = (unsigned* char)(*t>>`8`);
550	*t++ = `128`;
551	}
552	t -= n;
553	}
554	}
555	}
556
557	#ifdef ARCH_ARM
558
559	static void
560	scale_row_to_temp1(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
561	__attribute__((naked));
562
563	static void
564	scale_row_to_temp2(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
565	__attribute__((naked));
566
567	static void
568	scale_row_to_temp3(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
569	__attribute__((naked));
570
571	static void
572	scale_row_to_temp4(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
573	__attribute__((naked));
574
575	static void
576	scale_row_from_temp(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights, int width, int n, int row)
577	__attribute__((naked));
578
579	static void
580	scale_row_from_temp_alpha(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights, int width, int n, int row)
581	__attribute__((naked));
582
583	static void
584	scale_row_to_temp1(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
585	{
586	asm volatile(
587	ENTER_ARM
588	".syntax unified\n"
589	"stmfd r13!,{r4-r7,r9,r14} \n"
590	"@ r0 = dst \n"
591	"@ r1 = src \n"
592	"@ r2 = weights \n"
593	"ldr r12,[r2],#4 @ r12= flip \n"
594	"ldr r3, [r2],#20 @ r3 = count r2 = &index\n"
595	"ldr r4, [r2] @ r4 = index[0] \n"
596	"cmp r12,#0 @ if (flip) \n"
597	"beq 5f @ { \n"
598	"add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
599	"add r0, r0, r3 @ dst += count \n"
600	"1: \n"
601	"ldr r4, [r2], #4 @ r4 = *contrib++ \n"
602	"ldr r9, [r2], #4 @ r9 = len = *contrib++ \n"
603	"mov r5, #128 @ r5 = a = 128 \n"
604	"add r4, r1, r4 @ r4 = min = &src[r4] \n"
605	"subs r9, r9, #1 @ len-- \n"
606	"blt 3f @ while (len >= 0) \n"
607	"2: @ { \n"
608	"ldrgt r6, [r2], #4 @ r6 = *contrib++ \n"
609	"ldrbgt r7, [r4], #1 @ r7 = *min++ \n"
610	"ldr r12,[r2], #4 @ r12 = *contrib++ \n"
611	"ldrb r14,[r4], #1 @ r14 = *min++ \n"
612	"mlagt r5, r6, r7, r5 @ g += r6 * r7 \n"
613	"subs r9, r9, #2 @ r9 = len -= 2 \n"
614	"mla r5, r12,r14,r5 @ g += r14 * r12 \n"
615	"bge 2b @ } \n"
616	"3: \n"
617	"mov r5, r5, lsr #8 @ g >>= 8 \n"
618	"strb r5,[r0, #-1]! @ *--dst=a \n"
619	"subs r3, r3, #1 @ i-- \n"
620	"bgt 1b @ \n"
621	"ldmfd r13!,{r4-r7,r9,PC} @ pop, return to thumb \n"
622	"5:"
623	"add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
624	"6:"
625	"ldr r4, [r2], #4 @ r4 = *contrib++ \n"
626	"ldr r9, [r2], #4 @ r9 = len = *contrib++ \n"
627	"mov r5, #128 @ r5 = a = 128 \n"
628	"add r4, r1, r4 @ r4 = min = &src[r4] \n"
629	"subs r9, r9, #1 @ len-- \n"
630	"blt 9f @ while (len > 0) \n"
631	"7: @ { \n"
632	"ldrgt r6, [r2], #4 @ r6 = *contrib++ \n"
633	"ldrbgt r7, [r4], #1 @ r7 = *min++ \n"
634	"ldr r12,[r2], #4 @ r12 = *contrib++ \n"
635	"ldrb r14,[r4], #1 @ r14 = *min++ \n"
636	"mlagt r5, r6,r7,r5 @ a += r6 * r7 \n"
637	"subs r9, r9, #2 @ r9 = len -= 2 \n"
638	"mla r5, r12,r14,r5 @ a += r14 * r12 \n"
639	"bge 7b @ } \n"
640	"9: \n"
641	"mov r5, r5, LSR #8 @ a >>= 8 \n"
642	"strb r5, [r0], #1 @ *dst++=a \n"
643	"subs r3, r3, #1 @ i-- \n"
644	"bgt 6b @ \n"
645	"ldmfd r13!,{r4-r7,r9,PC} @ pop, return to thumb \n"
646	ENTER_THUMB
647	);
648	}
649
650	static void
651	scale_row_to_temp2(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
652	{
653	asm volatile(
654	ENTER_ARM
655	"stmfd r13!,{r4-r6,r9-r11,r14} \n"
656	"@ r0 = dst \n"
657	"@ r1 = src \n"
658	"@ r2 = weights \n"
659	"ldr r12,[r2],#4 @ r12= flip \n"
660	"ldr r3, [r2],#20 @ r3 = count r2 = &index\n"
661	"ldr r4, [r2] @ r4 = index[0] \n"
662	"cmp r12,#0 @ if (flip) \n"
663	"beq 4f @ { \n"
664	"add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
665	"add r0, r0, r3, LSL #1 @ dst += 2*count \n"
666	"1: \n"
667	"ldr r4, [r2], #4 @ r4 = *contrib++ \n"
668	"ldr r9, [r2], #4 @ r9 = len = *contrib++ \n"
669	"mov r5, #128 @ r5 = g = 128 \n"
670	"mov r6, #128 @ r6 = a = 128 \n"
671	"add r4, r1, r4, LSL #1 @ r4 = min = &src[2*r4] \n"
672	"cmp r9, #0 @ while (len-- > 0) \n"
673	"beq 3f @ { \n"
674	"2: \n"
675	"ldr r14,[r2], #4 @ r14 = *contrib++ \n"
676	"ldrb r11,[r4], #1 @ r11 = *min++ \n"
677	"ldrb r12,[r4], #1 @ r12 = *min++ \n"
678	"subs r9, r9, #1 @ r9 = len-- \n"
679	"mla r5, r14,r11,r5 @ g += r11 * r14 \n"
680	"mla r6, r14,r12,r6 @ a += r12 * r14 \n"
681	"bgt 2b @ } \n"
682	"3: \n"
683	"mov r5, r5, lsr #8 @ g >>= 8 \n"
684	"mov r6, r6, lsr #8 @ a >>= 8 \n"
685	"strb r5, [r0, #-2]! @ *--dst=a \n"
686	"strb r6, [r0, #1] @ *--dst=g \n"
687	"subs r3, r3, #1 @ i-- \n"
688	"bgt 1b @ \n"
689	"ldmfd r13!,{r4-r6,r9-r11,PC} @ pop, return to thumb \n"
690	"4:"
691	"add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
692	"5:"
693	"ldr r4, [r2], #4 @ r4 = *contrib++ \n"
694	"ldr r9, [r2], #4 @ r9 = len = *contrib++ \n"
695	"mov r5, #128 @ r5 = g = 128 \n"
696	"mov r6, #128 @ r6 = a = 128 \n"
697	"add r4, r1, r4, LSL #1 @ r4 = min = &src[2*r4] \n"
698	"cmp r9, #0 @ while (len-- > 0) \n"
699	"beq 7f @ { \n"
700	"6: \n"
701	"ldr r14,[r2], #4 @ r10 = *contrib++ \n"
702	"ldrb r11,[r4], #1 @ r11 = *min++ \n"
703	"ldrb r12,[r4], #1 @ r12 = *min++ \n"
704	"subs r9, r9, #1 @ r9 = len-- \n"
705	"mla r5, r14,r11,r5 @ g += r11 * r14 \n"
706	"mla r6, r14,r12,r6 @ a += r12 * r14 \n"
707	"bgt 6b @ } \n"
708	"7: \n"
709	"mov r5, r5, lsr #8 @ g >>= 8 \n"
710	"mov r6, r6, lsr #8 @ a >>= 8 \n"
711	"strb r5, [r0], #1 @ *dst++=g \n"
712	"strb r6, [r0], #1 @ *dst++=a \n"
713	"subs r3, r3, #1 @ i-- \n"
714	"bgt 5b @ \n"
715	"ldmfd r13!,{r4-r6,r9-r11,PC} @ pop, return to thumb \n"
716	ENTER_THUMB
717	);
718	}
719
720	static void
721	scale_row_to_temp3(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
722	{
723	asm volatile(
724	ENTER_ARM
725	"stmfd r13!,{r4-r11,r14} \n"
726	"@ r0 = dst \n"
727	"@ r1 = src \n"
728	"@ r2 = weights \n"
729	"ldr r12,[r2],#4 @ r12= flip \n"
730	"ldr r3, [r2],#20 @ r3 = count r2 = &index\n"
731	"ldr r4, [r2] @ r4 = index[0] \n"
732	"cmp r12,#0 @ if (flip) \n"
733	"beq 4f @ { \n"
734	"add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
735	"add r0, r0, r3, LSL #1 @ \n"
736	"add r0, r0, r3 @ dst += 3*count \n"
737	"1: \n"
738	"ldr r4, [r2], #4 @ r4 = *contrib++ \n"
739	"ldr r9, [r2], #4 @ r9 = len = *contrib++ \n"
740	"mov r5, #128 @ r5 = r = 128 \n"
741	"mov r6, #128 @ r6 = g = 128 \n"
742	"add r7, r1, r4, LSL #1 @ \n"
743	"add r4, r7, r4 @ r4 = min = &src[3*r4] \n"
744	"mov r7, #128 @ r7 = b = 128 \n"
745	"cmp r9, #0 @ while (len-- > 0) \n"
746	"beq 3f @ { \n"
747	"2: \n"
748	"ldr r14,[r2], #4 @ r14 = *contrib++ \n"
749	"ldrb r8, [r4], #1 @ r8 = *min++ \n"
750	"ldrb r11,[r4], #1 @ r11 = *min++ \n"
751	"ldrb r12,[r4], #1 @ r12 = *min++ \n"
752	"subs r9, r9, #1 @ r9 = len-- \n"
753	"mla r5, r14,r8, r5 @ r += r8 * r14 \n"
754	"mla r6, r14,r11,r6 @ g += r11 * r14 \n"
755	"mla r7, r14,r12,r7 @ b += r12 * r14 \n"
756	"bgt 2b @ } \n"
757	"3: \n"
758	"mov r5, r5, lsr #8 @ r >>= 8 \n"
759	"mov r6, r6, lsr #8 @ g >>= 8 \n"
760	"mov r7, r7, lsr #8 @ b >>= 8 \n"
761	"strb r5, [r0, #-3]! @ *--dst=r \n"
762	"strb r6, [r0, #1] @ *--dst=g \n"
763	"strb r7, [r0, #2] @ *--dst=b \n"
764	"subs r3, r3, #1 @ i-- \n"
765	"bgt 1b @ \n"
766	"ldmfd r13!,{r4-r11,PC} @ pop, return to thumb \n"
767	"4:"
768	"add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
769	"5:"
770	"ldr r4, [r2], #4 @ r4 = *contrib++ \n"
771	"ldr r9, [r2], #4 @ r9 = len = *contrib++ \n"
772	"mov r5, #128 @ r5 = r = 128 \n"
773	"mov r6, #128 @ r6 = g = 128 \n"
774	"add r7, r1, r4, LSL #1 @ r7 = min = &src[2*r4] \n"
775	"add r4, r7, r4 @ r4 = min = &src[3*r4] \n"
776	"mov r7, #128 @ r7 = b = 128 \n"
777	"cmp r9, #0 @ while (len-- > 0) \n"
778	"beq 7f @ { \n"
779	"6: \n"
780	"ldr r14,[r2], #4 @ r10 = *contrib++ \n"
781	"ldrb r8, [r4], #1 @ r8 = *min++ \n"
782	"ldrb r11,[r4], #1 @ r11 = *min++ \n"
783	"ldrb r12,[r4], #1 @ r12 = *min++ \n"
784	"subs r9, r9, #1 @ r9 = len-- \n"
785	"mla r5, r14,r8, r5 @ r += r8 * r14 \n"
786	"mla r6, r14,r11,r6 @ g += r11 * r14 \n"
787	"mla r7, r14,r12,r7 @ b += r12 * r14 \n"
788	"bgt 6b @ } \n"
789	"7: \n"
790	"mov r5, r5, lsr #8 @ r >>= 8 \n"
791	"mov r6, r6, lsr #8 @ g >>= 8 \n"
792	"mov r7, r7, lsr #8 @ b >>= 8 \n"
793	"strb r5, [r0], #1 @ *dst++=r \n"
794	"strb r6, [r0], #1 @ *dst++=g \n"
795	"strb r7, [r0], #1 @ *dst++=b \n"
796	"subs r3, r3, #1 @ i-- \n"
797	"bgt 5b @ \n"
798	"ldmfd r13!,{r4-r11,PC} @ pop, return to thumb \n"
799	ENTER_THUMB
800	);
801	}
802
803	static void
804	scale_row_to_temp4(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
805	{
806	asm volatile(
807	ENTER_ARM
808	"stmfd r13!,{r4-r11,r14} \n"
809	"@ r0 = dst \n"
810	"@ r1 = src \n"
811	"@ r2 = weights \n"
812	"ldr r12,[r2],#4 @ r12= flip \n"
813	"ldr r3, [r2],#20 @ r3 = count r2 = &index\n"
814	"ldr r4, [r2] @ r4 = index[0] \n"
815	"ldr r5,=0x00800080 @ r5 = rounding \n"
816	"ldr r6,=0x00FF00FF @ r7 = 0x00FF00FF \n"
817	"cmp r12,#0 @ if (flip) \n"
818	"beq 4f @ { \n"
819	"add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
820	"add r0, r0, r3, LSL #2 @ dst += 4*count \n"
821	"1: \n"
822	"ldr r4, [r2], #4 @ r4 = *contrib++ \n"
823	"ldr r9, [r2], #4 @ r9 = len = *contrib++ \n"
824	"mov r7, r5 @ r7 = b = rounding \n"
825	"mov r8, r5 @ r8 = a = rounding \n"
826	"add r4, r1, r4, LSL #2 @ r4 = min = &src[4*r4] \n"
827	"cmp r9, #0 @ while (len-- > 0) \n"
828	"beq 3f @ { \n"
829	"2: \n"
830	"ldr r11,[r4], #4 @ r11 = *min++ \n"
831	"ldr r10,[r2], #4 @ r10 = *contrib++ \n"
832	"subs r9, r9, #1 @ r9 = len-- \n"
833	"and r12,r6, r11 @ r12 = __22__00 \n"
834	"and r11,r6, r11,LSR #8 @ r11 = __33__11 \n"
835	"mla r7, r10,r12,r7 @ b += r14 * r10 \n"
836	"mla r8, r10,r11,r8 @ a += r11 * r10 \n"
837	"bgt 2b @ } \n"
838	"3: \n"
839	"and r7, r6, r7, lsr #8 @ r7 = __22__00 \n"
840	"bic r8, r8, r6 @ r8 = 33__11__ \n"
841	"orr r7, r7, r8 @ r7 = 33221100 \n"
842	"str r7, [r0, #-4]! @ *--dst=r \n"
843	"subs r3, r3, #1 @ i-- \n"
844	"bgt 1b @ \n"
845	"ldmfd r13!,{r4-r11,PC} @ pop, return to thumb \n"
846	"4: \n"
847	"add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
848	"5: \n"
849	"ldr r4, [r2], #4 @ r4 = *contrib++ \n"
850	"ldr r9, [r2], #4 @ r9 = len = *contrib++ \n"
851	"mov r7, r5 @ r7 = b = rounding \n"
852	"mov r8, r5 @ r8 = a = rounding \n"
853	"add r4, r1, r4, LSL #2 @ r4 = min = &src[4*r4] \n"
854	"cmp r9, #0 @ while (len-- > 0) \n"
855	"beq 7f @ { \n"
856	"6: \n"
857	"ldr r11,[r4], #4 @ r11 = *min++ \n"
858	"ldr r10,[r2], #4 @ r10 = *contrib++ \n"
859	"subs r9, r9, #1 @ r9 = len-- \n"
860	"and r12,r6, r11 @ r12 = __22__00 \n"
861	"and r11,r6, r11,LSR #8 @ r11 = __33__11 \n"
862	"mla r7, r10,r12,r7 @ b += r14 * r10 \n"
863	"mla r8, r10,r11,r8 @ a += r11 * r10 \n"
864	"bgt 6b @ } \n"
865	"7: \n"
866	"and r7, r6, r7, lsr #8 @ r7 = __22__00 \n"
867	"bic r8, r8, r6 @ r8 = 33__11__ \n"
868	"orr r7, r7, r8 @ r7 = 33221100 \n"
869	"str r7, [r0], #4 @ *dst++=r \n"
870	"subs r3, r3, #1 @ i-- \n"
871	"bgt 5b @ \n"
872	"ldmfd r13!,{r4-r11,PC} @ pop, return to thumb \n"
873	ENTER_THUMB
874	);
875	}
876
877	static void
878	scale_row_from_temp(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights, int width, int n, int row)
879	{
880	asm volatile(
881	ENTER_ARM
882	"stmfd r13!,{r4-r11,r14} \n"
883	"@ r0 = dst \n"
884	"@ r1 = src \n"
885	"@ r2 = &weights->index[0] \n"
886	"@ r3 = width \n"
887	"@ r12= row \n"
888	"ldr r14,[r13,#4*9] @ r14= n \n"
889	"ldr r12,[r13,#4*10] @ r12= row \n"
890	"add r2, r2, #24 @ r2 = weights->index \n"
891	"mul r3, r14, r3 @ r3 = width *= n \n"
892	"ldr r4, [r2, r12, LSL #2] @ r4 = index[row] \n"
893	"add r2, r2, #4 @ r2 = &index[1] \n"
894	"subs r6, r3, #4 @ r6 = x = width-4 \n"
895	"ldr r14,[r2, r4, LSL #2]! @ r2 = contrib = index[index[row]+1]\n"
896	" @ r14= len = *contrib \n"
897	"blt 4f @ while (x >= 0) { \n"
898	#ifndef ARCH_UNALIGNED_OK
899	"tst r3, #3 @ if ((r3 & 3) \n"
900	"tsteq r1, #3 @ \|\| (r1 & 3)) \n"
901	"bne 4f @ can't do fast code \n"
902	#endif
903	"ldr r9, =0x00FF00FF @ r9 = 0x00FF00FF \n"
904	"1: \n"
905	"ldr r7, =0x00800080 @ r5 = val0 = round \n"
906	"stmfd r13!,{r1,r2,r7} @ stash r1,r2,r5 \n"
907	" @ r1 = min = src \n"
908	" @ r2 = contrib2-4 \n"
909	"movs r8, r14 @ r8 = len2 = len \n"
910	"mov r5, r7 @ r7 = val1 = round \n"
911	"ble 3f @ while (len2-- > 0) { \n"
912	"2: \n"
913	"ldr r12,[r1], r3 @ r12 = *min r5 = min += width\n"
914	"ldr r10,[r2, #4]! @ r10 = *contrib2++ \n"
915	"subs r8, r8, #1 @ len2-- \n"
916	"and r11,r9, r12 @ r11= __22__00 \n"
917	"and r12,r9, r12,LSR #8 @ r12= __33__11 \n"
918	"mla r5, r10,r11,r5 @ r5 = val0 += r11 * r10\n"
919	"mla r7, r10,r12,r7 @ r7 = val1 += r12 * r10\n"
920	"bgt 2b @ } \n"
921	"and r5, r9, r5, LSR #8 @ r5 = __22__00 \n"
922	"and r7, r7, r9, LSL #8 @ r7 = 33__11__ \n"
923	"orr r5, r5, r7 @ r5 = 33221100 \n"
924	"3: \n"
925	"ldmfd r13!,{r1,r2,r7} @ restore r1,r2,r7 \n"
926	"subs r6, r6, #4 @ x-- \n"
927	"add r1, r1, #4 @ src++ \n"
928	"str r5, [r0], #4 @ *dst++ = val \n"
929	"bge 1b @ \n"
930	"4: @ } (Less than 4 to go) \n"
931	"adds r6, r6, #4 @ r6 = x += 4 \n"
932	"beq 8f @ if (x == 0) done \n"
933	"5: \n"
934	"mov r5, r1 @ r5 = min = src \n"
935	"mov r7, #128 @ r7 = val = 128 \n"
936	"movs r8, r14 @ r8 = len2 = len \n"
937	"add r9, r2, #4 @ r9 = contrib2 \n"
938	"ble 7f @ while (len2-- > 0) { \n"
939	"6: \n"
940	"ldr r10,[r9], #4 @ r10 = *contrib2++ \n"
941	"ldrb r12,[r5], r3 @ r12 = *min r5 = min += width\n"
942	"subs r8, r8, #1 @ len2-- \n"
943	"@ stall r12 \n"
944	"mla r7, r10,r12,r7 @ val += r12 * r10 \n"
945	"bgt 6b @ } \n"
946	"7: \n"
947	"mov r7, r7, asr #8 @ r7 = val >>= 8 \n"
948	"subs r6, r6, #1 @ x-- \n"
949	"add r1, r1, #1 @ src++ \n"
950	"strb r7, [r0], #1 @ *dst++ = val \n"
951	"bgt 5b @ \n"
952	"8: \n"
953	"ldmfd r13!,{r4-r11,PC} @ pop, return to thumb \n"
954	".ltorg \n"
955	ENTER_THUMB
956	);
957	}
958
959	static void
960	scale_row_from_temp_alpha(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights, int width, int n, int row)
961	{
962	asm volatile(
963	ENTER_ARM
964	"stmfd r13!,{r4-r11,r14} \n"
965	"mov r11,#255 @ r11= 255 \n"
966	"ldr r12,[r13,#4*10] @ r12= row \n"
967	"@ r0 = dst \n"
968	"@ r1 = src \n"
969	"@ r2 = &weights->index[0] \n"
970	"@ r3 = width \n"
971	"@ r11= 255 \n"
972	"@ r12= row \n"
973	"add r2, r2, #24 @ r2 = weights->index \n"
974	"ldr r4, [r2, r12, LSL #2] @ r4 = index[row] \n"
975	"add r2, r2, #4 @ r2 = &index[1] \n"
976	"mov r6, r3 @ r6 = x = width \n"
977	"ldr r14,[r2, r4, LSL #2]! @ r2 = contrib = index[index[row]+1]\n"
978	" @ r14= len = *contrib \n"
979	"5: \n"
980	"ldr r4,[r13,#4*9] @ r10= nn = n \n"
981	"1: \n"
982	"mov r5, r1 @ r5 = min = src \n"
983	"mov r7, #128 @ r7 = val = 128 \n"
984	"movs r8, r14 @ r8 = len2 = len \n"
985	"add r9, r2, #4 @ r9 = contrib2 \n"
986	"ble 7f @ while (len2-- > 0) { \n"
987	"6: \n"
988	"ldr r10,[r9], #4 @ r10 = *contrib2++ \n"
989	"ldrb r12,[r5], r3 @ r12 = *min r5 = min += width\n"
990	"subs r8, r8, #1 @ len2-- \n"
991	"@ stall r12 \n"
992	"mla r7, r10,r12,r7 @ val += r12 * r10 \n"
993	"bgt 6b @ } \n"
994	"7: \n"
995	"mov r7, r7, asr #8 @ r7 = val >>= 8 \n"
996	"subs r4, r4, #1 @ r4 = nn-- \n"
997	"add r1, r1, #1 @ src++ \n"
998	"strb r7, [r0], #1 @ *dst++ = val \n"
999	"bgt 1b @ \n"
1000	"subs r6, r6, #1 @ x-- \n"
1001	"strb r11,[r0], #1 @ *dst++ = 255 \n"
1002	"bgt 5b @ \n"
1003	"ldmfd r13!,{r4-r11,PC} @ pop, return to thumb \n"
1004	".ltorg \n"
1005	ENTER_THUMB
1006	);
1007	}
1008	#else
1009
1010	static void
1011	scale_row_to_temp1(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
1012	{
1013	const int *contrib = &weights->index[weights->index[`0`]];
1014	int len, i;
1015	const unsigned char *min;
1016
1017	assert(weights->n == `1`);
1018	if (weights->flip)
1019	{
1020	dst += weights->count;
1021	for (i=weights->count; i > `0`; i--)
1022	{
1023	int val = `128`;
1024	min = &src[*contrib++];
1025	len = *contrib++;
1026	while (len-- > `0`)
1027	{
1028	val += min++ *contrib++;
1029	}
1030	--dst = (unsigned* char)(val>>`8`);
1031	}
1032	}
1033	else
1034	{
1035	for (i=weights->count; i > `0`; i--)
1036	{
1037	int val = `128`;
1038	min = &src[*contrib++];
1039	len = *contrib++;
1040	while (len-- > `0`)
1041	{
1042	val += min++ *contrib++;
1043	}
1044	dst++ = (unsigned* char)(val>>`8`);
1045	}
1046	}
1047	}
1048
1049	static void
1050	scale_row_to_temp2(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
1051	{
1052	const int *contrib = &weights->index[weights->index[`0`]];
1053	int len, i;
1054	const unsigned char *min;
1055
1056	assert(weights->n == `2`);
1057	if (weights->flip)
1058	{
1059	dst += `2`*weights->count;
1060	for (i=weights->count; i > `0`; i--)
1061	{
1062	int c1 = `128`;
1063	int c2 = `128`;
1064	min = &src[`2` * *contrib++];
1065	len = *contrib++;
1066	while (len-- > `0`)
1067	{
1068	c1 += min++ *contrib;
1069	c2 += min++ *contrib++;
1070	}
1071	--dst = (unsigned* char)(c2>>`8`);
1072	--dst = (unsigned* char)(c1>>`8`);
1073	}
1074	}
1075	else
1076	{
1077	for (i=weights->count; i > `0`; i--)
1078	{
1079	int c1 = `128`;
1080	int c2 = `128`;
1081	min = &src[`2` * *contrib++];
1082	len = *contrib++;
1083	while (len-- > `0`)
1084	{
1085	c1 += min++ *contrib;
1086	c2 += min++ *contrib++;
1087	}
1088	dst++ = (unsigned* char)(c1>>`8`);
1089	dst++ = (unsigned* char)(c2>>`8`);
1090	}
1091	}
1092	}
1093
1094	static void
1095	scale_row_to_temp3(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
1096	{
1097	const int *contrib = &weights->index[weights->index[`0`]];
1098	int len, i;
1099	const unsigned char *min;
1100
1101	assert(weights->n == `3`);
1102	if (weights->flip)
1103	{
1104	dst += `3`*weights->count;
1105	for (i=weights->count; i > `0`; i--)
1106	{
1107	int c1 = `128`;
1108	int c2 = `128`;
1109	int c3 = `128`;
1110	min = &src[`3` * *contrib++];
1111	len = *contrib++;
1112	while (len-- > `0`)
1113	{
1114	int c = *contrib++;
1115	c1 += min++ c;
1116	c2 += min++ c;
1117	c3 += min++ c;
1118	}
1119	--dst = (unsigned* char)(c3>>`8`);
1120	--dst = (unsigned* char)(c2>>`8`);
1121	--dst = (unsigned* char)(c1>>`8`);
1122	}
1123	}
1124	else
1125	{
1126	for (i=weights->count; i > `0`; i--)
1127	{
1128	int c1 = `128`;
1129	int c2 = `128`;
1130	int c3 = `128`;
1131	min = &src[`3` * *contrib++];
1132	len = *contrib++;
1133	while (len-- > `0`)
1134	{
1135	int c = *contrib++;
1136	c1 += min++ c;
1137	c2 += min++ c;
1138	c3 += min++ c;
1139	}
1140	dst++ = (unsigned* char)(c1>>`8`);
1141	dst++ = (unsigned* char)(c2>>`8`);
1142	dst++ = (unsigned* char)(c3>>`8`);
1143	}
1144	}
1145	}
1146
1147	static void
1148	scale_row_to_temp4(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
1149	{
1150	const int *contrib = &weights->index[weights->index[`0`]];
1151	int len, i;
1152	const unsigned char *min;
1153
1154	assert(weights->n == `4`);
1155	if (weights->flip)
1156	{
1157	dst += `4`*weights->count;
1158	for (i=weights->count; i > `0`; i--)
1159	{
1160	int r = `128`;
1161	int g = `128`;
1162	int b = `128`;
1163	int a = `128`;
1164	min = &src[`4` * *contrib++];
1165	len = *contrib++;
1166	while (len-- > `0`)
1167	{
1168	r += min++ *contrib;
1169	g += min++ *contrib;
1170	b += min++ *contrib;
1171	a += min++ *contrib++;
1172	}
1173	--dst = (unsigned* char)(a>>`8`);
1174	--dst = (unsigned* char)(b>>`8`);
1175	--dst = (unsigned* char)(g>>`8`);
1176	--dst = (unsigned* char)(r>>`8`);
1177	}
1178	}
1179	else
1180	{
1181	for (i=weights->count; i > `0`; i--)
1182	{
1183	int r = `128`;
1184	int g = `128`;
1185	int b = `128`;
1186	int a = `128`;
1187	min = &src[`4` * *contrib++];
1188	len = *contrib++;
1189	while (len-- > `0`)
1190	{
1191	r += min++ *contrib;
1192	g += min++ *contrib;
1193	b += min++ *contrib;
1194	a += min++ *contrib++;
1195	}
1196	dst++ = (unsigned* char)(r>>`8`);
1197	dst++ = (unsigned* char)(g>>`8`);
1198	dst++ = (unsigned* char)(b>>`8`);
1199	dst++ = (unsigned* char)(a>>`8`);
1200	}
1201	}
1202	}
1203
1204	static void
1205	scale_row_from_temp(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights, int w, int n, int row)
1206	{
1207	const int *contrib = &weights->index[weights->index[row]];
1208	int len, x;
1209	int width = w * n;
1210
1211	contrib++; / Skip min /
1212	len = *contrib++;
1213	for (x=width; x > `0`; x--)
1214	{
1215	const unsigned char *min = src;
1216	int val = `128`;
1217	int len2 = len;
1218	const int *contrib2 = contrib;
1219
1220	while (len2-- > `0`)
1221	{
1222	val += min *contrib2++;
1223	min += width;
1224	}
1225	dst++ = (unsigned* char)(val>>`8`);
1226	src++;
1227	}
1228	}
1229
1230	static void
1231	scale_row_from_temp_alpha(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights, int w, int n, int row)
1232	{
1233	const int *contrib = &weights->index[weights->index[row]];
1234	int len, x;
1235	int width = w * n;
1236
1237	contrib++; / Skip min /
1238	len = *contrib++;
1239	for (x=w; x > `0`; x--)
1240	{
1241	int nn;
1242	for (nn = n; nn > `0`; nn--)
1243	{
1244	const unsigned char *min = src;
1245	int val = `128`;
1246	int len2 = len;
1247	const int *contrib2 = contrib;
1248
1249	while (len2-- > `0`)
1250	{
1251	val += min *contrib2++;
1252	min += width;
1253	}
1254	dst++ = (unsigned* char)(val>>`8`);
1255	src++;
1256	}
1257	*dst++ = `255`;
1258	}
1259	}
1260	#endif
1261
1262	#ifdef SINGLE_PIXEL_SPECIALS
1263	static void
1264	duplicate_single_pixel(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, int n, int forcealpha, int w, int h, int stride)
1265	{
1266	int i;
1267
1268	for (i = n; i > `0`; i--)
1269	dst++ = src++;
1270	if (forcealpha)
1271	*dst++ = `255`;
1272	n += forcealpha;
1273	for (i = w-`1`; i > `0`; i--)
1274	{
1275	memcpy(dst, dst-n, n);
1276	dst += n;
1277	}
1278	w *= n;
1279	dst -= w;
1280	h--;
1281	while (h--)
1282	{
1283	memcpy(dst+stride, dst, w);
1284	dst += stride;
1285	}
1286	}
1287
1288	static void
1289	scale_single_row(unsigned char * FZ_RESTRICT dst, int dstride, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights, int src_w, int h, int forcealpha)
1290	{
1291	const int *contrib = &weights->index[weights->index[`0`]];
1292	int min, len, i, j, n, nf;
1293	int tmp[FZ_MAX_COLORS];
1294
1295	n = weights->n;
1296	nf = n + forcealpha;
1297	/ Scale a single row /
1298	for (j = `0`; j < nf; j++)
1299	tmp[j] = `128`;
1300	if (weights->flip)
1301	{
1302	dst += (weights->count-`1`)*nf;
1303	for (i=weights->count; i > `0`; i--)
1304	{
1305	min = *contrib++;
1306	len = *contrib++;
1307	min *= n;
1308	while (len-- > `0`)
1309	{
1310	int c = *contrib++;
1311	for (j = `0`; j < n; j++)
1312	tmp[j] += src[min++] * c;
1313	if (forcealpha)
1314	tmp[j] += `255` * c;
1315	}
1316	for (j = `0`; j < nf; j++)
1317	{
1318	dst++ = (unsigned* char)(tmp[j]>>`8`);
1319	tmp[j] = `128`;
1320	}
1321	dst -= `2`*nf;
1322	}
1323	dst += nf + dstride;
1324	}
1325	else
1326	{
1327	for (i=weights->count; i > `0`; i--)
1328	{
1329	min = *contrib++;
1330	len = *contrib++;
1331	min *= n;
1332	while (len-- > `0`)
1333	{
1334	int c = *contrib++;
1335	for (j = `0`; j < n; j++)
1336	tmp[j] += src[min++] * c;
1337	if (forcealpha)
1338	tmp[j] += `255` * c;
1339	}
1340	for (j = `0`; j < nf; j++)
1341	{
1342	dst++ = (unsigned* char)(tmp[j]>>`8`);
1343	tmp[j] = `128`;
1344	}
1345	}
1346	dst += dstride - weights->count * nf;
1347	}
1348	/ And then duplicate it h times /
1349	nf *= weights->count;
1350	while (--h > `0`)
1351	{
1352	memcpy(dst, dst-dstride, nf);
1353	dst += dstride;
1354	}
1355	}
1356
1357	static void
1358	scale_single_col(unsigned char * FZ_RESTRICT dst, int dstride, const unsigned char * FZ_RESTRICT src, int sstride, const fz_weights * FZ_RESTRICT weights, int src_w, int n, int w, int forcealpha)
1359	{
1360	const int *contrib = &weights->index[weights->index[`0`]];
1361	int min, len, i, j;
1362	int tmp[FZ_MAX_COLORS];
1363	int nf = n + forcealpha;
1364
1365	for (j = `0`; j < nf; j++)
1366	tmp[j] = `128`;
1367	if (weights->flip)
1368	{
1369	src_w = (src_w-`1`)*sstride;
1370	for (i=weights->count; i > `0`; i--)
1371	{
1372	/ Scale the next pixel in the column /
1373	min = *contrib++;
1374	len = *contrib++;
1375	min = src_w-min*sstride;
1376	while (len-- > `0`)
1377	{
1378	int c = *contrib++;
1379	for (j = `0`; j < n; j++)
1380	tmp[j] += src[min+j] * c;
1381	if (forcealpha)
1382	tmp[j] += `255` * c;
1383	min -= sstride;
1384	}
1385	for (j = `0`; j < nf; j++)
1386	{
1387	dst++ = (unsigned* char)(tmp[j]>>`8`);
1388	tmp[j] = `128`;
1389	}
1390	/ And then duplicate it across the row /
1391	for (j = (w-`1`)*nf; j > `0`; j--)
1392	{
1393	*dst = dst[-nf];
1394	dst++;
1395	}
1396	dst += dstride - w*nf;
1397	}
1398	}
1399	else
1400	{
1401	for (i=weights->count; i > `0`; i--)
1402	{
1403	/ Scale the next pixel in the column /
1404	min = *contrib++;
1405	len = *contrib++;
1406	min *= sstride;
1407	while (len-- > `0`)
1408	{
1409	int c = *contrib++;
1410	for (j = `0`; j < n; j++)
1411	tmp[j] += src[min+j] * c;
1412	if (forcealpha)
1413	tmp[j] += `255` * c;
1414	min += sstride;
1415	}
1416	for (j = `0`; j < nf; j++)
1417	{
1418	dst++ = (unsigned* char)(tmp[j]>>`8`);
1419	tmp[j] = `128`;
1420	}
1421	/ And then duplicate it across the row /
1422	for (j = (w-`1`)*nf; j > `0`; j--)
1423	{
1424	*dst = dst[-nf];
1425	dst++;
1426	}
1427	dst += dstride - w*nf;
1428	}
1429	}
1430	}
1431	#endif /* SINGLE_PIXEL_SPECIALS */
1432
1433	static void
1434	get_alpha_edge_values(const fz_weights * FZ_RESTRICT rows, int * FZ_RESTRICT tp, int * FZ_RESTRICT bp)
1435	{
1436	const int *contrib = &rows->index[rows->index[`0`]];
1437	int len, i, t, b;
1438
1439	/ Calculate the edge alpha values /
1440	contrib++; / Skip min /
1441	len = *contrib++;
1442	t = `0`;
1443	while (len--)
1444	t += *contrib++;
1445	for (i=rows->count-`2`; i > `0`; i--)
1446	{
1447	contrib++; / Skip min /
1448	len = *contrib++;
1449	contrib += len;
1450	}
1451	b = `0`;
1452	if (i == `0`)
1453	{
1454	contrib++;
1455	len = *contrib++;
1456	while (len--)
1457	b += *contrib++;
1458	}
1459	if (rows->flip && i == `0`)
1460	{
1461	*tp = b;
1462	*bp = t;
1463	}
1464	else
1465	{
1466	*tp = t;
1467	*bp = b;
1468	}
1469	}
1470
1471	static void
1472	adjust_alpha_edges(fz_pixmap * FZ_RESTRICT pix, const fz_weights * FZ_RESTRICT rows, const fz_weights * FZ_RESTRICT cols)
1473	{
1474	int t, l, r, b, tl, tr, bl, br, x, y;
1475	unsigned char *dp = pix->samples;
1476	int w = pix->w;
1477	int n = pix->n;
1478	int span = w >= `2` ? (w-`1`)*n : `0`;
1479	int stride = pix->stride;
1480
1481	get_alpha_edge_values(rows, &t, &b);
1482	get_alpha_edge_values(cols, &l, &r);
1483
1484	l = (`255` * l + `128`)>>`8`;
1485	r = (`255` * r + `128`)>>`8`;
1486	tl = (l * t + `128`)>>`8`;
1487	tr = (r * t + `128`)>>`8`;
1488	bl = (l * b + `128`)>>`8`;
1489	br = (r * b + `128`)>>`8`;
1490	t = (`255` * t + `128`)>>`8`;
1491	b = (`255` * b + `128`)>>`8`;
1492	dp += n-`1`;
1493	*dp = tl;
1494	dp += n;
1495	for (x = w-`2`; x > `0`; x--)
1496	{
1497	*dp = t;
1498	dp += n;
1499	}
1500	if (x == `0`)
1501	{
1502	*dp = tr;
1503	dp += n;
1504	}
1505	dp += stride - w*n;
1506	for (y = pix->h-`2`; y > `0`; y--)
1507	{
1508	dp[span] = r;
1509	*dp = l;
1510	dp += stride;
1511	}
1512	if (y == `0`)
1513	{
1514	*dp = bl;
1515	dp += n;
1516	for (x = w-`2`; x > `0`; x--)
1517	{
1518	*dp = b;
1519	dp += n;
1520	}
1521	if (x == `0`)
1522	{
1523	*dp = br;
1524	}
1525	}
1526	}
1527
1528	fz_pixmap *
1529	fz_scale_pixmap(fz_context ctx, fz_pixmap src, float x, float y, float w, float h, const fz_irect *clip)
1530	{
1531	return fz_scale_pixmap_cached(ctx, src, x, y, w, h, clip, NULL, NULL);
1532	}
1533
1534	fz_pixmap *
1535	fz_scale_pixmap_cached(fz_context ctx, const* fz_pixmap src, float* x, float y, float w, float h, const fz_irect clip, fz_scale_cache cache_x, fz_scale_cache *cache_y)
1536	{
1537	fz_scale_filter *filter = &fz_scale_filter_simple;
1538	fz_weights *contrib_rows = NULL;
1539	fz_weights *contrib_cols = NULL;
1540	fz_pixmap *output = NULL;
1541	unsigned char *temp = NULL;
1542	int max_row, temp_span, temp_rows, row;
1543	int dst_w_int, dst_h_int, dst_x_int, dst_y_int;
1544	int flip_x, flip_y, forcealpha;
1545	fz_rect patch;
1546
1547	fz_var(contrib_cols);
1548	fz_var(contrib_rows);
1549
1550	/ Avoid extreme scales where overflows become problematic. /
1551	if (w > (`1`<<`24`) \|\| h > (`1`<<`24`) \|\| w < -(`1`<<`24`) \|\| h < -(`1`<<`24`))
1552	return NULL;
1553	if (x > (`1`<<`24`) \|\| y > (`1`<<`24`) \|\| x < -(`1`<<`24`) \|\| y < -(`1`<<`24`))
1554	return NULL;
1555
1556	/ Clamp small ranges of w and h /
1557	if (w <= -`1`)
1558	{
1559	}
1560	else if (w < `0`)
1561	{
1562	w = -`1`;
1563	}
1564	else if (w < `1`)
1565	{
1566	w = `1`;
1567	}
1568	if (h <= -`1`)
1569	{
1570	}
1571	else if (h < `0`)
1572	{
1573	h = -`1`;
1574	}
1575	else if (h < `1`)
1576	{
1577	h = `1`;
1578	}
1579
1580	/ If the src has an alpha, we'll make the dst have an alpha automatically.*
1581	* We also need to force the dst to have an alpha if x/y/w/h aren't ints. */
1582	forcealpha = !src->alpha && (x != (float)(int)x \|\| y != (float)(int)y \|\| w != (float)(int)w \|\| h != (float)(int)h);
1583
1584	/ Find the destination bbox, width/height, and sub pixel offset,*
1585	* allowing for whether we're flipping or not. */
1586	/ The (x,y) position given describes where the top left corner*
1587	* of the source image should be mapped to (i.e. where (0,0) in image
1588	* space ends up). Also there are differences in the way we scale
1589	* horizontally and vertically. When scaling rows horizontally, we
1590	* always read forwards through the source, and store either forwards
1591	* or in reverse as required. When scaling vertically, we always store
1592	* out forwards, but may feed source rows in in a different order.
1593	*
1594	* Consider the image rectangle 'r' to which the image is mapped,
1595	* and the (possibly) larger rectangle 'R', given by expanding 'r' to
1596	* complete pixels.
1597	*
1598	* x can either be r.xmin-R.xmin or R.xmax-r.xmax depending on whether
1599	* the image is x flipped or not. Whatever happens 0 <= x < 1.
1600	* y is always R.ymax - r.ymax.
1601	*/
1602	/ dst_x_int is calculated to be the left of the scaled image, and*
1603	* x (the sub pixel offset) is the distance in from either the left
1604	* or right pixel expanded edge. */
1605	flip_x = (w < `0`);
1606	if (flip_x)
1607	{
1608	float tmp;
1609	w = -w;
1610	dst_x_int = floorf(x-w);
1611	tmp = ceilf(x);
1612	dst_w_int = (int)tmp;
1613	x = tmp - x;
1614	dst_w_int -= dst_x_int;
1615	}
1616	else
1617	{
1618	dst_x_int = floorf(x);
1619	x -= dst_x_int;
1620	dst_w_int = (int)ceilf(x + w);
1621	}
1622	/ dst_y_int is calculated to be the top of the scaled image, and*
1623	* y (the sub pixel offset) is the distance in from either the top
1624	* or bottom pixel expanded edge.
1625	*/
1626	flip_y = (h < `0`);
1627	if (flip_y)
1628	{
1629	float tmp;
1630	h = -h;
1631	dst_y_int = floorf(y-h);
1632	tmp = ceilf(y);
1633	dst_h_int = (int)tmp;
1634	y = tmp - y;
1635	dst_h_int -= dst_y_int;
1636	}
1637	else
1638	{
1639	dst_y_int = floorf(y);
1640	y -= dst_y_int;
1641	dst_h_int = (int)ceilf(y + h);
1642	}
1643
1644	fz_valgrind_pixmap(src);
1645
1646	/ Step 0: Calculate the patch /
1647	patch.x0 = `0`;
1648	patch.y0 = `0`;
1649	patch.x1 = dst_w_int;
1650	patch.y1 = dst_h_int;
1651	if (clip)
1652	{
1653	if (flip_x)
1654	{
1655	if (dst_x_int + dst_w_int > clip->x1)
1656	patch.x0 = dst_x_int + dst_w_int - clip->x1;
1657	if (clip->x0 > dst_x_int)
1658	{
1659	patch.x1 = dst_w_int - (clip->x0 - dst_x_int);
1660	dst_x_int = clip->x0;
1661	}
1662	}
1663	else
1664	{
1665	if (dst_x_int + dst_w_int > clip->x1)
1666	patch.x1 = clip->x1 - dst_x_int;
1667	if (clip->x0 > dst_x_int)
1668	{
1669	patch.x0 = clip->x0 - dst_x_int;
1670	dst_x_int += patch.x0;
1671	}
1672	}
1673
1674	if (flip_y)
1675	{
1676	if (dst_y_int + dst_h_int > clip->y1)
1677	patch.y1 = clip->y1 - dst_y_int;
1678	if (clip->y0 > dst_y_int)
1679	{
1680	patch.y0 = clip->y0 - dst_y_int;
1681	dst_y_int = clip->y0;
1682	}
1683	}
1684	else
1685	{
1686	if (dst_y_int + dst_h_int > clip->y1)
1687	patch.y1 = clip->y1 - dst_y_int;
1688	if (clip->y0 > dst_y_int)
1689	{
1690	patch.y0 = clip->y0 - dst_y_int;
1691	dst_y_int += patch.y0;
1692	}
1693	}
1694	}
1695	if (patch.x0 >= patch.x1 \|\| patch.y0 >= patch.y1)
1696	return NULL;
1697
1698	fz_try(ctx)
1699	{
1700	/ Step 1: Calculate the weights for columns and rows /
1701	#ifdef SINGLE_PIXEL_SPECIALS
1702	if (src->w == `1`)
1703	contrib_cols = NULL;
1704	else
1705	#endif /* SINGLE_PIXEL_SPECIALS */
1706	contrib_cols = make_weights(ctx, src->w, x, w, filter, `0`, dst_w_int, patch.x0, patch.x1, src->n, flip_x, cache_x);
1707	#ifdef SINGLE_PIXEL_SPECIALS
1708	if (src->h == `1`)
1709	contrib_rows = NULL;
1710	else
1711	#endif /* SINGLE_PIXEL_SPECIALS */
1712	contrib_rows = make_weights(ctx, src->h, y, h, filter, `1`, dst_h_int, patch.y0, patch.y1, src->n, flip_y, cache_y);
1713
1714	output = fz_new_pixmap(ctx, src->colorspace, patch.x1 - patch.x0, patch.y1 - patch.y0, src->seps, src->alpha \|\| forcealpha);
1715	}
1716	fz_catch(ctx)
1717	{
1718	if (!cache_x)
1719	fz_free(ctx, contrib_cols);
1720	if (!cache_y)
1721	fz_free(ctx, contrib_rows);
1722	fz_rethrow(ctx);
1723	}
1724	output->x = dst_x_int;
1725	output->y = dst_y_int;
1726
1727	/ Step 2: Apply the weights /
1728	#ifdef SINGLE_PIXEL_SPECIALS
1729	if (!contrib_rows)
1730	{
1731	/ Only 1 source pixel high. /
1732	if (!contrib_cols)
1733	{
1734	/ Only 1 pixel in the entire image! /
1735	duplicate_single_pixel(output->samples, src->samples, src->n, forcealpha, patch.x1-patch.x0, patch.y1-patch.y0, output->stride);
1736	fz_valgrind_pixmap(output);
1737	}
1738	else
1739	{
1740	/ Scale the row once, then copy it. /
1741	scale_single_row(output->samples, output->stride, src->samples, contrib_cols, src->w, patch.y1-patch.y0, forcealpha);
1742	fz_valgrind_pixmap(output);
1743	}
1744	}
1745	else if (!contrib_cols)
1746	{
1747	/ Only 1 source pixel wide. Scale the col and duplicate. /
1748	scale_single_col(output->samples, output->stride, src->samples, src->stride, contrib_rows, src->h, src->n, patch.x1-patch.x0, forcealpha);
1749	fz_valgrind_pixmap(output);
1750	}
1751	else
1752	#endif /* SINGLE_PIXEL_SPECIALS */
1753	{
1754	void (row_scale_in)(unsigned* char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights);
1755	void (row_scale_out)(unsigned* char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights, int w, int n, int row);
1756
1757	temp_span = contrib_cols->count * src->n;
1758	temp_rows = contrib_rows->max_len;
1759	if (temp_span <= `0` \|\| temp_rows > INT_MAX / temp_span)
1760	goto cleanup;
1761	fz_try(ctx)
1762	{
1763	temp = fz_calloc(ctx, temp_spantemp_rows, sizeof(unsigned* char));
1764	}
1765	fz_catch(ctx)
1766	{
1767	fz_drop_pixmap(ctx, output);
1768	if (!cache_x)
1769	fz_free(ctx, contrib_cols);
1770	if (!cache_y)
1771	fz_free(ctx, contrib_rows);
1772	fz_rethrow(ctx);
1773	}
1774	switch (src->n)
1775	{
1776	default:
1777	row_scale_in = scale_row_to_temp;
1778	break;
1779	case `1`: / Image mask case or Greyscale case /
1780	row_scale_in = scale_row_to_temp1;
1781	break;
1782	case `2`: / Greyscale with alpha case /
1783	row_scale_in = scale_row_to_temp2;
1784	break;
1785	case `3`: / RGB case /
1786	row_scale_in = scale_row_to_temp3;
1787	break;
1788	case `4`: / RGBA or CMYK case /
1789	row_scale_in = scale_row_to_temp4;
1790	break;
1791	}
1792	row_scale_out = forcealpha ? scale_row_from_temp_alpha : scale_row_from_temp;
1793	max_row = contrib_rows->index[contrib_rows->index[`0`]];
1794	for (row = `0`; row < contrib_rows->count; row++)
1795	{
1796	/*
1797	Which source rows do we need to have scaled into the
1798	temporary buffer in order to be able to do the final
1799	scale?
1800	*/
1801	int row_index = contrib_rows->index[row];
1802	int row_min = contrib_rows->index[row_index++];
1803	int row_len = contrib_rows->index[row_index];
1804	while (max_row < row_min+row_len)
1805	{
1806	/ Scale another row /
1807	assert(max_row < src->h);
1808	(row_scale_in)(&temp[temp_span(max_row % temp_rows)], &src->samples[(flip_y ? (src->h-`1`-max_row): max_row)*src->stride], contrib_cols);
1809	max_row++;
1810	}
1811
1812	(row_scale_out)(&output->samples[rowoutput->stride], temp, contrib_rows, contrib_cols->count, src->n, row);
1813	}
1814	fz_free(ctx, temp);
1815
1816	if (forcealpha)
1817	adjust_alpha_edges(output, contrib_rows, contrib_cols);
1818
1819	fz_valgrind_pixmap(output);
1820	}
1821
1822	cleanup:
1823	if (!cache_y)
1824	fz_free(ctx, contrib_rows);
1825	if (!cache_x)
1826	fz_free(ctx, contrib_cols);
1827
1828	return output;
1829	}
1830
1831	void
1832	fz_drop_scale_cache(fz_context ctx, fz_scale_cache sc)
1833	{
1834	if (!sc)
1835	return;
1836	fz_free(ctx, sc->weights);
1837	fz_free(ctx, sc);
1838	}
1839
1840	fz_scale_cache *
1841	fz_new_scale_cache(fz_context *ctx)
1842	{
1843	return fz_malloc_struct(ctx, fz_scale_cache);
1844	}
1845

Browse the source code of MuPDF/source/fitz/draw-scale-simple.c