jfdctint.c source code [MuPDF/thirdparty/libjpeg/jfdctint.c]

1	/*
2	* jfdctint.c
3	*
4	* Copyright (C) 1991-1996, Thomas G. Lane.
5	* Modification developed 2003-2009 by Guido Vollbeding.
6	* This file is part of the Independent JPEG Group's software.
7	* For conditions of distribution and use, see the accompanying README file.
8	*
9	* This file contains a slow-but-accurate integer implementation of the
10	* forward DCT (Discrete Cosine Transform).
11	*
12	* A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
13	* on each column. Direct algorithms are also available, but they are
14	* much more complex and seem not to be any faster when reduced to code.
15	*
16	* This implementation is based on an algorithm described in
17	* C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
18	* Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
19	* Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
20	* The primary algorithm described there uses 11 multiplies and 29 adds.
21	* We use their alternate method with 12 multiplies and 32 adds.
22	* The advantage of this method is that no data path contains more than one
23	* multiplication; this allows a very simple and accurate implementation in
24	* scaled fixed-point arithmetic, with a minimal number of shifts.
25	*
26	* We also provide FDCT routines with various input sample block sizes for
27	* direct resolution reduction or enlargement and for direct resolving the
28	* common 2x1 and 1x2 subsampling cases without additional resampling: NxN
29	* (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 output DCT block.
30	*
31	* For N<8 we fill the remaining block coefficients with zero.
32	* For N>8 we apply a partial N-point FDCT on the input samples, computing
33	* just the lower 8 frequency coefficients and discarding the rest.
34	*
35	* We must scale the output coefficients of the N-point FDCT appropriately
36	* to the standard 8-point FDCT level by 8/N per 1-D pass. This scaling
37	* is folded into the constant multipliers (pass 2) and/or final/initial
38	* shifting.
39	*
40	* CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
41	* since there would be too many additional constants to pre-calculate.
42	*/
43
44	#define JPEG_INTERNALS
45	#include "jinclude.h"
46	#include "jpeglib.h"
47	#include "jdct.h" /* Private declarations for DCT subsystem */
48
49	#ifdef DCT_ISLOW_SUPPORTED
50
51
52	/*
53	* This module is specialized to the case DCTSIZE = 8.
54	*/
55
56	#if DCTSIZE != 8
57	Sorry, this code only copes with `8x8` DCT blocks. / deliberate syntax err /
58	#endif
59
60
61	/*
62	* The poop on this scaling stuff is as follows:
63	*
64	* Each 1-D DCT step produces outputs which are a factor of sqrt(N)
65	* larger than the true DCT outputs. The final outputs are therefore
66	* a factor of N larger than desired; since N=8 this can be cured by
67	* a simple right shift at the end of the algorithm. The advantage of
68	* this arrangement is that we save two multiplications per 1-D DCT,
69	* because the y0 and y4 outputs need not be divided by sqrt(N).
70	* In the IJG code, this factor of 8 is removed by the quantization step
71	* (in jcdctmgr.c), NOT in this module.
72	*
73	* We have to do addition and subtraction of the integer inputs, which
74	* is no problem, and multiplication by fractional constants, which is
75	* a problem to do in integer arithmetic. We multiply all the constants
76	* by CONST_SCALE and convert them to integer constants (thus retaining
77	* CONST_BITS bits of precision in the constants). After doing a
78	* multiplication we have to divide the product by CONST_SCALE, with proper
79	* rounding, to produce the correct output. This division can be done
80	* cheaply as a right shift of CONST_BITS bits. We postpone shifting
81	* as long as possible so that partial sums can be added together with
82	* full fractional precision.
83	*
84	* The outputs of the first pass are scaled up by PASS1_BITS bits so that
85	* they are represented to better-than-integral precision. These outputs
86	* require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
87	* with the recommended scaling. (For 12-bit sample data, the intermediate
88	* array is INT32 anyway.)
89	*
90	* To avoid overflow of the 32-bit intermediate results in pass 2, we must
91	* have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis
92	* shows that the values given below are the most effective.
93	*/
94
95	#if BITS_IN_JSAMPLE == 8
96	#define CONST_BITS 13
97	#define PASS1_BITS 2
98	#else
99	#define CONST_BITS 13
100	#define PASS1_BITS 1 /* lose a little precision to avoid overflow */
101	#endif
102
103	/ Some C compilers fail to reduce "FIX(constant)" at compile time, thus*
104	* causing a lot of useless floating-point operations at run time.
105	* To get around this we use the following pre-calculated constants.
106	* If you change CONST_BITS you may want to add appropriate values.
107	* (With a reasonable C compiler, you can just rely on the FIX() macro...)
108	*/
109
110	#if CONST_BITS == 13
111	#define FIX_0_298631336 ((INT32) 2446) /* FIX(0.298631336) */
112	#define FIX_0_390180644 ((INT32) 3196) /* FIX(0.390180644) */
113	#define FIX_0_541196100 ((INT32) 4433) /* FIX(0.541196100) */
114	#define FIX_0_765366865 ((INT32) 6270) /* FIX(0.765366865) */
115	#define FIX_0_899976223 ((INT32) 7373) /* FIX(0.899976223) */
116	#define FIX_1_175875602 ((INT32) 9633) /* FIX(1.175875602) */
117	#define FIX_1_501321110 ((INT32) 12299) /* FIX(1.501321110) */
118	#define FIX_1_847759065 ((INT32) 15137) /* FIX(1.847759065) */
119	#define FIX_1_961570560 ((INT32) 16069) /* FIX(1.961570560) */
120	#define FIX_2_053119869 ((INT32) 16819) /* FIX(2.053119869) */
121	#define FIX_2_562915447 ((INT32) 20995) /* FIX(2.562915447) */
122	#define FIX_3_072711026 ((INT32) 25172) /* FIX(3.072711026) */
123	#else
124	#define FIX_0_298631336 FIX(0.298631336)
125	#define FIX_0_390180644 FIX(0.390180644)
126	#define FIX_0_541196100 FIX(0.541196100)
127	#define FIX_0_765366865 FIX(0.765366865)
128	#define FIX_0_899976223 FIX(0.899976223)
129	#define FIX_1_175875602 FIX(1.175875602)
130	#define FIX_1_501321110 FIX(1.501321110)
131	#define FIX_1_847759065 FIX(1.847759065)
132	#define FIX_1_961570560 FIX(1.961570560)
133	#define FIX_2_053119869 FIX(2.053119869)
134	#define FIX_2_562915447 FIX(2.562915447)
135	#define FIX_3_072711026 FIX(3.072711026)
136	#endif
137
138
139	/ Multiply an INT32 variable by an INT32 constant to yield an INT32 result.*
140	* For 8-bit samples with the recommended scaling, all the variable
141	* and constant values involved are no more than 16 bits wide, so a
142	* 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
143	* For 12-bit samples, a full 32-bit multiplication will be needed.
144	*/
145
146	#if BITS_IN_JSAMPLE == 8
147	#define MULTIPLY(var,const) MULTIPLY16C16(var,const)
148	#else
149	#define MULTIPLY(var,const) ((var) * (const))
150	#endif
151
152
153	/*
154	* Perform the forward DCT on one block of samples.
155	*/
156
157	GLOBAL(void)
158	jpeg_fdct_islow (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
159	{
160	INT32 tmp0, tmp1, tmp2, tmp3;
161	INT32 tmp10, tmp11, tmp12, tmp13;
162	INT32 z1;
163	DCTELEM *dataptr;
164	JSAMPROW elemptr;
165	int ctr;
166	SHIFT_TEMPS
167
168	/ Pass 1: process rows. /
169	/ Note results are scaled up by sqrt(8) compared to a true DCT; /
170	/ furthermore, we scale the results by 2*PASS1_BITS. /*
171
172	dataptr = data;
173	for (ctr = `0`; ctr < DCTSIZE; ctr++) {
174	elemptr = sample_data[ctr] + start_col;
175
176	/ Even part per LL&M figure 1 --- note that published figure is faulty;*
177	* rotator "sqrt(2)c1" should be "sqrt(2)c6".
178	*/
179
180	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`7`]);
181	tmp1 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`6`]);
182	tmp2 = GETJSAMPLE(elemptr[`2`]) + GETJSAMPLE(elemptr[`5`]);
183	tmp3 = GETJSAMPLE(elemptr[`3`]) + GETJSAMPLE(elemptr[`4`]);
184
185	tmp10 = tmp0 + tmp3;
186	tmp12 = tmp0 - tmp3;
187	tmp11 = tmp1 + tmp2;
188	tmp13 = tmp1 - tmp2;
189
190	tmp0 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`7`]);
191	tmp1 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`6`]);
192	tmp2 = GETJSAMPLE(elemptr[`2`]) - GETJSAMPLE(elemptr[`5`]);
193	tmp3 = GETJSAMPLE(elemptr[`3`]) - GETJSAMPLE(elemptr[`4`]);
194
195	/ Apply unsigned->signed conversion /
196	dataptr[`0`] = (DCTELEM) ((tmp10 + tmp11 - `8` * CENTERJSAMPLE) << PASS1_BITS);
197	dataptr[`4`] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
198
199	z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
200	/ Add fudge factor here for final descale. /
201	z1 += ONE << (CONST_BITS-PASS1_BITS-`1`);
202	dataptr[`2`] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
203	CONST_BITS-PASS1_BITS);
204	dataptr[`6`] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
205	CONST_BITS-PASS1_BITS);
206
207	/ Odd part per figure 8 --- note paper omits factor of sqrt(2).*
208	* cK represents sqrt(2) * cos(K*pi/16).
209	* i0..i3 in the paper are tmp0..tmp3 here.
210	*/
211
212	tmp10 = tmp0 + tmp3;
213	tmp11 = tmp1 + tmp2;
214	tmp12 = tmp0 + tmp2;
215	tmp13 = tmp1 + tmp3;
216	z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); / c3 /
217	/ Add fudge factor here for final descale. /
218	z1 += ONE << (CONST_BITS-PASS1_BITS-`1`);
219
220	tmp0 = MULTIPLY(tmp0, FIX_1_501321110); / c1+c3-c5-c7 /
221	tmp1 = MULTIPLY(tmp1, FIX_3_072711026); / c1+c3+c5-c7 /
222	tmp2 = MULTIPLY(tmp2, FIX_2_053119869); / c1+c3-c5+c7 /
223	tmp3 = MULTIPLY(tmp3, FIX_0_298631336); / -c1+c3+c5-c7 /
224	tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); / c7-c3 /
225	tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); / -c1-c3 /
226	tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); / c5-c3 /
227	tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); / -c3-c5 /
228
229	tmp12 += z1;
230	tmp13 += z1;
231
232	dataptr[`1`] = (DCTELEM)
233	RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
234	dataptr[`3`] = (DCTELEM)
235	RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
236	dataptr[`5`] = (DCTELEM)
237	RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
238	dataptr[`7`] = (DCTELEM)
239	RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
240
241	dataptr += DCTSIZE; / advance pointer to next row /
242	}
243
244	/ Pass 2: process columns.*
245	* We remove the PASS1_BITS scaling, but leave the results scaled up
246	* by an overall factor of 8.
247	*/
248
249	dataptr = data;
250	for (ctr = DCTSIZE-`1`; ctr >= `0`; ctr--) {
251	/ Even part per LL&M figure 1 --- note that published figure is faulty;*
252	* rotator "sqrt(2)c1" should be "sqrt(2)c6".
253	*/
254
255	tmp0 = dataptr[DCTSIZE`0`] + dataptr[DCTSIZE`7`];
256	tmp1 = dataptr[DCTSIZE`1`] + dataptr[DCTSIZE`6`];
257	tmp2 = dataptr[DCTSIZE`2`] + dataptr[DCTSIZE`5`];
258	tmp3 = dataptr[DCTSIZE`3`] + dataptr[DCTSIZE`4`];
259
260	/ Add fudge factor here for final descale. /
261	tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-`1`));
262	tmp12 = tmp0 - tmp3;
263	tmp11 = tmp1 + tmp2;
264	tmp13 = tmp1 - tmp2;
265
266	tmp0 = dataptr[DCTSIZE`0`] - dataptr[DCTSIZE`7`];
267	tmp1 = dataptr[DCTSIZE`1`] - dataptr[DCTSIZE`6`];
268	tmp2 = dataptr[DCTSIZE`2`] - dataptr[DCTSIZE`5`];
269	tmp3 = dataptr[DCTSIZE`3`] - dataptr[DCTSIZE`4`];
270
271	dataptr[DCTSIZE*`0`] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
272	dataptr[DCTSIZE*`4`] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
273
274	z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
275	/ Add fudge factor here for final descale. /
276	z1 += ONE << (CONST_BITS+PASS1_BITS-`1`);
277	dataptr[DCTSIZE*`2`] = (DCTELEM)
278	RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
279	dataptr[DCTSIZE*`6`] = (DCTELEM)
280	RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);
281
282	/ Odd part per figure 8 --- note paper omits factor of sqrt(2).*
283	* cK represents sqrt(2) * cos(K*pi/16).
284	* i0..i3 in the paper are tmp0..tmp3 here.
285	*/
286
287	tmp10 = tmp0 + tmp3;
288	tmp11 = tmp1 + tmp2;
289	tmp12 = tmp0 + tmp2;
290	tmp13 = tmp1 + tmp3;
291	z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); / c3 /
292	/ Add fudge factor here for final descale. /
293	z1 += ONE << (CONST_BITS+PASS1_BITS-`1`);
294
295	tmp0 = MULTIPLY(tmp0, FIX_1_501321110); / c1+c3-c5-c7 /
296	tmp1 = MULTIPLY(tmp1, FIX_3_072711026); / c1+c3+c5-c7 /
297	tmp2 = MULTIPLY(tmp2, FIX_2_053119869); / c1+c3-c5+c7 /
298	tmp3 = MULTIPLY(tmp3, FIX_0_298631336); / -c1+c3+c5-c7 /
299	tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); / c7-c3 /
300	tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); / -c1-c3 /
301	tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); / c5-c3 /
302	tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); / -c3-c5 /
303
304	tmp12 += z1;
305	tmp13 += z1;
306
307	dataptr[DCTSIZE*`1`] = (DCTELEM)
308	RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
309	dataptr[DCTSIZE*`3`] = (DCTELEM)
310	RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
311	dataptr[DCTSIZE*`5`] = (DCTELEM)
312	RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
313	dataptr[DCTSIZE*`7`] = (DCTELEM)
314	RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);
315
316	dataptr++; / advance pointer to next column /
317	}
318	}
319
320	#ifdef DCT_SCALING_SUPPORTED
321
322
323	/*
324	* Perform the forward DCT on a 7x7 sample block.
325	*/
326
327	GLOBAL(void)
328	jpeg_fdct_7x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
329	{
330	INT32 tmp0, tmp1, tmp2, tmp3;
331	INT32 tmp10, tmp11, tmp12;
332	INT32 z1, z2, z3;
333	DCTELEM *dataptr;
334	JSAMPROW elemptr;
335	int ctr;
336	SHIFT_TEMPS
337
338	/ Pre-zero output coefficient block. /
339	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
340
341	/ Pass 1: process rows. /
342	/ Note results are scaled up by sqrt(8) compared to a true DCT; /
343	/ furthermore, we scale the results by 2*PASS1_BITS. /*
344	/ cK represents sqrt(2) * cos(Kpi/14). /*
345
346	dataptr = data;
347	for (ctr = `0`; ctr < `7`; ctr++) {
348	elemptr = sample_data[ctr] + start_col;
349
350	/ Even part /
351
352	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`6`]);
353	tmp1 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`5`]);
354	tmp2 = GETJSAMPLE(elemptr[`2`]) + GETJSAMPLE(elemptr[`4`]);
355	tmp3 = GETJSAMPLE(elemptr[`3`]);
356
357	tmp10 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`6`]);
358	tmp11 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`5`]);
359	tmp12 = GETJSAMPLE(elemptr[`2`]) - GETJSAMPLE(elemptr[`4`]);
360
361	z1 = tmp0 + tmp2;
362	/ Apply unsigned->signed conversion /
363	dataptr[`0`] = (DCTELEM)
364	((z1 + tmp1 + tmp3 - `7` * CENTERJSAMPLE) << PASS1_BITS);
365	tmp3 += tmp3;
366	z1 -= tmp3;
367	z1 -= tmp3;
368	z1 = MULTIPLY(z1, FIX(`0.353553391`)); / (c2+c6-c4)/2 /
369	z2 = MULTIPLY(tmp0 - tmp2, FIX(`0.920609002`)); / (c2+c4-c6)/2 /
370	z3 = MULTIPLY(tmp1 - tmp2, FIX(`0.314692123`)); / c6 /
371	dataptr[`2`] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
372	z1 -= z2;
373	z2 = MULTIPLY(tmp0 - tmp1, FIX(`0.881747734`)); / c4 /
374	dataptr[`4`] = (DCTELEM)
375	DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(`0.707106781`)), / c2+c6-c4 /
376	CONST_BITS-PASS1_BITS);
377	dataptr[`6`] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
378
379	/ Odd part /
380
381	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(`0.935414347`)); / (c3+c1-c5)/2 /
382	tmp2 = MULTIPLY(tmp10 - tmp11, FIX(`0.170262339`)); / (c3+c5-c1)/2 /
383	tmp0 = tmp1 - tmp2;
384	tmp1 += tmp2;
385	tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(`1.378756276`)); / -c1 /
386	tmp1 += tmp2;
387	tmp3 = MULTIPLY(tmp10 + tmp12, FIX(`0.613604268`)); / c5 /
388	tmp0 += tmp3;
389	tmp2 += tmp3 + MULTIPLY(tmp12, FIX(`1.870828693`)); / c3+c1-c5 /
390
391	dataptr[`1`] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
392	dataptr[`3`] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
393	dataptr[`5`] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
394
395	dataptr += DCTSIZE; / advance pointer to next row /
396	}
397
398	/ Pass 2: process columns.*
399	* We remove the PASS1_BITS scaling, but leave the results scaled up
400	* by an overall factor of 8.
401	* We must also scale the output by (8/7)**2 = 64/49, which we fold
402	* into the constant multipliers:
403	* cK now represents sqrt(2) * cos(Kpi/14) 64/49.
404	*/
405
406	dataptr = data;
407	for (ctr = `0`; ctr < `7`; ctr++) {
408	/ Even part /
409
410	tmp0 = dataptr[DCTSIZE`0`] + dataptr[DCTSIZE`6`];
411	tmp1 = dataptr[DCTSIZE`1`] + dataptr[DCTSIZE`5`];
412	tmp2 = dataptr[DCTSIZE`2`] + dataptr[DCTSIZE`4`];
413	tmp3 = dataptr[DCTSIZE*`3`];
414
415	tmp10 = dataptr[DCTSIZE`0`] - dataptr[DCTSIZE`6`];
416	tmp11 = dataptr[DCTSIZE`1`] - dataptr[DCTSIZE`5`];
417	tmp12 = dataptr[DCTSIZE`2`] - dataptr[DCTSIZE`4`];
418
419	z1 = tmp0 + tmp2;
420	dataptr[DCTSIZE*`0`] = (DCTELEM)
421	DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(`1.306122449`)), / 64/49 /
422	CONST_BITS+PASS1_BITS);
423	tmp3 += tmp3;
424	z1 -= tmp3;
425	z1 -= tmp3;
426	z1 = MULTIPLY(z1, FIX(`0.461784020`)); / (c2+c6-c4)/2 /
427	z2 = MULTIPLY(tmp0 - tmp2, FIX(`1.202428084`)); / (c2+c4-c6)/2 /
428	z3 = MULTIPLY(tmp1 - tmp2, FIX(`0.411026446`)); / c6 /
429	dataptr[DCTSIZE*`2`] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS);
430	z1 -= z2;
431	z2 = MULTIPLY(tmp0 - tmp1, FIX(`1.151670509`)); / c4 /
432	dataptr[DCTSIZE*`4`] = (DCTELEM)
433	DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(`0.923568041`)), / c2+c6-c4 /
434	CONST_BITS+PASS1_BITS);
435	dataptr[DCTSIZE*`6`] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS);
436
437	/ Odd part /
438
439	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(`1.221765677`)); / (c3+c1-c5)/2 /
440	tmp2 = MULTIPLY(tmp10 - tmp11, FIX(`0.222383464`)); / (c3+c5-c1)/2 /
441	tmp0 = tmp1 - tmp2;
442	tmp1 += tmp2;
443	tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(`1.800824523`)); / -c1 /
444	tmp1 += tmp2;
445	tmp3 = MULTIPLY(tmp10 + tmp12, FIX(`0.801442310`)); / c5 /
446	tmp0 += tmp3;
447	tmp2 += tmp3 + MULTIPLY(tmp12, FIX(`2.443531355`)); / c3+c1-c5 /
448
449	dataptr[DCTSIZE*`1`] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS);
450	dataptr[DCTSIZE*`3`] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS);
451	dataptr[DCTSIZE*`5`] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS);
452
453	dataptr++; / advance pointer to next column /
454	}
455	}
456
457
458	/*
459	* Perform the forward DCT on a 6x6 sample block.
460	*/
461
462	GLOBAL(void)
463	jpeg_fdct_6x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
464	{
465	INT32 tmp0, tmp1, tmp2;
466	INT32 tmp10, tmp11, tmp12;
467	DCTELEM *dataptr;
468	JSAMPROW elemptr;
469	int ctr;
470	SHIFT_TEMPS
471
472	/ Pre-zero output coefficient block. /
473	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
474
475	/ Pass 1: process rows. /
476	/ Note results are scaled up by sqrt(8) compared to a true DCT; /
477	/ furthermore, we scale the results by 2*PASS1_BITS. /*
478	/ cK represents sqrt(2) * cos(Kpi/12). /*
479
480	dataptr = data;
481	for (ctr = `0`; ctr < `6`; ctr++) {
482	elemptr = sample_data[ctr] + start_col;
483
484	/ Even part /
485
486	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`5`]);
487	tmp11 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`4`]);
488	tmp2 = GETJSAMPLE(elemptr[`2`]) + GETJSAMPLE(elemptr[`3`]);
489
490	tmp10 = tmp0 + tmp2;
491	tmp12 = tmp0 - tmp2;
492
493	tmp0 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`5`]);
494	tmp1 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`4`]);
495	tmp2 = GETJSAMPLE(elemptr[`2`]) - GETJSAMPLE(elemptr[`3`]);
496
497	/ Apply unsigned->signed conversion /
498	dataptr[`0`] = (DCTELEM)
499	((tmp10 + tmp11 - `6` * CENTERJSAMPLE) << PASS1_BITS);
500	dataptr[`2`] = (DCTELEM)
501	DESCALE(MULTIPLY(tmp12, FIX(`1.224744871`)), / c2 /
502	CONST_BITS-PASS1_BITS);
503	dataptr[`4`] = (DCTELEM)
504	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(`0.707106781`)), / c4 /
505	CONST_BITS-PASS1_BITS);
506
507	/ Odd part /
508
509	tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(`0.366025404`)), / c5 /
510	CONST_BITS-PASS1_BITS);
511
512	dataptr[`1`] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
513	dataptr[`3`] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
514	dataptr[`5`] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
515
516	dataptr += DCTSIZE; / advance pointer to next row /
517	}
518
519	/ Pass 2: process columns.*
520	* We remove the PASS1_BITS scaling, but leave the results scaled up
521	* by an overall factor of 8.
522	* We must also scale the output by (8/6)**2 = 16/9, which we fold
523	* into the constant multipliers:
524	* cK now represents sqrt(2) * cos(Kpi/12) 16/9.
525	*/
526
527	dataptr = data;
528	for (ctr = `0`; ctr < `6`; ctr++) {
529	/ Even part /
530
531	tmp0 = dataptr[DCTSIZE`0`] + dataptr[DCTSIZE`5`];
532	tmp11 = dataptr[DCTSIZE`1`] + dataptr[DCTSIZE`4`];
533	tmp2 = dataptr[DCTSIZE`2`] + dataptr[DCTSIZE`3`];
534
535	tmp10 = tmp0 + tmp2;
536	tmp12 = tmp0 - tmp2;
537
538	tmp0 = dataptr[DCTSIZE`0`] - dataptr[DCTSIZE`5`];
539	tmp1 = dataptr[DCTSIZE`1`] - dataptr[DCTSIZE`4`];
540	tmp2 = dataptr[DCTSIZE`2`] - dataptr[DCTSIZE`3`];
541
542	dataptr[DCTSIZE*`0`] = (DCTELEM)
543	DESCALE(MULTIPLY(tmp10 + tmp11, FIX(`1.777777778`)), / 16/9 /
544	CONST_BITS+PASS1_BITS);
545	dataptr[DCTSIZE*`2`] = (DCTELEM)
546	DESCALE(MULTIPLY(tmp12, FIX(`2.177324216`)), / c2 /
547	CONST_BITS+PASS1_BITS);
548	dataptr[DCTSIZE*`4`] = (DCTELEM)
549	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(`1.257078722`)), / c4 /
550	CONST_BITS+PASS1_BITS);
551
552	/ Odd part /
553
554	tmp10 = MULTIPLY(tmp0 + tmp2, FIX(`0.650711829`)); / c5 /
555
556	dataptr[DCTSIZE*`1`] = (DCTELEM)
557	DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(`1.777777778`)), / 16/9 /
558	CONST_BITS+PASS1_BITS);
559	dataptr[DCTSIZE*`3`] = (DCTELEM)
560	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(`1.777777778`)), / 16/9 /
561	CONST_BITS+PASS1_BITS);
562	dataptr[DCTSIZE*`5`] = (DCTELEM)
563	DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(`1.777777778`)), / 16/9 /
564	CONST_BITS+PASS1_BITS);
565
566	dataptr++; / advance pointer to next column /
567	}
568	}
569
570
571	/*
572	* Perform the forward DCT on a 5x5 sample block.
573	*/
574
575	GLOBAL(void)
576	jpeg_fdct_5x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
577	{
578	INT32 tmp0, tmp1, tmp2;
579	INT32 tmp10, tmp11;
580	DCTELEM *dataptr;
581	JSAMPROW elemptr;
582	int ctr;
583	SHIFT_TEMPS
584
585	/ Pre-zero output coefficient block. /
586	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
587
588	/ Pass 1: process rows. /
589	/ Note results are scaled up by sqrt(8) compared to a true DCT; /
590	/ furthermore, we scale the results by 2*PASS1_BITS. /*
591	/ We scale the results further by 2 as part of output adaption /
592	/ scaling for different DCT size. /
593	/ cK represents sqrt(2) * cos(Kpi/10). /*
594
595	dataptr = data;
596	for (ctr = `0`; ctr < `5`; ctr++) {
597	elemptr = sample_data[ctr] + start_col;
598
599	/ Even part /
600
601	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`4`]);
602	tmp1 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`3`]);
603	tmp2 = GETJSAMPLE(elemptr[`2`]);
604
605	tmp10 = tmp0 + tmp1;
606	tmp11 = tmp0 - tmp1;
607
608	tmp0 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`4`]);
609	tmp1 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`3`]);
610
611	/ Apply unsigned->signed conversion /
612	dataptr[`0`] = (DCTELEM)
613	((tmp10 + tmp2 - `5` * CENTERJSAMPLE) << (PASS1_BITS+`1`));
614	tmp11 = MULTIPLY(tmp11, FIX(`0.790569415`)); / (c2+c4)/2 /
615	tmp10 -= tmp2 << `2`;
616	tmp10 = MULTIPLY(tmp10, FIX(`0.353553391`)); / (c2-c4)/2 /
617	dataptr[`2`] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS-`1`);
618	dataptr[`4`] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS-`1`);
619
620	/ Odd part /
621
622	tmp10 = MULTIPLY(tmp0 + tmp1, FIX(`0.831253876`)); / c3 /
623
624	dataptr[`1`] = (DCTELEM)
625	DESCALE(tmp10 + MULTIPLY(tmp0, FIX(`0.513743148`)), / c1-c3 /
626	CONST_BITS-PASS1_BITS-`1`);
627	dataptr[`3`] = (DCTELEM)
628	DESCALE(tmp10 - MULTIPLY(tmp1, FIX(`2.176250899`)), / c1+c3 /
629	CONST_BITS-PASS1_BITS-`1`);
630
631	dataptr += DCTSIZE; / advance pointer to next row /
632	}
633
634	/ Pass 2: process columns.*
635	* We remove the PASS1_BITS scaling, but leave the results scaled up
636	* by an overall factor of 8.
637	* We must also scale the output by (8/5)**2 = 64/25, which we partially
638	* fold into the constant multipliers (other part was done in pass 1):
639	* cK now represents sqrt(2) * cos(Kpi/10) 32/25.
640	*/
641
642	dataptr = data;
643	for (ctr = `0`; ctr < `5`; ctr++) {
644	/ Even part /
645
646	tmp0 = dataptr[DCTSIZE`0`] + dataptr[DCTSIZE`4`];
647	tmp1 = dataptr[DCTSIZE`1`] + dataptr[DCTSIZE`3`];
648	tmp2 = dataptr[DCTSIZE*`2`];
649
650	tmp10 = tmp0 + tmp1;
651	tmp11 = tmp0 - tmp1;
652
653	tmp0 = dataptr[DCTSIZE`0`] - dataptr[DCTSIZE`4`];
654	tmp1 = dataptr[DCTSIZE`1`] - dataptr[DCTSIZE`3`];
655
656	dataptr[DCTSIZE*`0`] = (DCTELEM)
657	DESCALE(MULTIPLY(tmp10 + tmp2, FIX(`1.28`)), / 32/25 /
658	CONST_BITS+PASS1_BITS);
659	tmp11 = MULTIPLY(tmp11, FIX(`1.011928851`)); / (c2+c4)/2 /
660	tmp10 -= tmp2 << `2`;
661	tmp10 = MULTIPLY(tmp10, FIX(`0.452548340`)); / (c2-c4)/2 /
662	dataptr[DCTSIZE*`2`] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
663	dataptr[DCTSIZE*`4`] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
664
665	/ Odd part /
666
667	tmp10 = MULTIPLY(tmp0 + tmp1, FIX(`1.064004961`)); / c3 /
668
669	dataptr[DCTSIZE*`1`] = (DCTELEM)
670	DESCALE(tmp10 + MULTIPLY(tmp0, FIX(`0.657591230`)), / c1-c3 /
671	CONST_BITS+PASS1_BITS);
672	dataptr[DCTSIZE*`3`] = (DCTELEM)
673	DESCALE(tmp10 - MULTIPLY(tmp1, FIX(`2.785601151`)), / c1+c3 /
674	CONST_BITS+PASS1_BITS);
675
676	dataptr++; / advance pointer to next column /
677	}
678	}
679
680
681	/*
682	* Perform the forward DCT on a 4x4 sample block.
683	*/
684
685	GLOBAL(void)
686	jpeg_fdct_4x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
687	{
688	INT32 tmp0, tmp1;
689	INT32 tmp10, tmp11;
690	DCTELEM *dataptr;
691	JSAMPROW elemptr;
692	int ctr;
693	SHIFT_TEMPS
694
695	/ Pre-zero output coefficient block. /
696	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
697
698	/ Pass 1: process rows. /
699	/ Note results are scaled up by sqrt(8) compared to a true DCT; /
700	/ furthermore, we scale the results by 2*PASS1_BITS. /*
701	/ We must also scale the output by (8/4)2 = 22, which we add here. /
702	/ cK represents sqrt(2) * cos(Kpi/16) [refers to 8-point FDCT]. /*
703
704	dataptr = data;
705	for (ctr = `0`; ctr < `4`; ctr++) {
706	elemptr = sample_data[ctr] + start_col;
707
708	/ Even part /
709
710	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`3`]);
711	tmp1 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`2`]);
712
713	tmp10 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`3`]);
714	tmp11 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`2`]);
715
716	/ Apply unsigned->signed conversion /
717	dataptr[`0`] = (DCTELEM)
718	((tmp0 + tmp1 - `4` * CENTERJSAMPLE) << (PASS1_BITS+`2`));
719	dataptr[`2`] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+`2`));
720
721	/ Odd part /
722
723	tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); / c6 /
724	/ Add fudge factor here for final descale. /
725	tmp0 += ONE << (CONST_BITS-PASS1_BITS-`3`);
726
727	dataptr[`1`] = (DCTELEM)
728	RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), / c2-c6 /
729	CONST_BITS-PASS1_BITS-`2`);
730	dataptr[`3`] = (DCTELEM)
731	RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), / c2+c6 /
732	CONST_BITS-PASS1_BITS-`2`);
733
734	dataptr += DCTSIZE; / advance pointer to next row /
735	}
736
737	/ Pass 2: process columns.*
738	* We remove the PASS1_BITS scaling, but leave the results scaled up
739	* by an overall factor of 8.
740	*/
741
742	dataptr = data;
743	for (ctr = `0`; ctr < `4`; ctr++) {
744	/ Even part /
745
746	/ Add fudge factor here for final descale. /
747	tmp0 = dataptr[DCTSIZE`0`] + dataptr[DCTSIZE`3`] + (ONE << (PASS1_BITS-`1`));
748	tmp1 = dataptr[DCTSIZE`1`] + dataptr[DCTSIZE`2`];
749
750	tmp10 = dataptr[DCTSIZE`0`] - dataptr[DCTSIZE`3`];
751	tmp11 = dataptr[DCTSIZE`1`] - dataptr[DCTSIZE`2`];
752
753	dataptr[DCTSIZE*`0`] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
754	dataptr[DCTSIZE*`2`] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
755
756	/ Odd part /
757
758	tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); / c6 /
759	/ Add fudge factor here for final descale. /
760	tmp0 += ONE << (CONST_BITS+PASS1_BITS-`1`);
761
762	dataptr[DCTSIZE*`1`] = (DCTELEM)
763	RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), / c2-c6 /
764	CONST_BITS+PASS1_BITS);
765	dataptr[DCTSIZE*`3`] = (DCTELEM)
766	RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), / c2+c6 /
767	CONST_BITS+PASS1_BITS);
768
769	dataptr++; / advance pointer to next column /
770	}
771	}
772
773
774	/*
775	* Perform the forward DCT on a 3x3 sample block.
776	*/
777
778	GLOBAL(void)
779	jpeg_fdct_3x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
780	{
781	INT32 tmp0, tmp1, tmp2;
782	DCTELEM *dataptr;
783	JSAMPROW elemptr;
784	int ctr;
785	SHIFT_TEMPS
786
787	/ Pre-zero output coefficient block. /
788	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
789
790	/ Pass 1: process rows. /
791	/ Note results are scaled up by sqrt(8) compared to a true DCT; /
792	/ furthermore, we scale the results by 2*PASS1_BITS. /*
793	/ We scale the results further by 2*2 as part of output adaption /*
794	/ scaling for different DCT size. /
795	/ cK represents sqrt(2) * cos(Kpi/6). /*
796
797	dataptr = data;
798	for (ctr = `0`; ctr < `3`; ctr++) {
799	elemptr = sample_data[ctr] + start_col;
800
801	/ Even part /
802
803	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`2`]);
804	tmp1 = GETJSAMPLE(elemptr[`1`]);
805
806	tmp2 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`2`]);
807
808	/ Apply unsigned->signed conversion /
809	dataptr[`0`] = (DCTELEM)
810	((tmp0 + tmp1 - `3` * CENTERJSAMPLE) << (PASS1_BITS+`2`));
811	dataptr[`2`] = (DCTELEM)
812	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(`0.707106781`)), / c2 /
813	CONST_BITS-PASS1_BITS-`2`);
814
815	/ Odd part /
816
817	dataptr[`1`] = (DCTELEM)
818	DESCALE(MULTIPLY(tmp2, FIX(`1.224744871`)), / c1 /
819	CONST_BITS-PASS1_BITS-`2`);
820
821	dataptr += DCTSIZE; / advance pointer to next row /
822	}
823
824	/ Pass 2: process columns.*
825	* We remove the PASS1_BITS scaling, but leave the results scaled up
826	* by an overall factor of 8.
827	* We must also scale the output by (8/3)**2 = 64/9, which we partially
828	* fold into the constant multipliers (other part was done in pass 1):
829	* cK now represents sqrt(2) * cos(Kpi/6) 16/9.
830	*/
831
832	dataptr = data;
833	for (ctr = `0`; ctr < `3`; ctr++) {
834	/ Even part /
835
836	tmp0 = dataptr[DCTSIZE`0`] + dataptr[DCTSIZE`2`];
837	tmp1 = dataptr[DCTSIZE*`1`];
838
839	tmp2 = dataptr[DCTSIZE`0`] - dataptr[DCTSIZE`2`];
840
841	dataptr[DCTSIZE*`0`] = (DCTELEM)
842	DESCALE(MULTIPLY(tmp0 + tmp1, FIX(`1.777777778`)), / 16/9 /
843	CONST_BITS+PASS1_BITS);
844	dataptr[DCTSIZE*`2`] = (DCTELEM)
845	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(`1.257078722`)), / c2 /
846	CONST_BITS+PASS1_BITS);
847
848	/ Odd part /
849
850	dataptr[DCTSIZE*`1`] = (DCTELEM)
851	DESCALE(MULTIPLY(tmp2, FIX(`2.177324216`)), / c1 /
852	CONST_BITS+PASS1_BITS);
853
854	dataptr++; / advance pointer to next column /
855	}
856	}
857
858
859	/*
860	* Perform the forward DCT on a 2x2 sample block.
861	*/
862
863	GLOBAL(void)
864	jpeg_fdct_2x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
865	{
866	INT32 tmp0, tmp1, tmp2, tmp3;
867	JSAMPROW elemptr;
868
869	/ Pre-zero output coefficient block. /
870	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
871
872	/ Pass 1: process rows. /
873	/ Note results are scaled up by sqrt(8) compared to a true DCT. /
874
875	/ Row 0 /
876	elemptr = sample_data[`0`] + start_col;
877
878	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`1`]);
879	tmp1 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`1`]);
880
881	/ Row 1 /
882	elemptr = sample_data[`1`] + start_col;
883
884	tmp2 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`1`]);
885	tmp3 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`1`]);
886
887	/ Pass 2: process columns.*
888	* We leave the results scaled up by an overall factor of 8.
889	* We must also scale the output by (8/2)2 = 24.
890	*/
891
892	/ Column 0 /
893	/ Apply unsigned->signed conversion /
894	data[DCTSIZE`0`] = (DCTELEM) ((tmp0 + tmp2 - `4` CENTERJSAMPLE) << `4`);
895	data[DCTSIZE*`1`] = (DCTELEM) ((tmp0 - tmp2) << `4`);
896
897	/ Column 1 /
898	data[DCTSIZE*`0`+`1`] = (DCTELEM) ((tmp1 + tmp3) << `4`);
899	data[DCTSIZE*`1`+`1`] = (DCTELEM) ((tmp1 - tmp3) << `4`);
900	}
901
902
903	/*
904	* Perform the forward DCT on a 1x1 sample block.
905	*/
906
907	GLOBAL(void)
908	jpeg_fdct_1x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
909	{
910	/ Pre-zero output coefficient block. /
911	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
912
913	/ We leave the result scaled up by an overall factor of 8. /
914	/ We must also scale the output by (8/1)2 = 26. /
915	/ Apply unsigned->signed conversion /
916	data[`0`] = (DCTELEM)
917	((GETJSAMPLE(sample_data[`0`][start_col]) - CENTERJSAMPLE) << `6`);
918	}
919
920
921	/*
922	* Perform the forward DCT on a 9x9 sample block.
923	*/
924
925	GLOBAL(void)
926	jpeg_fdct_9x9 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
927	{
928	INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
929	INT32 tmp10, tmp11, tmp12, tmp13;
930	INT32 z1, z2;
931	DCTELEM workspace[`8`];
932	DCTELEM *dataptr;
933	DCTELEM *wsptr;
934	JSAMPROW elemptr;
935	int ctr;
936	SHIFT_TEMPS
937
938	/ Pass 1: process rows. /
939	/ Note results are scaled up by sqrt(8) compared to a true DCT; /
940	/ we scale the results further by 2 as part of output adaption /
941	/ scaling for different DCT size. /
942	/ cK represents sqrt(2) * cos(Kpi/18). /*
943
944	dataptr = data;
945	ctr = `0`;
946	for (;;) {
947	elemptr = sample_data[ctr] + start_col;
948
949	/ Even part /
950
951	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`8`]);
952	tmp1 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`7`]);
953	tmp2 = GETJSAMPLE(elemptr[`2`]) + GETJSAMPLE(elemptr[`6`]);
954	tmp3 = GETJSAMPLE(elemptr[`3`]) + GETJSAMPLE(elemptr[`5`]);
955	tmp4 = GETJSAMPLE(elemptr[`4`]);
956
957	tmp10 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`8`]);
958	tmp11 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`7`]);
959	tmp12 = GETJSAMPLE(elemptr[`2`]) - GETJSAMPLE(elemptr[`6`]);
960	tmp13 = GETJSAMPLE(elemptr[`3`]) - GETJSAMPLE(elemptr[`5`]);
961
962	z1 = tmp0 + tmp2 + tmp3;
963	z2 = tmp1 + tmp4;
964	/ Apply unsigned->signed conversion /
965	dataptr[`0`] = (DCTELEM) ((z1 + z2 - `9` * CENTERJSAMPLE) << `1`);
966	dataptr[`6`] = (DCTELEM)
967	DESCALE(MULTIPLY(z1 - z2 - z2, FIX(`0.707106781`)), / c6 /
968	CONST_BITS-`1`);
969	z1 = MULTIPLY(tmp0 - tmp2, FIX(`1.328926049`)); / c2 /
970	z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(`0.707106781`)); / c6 /
971	dataptr[`2`] = (DCTELEM)
972	DESCALE(MULTIPLY(tmp2 - tmp3, FIX(`1.083350441`)) / c4 /
973	+ z1 + z2, CONST_BITS-`1`);
974	dataptr[`4`] = (DCTELEM)
975	DESCALE(MULTIPLY(tmp3 - tmp0, FIX(`0.245575608`)) / c8 /
976	+ z1 - z2, CONST_BITS-`1`);
977
978	/ Odd part /
979
980	dataptr[`3`] = (DCTELEM)
981	DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(`1.224744871`)), / c3 /
982	CONST_BITS-`1`);
983
984	tmp11 = MULTIPLY(tmp11, FIX(`1.224744871`)); / c3 /
985	tmp0 = MULTIPLY(tmp10 + tmp12, FIX(`0.909038955`)); / c5 /
986	tmp1 = MULTIPLY(tmp10 + tmp13, FIX(`0.483689525`)); / c7 /
987
988	dataptr[`1`] = (DCTELEM) DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS-`1`);
989
990	tmp2 = MULTIPLY(tmp12 - tmp13, FIX(`1.392728481`)); / c1 /
991
992	dataptr[`5`] = (DCTELEM) DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS-`1`);
993	dataptr[`7`] = (DCTELEM) DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS-`1`);
994
995	ctr++;
996
997	if (ctr != DCTSIZE) {
998	if (ctr == `9`)
999	break; / Done. /
1000	dataptr += DCTSIZE; / advance pointer to next row /
1001	} else
1002	dataptr = workspace; / switch pointer to extended workspace /
1003	}
1004
1005	/ Pass 2: process columns.*
1006	* We leave the results scaled up by an overall factor of 8.
1007	* We must also scale the output by (8/9)**2 = 64/81, which we partially
1008	* fold into the constant multipliers and final/initial shifting:
1009	* cK now represents sqrt(2) * cos(Kpi/18) 128/81.
1010	*/
1011
1012	dataptr = data;
1013	wsptr = workspace;
1014	for (ctr = DCTSIZE-`1`; ctr >= `0`; ctr--) {
1015	/ Even part /
1016
1017	tmp0 = dataptr[DCTSIZE`0`] + wsptr[DCTSIZE`0`];
1018	tmp1 = dataptr[DCTSIZE`1`] + dataptr[DCTSIZE`7`];
1019	tmp2 = dataptr[DCTSIZE`2`] + dataptr[DCTSIZE`6`];
1020	tmp3 = dataptr[DCTSIZE`3`] + dataptr[DCTSIZE`5`];
1021	tmp4 = dataptr[DCTSIZE*`4`];
1022
1023	tmp10 = dataptr[DCTSIZE`0`] - wsptr[DCTSIZE`0`];
1024	tmp11 = dataptr[DCTSIZE`1`] - dataptr[DCTSIZE`7`];
1025	tmp12 = dataptr[DCTSIZE`2`] - dataptr[DCTSIZE`6`];
1026	tmp13 = dataptr[DCTSIZE`3`] - dataptr[DCTSIZE`5`];
1027
1028	z1 = tmp0 + tmp2 + tmp3;
1029	z2 = tmp1 + tmp4;
1030	dataptr[DCTSIZE*`0`] = (DCTELEM)
1031	DESCALE(MULTIPLY(z1 + z2, FIX(`1.580246914`)), / 128/81 /
1032	CONST_BITS+`2`);
1033	dataptr[DCTSIZE*`6`] = (DCTELEM)
1034	DESCALE(MULTIPLY(z1 - z2 - z2, FIX(`1.117403309`)), / c6 /
1035	CONST_BITS+`2`);
1036	z1 = MULTIPLY(tmp0 - tmp2, FIX(`2.100031287`)); / c2 /
1037	z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(`1.117403309`)); / c6 /
1038	dataptr[DCTSIZE*`2`] = (DCTELEM)
1039	DESCALE(MULTIPLY(tmp2 - tmp3, FIX(`1.711961190`)) / c4 /
1040	+ z1 + z2, CONST_BITS+`2`);
1041	dataptr[DCTSIZE*`4`] = (DCTELEM)
1042	DESCALE(MULTIPLY(tmp3 - tmp0, FIX(`0.388070096`)) / c8 /
1043	+ z1 - z2, CONST_BITS+`2`);
1044
1045	/ Odd part /
1046
1047	dataptr[DCTSIZE*`3`] = (DCTELEM)
1048	DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(`1.935399303`)), / c3 /
1049	CONST_BITS+`2`);
1050
1051	tmp11 = MULTIPLY(tmp11, FIX(`1.935399303`)); / c3 /
1052	tmp0 = MULTIPLY(tmp10 + tmp12, FIX(`1.436506004`)); / c5 /
1053	tmp1 = MULTIPLY(tmp10 + tmp13, FIX(`0.764348879`)); / c7 /
1054
1055	dataptr[DCTSIZE*`1`] = (DCTELEM)
1056	DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS+`2`);
1057
1058	tmp2 = MULTIPLY(tmp12 - tmp13, FIX(`2.200854883`)); / c1 /
1059
1060	dataptr[DCTSIZE*`5`] = (DCTELEM)
1061	DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS+`2`);
1062	dataptr[DCTSIZE*`7`] = (DCTELEM)
1063	DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS+`2`);
1064
1065	dataptr++; / advance pointer to next column /
1066	wsptr++; / advance pointer to next column /
1067	}
1068	}
1069
1070
1071	/*
1072	* Perform the forward DCT on a 10x10 sample block.
1073	*/
1074
1075	GLOBAL(void)
1076	jpeg_fdct_10x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1077	{
1078	INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
1079	INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1080	DCTELEM workspace[`8`*`2`];
1081	DCTELEM *dataptr;
1082	DCTELEM *wsptr;
1083	JSAMPROW elemptr;
1084	int ctr;
1085	SHIFT_TEMPS
1086
1087	/ Pass 1: process rows. /
1088	/ Note results are scaled up by sqrt(8) compared to a true DCT; /
1089	/ we scale the results further by 2 as part of output adaption /
1090	/ scaling for different DCT size. /
1091	/ cK represents sqrt(2) * cos(Kpi/20). /*
1092
1093	dataptr = data;
1094	ctr = `0`;
1095	for (;;) {
1096	elemptr = sample_data[ctr] + start_col;
1097
1098	/ Even part /
1099
1100	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`9`]);
1101	tmp1 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`8`]);
1102	tmp12 = GETJSAMPLE(elemptr[`2`]) + GETJSAMPLE(elemptr[`7`]);
1103	tmp3 = GETJSAMPLE(elemptr[`3`]) + GETJSAMPLE(elemptr[`6`]);
1104	tmp4 = GETJSAMPLE(elemptr[`4`]) + GETJSAMPLE(elemptr[`5`]);
1105
1106	tmp10 = tmp0 + tmp4;
1107	tmp13 = tmp0 - tmp4;
1108	tmp11 = tmp1 + tmp3;
1109	tmp14 = tmp1 - tmp3;
1110
1111	tmp0 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`9`]);
1112	tmp1 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`8`]);
1113	tmp2 = GETJSAMPLE(elemptr[`2`]) - GETJSAMPLE(elemptr[`7`]);
1114	tmp3 = GETJSAMPLE(elemptr[`3`]) - GETJSAMPLE(elemptr[`6`]);
1115	tmp4 = GETJSAMPLE(elemptr[`4`]) - GETJSAMPLE(elemptr[`5`]);
1116
1117	/ Apply unsigned->signed conversion /
1118	dataptr[`0`] = (DCTELEM)
1119	((tmp10 + tmp11 + tmp12 - `10` * CENTERJSAMPLE) << `1`);
1120	tmp12 += tmp12;
1121	dataptr[`4`] = (DCTELEM)
1122	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(`1.144122806`)) - / c4 /
1123	MULTIPLY(tmp11 - tmp12, FIX(`0.437016024`)), / c8 /
1124	CONST_BITS-`1`);
1125	tmp10 = MULTIPLY(tmp13 + tmp14, FIX(`0.831253876`)); / c6 /
1126	dataptr[`2`] = (DCTELEM)
1127	DESCALE(tmp10 + MULTIPLY(tmp13, FIX(`0.513743148`)), / c2-c6 /
1128	CONST_BITS-`1`);
1129	dataptr[`6`] = (DCTELEM)
1130	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(`2.176250899`)), / c2+c6 /
1131	CONST_BITS-`1`);
1132
1133	/ Odd part /
1134
1135	tmp10 = tmp0 + tmp4;
1136	tmp11 = tmp1 - tmp3;
1137	dataptr[`5`] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << `1`);
1138	tmp2 <<= CONST_BITS;
1139	dataptr[`1`] = (DCTELEM)
1140	DESCALE(MULTIPLY(tmp0, FIX(`1.396802247`)) + / c1 /
1141	MULTIPLY(tmp1, FIX(`1.260073511`)) + tmp2 + / c3 /
1142	MULTIPLY(tmp3, FIX(`0.642039522`)) + / c7 /
1143	MULTIPLY(tmp4, FIX(`0.221231742`)), / c9 /
1144	CONST_BITS-`1`);
1145	tmp12 = MULTIPLY(tmp0 - tmp4, FIX(`0.951056516`)) - / (c3+c7)/2 /
1146	MULTIPLY(tmp1 + tmp3, FIX(`0.587785252`)); / (c1-c9)/2 /
1147	tmp13 = MULTIPLY(tmp10 + tmp11, FIX(`0.309016994`)) + / (c3-c7)/2 /
1148	(tmp11 << (CONST_BITS - `1`)) - tmp2;
1149	dataptr[`3`] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-`1`);
1150	dataptr[`7`] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-`1`);
1151
1152	ctr++;
1153
1154	if (ctr != DCTSIZE) {
1155	if (ctr == `10`)
1156	break; / Done. /
1157	dataptr += DCTSIZE; / advance pointer to next row /
1158	} else
1159	dataptr = workspace; / switch pointer to extended workspace /
1160	}
1161
1162	/ Pass 2: process columns.*
1163	* We leave the results scaled up by an overall factor of 8.
1164	* We must also scale the output by (8/10)**2 = 16/25, which we partially
1165	* fold into the constant multipliers and final/initial shifting:
1166	* cK now represents sqrt(2) * cos(Kpi/20) 32/25.
1167	*/
1168
1169	dataptr = data;
1170	wsptr = workspace;
1171	for (ctr = DCTSIZE-`1`; ctr >= `0`; ctr--) {
1172	/ Even part /
1173
1174	tmp0 = dataptr[DCTSIZE`0`] + wsptr[DCTSIZE`1`];
1175	tmp1 = dataptr[DCTSIZE`1`] + wsptr[DCTSIZE`0`];
1176	tmp12 = dataptr[DCTSIZE`2`] + dataptr[DCTSIZE`7`];
1177	tmp3 = dataptr[DCTSIZE`3`] + dataptr[DCTSIZE`6`];
1178	tmp4 = dataptr[DCTSIZE`4`] + dataptr[DCTSIZE`5`];
1179
1180	tmp10 = tmp0 + tmp4;
1181	tmp13 = tmp0 - tmp4;
1182	tmp11 = tmp1 + tmp3;
1183	tmp14 = tmp1 - tmp3;
1184
1185	tmp0 = dataptr[DCTSIZE`0`] - wsptr[DCTSIZE`1`];
1186	tmp1 = dataptr[DCTSIZE`1`] - wsptr[DCTSIZE`0`];
1187	tmp2 = dataptr[DCTSIZE`2`] - dataptr[DCTSIZE`7`];
1188	tmp3 = dataptr[DCTSIZE`3`] - dataptr[DCTSIZE`6`];
1189	tmp4 = dataptr[DCTSIZE`4`] - dataptr[DCTSIZE`5`];
1190
1191	dataptr[DCTSIZE*`0`] = (DCTELEM)
1192	DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(`1.28`)), / 32/25 /
1193	CONST_BITS+`2`);
1194	tmp12 += tmp12;
1195	dataptr[DCTSIZE*`4`] = (DCTELEM)
1196	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(`1.464477191`)) - / c4 /
1197	MULTIPLY(tmp11 - tmp12, FIX(`0.559380511`)), / c8 /
1198	CONST_BITS+`2`);
1199	tmp10 = MULTIPLY(tmp13 + tmp14, FIX(`1.064004961`)); / c6 /
1200	dataptr[DCTSIZE*`2`] = (DCTELEM)
1201	DESCALE(tmp10 + MULTIPLY(tmp13, FIX(`0.657591230`)), / c2-c6 /
1202	CONST_BITS+`2`);
1203	dataptr[DCTSIZE*`6`] = (DCTELEM)
1204	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(`2.785601151`)), / c2+c6 /
1205	CONST_BITS+`2`);
1206
1207	/ Odd part /
1208
1209	tmp10 = tmp0 + tmp4;
1210	tmp11 = tmp1 - tmp3;
1211	dataptr[DCTSIZE*`5`] = (DCTELEM)
1212	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(`1.28`)), / 32/25 /
1213	CONST_BITS+`2`);
1214	tmp2 = MULTIPLY(tmp2, FIX(`1.28`)); / 32/25 /
1215	dataptr[DCTSIZE*`1`] = (DCTELEM)
1216	DESCALE(MULTIPLY(tmp0, FIX(`1.787906876`)) + / c1 /
1217	MULTIPLY(tmp1, FIX(`1.612894094`)) + tmp2 + / c3 /
1218	MULTIPLY(tmp3, FIX(`0.821810588`)) + / c7 /
1219	MULTIPLY(tmp4, FIX(`0.283176630`)), / c9 /
1220	CONST_BITS+`2`);
1221	tmp12 = MULTIPLY(tmp0 - tmp4, FIX(`1.217352341`)) - / (c3+c7)/2 /
1222	MULTIPLY(tmp1 + tmp3, FIX(`0.752365123`)); / (c1-c9)/2 /
1223	tmp13 = MULTIPLY(tmp10 + tmp11, FIX(`0.395541753`)) + / (c3-c7)/2 /
1224	MULTIPLY(tmp11, FIX(`0.64`)) - tmp2; / 16/25 /
1225	dataptr[DCTSIZE*`3`] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+`2`);
1226	dataptr[DCTSIZE*`7`] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+`2`);
1227
1228	dataptr++; / advance pointer to next column /
1229	wsptr++; / advance pointer to next column /
1230	}
1231	}
1232
1233
1234	/*
1235	* Perform the forward DCT on an 11x11 sample block.
1236	*/
1237
1238	GLOBAL(void)
1239	jpeg_fdct_11x11 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1240	{
1241	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1242	INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1243	INT32 z1, z2, z3;
1244	DCTELEM workspace[`8`*`3`];
1245	DCTELEM *dataptr;
1246	DCTELEM *wsptr;
1247	JSAMPROW elemptr;
1248	int ctr;
1249	SHIFT_TEMPS
1250
1251	/ Pass 1: process rows. /
1252	/ Note results are scaled up by sqrt(8) compared to a true DCT; /
1253	/ we scale the results further by 2 as part of output adaption /
1254	/ scaling for different DCT size. /
1255	/ cK represents sqrt(2) * cos(Kpi/22). /*
1256
1257	dataptr = data;
1258	ctr = `0`;
1259	for (;;) {
1260	elemptr = sample_data[ctr] + start_col;
1261
1262	/ Even part /
1263
1264	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`10`]);
1265	tmp1 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`9`]);
1266	tmp2 = GETJSAMPLE(elemptr[`2`]) + GETJSAMPLE(elemptr[`8`]);
1267	tmp3 = GETJSAMPLE(elemptr[`3`]) + GETJSAMPLE(elemptr[`7`]);
1268	tmp4 = GETJSAMPLE(elemptr[`4`]) + GETJSAMPLE(elemptr[`6`]);
1269	tmp5 = GETJSAMPLE(elemptr[`5`]);
1270
1271	tmp10 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`10`]);
1272	tmp11 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`9`]);
1273	tmp12 = GETJSAMPLE(elemptr[`2`]) - GETJSAMPLE(elemptr[`8`]);
1274	tmp13 = GETJSAMPLE(elemptr[`3`]) - GETJSAMPLE(elemptr[`7`]);
1275	tmp14 = GETJSAMPLE(elemptr[`4`]) - GETJSAMPLE(elemptr[`6`]);
1276
1277	/ Apply unsigned->signed conversion /
1278	dataptr[`0`] = (DCTELEM)
1279	((tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 - `11` * CENTERJSAMPLE) << `1`);
1280	tmp5 += tmp5;
1281	tmp0 -= tmp5;
1282	tmp1 -= tmp5;
1283	tmp2 -= tmp5;
1284	tmp3 -= tmp5;
1285	tmp4 -= tmp5;
1286	z1 = MULTIPLY(tmp0 + tmp3, FIX(`1.356927976`)) + / c2 /
1287	MULTIPLY(tmp2 + tmp4, FIX(`0.201263574`)); / c10 /
1288	z2 = MULTIPLY(tmp1 - tmp3, FIX(`0.926112931`)); / c6 /
1289	z3 = MULTIPLY(tmp0 - tmp1, FIX(`1.189712156`)); / c4 /
1290	dataptr[`2`] = (DCTELEM)
1291	DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(`1.018300590`)) / c2+c8-c6 /
1292	- MULTIPLY(tmp4, FIX(`1.390975730`)), / c4+c10 /
1293	CONST_BITS-`1`);
1294	dataptr[`4`] = (DCTELEM)
1295	DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(`0.062335650`)) / c4-c6-c10 /
1296	- MULTIPLY(tmp2, FIX(`1.356927976`)) / c2 /
1297	+ MULTIPLY(tmp4, FIX(`0.587485545`)), / c8 /
1298	CONST_BITS-`1`);
1299	dataptr[`6`] = (DCTELEM)
1300	DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(`1.620527200`)) / c2+c4-c6 /
1301	- MULTIPLY(tmp2, FIX(`0.788749120`)), / c8+c10 /
1302	CONST_BITS-`1`);
1303
1304	/ Odd part /
1305
1306	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(`1.286413905`)); / c3 /
1307	tmp2 = MULTIPLY(tmp10 + tmp12, FIX(`1.068791298`)); / c5 /
1308	tmp3 = MULTIPLY(tmp10 + tmp13, FIX(`0.764581576`)); / c7 /
1309	tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(`1.719967871`)) / c7+c5+c3-c1 /
1310	+ MULTIPLY(tmp14, FIX(`0.398430003`)); / c9 /
1311	tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(`0.764581576`)); / -c7 /
1312	tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(`1.399818907`)); / -c1 /
1313	tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(`1.276416582`)) / c9+c7+c1-c3 /
1314	- MULTIPLY(tmp14, FIX(`1.068791298`)); / c5 /
1315	tmp10 = MULTIPLY(tmp12 + tmp13, FIX(`0.398430003`)); / c9 /
1316	tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(`1.989053629`)) / c9+c5+c3-c7 /
1317	+ MULTIPLY(tmp14, FIX(`1.399818907`)); / c1 /
1318	tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(`1.305598626`)) / c1+c5-c9-c7 /
1319	- MULTIPLY(tmp14, FIX(`1.286413905`)); / c3 /
1320
1321	dataptr[`1`] = (DCTELEM) DESCALE(tmp0, CONST_BITS-`1`);
1322	dataptr[`3`] = (DCTELEM) DESCALE(tmp1, CONST_BITS-`1`);
1323	dataptr[`5`] = (DCTELEM) DESCALE(tmp2, CONST_BITS-`1`);
1324	dataptr[`7`] = (DCTELEM) DESCALE(tmp3, CONST_BITS-`1`);
1325
1326	ctr++;
1327
1328	if (ctr != DCTSIZE) {
1329	if (ctr == `11`)
1330	break; / Done. /
1331	dataptr += DCTSIZE; / advance pointer to next row /
1332	} else
1333	dataptr = workspace; / switch pointer to extended workspace /
1334	}
1335
1336	/ Pass 2: process columns.*
1337	* We leave the results scaled up by an overall factor of 8.
1338	* We must also scale the output by (8/11)**2 = 64/121, which we partially
1339	* fold into the constant multipliers and final/initial shifting:
1340	* cK now represents sqrt(2) * cos(Kpi/22) 128/121.
1341	*/
1342
1343	dataptr = data;
1344	wsptr = workspace;
1345	for (ctr = DCTSIZE-`1`; ctr >= `0`; ctr--) {
1346	/ Even part /
1347
1348	tmp0 = dataptr[DCTSIZE`0`] + wsptr[DCTSIZE`2`];
1349	tmp1 = dataptr[DCTSIZE`1`] + wsptr[DCTSIZE`1`];
1350	tmp2 = dataptr[DCTSIZE`2`] + wsptr[DCTSIZE`0`];
1351	tmp3 = dataptr[DCTSIZE`3`] + dataptr[DCTSIZE`7`];
1352	tmp4 = dataptr[DCTSIZE`4`] + dataptr[DCTSIZE`6`];
1353	tmp5 = dataptr[DCTSIZE*`5`];
1354
1355	tmp10 = dataptr[DCTSIZE`0`] - wsptr[DCTSIZE`2`];
1356	tmp11 = dataptr[DCTSIZE`1`] - wsptr[DCTSIZE`1`];
1357	tmp12 = dataptr[DCTSIZE`2`] - wsptr[DCTSIZE`0`];
1358	tmp13 = dataptr[DCTSIZE`3`] - dataptr[DCTSIZE`7`];
1359	tmp14 = dataptr[DCTSIZE`4`] - dataptr[DCTSIZE`6`];
1360
1361	dataptr[DCTSIZE*`0`] = (DCTELEM)
1362	DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5,
1363	FIX(`1.057851240`)), / 128/121 /
1364	CONST_BITS+`2`);
1365	tmp5 += tmp5;
1366	tmp0 -= tmp5;
1367	tmp1 -= tmp5;
1368	tmp2 -= tmp5;
1369	tmp3 -= tmp5;
1370	tmp4 -= tmp5;
1371	z1 = MULTIPLY(tmp0 + tmp3, FIX(`1.435427942`)) + / c2 /
1372	MULTIPLY(tmp2 + tmp4, FIX(`0.212906922`)); / c10 /
1373	z2 = MULTIPLY(tmp1 - tmp3, FIX(`0.979689713`)); / c6 /
1374	z3 = MULTIPLY(tmp0 - tmp1, FIX(`1.258538479`)); / c4 /
1375	dataptr[DCTSIZE*`2`] = (DCTELEM)
1376	DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(`1.077210542`)) / c2+c8-c6 /
1377	- MULTIPLY(tmp4, FIX(`1.471445400`)), / c4+c10 /
1378	CONST_BITS+`2`);
1379	dataptr[DCTSIZE*`4`] = (DCTELEM)
1380	DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(`0.065941844`)) / c4-c6-c10 /
1381	- MULTIPLY(tmp2, FIX(`1.435427942`)) / c2 /
1382	+ MULTIPLY(tmp4, FIX(`0.621472312`)), / c8 /
1383	CONST_BITS+`2`);
1384	dataptr[DCTSIZE*`6`] = (DCTELEM)
1385	DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(`1.714276708`)) / c2+c4-c6 /
1386	- MULTIPLY(tmp2, FIX(`0.834379234`)), / c8+c10 /
1387	CONST_BITS+`2`);
1388
1389	/ Odd part /
1390
1391	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(`1.360834544`)); / c3 /
1392	tmp2 = MULTIPLY(tmp10 + tmp12, FIX(`1.130622199`)); / c5 /
1393	tmp3 = MULTIPLY(tmp10 + tmp13, FIX(`0.808813568`)); / c7 /
1394	tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(`1.819470145`)) / c7+c5+c3-c1 /
1395	+ MULTIPLY(tmp14, FIX(`0.421479672`)); / c9 /
1396	tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(`0.808813568`)); / -c7 /
1397	tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(`1.480800167`)); / -c1 /
1398	tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(`1.350258864`)) / c9+c7+c1-c3 /
1399	- MULTIPLY(tmp14, FIX(`1.130622199`)); / c5 /
1400	tmp10 = MULTIPLY(tmp12 + tmp13, FIX(`0.421479672`)); / c9 /
1401	tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(`2.104122847`)) / c9+c5+c3-c7 /
1402	+ MULTIPLY(tmp14, FIX(`1.480800167`)); / c1 /
1403	tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(`1.381129125`)) / c1+c5-c9-c7 /
1404	- MULTIPLY(tmp14, FIX(`1.360834544`)); / c3 /
1405
1406	dataptr[DCTSIZE*`1`] = (DCTELEM) DESCALE(tmp0, CONST_BITS+`2`);
1407	dataptr[DCTSIZE*`3`] = (DCTELEM) DESCALE(tmp1, CONST_BITS+`2`);
1408	dataptr[DCTSIZE*`5`] = (DCTELEM) DESCALE(tmp2, CONST_BITS+`2`);
1409	dataptr[DCTSIZE*`7`] = (DCTELEM) DESCALE(tmp3, CONST_BITS+`2`);
1410
1411	dataptr++; / advance pointer to next column /
1412	wsptr++; / advance pointer to next column /
1413	}
1414	}
1415
1416
1417	/*
1418	* Perform the forward DCT on a 12x12 sample block.
1419	*/
1420
1421	GLOBAL(void)
1422	jpeg_fdct_12x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1423	{
1424	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1425	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1426	DCTELEM workspace[`8`*`4`];
1427	DCTELEM *dataptr;
1428	DCTELEM *wsptr;
1429	JSAMPROW elemptr;
1430	int ctr;
1431	SHIFT_TEMPS
1432
1433	/ Pass 1: process rows. /
1434	/ Note results are scaled up by sqrt(8) compared to a true DCT. /
1435	/ cK represents sqrt(2) * cos(Kpi/24). /*
1436
1437	dataptr = data;
1438	ctr = `0`;
1439	for (;;) {
1440	elemptr = sample_data[ctr] + start_col;
1441
1442	/ Even part /
1443
1444	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`11`]);
1445	tmp1 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`10`]);
1446	tmp2 = GETJSAMPLE(elemptr[`2`]) + GETJSAMPLE(elemptr[`9`]);
1447	tmp3 = GETJSAMPLE(elemptr[`3`]) + GETJSAMPLE(elemptr[`8`]);
1448	tmp4 = GETJSAMPLE(elemptr[`4`]) + GETJSAMPLE(elemptr[`7`]);
1449	tmp5 = GETJSAMPLE(elemptr[`5`]) + GETJSAMPLE(elemptr[`6`]);
1450
1451	tmp10 = tmp0 + tmp5;
1452	tmp13 = tmp0 - tmp5;
1453	tmp11 = tmp1 + tmp4;
1454	tmp14 = tmp1 - tmp4;
1455	tmp12 = tmp2 + tmp3;
1456	tmp15 = tmp2 - tmp3;
1457
1458	tmp0 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`11`]);
1459	tmp1 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`10`]);
1460	tmp2 = GETJSAMPLE(elemptr[`2`]) - GETJSAMPLE(elemptr[`9`]);
1461	tmp3 = GETJSAMPLE(elemptr[`3`]) - GETJSAMPLE(elemptr[`8`]);
1462	tmp4 = GETJSAMPLE(elemptr[`4`]) - GETJSAMPLE(elemptr[`7`]);
1463	tmp5 = GETJSAMPLE(elemptr[`5`]) - GETJSAMPLE(elemptr[`6`]);
1464
1465	/ Apply unsigned->signed conversion /
1466	dataptr[`0`] = (DCTELEM) (tmp10 + tmp11 + tmp12 - `12` * CENTERJSAMPLE);
1467	dataptr[`6`] = (DCTELEM) (tmp13 - tmp14 - tmp15);
1468	dataptr[`4`] = (DCTELEM)
1469	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(`1.224744871`)), / c4 /
1470	CONST_BITS);
1471	dataptr[`2`] = (DCTELEM)
1472	DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(`1.366025404`)), / c2 /
1473	CONST_BITS);
1474
1475	/ Odd part /
1476
1477	tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100); / c9 /
1478	tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865); / c3-c9 /
1479	tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065); / c3+c9 /
1480	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(`1.121971054`)); / c5 /
1481	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(`0.860918669`)); / c7 /
1482	tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(`0.580774953`)) / c5+c7-c1 /
1483	+ MULTIPLY(tmp5, FIX(`0.184591911`)); / c11 /
1484	tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(`0.184591911`)); / -c11 /
1485	tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(`2.339493912`)) / c1+c5-c11 /
1486	+ MULTIPLY(tmp5, FIX(`0.860918669`)); / c7 /
1487	tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(`0.725788011`)) / c1+c11-c7 /
1488	- MULTIPLY(tmp5, FIX(`1.121971054`)); / c5 /
1489	tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(`1.306562965`)) / c3 /
1490	- MULTIPLY(tmp2 + tmp5, FIX_0_541196100); / c9 /
1491
1492	dataptr[`1`] = (DCTELEM) DESCALE(tmp10, CONST_BITS);
1493	dataptr[`3`] = (DCTELEM) DESCALE(tmp11, CONST_BITS);
1494	dataptr[`5`] = (DCTELEM) DESCALE(tmp12, CONST_BITS);
1495	dataptr[`7`] = (DCTELEM) DESCALE(tmp13, CONST_BITS);
1496
1497	ctr++;
1498
1499	if (ctr != DCTSIZE) {
1500	if (ctr == `12`)
1501	break; / Done. /
1502	dataptr += DCTSIZE; / advance pointer to next row /
1503	} else
1504	dataptr = workspace; / switch pointer to extended workspace /
1505	}
1506
1507	/ Pass 2: process columns.*
1508	* We leave the results scaled up by an overall factor of 8.
1509	* We must also scale the output by (8/12)**2 = 4/9, which we partially
1510	* fold into the constant multipliers and final shifting:
1511	* cK now represents sqrt(2) * cos(Kpi/24) 8/9.
1512	*/
1513
1514	dataptr = data;
1515	wsptr = workspace;
1516	for (ctr = DCTSIZE-`1`; ctr >= `0`; ctr--) {
1517	/ Even part /
1518
1519	tmp0 = dataptr[DCTSIZE`0`] + wsptr[DCTSIZE`3`];
1520	tmp1 = dataptr[DCTSIZE`1`] + wsptr[DCTSIZE`2`];
1521	tmp2 = dataptr[DCTSIZE`2`] + wsptr[DCTSIZE`1`];
1522	tmp3 = dataptr[DCTSIZE`3`] + wsptr[DCTSIZE`0`];
1523	tmp4 = dataptr[DCTSIZE`4`] + dataptr[DCTSIZE`7`];
1524	tmp5 = dataptr[DCTSIZE`5`] + dataptr[DCTSIZE`6`];
1525
1526	tmp10 = tmp0 + tmp5;
1527	tmp13 = tmp0 - tmp5;
1528	tmp11 = tmp1 + tmp4;
1529	tmp14 = tmp1 - tmp4;
1530	tmp12 = tmp2 + tmp3;
1531	tmp15 = tmp2 - tmp3;
1532
1533	tmp0 = dataptr[DCTSIZE`0`] - wsptr[DCTSIZE`3`];
1534	tmp1 = dataptr[DCTSIZE`1`] - wsptr[DCTSIZE`2`];
1535	tmp2 = dataptr[DCTSIZE`2`] - wsptr[DCTSIZE`1`];
1536	tmp3 = dataptr[DCTSIZE`3`] - wsptr[DCTSIZE`0`];
1537	tmp4 = dataptr[DCTSIZE`4`] - dataptr[DCTSIZE`7`];
1538	tmp5 = dataptr[DCTSIZE`5`] - dataptr[DCTSIZE`6`];
1539
1540	dataptr[DCTSIZE*`0`] = (DCTELEM)
1541	DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(`0.888888889`)), / 8/9 /
1542	CONST_BITS+`1`);
1543	dataptr[DCTSIZE*`6`] = (DCTELEM)
1544	DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(`0.888888889`)), / 8/9 /
1545	CONST_BITS+`1`);
1546	dataptr[DCTSIZE*`4`] = (DCTELEM)
1547	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(`1.088662108`)), / c4 /
1548	CONST_BITS+`1`);
1549	dataptr[DCTSIZE*`2`] = (DCTELEM)
1550	DESCALE(MULTIPLY(tmp14 - tmp15, FIX(`0.888888889`)) + / 8/9 /
1551	MULTIPLY(tmp13 + tmp15, FIX(`1.214244803`)), / c2 /
1552	CONST_BITS+`1`);
1553
1554	/ Odd part /
1555
1556	tmp10 = MULTIPLY(tmp1 + tmp4, FIX(`0.481063200`)); / c9 /
1557	tmp14 = tmp10 + MULTIPLY(tmp1, FIX(`0.680326102`)); / c3-c9 /
1558	tmp15 = tmp10 - MULTIPLY(tmp4, FIX(`1.642452502`)); / c3+c9 /
1559	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(`0.997307603`)); / c5 /
1560	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(`0.765261039`)); / c7 /
1561	tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(`0.516244403`)) / c5+c7-c1 /
1562	+ MULTIPLY(tmp5, FIX(`0.164081699`)); / c11 /
1563	tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(`0.164081699`)); / -c11 /
1564	tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(`2.079550144`)) / c1+c5-c11 /
1565	+ MULTIPLY(tmp5, FIX(`0.765261039`)); / c7 /
1566	tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(`0.645144899`)) / c1+c11-c7 /
1567	- MULTIPLY(tmp5, FIX(`0.997307603`)); / c5 /
1568	tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(`1.161389302`)) / c3 /
1569	- MULTIPLY(tmp2 + tmp5, FIX(`0.481063200`)); / c9 /
1570
1571	dataptr[DCTSIZE*`1`] = (DCTELEM) DESCALE(tmp10, CONST_BITS+`1`);
1572	dataptr[DCTSIZE*`3`] = (DCTELEM) DESCALE(tmp11, CONST_BITS+`1`);
1573	dataptr[DCTSIZE*`5`] = (DCTELEM) DESCALE(tmp12, CONST_BITS+`1`);
1574	dataptr[DCTSIZE*`7`] = (DCTELEM) DESCALE(tmp13, CONST_BITS+`1`);
1575
1576	dataptr++; / advance pointer to next column /
1577	wsptr++; / advance pointer to next column /
1578	}
1579	}
1580
1581
1582	/*
1583	* Perform the forward DCT on a 13x13 sample block.
1584	*/
1585
1586	GLOBAL(void)
1587	jpeg_fdct_13x13 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1588	{
1589	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1590	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1591	INT32 z1, z2;
1592	DCTELEM workspace[`8`*`5`];
1593	DCTELEM *dataptr;
1594	DCTELEM *wsptr;
1595	JSAMPROW elemptr;
1596	int ctr;
1597	SHIFT_TEMPS
1598
1599	/ Pass 1: process rows. /
1600	/ Note results are scaled up by sqrt(8) compared to a true DCT. /
1601	/ cK represents sqrt(2) * cos(Kpi/26). /*
1602
1603	dataptr = data;
1604	ctr = `0`;
1605	for (;;) {
1606	elemptr = sample_data[ctr] + start_col;
1607
1608	/ Even part /
1609
1610	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`12`]);
1611	tmp1 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`11`]);
1612	tmp2 = GETJSAMPLE(elemptr[`2`]) + GETJSAMPLE(elemptr[`10`]);
1613	tmp3 = GETJSAMPLE(elemptr[`3`]) + GETJSAMPLE(elemptr[`9`]);
1614	tmp4 = GETJSAMPLE(elemptr[`4`]) + GETJSAMPLE(elemptr[`8`]);
1615	tmp5 = GETJSAMPLE(elemptr[`5`]) + GETJSAMPLE(elemptr[`7`]);
1616	tmp6 = GETJSAMPLE(elemptr[`6`]);
1617
1618	tmp10 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`12`]);
1619	tmp11 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`11`]);
1620	tmp12 = GETJSAMPLE(elemptr[`2`]) - GETJSAMPLE(elemptr[`10`]);
1621	tmp13 = GETJSAMPLE(elemptr[`3`]) - GETJSAMPLE(elemptr[`9`]);
1622	tmp14 = GETJSAMPLE(elemptr[`4`]) - GETJSAMPLE(elemptr[`8`]);
1623	tmp15 = GETJSAMPLE(elemptr[`5`]) - GETJSAMPLE(elemptr[`7`]);
1624
1625	/ Apply unsigned->signed conversion /
1626	dataptr[`0`] = (DCTELEM)
1627	(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6 - `13` * CENTERJSAMPLE);
1628	tmp6 += tmp6;
1629	tmp0 -= tmp6;
1630	tmp1 -= tmp6;
1631	tmp2 -= tmp6;
1632	tmp3 -= tmp6;
1633	tmp4 -= tmp6;
1634	tmp5 -= tmp6;
1635	dataptr[`2`] = (DCTELEM)
1636	DESCALE(MULTIPLY(tmp0, FIX(`1.373119086`)) + / c2 /
1637	MULTIPLY(tmp1, FIX(`1.058554052`)) + / c6 /
1638	MULTIPLY(tmp2, FIX(`0.501487041`)) - / c10 /
1639	MULTIPLY(tmp3, FIX(`0.170464608`)) - / c12 /
1640	MULTIPLY(tmp4, FIX(`0.803364869`)) - / c8 /
1641	MULTIPLY(tmp5, FIX(`1.252223920`)), / c4 /
1642	CONST_BITS);
1643	z1 = MULTIPLY(tmp0 - tmp2, FIX(`1.155388986`)) - / (c4+c6)/2 /
1644	MULTIPLY(tmp3 - tmp4, FIX(`0.435816023`)) - / (c2-c10)/2 /
1645	MULTIPLY(tmp1 - tmp5, FIX(`0.316450131`)); / (c8-c12)/2 /
1646	z2 = MULTIPLY(tmp0 + tmp2, FIX(`0.096834934`)) - / (c4-c6)/2 /
1647	MULTIPLY(tmp3 + tmp4, FIX(`0.937303064`)) + / (c2+c10)/2 /
1648	MULTIPLY(tmp1 + tmp5, FIX(`0.486914739`)); / (c8+c12)/2 /
1649
1650	dataptr[`4`] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS);
1651	dataptr[`6`] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS);
1652
1653	/ Odd part /
1654
1655	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(`1.322312651`)); / c3 /
1656	tmp2 = MULTIPLY(tmp10 + tmp12, FIX(`1.163874945`)); / c5 /
1657	tmp3 = MULTIPLY(tmp10 + tmp13, FIX(`0.937797057`)) + / c7 /
1658	MULTIPLY(tmp14 + tmp15, FIX(`0.338443458`)); / c11 /
1659	tmp0 = tmp1 + tmp2 + tmp3 -
1660	MULTIPLY(tmp10, FIX(`2.020082300`)) + / c3+c5+c7-c1 /
1661	MULTIPLY(tmp14, FIX(`0.318774355`)); / c9-c11 /
1662	tmp4 = MULTIPLY(tmp14 - tmp15, FIX(`0.937797057`)) - / c7 /
1663	MULTIPLY(tmp11 + tmp12, FIX(`0.338443458`)); / c11 /
1664	tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(`1.163874945`)); / -c5 /
1665	tmp1 += tmp4 + tmp5 +
1666	MULTIPLY(tmp11, FIX(`0.837223564`)) - / c5+c9+c11-c3 /
1667	MULTIPLY(tmp14, FIX(`2.341699410`)); / c1+c7 /
1668	tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(`0.657217813`)); / -c9 /
1669	tmp2 += tmp4 + tmp6 -
1670	MULTIPLY(tmp12, FIX(`1.572116027`)) + / c1+c5-c9-c11 /
1671	MULTIPLY(tmp15, FIX(`2.260109708`)); / c3+c7 /
1672	tmp3 += tmp5 + tmp6 +
1673	MULTIPLY(tmp13, FIX(`2.205608352`)) - / c3+c5+c9-c7 /
1674	MULTIPLY(tmp15, FIX(`1.742345811`)); / c1+c11 /
1675
1676	dataptr[`1`] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
1677	dataptr[`3`] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
1678	dataptr[`5`] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
1679	dataptr[`7`] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
1680
1681	ctr++;
1682
1683	if (ctr != DCTSIZE) {
1684	if (ctr == `13`)
1685	break; / Done. /
1686	dataptr += DCTSIZE; / advance pointer to next row /
1687	} else
1688	dataptr = workspace; / switch pointer to extended workspace /
1689	}
1690
1691	/ Pass 2: process columns.*
1692	* We leave the results scaled up by an overall factor of 8.
1693	* We must also scale the output by (8/13)**2 = 64/169, which we partially
1694	* fold into the constant multipliers and final shifting:
1695	* cK now represents sqrt(2) * cos(Kpi/26) 128/169.
1696	*/
1697
1698	dataptr = data;
1699	wsptr = workspace;
1700	for (ctr = DCTSIZE-`1`; ctr >= `0`; ctr--) {
1701	/ Even part /
1702
1703	tmp0 = dataptr[DCTSIZE`0`] + wsptr[DCTSIZE`4`];
1704	tmp1 = dataptr[DCTSIZE`1`] + wsptr[DCTSIZE`3`];
1705	tmp2 = dataptr[DCTSIZE`2`] + wsptr[DCTSIZE`2`];
1706	tmp3 = dataptr[DCTSIZE`3`] + wsptr[DCTSIZE`1`];
1707	tmp4 = dataptr[DCTSIZE`4`] + wsptr[DCTSIZE`0`];
1708	tmp5 = dataptr[DCTSIZE`5`] + dataptr[DCTSIZE`7`];
1709	tmp6 = dataptr[DCTSIZE*`6`];
1710
1711	tmp10 = dataptr[DCTSIZE`0`] - wsptr[DCTSIZE`4`];
1712	tmp11 = dataptr[DCTSIZE`1`] - wsptr[DCTSIZE`3`];
1713	tmp12 = dataptr[DCTSIZE`2`] - wsptr[DCTSIZE`2`];
1714	tmp13 = dataptr[DCTSIZE`3`] - wsptr[DCTSIZE`1`];
1715	tmp14 = dataptr[DCTSIZE`4`] - wsptr[DCTSIZE`0`];
1716	tmp15 = dataptr[DCTSIZE`5`] - dataptr[DCTSIZE`7`];
1717
1718	dataptr[DCTSIZE*`0`] = (DCTELEM)
1719	DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6,
1720	FIX(`0.757396450`)), / 128/169 /
1721	CONST_BITS+`1`);
1722	tmp6 += tmp6;
1723	tmp0 -= tmp6;
1724	tmp1 -= tmp6;
1725	tmp2 -= tmp6;
1726	tmp3 -= tmp6;
1727	tmp4 -= tmp6;
1728	tmp5 -= tmp6;
1729	dataptr[DCTSIZE*`2`] = (DCTELEM)
1730	DESCALE(MULTIPLY(tmp0, FIX(`1.039995521`)) + / c2 /
1731	MULTIPLY(tmp1, FIX(`0.801745081`)) + / c6 /
1732	MULTIPLY(tmp2, FIX(`0.379824504`)) - / c10 /
1733	MULTIPLY(tmp3, FIX(`0.129109289`)) - / c12 /
1734	MULTIPLY(tmp4, FIX(`0.608465700`)) - / c8 /
1735	MULTIPLY(tmp5, FIX(`0.948429952`)), / c4 /
1736	CONST_BITS+`1`);
1737	z1 = MULTIPLY(tmp0 - tmp2, FIX(`0.875087516`)) - / (c4+c6)/2 /
1738	MULTIPLY(tmp3 - tmp4, FIX(`0.330085509`)) - / (c2-c10)/2 /
1739	MULTIPLY(tmp1 - tmp5, FIX(`0.239678205`)); / (c8-c12)/2 /
1740	z2 = MULTIPLY(tmp0 + tmp2, FIX(`0.073342435`)) - / (c4-c6)/2 /
1741	MULTIPLY(tmp3 + tmp4, FIX(`0.709910013`)) + / (c2+c10)/2 /
1742	MULTIPLY(tmp1 + tmp5, FIX(`0.368787494`)); / (c8+c12)/2 /
1743
1744	dataptr[DCTSIZE*`4`] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+`1`);
1745	dataptr[DCTSIZE*`6`] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS+`1`);
1746
1747	/ Odd part /
1748
1749	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(`1.001514908`)); / c3 /
1750	tmp2 = MULTIPLY(tmp10 + tmp12, FIX(`0.881514751`)); / c5 /
1751	tmp3 = MULTIPLY(tmp10 + tmp13, FIX(`0.710284161`)) + / c7 /
1752	MULTIPLY(tmp14 + tmp15, FIX(`0.256335874`)); / c11 /
1753	tmp0 = tmp1 + tmp2 + tmp3 -
1754	MULTIPLY(tmp10, FIX(`1.530003162`)) + / c3+c5+c7-c1 /
1755	MULTIPLY(tmp14, FIX(`0.241438564`)); / c9-c11 /
1756	tmp4 = MULTIPLY(tmp14 - tmp15, FIX(`0.710284161`)) - / c7 /
1757	MULTIPLY(tmp11 + tmp12, FIX(`0.256335874`)); / c11 /
1758	tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(`0.881514751`)); / -c5 /
1759	tmp1 += tmp4 + tmp5 +
1760	MULTIPLY(tmp11, FIX(`0.634110155`)) - / c5+c9+c11-c3 /
1761	MULTIPLY(tmp14, FIX(`1.773594819`)); / c1+c7 /
1762	tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(`0.497774438`)); / -c9 /
1763	tmp2 += tmp4 + tmp6 -
1764	MULTIPLY(tmp12, FIX(`1.190715098`)) + / c1+c5-c9-c11 /
1765	MULTIPLY(tmp15, FIX(`1.711799069`)); / c3+c7 /
1766	tmp3 += tmp5 + tmp6 +
1767	MULTIPLY(tmp13, FIX(`1.670519935`)) - / c3+c5+c9-c7 /
1768	MULTIPLY(tmp15, FIX(`1.319646532`)); / c1+c11 /
1769
1770	dataptr[DCTSIZE*`1`] = (DCTELEM) DESCALE(tmp0, CONST_BITS+`1`);
1771	dataptr[DCTSIZE*`3`] = (DCTELEM) DESCALE(tmp1, CONST_BITS+`1`);
1772	dataptr[DCTSIZE*`5`] = (DCTELEM) DESCALE(tmp2, CONST_BITS+`1`);
1773	dataptr[DCTSIZE*`7`] = (DCTELEM) DESCALE(tmp3, CONST_BITS+`1`);
1774
1775	dataptr++; / advance pointer to next column /
1776	wsptr++; / advance pointer to next column /
1777	}
1778	}
1779
1780
1781	/*
1782	* Perform the forward DCT on a 14x14 sample block.
1783	*/
1784
1785	GLOBAL(void)
1786	jpeg_fdct_14x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1787	{
1788	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1789	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
1790	DCTELEM workspace[`8`*`6`];
1791	DCTELEM *dataptr;
1792	DCTELEM *wsptr;
1793	JSAMPROW elemptr;
1794	int ctr;
1795	SHIFT_TEMPS
1796
1797	/ Pass 1: process rows. /
1798	/ Note results are scaled up by sqrt(8) compared to a true DCT. /
1799	/ cK represents sqrt(2) * cos(Kpi/28). /*
1800
1801	dataptr = data;
1802	ctr = `0`;
1803	for (;;) {
1804	elemptr = sample_data[ctr] + start_col;
1805
1806	/ Even part /
1807
1808	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`13`]);
1809	tmp1 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`12`]);
1810	tmp2 = GETJSAMPLE(elemptr[`2`]) + GETJSAMPLE(elemptr[`11`]);
1811	tmp13 = GETJSAMPLE(elemptr[`3`]) + GETJSAMPLE(elemptr[`10`]);
1812	tmp4 = GETJSAMPLE(elemptr[`4`]) + GETJSAMPLE(elemptr[`9`]);
1813	tmp5 = GETJSAMPLE(elemptr[`5`]) + GETJSAMPLE(elemptr[`8`]);
1814	tmp6 = GETJSAMPLE(elemptr[`6`]) + GETJSAMPLE(elemptr[`7`]);
1815
1816	tmp10 = tmp0 + tmp6;
1817	tmp14 = tmp0 - tmp6;
1818	tmp11 = tmp1 + tmp5;
1819	tmp15 = tmp1 - tmp5;
1820	tmp12 = tmp2 + tmp4;
1821	tmp16 = tmp2 - tmp4;
1822
1823	tmp0 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`13`]);
1824	tmp1 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`12`]);
1825	tmp2 = GETJSAMPLE(elemptr[`2`]) - GETJSAMPLE(elemptr[`11`]);
1826	tmp3 = GETJSAMPLE(elemptr[`3`]) - GETJSAMPLE(elemptr[`10`]);
1827	tmp4 = GETJSAMPLE(elemptr[`4`]) - GETJSAMPLE(elemptr[`9`]);
1828	tmp5 = GETJSAMPLE(elemptr[`5`]) - GETJSAMPLE(elemptr[`8`]);
1829	tmp6 = GETJSAMPLE(elemptr[`6`]) - GETJSAMPLE(elemptr[`7`]);
1830
1831	/ Apply unsigned->signed conversion /
1832	dataptr[`0`] = (DCTELEM)
1833	(tmp10 + tmp11 + tmp12 + tmp13 - `14` * CENTERJSAMPLE);
1834	tmp13 += tmp13;
1835	dataptr[`4`] = (DCTELEM)
1836	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(`1.274162392`)) + / c4 /
1837	MULTIPLY(tmp11 - tmp13, FIX(`0.314692123`)) - / c12 /
1838	MULTIPLY(tmp12 - tmp13, FIX(`0.881747734`)), / c8 /
1839	CONST_BITS);
1840
1841	tmp10 = MULTIPLY(tmp14 + tmp15, FIX(`1.105676686`)); / c6 /
1842
1843	dataptr[`2`] = (DCTELEM)
1844	DESCALE(tmp10 + MULTIPLY(tmp14, FIX(`0.273079590`)) / c2-c6 /
1845	+ MULTIPLY(tmp16, FIX(`0.613604268`)), / c10 /
1846	CONST_BITS);
1847	dataptr[`6`] = (DCTELEM)
1848	DESCALE(tmp10 - MULTIPLY(tmp15, FIX(`1.719280954`)) / c6+c10 /
1849	- MULTIPLY(tmp16, FIX(`1.378756276`)), / c2 /
1850	CONST_BITS);
1851
1852	/ Odd part /
1853
1854	tmp10 = tmp1 + tmp2;
1855	tmp11 = tmp5 - tmp4;
1856	dataptr[`7`] = (DCTELEM) (tmp0 - tmp10 + tmp3 - tmp11 - tmp6);
1857	tmp3 <<= CONST_BITS;
1858	tmp10 = MULTIPLY(tmp10, - FIX(`0.158341681`)); / -c13 /
1859	tmp11 = MULTIPLY(tmp11, FIX(`1.405321284`)); / c1 /
1860	tmp10 += tmp11 - tmp3;
1861	tmp11 = MULTIPLY(tmp0 + tmp2, FIX(`1.197448846`)) + / c5 /
1862	MULTIPLY(tmp4 + tmp6, FIX(`0.752406978`)); / c9 /
1863	dataptr[`5`] = (DCTELEM)
1864	DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(`2.373959773`)) / c3+c5-c13 /
1865	+ MULTIPLY(tmp4, FIX(`1.119999435`)), / c1+c11-c9 /
1866	CONST_BITS);
1867	tmp12 = MULTIPLY(tmp0 + tmp1, FIX(`1.334852607`)) + / c3 /
1868	MULTIPLY(tmp5 - tmp6, FIX(`0.467085129`)); / c11 /
1869	dataptr[`3`] = (DCTELEM)
1870	DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(`0.424103948`)) / c3-c9-c13 /
1871	- MULTIPLY(tmp5, FIX(`3.069855259`)), / c1+c5+c11 /
1872	CONST_BITS);
1873	dataptr[`1`] = (DCTELEM)
1874	DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
1875	MULTIPLY(tmp0 + tmp6, FIX(`1.126980169`)), / c3+c5-c1 /
1876	CONST_BITS);
1877
1878	ctr++;
1879
1880	if (ctr != DCTSIZE) {
1881	if (ctr == `14`)
1882	break; / Done. /
1883	dataptr += DCTSIZE; / advance pointer to next row /
1884	} else
1885	dataptr = workspace; / switch pointer to extended workspace /
1886	}
1887
1888	/ Pass 2: process columns.*
1889	* We leave the results scaled up by an overall factor of 8.
1890	* We must also scale the output by (8/14)**2 = 16/49, which we partially
1891	* fold into the constant multipliers and final shifting:
1892	* cK now represents sqrt(2) * cos(Kpi/28) 32/49.
1893	*/
1894
1895	dataptr = data;
1896	wsptr = workspace;
1897	for (ctr = DCTSIZE-`1`; ctr >= `0`; ctr--) {
1898	/ Even part /
1899
1900	tmp0 = dataptr[DCTSIZE`0`] + wsptr[DCTSIZE`5`];
1901	tmp1 = dataptr[DCTSIZE`1`] + wsptr[DCTSIZE`4`];
1902	tmp2 = dataptr[DCTSIZE`2`] + wsptr[DCTSIZE`3`];
1903	tmp13 = dataptr[DCTSIZE`3`] + wsptr[DCTSIZE`2`];
1904	tmp4 = dataptr[DCTSIZE`4`] + wsptr[DCTSIZE`1`];
1905	tmp5 = dataptr[DCTSIZE`5`] + wsptr[DCTSIZE`0`];
1906	tmp6 = dataptr[DCTSIZE`6`] + dataptr[DCTSIZE`7`];
1907
1908	tmp10 = tmp0 + tmp6;
1909	tmp14 = tmp0 - tmp6;
1910	tmp11 = tmp1 + tmp5;
1911	tmp15 = tmp1 - tmp5;
1912	tmp12 = tmp2 + tmp4;
1913	tmp16 = tmp2 - tmp4;
1914
1915	tmp0 = dataptr[DCTSIZE`0`] - wsptr[DCTSIZE`5`];
1916	tmp1 = dataptr[DCTSIZE`1`] - wsptr[DCTSIZE`4`];
1917	tmp2 = dataptr[DCTSIZE`2`] - wsptr[DCTSIZE`3`];
1918	tmp3 = dataptr[DCTSIZE`3`] - wsptr[DCTSIZE`2`];
1919	tmp4 = dataptr[DCTSIZE`4`] - wsptr[DCTSIZE`1`];
1920	tmp5 = dataptr[DCTSIZE`5`] - wsptr[DCTSIZE`0`];
1921	tmp6 = dataptr[DCTSIZE`6`] - dataptr[DCTSIZE`7`];
1922
1923	dataptr[DCTSIZE*`0`] = (DCTELEM)
1924	DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
1925	FIX(`0.653061224`)), / 32/49 /
1926	CONST_BITS+`1`);
1927	tmp13 += tmp13;
1928	dataptr[DCTSIZE*`4`] = (DCTELEM)
1929	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(`0.832106052`)) + / c4 /
1930	MULTIPLY(tmp11 - tmp13, FIX(`0.205513223`)) - / c12 /
1931	MULTIPLY(tmp12 - tmp13, FIX(`0.575835255`)), / c8 /
1932	CONST_BITS+`1`);
1933
1934	tmp10 = MULTIPLY(tmp14 + tmp15, FIX(`0.722074570`)); / c6 /
1935
1936	dataptr[DCTSIZE*`2`] = (DCTELEM)
1937	DESCALE(tmp10 + MULTIPLY(tmp14, FIX(`0.178337691`)) / c2-c6 /
1938	+ MULTIPLY(tmp16, FIX(`0.400721155`)), / c10 /
1939	CONST_BITS+`1`);
1940	dataptr[DCTSIZE*`6`] = (DCTELEM)
1941	DESCALE(tmp10 - MULTIPLY(tmp15, FIX(`1.122795725`)) / c6+c10 /
1942	- MULTIPLY(tmp16, FIX(`0.900412262`)), / c2 /
1943	CONST_BITS+`1`);
1944
1945	/ Odd part /
1946
1947	tmp10 = tmp1 + tmp2;
1948	tmp11 = tmp5 - tmp4;
1949	dataptr[DCTSIZE*`7`] = (DCTELEM)
1950	DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
1951	FIX(`0.653061224`)), / 32/49 /
1952	CONST_BITS+`1`);
1953	tmp3 = MULTIPLY(tmp3 , FIX(`0.653061224`)); / 32/49 /
1954	tmp10 = MULTIPLY(tmp10, - FIX(`0.103406812`)); / -c13 /
1955	tmp11 = MULTIPLY(tmp11, FIX(`0.917760839`)); / c1 /
1956	tmp10 += tmp11 - tmp3;
1957	tmp11 = MULTIPLY(tmp0 + tmp2, FIX(`0.782007410`)) + / c5 /
1958	MULTIPLY(tmp4 + tmp6, FIX(`0.491367823`)); / c9 /
1959	dataptr[DCTSIZE*`5`] = (DCTELEM)
1960	DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(`1.550341076`)) / c3+c5-c13 /
1961	+ MULTIPLY(tmp4, FIX(`0.731428202`)), / c1+c11-c9 /
1962	CONST_BITS+`1`);
1963	tmp12 = MULTIPLY(tmp0 + tmp1, FIX(`0.871740478`)) + / c3 /
1964	MULTIPLY(tmp5 - tmp6, FIX(`0.305035186`)); / c11 /
1965	dataptr[DCTSIZE*`3`] = (DCTELEM)
1966	DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(`0.276965844`)) / c3-c9-c13 /
1967	- MULTIPLY(tmp5, FIX(`2.004803435`)), / c1+c5+c11 /
1968	CONST_BITS+`1`);
1969	dataptr[DCTSIZE*`1`] = (DCTELEM)
1970	DESCALE(tmp11 + tmp12 + tmp3
1971	- MULTIPLY(tmp0, FIX(`0.735987049`)) / c3+c5-c1 /
1972	- MULTIPLY(tmp6, FIX(`0.082925825`)), / c9-c11-c13 /
1973	CONST_BITS+`1`);
1974
1975	dataptr++; / advance pointer to next column /
1976	wsptr++; / advance pointer to next column /
1977	}
1978	}
1979
1980
1981	/*
1982	* Perform the forward DCT on a 15x15 sample block.
1983	*/
1984
1985	GLOBAL(void)
1986	jpeg_fdct_15x15 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1987	{
1988	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1989	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
1990	INT32 z1, z2, z3;
1991	DCTELEM workspace[`8`*`7`];
1992	DCTELEM *dataptr;
1993	DCTELEM *wsptr;
1994	JSAMPROW elemptr;
1995	int ctr;
1996	SHIFT_TEMPS
1997
1998	/ Pass 1: process rows. /
1999	/ Note results are scaled up by sqrt(8) compared to a true DCT. /
2000	/ cK represents sqrt(2) * cos(Kpi/30). /*
2001
2002	dataptr = data;
2003	ctr = `0`;
2004	for (;;) {
2005	elemptr = sample_data[ctr] + start_col;
2006
2007	/ Even part /
2008
2009	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`14`]);
2010	tmp1 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`13`]);
2011	tmp2 = GETJSAMPLE(elemptr[`2`]) + GETJSAMPLE(elemptr[`12`]);
2012	tmp3 = GETJSAMPLE(elemptr[`3`]) + GETJSAMPLE(elemptr[`11`]);
2013	tmp4 = GETJSAMPLE(elemptr[`4`]) + GETJSAMPLE(elemptr[`10`]);
2014	tmp5 = GETJSAMPLE(elemptr[`5`]) + GETJSAMPLE(elemptr[`9`]);
2015	tmp6 = GETJSAMPLE(elemptr[`6`]) + GETJSAMPLE(elemptr[`8`]);
2016	tmp7 = GETJSAMPLE(elemptr[`7`]);
2017
2018	tmp10 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`14`]);
2019	tmp11 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`13`]);
2020	tmp12 = GETJSAMPLE(elemptr[`2`]) - GETJSAMPLE(elemptr[`12`]);
2021	tmp13 = GETJSAMPLE(elemptr[`3`]) - GETJSAMPLE(elemptr[`11`]);
2022	tmp14 = GETJSAMPLE(elemptr[`4`]) - GETJSAMPLE(elemptr[`10`]);
2023	tmp15 = GETJSAMPLE(elemptr[`5`]) - GETJSAMPLE(elemptr[`9`]);
2024	tmp16 = GETJSAMPLE(elemptr[`6`]) - GETJSAMPLE(elemptr[`8`]);
2025
2026	z1 = tmp0 + tmp4 + tmp5;
2027	z2 = tmp1 + tmp3 + tmp6;
2028	z3 = tmp2 + tmp7;
2029	/ Apply unsigned->signed conversion /
2030	dataptr[`0`] = (DCTELEM) (z1 + z2 + z3 - `15` * CENTERJSAMPLE);
2031	z3 += z3;
2032	dataptr[`6`] = (DCTELEM)
2033	DESCALE(MULTIPLY(z1 - z3, FIX(`1.144122806`)) - / c6 /
2034	MULTIPLY(z2 - z3, FIX(`0.437016024`)), / c12 /
2035	CONST_BITS);
2036	tmp2 += ((tmp1 + tmp4) >> `1`) - tmp7 - tmp7;
2037	z1 = MULTIPLY(tmp3 - tmp2, FIX(`1.531135173`)) - / c2+c14 /
2038	MULTIPLY(tmp6 - tmp2, FIX(`2.238241955`)); / c4+c8 /
2039	z2 = MULTIPLY(tmp5 - tmp2, FIX(`0.798468008`)) - / c8-c14 /
2040	MULTIPLY(tmp0 - tmp2, FIX(`0.091361227`)); / c2-c4 /
2041	z3 = MULTIPLY(tmp0 - tmp3, FIX(`1.383309603`)) + / c2 /
2042	MULTIPLY(tmp6 - tmp5, FIX(`0.946293579`)) + / c8 /
2043	MULTIPLY(tmp1 - tmp4, FIX(`0.790569415`)); / (c6+c12)/2 /
2044
2045	dataptr[`2`] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS);
2046	dataptr[`4`] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS);
2047
2048	/ Odd part /
2049
2050	tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
2051	FIX(`1.224744871`)); / c5 /
2052	tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(`1.344997024`)) + / c3 /
2053	MULTIPLY(tmp11 - tmp13 - tmp16, FIX(`0.831253876`)); / c9 /
2054	tmp12 = MULTIPLY(tmp12, FIX(`1.224744871`)); / c5 /
2055	tmp4 = MULTIPLY(tmp10 - tmp16, FIX(`1.406466353`)) + / c1 /
2056	MULTIPLY(tmp11 + tmp14, FIX(`1.344997024`)) + / c3 /
2057	MULTIPLY(tmp13 + tmp15, FIX(`0.575212477`)); / c11 /
2058	tmp0 = MULTIPLY(tmp13, FIX(`0.475753014`)) - / c7-c11 /
2059	MULTIPLY(tmp14, FIX(`0.513743148`)) + / c3-c9 /
2060	MULTIPLY(tmp16, FIX(`1.700497885`)) + tmp4 + tmp12; / c1+c13 /
2061	tmp3 = MULTIPLY(tmp10, - FIX(`0.355500862`)) - / -(c1-c7) /
2062	MULTIPLY(tmp11, FIX(`2.176250899`)) - / c3+c9 /
2063	MULTIPLY(tmp15, FIX(`0.869244010`)) + tmp4 - tmp12; / c11+c13 /
2064
2065	dataptr[`1`] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
2066	dataptr[`3`] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
2067	dataptr[`5`] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
2068	dataptr[`7`] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
2069
2070	ctr++;
2071
2072	if (ctr != DCTSIZE) {
2073	if (ctr == `15`)
2074	break; / Done. /
2075	dataptr += DCTSIZE; / advance pointer to next row /
2076	} else
2077	dataptr = workspace; / switch pointer to extended workspace /
2078	}
2079
2080	/ Pass 2: process columns.*
2081	* We leave the results scaled up by an overall factor of 8.
2082	* We must also scale the output by (8/15)**2 = 64/225, which we partially
2083	* fold into the constant multipliers and final shifting:
2084	* cK now represents sqrt(2) * cos(Kpi/30) 256/225.
2085	*/
2086
2087	dataptr = data;
2088	wsptr = workspace;
2089	for (ctr = DCTSIZE-`1`; ctr >= `0`; ctr--) {
2090	/ Even part /
2091
2092	tmp0 = dataptr[DCTSIZE`0`] + wsptr[DCTSIZE`6`];
2093	tmp1 = dataptr[DCTSIZE`1`] + wsptr[DCTSIZE`5`];
2094	tmp2 = dataptr[DCTSIZE`2`] + wsptr[DCTSIZE`4`];
2095	tmp3 = dataptr[DCTSIZE`3`] + wsptr[DCTSIZE`3`];
2096	tmp4 = dataptr[DCTSIZE`4`] + wsptr[DCTSIZE`2`];
2097	tmp5 = dataptr[DCTSIZE`5`] + wsptr[DCTSIZE`1`];
2098	tmp6 = dataptr[DCTSIZE`6`] + wsptr[DCTSIZE`0`];
2099	tmp7 = dataptr[DCTSIZE*`7`];
2100
2101	tmp10 = dataptr[DCTSIZE`0`] - wsptr[DCTSIZE`6`];
2102	tmp11 = dataptr[DCTSIZE`1`] - wsptr[DCTSIZE`5`];
2103	tmp12 = dataptr[DCTSIZE`2`] - wsptr[DCTSIZE`4`];
2104	tmp13 = dataptr[DCTSIZE`3`] - wsptr[DCTSIZE`3`];
2105	tmp14 = dataptr[DCTSIZE`4`] - wsptr[DCTSIZE`2`];
2106	tmp15 = dataptr[DCTSIZE`5`] - wsptr[DCTSIZE`1`];
2107	tmp16 = dataptr[DCTSIZE`6`] - wsptr[DCTSIZE`0`];
2108
2109	z1 = tmp0 + tmp4 + tmp5;
2110	z2 = tmp1 + tmp3 + tmp6;
2111	z3 = tmp2 + tmp7;
2112	dataptr[DCTSIZE*`0`] = (DCTELEM)
2113	DESCALE(MULTIPLY(z1 + z2 + z3, FIX(`1.137777778`)), / 256/225 /
2114	CONST_BITS+`2`);
2115	z3 += z3;
2116	dataptr[DCTSIZE*`6`] = (DCTELEM)
2117	DESCALE(MULTIPLY(z1 - z3, FIX(`1.301757503`)) - / c6 /
2118	MULTIPLY(z2 - z3, FIX(`0.497227121`)), / c12 /
2119	CONST_BITS+`2`);
2120	tmp2 += ((tmp1 + tmp4) >> `1`) - tmp7 - tmp7;
2121	z1 = MULTIPLY(tmp3 - tmp2, FIX(`1.742091575`)) - / c2+c14 /
2122	MULTIPLY(tmp6 - tmp2, FIX(`2.546621957`)); / c4+c8 /
2123	z2 = MULTIPLY(tmp5 - tmp2, FIX(`0.908479156`)) - / c8-c14 /
2124	MULTIPLY(tmp0 - tmp2, FIX(`0.103948774`)); / c2-c4 /
2125	z3 = MULTIPLY(tmp0 - tmp3, FIX(`1.573898926`)) + / c2 /
2126	MULTIPLY(tmp6 - tmp5, FIX(`1.076671805`)) + / c8 /
2127	MULTIPLY(tmp1 - tmp4, FIX(`0.899492312`)); / (c6+c12)/2 /
2128
2129	dataptr[DCTSIZE*`2`] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS+`2`);
2130	dataptr[DCTSIZE*`4`] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS+`2`);
2131
2132	/ Odd part /
2133
2134	tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
2135	FIX(`1.393487498`)); / c5 /
2136	tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(`1.530307725`)) + / c3 /
2137	MULTIPLY(tmp11 - tmp13 - tmp16, FIX(`0.945782187`)); / c9 /
2138	tmp12 = MULTIPLY(tmp12, FIX(`1.393487498`)); / c5 /
2139	tmp4 = MULTIPLY(tmp10 - tmp16, FIX(`1.600246161`)) + / c1 /
2140	MULTIPLY(tmp11 + tmp14, FIX(`1.530307725`)) + / c3 /
2141	MULTIPLY(tmp13 + tmp15, FIX(`0.654463974`)); / c11 /
2142	tmp0 = MULTIPLY(tmp13, FIX(`0.541301207`)) - / c7-c11 /
2143	MULTIPLY(tmp14, FIX(`0.584525538`)) + / c3-c9 /
2144	MULTIPLY(tmp16, FIX(`1.934788705`)) + tmp4 + tmp12; / c1+c13 /
2145	tmp3 = MULTIPLY(tmp10, - FIX(`0.404480980`)) - / -(c1-c7) /
2146	MULTIPLY(tmp11, FIX(`2.476089912`)) - / c3+c9 /
2147	MULTIPLY(tmp15, FIX(`0.989006518`)) + tmp4 - tmp12; / c11+c13 /
2148
2149	dataptr[DCTSIZE*`1`] = (DCTELEM) DESCALE(tmp0, CONST_BITS+`2`);
2150	dataptr[DCTSIZE*`3`] = (DCTELEM) DESCALE(tmp1, CONST_BITS+`2`);
2151	dataptr[DCTSIZE*`5`] = (DCTELEM) DESCALE(tmp2, CONST_BITS+`2`);
2152	dataptr[DCTSIZE*`7`] = (DCTELEM) DESCALE(tmp3, CONST_BITS+`2`);
2153
2154	dataptr++; / advance pointer to next column /
2155	wsptr++; / advance pointer to next column /
2156	}
2157	}
2158
2159
2160	/*
2161	* Perform the forward DCT on a 16x16 sample block.
2162	*/
2163
2164	GLOBAL(void)
2165	jpeg_fdct_16x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2166	{
2167	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2168	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
2169	DCTELEM workspace[DCTSIZE2];
2170	DCTELEM *dataptr;
2171	DCTELEM *wsptr;
2172	JSAMPROW elemptr;
2173	int ctr;
2174	SHIFT_TEMPS
2175
2176	/ Pass 1: process rows. /
2177	/ Note results are scaled up by sqrt(8) compared to a true DCT; /
2178	/ furthermore, we scale the results by 2*PASS1_BITS. /*
2179	/ cK represents sqrt(2) * cos(Kpi/32). /*
2180
2181	dataptr = data;
2182	ctr = `0`;
2183	for (;;) {
2184	elemptr = sample_data[ctr] + start_col;
2185
2186	/ Even part /
2187
2188	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`15`]);
2189	tmp1 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`14`]);
2190	tmp2 = GETJSAMPLE(elemptr[`2`]) + GETJSAMPLE(elemptr[`13`]);
2191	tmp3 = GETJSAMPLE(elemptr[`3`]) + GETJSAMPLE(elemptr[`12`]);
2192	tmp4 = GETJSAMPLE(elemptr[`4`]) + GETJSAMPLE(elemptr[`11`]);
2193	tmp5 = GETJSAMPLE(elemptr[`5`]) + GETJSAMPLE(elemptr[`10`]);
2194	tmp6 = GETJSAMPLE(elemptr[`6`]) + GETJSAMPLE(elemptr[`9`]);
2195	tmp7 = GETJSAMPLE(elemptr[`7`]) + GETJSAMPLE(elemptr[`8`]);
2196
2197	tmp10 = tmp0 + tmp7;
2198	tmp14 = tmp0 - tmp7;
2199	tmp11 = tmp1 + tmp6;
2200	tmp15 = tmp1 - tmp6;
2201	tmp12 = tmp2 + tmp5;
2202	tmp16 = tmp2 - tmp5;
2203	tmp13 = tmp3 + tmp4;
2204	tmp17 = tmp3 - tmp4;
2205
2206	tmp0 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`15`]);
2207	tmp1 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`14`]);
2208	tmp2 = GETJSAMPLE(elemptr[`2`]) - GETJSAMPLE(elemptr[`13`]);
2209	tmp3 = GETJSAMPLE(elemptr[`3`]) - GETJSAMPLE(elemptr[`12`]);
2210	tmp4 = GETJSAMPLE(elemptr[`4`]) - GETJSAMPLE(elemptr[`11`]);
2211	tmp5 = GETJSAMPLE(elemptr[`5`]) - GETJSAMPLE(elemptr[`10`]);
2212	tmp6 = GETJSAMPLE(elemptr[`6`]) - GETJSAMPLE(elemptr[`9`]);
2213	tmp7 = GETJSAMPLE(elemptr[`7`]) - GETJSAMPLE(elemptr[`8`]);
2214
2215	/ Apply unsigned->signed conversion /
2216	dataptr[`0`] = (DCTELEM)
2217	((tmp10 + tmp11 + tmp12 + tmp13 - `16` * CENTERJSAMPLE) << PASS1_BITS);
2218	dataptr[`4`] = (DCTELEM)
2219	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(`1.306562965`)) + / c4[16] = c2[8] /
2220	MULTIPLY(tmp11 - tmp12, FIX_0_541196100), / c12[16] = c6[8] /
2221	CONST_BITS-PASS1_BITS);
2222
2223	tmp10 = MULTIPLY(tmp17 - tmp15, FIX(`0.275899379`)) + / c14[16] = c7[8] /
2224	MULTIPLY(tmp14 - tmp16, FIX(`1.387039845`)); / c2[16] = c1[8] /
2225
2226	dataptr[`2`] = (DCTELEM)
2227	DESCALE(tmp10 + MULTIPLY(tmp15, FIX(`1.451774982`)) / c6+c14 /
2228	+ MULTIPLY(tmp16, FIX(`2.172734804`)), / c2+c10 /
2229	CONST_BITS-PASS1_BITS);
2230	dataptr[`6`] = (DCTELEM)
2231	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(`0.211164243`)) / c2-c6 /
2232	- MULTIPLY(tmp17, FIX(`1.061594338`)), / c10+c14 /
2233	CONST_BITS-PASS1_BITS);
2234
2235	/ Odd part /
2236
2237	tmp11 = MULTIPLY(tmp0 + tmp1, FIX(`1.353318001`)) + / c3 /
2238	MULTIPLY(tmp6 - tmp7, FIX(`0.410524528`)); / c13 /
2239	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(`1.247225013`)) + / c5 /
2240	MULTIPLY(tmp5 + tmp7, FIX(`0.666655658`)); / c11 /
2241	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(`1.093201867`)) + / c7 /
2242	MULTIPLY(tmp4 - tmp7, FIX(`0.897167586`)); / c9 /
2243	tmp14 = MULTIPLY(tmp1 + tmp2, FIX(`0.138617169`)) + / c15 /
2244	MULTIPLY(tmp6 - tmp5, FIX(`1.407403738`)); / c1 /
2245	tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(`0.666655658`)) + / -c11 /
2246	MULTIPLY(tmp4 + tmp6, - FIX(`1.247225013`)); / -c5 /
2247	tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(`1.353318001`)) + / -c3 /
2248	MULTIPLY(tmp5 - tmp4, FIX(`0.410524528`)); / c13 /
2249	tmp10 = tmp11 + tmp12 + tmp13 -
2250	MULTIPLY(tmp0, FIX(`2.286341144`)) + / c7+c5+c3-c1 /
2251	MULTIPLY(tmp7, FIX(`0.779653625`)); / c15+c13-c11+c9 /
2252	tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(`0.071888074`)) / c9-c3-c15+c11 /
2253	- MULTIPLY(tmp6, FIX(`1.663905119`)); / c7+c13+c1-c5 /
2254	tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(`1.125726048`)) / c7+c5+c15-c3 /
2255	+ MULTIPLY(tmp5, FIX(`1.227391138`)); / c9-c11+c1-c13 /
2256	tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(`1.065388962`)) / c15+c3+c11-c7 /
2257	+ MULTIPLY(tmp4, FIX(`2.167985692`)); / c1+c13+c5-c9 /
2258
2259	dataptr[`1`] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2260	dataptr[`3`] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2261	dataptr[`5`] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2262	dataptr[`7`] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2263
2264	ctr++;
2265
2266	if (ctr != DCTSIZE) {
2267	if (ctr == DCTSIZE * `2`)
2268	break; / Done. /
2269	dataptr += DCTSIZE; / advance pointer to next row /
2270	} else
2271	dataptr = workspace; / switch pointer to extended workspace /
2272	}
2273
2274	/ Pass 2: process columns.*
2275	* We remove the PASS1_BITS scaling, but leave the results scaled up
2276	* by an overall factor of 8.
2277	* We must also scale the output by (8/16)2 = 1/22.
2278	*/
2279
2280	dataptr = data;
2281	wsptr = workspace;
2282	for (ctr = DCTSIZE-`1`; ctr >= `0`; ctr--) {
2283	/ Even part /
2284
2285	tmp0 = dataptr[DCTSIZE`0`] + wsptr[DCTSIZE`7`];
2286	tmp1 = dataptr[DCTSIZE`1`] + wsptr[DCTSIZE`6`];
2287	tmp2 = dataptr[DCTSIZE`2`] + wsptr[DCTSIZE`5`];
2288	tmp3 = dataptr[DCTSIZE`3`] + wsptr[DCTSIZE`4`];
2289	tmp4 = dataptr[DCTSIZE`4`] + wsptr[DCTSIZE`3`];
2290	tmp5 = dataptr[DCTSIZE`5`] + wsptr[DCTSIZE`2`];
2291	tmp6 = dataptr[DCTSIZE`6`] + wsptr[DCTSIZE`1`];
2292	tmp7 = dataptr[DCTSIZE`7`] + wsptr[DCTSIZE`0`];
2293
2294	tmp10 = tmp0 + tmp7;
2295	tmp14 = tmp0 - tmp7;
2296	tmp11 = tmp1 + tmp6;
2297	tmp15 = tmp1 - tmp6;
2298	tmp12 = tmp2 + tmp5;
2299	tmp16 = tmp2 - tmp5;
2300	tmp13 = tmp3 + tmp4;
2301	tmp17 = tmp3 - tmp4;
2302
2303	tmp0 = dataptr[DCTSIZE`0`] - wsptr[DCTSIZE`7`];
2304	tmp1 = dataptr[DCTSIZE`1`] - wsptr[DCTSIZE`6`];
2305	tmp2 = dataptr[DCTSIZE`2`] - wsptr[DCTSIZE`5`];
2306	tmp3 = dataptr[DCTSIZE`3`] - wsptr[DCTSIZE`4`];
2307	tmp4 = dataptr[DCTSIZE`4`] - wsptr[DCTSIZE`3`];
2308	tmp5 = dataptr[DCTSIZE`5`] - wsptr[DCTSIZE`2`];
2309	tmp6 = dataptr[DCTSIZE`6`] - wsptr[DCTSIZE`1`];
2310	tmp7 = dataptr[DCTSIZE`7`] - wsptr[DCTSIZE`0`];
2311
2312	dataptr[DCTSIZE*`0`] = (DCTELEM)
2313	DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+`2`);
2314	dataptr[DCTSIZE*`4`] = (DCTELEM)
2315	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(`1.306562965`)) + / c4[16] = c2[8] /
2316	MULTIPLY(tmp11 - tmp12, FIX_0_541196100), / c12[16] = c6[8] /
2317	CONST_BITS+PASS1_BITS+`2`);
2318
2319	tmp10 = MULTIPLY(tmp17 - tmp15, FIX(`0.275899379`)) + / c14[16] = c7[8] /
2320	MULTIPLY(tmp14 - tmp16, FIX(`1.387039845`)); / c2[16] = c1[8] /
2321
2322	dataptr[DCTSIZE*`2`] = (DCTELEM)
2323	DESCALE(tmp10 + MULTIPLY(tmp15, FIX(`1.451774982`)) / c6+c14 /
2324	+ MULTIPLY(tmp16, FIX(`2.172734804`)), / c2+10 /
2325	CONST_BITS+PASS1_BITS+`2`);
2326	dataptr[DCTSIZE*`6`] = (DCTELEM)
2327	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(`0.211164243`)) / c2-c6 /
2328	- MULTIPLY(tmp17, FIX(`1.061594338`)), / c10+c14 /
2329	CONST_BITS+PASS1_BITS+`2`);
2330
2331	/ Odd part /
2332
2333	tmp11 = MULTIPLY(tmp0 + tmp1, FIX(`1.353318001`)) + / c3 /
2334	MULTIPLY(tmp6 - tmp7, FIX(`0.410524528`)); / c13 /
2335	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(`1.247225013`)) + / c5 /
2336	MULTIPLY(tmp5 + tmp7, FIX(`0.666655658`)); / c11 /
2337	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(`1.093201867`)) + / c7 /
2338	MULTIPLY(tmp4 - tmp7, FIX(`0.897167586`)); / c9 /
2339	tmp14 = MULTIPLY(tmp1 + tmp2, FIX(`0.138617169`)) + / c15 /
2340	MULTIPLY(tmp6 - tmp5, FIX(`1.407403738`)); / c1 /
2341	tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(`0.666655658`)) + / -c11 /
2342	MULTIPLY(tmp4 + tmp6, - FIX(`1.247225013`)); / -c5 /
2343	tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(`1.353318001`)) + / -c3 /
2344	MULTIPLY(tmp5 - tmp4, FIX(`0.410524528`)); / c13 /
2345	tmp10 = tmp11 + tmp12 + tmp13 -
2346	MULTIPLY(tmp0, FIX(`2.286341144`)) + / c7+c5+c3-c1 /
2347	MULTIPLY(tmp7, FIX(`0.779653625`)); / c15+c13-c11+c9 /
2348	tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(`0.071888074`)) / c9-c3-c15+c11 /
2349	- MULTIPLY(tmp6, FIX(`1.663905119`)); / c7+c13+c1-c5 /
2350	tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(`1.125726048`)) / c7+c5+c15-c3 /
2351	+ MULTIPLY(tmp5, FIX(`1.227391138`)); / c9-c11+c1-c13 /
2352	tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(`1.065388962`)) / c15+c3+c11-c7 /
2353	+ MULTIPLY(tmp4, FIX(`2.167985692`)); / c1+c13+c5-c9 /
2354
2355	dataptr[DCTSIZE*`1`] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+`2`);
2356	dataptr[DCTSIZE*`3`] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+`2`);
2357	dataptr[DCTSIZE*`5`] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+`2`);
2358	dataptr[DCTSIZE*`7`] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+`2`);
2359
2360	dataptr++; / advance pointer to next column /
2361	wsptr++; / advance pointer to next column /
2362	}
2363	}
2364
2365
2366	/*
2367	* Perform the forward DCT on a 16x8 sample block.
2368	*
2369	* 16-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
2370	*/
2371
2372	GLOBAL(void)
2373	jpeg_fdct_16x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2374	{
2375	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2376	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
2377	INT32 z1;
2378	DCTELEM *dataptr;
2379	JSAMPROW elemptr;
2380	int ctr;
2381	SHIFT_TEMPS
2382
2383	/ Pass 1: process rows. /
2384	/ Note results are scaled up by sqrt(8) compared to a true DCT; /
2385	/ furthermore, we scale the results by 2*PASS1_BITS. /*
2386	/ 16-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/32). /*
2387
2388	dataptr = data;
2389	ctr = `0`;
2390	for (ctr = `0`; ctr < DCTSIZE; ctr++) {
2391	elemptr = sample_data[ctr] + start_col;
2392
2393	/ Even part /
2394
2395	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`15`]);
2396	tmp1 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`14`]);
2397	tmp2 = GETJSAMPLE(elemptr[`2`]) + GETJSAMPLE(elemptr[`13`]);
2398	tmp3 = GETJSAMPLE(elemptr[`3`]) + GETJSAMPLE(elemptr[`12`]);
2399	tmp4 = GETJSAMPLE(elemptr[`4`]) + GETJSAMPLE(elemptr[`11`]);
2400	tmp5 = GETJSAMPLE(elemptr[`5`]) + GETJSAMPLE(elemptr[`10`]);
2401	tmp6 = GETJSAMPLE(elemptr[`6`]) + GETJSAMPLE(elemptr[`9`]);
2402	tmp7 = GETJSAMPLE(elemptr[`7`]) + GETJSAMPLE(elemptr[`8`]);
2403
2404	tmp10 = tmp0 + tmp7;
2405	tmp14 = tmp0 - tmp7;
2406	tmp11 = tmp1 + tmp6;
2407	tmp15 = tmp1 - tmp6;
2408	tmp12 = tmp2 + tmp5;
2409	tmp16 = tmp2 - tmp5;
2410	tmp13 = tmp3 + tmp4;
2411	tmp17 = tmp3 - tmp4;
2412
2413	tmp0 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`15`]);
2414	tmp1 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`14`]);
2415	tmp2 = GETJSAMPLE(elemptr[`2`]) - GETJSAMPLE(elemptr[`13`]);
2416	tmp3 = GETJSAMPLE(elemptr[`3`]) - GETJSAMPLE(elemptr[`12`]);
2417	tmp4 = GETJSAMPLE(elemptr[`4`]) - GETJSAMPLE(elemptr[`11`]);
2418	tmp5 = GETJSAMPLE(elemptr[`5`]) - GETJSAMPLE(elemptr[`10`]);
2419	tmp6 = GETJSAMPLE(elemptr[`6`]) - GETJSAMPLE(elemptr[`9`]);
2420	tmp7 = GETJSAMPLE(elemptr[`7`]) - GETJSAMPLE(elemptr[`8`]);
2421
2422	/ Apply unsigned->signed conversion /
2423	dataptr[`0`] = (DCTELEM)
2424	((tmp10 + tmp11 + tmp12 + tmp13 - `16` * CENTERJSAMPLE) << PASS1_BITS);
2425	dataptr[`4`] = (DCTELEM)
2426	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(`1.306562965`)) + / c4[16] = c2[8] /
2427	MULTIPLY(tmp11 - tmp12, FIX_0_541196100), / c12[16] = c6[8] /
2428	CONST_BITS-PASS1_BITS);
2429
2430	tmp10 = MULTIPLY(tmp17 - tmp15, FIX(`0.275899379`)) + / c14[16] = c7[8] /
2431	MULTIPLY(tmp14 - tmp16, FIX(`1.387039845`)); / c2[16] = c1[8] /
2432
2433	dataptr[`2`] = (DCTELEM)
2434	DESCALE(tmp10 + MULTIPLY(tmp15, FIX(`1.451774982`)) / c6+c14 /
2435	+ MULTIPLY(tmp16, FIX(`2.172734804`)), / c2+c10 /
2436	CONST_BITS-PASS1_BITS);
2437	dataptr[`6`] = (DCTELEM)
2438	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(`0.211164243`)) / c2-c6 /
2439	- MULTIPLY(tmp17, FIX(`1.061594338`)), / c10+c14 /
2440	CONST_BITS-PASS1_BITS);
2441
2442	/ Odd part /
2443
2444	tmp11 = MULTIPLY(tmp0 + tmp1, FIX(`1.353318001`)) + / c3 /
2445	MULTIPLY(tmp6 - tmp7, FIX(`0.410524528`)); / c13 /
2446	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(`1.247225013`)) + / c5 /
2447	MULTIPLY(tmp5 + tmp7, FIX(`0.666655658`)); / c11 /
2448	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(`1.093201867`)) + / c7 /
2449	MULTIPLY(tmp4 - tmp7, FIX(`0.897167586`)); / c9 /
2450	tmp14 = MULTIPLY(tmp1 + tmp2, FIX(`0.138617169`)) + / c15 /
2451	MULTIPLY(tmp6 - tmp5, FIX(`1.407403738`)); / c1 /
2452	tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(`0.666655658`)) + / -c11 /
2453	MULTIPLY(tmp4 + tmp6, - FIX(`1.247225013`)); / -c5 /
2454	tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(`1.353318001`)) + / -c3 /
2455	MULTIPLY(tmp5 - tmp4, FIX(`0.410524528`)); / c13 /
2456	tmp10 = tmp11 + tmp12 + tmp13 -
2457	MULTIPLY(tmp0, FIX(`2.286341144`)) + / c7+c5+c3-c1 /
2458	MULTIPLY(tmp7, FIX(`0.779653625`)); / c15+c13-c11+c9 /
2459	tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(`0.071888074`)) / c9-c3-c15+c11 /
2460	- MULTIPLY(tmp6, FIX(`1.663905119`)); / c7+c13+c1-c5 /
2461	tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(`1.125726048`)) / c7+c5+c15-c3 /
2462	+ MULTIPLY(tmp5, FIX(`1.227391138`)); / c9-c11+c1-c13 /
2463	tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(`1.065388962`)) / c15+c3+c11-c7 /
2464	+ MULTIPLY(tmp4, FIX(`2.167985692`)); / c1+c13+c5-c9 /
2465
2466	dataptr[`1`] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2467	dataptr[`3`] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2468	dataptr[`5`] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2469	dataptr[`7`] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2470
2471	dataptr += DCTSIZE; / advance pointer to next row /
2472	}
2473
2474	/ Pass 2: process columns.*
2475	* We remove the PASS1_BITS scaling, but leave the results scaled up
2476	* by an overall factor of 8.
2477	* We must also scale the output by 8/16 = 1/2.
2478	*/
2479
2480	dataptr = data;
2481	for (ctr = DCTSIZE-`1`; ctr >= `0`; ctr--) {
2482	/ Even part per LL&M figure 1 --- note that published figure is faulty;*
2483	* rotator "sqrt(2)c1" should be "sqrt(2)c6".
2484	*/
2485
2486	tmp0 = dataptr[DCTSIZE`0`] + dataptr[DCTSIZE`7`];
2487	tmp1 = dataptr[DCTSIZE`1`] + dataptr[DCTSIZE`6`];
2488	tmp2 = dataptr[DCTSIZE`2`] + dataptr[DCTSIZE`5`];
2489	tmp3 = dataptr[DCTSIZE`3`] + dataptr[DCTSIZE`4`];
2490
2491	tmp10 = tmp0 + tmp3;
2492	tmp12 = tmp0 - tmp3;
2493	tmp11 = tmp1 + tmp2;
2494	tmp13 = tmp1 - tmp2;
2495
2496	tmp0 = dataptr[DCTSIZE`0`] - dataptr[DCTSIZE`7`];
2497	tmp1 = dataptr[DCTSIZE`1`] - dataptr[DCTSIZE`6`];
2498	tmp2 = dataptr[DCTSIZE`2`] - dataptr[DCTSIZE`5`];
2499	tmp3 = dataptr[DCTSIZE`3`] - dataptr[DCTSIZE`4`];
2500
2501	dataptr[DCTSIZE*`0`] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS+`1`);
2502	dataptr[DCTSIZE*`4`] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS+`1`);
2503
2504	z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
2505	dataptr[DCTSIZE*`2`] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
2506	CONST_BITS+PASS1_BITS+`1`);
2507	dataptr[DCTSIZE*`6`] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
2508	CONST_BITS+PASS1_BITS+`1`);
2509
2510	/ Odd part per figure 8 --- note paper omits factor of sqrt(2).*
2511	* 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
2512	* i0..i3 in the paper are tmp0..tmp3 here.
2513	*/
2514
2515	tmp10 = tmp0 + tmp3;
2516	tmp11 = tmp1 + tmp2;
2517	tmp12 = tmp0 + tmp2;
2518	tmp13 = tmp1 + tmp3;
2519	z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); / c3 /
2520
2521	tmp0 = MULTIPLY(tmp0, FIX_1_501321110); / c1+c3-c5-c7 /
2522	tmp1 = MULTIPLY(tmp1, FIX_3_072711026); / c1+c3+c5-c7 /
2523	tmp2 = MULTIPLY(tmp2, FIX_2_053119869); / c1+c3-c5+c7 /
2524	tmp3 = MULTIPLY(tmp3, FIX_0_298631336); / -c1+c3+c5-c7 /
2525	tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); / c7-c3 /
2526	tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); / -c1-c3 /
2527	tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); / c5-c3 /
2528	tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); / -c3-c5 /
2529
2530	tmp12 += z1;
2531	tmp13 += z1;
2532
2533	dataptr[DCTSIZE*`1`] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12,
2534	CONST_BITS+PASS1_BITS+`1`);
2535	dataptr[DCTSIZE*`3`] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13,
2536	CONST_BITS+PASS1_BITS+`1`);
2537	dataptr[DCTSIZE*`5`] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12,
2538	CONST_BITS+PASS1_BITS+`1`);
2539	dataptr[DCTSIZE*`7`] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13,
2540	CONST_BITS+PASS1_BITS+`1`);
2541
2542	dataptr++; / advance pointer to next column /
2543	}
2544	}
2545
2546
2547	/*
2548	* Perform the forward DCT on a 14x7 sample block.
2549	*
2550	* 14-point FDCT in pass 1 (rows), 7-point in pass 2 (columns).
2551	*/
2552
2553	GLOBAL(void)
2554	jpeg_fdct_14x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2555	{
2556	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
2557	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2558	INT32 z1, z2, z3;
2559	DCTELEM *dataptr;
2560	JSAMPROW elemptr;
2561	int ctr;
2562	SHIFT_TEMPS
2563
2564	/ Zero bottom row of output coefficient block. /
2565	MEMZERO(&data[DCTSIZE`7`], SIZEOF(DCTELEM) DCTSIZE);
2566
2567	/ Pass 1: process rows. /
2568	/ Note results are scaled up by sqrt(8) compared to a true DCT; /
2569	/ furthermore, we scale the results by 2*PASS1_BITS. /*
2570	/ 14-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/28). /*
2571
2572	dataptr = data;
2573	for (ctr = `0`; ctr < `7`; ctr++) {
2574	elemptr = sample_data[ctr] + start_col;
2575
2576	/ Even part /
2577
2578	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`13`]);
2579	tmp1 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`12`]);
2580	tmp2 = GETJSAMPLE(elemptr[`2`]) + GETJSAMPLE(elemptr[`11`]);
2581	tmp13 = GETJSAMPLE(elemptr[`3`]) + GETJSAMPLE(elemptr[`10`]);
2582	tmp4 = GETJSAMPLE(elemptr[`4`]) + GETJSAMPLE(elemptr[`9`]);
2583	tmp5 = GETJSAMPLE(elemptr[`5`]) + GETJSAMPLE(elemptr[`8`]);
2584	tmp6 = GETJSAMPLE(elemptr[`6`]) + GETJSAMPLE(elemptr[`7`]);
2585
2586	tmp10 = tmp0 + tmp6;
2587	tmp14 = tmp0 - tmp6;
2588	tmp11 = tmp1 + tmp5;
2589	tmp15 = tmp1 - tmp5;
2590	tmp12 = tmp2 + tmp4;
2591	tmp16 = tmp2 - tmp4;
2592
2593	tmp0 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`13`]);
2594	tmp1 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`12`]);
2595	tmp2 = GETJSAMPLE(elemptr[`2`]) - GETJSAMPLE(elemptr[`11`]);
2596	tmp3 = GETJSAMPLE(elemptr[`3`]) - GETJSAMPLE(elemptr[`10`]);
2597	tmp4 = GETJSAMPLE(elemptr[`4`]) - GETJSAMPLE(elemptr[`9`]);
2598	tmp5 = GETJSAMPLE(elemptr[`5`]) - GETJSAMPLE(elemptr[`8`]);
2599	tmp6 = GETJSAMPLE(elemptr[`6`]) - GETJSAMPLE(elemptr[`7`]);
2600
2601	/ Apply unsigned->signed conversion /
2602	dataptr[`0`] = (DCTELEM)
2603	((tmp10 + tmp11 + tmp12 + tmp13 - `14` * CENTERJSAMPLE) << PASS1_BITS);
2604	tmp13 += tmp13;
2605	dataptr[`4`] = (DCTELEM)
2606	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(`1.274162392`)) + / c4 /
2607	MULTIPLY(tmp11 - tmp13, FIX(`0.314692123`)) - / c12 /
2608	MULTIPLY(tmp12 - tmp13, FIX(`0.881747734`)), / c8 /
2609	CONST_BITS-PASS1_BITS);
2610
2611	tmp10 = MULTIPLY(tmp14 + tmp15, FIX(`1.105676686`)); / c6 /
2612
2613	dataptr[`2`] = (DCTELEM)
2614	DESCALE(tmp10 + MULTIPLY(tmp14, FIX(`0.273079590`)) / c2-c6 /
2615	+ MULTIPLY(tmp16, FIX(`0.613604268`)), / c10 /
2616	CONST_BITS-PASS1_BITS);
2617	dataptr[`6`] = (DCTELEM)
2618	DESCALE(tmp10 - MULTIPLY(tmp15, FIX(`1.719280954`)) / c6+c10 /
2619	- MULTIPLY(tmp16, FIX(`1.378756276`)), / c2 /
2620	CONST_BITS-PASS1_BITS);
2621
2622	/ Odd part /
2623
2624	tmp10 = tmp1 + tmp2;
2625	tmp11 = tmp5 - tmp4;
2626	dataptr[`7`] = (DCTELEM) ((tmp0 - tmp10 + tmp3 - tmp11 - tmp6) << PASS1_BITS);
2627	tmp3 <<= CONST_BITS;
2628	tmp10 = MULTIPLY(tmp10, - FIX(`0.158341681`)); / -c13 /
2629	tmp11 = MULTIPLY(tmp11, FIX(`1.405321284`)); / c1 /
2630	tmp10 += tmp11 - tmp3;
2631	tmp11 = MULTIPLY(tmp0 + tmp2, FIX(`1.197448846`)) + / c5 /
2632	MULTIPLY(tmp4 + tmp6, FIX(`0.752406978`)); / c9 /
2633	dataptr[`5`] = (DCTELEM)
2634	DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(`2.373959773`)) / c3+c5-c13 /
2635	+ MULTIPLY(tmp4, FIX(`1.119999435`)), / c1+c11-c9 /
2636	CONST_BITS-PASS1_BITS);
2637	tmp12 = MULTIPLY(tmp0 + tmp1, FIX(`1.334852607`)) + / c3 /
2638	MULTIPLY(tmp5 - tmp6, FIX(`0.467085129`)); / c11 /
2639	dataptr[`3`] = (DCTELEM)
2640	DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(`0.424103948`)) / c3-c9-c13 /
2641	- MULTIPLY(tmp5, FIX(`3.069855259`)), / c1+c5+c11 /
2642	CONST_BITS-PASS1_BITS);
2643	dataptr[`1`] = (DCTELEM)
2644	DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
2645	MULTIPLY(tmp0 + tmp6, FIX(`1.126980169`)), / c3+c5-c1 /
2646	CONST_BITS-PASS1_BITS);
2647
2648	dataptr += DCTSIZE; / advance pointer to next row /
2649	}
2650
2651	/ Pass 2: process columns.*
2652	* We remove the PASS1_BITS scaling, but leave the results scaled up
2653	* by an overall factor of 8.
2654	* We must also scale the output by (8/14)*(8/7) = 32/49, which we
2655	* partially fold into the constant multipliers and final shifting:
2656	* 7-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/14) 64/49.
2657	*/
2658
2659	dataptr = data;
2660	for (ctr = DCTSIZE-`1`; ctr >= `0`; ctr--) {
2661	/ Even part /
2662
2663	tmp0 = dataptr[DCTSIZE`0`] + dataptr[DCTSIZE`6`];
2664	tmp1 = dataptr[DCTSIZE`1`] + dataptr[DCTSIZE`5`];
2665	tmp2 = dataptr[DCTSIZE`2`] + dataptr[DCTSIZE`4`];
2666	tmp3 = dataptr[DCTSIZE*`3`];
2667
2668	tmp10 = dataptr[DCTSIZE`0`] - dataptr[DCTSIZE`6`];
2669	tmp11 = dataptr[DCTSIZE`1`] - dataptr[DCTSIZE`5`];
2670	tmp12 = dataptr[DCTSIZE`2`] - dataptr[DCTSIZE`4`];
2671
2672	z1 = tmp0 + tmp2;
2673	dataptr[DCTSIZE*`0`] = (DCTELEM)
2674	DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(`1.306122449`)), / 64/49 /
2675	CONST_BITS+PASS1_BITS+`1`);
2676	tmp3 += tmp3;
2677	z1 -= tmp3;
2678	z1 -= tmp3;
2679	z1 = MULTIPLY(z1, FIX(`0.461784020`)); / (c2+c6-c4)/2 /
2680	z2 = MULTIPLY(tmp0 - tmp2, FIX(`1.202428084`)); / (c2+c4-c6)/2 /
2681	z3 = MULTIPLY(tmp1 - tmp2, FIX(`0.411026446`)); / c6 /
2682	dataptr[DCTSIZE*`2`] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS+`1`);
2683	z1 -= z2;
2684	z2 = MULTIPLY(tmp0 - tmp1, FIX(`1.151670509`)); / c4 /
2685	dataptr[DCTSIZE*`4`] = (DCTELEM)
2686	DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(`0.923568041`)), / c2+c6-c4 /
2687	CONST_BITS+PASS1_BITS+`1`);
2688	dataptr[DCTSIZE*`6`] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS+`1`);
2689
2690	/ Odd part /
2691
2692	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(`1.221765677`)); / (c3+c1-c5)/2 /
2693	tmp2 = MULTIPLY(tmp10 - tmp11, FIX(`0.222383464`)); / (c3+c5-c1)/2 /
2694	tmp0 = tmp1 - tmp2;
2695	tmp1 += tmp2;
2696	tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(`1.800824523`)); / -c1 /
2697	tmp1 += tmp2;
2698	tmp3 = MULTIPLY(tmp10 + tmp12, FIX(`0.801442310`)); / c5 /
2699	tmp0 += tmp3;
2700	tmp2 += tmp3 + MULTIPLY(tmp12, FIX(`2.443531355`)); / c3+c1-c5 /
2701
2702	dataptr[DCTSIZE*`1`] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS+`1`);
2703	dataptr[DCTSIZE*`3`] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS+`1`);
2704	dataptr[DCTSIZE*`5`] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS+`1`);
2705
2706	dataptr++; / advance pointer to next column /
2707	}
2708	}
2709
2710
2711	/*
2712	* Perform the forward DCT on a 12x6 sample block.
2713	*
2714	* 12-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
2715	*/
2716
2717	GLOBAL(void)
2718	jpeg_fdct_12x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2719	{
2720	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
2721	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2722	DCTELEM *dataptr;
2723	JSAMPROW elemptr;
2724	int ctr;
2725	SHIFT_TEMPS
2726
2727	/ Zero 2 bottom rows of output coefficient block. /
2728	MEMZERO(&data[DCTSIZE`6`], SIZEOF(DCTELEM) DCTSIZE * `2`);
2729
2730	/ Pass 1: process rows. /
2731	/ Note results are scaled up by sqrt(8) compared to a true DCT; /
2732	/ furthermore, we scale the results by 2*PASS1_BITS. /*
2733	/ 12-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/24). /*
2734
2735	dataptr = data;
2736	for (ctr = `0`; ctr < `6`; ctr++) {
2737	elemptr = sample_data[ctr] + start_col;
2738
2739	/ Even part /
2740
2741	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`11`]);
2742	tmp1 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`10`]);
2743	tmp2 = GETJSAMPLE(elemptr[`2`]) + GETJSAMPLE(elemptr[`9`]);
2744	tmp3 = GETJSAMPLE(elemptr[`3`]) + GETJSAMPLE(elemptr[`8`]);
2745	tmp4 = GETJSAMPLE(elemptr[`4`]) + GETJSAMPLE(elemptr[`7`]);
2746	tmp5 = GETJSAMPLE(elemptr[`5`]) + GETJSAMPLE(elemptr[`6`]);
2747
2748	tmp10 = tmp0 + tmp5;
2749	tmp13 = tmp0 - tmp5;
2750	tmp11 = tmp1 + tmp4;
2751	tmp14 = tmp1 - tmp4;
2752	tmp12 = tmp2 + tmp3;
2753	tmp15 = tmp2 - tmp3;
2754
2755	tmp0 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`11`]);
2756	tmp1 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`10`]);
2757	tmp2 = GETJSAMPLE(elemptr[`2`]) - GETJSAMPLE(elemptr[`9`]);
2758	tmp3 = GETJSAMPLE(elemptr[`3`]) - GETJSAMPLE(elemptr[`8`]);
2759	tmp4 = GETJSAMPLE(elemptr[`4`]) - GETJSAMPLE(elemptr[`7`]);
2760	tmp5 = GETJSAMPLE(elemptr[`5`]) - GETJSAMPLE(elemptr[`6`]);
2761
2762	/ Apply unsigned->signed conversion /
2763	dataptr[`0`] = (DCTELEM)
2764	((tmp10 + tmp11 + tmp12 - `12` * CENTERJSAMPLE) << PASS1_BITS);
2765	dataptr[`6`] = (DCTELEM) ((tmp13 - tmp14 - tmp15) << PASS1_BITS);
2766	dataptr[`4`] = (DCTELEM)
2767	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(`1.224744871`)), / c4 /
2768	CONST_BITS-PASS1_BITS);
2769	dataptr[`2`] = (DCTELEM)
2770	DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(`1.366025404`)), / c2 /
2771	CONST_BITS-PASS1_BITS);
2772
2773	/ Odd part /
2774
2775	tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100); / c9 /
2776	tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865); / c3-c9 /
2777	tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065); / c3+c9 /
2778	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(`1.121971054`)); / c5 /
2779	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(`0.860918669`)); / c7 /
2780	tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(`0.580774953`)) / c5+c7-c1 /
2781	+ MULTIPLY(tmp5, FIX(`0.184591911`)); / c11 /
2782	tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(`0.184591911`)); / -c11 /
2783	tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(`2.339493912`)) / c1+c5-c11 /
2784	+ MULTIPLY(tmp5, FIX(`0.860918669`)); / c7 /
2785	tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(`0.725788011`)) / c1+c11-c7 /
2786	- MULTIPLY(tmp5, FIX(`1.121971054`)); / c5 /
2787	tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(`1.306562965`)) / c3 /
2788	- MULTIPLY(tmp2 + tmp5, FIX_0_541196100); / c9 /
2789
2790	dataptr[`1`] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2791	dataptr[`3`] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2792	dataptr[`5`] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2793	dataptr[`7`] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2794
2795	dataptr += DCTSIZE; / advance pointer to next row /
2796	}
2797
2798	/ Pass 2: process columns.*
2799	* We remove the PASS1_BITS scaling, but leave the results scaled up
2800	* by an overall factor of 8.
2801	* We must also scale the output by (8/12)*(8/6) = 8/9, which we
2802	* partially fold into the constant multipliers and final shifting:
2803	* 6-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/12) 16/9.
2804	*/
2805
2806	dataptr = data;
2807	for (ctr = DCTSIZE-`1`; ctr >= `0`; ctr--) {
2808	/ Even part /
2809
2810	tmp0 = dataptr[DCTSIZE`0`] + dataptr[DCTSIZE`5`];
2811	tmp11 = dataptr[DCTSIZE`1`] + dataptr[DCTSIZE`4`];
2812	tmp2 = dataptr[DCTSIZE`2`] + dataptr[DCTSIZE`3`];
2813
2814	tmp10 = tmp0 + tmp2;
2815	tmp12 = tmp0 - tmp2;
2816
2817	tmp0 = dataptr[DCTSIZE`0`] - dataptr[DCTSIZE`5`];
2818	tmp1 = dataptr[DCTSIZE`1`] - dataptr[DCTSIZE`4`];
2819	tmp2 = dataptr[DCTSIZE`2`] - dataptr[DCTSIZE`3`];
2820
2821	dataptr[DCTSIZE*`0`] = (DCTELEM)
2822	DESCALE(MULTIPLY(tmp10 + tmp11, FIX(`1.777777778`)), / 16/9 /
2823	CONST_BITS+PASS1_BITS+`1`);
2824	dataptr[DCTSIZE*`2`] = (DCTELEM)
2825	DESCALE(MULTIPLY(tmp12, FIX(`2.177324216`)), / c2 /
2826	CONST_BITS+PASS1_BITS+`1`);
2827	dataptr[DCTSIZE*`4`] = (DCTELEM)
2828	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(`1.257078722`)), / c4 /
2829	CONST_BITS+PASS1_BITS+`1`);
2830
2831	/ Odd part /
2832
2833	tmp10 = MULTIPLY(tmp0 + tmp2, FIX(`0.650711829`)); / c5 /
2834
2835	dataptr[DCTSIZE*`1`] = (DCTELEM)
2836	DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(`1.777777778`)), / 16/9 /
2837	CONST_BITS+PASS1_BITS+`1`);
2838	dataptr[DCTSIZE*`3`] = (DCTELEM)
2839	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(`1.777777778`)), / 16/9 /
2840	CONST_BITS+PASS1_BITS+`1`);
2841	dataptr[DCTSIZE*`5`] = (DCTELEM)
2842	DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(`1.777777778`)), / 16/9 /
2843	CONST_BITS+PASS1_BITS+`1`);
2844
2845	dataptr++; / advance pointer to next column /
2846	}
2847	}
2848
2849
2850	/*
2851	* Perform the forward DCT on a 10x5 sample block.
2852	*
2853	* 10-point FDCT in pass 1 (rows), 5-point in pass 2 (columns).
2854	*/
2855
2856	GLOBAL(void)
2857	jpeg_fdct_10x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2858	{
2859	INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
2860	INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
2861	DCTELEM *dataptr;
2862	JSAMPROW elemptr;
2863	int ctr;
2864	SHIFT_TEMPS
2865
2866	/ Zero 3 bottom rows of output coefficient block. /
2867	MEMZERO(&data[DCTSIZE`5`], SIZEOF(DCTELEM) DCTSIZE * `3`);
2868
2869	/ Pass 1: process rows. /
2870	/ Note results are scaled up by sqrt(8) compared to a true DCT; /
2871	/ furthermore, we scale the results by 2*PASS1_BITS. /*
2872	/ 10-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/20). /*
2873
2874	dataptr = data;
2875	for (ctr = `0`; ctr < `5`; ctr++) {
2876	elemptr = sample_data[ctr] + start_col;
2877
2878	/ Even part /
2879
2880	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`9`]);
2881	tmp1 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`8`]);
2882	tmp12 = GETJSAMPLE(elemptr[`2`]) + GETJSAMPLE(elemptr[`7`]);
2883	tmp3 = GETJSAMPLE(elemptr[`3`]) + GETJSAMPLE(elemptr[`6`]);
2884	tmp4 = GETJSAMPLE(elemptr[`4`]) + GETJSAMPLE(elemptr[`5`]);
2885
2886	tmp10 = tmp0 + tmp4;
2887	tmp13 = tmp0 - tmp4;
2888	tmp11 = tmp1 + tmp3;
2889	tmp14 = tmp1 - tmp3;
2890
2891	tmp0 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`9`]);
2892	tmp1 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`8`]);
2893	tmp2 = GETJSAMPLE(elemptr[`2`]) - GETJSAMPLE(elemptr[`7`]);
2894	tmp3 = GETJSAMPLE(elemptr[`3`]) - GETJSAMPLE(elemptr[`6`]);
2895	tmp4 = GETJSAMPLE(elemptr[`4`]) - GETJSAMPLE(elemptr[`5`]);
2896
2897	/ Apply unsigned->signed conversion /
2898	dataptr[`0`] = (DCTELEM)
2899	((tmp10 + tmp11 + tmp12 - `10` * CENTERJSAMPLE) << PASS1_BITS);
2900	tmp12 += tmp12;
2901	dataptr[`4`] = (DCTELEM)
2902	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(`1.144122806`)) - / c4 /
2903	MULTIPLY(tmp11 - tmp12, FIX(`0.437016024`)), / c8 /
2904	CONST_BITS-PASS1_BITS);
2905	tmp10 = MULTIPLY(tmp13 + tmp14, FIX(`0.831253876`)); / c6 /
2906	dataptr[`2`] = (DCTELEM)
2907	DESCALE(tmp10 + MULTIPLY(tmp13, FIX(`0.513743148`)), / c2-c6 /
2908	CONST_BITS-PASS1_BITS);
2909	dataptr[`6`] = (DCTELEM)
2910	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(`2.176250899`)), / c2+c6 /
2911	CONST_BITS-PASS1_BITS);
2912
2913	/ Odd part /
2914
2915	tmp10 = tmp0 + tmp4;
2916	tmp11 = tmp1 - tmp3;
2917	dataptr[`5`] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << PASS1_BITS);
2918	tmp2 <<= CONST_BITS;
2919	dataptr[`1`] = (DCTELEM)
2920	DESCALE(MULTIPLY(tmp0, FIX(`1.396802247`)) + / c1 /
2921	MULTIPLY(tmp1, FIX(`1.260073511`)) + tmp2 + / c3 /
2922	MULTIPLY(tmp3, FIX(`0.642039522`)) + / c7 /
2923	MULTIPLY(tmp4, FIX(`0.221231742`)), / c9 /
2924	CONST_BITS-PASS1_BITS);
2925	tmp12 = MULTIPLY(tmp0 - tmp4, FIX(`0.951056516`)) - / (c3+c7)/2 /
2926	MULTIPLY(tmp1 + tmp3, FIX(`0.587785252`)); / (c1-c9)/2 /
2927	tmp13 = MULTIPLY(tmp10 + tmp11, FIX(`0.309016994`)) + / (c3-c7)/2 /
2928	(tmp11 << (CONST_BITS - `1`)) - tmp2;
2929	dataptr[`3`] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-PASS1_BITS);
2930	dataptr[`7`] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-PASS1_BITS);
2931
2932	dataptr += DCTSIZE; / advance pointer to next row /
2933	}
2934
2935	/ Pass 2: process columns.*
2936	* We remove the PASS1_BITS scaling, but leave the results scaled up
2937	* by an overall factor of 8.
2938	* We must also scale the output by (8/10)*(8/5) = 32/25, which we
2939	* fold into the constant multipliers:
2940	* 5-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/10) 32/25.
2941	*/
2942
2943	dataptr = data;
2944	for (ctr = DCTSIZE-`1`; ctr >= `0`; ctr--) {
2945	/ Even part /
2946
2947	tmp0 = dataptr[DCTSIZE`0`] + dataptr[DCTSIZE`4`];
2948	tmp1 = dataptr[DCTSIZE`1`] + dataptr[DCTSIZE`3`];
2949	tmp2 = dataptr[DCTSIZE*`2`];
2950
2951	tmp10 = tmp0 + tmp1;
2952	tmp11 = tmp0 - tmp1;
2953
2954	tmp0 = dataptr[DCTSIZE`0`] - dataptr[DCTSIZE`4`];
2955	tmp1 = dataptr[DCTSIZE`1`] - dataptr[DCTSIZE`3`];
2956
2957	dataptr[DCTSIZE*`0`] = (DCTELEM)
2958	DESCALE(MULTIPLY(tmp10 + tmp2, FIX(`1.28`)), / 32/25 /
2959	CONST_BITS+PASS1_BITS);
2960	tmp11 = MULTIPLY(tmp11, FIX(`1.011928851`)); / (c2+c4)/2 /
2961	tmp10 -= tmp2 << `2`;
2962	tmp10 = MULTIPLY(tmp10, FIX(`0.452548340`)); / (c2-c4)/2 /
2963	dataptr[DCTSIZE*`2`] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
2964	dataptr[DCTSIZE*`4`] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
2965
2966	/ Odd part /
2967
2968	tmp10 = MULTIPLY(tmp0 + tmp1, FIX(`1.064004961`)); / c3 /
2969
2970	dataptr[DCTSIZE*`1`] = (DCTELEM)
2971	DESCALE(tmp10 + MULTIPLY(tmp0, FIX(`0.657591230`)), / c1-c3 /
2972	CONST_BITS+PASS1_BITS);
2973	dataptr[DCTSIZE*`3`] = (DCTELEM)
2974	DESCALE(tmp10 - MULTIPLY(tmp1, FIX(`2.785601151`)), / c1+c3 /
2975	CONST_BITS+PASS1_BITS);
2976
2977	dataptr++; / advance pointer to next column /
2978	}
2979	}
2980
2981
2982	/*
2983	* Perform the forward DCT on an 8x4 sample block.
2984	*
2985	* 8-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
2986	*/
2987
2988	GLOBAL(void)
2989	jpeg_fdct_8x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2990	{
2991	INT32 tmp0, tmp1, tmp2, tmp3;
2992	INT32 tmp10, tmp11, tmp12, tmp13;
2993	INT32 z1;
2994	DCTELEM *dataptr;
2995	JSAMPROW elemptr;
2996	int ctr;
2997	SHIFT_TEMPS
2998
2999	/ Zero 4 bottom rows of output coefficient block. /
3000	MEMZERO(&data[DCTSIZE`4`], SIZEOF(DCTELEM) DCTSIZE * `4`);
3001
3002	/ Pass 1: process rows. /
3003	/ Note results are scaled up by sqrt(8) compared to a true DCT; /
3004	/ furthermore, we scale the results by 2*PASS1_BITS. /*
3005	/ We must also scale the output by 8/4 = 2, which we add here. /
3006
3007	dataptr = data;
3008	for (ctr = `0`; ctr < `4`; ctr++) {
3009	elemptr = sample_data[ctr] + start_col;
3010
3011	/ Even part per LL&M figure 1 --- note that published figure is faulty;*
3012	* rotator "sqrt(2)c1" should be "sqrt(2)c6".
3013	*/
3014
3015	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`7`]);
3016	tmp1 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`6`]);
3017	tmp2 = GETJSAMPLE(elemptr[`2`]) + GETJSAMPLE(elemptr[`5`]);
3018	tmp3 = GETJSAMPLE(elemptr[`3`]) + GETJSAMPLE(elemptr[`4`]);
3019
3020	tmp10 = tmp0 + tmp3;
3021	tmp12 = tmp0 - tmp3;
3022	tmp11 = tmp1 + tmp2;
3023	tmp13 = tmp1 - tmp2;
3024
3025	tmp0 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`7`]);
3026	tmp1 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`6`]);
3027	tmp2 = GETJSAMPLE(elemptr[`2`]) - GETJSAMPLE(elemptr[`5`]);
3028	tmp3 = GETJSAMPLE(elemptr[`3`]) - GETJSAMPLE(elemptr[`4`]);
3029
3030	/ Apply unsigned->signed conversion /
3031	dataptr[`0`] = (DCTELEM)
3032	((tmp10 + tmp11 - `8` * CENTERJSAMPLE) << (PASS1_BITS+`1`));
3033	dataptr[`4`] = (DCTELEM) ((tmp10 - tmp11) << (PASS1_BITS+`1`));
3034
3035	z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
3036	/ Add fudge factor here for final descale. /
3037	z1 += ONE << (CONST_BITS-PASS1_BITS-`2`);
3038	dataptr[`2`] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
3039	CONST_BITS-PASS1_BITS-`1`);
3040	dataptr[`6`] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
3041	CONST_BITS-PASS1_BITS-`1`);
3042
3043	/ Odd part per figure 8 --- note paper omits factor of sqrt(2).*
3044	* 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3045	* i0..i3 in the paper are tmp0..tmp3 here.
3046	*/
3047
3048	tmp10 = tmp0 + tmp3;
3049	tmp11 = tmp1 + tmp2;
3050	tmp12 = tmp0 + tmp2;
3051	tmp13 = tmp1 + tmp3;
3052	z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); / c3 /
3053	/ Add fudge factor here for final descale. /
3054	z1 += ONE << (CONST_BITS-PASS1_BITS-`2`);
3055
3056	tmp0 = MULTIPLY(tmp0, FIX_1_501321110); / c1+c3-c5-c7 /
3057	tmp1 = MULTIPLY(tmp1, FIX_3_072711026); / c1+c3+c5-c7 /
3058	tmp2 = MULTIPLY(tmp2, FIX_2_053119869); / c1+c3-c5+c7 /
3059	tmp3 = MULTIPLY(tmp3, FIX_0_298631336); / -c1+c3+c5-c7 /
3060	tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); / c7-c3 /
3061	tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); / -c1-c3 /
3062	tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); / c5-c3 /
3063	tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); / -c3-c5 /
3064
3065	tmp12 += z1;
3066	tmp13 += z1;
3067
3068	dataptr[`1`] = (DCTELEM)
3069	RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS-`1`);
3070	dataptr[`3`] = (DCTELEM)
3071	RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS-`1`);
3072	dataptr[`5`] = (DCTELEM)
3073	RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS-`1`);
3074	dataptr[`7`] = (DCTELEM)
3075	RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS-`1`);
3076
3077	dataptr += DCTSIZE; / advance pointer to next row /
3078	}
3079
3080	/ Pass 2: process columns.*
3081	* We remove the PASS1_BITS scaling, but leave the results scaled up
3082	* by an overall factor of 8.
3083	* 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3084	*/
3085
3086	dataptr = data;
3087	for (ctr = DCTSIZE-`1`; ctr >= `0`; ctr--) {
3088	/ Even part /
3089
3090	/ Add fudge factor here for final descale. /
3091	tmp0 = dataptr[DCTSIZE`0`] + dataptr[DCTSIZE`3`] + (ONE << (PASS1_BITS-`1`));
3092	tmp1 = dataptr[DCTSIZE`1`] + dataptr[DCTSIZE`2`];
3093
3094	tmp10 = dataptr[DCTSIZE`0`] - dataptr[DCTSIZE`3`];
3095	tmp11 = dataptr[DCTSIZE`1`] - dataptr[DCTSIZE`2`];
3096
3097	dataptr[DCTSIZE*`0`] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
3098	dataptr[DCTSIZE*`2`] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
3099
3100	/ Odd part /
3101
3102	tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); / c6 /
3103	/ Add fudge factor here for final descale. /
3104	tmp0 += ONE << (CONST_BITS+PASS1_BITS-`1`);
3105
3106	dataptr[DCTSIZE*`1`] = (DCTELEM)
3107	RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), / c2-c6 /
3108	CONST_BITS+PASS1_BITS);
3109	dataptr[DCTSIZE*`3`] = (DCTELEM)
3110	RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), / c2+c6 /
3111	CONST_BITS+PASS1_BITS);
3112
3113	dataptr++; / advance pointer to next column /
3114	}
3115	}
3116
3117
3118	/*
3119	* Perform the forward DCT on a 6x3 sample block.
3120	*
3121	* 6-point FDCT in pass 1 (rows), 3-point in pass 2 (columns).
3122	*/
3123
3124	GLOBAL(void)
3125	jpeg_fdct_6x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3126	{
3127	INT32 tmp0, tmp1, tmp2;
3128	INT32 tmp10, tmp11, tmp12;
3129	DCTELEM *dataptr;
3130	JSAMPROW elemptr;
3131	int ctr;
3132	SHIFT_TEMPS
3133
3134	/ Pre-zero output coefficient block. /
3135	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3136
3137	/ Pass 1: process rows. /
3138	/ Note results are scaled up by sqrt(8) compared to a true DCT; /
3139	/ furthermore, we scale the results by 2*PASS1_BITS. /*
3140	/ We scale the results further by 2 as part of output adaption /
3141	/ scaling for different DCT size. /
3142	/ 6-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/12). /*
3143
3144	dataptr = data;
3145	for (ctr = `0`; ctr < `3`; ctr++) {
3146	elemptr = sample_data[ctr] + start_col;
3147
3148	/ Even part /
3149
3150	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`5`]);
3151	tmp11 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`4`]);
3152	tmp2 = GETJSAMPLE(elemptr[`2`]) + GETJSAMPLE(elemptr[`3`]);
3153
3154	tmp10 = tmp0 + tmp2;
3155	tmp12 = tmp0 - tmp2;
3156
3157	tmp0 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`5`]);
3158	tmp1 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`4`]);
3159	tmp2 = GETJSAMPLE(elemptr[`2`]) - GETJSAMPLE(elemptr[`3`]);
3160
3161	/ Apply unsigned->signed conversion /
3162	dataptr[`0`] = (DCTELEM)
3163	((tmp10 + tmp11 - `6` * CENTERJSAMPLE) << (PASS1_BITS+`1`));
3164	dataptr[`2`] = (DCTELEM)
3165	DESCALE(MULTIPLY(tmp12, FIX(`1.224744871`)), / c2 /
3166	CONST_BITS-PASS1_BITS-`1`);
3167	dataptr[`4`] = (DCTELEM)
3168	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(`0.707106781`)), / c4 /
3169	CONST_BITS-PASS1_BITS-`1`);
3170
3171	/ Odd part /
3172
3173	tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(`0.366025404`)), / c5 /
3174	CONST_BITS-PASS1_BITS-`1`);
3175
3176	dataptr[`1`] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << (PASS1_BITS+`1`)));
3177	dataptr[`3`] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << (PASS1_BITS+`1`));
3178	dataptr[`5`] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << (PASS1_BITS+`1`)));
3179
3180	dataptr += DCTSIZE; / advance pointer to next row /
3181	}
3182
3183	/ Pass 2: process columns.*
3184	* We remove the PASS1_BITS scaling, but leave the results scaled up
3185	* by an overall factor of 8.
3186	* We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
3187	* fold into the constant multipliers (other part was done in pass 1):
3188	* 3-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/6) 16/9.
3189	*/
3190
3191	dataptr = data;
3192	for (ctr = `0`; ctr < `6`; ctr++) {
3193	/ Even part /
3194
3195	tmp0 = dataptr[DCTSIZE`0`] + dataptr[DCTSIZE`2`];
3196	tmp1 = dataptr[DCTSIZE*`1`];
3197
3198	tmp2 = dataptr[DCTSIZE`0`] - dataptr[DCTSIZE`2`];
3199
3200	dataptr[DCTSIZE*`0`] = (DCTELEM)
3201	DESCALE(MULTIPLY(tmp0 + tmp1, FIX(`1.777777778`)), / 16/9 /
3202	CONST_BITS+PASS1_BITS);
3203	dataptr[DCTSIZE*`2`] = (DCTELEM)
3204	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(`1.257078722`)), / c2 /
3205	CONST_BITS+PASS1_BITS);
3206
3207	/ Odd part /
3208
3209	dataptr[DCTSIZE*`1`] = (DCTELEM)
3210	DESCALE(MULTIPLY(tmp2, FIX(`2.177324216`)), / c1 /
3211	CONST_BITS+PASS1_BITS);
3212
3213	dataptr++; / advance pointer to next column /
3214	}
3215	}
3216
3217
3218	/*
3219	* Perform the forward DCT on a 4x2 sample block.
3220	*
3221	* 4-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
3222	*/
3223
3224	GLOBAL(void)
3225	jpeg_fdct_4x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3226	{
3227	INT32 tmp0, tmp1;
3228	INT32 tmp10, tmp11;
3229	DCTELEM *dataptr;
3230	JSAMPROW elemptr;
3231	int ctr;
3232	SHIFT_TEMPS
3233
3234	/ Pre-zero output coefficient block. /
3235	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3236
3237	/ Pass 1: process rows. /
3238	/ Note results are scaled up by sqrt(8) compared to a true DCT; /
3239	/ furthermore, we scale the results by 2*PASS1_BITS. /*
3240	/ We must also scale the output by (8/4)(8/2) = 23, which we add here. /*
3241	/ 4-point FDCT kernel, /
3242	/ cK represents sqrt(2) * cos(Kpi/16) [refers to 8-point FDCT]. /*
3243
3244	dataptr = data;
3245	for (ctr = `0`; ctr < `2`; ctr++) {
3246	elemptr = sample_data[ctr] + start_col;
3247
3248	/ Even part /
3249
3250	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`3`]);
3251	tmp1 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`2`]);
3252
3253	tmp10 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`3`]);
3254	tmp11 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`2`]);
3255
3256	/ Apply unsigned->signed conversion /
3257	dataptr[`0`] = (DCTELEM)
3258	((tmp0 + tmp1 - `4` * CENTERJSAMPLE) << (PASS1_BITS+`3`));
3259	dataptr[`2`] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+`3`));
3260
3261	/ Odd part /
3262
3263	tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); / c6 /
3264	/ Add fudge factor here for final descale. /
3265	tmp0 += ONE << (CONST_BITS-PASS1_BITS-`4`);
3266
3267	dataptr[`1`] = (DCTELEM)
3268	RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), / c2-c6 /
3269	CONST_BITS-PASS1_BITS-`3`);
3270	dataptr[`3`] = (DCTELEM)
3271	RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), / c2+c6 /
3272	CONST_BITS-PASS1_BITS-`3`);
3273
3274	dataptr += DCTSIZE; / advance pointer to next row /
3275	}
3276
3277	/ Pass 2: process columns.*
3278	* We remove the PASS1_BITS scaling, but leave the results scaled up
3279	* by an overall factor of 8.
3280	*/
3281
3282	dataptr = data;
3283	for (ctr = `0`; ctr < `4`; ctr++) {
3284	/ Even part /
3285
3286	/ Add fudge factor here for final descale. /
3287	tmp0 = dataptr[DCTSIZE*`0`] + (ONE << (PASS1_BITS-`1`));
3288	tmp1 = dataptr[DCTSIZE*`1`];
3289
3290	dataptr[DCTSIZE*`0`] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
3291
3292	/ Odd part /
3293
3294	dataptr[DCTSIZE*`1`] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
3295
3296	dataptr++; / advance pointer to next column /
3297	}
3298	}
3299
3300
3301	/*
3302	* Perform the forward DCT on a 2x1 sample block.
3303	*
3304	* 2-point FDCT in pass 1 (rows), 1-point in pass 2 (columns).
3305	*/
3306
3307	GLOBAL(void)
3308	jpeg_fdct_2x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3309	{
3310	INT32 tmp0, tmp1;
3311	JSAMPROW elemptr;
3312
3313	/ Pre-zero output coefficient block. /
3314	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3315
3316	elemptr = sample_data[`0`] + start_col;
3317
3318	tmp0 = GETJSAMPLE(elemptr[`0`]);
3319	tmp1 = GETJSAMPLE(elemptr[`1`]);
3320
3321	/ We leave the results scaled up by an overall factor of 8.*
3322	* We must also scale the output by (8/2)(8/1) = 2*5.
3323	*/
3324
3325	/ Even part /
3326	/ Apply unsigned->signed conversion /
3327	data[`0`] = (DCTELEM) ((tmp0 + tmp1 - `2` * CENTERJSAMPLE) << `5`);
3328
3329	/ Odd part /
3330	data[`1`] = (DCTELEM) ((tmp0 - tmp1) << `5`);
3331	}
3332
3333
3334	/*
3335	* Perform the forward DCT on an 8x16 sample block.
3336	*
3337	* 8-point FDCT in pass 1 (rows), 16-point in pass 2 (columns).
3338	*/
3339
3340	GLOBAL(void)
3341	jpeg_fdct_8x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3342	{
3343	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3344	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
3345	INT32 z1;
3346	DCTELEM workspace[DCTSIZE2];
3347	DCTELEM *dataptr;
3348	DCTELEM *wsptr;
3349	JSAMPROW elemptr;
3350	int ctr;
3351	SHIFT_TEMPS
3352
3353	/ Pass 1: process rows. /
3354	/ Note results are scaled up by sqrt(8) compared to a true DCT; /
3355	/ furthermore, we scale the results by 2*PASS1_BITS. /*
3356
3357	dataptr = data;
3358	ctr = `0`;
3359	for (;;) {
3360	elemptr = sample_data[ctr] + start_col;
3361
3362	/ Even part per LL&M figure 1 --- note that published figure is faulty;*
3363	* rotator "sqrt(2)c1" should be "sqrt(2)c6".
3364	*/
3365
3366	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`7`]);
3367	tmp1 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`6`]);
3368	tmp2 = GETJSAMPLE(elemptr[`2`]) + GETJSAMPLE(elemptr[`5`]);
3369	tmp3 = GETJSAMPLE(elemptr[`3`]) + GETJSAMPLE(elemptr[`4`]);
3370
3371	tmp10 = tmp0 + tmp3;
3372	tmp12 = tmp0 - tmp3;
3373	tmp11 = tmp1 + tmp2;
3374	tmp13 = tmp1 - tmp2;
3375
3376	tmp0 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`7`]);
3377	tmp1 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`6`]);
3378	tmp2 = GETJSAMPLE(elemptr[`2`]) - GETJSAMPLE(elemptr[`5`]);
3379	tmp3 = GETJSAMPLE(elemptr[`3`]) - GETJSAMPLE(elemptr[`4`]);
3380
3381	/ Apply unsigned->signed conversion /
3382	dataptr[`0`] = (DCTELEM) ((tmp10 + tmp11 - `8` * CENTERJSAMPLE) << PASS1_BITS);
3383	dataptr[`4`] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
3384
3385	z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
3386	dataptr[`2`] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
3387	CONST_BITS-PASS1_BITS);
3388	dataptr[`6`] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
3389	CONST_BITS-PASS1_BITS);
3390
3391	/ Odd part per figure 8 --- note paper omits factor of sqrt(2).*
3392	* 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3393	* i0..i3 in the paper are tmp0..tmp3 here.
3394	*/
3395
3396	tmp10 = tmp0 + tmp3;
3397	tmp11 = tmp1 + tmp2;
3398	tmp12 = tmp0 + tmp2;
3399	tmp13 = tmp1 + tmp3;
3400	z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); / c3 /
3401
3402	tmp0 = MULTIPLY(tmp0, FIX_1_501321110); / c1+c3-c5-c7 /
3403	tmp1 = MULTIPLY(tmp1, FIX_3_072711026); / c1+c3+c5-c7 /
3404	tmp2 = MULTIPLY(tmp2, FIX_2_053119869); / c1+c3-c5+c7 /
3405	tmp3 = MULTIPLY(tmp3, FIX_0_298631336); / -c1+c3+c5-c7 /
3406	tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); / c7-c3 /
3407	tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); / -c1-c3 /
3408	tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); / c5-c3 /
3409	tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); / -c3-c5 /
3410
3411	tmp12 += z1;
3412	tmp13 += z1;
3413
3414	dataptr[`1`] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
3415	dataptr[`3`] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
3416	dataptr[`5`] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
3417	dataptr[`7`] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
3418
3419	ctr++;
3420
3421	if (ctr != DCTSIZE) {
3422	if (ctr == DCTSIZE * `2`)
3423	break; / Done. /
3424	dataptr += DCTSIZE; / advance pointer to next row /
3425	} else
3426	dataptr = workspace; / switch pointer to extended workspace /
3427	}
3428
3429	/ Pass 2: process columns.*
3430	* We remove the PASS1_BITS scaling, but leave the results scaled up
3431	* by an overall factor of 8.
3432	* We must also scale the output by 8/16 = 1/2.
3433	* 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
3434	*/
3435
3436	dataptr = data;
3437	wsptr = workspace;
3438	for (ctr = DCTSIZE-`1`; ctr >= `0`; ctr--) {
3439	/ Even part /
3440
3441	tmp0 = dataptr[DCTSIZE`0`] + wsptr[DCTSIZE`7`];
3442	tmp1 = dataptr[DCTSIZE`1`] + wsptr[DCTSIZE`6`];
3443	tmp2 = dataptr[DCTSIZE`2`] + wsptr[DCTSIZE`5`];
3444	tmp3 = dataptr[DCTSIZE`3`] + wsptr[DCTSIZE`4`];
3445	tmp4 = dataptr[DCTSIZE`4`] + wsptr[DCTSIZE`3`];
3446	tmp5 = dataptr[DCTSIZE`5`] + wsptr[DCTSIZE`2`];
3447	tmp6 = dataptr[DCTSIZE`6`] + wsptr[DCTSIZE`1`];
3448	tmp7 = dataptr[DCTSIZE`7`] + wsptr[DCTSIZE`0`];
3449
3450	tmp10 = tmp0 + tmp7;
3451	tmp14 = tmp0 - tmp7;
3452	tmp11 = tmp1 + tmp6;
3453	tmp15 = tmp1 - tmp6;
3454	tmp12 = tmp2 + tmp5;
3455	tmp16 = tmp2 - tmp5;
3456	tmp13 = tmp3 + tmp4;
3457	tmp17 = tmp3 - tmp4;
3458
3459	tmp0 = dataptr[DCTSIZE`0`] - wsptr[DCTSIZE`7`];
3460	tmp1 = dataptr[DCTSIZE`1`] - wsptr[DCTSIZE`6`];
3461	tmp2 = dataptr[DCTSIZE`2`] - wsptr[DCTSIZE`5`];
3462	tmp3 = dataptr[DCTSIZE`3`] - wsptr[DCTSIZE`4`];
3463	tmp4 = dataptr[DCTSIZE`4`] - wsptr[DCTSIZE`3`];
3464	tmp5 = dataptr[DCTSIZE`5`] - wsptr[DCTSIZE`2`];
3465	tmp6 = dataptr[DCTSIZE`6`] - wsptr[DCTSIZE`1`];
3466	tmp7 = dataptr[DCTSIZE`7`] - wsptr[DCTSIZE`0`];
3467
3468	dataptr[DCTSIZE*`0`] = (DCTELEM)
3469	DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+`1`);
3470	dataptr[DCTSIZE*`4`] = (DCTELEM)
3471	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(`1.306562965`)) + / c4[16] = c2[8] /
3472	MULTIPLY(tmp11 - tmp12, FIX_0_541196100), / c12[16] = c6[8] /
3473	CONST_BITS+PASS1_BITS+`1`);
3474
3475	tmp10 = MULTIPLY(tmp17 - tmp15, FIX(`0.275899379`)) + / c14[16] = c7[8] /
3476	MULTIPLY(tmp14 - tmp16, FIX(`1.387039845`)); / c2[16] = c1[8] /
3477
3478	dataptr[DCTSIZE*`2`] = (DCTELEM)
3479	DESCALE(tmp10 + MULTIPLY(tmp15, FIX(`1.451774982`)) / c6+c14 /
3480	+ MULTIPLY(tmp16, FIX(`2.172734804`)), / c2+c10 /
3481	CONST_BITS+PASS1_BITS+`1`);
3482	dataptr[DCTSIZE*`6`] = (DCTELEM)
3483	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(`0.211164243`)) / c2-c6 /
3484	- MULTIPLY(tmp17, FIX(`1.061594338`)), / c10+c14 /
3485	CONST_BITS+PASS1_BITS+`1`);
3486
3487	/ Odd part /
3488
3489	tmp11 = MULTIPLY(tmp0 + tmp1, FIX(`1.353318001`)) + / c3 /
3490	MULTIPLY(tmp6 - tmp7, FIX(`0.410524528`)); / c13 /
3491	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(`1.247225013`)) + / c5 /
3492	MULTIPLY(tmp5 + tmp7, FIX(`0.666655658`)); / c11 /
3493	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(`1.093201867`)) + / c7 /
3494	MULTIPLY(tmp4 - tmp7, FIX(`0.897167586`)); / c9 /
3495	tmp14 = MULTIPLY(tmp1 + tmp2, FIX(`0.138617169`)) + / c15 /
3496	MULTIPLY(tmp6 - tmp5, FIX(`1.407403738`)); / c1 /
3497	tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(`0.666655658`)) + / -c11 /
3498	MULTIPLY(tmp4 + tmp6, - FIX(`1.247225013`)); / -c5 /
3499	tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(`1.353318001`)) + / -c3 /
3500	MULTIPLY(tmp5 - tmp4, FIX(`0.410524528`)); / c13 /
3501	tmp10 = tmp11 + tmp12 + tmp13 -
3502	MULTIPLY(tmp0, FIX(`2.286341144`)) + / c7+c5+c3-c1 /
3503	MULTIPLY(tmp7, FIX(`0.779653625`)); / c15+c13-c11+c9 /
3504	tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(`0.071888074`)) / c9-c3-c15+c11 /
3505	- MULTIPLY(tmp6, FIX(`1.663905119`)); / c7+c13+c1-c5 /
3506	tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(`1.125726048`)) / c7+c5+c15-c3 /
3507	+ MULTIPLY(tmp5, FIX(`1.227391138`)); / c9-c11+c1-c13 /
3508	tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(`1.065388962`)) / c15+c3+c11-c7 /
3509	+ MULTIPLY(tmp4, FIX(`2.167985692`)); / c1+c13+c5-c9 /
3510
3511	dataptr[DCTSIZE*`1`] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+`1`);
3512	dataptr[DCTSIZE*`3`] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+`1`);
3513	dataptr[DCTSIZE*`5`] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+`1`);
3514	dataptr[DCTSIZE*`7`] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+`1`);
3515
3516	dataptr++; / advance pointer to next column /
3517	wsptr++; / advance pointer to next column /
3518	}
3519	}
3520
3521
3522	/*
3523	* Perform the forward DCT on a 7x14 sample block.
3524	*
3525	* 7-point FDCT in pass 1 (rows), 14-point in pass 2 (columns).
3526	*/
3527
3528	GLOBAL(void)
3529	jpeg_fdct_7x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3530	{
3531	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
3532	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
3533	INT32 z1, z2, z3;
3534	DCTELEM workspace[`8`*`6`];
3535	DCTELEM *dataptr;
3536	DCTELEM *wsptr;
3537	JSAMPROW elemptr;
3538	int ctr;
3539	SHIFT_TEMPS
3540
3541	/ Pre-zero output coefficient block. /
3542	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3543
3544	/ Pass 1: process rows. /
3545	/ Note results are scaled up by sqrt(8) compared to a true DCT; /
3546	/ furthermore, we scale the results by 2*PASS1_BITS. /*
3547	/ 7-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/14). /*
3548
3549	dataptr = data;
3550	ctr = `0`;
3551	for (;;) {
3552	elemptr = sample_data[ctr] + start_col;
3553
3554	/ Even part /
3555
3556	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`6`]);
3557	tmp1 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`5`]);
3558	tmp2 = GETJSAMPLE(elemptr[`2`]) + GETJSAMPLE(elemptr[`4`]);
3559	tmp3 = GETJSAMPLE(elemptr[`3`]);
3560
3561	tmp10 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`6`]);
3562	tmp11 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`5`]);
3563	tmp12 = GETJSAMPLE(elemptr[`2`]) - GETJSAMPLE(elemptr[`4`]);
3564
3565	z1 = tmp0 + tmp2;
3566	/ Apply unsigned->signed conversion /
3567	dataptr[`0`] = (DCTELEM)
3568	((z1 + tmp1 + tmp3 - `7` * CENTERJSAMPLE) << PASS1_BITS);
3569	tmp3 += tmp3;
3570	z1 -= tmp3;
3571	z1 -= tmp3;
3572	z1 = MULTIPLY(z1, FIX(`0.353553391`)); / (c2+c6-c4)/2 /
3573	z2 = MULTIPLY(tmp0 - tmp2, FIX(`0.920609002`)); / (c2+c4-c6)/2 /
3574	z3 = MULTIPLY(tmp1 - tmp2, FIX(`0.314692123`)); / c6 /
3575	dataptr[`2`] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
3576	z1 -= z2;
3577	z2 = MULTIPLY(tmp0 - tmp1, FIX(`0.881747734`)); / c4 /
3578	dataptr[`4`] = (DCTELEM)
3579	DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(`0.707106781`)), / c2+c6-c4 /
3580	CONST_BITS-PASS1_BITS);
3581	dataptr[`6`] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
3582
3583	/ Odd part /
3584
3585	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(`0.935414347`)); / (c3+c1-c5)/2 /
3586	tmp2 = MULTIPLY(tmp10 - tmp11, FIX(`0.170262339`)); / (c3+c5-c1)/2 /
3587	tmp0 = tmp1 - tmp2;
3588	tmp1 += tmp2;
3589	tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(`1.378756276`)); / -c1 /
3590	tmp1 += tmp2;
3591	tmp3 = MULTIPLY(tmp10 + tmp12, FIX(`0.613604268`)); / c5 /
3592	tmp0 += tmp3;
3593	tmp2 += tmp3 + MULTIPLY(tmp12, FIX(`1.870828693`)); / c3+c1-c5 /
3594
3595	dataptr[`1`] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
3596	dataptr[`3`] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
3597	dataptr[`5`] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
3598
3599	ctr++;
3600
3601	if (ctr != DCTSIZE) {
3602	if (ctr == `14`)
3603	break; / Done. /
3604	dataptr += DCTSIZE; / advance pointer to next row /
3605	} else
3606	dataptr = workspace; / switch pointer to extended workspace /
3607	}
3608
3609	/ Pass 2: process columns.*
3610	* We remove the PASS1_BITS scaling, but leave the results scaled up
3611	* by an overall factor of 8.
3612	* We must also scale the output by (8/7)*(8/14) = 32/49, which we
3613	* fold into the constant multipliers:
3614	* 14-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/28) 32/49.
3615	*/
3616
3617	dataptr = data;
3618	wsptr = workspace;
3619	for (ctr = `0`; ctr < `7`; ctr++) {
3620	/ Even part /
3621
3622	tmp0 = dataptr[DCTSIZE`0`] + wsptr[DCTSIZE`5`];
3623	tmp1 = dataptr[DCTSIZE`1`] + wsptr[DCTSIZE`4`];
3624	tmp2 = dataptr[DCTSIZE`2`] + wsptr[DCTSIZE`3`];
3625	tmp13 = dataptr[DCTSIZE`3`] + wsptr[DCTSIZE`2`];
3626	tmp4 = dataptr[DCTSIZE`4`] + wsptr[DCTSIZE`1`];
3627	tmp5 = dataptr[DCTSIZE`5`] + wsptr[DCTSIZE`0`];
3628	tmp6 = dataptr[DCTSIZE`6`] + dataptr[DCTSIZE`7`];
3629
3630	tmp10 = tmp0 + tmp6;
3631	tmp14 = tmp0 - tmp6;
3632	tmp11 = tmp1 + tmp5;
3633	tmp15 = tmp1 - tmp5;
3634	tmp12 = tmp2 + tmp4;
3635	tmp16 = tmp2 - tmp4;
3636
3637	tmp0 = dataptr[DCTSIZE`0`] - wsptr[DCTSIZE`5`];
3638	tmp1 = dataptr[DCTSIZE`1`] - wsptr[DCTSIZE`4`];
3639	tmp2 = dataptr[DCTSIZE`2`] - wsptr[DCTSIZE`3`];
3640	tmp3 = dataptr[DCTSIZE`3`] - wsptr[DCTSIZE`2`];
3641	tmp4 = dataptr[DCTSIZE`4`] - wsptr[DCTSIZE`1`];
3642	tmp5 = dataptr[DCTSIZE`5`] - wsptr[DCTSIZE`0`];
3643	tmp6 = dataptr[DCTSIZE`6`] - dataptr[DCTSIZE`7`];
3644
3645	dataptr[DCTSIZE*`0`] = (DCTELEM)
3646	DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
3647	FIX(`0.653061224`)), / 32/49 /
3648	CONST_BITS+PASS1_BITS);
3649	tmp13 += tmp13;
3650	dataptr[DCTSIZE*`4`] = (DCTELEM)
3651	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(`0.832106052`)) + / c4 /
3652	MULTIPLY(tmp11 - tmp13, FIX(`0.205513223`)) - / c12 /
3653	MULTIPLY(tmp12 - tmp13, FIX(`0.575835255`)), / c8 /
3654	CONST_BITS+PASS1_BITS);
3655
3656	tmp10 = MULTIPLY(tmp14 + tmp15, FIX(`0.722074570`)); / c6 /
3657
3658	dataptr[DCTSIZE*`2`] = (DCTELEM)
3659	DESCALE(tmp10 + MULTIPLY(tmp14, FIX(`0.178337691`)) / c2-c6 /
3660	+ MULTIPLY(tmp16, FIX(`0.400721155`)), / c10 /
3661	CONST_BITS+PASS1_BITS);
3662	dataptr[DCTSIZE*`6`] = (DCTELEM)
3663	DESCALE(tmp10 - MULTIPLY(tmp15, FIX(`1.122795725`)) / c6+c10 /
3664	- MULTIPLY(tmp16, FIX(`0.900412262`)), / c2 /
3665	CONST_BITS+PASS1_BITS);
3666
3667	/ Odd part /
3668
3669	tmp10 = tmp1 + tmp2;
3670	tmp11 = tmp5 - tmp4;
3671	dataptr[DCTSIZE*`7`] = (DCTELEM)
3672	DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
3673	FIX(`0.653061224`)), / 32/49 /
3674	CONST_BITS+PASS1_BITS);
3675	tmp3 = MULTIPLY(tmp3 , FIX(`0.653061224`)); / 32/49 /
3676	tmp10 = MULTIPLY(tmp10, - FIX(`0.103406812`)); / -c13 /
3677	tmp11 = MULTIPLY(tmp11, FIX(`0.917760839`)); / c1 /
3678	tmp10 += tmp11 - tmp3;
3679	tmp11 = MULTIPLY(tmp0 + tmp2, FIX(`0.782007410`)) + / c5 /
3680	MULTIPLY(tmp4 + tmp6, FIX(`0.491367823`)); / c9 /
3681	dataptr[DCTSIZE*`5`] = (DCTELEM)
3682	DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(`1.550341076`)) / c3+c5-c13 /
3683	+ MULTIPLY(tmp4, FIX(`0.731428202`)), / c1+c11-c9 /
3684	CONST_BITS+PASS1_BITS);
3685	tmp12 = MULTIPLY(tmp0 + tmp1, FIX(`0.871740478`)) + / c3 /
3686	MULTIPLY(tmp5 - tmp6, FIX(`0.305035186`)); / c11 /
3687	dataptr[DCTSIZE*`3`] = (DCTELEM)
3688	DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(`0.276965844`)) / c3-c9-c13 /
3689	- MULTIPLY(tmp5, FIX(`2.004803435`)), / c1+c5+c11 /
3690	CONST_BITS+PASS1_BITS);
3691	dataptr[DCTSIZE*`1`] = (DCTELEM)
3692	DESCALE(tmp11 + tmp12 + tmp3
3693	- MULTIPLY(tmp0, FIX(`0.735987049`)) / c3+c5-c1 /
3694	- MULTIPLY(tmp6, FIX(`0.082925825`)), / c9-c11-c13 /
3695	CONST_BITS+PASS1_BITS);
3696
3697	dataptr++; / advance pointer to next column /
3698	wsptr++; / advance pointer to next column /
3699	}
3700	}
3701
3702
3703	/*
3704	* Perform the forward DCT on a 6x12 sample block.
3705	*
3706	* 6-point FDCT in pass 1 (rows), 12-point in pass 2 (columns).
3707	*/
3708
3709	GLOBAL(void)
3710	jpeg_fdct_6x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3711	{
3712	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3713	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
3714	DCTELEM workspace[`8`*`4`];
3715	DCTELEM *dataptr;
3716	DCTELEM *wsptr;
3717	JSAMPROW elemptr;
3718	int ctr;
3719	SHIFT_TEMPS
3720
3721	/ Pre-zero output coefficient block. /
3722	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3723
3724	/ Pass 1: process rows. /
3725	/ Note results are scaled up by sqrt(8) compared to a true DCT; /
3726	/ furthermore, we scale the results by 2*PASS1_BITS. /*
3727	/ 6-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/12). /*
3728
3729	dataptr = data;
3730	ctr = `0`;
3731	for (;;) {
3732	elemptr = sample_data[ctr] + start_col;
3733
3734	/ Even part /
3735
3736	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`5`]);
3737	tmp11 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`4`]);
3738	tmp2 = GETJSAMPLE(elemptr[`2`]) + GETJSAMPLE(elemptr[`3`]);
3739
3740	tmp10 = tmp0 + tmp2;
3741	tmp12 = tmp0 - tmp2;
3742
3743	tmp0 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`5`]);
3744	tmp1 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`4`]);
3745	tmp2 = GETJSAMPLE(elemptr[`2`]) - GETJSAMPLE(elemptr[`3`]);
3746
3747	/ Apply unsigned->signed conversion /
3748	dataptr[`0`] = (DCTELEM)
3749	((tmp10 + tmp11 - `6` * CENTERJSAMPLE) << PASS1_BITS);
3750	dataptr[`2`] = (DCTELEM)
3751	DESCALE(MULTIPLY(tmp12, FIX(`1.224744871`)), / c2 /
3752	CONST_BITS-PASS1_BITS);
3753	dataptr[`4`] = (DCTELEM)
3754	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(`0.707106781`)), / c4 /
3755	CONST_BITS-PASS1_BITS);
3756
3757	/ Odd part /
3758
3759	tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(`0.366025404`)), / c5 /
3760	CONST_BITS-PASS1_BITS);
3761
3762	dataptr[`1`] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
3763	dataptr[`3`] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
3764	dataptr[`5`] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
3765
3766	ctr++;
3767
3768	if (ctr != DCTSIZE) {
3769	if (ctr == `12`)
3770	break; / Done. /
3771	dataptr += DCTSIZE; / advance pointer to next row /
3772	} else
3773	dataptr = workspace; / switch pointer to extended workspace /
3774	}
3775
3776	/ Pass 2: process columns.*
3777	* We remove the PASS1_BITS scaling, but leave the results scaled up
3778	* by an overall factor of 8.
3779	* We must also scale the output by (8/6)*(8/12) = 8/9, which we
3780	* fold into the constant multipliers:
3781	* 12-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/24) 8/9.
3782	*/
3783
3784	dataptr = data;
3785	wsptr = workspace;
3786	for (ctr = `0`; ctr < `6`; ctr++) {
3787	/ Even part /
3788
3789	tmp0 = dataptr[DCTSIZE`0`] + wsptr[DCTSIZE`3`];
3790	tmp1 = dataptr[DCTSIZE`1`] + wsptr[DCTSIZE`2`];
3791	tmp2 = dataptr[DCTSIZE`2`] + wsptr[DCTSIZE`1`];
3792	tmp3 = dataptr[DCTSIZE`3`] + wsptr[DCTSIZE`0`];
3793	tmp4 = dataptr[DCTSIZE`4`] + dataptr[DCTSIZE`7`];
3794	tmp5 = dataptr[DCTSIZE`5`] + dataptr[DCTSIZE`6`];
3795
3796	tmp10 = tmp0 + tmp5;
3797	tmp13 = tmp0 - tmp5;
3798	tmp11 = tmp1 + tmp4;
3799	tmp14 = tmp1 - tmp4;
3800	tmp12 = tmp2 + tmp3;
3801	tmp15 = tmp2 - tmp3;
3802
3803	tmp0 = dataptr[DCTSIZE`0`] - wsptr[DCTSIZE`3`];
3804	tmp1 = dataptr[DCTSIZE`1`] - wsptr[DCTSIZE`2`];
3805	tmp2 = dataptr[DCTSIZE`2`] - wsptr[DCTSIZE`1`];
3806	tmp3 = dataptr[DCTSIZE`3`] - wsptr[DCTSIZE`0`];
3807	tmp4 = dataptr[DCTSIZE`4`] - dataptr[DCTSIZE`7`];
3808	tmp5 = dataptr[DCTSIZE`5`] - dataptr[DCTSIZE`6`];
3809
3810	dataptr[DCTSIZE*`0`] = (DCTELEM)
3811	DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(`0.888888889`)), / 8/9 /
3812	CONST_BITS+PASS1_BITS);
3813	dataptr[DCTSIZE*`6`] = (DCTELEM)
3814	DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(`0.888888889`)), / 8/9 /
3815	CONST_BITS+PASS1_BITS);
3816	dataptr[DCTSIZE*`4`] = (DCTELEM)
3817	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(`1.088662108`)), / c4 /
3818	CONST_BITS+PASS1_BITS);
3819	dataptr[DCTSIZE*`2`] = (DCTELEM)
3820	DESCALE(MULTIPLY(tmp14 - tmp15, FIX(`0.888888889`)) + / 8/9 /
3821	MULTIPLY(tmp13 + tmp15, FIX(`1.214244803`)), / c2 /
3822	CONST_BITS+PASS1_BITS);
3823
3824	/ Odd part /
3825
3826	tmp10 = MULTIPLY(tmp1 + tmp4, FIX(`0.481063200`)); / c9 /
3827	tmp14 = tmp10 + MULTIPLY(tmp1, FIX(`0.680326102`)); / c3-c9 /
3828	tmp15 = tmp10 - MULTIPLY(tmp4, FIX(`1.642452502`)); / c3+c9 /
3829	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(`0.997307603`)); / c5 /
3830	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(`0.765261039`)); / c7 /
3831	tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(`0.516244403`)) / c5+c7-c1 /
3832	+ MULTIPLY(tmp5, FIX(`0.164081699`)); / c11 /
3833	tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(`0.164081699`)); / -c11 /
3834	tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(`2.079550144`)) / c1+c5-c11 /
3835	+ MULTIPLY(tmp5, FIX(`0.765261039`)); / c7 /
3836	tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(`0.645144899`)) / c1+c11-c7 /
3837	- MULTIPLY(tmp5, FIX(`0.997307603`)); / c5 /
3838	tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(`1.161389302`)) / c3 /
3839	- MULTIPLY(tmp2 + tmp5, FIX(`0.481063200`)); / c9 /
3840
3841	dataptr[DCTSIZE*`1`] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS);
3842	dataptr[DCTSIZE*`3`] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS);
3843	dataptr[DCTSIZE*`5`] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS);
3844	dataptr[DCTSIZE*`7`] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS);
3845
3846	dataptr++; / advance pointer to next column /
3847	wsptr++; / advance pointer to next column /
3848	}
3849	}
3850
3851
3852	/*
3853	* Perform the forward DCT on a 5x10 sample block.
3854	*
3855	* 5-point FDCT in pass 1 (rows), 10-point in pass 2 (columns).
3856	*/
3857
3858	GLOBAL(void)
3859	jpeg_fdct_5x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3860	{
3861	INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
3862	INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
3863	DCTELEM workspace[`8`*`2`];
3864	DCTELEM *dataptr;
3865	DCTELEM *wsptr;
3866	JSAMPROW elemptr;
3867	int ctr;
3868	SHIFT_TEMPS
3869
3870	/ Pre-zero output coefficient block. /
3871	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3872
3873	/ Pass 1: process rows. /
3874	/ Note results are scaled up by sqrt(8) compared to a true DCT; /
3875	/ furthermore, we scale the results by 2*PASS1_BITS. /*
3876	/ 5-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/10). /*
3877
3878	dataptr = data;
3879	ctr = `0`;
3880	for (;;) {
3881	elemptr = sample_data[ctr] + start_col;
3882
3883	/ Even part /
3884
3885	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`4`]);
3886	tmp1 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`3`]);
3887	tmp2 = GETJSAMPLE(elemptr[`2`]);
3888
3889	tmp10 = tmp0 + tmp1;
3890	tmp11 = tmp0 - tmp1;
3891
3892	tmp0 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`4`]);
3893	tmp1 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`3`]);
3894
3895	/ Apply unsigned->signed conversion /
3896	dataptr[`0`] = (DCTELEM)
3897	((tmp10 + tmp2 - `5` * CENTERJSAMPLE) << PASS1_BITS);
3898	tmp11 = MULTIPLY(tmp11, FIX(`0.790569415`)); / (c2+c4)/2 /
3899	tmp10 -= tmp2 << `2`;
3900	tmp10 = MULTIPLY(tmp10, FIX(`0.353553391`)); / (c2-c4)/2 /
3901	dataptr[`2`] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS);
3902	dataptr[`4`] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS);
3903
3904	/ Odd part /
3905
3906	tmp10 = MULTIPLY(tmp0 + tmp1, FIX(`0.831253876`)); / c3 /
3907
3908	dataptr[`1`] = (DCTELEM)
3909	DESCALE(tmp10 + MULTIPLY(tmp0, FIX(`0.513743148`)), / c1-c3 /
3910	CONST_BITS-PASS1_BITS);
3911	dataptr[`3`] = (DCTELEM)
3912	DESCALE(tmp10 - MULTIPLY(tmp1, FIX(`2.176250899`)), / c1+c3 /
3913	CONST_BITS-PASS1_BITS);
3914
3915	ctr++;
3916
3917	if (ctr != DCTSIZE) {
3918	if (ctr == `10`)
3919	break; / Done. /
3920	dataptr += DCTSIZE; / advance pointer to next row /
3921	} else
3922	dataptr = workspace; / switch pointer to extended workspace /
3923	}
3924
3925	/ Pass 2: process columns.*
3926	* We remove the PASS1_BITS scaling, but leave the results scaled up
3927	* by an overall factor of 8.
3928	* We must also scale the output by (8/5)*(8/10) = 32/25, which we
3929	* fold into the constant multipliers:
3930	* 10-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/20) 32/25.
3931	*/
3932
3933	dataptr = data;
3934	wsptr = workspace;
3935	for (ctr = `0`; ctr < `5`; ctr++) {
3936	/ Even part /
3937
3938	tmp0 = dataptr[DCTSIZE`0`] + wsptr[DCTSIZE`1`];
3939	tmp1 = dataptr[DCTSIZE`1`] + wsptr[DCTSIZE`0`];
3940	tmp12 = dataptr[DCTSIZE`2`] + dataptr[DCTSIZE`7`];
3941	tmp3 = dataptr[DCTSIZE`3`] + dataptr[DCTSIZE`6`];
3942	tmp4 = dataptr[DCTSIZE`4`] + dataptr[DCTSIZE`5`];
3943
3944	tmp10 = tmp0 + tmp4;
3945	tmp13 = tmp0 - tmp4;
3946	tmp11 = tmp1 + tmp3;
3947	tmp14 = tmp1 - tmp3;
3948
3949	tmp0 = dataptr[DCTSIZE`0`] - wsptr[DCTSIZE`1`];
3950	tmp1 = dataptr[DCTSIZE`1`] - wsptr[DCTSIZE`0`];
3951	tmp2 = dataptr[DCTSIZE`2`] - dataptr[DCTSIZE`7`];
3952	tmp3 = dataptr[DCTSIZE`3`] - dataptr[DCTSIZE`6`];
3953	tmp4 = dataptr[DCTSIZE`4`] - dataptr[DCTSIZE`5`];
3954
3955	dataptr[DCTSIZE*`0`] = (DCTELEM)
3956	DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(`1.28`)), / 32/25 /
3957	CONST_BITS+PASS1_BITS);
3958	tmp12 += tmp12;
3959	dataptr[DCTSIZE*`4`] = (DCTELEM)
3960	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(`1.464477191`)) - / c4 /
3961	MULTIPLY(tmp11 - tmp12, FIX(`0.559380511`)), / c8 /
3962	CONST_BITS+PASS1_BITS);
3963	tmp10 = MULTIPLY(tmp13 + tmp14, FIX(`1.064004961`)); / c6 /
3964	dataptr[DCTSIZE*`2`] = (DCTELEM)
3965	DESCALE(tmp10 + MULTIPLY(tmp13, FIX(`0.657591230`)), / c2-c6 /
3966	CONST_BITS+PASS1_BITS);
3967	dataptr[DCTSIZE*`6`] = (DCTELEM)
3968	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(`2.785601151`)), / c2+c6 /
3969	CONST_BITS+PASS1_BITS);
3970
3971	/ Odd part /
3972
3973	tmp10 = tmp0 + tmp4;
3974	tmp11 = tmp1 - tmp3;
3975	dataptr[DCTSIZE*`5`] = (DCTELEM)
3976	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(`1.28`)), / 32/25 /
3977	CONST_BITS+PASS1_BITS);
3978	tmp2 = MULTIPLY(tmp2, FIX(`1.28`)); / 32/25 /
3979	dataptr[DCTSIZE*`1`] = (DCTELEM)
3980	DESCALE(MULTIPLY(tmp0, FIX(`1.787906876`)) + / c1 /
3981	MULTIPLY(tmp1, FIX(`1.612894094`)) + tmp2 + / c3 /
3982	MULTIPLY(tmp3, FIX(`0.821810588`)) + / c7 /
3983	MULTIPLY(tmp4, FIX(`0.283176630`)), / c9 /
3984	CONST_BITS+PASS1_BITS);
3985	tmp12 = MULTIPLY(tmp0 - tmp4, FIX(`1.217352341`)) - / (c3+c7)/2 /
3986	MULTIPLY(tmp1 + tmp3, FIX(`0.752365123`)); / (c1-c9)/2 /
3987	tmp13 = MULTIPLY(tmp10 + tmp11, FIX(`0.395541753`)) + / (c3-c7)/2 /
3988	MULTIPLY(tmp11, FIX(`0.64`)) - tmp2; / 16/25 /
3989	dataptr[DCTSIZE*`3`] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+PASS1_BITS);
3990	dataptr[DCTSIZE*`7`] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+PASS1_BITS);
3991
3992	dataptr++; / advance pointer to next column /
3993	wsptr++; / advance pointer to next column /
3994	}
3995	}
3996
3997
3998	/*
3999	* Perform the forward DCT on a 4x8 sample block.
4000	*
4001	* 4-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
4002	*/
4003
4004	GLOBAL(void)
4005	jpeg_fdct_4x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4006	{
4007	INT32 tmp0, tmp1, tmp2, tmp3;
4008	INT32 tmp10, tmp11, tmp12, tmp13;
4009	INT32 z1;
4010	DCTELEM *dataptr;
4011	JSAMPROW elemptr;
4012	int ctr;
4013	SHIFT_TEMPS
4014
4015	/ Pre-zero output coefficient block. /
4016	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4017
4018	/ Pass 1: process rows. /
4019	/ Note results are scaled up by sqrt(8) compared to a true DCT; /
4020	/ furthermore, we scale the results by 2*PASS1_BITS. /*
4021	/ We must also scale the output by 8/4 = 2, which we add here. /
4022	/ 4-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/16). /*
4023
4024	dataptr = data;
4025	for (ctr = `0`; ctr < DCTSIZE; ctr++) {
4026	elemptr = sample_data[ctr] + start_col;
4027
4028	/ Even part /
4029
4030	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`3`]);
4031	tmp1 = GETJSAMPLE(elemptr[`1`]) + GETJSAMPLE(elemptr[`2`]);
4032
4033	tmp10 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`3`]);
4034	tmp11 = GETJSAMPLE(elemptr[`1`]) - GETJSAMPLE(elemptr[`2`]);
4035
4036	/ Apply unsigned->signed conversion /
4037	dataptr[`0`] = (DCTELEM)
4038	((tmp0 + tmp1 - `4` * CENTERJSAMPLE) << (PASS1_BITS+`1`));
4039	dataptr[`2`] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+`1`));
4040
4041	/ Odd part /
4042
4043	tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); / c6 /
4044	/ Add fudge factor here for final descale. /
4045	tmp0 += ONE << (CONST_BITS-PASS1_BITS-`2`);
4046
4047	dataptr[`1`] = (DCTELEM)
4048	RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), / c2-c6 /
4049	CONST_BITS-PASS1_BITS-`1`);
4050	dataptr[`3`] = (DCTELEM)
4051	RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), / c2+c6 /
4052	CONST_BITS-PASS1_BITS-`1`);
4053
4054	dataptr += DCTSIZE; / advance pointer to next row /
4055	}
4056
4057	/ Pass 2: process columns.*
4058	* We remove the PASS1_BITS scaling, but leave the results scaled up
4059	* by an overall factor of 8.
4060	*/
4061
4062	dataptr = data;
4063	for (ctr = `0`; ctr < `4`; ctr++) {
4064	/ Even part per LL&M figure 1 --- note that published figure is faulty;*
4065	* rotator "sqrt(2)c1" should be "sqrt(2)c6".
4066	*/
4067
4068	tmp0 = dataptr[DCTSIZE`0`] + dataptr[DCTSIZE`7`];
4069	tmp1 = dataptr[DCTSIZE`1`] + dataptr[DCTSIZE`6`];
4070	tmp2 = dataptr[DCTSIZE`2`] + dataptr[DCTSIZE`5`];
4071	tmp3 = dataptr[DCTSIZE`3`] + dataptr[DCTSIZE`4`];
4072
4073	/ Add fudge factor here for final descale. /
4074	tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-`1`));
4075	tmp12 = tmp0 - tmp3;
4076	tmp11 = tmp1 + tmp2;
4077	tmp13 = tmp1 - tmp2;
4078
4079	tmp0 = dataptr[DCTSIZE`0`] - dataptr[DCTSIZE`7`];
4080	tmp1 = dataptr[DCTSIZE`1`] - dataptr[DCTSIZE`6`];
4081	tmp2 = dataptr[DCTSIZE`2`] - dataptr[DCTSIZE`5`];
4082	tmp3 = dataptr[DCTSIZE`3`] - dataptr[DCTSIZE`4`];
4083
4084	dataptr[DCTSIZE*`0`] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
4085	dataptr[DCTSIZE*`4`] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
4086
4087	z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
4088	/ Add fudge factor here for final descale. /
4089	z1 += ONE << (CONST_BITS+PASS1_BITS-`1`);
4090	dataptr[DCTSIZE*`2`] = (DCTELEM)
4091	RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
4092	dataptr[DCTSIZE*`6`] = (DCTELEM)
4093	RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);
4094
4095	/ Odd part per figure 8 --- note paper omits factor of sqrt(2).*
4096	* 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
4097	* i0..i3 in the paper are tmp0..tmp3 here.
4098	*/
4099
4100	tmp10 = tmp0 + tmp3;
4101	tmp11 = tmp1 + tmp2;
4102	tmp12 = tmp0 + tmp2;
4103	tmp13 = tmp1 + tmp3;
4104	z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); / c3 /
4105	/ Add fudge factor here for final descale. /
4106	z1 += ONE << (CONST_BITS+PASS1_BITS-`1`);
4107
4108	tmp0 = MULTIPLY(tmp0, FIX_1_501321110); / c1+c3-c5-c7 /
4109	tmp1 = MULTIPLY(tmp1, FIX_3_072711026); / c1+c3+c5-c7 /
4110	tmp2 = MULTIPLY(tmp2, FIX_2_053119869); / c1+c3-c5+c7 /
4111	tmp3 = MULTIPLY(tmp3, FIX_0_298631336); / -c1+c3+c5-c7 /
4112	tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); / c7-c3 /
4113	tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); / -c1-c3 /
4114	tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); / c5-c3 /
4115	tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); / -c3-c5 /
4116
4117	tmp12 += z1;
4118	tmp13 += z1;
4119
4120	dataptr[DCTSIZE*`1`] = (DCTELEM)
4121	RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
4122	dataptr[DCTSIZE*`3`] = (DCTELEM)
4123	RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
4124	dataptr[DCTSIZE*`5`] = (DCTELEM)
4125	RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
4126	dataptr[DCTSIZE*`7`] = (DCTELEM)
4127	RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);
4128
4129	dataptr++; / advance pointer to next column /
4130	}
4131	}
4132
4133
4134	/*
4135	* Perform the forward DCT on a 3x6 sample block.
4136	*
4137	* 3-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
4138	*/
4139
4140	GLOBAL(void)
4141	jpeg_fdct_3x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4142	{
4143	INT32 tmp0, tmp1, tmp2;
4144	INT32 tmp10, tmp11, tmp12;
4145	DCTELEM *dataptr;
4146	JSAMPROW elemptr;
4147	int ctr;
4148	SHIFT_TEMPS
4149
4150	/ Pre-zero output coefficient block. /
4151	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4152
4153	/ Pass 1: process rows. /
4154	/ Note results are scaled up by sqrt(8) compared to a true DCT; /
4155	/ furthermore, we scale the results by 2*PASS1_BITS. /*
4156	/ We scale the results further by 2 as part of output adaption /
4157	/ scaling for different DCT size. /
4158	/ 3-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/6). /*
4159
4160	dataptr = data;
4161	for (ctr = `0`; ctr < `6`; ctr++) {
4162	elemptr = sample_data[ctr] + start_col;
4163
4164	/ Even part /
4165
4166	tmp0 = GETJSAMPLE(elemptr[`0`]) + GETJSAMPLE(elemptr[`2`]);
4167	tmp1 = GETJSAMPLE(elemptr[`1`]);
4168
4169	tmp2 = GETJSAMPLE(elemptr[`0`]) - GETJSAMPLE(elemptr[`2`]);
4170
4171	/ Apply unsigned->signed conversion /
4172	dataptr[`0`] = (DCTELEM)
4173	((tmp0 + tmp1 - `3` * CENTERJSAMPLE) << (PASS1_BITS+`1`));
4174	dataptr[`2`] = (DCTELEM)
4175	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(`0.707106781`)), / c2 /
4176	CONST_BITS-PASS1_BITS-`1`);
4177
4178	/ Odd part /
4179
4180	dataptr[`1`] = (DCTELEM)
4181	DESCALE(MULTIPLY(tmp2, FIX(`1.224744871`)), / c1 /
4182	CONST_BITS-PASS1_BITS-`1`);
4183
4184	dataptr += DCTSIZE; / advance pointer to next row /
4185	}
4186
4187	/ Pass 2: process columns.*
4188	* We remove the PASS1_BITS scaling, but leave the results scaled up
4189	* by an overall factor of 8.
4190	* We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
4191	* fold into the constant multipliers (other part was done in pass 1):
4192	* 6-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/12) 16/9.
4193	*/
4194
4195	dataptr = data;
4196	for (ctr = `0`; ctr < `3`; ctr++) {
4197	/ Even part /
4198
4199	tmp0 = dataptr[DCTSIZE`0`] + dataptr[DCTSIZE`5`];
4200	tmp11 = dataptr[DCTSIZE`1`] + dataptr[DCTSIZE`4`];
4201	tmp2 = dataptr[DCTSIZE`2`] + dataptr[DCTSIZE`3`];
4202
4203	tmp10 = tmp0 + tmp2;
4204	tmp12 = tmp0 - tmp2;
4205
4206	tmp0 = dataptr[DCTSIZE`0`] - dataptr[DCTSIZE`5`];
4207	tmp1 = dataptr[DCTSIZE`1`] - dataptr[DCTSIZE`4`];
4208	tmp2 = dataptr[DCTSIZE`2`] - dataptr[DCTSIZE`3`];
4209
4210	dataptr[DCTSIZE*`0`] = (DCTELEM)
4211	DESCALE(MULTIPLY(tmp10 + tmp11, FIX(`1.777777778`)), / 16/9 /
4212	CONST_BITS+PASS1_BITS);
4213	dataptr[DCTSIZE*`2`] = (DCTELEM)
4214	DESCALE(MULTIPLY(tmp12, FIX(`2.177324216`)), / c2 /
4215	CONST_BITS+PASS1_BITS);
4216	dataptr[DCTSIZE*`4`] = (DCTELEM)
4217	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(`1.257078722`)), / c4 /
4218	CONST_BITS+PASS1_BITS);
4219
4220	/ Odd part /
4221
4222	tmp10 = MULTIPLY(tmp0 + tmp2, FIX(`0.650711829`)); / c5 /
4223
4224	dataptr[DCTSIZE*`1`] = (DCTELEM)
4225	DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(`1.777777778`)), / 16/9 /
4226	CONST_BITS+PASS1_BITS);
4227	dataptr[DCTSIZE*`3`] = (DCTELEM)
4228	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(`1.777777778`)), / 16/9 /
4229	CONST_BITS+PASS1_BITS);
4230	dataptr[DCTSIZE*`5`] = (DCTELEM)
4231	DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(`1.777777778`)), / 16/9 /
4232	CONST_BITS+PASS1_BITS);
4233
4234	dataptr++; / advance pointer to next column /
4235	}
4236	}
4237
4238
4239	/*
4240	* Perform the forward DCT on a 2x4 sample block.
4241	*
4242	* 2-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
4243	*/
4244
4245	GLOBAL(void)
4246	jpeg_fdct_2x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4247	{
4248	INT32 tmp0, tmp1;
4249	INT32 tmp10, tmp11;
4250	DCTELEM *dataptr;
4251	JSAMPROW elemptr;
4252	int ctr;
4253	SHIFT_TEMPS
4254
4255	/ Pre-zero output coefficient block. /
4256	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4257
4258	/ Pass 1: process rows. /
4259	/ Note results are scaled up by sqrt(8) compared to a true DCT. /
4260	/ We must also scale the output by (8/2)(8/4) = 23, which we add here. /*
4261
4262	dataptr = data;
4263	for (ctr = `0`; ctr < `4`; ctr++) {
4264	elemptr = sample_data[ctr] + start_col;
4265
4266	/ Even part /
4267
4268	tmp0 = GETJSAMPLE(elemptr[`0`]);
4269	tmp1 = GETJSAMPLE(elemptr[`1`]);
4270
4271	/ Apply unsigned->signed conversion /
4272	dataptr[`0`] = (DCTELEM) ((tmp0 + tmp1 - `2` * CENTERJSAMPLE) << `3`);
4273
4274	/ Odd part /
4275
4276	dataptr[`1`] = (DCTELEM) ((tmp0 - tmp1) << `3`);
4277
4278	dataptr += DCTSIZE; / advance pointer to next row /
4279	}
4280
4281	/ Pass 2: process columns.*
4282	* We leave the results scaled up by an overall factor of 8.
4283	* 4-point FDCT kernel,
4284	* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
4285	*/
4286
4287	dataptr = data;
4288	for (ctr = `0`; ctr < `2`; ctr++) {
4289	/ Even part /
4290
4291	tmp0 = dataptr[DCTSIZE`0`] + dataptr[DCTSIZE`3`];
4292	tmp1 = dataptr[DCTSIZE`1`] + dataptr[DCTSIZE`2`];
4293
4294	tmp10 = dataptr[DCTSIZE`0`] - dataptr[DCTSIZE`3`];
4295	tmp11 = dataptr[DCTSIZE`1`] - dataptr[DCTSIZE`2`];
4296
4297	dataptr[DCTSIZE*`0`] = (DCTELEM) (tmp0 + tmp1);
4298	dataptr[DCTSIZE*`2`] = (DCTELEM) (tmp0 - tmp1);
4299
4300	/ Odd part /
4301
4302	tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); / c6 /
4303	/ Add fudge factor here for final descale. /
4304	tmp0 += ONE << (CONST_BITS-`1`);
4305
4306	dataptr[DCTSIZE*`1`] = (DCTELEM)
4307	RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), / c2-c6 /
4308	CONST_BITS);
4309	dataptr[DCTSIZE*`3`] = (DCTELEM)
4310	RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), / c2+c6 /
4311	CONST_BITS);
4312
4313	dataptr++; / advance pointer to next column /
4314	}
4315	}
4316
4317
4318	/*
4319	* Perform the forward DCT on a 1x2 sample block.
4320	*
4321	* 1-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
4322	*/
4323
4324	GLOBAL(void)
4325	jpeg_fdct_1x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4326	{
4327	INT32 tmp0, tmp1;
4328
4329	/ Pre-zero output coefficient block. /
4330	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4331
4332	tmp0 = GETJSAMPLE(sample_data[`0`][start_col]);
4333	tmp1 = GETJSAMPLE(sample_data[`1`][start_col]);
4334
4335	/ We leave the results scaled up by an overall factor of 8.*
4336	* We must also scale the output by (8/1)(8/2) = 2*5.
4337	*/
4338
4339	/ Even part /
4340	/ Apply unsigned->signed conversion /
4341	data[DCTSIZE`0`] = (DCTELEM) ((tmp0 + tmp1 - `2` CENTERJSAMPLE) << `5`);
4342
4343	/ Odd part /
4344	data[DCTSIZE*`1`] = (DCTELEM) ((tmp0 - tmp1) << `5`);
4345	}
4346
4347	#endif /* DCT_SCALING_SUPPORTED */
4348	#endif /* DCT_ISLOW_SUPPORTED */
4349

Browse the source code of MuPDF/thirdparty/libjpeg/jfdctint.c