1/********************************************************************
2 * *
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7 * *
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
9 * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
10 * *
11 ********************************************************************
12
13 function:
14 last mod: $Id$
15
16 ********************************************************************/
17
18#include <string.h>
19#include "internal.h"
20#include "dct.h"
21
22/*Performs an inverse 8 point Type-II DCT transform.
23 The output is scaled by a factor of 2 relative to the orthonormal version of
24 the transform.
25 _y: The buffer to store the result in.
26 Data will be placed in every 8th entry (e.g., in a column of an 8x8
27 block).
28 _x: The input coefficients.
29 The first 8 entries are used (e.g., from a row of an 8x8 block).*/
30static void idct8(ogg_int16_t *_y,const ogg_int16_t _x[8]){
31 ogg_int32_t t[8];
32 ogg_int32_t r;
33 /*Stage 1:*/
34 /*0-1 butterfly.*/
35 t[0]=OC_C4S4*(ogg_int16_t)(_x[0]+_x[4])>>16;
36 t[1]=OC_C4S4*(ogg_int16_t)(_x[0]-_x[4])>>16;
37 /*2-3 rotation by 6pi/16.*/
38 t[2]=(OC_C6S2*_x[2]>>16)-(OC_C2S6*_x[6]>>16);
39 t[3]=(OC_C2S6*_x[2]>>16)+(OC_C6S2*_x[6]>>16);
40 /*4-7 rotation by 7pi/16.*/
41 t[4]=(OC_C7S1*_x[1]>>16)-(OC_C1S7*_x[7]>>16);
42 /*5-6 rotation by 3pi/16.*/
43 t[5]=(OC_C3S5*_x[5]>>16)-(OC_C5S3*_x[3]>>16);
44 t[6]=(OC_C5S3*_x[5]>>16)+(OC_C3S5*_x[3]>>16);
45 t[7]=(OC_C1S7*_x[1]>>16)+(OC_C7S1*_x[7]>>16);
46 /*Stage 2:*/
47 /*4-5 butterfly.*/
48 r=t[4]+t[5];
49 t[5]=OC_C4S4*(ogg_int16_t)(t[4]-t[5])>>16;
50 t[4]=r;
51 /*7-6 butterfly.*/
52 r=t[7]+t[6];
53 t[6]=OC_C4S4*(ogg_int16_t)(t[7]-t[6])>>16;
54 t[7]=r;
55 /*Stage 3:*/
56 /*0-3 butterfly.*/
57 r=t[0]+t[3];
58 t[3]=t[0]-t[3];
59 t[0]=r;
60 /*1-2 butterfly.*/
61 r=t[1]+t[2];
62 t[2]=t[1]-t[2];
63 t[1]=r;
64 /*6-5 butterfly.*/
65 r=t[6]+t[5];
66 t[5]=t[6]-t[5];
67 t[6]=r;
68 /*Stage 4:*/
69 /*0-7 butterfly.*/
70 _y[0<<3]=(ogg_int16_t)(t[0]+t[7]);
71 /*1-6 butterfly.*/
72 _y[1<<3]=(ogg_int16_t)(t[1]+t[6]);
73 /*2-5 butterfly.*/
74 _y[2<<3]=(ogg_int16_t)(t[2]+t[5]);
75 /*3-4 butterfly.*/
76 _y[3<<3]=(ogg_int16_t)(t[3]+t[4]);
77 _y[4<<3]=(ogg_int16_t)(t[3]-t[4]);
78 _y[5<<3]=(ogg_int16_t)(t[2]-t[5]);
79 _y[6<<3]=(ogg_int16_t)(t[1]-t[6]);
80 _y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
81}
82
83/*Performs an inverse 8 point Type-II DCT transform.
84 The output is scaled by a factor of 2 relative to the orthonormal version of
85 the transform.
86 _y: The buffer to store the result in.
87 Data will be placed in every 8th entry (e.g., in a column of an 8x8
88 block).
89 _x: The input coefficients.
90 Only the first 4 entries are used.
91 The other 4 are assumed to be 0.*/
92static void idct8_4(ogg_int16_t *_y,const ogg_int16_t _x[8]){
93 ogg_int32_t t[8];
94 ogg_int32_t r;
95 /*Stage 1:*/
96 t[0]=OC_C4S4*_x[0]>>16;
97 t[2]=OC_C6S2*_x[2]>>16;
98 t[3]=OC_C2S6*_x[2]>>16;
99 t[4]=OC_C7S1*_x[1]>>16;
100 t[5]=-(OC_C5S3*_x[3]>>16);
101 t[6]=OC_C3S5*_x[3]>>16;
102 t[7]=OC_C1S7*_x[1]>>16;
103 /*Stage 2:*/
104 r=t[4]+t[5];
105 t[5]=OC_C4S4*(ogg_int16_t)(t[4]-t[5])>>16;
106 t[4]=r;
107 r=t[7]+t[6];
108 t[6]=OC_C4S4*(ogg_int16_t)(t[7]-t[6])>>16;
109 t[7]=r;
110 /*Stage 3:*/
111 t[1]=t[0]+t[2];
112 t[2]=t[0]-t[2];
113 r=t[0]+t[3];
114 t[3]=t[0]-t[3];
115 t[0]=r;
116 r=t[6]+t[5];
117 t[5]=t[6]-t[5];
118 t[6]=r;
119 /*Stage 4:*/
120 _y[0<<3]=(ogg_int16_t)(t[0]+t[7]);
121 _y[1<<3]=(ogg_int16_t)(t[1]+t[6]);
122 _y[2<<3]=(ogg_int16_t)(t[2]+t[5]);
123 _y[3<<3]=(ogg_int16_t)(t[3]+t[4]);
124 _y[4<<3]=(ogg_int16_t)(t[3]-t[4]);
125 _y[5<<3]=(ogg_int16_t)(t[2]-t[5]);
126 _y[6<<3]=(ogg_int16_t)(t[1]-t[6]);
127 _y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
128}
129
130/*Performs an inverse 8 point Type-II DCT transform.
131 The output is scaled by a factor of 2 relative to the orthonormal version of
132 the transform.
133 _y: The buffer to store the result in.
134 Data will be placed in every 8th entry (e.g., in a column of an 8x8
135 block).
136 _x: The input coefficients.
137 Only the first 3 entries are used.
138 The other 5 are assumed to be 0.*/
139static void idct8_3(ogg_int16_t *_y,const ogg_int16_t _x[8]){
140 ogg_int32_t t[8];
141 ogg_int32_t r;
142 /*Stage 1:*/
143 t[0]=OC_C4S4*_x[0]>>16;
144 t[2]=OC_C6S2*_x[2]>>16;
145 t[3]=OC_C2S6*_x[2]>>16;
146 t[4]=OC_C7S1*_x[1]>>16;
147 t[7]=OC_C1S7*_x[1]>>16;
148 /*Stage 2:*/
149 t[5]=OC_C4S4*t[4]>>16;
150 t[6]=OC_C4S4*t[7]>>16;
151 /*Stage 3:*/
152 t[1]=t[0]+t[2];
153 t[2]=t[0]-t[2];
154 r=t[0]+t[3];
155 t[3]=t[0]-t[3];
156 t[0]=r;
157 r=t[6]+t[5];
158 t[5]=t[6]-t[5];
159 t[6]=r;
160 /*Stage 4:*/
161 _y[0<<3]=(ogg_int16_t)(t[0]+t[7]);
162 _y[1<<3]=(ogg_int16_t)(t[1]+t[6]);
163 _y[2<<3]=(ogg_int16_t)(t[2]+t[5]);
164 _y[3<<3]=(ogg_int16_t)(t[3]+t[4]);
165 _y[4<<3]=(ogg_int16_t)(t[3]-t[4]);
166 _y[5<<3]=(ogg_int16_t)(t[2]-t[5]);
167 _y[6<<3]=(ogg_int16_t)(t[1]-t[6]);
168 _y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
169}
170
171/*Performs an inverse 8 point Type-II DCT transform.
172 The output is scaled by a factor of 2 relative to the orthonormal version of
173 the transform.
174 _y: The buffer to store the result in.
175 Data will be placed in every 8th entry (e.g., in a column of an 8x8
176 block).
177 _x: The input coefficients.
178 Only the first 2 entries are used.
179 The other 6 are assumed to be 0.*/
180static void idct8_2(ogg_int16_t *_y,const ogg_int16_t _x[8]){
181 ogg_int32_t t[8];
182 ogg_int32_t r;
183 /*Stage 1:*/
184 t[0]=OC_C4S4*_x[0]>>16;
185 t[4]=OC_C7S1*_x[1]>>16;
186 t[7]=OC_C1S7*_x[1]>>16;
187 /*Stage 2:*/
188 t[5]=OC_C4S4*t[4]>>16;
189 t[6]=OC_C4S4*t[7]>>16;
190 /*Stage 3:*/
191 r=t[6]+t[5];
192 t[5]=t[6]-t[5];
193 t[6]=r;
194 /*Stage 4:*/
195 _y[0<<3]=(ogg_int16_t)(t[0]+t[7]);
196 _y[1<<3]=(ogg_int16_t)(t[0]+t[6]);
197 _y[2<<3]=(ogg_int16_t)(t[0]+t[5]);
198 _y[3<<3]=(ogg_int16_t)(t[0]+t[4]);
199 _y[4<<3]=(ogg_int16_t)(t[0]-t[4]);
200 _y[5<<3]=(ogg_int16_t)(t[0]-t[5]);
201 _y[6<<3]=(ogg_int16_t)(t[0]-t[6]);
202 _y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
203}
204
205/*Performs an inverse 8 point Type-II DCT transform.
206 The output is scaled by a factor of 2 relative to the orthonormal version of
207 the transform.
208 _y: The buffer to store the result in.
209 Data will be placed in every 8th entry (e.g., in a column of an 8x8
210 block).
211 _x: The input coefficients.
212 Only the first entry is used.
213 The other 7 are assumed to be 0.*/
214static void idct8_1(ogg_int16_t *_y,const ogg_int16_t _x[1]){
215 _y[0<<3]=_y[1<<3]=_y[2<<3]=_y[3<<3]=
216 _y[4<<3]=_y[5<<3]=_y[6<<3]=_y[7<<3]=(ogg_int16_t)(OC_C4S4*_x[0]>>16);
217}
218
219/*Performs an inverse 8x8 Type-II DCT transform.
220 The input is assumed to be scaled by a factor of 4 relative to orthonormal
221 version of the transform.
222 All coefficients but the first 3 in zig-zag scan order are assumed to be 0:
223 x x 0 0 0 0 0 0
224 x 0 0 0 0 0 0 0
225 0 0 0 0 0 0 0 0
226 0 0 0 0 0 0 0 0
227 0 0 0 0 0 0 0 0
228 0 0 0 0 0 0 0 0
229 0 0 0 0 0 0 0 0
230 0 0 0 0 0 0 0 0
231 _y: The buffer to store the result in.
232 This may be the same as _x.
233 _x: The input coefficients.*/
234static void oc_idct8x8_3(ogg_int16_t _y[64],ogg_int16_t _x[64]){
235 ogg_int16_t w[64];
236 int i;
237 /*Transform rows of x into columns of w.*/
238 idct8_2(w,_x);
239 idct8_1(w+1,_x+8);
240 /*Transform rows of w into columns of y.*/
241 for(i=0;i<8;i++)idct8_2(_y+i,w+i*8);
242 /*Adjust for the scale factor.*/
243 for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
244 /*Clear input data for next block.*/
245 _x[0]=_x[1]=_x[8]=0;
246}
247
248/*Performs an inverse 8x8 Type-II DCT transform.
249 The input is assumed to be scaled by a factor of 4 relative to orthonormal
250 version of the transform.
251 All coefficients but the first 10 in zig-zag scan order are assumed to be 0:
252 x x x x 0 0 0 0
253 x x x 0 0 0 0 0
254 x x 0 0 0 0 0 0
255 x 0 0 0 0 0 0 0
256 0 0 0 0 0 0 0 0
257 0 0 0 0 0 0 0 0
258 0 0 0 0 0 0 0 0
259 0 0 0 0 0 0 0 0
260 _y: The buffer to store the result in.
261 This may be the same as _x.
262 _x: The input coefficients.*/
263static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){
264 ogg_int16_t w[64];
265 int i;
266 /*Transform rows of x into columns of w.*/
267 idct8_4(w,_x);
268 idct8_3(w+1,_x+8);
269 idct8_2(w+2,_x+16);
270 idct8_1(w+3,_x+24);
271 /*Transform rows of w into columns of y.*/
272 for(i=0;i<8;i++)idct8_4(_y+i,w+i*8);
273 /*Adjust for the scale factor.*/
274 for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
275 /*Clear input data for next block.*/
276 _x[0]=_x[1]=_x[2]=_x[3]=_x[8]=_x[9]=_x[10]=_x[16]=_x[17]=_x[24]=0;
277}
278
279/*Performs an inverse 8x8 Type-II DCT transform.
280 The input is assumed to be scaled by a factor of 4 relative to orthonormal
281 version of the transform.
282 _y: The buffer to store the result in.
283 This may be the same as _x.
284 _x: The input coefficients.*/
285static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){
286 ogg_int16_t w[64];
287 int i;
288 /*Transform rows of x into columns of w.*/
289 for(i=0;i<8;i++)idct8(w+i,_x+i*8);
290 /*Transform rows of w into columns of y.*/
291 for(i=0;i<8;i++)idct8(_y+i,w+i*8);
292 /*Adjust for the scale factor.*/
293 for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
294 /*Clear input data for next block.*/
295 for(i=0;i<64;i++)_x[i]=0;
296}
297
298/*Performs an inverse 8x8 Type-II DCT transform.
299 The input is assumed to be scaled by a factor of 4 relative to orthonormal
300 version of the transform.*/
301void oc_idct8x8_c(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
302 /*_last_zzi is subtly different from an actual count of the number of
303 coefficients we decoded for this block.
304 It contains the value of zzi BEFORE the final token in the block was
305 decoded.
306 In most cases this is an EOB token (the continuation of an EOB run from a
307 previous block counts), and so this is the same as the coefficient count.
308 However, in the case that the last token was NOT an EOB token, but filled
309 the block up with exactly 64 coefficients, _last_zzi will be less than 64.
310 Provided the last token was not a pure zero run, the minimum value it can
311 be is 46, and so that doesn't affect any of the cases in this routine.
312 However, if the last token WAS a pure zero run of length 63, then _last_zzi
313 will be 1 while the number of coefficients decoded is 64.
314 Thus, we will trigger the following special case, where the real
315 coefficient count would not.
316 Note also that a zero run of length 64 will give _last_zzi a value of 0,
317 but we still process the DC coefficient, which might have a non-zero value
318 due to DC prediction.
319 Although convoluted, this is arguably the correct behavior: it allows us to
320 use a smaller transform when the block ends with a long zero run instead
321 of a normal EOB token.
322 It could be smarter... multiple separate zero runs at the end of a block
323 will fool it, but an encoder that generates these really deserves what it
324 gets.
325 Needless to say we inherited this approach from VP3.*/
326 /*Then perform the iDCT.*/
327 if(_last_zzi<=3)oc_idct8x8_3(_y,_x);
328 else if(_last_zzi<=10)oc_idct8x8_10(_y,_x);
329 else oc_idct8x8_slow(_y,_x);
330}
331