1 | /******************************************************************** |
2 | * * |
3 | * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
4 | * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
5 | * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
6 | * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
7 | * * |
8 | * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
9 | * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
10 | * * |
11 | ******************************************************************** |
12 | |
13 | function: |
14 | last mod: $Id$ |
15 | |
16 | ********************************************************************/ |
17 | |
18 | #include <string.h> |
19 | #include "internal.h" |
20 | #include "dct.h" |
21 | |
22 | /*Performs an inverse 8 point Type-II DCT transform. |
23 | The output is scaled by a factor of 2 relative to the orthonormal version of |
24 | the transform. |
25 | _y: The buffer to store the result in. |
26 | Data will be placed in every 8th entry (e.g., in a column of an 8x8 |
27 | block). |
28 | _x: The input coefficients. |
29 | The first 8 entries are used (e.g., from a row of an 8x8 block).*/ |
30 | static void idct8(ogg_int16_t *_y,const ogg_int16_t _x[8]){ |
31 | ogg_int32_t t[8]; |
32 | ogg_int32_t r; |
33 | /*Stage 1:*/ |
34 | /*0-1 butterfly.*/ |
35 | t[0]=OC_C4S4*(ogg_int16_t)(_x[0]+_x[4])>>16; |
36 | t[1]=OC_C4S4*(ogg_int16_t)(_x[0]-_x[4])>>16; |
37 | /*2-3 rotation by 6pi/16.*/ |
38 | t[2]=(OC_C6S2*_x[2]>>16)-(OC_C2S6*_x[6]>>16); |
39 | t[3]=(OC_C2S6*_x[2]>>16)+(OC_C6S2*_x[6]>>16); |
40 | /*4-7 rotation by 7pi/16.*/ |
41 | t[4]=(OC_C7S1*_x[1]>>16)-(OC_C1S7*_x[7]>>16); |
42 | /*5-6 rotation by 3pi/16.*/ |
43 | t[5]=(OC_C3S5*_x[5]>>16)-(OC_C5S3*_x[3]>>16); |
44 | t[6]=(OC_C5S3*_x[5]>>16)+(OC_C3S5*_x[3]>>16); |
45 | t[7]=(OC_C1S7*_x[1]>>16)+(OC_C7S1*_x[7]>>16); |
46 | /*Stage 2:*/ |
47 | /*4-5 butterfly.*/ |
48 | r=t[4]+t[5]; |
49 | t[5]=OC_C4S4*(ogg_int16_t)(t[4]-t[5])>>16; |
50 | t[4]=r; |
51 | /*7-6 butterfly.*/ |
52 | r=t[7]+t[6]; |
53 | t[6]=OC_C4S4*(ogg_int16_t)(t[7]-t[6])>>16; |
54 | t[7]=r; |
55 | /*Stage 3:*/ |
56 | /*0-3 butterfly.*/ |
57 | r=t[0]+t[3]; |
58 | t[3]=t[0]-t[3]; |
59 | t[0]=r; |
60 | /*1-2 butterfly.*/ |
61 | r=t[1]+t[2]; |
62 | t[2]=t[1]-t[2]; |
63 | t[1]=r; |
64 | /*6-5 butterfly.*/ |
65 | r=t[6]+t[5]; |
66 | t[5]=t[6]-t[5]; |
67 | t[6]=r; |
68 | /*Stage 4:*/ |
69 | /*0-7 butterfly.*/ |
70 | _y[0<<3]=(ogg_int16_t)(t[0]+t[7]); |
71 | /*1-6 butterfly.*/ |
72 | _y[1<<3]=(ogg_int16_t)(t[1]+t[6]); |
73 | /*2-5 butterfly.*/ |
74 | _y[2<<3]=(ogg_int16_t)(t[2]+t[5]); |
75 | /*3-4 butterfly.*/ |
76 | _y[3<<3]=(ogg_int16_t)(t[3]+t[4]); |
77 | _y[4<<3]=(ogg_int16_t)(t[3]-t[4]); |
78 | _y[5<<3]=(ogg_int16_t)(t[2]-t[5]); |
79 | _y[6<<3]=(ogg_int16_t)(t[1]-t[6]); |
80 | _y[7<<3]=(ogg_int16_t)(t[0]-t[7]); |
81 | } |
82 | |
83 | /*Performs an inverse 8 point Type-II DCT transform. |
84 | The output is scaled by a factor of 2 relative to the orthonormal version of |
85 | the transform. |
86 | _y: The buffer to store the result in. |
87 | Data will be placed in every 8th entry (e.g., in a column of an 8x8 |
88 | block). |
89 | _x: The input coefficients. |
90 | Only the first 4 entries are used. |
91 | The other 4 are assumed to be 0.*/ |
92 | static void idct8_4(ogg_int16_t *_y,const ogg_int16_t _x[8]){ |
93 | ogg_int32_t t[8]; |
94 | ogg_int32_t r; |
95 | /*Stage 1:*/ |
96 | t[0]=OC_C4S4*_x[0]>>16; |
97 | t[2]=OC_C6S2*_x[2]>>16; |
98 | t[3]=OC_C2S6*_x[2]>>16; |
99 | t[4]=OC_C7S1*_x[1]>>16; |
100 | t[5]=-(OC_C5S3*_x[3]>>16); |
101 | t[6]=OC_C3S5*_x[3]>>16; |
102 | t[7]=OC_C1S7*_x[1]>>16; |
103 | /*Stage 2:*/ |
104 | r=t[4]+t[5]; |
105 | t[5]=OC_C4S4*(ogg_int16_t)(t[4]-t[5])>>16; |
106 | t[4]=r; |
107 | r=t[7]+t[6]; |
108 | t[6]=OC_C4S4*(ogg_int16_t)(t[7]-t[6])>>16; |
109 | t[7]=r; |
110 | /*Stage 3:*/ |
111 | t[1]=t[0]+t[2]; |
112 | t[2]=t[0]-t[2]; |
113 | r=t[0]+t[3]; |
114 | t[3]=t[0]-t[3]; |
115 | t[0]=r; |
116 | r=t[6]+t[5]; |
117 | t[5]=t[6]-t[5]; |
118 | t[6]=r; |
119 | /*Stage 4:*/ |
120 | _y[0<<3]=(ogg_int16_t)(t[0]+t[7]); |
121 | _y[1<<3]=(ogg_int16_t)(t[1]+t[6]); |
122 | _y[2<<3]=(ogg_int16_t)(t[2]+t[5]); |
123 | _y[3<<3]=(ogg_int16_t)(t[3]+t[4]); |
124 | _y[4<<3]=(ogg_int16_t)(t[3]-t[4]); |
125 | _y[5<<3]=(ogg_int16_t)(t[2]-t[5]); |
126 | _y[6<<3]=(ogg_int16_t)(t[1]-t[6]); |
127 | _y[7<<3]=(ogg_int16_t)(t[0]-t[7]); |
128 | } |
129 | |
130 | /*Performs an inverse 8 point Type-II DCT transform. |
131 | The output is scaled by a factor of 2 relative to the orthonormal version of |
132 | the transform. |
133 | _y: The buffer to store the result in. |
134 | Data will be placed in every 8th entry (e.g., in a column of an 8x8 |
135 | block). |
136 | _x: The input coefficients. |
137 | Only the first 3 entries are used. |
138 | The other 5 are assumed to be 0.*/ |
139 | static void idct8_3(ogg_int16_t *_y,const ogg_int16_t _x[8]){ |
140 | ogg_int32_t t[8]; |
141 | ogg_int32_t r; |
142 | /*Stage 1:*/ |
143 | t[0]=OC_C4S4*_x[0]>>16; |
144 | t[2]=OC_C6S2*_x[2]>>16; |
145 | t[3]=OC_C2S6*_x[2]>>16; |
146 | t[4]=OC_C7S1*_x[1]>>16; |
147 | t[7]=OC_C1S7*_x[1]>>16; |
148 | /*Stage 2:*/ |
149 | t[5]=OC_C4S4*t[4]>>16; |
150 | t[6]=OC_C4S4*t[7]>>16; |
151 | /*Stage 3:*/ |
152 | t[1]=t[0]+t[2]; |
153 | t[2]=t[0]-t[2]; |
154 | r=t[0]+t[3]; |
155 | t[3]=t[0]-t[3]; |
156 | t[0]=r; |
157 | r=t[6]+t[5]; |
158 | t[5]=t[6]-t[5]; |
159 | t[6]=r; |
160 | /*Stage 4:*/ |
161 | _y[0<<3]=(ogg_int16_t)(t[0]+t[7]); |
162 | _y[1<<3]=(ogg_int16_t)(t[1]+t[6]); |
163 | _y[2<<3]=(ogg_int16_t)(t[2]+t[5]); |
164 | _y[3<<3]=(ogg_int16_t)(t[3]+t[4]); |
165 | _y[4<<3]=(ogg_int16_t)(t[3]-t[4]); |
166 | _y[5<<3]=(ogg_int16_t)(t[2]-t[5]); |
167 | _y[6<<3]=(ogg_int16_t)(t[1]-t[6]); |
168 | _y[7<<3]=(ogg_int16_t)(t[0]-t[7]); |
169 | } |
170 | |
171 | /*Performs an inverse 8 point Type-II DCT transform. |
172 | The output is scaled by a factor of 2 relative to the orthonormal version of |
173 | the transform. |
174 | _y: The buffer to store the result in. |
175 | Data will be placed in every 8th entry (e.g., in a column of an 8x8 |
176 | block). |
177 | _x: The input coefficients. |
178 | Only the first 2 entries are used. |
179 | The other 6 are assumed to be 0.*/ |
180 | static void idct8_2(ogg_int16_t *_y,const ogg_int16_t _x[8]){ |
181 | ogg_int32_t t[8]; |
182 | ogg_int32_t r; |
183 | /*Stage 1:*/ |
184 | t[0]=OC_C4S4*_x[0]>>16; |
185 | t[4]=OC_C7S1*_x[1]>>16; |
186 | t[7]=OC_C1S7*_x[1]>>16; |
187 | /*Stage 2:*/ |
188 | t[5]=OC_C4S4*t[4]>>16; |
189 | t[6]=OC_C4S4*t[7]>>16; |
190 | /*Stage 3:*/ |
191 | r=t[6]+t[5]; |
192 | t[5]=t[6]-t[5]; |
193 | t[6]=r; |
194 | /*Stage 4:*/ |
195 | _y[0<<3]=(ogg_int16_t)(t[0]+t[7]); |
196 | _y[1<<3]=(ogg_int16_t)(t[0]+t[6]); |
197 | _y[2<<3]=(ogg_int16_t)(t[0]+t[5]); |
198 | _y[3<<3]=(ogg_int16_t)(t[0]+t[4]); |
199 | _y[4<<3]=(ogg_int16_t)(t[0]-t[4]); |
200 | _y[5<<3]=(ogg_int16_t)(t[0]-t[5]); |
201 | _y[6<<3]=(ogg_int16_t)(t[0]-t[6]); |
202 | _y[7<<3]=(ogg_int16_t)(t[0]-t[7]); |
203 | } |
204 | |
205 | /*Performs an inverse 8 point Type-II DCT transform. |
206 | The output is scaled by a factor of 2 relative to the orthonormal version of |
207 | the transform. |
208 | _y: The buffer to store the result in. |
209 | Data will be placed in every 8th entry (e.g., in a column of an 8x8 |
210 | block). |
211 | _x: The input coefficients. |
212 | Only the first entry is used. |
213 | The other 7 are assumed to be 0.*/ |
214 | static void idct8_1(ogg_int16_t *_y,const ogg_int16_t _x[1]){ |
215 | _y[0<<3]=_y[1<<3]=_y[2<<3]=_y[3<<3]= |
216 | _y[4<<3]=_y[5<<3]=_y[6<<3]=_y[7<<3]=(ogg_int16_t)(OC_C4S4*_x[0]>>16); |
217 | } |
218 | |
219 | /*Performs an inverse 8x8 Type-II DCT transform. |
220 | The input is assumed to be scaled by a factor of 4 relative to orthonormal |
221 | version of the transform. |
222 | All coefficients but the first 3 in zig-zag scan order are assumed to be 0: |
223 | x x 0 0 0 0 0 0 |
224 | x 0 0 0 0 0 0 0 |
225 | 0 0 0 0 0 0 0 0 |
226 | 0 0 0 0 0 0 0 0 |
227 | 0 0 0 0 0 0 0 0 |
228 | 0 0 0 0 0 0 0 0 |
229 | 0 0 0 0 0 0 0 0 |
230 | 0 0 0 0 0 0 0 0 |
231 | _y: The buffer to store the result in. |
232 | This may be the same as _x. |
233 | _x: The input coefficients.*/ |
234 | static void oc_idct8x8_3(ogg_int16_t _y[64],ogg_int16_t _x[64]){ |
235 | ogg_int16_t w[64]; |
236 | int i; |
237 | /*Transform rows of x into columns of w.*/ |
238 | idct8_2(w,_x); |
239 | idct8_1(w+1,_x+8); |
240 | /*Transform rows of w into columns of y.*/ |
241 | for(i=0;i<8;i++)idct8_2(_y+i,w+i*8); |
242 | /*Adjust for the scale factor.*/ |
243 | for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4); |
244 | /*Clear input data for next block.*/ |
245 | _x[0]=_x[1]=_x[8]=0; |
246 | } |
247 | |
248 | /*Performs an inverse 8x8 Type-II DCT transform. |
249 | The input is assumed to be scaled by a factor of 4 relative to orthonormal |
250 | version of the transform. |
251 | All coefficients but the first 10 in zig-zag scan order are assumed to be 0: |
252 | x x x x 0 0 0 0 |
253 | x x x 0 0 0 0 0 |
254 | x x 0 0 0 0 0 0 |
255 | x 0 0 0 0 0 0 0 |
256 | 0 0 0 0 0 0 0 0 |
257 | 0 0 0 0 0 0 0 0 |
258 | 0 0 0 0 0 0 0 0 |
259 | 0 0 0 0 0 0 0 0 |
260 | _y: The buffer to store the result in. |
261 | This may be the same as _x. |
262 | _x: The input coefficients.*/ |
263 | static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){ |
264 | ogg_int16_t w[64]; |
265 | int i; |
266 | /*Transform rows of x into columns of w.*/ |
267 | idct8_4(w,_x); |
268 | idct8_3(w+1,_x+8); |
269 | idct8_2(w+2,_x+16); |
270 | idct8_1(w+3,_x+24); |
271 | /*Transform rows of w into columns of y.*/ |
272 | for(i=0;i<8;i++)idct8_4(_y+i,w+i*8); |
273 | /*Adjust for the scale factor.*/ |
274 | for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4); |
275 | /*Clear input data for next block.*/ |
276 | _x[0]=_x[1]=_x[2]=_x[3]=_x[8]=_x[9]=_x[10]=_x[16]=_x[17]=_x[24]=0; |
277 | } |
278 | |
279 | /*Performs an inverse 8x8 Type-II DCT transform. |
280 | The input is assumed to be scaled by a factor of 4 relative to orthonormal |
281 | version of the transform. |
282 | _y: The buffer to store the result in. |
283 | This may be the same as _x. |
284 | _x: The input coefficients.*/ |
285 | static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){ |
286 | ogg_int16_t w[64]; |
287 | int i; |
288 | /*Transform rows of x into columns of w.*/ |
289 | for(i=0;i<8;i++)idct8(w+i,_x+i*8); |
290 | /*Transform rows of w into columns of y.*/ |
291 | for(i=0;i<8;i++)idct8(_y+i,w+i*8); |
292 | /*Adjust for the scale factor.*/ |
293 | for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4); |
294 | /*Clear input data for next block.*/ |
295 | for(i=0;i<64;i++)_x[i]=0; |
296 | } |
297 | |
298 | /*Performs an inverse 8x8 Type-II DCT transform. |
299 | The input is assumed to be scaled by a factor of 4 relative to orthonormal |
300 | version of the transform.*/ |
301 | void oc_idct8x8_c(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ |
302 | /*_last_zzi is subtly different from an actual count of the number of |
303 | coefficients we decoded for this block. |
304 | It contains the value of zzi BEFORE the final token in the block was |
305 | decoded. |
306 | In most cases this is an EOB token (the continuation of an EOB run from a |
307 | previous block counts), and so this is the same as the coefficient count. |
308 | However, in the case that the last token was NOT an EOB token, but filled |
309 | the block up with exactly 64 coefficients, _last_zzi will be less than 64. |
310 | Provided the last token was not a pure zero run, the minimum value it can |
311 | be is 46, and so that doesn't affect any of the cases in this routine. |
312 | However, if the last token WAS a pure zero run of length 63, then _last_zzi |
313 | will be 1 while the number of coefficients decoded is 64. |
314 | Thus, we will trigger the following special case, where the real |
315 | coefficient count would not. |
316 | Note also that a zero run of length 64 will give _last_zzi a value of 0, |
317 | but we still process the DC coefficient, which might have a non-zero value |
318 | due to DC prediction. |
319 | Although convoluted, this is arguably the correct behavior: it allows us to |
320 | use a smaller transform when the block ends with a long zero run instead |
321 | of a normal EOB token. |
322 | It could be smarter... multiple separate zero runs at the end of a block |
323 | will fool it, but an encoder that generates these really deserves what it |
324 | gets. |
325 | Needless to say we inherited this approach from VP3.*/ |
326 | /*Then perform the iDCT.*/ |
327 | if(_last_zzi<=3)oc_idct8x8_3(_y,_x); |
328 | else if(_last_zzi<=10)oc_idct8x8_10(_y,_x); |
329 | else oc_idct8x8_slow(_y,_x); |
330 | } |
331 | |