1 | /******************************************************************** |
2 | * * |
3 | * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
4 | * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
5 | * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
6 | * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
7 | * * |
8 | * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
9 | * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
10 | * * |
11 | ******************************************************************** |
12 | |
13 | function: |
14 | last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $ |
15 | |
16 | ********************************************************************/ |
17 | |
18 | /*SSE2 acceleration of Theora's iDCT.*/ |
19 | #include "x86int.h" |
20 | #include "sse2trans.h" |
21 | #include "../dct.h" |
22 | |
23 | #if defined(OC_X86_ASM) |
24 | |
25 | /*A table of constants used by the MMX routines.*/ |
26 | const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={ |
27 | 8, 8, 8, 8, 8, 8, 8, 8, |
28 | OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7, |
29 | OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6, |
30 | OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5, |
31 | OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4, |
32 | OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3, |
33 | OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2, |
34 | OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1 |
35 | }; |
36 | |
37 | |
38 | /*Performs the first three stages of the iDCT. |
39 | xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input |
40 | (accessed in that order). |
41 | The remaining rows must be in _x at their corresponding locations. |
42 | On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 |
43 | contain rows 4 through 7.*/ |
44 | #define OC_IDCT_8x8_ABC(_x) \ |
45 | "#OC_IDCT_8x8_ABC\n\t" \ |
46 | /*Stage 1:*/ \ |
47 | /*2-3 rotation by 6pi/16. \ |
48 | xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \ |
49 | "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \ |
50 | "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \ |
51 | "movdqa %%xmm1,%%xmm0\n\t" \ |
52 | "pmulhw %%xmm2,%%xmm1\n\t" \ |
53 | "movdqa %%xmm4,%%xmm7\n\t" \ |
54 | "pmulhw %%xmm6,%%xmm0\n\t" \ |
55 | "pmulhw %%xmm2,%%xmm7\n\t" \ |
56 | "pmulhw %%xmm6,%%xmm4\n\t" \ |
57 | "paddw %%xmm6,%%xmm0\n\t" \ |
58 | "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \ |
59 | "paddw %%xmm1,%%xmm2\n\t" \ |
60 | "psubw %%xmm0,%%xmm7\n\t" \ |
61 | "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
62 | "paddw %%xmm4,%%xmm2\n\t" \ |
63 | "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \ |
64 | "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
65 | /*5-6 rotation by 3pi/16. \ |
66 | xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \ |
67 | "movdqa %%xmm4,%%xmm2\n\t" \ |
68 | "movdqa %%xmm6,%%xmm1\n\t" \ |
69 | "pmulhw %%xmm3,%%xmm4\n\t" \ |
70 | "pmulhw %%xmm5,%%xmm1\n\t" \ |
71 | "pmulhw %%xmm3,%%xmm6\n\t" \ |
72 | "pmulhw %%xmm5,%%xmm2\n\t" \ |
73 | "paddw %%xmm3,%%xmm4\n\t" \ |
74 | "paddw %%xmm5,%%xmm3\n\t" \ |
75 | "paddw %%xmm6,%%xmm3\n\t" \ |
76 | "movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \ |
77 | "paddw %%xmm5,%%xmm1\n\t" \ |
78 | "movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \ |
79 | "paddw %%xmm3,%%xmm2\n\t" \ |
80 | "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \ |
81 | "psubw %%xmm4,%%xmm1\n\t" \ |
82 | "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \ |
83 | /*4-7 rotation by 7pi/16. \ |
84 | xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \ |
85 | "movdqa %%xmm3,%%xmm0\n\t" \ |
86 | "movdqa %%xmm4,%%xmm7\n\t" \ |
87 | "pmulhw %%xmm5,%%xmm3\n\t" \ |
88 | "pmulhw %%xmm5,%%xmm7\n\t" \ |
89 | "pmulhw %%xmm6,%%xmm4\n\t" \ |
90 | "pmulhw %%xmm6,%%xmm0\n\t" \ |
91 | "paddw %%xmm6,%%xmm4\n\t" \ |
92 | "movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \ |
93 | "paddw %%xmm5,%%xmm7\n\t" \ |
94 | "psubw %%xmm4,%%xmm3\n\t" \ |
95 | "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \ |
96 | "paddw %%xmm7,%%xmm0\n\t" \ |
97 | "movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \ |
98 | /*0-1 butterfly. \ |
99 | xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \ |
100 | "paddw %%xmm7,%%xmm6\n\t" \ |
101 | "movdqa %%xmm4,%%xmm5\n\t" \ |
102 | "pmulhw %%xmm6,%%xmm4\n\t" \ |
103 | "paddw %%xmm7,%%xmm7\n\t" \ |
104 | "psubw %%xmm6,%%xmm7\n\t" \ |
105 | "paddw %%xmm6,%%xmm4\n\t" \ |
106 | /*Stage 2:*/ \ |
107 | /*4-5 butterfly: xmm3=t[4], xmm1=t[5] \ |
108 | 7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \ |
109 | "movdqa %%xmm3,%%xmm6\n\t" \ |
110 | "paddw %%xmm1,%%xmm3\n\t" \ |
111 | "psubw %%xmm1,%%xmm6\n\t" \ |
112 | "movdqa %%xmm5,%%xmm1\n\t" \ |
113 | "pmulhw %%xmm7,%%xmm5\n\t" \ |
114 | "paddw %%xmm7,%%xmm5\n\t" \ |
115 | "movdqa %%xmm0,%%xmm7\n\t" \ |
116 | "paddw %%xmm2,%%xmm0\n\t" \ |
117 | "psubw %%xmm2,%%xmm7\n\t" \ |
118 | "movdqa %%xmm1,%%xmm2\n\t" \ |
119 | "pmulhw %%xmm6,%%xmm1\n\t" \ |
120 | "pmulhw %%xmm7,%%xmm2\n\t" \ |
121 | "paddw %%xmm6,%%xmm1\n\t" \ |
122 | "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \ |
123 | "paddw %%xmm7,%%xmm2\n\t" \ |
124 | "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \ |
125 | /*Stage 3: \ |
126 | 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \ |
127 | 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \ |
128 | 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \ |
129 | "paddw %%xmm2,%%xmm1\n\t" \ |
130 | "paddw %%xmm5,%%xmm6\n\t" \ |
131 | "paddw %%xmm4,%%xmm7\n\t" \ |
132 | "paddw %%xmm2,%%xmm2\n\t" \ |
133 | "paddw %%xmm4,%%xmm4\n\t" \ |
134 | "paddw %%xmm5,%%xmm5\n\t" \ |
135 | "psubw %%xmm1,%%xmm2\n\t" \ |
136 | "psubw %%xmm7,%%xmm4\n\t" \ |
137 | "psubw %%xmm6,%%xmm5\n\t" \ |
138 | |
139 | /*Performs the last stage of the iDCT. |
140 | On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 |
141 | contain rows 4 through 7. |
142 | On output, xmm0 through xmm7 contain the corresponding rows.*/ |
143 | #define OC_IDCT_8x8_D \ |
144 | "#OC_IDCT_8x8_D\n\t" \ |
145 | /*Stage 4: \ |
146 | 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \ |
147 | 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \ |
148 | 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \ |
149 | 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \ |
150 | "psubw %%xmm0,%%xmm7\n\t" \ |
151 | "psubw %%xmm1,%%xmm6\n\t" \ |
152 | "psubw %%xmm2,%%xmm5\n\t" \ |
153 | "psubw %%xmm3,%%xmm4\n\t" \ |
154 | "paddw %%xmm0,%%xmm0\n\t" \ |
155 | "paddw %%xmm1,%%xmm1\n\t" \ |
156 | "paddw %%xmm2,%%xmm2\n\t" \ |
157 | "paddw %%xmm3,%%xmm3\n\t" \ |
158 | "paddw %%xmm7,%%xmm0\n\t" \ |
159 | "paddw %%xmm6,%%xmm1\n\t" \ |
160 | "paddw %%xmm5,%%xmm2\n\t" \ |
161 | "paddw %%xmm4,%%xmm3\n\t" \ |
162 | |
163 | /*Performs the last stage of the iDCT. |
164 | On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 |
165 | contain rows 4 through 7. |
166 | On output, xmm0 through xmm7 contain the corresponding rows.*/ |
167 | #define OC_IDCT_8x8_D_STORE \ |
168 | "#OC_IDCT_8x8_D_STORE\n\t" \ |
169 | /*Stage 4: \ |
170 | 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \ |
171 | 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \ |
172 | 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \ |
173 | 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \ |
174 | "psubw %%xmm3,%%xmm4\n\t" \ |
175 | "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \ |
176 | "movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \ |
177 | "psubw %%xmm0,%%xmm7\n\t" \ |
178 | "psubw %%xmm1,%%xmm6\n\t" \ |
179 | "psubw %%xmm2,%%xmm5\n\t" \ |
180 | "paddw %%xmm4,%%xmm7\n\t" \ |
181 | "paddw %%xmm4,%%xmm6\n\t" \ |
182 | "paddw %%xmm4,%%xmm5\n\t" \ |
183 | "paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \ |
184 | "paddw %%xmm0,%%xmm0\n\t" \ |
185 | "paddw %%xmm1,%%xmm1\n\t" \ |
186 | "paddw %%xmm2,%%xmm2\n\t" \ |
187 | "paddw %%xmm3,%%xmm3\n\t" \ |
188 | "paddw %%xmm7,%%xmm0\n\t" \ |
189 | "paddw %%xmm6,%%xmm1\n\t" \ |
190 | "psraw $4,%%xmm0\n\t" \ |
191 | "paddw %%xmm5,%%xmm2\n\t" \ |
192 | "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \ |
193 | "psraw $4,%%xmm1\n\t" \ |
194 | "paddw %%xmm4,%%xmm3\n\t" \ |
195 | "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \ |
196 | "psraw $4,%%xmm2\n\t" \ |
197 | "movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \ |
198 | "psraw $4,%%xmm3\n\t" \ |
199 | "movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \ |
200 | "psraw $4,%%xmm4\n\t" \ |
201 | "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \ |
202 | "psraw $4,%%xmm5\n\t" \ |
203 | "movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \ |
204 | "psraw $4,%%xmm6\n\t" \ |
205 | "movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \ |
206 | "psraw $4,%%xmm7\n\t" \ |
207 | "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \ |
208 | |
209 | static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){ |
210 | OC_ALIGN16(ogg_int16_t buf[16]); |
211 | int i; |
212 | /*This routine accepts an 8x8 matrix pre-transposed.*/ |
213 | __asm__ __volatile__( |
214 | /*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/ |
215 | "movdqa " OC_MEM_OFFS(0x20,x)",%%xmm2\n\t" |
216 | "movdqa " OC_MEM_OFFS(0x60,x)",%%xmm6\n\t" |
217 | "movdqa " OC_MEM_OFFS(0x30,x)",%%xmm3\n\t" |
218 | "movdqa " OC_MEM_OFFS(0x50,x)",%%xmm5\n\t" |
219 | OC_IDCT_8x8_ABC(x) |
220 | OC_IDCT_8x8_D |
221 | OC_TRANSPOSE_8x8 |
222 | /*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/ |
223 | "movdqa %%xmm7," OC_MEM_OFFS(0x70,y)"\n\t" |
224 | "movdqa %%xmm4," OC_MEM_OFFS(0x40,y)"\n\t" |
225 | "movdqa %%xmm1," OC_MEM_OFFS(0x10,y)"\n\t" |
226 | "movdqa %%xmm0," OC_MEM_OFFS(0x00,y)"\n\t" |
227 | OC_IDCT_8x8_ABC(y) |
228 | OC_IDCT_8x8_D_STORE |
229 | :[buf]"=m" (OC_ARRAY_OPERAND(ogg_int16_t,buf,16)), |
230 | [y]"=m" (OC_ARRAY_OPERAND(ogg_int16_t,_y,64)) |
231 | :[x]"m" (OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)), |
232 | [c]"m" (OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)) |
233 | ); |
234 | __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t" ::); |
235 | /*Clear input data for next block (decoder only).*/ |
236 | for(i=0;i<2;i++){ |
237 | __asm__ __volatile__( |
238 | "movdqa %%xmm0," OC_MEM_OFFS(0x00,x)"\n\t" |
239 | "movdqa %%xmm0," OC_MEM_OFFS(0x10,x)"\n\t" |
240 | "movdqa %%xmm0," OC_MEM_OFFS(0x20,x)"\n\t" |
241 | "movdqa %%xmm0," OC_MEM_OFFS(0x30,x)"\n\t" |
242 | :[x]"=m" (OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32)) |
243 | ); |
244 | } |
245 | } |
246 | |
247 | /*For the first step of the 10-coefficient version of the 8x8 iDCT, we only |
248 | need to work with four columns at a time. |
249 | Doing this in MMX is faster on processors with a 64-bit data path.*/ |
250 | #define OC_IDCT_8x8_10_MMX \ |
251 | "#OC_IDCT_8x8_10_MMX\n\t" \ |
252 | /*Stage 1:*/ \ |
253 | /*2-3 rotation by 6pi/16. \ |
254 | mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \ |
255 | "movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \ |
256 | "movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \ |
257 | "pmulhw %%mm2,%%mm6\n\t" \ |
258 | "pmulhw %%mm2,%%mm7\n\t" \ |
259 | "movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \ |
260 | "paddw %%mm6,%%mm2\n\t" \ |
261 | "movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
262 | "movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \ |
263 | "movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
264 | /*5-6 rotation by 3pi/16. \ |
265 | mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \ |
266 | "pmulhw %%mm3,%%mm5\n\t" \ |
267 | "pmulhw %%mm3,%%mm2\n\t" \ |
268 | "movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \ |
269 | "paddw %%mm3,%%mm5\n\t" \ |
270 | "paddw %%mm3,%%mm2\n\t" \ |
271 | "movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \ |
272 | /*4-7 rotation by 7pi/16. \ |
273 | mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \ |
274 | "pmulhw %%mm1,%%mm3\n\t" \ |
275 | "pmulhw %%mm1,%%mm7\n\t" \ |
276 | "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ |
277 | "movq %%mm3,%%mm6\n\t" \ |
278 | "paddw %%mm1,%%mm7\n\t" \ |
279 | /*0-1 butterfly. \ |
280 | mm4=C4, mm0=X0, X4=0.*/ \ |
281 | /*Stage 2:*/ \ |
282 | /*4-5 butterfly: mm3=t[4], mm5=t[5] \ |
283 | 7-6 butterfly: mm2=t[6], mm7=t[7]*/ \ |
284 | "psubw %%mm5,%%mm3\n\t" \ |
285 | "paddw %%mm5,%%mm6\n\t" \ |
286 | "movq %%mm4,%%mm1\n\t" \ |
287 | "pmulhw %%mm0,%%mm4\n\t" \ |
288 | "paddw %%mm0,%%mm4\n\t" \ |
289 | "movq %%mm7,%%mm0\n\t" \ |
290 | "movq %%mm4,%%mm5\n\t" \ |
291 | "paddw %%mm2,%%mm0\n\t" \ |
292 | "psubw %%mm2,%%mm7\n\t" \ |
293 | "movq %%mm1,%%mm2\n\t" \ |
294 | "pmulhw %%mm6,%%mm1\n\t" \ |
295 | "pmulhw %%mm7,%%mm2\n\t" \ |
296 | "paddw %%mm6,%%mm1\n\t" \ |
297 | "movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \ |
298 | "paddw %%mm7,%%mm2\n\t" \ |
299 | "movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \ |
300 | /*Stage 3: \ |
301 | 6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \ |
302 | 0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \ |
303 | 1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \ |
304 | "paddw %%mm2,%%mm1\n\t" \ |
305 | "paddw %%mm5,%%mm6\n\t" \ |
306 | "paddw %%mm4,%%mm7\n\t" \ |
307 | "paddw %%mm2,%%mm2\n\t" \ |
308 | "paddw %%mm4,%%mm4\n\t" \ |
309 | "paddw %%mm5,%%mm5\n\t" \ |
310 | "psubw %%mm1,%%mm2\n\t" \ |
311 | "psubw %%mm7,%%mm4\n\t" \ |
312 | "psubw %%mm6,%%mm5\n\t" \ |
313 | /*Stage 4: \ |
314 | 0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \ |
315 | 1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \ |
316 | 2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \ |
317 | 3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \ |
318 | "psubw %%mm0,%%mm7\n\t" \ |
319 | "psubw %%mm1,%%mm6\n\t" \ |
320 | "psubw %%mm2,%%mm5\n\t" \ |
321 | "psubw %%mm3,%%mm4\n\t" \ |
322 | "paddw %%mm0,%%mm0\n\t" \ |
323 | "paddw %%mm1,%%mm1\n\t" \ |
324 | "paddw %%mm2,%%mm2\n\t" \ |
325 | "paddw %%mm3,%%mm3\n\t" \ |
326 | "paddw %%mm7,%%mm0\n\t" \ |
327 | "paddw %%mm6,%%mm1\n\t" \ |
328 | "paddw %%mm5,%%mm2\n\t" \ |
329 | "paddw %%mm4,%%mm3\n\t" \ |
330 | |
331 | #define OC_IDCT_8x8_10_ABC \ |
332 | "#OC_IDCT_8x8_10_ABC\n\t" \ |
333 | /*Stage 1:*/ \ |
334 | /*2-3 rotation by 6pi/16. \ |
335 | xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \ |
336 | "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \ |
337 | "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \ |
338 | "pmulhw %%xmm2,%%xmm6\n\t" \ |
339 | "pmulhw %%xmm2,%%xmm7\n\t" \ |
340 | "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \ |
341 | "paddw %%xmm6,%%xmm2\n\t" \ |
342 | "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
343 | "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \ |
344 | "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
345 | /*5-6 rotation by 3pi/16. \ |
346 | xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \ |
347 | "pmulhw %%xmm3,%%xmm5\n\t" \ |
348 | "pmulhw %%xmm3,%%xmm2\n\t" \ |
349 | "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \ |
350 | "paddw %%xmm3,%%xmm5\n\t" \ |
351 | "paddw %%xmm3,%%xmm2\n\t" \ |
352 | "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \ |
353 | /*4-7 rotation by 7pi/16. \ |
354 | xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \ |
355 | "pmulhw %%xmm1,%%xmm3\n\t" \ |
356 | "pmulhw %%xmm1,%%xmm7\n\t" \ |
357 | "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \ |
358 | "movdqa %%xmm3,%%xmm6\n\t" \ |
359 | "paddw %%xmm1,%%xmm7\n\t" \ |
360 | /*0-1 butterfly. \ |
361 | xmm4=C4, xmm0=X0, X4=0.*/ \ |
362 | /*Stage 2:*/ \ |
363 | /*4-5 butterfly: xmm3=t[4], xmm5=t[5] \ |
364 | 7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \ |
365 | "psubw %%xmm5,%%xmm3\n\t" \ |
366 | "paddw %%xmm5,%%xmm6\n\t" \ |
367 | "movdqa %%xmm4,%%xmm1\n\t" \ |
368 | "pmulhw %%xmm0,%%xmm4\n\t" \ |
369 | "paddw %%xmm0,%%xmm4\n\t" \ |
370 | "movdqa %%xmm7,%%xmm0\n\t" \ |
371 | "movdqa %%xmm4,%%xmm5\n\t" \ |
372 | "paddw %%xmm2,%%xmm0\n\t" \ |
373 | "psubw %%xmm2,%%xmm7\n\t" \ |
374 | "movdqa %%xmm1,%%xmm2\n\t" \ |
375 | "pmulhw %%xmm6,%%xmm1\n\t" \ |
376 | "pmulhw %%xmm7,%%xmm2\n\t" \ |
377 | "paddw %%xmm6,%%xmm1\n\t" \ |
378 | "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \ |
379 | "paddw %%xmm7,%%xmm2\n\t" \ |
380 | "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \ |
381 | /*Stage 3: \ |
382 | 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \ |
383 | 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \ |
384 | 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \ |
385 | "paddw %%xmm2,%%xmm1\n\t" \ |
386 | "paddw %%xmm5,%%xmm6\n\t" \ |
387 | "paddw %%xmm4,%%xmm7\n\t" \ |
388 | "paddw %%xmm2,%%xmm2\n\t" \ |
389 | "paddw %%xmm4,%%xmm4\n\t" \ |
390 | "paddw %%xmm5,%%xmm5\n\t" \ |
391 | "psubw %%xmm1,%%xmm2\n\t" \ |
392 | "psubw %%xmm7,%%xmm4\n\t" \ |
393 | "psubw %%xmm6,%%xmm5\n\t" \ |
394 | |
395 | static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){ |
396 | OC_ALIGN16(ogg_int16_t buf[16]); |
397 | /*This routine accepts an 8x8 matrix pre-transposed.*/ |
398 | __asm__ __volatile__( |
399 | "movq " OC_MEM_OFFS(0x20,x)",%%mm2\n\t" |
400 | "movq " OC_MEM_OFFS(0x30,x)",%%mm3\n\t" |
401 | "movq " OC_MEM_OFFS(0x10,x)",%%mm1\n\t" |
402 | "movq " OC_MEM_OFFS(0x00,x)",%%mm0\n\t" |
403 | OC_IDCT_8x8_10_MMX |
404 | OC_TRANSPOSE_8x4_MMX2SSE |
405 | OC_IDCT_8x8_10_ABC |
406 | OC_IDCT_8x8_D_STORE |
407 | :[buf]"=m" (OC_ARRAY_OPERAND(short,buf,16)), |
408 | [y]"=m" (OC_ARRAY_OPERAND(ogg_int16_t,_y,64)) |
409 | :[x]"m" OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), |
410 | [c]"m" (OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)) |
411 | ); |
412 | /*Clear input data for next block (decoder only).*/ |
413 | __asm__ __volatile__( |
414 | "pxor %%mm0,%%mm0\n\t" |
415 | "movq %%mm0," OC_MEM_OFFS(0x00,x)"\n\t" |
416 | "movq %%mm0," OC_MEM_OFFS(0x10,x)"\n\t" |
417 | "movq %%mm0," OC_MEM_OFFS(0x20,x)"\n\t" |
418 | "movq %%mm0," OC_MEM_OFFS(0x30,x)"\n\t" |
419 | :[x]"+m" (OC_ARRAY_OPERAND(ogg_int16_t,_x,28)) |
420 | ); |
421 | } |
422 | |
423 | /*Performs an inverse 8x8 Type-II DCT transform. |
424 | The input is assumed to be scaled by a factor of 4 relative to orthonormal |
425 | version of the transform.*/ |
426 | void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ |
427 | /*_last_zzi is subtly different from an actual count of the number of |
428 | coefficients we decoded for this block. |
429 | It contains the value of zzi BEFORE the final token in the block was |
430 | decoded. |
431 | In most cases this is an EOB token (the continuation of an EOB run from a |
432 | previous block counts), and so this is the same as the coefficient count. |
433 | However, in the case that the last token was NOT an EOB token, but filled |
434 | the block up with exactly 64 coefficients, _last_zzi will be less than 64. |
435 | Provided the last token was not a pure zero run, the minimum value it can |
436 | be is 46, and so that doesn't affect any of the cases in this routine. |
437 | However, if the last token WAS a pure zero run of length 63, then _last_zzi |
438 | will be 1 while the number of coefficients decoded is 64. |
439 | Thus, we will trigger the following special case, where the real |
440 | coefficient count would not. |
441 | Note also that a zero run of length 64 will give _last_zzi a value of 0, |
442 | but we still process the DC coefficient, which might have a non-zero value |
443 | due to DC prediction. |
444 | Although convoluted, this is arguably the correct behavior: it allows us to |
445 | use a smaller transform when the block ends with a long zero run instead |
446 | of a normal EOB token. |
447 | It could be smarter... multiple separate zero runs at the end of a block |
448 | will fool it, but an encoder that generates these really deserves what it |
449 | gets. |
450 | Needless to say we inherited this approach from VP3.*/ |
451 | /*Then perform the iDCT.*/ |
452 | if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x); |
453 | else oc_idct8x8_slow_sse2(_y,_x); |
454 | } |
455 | |
456 | #endif |
457 | |