| 1 | /******************************************************************** | 
| 2 |  *                                                                  * | 
| 3 |  * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   * | 
| 4 |  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     * | 
| 5 |  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * | 
| 6 |  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       * | 
| 7 |  *                                                                  * | 
| 8 |  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                * | 
| 9 |  * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * | 
| 10 |  *                                                                  * | 
| 11 |  ******************************************************************** | 
| 12 |  | 
| 13 |   function: | 
| 14 |     last mod: $Id$ | 
| 15 |  | 
| 16 |  ********************************************************************/ | 
| 17 |  | 
| 18 | /*MMX acceleration of Theora's iDCT. | 
| 19 |   Originally written by Rudolf Marek, based on code from On2's VP3.*/ | 
| 20 | #include "x86int.h" | 
| 21 | #include "../dct.h" | 
| 22 |  | 
| 23 | #if defined(OC_X86_ASM) | 
| 24 |  | 
| 25 | /*These are offsets into the table of constants below.*/ | 
| 26 | /*7 rows of cosines, in order: pi/16 * (1 ... 7).*/ | 
| 27 | #define OC_COSINE_OFFSET (0) | 
| 28 | /*A row of 8's.*/ | 
| 29 | #define OC_EIGHT_OFFSET  (56) | 
| 30 |  | 
| 31 |  | 
| 32 |  | 
| 33 | /*38 cycles*/ | 
| 34 | #define OC_IDCT_BEGIN(_y,_x) \ | 
| 35 |   "#OC_IDCT_BEGIN\n\t" \ | 
| 36 |   "movq "OC_I(3,_x)",%%mm2\n\t" \ | 
| 37 |   "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \ | 
| 38 |   "movq %%mm2,%%mm4\n\t" \ | 
| 39 |   "movq "OC_J(5,_x)",%%mm7\n\t" \ | 
| 40 |   "pmulhw %%mm6,%%mm4\n\t" \ | 
| 41 |   "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \ | 
| 42 |   "pmulhw %%mm7,%%mm6\n\t" \ | 
| 43 |   "movq %%mm1,%%mm5\n\t" \ | 
| 44 |   "pmulhw %%mm2,%%mm1\n\t" \ | 
| 45 |   "movq "OC_I(1,_x)",%%mm3\n\t" \ | 
| 46 |   "pmulhw %%mm7,%%mm5\n\t" \ | 
| 47 |   "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \ | 
| 48 |   "paddw %%mm2,%%mm4\n\t" \ | 
| 49 |   "paddw %%mm7,%%mm6\n\t" \ | 
| 50 |   "paddw %%mm1,%%mm2\n\t" \ | 
| 51 |   "movq "OC_J(7,_x)",%%mm1\n\t" \ | 
| 52 |   "paddw %%mm5,%%mm7\n\t" \ | 
| 53 |   "movq %%mm0,%%mm5\n\t" \ | 
| 54 |   "pmulhw %%mm3,%%mm0\n\t" \ | 
| 55 |   "paddw %%mm7,%%mm4\n\t" \ | 
| 56 |   "pmulhw %%mm1,%%mm5\n\t" \ | 
| 57 |   "movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \ | 
| 58 |   "psubw %%mm2,%%mm6\n\t" \ | 
| 59 |   "paddw %%mm3,%%mm0\n\t" \ | 
| 60 |   "pmulhw %%mm7,%%mm3\n\t" \ | 
| 61 |   "movq "OC_I(2,_x)",%%mm2\n\t" \ | 
| 62 |   "pmulhw %%mm1,%%mm7\n\t" \ | 
| 63 |   "paddw %%mm1,%%mm5\n\t" \ | 
| 64 |   "movq %%mm2,%%mm1\n\t" \ | 
| 65 |   "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \ | 
| 66 |   "psubw %%mm5,%%mm3\n\t" \ | 
| 67 |   "movq "OC_J(6,_x)",%%mm5\n\t" \ | 
| 68 |   "paddw %%mm7,%%mm0\n\t" \ | 
| 69 |   "movq %%mm5,%%mm7\n\t" \ | 
| 70 |   "psubw %%mm4,%%mm0\n\t" \ | 
| 71 |   "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \ | 
| 72 |   "paddw %%mm1,%%mm2\n\t" \ | 
| 73 |   "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \ | 
| 74 |   "paddw %%mm4,%%mm4\n\t" \ | 
| 75 |   "paddw %%mm0,%%mm4\n\t" \ | 
| 76 |   "psubw %%mm6,%%mm3\n\t" \ | 
| 77 |   "paddw %%mm7,%%mm5\n\t" \ | 
| 78 |   "paddw %%mm6,%%mm6\n\t" \ | 
| 79 |   "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \ | 
| 80 |   "paddw %%mm3,%%mm6\n\t" \ | 
| 81 |   "movq %%mm4,"OC_I(1,_y)"\n\t" \ | 
| 82 |   "psubw %%mm5,%%mm1\n\t" \ | 
| 83 |   "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ | 
| 84 |   "movq %%mm3,%%mm5\n\t" \ | 
| 85 |   "pmulhw %%mm4,%%mm3\n\t" \ | 
| 86 |   "paddw %%mm2,%%mm7\n\t" \ | 
| 87 |   "movq %%mm6,"OC_I(2,_y)"\n\t" \ | 
| 88 |   "movq %%mm0,%%mm2\n\t" \ | 
| 89 |   "movq "OC_I(0,_x)",%%mm6\n\t" \ | 
| 90 |   "pmulhw %%mm4,%%mm0\n\t" \ | 
| 91 |   "paddw %%mm3,%%mm5\n\t" \ | 
| 92 |   "movq "OC_J(4,_x)",%%mm3\n\t" \ | 
| 93 |   "psubw %%mm1,%%mm5\n\t" \ | 
| 94 |   "paddw %%mm0,%%mm2\n\t" \ | 
| 95 |   "psubw %%mm3,%%mm6\n\t" \ | 
| 96 |   "movq %%mm6,%%mm0\n\t" \ | 
| 97 |   "pmulhw %%mm4,%%mm6\n\t" \ | 
| 98 |   "paddw %%mm3,%%mm3\n\t" \ | 
| 99 |   "paddw %%mm1,%%mm1\n\t" \ | 
| 100 |   "paddw %%mm0,%%mm3\n\t" \ | 
| 101 |   "paddw %%mm5,%%mm1\n\t" \ | 
| 102 |   "pmulhw %%mm3,%%mm4\n\t" \ | 
| 103 |   "paddw %%mm0,%%mm6\n\t" \ | 
| 104 |   "psubw %%mm2,%%mm6\n\t" \ | 
| 105 |   "paddw %%mm2,%%mm2\n\t" \ | 
| 106 |   "movq "OC_I(1,_y)",%%mm0\n\t" \ | 
| 107 |   "paddw %%mm6,%%mm2\n\t" \ | 
| 108 |   "paddw %%mm3,%%mm4\n\t" \ | 
| 109 |   "psubw %%mm1,%%mm2\n\t" \ | 
| 110 |   "#end OC_IDCT_BEGIN\n\t" \ | 
| 111 |  | 
| 112 | /*38+8=46 cycles.*/ | 
| 113 | #define OC_ROW_IDCT(_y,_x) \ | 
| 114 |   "#OC_ROW_IDCT\n" \ | 
| 115 |   OC_IDCT_BEGIN(_y,_x) \ | 
| 116 |   /*r3=D'*/ \ | 
| 117 |   "movq "OC_I(2,_y)",%%mm3\n\t" \ | 
| 118 |   /*r4=E'=E-G*/ \ | 
| 119 |   "psubw %%mm7,%%mm4\n\t" \ | 
| 120 |   /*r1=H'+H'*/ \ | 
| 121 |   "paddw %%mm1,%%mm1\n\t" \ | 
| 122 |   /*r7=G+G*/ \ | 
| 123 |   "paddw %%mm7,%%mm7\n\t" \ | 
| 124 |   /*r1=R1=A''+H'*/ \ | 
| 125 |   "paddw %%mm2,%%mm1\n\t" \ | 
| 126 |   /*r7=G'=E+G*/ \ | 
| 127 |   "paddw %%mm4,%%mm7\n\t" \ | 
| 128 |   /*r4=R4=E'-D'*/ \ | 
| 129 |   "psubw %%mm3,%%mm4\n\t" \ | 
| 130 |   "paddw %%mm3,%%mm3\n\t" \ | 
| 131 |   /*r6=R6=F'-B''*/ \ | 
| 132 |   "psubw %%mm5,%%mm6\n\t" \ | 
| 133 |   "paddw %%mm5,%%mm5\n\t" \ | 
| 134 |   /*r3=R3=E'+D'*/ \ | 
| 135 |   "paddw %%mm4,%%mm3\n\t" \ | 
| 136 |   /*r5=R5=F'+B''*/ \ | 
| 137 |   "paddw %%mm6,%%mm5\n\t" \ | 
| 138 |   /*r7=R7=G'-C'*/ \ | 
| 139 |   "psubw %%mm0,%%mm7\n\t" \ | 
| 140 |   "paddw %%mm0,%%mm0\n\t" \ | 
| 141 |   /*Save R1.*/ \ | 
| 142 |   "movq %%mm1,"OC_I(1,_y)"\n\t" \ | 
| 143 |   /*r0=R0=G.+C.*/ \ | 
| 144 |   "paddw %%mm7,%%mm0\n\t" \ | 
| 145 |   "#end OC_ROW_IDCT\n\t" \ | 
| 146 |  | 
| 147 | /*The following macro does two 4x4 transposes in place. | 
| 148 |   At entry, we assume: | 
| 149 |     r0 = a3 a2 a1 a0 | 
| 150 |   I(1) = b3 b2 b1 b0 | 
| 151 |     r2 = c3 c2 c1 c0 | 
| 152 |     r3 = d3 d2 d1 d0 | 
| 153 |  | 
| 154 |     r4 = e3 e2 e1 e0 | 
| 155 |     r5 = f3 f2 f1 f0 | 
| 156 |     r6 = g3 g2 g1 g0 | 
| 157 |     r7 = h3 h2 h1 h0 | 
| 158 |  | 
| 159 |   At exit, we have: | 
| 160 |   I(0) = d0 c0 b0 a0 | 
| 161 |   I(1) = d1 c1 b1 a1 | 
| 162 |   I(2) = d2 c2 b2 a2 | 
| 163 |   I(3) = d3 c3 b3 a3 | 
| 164 |  | 
| 165 |   J(4) = h0 g0 f0 e0 | 
| 166 |   J(5) = h1 g1 f1 e1 | 
| 167 |   J(6) = h2 g2 f2 e2 | 
| 168 |   J(7) = h3 g3 f3 e3 | 
| 169 |  | 
| 170 |   I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. | 
| 171 |   J(4) J(5) J(6) J(7) is the transpose of r4  r5  r6 r7. | 
| 172 |  | 
| 173 |   Since r1 is free at entry, we calculate the Js first.*/ | 
| 174 | /*19 cycles.*/ | 
| 175 | #define OC_TRANSPOSE(_y) \ | 
| 176 |   "#OC_TRANSPOSE\n\t" \ | 
| 177 |   "movq %%mm4,%%mm1\n\t" \ | 
| 178 |   "punpcklwd %%mm5,%%mm4\n\t" \ | 
| 179 |   "movq %%mm0,"OC_I(0,_y)"\n\t" \ | 
| 180 |   "punpckhwd %%mm5,%%mm1\n\t" \ | 
| 181 |   "movq %%mm6,%%mm0\n\t" \ | 
| 182 |   "punpcklwd %%mm7,%%mm6\n\t" \ | 
| 183 |   "movq %%mm4,%%mm5\n\t" \ | 
| 184 |   "punpckldq %%mm6,%%mm4\n\t" \ | 
| 185 |   "punpckhdq %%mm6,%%mm5\n\t" \ | 
| 186 |   "movq %%mm1,%%mm6\n\t" \ | 
| 187 |   "movq %%mm4,"OC_J(4,_y)"\n\t" \ | 
| 188 |   "punpckhwd %%mm7,%%mm0\n\t" \ | 
| 189 |   "movq %%mm5,"OC_J(5,_y)"\n\t" \ | 
| 190 |   "punpckhdq %%mm0,%%mm6\n\t" \ | 
| 191 |   "movq "OC_I(0,_y)",%%mm4\n\t" \ | 
| 192 |   "punpckldq %%mm0,%%mm1\n\t" \ | 
| 193 |   "movq "OC_I(1,_y)",%%mm5\n\t" \ | 
| 194 |   "movq %%mm4,%%mm0\n\t" \ | 
| 195 |   "movq %%mm6,"OC_J(7,_y)"\n\t" \ | 
| 196 |   "punpcklwd %%mm5,%%mm0\n\t" \ | 
| 197 |   "movq %%mm1,"OC_J(6,_y)"\n\t" \ | 
| 198 |   "punpckhwd %%mm5,%%mm4\n\t" \ | 
| 199 |   "movq %%mm2,%%mm5\n\t" \ | 
| 200 |   "punpcklwd %%mm3,%%mm2\n\t" \ | 
| 201 |   "movq %%mm0,%%mm1\n\t" \ | 
| 202 |   "punpckldq %%mm2,%%mm0\n\t" \ | 
| 203 |   "punpckhdq %%mm2,%%mm1\n\t" \ | 
| 204 |   "movq %%mm4,%%mm2\n\t" \ | 
| 205 |   "movq %%mm0,"OC_I(0,_y)"\n\t" \ | 
| 206 |   "punpckhwd %%mm3,%%mm5\n\t" \ | 
| 207 |   "movq %%mm1,"OC_I(1,_y)"\n\t" \ | 
| 208 |   "punpckhdq %%mm5,%%mm4\n\t" \ | 
| 209 |   "punpckldq %%mm5,%%mm2\n\t" \ | 
| 210 |   "movq %%mm4,"OC_I(3,_y)"\n\t" \ | 
| 211 |   "movq %%mm2,"OC_I(2,_y)"\n\t" \ | 
| 212 |   "#end OC_TRANSPOSE\n\t" \ | 
| 213 |  | 
| 214 | /*38+19=57 cycles.*/ | 
| 215 | #define OC_COLUMN_IDCT(_y) \ | 
| 216 |   "#OC_COLUMN_IDCT\n" \ | 
| 217 |   OC_IDCT_BEGIN(_y,_y) \ | 
| 218 |   "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \ | 
| 219 |   /*r1=H'+H'*/ \ | 
| 220 |   "paddw %%mm1,%%mm1\n\t" \ | 
| 221 |   /*r1=R1=A''+H'*/ \ | 
| 222 |   "paddw %%mm2,%%mm1\n\t" \ | 
| 223 |   /*r2=NR2*/ \ | 
| 224 |   "psraw $4,%%mm2\n\t" \ | 
| 225 |   /*r4=E'=E-G*/ \ | 
| 226 |   "psubw %%mm7,%%mm4\n\t" \ | 
| 227 |   /*r1=NR1*/ \ | 
| 228 |   "psraw $4,%%mm1\n\t" \ | 
| 229 |   /*r3=D'*/ \ | 
| 230 |   "movq "OC_I(2,_y)",%%mm3\n\t" \ | 
| 231 |   /*r7=G+G*/ \ | 
| 232 |   "paddw %%mm7,%%mm7\n\t" \ | 
| 233 |   /*Store NR2 at I(2).*/ \ | 
| 234 |   "movq %%mm2,"OC_I(2,_y)"\n\t" \ | 
| 235 |   /*r7=G'=E+G*/ \ | 
| 236 |   "paddw %%mm4,%%mm7\n\t" \ | 
| 237 |   /*Store NR1 at I(1).*/ \ | 
| 238 |   "movq %%mm1,"OC_I(1,_y)"\n\t" \ | 
| 239 |   /*r4=R4=E'-D'*/ \ | 
| 240 |   "psubw %%mm3,%%mm4\n\t" \ | 
| 241 |   "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \ | 
| 242 |   /*r3=D'+D'*/ \ | 
| 243 |   "paddw %%mm3,%%mm3\n\t" \ | 
| 244 |   /*r3=R3=E'+D'*/ \ | 
| 245 |   "paddw %%mm4,%%mm3\n\t" \ | 
| 246 |   /*r4=NR4*/ \ | 
| 247 |   "psraw $4,%%mm4\n\t" \ | 
| 248 |   /*r6=R6=F'-B''*/ \ | 
| 249 |   "psubw %%mm5,%%mm6\n\t" \ | 
| 250 |   /*r3=NR3*/ \ | 
| 251 |   "psraw $4,%%mm3\n\t" \ | 
| 252 |   "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \ | 
| 253 |   /*r5=B''+B''*/ \ | 
| 254 |   "paddw %%mm5,%%mm5\n\t" \ | 
| 255 |   /*r5=R5=F'+B''*/ \ | 
| 256 |   "paddw %%mm6,%%mm5\n\t" \ | 
| 257 |   /*r6=NR6*/ \ | 
| 258 |   "psraw $4,%%mm6\n\t" \ | 
| 259 |   /*Store NR4 at J(4).*/ \ | 
| 260 |   "movq %%mm4,"OC_J(4,_y)"\n\t" \ | 
| 261 |   /*r5=NR5*/ \ | 
| 262 |   "psraw $4,%%mm5\n\t" \ | 
| 263 |   /*Store NR3 at I(3).*/ \ | 
| 264 |   "movq %%mm3,"OC_I(3,_y)"\n\t" \ | 
| 265 |   /*r7=R7=G'-C'*/ \ | 
| 266 |   "psubw %%mm0,%%mm7\n\t" \ | 
| 267 |   "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \ | 
| 268 |   /*r0=C'+C'*/ \ | 
| 269 |   "paddw %%mm0,%%mm0\n\t" \ | 
| 270 |   /*r0=R0=G'+C'*/ \ | 
| 271 |   "paddw %%mm7,%%mm0\n\t" \ | 
| 272 |   /*r7=NR7*/ \ | 
| 273 |   "psraw $4,%%mm7\n\t" \ | 
| 274 |   /*Store NR6 at J(6).*/ \ | 
| 275 |   "movq %%mm6,"OC_J(6,_y)"\n\t" \ | 
| 276 |   /*r0=NR0*/ \ | 
| 277 |   "psraw $4,%%mm0\n\t" \ | 
| 278 |   /*Store NR5 at J(5).*/ \ | 
| 279 |   "movq %%mm5,"OC_J(5,_y)"\n\t" \ | 
| 280 |   /*Store NR7 at J(7).*/ \ | 
| 281 |   "movq %%mm7,"OC_J(7,_y)"\n\t" \ | 
| 282 |   /*Store NR0 at I(0).*/ \ | 
| 283 |   "movq %%mm0,"OC_I(0,_y)"\n\t" \ | 
| 284 |   "#end OC_COLUMN_IDCT\n\t" \ | 
| 285 |  | 
| 286 | static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){ | 
| 287 |   int i; | 
| 288 |   /*This routine accepts an 8x8 matrix, but in partially transposed form. | 
| 289 |     Every 4x4 block is transposed.*/ | 
| 290 |   __asm__ __volatile__( | 
| 291 | #define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16,_y) | 
| 292 | #define OC_J(_k,_y)   OC_MEM_OFFS(((_k)-4)*16+8,_y) | 
| 293 |     OC_ROW_IDCT(y,x) | 
| 294 |     OC_TRANSPOSE(y) | 
| 295 | #undef  OC_I | 
| 296 | #undef  OC_J | 
| 297 | #define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16+64,_y) | 
| 298 | #define OC_J(_k,_y)   OC_MEM_OFFS(((_k)-4)*16+72,_y) | 
| 299 |     OC_ROW_IDCT(y,x) | 
| 300 |     OC_TRANSPOSE(y) | 
| 301 | #undef  OC_I | 
| 302 | #undef  OC_J | 
| 303 | #define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16,_y) | 
| 304 | #define OC_J(_k,_y)   OC_I(_k,_y) | 
| 305 |     OC_COLUMN_IDCT(y) | 
| 306 | #undef  OC_I | 
| 307 | #undef  OC_J | 
| 308 | #define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16+8,_y) | 
| 309 | #define OC_J(_k,_y)   OC_I(_k,_y) | 
| 310 |     OC_COLUMN_IDCT(y) | 
| 311 | #undef  OC_I | 
| 312 | #undef  OC_J | 
| 313 |     :[y]"=m" OC_ARRAY_OPERAND(ogg_int16_t,_y,64) | 
| 314 |     :[x]"m" OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), | 
| 315 |      [c]"m" OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128) | 
| 316 |   ); | 
| 317 |   __asm__ __volatile__("pxor %%mm0,%%mm0\n\t" ::); | 
| 318 |   for(i=0;i<4;i++){ | 
| 319 |     __asm__ __volatile__( | 
| 320 |       "movq %%mm0," OC_MEM_OFFS(0x00,x)"\n\t"  | 
| 321 |       "movq %%mm0," OC_MEM_OFFS(0x08,x)"\n\t"  | 
| 322 |       "movq %%mm0," OC_MEM_OFFS(0x10,x)"\n\t"  | 
| 323 |       "movq %%mm0," OC_MEM_OFFS(0x18,x)"\n\t"  | 
| 324 |       :[x]"=m" OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16) | 
| 325 |     ); | 
| 326 |   } | 
| 327 | } | 
| 328 |  | 
| 329 | /*25 cycles.*/ | 
| 330 | #define OC_IDCT_BEGIN_10(_y,_x) \ | 
| 331 |  "#OC_IDCT_BEGIN_10\n\t" \ | 
| 332 |  "movq "OC_I(3,_x)",%%mm2\n\t" \ | 
| 333 |  "nop\n\t" \ | 
| 334 |  "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \ | 
| 335 |  "movq %%mm2,%%mm4\n\t" \ | 
| 336 |  "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \ | 
| 337 |  "pmulhw %%mm6,%%mm4\n\t" \ | 
| 338 |  "movq "OC_I(1,_x)",%%mm3\n\t" \ | 
| 339 |  "pmulhw %%mm2,%%mm1\n\t" \ | 
| 340 |  "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \ | 
| 341 |  "paddw %%mm2,%%mm4\n\t" \ | 
| 342 |  "pxor %%mm6,%%mm6\n\t" \ | 
| 343 |  "paddw %%mm1,%%mm2\n\t" \ | 
| 344 |  "movq "OC_I(2,_x)",%%mm5\n\t" \ | 
| 345 |  "pmulhw %%mm3,%%mm0\n\t" \ | 
| 346 |  "movq %%mm5,%%mm1\n\t" \ | 
| 347 |  "paddw %%mm3,%%mm0\n\t" \ | 
| 348 |  "pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \ | 
| 349 |  "psubw %%mm2,%%mm6\n\t" \ | 
| 350 |  "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \ | 
| 351 |  "psubw %%mm4,%%mm0\n\t" \ | 
| 352 |  "movq "OC_I(2,_x)",%%mm7\n\t" \ | 
| 353 |  "paddw %%mm4,%%mm4\n\t" \ | 
| 354 |  "paddw %%mm5,%%mm7\n\t" \ | 
| 355 |  "paddw %%mm0,%%mm4\n\t" \ | 
| 356 |  "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \ | 
| 357 |  "psubw %%mm6,%%mm3\n\t" \ | 
| 358 |  "movq %%mm4,"OC_I(1,_y)"\n\t" \ | 
| 359 |  "paddw %%mm6,%%mm6\n\t" \ | 
| 360 |  "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ | 
| 361 |  "paddw %%mm3,%%mm6\n\t" \ | 
| 362 |  "movq %%mm3,%%mm5\n\t" \ | 
| 363 |  "pmulhw %%mm4,%%mm3\n\t" \ | 
| 364 |  "movq %%mm6,"OC_I(2,_y)"\n\t" \ | 
| 365 |  "movq %%mm0,%%mm2\n\t" \ | 
| 366 |  "movq "OC_I(0,_x)",%%mm6\n\t" \ | 
| 367 |  "pmulhw %%mm4,%%mm0\n\t" \ | 
| 368 |  "paddw %%mm3,%%mm5\n\t" \ | 
| 369 |  "paddw %%mm0,%%mm2\n\t" \ | 
| 370 |  "psubw %%mm1,%%mm5\n\t" \ | 
| 371 |  "pmulhw %%mm4,%%mm6\n\t" \ | 
| 372 |  "paddw "OC_I(0,_x)",%%mm6\n\t" \ | 
| 373 |  "paddw %%mm1,%%mm1\n\t" \ | 
| 374 |  "movq %%mm6,%%mm4\n\t" \ | 
| 375 |  "paddw %%mm5,%%mm1\n\t" \ | 
| 376 |  "psubw %%mm2,%%mm6\n\t" \ | 
| 377 |  "paddw %%mm2,%%mm2\n\t" \ | 
| 378 |  "movq "OC_I(1,_y)",%%mm0\n\t" \ | 
| 379 |  "paddw %%mm6,%%mm2\n\t" \ | 
| 380 |  "psubw %%mm1,%%mm2\n\t" \ | 
| 381 |  "nop\n\t" \ | 
| 382 |  "#end OC_IDCT_BEGIN_10\n\t" \ | 
| 383 |  | 
| 384 | /*25+8=33 cycles.*/ | 
| 385 | #define OC_ROW_IDCT_10(_y,_x) \ | 
| 386 |  "#OC_ROW_IDCT_10\n\t" \ | 
| 387 |  OC_IDCT_BEGIN_10(_y,_x) \ | 
| 388 |  /*r3=D'*/ \ | 
| 389 |  "movq "OC_I(2,_y)",%%mm3\n\t" \ | 
| 390 |  /*r4=E'=E-G*/ \ | 
| 391 |  "psubw %%mm7,%%mm4\n\t" \ | 
| 392 |  /*r1=H'+H'*/ \ | 
| 393 |  "paddw %%mm1,%%mm1\n\t" \ | 
| 394 |  /*r7=G+G*/ \ | 
| 395 |  "paddw %%mm7,%%mm7\n\t" \ | 
| 396 |  /*r1=R1=A''+H'*/ \ | 
| 397 |  "paddw %%mm2,%%mm1\n\t" \ | 
| 398 |  /*r7=G'=E+G*/ \ | 
| 399 |  "paddw %%mm4,%%mm7\n\t" \ | 
| 400 |  /*r4=R4=E'-D'*/ \ | 
| 401 |  "psubw %%mm3,%%mm4\n\t" \ | 
| 402 |  "paddw %%mm3,%%mm3\n\t" \ | 
| 403 |  /*r6=R6=F'-B''*/ \ | 
| 404 |  "psubw %%mm5,%%mm6\n\t" \ | 
| 405 |  "paddw %%mm5,%%mm5\n\t" \ | 
| 406 |  /*r3=R3=E'+D'*/ \ | 
| 407 |  "paddw %%mm4,%%mm3\n\t" \ | 
| 408 |  /*r5=R5=F'+B''*/ \ | 
| 409 |  "paddw %%mm6,%%mm5\n\t" \ | 
| 410 |  /*r7=R7=G'-C'*/ \ | 
| 411 |  "psubw %%mm0,%%mm7\n\t" \ | 
| 412 |  "paddw %%mm0,%%mm0\n\t" \ | 
| 413 |  /*Save R1.*/ \ | 
| 414 |  "movq %%mm1,"OC_I(1,_y)"\n\t" \ | 
| 415 |  /*r0=R0=G'+C'*/ \ | 
| 416 |  "paddw %%mm7,%%mm0\n\t" \ | 
| 417 |  "#end OC_ROW_IDCT_10\n\t" \ | 
| 418 |  | 
| 419 | /*25+19=44 cycles'*/ | 
| 420 | #define OC_COLUMN_IDCT_10(_y) \ | 
| 421 |  "#OC_COLUMN_IDCT_10\n\t" \ | 
| 422 |  OC_IDCT_BEGIN_10(_y,_y) \ | 
| 423 |  "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \ | 
| 424 |  /*r1=H'+H'*/ \ | 
| 425 |  "paddw %%mm1,%%mm1\n\t" \ | 
| 426 |  /*r1=R1=A''+H'*/ \ | 
| 427 |  "paddw %%mm2,%%mm1\n\t" \ | 
| 428 |  /*r2=NR2*/ \ | 
| 429 |  "psraw $4,%%mm2\n\t" \ | 
| 430 |  /*r4=E'=E-G*/ \ | 
| 431 |  "psubw %%mm7,%%mm4\n\t" \ | 
| 432 |  /*r1=NR1*/ \ | 
| 433 |  "psraw $4,%%mm1\n\t" \ | 
| 434 |  /*r3=D'*/ \ | 
| 435 |  "movq "OC_I(2,_y)",%%mm3\n\t" \ | 
| 436 |  /*r7=G+G*/ \ | 
| 437 |  "paddw %%mm7,%%mm7\n\t" \ | 
| 438 |  /*Store NR2 at I(2).*/ \ | 
| 439 |  "movq %%mm2,"OC_I(2,_y)"\n\t" \ | 
| 440 |  /*r7=G'=E+G*/ \ | 
| 441 |  "paddw %%mm4,%%mm7\n\t" \ | 
| 442 |  /*Store NR1 at I(1).*/ \ | 
| 443 |  "movq %%mm1,"OC_I(1,_y)"\n\t" \ | 
| 444 |  /*r4=R4=E'-D'*/ \ | 
| 445 |  "psubw %%mm3,%%mm4\n\t" \ | 
| 446 |  "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \ | 
| 447 |  /*r3=D'+D'*/ \ | 
| 448 |  "paddw %%mm3,%%mm3\n\t" \ | 
| 449 |  /*r3=R3=E'+D'*/ \ | 
| 450 |  "paddw %%mm4,%%mm3\n\t" \ | 
| 451 |  /*r4=NR4*/ \ | 
| 452 |  "psraw $4,%%mm4\n\t" \ | 
| 453 |  /*r6=R6=F'-B''*/ \ | 
| 454 |  "psubw %%mm5,%%mm6\n\t" \ | 
| 455 |  /*r3=NR3*/ \ | 
| 456 |  "psraw $4,%%mm3\n\t" \ | 
| 457 |  "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \ | 
| 458 |  /*r5=B''+B''*/ \ | 
| 459 |  "paddw %%mm5,%%mm5\n\t" \ | 
| 460 |  /*r5=R5=F'+B''*/ \ | 
| 461 |  "paddw %%mm6,%%mm5\n\t" \ | 
| 462 |  /*r6=NR6*/ \ | 
| 463 |  "psraw $4,%%mm6\n\t" \ | 
| 464 |  /*Store NR4 at J(4).*/ \ | 
| 465 |  "movq %%mm4,"OC_J(4,_y)"\n\t" \ | 
| 466 |  /*r5=NR5*/ \ | 
| 467 |  "psraw $4,%%mm5\n\t" \ | 
| 468 |  /*Store NR3 at I(3).*/ \ | 
| 469 |  "movq %%mm3,"OC_I(3,_y)"\n\t" \ | 
| 470 |  /*r7=R7=G'-C'*/ \ | 
| 471 |  "psubw %%mm0,%%mm7\n\t" \ | 
| 472 |  "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \ | 
| 473 |  /*r0=C'+C'*/ \ | 
| 474 |  "paddw %%mm0,%%mm0\n\t" \ | 
| 475 |  /*r0=R0=G'+C'*/ \ | 
| 476 |  "paddw %%mm7,%%mm0\n\t" \ | 
| 477 |  /*r7=NR7*/ \ | 
| 478 |  "psraw $4,%%mm7\n\t" \ | 
| 479 |  /*Store NR6 at J(6).*/ \ | 
| 480 |  "movq %%mm6,"OC_J(6,_y)"\n\t" \ | 
| 481 |  /*r0=NR0*/ \ | 
| 482 |  "psraw $4,%%mm0\n\t" \ | 
| 483 |  /*Store NR5 at J(5).*/ \ | 
| 484 |  "movq %%mm5,"OC_J(5,_y)"\n\t" \ | 
| 485 |  /*Store NR7 at J(7).*/ \ | 
| 486 |  "movq %%mm7,"OC_J(7,_y)"\n\t" \ | 
| 487 |  /*Store NR0 at I(0).*/ \ | 
| 488 |  "movq %%mm0,"OC_I(0,_y)"\n\t" \ | 
| 489 |  "#end OC_COLUMN_IDCT_10\n\t" \ | 
| 490 |  | 
| 491 | static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){ | 
| 492 |   __asm__ __volatile__( | 
| 493 | #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) | 
| 494 | #define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y) | 
| 495 |     /*Done with dequant, descramble, and partial transpose. | 
| 496 |       Now do the iDCT itself.*/ | 
| 497 |     OC_ROW_IDCT_10(y,x) | 
| 498 |     OC_TRANSPOSE(y) | 
| 499 | #undef  OC_I | 
| 500 | #undef  OC_J | 
| 501 | #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) | 
| 502 | #define OC_J(_k,_y) OC_I(_k,_y) | 
| 503 |     OC_COLUMN_IDCT_10(y) | 
| 504 | #undef  OC_I | 
| 505 | #undef  OC_J | 
| 506 | #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y) | 
| 507 | #define OC_J(_k,_y) OC_I(_k,_y) | 
| 508 |     OC_COLUMN_IDCT_10(y) | 
| 509 | #undef  OC_I | 
| 510 | #undef  OC_J | 
| 511 |     :[y]"=m" OC_ARRAY_OPERAND(ogg_int16_t,_y,64) | 
| 512 |     :[x]"m" OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), | 
| 513 |      [c]"m" OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128) | 
| 514 |   ); | 
| 515 |   __asm__ __volatile__( | 
| 516 |     "pxor %%mm0,%%mm0\n\t"  | 
| 517 |     "movq %%mm0," OC_MEM_OFFS(0x00,x)"\n\t"  | 
| 518 |     "movq %%mm0," OC_MEM_OFFS(0x10,x)"\n\t"  | 
| 519 |     "movq %%mm0," OC_MEM_OFFS(0x20,x)"\n\t"  | 
| 520 |     "movq %%mm0," OC_MEM_OFFS(0x30,x)"\n\t"  | 
| 521 |     :[x]"+m" OC_ARRAY_OPERAND(ogg_int16_t,_x,28) | 
| 522 |   ); | 
| 523 | } | 
| 524 |  | 
| 525 | /*Performs an inverse 8x8 Type-II DCT transform. | 
| 526 |   The input is assumed to be scaled by a factor of 4 relative to orthonormal | 
| 527 |    version of the transform.*/ | 
| 528 | void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ | 
| 529 |   /*_last_zzi is subtly different from an actual count of the number of | 
| 530 |      coefficients we decoded for this block. | 
| 531 |     It contains the value of zzi BEFORE the final token in the block was | 
| 532 |      decoded. | 
| 533 |     In most cases this is an EOB token (the continuation of an EOB run from a | 
| 534 |      previous block counts), and so this is the same as the coefficient count. | 
| 535 |     However, in the case that the last token was NOT an EOB token, but filled | 
| 536 |      the block up with exactly 64 coefficients, _last_zzi will be less than 64. | 
| 537 |     Provided the last token was not a pure zero run, the minimum value it can | 
| 538 |      be is 46, and so that doesn't affect any of the cases in this routine. | 
| 539 |     However, if the last token WAS a pure zero run of length 63, then _last_zzi | 
| 540 |      will be 1 while the number of coefficients decoded is 64. | 
| 541 |     Thus, we will trigger the following special case, where the real | 
| 542 |      coefficient count would not. | 
| 543 |     Note also that a zero run of length 64 will give _last_zzi a value of 0, | 
| 544 |      but we still process the DC coefficient, which might have a non-zero value | 
| 545 |      due to DC prediction. | 
| 546 |     Although convoluted, this is arguably the correct behavior: it allows us to | 
| 547 |      use a smaller transform when the block ends with a long zero run instead | 
| 548 |      of a normal EOB token. | 
| 549 |     It could be smarter... multiple separate zero runs at the end of a block | 
| 550 |      will fool it, but an encoder that generates these really deserves what it | 
| 551 |      gets. | 
| 552 |     Needless to say we inherited this approach from VP3.*/ | 
| 553 |   /*Then perform the iDCT.*/ | 
| 554 |   if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x); | 
| 555 |   else oc_idct8x8_slow_mmx(_y,_x); | 
| 556 | } | 
| 557 |  | 
| 558 | #endif | 
| 559 |  |