| 1 | /******************************************************************** |
| 2 | * * |
| 3 | * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
| 4 | * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
| 5 | * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
| 6 | * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
| 7 | * * |
| 8 | * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
| 9 | * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
| 10 | * * |
| 11 | ******************************************************************** |
| 12 | |
| 13 | function: |
| 14 | last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $ |
| 15 | |
| 16 | ********************************************************************/ |
| 17 | |
| 18 | /*SSE2 acceleration of Theora's iDCT.*/ |
| 19 | #include "x86int.h" |
| 20 | #include "sse2trans.h" |
| 21 | #include "../dct.h" |
| 22 | |
| 23 | #if defined(OC_X86_ASM) |
| 24 | |
| 25 | /*A table of constants used by the MMX routines.*/ |
| 26 | const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={ |
| 27 | 8, 8, 8, 8, 8, 8, 8, 8, |
| 28 | OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7, |
| 29 | OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6, |
| 30 | OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5, |
| 31 | OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4, |
| 32 | OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3, |
| 33 | OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2, |
| 34 | OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1 |
| 35 | }; |
| 36 | |
| 37 | |
| 38 | /*Performs the first three stages of the iDCT. |
| 39 | xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input |
| 40 | (accessed in that order). |
| 41 | The remaining rows must be in _x at their corresponding locations. |
| 42 | On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 |
| 43 | contain rows 4 through 7.*/ |
| 44 | #define OC_IDCT_8x8_ABC(_x) \ |
| 45 | "#OC_IDCT_8x8_ABC\n\t" \ |
| 46 | /*Stage 1:*/ \ |
| 47 | /*2-3 rotation by 6pi/16. \ |
| 48 | xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \ |
| 49 | "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \ |
| 50 | "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \ |
| 51 | "movdqa %%xmm1,%%xmm0\n\t" \ |
| 52 | "pmulhw %%xmm2,%%xmm1\n\t" \ |
| 53 | "movdqa %%xmm4,%%xmm7\n\t" \ |
| 54 | "pmulhw %%xmm6,%%xmm0\n\t" \ |
| 55 | "pmulhw %%xmm2,%%xmm7\n\t" \ |
| 56 | "pmulhw %%xmm6,%%xmm4\n\t" \ |
| 57 | "paddw %%xmm6,%%xmm0\n\t" \ |
| 58 | "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \ |
| 59 | "paddw %%xmm1,%%xmm2\n\t" \ |
| 60 | "psubw %%xmm0,%%xmm7\n\t" \ |
| 61 | "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
| 62 | "paddw %%xmm4,%%xmm2\n\t" \ |
| 63 | "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \ |
| 64 | "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
| 65 | /*5-6 rotation by 3pi/16. \ |
| 66 | xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \ |
| 67 | "movdqa %%xmm4,%%xmm2\n\t" \ |
| 68 | "movdqa %%xmm6,%%xmm1\n\t" \ |
| 69 | "pmulhw %%xmm3,%%xmm4\n\t" \ |
| 70 | "pmulhw %%xmm5,%%xmm1\n\t" \ |
| 71 | "pmulhw %%xmm3,%%xmm6\n\t" \ |
| 72 | "pmulhw %%xmm5,%%xmm2\n\t" \ |
| 73 | "paddw %%xmm3,%%xmm4\n\t" \ |
| 74 | "paddw %%xmm5,%%xmm3\n\t" \ |
| 75 | "paddw %%xmm6,%%xmm3\n\t" \ |
| 76 | "movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \ |
| 77 | "paddw %%xmm5,%%xmm1\n\t" \ |
| 78 | "movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \ |
| 79 | "paddw %%xmm3,%%xmm2\n\t" \ |
| 80 | "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \ |
| 81 | "psubw %%xmm4,%%xmm1\n\t" \ |
| 82 | "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \ |
| 83 | /*4-7 rotation by 7pi/16. \ |
| 84 | xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \ |
| 85 | "movdqa %%xmm3,%%xmm0\n\t" \ |
| 86 | "movdqa %%xmm4,%%xmm7\n\t" \ |
| 87 | "pmulhw %%xmm5,%%xmm3\n\t" \ |
| 88 | "pmulhw %%xmm5,%%xmm7\n\t" \ |
| 89 | "pmulhw %%xmm6,%%xmm4\n\t" \ |
| 90 | "pmulhw %%xmm6,%%xmm0\n\t" \ |
| 91 | "paddw %%xmm6,%%xmm4\n\t" \ |
| 92 | "movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \ |
| 93 | "paddw %%xmm5,%%xmm7\n\t" \ |
| 94 | "psubw %%xmm4,%%xmm3\n\t" \ |
| 95 | "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \ |
| 96 | "paddw %%xmm7,%%xmm0\n\t" \ |
| 97 | "movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \ |
| 98 | /*0-1 butterfly. \ |
| 99 | xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \ |
| 100 | "paddw %%xmm7,%%xmm6\n\t" \ |
| 101 | "movdqa %%xmm4,%%xmm5\n\t" \ |
| 102 | "pmulhw %%xmm6,%%xmm4\n\t" \ |
| 103 | "paddw %%xmm7,%%xmm7\n\t" \ |
| 104 | "psubw %%xmm6,%%xmm7\n\t" \ |
| 105 | "paddw %%xmm6,%%xmm4\n\t" \ |
| 106 | /*Stage 2:*/ \ |
| 107 | /*4-5 butterfly: xmm3=t[4], xmm1=t[5] \ |
| 108 | 7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \ |
| 109 | "movdqa %%xmm3,%%xmm6\n\t" \ |
| 110 | "paddw %%xmm1,%%xmm3\n\t" \ |
| 111 | "psubw %%xmm1,%%xmm6\n\t" \ |
| 112 | "movdqa %%xmm5,%%xmm1\n\t" \ |
| 113 | "pmulhw %%xmm7,%%xmm5\n\t" \ |
| 114 | "paddw %%xmm7,%%xmm5\n\t" \ |
| 115 | "movdqa %%xmm0,%%xmm7\n\t" \ |
| 116 | "paddw %%xmm2,%%xmm0\n\t" \ |
| 117 | "psubw %%xmm2,%%xmm7\n\t" \ |
| 118 | "movdqa %%xmm1,%%xmm2\n\t" \ |
| 119 | "pmulhw %%xmm6,%%xmm1\n\t" \ |
| 120 | "pmulhw %%xmm7,%%xmm2\n\t" \ |
| 121 | "paddw %%xmm6,%%xmm1\n\t" \ |
| 122 | "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \ |
| 123 | "paddw %%xmm7,%%xmm2\n\t" \ |
| 124 | "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \ |
| 125 | /*Stage 3: \ |
| 126 | 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \ |
| 127 | 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \ |
| 128 | 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \ |
| 129 | "paddw %%xmm2,%%xmm1\n\t" \ |
| 130 | "paddw %%xmm5,%%xmm6\n\t" \ |
| 131 | "paddw %%xmm4,%%xmm7\n\t" \ |
| 132 | "paddw %%xmm2,%%xmm2\n\t" \ |
| 133 | "paddw %%xmm4,%%xmm4\n\t" \ |
| 134 | "paddw %%xmm5,%%xmm5\n\t" \ |
| 135 | "psubw %%xmm1,%%xmm2\n\t" \ |
| 136 | "psubw %%xmm7,%%xmm4\n\t" \ |
| 137 | "psubw %%xmm6,%%xmm5\n\t" \ |
| 138 | |
| 139 | /*Performs the last stage of the iDCT. |
| 140 | On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 |
| 141 | contain rows 4 through 7. |
| 142 | On output, xmm0 through xmm7 contain the corresponding rows.*/ |
| 143 | #define OC_IDCT_8x8_D \ |
| 144 | "#OC_IDCT_8x8_D\n\t" \ |
| 145 | /*Stage 4: \ |
| 146 | 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \ |
| 147 | 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \ |
| 148 | 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \ |
| 149 | 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \ |
| 150 | "psubw %%xmm0,%%xmm7\n\t" \ |
| 151 | "psubw %%xmm1,%%xmm6\n\t" \ |
| 152 | "psubw %%xmm2,%%xmm5\n\t" \ |
| 153 | "psubw %%xmm3,%%xmm4\n\t" \ |
| 154 | "paddw %%xmm0,%%xmm0\n\t" \ |
| 155 | "paddw %%xmm1,%%xmm1\n\t" \ |
| 156 | "paddw %%xmm2,%%xmm2\n\t" \ |
| 157 | "paddw %%xmm3,%%xmm3\n\t" \ |
| 158 | "paddw %%xmm7,%%xmm0\n\t" \ |
| 159 | "paddw %%xmm6,%%xmm1\n\t" \ |
| 160 | "paddw %%xmm5,%%xmm2\n\t" \ |
| 161 | "paddw %%xmm4,%%xmm3\n\t" \ |
| 162 | |
| 163 | /*Performs the last stage of the iDCT. |
| 164 | On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 |
| 165 | contain rows 4 through 7. |
| 166 | On output, xmm0 through xmm7 contain the corresponding rows.*/ |
| 167 | #define OC_IDCT_8x8_D_STORE \ |
| 168 | "#OC_IDCT_8x8_D_STORE\n\t" \ |
| 169 | /*Stage 4: \ |
| 170 | 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \ |
| 171 | 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \ |
| 172 | 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \ |
| 173 | 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \ |
| 174 | "psubw %%xmm3,%%xmm4\n\t" \ |
| 175 | "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \ |
| 176 | "movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \ |
| 177 | "psubw %%xmm0,%%xmm7\n\t" \ |
| 178 | "psubw %%xmm1,%%xmm6\n\t" \ |
| 179 | "psubw %%xmm2,%%xmm5\n\t" \ |
| 180 | "paddw %%xmm4,%%xmm7\n\t" \ |
| 181 | "paddw %%xmm4,%%xmm6\n\t" \ |
| 182 | "paddw %%xmm4,%%xmm5\n\t" \ |
| 183 | "paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \ |
| 184 | "paddw %%xmm0,%%xmm0\n\t" \ |
| 185 | "paddw %%xmm1,%%xmm1\n\t" \ |
| 186 | "paddw %%xmm2,%%xmm2\n\t" \ |
| 187 | "paddw %%xmm3,%%xmm3\n\t" \ |
| 188 | "paddw %%xmm7,%%xmm0\n\t" \ |
| 189 | "paddw %%xmm6,%%xmm1\n\t" \ |
| 190 | "psraw $4,%%xmm0\n\t" \ |
| 191 | "paddw %%xmm5,%%xmm2\n\t" \ |
| 192 | "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \ |
| 193 | "psraw $4,%%xmm1\n\t" \ |
| 194 | "paddw %%xmm4,%%xmm3\n\t" \ |
| 195 | "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \ |
| 196 | "psraw $4,%%xmm2\n\t" \ |
| 197 | "movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \ |
| 198 | "psraw $4,%%xmm3\n\t" \ |
| 199 | "movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \ |
| 200 | "psraw $4,%%xmm4\n\t" \ |
| 201 | "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \ |
| 202 | "psraw $4,%%xmm5\n\t" \ |
| 203 | "movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \ |
| 204 | "psraw $4,%%xmm6\n\t" \ |
| 205 | "movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \ |
| 206 | "psraw $4,%%xmm7\n\t" \ |
| 207 | "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \ |
| 208 | |
| 209 | static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){ |
| 210 | OC_ALIGN16(ogg_int16_t buf[16]); |
| 211 | int i; |
| 212 | /*This routine accepts an 8x8 matrix pre-transposed.*/ |
| 213 | __asm__ __volatile__( |
| 214 | /*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/ |
| 215 | "movdqa " OC_MEM_OFFS(0x20,x)",%%xmm2\n\t" |
| 216 | "movdqa " OC_MEM_OFFS(0x60,x)",%%xmm6\n\t" |
| 217 | "movdqa " OC_MEM_OFFS(0x30,x)",%%xmm3\n\t" |
| 218 | "movdqa " OC_MEM_OFFS(0x50,x)",%%xmm5\n\t" |
| 219 | OC_IDCT_8x8_ABC(x) |
| 220 | OC_IDCT_8x8_D |
| 221 | OC_TRANSPOSE_8x8 |
| 222 | /*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/ |
| 223 | "movdqa %%xmm7," OC_MEM_OFFS(0x70,y)"\n\t" |
| 224 | "movdqa %%xmm4," OC_MEM_OFFS(0x40,y)"\n\t" |
| 225 | "movdqa %%xmm1," OC_MEM_OFFS(0x10,y)"\n\t" |
| 226 | "movdqa %%xmm0," OC_MEM_OFFS(0x00,y)"\n\t" |
| 227 | OC_IDCT_8x8_ABC(y) |
| 228 | OC_IDCT_8x8_D_STORE |
| 229 | :[buf]"=m" (OC_ARRAY_OPERAND(ogg_int16_t,buf,16)), |
| 230 | [y]"=m" (OC_ARRAY_OPERAND(ogg_int16_t,_y,64)) |
| 231 | :[x]"m" (OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)), |
| 232 | [c]"m" (OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)) |
| 233 | ); |
| 234 | __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t" ::); |
| 235 | /*Clear input data for next block (decoder only).*/ |
| 236 | for(i=0;i<2;i++){ |
| 237 | __asm__ __volatile__( |
| 238 | "movdqa %%xmm0," OC_MEM_OFFS(0x00,x)"\n\t" |
| 239 | "movdqa %%xmm0," OC_MEM_OFFS(0x10,x)"\n\t" |
| 240 | "movdqa %%xmm0," OC_MEM_OFFS(0x20,x)"\n\t" |
| 241 | "movdqa %%xmm0," OC_MEM_OFFS(0x30,x)"\n\t" |
| 242 | :[x]"=m" (OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32)) |
| 243 | ); |
| 244 | } |
| 245 | } |
| 246 | |
| 247 | /*For the first step of the 10-coefficient version of the 8x8 iDCT, we only |
| 248 | need to work with four columns at a time. |
| 249 | Doing this in MMX is faster on processors with a 64-bit data path.*/ |
| 250 | #define OC_IDCT_8x8_10_MMX \ |
| 251 | "#OC_IDCT_8x8_10_MMX\n\t" \ |
| 252 | /*Stage 1:*/ \ |
| 253 | /*2-3 rotation by 6pi/16. \ |
| 254 | mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \ |
| 255 | "movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \ |
| 256 | "movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \ |
| 257 | "pmulhw %%mm2,%%mm6\n\t" \ |
| 258 | "pmulhw %%mm2,%%mm7\n\t" \ |
| 259 | "movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \ |
| 260 | "paddw %%mm6,%%mm2\n\t" \ |
| 261 | "movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
| 262 | "movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \ |
| 263 | "movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
| 264 | /*5-6 rotation by 3pi/16. \ |
| 265 | mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \ |
| 266 | "pmulhw %%mm3,%%mm5\n\t" \ |
| 267 | "pmulhw %%mm3,%%mm2\n\t" \ |
| 268 | "movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \ |
| 269 | "paddw %%mm3,%%mm5\n\t" \ |
| 270 | "paddw %%mm3,%%mm2\n\t" \ |
| 271 | "movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \ |
| 272 | /*4-7 rotation by 7pi/16. \ |
| 273 | mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \ |
| 274 | "pmulhw %%mm1,%%mm3\n\t" \ |
| 275 | "pmulhw %%mm1,%%mm7\n\t" \ |
| 276 | "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ |
| 277 | "movq %%mm3,%%mm6\n\t" \ |
| 278 | "paddw %%mm1,%%mm7\n\t" \ |
| 279 | /*0-1 butterfly. \ |
| 280 | mm4=C4, mm0=X0, X4=0.*/ \ |
| 281 | /*Stage 2:*/ \ |
| 282 | /*4-5 butterfly: mm3=t[4], mm5=t[5] \ |
| 283 | 7-6 butterfly: mm2=t[6], mm7=t[7]*/ \ |
| 284 | "psubw %%mm5,%%mm3\n\t" \ |
| 285 | "paddw %%mm5,%%mm6\n\t" \ |
| 286 | "movq %%mm4,%%mm1\n\t" \ |
| 287 | "pmulhw %%mm0,%%mm4\n\t" \ |
| 288 | "paddw %%mm0,%%mm4\n\t" \ |
| 289 | "movq %%mm7,%%mm0\n\t" \ |
| 290 | "movq %%mm4,%%mm5\n\t" \ |
| 291 | "paddw %%mm2,%%mm0\n\t" \ |
| 292 | "psubw %%mm2,%%mm7\n\t" \ |
| 293 | "movq %%mm1,%%mm2\n\t" \ |
| 294 | "pmulhw %%mm6,%%mm1\n\t" \ |
| 295 | "pmulhw %%mm7,%%mm2\n\t" \ |
| 296 | "paddw %%mm6,%%mm1\n\t" \ |
| 297 | "movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \ |
| 298 | "paddw %%mm7,%%mm2\n\t" \ |
| 299 | "movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \ |
| 300 | /*Stage 3: \ |
| 301 | 6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \ |
| 302 | 0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \ |
| 303 | 1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \ |
| 304 | "paddw %%mm2,%%mm1\n\t" \ |
| 305 | "paddw %%mm5,%%mm6\n\t" \ |
| 306 | "paddw %%mm4,%%mm7\n\t" \ |
| 307 | "paddw %%mm2,%%mm2\n\t" \ |
| 308 | "paddw %%mm4,%%mm4\n\t" \ |
| 309 | "paddw %%mm5,%%mm5\n\t" \ |
| 310 | "psubw %%mm1,%%mm2\n\t" \ |
| 311 | "psubw %%mm7,%%mm4\n\t" \ |
| 312 | "psubw %%mm6,%%mm5\n\t" \ |
| 313 | /*Stage 4: \ |
| 314 | 0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \ |
| 315 | 1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \ |
| 316 | 2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \ |
| 317 | 3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \ |
| 318 | "psubw %%mm0,%%mm7\n\t" \ |
| 319 | "psubw %%mm1,%%mm6\n\t" \ |
| 320 | "psubw %%mm2,%%mm5\n\t" \ |
| 321 | "psubw %%mm3,%%mm4\n\t" \ |
| 322 | "paddw %%mm0,%%mm0\n\t" \ |
| 323 | "paddw %%mm1,%%mm1\n\t" \ |
| 324 | "paddw %%mm2,%%mm2\n\t" \ |
| 325 | "paddw %%mm3,%%mm3\n\t" \ |
| 326 | "paddw %%mm7,%%mm0\n\t" \ |
| 327 | "paddw %%mm6,%%mm1\n\t" \ |
| 328 | "paddw %%mm5,%%mm2\n\t" \ |
| 329 | "paddw %%mm4,%%mm3\n\t" \ |
| 330 | |
| 331 | #define OC_IDCT_8x8_10_ABC \ |
| 332 | "#OC_IDCT_8x8_10_ABC\n\t" \ |
| 333 | /*Stage 1:*/ \ |
| 334 | /*2-3 rotation by 6pi/16. \ |
| 335 | xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \ |
| 336 | "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \ |
| 337 | "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \ |
| 338 | "pmulhw %%xmm2,%%xmm6\n\t" \ |
| 339 | "pmulhw %%xmm2,%%xmm7\n\t" \ |
| 340 | "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \ |
| 341 | "paddw %%xmm6,%%xmm2\n\t" \ |
| 342 | "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
| 343 | "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \ |
| 344 | "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
| 345 | /*5-6 rotation by 3pi/16. \ |
| 346 | xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \ |
| 347 | "pmulhw %%xmm3,%%xmm5\n\t" \ |
| 348 | "pmulhw %%xmm3,%%xmm2\n\t" \ |
| 349 | "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \ |
| 350 | "paddw %%xmm3,%%xmm5\n\t" \ |
| 351 | "paddw %%xmm3,%%xmm2\n\t" \ |
| 352 | "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \ |
| 353 | /*4-7 rotation by 7pi/16. \ |
| 354 | xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \ |
| 355 | "pmulhw %%xmm1,%%xmm3\n\t" \ |
| 356 | "pmulhw %%xmm1,%%xmm7\n\t" \ |
| 357 | "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \ |
| 358 | "movdqa %%xmm3,%%xmm6\n\t" \ |
| 359 | "paddw %%xmm1,%%xmm7\n\t" \ |
| 360 | /*0-1 butterfly. \ |
| 361 | xmm4=C4, xmm0=X0, X4=0.*/ \ |
| 362 | /*Stage 2:*/ \ |
| 363 | /*4-5 butterfly: xmm3=t[4], xmm5=t[5] \ |
| 364 | 7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \ |
| 365 | "psubw %%xmm5,%%xmm3\n\t" \ |
| 366 | "paddw %%xmm5,%%xmm6\n\t" \ |
| 367 | "movdqa %%xmm4,%%xmm1\n\t" \ |
| 368 | "pmulhw %%xmm0,%%xmm4\n\t" \ |
| 369 | "paddw %%xmm0,%%xmm4\n\t" \ |
| 370 | "movdqa %%xmm7,%%xmm0\n\t" \ |
| 371 | "movdqa %%xmm4,%%xmm5\n\t" \ |
| 372 | "paddw %%xmm2,%%xmm0\n\t" \ |
| 373 | "psubw %%xmm2,%%xmm7\n\t" \ |
| 374 | "movdqa %%xmm1,%%xmm2\n\t" \ |
| 375 | "pmulhw %%xmm6,%%xmm1\n\t" \ |
| 376 | "pmulhw %%xmm7,%%xmm2\n\t" \ |
| 377 | "paddw %%xmm6,%%xmm1\n\t" \ |
| 378 | "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \ |
| 379 | "paddw %%xmm7,%%xmm2\n\t" \ |
| 380 | "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \ |
| 381 | /*Stage 3: \ |
| 382 | 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \ |
| 383 | 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \ |
| 384 | 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \ |
| 385 | "paddw %%xmm2,%%xmm1\n\t" \ |
| 386 | "paddw %%xmm5,%%xmm6\n\t" \ |
| 387 | "paddw %%xmm4,%%xmm7\n\t" \ |
| 388 | "paddw %%xmm2,%%xmm2\n\t" \ |
| 389 | "paddw %%xmm4,%%xmm4\n\t" \ |
| 390 | "paddw %%xmm5,%%xmm5\n\t" \ |
| 391 | "psubw %%xmm1,%%xmm2\n\t" \ |
| 392 | "psubw %%xmm7,%%xmm4\n\t" \ |
| 393 | "psubw %%xmm6,%%xmm5\n\t" \ |
| 394 | |
| 395 | static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){ |
| 396 | OC_ALIGN16(ogg_int16_t buf[16]); |
| 397 | /*This routine accepts an 8x8 matrix pre-transposed.*/ |
| 398 | __asm__ __volatile__( |
| 399 | "movq " OC_MEM_OFFS(0x20,x)",%%mm2\n\t" |
| 400 | "movq " OC_MEM_OFFS(0x30,x)",%%mm3\n\t" |
| 401 | "movq " OC_MEM_OFFS(0x10,x)",%%mm1\n\t" |
| 402 | "movq " OC_MEM_OFFS(0x00,x)",%%mm0\n\t" |
| 403 | OC_IDCT_8x8_10_MMX |
| 404 | OC_TRANSPOSE_8x4_MMX2SSE |
| 405 | OC_IDCT_8x8_10_ABC |
| 406 | OC_IDCT_8x8_D_STORE |
| 407 | :[buf]"=m" (OC_ARRAY_OPERAND(short,buf,16)), |
| 408 | [y]"=m" (OC_ARRAY_OPERAND(ogg_int16_t,_y,64)) |
| 409 | :[x]"m" OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), |
| 410 | [c]"m" (OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)) |
| 411 | ); |
| 412 | /*Clear input data for next block (decoder only).*/ |
| 413 | __asm__ __volatile__( |
| 414 | "pxor %%mm0,%%mm0\n\t" |
| 415 | "movq %%mm0," OC_MEM_OFFS(0x00,x)"\n\t" |
| 416 | "movq %%mm0," OC_MEM_OFFS(0x10,x)"\n\t" |
| 417 | "movq %%mm0," OC_MEM_OFFS(0x20,x)"\n\t" |
| 418 | "movq %%mm0," OC_MEM_OFFS(0x30,x)"\n\t" |
| 419 | :[x]"+m" (OC_ARRAY_OPERAND(ogg_int16_t,_x,28)) |
| 420 | ); |
| 421 | } |
| 422 | |
| 423 | /*Performs an inverse 8x8 Type-II DCT transform. |
| 424 | The input is assumed to be scaled by a factor of 4 relative to orthonormal |
| 425 | version of the transform.*/ |
| 426 | void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ |
| 427 | /*_last_zzi is subtly different from an actual count of the number of |
| 428 | coefficients we decoded for this block. |
| 429 | It contains the value of zzi BEFORE the final token in the block was |
| 430 | decoded. |
| 431 | In most cases this is an EOB token (the continuation of an EOB run from a |
| 432 | previous block counts), and so this is the same as the coefficient count. |
| 433 | However, in the case that the last token was NOT an EOB token, but filled |
| 434 | the block up with exactly 64 coefficients, _last_zzi will be less than 64. |
| 435 | Provided the last token was not a pure zero run, the minimum value it can |
| 436 | be is 46, and so that doesn't affect any of the cases in this routine. |
| 437 | However, if the last token WAS a pure zero run of length 63, then _last_zzi |
| 438 | will be 1 while the number of coefficients decoded is 64. |
| 439 | Thus, we will trigger the following special case, where the real |
| 440 | coefficient count would not. |
| 441 | Note also that a zero run of length 64 will give _last_zzi a value of 0, |
| 442 | but we still process the DC coefficient, which might have a non-zero value |
| 443 | due to DC prediction. |
| 444 | Although convoluted, this is arguably the correct behavior: it allows us to |
| 445 | use a smaller transform when the block ends with a long zero run instead |
| 446 | of a normal EOB token. |
| 447 | It could be smarter... multiple separate zero runs at the end of a block |
| 448 | will fool it, but an encoder that generates these really deserves what it |
| 449 | gets. |
| 450 | Needless to say we inherited this approach from VP3.*/ |
| 451 | /*Then perform the iDCT.*/ |
| 452 | if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x); |
| 453 | else oc_idct8x8_slow_sse2(_y,_x); |
| 454 | } |
| 455 | |
| 456 | #endif |
| 457 | |