| 1 | /******************************************************************** |
| 2 | * * |
| 3 | * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
| 4 | * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
| 5 | * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
| 6 | * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
| 7 | * * |
| 8 | * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
| 9 | * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
| 10 | * * |
| 11 | ******************************************************************** |
| 12 | |
| 13 | function: |
| 14 | last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $ |
| 15 | |
| 16 | ********************************************************************/ |
| 17 | |
| 18 | #if !defined(_x86_sse2trans_H) |
| 19 | # define _x86_sse2trans_H (1) |
| 20 | # include "x86int.h" |
| 21 | |
| 22 | # if defined(OC_X86_64_ASM) |
| 23 | /*On x86-64 we can transpose in-place without spilling registers. |
| 24 | By clever choices of the order to apply the butterflies and the order of |
| 25 | their outputs, we can take the rows in order and output the columns in order |
| 26 | without any extra operations and using just one temporary register.*/ |
| 27 | # define OC_TRANSPOSE_8x8 \ |
| 28 | "#OC_TRANSPOSE_8x8\n\t" \ |
| 29 | "movdqa %%xmm4,%%xmm8\n\t" \ |
| 30 | /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \ |
| 31 | "punpcklwd %%xmm5,%%xmm4\n\t" \ |
| 32 | /*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \ |
| 33 | "punpckhwd %%xmm5,%%xmm8\n\t" \ |
| 34 | /*xmm5 is free.*/ \ |
| 35 | "movdqa %%xmm0,%%xmm5\n\t" \ |
| 36 | /*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \ |
| 37 | "punpcklwd %%xmm1,%%xmm0\n\t" \ |
| 38 | /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \ |
| 39 | "punpckhwd %%xmm1,%%xmm5\n\t" \ |
| 40 | /*xmm1 is free.*/ \ |
| 41 | "movdqa %%xmm6,%%xmm1\n\t" \ |
| 42 | /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \ |
| 43 | "punpcklwd %%xmm7,%%xmm6\n\t" \ |
| 44 | /*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \ |
| 45 | "punpckhwd %%xmm7,%%xmm1\n\t" \ |
| 46 | /*xmm7 is free.*/ \ |
| 47 | "movdqa %%xmm2,%%xmm7\n\t" \ |
| 48 | /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \ |
| 49 | "punpckhwd %%xmm3,%%xmm2\n\t" \ |
| 50 | /*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \ |
| 51 | "punpcklwd %%xmm3,%%xmm7\n\t" \ |
| 52 | /*xmm3 is free.*/ \ |
| 53 | "movdqa %%xmm0,%%xmm3\n\t" \ |
| 54 | /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \ |
| 55 | "punpckldq %%xmm7,%%xmm0\n\t" \ |
| 56 | /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \ |
| 57 | "punpckhdq %%xmm7,%%xmm3\n\t" \ |
| 58 | /*xmm7 is free.*/ \ |
| 59 | "movdqa %%xmm5,%%xmm7\n\t" \ |
| 60 | /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \ |
| 61 | "punpckldq %%xmm2,%%xmm5\n\t" \ |
| 62 | /*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \ |
| 63 | "punpckhdq %%xmm2,%%xmm7\n\t" \ |
| 64 | /*xmm2 is free.*/ \ |
| 65 | "movdqa %%xmm4,%%xmm2\n\t" \ |
| 66 | /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \ |
| 67 | "punpckhdq %%xmm6,%%xmm4\n\t" \ |
| 68 | /*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \ |
| 69 | "punpckldq %%xmm6,%%xmm2\n\t" \ |
| 70 | /*xmm6 is free.*/ \ |
| 71 | "movdqa %%xmm8,%%xmm6\n\t" \ |
| 72 | /*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \ |
| 73 | "punpckldq %%xmm1,%%xmm6\n\t" \ |
| 74 | /*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \ |
| 75 | "punpckhdq %%xmm1,%%xmm8\n\t" \ |
| 76 | /*xmm1 is free.*/ \ |
| 77 | "movdqa %%xmm0,%%xmm1\n\t" \ |
| 78 | /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ |
| 79 | "punpcklqdq %%xmm2,%%xmm0\n\t" \ |
| 80 | /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \ |
| 81 | "punpckhqdq %%xmm2,%%xmm1\n\t" \ |
| 82 | /*xmm2 is free.*/ \ |
| 83 | "movdqa %%xmm3,%%xmm2\n\t" \ |
| 84 | /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \ |
| 85 | "punpckhqdq %%xmm4,%%xmm3\n\t" \ |
| 86 | /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \ |
| 87 | "punpcklqdq %%xmm4,%%xmm2\n\t" \ |
| 88 | /*xmm4 is free.*/ \ |
| 89 | "movdqa %%xmm5,%%xmm4\n\t" \ |
| 90 | /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \ |
| 91 | "punpckhqdq %%xmm6,%%xmm5\n\t" \ |
| 92 | /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \ |
| 93 | "punpcklqdq %%xmm6,%%xmm4\n\t" \ |
| 94 | /*xmm6 is free.*/ \ |
| 95 | "movdqa %%xmm7,%%xmm6\n\t" \ |
| 96 | /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \ |
| 97 | "punpckhqdq %%xmm8,%%xmm7\n\t" \ |
| 98 | /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \ |
| 99 | "punpcklqdq %%xmm8,%%xmm6\n\t" \ |
| 100 | /*xmm8 is free.*/ \ |
| 101 | |
| 102 | # else |
| 103 | /*Otherwise, we need to spill some values to %[buf] temporarily. |
| 104 | Again, the butterflies are carefully arranged to get the columns to come out |
| 105 | in order, minimizing register spills and maximizing the delay between a load |
| 106 | and when the value loaded is actually used.*/ |
| 107 | # define OC_TRANSPOSE_8x8 \ |
| 108 | "#OC_TRANSPOSE_8x8\n\t" \ |
| 109 | /*buf[0] = a7 a6 a5 a4 a3 a2 a1 a0*/ \ |
| 110 | "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
| 111 | /*xmm0 is free.*/ \ |
| 112 | "movdqa %%xmm2,%%xmm0\n\t" \ |
| 113 | /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \ |
| 114 | "punpckhwd %%xmm3,%%xmm2\n\t" \ |
| 115 | /*xmm0 = d3 c3 d2 c2 d1 c1 d0 c0*/ \ |
| 116 | "punpcklwd %%xmm3,%%xmm0\n\t" \ |
| 117 | /*xmm3 = a7 a6 a5 a4 a3 a2 a1 a0*/ \ |
| 118 | "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm3\n\t" \ |
| 119 | /*buf[1] = d7 c7 d6 c6 d5 c5 d4 c4*/ \ |
| 120 | "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
| 121 | /*xmm2 is free.*/ \ |
| 122 | "movdqa %%xmm6,%%xmm2\n\t" \ |
| 123 | /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \ |
| 124 | "punpcklwd %%xmm7,%%xmm6\n\t" \ |
| 125 | /*xmm2 = h7 g7 h6 g6 h5 g5 h4 g4*/ \ |
| 126 | "punpckhwd %%xmm7,%%xmm2\n\t" \ |
| 127 | /*xmm7 is free.*/ \ |
| 128 | "movdqa %%xmm4,%%xmm7\n\t" \ |
| 129 | /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \ |
| 130 | "punpcklwd %%xmm5,%%xmm4\n\t" \ |
| 131 | /*xmm7 = f7 e7 f6 e6 f5 e5 f4 e4*/ \ |
| 132 | "punpckhwd %%xmm5,%%xmm7\n\t" \ |
| 133 | /*xmm5 is free.*/ \ |
| 134 | "movdqa %%xmm3,%%xmm5\n\t" \ |
| 135 | /*xmm3 = b3 a3 b2 a2 b1 a1 b0 a0*/ \ |
| 136 | "punpcklwd %%xmm1,%%xmm3\n\t" \ |
| 137 | /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \ |
| 138 | "punpckhwd %%xmm1,%%xmm5\n\t" \ |
| 139 | /*xmm1 is free.*/ \ |
| 140 | "movdqa %%xmm7,%%xmm1\n\t" \ |
| 141 | /*xmm7 = h5 g5 f5 e5 h4 g4 f4 e4*/ \ |
| 142 | "punpckldq %%xmm2,%%xmm7\n\t" \ |
| 143 | /*xmm1 = h7 g7 f7 e7 h6 g6 f6 e6*/ \ |
| 144 | "punpckhdq %%xmm2,%%xmm1\n\t" \ |
| 145 | /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \ |
| 146 | "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm2\n\t" \ |
| 147 | /*buf[0] = h7 g7 f7 e7 h6 g6 f6 e6*/ \ |
| 148 | "movdqa %%xmm1,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
| 149 | /*xmm1 is free.*/ \ |
| 150 | "movdqa %%xmm3,%%xmm1\n\t" \ |
| 151 | /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \ |
| 152 | "punpckhdq %%xmm0,%%xmm3\n\t" \ |
| 153 | /*xmm1 = d1 c1 b1 a1 d0 c0 b0 a0*/ \ |
| 154 | "punpckldq %%xmm0,%%xmm1\n\t" \ |
| 155 | /*xmm0 is free.*/ \ |
| 156 | "movdqa %%xmm4,%%xmm0\n\t" \ |
| 157 | /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \ |
| 158 | "punpckhdq %%xmm6,%%xmm4\n\t" \ |
| 159 | /*xmm0 = h1 g1 f1 e1 h0 g0 f0 e0*/ \ |
| 160 | "punpckldq %%xmm6,%%xmm0\n\t" \ |
| 161 | /*xmm6 is free.*/ \ |
| 162 | "movdqa %%xmm5,%%xmm6\n\t" \ |
| 163 | /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \ |
| 164 | "punpckldq %%xmm2,%%xmm5\n\t" \ |
| 165 | /*xmm6 = d7 c7 b7 a7 d6 c6 b6 a6*/ \ |
| 166 | "punpckhdq %%xmm2,%%xmm6\n\t" \ |
| 167 | /*xmm2 is free.*/ \ |
| 168 | "movdqa %%xmm1,%%xmm2\n\t" \ |
| 169 | /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \ |
| 170 | "punpckhqdq %%xmm0,%%xmm1\n\t" \ |
| 171 | /*xmm2 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ |
| 172 | "punpcklqdq %%xmm0,%%xmm2\n\t" \ |
| 173 | /*xmm0 = h7 g7 f7 e7 h6 g6 f6 e6*/ \ |
| 174 | "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \ |
| 175 | /*buf[1] = h0 g0 f0 e0 d0 c0 b0 a0*/ \ |
| 176 | "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
| 177 | /*xmm2 is free.*/ \ |
| 178 | "movdqa %%xmm3,%%xmm2\n\t" \ |
| 179 | /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \ |
| 180 | "punpckhqdq %%xmm4,%%xmm3\n\t" \ |
| 181 | /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \ |
| 182 | "punpcklqdq %%xmm4,%%xmm2\n\t" \ |
| 183 | /*xmm4 is free.*/ \ |
| 184 | "movdqa %%xmm5,%%xmm4\n\t" \ |
| 185 | /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \ |
| 186 | "punpckhqdq %%xmm7,%%xmm5\n\t" \ |
| 187 | /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \ |
| 188 | "punpcklqdq %%xmm7,%%xmm4\n\t" \ |
| 189 | /*xmm7 is free.*/ \ |
| 190 | "movdqa %%xmm6,%%xmm7\n\t" \ |
| 191 | /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \ |
| 192 | "punpcklqdq %%xmm0,%%xmm6\n\t" \ |
| 193 | /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \ |
| 194 | "punpckhqdq %%xmm0,%%xmm7\n\t" \ |
| 195 | /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ |
| 196 | "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm0\n\t" \ |
| 197 | |
| 198 | # endif |
| 199 | |
| 200 | /*Transpose 4 values in each of 8 MMX registers into 8 values in the first |
| 201 | four SSE registers. |
| 202 | No need to be clever here; we have plenty of room.*/ |
| 203 | # define OC_TRANSPOSE_8x4_MMX2SSE \ |
| 204 | "#OC_TRANSPOSE_8x4_MMX2SSE\n\t" \ |
| 205 | "movq2dq %%mm0,%%xmm0\n\t" \ |
| 206 | "movq2dq %%mm1,%%xmm1\n\t" \ |
| 207 | /*xmmA = b3 a3 b2 a2 b1 a1 b0 a0*/ \ |
| 208 | "punpcklwd %%xmm1,%%xmm0\n\t" \ |
| 209 | "movq2dq %%mm2,%%xmm3\n\t" \ |
| 210 | "movq2dq %%mm3,%%xmm2\n\t" \ |
| 211 | /*xmmC = d3 c3 d2 c2 d1 c1 d0 c0*/ \ |
| 212 | "punpcklwd %%xmm2,%%xmm3\n\t" \ |
| 213 | "movq2dq %%mm4,%%xmm4\n\t" \ |
| 214 | "movq2dq %%mm5,%%xmm5\n\t" \ |
| 215 | /*xmmE = f3 e3 f2 e2 f1 e1 f0 e0*/ \ |
| 216 | "punpcklwd %%xmm5,%%xmm4\n\t" \ |
| 217 | "movq2dq %%mm6,%%xmm7\n\t" \ |
| 218 | "movq2dq %%mm7,%%xmm6\n\t" \ |
| 219 | /*xmmG = h3 g3 h2 g2 h1 g1 h0 g0*/ \ |
| 220 | "punpcklwd %%xmm6,%%xmm7\n\t" \ |
| 221 | "movdqa %%xmm0,%%xmm2\n\t" \ |
| 222 | /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \ |
| 223 | "punpckldq %%xmm3,%%xmm0\n\t" \ |
| 224 | /*xmm2 = d3 c3 b3 a3 d2 c2 b2 a2*/ \ |
| 225 | "punpckhdq %%xmm3,%%xmm2\n\t" \ |
| 226 | "movdqa %%xmm4,%%xmm5\n\t" \ |
| 227 | /*xmm4 = h1 g1 f1 e1 h0 g0 f0 e0*/ \ |
| 228 | "punpckldq %%xmm7,%%xmm4\n\t" \ |
| 229 | /*xmm3 = h3 g3 f3 e3 h2 g2 f2 e2*/ \ |
| 230 | "punpckhdq %%xmm7,%%xmm5\n\t" \ |
| 231 | "movdqa %%xmm0,%%xmm1\n\t" \ |
| 232 | /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ |
| 233 | "punpcklqdq %%xmm4,%%xmm0\n\t" \ |
| 234 | /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \ |
| 235 | "punpckhqdq %%xmm4,%%xmm1\n\t" \ |
| 236 | "movdqa %%xmm2,%%xmm3\n\t" \ |
| 237 | /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \ |
| 238 | "punpcklqdq %%xmm5,%%xmm2\n\t" \ |
| 239 | /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \ |
| 240 | "punpckhqdq %%xmm5,%%xmm3\n\t" \ |
| 241 | |
| 242 | #endif |
| 243 | |