1 | /******************************************************************** |
2 | * * |
3 | * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
4 | * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
5 | * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
6 | * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
7 | * * |
8 | * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
9 | * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
10 | * * |
11 | ******************************************************************** |
12 | |
13 | function: |
14 | last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $ |
15 | |
16 | ********************************************************************/ |
17 | |
18 | #if !defined(_x86_sse2trans_H) |
19 | # define _x86_sse2trans_H (1) |
20 | # include "x86int.h" |
21 | |
22 | # if defined(OC_X86_64_ASM) |
23 | /*On x86-64 we can transpose in-place without spilling registers. |
24 | By clever choices of the order to apply the butterflies and the order of |
25 | their outputs, we can take the rows in order and output the columns in order |
26 | without any extra operations and using just one temporary register.*/ |
27 | # define OC_TRANSPOSE_8x8 \ |
28 | "#OC_TRANSPOSE_8x8\n\t" \ |
29 | "movdqa %%xmm4,%%xmm8\n\t" \ |
30 | /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \ |
31 | "punpcklwd %%xmm5,%%xmm4\n\t" \ |
32 | /*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \ |
33 | "punpckhwd %%xmm5,%%xmm8\n\t" \ |
34 | /*xmm5 is free.*/ \ |
35 | "movdqa %%xmm0,%%xmm5\n\t" \ |
36 | /*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \ |
37 | "punpcklwd %%xmm1,%%xmm0\n\t" \ |
38 | /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \ |
39 | "punpckhwd %%xmm1,%%xmm5\n\t" \ |
40 | /*xmm1 is free.*/ \ |
41 | "movdqa %%xmm6,%%xmm1\n\t" \ |
42 | /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \ |
43 | "punpcklwd %%xmm7,%%xmm6\n\t" \ |
44 | /*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \ |
45 | "punpckhwd %%xmm7,%%xmm1\n\t" \ |
46 | /*xmm7 is free.*/ \ |
47 | "movdqa %%xmm2,%%xmm7\n\t" \ |
48 | /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \ |
49 | "punpckhwd %%xmm3,%%xmm2\n\t" \ |
50 | /*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \ |
51 | "punpcklwd %%xmm3,%%xmm7\n\t" \ |
52 | /*xmm3 is free.*/ \ |
53 | "movdqa %%xmm0,%%xmm3\n\t" \ |
54 | /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \ |
55 | "punpckldq %%xmm7,%%xmm0\n\t" \ |
56 | /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \ |
57 | "punpckhdq %%xmm7,%%xmm3\n\t" \ |
58 | /*xmm7 is free.*/ \ |
59 | "movdqa %%xmm5,%%xmm7\n\t" \ |
60 | /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \ |
61 | "punpckldq %%xmm2,%%xmm5\n\t" \ |
62 | /*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \ |
63 | "punpckhdq %%xmm2,%%xmm7\n\t" \ |
64 | /*xmm2 is free.*/ \ |
65 | "movdqa %%xmm4,%%xmm2\n\t" \ |
66 | /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \ |
67 | "punpckhdq %%xmm6,%%xmm4\n\t" \ |
68 | /*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \ |
69 | "punpckldq %%xmm6,%%xmm2\n\t" \ |
70 | /*xmm6 is free.*/ \ |
71 | "movdqa %%xmm8,%%xmm6\n\t" \ |
72 | /*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \ |
73 | "punpckldq %%xmm1,%%xmm6\n\t" \ |
74 | /*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \ |
75 | "punpckhdq %%xmm1,%%xmm8\n\t" \ |
76 | /*xmm1 is free.*/ \ |
77 | "movdqa %%xmm0,%%xmm1\n\t" \ |
78 | /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ |
79 | "punpcklqdq %%xmm2,%%xmm0\n\t" \ |
80 | /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \ |
81 | "punpckhqdq %%xmm2,%%xmm1\n\t" \ |
82 | /*xmm2 is free.*/ \ |
83 | "movdqa %%xmm3,%%xmm2\n\t" \ |
84 | /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \ |
85 | "punpckhqdq %%xmm4,%%xmm3\n\t" \ |
86 | /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \ |
87 | "punpcklqdq %%xmm4,%%xmm2\n\t" \ |
88 | /*xmm4 is free.*/ \ |
89 | "movdqa %%xmm5,%%xmm4\n\t" \ |
90 | /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \ |
91 | "punpckhqdq %%xmm6,%%xmm5\n\t" \ |
92 | /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \ |
93 | "punpcklqdq %%xmm6,%%xmm4\n\t" \ |
94 | /*xmm6 is free.*/ \ |
95 | "movdqa %%xmm7,%%xmm6\n\t" \ |
96 | /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \ |
97 | "punpckhqdq %%xmm8,%%xmm7\n\t" \ |
98 | /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \ |
99 | "punpcklqdq %%xmm8,%%xmm6\n\t" \ |
100 | /*xmm8 is free.*/ \ |
101 | |
102 | # else |
103 | /*Otherwise, we need to spill some values to %[buf] temporarily. |
104 | Again, the butterflies are carefully arranged to get the columns to come out |
105 | in order, minimizing register spills and maximizing the delay between a load |
106 | and when the value loaded is actually used.*/ |
107 | # define OC_TRANSPOSE_8x8 \ |
108 | "#OC_TRANSPOSE_8x8\n\t" \ |
109 | /*buf[0] = a7 a6 a5 a4 a3 a2 a1 a0*/ \ |
110 | "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
111 | /*xmm0 is free.*/ \ |
112 | "movdqa %%xmm2,%%xmm0\n\t" \ |
113 | /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \ |
114 | "punpckhwd %%xmm3,%%xmm2\n\t" \ |
115 | /*xmm0 = d3 c3 d2 c2 d1 c1 d0 c0*/ \ |
116 | "punpcklwd %%xmm3,%%xmm0\n\t" \ |
117 | /*xmm3 = a7 a6 a5 a4 a3 a2 a1 a0*/ \ |
118 | "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm3\n\t" \ |
119 | /*buf[1] = d7 c7 d6 c6 d5 c5 d4 c4*/ \ |
120 | "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
121 | /*xmm2 is free.*/ \ |
122 | "movdqa %%xmm6,%%xmm2\n\t" \ |
123 | /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \ |
124 | "punpcklwd %%xmm7,%%xmm6\n\t" \ |
125 | /*xmm2 = h7 g7 h6 g6 h5 g5 h4 g4*/ \ |
126 | "punpckhwd %%xmm7,%%xmm2\n\t" \ |
127 | /*xmm7 is free.*/ \ |
128 | "movdqa %%xmm4,%%xmm7\n\t" \ |
129 | /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \ |
130 | "punpcklwd %%xmm5,%%xmm4\n\t" \ |
131 | /*xmm7 = f7 e7 f6 e6 f5 e5 f4 e4*/ \ |
132 | "punpckhwd %%xmm5,%%xmm7\n\t" \ |
133 | /*xmm5 is free.*/ \ |
134 | "movdqa %%xmm3,%%xmm5\n\t" \ |
135 | /*xmm3 = b3 a3 b2 a2 b1 a1 b0 a0*/ \ |
136 | "punpcklwd %%xmm1,%%xmm3\n\t" \ |
137 | /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \ |
138 | "punpckhwd %%xmm1,%%xmm5\n\t" \ |
139 | /*xmm1 is free.*/ \ |
140 | "movdqa %%xmm7,%%xmm1\n\t" \ |
141 | /*xmm7 = h5 g5 f5 e5 h4 g4 f4 e4*/ \ |
142 | "punpckldq %%xmm2,%%xmm7\n\t" \ |
143 | /*xmm1 = h7 g7 f7 e7 h6 g6 f6 e6*/ \ |
144 | "punpckhdq %%xmm2,%%xmm1\n\t" \ |
145 | /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \ |
146 | "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm2\n\t" \ |
147 | /*buf[0] = h7 g7 f7 e7 h6 g6 f6 e6*/ \ |
148 | "movdqa %%xmm1,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
149 | /*xmm1 is free.*/ \ |
150 | "movdqa %%xmm3,%%xmm1\n\t" \ |
151 | /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \ |
152 | "punpckhdq %%xmm0,%%xmm3\n\t" \ |
153 | /*xmm1 = d1 c1 b1 a1 d0 c0 b0 a0*/ \ |
154 | "punpckldq %%xmm0,%%xmm1\n\t" \ |
155 | /*xmm0 is free.*/ \ |
156 | "movdqa %%xmm4,%%xmm0\n\t" \ |
157 | /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \ |
158 | "punpckhdq %%xmm6,%%xmm4\n\t" \ |
159 | /*xmm0 = h1 g1 f1 e1 h0 g0 f0 e0*/ \ |
160 | "punpckldq %%xmm6,%%xmm0\n\t" \ |
161 | /*xmm6 is free.*/ \ |
162 | "movdqa %%xmm5,%%xmm6\n\t" \ |
163 | /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \ |
164 | "punpckldq %%xmm2,%%xmm5\n\t" \ |
165 | /*xmm6 = d7 c7 b7 a7 d6 c6 b6 a6*/ \ |
166 | "punpckhdq %%xmm2,%%xmm6\n\t" \ |
167 | /*xmm2 is free.*/ \ |
168 | "movdqa %%xmm1,%%xmm2\n\t" \ |
169 | /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \ |
170 | "punpckhqdq %%xmm0,%%xmm1\n\t" \ |
171 | /*xmm2 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ |
172 | "punpcklqdq %%xmm0,%%xmm2\n\t" \ |
173 | /*xmm0 = h7 g7 f7 e7 h6 g6 f6 e6*/ \ |
174 | "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \ |
175 | /*buf[1] = h0 g0 f0 e0 d0 c0 b0 a0*/ \ |
176 | "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
177 | /*xmm2 is free.*/ \ |
178 | "movdqa %%xmm3,%%xmm2\n\t" \ |
179 | /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \ |
180 | "punpckhqdq %%xmm4,%%xmm3\n\t" \ |
181 | /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \ |
182 | "punpcklqdq %%xmm4,%%xmm2\n\t" \ |
183 | /*xmm4 is free.*/ \ |
184 | "movdqa %%xmm5,%%xmm4\n\t" \ |
185 | /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \ |
186 | "punpckhqdq %%xmm7,%%xmm5\n\t" \ |
187 | /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \ |
188 | "punpcklqdq %%xmm7,%%xmm4\n\t" \ |
189 | /*xmm7 is free.*/ \ |
190 | "movdqa %%xmm6,%%xmm7\n\t" \ |
191 | /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \ |
192 | "punpcklqdq %%xmm0,%%xmm6\n\t" \ |
193 | /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \ |
194 | "punpckhqdq %%xmm0,%%xmm7\n\t" \ |
195 | /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ |
196 | "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm0\n\t" \ |
197 | |
198 | # endif |
199 | |
200 | /*Transpose 4 values in each of 8 MMX registers into 8 values in the first |
201 | four SSE registers. |
202 | No need to be clever here; we have plenty of room.*/ |
203 | # define OC_TRANSPOSE_8x4_MMX2SSE \ |
204 | "#OC_TRANSPOSE_8x4_MMX2SSE\n\t" \ |
205 | "movq2dq %%mm0,%%xmm0\n\t" \ |
206 | "movq2dq %%mm1,%%xmm1\n\t" \ |
207 | /*xmmA = b3 a3 b2 a2 b1 a1 b0 a0*/ \ |
208 | "punpcklwd %%xmm1,%%xmm0\n\t" \ |
209 | "movq2dq %%mm2,%%xmm3\n\t" \ |
210 | "movq2dq %%mm3,%%xmm2\n\t" \ |
211 | /*xmmC = d3 c3 d2 c2 d1 c1 d0 c0*/ \ |
212 | "punpcklwd %%xmm2,%%xmm3\n\t" \ |
213 | "movq2dq %%mm4,%%xmm4\n\t" \ |
214 | "movq2dq %%mm5,%%xmm5\n\t" \ |
215 | /*xmmE = f3 e3 f2 e2 f1 e1 f0 e0*/ \ |
216 | "punpcklwd %%xmm5,%%xmm4\n\t" \ |
217 | "movq2dq %%mm6,%%xmm7\n\t" \ |
218 | "movq2dq %%mm7,%%xmm6\n\t" \ |
219 | /*xmmG = h3 g3 h2 g2 h1 g1 h0 g0*/ \ |
220 | "punpcklwd %%xmm6,%%xmm7\n\t" \ |
221 | "movdqa %%xmm0,%%xmm2\n\t" \ |
222 | /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \ |
223 | "punpckldq %%xmm3,%%xmm0\n\t" \ |
224 | /*xmm2 = d3 c3 b3 a3 d2 c2 b2 a2*/ \ |
225 | "punpckhdq %%xmm3,%%xmm2\n\t" \ |
226 | "movdqa %%xmm4,%%xmm5\n\t" \ |
227 | /*xmm4 = h1 g1 f1 e1 h0 g0 f0 e0*/ \ |
228 | "punpckldq %%xmm7,%%xmm4\n\t" \ |
229 | /*xmm3 = h3 g3 f3 e3 h2 g2 f2 e2*/ \ |
230 | "punpckhdq %%xmm7,%%xmm5\n\t" \ |
231 | "movdqa %%xmm0,%%xmm1\n\t" \ |
232 | /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ |
233 | "punpcklqdq %%xmm4,%%xmm0\n\t" \ |
234 | /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \ |
235 | "punpckhqdq %%xmm4,%%xmm1\n\t" \ |
236 | "movdqa %%xmm2,%%xmm3\n\t" \ |
237 | /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \ |
238 | "punpcklqdq %%xmm5,%%xmm2\n\t" \ |
239 | /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \ |
240 | "punpckhqdq %%xmm5,%%xmm3\n\t" \ |
241 | |
242 | #endif |
243 | |