1 | /******************************************************************** |
2 | * * |
3 | * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
4 | * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
5 | * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
6 | * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
7 | * * |
8 | * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
9 | * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
10 | * * |
11 | ******************************************************************** |
12 | |
13 | function: |
14 | last mod: $Id$ |
15 | |
16 | ********************************************************************/ |
17 | |
18 | /*MMX acceleration of fragment reconstruction for motion compensation. |
19 | Originally written by Rudolf Marek. |
20 | Additional optimization by Nils Pipenbrinck. |
21 | Note: Loops are unrolled for best performance. |
22 | The iteration each instruction belongs to is marked in the comments as #i.*/ |
23 | #include <stddef.h> |
24 | #include "x86int.h" |
25 | |
26 | #if defined(OC_X86_ASM) |
27 | |
28 | /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes |
29 | between rows.*/ |
30 | # define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \ |
31 | do{ \ |
32 | const unsigned char *src; \ |
33 | unsigned char *dst; \ |
34 | ptrdiff_t ystride3; \ |
35 | src=(_src); \ |
36 | dst=(_dst); \ |
37 | __asm__ __volatile__( \ |
38 | /*src+0*ystride*/ \ |
39 | "movq (%[src]),%%mm0\n\t" \ |
40 | /*src+1*ystride*/ \ |
41 | "movq (%[src],%[ystride]),%%mm1\n\t" \ |
42 | /*ystride3=ystride*3*/ \ |
43 | "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \ |
44 | /*src+2*ystride*/ \ |
45 | "movq (%[src],%[ystride],2),%%mm2\n\t" \ |
46 | /*src+3*ystride*/ \ |
47 | "movq (%[src],%[ystride3]),%%mm3\n\t" \ |
48 | /*dst+0*ystride*/ \ |
49 | "movq %%mm0,(%[dst])\n\t" \ |
50 | /*dst+1*ystride*/ \ |
51 | "movq %%mm1,(%[dst],%[ystride])\n\t" \ |
52 | /*Pointer to next 4.*/ \ |
53 | "lea (%[src],%[ystride],4),%[src]\n\t" \ |
54 | /*dst+2*ystride*/ \ |
55 | "movq %%mm2,(%[dst],%[ystride],2)\n\t" \ |
56 | /*dst+3*ystride*/ \ |
57 | "movq %%mm3,(%[dst],%[ystride3])\n\t" \ |
58 | /*Pointer to next 4.*/ \ |
59 | "lea (%[dst],%[ystride],4),%[dst]\n\t" \ |
60 | /*src+0*ystride*/ \ |
61 | "movq (%[src]),%%mm0\n\t" \ |
62 | /*src+1*ystride*/ \ |
63 | "movq (%[src],%[ystride]),%%mm1\n\t" \ |
64 | /*src+2*ystride*/ \ |
65 | "movq (%[src],%[ystride],2),%%mm2\n\t" \ |
66 | /*src+3*ystride*/ \ |
67 | "movq (%[src],%[ystride3]),%%mm3\n\t" \ |
68 | /*dst+0*ystride*/ \ |
69 | "movq %%mm0,(%[dst])\n\t" \ |
70 | /*dst+1*ystride*/ \ |
71 | "movq %%mm1,(%[dst],%[ystride])\n\t" \ |
72 | /*dst+2*ystride*/ \ |
73 | "movq %%mm2,(%[dst],%[ystride],2)\n\t" \ |
74 | /*dst+3*ystride*/ \ |
75 | "movq %%mm3,(%[dst],%[ystride3])\n\t" \ |
76 | :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \ |
77 | :[ystride]"r"((ptrdiff_t)(_ystride)) \ |
78 | :"memory" \ |
79 | ); \ |
80 | } \ |
81 | while(0) |
82 | |
83 | /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes |
84 | between rows.*/ |
85 | void oc_frag_copy_mmx(unsigned char *_dst, |
86 | const unsigned char *_src,int _ystride){ |
87 | OC_FRAG_COPY_MMX(_dst,_src,_ystride); |
88 | } |
89 | |
90 | /*Copies the fragments specified by the lists of fragment indices from one |
91 | frame to another. |
92 | _dst_frame: The reference frame to copy to. |
93 | _src_frame: The reference frame to copy from. |
94 | _ystride: The row stride of the reference frames. |
95 | _fragis: A pointer to a list of fragment indices. |
96 | _nfragis: The number of fragment indices to copy. |
97 | _frag_buf_offs: The offsets of fragments in the reference frames.*/ |
98 | void oc_frag_copy_list_mmx(unsigned char *_dst_frame, |
99 | const unsigned char *_src_frame,int _ystride, |
100 | const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){ |
101 | ptrdiff_t fragii; |
102 | for(fragii=0;fragii<_nfragis;fragii++){ |
103 | ptrdiff_t frag_buf_off; |
104 | frag_buf_off=_frag_buf_offs[_fragis[fragii]]; |
105 | OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off, |
106 | _src_frame+frag_buf_off,_ystride); |
107 | } |
108 | } |
109 | |
110 | |
111 | void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride, |
112 | const ogg_int16_t *_residue){ |
113 | __asm__ __volatile__( |
114 | /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/ |
115 | "pcmpeqw %%mm0,%%mm0\n\t" |
116 | /*#0 Load low residue.*/ |
117 | "movq 0*8(%[residue]),%%mm1\n\t" |
118 | /*#0 Load high residue.*/ |
119 | "movq 1*8(%[residue]),%%mm2\n\t" |
120 | /*Set mm0 to 0x8000800080008000.*/ |
121 | "psllw $15,%%mm0\n\t" |
122 | /*#1 Load low residue.*/ |
123 | "movq 2*8(%[residue]),%%mm3\n\t" |
124 | /*#1 Load high residue.*/ |
125 | "movq 3*8(%[residue]),%%mm4\n\t" |
126 | /*Set mm0 to 0x0080008000800080.*/ |
127 | "psrlw $8,%%mm0\n\t" |
128 | /*#2 Load low residue.*/ |
129 | "movq 4*8(%[residue]),%%mm5\n\t" |
130 | /*#2 Load high residue.*/ |
131 | "movq 5*8(%[residue]),%%mm6\n\t" |
132 | /*#0 Bias low residue.*/ |
133 | "paddsw %%mm0,%%mm1\n\t" |
134 | /*#0 Bias high residue.*/ |
135 | "paddsw %%mm0,%%mm2\n\t" |
136 | /*#0 Pack to byte.*/ |
137 | "packuswb %%mm2,%%mm1\n\t" |
138 | /*#1 Bias low residue.*/ |
139 | "paddsw %%mm0,%%mm3\n\t" |
140 | /*#1 Bias high residue.*/ |
141 | "paddsw %%mm0,%%mm4\n\t" |
142 | /*#1 Pack to byte.*/ |
143 | "packuswb %%mm4,%%mm3\n\t" |
144 | /*#2 Bias low residue.*/ |
145 | "paddsw %%mm0,%%mm5\n\t" |
146 | /*#2 Bias high residue.*/ |
147 | "paddsw %%mm0,%%mm6\n\t" |
148 | /*#2 Pack to byte.*/ |
149 | "packuswb %%mm6,%%mm5\n\t" |
150 | /*#0 Write row.*/ |
151 | "movq %%mm1,(%[dst])\n\t" |
152 | /*#1 Write row.*/ |
153 | "movq %%mm3,(%[dst],%[ystride])\n\t" |
154 | /*#2 Write row.*/ |
155 | "movq %%mm5,(%[dst],%[ystride],2)\n\t" |
156 | /*#3 Load low residue.*/ |
157 | "movq 6*8(%[residue]),%%mm1\n\t" |
158 | /*#3 Load high residue.*/ |
159 | "movq 7*8(%[residue]),%%mm2\n\t" |
160 | /*#4 Load high residue.*/ |
161 | "movq 8*8(%[residue]),%%mm3\n\t" |
162 | /*#4 Load high residue.*/ |
163 | "movq 9*8(%[residue]),%%mm4\n\t" |
164 | /*#5 Load high residue.*/ |
165 | "movq 10*8(%[residue]),%%mm5\n\t" |
166 | /*#5 Load high residue.*/ |
167 | "movq 11*8(%[residue]),%%mm6\n\t" |
168 | /*#3 Bias low residue.*/ |
169 | "paddsw %%mm0,%%mm1\n\t" |
170 | /*#3 Bias high residue.*/ |
171 | "paddsw %%mm0,%%mm2\n\t" |
172 | /*#3 Pack to byte.*/ |
173 | "packuswb %%mm2,%%mm1\n\t" |
174 | /*#4 Bias low residue.*/ |
175 | "paddsw %%mm0,%%mm3\n\t" |
176 | /*#4 Bias high residue.*/ |
177 | "paddsw %%mm0,%%mm4\n\t" |
178 | /*#4 Pack to byte.*/ |
179 | "packuswb %%mm4,%%mm3\n\t" |
180 | /*#5 Bias low residue.*/ |
181 | "paddsw %%mm0,%%mm5\n\t" |
182 | /*#5 Bias high residue.*/ |
183 | "paddsw %%mm0,%%mm6\n\t" |
184 | /*#5 Pack to byte.*/ |
185 | "packuswb %%mm6,%%mm5\n\t" |
186 | /*#3 Write row.*/ |
187 | "movq %%mm1,(%[dst],%[ystride3])\n\t" |
188 | /*#4 Write row.*/ |
189 | "movq %%mm3,(%[dst4])\n\t" |
190 | /*#5 Write row.*/ |
191 | "movq %%mm5,(%[dst4],%[ystride])\n\t" |
192 | /*#6 Load low residue.*/ |
193 | "movq 12*8(%[residue]),%%mm1\n\t" |
194 | /*#6 Load high residue.*/ |
195 | "movq 13*8(%[residue]),%%mm2\n\t" |
196 | /*#7 Load low residue.*/ |
197 | "movq 14*8(%[residue]),%%mm3\n\t" |
198 | /*#7 Load high residue.*/ |
199 | "movq 15*8(%[residue]),%%mm4\n\t" |
200 | /*#6 Bias low residue.*/ |
201 | "paddsw %%mm0,%%mm1\n\t" |
202 | /*#6 Bias high residue.*/ |
203 | "paddsw %%mm0,%%mm2\n\t" |
204 | /*#6 Pack to byte.*/ |
205 | "packuswb %%mm2,%%mm1\n\t" |
206 | /*#7 Bias low residue.*/ |
207 | "paddsw %%mm0,%%mm3\n\t" |
208 | /*#7 Bias high residue.*/ |
209 | "paddsw %%mm0,%%mm4\n\t" |
210 | /*#7 Pack to byte.*/ |
211 | "packuswb %%mm4,%%mm3\n\t" |
212 | /*#6 Write row.*/ |
213 | "movq %%mm1,(%[dst4],%[ystride],2)\n\t" |
214 | /*#7 Write row.*/ |
215 | "movq %%mm3,(%[dst4],%[ystride3])\n\t" |
216 | : |
217 | :[residue]"r" (_residue), |
218 | [dst]"r" (_dst), |
219 | [dst4]"r" (_dst+(_ystride<<2)), |
220 | [ystride]"r" ((ptrdiff_t)_ystride), |
221 | [ystride3]"r" ((ptrdiff_t)_ystride*3) |
222 | :"memory" |
223 | ); |
224 | } |
225 | |
226 | void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src, |
227 | int _ystride,const ogg_int16_t *_residue){ |
228 | int i; |
229 | /*Zero mm0.*/ |
230 | __asm__ __volatile__("pxor %%mm0,%%mm0\n\t" ::); |
231 | for(i=4;i-->0;){ |
232 | __asm__ __volatile__( |
233 | /*#0 Load source.*/ |
234 | "movq (%[src]),%%mm3\n\t" |
235 | /*#1 Load source.*/ |
236 | "movq (%[src],%[ystride]),%%mm7\n\t" |
237 | /*#0 Get copy of src.*/ |
238 | "movq %%mm3,%%mm4\n\t" |
239 | /*#0 Expand high source.*/ |
240 | "punpckhbw %%mm0,%%mm4\n\t" |
241 | /*#0 Expand low source.*/ |
242 | "punpcklbw %%mm0,%%mm3\n\t" |
243 | /*#0 Add residue high.*/ |
244 | "paddsw 8(%[residue]),%%mm4\n\t" |
245 | /*#1 Get copy of src.*/ |
246 | "movq %%mm7,%%mm2\n\t" |
247 | /*#0 Add residue low.*/ |
248 | "paddsw (%[residue]), %%mm3\n\t" |
249 | /*#1 Expand high source.*/ |
250 | "punpckhbw %%mm0,%%mm2\n\t" |
251 | /*#0 Pack final row pixels.*/ |
252 | "packuswb %%mm4,%%mm3\n\t" |
253 | /*#1 Expand low source.*/ |
254 | "punpcklbw %%mm0,%%mm7\n\t" |
255 | /*#1 Add residue low.*/ |
256 | "paddsw 16(%[residue]),%%mm7\n\t" |
257 | /*#1 Add residue high.*/ |
258 | "paddsw 24(%[residue]),%%mm2\n\t" |
259 | /*Advance residue.*/ |
260 | "lea 32(%[residue]),%[residue]\n\t" |
261 | /*#1 Pack final row pixels.*/ |
262 | "packuswb %%mm2,%%mm7\n\t" |
263 | /*Advance src.*/ |
264 | "lea (%[src],%[ystride],2),%[src]\n\t" |
265 | /*#0 Write row.*/ |
266 | "movq %%mm3,(%[dst])\n\t" |
267 | /*#1 Write row.*/ |
268 | "movq %%mm7,(%[dst],%[ystride])\n\t" |
269 | /*Advance dst.*/ |
270 | "lea (%[dst],%[ystride],2),%[dst]\n\t" |
271 | :[residue]"+r" (_residue),[dst]"+r" (_dst),[src]"+r" (_src) |
272 | :[ystride]"r" ((ptrdiff_t)_ystride) |
273 | :"memory" |
274 | ); |
275 | } |
276 | } |
277 | |
278 | void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1, |
279 | const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){ |
280 | int i; |
281 | /*Zero mm7.*/ |
282 | __asm__ __volatile__("pxor %%mm7,%%mm7\n\t" ::); |
283 | for(i=4;i-->0;){ |
284 | __asm__ __volatile__( |
285 | /*#0 Load src1.*/ |
286 | "movq (%[src1]),%%mm0\n\t" |
287 | /*#0 Load src2.*/ |
288 | "movq (%[src2]),%%mm2\n\t" |
289 | /*#0 Copy src1.*/ |
290 | "movq %%mm0,%%mm1\n\t" |
291 | /*#0 Copy src2.*/ |
292 | "movq %%mm2,%%mm3\n\t" |
293 | /*#1 Load src1.*/ |
294 | "movq (%[src1],%[ystride]),%%mm4\n\t" |
295 | /*#0 Unpack lower src1.*/ |
296 | "punpcklbw %%mm7,%%mm0\n\t" |
297 | /*#1 Load src2.*/ |
298 | "movq (%[src2],%[ystride]),%%mm5\n\t" |
299 | /*#0 Unpack higher src1.*/ |
300 | "punpckhbw %%mm7,%%mm1\n\t" |
301 | /*#0 Unpack lower src2.*/ |
302 | "punpcklbw %%mm7,%%mm2\n\t" |
303 | /*#0 Unpack higher src2.*/ |
304 | "punpckhbw %%mm7,%%mm3\n\t" |
305 | /*Advance src1 ptr.*/ |
306 | "lea (%[src1],%[ystride],2),%[src1]\n\t" |
307 | /*Advance src2 ptr.*/ |
308 | "lea (%[src2],%[ystride],2),%[src2]\n\t" |
309 | /*#0 Lower src1+src2.*/ |
310 | "paddsw %%mm2,%%mm0\n\t" |
311 | /*#0 Higher src1+src2.*/ |
312 | "paddsw %%mm3,%%mm1\n\t" |
313 | /*#1 Copy src1.*/ |
314 | "movq %%mm4,%%mm2\n\t" |
315 | /*#0 Build lo average.*/ |
316 | "psraw $1,%%mm0\n\t" |
317 | /*#1 Copy src2.*/ |
318 | "movq %%mm5,%%mm3\n\t" |
319 | /*#1 Unpack lower src1.*/ |
320 | "punpcklbw %%mm7,%%mm4\n\t" |
321 | /*#0 Build hi average.*/ |
322 | "psraw $1,%%mm1\n\t" |
323 | /*#1 Unpack higher src1.*/ |
324 | "punpckhbw %%mm7,%%mm2\n\t" |
325 | /*#0 low+=residue.*/ |
326 | "paddsw (%[residue]),%%mm0\n\t" |
327 | /*#1 Unpack lower src2.*/ |
328 | "punpcklbw %%mm7,%%mm5\n\t" |
329 | /*#0 high+=residue.*/ |
330 | "paddsw 8(%[residue]),%%mm1\n\t" |
331 | /*#1 Unpack higher src2.*/ |
332 | "punpckhbw %%mm7,%%mm3\n\t" |
333 | /*#1 Lower src1+src2.*/ |
334 | "paddsw %%mm4,%%mm5\n\t" |
335 | /*#0 Pack and saturate.*/ |
336 | "packuswb %%mm1,%%mm0\n\t" |
337 | /*#1 Higher src1+src2.*/ |
338 | "paddsw %%mm2,%%mm3\n\t" |
339 | /*#0 Write row.*/ |
340 | "movq %%mm0,(%[dst])\n\t" |
341 | /*#1 Build lo average.*/ |
342 | "psraw $1,%%mm5\n\t" |
343 | /*#1 Build hi average.*/ |
344 | "psraw $1,%%mm3\n\t" |
345 | /*#1 low+=residue.*/ |
346 | "paddsw 16(%[residue]),%%mm5\n\t" |
347 | /*#1 high+=residue.*/ |
348 | "paddsw 24(%[residue]),%%mm3\n\t" |
349 | /*#1 Pack and saturate.*/ |
350 | "packuswb %%mm3,%%mm5\n\t" |
351 | /*#1 Write row ptr.*/ |
352 | "movq %%mm5,(%[dst],%[ystride])\n\t" |
353 | /*Advance residue ptr.*/ |
354 | "add $32,%[residue]\n\t" |
355 | /*Advance dest ptr.*/ |
356 | "lea (%[dst],%[ystride],2),%[dst]\n\t" |
357 | :[dst]"+r" (_dst),[residue]"+r" (_residue), |
358 | [src1]"+r" (_src1),[src2]"+r" (_src2) |
359 | :[ystride]"r" ((ptrdiff_t)_ystride) |
360 | :"memory" |
361 | ); |
362 | } |
363 | } |
364 | |
365 | void oc_restore_fpu_mmx(void){ |
366 | __asm__ __volatile__("emms\n\t" ); |
367 | } |
368 | #endif |
369 | |