1/********************************************************************
2 * *
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7 * *
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
9 * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
10 * *
11 ********************************************************************
12
13 function:
14 last mod: $Id$
15
16 ********************************************************************/
17
18/*MMX acceleration of complete fragment reconstruction algorithm.
19 Originally written by Rudolf Marek.*/
20#include <string.h>
21#include "x86int.h"
22#include "mmxloop.h"
23
24#if defined(OC_X86_ASM)
25
26void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
27 int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
28 unsigned char *dst;
29 ptrdiff_t frag_buf_off;
30 int ystride;
31 int refi;
32 /*Apply the inverse transform.*/
33 /*Special case only having a DC component.*/
34 if(_last_zzi<2){
35 /*Note that this value must be unsigned, to keep the __asm__ block from
36 sign-extending it when it puts it in a register.*/
37 ogg_uint16_t p;
38 int i;
39 /*We round this dequant product (and not any of the others) because there's
40 no iDCT rounding.*/
41 p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
42 /*Fill _dct_coeffs with p.*/
43 __asm__ __volatile__(
44 /*mm0=0000 0000 0000 AAAA*/
45 "movd %[p],%%mm0\n\t"
46 /*mm0=0000 0000 AAAA AAAA*/
47 "punpcklwd %%mm0,%%mm0\n\t"
48 /*mm0=AAAA AAAA AAAA AAAA*/
49 "punpckldq %%mm0,%%mm0\n\t"
50 :
51 :[p]"r"((unsigned)p)
52 );
53 for(i=0;i<4;i++){
54 __asm__ __volatile__(
55 "movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t"
56 "movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t"
57 "movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t"
58 "movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t"
59 :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16)
60 );
61 }
62 }
63 else{
64 /*Dequantize the DC coefficient.*/
65 _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
66 oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);
67 }
68 /*Fill in the target buffer.*/
69 frag_buf_off=_state->frag_buf_offs[_fragi];
70 refi=_state->frags[_fragi].refi;
71 ystride=_state->ref_ystride[_pli];
72 dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
73 if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
74 else{
75 const unsigned char *ref;
76 int mvoffsets[2];
77 ref=_state->ref_frame_data[refi]+frag_buf_off;
78 if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
79 _state->frag_mvs[_fragi])>1){
80 oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
81 _dct_coeffs+64);
82 }
83 else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
84 }
85}
86
87/*We copy these entire function to inline the actual MMX routines so that we
88 use only a single indirect call.*/
89
90void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
91 memset(_bv,_flimit,8);
92}
93
94/*Apply the loop filter to a given set of fragment rows in the given plane.
95 The filter may be run on the bottom edge, affecting pixels in the next row of
96 fragments, so this row also needs to be available.
97 _bv: The bounding values array.
98 _refi: The index of the frame buffer to filter.
99 _pli: The color plane to filter.
100 _fragy0: The Y coordinate of the first fragment row to filter.
101 _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
102void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
103 signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
104 OC_ALIGN8(unsigned char ll[8]);
105 const oc_fragment_plane *fplane;
106 const oc_fragment *frags;
107 const ptrdiff_t *frag_buf_offs;
108 unsigned char *ref_frame_data;
109 ptrdiff_t fragi_top;
110 ptrdiff_t fragi_bot;
111 ptrdiff_t fragi0;
112 ptrdiff_t fragi0_end;
113 int ystride;
114 int nhfrags;
115 memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
116 fplane=_state->fplanes+_pli;
117 nhfrags=fplane->nhfrags;
118 fragi_top=fplane->froffset;
119 fragi_bot=fragi_top+fplane->nfrags;
120 fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
121 fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
122 ystride=_state->ref_ystride[_pli];
123 frags=_state->frags;
124 frag_buf_offs=_state->frag_buf_offs;
125 ref_frame_data=_state->ref_frame_data[_refi];
126 /*The following loops are constructed somewhat non-intuitively on purpose.
127 The main idea is: if a block boundary has at least one coded fragment on
128 it, the filter is applied to it.
129 However, the order that the filters are applied in matters, and VP3 chose
130 the somewhat strange ordering used below.*/
131 while(fragi0<fragi0_end){
132 ptrdiff_t fragi;
133 ptrdiff_t fragi_end;
134 fragi=fragi0;
135 fragi_end=fragi+nhfrags;
136 while(fragi<fragi_end){
137 if(frags[fragi].coded){
138 unsigned char *ref;
139 ref=ref_frame_data+frag_buf_offs[fragi];
140 if(fragi>fragi0){
141 OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
142 }
143 if(fragi0>fragi_top){
144 OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
145 }
146 if(fragi+1<fragi_end&&!frags[fragi+1].coded){
147 OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref+8,ystride,ll);
148 }
149 if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
150 OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref+(ystride<<3),ystride,ll);
151 }
152 }
153 fragi++;
154 }
155 fragi0+=nhfrags;
156 }
157}
158
159void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit){
160 memset(_bv,~(_flimit<<1),8);
161}
162
163/*Apply the loop filter to a given set of fragment rows in the given plane.
164 The filter may be run on the bottom edge, affecting pixels in the next row of
165 fragments, so this row also needs to be available.
166 _bv: The bounding values array.
167 _refi: The index of the frame buffer to filter.
168 _pli: The color plane to filter.
169 _fragy0: The Y coordinate of the first fragment row to filter.
170 _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
171void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
172 signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
173 const oc_fragment_plane *fplane;
174 const oc_fragment *frags;
175 const ptrdiff_t *frag_buf_offs;
176 unsigned char *ref_frame_data;
177 ptrdiff_t fragi_top;
178 ptrdiff_t fragi_bot;
179 ptrdiff_t fragi0;
180 ptrdiff_t fragi0_end;
181 int ystride;
182 int nhfrags;
183 fplane=_state->fplanes+_pli;
184 nhfrags=fplane->nhfrags;
185 fragi_top=fplane->froffset;
186 fragi_bot=fragi_top+fplane->nfrags;
187 fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
188 fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
189 ystride=_state->ref_ystride[_pli];
190 frags=_state->frags;
191 frag_buf_offs=_state->frag_buf_offs;
192 ref_frame_data=_state->ref_frame_data[_refi];
193 /*The following loops are constructed somewhat non-intuitively on purpose.
194 The main idea is: if a block boundary has at least one coded fragment on
195 it, the filter is applied to it.
196 However, the order that the filters are applied in matters, and VP3 chose
197 the somewhat strange ordering used below.*/
198 while(fragi0<fragi0_end){
199 ptrdiff_t fragi;
200 ptrdiff_t fragi_end;
201 fragi=fragi0;
202 fragi_end=fragi+nhfrags;
203 while(fragi<fragi_end){
204 if(frags[fragi].coded){
205 unsigned char *ref;
206 ref=ref_frame_data+frag_buf_offs[fragi];
207 if(fragi>fragi0){
208 OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
209 }
210 if(fragi0>fragi_top){
211 OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
212 }
213 if(fragi+1<fragi_end&&!frags[fragi+1].coded){
214 OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,_bv);
215 }
216 if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
217 OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride<<3),ystride,_bv);
218 }
219 }
220 fragi++;
221 }
222 fragi0+=nhfrags;
223 }
224}
225
226#endif
227