mmxloop.h source code [Godot/thirdparty/libtheora/x86/mmxloop.h]

1	#if !defined(_x86_mmxloop_H)
2	# define _x86_mmxloop_H (1)
3	# include <stddef.h>
4	# include "x86int.h"
5
6	#if defined(OC_X86_ASM)
7
8	/On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.*
9	On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
10	mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered./*
11	#define OC_LOOP_FILTER8_MMX \
12	"#OC_LOOP_FILTER8_MMX\n\t" \
13	/mm7=0/ \
14	"pxor %%mm7,%%mm7\n\t" \
15	/mm6:mm0={a0,...,a7}/ \
16	"movq %%mm0,%%mm6\n\t" \
17	"punpcklbw %%mm7,%%mm0\n\t" \
18	"punpckhbw %%mm7,%%mm6\n\t" \
19	/mm3:mm5={d0,...,d7}/ \
20	"movq %%mm3,%%mm5\n\t" \
21	"punpcklbw %%mm7,%%mm3\n\t" \
22	"punpckhbw %%mm7,%%mm5\n\t" \
23	/mm6:mm0={a0-d0,...,a7-d7}/ \
24	"psubw %%mm3,%%mm0\n\t" \
25	"psubw %%mm5,%%mm6\n\t" \
26	/mm3:mm1={b0,...,b7}/ \
27	"movq %%mm1,%%mm3\n\t" \
28	"punpcklbw %%mm7,%%mm1\n\t" \
29	"movq %%mm2,%%mm4\n\t" \
30	"punpckhbw %%mm7,%%mm3\n\t" \
31	/mm5:mm4={c0,...,c7}/ \
32	"movq %%mm2,%%mm5\n\t" \
33	"punpcklbw %%mm7,%%mm4\n\t" \
34	"punpckhbw %%mm7,%%mm5\n\t" \
35	/*mm7={3}x4 \
36	mm5:mm4={c0-b0,...,c7-b7}*/ \
37	"pcmpeqw %%mm7,%%mm7\n\t" \
38	"psubw %%mm1,%%mm4\n\t" \
39	"psrlw $14,%%mm7\n\t" \
40	"psubw %%mm3,%%mm5\n\t" \
41	/Scale by 3./ \
42	"pmullw %%mm7,%%mm4\n\t" \
43	"pmullw %%mm7,%%mm5\n\t" \
44	/*mm7={4}x4 \
45	mm5:mm4=f={a0-d0+3(c0-b0),...,a7-d7+3(c7-b7)}*/ \
46	"psrlw $1,%%mm7\n\t" \
47	"paddw %%mm0,%%mm4\n\t" \
48	"psllw $2,%%mm7\n\t" \
49	"movq (%[ll]),%%mm0\n\t" \
50	"paddw %%mm6,%%mm5\n\t" \
51	/*R_i has the range [-127,128], so we compute -R_i instead. \
52	mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
53	"psubw %%mm7,%%mm4\n\t" \
54	"psubw %%mm7,%%mm5\n\t" \
55	"psraw $3,%%mm4\n\t" \
56	"psraw $3,%%mm5\n\t" \
57	"pcmpeqb %%mm7,%%mm7\n\t" \
58	"packsswb %%mm5,%%mm4\n\t" \
59	"pxor %%mm6,%%mm6\n\t" \
60	"pxor %%mm7,%%mm4\n\t" \
61	"packuswb %%mm3,%%mm1\n\t" \
62	/Now compute lflim of -mm4 cf. Section 7.10 of the sepc./ \
63	/*There's no unsigned byte+signed byte with unsigned saturation op code, so \
64	we have to split things by sign (the other option is to work in 16 bits, \
65	but working in 8 bits gives much better parallelism). \
66	We compute abs(R_i), but save a mask of which terms were negative in mm6. \
67	Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
68	Finally, we split mm4 into positive and negative pieces using the mask in \
69	mm6, and add and subtract them as appropriate.*/ \
70	/mm4=abs(-R_i)/ \
71	/mm7=255-2L*/ \
72	"pcmpgtb %%mm4,%%mm6\n\t" \
73	"psubb %%mm0,%%mm7\n\t" \
74	"pxor %%mm6,%%mm4\n\t" \
75	"psubb %%mm0,%%mm7\n\t" \
76	"psubb %%mm6,%%mm4\n\t" \
77	/mm7=255-max(2L-abs(R_i),0)*/ \
78	"paddusb %%mm4,%%mm7\n\t" \
79	/mm4=min(abs(R_i),max(2L-abs(R_i),0))*/ \
80	"paddusb %%mm7,%%mm4\n\t" \
81	"psubusb %%mm7,%%mm4\n\t" \
82	/Now split mm4 by the original sign of -R_i./ \
83	"movq %%mm4,%%mm5\n\t" \
84	"pand %%mm6,%%mm4\n\t" \
85	"pandn %%mm5,%%mm6\n\t" \
86	/mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}/ \
87	/mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}/ \
88	"paddusb %%mm4,%%mm1\n\t" \
89	"psubusb %%mm4,%%mm2\n\t" \
90	"psubusb %%mm6,%%mm1\n\t" \
91	"paddusb %%mm6,%%mm2\n\t" \
92
93	/On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.*
94	On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
95	mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}.
96	All other MMX registers are clobbered./*
97	#define OC_LOOP_FILTER8_MMXEXT \
98	"#OC_LOOP_FILTER8_MMXEXT\n\t" \
99	/R_i=(a_i-3b_i+3*c_i-d_i+4>>3) has the range [-127,128], so we compute \
100	-R_i=(-a_i+3b_i-3c_i+d_i+3>>3) instead.*/ \
101	/*This first part is based on the transformation \
102	f = -(3*(c-b)+a-d+4>>3) \
103	= -(3*(c+255-b)+(a+255-d)+4-1020>>3) \
104	= -(3*(c+~b)+(a+~d)-1016>>3) \
105	= 127-(3*(c+~b)+(a+~d)>>3) \
106	= 128+~(3*(c+~b)+(a+~d)>>3) (mod 256). \
107	Although pavgb(a,b) = (a+b+1>>1) (biased up), we rely heavily on the \
108	fact that ~pavgb(~a,~b) = (a+b>>1) (biased down). \
109	Using this, the last expression above can be computed in 8 bits of working \
110	precision via: \
111	u = ~pavgb(~b,c); \
112	v = pavgb(b,~c); \
113	This mask is 0 or 0xFF, and controls whether t is biased up or down: \
114	m = u-v; \
115	t = m^pavgb(m^~a,m^d); \
116	f = 128+pavgb(pavgb(t,u),v); \
117	This required some careful analysis to ensure that carries are propagated \
118	correctly in all cases, but has been checked exhaustively.*/ \
119	/input (a, b, c, d, ., ., ., .)/ \
120	/*ff=0xFF; \
121	u=b; \
122	v=c; \
123	ll=255-2L;/ \
124	"pcmpeqb %%mm7,%%mm7\n\t" \
125	"movq %%mm1,%%mm4\n\t" \
126	"movq %%mm2,%%mm5\n\t" \
127	"movq (%[ll]),%%mm6\n\t" \
128	/allocated u, v, ll, ff: (a, b, c, d, u, v, ll, ff)/ \
129	/*u^=ff; \
130	v^=ff;*/ \
131	"pxor %%mm7,%%mm4\n\t" \
132	"pxor %%mm7,%%mm5\n\t" \
133	/allocated ll: (a, b, c, d, u, v, ll, ff)/ \
134	/*u=pavgb(u,c); \
135	v=pavgb(v,b);*/ \
136	"pavgb %%mm2,%%mm4\n\t" \
137	"pavgb %%mm1,%%mm5\n\t" \
138	/*u^=ff; \
139	a^=ff;*/ \
140	"pxor %%mm7,%%mm4\n\t" \
141	"pxor %%mm7,%%mm0\n\t" \
142	/m=u-v;/ \
143	"psubb %%mm5,%%mm4\n\t" \
144	/freed u, allocated m: (a, b, c, d, m, v, ll, ff)/ \
145	/*a^=m; \
146	d^=m;*/ \
147	"pxor %%mm4,%%mm0\n\t" \
148	"pxor %%mm4,%%mm3\n\t" \
149	/t=pavgb(a,d);/ \
150	"pavgb %%mm3,%%mm0\n\t" \
151	"psllw $7,%%mm7\n\t" \
152	/freed a, d, ff, allocated t, of: (t, b, c, ., m, v, ll, of)/ \
153	/*t^=m; \
154	u=m+v;*/ \
155	"pxor %%mm4,%%mm0\n\t" \
156	"paddb %%mm5,%%mm4\n\t" \
157	/freed t, m, allocated f, u: (f, b, c, ., u, v, ll, of)/ \
158	/*f=pavgb(f,u); \
159	of=128;*/ \
160	"pavgb %%mm4,%%mm0\n\t" \
161	"packsswb %%mm7,%%mm7\n\t" \
162	/freed u, ff, allocated ll: (f, b, c, ., ll, v, ll, of)/ \
163	/f=pavgb(f,v);/ \
164	"pavgb %%mm5,%%mm0\n\t" \
165	"movq %%mm7,%%mm3\n\t" \
166	"movq %%mm6,%%mm4\n\t" \
167	/freed v, allocated of: (f, b, c, of, ll, ., ll, of)/ \
168	/Now compute lflim of R_i=-(128+mm0) cf. Section 7.10 of the sepc./ \
169	/*There's no unsigned byte+signed byte with unsigned saturation op code, so \
170	we have to split things by sign (the other option is to work in 16 bits, \
171	but staying in 8 bits gives much better parallelism).*/ \
172	/*Instead of adding the offset of 128 in mm3, we use it to split mm0. \
173	This is the same number of instructions as computing a mask and splitting \
174	after the lflim computation, but has shorter dependency chains.*/ \
175	/*mm0=R_i<0?-R_i:0 (denoted abs(R_i<0))\
176	mm3=R_i>0?R_i:0* (denoted abs(R_i>0))*/ \
177	"psubusb %%mm0,%%mm3\n\t" \
178	"psubusb %%mm7,%%mm0\n\t" \
179	/mm6=255-max(2L-abs(R_i<0),0) \
180	mm4=255-max(2L-abs(R_i>0),0)/ \
181	"paddusb %%mm3,%%mm4\n\t" \
182	"paddusb %%mm0,%%mm6\n\t" \
183	/mm0=min(abs(R_i<0),max(2L-abs(R_i<0),0)) \
184	mm3=min(abs(R_i>0),max(2L-abs(R_i>0),0))/ \
185	"paddusb %%mm4,%%mm3\n\t" \
186	"paddusb %%mm6,%%mm0\n\t" \
187	"psubusb %%mm4,%%mm3\n\t" \
188	"psubusb %%mm6,%%mm0\n\t" \
189	/mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}/ \
190	/mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}/ \
191	"paddusb %%mm3,%%mm1\n\t" \
192	"psubusb %%mm3,%%mm2\n\t" \
193	"psubusb %%mm0,%%mm1\n\t" \
194	"paddusb %%mm0,%%mm2\n\t" \
195
196	#define OC_LOOP_FILTER_V(_filter,_pix,_ystride,_ll) \
197	do{ \
198	ptrdiff_t ystride3__; \
199	__asm__ __volatile__( \
200	/mm0={a0,...,a7}/ \
201	"movq (%[pix]),%%mm0\n\t" \
202	/ystride3=_ystride3*/ \
203	"lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
204	/mm3={d0,...,d7}/ \
205	"movq (%[pix],%[ystride3]),%%mm3\n\t" \
206	/mm1={b0,...,b7}/ \
207	"movq (%[pix],%[ystride]),%%mm1\n\t" \
208	/mm2={c0,...,c7}/ \
209	"movq (%[pix],%[ystride],2),%%mm2\n\t" \
210	_filter \
211	/Write it back out./ \
212	"movq %%mm1,(%[pix],%[ystride])\n\t" \
213	"movq %%mm2,(%[pix],%[ystride],2)\n\t" \
214	:[ystride3]"=&r"(ystride3__) \
215	:[pix]"r"(_pix-_ystride*2),[ystride]"r"((ptrdiff_t)(_ystride)), \
216	[ll]"r"(_ll) \
217	:"memory" \
218	); \
219	} \
220	while(0)
221
222	#define OC_LOOP_FILTER_H(_filter,_pix,_ystride,_ll) \
223	do{ \
224	unsigned char *pix__; \
225	ptrdiff_t ystride3__; \
226	ptrdiff_t d__; \
227	pix__=(_pix)-2; \
228	__asm__ __volatile__( \
229	/x x x x d0 c0 b0 a0/ \
230	"movd (%[pix]),%%mm0\n\t" \
231	/x x x x d1 c1 b1 a1/ \
232	"movd (%[pix],%[ystride]),%%mm1\n\t" \
233	/ystride3=_ystride3*/ \
234	"lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
235	/x x x x d2 c2 b2 a2/ \
236	"movd (%[pix],%[ystride],2),%%mm2\n\t" \
237	/x x x x d3 c3 b3 a3/ \
238	"lea (%[pix],%[ystride],4),%[d]\n\t" \
239	"movd (%[pix],%[ystride3]),%%mm3\n\t" \
240	/x x x x d4 c4 b4 a4/ \
241	"movd (%[d]),%%mm4\n\t" \
242	/x x x x d5 c5 b5 a5/ \
243	"movd (%[d],%[ystride]),%%mm5\n\t" \
244	/x x x x d6 c6 b6 a6/ \
245	"movd (%[d],%[ystride],2),%%mm6\n\t" \
246	/x x x x d7 c7 b7 a7/ \
247	"movd (%[d],%[ystride3]),%%mm7\n\t" \
248	/mm0=d1 d0 c1 c0 b1 b0 a1 a0/ \
249	"punpcklbw %%mm1,%%mm0\n\t" \
250	/mm2=d3 d2 c3 c2 b3 b2 a3 a2/ \
251	"punpcklbw %%mm3,%%mm2\n\t" \
252	/mm3=d1 d0 c1 c0 b1 b0 a1 a0/ \
253	"movq %%mm0,%%mm3\n\t" \
254	/mm0=b3 b2 b1 b0 a3 a2 a1 a0/ \
255	"punpcklwd %%mm2,%%mm0\n\t" \
256	/mm3=d3 d2 d1 d0 c3 c2 c1 c0/ \
257	"punpckhwd %%mm2,%%mm3\n\t" \
258	/mm1=b3 b2 b1 b0 a3 a2 a1 a0/ \
259	"movq %%mm0,%%mm1\n\t" \
260	/mm4=d5 d4 c5 c4 b5 b4 a5 a4/ \
261	"punpcklbw %%mm5,%%mm4\n\t" \
262	/mm6=d7 d6 c7 c6 b7 b6 a7 a6/ \
263	"punpcklbw %%mm7,%%mm6\n\t" \
264	/mm5=d5 d4 c5 c4 b5 b4 a5 a4/ \
265	"movq %%mm4,%%mm5\n\t" \
266	/mm4=b7 b6 b5 b4 a7 a6 a5 a4/ \
267	"punpcklwd %%mm6,%%mm4\n\t" \
268	/mm5=d7 d6 d5 d4 c7 c6 c5 c4/ \
269	"punpckhwd %%mm6,%%mm5\n\t" \
270	/mm2=d3 d2 d1 d0 c3 c2 c1 c0/ \
271	"movq %%mm3,%%mm2\n\t" \
272	/mm0=a7 a6 a5 a4 a3 a2 a1 a0/ \
273	"punpckldq %%mm4,%%mm0\n\t" \
274	/mm1=b7 b6 b5 b4 b3 b2 b1 b0/ \
275	"punpckhdq %%mm4,%%mm1\n\t" \
276	/mm2=c7 c6 c5 c4 c3 c2 c1 c0/ \
277	"punpckldq %%mm5,%%mm2\n\t" \
278	/mm3=d7 d6 d5 d4 d3 d2 d1 d0/ \
279	"punpckhdq %%mm5,%%mm3\n\t" \
280	_filter \
281	/mm2={b0+R_0'',...,b7+R_7''}/ \
282	"movq %%mm1,%%mm0\n\t" \
283	/mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}/ \
284	"punpcklbw %%mm2,%%mm1\n\t" \
285	/mm2={b4+R_4'',c4-R_4'',...,b7+R_7'',c7-R_7''}/ \
286	"punpckhbw %%mm2,%%mm0\n\t" \
287	/[d]=c1 b1 c0 b0/ \
288	"movd %%mm1,%[d]\n\t" \
289	"movw %w[d],1(%[pix])\n\t" \
290	"psrlq $32,%%mm1\n\t" \
291	"shr $16,%[d]\n\t" \
292	"movw %w[d],1(%[pix],%[ystride])\n\t" \
293	/[d]=c3 b3 c2 b2/ \
294	"movd %%mm1,%[d]\n\t" \
295	"movw %w[d],1(%[pix],%[ystride],2)\n\t" \
296	"shr $16,%[d]\n\t" \
297	"movw %w[d],1(%[pix],%[ystride3])\n\t" \
298	"lea (%[pix],%[ystride],4),%[pix]\n\t" \
299	/[d]=c5 b5 c4 b4/ \
300	"movd %%mm0,%[d]\n\t" \
301	"movw %w[d],1(%[pix])\n\t" \
302	"psrlq $32,%%mm0\n\t" \
303	"shr $16,%[d]\n\t" \
304	"movw %w[d],1(%[pix],%[ystride])\n\t" \
305	/[d]=c7 b7 c6 b6/ \
306	"movd %%mm0,%[d]\n\t" \
307	"movw %w[d],1(%[pix],%[ystride],2)\n\t" \
308	"shr $16,%[d]\n\t" \
309	"movw %w[d],1(%[pix],%[ystride3])\n\t" \
310	:[pix]"+r"(pix__),[ystride3]"=&r"(ystride3__),[d]"=&r"(d__) \
311	:[ystride]"r"((ptrdiff_t)(_ystride)),[ll]"r"(_ll) \
312	:"memory" \
313	); \
314	} \
315	while(0)
316
317	# endif
318	#endif
319

Browse the source code of Godot/thirdparty/libtheora/x86/mmxloop.h