ShaderCore.cpp source code [engine/third_party/swiftshader/src/Pipeline/ShaderCore.cpp]

1	// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2	//
3	// Licensed under the Apache License, Version 2.0 (the "License");
4	// you may not use this file except in compliance with the License.
5	// You may obtain a copy of the License at
6	//
7	// http://www.apache.org/licenses/LICENSE-2.0
8	//
9	// Unless required by applicable law or agreed to in writing, software
10	// distributed under the License is distributed on an "AS IS" BASIS,
11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	// See the License for the specific language governing permissions and
13	// limitations under the License.
14
15	#include "ShaderCore.hpp"
16
17	#include "Device/Renderer.hpp"
18	#include "Vulkan/VkDebug.hpp"
19
20	#include <limits.h>
21
22	namespace sw
23	{
24	Vector4s::Vector4s()
25	{
26	}
27
28	Vector4s::Vector4s(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
29	{
30	this->x = Short4 (x);
31	this->y = Short4 (y);
32	this->z = Short4 (z);
33	this->w = Short4 (w);
34	}
35
36	Vector4s::Vector4s(const Vector4s &rhs)
37	{
38	x = rhs.x;
39	y = rhs.y;
40	z = rhs.z;
41	w = rhs.w;
42	}
43
44	Vector4s &Vector4s::operator=(const Vector4s &rhs)
45	{
46	x = rhs.x;
47	y = rhs.y;
48	z = rhs.z;
49	w = rhs.w;
50
51	return *this;
52	}
53
54	Short4 &Vector4s::operator[](int i)
55	{
56	switch(i)
57	{
58	case `0`: return x;
59	case `1`: return y;
60	case `2`: return z;
61	case `3`: return w;
62	}
63
64	return x;
65	}
66
67	Vector4f::Vector4f()
68	{
69	}
70
71	Vector4f::Vector4f(float x, float y, float z, float w)
72	{
73	this->x = Float4 (x);
74	this->y = Float4 (y);
75	this->z = Float4 (z);
76	this->w = Float4 (w);
77	}
78
79	Vector4f::Vector4f(const Vector4f &rhs)
80	{
81	x = rhs.x;
82	y = rhs.y;
83	z = rhs.z;
84	w = rhs.w;
85	}
86
87	Vector4f &Vector4f::operator=(const Vector4f &rhs)
88	{
89	x = rhs.x;
90	y = rhs.y;
91	z = rhs.z;
92	w = rhs.w;
93
94	return *this;
95	}
96
97	Float4 &Vector4f::operator[](int i)
98	{
99	switch(i)
100	{
101	case `0`: return x;
102	case `1`: return y;
103	case `2`: return z;
104	case `3`: return w;
105	}
106
107	return x;
108	}
109
110	Float4 exponential2(RValue<Float4> x, bool pp)
111	{
112	// This implementation is based on 2^(i + f) = 2^i 2^f,*
113	// where i is the integer part of x and f is the fraction.
114
115	// For 2^i we can put the integer part directly in the exponent of
116	// the IEEE-754 floating-point number. Clamp to prevent overflow
117	// past the representation of infinity.
118	Float4 x0 = x;
119	x0 = Min(x0, As<Float4>(Int4 (`0x43010000`))); // 129.00000e+0f
120	x0 = Max(x0, As<Float4>(Int4 (`0xC2FDFFFF`))); // -126.99999e+0f
121
122	Int4 i = RoundInt(x0 - Float4 (`0.5f`));
123	Float4 ii = As<Float4>((i + Int4 (`127`)) << `23`); // Add single-precision bias, and shift into exponent.
124
125	// For the fractional part use a polynomial
126	// which approximates 2^f in the 0 to 1 range.
127	Float4 f = x0 - Float4 (i);
128	Float4 ff = As<Float4>(Int4 (`0x3AF61905`)); // 1.8775767e-3f
129	ff = ff * f + As<Float4>(Int4 (`0x3C134806`)); // 8.9893397e-3f
130	ff = ff * f + As<Float4>(Int4 (`0x3D64AA23`)); // 5.5826318e-2f
131	ff = ff * f + As<Float4>(Int4 (`0x3E75EAD4`)); // 2.4015361e-1f
132	ff = ff * f + As<Float4>(Int4 (`0x3F31727B`)); // 6.9315308e-1f
133	ff = ff * f + Float4 (`1.0f`);
134
135	return ii * ff;
136	}
137
138	Float4 logarithm2(RValue<Float4> x, bool pp)
139	{
140	Float4 x0;
141	Float4 x1;
142	Float4 x2;
143	Float4 x3;
144
145	x0 = x;
146
147	x1 = As<Float4>(As<Int4>(x0) & Int4 (`0x7F800000`));
148	x1 = As<Float4>(As<UInt4>(x1) >> `8`);
149	x1 = As<Float4>(As<Int4>(x1) \| As<Int4>(Float4 (`1.0f`)));
150	x1 = (x1 - Float4 (`1.4960938f`)) * Float4 (`256.0f`); // FIXME: (x1 - 1.4960938f) 256.0f;*
151	x0 = As<Float4>((As<Int4>(x0) & Int4 (`0x007FFFFF`)) \| As<Int4>(Float4 (`1.0f`)));
152
153	x2 = (Float4 (`9.5428179e-2f`) * x0 + Float4 (`4.7779095e-1f`)) * x0 + Float4 (`1.9782813e-1f`);
154	x3 = ((Float4 (`1.6618466e-2f`) * x0 + Float4 (`2.0350508e-1f`)) * x0 + Float4 (`2.7382900e-1f`)) * x0 + Float4 (`4.0496687e-2f`);
155	x2 /= x3;
156
157	x1 += (x0 - Float4 (`1.0f`)) * x2;
158
159	Int4 pos_inf_x = CmpEQ(As<Int4>(x), Int4 (`0x7F800000`));
160	return As<Float4>((pos_inf_x & As<Int4>(x)) \| (~pos_inf_x & As<Int4>(x1)));
161	}
162
163	Float4 exponential(RValue<Float4> x, bool pp)
164	{
165	// FIXME: Propagate the constant
166	return exponential2(Float4 (`1.44269504f`) * x, pp); // 1/ln(2)
167	}
168
169	Float4 logarithm(RValue<Float4> x, bool pp)
170	{
171	// FIXME: Propagate the constant
172	return Float4 (`6.93147181e-1f`) * logarithm2(x, pp); // ln(2)
173	}
174
175	Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp)
176	{
177	Float4 log = logarithm2(x, pp);
178	log *= y;
179	return exponential2(log, pp);
180	}
181
182	Float4 reciprocal(RValue<Float4> x, bool pp, bool finite, bool exactAtPow2)
183	{
184	Float4 rcp = Rcp_pp(x, exactAtPow2);
185
186	if(!pp)
187	{
188	rcp = (rcp + rcp) - (x * rcp * rcp);
189	}
190
191	if(finite)
192	{
193	int big = `0x7F7FFFFF`;
194	rcp = Min(rcp, Float4 ((float&)big));
195	}
196
197	return rcp;
198	}
199
200	Float4 reciprocalSquareRoot(RValue<Float4> x, bool absolute, bool pp)
201	{
202	Float4 abs = x;
203
204	if(absolute)
205	{
206	abs = Abs(abs);
207	}
208
209	Float4 rsq;
210
211	if(!pp)
212	{
213	rsq = Float4 (`1.0f`) / Sqrt(abs);
214	}
215	else
216	{
217	rsq = RcpSqrt_pp(abs);
218
219	if(!pp)
220	{
221	rsq = rsq * (Float4 (`3.0f`) - rsq * rsq * abs) * Float4 (`0.5f`);
222	}
223
224	rsq = As<Float4>(CmpNEQ(As<Int4>(abs), Int4 (`0x7F800000`)) & As<Int4>(rsq));
225	}
226
227	return rsq;
228	}
229
230	Float4 modulo(RValue<Float4> x, RValue<Float4> y)
231	{
232	return x - y * Floor(x / y);
233	}
234
235	Float4 sine_pi(RValue<Float4> x, bool pp)
236	{
237	const Float4 A = Float4 (-`4.05284734e-1f`); // -4/pi^2
238	const Float4 B = Float4 (`1.27323954e+0f`); // 4/pi
239	const Float4 C = Float4 (`7.75160950e-1f`);
240	const Float4 D = Float4 (`2.24839049e-1f`);
241
242	// Parabola approximating sine
243	Float4 sin = x * (Abs(x) * A + B);
244
245	// Improve precision from 0.06 to 0.001
246	if(true)
247	{
248	sin = sin * (Abs(sin) * D + C);
249	}
250
251	return sin;
252	}
253
254	Float4 cosine_pi(RValue<Float4> x, bool pp)
255	{
256	// cos(x) = sin(x + pi/2)
257	Float4 y = x + Float4 (`1.57079632e+0f`);
258
259	// Wrap around
260	y -= As<Float4>(CmpNLT(y, Float4 (`3.14159265e+0f`)) & As<Int4>(Float4 (`6.28318530e+0f`)));
261
262	return sine_pi(y, pp);
263	}
264
265	Float4 sine(RValue<Float4> x, bool pp)
266	{
267	// Reduce to [-0.5, 0.5] range
268	Float4 y = x * Float4 (`1.59154943e-1f`); // 1/2pi
269	y = y - Round(y);
270
271	if(!pp)
272	{
273	// From the paper: "A Fast, Vectorizable Algorithm for Producing Single-Precision Sine-Cosine Pairs"
274	// This implementation passes OpenGL ES 3.0 precision requirements, at the cost of more operations:
275	// !pp : 17 mul, 7 add, 1 sub, 1 reciprocal
276	// pp : 4 mul, 2 add, 2 abs
277
278	Float4 y2 = y * y;
279	Float4 c1 = y2 * (y2 * (y2 * Float4 (-`0.0204391631f`) + Float4 (`0.2536086171f`)) + Float4 (-`1.2336977925f`)) + Float4 (`1.0f`);
280	Float4 s1 = y * (y2 * (y2 * (y2 * Float4 (-`0.0046075748f`) + Float4 (`0.0796819754f`)) + Float4 (-`0.645963615f`)) + Float4 (`1.5707963235f`));
281	Float4 c2 = (c1 * c1) - (s1 * s1);
282	Float4 s2 = Float4 (`2.0f`) * s1 * c1;
283	return Float4 (`2.0f`) * s2 * c2 * reciprocal(s2 * s2 + c2 * c2, pp, true);
284	}
285
286	const Float4 A = Float4 (-`16.0f`);
287	const Float4 B = Float4 (`8.0f`);
288	const Float4 C = Float4 (`7.75160950e-1f`);
289	const Float4 D = Float4 (`2.24839049e-1f`);
290
291	// Parabola approximating sine
292	Float4 sin = y * (Abs(y) * A + B);
293
294	// Improve precision from 0.06 to 0.001
295	if(true)
296	{
297	sin = sin * (Abs(sin) * D + C);
298	}
299
300	return sin;
301	}
302
303	Float4 cosine(RValue<Float4> x, bool pp)
304	{
305	// cos(x) = sin(x + pi/2)
306	Float4 y = x + Float4 (`1.57079632e+0f`);
307	return sine(y, pp);
308	}
309
310	Float4 tangent(RValue<Float4> x, bool pp)
311	{
312	return sine(x, pp) / cosine(x, pp);
313	}
314
315	Float4 arccos(RValue<Float4> x, bool pp)
316	{
317	// pi/2 - arcsin(x)
318	return Float4 (`1.57079632e+0f`) - arcsin(x);
319	}
320
321	Float4 arcsin(RValue<Float4> x, bool pp)
322	{
323	if(false) // Simpler implementation fails even lowp precision tests
324	{
325	// x(pi/2-sqrt(1-xx)pi/5)*
326	return x * (Float4 (`1.57079632e+0f`) - Sqrt(Float4 (`1.0f`) - x x) Float4 (`6.28318531e-1f`));
327	}
328	else
329	{
330	// From 4.4.45, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun
331	const Float4 half_pi(`1.57079632f`);
332	const Float4 a0(`1.5707288f`);
333	const Float4 a1(-`0.2121144f`);
334	const Float4 a2(`0.0742610f`);
335	const Float4 a3(-`0.0187293f`);
336	Float4 absx = Abs(x);
337	return As<Float4>(As<Int4>(half_pi - Sqrt(Float4 (`1.0f`) - absx) * (a0 + absx * (a1 + absx * (a2 + absx * a3)))) ^
338	(As<Int4>(x) & Int4 (`0x80000000`)));
339	}
340	}
341
342	// Approximation of atan in [0..1]
343	Float4 arctan_01(Float4 x, bool pp)
344	{
345	if(pp)
346	{
347	return x * (Float4 (-`0.27f`) * x + Float4 (`1.05539816f`));
348	}
349	else
350	{
351	// From 4.4.49, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun
352	const Float4 a2(-`0.3333314528f`);
353	const Float4 a4(`0.1999355085f`);
354	const Float4 a6(-`0.1420889944f`);
355	const Float4 a8(`0.1065626393f`);
356	const Float4 a10(-`0.0752896400f`);
357	const Float4 a12(`0.0429096138f`);
358	const Float4 a14(-`0.0161657367f`);
359	const Float4 a16(`0.0028662257f`);
360	Float4 x2 = x * x;
361	return (x + x * (x2 * (a2 + x2 * (a4 + x2 * (a6 + x2 * (a8 + x2 * (a10 + x2 * (a12 + x2 * (a14 + x2 * a16)))))))));
362	}
363	}
364
365	Float4 arctan(RValue<Float4> x, bool pp)
366	{
367	Float4 absx = Abs(x);
368	Int4 O = CmpNLT(absx, Float4 (`1.0f`));
369	Float4 y = As<Float4>((O & As<Int4>(Float4 (`1.0f`) / absx)) \| (~O & As<Int4>(absx))); // FIXME: Vector select
370
371	const Float4 half_pi(`1.57079632f`);
372	Float4 theta = arctan_01(y, pp);
373	return As<Float4>(((O & As<Int4>(half_pi - theta)) \| (~O & As<Int4>(theta))) ^ // FIXME: Vector select
374	(As<Int4>(x) & Int4 (`0x80000000`)));
375	}
376
377	Float4 arctan(RValue<Float4> y, RValue<Float4> x, bool pp)
378	{
379	const Float4 pi(`3.14159265f`); // pi
380	const Float4 minus_pi(-`3.14159265f`); // -pi
381	const Float4 half_pi(`1.57079632f`); // pi/2
382	const Float4 quarter_pi(`7.85398163e-1f`); // pi/4
383
384	// Rotate to upper semicircle when in lower semicircle
385	Int4 S = CmpLT(y, Float4 (`0.0f`));
386	Float4 theta = As<Float4>(S & As<Int4>(minus_pi));
387	Float4 x0 = As<Float4>((As<Int4>(y) & Int4 (`0x80000000`)) ^ As<Int4>(x));
388	Float4 y0 = Abs(y);
389
390	// Rotate to right quadrant when in left quadrant
391	Int4 Q = CmpLT(x0, Float4 (`0.0f`));
392	theta += As<Float4>(Q & As<Int4>(half_pi));
393	Float4 x1 = As<Float4>((Q & As<Int4>(y0)) \| (~Q & As<Int4>(x0))); // FIXME: Vector select
394	Float4 y1 = As<Float4>((Q & As<Int4>(-x0)) \| (~Q & As<Int4>(y0))); // FIXME: Vector select
395
396	// Mirror to first octant when in second octant
397	Int4 O = CmpNLT(y1, x1);
398	Float4 x2 = As<Float4>((O & As<Int4>(y1)) \| (~O & As<Int4>(x1))); // FIXME: Vector select
399	Float4 y2 = As<Float4>((O & As<Int4>(x1)) \| (~O & As<Int4>(y1))); // FIXME: Vector select
400
401	// Approximation of atan in [0..1]
402	Int4 zero_x = CmpEQ(x2, Float4 (`0.0f`));
403	Int4 inf_y = IsInf(y2); // Since x2 >= y2, this means x2 == y2 == inf, so we use 45 degrees or pi/4
404	Float4 atan2_theta = arctan_01(y2 / x2, pp);
405	theta += As<Float4>((~zero_x & ~inf_y & ((O & As<Int4>(half_pi - atan2_theta)) \| (~O & (As<Int4>(atan2_theta))))) \| // FIXME: Vector select
406	(inf_y & As<Int4>(quarter_pi)));
407
408	// Recover loss of precision for tiny theta angles
409	Int4 precision_loss = S & Q & O & ~inf_y; // This combination results in (-pi + half_pi + half_pi - atan2_theta) which is equivalent to -atan2_theta
410	return As<Float4>((precision_loss & As<Int4>(-atan2_theta)) \| (~precision_loss & As<Int4>(theta))); // FIXME: Vector select
411	}
412
413	Float4 sineh(RValue<Float4> x, bool pp)
414	{
415	return (exponential(x, pp) - exponential(-x, pp)) * Float4 (`0.5f`);
416	}
417
418	Float4 cosineh(RValue<Float4> x, bool pp)
419	{
420	return (exponential(x, pp) + exponential(-x, pp)) * Float4 (`0.5f`);
421	}
422
423	Float4 tangenth(RValue<Float4> x, bool pp)
424	{
425	Float4 e_x = exponential(x, pp);
426	Float4 e_minus_x = exponential(-x, pp);
427	return (e_x - e_minus_x) / (e_x + e_minus_x);
428	}
429
430	Float4 arccosh(RValue<Float4> x, bool pp)
431	{
432	return logarithm(x + Sqrt(x + Float4 (`1.0f`)) * Sqrt(x - Float4 (`1.0f`)), pp);
433	}
434
435	Float4 arcsinh(RValue<Float4> x, bool pp)
436	{
437	return logarithm(x + Sqrt(x * x + Float4 (`1.0f`)), pp);
438	}
439
440	Float4 arctanh(RValue<Float4> x, bool pp)
441	{
442	return logarithm((Float4 (`1.0f`) + x) / (Float4 (`1.0f`) - x), pp) * Float4 (`0.5f`);
443	}
444
445	Float4 dot2(const Vector4f &v0, const Vector4f &v1)
446	{
447	return v0.x * v1.x + v0.y * v1.y;
448	}
449
450	Float4 dot3(const Vector4f &v0, const Vector4f &v1)
451	{
452	return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z;
453	}
454
455	Float4 dot4(const Vector4f &v0, const Vector4f &v1)
456	{
457	return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z + v0.w * v1.w;
458	}
459
460	void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3)
461	{
462	Int2 tmp0 = UnpackHigh(row0, row1);
463	Int2 tmp1 = UnpackHigh(row2, row3);
464	Int2 tmp2 = UnpackLow(row0, row1);
465	Int2 tmp3 = UnpackLow(row2, row3);
466
467	row0 = UnpackLow(tmp2, tmp3);
468	row1 = UnpackHigh(tmp2, tmp3);
469	row2 = UnpackLow(tmp0, tmp1);
470	row3 = UnpackHigh(tmp0, tmp1);
471	}
472
473	void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3)
474	{
475	Int2 tmp0 = UnpackHigh(row0, row1);
476	Int2 tmp1 = UnpackHigh(row2, row3);
477	Int2 tmp2 = UnpackLow(row0, row1);
478	Int2 tmp3 = UnpackLow(row2, row3);
479
480	row0 = UnpackLow(tmp2, tmp3);
481	row1 = UnpackHigh(tmp2, tmp3);
482	row2 = UnpackLow(tmp0, tmp1);
483	}
484
485	void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
486	{
487	Float4 tmp0 = UnpackLow(row0, row1);
488	Float4 tmp1 = UnpackLow(row2, row3);
489	Float4 tmp2 = UnpackHigh(row0, row1);
490	Float4 tmp3 = UnpackHigh(row2, row3);
491
492	row0 = Float4 (tmp0.xy, tmp1.xy);
493	row1 = Float4 (tmp0.zw, tmp1.zw);
494	row2 = Float4 (tmp2.xy, tmp3.xy);
495	row3 = Float4 (tmp2.zw, tmp3.zw);
496	}
497
498	void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
499	{
500	Float4 tmp0 = UnpackLow(row0, row1);
501	Float4 tmp1 = UnpackLow(row2, row3);
502	Float4 tmp2 = UnpackHigh(row0, row1);
503	Float4 tmp3 = UnpackHigh(row2, row3);
504
505	row0 = Float4 (tmp0.xy, tmp1.xy);
506	row1 = Float4 (tmp0.zw, tmp1.zw);
507	row2 = Float4 (tmp2.xy, tmp3.xy);
508	}
509
510	void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
511	{
512	Float4 tmp0 = UnpackLow(row0, row1);
513	Float4 tmp1 = UnpackLow(row2, row3);
514
515	row0 = Float4 (tmp0.xy, tmp1.xy);
516	row1 = Float4 (tmp0.zw, tmp1.zw);
517	}
518
519	void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
520	{
521	Float4 tmp0 = UnpackLow(row0, row1);
522	Float4 tmp1 = UnpackLow(row2, row3);
523
524	row0 = Float4 (tmp0.xy, tmp1.xy);
525	}
526
527	void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
528	{
529	Float4 tmp01 = UnpackLow(row0, row1);
530	Float4 tmp23 = UnpackHigh(row0, row1);
531
532	row0 = tmp01;
533	row1 = Float4 (tmp01.zw, row1.zw);
534	row2 = tmp23;
535	row3 = Float4 (tmp23.zw, row3.zw);
536	}
537
538	void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N)
539	{
540	switch(N)
541	{
542	case `1`: transpose4x1(row0, row1, row2, row3); break;
543	case `2`: transpose4x2(row0, row1, row2, row3); break;
544	case `3`: transpose4x3(row0, row1, row2, row3); break;
545	case `4`: transpose4x4(row0, row1, row2, row3); break;
546	}
547	}
548
549	UInt4 halfToFloatBits(UInt4 halfBits)
550	{
551	auto magic = UInt4 (`126` << `23`);
552
553	auto sign16 = halfBits & UInt4 (`0x8000`);
554	auto man16 = halfBits & UInt4 (`0x3FF`);
555	auto exp16 = halfBits & UInt4 (`0x7C00`);
556
557	auto isDnormOrZero = CmpEQ(exp16, UInt4 (`0`));
558	auto isInfOrNaN = CmpEQ(exp16, UInt4 (`0x7C00`));
559
560	auto sign32 = sign16 << `16`;
561	auto man32 = man16 << `13`;
562	auto exp32 = (exp16 + UInt4 (`0x1C000`)) << `13`;
563	auto norm32 = (man32 \| exp32) \| (isInfOrNaN & UInt4 (`0x7F800000`));
564
565	auto denorm32 = As<UInt4>(As<Float4>(magic + man16) - As<Float4>(magic));
566
567	return sign32 \| (norm32 & ~isDnormOrZero) \| (denorm32 & isDnormOrZero);
568	}
569	}
570

Browse the source code of engine/third_party/swiftshader/src/Pipeline/ShaderCore.cpp