ShaderCore.cpp source code [engine/third_party/swiftshader/src/Shader/ShaderCore.cpp]

1	// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2	//
3	// Licensed under the Apache License, Version 2.0 (the "License");
4	// you may not use this file except in compliance with the License.
5	// You may obtain a copy of the License at
6	//
7	// http://www.apache.org/licenses/LICENSE-2.0
8	//
9	// Unless required by applicable law or agreed to in writing, software
10	// distributed under the License is distributed on an "AS IS" BASIS,
11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	// See the License for the specific language governing permissions and
13	// limitations under the License.
14
15	#include "ShaderCore.hpp"
16
17	#include "Renderer/Renderer.hpp"
18	#include "Common/Debug.hpp"
19
20	#include <limits.h>
21
22	namespace sw
23	{
24	extern TranscendentalPrecision logPrecision;
25	extern TranscendentalPrecision expPrecision;
26	extern TranscendentalPrecision rcpPrecision;
27	extern TranscendentalPrecision rsqPrecision;
28
29	Vector4s::Vector4s()
30	{
31	}
32
33	Vector4s::Vector4s(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
34	{
35	this->x = Short4 (x);
36	this->y = Short4 (y);
37	this->z = Short4 (z);
38	this->w = Short4 (w);
39	}
40
41	Vector4s::Vector4s(const Vector4s &rhs)
42	{
43	x = rhs.x;
44	y = rhs.y;
45	z = rhs.z;
46	w = rhs.w;
47	}
48
49	Vector4s &Vector4s::operator=(const Vector4s &rhs)
50	{
51	x = rhs.x;
52	y = rhs.y;
53	z = rhs.z;
54	w = rhs.w;
55
56	return *this;
57	}
58
59	Short4 &Vector4s::operator[](int i)
60	{
61	switch(i)
62	{
63	case `0`: return x;
64	case `1`: return y;
65	case `2`: return z;
66	case `3`: return w;
67	}
68
69	return x;
70	}
71
72	Vector4f::Vector4f()
73	{
74	}
75
76	Vector4f::Vector4f(float x, float y, float z, float w)
77	{
78	this->x = Float4 (x);
79	this->y = Float4 (y);
80	this->z = Float4 (z);
81	this->w = Float4 (w);
82	}
83
84	Vector4f::Vector4f(const Vector4f &rhs)
85	{
86	x = rhs.x;
87	y = rhs.y;
88	z = rhs.z;
89	w = rhs.w;
90	}
91
92	Vector4f &Vector4f::operator=(const Vector4f &rhs)
93	{
94	x = rhs.x;
95	y = rhs.y;
96	z = rhs.z;
97	w = rhs.w;
98
99	return *this;
100	}
101
102	Float4 &Vector4f::operator[](int i)
103	{
104	switch(i)
105	{
106	case `0`: return x;
107	case `1`: return y;
108	case `2`: return z;
109	case `3`: return w;
110	}
111
112	return x;
113	}
114
115	Float4 exponential2(RValue<Float4> x, bool pp)
116	{
117	// This implementation is based on 2^(i + f) = 2^i 2^f,*
118	// where i is the integer part of x and f is the fraction.
119
120	// For 2^i we can put the integer part directly in the exponent of
121	// the IEEE-754 floating-point number. Clamp to prevent overflow
122	// past the representation of infinity.
123	Float4 x0 = x;
124	x0 = Min(x0, As<Float4>(Int4 (`0x43010000`))); // 129.00000e+0f
125	x0 = Max(x0, As<Float4>(Int4 (`0xC2FDFFFF`))); // -126.99999e+0f
126
127	Int4 i = RoundInt(x0 - Float4 (`0.5f`));
128	Float4 ii = As<Float4>((i + Int4 (`127`)) << `23`); // Add single-precision bias, and shift into exponent.
129
130	// For the fractional part use a polynomial
131	// which approximates 2^f in the 0 to 1 range.
132	Float4 f = x0 - Float4 (i);
133	Float4 ff = As<Float4>(Int4 (`0x3AF61905`)); // 1.8775767e-3f
134	ff = ff * f + As<Float4>(Int4 (`0x3C134806`)); // 8.9893397e-3f
135	ff = ff * f + As<Float4>(Int4 (`0x3D64AA23`)); // 5.5826318e-2f
136	ff = ff * f + As<Float4>(Int4 (`0x3E75EAD4`)); // 2.4015361e-1f
137	ff = ff * f + As<Float4>(Int4 (`0x3F31727B`)); // 6.9315308e-1f
138	ff = ff * f + Float4 (`1.0f`);
139
140	return ii * ff;
141	}
142
143	Float4 logarithm2(RValue<Float4> x, bool absolute, bool pp)
144	{
145	Float4 x0;
146	Float4 x1;
147	Float4 x2;
148	Float4 x3;
149
150	x0 = x;
151
152	x1 = As<Float4>(As<Int4>(x0) & Int4 (`0x7F800000`));
153	x1 = As<Float4>(As<UInt4>(x1) >> `8`);
154	x1 = As<Float4>(As<Int4>(x1) \| As<Int4>(Float4 (`1.0f`)));
155	x1 = (x1 - Float4 (`1.4960938f`)) * Float4 (`256.0f`); // FIXME: (x1 - 1.4960938f) 256.0f;*
156	x0 = As<Float4>((As<Int4>(x0) & Int4 (`0x007FFFFF`)) \| As<Int4>(Float4 (`1.0f`)));
157
158	x2 = (Float4 (`9.5428179e-2f`) * x0 + Float4 (`4.7779095e-1f`)) * x0 + Float4 (`1.9782813e-1f`);
159	x3 = ((Float4 (`1.6618466e-2f`) * x0 + Float4 (`2.0350508e-1f`)) * x0 + Float4 (`2.7382900e-1f`)) * x0 + Float4 (`4.0496687e-2f`);
160	x2 /= x3;
161
162	x1 += (x0 - Float4 (`1.0f`)) * x2;
163
164	Int4 pos_inf_x = CmpEQ(As<Int4>(x), Int4 (`0x7F800000`));
165	return As<Float4>((pos_inf_x & As<Int4>(x)) \| (~pos_inf_x & As<Int4>(x1)));
166	}
167
168	Float4 exponential(RValue<Float4> x, bool pp)
169	{
170	// FIXME: Propagate the constant
171	return exponential2(Float4 (`1.44269504f`) * x, pp); // 1/ln(2)
172	}
173
174	Float4 logarithm(RValue<Float4> x, bool absolute, bool pp)
175	{
176	// FIXME: Propagate the constant
177	return Float4 (`6.93147181e-1f`) * logarithm2(x, absolute, pp); // ln(2)
178	}
179
180	Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp)
181	{
182	Float4 log = logarithm2(x, true, pp);
183	log *= y;
184	return exponential2(log, pp);
185	}
186
187	Float4 reciprocal(RValue<Float4> x, bool pp, bool finite, bool exactAtPow2)
188	{
189	Float4 rcp;
190
191	if(!pp && rcpPrecision >= WHQL)
192	{
193	rcp = Float4 (`1.0f`) / x;
194	}
195	else
196	{
197	rcp = Rcp_pp(x, exactAtPow2);
198
199	if(!pp)
200	{
201	rcp = (rcp + rcp) - (x * rcp * rcp);
202	}
203	}
204
205	if(finite)
206	{
207	int big = `0x7F7FFFFF`;
208	rcp = Min(rcp, Float4 ((float&)big));
209	}
210
211	return rcp;
212	}
213
214	Float4 reciprocalSquareRoot(RValue<Float4> x, bool absolute, bool pp)
215	{
216	Float4 abs = x;
217
218	if(absolute)
219	{
220	abs = Abs(abs);
221	}
222
223	Float4 rsq;
224
225	if(!pp)
226	{
227	rsq = Float4 (`1.0f`) / Sqrt(abs);
228	}
229	else
230	{
231	rsq = RcpSqrt_pp(abs);
232
233	if(!pp)
234	{
235	rsq = rsq * (Float4 (`3.0f`) - rsq * rsq * abs) * Float4 (`0.5f`);
236	}
237
238	rsq = As<Float4>(CmpNEQ(As<Int4>(abs), Int4 (`0x7F800000`)) & As<Int4>(rsq));
239	}
240
241	return rsq;
242	}
243
244	Float4 modulo(RValue<Float4> x, RValue<Float4> y)
245	{
246	return x - y * Floor(x / y);
247	}
248
249	Float4 sine_pi(RValue<Float4> x, bool pp)
250	{
251	const Float4 A = Float4 (-`4.05284734e-1f`); // -4/pi^2
252	const Float4 B = Float4 (`1.27323954e+0f`); // 4/pi
253	const Float4 C = Float4 (`7.75160950e-1f`);
254	const Float4 D = Float4 (`2.24839049e-1f`);
255
256	// Parabola approximating sine
257	Float4 sin = x * (Abs(x) * A + B);
258
259	// Improve precision from 0.06 to 0.001
260	if(true)
261	{
262	sin = sin * (Abs(sin) * D + C);
263	}
264
265	return sin;
266	}
267
268	Float4 cosine_pi(RValue<Float4> x, bool pp)
269	{
270	// cos(x) = sin(x + pi/2)
271	Float4 y = x + Float4 (`1.57079632e+0f`);
272
273	// Wrap around
274	y -= As<Float4>(CmpNLT(y, Float4 (`3.14159265e+0f`)) & As<Int4>(Float4 (`6.28318530e+0f`)));
275
276	return sine_pi(y, pp);
277	}
278
279	Float4 sine(RValue<Float4> x, bool pp)
280	{
281	// Reduce to [-0.5, 0.5] range
282	Float4 y = x * Float4 (`1.59154943e-1f`); // 1/2pi
283	y = y - Round(y);
284
285	if(!pp)
286	{
287	// From the paper: "A Fast, Vectorizable Algorithm for Producing Single-Precision Sine-Cosine Pairs"
288	// This implementation passes OpenGL ES 3.0 precision requirements, at the cost of more operations:
289	// !pp : 17 mul, 7 add, 1 sub, 1 reciprocal
290	// pp : 4 mul, 2 add, 2 abs
291
292	Float4 y2 = y * y;
293	Float4 c1 = y2 * (y2 * (y2 * Float4 (-`0.0204391631f`) + Float4 (`0.2536086171f`)) + Float4 (-`1.2336977925f`)) + Float4 (`1.0f`);
294	Float4 s1 = y * (y2 * (y2 * (y2 * Float4 (-`0.0046075748f`) + Float4 (`0.0796819754f`)) + Float4 (-`0.645963615f`)) + Float4 (`1.5707963235f`));
295	Float4 c2 = (c1 * c1) - (s1 * s1);
296	Float4 s2 = Float4 (`2.0f`) * s1 * c1;
297	return Float4 (`2.0f`) * s2 * c2 * reciprocal(s2 * s2 + c2 * c2, pp, true);
298	}
299
300	const Float4 A = Float4 (-`16.0f`);
301	const Float4 B = Float4 (`8.0f`);
302	const Float4 C = Float4 (`7.75160950e-1f`);
303	const Float4 D = Float4 (`2.24839049e-1f`);
304
305	// Parabola approximating sine
306	Float4 sin = y * (Abs(y) * A + B);
307
308	// Improve precision from 0.06 to 0.001
309	if(true)
310	{
311	sin = sin * (Abs(sin) * D + C);
312	}
313
314	return sin;
315	}
316
317	Float4 cosine(RValue<Float4> x, bool pp)
318	{
319	// cos(x) = sin(x + pi/2)
320	Float4 y = x + Float4 (`1.57079632e+0f`);
321	return sine(y, pp);
322	}
323
324	Float4 tangent(RValue<Float4> x, bool pp)
325	{
326	return sine(x, pp) / cosine(x, pp);
327	}
328
329	Float4 arccos(RValue<Float4> x, bool pp)
330	{
331	// pi/2 - arcsin(x)
332	return Float4 (`1.57079632e+0f`) - arcsin(x);
333	}
334
335	Float4 arcsin(RValue<Float4> x, bool pp)
336	{
337	if(false) // Simpler implementation fails even lowp precision tests
338	{
339	// x(pi/2-sqrt(1-xx)pi/5)*
340	return x * (Float4 (`1.57079632e+0f`) - Sqrt(Float4 (`1.0f`) - x x) Float4 (`6.28318531e-1f`));
341	}
342	else
343	{
344	// From 4.4.45, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun
345	const Float4 half_pi(`1.57079632f`);
346	const Float4 a0(`1.5707288f`);
347	const Float4 a1(-`0.2121144f`);
348	const Float4 a2(`0.0742610f`);
349	const Float4 a3(-`0.0187293f`);
350	Float4 absx = Abs(x);
351	return As<Float4>(As<Int4>(half_pi - Sqrt(Float4 (`1.0f`) - absx) * (a0 + absx * (a1 + absx * (a2 + absx * a3)))) ^
352	(As<Int4>(x) & Int4 (`0x80000000`)));
353	}
354	}
355
356	// Approximation of atan in [0..1]
357	Float4 arctan_01(Float4 x, bool pp)
358	{
359	if(pp)
360	{
361	return x * (Float4 (-`0.27f`) * x + Float4 (`1.05539816f`));
362	}
363	else
364	{
365	// From 4.4.49, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun
366	const Float4 a2(-`0.3333314528f`);
367	const Float4 a4(`0.1999355085f`);
368	const Float4 a6(-`0.1420889944f`);
369	const Float4 a8(`0.1065626393f`);
370	const Float4 a10(-`0.0752896400f`);
371	const Float4 a12(`0.0429096138f`);
372	const Float4 a14(-`0.0161657367f`);
373	const Float4 a16(`0.0028662257f`);
374	Float4 x2 = x * x;
375	return (x + x * (x2 * (a2 + x2 * (a4 + x2 * (a6 + x2 * (a8 + x2 * (a10 + x2 * (a12 + x2 * (a14 + x2 * a16)))))))));
376	}
377	}
378
379	Float4 arctan(RValue<Float4> x, bool pp)
380	{
381	Float4 absx = Abs(x);
382	Int4 O = CmpNLT(absx, Float4 (`1.0f`));
383	Float4 y = As<Float4>((O & As<Int4>(Float4 (`1.0f`) / absx)) \| (~O & As<Int4>(absx))); // FIXME: Vector select
384
385	const Float4 half_pi(`1.57079632f`);
386	Float4 theta = arctan_01(y, pp);
387	return As<Float4>(((O & As<Int4>(half_pi - theta)) \| (~O & As<Int4>(theta))) ^ // FIXME: Vector select
388	(As<Int4>(x) & Int4 (`0x80000000`)));
389	}
390
391	Float4 arctan(RValue<Float4> y, RValue<Float4> x, bool pp)
392	{
393	const Float4 pi(`3.14159265f`); // pi
394	const Float4 minus_pi(-`3.14159265f`); // -pi
395	const Float4 half_pi(`1.57079632f`); // pi/2
396	const Float4 quarter_pi(`7.85398163e-1f`); // pi/4
397
398	// Rotate to upper semicircle when in lower semicircle
399	Int4 S = CmpLT(y, Float4 (`0.0f`));
400	Float4 theta = As<Float4>(S & As<Int4>(minus_pi));
401	Float4 x0 = As<Float4>((As<Int4>(y) & Int4 (`0x80000000`)) ^ As<Int4>(x));
402	Float4 y0 = Abs(y);
403
404	// Rotate to right quadrant when in left quadrant
405	Int4 Q = CmpLT(x0, Float4 (`0.0f`));
406	theta += As<Float4>(Q & As<Int4>(half_pi));
407	Float4 x1 = As<Float4>((Q & As<Int4>(y0)) \| (~Q & As<Int4>(x0))); // FIXME: Vector select
408	Float4 y1 = As<Float4>((Q & As<Int4>(-x0)) \| (~Q & As<Int4>(y0))); // FIXME: Vector select
409
410	// Mirror to first octant when in second octant
411	Int4 O = CmpNLT(y1, x1);
412	Float4 x2 = As<Float4>((O & As<Int4>(y1)) \| (~O & As<Int4>(x1))); // FIXME: Vector select
413	Float4 y2 = As<Float4>((O & As<Int4>(x1)) \| (~O & As<Int4>(y1))); // FIXME: Vector select
414
415	// Approximation of atan in [0..1]
416	Int4 zero_x = CmpEQ(x2, Float4 (`0.0f`));
417	Int4 inf_y = IsInf(y2); // Since x2 >= y2, this means x2 == y2 == inf, so we use 45 degrees or pi/4
418	Float4 atan2_theta = arctan_01(y2 / x2, pp);
419	theta += As<Float4>((~zero_x & ~inf_y & ((O & As<Int4>(half_pi - atan2_theta)) \| (~O & (As<Int4>(atan2_theta))))) \| // FIXME: Vector select
420	(inf_y & As<Int4>(quarter_pi)));
421
422	// Recover loss of precision for tiny theta angles
423	Int4 precision_loss = S & Q & O & ~inf_y; // This combination results in (-pi + half_pi + half_pi - atan2_theta) which is equivalent to -atan2_theta
424	return As<Float4>((precision_loss & As<Int4>(-atan2_theta)) \| (~precision_loss & As<Int4>(theta))); // FIXME: Vector select
425	}
426
427	Float4 sineh(RValue<Float4> x, bool pp)
428	{
429	return (exponential(x, pp) - exponential(-x, pp)) * Float4 (`0.5f`);
430	}
431
432	Float4 cosineh(RValue<Float4> x, bool pp)
433	{
434	return (exponential(x, pp) + exponential(-x, pp)) * Float4 (`0.5f`);
435	}
436
437	Float4 tangenth(RValue<Float4> x, bool pp)
438	{
439	Float4 e_x = exponential(x, pp);
440	Float4 e_minus_x = exponential(-x, pp);
441	return (e_x - e_minus_x) / (e_x + e_minus_x);
442	}
443
444	Float4 arccosh(RValue<Float4> x, bool pp)
445	{
446	return logarithm(x + Sqrt(x + Float4 (`1.0f`)) * Sqrt(x - Float4 (`1.0f`)), pp);
447	}
448
449	Float4 arcsinh(RValue<Float4> x, bool pp)
450	{
451	return logarithm(x + Sqrt(x * x + Float4 (`1.0f`)), pp);
452	}
453
454	Float4 arctanh(RValue<Float4> x, bool pp)
455	{
456	return logarithm((Float4 (`1.0f`) + x) / (Float4 (`1.0f`) - x), pp) * Float4 (`0.5f`);
457	}
458
459	Float4 dot2(const Vector4f &v0, const Vector4f &v1)
460	{
461	return v0.x * v1.x + v0.y * v1.y;
462	}
463
464	Float4 dot3(const Vector4f &v0, const Vector4f &v1)
465	{
466	return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z;
467	}
468
469	Float4 dot4(const Vector4f &v0, const Vector4f &v1)
470	{
471	return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z + v0.w * v1.w;
472	}
473
474	void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3)
475	{
476	Int2 tmp0 = UnpackHigh(row0, row1);
477	Int2 tmp1 = UnpackHigh(row2, row3);
478	Int2 tmp2 = UnpackLow(row0, row1);
479	Int2 tmp3 = UnpackLow(row2, row3);
480
481	row0 = UnpackLow(tmp2, tmp3);
482	row1 = UnpackHigh(tmp2, tmp3);
483	row2 = UnpackLow(tmp0, tmp1);
484	row3 = UnpackHigh(tmp0, tmp1);
485	}
486
487	void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3)
488	{
489	Int2 tmp0 = UnpackHigh(row0, row1);
490	Int2 tmp1 = UnpackHigh(row2, row3);
491	Int2 tmp2 = UnpackLow(row0, row1);
492	Int2 tmp3 = UnpackLow(row2, row3);
493
494	row0 = UnpackLow(tmp2, tmp3);
495	row1 = UnpackHigh(tmp2, tmp3);
496	row2 = UnpackLow(tmp0, tmp1);
497	}
498
499	void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
500	{
501	Float4 tmp0 = UnpackLow(row0, row1);
502	Float4 tmp1 = UnpackLow(row2, row3);
503	Float4 tmp2 = UnpackHigh(row0, row1);
504	Float4 tmp3 = UnpackHigh(row2, row3);
505
506	row0 = Float4 (tmp0.xy, tmp1.xy);
507	row1 = Float4 (tmp0.zw, tmp1.zw);
508	row2 = Float4 (tmp2.xy, tmp3.xy);
509	row3 = Float4 (tmp2.zw, tmp3.zw);
510	}
511
512	void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
513	{
514	Float4 tmp0 = UnpackLow(row0, row1);
515	Float4 tmp1 = UnpackLow(row2, row3);
516	Float4 tmp2 = UnpackHigh(row0, row1);
517	Float4 tmp3 = UnpackHigh(row2, row3);
518
519	row0 = Float4 (tmp0.xy, tmp1.xy);
520	row1 = Float4 (tmp0.zw, tmp1.zw);
521	row2 = Float4 (tmp2.xy, tmp3.xy);
522	}
523
524	void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
525	{
526	Float4 tmp0 = UnpackLow(row0, row1);
527	Float4 tmp1 = UnpackLow(row2, row3);
528
529	row0 = Float4 (tmp0.xy, tmp1.xy);
530	row1 = Float4 (tmp0.zw, tmp1.zw);
531	}
532
533	void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
534	{
535	Float4 tmp0 = UnpackLow(row0, row1);
536	Float4 tmp1 = UnpackLow(row2, row3);
537
538	row0 = Float4 (tmp0.xy, tmp1.xy);
539	}
540
541	void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
542	{
543	Float4 tmp01 = UnpackLow(row0, row1);
544	Float4 tmp23 = UnpackHigh(row0, row1);
545
546	row0 = tmp01;
547	row1 = Float4 (tmp01.zw, row1.zw);
548	row2 = tmp23;
549	row3 = Float4 (tmp23.zw, row3.zw);
550	}
551
552	void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N)
553	{
554	switch(N)
555	{
556	case `1`: transpose4x1(row0, row1, row2, row3); break;
557	case `2`: transpose4x2(row0, row1, row2, row3); break;
558	case `3`: transpose4x3(row0, row1, row2, row3); break;
559	case `4`: transpose4x4(row0, row1, row2, row3); break;
560	}
561	}
562
563	const Vector4f RegisterFile::operator[](RValue<Int4> index)
564	{
565	ASSERT(indirectAddressable);
566
567	Int index0 = Extract(index, `0`);
568	Int index1 = Extract(index, `1`);
569	Int index2 = Extract(index, `2`);
570	Int index3 = Extract(index, `3`);
571
572	Vector4f r;
573
574	r.x.x = Extract(x[`0`][index0], `0`);
575	r.x.y = Extract(x[`0`][index1], `1`);
576	r.x.z = Extract(x[`0`][index2], `2`);
577	r.x.w = Extract(x[`0`][index3], `3`);
578
579	r.y.x = Extract(y[`0`][index0], `0`);
580	r.y.y = Extract(y[`0`][index1], `1`);
581	r.y.z = Extract(y[`0`][index2], `2`);
582	r.y.w = Extract(y[`0`][index3], `3`);
583
584	r.z.x = Extract(z[`0`][index0], `0`);
585	r.z.y = Extract(z[`0`][index1], `1`);
586	r.z.z = Extract(z[`0`][index2], `2`);
587	r.z.w = Extract(z[`0`][index3], `3`);
588
589	r.w.x = Extract(w[`0`][index0], `0`);
590	r.w.y = Extract(w[`0`][index1], `1`);
591	r.w.z = Extract(w[`0`][index2], `2`);
592	r.w.w = Extract(w[`0`][index3], `3`);
593
594	return r;
595	}
596
597	void RegisterFile::scatter_x(Int4 index, RValue<Float4> r)
598	{
599	ASSERT(indirectAddressable);
600
601	Int index0 = Extract(index, `0`);
602	Int index1 = Extract(index, `1`);
603	Int index2 = Extract(index, `2`);
604	Int index3 = Extract(index, `3`);
605
606	x[`0`][index0] = Insert(x[`0`][index0], Extract(r, `0`), `0`);
607	x[`0`][index1] = Insert(x[`0`][index1], Extract(r, `1`), `1`);
608	x[`0`][index2] = Insert(x[`0`][index2], Extract(r, `2`), `2`);
609	x[`0`][index3] = Insert(x[`0`][index3], Extract(r, `3`), `3`);
610	}
611
612	void RegisterFile::scatter_y(Int4 index, RValue<Float4> r)
613	{
614	ASSERT(indirectAddressable);
615
616	Int index0 = Extract(index, `0`);
617	Int index1 = Extract(index, `1`);
618	Int index2 = Extract(index, `2`);
619	Int index3 = Extract(index, `3`);
620
621	y[`0`][index0] = Insert(y[`0`][index0], Extract(r, `0`), `0`);
622	y[`0`][index1] = Insert(y[`0`][index1], Extract(r, `1`), `1`);
623	y[`0`][index2] = Insert(y[`0`][index2], Extract(r, `2`), `2`);
624	y[`0`][index3] = Insert(y[`0`][index3], Extract(r, `3`), `3`);
625	}
626
627	void RegisterFile::scatter_z(Int4 index, RValue<Float4> r)
628	{
629	ASSERT(indirectAddressable);
630
631	Int index0 = Extract(index, `0`);
632	Int index1 = Extract(index, `1`);
633	Int index2 = Extract(index, `2`);
634	Int index3 = Extract(index, `3`);
635
636	z[`0`][index0] = Insert(z[`0`][index0], Extract(r, `0`), `0`);
637	z[`0`][index1] = Insert(z[`0`][index1], Extract(r, `1`), `1`);
638	z[`0`][index2] = Insert(z[`0`][index2], Extract(r, `2`), `2`);
639	z[`0`][index3] = Insert(z[`0`][index3], Extract(r, `3`), `3`);
640	}
641
642	void RegisterFile::scatter_w(Int4 index, RValue<Float4> r)
643	{
644	ASSERT(indirectAddressable);
645
646	Int index0 = Extract(index, `0`);
647	Int index1 = Extract(index, `1`);
648	Int index2 = Extract(index, `2`);
649	Int index3 = Extract(index, `3`);
650
651	w[`0`][index0] = Insert(w[`0`][index0], Extract(r, `0`), `0`);
652	w[`0`][index1] = Insert(w[`0`][index1], Extract(r, `1`), `1`);
653	w[`0`][index2] = Insert(w[`0`][index2], Extract(r, `2`), `2`);
654	w[`0`][index3] = Insert(w[`0`][index3], Extract(r, `3`), `3`);
655	}
656
657	void ShaderCore::mov(Vector4f &dst, const Vector4f &src, bool integerDestination)
658	{
659	if(integerDestination)
660	{
661	dst.x = As<Float4>(RoundInt(src.x));
662	dst.y = As<Float4>(RoundInt(src.y));
663	dst.z = As<Float4>(RoundInt(src.z));
664	dst.w = As<Float4>(RoundInt(src.w));
665	}
666	else
667	{
668	dst = src;
669	}
670	}
671
672	void ShaderCore::neg(Vector4f &dst, const Vector4f &src)
673	{
674	dst.x = -src.x;
675	dst.y = -src.y;
676	dst.z = -src.z;
677	dst.w = -src.w;
678	}
679
680	void ShaderCore::ineg(Vector4f &dst, const Vector4f &src)
681	{
682	dst.x = As<Float4>(-As<Int4>(src.x));
683	dst.y = As<Float4>(-As<Int4>(src.y));
684	dst.z = As<Float4>(-As<Int4>(src.z));
685	dst.w = As<Float4>(-As<Int4>(src.w));
686	}
687
688	void ShaderCore::f2b(Vector4f &dst, const Vector4f &src)
689	{
690	dst.x = As<Float4>(CmpNEQ(src.x, Float4 (`0.0f`)));
691	dst.y = As<Float4>(CmpNEQ(src.y, Float4 (`0.0f`)));
692	dst.z = As<Float4>(CmpNEQ(src.z, Float4 (`0.0f`)));
693	dst.w = As<Float4>(CmpNEQ(src.w, Float4 (`0.0f`)));
694	}
695
696	void ShaderCore::b2f(Vector4f &dst, const Vector4f &src)
697	{
698	dst.x = As<Float4>(As<Int4>(src.x) & As<Int4>(Float4 (`1.0f`)));
699	dst.y = As<Float4>(As<Int4>(src.y) & As<Int4>(Float4 (`1.0f`)));
700	dst.z = As<Float4>(As<Int4>(src.z) & As<Int4>(Float4 (`1.0f`)));
701	dst.w = As<Float4>(As<Int4>(src.w) & As<Int4>(Float4 (`1.0f`)));
702	}
703
704	void ShaderCore::f2i(Vector4f &dst, const Vector4f &src)
705	{
706	dst.x = As<Float4>(Int4 (src.x));
707	dst.y = As<Float4>(Int4 (src.y));
708	dst.z = As<Float4>(Int4 (src.z));
709	dst.w = As<Float4>(Int4 (src.w));
710	}
711
712	void ShaderCore::i2f(Vector4f &dst, const Vector4f &src)
713	{
714	dst.x = Float4 (As<Int4>(src.x));
715	dst.y = Float4 (As<Int4>(src.y));
716	dst.z = Float4 (As<Int4>(src.z));
717	dst.w = Float4 (As<Int4>(src.w));
718	}
719
720	void ShaderCore::f2u(Vector4f &dst, const Vector4f &src)
721	{
722	dst.x = As<Float4>(UInt4 (src.x));
723	dst.y = As<Float4>(UInt4 (src.y));
724	dst.z = As<Float4>(UInt4 (src.z));
725	dst.w = As<Float4>(UInt4 (src.w));
726	}
727
728	void ShaderCore::u2f(Vector4f &dst, const Vector4f &src)
729	{
730	dst.x = Float4 (As<UInt4>(src.x));
731	dst.y = Float4 (As<UInt4>(src.y));
732	dst.z = Float4 (As<UInt4>(src.z));
733	dst.w = Float4 (As<UInt4>(src.w));
734	}
735
736	void ShaderCore::i2b(Vector4f &dst, const Vector4f &src)
737	{
738	dst.x = As<Float4>(CmpNEQ(As<Int4>(src.x), Int4 (`0`)));
739	dst.y = As<Float4>(CmpNEQ(As<Int4>(src.y), Int4 (`0`)));
740	dst.z = As<Float4>(CmpNEQ(As<Int4>(src.z), Int4 (`0`)));
741	dst.w = As<Float4>(CmpNEQ(As<Int4>(src.w), Int4 (`0`)));
742	}
743
744	void ShaderCore::b2i(Vector4f &dst, const Vector4f &src)
745	{
746	dst.x = As<Float4>(As<Int4>(src.x) & Int4 (`1`));
747	dst.y = As<Float4>(As<Int4>(src.y) & Int4 (`1`));
748	dst.z = As<Float4>(As<Int4>(src.z) & Int4 (`1`));
749	dst.w = As<Float4>(As<Int4>(src.w) & Int4 (`1`));
750	}
751
752	void ShaderCore::add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
753	{
754	dst.x = src0.x + src1.x;
755	dst.y = src0.y + src1.y;
756	dst.z = src0.z + src1.z;
757	dst.w = src0.w + src1.w;
758	}
759
760	void ShaderCore::iadd(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
761	{
762	dst.x = As<Float4>(As<Int4>(src0.x) + As<Int4>(src1.x));
763	dst.y = As<Float4>(As<Int4>(src0.y) + As<Int4>(src1.y));
764	dst.z = As<Float4>(As<Int4>(src0.z) + As<Int4>(src1.z));
765	dst.w = As<Float4>(As<Int4>(src0.w) + As<Int4>(src1.w));
766	}
767
768	void ShaderCore::sub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
769	{
770	dst.x = src0.x - src1.x;
771	dst.y = src0.y - src1.y;
772	dst.z = src0.z - src1.z;
773	dst.w = src0.w - src1.w;
774	}
775
776	void ShaderCore::isub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
777	{
778	dst.x = As<Float4>(As<Int4>(src0.x) - As<Int4>(src1.x));
779	dst.y = As<Float4>(As<Int4>(src0.y) - As<Int4>(src1.y));
780	dst.z = As<Float4>(As<Int4>(src0.z) - As<Int4>(src1.z));
781	dst.w = As<Float4>(As<Int4>(src0.w) - As<Int4>(src1.w));
782	}
783
784	void ShaderCore::mad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
785	{
786	dst.x = src0.x * src1.x + src2.x;
787	dst.y = src0.y * src1.y + src2.y;
788	dst.z = src0.z * src1.z + src2.z;
789	dst.w = src0.w * src1.w + src2.w;
790	}
791
792	void ShaderCore::imad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
793	{
794	dst.x = As<Float4>(As<Int4>(src0.x) * As<Int4>(src1.x) + As<Int4>(src2.x));
795	dst.y = As<Float4>(As<Int4>(src0.y) * As<Int4>(src1.y) + As<Int4>(src2.y));
796	dst.z = As<Float4>(As<Int4>(src0.z) * As<Int4>(src1.z) + As<Int4>(src2.z));
797	dst.w = As<Float4>(As<Int4>(src0.w) * As<Int4>(src1.w) + As<Int4>(src2.w));
798	}
799
800	void ShaderCore::mul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
801	{
802	dst.x = src0.x * src1.x;
803	dst.y = src0.y * src1.y;
804	dst.z = src0.z * src1.z;
805	dst.w = src0.w * src1.w;
806	}
807
808	void ShaderCore::imul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
809	{
810	dst.x = As<Float4>(As<Int4>(src0.x) * As<Int4>(src1.x));
811	dst.y = As<Float4>(As<Int4>(src0.y) * As<Int4>(src1.y));
812	dst.z = As<Float4>(As<Int4>(src0.z) * As<Int4>(src1.z));
813	dst.w = As<Float4>(As<Int4>(src0.w) * As<Int4>(src1.w));
814	}
815
816	void ShaderCore::rcpx(Vector4f &dst, const Vector4f &src, bool pp)
817	{
818	Float4 rcp = reciprocal(src.x, pp, true, true);
819
820	dst.x = rcp;
821	dst.y = rcp;
822	dst.z = rcp;
823	dst.w = rcp;
824	}
825
826	void ShaderCore::div(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
827	{
828	dst.x = src0.x / src1.x;
829	dst.y = src0.y / src1.y;
830	dst.z = src0.z / src1.z;
831	dst.w = src0.w / src1.w;
832	}
833
834	void ShaderCore::idiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
835	{
836	Float4 intMax(As<Float4>(Int4 (INT_MAX)));
837	cmp0i(dst.x, src1.x, intMax, src1.x);
838	dst.x = As<Float4>(As<Int4>(src0.x) / As<Int4>(dst.x));
839	cmp0i(dst.y, src1.y, intMax, src1.y);
840	dst.y = As<Float4>(As<Int4>(src0.y) / As<Int4>(dst.y));
841	cmp0i(dst.z, src1.z, intMax, src1.z);
842	dst.z = As<Float4>(As<Int4>(src0.z) / As<Int4>(dst.z));
843	cmp0i(dst.w, src1.w, intMax, src1.w);
844	dst.w = As<Float4>(As<Int4>(src0.w) / As<Int4>(dst.w));
845	}
846
847	void ShaderCore::udiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
848	{
849	Float4 uintMax(As<Float4>(UInt4 (UINT_MAX)));
850	cmp0i(dst.x, src1.x, uintMax, src1.x);
851	dst.x = As<Float4>(As<UInt4>(src0.x) / As<UInt4>(dst.x));
852	cmp0i(dst.y, src1.y, uintMax, src1.y);
853	dst.y = As<Float4>(As<UInt4>(src0.y) / As<UInt4>(dst.y));
854	cmp0i(dst.z, src1.z, uintMax, src1.z);
855	dst.z = As<Float4>(As<UInt4>(src0.z) / As<UInt4>(dst.z));
856	cmp0i(dst.w, src1.w, uintMax, src1.w);
857	dst.w = As<Float4>(As<UInt4>(src0.w) / As<UInt4>(dst.w));
858	}
859
860	void ShaderCore::mod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
861	{
862	dst.x = modulo(src0.x, src1.x);
863	dst.y = modulo(src0.y, src1.y);
864	dst.z = modulo(src0.z, src1.z);
865	dst.w = modulo(src0.w, src1.w);
866	}
867
868	void ShaderCore::imod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
869	{
870	Float4 intMax(As<Float4>(Int4 (INT_MAX)));
871	cmp0i(dst.x, src1.x, intMax, src1.x);
872	dst.x = As<Float4>(As<Int4>(src0.x) % As<Int4>(dst.x));
873	cmp0i(dst.y, src1.y, intMax, src1.y);
874	dst.y = As<Float4>(As<Int4>(src0.y) % As<Int4>(dst.y));
875	cmp0i(dst.z, src1.z, intMax, src1.z);
876	dst.z = As<Float4>(As<Int4>(src0.z) % As<Int4>(dst.z));
877	cmp0i(dst.w, src1.w, intMax, src1.w);
878	dst.w = As<Float4>(As<Int4>(src0.w) % As<Int4>(dst.w));
879	}
880
881	void ShaderCore::umod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
882	{
883	Float4 uintMax(As<Float4>(UInt4 (UINT_MAX)));
884	cmp0i(dst.x, src1.x, uintMax, src1.x);
885	dst.x = As<Float4>(As<UInt4>(src0.x) % As<UInt4>(dst.x));
886	cmp0i(dst.y, src1.y, uintMax, src1.y);
887	dst.y = As<Float4>(As<UInt4>(src0.y) % As<UInt4>(dst.y));
888	cmp0i(dst.z, src1.z, uintMax, src1.z);
889	dst.z = As<Float4>(As<UInt4>(src0.z) % As<UInt4>(dst.z));
890	cmp0i(dst.w, src1.w, uintMax, src1.w);
891	dst.w = As<Float4>(As<UInt4>(src0.w) % As<UInt4>(dst.w));
892	}
893
894	void ShaderCore::shl(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
895	{
896	dst.x = As<Float4>(As<Int4>(src0.x) << As<Int4>(src1.x));
897	dst.y = As<Float4>(As<Int4>(src0.y) << As<Int4>(src1.y));
898	dst.z = As<Float4>(As<Int4>(src0.z) << As<Int4>(src1.z));
899	dst.w = As<Float4>(As<Int4>(src0.w) << As<Int4>(src1.w));
900	}
901
902	void ShaderCore::ishr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
903	{
904	dst.x = As<Float4>(As<Int4>(src0.x) >> As<Int4>(src1.x));
905	dst.y = As<Float4>(As<Int4>(src0.y) >> As<Int4>(src1.y));
906	dst.z = As<Float4>(As<Int4>(src0.z) >> As<Int4>(src1.z));
907	dst.w = As<Float4>(As<Int4>(src0.w) >> As<Int4>(src1.w));
908	}
909
910	void ShaderCore::ushr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
911	{
912	dst.x = As<Float4>(As<UInt4>(src0.x) >> As<UInt4>(src1.x));
913	dst.y = As<Float4>(As<UInt4>(src0.y) >> As<UInt4>(src1.y));
914	dst.z = As<Float4>(As<UInt4>(src0.z) >> As<UInt4>(src1.z));
915	dst.w = As<Float4>(As<UInt4>(src0.w) >> As<UInt4>(src1.w));
916	}
917
918	void ShaderCore::rsqx(Vector4f &dst, const Vector4f &src, bool pp)
919	{
920	Float4 rsq = reciprocalSquareRoot(src.x, true, pp);
921
922	dst.x = rsq;
923	dst.y = rsq;
924	dst.z = rsq;
925	dst.w = rsq;
926	}
927
928	void ShaderCore::sqrt(Vector4f &dst, const Vector4f &src, bool pp)
929	{
930	dst.x = Sqrt(src.x);
931	dst.y = Sqrt(src.y);
932	dst.z = Sqrt(src.z);
933	dst.w = Sqrt(src.w);
934	}
935
936	void ShaderCore::rsq(Vector4f &dst, const Vector4f &src, bool pp)
937	{
938	dst.x = reciprocalSquareRoot(src.x, false, pp);
939	dst.y = reciprocalSquareRoot(src.y, false, pp);
940	dst.z = reciprocalSquareRoot(src.z, false, pp);
941	dst.w = reciprocalSquareRoot(src.w, false, pp);
942	}
943
944	void ShaderCore::len2(Float4 &dst, const Vector4f &src, bool pp)
945	{
946	dst = Sqrt(dot2(src, src));
947	}
948
949	void ShaderCore::len3(Float4 &dst, const Vector4f &src, bool pp)
950	{
951	dst = Sqrt(dot3(src, src));
952	}
953
954	void ShaderCore::len4(Float4 &dst, const Vector4f &src, bool pp)
955	{
956	dst = Sqrt(dot4(src, src));
957	}
958
959	void ShaderCore::dist1(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
960	{
961	dst = Abs(src0.x - src1.x);
962	}
963
964	void ShaderCore::dist2(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
965	{
966	Float4 dx = src0.x - src1.x;
967	Float4 dy = src0.y - src1.y;
968	Float4 dot2 = dx * dx + dy * dy;
969	dst = Sqrt(dot2);
970	}
971
972	void ShaderCore::dist3(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
973	{
974	Float4 dx = src0.x - src1.x;
975	Float4 dy = src0.y - src1.y;
976	Float4 dz = src0.z - src1.z;
977	Float4 dot3 = dx * dx + dy * dy + dz * dz;
978	dst = Sqrt(dot3);
979	}
980
981	void ShaderCore::dist4(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
982	{
983	Float4 dx = src0.x - src1.x;
984	Float4 dy = src0.y - src1.y;
985	Float4 dz = src0.z - src1.z;
986	Float4 dw = src0.w - src1.w;
987	Float4 dot4 = dx * dx + dy * dy + dz * dz + dw * dw;
988	dst = Sqrt(dot4);
989	}
990
991	void ShaderCore::dp1(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
992	{
993	Float4 t = src0.x * src1.x;
994
995	dst.x = t;
996	dst.y = t;
997	dst.z = t;
998	dst.w = t;
999	}
1000
1001	void ShaderCore::dp2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1002	{
1003	Float4 t = dot2(src0, src1);
1004
1005	dst.x = t;
1006	dst.y = t;
1007	dst.z = t;
1008	dst.w = t;
1009	}
1010
1011	void ShaderCore::dp2add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
1012	{
1013	Float4 t = dot2(src0, src1) + src2.x;
1014
1015	dst.x = t;
1016	dst.y = t;
1017	dst.z = t;
1018	dst.w = t;
1019	}
1020
1021	void ShaderCore::dp3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1022	{
1023	Float4 dot = dot3(src0, src1);
1024
1025	dst.x = dot;
1026	dst.y = dot;
1027	dst.z = dot;
1028	dst.w = dot;
1029	}
1030
1031	void ShaderCore::dp4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1032	{
1033	Float4 dot = dot4(src0, src1);
1034
1035	dst.x = dot;
1036	dst.y = dot;
1037	dst.z = dot;
1038	dst.w = dot;
1039	}
1040
1041	void ShaderCore::min(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1042	{
1043	dst.x = Min(src0.x, src1.x);
1044	dst.y = Min(src0.y, src1.y);
1045	dst.z = Min(src0.z, src1.z);
1046	dst.w = Min(src0.w, src1.w);
1047	}
1048
1049	void ShaderCore::imin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1050	{
1051	dst.x = As<Float4>(Min(As<Int4>(src0.x), As<Int4>(src1.x)));
1052	dst.y = As<Float4>(Min(As<Int4>(src0.y), As<Int4>(src1.y)));
1053	dst.z = As<Float4>(Min(As<Int4>(src0.z), As<Int4>(src1.z)));
1054	dst.w = As<Float4>(Min(As<Int4>(src0.w), As<Int4>(src1.w)));
1055	}
1056
1057	void ShaderCore::umin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1058	{
1059	dst.x = As<Float4>(Min(As<UInt4>(src0.x), As<UInt4>(src1.x)));
1060	dst.y = As<Float4>(Min(As<UInt4>(src0.y), As<UInt4>(src1.y)));
1061	dst.z = As<Float4>(Min(As<UInt4>(src0.z), As<UInt4>(src1.z)));
1062	dst.w = As<Float4>(Min(As<UInt4>(src0.w), As<UInt4>(src1.w)));
1063	}
1064
1065	void ShaderCore::max(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1066	{
1067	dst.x = Max(src0.x, src1.x);
1068	dst.y = Max(src0.y, src1.y);
1069	dst.z = Max(src0.z, src1.z);
1070	dst.w = Max(src0.w, src1.w);
1071	}
1072
1073	void ShaderCore::imax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1074	{
1075	dst.x = As<Float4>(Max(As<Int4>(src0.x), As<Int4>(src1.x)));
1076	dst.y = As<Float4>(Max(As<Int4>(src0.y), As<Int4>(src1.y)));
1077	dst.z = As<Float4>(Max(As<Int4>(src0.z), As<Int4>(src1.z)));
1078	dst.w = As<Float4>(Max(As<Int4>(src0.w), As<Int4>(src1.w)));
1079	}
1080
1081	void ShaderCore::umax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1082	{
1083	dst.x = As<Float4>(Max(As<Int4>(src0.x), As<Int4>(src1.x)));
1084	dst.y = As<Float4>(Max(As<Int4>(src0.y), As<Int4>(src1.y)));
1085	dst.z = As<Float4>(Max(As<Int4>(src0.z), As<Int4>(src1.z)));
1086	dst.w = As<Float4>(Max(As<Int4>(src0.w), As<Int4>(src1.w)));
1087	}
1088
1089	void ShaderCore::slt(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1090	{
1091	dst.x = As<Float4>(As<Int4>(CmpLT(src0.x, src1.x)) & As<Int4>(Float4 (`1.0f`)));
1092	dst.y = As<Float4>(As<Int4>(CmpLT(src0.y, src1.y)) & As<Int4>(Float4 (`1.0f`)));
1093	dst.z = As<Float4>(As<Int4>(CmpLT(src0.z, src1.z)) & As<Int4>(Float4 (`1.0f`)));
1094	dst.w = As<Float4>(As<Int4>(CmpLT(src0.w, src1.w)) & As<Int4>(Float4 (`1.0f`)));
1095	}
1096
1097	void ShaderCore::step(Vector4f &dst, const Vector4f &edge, const Vector4f &x)
1098	{
1099	dst.x = As<Float4>(CmpNLT(x.x, edge.x) & As<Int4>(Float4 (`1.0f`)));
1100	dst.y = As<Float4>(CmpNLT(x.y, edge.y) & As<Int4>(Float4 (`1.0f`)));
1101	dst.z = As<Float4>(CmpNLT(x.z, edge.z) & As<Int4>(Float4 (`1.0f`)));
1102	dst.w = As<Float4>(CmpNLT(x.w, edge.w) & As<Int4>(Float4 (`1.0f`)));
1103	}
1104
1105	void ShaderCore::exp2x(Vector4f &dst, const Vector4f &src, bool pp)
1106	{
1107	Float4 exp = exponential2(src.x, pp);
1108
1109	dst.x = exp;
1110	dst.y = exp;
1111	dst.z = exp;
1112	dst.w = exp;
1113	}
1114
1115	void ShaderCore::exp2(Vector4f &dst, const Vector4f &src, bool pp)
1116	{
1117	dst.x = exponential2(src.x, pp);
1118	dst.y = exponential2(src.y, pp);
1119	dst.z = exponential2(src.z, pp);
1120	dst.w = exponential2(src.w, pp);
1121	}
1122
1123	void ShaderCore::exp(Vector4f &dst, const Vector4f &src, bool pp)
1124	{
1125	dst.x = exponential(src.x, pp);
1126	dst.y = exponential(src.y, pp);
1127	dst.z = exponential(src.z, pp);
1128	dst.w = exponential(src.w, pp);
1129	}
1130
1131	void ShaderCore::log2x(Vector4f &dst, const Vector4f &src, bool pp)
1132	{
1133	Float4 log = logarithm2(src.x, true, pp);
1134
1135	dst.x = log;
1136	dst.y = log;
1137	dst.z = log;
1138	dst.w = log;
1139	}
1140
1141	void ShaderCore::log2(Vector4f &dst, const Vector4f &src, bool pp)
1142	{
1143	dst.x = logarithm2(src.x, false, pp);
1144	dst.y = logarithm2(src.y, false, pp);
1145	dst.z = logarithm2(src.z, false, pp);
1146	dst.w = logarithm2(src.w, false, pp);
1147	}
1148
1149	void ShaderCore::log(Vector4f &dst, const Vector4f &src, bool pp)
1150	{
1151	dst.x = logarithm(src.x, false, pp);
1152	dst.y = logarithm(src.y, false, pp);
1153	dst.z = logarithm(src.z, false, pp);
1154	dst.w = logarithm(src.w, false, pp);
1155	}
1156
1157	void ShaderCore::lit(Vector4f &dst, const Vector4f &src)
1158	{
1159	dst.x = Float4 (`1.0f`);
1160	dst.y = Max(src.x, Float4 (`0.0f`));
1161
1162	Float4 pow;
1163
1164	pow = src.w;
1165	pow = Min(pow, Float4 (`127.9961f`));
1166	pow = Max(pow, Float4 (-`127.9961f`));
1167
1168	dst.z = power(src.y, pow);
1169	dst.z = As<Float4>(As<Int4>(dst.z) & CmpNLT(src.x, Float4 (`0.0f`)));
1170	dst.z = As<Float4>(As<Int4>(dst.z) & CmpNLT(src.y, Float4 (`0.0f`)));
1171
1172	dst.w = Float4 (`1.0f`);
1173	}
1174
1175	void ShaderCore::att(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1176	{
1177	// Computes attenuation factors (1, d, d^2, 1/d) assuming src0 = d^2, src1 = 1/d
1178	dst.x = `1`;
1179	dst.y = src0.y * src1.y;
1180	dst.z = src0.z;
1181	dst.w = src1.w;
1182	}
1183
1184	void ShaderCore::lrp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
1185	{
1186	dst.x = src0.x * (src1.x - src2.x) + src2.x;
1187	dst.y = src0.y * (src1.y - src2.y) + src2.y;
1188	dst.z = src0.z * (src1.z - src2.z) + src2.z;
1189	dst.w = src0.w * (src1.w - src2.w) + src2.w;
1190	}
1191
1192	void ShaderCore::isinf(Vector4f &dst, const Vector4f &src)
1193	{
1194	dst.x = As<Float4>(IsInf(src.x));
1195	dst.y = As<Float4>(IsInf(src.y));
1196	dst.z = As<Float4>(IsInf(src.z));
1197	dst.w = As<Float4>(IsInf(src.w));
1198	}
1199
1200	void ShaderCore::isnan(Vector4f &dst, const Vector4f &src)
1201	{
1202	dst.x = As<Float4>(IsNan(src.x));
1203	dst.y = As<Float4>(IsNan(src.y));
1204	dst.z = As<Float4>(IsNan(src.z));
1205	dst.w = As<Float4>(IsNan(src.w));
1206	}
1207
1208	void ShaderCore::smooth(Vector4f &dst, const Vector4f &edge0, const Vector4f &edge1, const Vector4f &x)
1209	{
1210	Float4 tx = Min(Max((x.x - edge0.x) / (edge1.x - edge0.x), Float4 (`0.0f`)), Float4 (`1.0f`)); dst.x = tx * tx * (Float4 (`3.0f`) - Float4 (`2.0f`) * tx);
1211	Float4 ty = Min(Max((x.y - edge0.y) / (edge1.y - edge0.y), Float4 (`0.0f`)), Float4 (`1.0f`)); dst.y = ty * ty * (Float4 (`3.0f`) - Float4 (`2.0f`) * ty);
1212	Float4 tz = Min(Max((x.z - edge0.z) / (edge1.z - edge0.z), Float4 (`0.0f`)), Float4 (`1.0f`)); dst.z = tz * tz * (Float4 (`3.0f`) - Float4 (`2.0f`) * tz);
1213	Float4 tw = Min(Max((x.w - edge0.w) / (edge1.w - edge0.w), Float4 (`0.0f`)), Float4 (`1.0f`)); dst.w = tw * tw * (Float4 (`3.0f`) - Float4 (`2.0f`) * tw);
1214	}
1215
1216	void ShaderCore::floatToHalfBits(Float4& dst, const Float4& floatBits, bool storeInUpperBits)
1217	{
1218	static const uint32_t mask_sign = `0x80000000u`;
1219	static const uint32_t mask_round = ~`0xfffu`;
1220	static const uint32_t c_f32infty = `255` << `23`;
1221	static const uint32_t c_magic = `15` << `23`;
1222	static const uint32_t c_nanbit = `0x200`;
1223	static const uint32_t c_infty_as_fp16 = `0x7c00`;
1224	static const uint32_t c_clamp = (`31` << `23`) - `0x1000`;
1225
1226	UInt4 justsign = UInt4 (mask_sign) & As<UInt4>(floatBits);
1227	UInt4 absf = As<UInt4>(floatBits) ^ justsign;
1228	UInt4 b_isnormal = CmpNLE(UInt4 (c_f32infty), absf);
1229
1230	// Note: this version doesn't round to the nearest even in case of a tie as defined by IEEE 754-2008, it rounds to +inf
1231	// instead of nearest even, since that's fine for GLSL ES 3.0's needs (see section 2.1.1 Floating-Point Computation)
1232	UInt4 joined = ((((As<UInt4>(Min(As<Float4>(absf & UInt4 (mask_round)) * As<Float4>(UInt4 (c_magic)),
1233	As<Float4>(UInt4 (c_clamp))))) - UInt4 (mask_round)) >> `13`) & b_isnormal) \|
1234	((b_isnormal ^ UInt4 (`0xFFFFFFFF`)) & ((CmpNLE(absf, UInt4 (c_f32infty)) & UInt4 (c_nanbit)) \|
1235	UInt4 (c_infty_as_fp16)));
1236
1237	dst = As<Float4>(storeInUpperBits ? As<UInt4>(dst) \| ((joined << `16`) \| justsign) : joined \| (justsign >> `16`));
1238	}
1239
1240	void ShaderCore::halfToFloatBits(Float4& dst, const Float4& halfBits)
1241	{
1242	static const uint32_t mask_nosign = `0x7FFF`;
1243	static const uint32_t magic = (`254` - `15`) << `23`;
1244	static const uint32_t was_infnan = `0x7BFF`;
1245	static const uint32_t exp_infnan = `255` << `23`;
1246
1247	UInt4 expmant = As<UInt4>(halfBits) & UInt4 (mask_nosign);
1248	dst = As<Float4>(As<UInt4>(As<Float4>(expmant << `13`) * As<Float4>(UInt4 (magic))) \|
1249	((As<UInt4>(halfBits) ^ UInt4 (expmant)) << `16`) \|
1250	(CmpNLE(As<UInt4>(expmant), UInt4 (was_infnan)) & UInt4 (exp_infnan)));
1251	}
1252
1253	void ShaderCore::packHalf2x16(Vector4f &d, const Vector4f &s0)
1254	{
1255	// half2 \| half1
1256	floatToHalfBits(d.x, s0.x, false);
1257	floatToHalfBits(d.x, s0.y, true);
1258	}
1259
1260	void ShaderCore::unpackHalf2x16(Vector4f &dst, const Vector4f &s0)
1261	{
1262	// half2 \| half1
1263	halfToFloatBits(dst.x, As<Float4>(As<UInt4>(s0.x) & UInt4 (`0x0000FFFF`)));
1264	halfToFloatBits(dst.y, As<Float4>((As<UInt4>(s0.x) & UInt4 (`0xFFFF0000`)) >> `16`));
1265	}
1266
1267	void ShaderCore::packSnorm2x16(Vector4f &d, const Vector4f &s0)
1268	{
1269	// round(clamp(c, -1.0, 1.0) 32767.0)*
1270	d.x = As<Float4>((Int4 (Round(Min(Max(s0.x, Float4 (-`1.0f`)), Float4 (`1.0f`)) * Float4 (`32767.0f`))) & Int4 (`0xFFFF`)) \|
1271	((Int4 (Round(Min(Max(s0.y, Float4 (-`1.0f`)), Float4 (`1.0f`)) * Float4 (`32767.0f`))) & Int4 (`0xFFFF`)) << `16`));
1272	}
1273
1274	void ShaderCore::packUnorm2x16(Vector4f &d, const Vector4f &s0)
1275	{
1276	// round(clamp(c, 0.0, 1.0) 65535.0)*
1277	d.x = As<Float4>((Int4 (Round(Min(Max(s0.x, Float4 (`0.0f`)), Float4 (`1.0f`)) * Float4 (`65535.0f`))) & Int4 (`0xFFFF`)) \|
1278	((Int4 (Round(Min(Max(s0.y, Float4 (`0.0f`)), Float4 (`1.0f`)) * Float4 (`65535.0f`))) & Int4 (`0xFFFF`)) << `16`));
1279	}
1280
1281	void ShaderCore::unpackSnorm2x16(Vector4f &dst, const Vector4f &s0)
1282	{
1283	// clamp(f / 32727.0, -1.0, 1.0)
1284	dst.x = Min(Max(Float4 (As<Int4>((As<UInt4>(s0.x) & UInt4 (`0x0000FFFF`)) << `16`)) * Float4 (`1.0f` / float(`0x7FFF0000`)), Float4 (-`1.0f`)), Float4 (`1.0f`));
1285	dst.y = Min(Max(Float4 (As<Int4>(As<UInt4>(s0.x) & UInt4 (`0xFFFF0000`))) * Float4 (`1.0f` / float(`0x7FFF0000`)), Float4 (-`1.0f`)), Float4 (`1.0f`));
1286	}
1287
1288	void ShaderCore::unpackUnorm2x16(Vector4f &dst, const Vector4f &s0)
1289	{
1290	// f / 65535.0
1291	dst.x = Float4 ((As<UInt4>(s0.x) & UInt4 (`0x0000FFFF`)) << `16`) * Float4 (`1.0f` / float(`0xFFFF0000`));
1292	dst.y = Float4 (As<UInt4>(s0.x) & UInt4 (`0xFFFF0000`)) * Float4 (`1.0f` / float(`0xFFFF0000`));
1293	}
1294
1295	void ShaderCore::det2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1296	{
1297	dst.x = src0.x * src1.y - src0.y * src1.x;
1298	dst.y = dst.z = dst.w = dst.x;
1299	}
1300
1301	void ShaderCore::det3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
1302	{
1303	crs(dst, src1, src2);
1304	dp3(dst, dst, src0);
1305	}
1306
1307	void ShaderCore::det4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2, const Vector4f &src3)
1308	{
1309	dst.x = src2.z * src3.w - src2.w * src3.z;
1310	dst.y = src1.w * src3.z - src1.z * src3.w;
1311	dst.z = src1.z * src2.w - src1.w * src2.z;
1312	dst.x = src0.x * (src1.y * dst.x + src2.y * dst.y + src3.y * dst.z) -
1313	src0.y * (src1.x * dst.x + src2.x * dst.y + src3.x * dst.z) +
1314	src0.z * (src1.x * (src2.y * src3.w - src2.w * src3.y) +
1315	src2.x * (src1.w * src3.y - src1.y * src3.w) +
1316	src3.x * (src1.y * src2.w - src1.w * src2.y)) +
1317	src0.w * (src1.x * (src2.z * src3.y - src2.y * src3.z) +
1318	src2.x * (src1.y * src3.z - src1.z * src3.y) +
1319	src3.x * (src1.z * src2.y - src1.y * src2.z));
1320	dst.y = dst.z = dst.w = dst.x;
1321	}
1322
1323	void ShaderCore::frc(Vector4f &dst, const Vector4f &src)
1324	{
1325	dst.x = Frac(src.x);
1326	dst.y = Frac(src.y);
1327	dst.z = Frac(src.z);
1328	dst.w = Frac(src.w);
1329	}
1330
1331	void ShaderCore::trunc(Vector4f &dst, const Vector4f &src)
1332	{
1333	dst.x = Trunc(src.x);
1334	dst.y = Trunc(src.y);
1335	dst.z = Trunc(src.z);
1336	dst.w = Trunc(src.w);
1337	}
1338
1339	void ShaderCore::floor(Vector4f &dst, const Vector4f &src)
1340	{
1341	dst.x = Floor(src.x);
1342	dst.y = Floor(src.y);
1343	dst.z = Floor(src.z);
1344	dst.w = Floor(src.w);
1345	}
1346
1347	void ShaderCore::round(Vector4f &dst, const Vector4f &src)
1348	{
1349	dst.x = Round(src.x);
1350	dst.y = Round(src.y);
1351	dst.z = Round(src.z);
1352	dst.w = Round(src.w);
1353	}
1354
1355	void ShaderCore::roundEven(Vector4f &dst, const Vector4f &src)
1356	{
1357	// dst = round(src) + ((round(src) < src) 2 - 1) * (fract(src) == 0.5) * isOdd(round(src));*
1358	// ex.: 1.5: 2 + (0 2 - 1) * 1 * 0 = 2*
1359	// 2.5: 3 + (0 2 - 1) * 1 * 1 = 2*
1360	// -1.5: -2 + (1 2 - 1) * 1 * 0 = -2*
1361	// -2.5: -3 + (1 2 - 1) * 1 * 1 = -2*
1362	// Even if the round implementation rounds the other way:
1363	// 1.5: 1 + (1 2 - 1) * 1 * 1 = 2*
1364	// 2.5: 2 + (1 2 - 1) * 1 * 0 = 2*
1365	// -1.5: -1 + (0 2 - 1) * 1 * 1 = -2*
1366	// -2.5: -2 + (0 2 - 1) * 1 * 0 = -2*
1367	round(dst, src);
1368	dst.x += ((Float4 (CmpLT(dst.x, src.x) & Int4 (`1`)) * Float4 (`2.0f`)) - Float4 (`1.0f`)) * Float4 (CmpEQ(Frac(src.x), Float4 (`0.5f`)) & Int4 (`1`)) * Float4 (Int4 (dst.x) & Int4 (`1`));
1369	dst.y += ((Float4 (CmpLT(dst.y, src.y) & Int4 (`1`)) * Float4 (`2.0f`)) - Float4 (`1.0f`)) * Float4 (CmpEQ(Frac(src.y), Float4 (`0.5f`)) & Int4 (`1`)) * Float4 (Int4 (dst.y) & Int4 (`1`));
1370	dst.z += ((Float4 (CmpLT(dst.z, src.z) & Int4 (`1`)) * Float4 (`2.0f`)) - Float4 (`1.0f`)) * Float4 (CmpEQ(Frac(src.z), Float4 (`0.5f`)) & Int4 (`1`)) * Float4 (Int4 (dst.z) & Int4 (`1`));
1371	dst.w += ((Float4 (CmpLT(dst.w, src.w) & Int4 (`1`)) * Float4 (`2.0f`)) - Float4 (`1.0f`)) * Float4 (CmpEQ(Frac(src.w), Float4 (`0.5f`)) & Int4 (`1`)) * Float4 (Int4 (dst.w) & Int4 (`1`));
1372	}
1373
1374	void ShaderCore::ceil(Vector4f &dst, const Vector4f &src)
1375	{
1376	dst.x = Ceil(src.x);
1377	dst.y = Ceil(src.y);
1378	dst.z = Ceil(src.z);
1379	dst.w = Ceil(src.w);
1380	}
1381
1382	void ShaderCore::powx(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
1383	{
1384	Float4 pow = power(src0.x, src1.x, pp);
1385
1386	dst.x = pow;
1387	dst.y = pow;
1388	dst.z = pow;
1389	dst.w = pow;
1390	}
1391
1392	void ShaderCore::pow(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
1393	{
1394	dst.x = power(src0.x, src1.x, pp);
1395	dst.y = power(src0.y, src1.y, pp);
1396	dst.z = power(src0.z, src1.z, pp);
1397	dst.w = power(src0.w, src1.w, pp);
1398	}
1399
1400	void ShaderCore::crs(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1401	{
1402	dst.x = src0.y * src1.z - src0.z * src1.y;
1403	dst.y = src0.z * src1.x - src0.x * src1.z;
1404	dst.z = src0.x * src1.y - src0.y * src1.x;
1405	}
1406
1407	void ShaderCore::forward1(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
1408	{
1409	Int4 flip = CmpNLT(Nref.x * I.x, Float4 (`0.0f`)) & Int4 (`0x80000000`);
1410
1411	dst.x = As<Float4>(flip ^ As<Int4>(N.x));
1412	}
1413
1414	void ShaderCore::forward2(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
1415	{
1416	Int4 flip = CmpNLT(dot2(Nref, I), Float4 (`0.0f`)) & Int4 (`0x80000000`);
1417
1418	dst.x = As<Float4>(flip ^ As<Int4>(N.x));
1419	dst.y = As<Float4>(flip ^ As<Int4>(N.y));
1420	}
1421
1422	void ShaderCore::forward3(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
1423	{
1424	Int4 flip = CmpNLT(dot3(Nref, I), Float4 (`0.0f`)) & Int4 (`0x80000000`);
1425
1426	dst.x = As<Float4>(flip ^ As<Int4>(N.x));
1427	dst.y = As<Float4>(flip ^ As<Int4>(N.y));
1428	dst.z = As<Float4>(flip ^ As<Int4>(N.z));
1429	}
1430
1431	void ShaderCore::forward4(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
1432	{
1433	Int4 flip = CmpNLT(dot4(Nref, I), Float4 (`0.0f`)) & Int4 (`0x80000000`);
1434
1435	dst.x = As<Float4>(flip ^ As<Int4>(N.x));
1436	dst.y = As<Float4>(flip ^ As<Int4>(N.y));
1437	dst.z = As<Float4>(flip ^ As<Int4>(N.z));
1438	dst.w = As<Float4>(flip ^ As<Int4>(N.w));
1439	}
1440
1441	void ShaderCore::reflect1(Vector4f &dst, const Vector4f &I, const Vector4f &N)
1442	{
1443	Float4 d = N.x * I.x;
1444
1445	dst.x = I.x - Float4 (`2.0f`) * d * N.x;
1446	}
1447
1448	void ShaderCore::reflect2(Vector4f &dst, const Vector4f &I, const Vector4f &N)
1449	{
1450	Float4 d = dot2(N, I);
1451
1452	dst.x = I.x - Float4 (`2.0f`) * d * N.x;
1453	dst.y = I.y - Float4 (`2.0f`) * d * N.y;
1454	}
1455
1456	void ShaderCore::reflect3(Vector4f &dst, const Vector4f &I, const Vector4f &N)
1457	{
1458	Float4 d = dot3(N, I);
1459
1460	dst.x = I.x - Float4 (`2.0f`) * d * N.x;
1461	dst.y = I.y - Float4 (`2.0f`) * d * N.y;
1462	dst.z = I.z - Float4 (`2.0f`) * d * N.z;
1463	}
1464
1465	void ShaderCore::reflect4(Vector4f &dst, const Vector4f &I, const Vector4f &N)
1466	{
1467	Float4 d = dot4(N, I);
1468
1469	dst.x = I.x - Float4 (`2.0f`) * d * N.x;
1470	dst.y = I.y - Float4 (`2.0f`) * d * N.y;
1471	dst.z = I.z - Float4 (`2.0f`) * d * N.z;
1472	dst.w = I.w - Float4 (`2.0f`) * d * N.w;
1473	}
1474
1475	void ShaderCore::refract1(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
1476	{
1477	Float4 d = N.x * I.x;
1478	Float4 k = Float4 (`1.0f`) - eta * eta * (Float4 (`1.0f`) - d * d);
1479	Int4 pos = CmpNLT(k, Float4 (`0.0f`));
1480	Float4 t = (eta * d + Sqrt(k));
1481
1482	dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
1483	}
1484
1485	void ShaderCore::refract2(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
1486	{
1487	Float4 d = dot2(N, I);
1488	Float4 k = Float4 (`1.0f`) - eta * eta * (Float4 (`1.0f`) - d * d);
1489	Int4 pos = CmpNLT(k, Float4 (`0.0f`));
1490	Float4 t = (eta * d + Sqrt(k));
1491
1492	dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
1493	dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y));
1494	}
1495
1496	void ShaderCore::refract3(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
1497	{
1498	Float4 d = dot3(N, I);
1499	Float4 k = Float4 (`1.0f`) - eta * eta * (Float4 (`1.0f`) - d * d);
1500	Int4 pos = CmpNLT(k, Float4 (`0.0f`));
1501	Float4 t = (eta * d + Sqrt(k));
1502
1503	dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
1504	dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y));
1505	dst.z = As<Float4>(pos & As<Int4>(eta * I.z - t * N.z));
1506	}
1507
1508	void ShaderCore::refract4(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
1509	{
1510	Float4 d = dot4(N, I);
1511	Float4 k = Float4 (`1.0f`) - eta * eta * (Float4 (`1.0f`) - d * d);
1512	Int4 pos = CmpNLT(k, Float4 (`0.0f`));
1513	Float4 t = (eta * d + Sqrt(k));
1514
1515	dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
1516	dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y));
1517	dst.z = As<Float4>(pos & As<Int4>(eta * I.z - t * N.z));
1518	dst.w = As<Float4>(pos & As<Int4>(eta * I.w - t * N.w));
1519	}
1520
1521	void ShaderCore::sgn(Vector4f &dst, const Vector4f &src)
1522	{
1523	sgn(dst.x, src.x);
1524	sgn(dst.y, src.y);
1525	sgn(dst.z, src.z);
1526	sgn(dst.w, src.w);
1527	}
1528
1529	void ShaderCore::isgn(Vector4f &dst, const Vector4f &src)
1530	{
1531	isgn(dst.x, src.x);
1532	isgn(dst.y, src.y);
1533	isgn(dst.z, src.z);
1534	isgn(dst.w, src.w);
1535	}
1536
1537	void ShaderCore::abs(Vector4f &dst, const Vector4f &src)
1538	{
1539	dst.x = Abs(src.x);
1540	dst.y = Abs(src.y);
1541	dst.z = Abs(src.z);
1542	dst.w = Abs(src.w);
1543	}
1544
1545	void ShaderCore::iabs(Vector4f &dst, const Vector4f &src)
1546	{
1547	dst.x = As<Float4>(Abs(As<Int4>(src.x)));
1548	dst.y = As<Float4>(Abs(As<Int4>(src.y)));
1549	dst.z = As<Float4>(Abs(As<Int4>(src.z)));
1550	dst.w = As<Float4>(Abs(As<Int4>(src.w)));
1551	}
1552
1553	void ShaderCore::nrm2(Vector4f &dst, const Vector4f &src, bool pp)
1554	{
1555	Float4 dot = dot2(src, src);
1556	Float4 rsq = reciprocalSquareRoot(dot, false, pp);
1557
1558	dst.x = src.x * rsq;
1559	dst.y = src.y * rsq;
1560	dst.z = src.z * rsq;
1561	dst.w = src.w * rsq;
1562	}
1563
1564	void ShaderCore::nrm3(Vector4f &dst, const Vector4f &src, bool pp)
1565	{
1566	Float4 dot = dot3(src, src);
1567	Float4 rsq = reciprocalSquareRoot(dot, false, pp);
1568
1569	dst.x = src.x * rsq;
1570	dst.y = src.y * rsq;
1571	dst.z = src.z * rsq;
1572	dst.w = src.w * rsq;
1573	}
1574
1575	void ShaderCore::nrm4(Vector4f &dst, const Vector4f &src, bool pp)
1576	{
1577	Float4 dot = dot4(src, src);
1578	Float4 rsq = reciprocalSquareRoot(dot, false, pp);
1579
1580	dst.x = src.x * rsq;
1581	dst.y = src.y * rsq;
1582	dst.z = src.z * rsq;
1583	dst.w = src.w * rsq;
1584	}
1585
1586	void ShaderCore::sincos(Vector4f &dst, const Vector4f &src, bool pp)
1587	{
1588	dst.x = cosine_pi(src.x, pp);
1589	dst.y = sine_pi(src.x, pp);
1590	}
1591
1592	void ShaderCore::cos(Vector4f &dst, const Vector4f &src, bool pp)
1593	{
1594	dst.x = cosine(src.x, pp);
1595	dst.y = cosine(src.y, pp);
1596	dst.z = cosine(src.z, pp);
1597	dst.w = cosine(src.w, pp);
1598	}
1599
1600	void ShaderCore::sin(Vector4f &dst, const Vector4f &src, bool pp)
1601	{
1602	dst.x = sine(src.x, pp);
1603	dst.y = sine(src.y, pp);
1604	dst.z = sine(src.z, pp);
1605	dst.w = sine(src.w, pp);
1606	}
1607
1608	void ShaderCore::tan(Vector4f &dst, const Vector4f &src, bool pp)
1609	{
1610	dst.x = tangent(src.x, pp);
1611	dst.y = tangent(src.y, pp);
1612	dst.z = tangent(src.z, pp);
1613	dst.w = tangent(src.w, pp);
1614	}
1615
1616	void ShaderCore::acos(Vector4f &dst, const Vector4f &src, bool pp)
1617	{
1618	dst.x = arccos(src.x, pp);
1619	dst.y = arccos(src.y, pp);
1620	dst.z = arccos(src.z, pp);
1621	dst.w = arccos(src.w, pp);
1622	}
1623
1624	void ShaderCore::asin(Vector4f &dst, const Vector4f &src, bool pp)
1625	{
1626	dst.x = arcsin(src.x, pp);
1627	dst.y = arcsin(src.y, pp);
1628	dst.z = arcsin(src.z, pp);
1629	dst.w = arcsin(src.w, pp);
1630	}
1631
1632	void ShaderCore::atan(Vector4f &dst, const Vector4f &src, bool pp)
1633	{
1634	dst.x = arctan(src.x, pp);
1635	dst.y = arctan(src.y, pp);
1636	dst.z = arctan(src.z, pp);
1637	dst.w = arctan(src.w, pp);
1638	}
1639
1640	void ShaderCore::atan2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
1641	{
1642	dst.x = arctan(src0.x, src1.x, pp);
1643	dst.y = arctan(src0.y, src1.y, pp);
1644	dst.z = arctan(src0.z, src1.z, pp);
1645	dst.w = arctan(src0.w, src1.w, pp);
1646	}
1647
1648	void ShaderCore::cosh(Vector4f &dst, const Vector4f &src, bool pp)
1649	{
1650	dst.x = cosineh(src.x, pp);
1651	dst.y = cosineh(src.y, pp);
1652	dst.z = cosineh(src.z, pp);
1653	dst.w = cosineh(src.w, pp);
1654	}
1655
1656	void ShaderCore::sinh(Vector4f &dst, const Vector4f &src, bool pp)
1657	{
1658	dst.x = sineh(src.x, pp);
1659	dst.y = sineh(src.y, pp);
1660	dst.z = sineh(src.z, pp);
1661	dst.w = sineh(src.w, pp);
1662	}
1663
1664	void ShaderCore::tanh(Vector4f &dst, const Vector4f &src, bool pp)
1665	{
1666	dst.x = tangenth(src.x, pp);
1667	dst.y = tangenth(src.y, pp);
1668	dst.z = tangenth(src.z, pp);
1669	dst.w = tangenth(src.w, pp);
1670	}
1671
1672	void ShaderCore::acosh(Vector4f &dst, const Vector4f &src, bool pp)
1673	{
1674	dst.x = arccosh(src.x, pp);
1675	dst.y = arccosh(src.y, pp);
1676	dst.z = arccosh(src.z, pp);
1677	dst.w = arccosh(src.w, pp);
1678	}
1679
1680	void ShaderCore::asinh(Vector4f &dst, const Vector4f &src, bool pp)
1681	{
1682	dst.x = arcsinh(src.x, pp);
1683	dst.y = arcsinh(src.y, pp);
1684	dst.z = arcsinh(src.z, pp);
1685	dst.w = arcsinh(src.w, pp);
1686	}
1687
1688	void ShaderCore::atanh(Vector4f &dst, const Vector4f &src, bool pp)
1689	{
1690	dst.x = arctanh(src.x, pp);
1691	dst.y = arctanh(src.y, pp);
1692	dst.z = arctanh(src.z, pp);
1693	dst.w = arctanh(src.w, pp);
1694	}
1695
1696	void ShaderCore::expp(Vector4f &dst, const Vector4f &src, unsigned short shaderModel)
1697	{
1698	if(shaderModel < `0x0200`)
1699	{
1700	Float4 frc = Frac(src.x);
1701	Float4 floor = src.x - frc;
1702
1703	dst.x = exponential2(floor, true);
1704	dst.y = frc;
1705	dst.z = exponential2(src.x, true);
1706	dst.w = Float4 (`1.0f`);
1707	}
1708	else // Version >= 2.0
1709	{
1710	exp2x(dst, src, true); // FIXME: 10-bit precision suffices
1711	}
1712	}
1713
1714	void ShaderCore::logp(Vector4f &dst, const Vector4f &src, unsigned short shaderModel)
1715	{
1716	if(shaderModel < `0x0200`)
1717	{
1718	Float4 tmp0;
1719	Float4 tmp1;
1720	Float4 t;
1721	Int4 r;
1722
1723	tmp0 = Abs(src.x);
1724	tmp1 = tmp0;
1725
1726	// X component
1727	r = As<Int4>(As<UInt4>(tmp0) >> `23`) - Int4 (`127`);
1728	dst.x = Float4 (r);
1729
1730	// Y component
1731	dst.y = As<Float4>((As<Int4>(tmp1) & Int4 (`0x007FFFFF`)) \| As<Int4>(Float4 (`1.0f`)));
1732
1733	// Z component
1734	dst.z = logarithm2(src.x, true, true);
1735
1736	// W component
1737	dst.w = `1.0f`;
1738	}
1739	else
1740	{
1741	log2x(dst, src, true);
1742	}
1743	}
1744
1745	void ShaderCore::cmp0(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
1746	{
1747	cmp0(dst.x, src0.x, src1.x, src2.x);
1748	cmp0(dst.y, src0.y, src1.y, src2.y);
1749	cmp0(dst.z, src0.z, src1.z, src2.z);
1750	cmp0(dst.w, src0.w, src1.w, src2.w);
1751	}
1752
1753	void ShaderCore::select(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
1754	{
1755	select(dst.x, As<Int4>(src0.x), src1.x, src2.x);
1756	select(dst.y, As<Int4>(src0.y), src1.y, src2.y);
1757	select(dst.z, As<Int4>(src0.z), src1.z, src2.z);
1758	select(dst.w, As<Int4>(src0.w), src1.w, src2.w);
1759	}
1760
1761	void ShaderCore::extract(Float4 &dst, const Vector4f &src0, const Float4 &src1)
1762	{
1763	select(dst, CmpEQ(As<Int4>(src1), Int4 (`1`)), src0.y, src0.x);
1764	select(dst, CmpEQ(As<Int4>(src1), Int4 (`2`)), src0.z, dst);
1765	select(dst, CmpEQ(As<Int4>(src1), Int4 (`3`)), src0.w, dst);
1766	}
1767
1768	void ShaderCore::insert(Vector4f &dst, const Vector4f &src, const Float4 &element, const Float4 &index)
1769	{
1770	select(dst.x, CmpEQ(As<Int4>(index), Int4 (`0`)), element, src.x);
1771	select(dst.y, CmpEQ(As<Int4>(index), Int4 (`1`)), element, src.y);
1772	select(dst.z, CmpEQ(As<Int4>(index), Int4 (`2`)), element, src.z);
1773	select(dst.w, CmpEQ(As<Int4>(index), Int4 (`3`)), element, src.w);
1774	}
1775
1776	void ShaderCore::sgn(Float4 &dst, const Float4 &src)
1777	{
1778	Int4 neg = As<Int4>(CmpLT(src, Float4 (-`0.0f`))) & As<Int4>(Float4 (-`1.0f`));
1779	Int4 pos = As<Int4>(CmpNLE(src, Float4 (+`0.0f`))) & As<Int4>(Float4 (`1.0f`));
1780	dst = As<Float4>(neg \| pos);
1781	}
1782
1783	void ShaderCore::isgn(Float4 &dst, const Float4 &src)
1784	{
1785	Int4 neg = CmpLT(As<Int4>(src), Int4 (`0`)) & Int4 (-`1`);
1786	Int4 pos = CmpNLE(As<Int4>(src), Int4 (`0`)) & Int4 (`1`);
1787	dst = As<Float4>(neg \| pos);
1788	}
1789
1790	void ShaderCore::cmp0(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2)
1791	{
1792	Int4 pos = CmpLE(Float4 (`0.0f`), src0);
1793	select(dst, pos, src1, src2);
1794	}
1795
1796	void ShaderCore::cmp0i(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2)
1797	{
1798	Int4 pos = CmpEQ(Int4 (`0`), As<Int4>(src0));
1799	select(dst, pos, src1, src2);
1800	}
1801
1802	void ShaderCore::select(Float4 &dst, RValue<Int4> src0, const Float4 &src1, const Float4 &src2)
1803	{
1804	// FIXME: LLVM vector select
1805	dst = As<Float4>((src0 & As<Int4>(src1)) \| (~src0 & As<Int4>(src2)));
1806	}
1807
1808	void ShaderCore::cmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control)
1809	{
1810	switch(control)
1811	{
1812	case Shader::CONTROL_GT:
1813	dst.x = As<Float4>(CmpNLE(src0.x, src1.x));
1814	dst.y = As<Float4>(CmpNLE(src0.y, src1.y));
1815	dst.z = As<Float4>(CmpNLE(src0.z, src1.z));
1816	dst.w = As<Float4>(CmpNLE(src0.w, src1.w));
1817	break;
1818	case Shader::CONTROL_EQ:
1819	dst.x = As<Float4>(CmpEQ(src0.x, src1.x));
1820	dst.y = As<Float4>(CmpEQ(src0.y, src1.y));
1821	dst.z = As<Float4>(CmpEQ(src0.z, src1.z));
1822	dst.w = As<Float4>(CmpEQ(src0.w, src1.w));
1823	break;
1824	case Shader::CONTROL_GE:
1825	dst.x = As<Float4>(CmpNLT(src0.x, src1.x));
1826	dst.y = As<Float4>(CmpNLT(src0.y, src1.y));
1827	dst.z = As<Float4>(CmpNLT(src0.z, src1.z));
1828	dst.w = As<Float4>(CmpNLT(src0.w, src1.w));
1829	break;
1830	case Shader::CONTROL_LT:
1831	dst.x = As<Float4>(CmpLT(src0.x, src1.x));
1832	dst.y = As<Float4>(CmpLT(src0.y, src1.y));
1833	dst.z = As<Float4>(CmpLT(src0.z, src1.z));
1834	dst.w = As<Float4>(CmpLT(src0.w, src1.w));
1835	break;
1836	case Shader::CONTROL_NE:
1837	dst.x = As<Float4>(CmpNEQ(src0.x, src1.x));
1838	dst.y = As<Float4>(CmpNEQ(src0.y, src1.y));
1839	dst.z = As<Float4>(CmpNEQ(src0.z, src1.z));
1840	dst.w = As<Float4>(CmpNEQ(src0.w, src1.w));
1841	break;
1842	case Shader::CONTROL_LE:
1843	dst.x = As<Float4>(CmpLE(src0.x, src1.x));
1844	dst.y = As<Float4>(CmpLE(src0.y, src1.y));
1845	dst.z = As<Float4>(CmpLE(src0.z, src1.z));
1846	dst.w = As<Float4>(CmpLE(src0.w, src1.w));
1847	break;
1848	default:
1849	ASSERT(false);
1850	}
1851	}
1852
1853	void ShaderCore::icmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control)
1854	{
1855	switch(control)
1856	{
1857	case Shader::CONTROL_GT:
1858	dst.x = As<Float4>(CmpNLE(As<Int4>(src0.x), As<Int4>(src1.x)));
1859	dst.y = As<Float4>(CmpNLE(As<Int4>(src0.y), As<Int4>(src1.y)));
1860	dst.z = As<Float4>(CmpNLE(As<Int4>(src0.z), As<Int4>(src1.z)));
1861	dst.w = As<Float4>(CmpNLE(As<Int4>(src0.w), As<Int4>(src1.w)));
1862	break;
1863	case Shader::CONTROL_EQ:
1864	dst.x = As<Float4>(CmpEQ(As<Int4>(src0.x), As<Int4>(src1.x)));
1865	dst.y = As<Float4>(CmpEQ(As<Int4>(src0.y), As<Int4>(src1.y)));
1866	dst.z = As<Float4>(CmpEQ(As<Int4>(src0.z), As<Int4>(src1.z)));
1867	dst.w = As<Float4>(CmpEQ(As<Int4>(src0.w), As<Int4>(src1.w)));
1868	break;
1869	case Shader::CONTROL_GE:
1870	dst.x = As<Float4>(CmpNLT(As<Int4>(src0.x), As<Int4>(src1.x)));
1871	dst.y = As<Float4>(CmpNLT(As<Int4>(src0.y), As<Int4>(src1.y)));
1872	dst.z = As<Float4>(CmpNLT(As<Int4>(src0.z), As<Int4>(src1.z)));
1873	dst.w = As<Float4>(CmpNLT(As<Int4>(src0.w), As<Int4>(src1.w)));
1874	break;
1875	case Shader::CONTROL_LT:
1876	dst.x = As<Float4>(CmpLT(As<Int4>(src0.x), As<Int4>(src1.x)));
1877	dst.y = As<Float4>(CmpLT(As<Int4>(src0.y), As<Int4>(src1.y)));
1878	dst.z = As<Float4>(CmpLT(As<Int4>(src0.z), As<Int4>(src1.z)));
1879	dst.w = As<Float4>(CmpLT(As<Int4>(src0.w), As<Int4>(src1.w)));
1880	break;
1881	case Shader::CONTROL_NE:
1882	dst.x = As<Float4>(CmpNEQ(As<Int4>(src0.x), As<Int4>(src1.x)));
1883	dst.y = As<Float4>(CmpNEQ(As<Int4>(src0.y), As<Int4>(src1.y)));
1884	dst.z = As<Float4>(CmpNEQ(As<Int4>(src0.z), As<Int4>(src1.z)));
1885	dst.w = As<Float4>(CmpNEQ(As<Int4>(src0.w), As<Int4>(src1.w)));
1886	break;
1887	case Shader::CONTROL_LE:
1888	dst.x = As<Float4>(CmpLE(As<Int4>(src0.x), As<Int4>(src1.x)));
1889	dst.y = As<Float4>(CmpLE(As<Int4>(src0.y), As<Int4>(src1.y)));
1890	dst.z = As<Float4>(CmpLE(As<Int4>(src0.z), As<Int4>(src1.z)));
1891	dst.w = As<Float4>(CmpLE(As<Int4>(src0.w), As<Int4>(src1.w)));
1892	break;
1893	default:
1894	ASSERT(false);
1895	}
1896	}
1897
1898	void ShaderCore::ucmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control)
1899	{
1900	switch(control)
1901	{
1902	case Shader::CONTROL_GT:
1903	dst.x = As<Float4>(CmpNLE(As<UInt4>(src0.x), As<UInt4>(src1.x)));
1904	dst.y = As<Float4>(CmpNLE(As<UInt4>(src0.y), As<UInt4>(src1.y)));
1905	dst.z = As<Float4>(CmpNLE(As<UInt4>(src0.z), As<UInt4>(src1.z)));
1906	dst.w = As<Float4>(CmpNLE(As<UInt4>(src0.w), As<UInt4>(src1.w)));
1907	break;
1908	case Shader::CONTROL_EQ:
1909	dst.x = As<Float4>(CmpEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)));
1910	dst.y = As<Float4>(CmpEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)));
1911	dst.z = As<Float4>(CmpEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)));
1912	dst.w = As<Float4>(CmpEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
1913	break;
1914	case Shader::CONTROL_GE:
1915	dst.x = As<Float4>(CmpNLT(As<UInt4>(src0.x), As<UInt4>(src1.x)));
1916	dst.y = As<Float4>(CmpNLT(As<UInt4>(src0.y), As<UInt4>(src1.y)));
1917	dst.z = As<Float4>(CmpNLT(As<UInt4>(src0.z), As<UInt4>(src1.z)));
1918	dst.w = As<Float4>(CmpNLT(As<UInt4>(src0.w), As<UInt4>(src1.w)));
1919	break;
1920	case Shader::CONTROL_LT:
1921	dst.x = As<Float4>(CmpLT(As<UInt4>(src0.x), As<UInt4>(src1.x)));
1922	dst.y = As<Float4>(CmpLT(As<UInt4>(src0.y), As<UInt4>(src1.y)));
1923	dst.z = As<Float4>(CmpLT(As<UInt4>(src0.z), As<UInt4>(src1.z)));
1924	dst.w = As<Float4>(CmpLT(As<UInt4>(src0.w), As<UInt4>(src1.w)));
1925	break;
1926	case Shader::CONTROL_NE:
1927	dst.x = As<Float4>(CmpNEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)));
1928	dst.y = As<Float4>(CmpNEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)));
1929	dst.z = As<Float4>(CmpNEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)));
1930	dst.w = As<Float4>(CmpNEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
1931	break;
1932	case Shader::CONTROL_LE:
1933	dst.x = As<Float4>(CmpLE(As<UInt4>(src0.x), As<UInt4>(src1.x)));
1934	dst.y = As<Float4>(CmpLE(As<UInt4>(src0.y), As<UInt4>(src1.y)));
1935	dst.z = As<Float4>(CmpLE(As<UInt4>(src0.z), As<UInt4>(src1.z)));
1936	dst.w = As<Float4>(CmpLE(As<UInt4>(src0.w), As<UInt4>(src1.w)));
1937	break;
1938	default:
1939	ASSERT(false);
1940	}
1941	}
1942
1943	void ShaderCore::all(Float4 &dst, const Vector4f &src)
1944	{
1945	dst = As<Float4>(As<Int4>(src.x) & As<Int4>(src.y) & As<Int4>(src.z) & As<Int4>(src.w));
1946	}
1947
1948	void ShaderCore::any(Float4 &dst, const Vector4f &src)
1949	{
1950	dst = As<Float4>(As<Int4>(src.x) \| As<Int4>(src.y) \| As<Int4>(src.z) \| As<Int4>(src.w));
1951	}
1952
1953	void ShaderCore::bitwise_not(Vector4f &dst, const Vector4f &src)
1954	{
1955	dst.x = As<Float4>(As<Int4>(src.x) ^ Int4 (`0xFFFFFFFF`));
1956	dst.y = As<Float4>(As<Int4>(src.y) ^ Int4 (`0xFFFFFFFF`));
1957	dst.z = As<Float4>(As<Int4>(src.z) ^ Int4 (`0xFFFFFFFF`));
1958	dst.w = As<Float4>(As<Int4>(src.w) ^ Int4 (`0xFFFFFFFF`));
1959	}
1960
1961	void ShaderCore::bitwise_or(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1962	{
1963	dst.x = As<Float4>(As<Int4>(src0.x) \| As<Int4>(src1.x));
1964	dst.y = As<Float4>(As<Int4>(src0.y) \| As<Int4>(src1.y));
1965	dst.z = As<Float4>(As<Int4>(src0.z) \| As<Int4>(src1.z));
1966	dst.w = As<Float4>(As<Int4>(src0.w) \| As<Int4>(src1.w));
1967	}
1968
1969	void ShaderCore::bitwise_xor(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1970	{
1971	dst.x = As<Float4>(As<Int4>(src0.x) ^ As<Int4>(src1.x));
1972	dst.y = As<Float4>(As<Int4>(src0.y) ^ As<Int4>(src1.y));
1973	dst.z = As<Float4>(As<Int4>(src0.z) ^ As<Int4>(src1.z));
1974	dst.w = As<Float4>(As<Int4>(src0.w) ^ As<Int4>(src1.w));
1975	}
1976
1977	void ShaderCore::bitwise_and(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1978	{
1979	dst.x = As<Float4>(As<Int4>(src0.x) & As<Int4>(src1.x));
1980	dst.y = As<Float4>(As<Int4>(src0.y) & As<Int4>(src1.y));
1981	dst.z = As<Float4>(As<Int4>(src0.z) & As<Int4>(src1.z));
1982	dst.w = As<Float4>(As<Int4>(src0.w) & As<Int4>(src1.w));
1983	}
1984
1985	void ShaderCore::equal(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1986	{
1987	dst.x = As<Float4>(CmpEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)) &
1988	CmpEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)) &
1989	CmpEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)) &
1990	CmpEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
1991	dst.y = dst.x;
1992	dst.z = dst.x;
1993	dst.w = dst.x;
1994	}
1995
1996	void ShaderCore::notEqual(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1997	{
1998	dst.x = As<Float4>(CmpNEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)) \|
1999	CmpNEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)) \|
2000	CmpNEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)) \|
2001	CmpNEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
2002	dst.y = dst.x;
2003	dst.z = dst.x;
2004	dst.w = dst.x;
2005	}
2006	}
2007

Browse the source code of engine/third_party/swiftshader/src/Shader/ShaderCore.cpp