1 | // Copyright 2016 The SwiftShader Authors. All Rights Reserved. |
2 | // |
3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | // you may not use this file except in compliance with the License. |
5 | // You may obtain a copy of the License at |
6 | // |
7 | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | // |
9 | // Unless required by applicable law or agreed to in writing, software |
10 | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | // See the License for the specific language governing permissions and |
13 | // limitations under the License. |
14 | |
15 | #include "ShaderCore.hpp" |
16 | |
17 | #include "Renderer/Renderer.hpp" |
18 | #include "Common/Debug.hpp" |
19 | |
20 | #include <limits.h> |
21 | |
22 | namespace sw |
23 | { |
24 | extern TranscendentalPrecision logPrecision; |
25 | extern TranscendentalPrecision expPrecision; |
26 | extern TranscendentalPrecision rcpPrecision; |
27 | extern TranscendentalPrecision rsqPrecision; |
28 | |
29 | Vector4s::Vector4s() |
30 | { |
31 | } |
32 | |
33 | Vector4s::Vector4s(unsigned short x, unsigned short y, unsigned short z, unsigned short w) |
34 | { |
35 | this->x = Short4(x); |
36 | this->y = Short4(y); |
37 | this->z = Short4(z); |
38 | this->w = Short4(w); |
39 | } |
40 | |
41 | Vector4s::Vector4s(const Vector4s &rhs) |
42 | { |
43 | x = rhs.x; |
44 | y = rhs.y; |
45 | z = rhs.z; |
46 | w = rhs.w; |
47 | } |
48 | |
49 | Vector4s &Vector4s::operator=(const Vector4s &rhs) |
50 | { |
51 | x = rhs.x; |
52 | y = rhs.y; |
53 | z = rhs.z; |
54 | w = rhs.w; |
55 | |
56 | return *this; |
57 | } |
58 | |
59 | Short4 &Vector4s::operator[](int i) |
60 | { |
61 | switch(i) |
62 | { |
63 | case 0: return x; |
64 | case 1: return y; |
65 | case 2: return z; |
66 | case 3: return w; |
67 | } |
68 | |
69 | return x; |
70 | } |
71 | |
72 | Vector4f::Vector4f() |
73 | { |
74 | } |
75 | |
76 | Vector4f::Vector4f(float x, float y, float z, float w) |
77 | { |
78 | this->x = Float4(x); |
79 | this->y = Float4(y); |
80 | this->z = Float4(z); |
81 | this->w = Float4(w); |
82 | } |
83 | |
84 | Vector4f::Vector4f(const Vector4f &rhs) |
85 | { |
86 | x = rhs.x; |
87 | y = rhs.y; |
88 | z = rhs.z; |
89 | w = rhs.w; |
90 | } |
91 | |
92 | Vector4f &Vector4f::operator=(const Vector4f &rhs) |
93 | { |
94 | x = rhs.x; |
95 | y = rhs.y; |
96 | z = rhs.z; |
97 | w = rhs.w; |
98 | |
99 | return *this; |
100 | } |
101 | |
102 | Float4 &Vector4f::operator[](int i) |
103 | { |
104 | switch(i) |
105 | { |
106 | case 0: return x; |
107 | case 1: return y; |
108 | case 2: return z; |
109 | case 3: return w; |
110 | } |
111 | |
112 | return x; |
113 | } |
114 | |
115 | Float4 exponential2(RValue<Float4> x, bool pp) |
116 | { |
117 | // This implementation is based on 2^(i + f) = 2^i * 2^f, |
118 | // where i is the integer part of x and f is the fraction. |
119 | |
120 | // For 2^i we can put the integer part directly in the exponent of |
121 | // the IEEE-754 floating-point number. Clamp to prevent overflow |
122 | // past the representation of infinity. |
123 | Float4 x0 = x; |
124 | x0 = Min(x0, As<Float4>(Int4(0x43010000))); // 129.00000e+0f |
125 | x0 = Max(x0, As<Float4>(Int4(0xC2FDFFFF))); // -126.99999e+0f |
126 | |
127 | Int4 i = RoundInt(x0 - Float4(0.5f)); |
128 | Float4 ii = As<Float4>((i + Int4(127)) << 23); // Add single-precision bias, and shift into exponent. |
129 | |
130 | // For the fractional part use a polynomial |
131 | // which approximates 2^f in the 0 to 1 range. |
132 | Float4 f = x0 - Float4(i); |
133 | Float4 ff = As<Float4>(Int4(0x3AF61905)); // 1.8775767e-3f |
134 | ff = ff * f + As<Float4>(Int4(0x3C134806)); // 8.9893397e-3f |
135 | ff = ff * f + As<Float4>(Int4(0x3D64AA23)); // 5.5826318e-2f |
136 | ff = ff * f + As<Float4>(Int4(0x3E75EAD4)); // 2.4015361e-1f |
137 | ff = ff * f + As<Float4>(Int4(0x3F31727B)); // 6.9315308e-1f |
138 | ff = ff * f + Float4(1.0f); |
139 | |
140 | return ii * ff; |
141 | } |
142 | |
143 | Float4 logarithm2(RValue<Float4> x, bool absolute, bool pp) |
144 | { |
145 | Float4 x0; |
146 | Float4 x1; |
147 | Float4 x2; |
148 | Float4 x3; |
149 | |
150 | x0 = x; |
151 | |
152 | x1 = As<Float4>(As<Int4>(x0) & Int4(0x7F800000)); |
153 | x1 = As<Float4>(As<UInt4>(x1) >> 8); |
154 | x1 = As<Float4>(As<Int4>(x1) | As<Int4>(Float4(1.0f))); |
155 | x1 = (x1 - Float4(1.4960938f)) * Float4(256.0f); // FIXME: (x1 - 1.4960938f) * 256.0f; |
156 | x0 = As<Float4>((As<Int4>(x0) & Int4(0x007FFFFF)) | As<Int4>(Float4(1.0f))); |
157 | |
158 | x2 = (Float4(9.5428179e-2f) * x0 + Float4(4.7779095e-1f)) * x0 + Float4(1.9782813e-1f); |
159 | x3 = ((Float4(1.6618466e-2f) * x0 + Float4(2.0350508e-1f)) * x0 + Float4(2.7382900e-1f)) * x0 + Float4(4.0496687e-2f); |
160 | x2 /= x3; |
161 | |
162 | x1 += (x0 - Float4(1.0f)) * x2; |
163 | |
164 | Int4 pos_inf_x = CmpEQ(As<Int4>(x), Int4(0x7F800000)); |
165 | return As<Float4>((pos_inf_x & As<Int4>(x)) | (~pos_inf_x & As<Int4>(x1))); |
166 | } |
167 | |
168 | Float4 exponential(RValue<Float4> x, bool pp) |
169 | { |
170 | // FIXME: Propagate the constant |
171 | return exponential2(Float4(1.44269504f) * x, pp); // 1/ln(2) |
172 | } |
173 | |
174 | Float4 logarithm(RValue<Float4> x, bool absolute, bool pp) |
175 | { |
176 | // FIXME: Propagate the constant |
177 | return Float4(6.93147181e-1f) * logarithm2(x, absolute, pp); // ln(2) |
178 | } |
179 | |
180 | Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp) |
181 | { |
182 | Float4 log = logarithm2(x, true, pp); |
183 | log *= y; |
184 | return exponential2(log, pp); |
185 | } |
186 | |
187 | Float4 reciprocal(RValue<Float4> x, bool pp, bool finite, bool exactAtPow2) |
188 | { |
189 | Float4 rcp; |
190 | |
191 | if(!pp && rcpPrecision >= WHQL) |
192 | { |
193 | rcp = Float4(1.0f) / x; |
194 | } |
195 | else |
196 | { |
197 | rcp = Rcp_pp(x, exactAtPow2); |
198 | |
199 | if(!pp) |
200 | { |
201 | rcp = (rcp + rcp) - (x * rcp * rcp); |
202 | } |
203 | } |
204 | |
205 | if(finite) |
206 | { |
207 | int big = 0x7F7FFFFF; |
208 | rcp = Min(rcp, Float4((float&)big)); |
209 | } |
210 | |
211 | return rcp; |
212 | } |
213 | |
214 | Float4 reciprocalSquareRoot(RValue<Float4> x, bool absolute, bool pp) |
215 | { |
216 | Float4 abs = x; |
217 | |
218 | if(absolute) |
219 | { |
220 | abs = Abs(abs); |
221 | } |
222 | |
223 | Float4 rsq; |
224 | |
225 | if(!pp) |
226 | { |
227 | rsq = Float4(1.0f) / Sqrt(abs); |
228 | } |
229 | else |
230 | { |
231 | rsq = RcpSqrt_pp(abs); |
232 | |
233 | if(!pp) |
234 | { |
235 | rsq = rsq * (Float4(3.0f) - rsq * rsq * abs) * Float4(0.5f); |
236 | } |
237 | |
238 | rsq = As<Float4>(CmpNEQ(As<Int4>(abs), Int4(0x7F800000)) & As<Int4>(rsq)); |
239 | } |
240 | |
241 | return rsq; |
242 | } |
243 | |
244 | Float4 modulo(RValue<Float4> x, RValue<Float4> y) |
245 | { |
246 | return x - y * Floor(x / y); |
247 | } |
248 | |
249 | Float4 sine_pi(RValue<Float4> x, bool pp) |
250 | { |
251 | const Float4 A = Float4(-4.05284734e-1f); // -4/pi^2 |
252 | const Float4 B = Float4(1.27323954e+0f); // 4/pi |
253 | const Float4 C = Float4(7.75160950e-1f); |
254 | const Float4 D = Float4(2.24839049e-1f); |
255 | |
256 | // Parabola approximating sine |
257 | Float4 sin = x * (Abs(x) * A + B); |
258 | |
259 | // Improve precision from 0.06 to 0.001 |
260 | if(true) |
261 | { |
262 | sin = sin * (Abs(sin) * D + C); |
263 | } |
264 | |
265 | return sin; |
266 | } |
267 | |
268 | Float4 cosine_pi(RValue<Float4> x, bool pp) |
269 | { |
270 | // cos(x) = sin(x + pi/2) |
271 | Float4 y = x + Float4(1.57079632e+0f); |
272 | |
273 | // Wrap around |
274 | y -= As<Float4>(CmpNLT(y, Float4(3.14159265e+0f)) & As<Int4>(Float4(6.28318530e+0f))); |
275 | |
276 | return sine_pi(y, pp); |
277 | } |
278 | |
279 | Float4 sine(RValue<Float4> x, bool pp) |
280 | { |
281 | // Reduce to [-0.5, 0.5] range |
282 | Float4 y = x * Float4(1.59154943e-1f); // 1/2pi |
283 | y = y - Round(y); |
284 | |
285 | if(!pp) |
286 | { |
287 | // From the paper: "A Fast, Vectorizable Algorithm for Producing Single-Precision Sine-Cosine Pairs" |
288 | // This implementation passes OpenGL ES 3.0 precision requirements, at the cost of more operations: |
289 | // !pp : 17 mul, 7 add, 1 sub, 1 reciprocal |
290 | // pp : 4 mul, 2 add, 2 abs |
291 | |
292 | Float4 y2 = y * y; |
293 | Float4 c1 = y2 * (y2 * (y2 * Float4(-0.0204391631f) + Float4(0.2536086171f)) + Float4(-1.2336977925f)) + Float4(1.0f); |
294 | Float4 s1 = y * (y2 * (y2 * (y2 * Float4(-0.0046075748f) + Float4(0.0796819754f)) + Float4(-0.645963615f)) + Float4(1.5707963235f)); |
295 | Float4 c2 = (c1 * c1) - (s1 * s1); |
296 | Float4 s2 = Float4(2.0f) * s1 * c1; |
297 | return Float4(2.0f) * s2 * c2 * reciprocal(s2 * s2 + c2 * c2, pp, true); |
298 | } |
299 | |
300 | const Float4 A = Float4(-16.0f); |
301 | const Float4 B = Float4(8.0f); |
302 | const Float4 C = Float4(7.75160950e-1f); |
303 | const Float4 D = Float4(2.24839049e-1f); |
304 | |
305 | // Parabola approximating sine |
306 | Float4 sin = y * (Abs(y) * A + B); |
307 | |
308 | // Improve precision from 0.06 to 0.001 |
309 | if(true) |
310 | { |
311 | sin = sin * (Abs(sin) * D + C); |
312 | } |
313 | |
314 | return sin; |
315 | } |
316 | |
317 | Float4 cosine(RValue<Float4> x, bool pp) |
318 | { |
319 | // cos(x) = sin(x + pi/2) |
320 | Float4 y = x + Float4(1.57079632e+0f); |
321 | return sine(y, pp); |
322 | } |
323 | |
324 | Float4 tangent(RValue<Float4> x, bool pp) |
325 | { |
326 | return sine(x, pp) / cosine(x, pp); |
327 | } |
328 | |
329 | Float4 arccos(RValue<Float4> x, bool pp) |
330 | { |
331 | // pi/2 - arcsin(x) |
332 | return Float4(1.57079632e+0f) - arcsin(x); |
333 | } |
334 | |
335 | Float4 arcsin(RValue<Float4> x, bool pp) |
336 | { |
337 | if(false) // Simpler implementation fails even lowp precision tests |
338 | { |
339 | // x*(pi/2-sqrt(1-x*x)*pi/5) |
340 | return x * (Float4(1.57079632e+0f) - Sqrt(Float4(1.0f) - x*x) * Float4(6.28318531e-1f)); |
341 | } |
342 | else |
343 | { |
344 | // From 4.4.45, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun |
345 | const Float4 half_pi(1.57079632f); |
346 | const Float4 a0(1.5707288f); |
347 | const Float4 a1(-0.2121144f); |
348 | const Float4 a2(0.0742610f); |
349 | const Float4 a3(-0.0187293f); |
350 | Float4 absx = Abs(x); |
351 | return As<Float4>(As<Int4>(half_pi - Sqrt(Float4(1.0f) - absx) * (a0 + absx * (a1 + absx * (a2 + absx * a3)))) ^ |
352 | (As<Int4>(x) & Int4(0x80000000))); |
353 | } |
354 | } |
355 | |
356 | // Approximation of atan in [0..1] |
357 | Float4 arctan_01(Float4 x, bool pp) |
358 | { |
359 | if(pp) |
360 | { |
361 | return x * (Float4(-0.27f) * x + Float4(1.05539816f)); |
362 | } |
363 | else |
364 | { |
365 | // From 4.4.49, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun |
366 | const Float4 a2(-0.3333314528f); |
367 | const Float4 a4(0.1999355085f); |
368 | const Float4 a6(-0.1420889944f); |
369 | const Float4 a8(0.1065626393f); |
370 | const Float4 a10(-0.0752896400f); |
371 | const Float4 a12(0.0429096138f); |
372 | const Float4 a14(-0.0161657367f); |
373 | const Float4 a16(0.0028662257f); |
374 | Float4 x2 = x * x; |
375 | return (x + x * (x2 * (a2 + x2 * (a4 + x2 * (a6 + x2 * (a8 + x2 * (a10 + x2 * (a12 + x2 * (a14 + x2 * a16))))))))); |
376 | } |
377 | } |
378 | |
379 | Float4 arctan(RValue<Float4> x, bool pp) |
380 | { |
381 | Float4 absx = Abs(x); |
382 | Int4 O = CmpNLT(absx, Float4(1.0f)); |
383 | Float4 y = As<Float4>((O & As<Int4>(Float4(1.0f) / absx)) | (~O & As<Int4>(absx))); // FIXME: Vector select |
384 | |
385 | const Float4 half_pi(1.57079632f); |
386 | Float4 theta = arctan_01(y, pp); |
387 | return As<Float4>(((O & As<Int4>(half_pi - theta)) | (~O & As<Int4>(theta))) ^ // FIXME: Vector select |
388 | (As<Int4>(x) & Int4(0x80000000))); |
389 | } |
390 | |
391 | Float4 arctan(RValue<Float4> y, RValue<Float4> x, bool pp) |
392 | { |
393 | const Float4 pi(3.14159265f); // pi |
394 | const Float4 minus_pi(-3.14159265f); // -pi |
395 | const Float4 half_pi(1.57079632f); // pi/2 |
396 | const Float4 quarter_pi(7.85398163e-1f); // pi/4 |
397 | |
398 | // Rotate to upper semicircle when in lower semicircle |
399 | Int4 S = CmpLT(y, Float4(0.0f)); |
400 | Float4 theta = As<Float4>(S & As<Int4>(minus_pi)); |
401 | Float4 x0 = As<Float4>((As<Int4>(y) & Int4(0x80000000)) ^ As<Int4>(x)); |
402 | Float4 y0 = Abs(y); |
403 | |
404 | // Rotate to right quadrant when in left quadrant |
405 | Int4 Q = CmpLT(x0, Float4(0.0f)); |
406 | theta += As<Float4>(Q & As<Int4>(half_pi)); |
407 | Float4 x1 = As<Float4>((Q & As<Int4>(y0)) | (~Q & As<Int4>(x0))); // FIXME: Vector select |
408 | Float4 y1 = As<Float4>((Q & As<Int4>(-x0)) | (~Q & As<Int4>(y0))); // FIXME: Vector select |
409 | |
410 | // Mirror to first octant when in second octant |
411 | Int4 O = CmpNLT(y1, x1); |
412 | Float4 x2 = As<Float4>((O & As<Int4>(y1)) | (~O & As<Int4>(x1))); // FIXME: Vector select |
413 | Float4 y2 = As<Float4>((O & As<Int4>(x1)) | (~O & As<Int4>(y1))); // FIXME: Vector select |
414 | |
415 | // Approximation of atan in [0..1] |
416 | Int4 zero_x = CmpEQ(x2, Float4(0.0f)); |
417 | Int4 inf_y = IsInf(y2); // Since x2 >= y2, this means x2 == y2 == inf, so we use 45 degrees or pi/4 |
418 | Float4 atan2_theta = arctan_01(y2 / x2, pp); |
419 | theta += As<Float4>((~zero_x & ~inf_y & ((O & As<Int4>(half_pi - atan2_theta)) | (~O & (As<Int4>(atan2_theta))))) | // FIXME: Vector select |
420 | (inf_y & As<Int4>(quarter_pi))); |
421 | |
422 | // Recover loss of precision for tiny theta angles |
423 | Int4 precision_loss = S & Q & O & ~inf_y; // This combination results in (-pi + half_pi + half_pi - atan2_theta) which is equivalent to -atan2_theta |
424 | return As<Float4>((precision_loss & As<Int4>(-atan2_theta)) | (~precision_loss & As<Int4>(theta))); // FIXME: Vector select |
425 | } |
426 | |
427 | Float4 sineh(RValue<Float4> x, bool pp) |
428 | { |
429 | return (exponential(x, pp) - exponential(-x, pp)) * Float4(0.5f); |
430 | } |
431 | |
432 | Float4 cosineh(RValue<Float4> x, bool pp) |
433 | { |
434 | return (exponential(x, pp) + exponential(-x, pp)) * Float4(0.5f); |
435 | } |
436 | |
437 | Float4 tangenth(RValue<Float4> x, bool pp) |
438 | { |
439 | Float4 e_x = exponential(x, pp); |
440 | Float4 e_minus_x = exponential(-x, pp); |
441 | return (e_x - e_minus_x) / (e_x + e_minus_x); |
442 | } |
443 | |
444 | Float4 arccosh(RValue<Float4> x, bool pp) |
445 | { |
446 | return logarithm(x + Sqrt(x + Float4(1.0f)) * Sqrt(x - Float4(1.0f)), pp); |
447 | } |
448 | |
449 | Float4 arcsinh(RValue<Float4> x, bool pp) |
450 | { |
451 | return logarithm(x + Sqrt(x * x + Float4(1.0f)), pp); |
452 | } |
453 | |
454 | Float4 arctanh(RValue<Float4> x, bool pp) |
455 | { |
456 | return logarithm((Float4(1.0f) + x) / (Float4(1.0f) - x), pp) * Float4(0.5f); |
457 | } |
458 | |
459 | Float4 dot2(const Vector4f &v0, const Vector4f &v1) |
460 | { |
461 | return v0.x * v1.x + v0.y * v1.y; |
462 | } |
463 | |
464 | Float4 dot3(const Vector4f &v0, const Vector4f &v1) |
465 | { |
466 | return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z; |
467 | } |
468 | |
469 | Float4 dot4(const Vector4f &v0, const Vector4f &v1) |
470 | { |
471 | return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z + v0.w * v1.w; |
472 | } |
473 | |
474 | void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3) |
475 | { |
476 | Int2 tmp0 = UnpackHigh(row0, row1); |
477 | Int2 tmp1 = UnpackHigh(row2, row3); |
478 | Int2 tmp2 = UnpackLow(row0, row1); |
479 | Int2 tmp3 = UnpackLow(row2, row3); |
480 | |
481 | row0 = UnpackLow(tmp2, tmp3); |
482 | row1 = UnpackHigh(tmp2, tmp3); |
483 | row2 = UnpackLow(tmp0, tmp1); |
484 | row3 = UnpackHigh(tmp0, tmp1); |
485 | } |
486 | |
487 | void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3) |
488 | { |
489 | Int2 tmp0 = UnpackHigh(row0, row1); |
490 | Int2 tmp1 = UnpackHigh(row2, row3); |
491 | Int2 tmp2 = UnpackLow(row0, row1); |
492 | Int2 tmp3 = UnpackLow(row2, row3); |
493 | |
494 | row0 = UnpackLow(tmp2, tmp3); |
495 | row1 = UnpackHigh(tmp2, tmp3); |
496 | row2 = UnpackLow(tmp0, tmp1); |
497 | } |
498 | |
499 | void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3) |
500 | { |
501 | Float4 tmp0 = UnpackLow(row0, row1); |
502 | Float4 tmp1 = UnpackLow(row2, row3); |
503 | Float4 tmp2 = UnpackHigh(row0, row1); |
504 | Float4 tmp3 = UnpackHigh(row2, row3); |
505 | |
506 | row0 = Float4(tmp0.xy, tmp1.xy); |
507 | row1 = Float4(tmp0.zw, tmp1.zw); |
508 | row2 = Float4(tmp2.xy, tmp3.xy); |
509 | row3 = Float4(tmp2.zw, tmp3.zw); |
510 | } |
511 | |
512 | void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3) |
513 | { |
514 | Float4 tmp0 = UnpackLow(row0, row1); |
515 | Float4 tmp1 = UnpackLow(row2, row3); |
516 | Float4 tmp2 = UnpackHigh(row0, row1); |
517 | Float4 tmp3 = UnpackHigh(row2, row3); |
518 | |
519 | row0 = Float4(tmp0.xy, tmp1.xy); |
520 | row1 = Float4(tmp0.zw, tmp1.zw); |
521 | row2 = Float4(tmp2.xy, tmp3.xy); |
522 | } |
523 | |
524 | void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3) |
525 | { |
526 | Float4 tmp0 = UnpackLow(row0, row1); |
527 | Float4 tmp1 = UnpackLow(row2, row3); |
528 | |
529 | row0 = Float4(tmp0.xy, tmp1.xy); |
530 | row1 = Float4(tmp0.zw, tmp1.zw); |
531 | } |
532 | |
533 | void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3) |
534 | { |
535 | Float4 tmp0 = UnpackLow(row0, row1); |
536 | Float4 tmp1 = UnpackLow(row2, row3); |
537 | |
538 | row0 = Float4(tmp0.xy, tmp1.xy); |
539 | } |
540 | |
541 | void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3) |
542 | { |
543 | Float4 tmp01 = UnpackLow(row0, row1); |
544 | Float4 tmp23 = UnpackHigh(row0, row1); |
545 | |
546 | row0 = tmp01; |
547 | row1 = Float4(tmp01.zw, row1.zw); |
548 | row2 = tmp23; |
549 | row3 = Float4(tmp23.zw, row3.zw); |
550 | } |
551 | |
552 | void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N) |
553 | { |
554 | switch(N) |
555 | { |
556 | case 1: transpose4x1(row0, row1, row2, row3); break; |
557 | case 2: transpose4x2(row0, row1, row2, row3); break; |
558 | case 3: transpose4x3(row0, row1, row2, row3); break; |
559 | case 4: transpose4x4(row0, row1, row2, row3); break; |
560 | } |
561 | } |
562 | |
563 | const Vector4f RegisterFile::operator[](RValue<Int4> index) |
564 | { |
565 | ASSERT(indirectAddressable); |
566 | |
567 | Int index0 = Extract(index, 0); |
568 | Int index1 = Extract(index, 1); |
569 | Int index2 = Extract(index, 2); |
570 | Int index3 = Extract(index, 3); |
571 | |
572 | Vector4f r; |
573 | |
574 | r.x.x = Extract(x[0][index0], 0); |
575 | r.x.y = Extract(x[0][index1], 1); |
576 | r.x.z = Extract(x[0][index2], 2); |
577 | r.x.w = Extract(x[0][index3], 3); |
578 | |
579 | r.y.x = Extract(y[0][index0], 0); |
580 | r.y.y = Extract(y[0][index1], 1); |
581 | r.y.z = Extract(y[0][index2], 2); |
582 | r.y.w = Extract(y[0][index3], 3); |
583 | |
584 | r.z.x = Extract(z[0][index0], 0); |
585 | r.z.y = Extract(z[0][index1], 1); |
586 | r.z.z = Extract(z[0][index2], 2); |
587 | r.z.w = Extract(z[0][index3], 3); |
588 | |
589 | r.w.x = Extract(w[0][index0], 0); |
590 | r.w.y = Extract(w[0][index1], 1); |
591 | r.w.z = Extract(w[0][index2], 2); |
592 | r.w.w = Extract(w[0][index3], 3); |
593 | |
594 | return r; |
595 | } |
596 | |
597 | void RegisterFile::scatter_x(Int4 index, RValue<Float4> r) |
598 | { |
599 | ASSERT(indirectAddressable); |
600 | |
601 | Int index0 = Extract(index, 0); |
602 | Int index1 = Extract(index, 1); |
603 | Int index2 = Extract(index, 2); |
604 | Int index3 = Extract(index, 3); |
605 | |
606 | x[0][index0] = Insert(x[0][index0], Extract(r, 0), 0); |
607 | x[0][index1] = Insert(x[0][index1], Extract(r, 1), 1); |
608 | x[0][index2] = Insert(x[0][index2], Extract(r, 2), 2); |
609 | x[0][index3] = Insert(x[0][index3], Extract(r, 3), 3); |
610 | } |
611 | |
612 | void RegisterFile::scatter_y(Int4 index, RValue<Float4> r) |
613 | { |
614 | ASSERT(indirectAddressable); |
615 | |
616 | Int index0 = Extract(index, 0); |
617 | Int index1 = Extract(index, 1); |
618 | Int index2 = Extract(index, 2); |
619 | Int index3 = Extract(index, 3); |
620 | |
621 | y[0][index0] = Insert(y[0][index0], Extract(r, 0), 0); |
622 | y[0][index1] = Insert(y[0][index1], Extract(r, 1), 1); |
623 | y[0][index2] = Insert(y[0][index2], Extract(r, 2), 2); |
624 | y[0][index3] = Insert(y[0][index3], Extract(r, 3), 3); |
625 | } |
626 | |
627 | void RegisterFile::scatter_z(Int4 index, RValue<Float4> r) |
628 | { |
629 | ASSERT(indirectAddressable); |
630 | |
631 | Int index0 = Extract(index, 0); |
632 | Int index1 = Extract(index, 1); |
633 | Int index2 = Extract(index, 2); |
634 | Int index3 = Extract(index, 3); |
635 | |
636 | z[0][index0] = Insert(z[0][index0], Extract(r, 0), 0); |
637 | z[0][index1] = Insert(z[0][index1], Extract(r, 1), 1); |
638 | z[0][index2] = Insert(z[0][index2], Extract(r, 2), 2); |
639 | z[0][index3] = Insert(z[0][index3], Extract(r, 3), 3); |
640 | } |
641 | |
642 | void RegisterFile::scatter_w(Int4 index, RValue<Float4> r) |
643 | { |
644 | ASSERT(indirectAddressable); |
645 | |
646 | Int index0 = Extract(index, 0); |
647 | Int index1 = Extract(index, 1); |
648 | Int index2 = Extract(index, 2); |
649 | Int index3 = Extract(index, 3); |
650 | |
651 | w[0][index0] = Insert(w[0][index0], Extract(r, 0), 0); |
652 | w[0][index1] = Insert(w[0][index1], Extract(r, 1), 1); |
653 | w[0][index2] = Insert(w[0][index2], Extract(r, 2), 2); |
654 | w[0][index3] = Insert(w[0][index3], Extract(r, 3), 3); |
655 | } |
656 | |
657 | void ShaderCore::mov(Vector4f &dst, const Vector4f &src, bool integerDestination) |
658 | { |
659 | if(integerDestination) |
660 | { |
661 | dst.x = As<Float4>(RoundInt(src.x)); |
662 | dst.y = As<Float4>(RoundInt(src.y)); |
663 | dst.z = As<Float4>(RoundInt(src.z)); |
664 | dst.w = As<Float4>(RoundInt(src.w)); |
665 | } |
666 | else |
667 | { |
668 | dst = src; |
669 | } |
670 | } |
671 | |
672 | void ShaderCore::neg(Vector4f &dst, const Vector4f &src) |
673 | { |
674 | dst.x = -src.x; |
675 | dst.y = -src.y; |
676 | dst.z = -src.z; |
677 | dst.w = -src.w; |
678 | } |
679 | |
680 | void ShaderCore::ineg(Vector4f &dst, const Vector4f &src) |
681 | { |
682 | dst.x = As<Float4>(-As<Int4>(src.x)); |
683 | dst.y = As<Float4>(-As<Int4>(src.y)); |
684 | dst.z = As<Float4>(-As<Int4>(src.z)); |
685 | dst.w = As<Float4>(-As<Int4>(src.w)); |
686 | } |
687 | |
688 | void ShaderCore::f2b(Vector4f &dst, const Vector4f &src) |
689 | { |
690 | dst.x = As<Float4>(CmpNEQ(src.x, Float4(0.0f))); |
691 | dst.y = As<Float4>(CmpNEQ(src.y, Float4(0.0f))); |
692 | dst.z = As<Float4>(CmpNEQ(src.z, Float4(0.0f))); |
693 | dst.w = As<Float4>(CmpNEQ(src.w, Float4(0.0f))); |
694 | } |
695 | |
696 | void ShaderCore::b2f(Vector4f &dst, const Vector4f &src) |
697 | { |
698 | dst.x = As<Float4>(As<Int4>(src.x) & As<Int4>(Float4(1.0f))); |
699 | dst.y = As<Float4>(As<Int4>(src.y) & As<Int4>(Float4(1.0f))); |
700 | dst.z = As<Float4>(As<Int4>(src.z) & As<Int4>(Float4(1.0f))); |
701 | dst.w = As<Float4>(As<Int4>(src.w) & As<Int4>(Float4(1.0f))); |
702 | } |
703 | |
704 | void ShaderCore::f2i(Vector4f &dst, const Vector4f &src) |
705 | { |
706 | dst.x = As<Float4>(Int4(src.x)); |
707 | dst.y = As<Float4>(Int4(src.y)); |
708 | dst.z = As<Float4>(Int4(src.z)); |
709 | dst.w = As<Float4>(Int4(src.w)); |
710 | } |
711 | |
712 | void ShaderCore::i2f(Vector4f &dst, const Vector4f &src) |
713 | { |
714 | dst.x = Float4(As<Int4>(src.x)); |
715 | dst.y = Float4(As<Int4>(src.y)); |
716 | dst.z = Float4(As<Int4>(src.z)); |
717 | dst.w = Float4(As<Int4>(src.w)); |
718 | } |
719 | |
720 | void ShaderCore::f2u(Vector4f &dst, const Vector4f &src) |
721 | { |
722 | dst.x = As<Float4>(UInt4(src.x)); |
723 | dst.y = As<Float4>(UInt4(src.y)); |
724 | dst.z = As<Float4>(UInt4(src.z)); |
725 | dst.w = As<Float4>(UInt4(src.w)); |
726 | } |
727 | |
728 | void ShaderCore::u2f(Vector4f &dst, const Vector4f &src) |
729 | { |
730 | dst.x = Float4(As<UInt4>(src.x)); |
731 | dst.y = Float4(As<UInt4>(src.y)); |
732 | dst.z = Float4(As<UInt4>(src.z)); |
733 | dst.w = Float4(As<UInt4>(src.w)); |
734 | } |
735 | |
736 | void ShaderCore::i2b(Vector4f &dst, const Vector4f &src) |
737 | { |
738 | dst.x = As<Float4>(CmpNEQ(As<Int4>(src.x), Int4(0))); |
739 | dst.y = As<Float4>(CmpNEQ(As<Int4>(src.y), Int4(0))); |
740 | dst.z = As<Float4>(CmpNEQ(As<Int4>(src.z), Int4(0))); |
741 | dst.w = As<Float4>(CmpNEQ(As<Int4>(src.w), Int4(0))); |
742 | } |
743 | |
744 | void ShaderCore::b2i(Vector4f &dst, const Vector4f &src) |
745 | { |
746 | dst.x = As<Float4>(As<Int4>(src.x) & Int4(1)); |
747 | dst.y = As<Float4>(As<Int4>(src.y) & Int4(1)); |
748 | dst.z = As<Float4>(As<Int4>(src.z) & Int4(1)); |
749 | dst.w = As<Float4>(As<Int4>(src.w) & Int4(1)); |
750 | } |
751 | |
752 | void ShaderCore::add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
753 | { |
754 | dst.x = src0.x + src1.x; |
755 | dst.y = src0.y + src1.y; |
756 | dst.z = src0.z + src1.z; |
757 | dst.w = src0.w + src1.w; |
758 | } |
759 | |
760 | void ShaderCore::iadd(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
761 | { |
762 | dst.x = As<Float4>(As<Int4>(src0.x) + As<Int4>(src1.x)); |
763 | dst.y = As<Float4>(As<Int4>(src0.y) + As<Int4>(src1.y)); |
764 | dst.z = As<Float4>(As<Int4>(src0.z) + As<Int4>(src1.z)); |
765 | dst.w = As<Float4>(As<Int4>(src0.w) + As<Int4>(src1.w)); |
766 | } |
767 | |
768 | void ShaderCore::sub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
769 | { |
770 | dst.x = src0.x - src1.x; |
771 | dst.y = src0.y - src1.y; |
772 | dst.z = src0.z - src1.z; |
773 | dst.w = src0.w - src1.w; |
774 | } |
775 | |
776 | void ShaderCore::isub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
777 | { |
778 | dst.x = As<Float4>(As<Int4>(src0.x) - As<Int4>(src1.x)); |
779 | dst.y = As<Float4>(As<Int4>(src0.y) - As<Int4>(src1.y)); |
780 | dst.z = As<Float4>(As<Int4>(src0.z) - As<Int4>(src1.z)); |
781 | dst.w = As<Float4>(As<Int4>(src0.w) - As<Int4>(src1.w)); |
782 | } |
783 | |
784 | void ShaderCore::mad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2) |
785 | { |
786 | dst.x = src0.x * src1.x + src2.x; |
787 | dst.y = src0.y * src1.y + src2.y; |
788 | dst.z = src0.z * src1.z + src2.z; |
789 | dst.w = src0.w * src1.w + src2.w; |
790 | } |
791 | |
792 | void ShaderCore::imad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2) |
793 | { |
794 | dst.x = As<Float4>(As<Int4>(src0.x) * As<Int4>(src1.x) + As<Int4>(src2.x)); |
795 | dst.y = As<Float4>(As<Int4>(src0.y) * As<Int4>(src1.y) + As<Int4>(src2.y)); |
796 | dst.z = As<Float4>(As<Int4>(src0.z) * As<Int4>(src1.z) + As<Int4>(src2.z)); |
797 | dst.w = As<Float4>(As<Int4>(src0.w) * As<Int4>(src1.w) + As<Int4>(src2.w)); |
798 | } |
799 | |
800 | void ShaderCore::mul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
801 | { |
802 | dst.x = src0.x * src1.x; |
803 | dst.y = src0.y * src1.y; |
804 | dst.z = src0.z * src1.z; |
805 | dst.w = src0.w * src1.w; |
806 | } |
807 | |
808 | void ShaderCore::imul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
809 | { |
810 | dst.x = As<Float4>(As<Int4>(src0.x) * As<Int4>(src1.x)); |
811 | dst.y = As<Float4>(As<Int4>(src0.y) * As<Int4>(src1.y)); |
812 | dst.z = As<Float4>(As<Int4>(src0.z) * As<Int4>(src1.z)); |
813 | dst.w = As<Float4>(As<Int4>(src0.w) * As<Int4>(src1.w)); |
814 | } |
815 | |
816 | void ShaderCore::rcpx(Vector4f &dst, const Vector4f &src, bool pp) |
817 | { |
818 | Float4 rcp = reciprocal(src.x, pp, true, true); |
819 | |
820 | dst.x = rcp; |
821 | dst.y = rcp; |
822 | dst.z = rcp; |
823 | dst.w = rcp; |
824 | } |
825 | |
826 | void ShaderCore::div(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
827 | { |
828 | dst.x = src0.x / src1.x; |
829 | dst.y = src0.y / src1.y; |
830 | dst.z = src0.z / src1.z; |
831 | dst.w = src0.w / src1.w; |
832 | } |
833 | |
834 | void ShaderCore::idiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
835 | { |
836 | Float4 intMax(As<Float4>(Int4(INT_MAX))); |
837 | cmp0i(dst.x, src1.x, intMax, src1.x); |
838 | dst.x = As<Float4>(As<Int4>(src0.x) / As<Int4>(dst.x)); |
839 | cmp0i(dst.y, src1.y, intMax, src1.y); |
840 | dst.y = As<Float4>(As<Int4>(src0.y) / As<Int4>(dst.y)); |
841 | cmp0i(dst.z, src1.z, intMax, src1.z); |
842 | dst.z = As<Float4>(As<Int4>(src0.z) / As<Int4>(dst.z)); |
843 | cmp0i(dst.w, src1.w, intMax, src1.w); |
844 | dst.w = As<Float4>(As<Int4>(src0.w) / As<Int4>(dst.w)); |
845 | } |
846 | |
847 | void ShaderCore::udiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
848 | { |
849 | Float4 uintMax(As<Float4>(UInt4(UINT_MAX))); |
850 | cmp0i(dst.x, src1.x, uintMax, src1.x); |
851 | dst.x = As<Float4>(As<UInt4>(src0.x) / As<UInt4>(dst.x)); |
852 | cmp0i(dst.y, src1.y, uintMax, src1.y); |
853 | dst.y = As<Float4>(As<UInt4>(src0.y) / As<UInt4>(dst.y)); |
854 | cmp0i(dst.z, src1.z, uintMax, src1.z); |
855 | dst.z = As<Float4>(As<UInt4>(src0.z) / As<UInt4>(dst.z)); |
856 | cmp0i(dst.w, src1.w, uintMax, src1.w); |
857 | dst.w = As<Float4>(As<UInt4>(src0.w) / As<UInt4>(dst.w)); |
858 | } |
859 | |
860 | void ShaderCore::mod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
861 | { |
862 | dst.x = modulo(src0.x, src1.x); |
863 | dst.y = modulo(src0.y, src1.y); |
864 | dst.z = modulo(src0.z, src1.z); |
865 | dst.w = modulo(src0.w, src1.w); |
866 | } |
867 | |
868 | void ShaderCore::imod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
869 | { |
870 | Float4 intMax(As<Float4>(Int4(INT_MAX))); |
871 | cmp0i(dst.x, src1.x, intMax, src1.x); |
872 | dst.x = As<Float4>(As<Int4>(src0.x) % As<Int4>(dst.x)); |
873 | cmp0i(dst.y, src1.y, intMax, src1.y); |
874 | dst.y = As<Float4>(As<Int4>(src0.y) % As<Int4>(dst.y)); |
875 | cmp0i(dst.z, src1.z, intMax, src1.z); |
876 | dst.z = As<Float4>(As<Int4>(src0.z) % As<Int4>(dst.z)); |
877 | cmp0i(dst.w, src1.w, intMax, src1.w); |
878 | dst.w = As<Float4>(As<Int4>(src0.w) % As<Int4>(dst.w)); |
879 | } |
880 | |
881 | void ShaderCore::umod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
882 | { |
883 | Float4 uintMax(As<Float4>(UInt4(UINT_MAX))); |
884 | cmp0i(dst.x, src1.x, uintMax, src1.x); |
885 | dst.x = As<Float4>(As<UInt4>(src0.x) % As<UInt4>(dst.x)); |
886 | cmp0i(dst.y, src1.y, uintMax, src1.y); |
887 | dst.y = As<Float4>(As<UInt4>(src0.y) % As<UInt4>(dst.y)); |
888 | cmp0i(dst.z, src1.z, uintMax, src1.z); |
889 | dst.z = As<Float4>(As<UInt4>(src0.z) % As<UInt4>(dst.z)); |
890 | cmp0i(dst.w, src1.w, uintMax, src1.w); |
891 | dst.w = As<Float4>(As<UInt4>(src0.w) % As<UInt4>(dst.w)); |
892 | } |
893 | |
894 | void ShaderCore::shl(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
895 | { |
896 | dst.x = As<Float4>(As<Int4>(src0.x) << As<Int4>(src1.x)); |
897 | dst.y = As<Float4>(As<Int4>(src0.y) << As<Int4>(src1.y)); |
898 | dst.z = As<Float4>(As<Int4>(src0.z) << As<Int4>(src1.z)); |
899 | dst.w = As<Float4>(As<Int4>(src0.w) << As<Int4>(src1.w)); |
900 | } |
901 | |
902 | void ShaderCore::ishr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
903 | { |
904 | dst.x = As<Float4>(As<Int4>(src0.x) >> As<Int4>(src1.x)); |
905 | dst.y = As<Float4>(As<Int4>(src0.y) >> As<Int4>(src1.y)); |
906 | dst.z = As<Float4>(As<Int4>(src0.z) >> As<Int4>(src1.z)); |
907 | dst.w = As<Float4>(As<Int4>(src0.w) >> As<Int4>(src1.w)); |
908 | } |
909 | |
910 | void ShaderCore::ushr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
911 | { |
912 | dst.x = As<Float4>(As<UInt4>(src0.x) >> As<UInt4>(src1.x)); |
913 | dst.y = As<Float4>(As<UInt4>(src0.y) >> As<UInt4>(src1.y)); |
914 | dst.z = As<Float4>(As<UInt4>(src0.z) >> As<UInt4>(src1.z)); |
915 | dst.w = As<Float4>(As<UInt4>(src0.w) >> As<UInt4>(src1.w)); |
916 | } |
917 | |
918 | void ShaderCore::rsqx(Vector4f &dst, const Vector4f &src, bool pp) |
919 | { |
920 | Float4 rsq = reciprocalSquareRoot(src.x, true, pp); |
921 | |
922 | dst.x = rsq; |
923 | dst.y = rsq; |
924 | dst.z = rsq; |
925 | dst.w = rsq; |
926 | } |
927 | |
928 | void ShaderCore::sqrt(Vector4f &dst, const Vector4f &src, bool pp) |
929 | { |
930 | dst.x = Sqrt(src.x); |
931 | dst.y = Sqrt(src.y); |
932 | dst.z = Sqrt(src.z); |
933 | dst.w = Sqrt(src.w); |
934 | } |
935 | |
936 | void ShaderCore::rsq(Vector4f &dst, const Vector4f &src, bool pp) |
937 | { |
938 | dst.x = reciprocalSquareRoot(src.x, false, pp); |
939 | dst.y = reciprocalSquareRoot(src.y, false, pp); |
940 | dst.z = reciprocalSquareRoot(src.z, false, pp); |
941 | dst.w = reciprocalSquareRoot(src.w, false, pp); |
942 | } |
943 | |
944 | void ShaderCore::len2(Float4 &dst, const Vector4f &src, bool pp) |
945 | { |
946 | dst = Sqrt(dot2(src, src)); |
947 | } |
948 | |
949 | void ShaderCore::len3(Float4 &dst, const Vector4f &src, bool pp) |
950 | { |
951 | dst = Sqrt(dot3(src, src)); |
952 | } |
953 | |
954 | void ShaderCore::len4(Float4 &dst, const Vector4f &src, bool pp) |
955 | { |
956 | dst = Sqrt(dot4(src, src)); |
957 | } |
958 | |
959 | void ShaderCore::dist1(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp) |
960 | { |
961 | dst = Abs(src0.x - src1.x); |
962 | } |
963 | |
964 | void ShaderCore::dist2(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp) |
965 | { |
966 | Float4 dx = src0.x - src1.x; |
967 | Float4 dy = src0.y - src1.y; |
968 | Float4 dot2 = dx * dx + dy * dy; |
969 | dst = Sqrt(dot2); |
970 | } |
971 | |
972 | void ShaderCore::dist3(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp) |
973 | { |
974 | Float4 dx = src0.x - src1.x; |
975 | Float4 dy = src0.y - src1.y; |
976 | Float4 dz = src0.z - src1.z; |
977 | Float4 dot3 = dx * dx + dy * dy + dz * dz; |
978 | dst = Sqrt(dot3); |
979 | } |
980 | |
981 | void ShaderCore::dist4(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp) |
982 | { |
983 | Float4 dx = src0.x - src1.x; |
984 | Float4 dy = src0.y - src1.y; |
985 | Float4 dz = src0.z - src1.z; |
986 | Float4 dw = src0.w - src1.w; |
987 | Float4 dot4 = dx * dx + dy * dy + dz * dz + dw * dw; |
988 | dst = Sqrt(dot4); |
989 | } |
990 | |
991 | void ShaderCore::dp1(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
992 | { |
993 | Float4 t = src0.x * src1.x; |
994 | |
995 | dst.x = t; |
996 | dst.y = t; |
997 | dst.z = t; |
998 | dst.w = t; |
999 | } |
1000 | |
1001 | void ShaderCore::dp2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
1002 | { |
1003 | Float4 t = dot2(src0, src1); |
1004 | |
1005 | dst.x = t; |
1006 | dst.y = t; |
1007 | dst.z = t; |
1008 | dst.w = t; |
1009 | } |
1010 | |
1011 | void ShaderCore::dp2add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2) |
1012 | { |
1013 | Float4 t = dot2(src0, src1) + src2.x; |
1014 | |
1015 | dst.x = t; |
1016 | dst.y = t; |
1017 | dst.z = t; |
1018 | dst.w = t; |
1019 | } |
1020 | |
1021 | void ShaderCore::dp3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
1022 | { |
1023 | Float4 dot = dot3(src0, src1); |
1024 | |
1025 | dst.x = dot; |
1026 | dst.y = dot; |
1027 | dst.z = dot; |
1028 | dst.w = dot; |
1029 | } |
1030 | |
1031 | void ShaderCore::dp4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
1032 | { |
1033 | Float4 dot = dot4(src0, src1); |
1034 | |
1035 | dst.x = dot; |
1036 | dst.y = dot; |
1037 | dst.z = dot; |
1038 | dst.w = dot; |
1039 | } |
1040 | |
1041 | void ShaderCore::min(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
1042 | { |
1043 | dst.x = Min(src0.x, src1.x); |
1044 | dst.y = Min(src0.y, src1.y); |
1045 | dst.z = Min(src0.z, src1.z); |
1046 | dst.w = Min(src0.w, src1.w); |
1047 | } |
1048 | |
1049 | void ShaderCore::imin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
1050 | { |
1051 | dst.x = As<Float4>(Min(As<Int4>(src0.x), As<Int4>(src1.x))); |
1052 | dst.y = As<Float4>(Min(As<Int4>(src0.y), As<Int4>(src1.y))); |
1053 | dst.z = As<Float4>(Min(As<Int4>(src0.z), As<Int4>(src1.z))); |
1054 | dst.w = As<Float4>(Min(As<Int4>(src0.w), As<Int4>(src1.w))); |
1055 | } |
1056 | |
1057 | void ShaderCore::umin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
1058 | { |
1059 | dst.x = As<Float4>(Min(As<UInt4>(src0.x), As<UInt4>(src1.x))); |
1060 | dst.y = As<Float4>(Min(As<UInt4>(src0.y), As<UInt4>(src1.y))); |
1061 | dst.z = As<Float4>(Min(As<UInt4>(src0.z), As<UInt4>(src1.z))); |
1062 | dst.w = As<Float4>(Min(As<UInt4>(src0.w), As<UInt4>(src1.w))); |
1063 | } |
1064 | |
1065 | void ShaderCore::max(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
1066 | { |
1067 | dst.x = Max(src0.x, src1.x); |
1068 | dst.y = Max(src0.y, src1.y); |
1069 | dst.z = Max(src0.z, src1.z); |
1070 | dst.w = Max(src0.w, src1.w); |
1071 | } |
1072 | |
1073 | void ShaderCore::imax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
1074 | { |
1075 | dst.x = As<Float4>(Max(As<Int4>(src0.x), As<Int4>(src1.x))); |
1076 | dst.y = As<Float4>(Max(As<Int4>(src0.y), As<Int4>(src1.y))); |
1077 | dst.z = As<Float4>(Max(As<Int4>(src0.z), As<Int4>(src1.z))); |
1078 | dst.w = As<Float4>(Max(As<Int4>(src0.w), As<Int4>(src1.w))); |
1079 | } |
1080 | |
1081 | void ShaderCore::umax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
1082 | { |
1083 | dst.x = As<Float4>(Max(As<Int4>(src0.x), As<Int4>(src1.x))); |
1084 | dst.y = As<Float4>(Max(As<Int4>(src0.y), As<Int4>(src1.y))); |
1085 | dst.z = As<Float4>(Max(As<Int4>(src0.z), As<Int4>(src1.z))); |
1086 | dst.w = As<Float4>(Max(As<Int4>(src0.w), As<Int4>(src1.w))); |
1087 | } |
1088 | |
1089 | void ShaderCore::slt(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
1090 | { |
1091 | dst.x = As<Float4>(As<Int4>(CmpLT(src0.x, src1.x)) & As<Int4>(Float4(1.0f))); |
1092 | dst.y = As<Float4>(As<Int4>(CmpLT(src0.y, src1.y)) & As<Int4>(Float4(1.0f))); |
1093 | dst.z = As<Float4>(As<Int4>(CmpLT(src0.z, src1.z)) & As<Int4>(Float4(1.0f))); |
1094 | dst.w = As<Float4>(As<Int4>(CmpLT(src0.w, src1.w)) & As<Int4>(Float4(1.0f))); |
1095 | } |
1096 | |
1097 | void ShaderCore::step(Vector4f &dst, const Vector4f &edge, const Vector4f &x) |
1098 | { |
1099 | dst.x = As<Float4>(CmpNLT(x.x, edge.x) & As<Int4>(Float4(1.0f))); |
1100 | dst.y = As<Float4>(CmpNLT(x.y, edge.y) & As<Int4>(Float4(1.0f))); |
1101 | dst.z = As<Float4>(CmpNLT(x.z, edge.z) & As<Int4>(Float4(1.0f))); |
1102 | dst.w = As<Float4>(CmpNLT(x.w, edge.w) & As<Int4>(Float4(1.0f))); |
1103 | } |
1104 | |
1105 | void ShaderCore::exp2x(Vector4f &dst, const Vector4f &src, bool pp) |
1106 | { |
1107 | Float4 exp = exponential2(src.x, pp); |
1108 | |
1109 | dst.x = exp; |
1110 | dst.y = exp; |
1111 | dst.z = exp; |
1112 | dst.w = exp; |
1113 | } |
1114 | |
1115 | void ShaderCore::exp2(Vector4f &dst, const Vector4f &src, bool pp) |
1116 | { |
1117 | dst.x = exponential2(src.x, pp); |
1118 | dst.y = exponential2(src.y, pp); |
1119 | dst.z = exponential2(src.z, pp); |
1120 | dst.w = exponential2(src.w, pp); |
1121 | } |
1122 | |
1123 | void ShaderCore::exp(Vector4f &dst, const Vector4f &src, bool pp) |
1124 | { |
1125 | dst.x = exponential(src.x, pp); |
1126 | dst.y = exponential(src.y, pp); |
1127 | dst.z = exponential(src.z, pp); |
1128 | dst.w = exponential(src.w, pp); |
1129 | } |
1130 | |
1131 | void ShaderCore::log2x(Vector4f &dst, const Vector4f &src, bool pp) |
1132 | { |
1133 | Float4 log = logarithm2(src.x, true, pp); |
1134 | |
1135 | dst.x = log; |
1136 | dst.y = log; |
1137 | dst.z = log; |
1138 | dst.w = log; |
1139 | } |
1140 | |
1141 | void ShaderCore::log2(Vector4f &dst, const Vector4f &src, bool pp) |
1142 | { |
1143 | dst.x = logarithm2(src.x, false, pp); |
1144 | dst.y = logarithm2(src.y, false, pp); |
1145 | dst.z = logarithm2(src.z, false, pp); |
1146 | dst.w = logarithm2(src.w, false, pp); |
1147 | } |
1148 | |
1149 | void ShaderCore::log(Vector4f &dst, const Vector4f &src, bool pp) |
1150 | { |
1151 | dst.x = logarithm(src.x, false, pp); |
1152 | dst.y = logarithm(src.y, false, pp); |
1153 | dst.z = logarithm(src.z, false, pp); |
1154 | dst.w = logarithm(src.w, false, pp); |
1155 | } |
1156 | |
1157 | void ShaderCore::lit(Vector4f &dst, const Vector4f &src) |
1158 | { |
1159 | dst.x = Float4(1.0f); |
1160 | dst.y = Max(src.x, Float4(0.0f)); |
1161 | |
1162 | Float4 pow; |
1163 | |
1164 | pow = src.w; |
1165 | pow = Min(pow, Float4(127.9961f)); |
1166 | pow = Max(pow, Float4(-127.9961f)); |
1167 | |
1168 | dst.z = power(src.y, pow); |
1169 | dst.z = As<Float4>(As<Int4>(dst.z) & CmpNLT(src.x, Float4(0.0f))); |
1170 | dst.z = As<Float4>(As<Int4>(dst.z) & CmpNLT(src.y, Float4(0.0f))); |
1171 | |
1172 | dst.w = Float4(1.0f); |
1173 | } |
1174 | |
1175 | void ShaderCore::att(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
1176 | { |
1177 | // Computes attenuation factors (1, d, d^2, 1/d) assuming src0 = d^2, src1 = 1/d |
1178 | dst.x = 1; |
1179 | dst.y = src0.y * src1.y; |
1180 | dst.z = src0.z; |
1181 | dst.w = src1.w; |
1182 | } |
1183 | |
1184 | void ShaderCore::lrp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2) |
1185 | { |
1186 | dst.x = src0.x * (src1.x - src2.x) + src2.x; |
1187 | dst.y = src0.y * (src1.y - src2.y) + src2.y; |
1188 | dst.z = src0.z * (src1.z - src2.z) + src2.z; |
1189 | dst.w = src0.w * (src1.w - src2.w) + src2.w; |
1190 | } |
1191 | |
1192 | void ShaderCore::isinf(Vector4f &dst, const Vector4f &src) |
1193 | { |
1194 | dst.x = As<Float4>(IsInf(src.x)); |
1195 | dst.y = As<Float4>(IsInf(src.y)); |
1196 | dst.z = As<Float4>(IsInf(src.z)); |
1197 | dst.w = As<Float4>(IsInf(src.w)); |
1198 | } |
1199 | |
1200 | void ShaderCore::isnan(Vector4f &dst, const Vector4f &src) |
1201 | { |
1202 | dst.x = As<Float4>(IsNan(src.x)); |
1203 | dst.y = As<Float4>(IsNan(src.y)); |
1204 | dst.z = As<Float4>(IsNan(src.z)); |
1205 | dst.w = As<Float4>(IsNan(src.w)); |
1206 | } |
1207 | |
1208 | void ShaderCore::smooth(Vector4f &dst, const Vector4f &edge0, const Vector4f &edge1, const Vector4f &x) |
1209 | { |
1210 | Float4 tx = Min(Max((x.x - edge0.x) / (edge1.x - edge0.x), Float4(0.0f)), Float4(1.0f)); dst.x = tx * tx * (Float4(3.0f) - Float4(2.0f) * tx); |
1211 | Float4 ty = Min(Max((x.y - edge0.y) / (edge1.y - edge0.y), Float4(0.0f)), Float4(1.0f)); dst.y = ty * ty * (Float4(3.0f) - Float4(2.0f) * ty); |
1212 | Float4 tz = Min(Max((x.z - edge0.z) / (edge1.z - edge0.z), Float4(0.0f)), Float4(1.0f)); dst.z = tz * tz * (Float4(3.0f) - Float4(2.0f) * tz); |
1213 | Float4 tw = Min(Max((x.w - edge0.w) / (edge1.w - edge0.w), Float4(0.0f)), Float4(1.0f)); dst.w = tw * tw * (Float4(3.0f) - Float4(2.0f) * tw); |
1214 | } |
1215 | |
1216 | void ShaderCore::floatToHalfBits(Float4& dst, const Float4& floatBits, bool storeInUpperBits) |
1217 | { |
1218 | static const uint32_t mask_sign = 0x80000000u; |
1219 | static const uint32_t mask_round = ~0xfffu; |
1220 | static const uint32_t c_f32infty = 255 << 23; |
1221 | static const uint32_t c_magic = 15 << 23; |
1222 | static const uint32_t c_nanbit = 0x200; |
1223 | static const uint32_t c_infty_as_fp16 = 0x7c00; |
1224 | static const uint32_t c_clamp = (31 << 23) - 0x1000; |
1225 | |
1226 | UInt4 justsign = UInt4(mask_sign) & As<UInt4>(floatBits); |
1227 | UInt4 absf = As<UInt4>(floatBits) ^ justsign; |
1228 | UInt4 b_isnormal = CmpNLE(UInt4(c_f32infty), absf); |
1229 | |
1230 | // Note: this version doesn't round to the nearest even in case of a tie as defined by IEEE 754-2008, it rounds to +inf |
1231 | // instead of nearest even, since that's fine for GLSL ES 3.0's needs (see section 2.1.1 Floating-Point Computation) |
1232 | UInt4 joined = ((((As<UInt4>(Min(As<Float4>(absf & UInt4(mask_round)) * As<Float4>(UInt4(c_magic)), |
1233 | As<Float4>(UInt4(c_clamp))))) - UInt4(mask_round)) >> 13) & b_isnormal) | |
1234 | ((b_isnormal ^ UInt4(0xFFFFFFFF)) & ((CmpNLE(absf, UInt4(c_f32infty)) & UInt4(c_nanbit)) | |
1235 | UInt4(c_infty_as_fp16))); |
1236 | |
1237 | dst = As<Float4>(storeInUpperBits ? As<UInt4>(dst) | ((joined << 16) | justsign) : joined | (justsign >> 16)); |
1238 | } |
1239 | |
1240 | void ShaderCore::halfToFloatBits(Float4& dst, const Float4& halfBits) |
1241 | { |
1242 | static const uint32_t mask_nosign = 0x7FFF; |
1243 | static const uint32_t magic = (254 - 15) << 23; |
1244 | static const uint32_t was_infnan = 0x7BFF; |
1245 | static const uint32_t exp_infnan = 255 << 23; |
1246 | |
1247 | UInt4 expmant = As<UInt4>(halfBits) & UInt4(mask_nosign); |
1248 | dst = As<Float4>(As<UInt4>(As<Float4>(expmant << 13) * As<Float4>(UInt4(magic))) | |
1249 | ((As<UInt4>(halfBits) ^ UInt4(expmant)) << 16) | |
1250 | (CmpNLE(As<UInt4>(expmant), UInt4(was_infnan)) & UInt4(exp_infnan))); |
1251 | } |
1252 | |
1253 | void ShaderCore::packHalf2x16(Vector4f &d, const Vector4f &s0) |
1254 | { |
1255 | // half2 | half1 |
1256 | floatToHalfBits(d.x, s0.x, false); |
1257 | floatToHalfBits(d.x, s0.y, true); |
1258 | } |
1259 | |
1260 | void ShaderCore::unpackHalf2x16(Vector4f &dst, const Vector4f &s0) |
1261 | { |
1262 | // half2 | half1 |
1263 | halfToFloatBits(dst.x, As<Float4>(As<UInt4>(s0.x) & UInt4(0x0000FFFF))); |
1264 | halfToFloatBits(dst.y, As<Float4>((As<UInt4>(s0.x) & UInt4(0xFFFF0000)) >> 16)); |
1265 | } |
1266 | |
1267 | void ShaderCore::packSnorm2x16(Vector4f &d, const Vector4f &s0) |
1268 | { |
1269 | // round(clamp(c, -1.0, 1.0) * 32767.0) |
1270 | d.x = As<Float4>((Int4(Round(Min(Max(s0.x, Float4(-1.0f)), Float4(1.0f)) * Float4(32767.0f))) & Int4(0xFFFF)) | |
1271 | ((Int4(Round(Min(Max(s0.y, Float4(-1.0f)), Float4(1.0f)) * Float4(32767.0f))) & Int4(0xFFFF)) << 16)); |
1272 | } |
1273 | |
1274 | void ShaderCore::packUnorm2x16(Vector4f &d, const Vector4f &s0) |
1275 | { |
1276 | // round(clamp(c, 0.0, 1.0) * 65535.0) |
1277 | d.x = As<Float4>((Int4(Round(Min(Max(s0.x, Float4(0.0f)), Float4(1.0f)) * Float4(65535.0f))) & Int4(0xFFFF)) | |
1278 | ((Int4(Round(Min(Max(s0.y, Float4(0.0f)), Float4(1.0f)) * Float4(65535.0f))) & Int4(0xFFFF)) << 16)); |
1279 | } |
1280 | |
1281 | void ShaderCore::unpackSnorm2x16(Vector4f &dst, const Vector4f &s0) |
1282 | { |
1283 | // clamp(f / 32727.0, -1.0, 1.0) |
1284 | dst.x = Min(Max(Float4(As<Int4>((As<UInt4>(s0.x) & UInt4(0x0000FFFF)) << 16)) * Float4(1.0f / float(0x7FFF0000)), Float4(-1.0f)), Float4(1.0f)); |
1285 | dst.y = Min(Max(Float4(As<Int4>(As<UInt4>(s0.x) & UInt4(0xFFFF0000))) * Float4(1.0f / float(0x7FFF0000)), Float4(-1.0f)), Float4(1.0f)); |
1286 | } |
1287 | |
1288 | void ShaderCore::unpackUnorm2x16(Vector4f &dst, const Vector4f &s0) |
1289 | { |
1290 | // f / 65535.0 |
1291 | dst.x = Float4((As<UInt4>(s0.x) & UInt4(0x0000FFFF)) << 16) * Float4(1.0f / float(0xFFFF0000)); |
1292 | dst.y = Float4(As<UInt4>(s0.x) & UInt4(0xFFFF0000)) * Float4(1.0f / float(0xFFFF0000)); |
1293 | } |
1294 | |
1295 | void ShaderCore::det2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
1296 | { |
1297 | dst.x = src0.x * src1.y - src0.y * src1.x; |
1298 | dst.y = dst.z = dst.w = dst.x; |
1299 | } |
1300 | |
1301 | void ShaderCore::det3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2) |
1302 | { |
1303 | crs(dst, src1, src2); |
1304 | dp3(dst, dst, src0); |
1305 | } |
1306 | |
1307 | void ShaderCore::det4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2, const Vector4f &src3) |
1308 | { |
1309 | dst.x = src2.z * src3.w - src2.w * src3.z; |
1310 | dst.y = src1.w * src3.z - src1.z * src3.w; |
1311 | dst.z = src1.z * src2.w - src1.w * src2.z; |
1312 | dst.x = src0.x * (src1.y * dst.x + src2.y * dst.y + src3.y * dst.z) - |
1313 | src0.y * (src1.x * dst.x + src2.x * dst.y + src3.x * dst.z) + |
1314 | src0.z * (src1.x * (src2.y * src3.w - src2.w * src3.y) + |
1315 | src2.x * (src1.w * src3.y - src1.y * src3.w) + |
1316 | src3.x * (src1.y * src2.w - src1.w * src2.y)) + |
1317 | src0.w * (src1.x * (src2.z * src3.y - src2.y * src3.z) + |
1318 | src2.x * (src1.y * src3.z - src1.z * src3.y) + |
1319 | src3.x * (src1.z * src2.y - src1.y * src2.z)); |
1320 | dst.y = dst.z = dst.w = dst.x; |
1321 | } |
1322 | |
1323 | void ShaderCore::frc(Vector4f &dst, const Vector4f &src) |
1324 | { |
1325 | dst.x = Frac(src.x); |
1326 | dst.y = Frac(src.y); |
1327 | dst.z = Frac(src.z); |
1328 | dst.w = Frac(src.w); |
1329 | } |
1330 | |
1331 | void ShaderCore::trunc(Vector4f &dst, const Vector4f &src) |
1332 | { |
1333 | dst.x = Trunc(src.x); |
1334 | dst.y = Trunc(src.y); |
1335 | dst.z = Trunc(src.z); |
1336 | dst.w = Trunc(src.w); |
1337 | } |
1338 | |
1339 | void ShaderCore::floor(Vector4f &dst, const Vector4f &src) |
1340 | { |
1341 | dst.x = Floor(src.x); |
1342 | dst.y = Floor(src.y); |
1343 | dst.z = Floor(src.z); |
1344 | dst.w = Floor(src.w); |
1345 | } |
1346 | |
1347 | void ShaderCore::round(Vector4f &dst, const Vector4f &src) |
1348 | { |
1349 | dst.x = Round(src.x); |
1350 | dst.y = Round(src.y); |
1351 | dst.z = Round(src.z); |
1352 | dst.w = Round(src.w); |
1353 | } |
1354 | |
1355 | void ShaderCore::roundEven(Vector4f &dst, const Vector4f &src) |
1356 | { |
1357 | // dst = round(src) + ((round(src) < src) * 2 - 1) * (fract(src) == 0.5) * isOdd(round(src)); |
1358 | // ex.: 1.5: 2 + (0 * 2 - 1) * 1 * 0 = 2 |
1359 | // 2.5: 3 + (0 * 2 - 1) * 1 * 1 = 2 |
1360 | // -1.5: -2 + (1 * 2 - 1) * 1 * 0 = -2 |
1361 | // -2.5: -3 + (1 * 2 - 1) * 1 * 1 = -2 |
1362 | // Even if the round implementation rounds the other way: |
1363 | // 1.5: 1 + (1 * 2 - 1) * 1 * 1 = 2 |
1364 | // 2.5: 2 + (1 * 2 - 1) * 1 * 0 = 2 |
1365 | // -1.5: -1 + (0 * 2 - 1) * 1 * 1 = -2 |
1366 | // -2.5: -2 + (0 * 2 - 1) * 1 * 0 = -2 |
1367 | round(dst, src); |
1368 | dst.x += ((Float4(CmpLT(dst.x, src.x) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.x), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.x) & Int4(1)); |
1369 | dst.y += ((Float4(CmpLT(dst.y, src.y) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.y), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.y) & Int4(1)); |
1370 | dst.z += ((Float4(CmpLT(dst.z, src.z) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.z), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.z) & Int4(1)); |
1371 | dst.w += ((Float4(CmpLT(dst.w, src.w) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.w), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.w) & Int4(1)); |
1372 | } |
1373 | |
1374 | void ShaderCore::ceil(Vector4f &dst, const Vector4f &src) |
1375 | { |
1376 | dst.x = Ceil(src.x); |
1377 | dst.y = Ceil(src.y); |
1378 | dst.z = Ceil(src.z); |
1379 | dst.w = Ceil(src.w); |
1380 | } |
1381 | |
1382 | void ShaderCore::powx(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp) |
1383 | { |
1384 | Float4 pow = power(src0.x, src1.x, pp); |
1385 | |
1386 | dst.x = pow; |
1387 | dst.y = pow; |
1388 | dst.z = pow; |
1389 | dst.w = pow; |
1390 | } |
1391 | |
1392 | void ShaderCore::pow(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp) |
1393 | { |
1394 | dst.x = power(src0.x, src1.x, pp); |
1395 | dst.y = power(src0.y, src1.y, pp); |
1396 | dst.z = power(src0.z, src1.z, pp); |
1397 | dst.w = power(src0.w, src1.w, pp); |
1398 | } |
1399 | |
1400 | void ShaderCore::crs(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
1401 | { |
1402 | dst.x = src0.y * src1.z - src0.z * src1.y; |
1403 | dst.y = src0.z * src1.x - src0.x * src1.z; |
1404 | dst.z = src0.x * src1.y - src0.y * src1.x; |
1405 | } |
1406 | |
1407 | void ShaderCore::forward1(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref) |
1408 | { |
1409 | Int4 flip = CmpNLT(Nref.x * I.x, Float4(0.0f)) & Int4(0x80000000); |
1410 | |
1411 | dst.x = As<Float4>(flip ^ As<Int4>(N.x)); |
1412 | } |
1413 | |
1414 | void ShaderCore::forward2(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref) |
1415 | { |
1416 | Int4 flip = CmpNLT(dot2(Nref, I), Float4(0.0f)) & Int4(0x80000000); |
1417 | |
1418 | dst.x = As<Float4>(flip ^ As<Int4>(N.x)); |
1419 | dst.y = As<Float4>(flip ^ As<Int4>(N.y)); |
1420 | } |
1421 | |
1422 | void ShaderCore::forward3(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref) |
1423 | { |
1424 | Int4 flip = CmpNLT(dot3(Nref, I), Float4(0.0f)) & Int4(0x80000000); |
1425 | |
1426 | dst.x = As<Float4>(flip ^ As<Int4>(N.x)); |
1427 | dst.y = As<Float4>(flip ^ As<Int4>(N.y)); |
1428 | dst.z = As<Float4>(flip ^ As<Int4>(N.z)); |
1429 | } |
1430 | |
1431 | void ShaderCore::forward4(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref) |
1432 | { |
1433 | Int4 flip = CmpNLT(dot4(Nref, I), Float4(0.0f)) & Int4(0x80000000); |
1434 | |
1435 | dst.x = As<Float4>(flip ^ As<Int4>(N.x)); |
1436 | dst.y = As<Float4>(flip ^ As<Int4>(N.y)); |
1437 | dst.z = As<Float4>(flip ^ As<Int4>(N.z)); |
1438 | dst.w = As<Float4>(flip ^ As<Int4>(N.w)); |
1439 | } |
1440 | |
1441 | void ShaderCore::reflect1(Vector4f &dst, const Vector4f &I, const Vector4f &N) |
1442 | { |
1443 | Float4 d = N.x * I.x; |
1444 | |
1445 | dst.x = I.x - Float4(2.0f) * d * N.x; |
1446 | } |
1447 | |
1448 | void ShaderCore::reflect2(Vector4f &dst, const Vector4f &I, const Vector4f &N) |
1449 | { |
1450 | Float4 d = dot2(N, I); |
1451 | |
1452 | dst.x = I.x - Float4(2.0f) * d * N.x; |
1453 | dst.y = I.y - Float4(2.0f) * d * N.y; |
1454 | } |
1455 | |
1456 | void ShaderCore::reflect3(Vector4f &dst, const Vector4f &I, const Vector4f &N) |
1457 | { |
1458 | Float4 d = dot3(N, I); |
1459 | |
1460 | dst.x = I.x - Float4(2.0f) * d * N.x; |
1461 | dst.y = I.y - Float4(2.0f) * d * N.y; |
1462 | dst.z = I.z - Float4(2.0f) * d * N.z; |
1463 | } |
1464 | |
1465 | void ShaderCore::reflect4(Vector4f &dst, const Vector4f &I, const Vector4f &N) |
1466 | { |
1467 | Float4 d = dot4(N, I); |
1468 | |
1469 | dst.x = I.x - Float4(2.0f) * d * N.x; |
1470 | dst.y = I.y - Float4(2.0f) * d * N.y; |
1471 | dst.z = I.z - Float4(2.0f) * d * N.z; |
1472 | dst.w = I.w - Float4(2.0f) * d * N.w; |
1473 | } |
1474 | |
1475 | void ShaderCore::refract1(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta) |
1476 | { |
1477 | Float4 d = N.x * I.x; |
1478 | Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d); |
1479 | Int4 pos = CmpNLT(k, Float4(0.0f)); |
1480 | Float4 t = (eta * d + Sqrt(k)); |
1481 | |
1482 | dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x)); |
1483 | } |
1484 | |
1485 | void ShaderCore::refract2(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta) |
1486 | { |
1487 | Float4 d = dot2(N, I); |
1488 | Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d); |
1489 | Int4 pos = CmpNLT(k, Float4(0.0f)); |
1490 | Float4 t = (eta * d + Sqrt(k)); |
1491 | |
1492 | dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x)); |
1493 | dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y)); |
1494 | } |
1495 | |
1496 | void ShaderCore::refract3(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta) |
1497 | { |
1498 | Float4 d = dot3(N, I); |
1499 | Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d); |
1500 | Int4 pos = CmpNLT(k, Float4(0.0f)); |
1501 | Float4 t = (eta * d + Sqrt(k)); |
1502 | |
1503 | dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x)); |
1504 | dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y)); |
1505 | dst.z = As<Float4>(pos & As<Int4>(eta * I.z - t * N.z)); |
1506 | } |
1507 | |
1508 | void ShaderCore::refract4(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta) |
1509 | { |
1510 | Float4 d = dot4(N, I); |
1511 | Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d); |
1512 | Int4 pos = CmpNLT(k, Float4(0.0f)); |
1513 | Float4 t = (eta * d + Sqrt(k)); |
1514 | |
1515 | dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x)); |
1516 | dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y)); |
1517 | dst.z = As<Float4>(pos & As<Int4>(eta * I.z - t * N.z)); |
1518 | dst.w = As<Float4>(pos & As<Int4>(eta * I.w - t * N.w)); |
1519 | } |
1520 | |
1521 | void ShaderCore::sgn(Vector4f &dst, const Vector4f &src) |
1522 | { |
1523 | sgn(dst.x, src.x); |
1524 | sgn(dst.y, src.y); |
1525 | sgn(dst.z, src.z); |
1526 | sgn(dst.w, src.w); |
1527 | } |
1528 | |
1529 | void ShaderCore::isgn(Vector4f &dst, const Vector4f &src) |
1530 | { |
1531 | isgn(dst.x, src.x); |
1532 | isgn(dst.y, src.y); |
1533 | isgn(dst.z, src.z); |
1534 | isgn(dst.w, src.w); |
1535 | } |
1536 | |
1537 | void ShaderCore::abs(Vector4f &dst, const Vector4f &src) |
1538 | { |
1539 | dst.x = Abs(src.x); |
1540 | dst.y = Abs(src.y); |
1541 | dst.z = Abs(src.z); |
1542 | dst.w = Abs(src.w); |
1543 | } |
1544 | |
1545 | void ShaderCore::iabs(Vector4f &dst, const Vector4f &src) |
1546 | { |
1547 | dst.x = As<Float4>(Abs(As<Int4>(src.x))); |
1548 | dst.y = As<Float4>(Abs(As<Int4>(src.y))); |
1549 | dst.z = As<Float4>(Abs(As<Int4>(src.z))); |
1550 | dst.w = As<Float4>(Abs(As<Int4>(src.w))); |
1551 | } |
1552 | |
1553 | void ShaderCore::nrm2(Vector4f &dst, const Vector4f &src, bool pp) |
1554 | { |
1555 | Float4 dot = dot2(src, src); |
1556 | Float4 rsq = reciprocalSquareRoot(dot, false, pp); |
1557 | |
1558 | dst.x = src.x * rsq; |
1559 | dst.y = src.y * rsq; |
1560 | dst.z = src.z * rsq; |
1561 | dst.w = src.w * rsq; |
1562 | } |
1563 | |
1564 | void ShaderCore::nrm3(Vector4f &dst, const Vector4f &src, bool pp) |
1565 | { |
1566 | Float4 dot = dot3(src, src); |
1567 | Float4 rsq = reciprocalSquareRoot(dot, false, pp); |
1568 | |
1569 | dst.x = src.x * rsq; |
1570 | dst.y = src.y * rsq; |
1571 | dst.z = src.z * rsq; |
1572 | dst.w = src.w * rsq; |
1573 | } |
1574 | |
1575 | void ShaderCore::nrm4(Vector4f &dst, const Vector4f &src, bool pp) |
1576 | { |
1577 | Float4 dot = dot4(src, src); |
1578 | Float4 rsq = reciprocalSquareRoot(dot, false, pp); |
1579 | |
1580 | dst.x = src.x * rsq; |
1581 | dst.y = src.y * rsq; |
1582 | dst.z = src.z * rsq; |
1583 | dst.w = src.w * rsq; |
1584 | } |
1585 | |
1586 | void ShaderCore::sincos(Vector4f &dst, const Vector4f &src, bool pp) |
1587 | { |
1588 | dst.x = cosine_pi(src.x, pp); |
1589 | dst.y = sine_pi(src.x, pp); |
1590 | } |
1591 | |
1592 | void ShaderCore::cos(Vector4f &dst, const Vector4f &src, bool pp) |
1593 | { |
1594 | dst.x = cosine(src.x, pp); |
1595 | dst.y = cosine(src.y, pp); |
1596 | dst.z = cosine(src.z, pp); |
1597 | dst.w = cosine(src.w, pp); |
1598 | } |
1599 | |
1600 | void ShaderCore::sin(Vector4f &dst, const Vector4f &src, bool pp) |
1601 | { |
1602 | dst.x = sine(src.x, pp); |
1603 | dst.y = sine(src.y, pp); |
1604 | dst.z = sine(src.z, pp); |
1605 | dst.w = sine(src.w, pp); |
1606 | } |
1607 | |
1608 | void ShaderCore::tan(Vector4f &dst, const Vector4f &src, bool pp) |
1609 | { |
1610 | dst.x = tangent(src.x, pp); |
1611 | dst.y = tangent(src.y, pp); |
1612 | dst.z = tangent(src.z, pp); |
1613 | dst.w = tangent(src.w, pp); |
1614 | } |
1615 | |
1616 | void ShaderCore::acos(Vector4f &dst, const Vector4f &src, bool pp) |
1617 | { |
1618 | dst.x = arccos(src.x, pp); |
1619 | dst.y = arccos(src.y, pp); |
1620 | dst.z = arccos(src.z, pp); |
1621 | dst.w = arccos(src.w, pp); |
1622 | } |
1623 | |
1624 | void ShaderCore::asin(Vector4f &dst, const Vector4f &src, bool pp) |
1625 | { |
1626 | dst.x = arcsin(src.x, pp); |
1627 | dst.y = arcsin(src.y, pp); |
1628 | dst.z = arcsin(src.z, pp); |
1629 | dst.w = arcsin(src.w, pp); |
1630 | } |
1631 | |
1632 | void ShaderCore::atan(Vector4f &dst, const Vector4f &src, bool pp) |
1633 | { |
1634 | dst.x = arctan(src.x, pp); |
1635 | dst.y = arctan(src.y, pp); |
1636 | dst.z = arctan(src.z, pp); |
1637 | dst.w = arctan(src.w, pp); |
1638 | } |
1639 | |
1640 | void ShaderCore::atan2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp) |
1641 | { |
1642 | dst.x = arctan(src0.x, src1.x, pp); |
1643 | dst.y = arctan(src0.y, src1.y, pp); |
1644 | dst.z = arctan(src0.z, src1.z, pp); |
1645 | dst.w = arctan(src0.w, src1.w, pp); |
1646 | } |
1647 | |
1648 | void ShaderCore::cosh(Vector4f &dst, const Vector4f &src, bool pp) |
1649 | { |
1650 | dst.x = cosineh(src.x, pp); |
1651 | dst.y = cosineh(src.y, pp); |
1652 | dst.z = cosineh(src.z, pp); |
1653 | dst.w = cosineh(src.w, pp); |
1654 | } |
1655 | |
1656 | void ShaderCore::sinh(Vector4f &dst, const Vector4f &src, bool pp) |
1657 | { |
1658 | dst.x = sineh(src.x, pp); |
1659 | dst.y = sineh(src.y, pp); |
1660 | dst.z = sineh(src.z, pp); |
1661 | dst.w = sineh(src.w, pp); |
1662 | } |
1663 | |
1664 | void ShaderCore::tanh(Vector4f &dst, const Vector4f &src, bool pp) |
1665 | { |
1666 | dst.x = tangenth(src.x, pp); |
1667 | dst.y = tangenth(src.y, pp); |
1668 | dst.z = tangenth(src.z, pp); |
1669 | dst.w = tangenth(src.w, pp); |
1670 | } |
1671 | |
1672 | void ShaderCore::acosh(Vector4f &dst, const Vector4f &src, bool pp) |
1673 | { |
1674 | dst.x = arccosh(src.x, pp); |
1675 | dst.y = arccosh(src.y, pp); |
1676 | dst.z = arccosh(src.z, pp); |
1677 | dst.w = arccosh(src.w, pp); |
1678 | } |
1679 | |
1680 | void ShaderCore::asinh(Vector4f &dst, const Vector4f &src, bool pp) |
1681 | { |
1682 | dst.x = arcsinh(src.x, pp); |
1683 | dst.y = arcsinh(src.y, pp); |
1684 | dst.z = arcsinh(src.z, pp); |
1685 | dst.w = arcsinh(src.w, pp); |
1686 | } |
1687 | |
1688 | void ShaderCore::atanh(Vector4f &dst, const Vector4f &src, bool pp) |
1689 | { |
1690 | dst.x = arctanh(src.x, pp); |
1691 | dst.y = arctanh(src.y, pp); |
1692 | dst.z = arctanh(src.z, pp); |
1693 | dst.w = arctanh(src.w, pp); |
1694 | } |
1695 | |
1696 | void ShaderCore::expp(Vector4f &dst, const Vector4f &src, unsigned short shaderModel) |
1697 | { |
1698 | if(shaderModel < 0x0200) |
1699 | { |
1700 | Float4 frc = Frac(src.x); |
1701 | Float4 floor = src.x - frc; |
1702 | |
1703 | dst.x = exponential2(floor, true); |
1704 | dst.y = frc; |
1705 | dst.z = exponential2(src.x, true); |
1706 | dst.w = Float4(1.0f); |
1707 | } |
1708 | else // Version >= 2.0 |
1709 | { |
1710 | exp2x(dst, src, true); // FIXME: 10-bit precision suffices |
1711 | } |
1712 | } |
1713 | |
1714 | void ShaderCore::logp(Vector4f &dst, const Vector4f &src, unsigned short shaderModel) |
1715 | { |
1716 | if(shaderModel < 0x0200) |
1717 | { |
1718 | Float4 tmp0; |
1719 | Float4 tmp1; |
1720 | Float4 t; |
1721 | Int4 r; |
1722 | |
1723 | tmp0 = Abs(src.x); |
1724 | tmp1 = tmp0; |
1725 | |
1726 | // X component |
1727 | r = As<Int4>(As<UInt4>(tmp0) >> 23) - Int4(127); |
1728 | dst.x = Float4(r); |
1729 | |
1730 | // Y component |
1731 | dst.y = As<Float4>((As<Int4>(tmp1) & Int4(0x007FFFFF)) | As<Int4>(Float4(1.0f))); |
1732 | |
1733 | // Z component |
1734 | dst.z = logarithm2(src.x, true, true); |
1735 | |
1736 | // W component |
1737 | dst.w = 1.0f; |
1738 | } |
1739 | else |
1740 | { |
1741 | log2x(dst, src, true); |
1742 | } |
1743 | } |
1744 | |
1745 | void ShaderCore::cmp0(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2) |
1746 | { |
1747 | cmp0(dst.x, src0.x, src1.x, src2.x); |
1748 | cmp0(dst.y, src0.y, src1.y, src2.y); |
1749 | cmp0(dst.z, src0.z, src1.z, src2.z); |
1750 | cmp0(dst.w, src0.w, src1.w, src2.w); |
1751 | } |
1752 | |
1753 | void ShaderCore::select(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2) |
1754 | { |
1755 | select(dst.x, As<Int4>(src0.x), src1.x, src2.x); |
1756 | select(dst.y, As<Int4>(src0.y), src1.y, src2.y); |
1757 | select(dst.z, As<Int4>(src0.z), src1.z, src2.z); |
1758 | select(dst.w, As<Int4>(src0.w), src1.w, src2.w); |
1759 | } |
1760 | |
1761 | void ShaderCore::(Float4 &dst, const Vector4f &src0, const Float4 &src1) |
1762 | { |
1763 | select(dst, CmpEQ(As<Int4>(src1), Int4(1)), src0.y, src0.x); |
1764 | select(dst, CmpEQ(As<Int4>(src1), Int4(2)), src0.z, dst); |
1765 | select(dst, CmpEQ(As<Int4>(src1), Int4(3)), src0.w, dst); |
1766 | } |
1767 | |
1768 | void ShaderCore::insert(Vector4f &dst, const Vector4f &src, const Float4 &element, const Float4 &index) |
1769 | { |
1770 | select(dst.x, CmpEQ(As<Int4>(index), Int4(0)), element, src.x); |
1771 | select(dst.y, CmpEQ(As<Int4>(index), Int4(1)), element, src.y); |
1772 | select(dst.z, CmpEQ(As<Int4>(index), Int4(2)), element, src.z); |
1773 | select(dst.w, CmpEQ(As<Int4>(index), Int4(3)), element, src.w); |
1774 | } |
1775 | |
1776 | void ShaderCore::sgn(Float4 &dst, const Float4 &src) |
1777 | { |
1778 | Int4 neg = As<Int4>(CmpLT(src, Float4(-0.0f))) & As<Int4>(Float4(-1.0f)); |
1779 | Int4 pos = As<Int4>(CmpNLE(src, Float4(+0.0f))) & As<Int4>(Float4(1.0f)); |
1780 | dst = As<Float4>(neg | pos); |
1781 | } |
1782 | |
1783 | void ShaderCore::isgn(Float4 &dst, const Float4 &src) |
1784 | { |
1785 | Int4 neg = CmpLT(As<Int4>(src), Int4(0)) & Int4(-1); |
1786 | Int4 pos = CmpNLE(As<Int4>(src), Int4(0)) & Int4(1); |
1787 | dst = As<Float4>(neg | pos); |
1788 | } |
1789 | |
1790 | void ShaderCore::cmp0(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2) |
1791 | { |
1792 | Int4 pos = CmpLE(Float4(0.0f), src0); |
1793 | select(dst, pos, src1, src2); |
1794 | } |
1795 | |
1796 | void ShaderCore::cmp0i(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2) |
1797 | { |
1798 | Int4 pos = CmpEQ(Int4(0), As<Int4>(src0)); |
1799 | select(dst, pos, src1, src2); |
1800 | } |
1801 | |
1802 | void ShaderCore::select(Float4 &dst, RValue<Int4> src0, const Float4 &src1, const Float4 &src2) |
1803 | { |
1804 | // FIXME: LLVM vector select |
1805 | dst = As<Float4>((src0 & As<Int4>(src1)) | (~src0 & As<Int4>(src2))); |
1806 | } |
1807 | |
1808 | void ShaderCore::cmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control) |
1809 | { |
1810 | switch(control) |
1811 | { |
1812 | case Shader::CONTROL_GT: |
1813 | dst.x = As<Float4>(CmpNLE(src0.x, src1.x)); |
1814 | dst.y = As<Float4>(CmpNLE(src0.y, src1.y)); |
1815 | dst.z = As<Float4>(CmpNLE(src0.z, src1.z)); |
1816 | dst.w = As<Float4>(CmpNLE(src0.w, src1.w)); |
1817 | break; |
1818 | case Shader::CONTROL_EQ: |
1819 | dst.x = As<Float4>(CmpEQ(src0.x, src1.x)); |
1820 | dst.y = As<Float4>(CmpEQ(src0.y, src1.y)); |
1821 | dst.z = As<Float4>(CmpEQ(src0.z, src1.z)); |
1822 | dst.w = As<Float4>(CmpEQ(src0.w, src1.w)); |
1823 | break; |
1824 | case Shader::CONTROL_GE: |
1825 | dst.x = As<Float4>(CmpNLT(src0.x, src1.x)); |
1826 | dst.y = As<Float4>(CmpNLT(src0.y, src1.y)); |
1827 | dst.z = As<Float4>(CmpNLT(src0.z, src1.z)); |
1828 | dst.w = As<Float4>(CmpNLT(src0.w, src1.w)); |
1829 | break; |
1830 | case Shader::CONTROL_LT: |
1831 | dst.x = As<Float4>(CmpLT(src0.x, src1.x)); |
1832 | dst.y = As<Float4>(CmpLT(src0.y, src1.y)); |
1833 | dst.z = As<Float4>(CmpLT(src0.z, src1.z)); |
1834 | dst.w = As<Float4>(CmpLT(src0.w, src1.w)); |
1835 | break; |
1836 | case Shader::CONTROL_NE: |
1837 | dst.x = As<Float4>(CmpNEQ(src0.x, src1.x)); |
1838 | dst.y = As<Float4>(CmpNEQ(src0.y, src1.y)); |
1839 | dst.z = As<Float4>(CmpNEQ(src0.z, src1.z)); |
1840 | dst.w = As<Float4>(CmpNEQ(src0.w, src1.w)); |
1841 | break; |
1842 | case Shader::CONTROL_LE: |
1843 | dst.x = As<Float4>(CmpLE(src0.x, src1.x)); |
1844 | dst.y = As<Float4>(CmpLE(src0.y, src1.y)); |
1845 | dst.z = As<Float4>(CmpLE(src0.z, src1.z)); |
1846 | dst.w = As<Float4>(CmpLE(src0.w, src1.w)); |
1847 | break; |
1848 | default: |
1849 | ASSERT(false); |
1850 | } |
1851 | } |
1852 | |
1853 | void ShaderCore::icmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control) |
1854 | { |
1855 | switch(control) |
1856 | { |
1857 | case Shader::CONTROL_GT: |
1858 | dst.x = As<Float4>(CmpNLE(As<Int4>(src0.x), As<Int4>(src1.x))); |
1859 | dst.y = As<Float4>(CmpNLE(As<Int4>(src0.y), As<Int4>(src1.y))); |
1860 | dst.z = As<Float4>(CmpNLE(As<Int4>(src0.z), As<Int4>(src1.z))); |
1861 | dst.w = As<Float4>(CmpNLE(As<Int4>(src0.w), As<Int4>(src1.w))); |
1862 | break; |
1863 | case Shader::CONTROL_EQ: |
1864 | dst.x = As<Float4>(CmpEQ(As<Int4>(src0.x), As<Int4>(src1.x))); |
1865 | dst.y = As<Float4>(CmpEQ(As<Int4>(src0.y), As<Int4>(src1.y))); |
1866 | dst.z = As<Float4>(CmpEQ(As<Int4>(src0.z), As<Int4>(src1.z))); |
1867 | dst.w = As<Float4>(CmpEQ(As<Int4>(src0.w), As<Int4>(src1.w))); |
1868 | break; |
1869 | case Shader::CONTROL_GE: |
1870 | dst.x = As<Float4>(CmpNLT(As<Int4>(src0.x), As<Int4>(src1.x))); |
1871 | dst.y = As<Float4>(CmpNLT(As<Int4>(src0.y), As<Int4>(src1.y))); |
1872 | dst.z = As<Float4>(CmpNLT(As<Int4>(src0.z), As<Int4>(src1.z))); |
1873 | dst.w = As<Float4>(CmpNLT(As<Int4>(src0.w), As<Int4>(src1.w))); |
1874 | break; |
1875 | case Shader::CONTROL_LT: |
1876 | dst.x = As<Float4>(CmpLT(As<Int4>(src0.x), As<Int4>(src1.x))); |
1877 | dst.y = As<Float4>(CmpLT(As<Int4>(src0.y), As<Int4>(src1.y))); |
1878 | dst.z = As<Float4>(CmpLT(As<Int4>(src0.z), As<Int4>(src1.z))); |
1879 | dst.w = As<Float4>(CmpLT(As<Int4>(src0.w), As<Int4>(src1.w))); |
1880 | break; |
1881 | case Shader::CONTROL_NE: |
1882 | dst.x = As<Float4>(CmpNEQ(As<Int4>(src0.x), As<Int4>(src1.x))); |
1883 | dst.y = As<Float4>(CmpNEQ(As<Int4>(src0.y), As<Int4>(src1.y))); |
1884 | dst.z = As<Float4>(CmpNEQ(As<Int4>(src0.z), As<Int4>(src1.z))); |
1885 | dst.w = As<Float4>(CmpNEQ(As<Int4>(src0.w), As<Int4>(src1.w))); |
1886 | break; |
1887 | case Shader::CONTROL_LE: |
1888 | dst.x = As<Float4>(CmpLE(As<Int4>(src0.x), As<Int4>(src1.x))); |
1889 | dst.y = As<Float4>(CmpLE(As<Int4>(src0.y), As<Int4>(src1.y))); |
1890 | dst.z = As<Float4>(CmpLE(As<Int4>(src0.z), As<Int4>(src1.z))); |
1891 | dst.w = As<Float4>(CmpLE(As<Int4>(src0.w), As<Int4>(src1.w))); |
1892 | break; |
1893 | default: |
1894 | ASSERT(false); |
1895 | } |
1896 | } |
1897 | |
1898 | void ShaderCore::ucmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control) |
1899 | { |
1900 | switch(control) |
1901 | { |
1902 | case Shader::CONTROL_GT: |
1903 | dst.x = As<Float4>(CmpNLE(As<UInt4>(src0.x), As<UInt4>(src1.x))); |
1904 | dst.y = As<Float4>(CmpNLE(As<UInt4>(src0.y), As<UInt4>(src1.y))); |
1905 | dst.z = As<Float4>(CmpNLE(As<UInt4>(src0.z), As<UInt4>(src1.z))); |
1906 | dst.w = As<Float4>(CmpNLE(As<UInt4>(src0.w), As<UInt4>(src1.w))); |
1907 | break; |
1908 | case Shader::CONTROL_EQ: |
1909 | dst.x = As<Float4>(CmpEQ(As<UInt4>(src0.x), As<UInt4>(src1.x))); |
1910 | dst.y = As<Float4>(CmpEQ(As<UInt4>(src0.y), As<UInt4>(src1.y))); |
1911 | dst.z = As<Float4>(CmpEQ(As<UInt4>(src0.z), As<UInt4>(src1.z))); |
1912 | dst.w = As<Float4>(CmpEQ(As<UInt4>(src0.w), As<UInt4>(src1.w))); |
1913 | break; |
1914 | case Shader::CONTROL_GE: |
1915 | dst.x = As<Float4>(CmpNLT(As<UInt4>(src0.x), As<UInt4>(src1.x))); |
1916 | dst.y = As<Float4>(CmpNLT(As<UInt4>(src0.y), As<UInt4>(src1.y))); |
1917 | dst.z = As<Float4>(CmpNLT(As<UInt4>(src0.z), As<UInt4>(src1.z))); |
1918 | dst.w = As<Float4>(CmpNLT(As<UInt4>(src0.w), As<UInt4>(src1.w))); |
1919 | break; |
1920 | case Shader::CONTROL_LT: |
1921 | dst.x = As<Float4>(CmpLT(As<UInt4>(src0.x), As<UInt4>(src1.x))); |
1922 | dst.y = As<Float4>(CmpLT(As<UInt4>(src0.y), As<UInt4>(src1.y))); |
1923 | dst.z = As<Float4>(CmpLT(As<UInt4>(src0.z), As<UInt4>(src1.z))); |
1924 | dst.w = As<Float4>(CmpLT(As<UInt4>(src0.w), As<UInt4>(src1.w))); |
1925 | break; |
1926 | case Shader::CONTROL_NE: |
1927 | dst.x = As<Float4>(CmpNEQ(As<UInt4>(src0.x), As<UInt4>(src1.x))); |
1928 | dst.y = As<Float4>(CmpNEQ(As<UInt4>(src0.y), As<UInt4>(src1.y))); |
1929 | dst.z = As<Float4>(CmpNEQ(As<UInt4>(src0.z), As<UInt4>(src1.z))); |
1930 | dst.w = As<Float4>(CmpNEQ(As<UInt4>(src0.w), As<UInt4>(src1.w))); |
1931 | break; |
1932 | case Shader::CONTROL_LE: |
1933 | dst.x = As<Float4>(CmpLE(As<UInt4>(src0.x), As<UInt4>(src1.x))); |
1934 | dst.y = As<Float4>(CmpLE(As<UInt4>(src0.y), As<UInt4>(src1.y))); |
1935 | dst.z = As<Float4>(CmpLE(As<UInt4>(src0.z), As<UInt4>(src1.z))); |
1936 | dst.w = As<Float4>(CmpLE(As<UInt4>(src0.w), As<UInt4>(src1.w))); |
1937 | break; |
1938 | default: |
1939 | ASSERT(false); |
1940 | } |
1941 | } |
1942 | |
1943 | void ShaderCore::all(Float4 &dst, const Vector4f &src) |
1944 | { |
1945 | dst = As<Float4>(As<Int4>(src.x) & As<Int4>(src.y) & As<Int4>(src.z) & As<Int4>(src.w)); |
1946 | } |
1947 | |
1948 | void ShaderCore::any(Float4 &dst, const Vector4f &src) |
1949 | { |
1950 | dst = As<Float4>(As<Int4>(src.x) | As<Int4>(src.y) | As<Int4>(src.z) | As<Int4>(src.w)); |
1951 | } |
1952 | |
1953 | void ShaderCore::bitwise_not(Vector4f &dst, const Vector4f &src) |
1954 | { |
1955 | dst.x = As<Float4>(As<Int4>(src.x) ^ Int4(0xFFFFFFFF)); |
1956 | dst.y = As<Float4>(As<Int4>(src.y) ^ Int4(0xFFFFFFFF)); |
1957 | dst.z = As<Float4>(As<Int4>(src.z) ^ Int4(0xFFFFFFFF)); |
1958 | dst.w = As<Float4>(As<Int4>(src.w) ^ Int4(0xFFFFFFFF)); |
1959 | } |
1960 | |
1961 | void ShaderCore::bitwise_or(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
1962 | { |
1963 | dst.x = As<Float4>(As<Int4>(src0.x) | As<Int4>(src1.x)); |
1964 | dst.y = As<Float4>(As<Int4>(src0.y) | As<Int4>(src1.y)); |
1965 | dst.z = As<Float4>(As<Int4>(src0.z) | As<Int4>(src1.z)); |
1966 | dst.w = As<Float4>(As<Int4>(src0.w) | As<Int4>(src1.w)); |
1967 | } |
1968 | |
1969 | void ShaderCore::bitwise_xor(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
1970 | { |
1971 | dst.x = As<Float4>(As<Int4>(src0.x) ^ As<Int4>(src1.x)); |
1972 | dst.y = As<Float4>(As<Int4>(src0.y) ^ As<Int4>(src1.y)); |
1973 | dst.z = As<Float4>(As<Int4>(src0.z) ^ As<Int4>(src1.z)); |
1974 | dst.w = As<Float4>(As<Int4>(src0.w) ^ As<Int4>(src1.w)); |
1975 | } |
1976 | |
1977 | void ShaderCore::bitwise_and(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
1978 | { |
1979 | dst.x = As<Float4>(As<Int4>(src0.x) & As<Int4>(src1.x)); |
1980 | dst.y = As<Float4>(As<Int4>(src0.y) & As<Int4>(src1.y)); |
1981 | dst.z = As<Float4>(As<Int4>(src0.z) & As<Int4>(src1.z)); |
1982 | dst.w = As<Float4>(As<Int4>(src0.w) & As<Int4>(src1.w)); |
1983 | } |
1984 | |
1985 | void ShaderCore::equal(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
1986 | { |
1987 | dst.x = As<Float4>(CmpEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)) & |
1988 | CmpEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)) & |
1989 | CmpEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)) & |
1990 | CmpEQ(As<UInt4>(src0.w), As<UInt4>(src1.w))); |
1991 | dst.y = dst.x; |
1992 | dst.z = dst.x; |
1993 | dst.w = dst.x; |
1994 | } |
1995 | |
1996 | void ShaderCore::notEqual(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) |
1997 | { |
1998 | dst.x = As<Float4>(CmpNEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)) | |
1999 | CmpNEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)) | |
2000 | CmpNEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)) | |
2001 | CmpNEQ(As<UInt4>(src0.w), As<UInt4>(src1.w))); |
2002 | dst.y = dst.x; |
2003 | dst.z = dst.x; |
2004 | dst.w = dst.x; |
2005 | } |
2006 | } |
2007 | |