1// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include "PixelRoutine.hpp"
16
17#include "SamplerCore.hpp"
18#include "Constants.hpp"
19#include "Renderer/Renderer.hpp"
20#include "Renderer/QuadRasterizer.hpp"
21#include "Renderer/Surface.hpp"
22#include "Renderer/Primitive.hpp"
23#include "Common/Debug.hpp"
24
25namespace sw
26{
27 extern bool complementaryDepthBuffer;
28 extern bool postBlendSRGB;
29 extern bool exactColorRounding;
30 extern bool forceClearRegisters;
31
32 PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader)
33 : QuadRasterizer(state, shader), v(shader && shader->indirectAddressableInput)
34 {
35 if(!shader || shader->getShaderModel() < 0x0200 || forceClearRegisters)
36 {
37 for(int i = 0; i < MAX_FRAGMENT_INPUTS; i++)
38 {
39 v[i].x = Float4(0.0f);
40 v[i].y = Float4(0.0f);
41 v[i].z = Float4(0.0f);
42 v[i].w = Float4(0.0f);
43 }
44 }
45 }
46
47 PixelRoutine::~PixelRoutine()
48 {
49 }
50
51 void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x)
52 {
53 #if PERF_PROFILE
54 Long pipeTime = Ticks();
55 #endif
56
57 const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive();
58
59 Int zMask[4]; // Depth mask
60 Int sMask[4]; // Stencil mask
61
62 for(unsigned int q = 0; q < state.multiSample; q++)
63 {
64 zMask[q] = cMask[q];
65 sMask[q] = cMask[q];
66 }
67
68 for(unsigned int q = 0; q < state.multiSample; q++)
69 {
70 stencilTest(sBuffer, q, x, sMask[q], cMask[q]);
71 }
72
73 Float4 f;
74 Float4 rhwCentroid;
75
76 Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16);
77
78 if(interpolateZ())
79 {
80 for(unsigned int q = 0; q < state.multiSample; q++)
81 {
82 Float4 x = xxxx;
83
84 if(state.multiSample > 1)
85 {
86 x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4));
87 }
88
89 z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false, state.depthClamp);
90 }
91 }
92
93 Bool depthPass = false;
94
95 if(earlyDepthTest)
96 {
97 for(unsigned int q = 0; q < state.multiSample; q++)
98 {
99 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
100 }
101 }
102
103 If(depthPass || Bool(!earlyDepthTest))
104 {
105 #if PERF_PROFILE
106 Long interpTime = Ticks();
107 #endif
108
109 Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
110
111 // Centroid locations
112 Float4 XXXX = Float4(0.0f);
113 Float4 YYYY = Float4(0.0f);
114
115 if(state.centroid)
116 {
117 Float4 WWWW(1.0e-9f);
118
119 for(unsigned int q = 0; q < state.multiSample; q++)
120 {
121 XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
122 YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
123 WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]);
124 }
125
126 WWWW = Rcp_pp(WWWW);
127 XXXX *= WWWW;
128 YYYY *= WWWW;
129
130 XXXX += xxxx;
131 YYYY += yyyy;
132 }
133
134 if(interpolateW())
135 {
136 w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false, false);
137 rhw = reciprocal(w, false, false, true);
138
139 if(state.centroid)
140 {
141 rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false));
142 }
143 }
144
145 for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
146 {
147 for(int component = 0; component < 4; component++)
148 {
149 if(state.interpolant[interpolant].component & (1 << component))
150 {
151 if(!state.interpolant[interpolant].centroid)
152 {
153 v[interpolant][component] = interpolate(xxxx, Dv[interpolant][component], rhw, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective, false);
154 }
155 else
156 {
157 v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
158 }
159 }
160 }
161
162 Float4 rcp;
163
164 switch(state.interpolant[interpolant].project)
165 {
166 case 0:
167 break;
168 case 1:
169 rcp = reciprocal(v[interpolant].y);
170 v[interpolant].x = v[interpolant].x * rcp;
171 break;
172 case 2:
173 rcp = reciprocal(v[interpolant].z);
174 v[interpolant].x = v[interpolant].x * rcp;
175 v[interpolant].y = v[interpolant].y * rcp;
176 break;
177 case 3:
178 rcp = reciprocal(v[interpolant].w);
179 v[interpolant].x = v[interpolant].x * rcp;
180 v[interpolant].y = v[interpolant].y * rcp;
181 v[interpolant].z = v[interpolant].z * rcp;
182 break;
183 }
184 }
185
186 if(state.fog.component)
187 {
188 f = interpolate(xxxx, Df, rhw, primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective, false);
189 }
190
191 setBuiltins(x, y, z, w);
192
193 #if PERF_PROFILE
194 cycles[PERF_INTERP] += Ticks() - interpTime;
195 #endif
196
197 Bool alphaPass = true;
198
199 if(colorUsed())
200 {
201 #if PERF_PROFILE
202 Long shaderTime = Ticks();
203 #endif
204
205 applyShader(cMask);
206
207 #if PERF_PROFILE
208 cycles[PERF_SHADER] += Ticks() - shaderTime;
209 #endif
210
211 alphaPass = alphaTest(cMask);
212
213 if((shader && shader->containsKill()) || state.alphaTestActive())
214 {
215 for(unsigned int q = 0; q < state.multiSample; q++)
216 {
217 zMask[q] &= cMask[q];
218 sMask[q] &= cMask[q];
219 }
220 }
221 }
222
223 If(alphaPass)
224 {
225 if(!earlyDepthTest)
226 {
227 for(unsigned int q = 0; q < state.multiSample; q++)
228 {
229 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
230 }
231 }
232
233 #if PERF_PROFILE
234 Long ropTime = Ticks();
235 #endif
236
237 If(depthPass || Bool(earlyDepthTest))
238 {
239 for(unsigned int q = 0; q < state.multiSample; q++)
240 {
241 if(state.multiSampleMask & (1 << q))
242 {
243 writeDepth(zBuffer, q, x, z[q], zMask[q]);
244
245 if(state.occlusionEnabled)
246 {
247 occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
248 }
249 }
250 }
251
252 if(colorUsed())
253 {
254 #if PERF_PROFILE
255 AddAtomic(Pointer<Long>(&profiler.ropOperations), 4);
256 #endif
257
258 rasterOperation(f, cBuffer, x, sMask, zMask, cMask);
259 }
260 }
261
262 #if PERF_PROFILE
263 cycles[PERF_ROP] += Ticks() - ropTime;
264 #endif
265 }
266 }
267
268 for(unsigned int q = 0; q < state.multiSample; q++)
269 {
270 if(state.multiSampleMask & (1 << q))
271 {
272 writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
273 }
274 }
275
276 #if PERF_PROFILE
277 cycles[PERF_PIPE] += Ticks() - pipeTime;
278 #endif
279 }
280
281 Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
282 {
283 Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
284
285 if(!flat)
286 {
287 interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
288 y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
289
290 if(perspective)
291 {
292 interpolant *= rhw;
293 }
294 }
295
296 return interpolant;
297 }
298
299 void PixelRoutine::stencilTest(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask)
300 {
301 if(!state.stencilActive)
302 {
303 return;
304 }
305
306 // (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
307
308 Pointer<Byte> buffer = sBuffer + 2 * x;
309
310 if(q > 0)
311 {
312 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
313 }
314
315 Byte8 value = *Pointer<Byte8>(buffer);
316 Byte8 valueCCW = value;
317
318 if(!state.noStencilMask)
319 {
320 value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ));
321 }
322
323 stencilTest(value, state.stencilCompareMode, false);
324
325 if(state.twoSidedStencil)
326 {
327 if(!state.noStencilMaskCCW)
328 {
329 valueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ));
330 }
331
332 stencilTest(valueCCW, state.stencilCompareModeCCW, true);
333
334 value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
335 valueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
336 value |= valueCCW;
337 }
338
339 sMask = SignMask(value) & cMask;
340 }
341
342 void PixelRoutine::stencilTest(Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW)
343 {
344 Byte8 equal;
345
346 switch(stencilCompareMode)
347 {
348 case STENCIL_ALWAYS:
349 value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
350 break;
351 case STENCIL_NEVER:
352 value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
353 break;
354 case STENCIL_LESS: // a < b ~ b > a
355 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
356 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
357 break;
358 case STENCIL_EQUAL:
359 value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
360 break;
361 case STENCIL_NOTEQUAL: // a != b ~ !(a == b)
362 value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
363 value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
364 break;
365 case STENCIL_LESSEQUAL: // a <= b ~ (b > a) || (a == b)
366 equal = value;
367 equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
368 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
369 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
370 value |= equal;
371 break;
372 case STENCIL_GREATER: // a > b
373 equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
374 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
375 equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
376 value = equal;
377 break;
378 case STENCIL_GREATEREQUAL: // a >= b ~ !(a < b) ~ !(b > a)
379 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
380 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
381 value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
382 break;
383 default:
384 ASSERT(false);
385 }
386 }
387
388 Bool PixelRoutine::depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
389 {
390 if(!state.depthTestActive)
391 {
392 return true;
393 }
394
395 Float4 Z = z;
396
397 if(shader && shader->depthOverride())
398 {
399 if(complementaryDepthBuffer)
400 {
401 Z = Float4(1.0f) - oDepth;
402 }
403 else
404 {
405 Z = oDepth;
406 }
407 }
408
409 Pointer<Byte> buffer;
410 Int pitch;
411
412 if(!state.quadLayoutDepthBuffer)
413 {
414 buffer = zBuffer + 4 * x;
415 pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
416 }
417 else
418 {
419 buffer = zBuffer + 8 * x;
420 }
421
422 if(q > 0)
423 {
424 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
425 }
426
427 Float4 zValue;
428
429 if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
430 {
431 if(!state.quadLayoutDepthBuffer)
432 {
433 // FIXME: Properly optimizes?
434 zValue.xy = *Pointer<Float4>(buffer);
435 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
436 }
437 else
438 {
439 zValue = *Pointer<Float4>(buffer, 16);
440 }
441 }
442
443 Int4 zTest;
444
445 switch(state.depthCompareMode)
446 {
447 case DEPTH_ALWAYS:
448 // Optimized
449 break;
450 case DEPTH_NEVER:
451 // Optimized
452 break;
453 case DEPTH_EQUAL:
454 zTest = CmpEQ(zValue, Z);
455 break;
456 case DEPTH_NOTEQUAL:
457 zTest = CmpNEQ(zValue, Z);
458 break;
459 case DEPTH_LESS:
460 if(complementaryDepthBuffer)
461 {
462 zTest = CmpLT(zValue, Z);
463 }
464 else
465 {
466 zTest = CmpNLE(zValue, Z);
467 }
468 break;
469 case DEPTH_GREATEREQUAL:
470 if(complementaryDepthBuffer)
471 {
472 zTest = CmpNLT(zValue, Z);
473 }
474 else
475 {
476 zTest = CmpLE(zValue, Z);
477 }
478 break;
479 case DEPTH_LESSEQUAL:
480 if(complementaryDepthBuffer)
481 {
482 zTest = CmpLE(zValue, Z);
483 }
484 else
485 {
486 zTest = CmpNLT(zValue, Z);
487 }
488 break;
489 case DEPTH_GREATER:
490 if(complementaryDepthBuffer)
491 {
492 zTest = CmpNLE(zValue, Z);
493 }
494 else
495 {
496 zTest = CmpLT(zValue, Z);
497 }
498 break;
499 default:
500 ASSERT(false);
501 }
502
503 switch(state.depthCompareMode)
504 {
505 case DEPTH_ALWAYS:
506 zMask = cMask;
507 break;
508 case DEPTH_NEVER:
509 zMask = 0x0;
510 break;
511 default:
512 zMask = SignMask(zTest) & cMask;
513 break;
514 }
515
516 if(state.stencilActive)
517 {
518 zMask &= sMask;
519 }
520
521 return zMask != 0;
522 }
523
524 void PixelRoutine::alphaTest(Int &aMask, Short4 &alpha)
525 {
526 Short4 cmp;
527 Short4 equal;
528
529 switch(state.alphaCompareMode)
530 {
531 case ALPHA_ALWAYS:
532 aMask = 0xF;
533 break;
534 case ALPHA_NEVER:
535 aMask = 0x0;
536 break;
537 case ALPHA_EQUAL:
538 cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
539 aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
540 break;
541 case ALPHA_NOTEQUAL: // a != b ~ !(a == b)
542 cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu); // FIXME
543 aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
544 break;
545 case ALPHA_LESS: // a < b ~ b > a
546 cmp = CmpGT(*Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)), alpha);
547 aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
548 break;
549 case ALPHA_GREATEREQUAL: // a >= b ~ (a > b) || (a == b) ~ !(b > a) // TODO: Approximate
550 equal = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
551 cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
552 cmp |= equal;
553 aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
554 break;
555 case ALPHA_LESSEQUAL: // a <= b ~ !(a > b)
556 cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu); // FIXME
557 aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
558 break;
559 case ALPHA_GREATER: // a > b
560 cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
561 aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
562 break;
563 default:
564 ASSERT(false);
565 }
566 }
567
568 void PixelRoutine::alphaToCoverage(Int cMask[4], Float4 &alpha)
569 {
570 Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0)));
571 Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1)));
572 Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2)));
573 Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3)));
574
575 Int aMask0 = SignMask(coverage0);
576 Int aMask1 = SignMask(coverage1);
577 Int aMask2 = SignMask(coverage2);
578 Int aMask3 = SignMask(coverage3);
579
580 cMask[0] &= aMask0;
581 cMask[1] &= aMask1;
582 cMask[2] &= aMask2;
583 cMask[3] &= aMask3;
584 }
585
586 void PixelRoutine::fogBlend(Vector4f &c0, Float4 &fog)
587 {
588 if(!state.fogActive)
589 {
590 return;
591 }
592
593 if(state.pixelFogMode != FOG_NONE)
594 {
595 pixelFog(fog);
596
597 fog = Min(fog, Float4(1.0f));
598 fog = Max(fog, Float4(0.0f));
599 }
600
601 c0.x -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
602 c0.y -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
603 c0.z -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
604
605 c0.x *= fog;
606 c0.y *= fog;
607 c0.z *= fog;
608
609 c0.x += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
610 c0.y += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
611 c0.z += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
612 }
613
614 void PixelRoutine::pixelFog(Float4 &visibility)
615 {
616 Float4 &zw = visibility;
617
618 if(state.pixelFogMode != FOG_NONE)
619 {
620 if(state.wBasedFog)
621 {
622 zw = rhw;
623 }
624 else
625 {
626 if(complementaryDepthBuffer)
627 {
628 zw = Float4(1.0f) - z[0];
629 }
630 else
631 {
632 zw = z[0];
633 }
634 }
635 }
636
637 switch(state.pixelFogMode)
638 {
639 case FOG_NONE:
640 break;
641 case FOG_LINEAR:
642 zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.scale));
643 zw += *Pointer<Float4>(data + OFFSET(DrawData,fog.offset));
644 break;
645 case FOG_EXP:
646 zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.densityE));
647 zw = exponential2(zw, true);
648 break;
649 case FOG_EXP2:
650 zw *= zw;
651 zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.density2E));
652 zw = exponential2(zw, true);
653 break;
654 default:
655 ASSERT(false);
656 }
657 }
658
659 void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
660 {
661 if(!state.depthWriteEnable)
662 {
663 return;
664 }
665
666 Float4 Z = z;
667
668 if(shader && shader->depthOverride())
669 {
670 if(complementaryDepthBuffer)
671 {
672 Z = Float4(1.0f) - oDepth;
673 }
674 else
675 {
676 Z = oDepth;
677 }
678 }
679
680 Pointer<Byte> buffer;
681 Int pitch;
682
683 if(!state.quadLayoutDepthBuffer)
684 {
685 buffer = zBuffer + 4 * x;
686 pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
687 }
688 else
689 {
690 buffer = zBuffer + 8 * x;
691 }
692
693 if(q > 0)
694 {
695 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
696 }
697
698 Float4 zValue;
699
700 if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
701 {
702 if(!state.quadLayoutDepthBuffer)
703 {
704 // FIXME: Properly optimizes?
705 zValue.xy = *Pointer<Float4>(buffer);
706 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
707 }
708 else
709 {
710 zValue = *Pointer<Float4>(buffer, 16);
711 }
712 }
713
714 Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
715 zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
716 Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
717
718 if(!state.quadLayoutDepthBuffer)
719 {
720 // FIXME: Properly optimizes?
721 *Pointer<Float2>(buffer) = Float2(Z.xy);
722 *Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
723 }
724 else
725 {
726 *Pointer<Float4>(buffer, 16) = Z;
727 }
728 }
729
730 void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
731 {
732 if(!state.stencilActive)
733 {
734 return;
735 }
736
737 if(state.stencilPassOperation == OPERATION_KEEP && state.stencilZFailOperation == OPERATION_KEEP && state.stencilFailOperation == OPERATION_KEEP)
738 {
739 if(!state.twoSidedStencil || (state.stencilPassOperationCCW == OPERATION_KEEP && state.stencilZFailOperationCCW == OPERATION_KEEP && state.stencilFailOperationCCW == OPERATION_KEEP))
740 {
741 return;
742 }
743 }
744
745 if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW))
746 {
747 return;
748 }
749
750 Pointer<Byte> buffer = sBuffer + 2 * x;
751
752 if(q > 0)
753 {
754 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
755 }
756
757 Byte8 bufferValue = *Pointer<Byte8>(buffer);
758
759 Byte8 newValue;
760 stencilOperation(newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
761
762 if(!state.noStencilWriteMask)
763 {
764 Byte8 maskedValue = bufferValue;
765 newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ));
766 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
767 newValue |= maskedValue;
768 }
769
770 if(state.twoSidedStencil)
771 {
772 Byte8 newValueCCW;
773
774 stencilOperation(newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
775
776 if(!state.noStencilWriteMaskCCW)
777 {
778 Byte8 maskedValue = bufferValue;
779 newValueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ));
780 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
781 newValueCCW |= maskedValue;
782 }
783
784 newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
785 newValueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
786 newValue |= newValueCCW;
787 }
788
789 newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
790 bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
791 newValue |= bufferValue;
792
793 *Pointer<Byte4>(buffer) = Byte4(newValue);
794 }
795
796 void PixelRoutine::stencilOperation(Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
797 {
798 Byte8 &pass = newValue;
799 Byte8 fail;
800 Byte8 zFail;
801
802 stencilOperation(pass, bufferValue, stencilPassOperation, CCW);
803
804 if(stencilZFailOperation != stencilPassOperation)
805 {
806 stencilOperation(zFail, bufferValue, stencilZFailOperation, CCW);
807 }
808
809 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
810 {
811 stencilOperation(fail, bufferValue, stencilFailOperation, CCW);
812 }
813
814 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
815 {
816 if(state.depthTestActive && stencilZFailOperation != stencilPassOperation) // zMask valid and values not the same
817 {
818 pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
819 zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
820 pass |= zFail;
821 }
822
823 pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
824 fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
825 pass |= fail;
826 }
827 }
828
829 void PixelRoutine::stencilOperation(Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW)
830 {
831 switch(operation)
832 {
833 case OPERATION_KEEP:
834 output = bufferValue;
835 break;
836 case OPERATION_ZERO:
837 output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
838 break;
839 case OPERATION_REPLACE:
840 output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceQ));
841 break;
842 case OPERATION_INCRSAT:
843 output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
844 break;
845 case OPERATION_DECRSAT:
846 output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
847 break;
848 case OPERATION_INVERT:
849 output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
850 break;
851 case OPERATION_INCR:
852 output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
853 break;
854 case OPERATION_DECR:
855 output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
856 break;
857 default:
858 ASSERT(false);
859 }
860 }
861
862 void PixelRoutine::blendFactor(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorActive)
863 {
864 switch(blendFactorActive)
865 {
866 case BLEND_ZERO:
867 // Optimized
868 break;
869 case BLEND_ONE:
870 // Optimized
871 break;
872 case BLEND_SOURCE:
873 blendFactor.x = current.x;
874 blendFactor.y = current.y;
875 blendFactor.z = current.z;
876 break;
877 case BLEND_INVSOURCE:
878 blendFactor.x = Short4(0xFFFFu) - current.x;
879 blendFactor.y = Short4(0xFFFFu) - current.y;
880 blendFactor.z = Short4(0xFFFFu) - current.z;
881 break;
882 case BLEND_DEST:
883 blendFactor.x = pixel.x;
884 blendFactor.y = pixel.y;
885 blendFactor.z = pixel.z;
886 break;
887 case BLEND_INVDEST:
888 blendFactor.x = Short4(0xFFFFu) - pixel.x;
889 blendFactor.y = Short4(0xFFFFu) - pixel.y;
890 blendFactor.z = Short4(0xFFFFu) - pixel.z;
891 break;
892 case BLEND_SOURCEALPHA:
893 blendFactor.x = current.w;
894 blendFactor.y = current.w;
895 blendFactor.z = current.w;
896 break;
897 case BLEND_INVSOURCEALPHA:
898 blendFactor.x = Short4(0xFFFFu) - current.w;
899 blendFactor.y = Short4(0xFFFFu) - current.w;
900 blendFactor.z = Short4(0xFFFFu) - current.w;
901 break;
902 case BLEND_DESTALPHA:
903 blendFactor.x = pixel.w;
904 blendFactor.y = pixel.w;
905 blendFactor.z = pixel.w;
906 break;
907 case BLEND_INVDESTALPHA:
908 blendFactor.x = Short4(0xFFFFu) - pixel.w;
909 blendFactor.y = Short4(0xFFFFu) - pixel.w;
910 blendFactor.z = Short4(0xFFFFu) - pixel.w;
911 break;
912 case BLEND_SRCALPHASAT:
913 blendFactor.x = Short4(0xFFFFu) - pixel.w;
914 blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
915 blendFactor.y = blendFactor.x;
916 blendFactor.z = blendFactor.x;
917 break;
918 case BLEND_CONSTANT:
919 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0]));
920 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1]));
921 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2]));
922 break;
923 case BLEND_INVCONSTANT:
924 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
925 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
926 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
927 break;
928 case BLEND_CONSTANTALPHA:
929 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
930 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
931 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
932 break;
933 case BLEND_INVCONSTANTALPHA:
934 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
935 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
936 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
937 break;
938 default:
939 ASSERT(false);
940 }
941 }
942
943 void PixelRoutine::blendFactorAlpha(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorAlphaActive)
944 {
945 switch(blendFactorAlphaActive)
946 {
947 case BLEND_ZERO:
948 // Optimized
949 break;
950 case BLEND_ONE:
951 // Optimized
952 break;
953 case BLEND_SOURCE:
954 blendFactor.w = current.w;
955 break;
956 case BLEND_INVSOURCE:
957 blendFactor.w = Short4(0xFFFFu) - current.w;
958 break;
959 case BLEND_DEST:
960 blendFactor.w = pixel.w;
961 break;
962 case BLEND_INVDEST:
963 blendFactor.w = Short4(0xFFFFu) - pixel.w;
964 break;
965 case BLEND_SOURCEALPHA:
966 blendFactor.w = current.w;
967 break;
968 case BLEND_INVSOURCEALPHA:
969 blendFactor.w = Short4(0xFFFFu) - current.w;
970 break;
971 case BLEND_DESTALPHA:
972 blendFactor.w = pixel.w;
973 break;
974 case BLEND_INVDESTALPHA:
975 blendFactor.w = Short4(0xFFFFu) - pixel.w;
976 break;
977 case BLEND_SRCALPHASAT:
978 blendFactor.w = Short4(0xFFFFu);
979 break;
980 case BLEND_CONSTANT:
981 case BLEND_CONSTANTALPHA:
982 blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
983 break;
984 case BLEND_INVCONSTANT:
985 case BLEND_INVCONSTANTALPHA:
986 blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
987 break;
988 default:
989 ASSERT(false);
990 }
991 }
992
993 bool PixelRoutine::isSRGB(int index) const
994 {
995 return Surface::isSRGBformat(state.targetFormat[index]);
996 }
997
998 void PixelRoutine::readPixel(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel)
999 {
1000 Short4 c01;
1001 Short4 c23;
1002 Pointer<Byte> buffer;
1003 Pointer<Byte> buffer2;
1004
1005 switch(state.targetFormat[index])
1006 {
1007 case FORMAT_R5G6B5:
1008 buffer = cBuffer + 2 * x;
1009 buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1010 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1011
1012 pixel.x = c01 & Short4(0xF800u);
1013 pixel.y = (c01 & Short4(0x07E0u)) << 5;
1014 pixel.z = (c01 & Short4(0x001Fu)) << 11;
1015 pixel.w = Short4(0xFFFFu);
1016 break;
1017 case FORMAT_A8R8G8B8:
1018 buffer = cBuffer + 4 * x;
1019 c01 = *Pointer<Short4>(buffer);
1020 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1021 c23 = *Pointer<Short4>(buffer);
1022 pixel.z = c01;
1023 pixel.y = c01;
1024 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1025 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1026 pixel.x = pixel.z;
1027 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1028 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1029 pixel.y = pixel.z;
1030 pixel.w = pixel.x;
1031 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1032 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1033 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1034 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1035 break;
1036 case FORMAT_A8B8G8R8:
1037 case FORMAT_SRGB8_A8:
1038 buffer = cBuffer + 4 * x;
1039 c01 = *Pointer<Short4>(buffer);
1040 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1041 c23 = *Pointer<Short4>(buffer);
1042 pixel.z = c01;
1043 pixel.y = c01;
1044 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1045 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1046 pixel.x = pixel.z;
1047 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1048 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1049 pixel.y = pixel.z;
1050 pixel.w = pixel.x;
1051 pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1052 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1053 pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1054 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1055 break;
1056 case FORMAT_A8:
1057 buffer = cBuffer + 1 * x;
1058 pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 0);
1059 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1060 pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 1);
1061 pixel.w = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1062 pixel.x = Short4(0x0000);
1063 pixel.y = Short4(0x0000);
1064 pixel.z = Short4(0x0000);
1065 break;
1066 case FORMAT_R8:
1067 buffer = cBuffer + 1 * x;
1068 pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0);
1069 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1070 pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1);
1071 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1072 pixel.y = Short4(0x0000);
1073 pixel.z = Short4(0x0000);
1074 pixel.w = Short4(0xFFFFu);
1075 break;
1076 case FORMAT_X8R8G8B8:
1077 buffer = cBuffer + 4 * x;
1078 c01 = *Pointer<Short4>(buffer);
1079 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1080 c23 = *Pointer<Short4>(buffer);
1081 pixel.z = c01;
1082 pixel.y = c01;
1083 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1084 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1085 pixel.x = pixel.z;
1086 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1087 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1088 pixel.y = pixel.z;
1089 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1090 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1091 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1092 pixel.w = Short4(0xFFFFu);
1093 break;
1094 case FORMAT_G8R8:
1095 buffer = cBuffer + 2 * x;
1096 c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
1097 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1098 c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
1099 pixel.x = (c01 & Short4(0x00FFu)) | (c01 << 8);
1100 pixel.y = (c01 & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c01) >> 8);
1101 pixel.z = Short4(0x0000u);
1102 pixel.w = Short4(0xFFFFu);
1103 break;
1104 case FORMAT_X8B8G8R8:
1105 case FORMAT_SRGB8_X8:
1106 buffer = cBuffer + 4 * x;
1107 c01 = *Pointer<Short4>(buffer);
1108 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1109 c23 = *Pointer<Short4>(buffer);
1110 pixel.z = c01;
1111 pixel.y = c01;
1112 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1113 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1114 pixel.x = pixel.z;
1115 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1116 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1117 pixel.y = pixel.z;
1118 pixel.w = pixel.x;
1119 pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1120 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1121 pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1122 pixel.w = Short4(0xFFFFu);
1123 break;
1124 case FORMAT_A8G8R8B8Q:
1125 UNIMPLEMENTED();
1126 // pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1127 // pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1128 // pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1129 // pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1130 break;
1131 case FORMAT_X8G8R8B8Q:
1132 UNIMPLEMENTED();
1133 // pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1134 // pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1135 // pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1136 // pixel.w = Short4(0xFFFFu);
1137 break;
1138 case FORMAT_A16B16G16R16:
1139 buffer = cBuffer;
1140 pixel.x = *Pointer<Short4>(buffer + 8 * x);
1141 pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
1142 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1143 pixel.z = *Pointer<Short4>(buffer + 8 * x);
1144 pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
1145 transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
1146 break;
1147 case FORMAT_G16R16:
1148 buffer = cBuffer;
1149 pixel.x = *Pointer<Short4>(buffer + 4 * x);
1150 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1151 pixel.y = *Pointer<Short4>(buffer + 4 * x);
1152 pixel.z = pixel.x;
1153 pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
1154 pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
1155 pixel.y = pixel.z;
1156 pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
1157 pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
1158 pixel.z = Short4(0xFFFFu);
1159 pixel.w = Short4(0xFFFFu);
1160 break;
1161 default:
1162 ASSERT(false);
1163 }
1164
1165 if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
1166 {
1167 sRGBtoLinear16_12_16(pixel);
1168 }
1169 }
1170
1171 void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
1172 {
1173 if(!state.alphaBlendActive)
1174 {
1175 return;
1176 }
1177
1178 Vector4s pixel;
1179 readPixel(index, cBuffer, x, pixel);
1180
1181 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
1182 Vector4s sourceFactor;
1183 Vector4s destFactor;
1184
1185 blendFactor(sourceFactor, current, pixel, state.sourceBlendFactor);
1186 blendFactor(destFactor, current, pixel, state.destBlendFactor);
1187
1188 if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
1189 {
1190 current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
1191 current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
1192 current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
1193 }
1194
1195 if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
1196 {
1197 pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
1198 pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
1199 pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
1200 }
1201
1202 switch(state.blendOperation)
1203 {
1204 case BLENDOP_ADD:
1205 current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1206 current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1207 current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1208 break;
1209 case BLENDOP_SUB:
1210 current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1211 current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1212 current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1213 break;
1214 case BLENDOP_INVSUB:
1215 current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
1216 current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
1217 current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
1218 break;
1219 case BLENDOP_MIN:
1220 current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
1221 current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
1222 current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
1223 break;
1224 case BLENDOP_MAX:
1225 current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
1226 current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
1227 current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
1228 break;
1229 case BLENDOP_SOURCE:
1230 // No operation
1231 break;
1232 case BLENDOP_DEST:
1233 current.x = pixel.x;
1234 current.y = pixel.y;
1235 current.z = pixel.z;
1236 break;
1237 case BLENDOP_NULL:
1238 current.x = Short4(0x0000);
1239 current.y = Short4(0x0000);
1240 current.z = Short4(0x0000);
1241 break;
1242 default:
1243 ASSERT(false);
1244 }
1245
1246 blendFactorAlpha(sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
1247 blendFactorAlpha(destFactor, current, pixel, state.destBlendFactorAlpha);
1248
1249 if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
1250 {
1251 current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
1252 }
1253
1254 if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
1255 {
1256 pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
1257 }
1258
1259 switch(state.blendOperationAlpha)
1260 {
1261 case BLENDOP_ADD:
1262 current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1263 break;
1264 case BLENDOP_SUB:
1265 current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1266 break;
1267 case BLENDOP_INVSUB:
1268 current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
1269 break;
1270 case BLENDOP_MIN:
1271 current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
1272 break;
1273 case BLENDOP_MAX:
1274 current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
1275 break;
1276 case BLENDOP_SOURCE:
1277 // No operation
1278 break;
1279 case BLENDOP_DEST:
1280 current.w = pixel.w;
1281 break;
1282 case BLENDOP_NULL:
1283 current.w = Short4(0x0000);
1284 break;
1285 default:
1286 ASSERT(false);
1287 }
1288 }
1289
1290 void PixelRoutine::logicOperation(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
1291 {
1292 if(state.logicalOperation == LOGICALOP_COPY)
1293 {
1294 return;
1295 }
1296
1297 Vector4s pixel;
1298 readPixel(index, cBuffer, x, pixel);
1299
1300 switch(state.logicalOperation)
1301 {
1302 case LOGICALOP_CLEAR:
1303 current.x = UShort4(0);
1304 current.y = UShort4(0);
1305 current.z = UShort4(0);
1306 break;
1307 case LOGICALOP_SET:
1308 current.x = UShort4(0xFFFFu);
1309 current.y = UShort4(0xFFFFu);
1310 current.z = UShort4(0xFFFFu);
1311 break;
1312 case LOGICALOP_COPY:
1313 ASSERT(false); // Optimized out
1314 break;
1315 case LOGICALOP_COPY_INVERTED:
1316 current.x = ~current.x;
1317 current.y = ~current.y;
1318 current.z = ~current.z;
1319 break;
1320 case LOGICALOP_NOOP:
1321 current.x = pixel.x;
1322 current.y = pixel.y;
1323 current.z = pixel.z;
1324 break;
1325 case LOGICALOP_INVERT:
1326 current.x = ~pixel.x;
1327 current.y = ~pixel.y;
1328 current.z = ~pixel.z;
1329 break;
1330 case LOGICALOP_AND:
1331 current.x = pixel.x & current.x;
1332 current.y = pixel.y & current.y;
1333 current.z = pixel.z & current.z;
1334 break;
1335 case LOGICALOP_NAND:
1336 current.x = ~(pixel.x & current.x);
1337 current.y = ~(pixel.y & current.y);
1338 current.z = ~(pixel.z & current.z);
1339 break;
1340 case LOGICALOP_OR:
1341 current.x = pixel.x | current.x;
1342 current.y = pixel.y | current.y;
1343 current.z = pixel.z | current.z;
1344 break;
1345 case LOGICALOP_NOR:
1346 current.x = ~(pixel.x | current.x);
1347 current.y = ~(pixel.y | current.y);
1348 current.z = ~(pixel.z | current.z);
1349 break;
1350 case LOGICALOP_XOR:
1351 current.x = pixel.x ^ current.x;
1352 current.y = pixel.y ^ current.y;
1353 current.z = pixel.z ^ current.z;
1354 break;
1355 case LOGICALOP_EQUIV:
1356 current.x = ~(pixel.x ^ current.x);
1357 current.y = ~(pixel.y ^ current.y);
1358 current.z = ~(pixel.z ^ current.z);
1359 break;
1360 case LOGICALOP_AND_REVERSE:
1361 current.x = ~pixel.x & current.x;
1362 current.y = ~pixel.y & current.y;
1363 current.z = ~pixel.z & current.z;
1364 break;
1365 case LOGICALOP_AND_INVERTED:
1366 current.x = pixel.x & ~current.x;
1367 current.y = pixel.y & ~current.y;
1368 current.z = pixel.z & ~current.z;
1369 break;
1370 case LOGICALOP_OR_REVERSE:
1371 current.x = ~pixel.x | current.x;
1372 current.y = ~pixel.y | current.y;
1373 current.z = ~pixel.z | current.z;
1374 break;
1375 case LOGICALOP_OR_INVERTED:
1376 current.x = pixel.x | ~current.x;
1377 current.y = pixel.y | ~current.y;
1378 current.z = pixel.z | ~current.z;
1379 break;
1380 default:
1381 ASSERT(false);
1382 }
1383 }
1384
1385 void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &current, Int &sMask, Int &zMask, Int &cMask)
1386 {
1387 if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
1388 {
1389 linearToSRGB16_12_16(current);
1390 }
1391
1392 if(exactColorRounding)
1393 {
1394 switch(state.targetFormat[index])
1395 {
1396 case FORMAT_R5G6B5:
1397 current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400));
1398 current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200));
1399 current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400));
1400 break;
1401 case FORMAT_X8G8R8B8Q:
1402 case FORMAT_A8G8R8B8Q:
1403 case FORMAT_X8R8G8B8:
1404 case FORMAT_X8B8G8R8:
1405 case FORMAT_A8R8G8B8:
1406 case FORMAT_A8B8G8R8:
1407 case FORMAT_SRGB8_X8:
1408 case FORMAT_SRGB8_A8:
1409 case FORMAT_G8R8:
1410 case FORMAT_R8:
1411 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
1412 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
1413 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
1414 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
1415 break;
1416 default:
1417 break;
1418 }
1419 }
1420
1421 int rgbaWriteMask = state.colorWriteActive(index);
1422 int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
1423
1424 switch(state.targetFormat[index])
1425 {
1426 case FORMAT_R5G6B5:
1427 {
1428 current.x = current.x & Short4(0xF800u);
1429 current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1430 current.z = As<UShort4>(current.z) >> 11;
1431
1432 current.x = current.x | current.y | current.z;
1433 }
1434 break;
1435 case FORMAT_X8G8R8B8Q:
1436 UNIMPLEMENTED();
1437 // current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1438 // current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1439 // current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1440
1441 // current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1442 // current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1443 break;
1444 case FORMAT_A8G8R8B8Q:
1445 UNIMPLEMENTED();
1446 // current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1447 // current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1448 // current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1449 // current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1450
1451 // current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1452 // current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1453 break;
1454 case FORMAT_X8R8G8B8:
1455 case FORMAT_A8R8G8B8:
1456 if(state.targetFormat[index] == FORMAT_X8R8G8B8 || rgbaWriteMask == 0x7)
1457 {
1458 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1459 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1460 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1461
1462 current.z = As<Short4>(PackUnsigned(current.z, current.x));
1463 current.y = As<Short4>(PackUnsigned(current.y, current.y));
1464
1465 current.x = current.z;
1466 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1467 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1468 current.y = current.z;
1469 current.z = As<Short4>(UnpackLow(current.z, current.x));
1470 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1471 }
1472 else
1473 {
1474 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1475 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1476 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1477 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1478
1479 current.z = As<Short4>(PackUnsigned(current.z, current.x));
1480 current.y = As<Short4>(PackUnsigned(current.y, current.w));
1481
1482 current.x = current.z;
1483 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1484 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1485 current.y = current.z;
1486 current.z = As<Short4>(UnpackLow(current.z, current.x));
1487 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1488 }
1489 break;
1490 case FORMAT_X8B8G8R8:
1491 case FORMAT_A8B8G8R8:
1492 case FORMAT_SRGB8_X8:
1493 case FORMAT_SRGB8_A8:
1494 if(state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8 || rgbaWriteMask == 0x7)
1495 {
1496 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1497 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1498 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1499
1500 current.z = As<Short4>(PackUnsigned(current.x, current.z));
1501 current.y = As<Short4>(PackUnsigned(current.y, current.y));
1502
1503 current.x = current.z;
1504 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1505 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1506 current.y = current.z;
1507 current.z = As<Short4>(UnpackLow(current.z, current.x));
1508 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1509 }
1510 else
1511 {
1512 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1513 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1514 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1515 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1516
1517 current.z = As<Short4>(PackUnsigned(current.x, current.z));
1518 current.y = As<Short4>(PackUnsigned(current.y, current.w));
1519
1520 current.x = current.z;
1521 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1522 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1523 current.y = current.z;
1524 current.z = As<Short4>(UnpackLow(current.z, current.x));
1525 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1526 }
1527 break;
1528 case FORMAT_G8R8:
1529 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1530 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1531 current.x = As<Short4>(PackUnsigned(current.x, current.x));
1532 current.y = As<Short4>(PackUnsigned(current.y, current.y));
1533 current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
1534 break;
1535 case FORMAT_R8:
1536 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1537 current.x = As<Short4>(PackUnsigned(current.x, current.x));
1538 break;
1539 case FORMAT_A8:
1540 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1541 current.w = As<Short4>(PackUnsigned(current.w, current.w));
1542 break;
1543 case FORMAT_G16R16:
1544 current.z = current.x;
1545 current.x = As<Short4>(UnpackLow(current.x, current.y));
1546 current.z = As<Short4>(UnpackHigh(current.z, current.y));
1547 current.y = current.z;
1548 break;
1549 case FORMAT_A16B16G16R16:
1550 transpose4x4(current.x, current.y, current.z, current.w);
1551 break;
1552 default:
1553 ASSERT(false);
1554 }
1555
1556 Short4 c01 = current.z;
1557 Short4 c23 = current.y;
1558
1559 Int xMask; // Combination of all masks
1560
1561 if(state.depthTestActive)
1562 {
1563 xMask = zMask;
1564 }
1565 else
1566 {
1567 xMask = cMask;
1568 }
1569
1570 if(state.stencilActive)
1571 {
1572 xMask &= sMask;
1573 }
1574
1575 switch(state.targetFormat[index])
1576 {
1577 case FORMAT_R5G6B5:
1578 {
1579 Pointer<Byte> buffer = cBuffer + 2 * x;
1580 Int value = *Pointer<Int>(buffer);
1581
1582 Int c01 = Extract(As<Int2>(current.x), 0);
1583
1584 if((bgraWriteMask & 0x00000007) != 0x00000007)
1585 {
1586 Int masked = value;
1587 c01 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1588 masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
1589 c01 |= masked;
1590 }
1591
1592 c01 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
1593 value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
1594 c01 |= value;
1595 *Pointer<Int>(buffer) = c01;
1596
1597 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1598 value = *Pointer<Int>(buffer);
1599
1600 Int c23 = Extract(As<Int2>(current.x), 1);
1601
1602 if((bgraWriteMask & 0x00000007) != 0x00000007)
1603 {
1604 Int masked = value;
1605 c23 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1606 masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
1607 c23 |= masked;
1608 }
1609
1610 c23 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
1611 value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
1612 c23 |= value;
1613 *Pointer<Int>(buffer) = c23;
1614 }
1615 break;
1616 case FORMAT_A8G8R8B8Q:
1617 case FORMAT_X8G8R8B8Q: // FIXME: Don't touch alpha?
1618 UNIMPLEMENTED();
1619 // value = *Pointer<Short4>(cBuffer + 8 * x + 0);
1620
1621 // if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1622 // ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1623 // (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
1624 // {
1625 // Short4 masked = value;
1626 // c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1627 // masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1628 // c01 |= masked;
1629 // }
1630
1631 // c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1632 // value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1633 // c01 |= value;
1634 // *Pointer<Short4>(cBuffer + 8 * x + 0) = c01;
1635
1636 // value = *Pointer<Short4>(cBuffer + 8 * x + 8);
1637
1638 // if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1639 // ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1640 // (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
1641 // {
1642 // Short4 masked = value;
1643 // c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1644 // masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1645 // c23 |= masked;
1646 // }
1647
1648 // c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1649 // value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1650 // c23 |= value;
1651 // *Pointer<Short4>(cBuffer + 8 * x + 8) = c23;
1652 break;
1653 case FORMAT_A8R8G8B8:
1654 case FORMAT_X8R8G8B8: // FIXME: Don't touch alpha?
1655 {
1656 Pointer<Byte> buffer = cBuffer + x * 4;
1657 Short4 value = *Pointer<Short4>(buffer);
1658
1659 if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1660 ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1661 (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
1662 {
1663 Short4 masked = value;
1664 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1665 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1666 c01 |= masked;
1667 }
1668
1669 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1670 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1671 c01 |= value;
1672 *Pointer<Short4>(buffer) = c01;
1673
1674 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1675 value = *Pointer<Short4>(buffer);
1676
1677 if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1678 ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1679 (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
1680 {
1681 Short4 masked = value;
1682 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1683 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1684 c23 |= masked;
1685 }
1686
1687#ifdef __APPLE__
1688 // On Mac we render directly to an IOSurface that isn't vertically padded. So we
1689 // only render the bottom half of quads when it won't overflow the buffer.
1690 If ((y + 1) < yMax)
1691#endif
1692 {
1693 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1694 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1695 c23 |= value;
1696 *Pointer<Short4>(buffer) = c23;
1697 }
1698 }
1699 break;
1700 case FORMAT_A8B8G8R8:
1701 case FORMAT_X8B8G8R8: // FIXME: Don't touch alpha?
1702 case FORMAT_SRGB8_X8:
1703 case FORMAT_SRGB8_A8:
1704 {
1705 Pointer<Byte> buffer = cBuffer + x * 4;
1706 Short4 value = *Pointer<Short4>(buffer);
1707
1708 bool masked = (((state.targetFormat[index] == FORMAT_A8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_A8) && rgbaWriteMask != 0x0000000F) ||
1709 (((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x00000007) &&
1710 ((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x0000000F))); // FIXME: Need for masking when XBGR && Fh?
1711
1712 if(masked)
1713 {
1714 Short4 masked = value;
1715 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1716 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1717 c01 |= masked;
1718 }
1719
1720 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1721 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1722 c01 |= value;
1723 *Pointer<Short4>(buffer) = c01;
1724
1725 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1726 value = *Pointer<Short4>(buffer);
1727
1728 if(masked)
1729 {
1730 Short4 masked = value;
1731 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1732 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1733 c23 |= masked;
1734 }
1735
1736 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1737 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1738 c23 |= value;
1739 *Pointer<Short4>(buffer) = c23;
1740 }
1741 break;
1742 case FORMAT_G8R8:
1743 if((rgbaWriteMask & 0x00000003) != 0x0)
1744 {
1745 Pointer<Byte> buffer = cBuffer + 2 * x;
1746 Int2 value;
1747 value = Insert(value, *Pointer<Int>(buffer), 0);
1748 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1749 value = Insert(value, *Pointer<Int>(buffer + pitch), 1);
1750
1751 Int2 packedCol = As<Int2>(current.x);
1752
1753 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
1754 if((rgbaWriteMask & 0x3) != 0x3)
1755 {
1756 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
1757 UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
1758 mergedMask &= rgbaMask;
1759 }
1760
1761 packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
1762
1763 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
1764 *Pointer<UInt>(buffer + pitch) = As<UInt>(Extract(packedCol, 1));
1765 }
1766 break;
1767 case FORMAT_R8:
1768 if(rgbaWriteMask & 0x00000001)
1769 {
1770 Pointer<Byte> buffer = cBuffer + 1 * x;
1771 Short4 value;
1772 value = Insert(value, *Pointer<Short>(buffer), 0);
1773 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1774 value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
1775
1776 current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
1777 value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
1778 current.x |= value;
1779
1780 *Pointer<Short>(buffer) = Extract(current.x, 0);
1781 *Pointer<Short>(buffer + pitch) = Extract(current.x, 1);
1782 }
1783 break;
1784 case FORMAT_A8:
1785 if(rgbaWriteMask & 0x00000008)
1786 {
1787 Pointer<Byte> buffer = cBuffer + 1 * x;
1788 Short4 value;
1789 value = Insert(value, *Pointer<Short>(buffer), 0);
1790 Int pitch = *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1791 value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
1792
1793 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q) + 8 * xMask);
1794 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q) + 8 * xMask);
1795 current.w |= value;
1796
1797 *Pointer<Short>(buffer) = Extract(current.w, 0);
1798 *Pointer<Short>(buffer + pitch) = Extract(current.w, 1);
1799 }
1800 break;
1801 case FORMAT_G16R16:
1802 {
1803 Pointer<Byte> buffer = cBuffer + 4 * x;
1804
1805 Short4 value = *Pointer<Short4>(buffer);
1806
1807 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1808 {
1809 Short4 masked = value;
1810 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1811 masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
1812 current.x |= masked;
1813 }
1814
1815 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1816 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1817 current.x |= value;
1818 *Pointer<Short4>(buffer) = current.x;
1819
1820 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1821
1822 value = *Pointer<Short4>(buffer);
1823
1824 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1825 {
1826 Short4 masked = value;
1827 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1828 masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
1829 current.y |= masked;
1830 }
1831
1832 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1833 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1834 current.y |= value;
1835 *Pointer<Short4>(buffer) = current.y;
1836 }
1837 break;
1838 case FORMAT_A16B16G16R16:
1839 {
1840 Pointer<Byte> buffer = cBuffer + 8 * x;
1841
1842 {
1843 Short4 value = *Pointer<Short4>(buffer);
1844
1845 if(rgbaWriteMask != 0x0000000F)
1846 {
1847 Short4 masked = value;
1848 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1849 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1850 current.x |= masked;
1851 }
1852
1853 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
1854 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
1855 current.x |= value;
1856 *Pointer<Short4>(buffer) = current.x;
1857 }
1858
1859 {
1860 Short4 value = *Pointer<Short4>(buffer + 8);
1861
1862 if(rgbaWriteMask != 0x0000000F)
1863 {
1864 Short4 masked = value;
1865 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1866 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1867 current.y |= masked;
1868 }
1869
1870 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
1871 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
1872 current.y |= value;
1873 *Pointer<Short4>(buffer + 8) = current.y;
1874 }
1875
1876 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1877
1878 {
1879 Short4 value = *Pointer<Short4>(buffer);
1880
1881 if(rgbaWriteMask != 0x0000000F)
1882 {
1883 Short4 masked = value;
1884 current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1885 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1886 current.z |= masked;
1887 }
1888
1889 current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
1890 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
1891 current.z |= value;
1892 *Pointer<Short4>(buffer) = current.z;
1893 }
1894
1895 {
1896 Short4 value = *Pointer<Short4>(buffer + 8);
1897
1898 if(rgbaWriteMask != 0x0000000F)
1899 {
1900 Short4 masked = value;
1901 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1902 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1903 current.w |= masked;
1904 }
1905
1906 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
1907 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
1908 current.w |= value;
1909 *Pointer<Short4>(buffer + 8) = current.w;
1910 }
1911 }
1912 break;
1913 default:
1914 ASSERT(false);
1915 }
1916 }
1917
1918 void PixelRoutine::blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive)
1919 {
1920 switch(blendFactorActive)
1921 {
1922 case BLEND_ZERO:
1923 // Optimized
1924 break;
1925 case BLEND_ONE:
1926 // Optimized
1927 break;
1928 case BLEND_SOURCE:
1929 blendFactor.x = oC.x;
1930 blendFactor.y = oC.y;
1931 blendFactor.z = oC.z;
1932 break;
1933 case BLEND_INVSOURCE:
1934 blendFactor.x = Float4(1.0f) - oC.x;
1935 blendFactor.y = Float4(1.0f) - oC.y;
1936 blendFactor.z = Float4(1.0f) - oC.z;
1937 break;
1938 case BLEND_DEST:
1939 blendFactor.x = pixel.x;
1940 blendFactor.y = pixel.y;
1941 blendFactor.z = pixel.z;
1942 break;
1943 case BLEND_INVDEST:
1944 blendFactor.x = Float4(1.0f) - pixel.x;
1945 blendFactor.y = Float4(1.0f) - pixel.y;
1946 blendFactor.z = Float4(1.0f) - pixel.z;
1947 break;
1948 case BLEND_SOURCEALPHA:
1949 blendFactor.x = oC.w;
1950 blendFactor.y = oC.w;
1951 blendFactor.z = oC.w;
1952 break;
1953 case BLEND_INVSOURCEALPHA:
1954 blendFactor.x = Float4(1.0f) - oC.w;
1955 blendFactor.y = Float4(1.0f) - oC.w;
1956 blendFactor.z = Float4(1.0f) - oC.w;
1957 break;
1958 case BLEND_DESTALPHA:
1959 blendFactor.x = pixel.w;
1960 blendFactor.y = pixel.w;
1961 blendFactor.z = pixel.w;
1962 break;
1963 case BLEND_INVDESTALPHA:
1964 blendFactor.x = Float4(1.0f) - pixel.w;
1965 blendFactor.y = Float4(1.0f) - pixel.w;
1966 blendFactor.z = Float4(1.0f) - pixel.w;
1967 break;
1968 case BLEND_SRCALPHASAT:
1969 blendFactor.x = Float4(1.0f) - pixel.w;
1970 blendFactor.x = Min(blendFactor.x, oC.w);
1971 blendFactor.y = blendFactor.x;
1972 blendFactor.z = blendFactor.x;
1973 break;
1974 case BLEND_CONSTANT:
1975 blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0]));
1976 blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1]));
1977 blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2]));
1978 break;
1979 case BLEND_INVCONSTANT:
1980 blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
1981 blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
1982 blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
1983 break;
1984 default:
1985 ASSERT(false);
1986 }
1987 }
1988
1989 void PixelRoutine::blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive)
1990 {
1991 switch(blendFactorAlphaActive)
1992 {
1993 case BLEND_ZERO:
1994 // Optimized
1995 break;
1996 case BLEND_ONE:
1997 // Optimized
1998 break;
1999 case BLEND_SOURCE:
2000 blendFactor.w = oC.w;
2001 break;
2002 case BLEND_INVSOURCE:
2003 blendFactor.w = Float4(1.0f) - oC.w;
2004 break;
2005 case BLEND_DEST:
2006 blendFactor.w = pixel.w;
2007 break;
2008 case BLEND_INVDEST:
2009 blendFactor.w = Float4(1.0f) - pixel.w;
2010 break;
2011 case BLEND_SOURCEALPHA:
2012 blendFactor.w = oC.w;
2013 break;
2014 case BLEND_INVSOURCEALPHA:
2015 blendFactor.w = Float4(1.0f) - oC.w;
2016 break;
2017 case BLEND_DESTALPHA:
2018 blendFactor.w = pixel.w;
2019 break;
2020 case BLEND_INVDESTALPHA:
2021 blendFactor.w = Float4(1.0f) - pixel.w;
2022 break;
2023 case BLEND_SRCALPHASAT:
2024 blendFactor.w = Float4(1.0f);
2025 break;
2026 case BLEND_CONSTANT:
2027 blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
2028 break;
2029 case BLEND_INVCONSTANT:
2030 blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
2031 break;
2032 default:
2033 ASSERT(false);
2034 }
2035 }
2036
2037 void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
2038 {
2039 if(!state.alphaBlendActive)
2040 {
2041 return;
2042 }
2043
2044 Pointer<Byte> buffer;
2045 Vector4f pixel;
2046
2047 Vector4s color;
2048 Short4 c01;
2049 Short4 c23;
2050
2051 Float4 one;
2052 if(Surface::isFloatFormat(state.targetFormat[index]))
2053 {
2054 one = Float4(1.0f);
2055 }
2056 else if(Surface::isNonNormalizedInteger(state.targetFormat[index]))
2057 {
2058 one = As<Float4>(Surface::isUnsignedComponent(state.targetFormat[index], 0) ? Int4(0xFFFFFFFF) : Int4(0x7FFFFFFF));
2059 }
2060
2061 switch(state.targetFormat[index])
2062 {
2063 case FORMAT_R32I:
2064 case FORMAT_R32UI:
2065 case FORMAT_R32F:
2066 buffer = cBuffer;
2067 // FIXME: movlps
2068 pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
2069 pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
2070 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2071 // FIXME: movhps
2072 pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
2073 pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
2074 pixel.y = pixel.z = pixel.w = one;
2075 break;
2076 case FORMAT_G32R32I:
2077 case FORMAT_G32R32UI:
2078 case FORMAT_G32R32F:
2079 buffer = cBuffer;
2080 pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
2081 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2082 pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
2083 pixel.z = pixel.x;
2084 pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88);
2085 pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD);
2086 pixel.y = pixel.z;
2087 pixel.z = pixel.w = one;
2088 break;
2089 case FORMAT_X32B32G32R32F:
2090 case FORMAT_A32B32G32R32F:
2091 case FORMAT_X32B32G32R32F_UNSIGNED:
2092 case FORMAT_A32B32G32R32I:
2093 case FORMAT_A32B32G32R32UI:
2094 buffer = cBuffer;
2095 pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
2096 pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2097 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2098 pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
2099 pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2100 transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
2101 if(state.targetFormat[index] == FORMAT_X32B32G32R32F ||
2102 state.targetFormat[index] == FORMAT_X32B32G32R32F_UNSIGNED)
2103 {
2104 pixel.w = Float4(1.0f);
2105 }
2106 break;
2107 default:
2108 ASSERT(false);
2109 }
2110
2111 if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
2112 {
2113 sRGBtoLinear(pixel.x);
2114 sRGBtoLinear(pixel.y);
2115 sRGBtoLinear(pixel.z);
2116 }
2117
2118 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
2119 Vector4f sourceFactor;
2120 Vector4f destFactor;
2121
2122 blendFactor(sourceFactor, oC, pixel, state.sourceBlendFactor);
2123 blendFactor(destFactor, oC, pixel, state.destBlendFactor);
2124
2125 if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
2126 {
2127 oC.x *= sourceFactor.x;
2128 oC.y *= sourceFactor.y;
2129 oC.z *= sourceFactor.z;
2130 }
2131
2132 if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
2133 {
2134 pixel.x *= destFactor.x;
2135 pixel.y *= destFactor.y;
2136 pixel.z *= destFactor.z;
2137 }
2138
2139 switch(state.blendOperation)
2140 {
2141 case BLENDOP_ADD:
2142 oC.x += pixel.x;
2143 oC.y += pixel.y;
2144 oC.z += pixel.z;
2145 break;
2146 case BLENDOP_SUB:
2147 oC.x -= pixel.x;
2148 oC.y -= pixel.y;
2149 oC.z -= pixel.z;
2150 break;
2151 case BLENDOP_INVSUB:
2152 oC.x = pixel.x - oC.x;
2153 oC.y = pixel.y - oC.y;
2154 oC.z = pixel.z - oC.z;
2155 break;
2156 case BLENDOP_MIN:
2157 oC.x = Min(oC.x, pixel.x);
2158 oC.y = Min(oC.y, pixel.y);
2159 oC.z = Min(oC.z, pixel.z);
2160 break;
2161 case BLENDOP_MAX:
2162 oC.x = Max(oC.x, pixel.x);
2163 oC.y = Max(oC.y, pixel.y);
2164 oC.z = Max(oC.z, pixel.z);
2165 break;
2166 case BLENDOP_SOURCE:
2167 // No operation
2168 break;
2169 case BLENDOP_DEST:
2170 oC.x = pixel.x;
2171 oC.y = pixel.y;
2172 oC.z = pixel.z;
2173 break;
2174 case BLENDOP_NULL:
2175 oC.x = Float4(0.0f);
2176 oC.y = Float4(0.0f);
2177 oC.z = Float4(0.0f);
2178 break;
2179 default:
2180 ASSERT(false);
2181 }
2182
2183 blendFactorAlpha(sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
2184 blendFactorAlpha(destFactor, oC, pixel, state.destBlendFactorAlpha);
2185
2186 if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
2187 {
2188 oC.w *= sourceFactor.w;
2189 }
2190
2191 if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
2192 {
2193 pixel.w *= destFactor.w;
2194 }
2195
2196 switch(state.blendOperationAlpha)
2197 {
2198 case BLENDOP_ADD:
2199 oC.w += pixel.w;
2200 break;
2201 case BLENDOP_SUB:
2202 oC.w -= pixel.w;
2203 break;
2204 case BLENDOP_INVSUB:
2205 pixel.w -= oC.w;
2206 oC.w = pixel.w;
2207 break;
2208 case BLENDOP_MIN:
2209 oC.w = Min(oC.w, pixel.w);
2210 break;
2211 case BLENDOP_MAX:
2212 oC.w = Max(oC.w, pixel.w);
2213 break;
2214 case BLENDOP_SOURCE:
2215 // No operation
2216 break;
2217 case BLENDOP_DEST:
2218 oC.w = pixel.w;
2219 break;
2220 case BLENDOP_NULL:
2221 oC.w = Float4(0.0f);
2222 break;
2223 default:
2224 ASSERT(false);
2225 }
2226 }
2227
2228 void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
2229 {
2230 switch(state.targetFormat[index])
2231 {
2232 case FORMAT_R32F:
2233 case FORMAT_R32I:
2234 case FORMAT_R32UI:
2235 case FORMAT_R16I:
2236 case FORMAT_R16UI:
2237 case FORMAT_R8I:
2238 case FORMAT_R8UI:
2239 break;
2240 case FORMAT_G32R32F:
2241 case FORMAT_G32R32I:
2242 case FORMAT_G32R32UI:
2243 case FORMAT_G16R16I:
2244 case FORMAT_G16R16UI:
2245 case FORMAT_G8R8I:
2246 case FORMAT_G8R8UI:
2247 oC.z = oC.x;
2248 oC.x = UnpackLow(oC.x, oC.y);
2249 oC.z = UnpackHigh(oC.z, oC.y);
2250 oC.y = oC.z;
2251 break;
2252 case FORMAT_X32B32G32R32F:
2253 case FORMAT_A32B32G32R32F:
2254 case FORMAT_X32B32G32R32F_UNSIGNED:
2255 case FORMAT_A32B32G32R32I:
2256 case FORMAT_A32B32G32R32UI:
2257 case FORMAT_A16B16G16R16I:
2258 case FORMAT_A16B16G16R16UI:
2259 case FORMAT_A8B8G8R8I:
2260 case FORMAT_A8B8G8R8UI:
2261 transpose4x4(oC.x, oC.y, oC.z, oC.w);
2262 break;
2263 default:
2264 ASSERT(false);
2265 }
2266
2267 int rgbaWriteMask = state.colorWriteActive(index);
2268
2269 Int xMask; // Combination of all masks
2270
2271 if(state.depthTestActive)
2272 {
2273 xMask = zMask;
2274 }
2275 else
2276 {
2277 xMask = cMask;
2278 }
2279
2280 if(state.stencilActive)
2281 {
2282 xMask &= sMask;
2283 }
2284
2285 Pointer<Byte> buffer;
2286 Float4 value;
2287
2288 switch(state.targetFormat[index])
2289 {
2290 case FORMAT_R32F:
2291 case FORMAT_R32I:
2292 case FORMAT_R32UI:
2293 if(rgbaWriteMask & 0x00000001)
2294 {
2295 buffer = cBuffer + 4 * x;
2296
2297 // FIXME: movlps
2298 value.x = *Pointer<Float>(buffer + 0);
2299 value.y = *Pointer<Float>(buffer + 4);
2300
2301 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2302
2303 // FIXME: movhps
2304 value.z = *Pointer<Float>(buffer + 0);
2305 value.w = *Pointer<Float>(buffer + 4);
2306
2307 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
2308 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
2309 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2310
2311 // FIXME: movhps
2312 *Pointer<Float>(buffer + 0) = oC.x.z;
2313 *Pointer<Float>(buffer + 4) = oC.x.w;
2314
2315 buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2316
2317 // FIXME: movlps
2318 *Pointer<Float>(buffer + 0) = oC.x.x;
2319 *Pointer<Float>(buffer + 4) = oC.x.y;
2320 }
2321 break;
2322 case FORMAT_R16I:
2323 case FORMAT_R16UI:
2324 if(rgbaWriteMask & 0x00000001)
2325 {
2326 buffer = cBuffer + 2 * x;
2327
2328 UShort4 xyzw;
2329 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
2330
2331 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2332
2333 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
2334 value = As<Float4>(Int4(xyzw));
2335
2336 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2337 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2338 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2339
2340 if(state.targetFormat[index] == FORMAT_R16I)
2341 {
2342 Float component = oC.x.z;
2343 *Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2344 component = oC.x.w;
2345 *Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2346
2347 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2348
2349 component = oC.x.x;
2350 *Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2351 component = oC.x.y;
2352 *Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2353 }
2354 else // FORMAT_R16UI
2355 {
2356 Float component = oC.x.z;
2357 *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2358 component = oC.x.w;
2359 *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2360
2361 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2362
2363 component = oC.x.x;
2364 *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2365 component = oC.x.y;
2366 *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2367 }
2368 }
2369 break;
2370 case FORMAT_R8I:
2371 case FORMAT_R8UI:
2372 if(rgbaWriteMask & 0x00000001)
2373 {
2374 buffer = cBuffer + x;
2375
2376 UInt xyzw, packedCol;
2377
2378 xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
2379 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2380 xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
2381
2382 Short4 tmpCol = Short4(As<Int4>(oC.x));
2383 if(state.targetFormat[index] == FORMAT_R8I)
2384 {
2385 tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol));
2386 }
2387 else
2388 {
2389 tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol));
2390 }
2391 packedCol = Extract(As<Int2>(tmpCol), 0);
2392
2393 packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
2394 (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
2395
2396 *Pointer<UShort>(buffer) = UShort(packedCol >> 16);
2397 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2398 *Pointer<UShort>(buffer) = UShort(packedCol);
2399 }
2400 break;
2401 case FORMAT_G32R32F:
2402 case FORMAT_G32R32I:
2403 case FORMAT_G32R32UI:
2404 buffer = cBuffer + 8 * x;
2405
2406 value = *Pointer<Float4>(buffer);
2407
2408 if((rgbaWriteMask & 0x00000003) != 0x00000003)
2409 {
2410 Float4 masked = value;
2411 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2412 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
2413 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2414 }
2415
2416 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
2417 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
2418 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2419 *Pointer<Float4>(buffer) = oC.x;
2420
2421 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2422
2423 value = *Pointer<Float4>(buffer);
2424
2425 if((rgbaWriteMask & 0x00000003) != 0x00000003)
2426 {
2427 Float4 masked;
2428
2429 masked = value;
2430 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2431 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
2432 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2433 }
2434
2435 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
2436 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
2437 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2438 *Pointer<Float4>(buffer) = oC.y;
2439 break;
2440 case FORMAT_G16R16I:
2441 case FORMAT_G16R16UI:
2442 if((rgbaWriteMask & 0x00000003) != 0x0)
2443 {
2444 buffer = cBuffer + 4 * x;
2445
2446 UInt2 rgbaMask;
2447 UShort4 packedCol = UShort4(As<Int4>(oC.x));
2448 UShort4 value = *Pointer<UShort4>(buffer);
2449 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2450 if((rgbaWriteMask & 0x3) != 0x3)
2451 {
2452 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
2453 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2454 mergedMask &= rgbaMask;
2455 }
2456 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2457
2458 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2459
2460 packedCol = UShort4(As<Int4>(oC.y));
2461 value = *Pointer<UShort4>(buffer);
2462 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2463 if((rgbaWriteMask & 0x3) != 0x3)
2464 {
2465 mergedMask &= rgbaMask;
2466 }
2467 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2468 }
2469 break;
2470 case FORMAT_G8R8I:
2471 case FORMAT_G8R8UI:
2472 if((rgbaWriteMask & 0x00000003) != 0x0)
2473 {
2474 buffer = cBuffer + 2 * x;
2475
2476 Int2 xyzw, packedCol;
2477
2478 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
2479 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2480 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
2481
2482 if(state.targetFormat[index] == FORMAT_G8R8I)
2483 {
2484 packedCol = As<Int2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2485 }
2486 else
2487 {
2488 packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2489 }
2490
2491 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
2492 if((rgbaWriteMask & 0x3) != 0x3)
2493 {
2494 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
2495 UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2496 mergedMask &= rgbaMask;
2497 }
2498
2499 packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
2500
2501 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
2502 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2503 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
2504 }
2505 break;
2506 case FORMAT_X32B32G32R32F:
2507 case FORMAT_A32B32G32R32F:
2508 case FORMAT_X32B32G32R32F_UNSIGNED:
2509 case FORMAT_A32B32G32R32I:
2510 case FORMAT_A32B32G32R32UI:
2511 buffer = cBuffer + 16 * x;
2512
2513 {
2514 value = *Pointer<Float4>(buffer, 16);
2515
2516 if(rgbaWriteMask != 0x0000000F)
2517 {
2518 Float4 masked = value;
2519 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2520 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2521 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2522 }
2523
2524 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
2525 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
2526 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2527 *Pointer<Float4>(buffer, 16) = oC.x;
2528 }
2529
2530 {
2531 value = *Pointer<Float4>(buffer + 16, 16);
2532
2533 if(rgbaWriteMask != 0x0000000F)
2534 {
2535 Float4 masked = value;
2536 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2537 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2538 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2539 }
2540
2541 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
2542 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
2543 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2544 *Pointer<Float4>(buffer + 16, 16) = oC.y;
2545 }
2546
2547 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2548
2549 {
2550 value = *Pointer<Float4>(buffer, 16);
2551
2552 if(rgbaWriteMask != 0x0000000F)
2553 {
2554 Float4 masked = value;
2555 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2556 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2557 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
2558 }
2559
2560 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
2561 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
2562 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
2563 *Pointer<Float4>(buffer, 16) = oC.z;
2564 }
2565
2566 {
2567 value = *Pointer<Float4>(buffer + 16, 16);
2568
2569 if(rgbaWriteMask != 0x0000000F)
2570 {
2571 Float4 masked = value;
2572 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2573 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2574 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
2575 }
2576
2577 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
2578 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
2579 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
2580 *Pointer<Float4>(buffer + 16, 16) = oC.w;
2581 }
2582 break;
2583 case FORMAT_A16B16G16R16I:
2584 case FORMAT_A16B16G16R16UI:
2585 if((rgbaWriteMask & 0x0000000F) != 0x0)
2586 {
2587 buffer = cBuffer + 8 * x;
2588
2589 UInt4 rgbaMask;
2590 UShort8 value = *Pointer<UShort8>(buffer);
2591 UShort8 packedCol = UShort8(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y)));
2592 UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
2593 if((rgbaWriteMask & 0xF) != 0xF)
2594 {
2595 UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
2596 rgbaMask = UInt4(tmpMask, tmpMask);
2597 mergedMask &= rgbaMask;
2598 }
2599 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2600
2601 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2602
2603 value = *Pointer<UShort8>(buffer);
2604 packedCol = UShort8(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w)));
2605 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
2606 if((rgbaWriteMask & 0xF) != 0xF)
2607 {
2608 mergedMask &= rgbaMask;
2609 }
2610 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2611 }
2612 break;
2613 case FORMAT_A8B8G8R8I:
2614 case FORMAT_A8B8G8R8UI:
2615 if((rgbaWriteMask & 0x0000000F) != 0x0)
2616 {
2617 UInt2 value, packedCol, mergedMask;
2618
2619 buffer = cBuffer + 4 * x;
2620
2621 if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
2622 {
2623 packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2624 }
2625 else
2626 {
2627 packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2628 }
2629 value = *Pointer<UInt2>(buffer, 16);
2630 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2631 if(rgbaWriteMask != 0xF)
2632 {
2633 mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2634 }
2635 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2636
2637 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2638
2639 if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
2640 {
2641 packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
2642 }
2643 else
2644 {
2645 packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
2646 }
2647 value = *Pointer<UInt2>(buffer, 16);
2648 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2649 if(rgbaWriteMask != 0xF)
2650 {
2651 mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2652 }
2653 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2654 }
2655 break;
2656 default:
2657 ASSERT(false);
2658 }
2659 }
2660
2661 UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate)
2662 {
2663 return UShort4(cf * Float4(0xFFFF), saturate);
2664 }
2665
2666 void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
2667 {
2668 Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16);
2669
2670 c.x = As<UShort4>(c.x) >> 4;
2671 c.y = As<UShort4>(c.y) >> 4;
2672 c.z = As<UShort4>(c.z) >> 4;
2673
2674 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2675 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2676 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2677 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2678
2679 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2680 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2681 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2682 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2683
2684 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2685 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2686 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2687 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2688 }
2689
2690 void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
2691 {
2692 c.x = As<UShort4>(c.x) >> 4;
2693 c.y = As<UShort4>(c.y) >> 4;
2694 c.z = As<UShort4>(c.z) >> 4;
2695
2696 linearToSRGB12_16(c);
2697 }
2698
2699 void PixelRoutine::linearToSRGB12_16(Vector4s &c)
2700 {
2701 Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16);
2702
2703 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2704 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2705 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2706 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2707
2708 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2709 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2710 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2711 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2712
2713 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2714 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2715 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2716 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2717 }
2718
2719 Float4 PixelRoutine::sRGBtoLinear(const Float4 &x) // Approximates x^2.2
2720 {
2721 Float4 linear = x * x;
2722 linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
2723
2724 return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
2725 }
2726
2727 bool PixelRoutine::colorUsed()
2728 {
2729 return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill;
2730 }
2731}
2732