1// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include "PixelRoutine.hpp"
16
17#include "SamplerCore.hpp"
18#include "Constants.hpp"
19#include "Device/Renderer.hpp"
20#include "Device/QuadRasterizer.hpp"
21#include "Device/Primitive.hpp"
22#include "Vulkan/VkDebug.hpp"
23#include "Vulkan/VkPipelineLayout.hpp"
24
25namespace sw
26{
27 PixelRoutine::PixelRoutine(
28 const PixelProcessor::State &state,
29 vk::PipelineLayout const *pipelineLayout,
30 SpirvShader const *spirvShader,
31 const vk::DescriptorSet::Bindings &descriptorSets)
32 : QuadRasterizer(state, spirvShader),
33 routine(pipelineLayout),
34 descriptorSets(descriptorSets)
35 {
36 if (spirvShader)
37 {
38 spirvShader->emitProlog(&routine);
39
40 // Clearing inputs to 0 is not demanded by the spec,
41 // but it makes the undefined behavior deterministic.
42 for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i++)
43 {
44 routine.inputs[i] = Float4(0.0f);
45 }
46 }
47 }
48
49 PixelRoutine::~PixelRoutine()
50 {
51 }
52
53 void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
54 {
55 // TODO: consider shader which modifies sample mask in general
56 const bool earlyDepthTest = !spirvShader || (spirvShader->getModes().EarlyFragmentTests && !spirvShader->getModes().DepthReplacing && !state.alphaToCoverage);
57
58 Int zMask[4]; // Depth mask
59 Int sMask[4]; // Stencil mask
60
61 for(unsigned int q = 0; q < state.multiSample; q++)
62 {
63 zMask[q] = cMask[q];
64 sMask[q] = cMask[q];
65 }
66
67 for(unsigned int q = 0; q < state.multiSample; q++)
68 {
69 stencilTest(sBuffer, q, x, sMask[q], cMask[q]);
70 }
71
72 Float4 f;
73 Float4 rhwCentroid;
74
75 Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16);
76
77 if(interpolateZ())
78 {
79 for(unsigned int q = 0; q < state.multiSample; q++)
80 {
81 Float4 x = xxxx;
82
83 if(state.multiSample > 1)
84 {
85 x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4));
86 }
87
88 z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false, state.depthClamp);
89 }
90 }
91
92 Bool depthPass = false;
93
94 if(earlyDepthTest)
95 {
96 for(unsigned int q = 0; q < state.multiSample; q++)
97 {
98 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
99 }
100 }
101
102 If(depthPass || Bool(!earlyDepthTest))
103 {
104 Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
105
106 // Centroid locations
107 Float4 XXXX = Float4(0.0f);
108 Float4 YYYY = Float4(0.0f);
109
110 if(state.centroid)
111 {
112 Float4 WWWW(1.0e-9f);
113
114 for(unsigned int q = 0; q < state.multiSample; q++)
115 {
116 XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
117 YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
118 WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]);
119 }
120
121 WWWW = Rcp_pp(WWWW);
122 XXXX *= WWWW;
123 YYYY *= WWWW;
124
125 XXXX += xxxx;
126 YYYY += yyyy;
127 }
128
129 if(interpolateW())
130 {
131 w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false, false);
132 rhw = reciprocal(w, false, false, true);
133
134 if(state.centroid)
135 {
136 rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false));
137 }
138 }
139
140 if (spirvShader)
141 {
142 for (int interpolant = 0; interpolant < MAX_INTERFACE_COMPONENTS; interpolant++)
143 {
144 auto const &input = spirvShader->inputs[interpolant];
145 if (input.Type != SpirvShader::ATTRIBTYPE_UNUSED)
146 {
147 if (input.Centroid && state.multiSample > 1)
148 {
149 routine.inputs[interpolant] =
150 interpolateCentroid(XXXX, YYYY, rhwCentroid,
151 primitive + OFFSET(Primitive, V[interpolant]),
152 input.Flat, !input.NoPerspective);
153 }
154 else
155 {
156 routine.inputs[interpolant] =
157 interpolate(xxxx, Dv[interpolant], rhw,
158 primitive + OFFSET(Primitive, V[interpolant]),
159 input.Flat, !input.NoPerspective, false);
160 }
161 }
162 }
163
164 setBuiltins(x, y, z, w, cMask);
165 }
166
167 Bool alphaPass = true;
168
169 if (spirvShader)
170 {
171 bool earlyFragTests = (spirvShader && spirvShader->getModes().EarlyFragmentTests);
172 applyShader(cMask, earlyFragTests ? sMask : cMask, earlyDepthTest ? zMask : cMask);
173 }
174
175 alphaPass = alphaTest(cMask);
176
177 if((spirvShader && spirvShader->getModes().ContainsKill) || state.alphaToCoverage)
178 {
179 for(unsigned int q = 0; q < state.multiSample; q++)
180 {
181 zMask[q] &= cMask[q];
182 sMask[q] &= cMask[q];
183 }
184 }
185
186 If(alphaPass)
187 {
188 if(!earlyDepthTest)
189 {
190 for(unsigned int q = 0; q < state.multiSample; q++)
191 {
192 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
193 }
194 }
195
196 If(depthPass || Bool(earlyDepthTest))
197 {
198 for(unsigned int q = 0; q < state.multiSample; q++)
199 {
200 if(state.multiSampleMask & (1 << q))
201 {
202 writeDepth(zBuffer, q, x, z[q], zMask[q]);
203
204 if(state.occlusionEnabled)
205 {
206 occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
207 }
208 }
209 }
210
211 rasterOperation(cBuffer, x, sMask, zMask, cMask);
212 }
213 }
214 }
215
216 for(unsigned int q = 0; q < state.multiSample; q++)
217 {
218 if(state.multiSampleMask & (1 << q))
219 {
220 writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
221 }
222 }
223 }
224
225 Float4 PixelRoutine::interpolateCentroid(const Float4 &x, const Float4 &y, const Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
226 {
227 Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
228
229 if(!flat)
230 {
231 interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
232 y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
233
234 if(perspective)
235 {
236 interpolant *= rhw;
237 }
238 }
239
240 return interpolant;
241 }
242
243 void PixelRoutine::stencilTest(const Pointer<Byte> &sBuffer, int q, const Int &x, Int &sMask, const Int &cMask)
244 {
245 if(!state.stencilActive)
246 {
247 return;
248 }
249
250 // (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
251
252 Pointer<Byte> buffer = sBuffer + 2 * x;
253
254 if(q > 0)
255 {
256 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
257 }
258
259 Byte8 value = *Pointer<Byte8>(buffer);
260 Byte8 valueBack = value;
261
262 if(state.frontStencil.compareMask != 0xff)
263 {
264 value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ));
265 }
266
267 stencilTest(value, state.frontStencil.compareOp, false);
268
269 if(state.backStencil.compareMask != 0xff)
270 {
271 valueBack &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ));
272 }
273
274 stencilTest(valueBack, state.backStencil.compareOp, true);
275
276 value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
277 valueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
278 value |= valueBack;
279
280 sMask = SignMask(value) & cMask;
281 }
282
283 void PixelRoutine::stencilTest(Byte8 &value, VkCompareOp stencilCompareMode, bool isBack)
284 {
285 Byte8 equal;
286
287 switch(stencilCompareMode)
288 {
289 case VK_COMPARE_OP_ALWAYS:
290 value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
291 break;
292 case VK_COMPARE_OP_NEVER:
293 value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
294 break;
295 case VK_COMPARE_OP_LESS: // a < b ~ b > a
296 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
297 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[isBack].referenceMaskedSignedQ)));
298 break;
299 case VK_COMPARE_OP_EQUAL:
300 value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[isBack].referenceMaskedQ)));
301 break;
302 case VK_COMPARE_OP_NOT_EQUAL: // a != b ~ !(a == b)
303 value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[isBack].referenceMaskedQ)));
304 value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
305 break;
306 case VK_COMPARE_OP_LESS_OR_EQUAL: // a <= b ~ (b > a) || (a == b)
307 equal = value;
308 equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[isBack].referenceMaskedQ)));
309 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
310 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[isBack].referenceMaskedSignedQ)));
311 value |= equal;
312 break;
313 case VK_COMPARE_OP_GREATER: // a > b
314 equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[isBack].referenceMaskedSignedQ));
315 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
316 equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
317 value = equal;
318 break;
319 case VK_COMPARE_OP_GREATER_OR_EQUAL: // a >= b ~ !(a < b) ~ !(b > a)
320 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
321 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[isBack].referenceMaskedSignedQ)));
322 value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
323 break;
324 default:
325 UNIMPLEMENTED("VkCompareOp: %d", int(stencilCompareMode));
326 }
327 }
328
329 Bool PixelRoutine::depthTest32F(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
330 {
331 Float4 Z = z;
332
333 if(spirvShader && spirvShader->getModes().DepthReplacing)
334 {
335 Z = oDepth;
336 }
337
338 Pointer<Byte> buffer;
339 Int pitch;
340
341 if(!state.quadLayoutDepthBuffer)
342 {
343 buffer = zBuffer + 4 * x;
344 pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
345 }
346 else
347 {
348 buffer = zBuffer + 8 * x;
349 }
350
351 if(q > 0)
352 {
353 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
354 }
355
356 Float4 zValue;
357
358 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
359 {
360 if(!state.quadLayoutDepthBuffer)
361 {
362 // FIXME: Properly optimizes?
363 zValue.xy = *Pointer<Float4>(buffer);
364 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
365 }
366 else
367 {
368 zValue = *Pointer<Float4>(buffer, 16);
369 }
370 }
371
372 Int4 zTest;
373
374 switch(state.depthCompareMode)
375 {
376 case VK_COMPARE_OP_ALWAYS:
377 // Optimized
378 break;
379 case VK_COMPARE_OP_NEVER:
380 // Optimized
381 break;
382 case VK_COMPARE_OP_EQUAL:
383 zTest = CmpEQ(zValue, Z);
384 break;
385 case VK_COMPARE_OP_NOT_EQUAL:
386 zTest = CmpNEQ(zValue, Z);
387 break;
388 case VK_COMPARE_OP_LESS:
389 zTest = CmpNLE(zValue, Z);
390 break;
391 case VK_COMPARE_OP_GREATER_OR_EQUAL:
392 zTest = CmpLE(zValue, Z);
393 break;
394 case VK_COMPARE_OP_LESS_OR_EQUAL:
395 zTest = CmpNLT(zValue, Z);
396 break;
397 case VK_COMPARE_OP_GREATER:
398 zTest = CmpLT(zValue, Z);
399 break;
400 default:
401 UNIMPLEMENTED("VkCompareOp: %d", int(state.depthCompareMode));
402 }
403
404 switch(state.depthCompareMode)
405 {
406 case VK_COMPARE_OP_ALWAYS:
407 zMask = cMask;
408 break;
409 case VK_COMPARE_OP_NEVER:
410 zMask = 0x0;
411 break;
412 default:
413 zMask = SignMask(zTest) & cMask;
414 break;
415 }
416
417 if(state.stencilActive)
418 {
419 zMask &= sMask;
420 }
421
422 return zMask != 0;
423 }
424
425 Bool PixelRoutine::depthTest16(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
426 {
427 Short4 Z = convertFixed16(z, true);
428
429 if(spirvShader && spirvShader->getModes().DepthReplacing)
430 {
431 Z = convertFixed16(oDepth, true);
432 }
433
434 Pointer<Byte> buffer;
435 Int pitch;
436
437 if(!state.quadLayoutDepthBuffer)
438 {
439 buffer = zBuffer + 2 * x;
440 pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
441 }
442 else
443 {
444 buffer = zBuffer + 4 * x;
445 }
446
447 if(q > 0)
448 {
449 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
450 }
451
452 Short4 zValue;
453
454 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
455 {
456 if(!state.quadLayoutDepthBuffer)
457 {
458 // FIXME: Properly optimizes?
459 zValue = *Pointer<Short4>(buffer) & Short4(-1, -1, 0, 0);
460 zValue = zValue | (*Pointer<Short4>(buffer + pitch - 4) & Short4(0, 0, -1, -1));
461 }
462 else
463 {
464 zValue = *Pointer<Short4>(buffer, 8);
465 }
466 }
467
468 Int4 zTest;
469
470 // Bias values to make unsigned compares out of Reactor's (due SSE's) signed compares only
471 zValue = zValue - Short4(0x8000u);
472 Z = Z - Short4(0x8000u);
473
474 switch(state.depthCompareMode)
475 {
476 case VK_COMPARE_OP_ALWAYS:
477 // Optimized
478 break;
479 case VK_COMPARE_OP_NEVER:
480 // Optimized
481 break;
482 case VK_COMPARE_OP_EQUAL:
483 zTest = Int4(CmpEQ(zValue, Z));
484 break;
485 case VK_COMPARE_OP_NOT_EQUAL:
486 zTest = ~Int4(CmpEQ(zValue, Z));
487 break;
488 case VK_COMPARE_OP_LESS:
489 zTest = Int4(CmpGT(zValue, Z));
490 break;
491 case VK_COMPARE_OP_GREATER_OR_EQUAL:
492 zTest = ~Int4(CmpGT(zValue, Z));
493 break;
494 case VK_COMPARE_OP_LESS_OR_EQUAL:
495 zTest = ~Int4(CmpGT(Z, zValue));
496 break;
497 case VK_COMPARE_OP_GREATER:
498 zTest = Int4(CmpGT(Z, zValue));
499 break;
500 default:
501 UNIMPLEMENTED("VkCompareOp: %d", int(state.depthCompareMode));
502 }
503
504 switch(state.depthCompareMode)
505 {
506 case VK_COMPARE_OP_ALWAYS:
507 zMask = cMask;
508 break;
509 case VK_COMPARE_OP_NEVER:
510 zMask = 0x0;
511 break;
512 default:
513 zMask = SignMask(zTest) & cMask;
514 break;
515 }
516
517 if(state.stencilActive)
518 {
519 zMask &= sMask;
520 }
521
522 return zMask != 0;
523 }
524
525 Bool PixelRoutine::depthTest(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
526 {
527 if(!state.depthTestActive)
528 {
529 return true;
530 }
531
532 if (state.depthFormat == VK_FORMAT_D16_UNORM)
533 return depthTest16(zBuffer, q, x, z, sMask, zMask, cMask);
534 else
535 return depthTest32F(zBuffer, q, x, z, sMask, zMask, cMask);
536 }
537
538 void PixelRoutine::alphaToCoverage(Int cMask[4], const Float4 &alpha)
539 {
540 Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0)));
541 Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1)));
542 Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2)));
543 Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3)));
544
545 Int aMask0 = SignMask(coverage0);
546 Int aMask1 = SignMask(coverage1);
547 Int aMask2 = SignMask(coverage2);
548 Int aMask3 = SignMask(coverage3);
549
550 cMask[0] &= aMask0;
551 cMask[1] &= aMask1;
552 cMask[2] &= aMask2;
553 cMask[3] &= aMask3;
554 }
555
556 void PixelRoutine::writeDepth32F(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
557 {
558 Float4 Z = z;
559
560 if(spirvShader && spirvShader->getModes().DepthReplacing)
561 {
562 Z = oDepth;
563 }
564
565 Pointer<Byte> buffer;
566 Int pitch;
567
568 if(!state.quadLayoutDepthBuffer)
569 {
570 buffer = zBuffer + 4 * x;
571 pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
572 }
573 else
574 {
575 buffer = zBuffer + 8 * x;
576 }
577
578 if(q > 0)
579 {
580 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
581 }
582
583 Float4 zValue;
584
585 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
586 {
587 if(!state.quadLayoutDepthBuffer)
588 {
589 // FIXME: Properly optimizes?
590 zValue.xy = *Pointer<Float4>(buffer);
591 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
592 }
593 else
594 {
595 zValue = *Pointer<Float4>(buffer, 16);
596 }
597 }
598
599 Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
600 zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
601 Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
602
603 if(!state.quadLayoutDepthBuffer)
604 {
605 // FIXME: Properly optimizes?
606 *Pointer<Float2>(buffer) = Float2(Z.xy);
607 *Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
608 }
609 else
610 {
611 *Pointer<Float4>(buffer, 16) = Z;
612 }
613 }
614
615 void PixelRoutine::writeDepth16(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
616 {
617 Short4 Z = As<Short4>(convertFixed16(z, true));
618
619 if(spirvShader && spirvShader->getModes().DepthReplacing)
620 {
621 Z = As<Short4>(convertFixed16(oDepth, true));
622 }
623
624 Pointer<Byte> buffer;
625 Int pitch;
626
627 if(!state.quadLayoutDepthBuffer)
628 {
629 buffer = zBuffer + 2 * x;
630 pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
631 }
632 else
633 {
634 buffer = zBuffer + 4 * x;
635 }
636
637 if(q > 0)
638 {
639 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
640 }
641
642 Short4 zValue;
643
644 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
645 {
646 if(!state.quadLayoutDepthBuffer)
647 {
648 // FIXME: Properly optimizes?
649 zValue = *Pointer<Short4>(buffer) & Short4(-1, -1, 0, 0);
650 zValue = zValue | (*Pointer<Short4>(buffer + pitch - 4) & Short4(0, 0, -1, -1));
651 }
652 else
653 {
654 zValue = *Pointer<Short4>(buffer, 8);
655 }
656 }
657
658 Z = Z & *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q) + zMask * 8, 8);
659 zValue = zValue & *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q) + zMask * 8, 8);
660 Z = Z | zValue;
661
662 if(!state.quadLayoutDepthBuffer)
663 {
664 // FIXME: Properly optimizes?
665 *Pointer<Short>(buffer) = Extract(Z, 0);
666 *Pointer<Short>(buffer+2) = Extract(Z, 1);
667 *Pointer<Short>(buffer+pitch) = Extract(Z, 2);
668 *Pointer<Short>(buffer+pitch+2) = Extract(Z, 3);
669 }
670 else
671 {
672 *Pointer<Short4>(buffer, 8) = Z;
673 }
674 }
675
676 void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
677 {
678 if(!state.depthWriteEnable)
679 {
680 return;
681 }
682
683 if (state.depthFormat == VK_FORMAT_D16_UNORM)
684 writeDepth16(zBuffer, q, x, z, zMask);
685 else
686 writeDepth32F(zBuffer, q, x, z, zMask);
687 }
688
689 void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, const Int &x, const Int &sMask, const Int &zMask, const Int &cMask)
690 {
691 if(!state.stencilActive)
692 {
693 return;
694 }
695
696 if(state.frontStencil.passOp == VK_STENCIL_OP_KEEP && state.frontStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.frontStencil.failOp == VK_STENCIL_OP_KEEP)
697 {
698 if(state.backStencil.passOp == VK_STENCIL_OP_KEEP && state.backStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.backStencil.failOp == VK_STENCIL_OP_KEEP)
699 {
700 return;
701 }
702 }
703
704 if((state.frontStencil.writeMask == 0) && (state.backStencil.writeMask == 0))
705 {
706 return;
707 }
708
709 Pointer<Byte> buffer = sBuffer + 2 * x;
710
711 if(q > 0)
712 {
713 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
714 }
715
716 Byte8 bufferValue = *Pointer<Byte8>(buffer);
717
718 Byte8 newValue;
719 stencilOperation(newValue, bufferValue, state.frontStencil, false, zMask, sMask);
720
721 if((state.frontStencil.writeMask & 0xFF) != 0xFF) // Assume 8-bit stencil buffer
722 {
723 Byte8 maskedValue = bufferValue;
724 newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ));
725 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
726 newValue |= maskedValue;
727 }
728
729 Byte8 newValueBack;
730
731 stencilOperation(newValueBack, bufferValue, state.backStencil, true, zMask, sMask);
732
733 if((state.backStencil.writeMask & 0xFF) != 0xFF) // Assume 8-bit stencil buffer
734 {
735 Byte8 maskedValue = bufferValue;
736 newValueBack &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ));
737 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
738 newValueBack |= maskedValue;
739 }
740
741 newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
742 newValueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
743 newValue |= newValueBack;
744
745 newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
746 bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
747 newValue |= bufferValue;
748
749 *Pointer<Byte4>(buffer) = Byte4(newValue);
750 }
751
752 void PixelRoutine::stencilOperation(Byte8 &newValue, const Byte8 &bufferValue, const PixelProcessor::States::StencilOpState &ops, bool isBack, const Int &zMask, const Int &sMask)
753 {
754 Byte8 &pass = newValue;
755 Byte8 fail;
756 Byte8 zFail;
757
758 stencilOperation(pass, bufferValue, ops.passOp, isBack);
759
760 if(ops.depthFailOp != ops.passOp)
761 {
762 stencilOperation(zFail, bufferValue, ops.depthFailOp, isBack);
763 }
764
765 if(ops.failOp != ops.passOp || ops.failOp != ops.depthFailOp)
766 {
767 stencilOperation(fail, bufferValue, ops.failOp, isBack);
768 }
769
770 if(ops.failOp != ops.passOp || ops.failOp != ops.depthFailOp)
771 {
772 if(state.depthTestActive && ops.depthFailOp != ops.passOp) // zMask valid and values not the same
773 {
774 pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
775 zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
776 pass |= zFail;
777 }
778
779 pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
780 fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
781 pass |= fail;
782 }
783 }
784
785 void PixelRoutine::stencilOperation(Byte8 &output, const Byte8 &bufferValue, VkStencilOp operation, bool isBack)
786 {
787 switch(operation)
788 {
789 case VK_STENCIL_OP_KEEP:
790 output = bufferValue;
791 break;
792 case VK_STENCIL_OP_ZERO:
793 output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
794 break;
795 case VK_STENCIL_OP_REPLACE:
796 output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[isBack].referenceQ));
797 break;
798 case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
799 output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
800 break;
801 case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
802 output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
803 break;
804 case VK_STENCIL_OP_INVERT:
805 output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
806 break;
807 case VK_STENCIL_OP_INCREMENT_AND_WRAP:
808 output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
809 break;
810 case VK_STENCIL_OP_DECREMENT_AND_WRAP:
811 output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
812 break;
813 default:
814 UNIMPLEMENTED("VkStencilOp: %d", int(operation));
815 }
816 }
817
818 void PixelRoutine::blendFactor(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, VkBlendFactor blendFactorActive)
819 {
820 switch(blendFactorActive)
821 {
822 case VK_BLEND_FACTOR_ZERO:
823 // Optimized
824 break;
825 case VK_BLEND_FACTOR_ONE:
826 // Optimized
827 break;
828 case VK_BLEND_FACTOR_SRC_COLOR:
829 blendFactor.x = current.x;
830 blendFactor.y = current.y;
831 blendFactor.z = current.z;
832 break;
833 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
834 blendFactor.x = Short4(0xFFFFu) - current.x;
835 blendFactor.y = Short4(0xFFFFu) - current.y;
836 blendFactor.z = Short4(0xFFFFu) - current.z;
837 break;
838 case VK_BLEND_FACTOR_DST_COLOR:
839 blendFactor.x = pixel.x;
840 blendFactor.y = pixel.y;
841 blendFactor.z = pixel.z;
842 break;
843 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
844 blendFactor.x = Short4(0xFFFFu) - pixel.x;
845 blendFactor.y = Short4(0xFFFFu) - pixel.y;
846 blendFactor.z = Short4(0xFFFFu) - pixel.z;
847 break;
848 case VK_BLEND_FACTOR_SRC_ALPHA:
849 blendFactor.x = current.w;
850 blendFactor.y = current.w;
851 blendFactor.z = current.w;
852 break;
853 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
854 blendFactor.x = Short4(0xFFFFu) - current.w;
855 blendFactor.y = Short4(0xFFFFu) - current.w;
856 blendFactor.z = Short4(0xFFFFu) - current.w;
857 break;
858 case VK_BLEND_FACTOR_DST_ALPHA:
859 blendFactor.x = pixel.w;
860 blendFactor.y = pixel.w;
861 blendFactor.z = pixel.w;
862 break;
863 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
864 blendFactor.x = Short4(0xFFFFu) - pixel.w;
865 blendFactor.y = Short4(0xFFFFu) - pixel.w;
866 blendFactor.z = Short4(0xFFFFu) - pixel.w;
867 break;
868 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
869 blendFactor.x = Short4(0xFFFFu) - pixel.w;
870 blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
871 blendFactor.y = blendFactor.x;
872 blendFactor.z = blendFactor.x;
873 break;
874 case VK_BLEND_FACTOR_CONSTANT_COLOR:
875 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0]));
876 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1]));
877 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2]));
878 break;
879 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
880 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
881 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
882 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
883 break;
884 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
885 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
886 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
887 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
888 break;
889 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
890 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
891 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
892 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
893 break;
894 default:
895 UNIMPLEMENTED("VkBlendFactor: %d", int(blendFactorActive));
896 }
897 }
898
899 void PixelRoutine::blendFactorAlpha(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, VkBlendFactor blendFactorAlphaActive)
900 {
901 switch(blendFactorAlphaActive)
902 {
903 case VK_BLEND_FACTOR_ZERO:
904 // Optimized
905 break;
906 case VK_BLEND_FACTOR_ONE:
907 // Optimized
908 break;
909 case VK_BLEND_FACTOR_SRC_COLOR:
910 blendFactor.w = current.w;
911 break;
912 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
913 blendFactor.w = Short4(0xFFFFu) - current.w;
914 break;
915 case VK_BLEND_FACTOR_DST_COLOR:
916 blendFactor.w = pixel.w;
917 break;
918 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
919 blendFactor.w = Short4(0xFFFFu) - pixel.w;
920 break;
921 case VK_BLEND_FACTOR_SRC_ALPHA:
922 blendFactor.w = current.w;
923 break;
924 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
925 blendFactor.w = Short4(0xFFFFu) - current.w;
926 break;
927 case VK_BLEND_FACTOR_DST_ALPHA:
928 blendFactor.w = pixel.w;
929 break;
930 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
931 blendFactor.w = Short4(0xFFFFu) - pixel.w;
932 break;
933 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
934 blendFactor.w = Short4(0xFFFFu);
935 break;
936 case VK_BLEND_FACTOR_CONSTANT_COLOR:
937 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
938 blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
939 break;
940 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
941 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
942 blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
943 break;
944 default:
945 UNIMPLEMENTED("VkBlendFactor: %d", int(blendFactorAlphaActive));
946 }
947 }
948
949 bool PixelRoutine::isSRGB(int index) const
950 {
951 return vk::Format(state.targetFormat[index]).isSRGBformat();
952 }
953
954 void PixelRoutine::readPixel(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &pixel)
955 {
956 Short4 c01;
957 Short4 c23;
958 Pointer<Byte> buffer;
959 Pointer<Byte> buffer2;
960
961 switch(state.targetFormat[index])
962 {
963 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
964 buffer = cBuffer + 2 * x;
965 buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
966 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
967
968 pixel.x = (c01 & Short4(0x7C00u)) << 1;
969 pixel.y = (c01 & Short4(0x03E0u)) << 6;
970 pixel.z = (c01 & Short4(0x001Fu)) << 11;
971 pixel.w = (c01 & Short4(0x8000u)) >> 15;
972
973 // Expand to 16 bit range
974 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
975 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
976 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
977 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
978 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
979 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
980 break;
981 case VK_FORMAT_R5G6B5_UNORM_PACK16:
982 buffer = cBuffer + 2 * x;
983 buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
984 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
985
986 pixel.x = c01 & Short4(0xF800u);
987 pixel.y = (c01 & Short4(0x07E0u)) << 5;
988 pixel.z = (c01 & Short4(0x001Fu)) << 11;
989 pixel.w = Short4(0xFFFFu);
990
991 // Expand to 16 bit range
992 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
993 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
994 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 6);
995 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 12);
996 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
997 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
998 break;
999 case VK_FORMAT_B8G8R8A8_UNORM:
1000 case VK_FORMAT_B8G8R8A8_SRGB:
1001 buffer = cBuffer + 4 * x;
1002 c01 = *Pointer<Short4>(buffer);
1003 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1004 c23 = *Pointer<Short4>(buffer);
1005 pixel.z = c01;
1006 pixel.y = c01;
1007 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1008 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1009 pixel.x = pixel.z;
1010 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1011 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1012 pixel.y = pixel.z;
1013 pixel.w = pixel.x;
1014 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1015 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1016 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1017 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1018 break;
1019 case VK_FORMAT_R8G8B8A8_UNORM:
1020 case VK_FORMAT_R8G8B8A8_SRGB:
1021 buffer = cBuffer + 4 * x;
1022 c01 = *Pointer<Short4>(buffer);
1023 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1024 c23 = *Pointer<Short4>(buffer);
1025 pixel.z = c01;
1026 pixel.y = c01;
1027 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1028 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1029 pixel.x = pixel.z;
1030 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1031 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1032 pixel.y = pixel.z;
1033 pixel.w = pixel.x;
1034 pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1035 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1036 pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1037 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1038 break;
1039 case VK_FORMAT_R8_UNORM:
1040 buffer = cBuffer + 1 * x;
1041 pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0);
1042 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1043 pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1);
1044 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1045 pixel.y = Short4(0x0000);
1046 pixel.z = Short4(0x0000);
1047 pixel.w = Short4(0xFFFFu);
1048 break;
1049 case VK_FORMAT_R8G8_UNORM:
1050 buffer = cBuffer + 2 * x;
1051 c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
1052 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1053 c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
1054 pixel.x = (c01 & Short4(0x00FFu)) | (c01 << 8);
1055 pixel.y = (c01 & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c01) >> 8);
1056 pixel.z = Short4(0x0000u);
1057 pixel.w = Short4(0xFFFFu);
1058 break;
1059 case VK_FORMAT_R16G16B16A16_UNORM:
1060 buffer = cBuffer;
1061 pixel.x = *Pointer<Short4>(buffer + 8 * x);
1062 pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
1063 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1064 pixel.z = *Pointer<Short4>(buffer + 8 * x);
1065 pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
1066 transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
1067 break;
1068 case VK_FORMAT_R16G16_UNORM:
1069 buffer = cBuffer;
1070 pixel.x = *Pointer<Short4>(buffer + 4 * x);
1071 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1072 pixel.y = *Pointer<Short4>(buffer + 4 * x);
1073 pixel.z = pixel.x;
1074 pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
1075 pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
1076 pixel.y = pixel.z;
1077 pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
1078 pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
1079 pixel.z = Short4(0xFFFFu);
1080 pixel.w = Short4(0xFFFFu);
1081 break;
1082 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1083 {
1084 buffer = cBuffer;
1085 Int4 v = Int4(0);
1086 v = Insert(v, *Pointer<Int>(buffer + 4 * x), 0);
1087 v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 1);
1088 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1089 v = Insert(v, *Pointer<Int>(buffer + 4 * x), 2);
1090 v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 3);
1091
1092 pixel.x = Short4(v << 6) & Short4(0xFFC0u);
1093 pixel.y = Short4(v >> 4) & Short4(0xFFC0u);
1094 pixel.z = Short4(v >> 14) & Short4(0xFFC0u);
1095 pixel.w = Short4(v >> 16) & Short4(0xC000u);
1096 } break;
1097 default:
1098 UNIMPLEMENTED("VkFormat %d", state.targetFormat[index]);
1099 }
1100
1101 if(isSRGB(index))
1102 {
1103 sRGBtoLinear16_12_16(pixel);
1104 }
1105 }
1106
1107 void PixelRoutine::alphaBlend(int index, const Pointer<Byte> &cBuffer, Vector4s &current, const Int &x)
1108 {
1109 if(!state.blendState[index].alphaBlendEnable)
1110 {
1111 return;
1112 }
1113
1114 Vector4s pixel;
1115 readPixel(index, cBuffer, x, pixel);
1116
1117 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
1118 Vector4s sourceFactor;
1119 Vector4s destFactor;
1120
1121 blendFactor(sourceFactor, current, pixel, state.blendState[index].sourceBlendFactor);
1122 blendFactor(destFactor, current, pixel, state.blendState[index].destBlendFactor);
1123
1124 if(state.blendState[index].sourceBlendFactor != VK_BLEND_FACTOR_ONE && state.blendState[index].sourceBlendFactor != VK_BLEND_FACTOR_ZERO)
1125 {
1126 current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
1127 current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
1128 current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
1129 }
1130
1131 if(state.blendState[index].destBlendFactor != VK_BLEND_FACTOR_ONE && state.blendState[index].destBlendFactor != VK_BLEND_FACTOR_ZERO)
1132 {
1133 pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
1134 pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
1135 pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
1136 }
1137
1138 switch(state.blendState[index].blendOperation)
1139 {
1140 case VK_BLEND_OP_ADD:
1141 current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1142 current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1143 current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1144 break;
1145 case VK_BLEND_OP_SUBTRACT:
1146 current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1147 current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1148 current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1149 break;
1150 case VK_BLEND_OP_REVERSE_SUBTRACT:
1151 current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
1152 current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
1153 current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
1154 break;
1155 case VK_BLEND_OP_MIN:
1156 current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
1157 current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
1158 current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
1159 break;
1160 case VK_BLEND_OP_MAX:
1161 current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
1162 current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
1163 current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
1164 break;
1165 case VK_BLEND_OP_SRC_EXT:
1166 // No operation
1167 break;
1168 case VK_BLEND_OP_DST_EXT:
1169 current.x = pixel.x;
1170 current.y = pixel.y;
1171 current.z = pixel.z;
1172 break;
1173 case VK_BLEND_OP_ZERO_EXT:
1174 current.x = Short4(0x0000);
1175 current.y = Short4(0x0000);
1176 current.z = Short4(0x0000);
1177 break;
1178 default:
1179 UNIMPLEMENTED("VkBlendOp: %d", int(state.blendState[index].blendOperation));
1180 }
1181
1182 blendFactorAlpha(sourceFactor, current, pixel, state.blendState[index].sourceBlendFactorAlpha);
1183 blendFactorAlpha(destFactor, current, pixel, state.blendState[index].destBlendFactorAlpha);
1184
1185 if(state.blendState[index].sourceBlendFactorAlpha != VK_BLEND_FACTOR_ONE && state.blendState[index].sourceBlendFactorAlpha != VK_BLEND_FACTOR_ZERO)
1186 {
1187 current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
1188 }
1189
1190 if(state.blendState[index].destBlendFactorAlpha != VK_BLEND_FACTOR_ONE && state.blendState[index].destBlendFactorAlpha != VK_BLEND_FACTOR_ZERO)
1191 {
1192 pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
1193 }
1194
1195 switch(state.blendState[index].blendOperationAlpha)
1196 {
1197 case VK_BLEND_OP_ADD:
1198 current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1199 break;
1200 case VK_BLEND_OP_SUBTRACT:
1201 current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1202 break;
1203 case VK_BLEND_OP_REVERSE_SUBTRACT:
1204 current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
1205 break;
1206 case VK_BLEND_OP_MIN:
1207 current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
1208 break;
1209 case VK_BLEND_OP_MAX:
1210 current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
1211 break;
1212 case VK_BLEND_OP_SRC_EXT:
1213 // No operation
1214 break;
1215 case VK_BLEND_OP_DST_EXT:
1216 current.w = pixel.w;
1217 break;
1218 case VK_BLEND_OP_ZERO_EXT:
1219 current.w = Short4(0x0000);
1220 break;
1221 default:
1222 UNIMPLEMENTED("VkBlendOp: %d", int(state.blendState[index].blendOperationAlpha));
1223 }
1224 }
1225
1226 void PixelRoutine::writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &current, const Int &sMask, const Int &zMask, const Int &cMask)
1227 {
1228 if(isSRGB(index))
1229 {
1230 linearToSRGB16_12_16(current);
1231 }
1232
1233 switch(state.targetFormat[index])
1234 {
1235 case VK_FORMAT_B8G8R8A8_UNORM:
1236 case VK_FORMAT_B8G8R8A8_SRGB:
1237 case VK_FORMAT_R8G8B8A8_UNORM:
1238 case VK_FORMAT_R8G8B8A8_SRGB:
1239 case VK_FORMAT_R8G8_UNORM:
1240 case VK_FORMAT_R8_UNORM:
1241 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1242 case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1243 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
1244 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
1245 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
1246 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
1247 break;
1248 default:
1249 break;
1250 }
1251
1252 int rgbaWriteMask = state.colorWriteActive(index);
1253 int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
1254
1255 switch(state.targetFormat[index])
1256 {
1257 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1258 {
1259 current.w = current.w & Short4(0x8000u);
1260 current.x = As<UShort4>(current.x & Short4(0xF800)) >> 1;
1261 current.y = As<UShort4>(current.y & Short4(0xF800)) >> 6;
1262 current.z = As<UShort4>(current.z & Short4(0xF800)) >> 11;
1263
1264 current.x = current.x | current.y | current.z | current.w;
1265 }
1266 break;
1267 case VK_FORMAT_R5G6B5_UNORM_PACK16:
1268 {
1269 current.x = current.x & Short4(0xF800u);
1270 current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1271 current.z = As<UShort4>(current.z) >> 11;
1272
1273 current.x = current.x | current.y | current.z;
1274 }
1275 break;
1276 case VK_FORMAT_B8G8R8A8_UNORM:
1277 case VK_FORMAT_B8G8R8A8_SRGB:
1278 if(rgbaWriteMask == 0x7)
1279 {
1280 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1281 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1282 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1283
1284 current.z = As<Short4>(PackUnsigned(current.z, current.x));
1285 current.y = As<Short4>(PackUnsigned(current.y, current.y));
1286
1287 current.x = current.z;
1288 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1289 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1290 current.y = current.z;
1291 current.z = As<Short4>(UnpackLow(current.z, current.x));
1292 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1293 }
1294 else
1295 {
1296 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1297 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1298 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1299 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1300
1301 current.z = As<Short4>(PackUnsigned(current.z, current.x));
1302 current.y = As<Short4>(PackUnsigned(current.y, current.w));
1303
1304 current.x = current.z;
1305 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1306 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1307 current.y = current.z;
1308 current.z = As<Short4>(UnpackLow(current.z, current.x));
1309 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1310 }
1311 break;
1312 case VK_FORMAT_R8G8B8A8_UNORM:
1313 case VK_FORMAT_R8G8B8A8_SRGB:
1314 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1315 case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1316 if(rgbaWriteMask == 0x7)
1317 {
1318 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1319 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1320 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1321
1322 current.z = As<Short4>(PackUnsigned(current.x, current.z));
1323 current.y = As<Short4>(PackUnsigned(current.y, current.y));
1324
1325 current.x = current.z;
1326 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1327 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1328 current.y = current.z;
1329 current.z = As<Short4>(UnpackLow(current.z, current.x));
1330 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1331 }
1332 else
1333 {
1334 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1335 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1336 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1337 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1338
1339 current.z = As<Short4>(PackUnsigned(current.x, current.z));
1340 current.y = As<Short4>(PackUnsigned(current.y, current.w));
1341
1342 current.x = current.z;
1343 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1344 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1345 current.y = current.z;
1346 current.z = As<Short4>(UnpackLow(current.z, current.x));
1347 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1348 }
1349 break;
1350 case VK_FORMAT_R8G8_UNORM:
1351 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1352 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1353 current.x = As<Short4>(PackUnsigned(current.x, current.x));
1354 current.y = As<Short4>(PackUnsigned(current.y, current.y));
1355 current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
1356 break;
1357 case VK_FORMAT_R8_UNORM:
1358 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1359 current.x = As<Short4>(PackUnsigned(current.x, current.x));
1360 break;
1361 case VK_FORMAT_R16G16_UNORM:
1362 current.z = current.x;
1363 current.x = As<Short4>(UnpackLow(current.x, current.y));
1364 current.z = As<Short4>(UnpackHigh(current.z, current.y));
1365 current.y = current.z;
1366 break;
1367 case VK_FORMAT_R16G16B16A16_UNORM:
1368 transpose4x4(current.x, current.y, current.z, current.w);
1369 break;
1370 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1371 {
1372 auto r = (Int4(current.x) >> 6) & Int4(0x3ff);
1373 auto g = (Int4(current.y) >> 6) & Int4(0x3ff);
1374 auto b = (Int4(current.z) >> 6) & Int4(0x3ff);
1375 auto a = (Int4(current.w) >> 14) & Int4(0x3);
1376 Int4 packed = (a << 30) | (b << 20) | (g << 10) | r;
1377 auto c02 = As<Int2>(Int4(packed.xzzz)); // TODO: auto c02 = packed.xz;
1378 auto c13 = As<Int2>(Int4(packed.ywww)); // TODO: auto c13 = packed.yw;
1379 current.x = UnpackLow(c02, c13);
1380 current.y = UnpackHigh(c02, c13);
1381 break;
1382 }
1383 default:
1384 UNIMPLEMENTED("VkFormat: %d", int(state.targetFormat[index]));
1385 }
1386
1387 Short4 c01 = current.z;
1388 Short4 c23 = current.y;
1389
1390 Int xMask; // Combination of all masks
1391
1392 if(state.depthTestActive)
1393 {
1394 xMask = zMask;
1395 }
1396 else
1397 {
1398 xMask = cMask;
1399 }
1400
1401 if(state.stencilActive)
1402 {
1403 xMask &= sMask;
1404 }
1405
1406 switch(state.targetFormat[index])
1407 {
1408 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1409 {
1410 Pointer<Byte> buffer = cBuffer + 2 * x;
1411 Int value = *Pointer<Int>(buffer);
1412
1413 Int channelMask = *Pointer<Int>(constants + OFFSET(Constants,mask5551Q[bgraWriteMask & 0xF][0]));
1414
1415 Int c01 = Extract(As<Int2>(current.x), 0);
1416 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
1417 if(bgraWriteMask != 0x0000000F)
1418 {
1419 mask01 &= channelMask;
1420 }
1421 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1422
1423 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1424 value = *Pointer<Int>(buffer);
1425
1426 Int c23 = Extract(As<Int2>(current.x), 1);
1427 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
1428 if(bgraWriteMask != 0x0000000F)
1429 {
1430 mask23 &= channelMask;
1431 }
1432 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1433 }
1434 break;
1435 case VK_FORMAT_R5G6B5_UNORM_PACK16:
1436 {
1437 Pointer<Byte> buffer = cBuffer + 2 * x;
1438 Int value = *Pointer<Int>(buffer);
1439
1440 Int channelMask = *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1441
1442 Int c01 = Extract(As<Int2>(current.x), 0);
1443 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
1444 if((bgraWriteMask & 0x00000007) != 0x00000007)
1445 {
1446 mask01 &= channelMask;
1447 }
1448 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1449
1450 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1451 value = *Pointer<Int>(buffer);
1452
1453 Int c23 = Extract(As<Int2>(current.x), 1);
1454 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
1455 if((bgraWriteMask & 0x00000007) != 0x00000007)
1456 {
1457 mask23 &= channelMask;
1458 }
1459 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1460 }
1461 break;
1462 case VK_FORMAT_B8G8R8A8_UNORM:
1463 case VK_FORMAT_B8G8R8A8_SRGB:
1464 {
1465 Pointer<Byte> buffer = cBuffer + x * 4;
1466 Short4 value = *Pointer<Short4>(buffer);
1467 Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1468
1469 Short4 mask01 = *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1470 if(bgraWriteMask != 0x0000000F)
1471 {
1472 mask01 &= channelMask;
1473 }
1474 *Pointer<Short4>(buffer) = (c01 & mask01) | (value & ~mask01);
1475
1476 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1477 value = *Pointer<Short4>(buffer);
1478
1479 Short4 mask23 = *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1480 if(bgraWriteMask != 0x0000000F)
1481 {
1482 mask23 &= channelMask;
1483 }
1484 *Pointer<Short4>(buffer) = (c23 & mask23) | (value & ~mask23);
1485 }
1486 break;
1487 case VK_FORMAT_R8G8B8A8_UNORM:
1488 case VK_FORMAT_R8G8B8A8_SRGB:
1489 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1490 case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1491 {
1492 Pointer<Byte> buffer = cBuffer + x * 4;
1493 Short4 value = *Pointer<Short4>(buffer);
1494 Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1495
1496 Short4 mask01 = *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1497 if(rgbaWriteMask != 0x0000000F)
1498 {
1499 mask01 &= channelMask;
1500 }
1501 *Pointer<Short4>(buffer) = (c01 & mask01) | (value & ~mask01);
1502
1503 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1504 value = *Pointer<Short4>(buffer);
1505
1506 Short4 mask23 = *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1507 if(rgbaWriteMask != 0x0000000F)
1508 {
1509 mask23 &= channelMask;
1510 }
1511 *Pointer<Short4>(buffer) = (c23 & mask23) | (value & ~mask23);
1512 }
1513 break;
1514 case VK_FORMAT_R8G8_UNORM:
1515 if((rgbaWriteMask & 0x00000003) != 0x0)
1516 {
1517 Pointer<Byte> buffer = cBuffer + 2 * x;
1518 Int2 value;
1519 value = Insert(value, *Pointer<Int>(buffer), 0);
1520 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1521 value = Insert(value, *Pointer<Int>(buffer + pitch), 1);
1522
1523 Int2 packedCol = As<Int2>(current.x);
1524
1525 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
1526 if((rgbaWriteMask & 0x3) != 0x3)
1527 {
1528 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
1529 UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
1530 mergedMask &= rgbaMask;
1531 }
1532
1533 packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
1534
1535 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
1536 *Pointer<UInt>(buffer + pitch) = As<UInt>(Extract(packedCol, 1));
1537 }
1538 break;
1539 case VK_FORMAT_R8_UNORM:
1540 if(rgbaWriteMask & 0x00000001)
1541 {
1542 Pointer<Byte> buffer = cBuffer + 1 * x;
1543 Short4 value;
1544 value = Insert(value, *Pointer<Short>(buffer), 0);
1545 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1546 value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
1547
1548 current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
1549 value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
1550 current.x |= value;
1551
1552 *Pointer<Short>(buffer) = Extract(current.x, 0);
1553 *Pointer<Short>(buffer + pitch) = Extract(current.x, 1);
1554 }
1555 break;
1556 case VK_FORMAT_R16G16_UNORM:
1557 {
1558 Pointer<Byte> buffer = cBuffer + 4 * x;
1559
1560 Short4 value = *Pointer<Short4>(buffer);
1561
1562 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1563 {
1564 Short4 masked = value;
1565 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1566 masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
1567 current.x |= masked;
1568 }
1569
1570 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1571 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1572 current.x |= value;
1573 *Pointer<Short4>(buffer) = current.x;
1574
1575 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1576
1577 value = *Pointer<Short4>(buffer);
1578
1579 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1580 {
1581 Short4 masked = value;
1582 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1583 masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
1584 current.y |= masked;
1585 }
1586
1587 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1588 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1589 current.y |= value;
1590 *Pointer<Short4>(buffer) = current.y;
1591 }
1592 break;
1593 case VK_FORMAT_R16G16B16A16_UNORM:
1594 {
1595 Pointer<Byte> buffer = cBuffer + 8 * x;
1596
1597 {
1598 Short4 value = *Pointer<Short4>(buffer);
1599
1600 if(rgbaWriteMask != 0x0000000F)
1601 {
1602 Short4 masked = value;
1603 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1604 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1605 current.x |= masked;
1606 }
1607
1608 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
1609 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
1610 current.x |= value;
1611 *Pointer<Short4>(buffer) = current.x;
1612 }
1613
1614 {
1615 Short4 value = *Pointer<Short4>(buffer + 8);
1616
1617 if(rgbaWriteMask != 0x0000000F)
1618 {
1619 Short4 masked = value;
1620 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1621 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1622 current.y |= masked;
1623 }
1624
1625 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
1626 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
1627 current.y |= value;
1628 *Pointer<Short4>(buffer + 8) = current.y;
1629 }
1630
1631 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1632
1633 {
1634 Short4 value = *Pointer<Short4>(buffer);
1635
1636 if(rgbaWriteMask != 0x0000000F)
1637 {
1638 Short4 masked = value;
1639 current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1640 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1641 current.z |= masked;
1642 }
1643
1644 current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
1645 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
1646 current.z |= value;
1647 *Pointer<Short4>(buffer) = current.z;
1648 }
1649
1650 {
1651 Short4 value = *Pointer<Short4>(buffer + 8);
1652
1653 if(rgbaWriteMask != 0x0000000F)
1654 {
1655 Short4 masked = value;
1656 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1657 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1658 current.w |= masked;
1659 }
1660
1661 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
1662 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
1663 current.w |= value;
1664 *Pointer<Short4>(buffer + 8) = current.w;
1665 }
1666 }
1667 break;
1668 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1669 {
1670 Pointer<Byte> buffer = cBuffer + 4 * x;
1671
1672 buffer = cBuffer + 4 * x;
1673 Int2 value = *Pointer<Int2>(buffer, 16);
1674 Int2 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
1675 if (rgbaWriteMask != 0xF)
1676 {
1677 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
1678 }
1679 *Pointer<Int2>(buffer) = (As<Int2>(current.x) & mergedMask) | (value & ~mergedMask);
1680
1681 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1682
1683 value = *Pointer<Int2>(buffer, 16);
1684 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
1685 if (rgbaWriteMask != 0xF)
1686 {
1687 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
1688 }
1689 *Pointer<Int2>(buffer) = (As<Int2>(current.y) & mergedMask) | (value & ~mergedMask);
1690 }
1691 break;
1692 default:
1693 UNIMPLEMENTED("VkFormat: %d", int(state.targetFormat[index]));
1694 }
1695 }
1696
1697 void PixelRoutine::blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, VkBlendFactor blendFactorActive)
1698 {
1699 switch(blendFactorActive)
1700 {
1701 case VK_BLEND_FACTOR_ZERO:
1702 blendFactor.x = Float4(0);
1703 blendFactor.y = Float4(0);
1704 blendFactor.z = Float4(0);
1705 break;
1706 case VK_BLEND_FACTOR_ONE:
1707 blendFactor.x = Float4(1);
1708 blendFactor.y = Float4(1);
1709 blendFactor.z = Float4(1);
1710 break;
1711 case VK_BLEND_FACTOR_SRC_COLOR:
1712 blendFactor.x = oC.x;
1713 blendFactor.y = oC.y;
1714 blendFactor.z = oC.z;
1715 break;
1716 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1717 blendFactor.x = Float4(1.0f) - oC.x;
1718 blendFactor.y = Float4(1.0f) - oC.y;
1719 blendFactor.z = Float4(1.0f) - oC.z;
1720 break;
1721 case VK_BLEND_FACTOR_DST_COLOR:
1722 blendFactor.x = pixel.x;
1723 blendFactor.y = pixel.y;
1724 blendFactor.z = pixel.z;
1725 break;
1726 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1727 blendFactor.x = Float4(1.0f) - pixel.x;
1728 blendFactor.y = Float4(1.0f) - pixel.y;
1729 blendFactor.z = Float4(1.0f) - pixel.z;
1730 break;
1731 case VK_BLEND_FACTOR_SRC_ALPHA:
1732 blendFactor.x = oC.w;
1733 blendFactor.y = oC.w;
1734 blendFactor.z = oC.w;
1735 break;
1736 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1737 blendFactor.x = Float4(1.0f) - oC.w;
1738 blendFactor.y = Float4(1.0f) - oC.w;
1739 blendFactor.z = Float4(1.0f) - oC.w;
1740 break;
1741 case VK_BLEND_FACTOR_DST_ALPHA:
1742 blendFactor.x = pixel.w;
1743 blendFactor.y = pixel.w;
1744 blendFactor.z = pixel.w;
1745 break;
1746 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1747 blendFactor.x = Float4(1.0f) - pixel.w;
1748 blendFactor.y = Float4(1.0f) - pixel.w;
1749 blendFactor.z = Float4(1.0f) - pixel.w;
1750 break;
1751 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1752 blendFactor.x = Float4(1.0f) - pixel.w;
1753 blendFactor.x = Min(blendFactor.x, oC.w);
1754 blendFactor.y = blendFactor.x;
1755 blendFactor.z = blendFactor.x;
1756 break;
1757 case VK_BLEND_FACTOR_CONSTANT_COLOR:
1758 blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0]));
1759 blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1]));
1760 blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2]));
1761 break;
1762 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1763 blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
1764 blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
1765 blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
1766 break;
1767 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1768 blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
1769 blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
1770 blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
1771 break;
1772 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1773 blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
1774 blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
1775 blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
1776 break;
1777
1778 default:
1779 UNIMPLEMENTED("VkBlendFactor: %d", int(blendFactorActive));
1780 }
1781 }
1782
1783 void PixelRoutine::blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, VkBlendFactor blendFactorAlphaActive)
1784 {
1785 switch(blendFactorAlphaActive)
1786 {
1787 case VK_BLEND_FACTOR_ZERO:
1788 blendFactor.w = Float4(0);
1789 break;
1790 case VK_BLEND_FACTOR_ONE:
1791 blendFactor.w = Float4(1);
1792 break;
1793 case VK_BLEND_FACTOR_SRC_COLOR:
1794 blendFactor.w = oC.w;
1795 break;
1796 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1797 blendFactor.w = Float4(1.0f) - oC.w;
1798 break;
1799 case VK_BLEND_FACTOR_DST_COLOR:
1800 blendFactor.w = pixel.w;
1801 break;
1802 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1803 blendFactor.w = Float4(1.0f) - pixel.w;
1804 break;
1805 case VK_BLEND_FACTOR_SRC_ALPHA:
1806 blendFactor.w = oC.w;
1807 break;
1808 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1809 blendFactor.w = Float4(1.0f) - oC.w;
1810 break;
1811 case VK_BLEND_FACTOR_DST_ALPHA:
1812 blendFactor.w = pixel.w;
1813 break;
1814 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1815 blendFactor.w = Float4(1.0f) - pixel.w;
1816 break;
1817 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1818 blendFactor.w = Float4(1.0f);
1819 break;
1820 case VK_BLEND_FACTOR_CONSTANT_COLOR:
1821 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1822 blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
1823 break;
1824 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1825 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1826 blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
1827 break;
1828 default:
1829 UNIMPLEMENTED("VkBlendFactor: %d", int(blendFactorAlphaActive));
1830 }
1831 }
1832
1833 void PixelRoutine::alphaBlend(int index, const Pointer<Byte> &cBuffer, Vector4f &oC, const Int &x)
1834 {
1835 if(!state.blendState[index].alphaBlendEnable)
1836 {
1837 return;
1838 }
1839
1840 Pointer<Byte> buffer;
1841
1842 // pixel holds four texel color values.
1843 // Note: Despite the type being Vector4f, the colors may be stored as
1844 // integers. Half-floats are stored as full 32-bit floats.
1845 // Non-float and non-fixed point formats are not alpha blended.
1846 Vector4f pixel;
1847
1848 Vector4s color;
1849 Short4 c01;
1850 Short4 c23;
1851
1852 Float4 one;
1853 vk::Format format(state.targetFormat[index]);
1854 if(format.isFloatFormat())
1855 {
1856 one = Float4(1.0f);
1857 }
1858 else if(format.isNonNormalizedInteger())
1859 {
1860 one = As<Float4>(format.isUnsignedComponent(0) ? Int4(0xFFFFFFFF) : Int4(0x7FFFFFFF));
1861 }
1862
1863 switch(state.targetFormat[index])
1864 {
1865 case VK_FORMAT_R32_SINT:
1866 case VK_FORMAT_R32_UINT:
1867 case VK_FORMAT_R32_SFLOAT:
1868 buffer = cBuffer;
1869 // FIXME: movlps
1870 pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
1871 pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
1872 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1873 // FIXME: movhps
1874 pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
1875 pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
1876 pixel.y = pixel.z = pixel.w = one;
1877 break;
1878 case VK_FORMAT_R32G32_SINT:
1879 case VK_FORMAT_R32G32_UINT:
1880 case VK_FORMAT_R32G32_SFLOAT:
1881 buffer = cBuffer;
1882 pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
1883 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1884 pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
1885 pixel.z = pixel.x;
1886 pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88);
1887 pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD);
1888 pixel.y = pixel.z;
1889 pixel.z = pixel.w = one;
1890 break;
1891 case VK_FORMAT_R32G32B32A32_SFLOAT:
1892 case VK_FORMAT_R32G32B32A32_SINT:
1893 case VK_FORMAT_R32G32B32A32_UINT:
1894 buffer = cBuffer;
1895 pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
1896 pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
1897 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1898 pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
1899 pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
1900 transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
1901 break;
1902 case VK_FORMAT_R16_SFLOAT:
1903 buffer = cBuffer;
1904 pixel.x.x = Float(*Pointer<Half>(buffer + 2 * x + 0));
1905 pixel.x.y = Float(*Pointer<Half>(buffer + 2 * x + 2));
1906 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1907 pixel.x.z = Float(*Pointer<Half>(buffer + 2 * x + 0));
1908 pixel.x.w = Float(*Pointer<Half>(buffer + 2 * x + 2));
1909 pixel.y = pixel.z = pixel.w = one;
1910 break;
1911 case VK_FORMAT_R16G16_SFLOAT:
1912 buffer = cBuffer;
1913 pixel.x.x = Float(*Pointer<Half>(buffer + 4 * x + 0));
1914 pixel.y.x = Float(*Pointer<Half>(buffer + 4 * x + 2));
1915 pixel.x.y = Float(*Pointer<Half>(buffer + 4 * x + 4));
1916 pixel.y.y = Float(*Pointer<Half>(buffer + 4 * x + 6));
1917 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1918 pixel.x.z = Float(*Pointer<Half>(buffer + 4 * x + 0));
1919 pixel.y.z = Float(*Pointer<Half>(buffer + 4 * x + 2));
1920 pixel.x.w = Float(*Pointer<Half>(buffer + 4 * x + 4));
1921 pixel.y.w = Float(*Pointer<Half>(buffer + 4 * x + 6));
1922 pixel.z = pixel.w = one;
1923 break;
1924 case VK_FORMAT_R16G16B16A16_SFLOAT:
1925 buffer = cBuffer;
1926 pixel.x.x = Float(*Pointer<Half>(buffer + 8 * x + 0x0));
1927 pixel.y.x = Float(*Pointer<Half>(buffer + 8 * x + 0x2));
1928 pixel.z.x = Float(*Pointer<Half>(buffer + 8 * x + 0x4));
1929 pixel.w.x = Float(*Pointer<Half>(buffer + 8 * x + 0x6));
1930 pixel.x.y = Float(*Pointer<Half>(buffer + 8 * x + 0x8));
1931 pixel.y.y = Float(*Pointer<Half>(buffer + 8 * x + 0xa));
1932 pixel.z.y = Float(*Pointer<Half>(buffer + 8 * x + 0xc));
1933 pixel.w.y = Float(*Pointer<Half>(buffer + 8 * x + 0xe));
1934 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1935 pixel.x.z = Float(*Pointer<Half>(buffer + 8 * x + 0x0));
1936 pixel.y.z = Float(*Pointer<Half>(buffer + 8 * x + 0x2));
1937 pixel.z.z = Float(*Pointer<Half>(buffer + 8 * x + 0x4));
1938 pixel.w.z = Float(*Pointer<Half>(buffer + 8 * x + 0x6));
1939 pixel.x.w = Float(*Pointer<Half>(buffer + 8 * x + 0x8));
1940 pixel.y.w = Float(*Pointer<Half>(buffer + 8 * x + 0xa));
1941 pixel.z.w = Float(*Pointer<Half>(buffer + 8 * x + 0xc));
1942 pixel.w.w = Float(*Pointer<Half>(buffer + 8 * x + 0xe));
1943 break;
1944 default:
1945 UNIMPLEMENTED("VkFormat: %d", int(state.targetFormat[index]));
1946 }
1947
1948 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
1949 Vector4f sourceFactor;
1950 Vector4f destFactor;
1951
1952 blendFactor(sourceFactor, oC, pixel, state.blendState[index].sourceBlendFactor);
1953 blendFactor(destFactor, oC, pixel, state.blendState[index].destBlendFactor);
1954
1955 oC.x *= sourceFactor.x;
1956 oC.y *= sourceFactor.y;
1957 oC.z *= sourceFactor.z;
1958
1959 pixel.x *= destFactor.x;
1960 pixel.y *= destFactor.y;
1961 pixel.z *= destFactor.z;
1962
1963 switch(state.blendState[index].blendOperation)
1964 {
1965 case VK_BLEND_OP_ADD:
1966 oC.x += pixel.x;
1967 oC.y += pixel.y;
1968 oC.z += pixel.z;
1969 break;
1970 case VK_BLEND_OP_SUBTRACT:
1971 oC.x -= pixel.x;
1972 oC.y -= pixel.y;
1973 oC.z -= pixel.z;
1974 break;
1975 case VK_BLEND_OP_REVERSE_SUBTRACT:
1976 oC.x = pixel.x - oC.x;
1977 oC.y = pixel.y - oC.y;
1978 oC.z = pixel.z - oC.z;
1979 break;
1980 case VK_BLEND_OP_MIN:
1981 oC.x = Min(oC.x, pixel.x);
1982 oC.y = Min(oC.y, pixel.y);
1983 oC.z = Min(oC.z, pixel.z);
1984 break;
1985 case VK_BLEND_OP_MAX:
1986 oC.x = Max(oC.x, pixel.x);
1987 oC.y = Max(oC.y, pixel.y);
1988 oC.z = Max(oC.z, pixel.z);
1989 break;
1990 case VK_BLEND_OP_SRC_EXT:
1991 // No operation
1992 break;
1993 case VK_BLEND_OP_DST_EXT:
1994 oC.x = pixel.x;
1995 oC.y = pixel.y;
1996 oC.z = pixel.z;
1997 break;
1998 case VK_BLEND_OP_ZERO_EXT:
1999 oC.x = Float4(0.0f);
2000 oC.y = Float4(0.0f);
2001 oC.z = Float4(0.0f);
2002 break;
2003 default:
2004 UNIMPLEMENTED("VkBlendOp: %d", int(state.blendState[index].blendOperation));
2005 }
2006
2007 blendFactorAlpha(sourceFactor, oC, pixel, state.blendState[index].sourceBlendFactorAlpha);
2008 blendFactorAlpha(destFactor, oC, pixel, state.blendState[index].destBlendFactorAlpha);
2009
2010 oC.w *= sourceFactor.w;
2011 pixel.w *= destFactor.w;
2012
2013 switch(state.blendState[index].blendOperationAlpha)
2014 {
2015 case VK_BLEND_OP_ADD:
2016 oC.w += pixel.w;
2017 break;
2018 case VK_BLEND_OP_SUBTRACT:
2019 oC.w -= pixel.w;
2020 break;
2021 case VK_BLEND_OP_REVERSE_SUBTRACT:
2022 pixel.w -= oC.w;
2023 oC.w = pixel.w;
2024 break;
2025 case VK_BLEND_OP_MIN:
2026 oC.w = Min(oC.w, pixel.w);
2027 break;
2028 case VK_BLEND_OP_MAX:
2029 oC.w = Max(oC.w, pixel.w);
2030 break;
2031 case VK_BLEND_OP_SRC_EXT:
2032 // No operation
2033 break;
2034 case VK_BLEND_OP_DST_EXT:
2035 oC.w = pixel.w;
2036 break;
2037 case VK_BLEND_OP_ZERO_EXT:
2038 oC.w = Float4(0.0f);
2039 break;
2040 default:
2041 UNIMPLEMENTED("VkBlendOp: %d", int(state.blendState[index].blendOperationAlpha));
2042 }
2043 }
2044
2045 void PixelRoutine::writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4f &oC, const Int &sMask, const Int &zMask, const Int &cMask)
2046 {
2047 switch(state.targetFormat[index])
2048 {
2049 case VK_FORMAT_R16_SFLOAT:
2050 case VK_FORMAT_R32_SFLOAT:
2051 case VK_FORMAT_R32_SINT:
2052 case VK_FORMAT_R32_UINT:
2053 case VK_FORMAT_R16_SINT:
2054 case VK_FORMAT_R16_UINT:
2055 case VK_FORMAT_R8_SINT:
2056 case VK_FORMAT_R8_UINT:
2057 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
2058 break;
2059 case VK_FORMAT_R16G16_SFLOAT:
2060 case VK_FORMAT_R32G32_SFLOAT:
2061 case VK_FORMAT_R32G32_SINT:
2062 case VK_FORMAT_R32G32_UINT:
2063 case VK_FORMAT_R16G16_SINT:
2064 case VK_FORMAT_R16G16_UINT:
2065 case VK_FORMAT_R8G8_SINT:
2066 case VK_FORMAT_R8G8_UINT:
2067 oC.z = oC.x;
2068 oC.x = UnpackLow(oC.x, oC.y);
2069 oC.z = UnpackHigh(oC.z, oC.y);
2070 oC.y = oC.z;
2071 break;
2072 case VK_FORMAT_R16G16B16A16_SFLOAT:
2073 case VK_FORMAT_R32G32B32A32_SFLOAT:
2074 case VK_FORMAT_R32G32B32A32_SINT:
2075 case VK_FORMAT_R32G32B32A32_UINT:
2076 case VK_FORMAT_R16G16B16A16_SINT:
2077 case VK_FORMAT_R16G16B16A16_UINT:
2078 case VK_FORMAT_R8G8B8A8_SINT:
2079 case VK_FORMAT_R8G8B8A8_UINT:
2080 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
2081 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
2082 transpose4x4(oC.x, oC.y, oC.z, oC.w);
2083 break;
2084 default:
2085 UNIMPLEMENTED("VkFormat: %d", int(state.targetFormat[index]));
2086 }
2087
2088 int rgbaWriteMask = state.colorWriteActive(index);
2089
2090 Int xMask; // Combination of all masks
2091
2092 if(state.depthTestActive)
2093 {
2094 xMask = zMask;
2095 }
2096 else
2097 {
2098 xMask = cMask;
2099 }
2100
2101 if(state.stencilActive)
2102 {
2103 xMask &= sMask;
2104 }
2105
2106 auto targetFormat = state.targetFormat[index];
2107
2108 Pointer<Byte> buffer;
2109 Float4 value;
2110
2111 switch(targetFormat)
2112 {
2113 case VK_FORMAT_R32_SFLOAT:
2114 case VK_FORMAT_R32_SINT:
2115 case VK_FORMAT_R32_UINT:
2116 if(rgbaWriteMask & 0x00000001)
2117 {
2118 buffer = cBuffer + 4 * x;
2119
2120 // FIXME: movlps
2121 value.x = *Pointer<Float>(buffer + 0);
2122 value.y = *Pointer<Float>(buffer + 4);
2123
2124 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2125
2126 // FIXME: movhps
2127 value.z = *Pointer<Float>(buffer + 0);
2128 value.w = *Pointer<Float>(buffer + 4);
2129
2130 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
2131 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
2132 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2133
2134 // FIXME: movhps
2135 *Pointer<Float>(buffer + 0) = oC.x.z;
2136 *Pointer<Float>(buffer + 4) = oC.x.w;
2137
2138 buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2139
2140 // FIXME: movlps
2141 *Pointer<Float>(buffer + 0) = oC.x.x;
2142 *Pointer<Float>(buffer + 4) = oC.x.y;
2143 }
2144 break;
2145 case VK_FORMAT_R16_SFLOAT:
2146 if(rgbaWriteMask & 0x00000001)
2147 {
2148 buffer = cBuffer + 2 * x;
2149
2150 value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 0);
2151 value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 1);
2152
2153 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2154
2155 value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 2);
2156 value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 3);
2157
2158 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2159 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2160 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2161
2162 *Pointer<Half>(buffer + 0) = Half(oC.x.z);
2163 *Pointer<Half>(buffer + 2) = Half(oC.x.w);
2164
2165 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2166
2167 *Pointer<Half>(buffer + 0) = Half(oC.x.x);
2168 *Pointer<Half>(buffer + 2) = Half(oC.x.y);
2169 }
2170 break;
2171 case VK_FORMAT_R16_SINT:
2172 case VK_FORMAT_R16_UINT:
2173 if(rgbaWriteMask & 0x00000001)
2174 {
2175 buffer = cBuffer + 2 * x;
2176
2177 UShort4 xyzw;
2178 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
2179
2180 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2181
2182 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
2183 value = As<Float4>(Int4(xyzw));
2184
2185 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2186 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2187 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2188
2189 if(targetFormat == VK_FORMAT_R16_SINT)
2190 {
2191 Float component = oC.x.z;
2192 *Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2193 component = oC.x.w;
2194 *Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2195
2196 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2197
2198 component = oC.x.x;
2199 *Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2200 component = oC.x.y;
2201 *Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2202 }
2203 else // VK_FORMAT_R16_UINT
2204 {
2205 Float component = oC.x.z;
2206 *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2207 component = oC.x.w;
2208 *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2209
2210 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2211
2212 component = oC.x.x;
2213 *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2214 component = oC.x.y;
2215 *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2216 }
2217 }
2218 break;
2219 case VK_FORMAT_R8_SINT:
2220 case VK_FORMAT_R8_UINT:
2221 if(rgbaWriteMask & 0x00000001)
2222 {
2223 buffer = cBuffer + x;
2224
2225 UInt xyzw, packedCol;
2226
2227 xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
2228 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2229 xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
2230
2231 Short4 tmpCol = Short4(As<Int4>(oC.x));
2232 if(targetFormat == VK_FORMAT_R8_SINT)
2233 {
2234 tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol));
2235 }
2236 else
2237 {
2238 tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol));
2239 }
2240 packedCol = Extract(As<Int2>(tmpCol), 0);
2241
2242 packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
2243 (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
2244
2245 *Pointer<UShort>(buffer) = UShort(packedCol >> 16);
2246 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2247 *Pointer<UShort>(buffer) = UShort(packedCol);
2248 }
2249 break;
2250 case VK_FORMAT_R32G32_SFLOAT:
2251 case VK_FORMAT_R32G32_SINT:
2252 case VK_FORMAT_R32G32_UINT:
2253 buffer = cBuffer + 8 * x;
2254
2255 value = *Pointer<Float4>(buffer);
2256
2257 if((rgbaWriteMask & 0x00000003) != 0x00000003)
2258 {
2259 Float4 masked = value;
2260 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2261 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
2262 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2263 }
2264
2265 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
2266 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
2267 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2268 *Pointer<Float4>(buffer) = oC.x;
2269
2270 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2271
2272 value = *Pointer<Float4>(buffer);
2273
2274 if((rgbaWriteMask & 0x00000003) != 0x00000003)
2275 {
2276 Float4 masked;
2277
2278 masked = value;
2279 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2280 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
2281 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2282 }
2283
2284 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
2285 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
2286 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2287 *Pointer<Float4>(buffer) = oC.y;
2288 break;
2289 case VK_FORMAT_R16G16_SFLOAT:
2290 if((rgbaWriteMask & 0x00000003) != 0x0)
2291 {
2292 buffer = cBuffer + 4 * x;
2293
2294 UInt2 rgbaMask;
2295 UInt2 packedCol;
2296 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.x.y))) << 16) | UInt(As<UShort>(Half(oC.x.x))), 0);
2297 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.x.w))) << 16) | UInt(As<UShort>(Half(oC.x.z))), 1);
2298
2299 UShort4 value = *Pointer<UShort4>(buffer);
2300 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2301 if((rgbaWriteMask & 0x3) != 0x3)
2302 {
2303 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
2304 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2305 mergedMask &= rgbaMask;
2306 }
2307 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2308
2309 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2310
2311 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.y.y))) << 16) | UInt(As<UShort>(Half(oC.y.x))), 0);
2312 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.y.w))) << 16) | UInt(As<UShort>(Half(oC.y.z))), 1);
2313 value = *Pointer<UShort4>(buffer);
2314 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2315 if((rgbaWriteMask & 0x3) != 0x3)
2316 {
2317 mergedMask &= rgbaMask;
2318 }
2319 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2320 }
2321 break;
2322 case VK_FORMAT_R16G16_SINT:
2323 case VK_FORMAT_R16G16_UINT:
2324 if((rgbaWriteMask & 0x00000003) != 0x0)
2325 {
2326 buffer = cBuffer + 4 * x;
2327
2328 UInt2 rgbaMask;
2329 UShort4 packedCol = UShort4(As<Int4>(oC.x));
2330 UShort4 value = *Pointer<UShort4>(buffer);
2331 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2332 if((rgbaWriteMask & 0x3) != 0x3)
2333 {
2334 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
2335 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2336 mergedMask &= rgbaMask;
2337 }
2338 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2339
2340 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2341
2342 packedCol = UShort4(As<Int4>(oC.y));
2343 value = *Pointer<UShort4>(buffer);
2344 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2345 if((rgbaWriteMask & 0x3) != 0x3)
2346 {
2347 mergedMask &= rgbaMask;
2348 }
2349 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2350 }
2351 break;
2352 case VK_FORMAT_R8G8_SINT:
2353 case VK_FORMAT_R8G8_UINT:
2354 if((rgbaWriteMask & 0x00000003) != 0x0)
2355 {
2356 buffer = cBuffer + 2 * x;
2357
2358 Int2 xyzw, packedCol;
2359
2360 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
2361 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2362 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
2363
2364 if(targetFormat == VK_FORMAT_R8G8_SINT)
2365 {
2366 packedCol = As<Int2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2367 }
2368 else
2369 {
2370 packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2371 }
2372
2373 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
2374 if((rgbaWriteMask & 0x3) != 0x3)
2375 {
2376 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
2377 UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2378 mergedMask &= rgbaMask;
2379 }
2380
2381 packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
2382
2383 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
2384 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2385 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
2386 }
2387 break;
2388 case VK_FORMAT_R32G32B32A32_SFLOAT:
2389 case VK_FORMAT_R32G32B32A32_SINT:
2390 case VK_FORMAT_R32G32B32A32_UINT:
2391 buffer = cBuffer + 16 * x;
2392
2393 {
2394 value = *Pointer<Float4>(buffer, 16);
2395
2396 if(rgbaWriteMask != 0x0000000F)
2397 {
2398 Float4 masked = value;
2399 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2400 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2401 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2402 }
2403
2404 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
2405 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
2406 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2407 *Pointer<Float4>(buffer, 16) = oC.x;
2408 }
2409
2410 {
2411 value = *Pointer<Float4>(buffer + 16, 16);
2412
2413 if(rgbaWriteMask != 0x0000000F)
2414 {
2415 Float4 masked = value;
2416 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2417 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2418 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2419 }
2420
2421 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
2422 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
2423 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2424 *Pointer<Float4>(buffer + 16, 16) = oC.y;
2425 }
2426
2427 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2428
2429 {
2430 value = *Pointer<Float4>(buffer, 16);
2431
2432 if(rgbaWriteMask != 0x0000000F)
2433 {
2434 Float4 masked = value;
2435 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2436 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2437 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
2438 }
2439
2440 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
2441 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
2442 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
2443 *Pointer<Float4>(buffer, 16) = oC.z;
2444 }
2445
2446 {
2447 value = *Pointer<Float4>(buffer + 16, 16);
2448
2449 if(rgbaWriteMask != 0x0000000F)
2450 {
2451 Float4 masked = value;
2452 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2453 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2454 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
2455 }
2456
2457 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
2458 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
2459 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
2460 *Pointer<Float4>(buffer + 16, 16) = oC.w;
2461 }
2462 break;
2463 case VK_FORMAT_R16G16B16A16_SFLOAT:
2464 if((rgbaWriteMask & 0x0000000F) != 0x0)
2465 {
2466 buffer = cBuffer + 8 * x;
2467
2468 UInt4 rgbaMask;
2469 UInt4 value = *Pointer<UInt4>(buffer);
2470 UInt4 packedCol;
2471 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.x.y))) << 16) | UInt(As<UShort>(Half(oC.x.x))), 0);
2472 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.x.w))) << 16) | UInt(As<UShort>(Half(oC.x.z))), 1);
2473 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.y.y))) << 16) | UInt(As<UShort>(Half(oC.y.x))), 2);
2474 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.y.w))) << 16) | UInt(As<UShort>(Half(oC.y.z))), 3);
2475 UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
2476 if((rgbaWriteMask & 0xF) != 0xF)
2477 {
2478 UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
2479 rgbaMask = UInt4(tmpMask, tmpMask);
2480 mergedMask &= rgbaMask;
2481 }
2482 *Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2483
2484 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2485
2486 value = *Pointer<UInt4>(buffer);
2487 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.z.y))) << 16) | UInt(As<UShort>(Half(oC.z.x))), 0);
2488 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.z.w))) << 16) | UInt(As<UShort>(Half(oC.z.z))), 1);
2489 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.w.y))) << 16) | UInt(As<UShort>(Half(oC.w.x))), 2);
2490 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.w.w))) << 16) | UInt(As<UShort>(Half(oC.w.z))), 3);
2491 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
2492 if((rgbaWriteMask & 0xF) != 0xF)
2493 {
2494 mergedMask &= rgbaMask;
2495 }
2496 *Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2497 }
2498 break;
2499 case VK_FORMAT_R16G16B16A16_SINT:
2500 case VK_FORMAT_R16G16B16A16_UINT:
2501 if((rgbaWriteMask & 0x0000000F) != 0x0)
2502 {
2503 buffer = cBuffer + 8 * x;
2504
2505 UInt4 rgbaMask;
2506 UShort8 value = *Pointer<UShort8>(buffer);
2507 UShort8 packedCol = UShort8(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y)));
2508 UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
2509 if((rgbaWriteMask & 0xF) != 0xF)
2510 {
2511 UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
2512 rgbaMask = UInt4(tmpMask, tmpMask);
2513 mergedMask &= rgbaMask;
2514 }
2515 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2516
2517 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2518
2519 value = *Pointer<UShort8>(buffer);
2520 packedCol = UShort8(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w)));
2521 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
2522 if((rgbaWriteMask & 0xF) != 0xF)
2523 {
2524 mergedMask &= rgbaMask;
2525 }
2526 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2527 }
2528 break;
2529 case VK_FORMAT_R8G8B8A8_SINT:
2530 case VK_FORMAT_R8G8B8A8_UINT:
2531 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
2532 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
2533 if((rgbaWriteMask & 0x0000000F) != 0x0)
2534 {
2535 UInt2 value, packedCol, mergedMask;
2536
2537 buffer = cBuffer + 4 * x;
2538
2539 bool isSigned = targetFormat == VK_FORMAT_R8G8B8A8_SINT || targetFormat == VK_FORMAT_A8B8G8R8_SINT_PACK32;
2540
2541 if(isSigned)
2542 {
2543 packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2544 }
2545 else
2546 {
2547 packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2548 }
2549 value = *Pointer<UInt2>(buffer, 16);
2550 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2551 if(rgbaWriteMask != 0xF)
2552 {
2553 mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2554 }
2555 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2556
2557 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2558
2559 if(isSigned)
2560 {
2561 packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
2562 }
2563 else
2564 {
2565 packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
2566 }
2567 value = *Pointer<UInt2>(buffer, 16);
2568 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2569 if(rgbaWriteMask != 0xF)
2570 {
2571 mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2572 }
2573 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2574 }
2575 break;
2576 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
2577 if ((rgbaWriteMask & 0x0000000F) != 0x0)
2578 {
2579 Int2 mergedMask, packedCol, value;
2580 Int4 packed = ((As<Int4>(oC.w) & Int4(0x3)) << 30) |
2581 ((As<Int4>(oC.z) & Int4(0x3ff)) << 20) |
2582 ((As<Int4>(oC.y) & Int4(0x3ff)) << 10) |
2583 ((As<Int4>(oC.x) & Int4(0x3ff)));
2584
2585 buffer = cBuffer + 4 * x;
2586 value = *Pointer<Int2>(buffer, 16);
2587 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2588 if (rgbaWriteMask != 0xF)
2589 {
2590 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
2591 }
2592 *Pointer<Int2>(buffer) = (As<Int2>(packed) & mergedMask) | (value & ~mergedMask);
2593
2594 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2595
2596 value = *Pointer<Int2>(buffer, 16);
2597 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2598 if (rgbaWriteMask != 0xF)
2599 {
2600 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
2601 }
2602 *Pointer<Int2>(buffer) = (As<Int2>(Int4(packed.zwww)) & mergedMask) | (value & ~mergedMask);
2603 }
2604 break;
2605 default:
2606 UNIMPLEMENTED("VkFormat: %d", int(targetFormat));
2607 }
2608 }
2609
2610 UShort4 PixelRoutine::convertFixed16(const Float4 &cf, bool saturate)
2611 {
2612 return UShort4(cf * Float4(0xFFFF), saturate);
2613 }
2614
2615 void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
2616 {
2617 Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16);
2618
2619 c.x = As<UShort4>(c.x) >> 4;
2620 c.y = As<UShort4>(c.y) >> 4;
2621 c.z = As<UShort4>(c.z) >> 4;
2622
2623 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2624 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2625 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2626 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2627
2628 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2629 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2630 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2631 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2632
2633 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2634 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2635 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2636 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2637 }
2638
2639 void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
2640 {
2641 c.x = As<UShort4>(c.x) >> 4;
2642 c.y = As<UShort4>(c.y) >> 4;
2643 c.z = As<UShort4>(c.z) >> 4;
2644
2645 linearToSRGB12_16(c);
2646 }
2647
2648 void PixelRoutine::linearToSRGB12_16(Vector4s &c)
2649 {
2650 Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16);
2651
2652 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2653 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2654 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2655 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2656
2657 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2658 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2659 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2660 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2661
2662 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2663 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2664 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2665 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2666 }
2667
2668 Float4 PixelRoutine::sRGBtoLinear(const Float4 &x) // Approximates x^2.2
2669 {
2670 Float4 linear = x * x;
2671 linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
2672
2673 return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
2674 }
2675}
2676