1 | // Copyright 2016 The SwiftShader Authors. All Rights Reserved. |
2 | // |
3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | // you may not use this file except in compliance with the License. |
5 | // You may obtain a copy of the License at |
6 | // |
7 | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | // |
9 | // Unless required by applicable law or agreed to in writing, software |
10 | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | // See the License for the specific language governing permissions and |
13 | // limitations under the License. |
14 | |
15 | #include "PixelRoutine.hpp" |
16 | |
17 | #include "SamplerCore.hpp" |
18 | #include "Constants.hpp" |
19 | #include "Renderer/Renderer.hpp" |
20 | #include "Renderer/QuadRasterizer.hpp" |
21 | #include "Renderer/Surface.hpp" |
22 | #include "Renderer/Primitive.hpp" |
23 | #include "Common/Debug.hpp" |
24 | |
25 | namespace sw |
26 | { |
27 | extern bool complementaryDepthBuffer; |
28 | extern bool postBlendSRGB; |
29 | extern bool exactColorRounding; |
30 | extern bool forceClearRegisters; |
31 | |
32 | PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader) |
33 | : QuadRasterizer(state, shader), v(shader && shader->indirectAddressableInput) |
34 | { |
35 | if(!shader || shader->getShaderModel() < 0x0200 || forceClearRegisters) |
36 | { |
37 | for(int i = 0; i < MAX_FRAGMENT_INPUTS; i++) |
38 | { |
39 | v[i].x = Float4(0.0f); |
40 | v[i].y = Float4(0.0f); |
41 | v[i].z = Float4(0.0f); |
42 | v[i].w = Float4(0.0f); |
43 | } |
44 | } |
45 | } |
46 | |
47 | PixelRoutine::~PixelRoutine() |
48 | { |
49 | } |
50 | |
51 | void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x) |
52 | { |
53 | #if PERF_PROFILE |
54 | Long pipeTime = Ticks(); |
55 | #endif |
56 | |
57 | const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive(); |
58 | |
59 | Int zMask[4]; // Depth mask |
60 | Int sMask[4]; // Stencil mask |
61 | |
62 | for(unsigned int q = 0; q < state.multiSample; q++) |
63 | { |
64 | zMask[q] = cMask[q]; |
65 | sMask[q] = cMask[q]; |
66 | } |
67 | |
68 | for(unsigned int q = 0; q < state.multiSample; q++) |
69 | { |
70 | stencilTest(sBuffer, q, x, sMask[q], cMask[q]); |
71 | } |
72 | |
73 | Float4 f; |
74 | Float4 rhwCentroid; |
75 | |
76 | Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16); |
77 | |
78 | if(interpolateZ()) |
79 | { |
80 | for(unsigned int q = 0; q < state.multiSample; q++) |
81 | { |
82 | Float4 x = xxxx; |
83 | |
84 | if(state.multiSample > 1) |
85 | { |
86 | x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4)); |
87 | } |
88 | |
89 | z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false, state.depthClamp); |
90 | } |
91 | } |
92 | |
93 | Bool depthPass = false; |
94 | |
95 | if(earlyDepthTest) |
96 | { |
97 | for(unsigned int q = 0; q < state.multiSample; q++) |
98 | { |
99 | depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]); |
100 | } |
101 | } |
102 | |
103 | If(depthPass || Bool(!earlyDepthTest)) |
104 | { |
105 | #if PERF_PROFILE |
106 | Long interpTime = Ticks(); |
107 | #endif |
108 | |
109 | Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16); |
110 | |
111 | // Centroid locations |
112 | Float4 XXXX = Float4(0.0f); |
113 | Float4 YYYY = Float4(0.0f); |
114 | |
115 | if(state.centroid) |
116 | { |
117 | Float4 WWWW(1.0e-9f); |
118 | |
119 | for(unsigned int q = 0; q < state.multiSample; q++) |
120 | { |
121 | XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]); |
122 | YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]); |
123 | WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]); |
124 | } |
125 | |
126 | WWWW = Rcp_pp(WWWW); |
127 | XXXX *= WWWW; |
128 | YYYY *= WWWW; |
129 | |
130 | XXXX += xxxx; |
131 | YYYY += yyyy; |
132 | } |
133 | |
134 | if(interpolateW()) |
135 | { |
136 | w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false, false); |
137 | rhw = reciprocal(w, false, false, true); |
138 | |
139 | if(state.centroid) |
140 | { |
141 | rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false)); |
142 | } |
143 | } |
144 | |
145 | for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++) |
146 | { |
147 | for(int component = 0; component < 4; component++) |
148 | { |
149 | if(state.interpolant[interpolant].component & (1 << component)) |
150 | { |
151 | if(!state.interpolant[interpolant].centroid) |
152 | { |
153 | v[interpolant][component] = interpolate(xxxx, Dv[interpolant][component], rhw, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective, false); |
154 | } |
155 | else |
156 | { |
157 | v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective); |
158 | } |
159 | } |
160 | } |
161 | |
162 | Float4 rcp; |
163 | |
164 | switch(state.interpolant[interpolant].project) |
165 | { |
166 | case 0: |
167 | break; |
168 | case 1: |
169 | rcp = reciprocal(v[interpolant].y); |
170 | v[interpolant].x = v[interpolant].x * rcp; |
171 | break; |
172 | case 2: |
173 | rcp = reciprocal(v[interpolant].z); |
174 | v[interpolant].x = v[interpolant].x * rcp; |
175 | v[interpolant].y = v[interpolant].y * rcp; |
176 | break; |
177 | case 3: |
178 | rcp = reciprocal(v[interpolant].w); |
179 | v[interpolant].x = v[interpolant].x * rcp; |
180 | v[interpolant].y = v[interpolant].y * rcp; |
181 | v[interpolant].z = v[interpolant].z * rcp; |
182 | break; |
183 | } |
184 | } |
185 | |
186 | if(state.fog.component) |
187 | { |
188 | f = interpolate(xxxx, Df, rhw, primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective, false); |
189 | } |
190 | |
191 | setBuiltins(x, y, z, w); |
192 | |
193 | #if PERF_PROFILE |
194 | cycles[PERF_INTERP] += Ticks() - interpTime; |
195 | #endif |
196 | |
197 | Bool alphaPass = true; |
198 | |
199 | if(colorUsed()) |
200 | { |
201 | #if PERF_PROFILE |
202 | Long shaderTime = Ticks(); |
203 | #endif |
204 | |
205 | applyShader(cMask); |
206 | |
207 | #if PERF_PROFILE |
208 | cycles[PERF_SHADER] += Ticks() - shaderTime; |
209 | #endif |
210 | |
211 | alphaPass = alphaTest(cMask); |
212 | |
213 | if((shader && shader->containsKill()) || state.alphaTestActive()) |
214 | { |
215 | for(unsigned int q = 0; q < state.multiSample; q++) |
216 | { |
217 | zMask[q] &= cMask[q]; |
218 | sMask[q] &= cMask[q]; |
219 | } |
220 | } |
221 | } |
222 | |
223 | If(alphaPass) |
224 | { |
225 | if(!earlyDepthTest) |
226 | { |
227 | for(unsigned int q = 0; q < state.multiSample; q++) |
228 | { |
229 | depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]); |
230 | } |
231 | } |
232 | |
233 | #if PERF_PROFILE |
234 | Long ropTime = Ticks(); |
235 | #endif |
236 | |
237 | If(depthPass || Bool(earlyDepthTest)) |
238 | { |
239 | for(unsigned int q = 0; q < state.multiSample; q++) |
240 | { |
241 | if(state.multiSampleMask & (1 << q)) |
242 | { |
243 | writeDepth(zBuffer, q, x, z[q], zMask[q]); |
244 | |
245 | if(state.occlusionEnabled) |
246 | { |
247 | occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q])); |
248 | } |
249 | } |
250 | } |
251 | |
252 | if(colorUsed()) |
253 | { |
254 | #if PERF_PROFILE |
255 | AddAtomic(Pointer<Long>(&profiler.ropOperations), 4); |
256 | #endif |
257 | |
258 | rasterOperation(f, cBuffer, x, sMask, zMask, cMask); |
259 | } |
260 | } |
261 | |
262 | #if PERF_PROFILE |
263 | cycles[PERF_ROP] += Ticks() - ropTime; |
264 | #endif |
265 | } |
266 | } |
267 | |
268 | for(unsigned int q = 0; q < state.multiSample; q++) |
269 | { |
270 | if(state.multiSampleMask & (1 << q)) |
271 | { |
272 | writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]); |
273 | } |
274 | } |
275 | |
276 | #if PERF_PROFILE |
277 | cycles[PERF_PIPE] += Ticks() - pipeTime; |
278 | #endif |
279 | } |
280 | |
281 | Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective) |
282 | { |
283 | Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16); |
284 | |
285 | if(!flat) |
286 | { |
287 | interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) + |
288 | y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16); |
289 | |
290 | if(perspective) |
291 | { |
292 | interpolant *= rhw; |
293 | } |
294 | } |
295 | |
296 | return interpolant; |
297 | } |
298 | |
299 | void PixelRoutine::stencilTest(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask) |
300 | { |
301 | if(!state.stencilActive) |
302 | { |
303 | return; |
304 | } |
305 | |
306 | // (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask) |
307 | |
308 | Pointer<Byte> buffer = sBuffer + 2 * x; |
309 | |
310 | if(q > 0) |
311 | { |
312 | buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB)); |
313 | } |
314 | |
315 | Byte8 value = *Pointer<Byte8>(buffer); |
316 | Byte8 valueCCW = value; |
317 | |
318 | if(!state.noStencilMask) |
319 | { |
320 | value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ)); |
321 | } |
322 | |
323 | stencilTest(value, state.stencilCompareMode, false); |
324 | |
325 | if(state.twoSidedStencil) |
326 | { |
327 | if(!state.noStencilMaskCCW) |
328 | { |
329 | valueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ)); |
330 | } |
331 | |
332 | stencilTest(valueCCW, state.stencilCompareModeCCW, true); |
333 | |
334 | value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask)); |
335 | valueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask)); |
336 | value |= valueCCW; |
337 | } |
338 | |
339 | sMask = SignMask(value) & cMask; |
340 | } |
341 | |
342 | void PixelRoutine::stencilTest(Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW) |
343 | { |
344 | Byte8 equal; |
345 | |
346 | switch(stencilCompareMode) |
347 | { |
348 | case STENCIL_ALWAYS: |
349 | value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); |
350 | break; |
351 | case STENCIL_NEVER: |
352 | value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); |
353 | break; |
354 | case STENCIL_LESS: // a < b ~ b > a |
355 | value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); |
356 | value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ))); |
357 | break; |
358 | case STENCIL_EQUAL: |
359 | value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ))); |
360 | break; |
361 | case STENCIL_NOTEQUAL: // a != b ~ !(a == b) |
362 | value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ))); |
363 | value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); |
364 | break; |
365 | case STENCIL_LESSEQUAL: // a <= b ~ (b > a) || (a == b) |
366 | equal = value; |
367 | equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ))); |
368 | value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); |
369 | value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ))); |
370 | value |= equal; |
371 | break; |
372 | case STENCIL_GREATER: // a > b |
373 | equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)); |
374 | value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); |
375 | equal = CmpGT(As<SByte8>(equal), As<SByte8>(value)); |
376 | value = equal; |
377 | break; |
378 | case STENCIL_GREATEREQUAL: // a >= b ~ !(a < b) ~ !(b > a) |
379 | value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); |
380 | value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ))); |
381 | value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); |
382 | break; |
383 | default: |
384 | ASSERT(false); |
385 | } |
386 | } |
387 | |
388 | Bool PixelRoutine::depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask) |
389 | { |
390 | if(!state.depthTestActive) |
391 | { |
392 | return true; |
393 | } |
394 | |
395 | Float4 Z = z; |
396 | |
397 | if(shader && shader->depthOverride()) |
398 | { |
399 | if(complementaryDepthBuffer) |
400 | { |
401 | Z = Float4(1.0f) - oDepth; |
402 | } |
403 | else |
404 | { |
405 | Z = oDepth; |
406 | } |
407 | } |
408 | |
409 | Pointer<Byte> buffer; |
410 | Int pitch; |
411 | |
412 | if(!state.quadLayoutDepthBuffer) |
413 | { |
414 | buffer = zBuffer + 4 * x; |
415 | pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB)); |
416 | } |
417 | else |
418 | { |
419 | buffer = zBuffer + 8 * x; |
420 | } |
421 | |
422 | if(q > 0) |
423 | { |
424 | buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB)); |
425 | } |
426 | |
427 | Float4 zValue; |
428 | |
429 | if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable)) |
430 | { |
431 | if(!state.quadLayoutDepthBuffer) |
432 | { |
433 | // FIXME: Properly optimizes? |
434 | zValue.xy = *Pointer<Float4>(buffer); |
435 | zValue.zw = *Pointer<Float4>(buffer + pitch - 8); |
436 | } |
437 | else |
438 | { |
439 | zValue = *Pointer<Float4>(buffer, 16); |
440 | } |
441 | } |
442 | |
443 | Int4 zTest; |
444 | |
445 | switch(state.depthCompareMode) |
446 | { |
447 | case DEPTH_ALWAYS: |
448 | // Optimized |
449 | break; |
450 | case DEPTH_NEVER: |
451 | // Optimized |
452 | break; |
453 | case DEPTH_EQUAL: |
454 | zTest = CmpEQ(zValue, Z); |
455 | break; |
456 | case DEPTH_NOTEQUAL: |
457 | zTest = CmpNEQ(zValue, Z); |
458 | break; |
459 | case DEPTH_LESS: |
460 | if(complementaryDepthBuffer) |
461 | { |
462 | zTest = CmpLT(zValue, Z); |
463 | } |
464 | else |
465 | { |
466 | zTest = CmpNLE(zValue, Z); |
467 | } |
468 | break; |
469 | case DEPTH_GREATEREQUAL: |
470 | if(complementaryDepthBuffer) |
471 | { |
472 | zTest = CmpNLT(zValue, Z); |
473 | } |
474 | else |
475 | { |
476 | zTest = CmpLE(zValue, Z); |
477 | } |
478 | break; |
479 | case DEPTH_LESSEQUAL: |
480 | if(complementaryDepthBuffer) |
481 | { |
482 | zTest = CmpLE(zValue, Z); |
483 | } |
484 | else |
485 | { |
486 | zTest = CmpNLT(zValue, Z); |
487 | } |
488 | break; |
489 | case DEPTH_GREATER: |
490 | if(complementaryDepthBuffer) |
491 | { |
492 | zTest = CmpNLE(zValue, Z); |
493 | } |
494 | else |
495 | { |
496 | zTest = CmpLT(zValue, Z); |
497 | } |
498 | break; |
499 | default: |
500 | ASSERT(false); |
501 | } |
502 | |
503 | switch(state.depthCompareMode) |
504 | { |
505 | case DEPTH_ALWAYS: |
506 | zMask = cMask; |
507 | break; |
508 | case DEPTH_NEVER: |
509 | zMask = 0x0; |
510 | break; |
511 | default: |
512 | zMask = SignMask(zTest) & cMask; |
513 | break; |
514 | } |
515 | |
516 | if(state.stencilActive) |
517 | { |
518 | zMask &= sMask; |
519 | } |
520 | |
521 | return zMask != 0; |
522 | } |
523 | |
524 | void PixelRoutine::alphaTest(Int &aMask, Short4 &alpha) |
525 | { |
526 | Short4 cmp; |
527 | Short4 equal; |
528 | |
529 | switch(state.alphaCompareMode) |
530 | { |
531 | case ALPHA_ALWAYS: |
532 | aMask = 0xF; |
533 | break; |
534 | case ALPHA_NEVER: |
535 | aMask = 0x0; |
536 | break; |
537 | case ALPHA_EQUAL: |
538 | cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))); |
539 | aMask = SignMask(PackSigned(cmp, Short4(0x0000))); |
540 | break; |
541 | case ALPHA_NOTEQUAL: // a != b ~ !(a == b) |
542 | cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu); // FIXME |
543 | aMask = SignMask(PackSigned(cmp, Short4(0x0000))); |
544 | break; |
545 | case ALPHA_LESS: // a < b ~ b > a |
546 | cmp = CmpGT(*Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)), alpha); |
547 | aMask = SignMask(PackSigned(cmp, Short4(0x0000))); |
548 | break; |
549 | case ALPHA_GREATEREQUAL: // a >= b ~ (a > b) || (a == b) ~ !(b > a) // TODO: Approximate |
550 | equal = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))); |
551 | cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))); |
552 | cmp |= equal; |
553 | aMask = SignMask(PackSigned(cmp, Short4(0x0000))); |
554 | break; |
555 | case ALPHA_LESSEQUAL: // a <= b ~ !(a > b) |
556 | cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu); // FIXME |
557 | aMask = SignMask(PackSigned(cmp, Short4(0x0000))); |
558 | break; |
559 | case ALPHA_GREATER: // a > b |
560 | cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))); |
561 | aMask = SignMask(PackSigned(cmp, Short4(0x0000))); |
562 | break; |
563 | default: |
564 | ASSERT(false); |
565 | } |
566 | } |
567 | |
568 | void PixelRoutine::alphaToCoverage(Int cMask[4], Float4 &alpha) |
569 | { |
570 | Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0))); |
571 | Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1))); |
572 | Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2))); |
573 | Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3))); |
574 | |
575 | Int aMask0 = SignMask(coverage0); |
576 | Int aMask1 = SignMask(coverage1); |
577 | Int aMask2 = SignMask(coverage2); |
578 | Int aMask3 = SignMask(coverage3); |
579 | |
580 | cMask[0] &= aMask0; |
581 | cMask[1] &= aMask1; |
582 | cMask[2] &= aMask2; |
583 | cMask[3] &= aMask3; |
584 | } |
585 | |
586 | void PixelRoutine::fogBlend(Vector4f &c0, Float4 &fog) |
587 | { |
588 | if(!state.fogActive) |
589 | { |
590 | return; |
591 | } |
592 | |
593 | if(state.pixelFogMode != FOG_NONE) |
594 | { |
595 | pixelFog(fog); |
596 | |
597 | fog = Min(fog, Float4(1.0f)); |
598 | fog = Max(fog, Float4(0.0f)); |
599 | } |
600 | |
601 | c0.x -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0])); |
602 | c0.y -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1])); |
603 | c0.z -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2])); |
604 | |
605 | c0.x *= fog; |
606 | c0.y *= fog; |
607 | c0.z *= fog; |
608 | |
609 | c0.x += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0])); |
610 | c0.y += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1])); |
611 | c0.z += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2])); |
612 | } |
613 | |
614 | void PixelRoutine::pixelFog(Float4 &visibility) |
615 | { |
616 | Float4 &zw = visibility; |
617 | |
618 | if(state.pixelFogMode != FOG_NONE) |
619 | { |
620 | if(state.wBasedFog) |
621 | { |
622 | zw = rhw; |
623 | } |
624 | else |
625 | { |
626 | if(complementaryDepthBuffer) |
627 | { |
628 | zw = Float4(1.0f) - z[0]; |
629 | } |
630 | else |
631 | { |
632 | zw = z[0]; |
633 | } |
634 | } |
635 | } |
636 | |
637 | switch(state.pixelFogMode) |
638 | { |
639 | case FOG_NONE: |
640 | break; |
641 | case FOG_LINEAR: |
642 | zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.scale)); |
643 | zw += *Pointer<Float4>(data + OFFSET(DrawData,fog.offset)); |
644 | break; |
645 | case FOG_EXP: |
646 | zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.densityE)); |
647 | zw = exponential2(zw, true); |
648 | break; |
649 | case FOG_EXP2: |
650 | zw *= zw; |
651 | zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.density2E)); |
652 | zw = exponential2(zw, true); |
653 | break; |
654 | default: |
655 | ASSERT(false); |
656 | } |
657 | } |
658 | |
659 | void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask) |
660 | { |
661 | if(!state.depthWriteEnable) |
662 | { |
663 | return; |
664 | } |
665 | |
666 | Float4 Z = z; |
667 | |
668 | if(shader && shader->depthOverride()) |
669 | { |
670 | if(complementaryDepthBuffer) |
671 | { |
672 | Z = Float4(1.0f) - oDepth; |
673 | } |
674 | else |
675 | { |
676 | Z = oDepth; |
677 | } |
678 | } |
679 | |
680 | Pointer<Byte> buffer; |
681 | Int pitch; |
682 | |
683 | if(!state.quadLayoutDepthBuffer) |
684 | { |
685 | buffer = zBuffer + 4 * x; |
686 | pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB)); |
687 | } |
688 | else |
689 | { |
690 | buffer = zBuffer + 8 * x; |
691 | } |
692 | |
693 | if(q > 0) |
694 | { |
695 | buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB)); |
696 | } |
697 | |
698 | Float4 zValue; |
699 | |
700 | if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable)) |
701 | { |
702 | if(!state.quadLayoutDepthBuffer) |
703 | { |
704 | // FIXME: Properly optimizes? |
705 | zValue.xy = *Pointer<Float4>(buffer); |
706 | zValue.zw = *Pointer<Float4>(buffer + pitch - 8); |
707 | } |
708 | else |
709 | { |
710 | zValue = *Pointer<Float4>(buffer, 16); |
711 | } |
712 | } |
713 | |
714 | Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16)); |
715 | zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16)); |
716 | Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue)); |
717 | |
718 | if(!state.quadLayoutDepthBuffer) |
719 | { |
720 | // FIXME: Properly optimizes? |
721 | *Pointer<Float2>(buffer) = Float2(Z.xy); |
722 | *Pointer<Float2>(buffer + pitch) = Float2(Z.zw); |
723 | } |
724 | else |
725 | { |
726 | *Pointer<Float4>(buffer, 16) = Z; |
727 | } |
728 | } |
729 | |
730 | void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask) |
731 | { |
732 | if(!state.stencilActive) |
733 | { |
734 | return; |
735 | } |
736 | |
737 | if(state.stencilPassOperation == OPERATION_KEEP && state.stencilZFailOperation == OPERATION_KEEP && state.stencilFailOperation == OPERATION_KEEP) |
738 | { |
739 | if(!state.twoSidedStencil || (state.stencilPassOperationCCW == OPERATION_KEEP && state.stencilZFailOperationCCW == OPERATION_KEEP && state.stencilFailOperationCCW == OPERATION_KEEP)) |
740 | { |
741 | return; |
742 | } |
743 | } |
744 | |
745 | if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW)) |
746 | { |
747 | return; |
748 | } |
749 | |
750 | Pointer<Byte> buffer = sBuffer + 2 * x; |
751 | |
752 | if(q > 0) |
753 | { |
754 | buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB)); |
755 | } |
756 | |
757 | Byte8 bufferValue = *Pointer<Byte8>(buffer); |
758 | |
759 | Byte8 newValue; |
760 | stencilOperation(newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask); |
761 | |
762 | if(!state.noStencilWriteMask) |
763 | { |
764 | Byte8 maskedValue = bufferValue; |
765 | newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ)); |
766 | maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ)); |
767 | newValue |= maskedValue; |
768 | } |
769 | |
770 | if(state.twoSidedStencil) |
771 | { |
772 | Byte8 newValueCCW; |
773 | |
774 | stencilOperation(newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask); |
775 | |
776 | if(!state.noStencilWriteMaskCCW) |
777 | { |
778 | Byte8 maskedValue = bufferValue; |
779 | newValueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ)); |
780 | maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ)); |
781 | newValueCCW |= maskedValue; |
782 | } |
783 | |
784 | newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask)); |
785 | newValueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask)); |
786 | newValue |= newValueCCW; |
787 | } |
788 | |
789 | newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask); |
790 | bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask); |
791 | newValue |= bufferValue; |
792 | |
793 | *Pointer<Byte4>(buffer) = Byte4(newValue); |
794 | } |
795 | |
796 | void PixelRoutine::stencilOperation(Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask) |
797 | { |
798 | Byte8 &pass = newValue; |
799 | Byte8 fail; |
800 | Byte8 zFail; |
801 | |
802 | stencilOperation(pass, bufferValue, stencilPassOperation, CCW); |
803 | |
804 | if(stencilZFailOperation != stencilPassOperation) |
805 | { |
806 | stencilOperation(zFail, bufferValue, stencilZFailOperation, CCW); |
807 | } |
808 | |
809 | if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation) |
810 | { |
811 | stencilOperation(fail, bufferValue, stencilFailOperation, CCW); |
812 | } |
813 | |
814 | if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation) |
815 | { |
816 | if(state.depthTestActive && stencilZFailOperation != stencilPassOperation) // zMask valid and values not the same |
817 | { |
818 | pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask); |
819 | zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask); |
820 | pass |= zFail; |
821 | } |
822 | |
823 | pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask); |
824 | fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask); |
825 | pass |= fail; |
826 | } |
827 | } |
828 | |
829 | void PixelRoutine::stencilOperation(Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW) |
830 | { |
831 | switch(operation) |
832 | { |
833 | case OPERATION_KEEP: |
834 | output = bufferValue; |
835 | break; |
836 | case OPERATION_ZERO: |
837 | output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); |
838 | break; |
839 | case OPERATION_REPLACE: |
840 | output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceQ)); |
841 | break; |
842 | case OPERATION_INCRSAT: |
843 | output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1)); |
844 | break; |
845 | case OPERATION_DECRSAT: |
846 | output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1)); |
847 | break; |
848 | case OPERATION_INVERT: |
849 | output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); |
850 | break; |
851 | case OPERATION_INCR: |
852 | output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1); |
853 | break; |
854 | case OPERATION_DECR: |
855 | output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1); |
856 | break; |
857 | default: |
858 | ASSERT(false); |
859 | } |
860 | } |
861 | |
862 | void PixelRoutine::blendFactor(Vector4s &blendFactor, const Vector4s ¤t, const Vector4s &pixel, BlendFactor blendFactorActive) |
863 | { |
864 | switch(blendFactorActive) |
865 | { |
866 | case BLEND_ZERO: |
867 | // Optimized |
868 | break; |
869 | case BLEND_ONE: |
870 | // Optimized |
871 | break; |
872 | case BLEND_SOURCE: |
873 | blendFactor.x = current.x; |
874 | blendFactor.y = current.y; |
875 | blendFactor.z = current.z; |
876 | break; |
877 | case BLEND_INVSOURCE: |
878 | blendFactor.x = Short4(0xFFFFu) - current.x; |
879 | blendFactor.y = Short4(0xFFFFu) - current.y; |
880 | blendFactor.z = Short4(0xFFFFu) - current.z; |
881 | break; |
882 | case BLEND_DEST: |
883 | blendFactor.x = pixel.x; |
884 | blendFactor.y = pixel.y; |
885 | blendFactor.z = pixel.z; |
886 | break; |
887 | case BLEND_INVDEST: |
888 | blendFactor.x = Short4(0xFFFFu) - pixel.x; |
889 | blendFactor.y = Short4(0xFFFFu) - pixel.y; |
890 | blendFactor.z = Short4(0xFFFFu) - pixel.z; |
891 | break; |
892 | case BLEND_SOURCEALPHA: |
893 | blendFactor.x = current.w; |
894 | blendFactor.y = current.w; |
895 | blendFactor.z = current.w; |
896 | break; |
897 | case BLEND_INVSOURCEALPHA: |
898 | blendFactor.x = Short4(0xFFFFu) - current.w; |
899 | blendFactor.y = Short4(0xFFFFu) - current.w; |
900 | blendFactor.z = Short4(0xFFFFu) - current.w; |
901 | break; |
902 | case BLEND_DESTALPHA: |
903 | blendFactor.x = pixel.w; |
904 | blendFactor.y = pixel.w; |
905 | blendFactor.z = pixel.w; |
906 | break; |
907 | case BLEND_INVDESTALPHA: |
908 | blendFactor.x = Short4(0xFFFFu) - pixel.w; |
909 | blendFactor.y = Short4(0xFFFFu) - pixel.w; |
910 | blendFactor.z = Short4(0xFFFFu) - pixel.w; |
911 | break; |
912 | case BLEND_SRCALPHASAT: |
913 | blendFactor.x = Short4(0xFFFFu) - pixel.w; |
914 | blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w)); |
915 | blendFactor.y = blendFactor.x; |
916 | blendFactor.z = blendFactor.x; |
917 | break; |
918 | case BLEND_CONSTANT: |
919 | blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0])); |
920 | blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1])); |
921 | blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2])); |
922 | break; |
923 | case BLEND_INVCONSTANT: |
924 | blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0])); |
925 | blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1])); |
926 | blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2])); |
927 | break; |
928 | case BLEND_CONSTANTALPHA: |
929 | blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3])); |
930 | blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3])); |
931 | blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3])); |
932 | break; |
933 | case BLEND_INVCONSTANTALPHA: |
934 | blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3])); |
935 | blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3])); |
936 | blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3])); |
937 | break; |
938 | default: |
939 | ASSERT(false); |
940 | } |
941 | } |
942 | |
943 | void PixelRoutine::blendFactorAlpha(Vector4s &blendFactor, const Vector4s ¤t, const Vector4s &pixel, BlendFactor blendFactorAlphaActive) |
944 | { |
945 | switch(blendFactorAlphaActive) |
946 | { |
947 | case BLEND_ZERO: |
948 | // Optimized |
949 | break; |
950 | case BLEND_ONE: |
951 | // Optimized |
952 | break; |
953 | case BLEND_SOURCE: |
954 | blendFactor.w = current.w; |
955 | break; |
956 | case BLEND_INVSOURCE: |
957 | blendFactor.w = Short4(0xFFFFu) - current.w; |
958 | break; |
959 | case BLEND_DEST: |
960 | blendFactor.w = pixel.w; |
961 | break; |
962 | case BLEND_INVDEST: |
963 | blendFactor.w = Short4(0xFFFFu) - pixel.w; |
964 | break; |
965 | case BLEND_SOURCEALPHA: |
966 | blendFactor.w = current.w; |
967 | break; |
968 | case BLEND_INVSOURCEALPHA: |
969 | blendFactor.w = Short4(0xFFFFu) - current.w; |
970 | break; |
971 | case BLEND_DESTALPHA: |
972 | blendFactor.w = pixel.w; |
973 | break; |
974 | case BLEND_INVDESTALPHA: |
975 | blendFactor.w = Short4(0xFFFFu) - pixel.w; |
976 | break; |
977 | case BLEND_SRCALPHASAT: |
978 | blendFactor.w = Short4(0xFFFFu); |
979 | break; |
980 | case BLEND_CONSTANT: |
981 | case BLEND_CONSTANTALPHA: |
982 | blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3])); |
983 | break; |
984 | case BLEND_INVCONSTANT: |
985 | case BLEND_INVCONSTANTALPHA: |
986 | blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3])); |
987 | break; |
988 | default: |
989 | ASSERT(false); |
990 | } |
991 | } |
992 | |
993 | bool PixelRoutine::isSRGB(int index) const |
994 | { |
995 | return Surface::isSRGBformat(state.targetFormat[index]); |
996 | } |
997 | |
998 | void PixelRoutine::readPixel(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel) |
999 | { |
1000 | Short4 c01; |
1001 | Short4 c23; |
1002 | Pointer<Byte> buffer; |
1003 | Pointer<Byte> buffer2; |
1004 | |
1005 | switch(state.targetFormat[index]) |
1006 | { |
1007 | case FORMAT_R5G6B5: |
1008 | buffer = cBuffer + 2 * x; |
1009 | buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); |
1010 | c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2))); |
1011 | |
1012 | pixel.x = c01 & Short4(0xF800u); |
1013 | pixel.y = (c01 & Short4(0x07E0u)) << 5; |
1014 | pixel.z = (c01 & Short4(0x001Fu)) << 11; |
1015 | pixel.w = Short4(0xFFFFu); |
1016 | break; |
1017 | case FORMAT_A8R8G8B8: |
1018 | buffer = cBuffer + 4 * x; |
1019 | c01 = *Pointer<Short4>(buffer); |
1020 | buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); |
1021 | c23 = *Pointer<Short4>(buffer); |
1022 | pixel.z = c01; |
1023 | pixel.y = c01; |
1024 | pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23)); |
1025 | pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23)); |
1026 | pixel.x = pixel.z; |
1027 | pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y)); |
1028 | pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y)); |
1029 | pixel.y = pixel.z; |
1030 | pixel.w = pixel.x; |
1031 | pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x)); |
1032 | pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y)); |
1033 | pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z)); |
1034 | pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w)); |
1035 | break; |
1036 | case FORMAT_A8B8G8R8: |
1037 | case FORMAT_SRGB8_A8: |
1038 | buffer = cBuffer + 4 * x; |
1039 | c01 = *Pointer<Short4>(buffer); |
1040 | buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); |
1041 | c23 = *Pointer<Short4>(buffer); |
1042 | pixel.z = c01; |
1043 | pixel.y = c01; |
1044 | pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23)); |
1045 | pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23)); |
1046 | pixel.x = pixel.z; |
1047 | pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y)); |
1048 | pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y)); |
1049 | pixel.y = pixel.z; |
1050 | pixel.w = pixel.x; |
1051 | pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z)); |
1052 | pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y)); |
1053 | pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w)); |
1054 | pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w)); |
1055 | break; |
1056 | case FORMAT_A8: |
1057 | buffer = cBuffer + 1 * x; |
1058 | pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 0); |
1059 | buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); |
1060 | pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 1); |
1061 | pixel.w = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w)); |
1062 | pixel.x = Short4(0x0000); |
1063 | pixel.y = Short4(0x0000); |
1064 | pixel.z = Short4(0x0000); |
1065 | break; |
1066 | case FORMAT_R8: |
1067 | buffer = cBuffer + 1 * x; |
1068 | pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0); |
1069 | buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); |
1070 | pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1); |
1071 | pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x)); |
1072 | pixel.y = Short4(0x0000); |
1073 | pixel.z = Short4(0x0000); |
1074 | pixel.w = Short4(0xFFFFu); |
1075 | break; |
1076 | case FORMAT_X8R8G8B8: |
1077 | buffer = cBuffer + 4 * x; |
1078 | c01 = *Pointer<Short4>(buffer); |
1079 | buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); |
1080 | c23 = *Pointer<Short4>(buffer); |
1081 | pixel.z = c01; |
1082 | pixel.y = c01; |
1083 | pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23)); |
1084 | pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23)); |
1085 | pixel.x = pixel.z; |
1086 | pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y)); |
1087 | pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y)); |
1088 | pixel.y = pixel.z; |
1089 | pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x)); |
1090 | pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y)); |
1091 | pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z)); |
1092 | pixel.w = Short4(0xFFFFu); |
1093 | break; |
1094 | case FORMAT_G8R8: |
1095 | buffer = cBuffer + 2 * x; |
1096 | c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0)); |
1097 | buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); |
1098 | c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1)); |
1099 | pixel.x = (c01 & Short4(0x00FFu)) | (c01 << 8); |
1100 | pixel.y = (c01 & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c01) >> 8); |
1101 | pixel.z = Short4(0x0000u); |
1102 | pixel.w = Short4(0xFFFFu); |
1103 | break; |
1104 | case FORMAT_X8B8G8R8: |
1105 | case FORMAT_SRGB8_X8: |
1106 | buffer = cBuffer + 4 * x; |
1107 | c01 = *Pointer<Short4>(buffer); |
1108 | buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); |
1109 | c23 = *Pointer<Short4>(buffer); |
1110 | pixel.z = c01; |
1111 | pixel.y = c01; |
1112 | pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23)); |
1113 | pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23)); |
1114 | pixel.x = pixel.z; |
1115 | pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y)); |
1116 | pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y)); |
1117 | pixel.y = pixel.z; |
1118 | pixel.w = pixel.x; |
1119 | pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z)); |
1120 | pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y)); |
1121 | pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w)); |
1122 | pixel.w = Short4(0xFFFFu); |
1123 | break; |
1124 | case FORMAT_A8G8R8B8Q: |
1125 | UNIMPLEMENTED(); |
1126 | // pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0)); |
1127 | // pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0)); |
1128 | // pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8)); |
1129 | // pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8)); |
1130 | break; |
1131 | case FORMAT_X8G8R8B8Q: |
1132 | UNIMPLEMENTED(); |
1133 | // pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0)); |
1134 | // pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0)); |
1135 | // pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8)); |
1136 | // pixel.w = Short4(0xFFFFu); |
1137 | break; |
1138 | case FORMAT_A16B16G16R16: |
1139 | buffer = cBuffer; |
1140 | pixel.x = *Pointer<Short4>(buffer + 8 * x); |
1141 | pixel.y = *Pointer<Short4>(buffer + 8 * x + 8); |
1142 | buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); |
1143 | pixel.z = *Pointer<Short4>(buffer + 8 * x); |
1144 | pixel.w = *Pointer<Short4>(buffer + 8 * x + 8); |
1145 | transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w); |
1146 | break; |
1147 | case FORMAT_G16R16: |
1148 | buffer = cBuffer; |
1149 | pixel.x = *Pointer<Short4>(buffer + 4 * x); |
1150 | buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); |
1151 | pixel.y = *Pointer<Short4>(buffer + 4 * x); |
1152 | pixel.z = pixel.x; |
1153 | pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y)); |
1154 | pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y)); |
1155 | pixel.y = pixel.z; |
1156 | pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z)); |
1157 | pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z)); |
1158 | pixel.z = Short4(0xFFFFu); |
1159 | pixel.w = Short4(0xFFFFu); |
1160 | break; |
1161 | default: |
1162 | ASSERT(false); |
1163 | } |
1164 | |
1165 | if((postBlendSRGB && state.writeSRGB) || isSRGB(index)) |
1166 | { |
1167 | sRGBtoLinear16_12_16(pixel); |
1168 | } |
1169 | } |
1170 | |
1171 | void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4s ¤t, Int &x) |
1172 | { |
1173 | if(!state.alphaBlendActive) |
1174 | { |
1175 | return; |
1176 | } |
1177 | |
1178 | Vector4s pixel; |
1179 | readPixel(index, cBuffer, x, pixel); |
1180 | |
1181 | // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor |
1182 | Vector4s sourceFactor; |
1183 | Vector4s destFactor; |
1184 | |
1185 | blendFactor(sourceFactor, current, pixel, state.sourceBlendFactor); |
1186 | blendFactor(destFactor, current, pixel, state.destBlendFactor); |
1187 | |
1188 | if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO) |
1189 | { |
1190 | current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x)); |
1191 | current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y)); |
1192 | current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z)); |
1193 | } |
1194 | |
1195 | if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO) |
1196 | { |
1197 | pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x)); |
1198 | pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y)); |
1199 | pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z)); |
1200 | } |
1201 | |
1202 | switch(state.blendOperation) |
1203 | { |
1204 | case BLENDOP_ADD: |
1205 | current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x)); |
1206 | current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y)); |
1207 | current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z)); |
1208 | break; |
1209 | case BLENDOP_SUB: |
1210 | current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x)); |
1211 | current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y)); |
1212 | current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z)); |
1213 | break; |
1214 | case BLENDOP_INVSUB: |
1215 | current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x)); |
1216 | current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y)); |
1217 | current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z)); |
1218 | break; |
1219 | case BLENDOP_MIN: |
1220 | current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x)); |
1221 | current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y)); |
1222 | current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z)); |
1223 | break; |
1224 | case BLENDOP_MAX: |
1225 | current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x)); |
1226 | current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y)); |
1227 | current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z)); |
1228 | break; |
1229 | case BLENDOP_SOURCE: |
1230 | // No operation |
1231 | break; |
1232 | case BLENDOP_DEST: |
1233 | current.x = pixel.x; |
1234 | current.y = pixel.y; |
1235 | current.z = pixel.z; |
1236 | break; |
1237 | case BLENDOP_NULL: |
1238 | current.x = Short4(0x0000); |
1239 | current.y = Short4(0x0000); |
1240 | current.z = Short4(0x0000); |
1241 | break; |
1242 | default: |
1243 | ASSERT(false); |
1244 | } |
1245 | |
1246 | blendFactorAlpha(sourceFactor, current, pixel, state.sourceBlendFactorAlpha); |
1247 | blendFactorAlpha(destFactor, current, pixel, state.destBlendFactorAlpha); |
1248 | |
1249 | if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO) |
1250 | { |
1251 | current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w)); |
1252 | } |
1253 | |
1254 | if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO) |
1255 | { |
1256 | pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w)); |
1257 | } |
1258 | |
1259 | switch(state.blendOperationAlpha) |
1260 | { |
1261 | case BLENDOP_ADD: |
1262 | current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w)); |
1263 | break; |
1264 | case BLENDOP_SUB: |
1265 | current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w)); |
1266 | break; |
1267 | case BLENDOP_INVSUB: |
1268 | current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w)); |
1269 | break; |
1270 | case BLENDOP_MIN: |
1271 | current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w)); |
1272 | break; |
1273 | case BLENDOP_MAX: |
1274 | current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w)); |
1275 | break; |
1276 | case BLENDOP_SOURCE: |
1277 | // No operation |
1278 | break; |
1279 | case BLENDOP_DEST: |
1280 | current.w = pixel.w; |
1281 | break; |
1282 | case BLENDOP_NULL: |
1283 | current.w = Short4(0x0000); |
1284 | break; |
1285 | default: |
1286 | ASSERT(false); |
1287 | } |
1288 | } |
1289 | |
1290 | void PixelRoutine::logicOperation(int index, Pointer<Byte> &cBuffer, Vector4s ¤t, Int &x) |
1291 | { |
1292 | if(state.logicalOperation == LOGICALOP_COPY) |
1293 | { |
1294 | return; |
1295 | } |
1296 | |
1297 | Vector4s pixel; |
1298 | readPixel(index, cBuffer, x, pixel); |
1299 | |
1300 | switch(state.logicalOperation) |
1301 | { |
1302 | case LOGICALOP_CLEAR: |
1303 | current.x = UShort4(0); |
1304 | current.y = UShort4(0); |
1305 | current.z = UShort4(0); |
1306 | break; |
1307 | case LOGICALOP_SET: |
1308 | current.x = UShort4(0xFFFFu); |
1309 | current.y = UShort4(0xFFFFu); |
1310 | current.z = UShort4(0xFFFFu); |
1311 | break; |
1312 | case LOGICALOP_COPY: |
1313 | ASSERT(false); // Optimized out |
1314 | break; |
1315 | case LOGICALOP_COPY_INVERTED: |
1316 | current.x = ~current.x; |
1317 | current.y = ~current.y; |
1318 | current.z = ~current.z; |
1319 | break; |
1320 | case LOGICALOP_NOOP: |
1321 | current.x = pixel.x; |
1322 | current.y = pixel.y; |
1323 | current.z = pixel.z; |
1324 | break; |
1325 | case LOGICALOP_INVERT: |
1326 | current.x = ~pixel.x; |
1327 | current.y = ~pixel.y; |
1328 | current.z = ~pixel.z; |
1329 | break; |
1330 | case LOGICALOP_AND: |
1331 | current.x = pixel.x & current.x; |
1332 | current.y = pixel.y & current.y; |
1333 | current.z = pixel.z & current.z; |
1334 | break; |
1335 | case LOGICALOP_NAND: |
1336 | current.x = ~(pixel.x & current.x); |
1337 | current.y = ~(pixel.y & current.y); |
1338 | current.z = ~(pixel.z & current.z); |
1339 | break; |
1340 | case LOGICALOP_OR: |
1341 | current.x = pixel.x | current.x; |
1342 | current.y = pixel.y | current.y; |
1343 | current.z = pixel.z | current.z; |
1344 | break; |
1345 | case LOGICALOP_NOR: |
1346 | current.x = ~(pixel.x | current.x); |
1347 | current.y = ~(pixel.y | current.y); |
1348 | current.z = ~(pixel.z | current.z); |
1349 | break; |
1350 | case LOGICALOP_XOR: |
1351 | current.x = pixel.x ^ current.x; |
1352 | current.y = pixel.y ^ current.y; |
1353 | current.z = pixel.z ^ current.z; |
1354 | break; |
1355 | case LOGICALOP_EQUIV: |
1356 | current.x = ~(pixel.x ^ current.x); |
1357 | current.y = ~(pixel.y ^ current.y); |
1358 | current.z = ~(pixel.z ^ current.z); |
1359 | break; |
1360 | case LOGICALOP_AND_REVERSE: |
1361 | current.x = ~pixel.x & current.x; |
1362 | current.y = ~pixel.y & current.y; |
1363 | current.z = ~pixel.z & current.z; |
1364 | break; |
1365 | case LOGICALOP_AND_INVERTED: |
1366 | current.x = pixel.x & ~current.x; |
1367 | current.y = pixel.y & ~current.y; |
1368 | current.z = pixel.z & ~current.z; |
1369 | break; |
1370 | case LOGICALOP_OR_REVERSE: |
1371 | current.x = ~pixel.x | current.x; |
1372 | current.y = ~pixel.y | current.y; |
1373 | current.z = ~pixel.z | current.z; |
1374 | break; |
1375 | case LOGICALOP_OR_INVERTED: |
1376 | current.x = pixel.x | ~current.x; |
1377 | current.y = pixel.y | ~current.y; |
1378 | current.z = pixel.z | ~current.z; |
1379 | break; |
1380 | default: |
1381 | ASSERT(false); |
1382 | } |
1383 | } |
1384 | |
1385 | void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s ¤t, Int &sMask, Int &zMask, Int &cMask) |
1386 | { |
1387 | if((postBlendSRGB && state.writeSRGB) || isSRGB(index)) |
1388 | { |
1389 | linearToSRGB16_12_16(current); |
1390 | } |
1391 | |
1392 | if(exactColorRounding) |
1393 | { |
1394 | switch(state.targetFormat[index]) |
1395 | { |
1396 | case FORMAT_R5G6B5: |
1397 | current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400)); |
1398 | current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200)); |
1399 | current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400)); |
1400 | break; |
1401 | case FORMAT_X8G8R8B8Q: |
1402 | case FORMAT_A8G8R8B8Q: |
1403 | case FORMAT_X8R8G8B8: |
1404 | case FORMAT_X8B8G8R8: |
1405 | case FORMAT_A8R8G8B8: |
1406 | case FORMAT_A8B8G8R8: |
1407 | case FORMAT_SRGB8_X8: |
1408 | case FORMAT_SRGB8_A8: |
1409 | case FORMAT_G8R8: |
1410 | case FORMAT_R8: |
1411 | current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080); |
1412 | current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080); |
1413 | current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080); |
1414 | current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080); |
1415 | break; |
1416 | default: |
1417 | break; |
1418 | } |
1419 | } |
1420 | |
1421 | int rgbaWriteMask = state.colorWriteActive(index); |
1422 | int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2; |
1423 | |
1424 | switch(state.targetFormat[index]) |
1425 | { |
1426 | case FORMAT_R5G6B5: |
1427 | { |
1428 | current.x = current.x & Short4(0xF800u); |
1429 | current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5; |
1430 | current.z = As<UShort4>(current.z) >> 11; |
1431 | |
1432 | current.x = current.x | current.y | current.z; |
1433 | } |
1434 | break; |
1435 | case FORMAT_X8G8R8B8Q: |
1436 | UNIMPLEMENTED(); |
1437 | // current.x = As<Short4>(As<UShort4>(current.x) >> 8); |
1438 | // current.y = As<Short4>(As<UShort4>(current.y) >> 8); |
1439 | // current.z = As<Short4>(As<UShort4>(current.z) >> 8); |
1440 | |
1441 | // current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x))); |
1442 | // current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y))); |
1443 | break; |
1444 | case FORMAT_A8G8R8B8Q: |
1445 | UNIMPLEMENTED(); |
1446 | // current.x = As<Short4>(As<UShort4>(current.x) >> 8); |
1447 | // current.y = As<Short4>(As<UShort4>(current.y) >> 8); |
1448 | // current.z = As<Short4>(As<UShort4>(current.z) >> 8); |
1449 | // current.w = As<Short4>(As<UShort4>(current.w) >> 8); |
1450 | |
1451 | // current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x))); |
1452 | // current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w))); |
1453 | break; |
1454 | case FORMAT_X8R8G8B8: |
1455 | case FORMAT_A8R8G8B8: |
1456 | if(state.targetFormat[index] == FORMAT_X8R8G8B8 || rgbaWriteMask == 0x7) |
1457 | { |
1458 | current.x = As<Short4>(As<UShort4>(current.x) >> 8); |
1459 | current.y = As<Short4>(As<UShort4>(current.y) >> 8); |
1460 | current.z = As<Short4>(As<UShort4>(current.z) >> 8); |
1461 | |
1462 | current.z = As<Short4>(PackUnsigned(current.z, current.x)); |
1463 | current.y = As<Short4>(PackUnsigned(current.y, current.y)); |
1464 | |
1465 | current.x = current.z; |
1466 | current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y)); |
1467 | current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y)); |
1468 | current.y = current.z; |
1469 | current.z = As<Short4>(UnpackLow(current.z, current.x)); |
1470 | current.y = As<Short4>(UnpackHigh(current.y, current.x)); |
1471 | } |
1472 | else |
1473 | { |
1474 | current.x = As<Short4>(As<UShort4>(current.x) >> 8); |
1475 | current.y = As<Short4>(As<UShort4>(current.y) >> 8); |
1476 | current.z = As<Short4>(As<UShort4>(current.z) >> 8); |
1477 | current.w = As<Short4>(As<UShort4>(current.w) >> 8); |
1478 | |
1479 | current.z = As<Short4>(PackUnsigned(current.z, current.x)); |
1480 | current.y = As<Short4>(PackUnsigned(current.y, current.w)); |
1481 | |
1482 | current.x = current.z; |
1483 | current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y)); |
1484 | current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y)); |
1485 | current.y = current.z; |
1486 | current.z = As<Short4>(UnpackLow(current.z, current.x)); |
1487 | current.y = As<Short4>(UnpackHigh(current.y, current.x)); |
1488 | } |
1489 | break; |
1490 | case FORMAT_X8B8G8R8: |
1491 | case FORMAT_A8B8G8R8: |
1492 | case FORMAT_SRGB8_X8: |
1493 | case FORMAT_SRGB8_A8: |
1494 | if(state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8 || rgbaWriteMask == 0x7) |
1495 | { |
1496 | current.x = As<Short4>(As<UShort4>(current.x) >> 8); |
1497 | current.y = As<Short4>(As<UShort4>(current.y) >> 8); |
1498 | current.z = As<Short4>(As<UShort4>(current.z) >> 8); |
1499 | |
1500 | current.z = As<Short4>(PackUnsigned(current.x, current.z)); |
1501 | current.y = As<Short4>(PackUnsigned(current.y, current.y)); |
1502 | |
1503 | current.x = current.z; |
1504 | current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y)); |
1505 | current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y)); |
1506 | current.y = current.z; |
1507 | current.z = As<Short4>(UnpackLow(current.z, current.x)); |
1508 | current.y = As<Short4>(UnpackHigh(current.y, current.x)); |
1509 | } |
1510 | else |
1511 | { |
1512 | current.x = As<Short4>(As<UShort4>(current.x) >> 8); |
1513 | current.y = As<Short4>(As<UShort4>(current.y) >> 8); |
1514 | current.z = As<Short4>(As<UShort4>(current.z) >> 8); |
1515 | current.w = As<Short4>(As<UShort4>(current.w) >> 8); |
1516 | |
1517 | current.z = As<Short4>(PackUnsigned(current.x, current.z)); |
1518 | current.y = As<Short4>(PackUnsigned(current.y, current.w)); |
1519 | |
1520 | current.x = current.z; |
1521 | current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y)); |
1522 | current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y)); |
1523 | current.y = current.z; |
1524 | current.z = As<Short4>(UnpackLow(current.z, current.x)); |
1525 | current.y = As<Short4>(UnpackHigh(current.y, current.x)); |
1526 | } |
1527 | break; |
1528 | case FORMAT_G8R8: |
1529 | current.x = As<Short4>(As<UShort4>(current.x) >> 8); |
1530 | current.y = As<Short4>(As<UShort4>(current.y) >> 8); |
1531 | current.x = As<Short4>(PackUnsigned(current.x, current.x)); |
1532 | current.y = As<Short4>(PackUnsigned(current.y, current.y)); |
1533 | current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y)); |
1534 | break; |
1535 | case FORMAT_R8: |
1536 | current.x = As<Short4>(As<UShort4>(current.x) >> 8); |
1537 | current.x = As<Short4>(PackUnsigned(current.x, current.x)); |
1538 | break; |
1539 | case FORMAT_A8: |
1540 | current.w = As<Short4>(As<UShort4>(current.w) >> 8); |
1541 | current.w = As<Short4>(PackUnsigned(current.w, current.w)); |
1542 | break; |
1543 | case FORMAT_G16R16: |
1544 | current.z = current.x; |
1545 | current.x = As<Short4>(UnpackLow(current.x, current.y)); |
1546 | current.z = As<Short4>(UnpackHigh(current.z, current.y)); |
1547 | current.y = current.z; |
1548 | break; |
1549 | case FORMAT_A16B16G16R16: |
1550 | transpose4x4(current.x, current.y, current.z, current.w); |
1551 | break; |
1552 | default: |
1553 | ASSERT(false); |
1554 | } |
1555 | |
1556 | Short4 c01 = current.z; |
1557 | Short4 c23 = current.y; |
1558 | |
1559 | Int xMask; // Combination of all masks |
1560 | |
1561 | if(state.depthTestActive) |
1562 | { |
1563 | xMask = zMask; |
1564 | } |
1565 | else |
1566 | { |
1567 | xMask = cMask; |
1568 | } |
1569 | |
1570 | if(state.stencilActive) |
1571 | { |
1572 | xMask &= sMask; |
1573 | } |
1574 | |
1575 | switch(state.targetFormat[index]) |
1576 | { |
1577 | case FORMAT_R5G6B5: |
1578 | { |
1579 | Pointer<Byte> buffer = cBuffer + 2 * x; |
1580 | Int value = *Pointer<Int>(buffer); |
1581 | |
1582 | Int c01 = Extract(As<Int2>(current.x), 0); |
1583 | |
1584 | if((bgraWriteMask & 0x00000007) != 0x00000007) |
1585 | { |
1586 | Int masked = value; |
1587 | c01 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0])); |
1588 | masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0])); |
1589 | c01 |= masked; |
1590 | } |
1591 | |
1592 | c01 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8); |
1593 | value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8); |
1594 | c01 |= value; |
1595 | *Pointer<Int>(buffer) = c01; |
1596 | |
1597 | buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); |
1598 | value = *Pointer<Int>(buffer); |
1599 | |
1600 | Int c23 = Extract(As<Int2>(current.x), 1); |
1601 | |
1602 | if((bgraWriteMask & 0x00000007) != 0x00000007) |
1603 | { |
1604 | Int masked = value; |
1605 | c23 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0])); |
1606 | masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0])); |
1607 | c23 |= masked; |
1608 | } |
1609 | |
1610 | c23 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8); |
1611 | value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8); |
1612 | c23 |= value; |
1613 | *Pointer<Int>(buffer) = c23; |
1614 | } |
1615 | break; |
1616 | case FORMAT_A8G8R8B8Q: |
1617 | case FORMAT_X8G8R8B8Q: // FIXME: Don't touch alpha? |
1618 | UNIMPLEMENTED(); |
1619 | // value = *Pointer<Short4>(cBuffer + 8 * x + 0); |
1620 | |
1621 | // if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) || |
1622 | // ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) && |
1623 | // (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh? |
1624 | // { |
1625 | // Short4 masked = value; |
1626 | // c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0])); |
1627 | // masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0])); |
1628 | // c01 |= masked; |
1629 | // } |
1630 | |
1631 | // c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8); |
1632 | // value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8); |
1633 | // c01 |= value; |
1634 | // *Pointer<Short4>(cBuffer + 8 * x + 0) = c01; |
1635 | |
1636 | // value = *Pointer<Short4>(cBuffer + 8 * x + 8); |
1637 | |
1638 | // if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) || |
1639 | // ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) && |
1640 | // (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh? |
1641 | // { |
1642 | // Short4 masked = value; |
1643 | // c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0])); |
1644 | // masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0])); |
1645 | // c23 |= masked; |
1646 | // } |
1647 | |
1648 | // c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8); |
1649 | // value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8); |
1650 | // c23 |= value; |
1651 | // *Pointer<Short4>(cBuffer + 8 * x + 8) = c23; |
1652 | break; |
1653 | case FORMAT_A8R8G8B8: |
1654 | case FORMAT_X8R8G8B8: // FIXME: Don't touch alpha? |
1655 | { |
1656 | Pointer<Byte> buffer = cBuffer + x * 4; |
1657 | Short4 value = *Pointer<Short4>(buffer); |
1658 | |
1659 | if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) || |
1660 | ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) && |
1661 | (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh? |
1662 | { |
1663 | Short4 masked = value; |
1664 | c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0])); |
1665 | masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0])); |
1666 | c01 |= masked; |
1667 | } |
1668 | |
1669 | c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8); |
1670 | value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8); |
1671 | c01 |= value; |
1672 | *Pointer<Short4>(buffer) = c01; |
1673 | |
1674 | buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); |
1675 | value = *Pointer<Short4>(buffer); |
1676 | |
1677 | if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) || |
1678 | ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) && |
1679 | (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh? |
1680 | { |
1681 | Short4 masked = value; |
1682 | c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0])); |
1683 | masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0])); |
1684 | c23 |= masked; |
1685 | } |
1686 | |
1687 | #ifdef __APPLE__ |
1688 | // On Mac we render directly to an IOSurface that isn't vertically padded. So we |
1689 | // only render the bottom half of quads when it won't overflow the buffer. |
1690 | If ((y + 1) < yMax) |
1691 | #endif |
1692 | { |
1693 | c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8); |
1694 | value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8); |
1695 | c23 |= value; |
1696 | *Pointer<Short4>(buffer) = c23; |
1697 | } |
1698 | } |
1699 | break; |
1700 | case FORMAT_A8B8G8R8: |
1701 | case FORMAT_X8B8G8R8: // FIXME: Don't touch alpha? |
1702 | case FORMAT_SRGB8_X8: |
1703 | case FORMAT_SRGB8_A8: |
1704 | { |
1705 | Pointer<Byte> buffer = cBuffer + x * 4; |
1706 | Short4 value = *Pointer<Short4>(buffer); |
1707 | |
1708 | bool masked = (((state.targetFormat[index] == FORMAT_A8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_A8) && rgbaWriteMask != 0x0000000F) || |
1709 | (((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x00000007) && |
1710 | ((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x0000000F))); // FIXME: Need for masking when XBGR && Fh? |
1711 | |
1712 | if(masked) |
1713 | { |
1714 | Short4 masked = value; |
1715 | c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0])); |
1716 | masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0])); |
1717 | c01 |= masked; |
1718 | } |
1719 | |
1720 | c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8); |
1721 | value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8); |
1722 | c01 |= value; |
1723 | *Pointer<Short4>(buffer) = c01; |
1724 | |
1725 | buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); |
1726 | value = *Pointer<Short4>(buffer); |
1727 | |
1728 | if(masked) |
1729 | { |
1730 | Short4 masked = value; |
1731 | c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0])); |
1732 | masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0])); |
1733 | c23 |= masked; |
1734 | } |
1735 | |
1736 | c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8); |
1737 | value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8); |
1738 | c23 |= value; |
1739 | *Pointer<Short4>(buffer) = c23; |
1740 | } |
1741 | break; |
1742 | case FORMAT_G8R8: |
1743 | if((rgbaWriteMask & 0x00000003) != 0x0) |
1744 | { |
1745 | Pointer<Byte> buffer = cBuffer + 2 * x; |
1746 | Int2 value; |
1747 | value = Insert(value, *Pointer<Int>(buffer), 0); |
1748 | Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); |
1749 | value = Insert(value, *Pointer<Int>(buffer + pitch), 1); |
1750 | |
1751 | Int2 packedCol = As<Int2>(current.x); |
1752 | |
1753 | UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8); |
1754 | if((rgbaWriteMask & 0x3) != 0x3) |
1755 | { |
1756 | Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0])); |
1757 | UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask)); |
1758 | mergedMask &= rgbaMask; |
1759 | } |
1760 | |
1761 | packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask)); |
1762 | |
1763 | *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0)); |
1764 | *Pointer<UInt>(buffer + pitch) = As<UInt>(Extract(packedCol, 1)); |
1765 | } |
1766 | break; |
1767 | case FORMAT_R8: |
1768 | if(rgbaWriteMask & 0x00000001) |
1769 | { |
1770 | Pointer<Byte> buffer = cBuffer + 1 * x; |
1771 | Short4 value; |
1772 | value = Insert(value, *Pointer<Short>(buffer), 0); |
1773 | Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); |
1774 | value = Insert(value, *Pointer<Short>(buffer + pitch), 1); |
1775 | |
1776 | current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask); |
1777 | value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask); |
1778 | current.x |= value; |
1779 | |
1780 | *Pointer<Short>(buffer) = Extract(current.x, 0); |
1781 | *Pointer<Short>(buffer + pitch) = Extract(current.x, 1); |
1782 | } |
1783 | break; |
1784 | case FORMAT_A8: |
1785 | if(rgbaWriteMask & 0x00000008) |
1786 | { |
1787 | Pointer<Byte> buffer = cBuffer + 1 * x; |
1788 | Short4 value; |
1789 | value = Insert(value, *Pointer<Short>(buffer), 0); |
1790 | Int pitch = *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); |
1791 | value = Insert(value, *Pointer<Short>(buffer + pitch), 1); |
1792 | |
1793 | current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q) + 8 * xMask); |
1794 | value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q) + 8 * xMask); |
1795 | current.w |= value; |
1796 | |
1797 | *Pointer<Short>(buffer) = Extract(current.w, 0); |
1798 | *Pointer<Short>(buffer + pitch) = Extract(current.w, 1); |
1799 | } |
1800 | break; |
1801 | case FORMAT_G16R16: |
1802 | { |
1803 | Pointer<Byte> buffer = cBuffer + 4 * x; |
1804 | |
1805 | Short4 value = *Pointer<Short4>(buffer); |
1806 | |
1807 | if((rgbaWriteMask & 0x00000003) != 0x00000003) |
1808 | { |
1809 | Short4 masked = value; |
1810 | current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0])); |
1811 | masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0])); |
1812 | current.x |= masked; |
1813 | } |
1814 | |
1815 | current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8); |
1816 | value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8); |
1817 | current.x |= value; |
1818 | *Pointer<Short4>(buffer) = current.x; |
1819 | |
1820 | buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); |
1821 | |
1822 | value = *Pointer<Short4>(buffer); |
1823 | |
1824 | if((rgbaWriteMask & 0x00000003) != 0x00000003) |
1825 | { |
1826 | Short4 masked = value; |
1827 | current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0])); |
1828 | masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0])); |
1829 | current.y |= masked; |
1830 | } |
1831 | |
1832 | current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8); |
1833 | value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8); |
1834 | current.y |= value; |
1835 | *Pointer<Short4>(buffer) = current.y; |
1836 | } |
1837 | break; |
1838 | case FORMAT_A16B16G16R16: |
1839 | { |
1840 | Pointer<Byte> buffer = cBuffer + 8 * x; |
1841 | |
1842 | { |
1843 | Short4 value = *Pointer<Short4>(buffer); |
1844 | |
1845 | if(rgbaWriteMask != 0x0000000F) |
1846 | { |
1847 | Short4 masked = value; |
1848 | current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0])); |
1849 | masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0])); |
1850 | current.x |= masked; |
1851 | } |
1852 | |
1853 | current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8); |
1854 | value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8); |
1855 | current.x |= value; |
1856 | *Pointer<Short4>(buffer) = current.x; |
1857 | } |
1858 | |
1859 | { |
1860 | Short4 value = *Pointer<Short4>(buffer + 8); |
1861 | |
1862 | if(rgbaWriteMask != 0x0000000F) |
1863 | { |
1864 | Short4 masked = value; |
1865 | current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0])); |
1866 | masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0])); |
1867 | current.y |= masked; |
1868 | } |
1869 | |
1870 | current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8); |
1871 | value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8); |
1872 | current.y |= value; |
1873 | *Pointer<Short4>(buffer + 8) = current.y; |
1874 | } |
1875 | |
1876 | buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); |
1877 | |
1878 | { |
1879 | Short4 value = *Pointer<Short4>(buffer); |
1880 | |
1881 | if(rgbaWriteMask != 0x0000000F) |
1882 | { |
1883 | Short4 masked = value; |
1884 | current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0])); |
1885 | masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0])); |
1886 | current.z |= masked; |
1887 | } |
1888 | |
1889 | current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8); |
1890 | value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8); |
1891 | current.z |= value; |
1892 | *Pointer<Short4>(buffer) = current.z; |
1893 | } |
1894 | |
1895 | { |
1896 | Short4 value = *Pointer<Short4>(buffer + 8); |
1897 | |
1898 | if(rgbaWriteMask != 0x0000000F) |
1899 | { |
1900 | Short4 masked = value; |
1901 | current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0])); |
1902 | masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0])); |
1903 | current.w |= masked; |
1904 | } |
1905 | |
1906 | current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8); |
1907 | value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8); |
1908 | current.w |= value; |
1909 | *Pointer<Short4>(buffer + 8) = current.w; |
1910 | } |
1911 | } |
1912 | break; |
1913 | default: |
1914 | ASSERT(false); |
1915 | } |
1916 | } |
1917 | |
1918 | void PixelRoutine::blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive) |
1919 | { |
1920 | switch(blendFactorActive) |
1921 | { |
1922 | case BLEND_ZERO: |
1923 | // Optimized |
1924 | break; |
1925 | case BLEND_ONE: |
1926 | // Optimized |
1927 | break; |
1928 | case BLEND_SOURCE: |
1929 | blendFactor.x = oC.x; |
1930 | blendFactor.y = oC.y; |
1931 | blendFactor.z = oC.z; |
1932 | break; |
1933 | case BLEND_INVSOURCE: |
1934 | blendFactor.x = Float4(1.0f) - oC.x; |
1935 | blendFactor.y = Float4(1.0f) - oC.y; |
1936 | blendFactor.z = Float4(1.0f) - oC.z; |
1937 | break; |
1938 | case BLEND_DEST: |
1939 | blendFactor.x = pixel.x; |
1940 | blendFactor.y = pixel.y; |
1941 | blendFactor.z = pixel.z; |
1942 | break; |
1943 | case BLEND_INVDEST: |
1944 | blendFactor.x = Float4(1.0f) - pixel.x; |
1945 | blendFactor.y = Float4(1.0f) - pixel.y; |
1946 | blendFactor.z = Float4(1.0f) - pixel.z; |
1947 | break; |
1948 | case BLEND_SOURCEALPHA: |
1949 | blendFactor.x = oC.w; |
1950 | blendFactor.y = oC.w; |
1951 | blendFactor.z = oC.w; |
1952 | break; |
1953 | case BLEND_INVSOURCEALPHA: |
1954 | blendFactor.x = Float4(1.0f) - oC.w; |
1955 | blendFactor.y = Float4(1.0f) - oC.w; |
1956 | blendFactor.z = Float4(1.0f) - oC.w; |
1957 | break; |
1958 | case BLEND_DESTALPHA: |
1959 | blendFactor.x = pixel.w; |
1960 | blendFactor.y = pixel.w; |
1961 | blendFactor.z = pixel.w; |
1962 | break; |
1963 | case BLEND_INVDESTALPHA: |
1964 | blendFactor.x = Float4(1.0f) - pixel.w; |
1965 | blendFactor.y = Float4(1.0f) - pixel.w; |
1966 | blendFactor.z = Float4(1.0f) - pixel.w; |
1967 | break; |
1968 | case BLEND_SRCALPHASAT: |
1969 | blendFactor.x = Float4(1.0f) - pixel.w; |
1970 | blendFactor.x = Min(blendFactor.x, oC.w); |
1971 | blendFactor.y = blendFactor.x; |
1972 | blendFactor.z = blendFactor.x; |
1973 | break; |
1974 | case BLEND_CONSTANT: |
1975 | blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0])); |
1976 | blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1])); |
1977 | blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2])); |
1978 | break; |
1979 | case BLEND_INVCONSTANT: |
1980 | blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0])); |
1981 | blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1])); |
1982 | blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2])); |
1983 | break; |
1984 | default: |
1985 | ASSERT(false); |
1986 | } |
1987 | } |
1988 | |
1989 | void PixelRoutine::blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive) |
1990 | { |
1991 | switch(blendFactorAlphaActive) |
1992 | { |
1993 | case BLEND_ZERO: |
1994 | // Optimized |
1995 | break; |
1996 | case BLEND_ONE: |
1997 | // Optimized |
1998 | break; |
1999 | case BLEND_SOURCE: |
2000 | blendFactor.w = oC.w; |
2001 | break; |
2002 | case BLEND_INVSOURCE: |
2003 | blendFactor.w = Float4(1.0f) - oC.w; |
2004 | break; |
2005 | case BLEND_DEST: |
2006 | blendFactor.w = pixel.w; |
2007 | break; |
2008 | case BLEND_INVDEST: |
2009 | blendFactor.w = Float4(1.0f) - pixel.w; |
2010 | break; |
2011 | case BLEND_SOURCEALPHA: |
2012 | blendFactor.w = oC.w; |
2013 | break; |
2014 | case BLEND_INVSOURCEALPHA: |
2015 | blendFactor.w = Float4(1.0f) - oC.w; |
2016 | break; |
2017 | case BLEND_DESTALPHA: |
2018 | blendFactor.w = pixel.w; |
2019 | break; |
2020 | case BLEND_INVDESTALPHA: |
2021 | blendFactor.w = Float4(1.0f) - pixel.w; |
2022 | break; |
2023 | case BLEND_SRCALPHASAT: |
2024 | blendFactor.w = Float4(1.0f); |
2025 | break; |
2026 | case BLEND_CONSTANT: |
2027 | blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3])); |
2028 | break; |
2029 | case BLEND_INVCONSTANT: |
2030 | blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3])); |
2031 | break; |
2032 | default: |
2033 | ASSERT(false); |
2034 | } |
2035 | } |
2036 | |
2037 | void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x) |
2038 | { |
2039 | if(!state.alphaBlendActive) |
2040 | { |
2041 | return; |
2042 | } |
2043 | |
2044 | Pointer<Byte> buffer; |
2045 | Vector4f pixel; |
2046 | |
2047 | Vector4s color; |
2048 | Short4 c01; |
2049 | Short4 c23; |
2050 | |
2051 | Float4 one; |
2052 | if(Surface::isFloatFormat(state.targetFormat[index])) |
2053 | { |
2054 | one = Float4(1.0f); |
2055 | } |
2056 | else if(Surface::isNonNormalizedInteger(state.targetFormat[index])) |
2057 | { |
2058 | one = As<Float4>(Surface::isUnsignedComponent(state.targetFormat[index], 0) ? Int4(0xFFFFFFFF) : Int4(0x7FFFFFFF)); |
2059 | } |
2060 | |
2061 | switch(state.targetFormat[index]) |
2062 | { |
2063 | case FORMAT_R32I: |
2064 | case FORMAT_R32UI: |
2065 | case FORMAT_R32F: |
2066 | buffer = cBuffer; |
2067 | // FIXME: movlps |
2068 | pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0); |
2069 | pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4); |
2070 | buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); |
2071 | // FIXME: movhps |
2072 | pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0); |
2073 | pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4); |
2074 | pixel.y = pixel.z = pixel.w = one; |
2075 | break; |
2076 | case FORMAT_G32R32I: |
2077 | case FORMAT_G32R32UI: |
2078 | case FORMAT_G32R32F: |
2079 | buffer = cBuffer; |
2080 | pixel.x = *Pointer<Float4>(buffer + 8 * x, 16); |
2081 | buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); |
2082 | pixel.y = *Pointer<Float4>(buffer + 8 * x, 16); |
2083 | pixel.z = pixel.x; |
2084 | pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88); |
2085 | pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD); |
2086 | pixel.y = pixel.z; |
2087 | pixel.z = pixel.w = one; |
2088 | break; |
2089 | case FORMAT_X32B32G32R32F: |
2090 | case FORMAT_A32B32G32R32F: |
2091 | case FORMAT_X32B32G32R32F_UNSIGNED: |
2092 | case FORMAT_A32B32G32R32I: |
2093 | case FORMAT_A32B32G32R32UI: |
2094 | buffer = cBuffer; |
2095 | pixel.x = *Pointer<Float4>(buffer + 16 * x, 16); |
2096 | pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16); |
2097 | buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); |
2098 | pixel.z = *Pointer<Float4>(buffer + 16 * x, 16); |
2099 | pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16); |
2100 | transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w); |
2101 | if(state.targetFormat[index] == FORMAT_X32B32G32R32F || |
2102 | state.targetFormat[index] == FORMAT_X32B32G32R32F_UNSIGNED) |
2103 | { |
2104 | pixel.w = Float4(1.0f); |
2105 | } |
2106 | break; |
2107 | default: |
2108 | ASSERT(false); |
2109 | } |
2110 | |
2111 | if((postBlendSRGB && state.writeSRGB) || isSRGB(index)) |
2112 | { |
2113 | sRGBtoLinear(pixel.x); |
2114 | sRGBtoLinear(pixel.y); |
2115 | sRGBtoLinear(pixel.z); |
2116 | } |
2117 | |
2118 | // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor |
2119 | Vector4f sourceFactor; |
2120 | Vector4f destFactor; |
2121 | |
2122 | blendFactor(sourceFactor, oC, pixel, state.sourceBlendFactor); |
2123 | blendFactor(destFactor, oC, pixel, state.destBlendFactor); |
2124 | |
2125 | if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO) |
2126 | { |
2127 | oC.x *= sourceFactor.x; |
2128 | oC.y *= sourceFactor.y; |
2129 | oC.z *= sourceFactor.z; |
2130 | } |
2131 | |
2132 | if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO) |
2133 | { |
2134 | pixel.x *= destFactor.x; |
2135 | pixel.y *= destFactor.y; |
2136 | pixel.z *= destFactor.z; |
2137 | } |
2138 | |
2139 | switch(state.blendOperation) |
2140 | { |
2141 | case BLENDOP_ADD: |
2142 | oC.x += pixel.x; |
2143 | oC.y += pixel.y; |
2144 | oC.z += pixel.z; |
2145 | break; |
2146 | case BLENDOP_SUB: |
2147 | oC.x -= pixel.x; |
2148 | oC.y -= pixel.y; |
2149 | oC.z -= pixel.z; |
2150 | break; |
2151 | case BLENDOP_INVSUB: |
2152 | oC.x = pixel.x - oC.x; |
2153 | oC.y = pixel.y - oC.y; |
2154 | oC.z = pixel.z - oC.z; |
2155 | break; |
2156 | case BLENDOP_MIN: |
2157 | oC.x = Min(oC.x, pixel.x); |
2158 | oC.y = Min(oC.y, pixel.y); |
2159 | oC.z = Min(oC.z, pixel.z); |
2160 | break; |
2161 | case BLENDOP_MAX: |
2162 | oC.x = Max(oC.x, pixel.x); |
2163 | oC.y = Max(oC.y, pixel.y); |
2164 | oC.z = Max(oC.z, pixel.z); |
2165 | break; |
2166 | case BLENDOP_SOURCE: |
2167 | // No operation |
2168 | break; |
2169 | case BLENDOP_DEST: |
2170 | oC.x = pixel.x; |
2171 | oC.y = pixel.y; |
2172 | oC.z = pixel.z; |
2173 | break; |
2174 | case BLENDOP_NULL: |
2175 | oC.x = Float4(0.0f); |
2176 | oC.y = Float4(0.0f); |
2177 | oC.z = Float4(0.0f); |
2178 | break; |
2179 | default: |
2180 | ASSERT(false); |
2181 | } |
2182 | |
2183 | blendFactorAlpha(sourceFactor, oC, pixel, state.sourceBlendFactorAlpha); |
2184 | blendFactorAlpha(destFactor, oC, pixel, state.destBlendFactorAlpha); |
2185 | |
2186 | if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO) |
2187 | { |
2188 | oC.w *= sourceFactor.w; |
2189 | } |
2190 | |
2191 | if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO) |
2192 | { |
2193 | pixel.w *= destFactor.w; |
2194 | } |
2195 | |
2196 | switch(state.blendOperationAlpha) |
2197 | { |
2198 | case BLENDOP_ADD: |
2199 | oC.w += pixel.w; |
2200 | break; |
2201 | case BLENDOP_SUB: |
2202 | oC.w -= pixel.w; |
2203 | break; |
2204 | case BLENDOP_INVSUB: |
2205 | pixel.w -= oC.w; |
2206 | oC.w = pixel.w; |
2207 | break; |
2208 | case BLENDOP_MIN: |
2209 | oC.w = Min(oC.w, pixel.w); |
2210 | break; |
2211 | case BLENDOP_MAX: |
2212 | oC.w = Max(oC.w, pixel.w); |
2213 | break; |
2214 | case BLENDOP_SOURCE: |
2215 | // No operation |
2216 | break; |
2217 | case BLENDOP_DEST: |
2218 | oC.w = pixel.w; |
2219 | break; |
2220 | case BLENDOP_NULL: |
2221 | oC.w = Float4(0.0f); |
2222 | break; |
2223 | default: |
2224 | ASSERT(false); |
2225 | } |
2226 | } |
2227 | |
2228 | void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask) |
2229 | { |
2230 | switch(state.targetFormat[index]) |
2231 | { |
2232 | case FORMAT_R32F: |
2233 | case FORMAT_R32I: |
2234 | case FORMAT_R32UI: |
2235 | case FORMAT_R16I: |
2236 | case FORMAT_R16UI: |
2237 | case FORMAT_R8I: |
2238 | case FORMAT_R8UI: |
2239 | break; |
2240 | case FORMAT_G32R32F: |
2241 | case FORMAT_G32R32I: |
2242 | case FORMAT_G32R32UI: |
2243 | case FORMAT_G16R16I: |
2244 | case FORMAT_G16R16UI: |
2245 | case FORMAT_G8R8I: |
2246 | case FORMAT_G8R8UI: |
2247 | oC.z = oC.x; |
2248 | oC.x = UnpackLow(oC.x, oC.y); |
2249 | oC.z = UnpackHigh(oC.z, oC.y); |
2250 | oC.y = oC.z; |
2251 | break; |
2252 | case FORMAT_X32B32G32R32F: |
2253 | case FORMAT_A32B32G32R32F: |
2254 | case FORMAT_X32B32G32R32F_UNSIGNED: |
2255 | case FORMAT_A32B32G32R32I: |
2256 | case FORMAT_A32B32G32R32UI: |
2257 | case FORMAT_A16B16G16R16I: |
2258 | case FORMAT_A16B16G16R16UI: |
2259 | case FORMAT_A8B8G8R8I: |
2260 | case FORMAT_A8B8G8R8UI: |
2261 | transpose4x4(oC.x, oC.y, oC.z, oC.w); |
2262 | break; |
2263 | default: |
2264 | ASSERT(false); |
2265 | } |
2266 | |
2267 | int rgbaWriteMask = state.colorWriteActive(index); |
2268 | |
2269 | Int xMask; // Combination of all masks |
2270 | |
2271 | if(state.depthTestActive) |
2272 | { |
2273 | xMask = zMask; |
2274 | } |
2275 | else |
2276 | { |
2277 | xMask = cMask; |
2278 | } |
2279 | |
2280 | if(state.stencilActive) |
2281 | { |
2282 | xMask &= sMask; |
2283 | } |
2284 | |
2285 | Pointer<Byte> buffer; |
2286 | Float4 value; |
2287 | |
2288 | switch(state.targetFormat[index]) |
2289 | { |
2290 | case FORMAT_R32F: |
2291 | case FORMAT_R32I: |
2292 | case FORMAT_R32UI: |
2293 | if(rgbaWriteMask & 0x00000001) |
2294 | { |
2295 | buffer = cBuffer + 4 * x; |
2296 | |
2297 | // FIXME: movlps |
2298 | value.x = *Pointer<Float>(buffer + 0); |
2299 | value.y = *Pointer<Float>(buffer + 4); |
2300 | |
2301 | buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); |
2302 | |
2303 | // FIXME: movhps |
2304 | value.z = *Pointer<Float>(buffer + 0); |
2305 | value.w = *Pointer<Float>(buffer + 4); |
2306 | |
2307 | oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16)); |
2308 | value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16)); |
2309 | oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value)); |
2310 | |
2311 | // FIXME: movhps |
2312 | *Pointer<Float>(buffer + 0) = oC.x.z; |
2313 | *Pointer<Float>(buffer + 4) = oC.x.w; |
2314 | |
2315 | buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); |
2316 | |
2317 | // FIXME: movlps |
2318 | *Pointer<Float>(buffer + 0) = oC.x.x; |
2319 | *Pointer<Float>(buffer + 4) = oC.x.y; |
2320 | } |
2321 | break; |
2322 | case FORMAT_R16I: |
2323 | case FORMAT_R16UI: |
2324 | if(rgbaWriteMask & 0x00000001) |
2325 | { |
2326 | buffer = cBuffer + 2 * x; |
2327 | |
2328 | UShort4 xyzw; |
2329 | xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0)); |
2330 | |
2331 | buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); |
2332 | |
2333 | xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1)); |
2334 | value = As<Float4>(Int4(xyzw)); |
2335 | |
2336 | oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16)); |
2337 | value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16)); |
2338 | oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value)); |
2339 | |
2340 | if(state.targetFormat[index] == FORMAT_R16I) |
2341 | { |
2342 | Float component = oC.x.z; |
2343 | *Pointer<Short>(buffer + 0) = Short(As<Int>(component)); |
2344 | component = oC.x.w; |
2345 | *Pointer<Short>(buffer + 2) = Short(As<Int>(component)); |
2346 | |
2347 | buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); |
2348 | |
2349 | component = oC.x.x; |
2350 | *Pointer<Short>(buffer + 0) = Short(As<Int>(component)); |
2351 | component = oC.x.y; |
2352 | *Pointer<Short>(buffer + 2) = Short(As<Int>(component)); |
2353 | } |
2354 | else // FORMAT_R16UI |
2355 | { |
2356 | Float component = oC.x.z; |
2357 | *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component)); |
2358 | component = oC.x.w; |
2359 | *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component)); |
2360 | |
2361 | buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); |
2362 | |
2363 | component = oC.x.x; |
2364 | *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component)); |
2365 | component = oC.x.y; |
2366 | *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component)); |
2367 | } |
2368 | } |
2369 | break; |
2370 | case FORMAT_R8I: |
2371 | case FORMAT_R8UI: |
2372 | if(rgbaWriteMask & 0x00000001) |
2373 | { |
2374 | buffer = cBuffer + x; |
2375 | |
2376 | UInt xyzw, packedCol; |
2377 | |
2378 | xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF; |
2379 | buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); |
2380 | xyzw |= UInt(*Pointer<UShort>(buffer)) << 16; |
2381 | |
2382 | Short4 tmpCol = Short4(As<Int4>(oC.x)); |
2383 | if(state.targetFormat[index] == FORMAT_R8I) |
2384 | { |
2385 | tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol)); |
2386 | } |
2387 | else |
2388 | { |
2389 | tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol)); |
2390 | } |
2391 | packedCol = Extract(As<Int2>(tmpCol), 0); |
2392 | |
2393 | packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) | |
2394 | (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask)); |
2395 | |
2396 | *Pointer<UShort>(buffer) = UShort(packedCol >> 16); |
2397 | buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); |
2398 | *Pointer<UShort>(buffer) = UShort(packedCol); |
2399 | } |
2400 | break; |
2401 | case FORMAT_G32R32F: |
2402 | case FORMAT_G32R32I: |
2403 | case FORMAT_G32R32UI: |
2404 | buffer = cBuffer + 8 * x; |
2405 | |
2406 | value = *Pointer<Float4>(buffer); |
2407 | |
2408 | if((rgbaWriteMask & 0x00000003) != 0x00000003) |
2409 | { |
2410 | Float4 masked = value; |
2411 | oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0]))); |
2412 | masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0]))); |
2413 | oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked)); |
2414 | } |
2415 | |
2416 | oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16)); |
2417 | value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16)); |
2418 | oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value)); |
2419 | *Pointer<Float4>(buffer) = oC.x; |
2420 | |
2421 | buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); |
2422 | |
2423 | value = *Pointer<Float4>(buffer); |
2424 | |
2425 | if((rgbaWriteMask & 0x00000003) != 0x00000003) |
2426 | { |
2427 | Float4 masked; |
2428 | |
2429 | masked = value; |
2430 | oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0]))); |
2431 | masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0]))); |
2432 | oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked)); |
2433 | } |
2434 | |
2435 | oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16)); |
2436 | value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16)); |
2437 | oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value)); |
2438 | *Pointer<Float4>(buffer) = oC.y; |
2439 | break; |
2440 | case FORMAT_G16R16I: |
2441 | case FORMAT_G16R16UI: |
2442 | if((rgbaWriteMask & 0x00000003) != 0x0) |
2443 | { |
2444 | buffer = cBuffer + 4 * x; |
2445 | |
2446 | UInt2 rgbaMask; |
2447 | UShort4 packedCol = UShort4(As<Int4>(oC.x)); |
2448 | UShort4 value = *Pointer<UShort4>(buffer); |
2449 | UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8); |
2450 | if((rgbaWriteMask & 0x3) != 0x3) |
2451 | { |
2452 | Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0])); |
2453 | rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask)); |
2454 | mergedMask &= rgbaMask; |
2455 | } |
2456 | *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask); |
2457 | |
2458 | buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); |
2459 | |
2460 | packedCol = UShort4(As<Int4>(oC.y)); |
2461 | value = *Pointer<UShort4>(buffer); |
2462 | mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8); |
2463 | if((rgbaWriteMask & 0x3) != 0x3) |
2464 | { |
2465 | mergedMask &= rgbaMask; |
2466 | } |
2467 | *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask); |
2468 | } |
2469 | break; |
2470 | case FORMAT_G8R8I: |
2471 | case FORMAT_G8R8UI: |
2472 | if((rgbaWriteMask & 0x00000003) != 0x0) |
2473 | { |
2474 | buffer = cBuffer + 2 * x; |
2475 | |
2476 | Int2 xyzw, packedCol; |
2477 | |
2478 | xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0); |
2479 | buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); |
2480 | xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1); |
2481 | |
2482 | if(state.targetFormat[index] == FORMAT_G8R8I) |
2483 | { |
2484 | packedCol = As<Int2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y)))); |
2485 | } |
2486 | else |
2487 | { |
2488 | packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y)))); |
2489 | } |
2490 | |
2491 | UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8); |
2492 | if((rgbaWriteMask & 0x3) != 0x3) |
2493 | { |
2494 | Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0])); |
2495 | UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask)); |
2496 | mergedMask &= rgbaMask; |
2497 | } |
2498 | |
2499 | packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask)); |
2500 | |
2501 | *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1)); |
2502 | buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); |
2503 | *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0)); |
2504 | } |
2505 | break; |
2506 | case FORMAT_X32B32G32R32F: |
2507 | case FORMAT_A32B32G32R32F: |
2508 | case FORMAT_X32B32G32R32F_UNSIGNED: |
2509 | case FORMAT_A32B32G32R32I: |
2510 | case FORMAT_A32B32G32R32UI: |
2511 | buffer = cBuffer + 16 * x; |
2512 | |
2513 | { |
2514 | value = *Pointer<Float4>(buffer, 16); |
2515 | |
2516 | if(rgbaWriteMask != 0x0000000F) |
2517 | { |
2518 | Float4 masked = value; |
2519 | oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0]))); |
2520 | masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0]))); |
2521 | oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked)); |
2522 | } |
2523 | |
2524 | oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16)); |
2525 | value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16)); |
2526 | oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value)); |
2527 | *Pointer<Float4>(buffer, 16) = oC.x; |
2528 | } |
2529 | |
2530 | { |
2531 | value = *Pointer<Float4>(buffer + 16, 16); |
2532 | |
2533 | if(rgbaWriteMask != 0x0000000F) |
2534 | { |
2535 | Float4 masked = value; |
2536 | oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0]))); |
2537 | masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0]))); |
2538 | oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked)); |
2539 | } |
2540 | |
2541 | oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16)); |
2542 | value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16)); |
2543 | oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value)); |
2544 | *Pointer<Float4>(buffer + 16, 16) = oC.y; |
2545 | } |
2546 | |
2547 | buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); |
2548 | |
2549 | { |
2550 | value = *Pointer<Float4>(buffer, 16); |
2551 | |
2552 | if(rgbaWriteMask != 0x0000000F) |
2553 | { |
2554 | Float4 masked = value; |
2555 | oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0]))); |
2556 | masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0]))); |
2557 | oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked)); |
2558 | } |
2559 | |
2560 | oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16)); |
2561 | value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16)); |
2562 | oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value)); |
2563 | *Pointer<Float4>(buffer, 16) = oC.z; |
2564 | } |
2565 | |
2566 | { |
2567 | value = *Pointer<Float4>(buffer + 16, 16); |
2568 | |
2569 | if(rgbaWriteMask != 0x0000000F) |
2570 | { |
2571 | Float4 masked = value; |
2572 | oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0]))); |
2573 | masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0]))); |
2574 | oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked)); |
2575 | } |
2576 | |
2577 | oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16)); |
2578 | value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16)); |
2579 | oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value)); |
2580 | *Pointer<Float4>(buffer + 16, 16) = oC.w; |
2581 | } |
2582 | break; |
2583 | case FORMAT_A16B16G16R16I: |
2584 | case FORMAT_A16B16G16R16UI: |
2585 | if((rgbaWriteMask & 0x0000000F) != 0x0) |
2586 | { |
2587 | buffer = cBuffer + 8 * x; |
2588 | |
2589 | UInt4 rgbaMask; |
2590 | UShort8 value = *Pointer<UShort8>(buffer); |
2591 | UShort8 packedCol = UShort8(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y))); |
2592 | UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16); |
2593 | if((rgbaWriteMask & 0xF) != 0xF) |
2594 | { |
2595 | UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0])); |
2596 | rgbaMask = UInt4(tmpMask, tmpMask); |
2597 | mergedMask &= rgbaMask; |
2598 | } |
2599 | *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask); |
2600 | |
2601 | buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); |
2602 | |
2603 | value = *Pointer<UShort8>(buffer); |
2604 | packedCol = UShort8(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w))); |
2605 | mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16); |
2606 | if((rgbaWriteMask & 0xF) != 0xF) |
2607 | { |
2608 | mergedMask &= rgbaMask; |
2609 | } |
2610 | *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask); |
2611 | } |
2612 | break; |
2613 | case FORMAT_A8B8G8R8I: |
2614 | case FORMAT_A8B8G8R8UI: |
2615 | if((rgbaWriteMask & 0x0000000F) != 0x0) |
2616 | { |
2617 | UInt2 value, packedCol, mergedMask; |
2618 | |
2619 | buffer = cBuffer + 4 * x; |
2620 | |
2621 | if(state.targetFormat[index] == FORMAT_A8B8G8R8I) |
2622 | { |
2623 | packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y)))); |
2624 | } |
2625 | else |
2626 | { |
2627 | packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y)))); |
2628 | } |
2629 | value = *Pointer<UInt2>(buffer, 16); |
2630 | mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8); |
2631 | if(rgbaWriteMask != 0xF) |
2632 | { |
2633 | mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0])); |
2634 | } |
2635 | *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask); |
2636 | |
2637 | buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); |
2638 | |
2639 | if(state.targetFormat[index] == FORMAT_A8B8G8R8I) |
2640 | { |
2641 | packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w)))); |
2642 | } |
2643 | else |
2644 | { |
2645 | packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w)))); |
2646 | } |
2647 | value = *Pointer<UInt2>(buffer, 16); |
2648 | mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8); |
2649 | if(rgbaWriteMask != 0xF) |
2650 | { |
2651 | mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0])); |
2652 | } |
2653 | *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask); |
2654 | } |
2655 | break; |
2656 | default: |
2657 | ASSERT(false); |
2658 | } |
2659 | } |
2660 | |
2661 | UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate) |
2662 | { |
2663 | return UShort4(cf * Float4(0xFFFF), saturate); |
2664 | } |
2665 | |
2666 | void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c) |
2667 | { |
2668 | Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16); |
2669 | |
2670 | c.x = As<UShort4>(c.x) >> 4; |
2671 | c.y = As<UShort4>(c.y) >> 4; |
2672 | c.z = As<UShort4>(c.z) >> 4; |
2673 | |
2674 | c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0); |
2675 | c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1); |
2676 | c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2); |
2677 | c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3); |
2678 | |
2679 | c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0); |
2680 | c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1); |
2681 | c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2); |
2682 | c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3); |
2683 | |
2684 | c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0); |
2685 | c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1); |
2686 | c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2); |
2687 | c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3); |
2688 | } |
2689 | |
2690 | void PixelRoutine::linearToSRGB16_12_16(Vector4s &c) |
2691 | { |
2692 | c.x = As<UShort4>(c.x) >> 4; |
2693 | c.y = As<UShort4>(c.y) >> 4; |
2694 | c.z = As<UShort4>(c.z) >> 4; |
2695 | |
2696 | linearToSRGB12_16(c); |
2697 | } |
2698 | |
2699 | void PixelRoutine::linearToSRGB12_16(Vector4s &c) |
2700 | { |
2701 | Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16); |
2702 | |
2703 | c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0); |
2704 | c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1); |
2705 | c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2); |
2706 | c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3); |
2707 | |
2708 | c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0); |
2709 | c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1); |
2710 | c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2); |
2711 | c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3); |
2712 | |
2713 | c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0); |
2714 | c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1); |
2715 | c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2); |
2716 | c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3); |
2717 | } |
2718 | |
2719 | Float4 PixelRoutine::sRGBtoLinear(const Float4 &x) // Approximates x^2.2 |
2720 | { |
2721 | Float4 linear = x * x; |
2722 | linear = linear * Float4(0.73f) + linear * x * Float4(0.27f); |
2723 | |
2724 | return Min(Max(linear, Float4(0.0f)), Float4(1.0f)); |
2725 | } |
2726 | |
2727 | bool PixelRoutine::colorUsed() |
2728 | { |
2729 | return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill; |
2730 | } |
2731 | } |
2732 | |