1 // SwiftShader Software Renderer
3 // Copyright(c) 2005-2013 TransGaming Inc.
5 // All rights reserved. No part of this software may be copied, distributed, transmitted,
6 // transcribed, stored in a retrieval system, translated into any human or computer
7 // language by any means, or disclosed to third parties without the explicit written
8 // agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
9 // or implied, including but not limited to any patent rights, are granted to you.
12 #include "PixelRoutine.hpp"
14 #include "Renderer.hpp"
15 #include "QuadRasterizer.hpp"
16 #include "Surface.hpp"
17 #include "Primitive.hpp"
19 #include "SamplerCore.hpp"
20 #include "Constants.hpp"
25 extern bool complementaryDepthBuffer;
26 extern bool postBlendSRGB;
27 extern bool exactColorRounding;
28 extern bool forceClearRegisters;
30 PixelRoutine::Registers::Registers(const PixelShader *shader) :
31 QuadRasterizer::Registers(),
32 v(shader && shader->dynamicallyIndexedInput)
34 if(!shader || shader->getVersion() < 0x0200 || forceClearRegisters)
36 for(int i = 0; i < 10; i++)
38 v[i].x = Float4(0.0f);
39 v[i].y = Float4(0.0f);
40 v[i].z = Float4(0.0f);
41 v[i].w = Float4(0.0f);
46 PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader) : QuadRasterizer(state, shader)
50 PixelRoutine::~PixelRoutine()
52 for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
58 void PixelRoutine::quad(QuadRasterizer::Registers &rBase, Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
60 Registers& r = *static_cast<Registers*>(&rBase);
63 Long pipeTime = Ticks();
66 for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
68 sampler[i] = new SamplerCore(r.constants, state.sampler[i]);
71 const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive();
73 Int zMask[4]; // Depth mask
74 Int sMask[4]; // Stencil mask
76 for(unsigned int q = 0; q < state.multiSample; q++)
82 for(unsigned int q = 0; q < state.multiSample; q++)
84 stencilTest(r, sBuffer, q, x, sMask[q], cMask[q]);
94 Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(r.primitive + OFFSET(Primitive,xQuad), 16);
98 for(unsigned int q = 0; q < state.multiSample; q++)
102 if(state.multiSample > 1)
104 x -= *Pointer<Float4>(r.constants + OFFSET(Constants,X) + q * sizeof(float4));
107 z[q] = interpolate(x, r.Dz[q], z[q], r.primitive + OFFSET(Primitive,z), false, false);
111 Bool depthPass = false;
115 for(unsigned int q = 0; q < state.multiSample; q++)
117 depthPass = depthPass || depthTest(r, zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
121 If(depthPass || Bool(!earlyDepthTest))
124 Long interpTime = Ticks();
127 Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(r.primitive + OFFSET(Primitive,yQuad), 16);
129 // Centroid locations
130 Float4 XXXX = Float4(0.0f);
131 Float4 YYYY = Float4(0.0f);
135 Float4 WWWW(1.0e-9f);
137 for(unsigned int q = 0; q < state.multiSample; q++)
139 XXXX += *Pointer<Float4>(r.constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
140 YYYY += *Pointer<Float4>(r.constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
141 WWWW += *Pointer<Float4>(r.constants + OFFSET(Constants,weight) + 16 * cMask[q]);
154 w = interpolate(xxxx, r.Dw, rhw, r.primitive + OFFSET(Primitive,w), false, false);
159 rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, r.primitive + OFFSET(Primitive,w), false, false));
163 for(int interpolant = 0; interpolant < 10; interpolant++)
165 for(int component = 0; component < 4; component++)
167 if(state.interpolant[interpolant].component & (1 << component))
169 if(!state.interpolant[interpolant].centroid)
171 r.v[interpolant][component] = interpolate(xxxx, r.Dv[interpolant][component], rhw, r.primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
175 r.v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, r.primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
182 switch(state.interpolant[interpolant].project)
187 rcp = reciprocal(r.v[interpolant].y);
188 r.v[interpolant].x = r.v[interpolant].x * rcp;
191 rcp = reciprocal(r.v[interpolant].z);
192 r.v[interpolant].x = r.v[interpolant].x * rcp;
193 r.v[interpolant].y = r.v[interpolant].y * rcp;
196 rcp = reciprocal(r.v[interpolant].w);
197 r.v[interpolant].x = r.v[interpolant].x * rcp;
198 r.v[interpolant].y = r.v[interpolant].y * rcp;
199 r.v[interpolant].z = r.v[interpolant].z * rcp;
204 if(state.fog.component)
206 f = interpolate(xxxx, r.Df, rhw, r.primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective);
209 setBuiltins(r, x, y, z, w);
212 r.cycles[PERF_INTERP] += Ticks() - interpTime;
215 Bool alphaPass = true;
220 Long shaderTime = Ticks();
223 applyShader(r, cMask);
226 r.cycles[PERF_SHADER] += Ticks() - shaderTime;
229 alphaPass = alphaTest(r, cMask);
231 if((shader && shader->containsKill()) || state.alphaTestActive())
233 for(unsigned int q = 0; q < state.multiSample; q++)
235 zMask[q] &= cMask[q];
236 sMask[q] &= cMask[q];
245 for(unsigned int q = 0; q < state.multiSample; q++)
247 depthPass = depthPass || depthTest(r, zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
252 Long ropTime = Ticks();
255 If(depthPass || Bool(earlyDepthTest))
257 for(unsigned int q = 0; q < state.multiSample; q++)
259 if(state.multiSampleMask & (1 << q))
261 writeDepth(r, zBuffer, q, x, z[q], zMask[q]);
263 if(state.occlusionEnabled)
265 r.occlusion += *Pointer<UInt>(r.constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
273 AddAtomic(Pointer<Long>(&profiler.ropOperations), 4);
276 rasterOperation(r, f, cBuffer, x, sMask, zMask, cMask);
281 r.cycles[PERF_ROP] += Ticks() - ropTime;
286 for(unsigned int q = 0; q < state.multiSample; q++)
288 if(state.multiSampleMask & (1 << q))
290 writeStencil(r, sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
295 r.cycles[PERF_PIPE] += Ticks() - pipeTime;
299 Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
301 Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
305 interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
306 y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
317 void PixelRoutine::stencilTest(Registers &r, Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask)
319 if(!state.stencilActive)
324 // (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
326 Pointer<Byte> buffer = sBuffer + 2 * x;
330 buffer += q * *Pointer<Int>(r.data + OFFSET(DrawData,stencilSliceB));
333 Byte8 value = As<Byte8>(Long1(*Pointer<UInt>(buffer)));
334 Byte8 valueCCW = value;
336 if(!state.noStencilMask)
338 value &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[0].testMaskQ));
341 stencilTest(r, value, state.stencilCompareMode, false);
343 if(state.twoSidedStencil)
345 if(!state.noStencilMaskCCW)
347 valueCCW &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[1].testMaskQ));
350 stencilTest(r, valueCCW, state.stencilCompareModeCCW, true);
352 value &= *Pointer<Byte8>(r.primitive + OFFSET(Primitive,clockwiseMask));
353 valueCCW &= *Pointer<Byte8>(r.primitive + OFFSET(Primitive,invClockwiseMask));
357 sMask = SignMask(value) & cMask;
360 void PixelRoutine::stencilTest(Registers &r, Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW)
364 switch(stencilCompareMode)
367 value = Byte8(0xFFFFFFFFFFFFFFFF);
370 value = Byte8(0x0000000000000000);
372 case STENCIL_LESS: // a < b ~ b > a
373 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
374 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
377 value = CmpEQ(value, *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
379 case STENCIL_NOTEQUAL: // a != b ~ !(a == b)
380 value = CmpEQ(value, *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
381 value ^= Byte8(0xFFFFFFFFFFFFFFFF);
383 case STENCIL_LESSEQUAL: // a <= b ~ (b > a) || (a == b)
385 equal = CmpEQ(equal, *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
386 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
387 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
390 case STENCIL_GREATER: // a > b
391 equal = *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
392 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
393 equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
396 case STENCIL_GREATEREQUAL: // a >= b ~ !(a < b) ~ !(b > a)
397 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
398 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
399 value ^= Byte8(0xFFFFFFFFFFFFFFFF);
406 Bool PixelRoutine::depthTest(Registers &r, Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
408 if(!state.depthTestActive)
415 if(shader && shader->depthOverride())
417 if(complementaryDepthBuffer)
419 Z = Float4(1.0f) - r.oDepth;
427 Pointer<Byte> buffer;
430 if(!state.quadLayoutDepthBuffer)
432 buffer = zBuffer + 4 * x;
433 pitch = *Pointer<Int>(r.data + OFFSET(DrawData,depthPitchB));
437 buffer = zBuffer + 8 * x;
442 buffer += q * *Pointer<Int>(r.data + OFFSET(DrawData,depthSliceB));
447 if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
449 if(!state.quadLayoutDepthBuffer)
451 // FIXME: Properly optimizes?
452 zValue.xy = *Pointer<Float4>(buffer);
453 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
457 zValue = *Pointer<Float4>(buffer, 16);
463 switch(state.depthCompareMode)
472 zTest = CmpEQ(zValue, Z);
475 zTest = CmpNEQ(zValue, Z);
478 if(complementaryDepthBuffer)
480 zTest = CmpLT(zValue, Z);
484 zTest = CmpNLE(zValue, Z);
487 case DEPTH_GREATEREQUAL:
488 if(complementaryDepthBuffer)
490 zTest = CmpNLT(zValue, Z);
494 zTest = CmpLE(zValue, Z);
497 case DEPTH_LESSEQUAL:
498 if(complementaryDepthBuffer)
500 zTest = CmpLE(zValue, Z);
504 zTest = CmpNLT(zValue, Z);
508 if(complementaryDepthBuffer)
510 zTest = CmpNLE(zValue, Z);
514 zTest = CmpLT(zValue, Z);
521 switch(state.depthCompareMode)
530 zMask = SignMask(zTest) & cMask;
534 if(state.stencilActive)
542 void PixelRoutine::alphaTest(Registers &r, Int &aMask, Short4 &alpha)
547 switch(state.alphaCompareMode)
556 cmp = CmpEQ(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)));
557 aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
559 case ALPHA_NOTEQUAL: // a != b ~ !(a == b)
560 cmp = CmpEQ(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4((short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF); // FIXME
561 aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
563 case ALPHA_LESS: // a < b ~ b > a
564 cmp = CmpGT(*Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)), alpha);
565 aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
567 case ALPHA_GREATEREQUAL: // a >= b ~ (a > b) || (a == b) ~ !(b > a) // TODO: Approximate
568 equal = CmpEQ(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)));
569 cmp = CmpGT(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)));
571 aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
573 case ALPHA_LESSEQUAL: // a <= b ~ !(a > b)
574 cmp = CmpGT(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4((short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF); // FIXME
575 aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
577 case ALPHA_GREATER: // a > b
578 cmp = CmpGT(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)));
579 aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
586 void PixelRoutine::alphaToCoverage(Registers &r, Int cMask[4], Float4 &alpha)
588 Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(r.data + OFFSET(DrawData,a2c0)));
589 Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(r.data + OFFSET(DrawData,a2c1)));
590 Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(r.data + OFFSET(DrawData,a2c2)));
591 Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(r.data + OFFSET(DrawData,a2c3)));
593 Int aMask0 = SignMask(coverage0);
594 Int aMask1 = SignMask(coverage1);
595 Int aMask2 = SignMask(coverage2);
596 Int aMask3 = SignMask(coverage3);
604 void PixelRoutine::fogBlend(Registers &r, Vector4f &c0, Float4 &fog)
611 if(state.pixelFogMode != FOG_NONE)
615 fog = Min(fog, Float4(1.0f));
616 fog = Max(fog, Float4(0.0f));
619 c0.x -= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[0]));
620 c0.y -= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[1]));
621 c0.z -= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[2]));
627 c0.x += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[0]));
628 c0.y += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[1]));
629 c0.z += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[2]));
632 void PixelRoutine::pixelFog(Registers &r, Float4 &visibility)
634 Float4 &zw = visibility;
636 if(state.pixelFogMode != FOG_NONE)
644 if(complementaryDepthBuffer)
646 zw = Float4(1.0f) - r.z[0];
655 switch(state.pixelFogMode)
660 zw *= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.scale));
661 zw += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.offset));
664 zw *= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.densityE));
665 zw = exponential2(zw, true);
669 zw *= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.density2E));
670 zw = exponential2(zw, true);
677 void PixelRoutine::writeDepth(Registers &r, Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
679 if(!state.depthWriteEnable)
686 if(shader && shader->depthOverride())
688 if(complementaryDepthBuffer)
690 Z = Float4(1.0f) - r.oDepth;
698 Pointer<Byte> buffer;
701 if(!state.quadLayoutDepthBuffer)
703 buffer = zBuffer + 4 * x;
704 pitch = *Pointer<Int>(r.data + OFFSET(DrawData,depthPitchB));
708 buffer = zBuffer + 8 * x;
713 buffer += q * *Pointer<Int>(r.data + OFFSET(DrawData,depthSliceB));
718 if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
720 if(!state.quadLayoutDepthBuffer)
722 // FIXME: Properly optimizes?
723 zValue.xy = *Pointer<Float4>(buffer);
724 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
728 zValue = *Pointer<Float4>(buffer, 16);
732 Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
733 zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
734 Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
736 if(!state.quadLayoutDepthBuffer)
738 // FIXME: Properly optimizes?
739 *Pointer<Float2>(buffer) = Float2(Z.xy);
740 *Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
744 *Pointer<Float4>(buffer, 16) = Z;
748 void PixelRoutine::writeStencil(Registers &r, Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
750 if(!state.stencilActive)
755 if(state.stencilPassOperation == OPERATION_KEEP && state.stencilZFailOperation == OPERATION_KEEP && state.stencilFailOperation == OPERATION_KEEP)
757 if(!state.twoSidedStencil || (state.stencilPassOperationCCW == OPERATION_KEEP && state.stencilZFailOperationCCW == OPERATION_KEEP && state.stencilFailOperationCCW == OPERATION_KEEP))
763 if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW))
768 Pointer<Byte> buffer = sBuffer + 2 * x;
772 buffer += q * *Pointer<Int>(r.data + OFFSET(DrawData,stencilSliceB));
775 Byte8 bufferValue = As<Byte8>(Long1(*Pointer<UInt>(buffer)));
778 stencilOperation(r, newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
780 if(!state.noStencilWriteMask)
782 Byte8 maskedValue = bufferValue;
783 newValue &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[0].writeMaskQ));
784 maskedValue &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
785 newValue |= maskedValue;
788 if(state.twoSidedStencil)
792 stencilOperation(r, newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
794 if(!state.noStencilWriteMaskCCW)
796 Byte8 maskedValue = bufferValue;
797 newValueCCW &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[1].writeMaskQ));
798 maskedValue &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
799 newValueCCW |= maskedValue;
802 newValue &= *Pointer<Byte8>(r.primitive + OFFSET(Primitive,clockwiseMask));
803 newValueCCW &= *Pointer<Byte8>(r.primitive + OFFSET(Primitive,invClockwiseMask));
804 newValue |= newValueCCW;
807 newValue &= *Pointer<Byte8>(r.constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
808 bufferValue &= *Pointer<Byte8>(r.constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
809 newValue |= bufferValue;
811 *Pointer<UInt>(buffer) = UInt(As<Long>(newValue));
814 void PixelRoutine::stencilOperation(Registers &r, Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
816 Byte8 &pass = newValue;
820 stencilOperation(r, pass, bufferValue, stencilPassOperation, CCW);
822 if(stencilZFailOperation != stencilPassOperation)
824 stencilOperation(r, zFail, bufferValue, stencilZFailOperation, CCW);
827 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
829 stencilOperation(r, fail, bufferValue, stencilFailOperation, CCW);
832 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
834 if(state.depthTestActive && stencilZFailOperation != stencilPassOperation) // zMask valid and values not the same
836 pass &= *Pointer<Byte8>(r.constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
837 zFail &= *Pointer<Byte8>(r.constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
841 pass &= *Pointer<Byte8>(r.constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
842 fail &= *Pointer<Byte8>(r.constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
847 void PixelRoutine::stencilOperation(Registers &r, Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW)
852 output = bufferValue;
855 output = Byte8(0x0000000000000000);
857 case OPERATION_REPLACE:
858 output = *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceQ));
860 case OPERATION_INCRSAT:
861 output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
863 case OPERATION_DECRSAT:
864 output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
866 case OPERATION_INVERT:
867 output = bufferValue ^ Byte8(0xFFFFFFFFFFFFFFFF);
870 output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
873 output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
880 void PixelRoutine::blendFactor(Registers &r, const Vector4s &blendFactor, const Vector4s ¤t, const Vector4s &pixel, BlendFactor blendFactorActive)
882 switch(blendFactorActive)
891 blendFactor.x = current.x;
892 blendFactor.y = current.y;
893 blendFactor.z = current.z;
895 case BLEND_INVSOURCE:
896 blendFactor.x = Short4(0xFFFFu) - current.x;
897 blendFactor.y = Short4(0xFFFFu) - current.y;
898 blendFactor.z = Short4(0xFFFFu) - current.z;
901 blendFactor.x = pixel.x;
902 blendFactor.y = pixel.y;
903 blendFactor.z = pixel.z;
906 blendFactor.x = Short4(0xFFFFu) - pixel.x;
907 blendFactor.y = Short4(0xFFFFu) - pixel.y;
908 blendFactor.z = Short4(0xFFFFu) - pixel.z;
910 case BLEND_SOURCEALPHA:
911 blendFactor.x = current.w;
912 blendFactor.y = current.w;
913 blendFactor.z = current.w;
915 case BLEND_INVSOURCEALPHA:
916 blendFactor.x = Short4(0xFFFFu) - current.w;
917 blendFactor.y = Short4(0xFFFFu) - current.w;
918 blendFactor.z = Short4(0xFFFFu) - current.w;
920 case BLEND_DESTALPHA:
921 blendFactor.x = pixel.w;
922 blendFactor.y = pixel.w;
923 blendFactor.z = pixel.w;
925 case BLEND_INVDESTALPHA:
926 blendFactor.x = Short4(0xFFFFu) - pixel.w;
927 blendFactor.y = Short4(0xFFFFu) - pixel.w;
928 blendFactor.z = Short4(0xFFFFu) - pixel.w;
930 case BLEND_SRCALPHASAT:
931 blendFactor.x = Short4(0xFFFFu) - pixel.w;
932 blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
933 blendFactor.y = blendFactor.x;
934 blendFactor.z = blendFactor.x;
937 blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[0]));
938 blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[1]));
939 blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[2]));
941 case BLEND_INVCONSTANT:
942 blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
943 blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
944 blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
946 case BLEND_CONSTANTALPHA:
947 blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
948 blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
949 blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
951 case BLEND_INVCONSTANTALPHA:
952 blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
953 blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
954 blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
961 void PixelRoutine::blendFactorAlpha(Registers &r, const Vector4s &blendFactor, const Vector4s ¤t, const Vector4s &pixel, BlendFactor blendFactorAlphaActive)
963 switch(blendFactorAlphaActive)
972 blendFactor.w = current.w;
974 case BLEND_INVSOURCE:
975 blendFactor.w = Short4(0xFFFFu) - current.w;
978 blendFactor.w = pixel.w;
981 blendFactor.w = Short4(0xFFFFu) - pixel.w;
983 case BLEND_SOURCEALPHA:
984 blendFactor.w = current.w;
986 case BLEND_INVSOURCEALPHA:
987 blendFactor.w = Short4(0xFFFFu) - current.w;
989 case BLEND_DESTALPHA:
990 blendFactor.w = pixel.w;
992 case BLEND_INVDESTALPHA:
993 blendFactor.w = Short4(0xFFFFu) - pixel.w;
995 case BLEND_SRCALPHASAT:
996 blendFactor.w = Short4(0xFFFFu);
999 case BLEND_CONSTANTALPHA:
1000 blendFactor.w = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
1002 case BLEND_INVCONSTANT:
1003 case BLEND_INVCONSTANTALPHA:
1004 blendFactor.w = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
1011 void PixelRoutine::readPixel(Registers &r, int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel)
1015 Pointer<Byte> buffer;
1016 Pointer<Byte> buffer2;
1018 switch(state.targetFormat[index])
1021 buffer = cBuffer + 2 * x;
1022 buffer2 = buffer + *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1023 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1025 pixel.x = c01 & Short4(0xF800u);
1026 pixel.y = (c01 & Short4(0x07E0u)) << 5;
1027 pixel.z = (c01 & Short4(0x001Fu)) << 11;
1028 pixel.w = Short4(0xFFFFu);
1030 case FORMAT_A8R8G8B8:
1031 buffer = cBuffer + 4 * x;
1032 c01 = *Pointer<Short4>(buffer);
1033 buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1034 c23 = *Pointer<Short4>(buffer);
1037 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1038 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1040 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1041 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1044 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1045 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1046 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1047 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1049 case FORMAT_A8B8G8R8:
1050 buffer = cBuffer + 4 * x;
1051 c01 = *Pointer<Short4>(buffer);
1052 buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1053 c23 = *Pointer<Short4>(buffer);
1056 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1057 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1059 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1060 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1063 pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1064 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1065 pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1066 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1069 buffer = cBuffer + 1 * x;
1070 pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 0);
1071 buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1072 pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 1);
1073 pixel.w = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1074 pixel.x = Short4(0x0000);
1075 pixel.y = Short4(0x0000);
1076 pixel.z = Short4(0x0000);
1078 case FORMAT_X8R8G8B8:
1079 buffer = cBuffer + 4 * x;
1080 c01 = *Pointer<Short4>(buffer);
1081 buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1082 c23 = *Pointer<Short4>(buffer);
1085 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1086 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1088 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1089 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1091 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1092 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1093 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1094 pixel.w = Short4(0xFFFFu);
1096 case FORMAT_X8B8G8R8:
1097 buffer = cBuffer + 4 * x;
1098 c01 = *Pointer<Short4>(buffer);
1099 buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1100 c23 = *Pointer<Short4>(buffer);
1103 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1104 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1106 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1107 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1110 pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1111 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1112 pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1113 pixel.w = Short4(0xFFFFu);
1115 case FORMAT_A8G8R8B8Q:
1117 // pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1118 // pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1119 // pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1120 // pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1122 case FORMAT_X8G8R8B8Q:
1124 // pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1125 // pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1126 // pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1127 // pixel.w = Short4(0xFFFFu);
1129 case FORMAT_A16B16G16R16:
1131 pixel.x = *Pointer<Short4>(buffer + 8 * x);
1132 pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
1133 buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1134 pixel.z = *Pointer<Short4>(buffer + 8 * x);
1135 pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
1136 transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
1140 pixel.x = *Pointer<Short4>(buffer + 4 * x);
1141 buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1142 pixel.y = *Pointer<Short4>(buffer + 4 * x);
1144 pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
1145 pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
1147 pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
1148 pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
1149 pixel.z = Short4(0xFFFFu);
1150 pixel.w = Short4(0xFFFFu);
1156 if(postBlendSRGB && state.writeSRGB)
1158 sRGBtoLinear16_12_16(r, pixel);
1162 void PixelRoutine::alphaBlend(Registers &r, int index, Pointer<Byte> &cBuffer, Vector4s ¤t, Int &x)
1164 if(!state.alphaBlendActive)
1170 readPixel(r, index, cBuffer, x, pixel);
1172 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
1173 Vector4s sourceFactor;
1174 Vector4s destFactor;
1176 blendFactor(r, sourceFactor, current, pixel, state.sourceBlendFactor);
1177 blendFactor(r, destFactor, current, pixel, state.destBlendFactor);
1179 if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
1181 current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
1182 current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
1183 current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
1186 if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
1188 pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
1189 pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
1190 pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
1193 switch(state.blendOperation)
1196 current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1197 current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1198 current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1201 current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1202 current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1203 current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1205 case BLENDOP_INVSUB:
1206 current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
1207 current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
1208 current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
1211 current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
1212 current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
1213 current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
1216 current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
1217 current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
1218 current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
1220 case BLENDOP_SOURCE:
1224 current.x = pixel.x;
1225 current.y = pixel.y;
1226 current.z = pixel.z;
1229 current.x = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1230 current.y = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1231 current.z = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1237 blendFactorAlpha(r, sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
1238 blendFactorAlpha(r, destFactor, current, pixel, state.destBlendFactorAlpha);
1240 if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
1242 current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
1245 if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
1247 pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
1250 switch(state.blendOperationAlpha)
1253 current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1256 current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1258 case BLENDOP_INVSUB:
1259 current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
1262 current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
1265 current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
1267 case BLENDOP_SOURCE:
1271 current.w = pixel.w;
1274 current.w = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1281 void PixelRoutine::logicOperation(Registers &r, int index, Pointer<Byte> &cBuffer, Vector4s ¤t, Int &x)
1283 if(state.logicalOperation == LOGICALOP_COPY)
1289 readPixel(r, index, cBuffer, x, pixel);
1291 switch(state.logicalOperation)
1293 case LOGICALOP_CLEAR:
1299 current.x = 0xFFFFu;
1300 current.y = 0xFFFFu;
1301 current.z = 0xFFFFu;
1303 case LOGICALOP_COPY:
1304 ASSERT(false); // Optimized out
1306 case LOGICALOP_COPY_INVERTED:
1307 current.x = ~current.x;
1308 current.y = ~current.y;
1309 current.z = ~current.z;
1311 case LOGICALOP_NOOP:
1312 current.x = pixel.x;
1313 current.y = pixel.y;
1314 current.z = pixel.z;
1316 case LOGICALOP_INVERT:
1317 current.x = ~pixel.x;
1318 current.y = ~pixel.y;
1319 current.z = ~pixel.z;
1322 current.x = pixel.x & current.x;
1323 current.y = pixel.y & current.y;
1324 current.z = pixel.z & current.z;
1326 case LOGICALOP_NAND:
1327 current.x = ~(pixel.x & current.x);
1328 current.y = ~(pixel.y & current.y);
1329 current.z = ~(pixel.z & current.z);
1332 current.x = pixel.x | current.x;
1333 current.y = pixel.y | current.y;
1334 current.z = pixel.z | current.z;
1337 current.x = ~(pixel.x | current.x);
1338 current.y = ~(pixel.y | current.y);
1339 current.z = ~(pixel.z | current.z);
1342 current.x = pixel.x ^ current.x;
1343 current.y = pixel.y ^ current.y;
1344 current.z = pixel.z ^ current.z;
1346 case LOGICALOP_EQUIV:
1347 current.x = ~(pixel.x ^ current.x);
1348 current.y = ~(pixel.y ^ current.y);
1349 current.z = ~(pixel.z ^ current.z);
1351 case LOGICALOP_AND_REVERSE:
1352 current.x = ~pixel.x & current.x;
1353 current.y = ~pixel.y & current.y;
1354 current.z = ~pixel.z & current.z;
1356 case LOGICALOP_AND_INVERTED:
1357 current.x = pixel.x & ~current.x;
1358 current.y = pixel.y & ~current.y;
1359 current.z = pixel.z & ~current.z;
1361 case LOGICALOP_OR_REVERSE:
1362 current.x = ~pixel.x | current.x;
1363 current.y = ~pixel.y | current.y;
1364 current.z = ~pixel.z | current.z;
1366 case LOGICALOP_OR_INVERTED:
1367 current.x = pixel.x | ~current.x;
1368 current.y = pixel.y | ~current.y;
1369 current.z = pixel.z | ~current.z;
1376 void PixelRoutine::writeColor(Registers &r, int index, Pointer<Byte> &cBuffer, Int &x, Vector4s ¤t, Int &sMask, Int &zMask, Int &cMask)
1378 if(postBlendSRGB && state.writeSRGB)
1380 linearToSRGB16_12_16(r, current);
1383 if(exactColorRounding)
1385 switch(state.targetFormat[index])
1388 current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400));
1389 current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200));
1390 current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400));
1392 case FORMAT_X8G8R8B8Q:
1393 case FORMAT_A8G8R8B8Q:
1394 case FORMAT_X8R8G8B8:
1395 case FORMAT_X8B8G8R8:
1396 case FORMAT_A8R8G8B8:
1397 case FORMAT_A8B8G8R8:
1398 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1399 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1400 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1401 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1406 int rgbaWriteMask = state.colorWriteActive(index);
1407 int bgraWriteMask = rgbaWriteMask & 0x0000000A | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
1408 int brgaWriteMask = rgbaWriteMask & 0x00000008 | (rgbaWriteMask & 0x00000001) << 1 | (rgbaWriteMask & 0x00000002) << 1 | (rgbaWriteMask & 0x00000004) >> 2;
1410 switch(state.targetFormat[index])
1414 current.x = current.x & Short4(0xF800u);
1415 current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1416 current.z = As<UShort4>(current.z) >> 11;
1418 current.x = current.x | current.y | current.z;
1421 case FORMAT_X8G8R8B8Q:
1423 // current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1424 // current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1425 // current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1427 // current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1428 // current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1430 case FORMAT_A8G8R8B8Q:
1432 // current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1433 // current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1434 // current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1435 // current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1437 // current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1438 // current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1440 case FORMAT_X8R8G8B8:
1441 case FORMAT_A8R8G8B8:
1442 if(state.targetFormat[index] == FORMAT_X8R8G8B8 || rgbaWriteMask == 0x7)
1444 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1445 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1446 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1448 current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1449 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1451 current.x = current.z;
1452 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1453 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1454 current.y = current.z;
1455 current.z = As<Short4>(UnpackLow(current.z, current.x));
1456 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1460 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1461 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1462 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1463 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1465 current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1466 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1468 current.x = current.z;
1469 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1470 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1471 current.y = current.z;
1472 current.z = As<Short4>(UnpackLow(current.z, current.x));
1473 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1476 case FORMAT_X8B8G8R8:
1477 case FORMAT_A8B8G8R8:
1478 if(state.targetFormat[index] == FORMAT_X8B8G8R8 || rgbaWriteMask == 0x7)
1480 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1481 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1482 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1484 current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1485 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1487 current.x = current.z;
1488 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1489 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1490 current.y = current.z;
1491 current.z = As<Short4>(UnpackLow(current.z, current.x));
1492 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1496 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1497 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1498 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1499 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1501 current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1502 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1504 current.x = current.z;
1505 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1506 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1507 current.y = current.z;
1508 current.z = As<Short4>(UnpackLow(current.z, current.x));
1509 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1513 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1514 current.w = As<Short4>(Pack(As<UShort4>(current.w), As<UShort4>(current.w)));
1517 current.z = current.x;
1518 current.x = As<Short4>(UnpackLow(current.x, current.y));
1519 current.z = As<Short4>(UnpackHigh(current.z, current.y));
1520 current.y = current.z;
1522 case FORMAT_A16B16G16R16:
1523 transpose4x4(current.x, current.y, current.z, current.w);
1529 Short4 c01 = current.z;
1530 Short4 c23 = current.y;
1532 Int xMask; // Combination of all masks
1534 if(state.depthTestActive)
1543 if(state.stencilActive)
1548 switch(state.targetFormat[index])
1552 Pointer<Byte> buffer = cBuffer + 2 * x;
1553 Int value = *Pointer<Int>(buffer);
1555 Int c01 = Extract(As<Int2>(current.x), 0);
1557 if((bgraWriteMask & 0x00000007) != 0x00000007)
1560 c01 &= *Pointer<Int>(r.constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1561 masked &= *Pointer<Int>(r.constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0]));
1565 c01 &= *Pointer<Int>(r.constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
1566 value &= *Pointer<Int>(r.constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
1568 *Pointer<Int>(buffer) = c01;
1570 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1571 value = *Pointer<Int>(buffer);
1573 Int c23 = Extract(As<Int2>(current.x), 1);
1575 if((bgraWriteMask & 0x00000007) != 0x00000007)
1578 c23 &= *Pointer<Int>(r.constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1579 masked &= *Pointer<Int>(r.constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0]));
1583 c23 &= *Pointer<Int>(r.constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
1584 value &= *Pointer<Int>(r.constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
1586 *Pointer<Int>(buffer) = c23;
1589 case FORMAT_A8G8R8B8Q:
1590 case FORMAT_X8G8R8B8Q: // FIXME: Don't touch alpha?
1592 // value = *Pointer<Short4>(cBuffer + 8 * x + 0);
1594 // if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1595 // ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1596 // (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
1598 // Short4 masked = value;
1599 // c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1600 // masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1604 // c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1605 // value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1607 // *Pointer<Short4>(cBuffer + 8 * x + 0) = c01;
1609 // value = *Pointer<Short4>(cBuffer + 8 * x + 8);
1611 // if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1612 // ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1613 // (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
1615 // Short4 masked = value;
1616 // c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1617 // masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1621 // c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1622 // value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1624 // *Pointer<Short4>(cBuffer + 8 * x + 8) = c23;
1626 case FORMAT_A8R8G8B8:
1627 case FORMAT_X8R8G8B8: // FIXME: Don't touch alpha?
1629 Pointer<Byte> buffer = cBuffer + x * 4;
1630 Short4 value = *Pointer<Short4>(buffer);
1632 if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1633 ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1634 (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
1636 Short4 masked = value;
1637 c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1638 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1642 c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1643 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1645 *Pointer<Short4>(buffer) = c01;
1647 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1648 value = *Pointer<Short4>(buffer);
1650 if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1651 ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1652 (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
1654 Short4 masked = value;
1655 c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1656 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1660 c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1661 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1663 *Pointer<Short4>(buffer) = c23;
1666 case FORMAT_A8B8G8R8:
1667 case FORMAT_X8B8G8R8: // FIXME: Don't touch alpha?
1669 Pointer<Byte> buffer = cBuffer + x * 4;
1670 Short4 value = *Pointer<Short4>(buffer);
1672 if((state.targetFormat[index] == FORMAT_A8B8G8R8 && rgbaWriteMask != 0x0000000F) ||
1673 ((state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x00000007) &&
1674 (state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x0000000F))) // FIXME: Need for masking when XBGR && Fh?
1676 Short4 masked = value;
1677 c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1678 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1682 c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1683 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1685 *Pointer<Short4>(buffer) = c01;
1687 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1688 value = *Pointer<Short4>(buffer);
1690 if((state.targetFormat[index] == FORMAT_A8B8G8R8 && rgbaWriteMask != 0x0000000F) ||
1691 ((state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x00000007) &&
1692 (state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x0000000F))) // FIXME: Need for masking when XBGR && Fh?
1694 Short4 masked = value;
1695 c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1696 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1700 c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1701 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1703 *Pointer<Short4>(buffer) = c23;
1707 if(rgbaWriteMask & 0x00000008)
1709 Pointer<Byte> buffer = cBuffer + 1 * x;
1711 Insert(value, *Pointer<Short>(buffer), 0);
1712 Int pitch = *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1713 Insert(value, *Pointer<Short>(buffer + pitch), 1);
1714 value = UnpackLow(As<Byte8>(value), As<Byte8>(value));
1716 current.w &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q) + 8 * xMask);
1717 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q) + 8 * xMask);
1720 *Pointer<Short>(buffer) = Extract(current.w, 0);
1721 *Pointer<Short>(buffer + pitch) = Extract(current.w, 1);
1726 Pointer<Byte> buffer = cBuffer + 4 * x;
1728 Short4 value = *Pointer<Short4>(buffer);
1730 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1732 Short4 masked = value;
1733 current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1734 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
1735 current.x |= masked;
1738 current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1739 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1741 *Pointer<Short4>(buffer) = current.x;
1743 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1745 value = *Pointer<Short4>(buffer);
1747 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1749 Short4 masked = value;
1750 current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1751 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
1752 current.y |= masked;
1755 current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1756 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1758 *Pointer<Short4>(buffer) = current.y;
1761 case FORMAT_A16B16G16R16:
1763 Pointer<Byte> buffer = cBuffer + 8 * x;
1766 Short4 value = *Pointer<Short4>(buffer);
1768 if(rgbaWriteMask != 0x0000000F)
1770 Short4 masked = value;
1771 current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1772 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1773 current.x |= masked;
1776 current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
1777 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
1779 *Pointer<Short4>(buffer) = current.x;
1783 Short4 value = *Pointer<Short4>(buffer + 8);
1785 if(rgbaWriteMask != 0x0000000F)
1787 Short4 masked = value;
1788 current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1789 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1790 current.y |= masked;
1793 current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
1794 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
1796 *Pointer<Short4>(buffer + 8) = current.y;
1799 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1802 Short4 value = *Pointer<Short4>(buffer);
1804 if(rgbaWriteMask != 0x0000000F)
1806 Short4 masked = value;
1807 current.z &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1808 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1809 current.z |= masked;
1812 current.z &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
1813 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
1815 *Pointer<Short4>(buffer) = current.z;
1819 Short4 value = *Pointer<Short4>(buffer + 8);
1821 if(rgbaWriteMask != 0x0000000F)
1823 Short4 masked = value;
1824 current.w &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1825 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1826 current.w |= masked;
1829 current.w &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
1830 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
1832 *Pointer<Short4>(buffer + 8) = current.w;
1841 void PixelRoutine::blendFactor(Registers &r, const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive)
1843 switch(blendFactorActive)
1852 blendFactor.x = oC.x;
1853 blendFactor.y = oC.y;
1854 blendFactor.z = oC.z;
1856 case BLEND_INVSOURCE:
1857 blendFactor.x = Float4(1.0f) - oC.x;
1858 blendFactor.y = Float4(1.0f) - oC.y;
1859 blendFactor.z = Float4(1.0f) - oC.z;
1862 blendFactor.x = pixel.x;
1863 blendFactor.y = pixel.y;
1864 blendFactor.z = pixel.z;
1867 blendFactor.x = Float4(1.0f) - pixel.x;
1868 blendFactor.y = Float4(1.0f) - pixel.y;
1869 blendFactor.z = Float4(1.0f) - pixel.z;
1871 case BLEND_SOURCEALPHA:
1872 blendFactor.x = oC.w;
1873 blendFactor.y = oC.w;
1874 blendFactor.z = oC.w;
1876 case BLEND_INVSOURCEALPHA:
1877 blendFactor.x = Float4(1.0f) - oC.w;
1878 blendFactor.y = Float4(1.0f) - oC.w;
1879 blendFactor.z = Float4(1.0f) - oC.w;
1881 case BLEND_DESTALPHA:
1882 blendFactor.x = pixel.w;
1883 blendFactor.y = pixel.w;
1884 blendFactor.z = pixel.w;
1886 case BLEND_INVDESTALPHA:
1887 blendFactor.x = Float4(1.0f) - pixel.w;
1888 blendFactor.y = Float4(1.0f) - pixel.w;
1889 blendFactor.z = Float4(1.0f) - pixel.w;
1891 case BLEND_SRCALPHASAT:
1892 blendFactor.x = Float4(1.0f) - pixel.w;
1893 blendFactor.x = Min(blendFactor.x, oC.w);
1894 blendFactor.y = blendFactor.x;
1895 blendFactor.z = blendFactor.x;
1897 case BLEND_CONSTANT:
1898 blendFactor.x = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[0]));
1899 blendFactor.y = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[1]));
1900 blendFactor.z = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[2]));
1902 case BLEND_INVCONSTANT:
1903 blendFactor.x = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
1904 blendFactor.y = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
1905 blendFactor.z = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
1912 void PixelRoutine::blendFactorAlpha(Registers &r, const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive)
1914 switch(blendFactorAlphaActive)
1923 blendFactor.w = oC.w;
1925 case BLEND_INVSOURCE:
1926 blendFactor.w = Float4(1.0f) - oC.w;
1929 blendFactor.w = pixel.w;
1932 blendFactor.w = Float4(1.0f) - pixel.w;
1934 case BLEND_SOURCEALPHA:
1935 blendFactor.w = oC.w;
1937 case BLEND_INVSOURCEALPHA:
1938 blendFactor.w = Float4(1.0f) - oC.w;
1940 case BLEND_DESTALPHA:
1941 blendFactor.w = pixel.w;
1943 case BLEND_INVDESTALPHA:
1944 blendFactor.w = Float4(1.0f) - pixel.w;
1946 case BLEND_SRCALPHASAT:
1947 blendFactor.w = Float4(1.0f);
1949 case BLEND_CONSTANT:
1950 blendFactor.w = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[3]));
1952 case BLEND_INVCONSTANT:
1953 blendFactor.w = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
1960 void PixelRoutine::alphaBlend(Registers &r, int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
1962 if(!state.alphaBlendActive)
1967 Pointer<Byte> buffer;
1974 switch(state.targetFormat[index])
1979 pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
1980 pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
1981 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1983 pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
1984 pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
1985 pixel.y = Float4(1.0f);
1986 pixel.z = Float4(1.0f);
1987 pixel.w = Float4(1.0f);
1989 case FORMAT_G32R32F:
1991 pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
1992 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1993 pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
1995 pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88);
1996 pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD);
1998 pixel.z = Float4(1.0f);
1999 pixel.w = Float4(1.0f);
2001 case FORMAT_A32B32G32R32F:
2003 pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
2004 pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2005 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
2006 pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
2007 pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2008 transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
2014 if(postBlendSRGB && state.writeSRGB)
2016 sRGBtoLinear(pixel.x);
2017 sRGBtoLinear(pixel.y);
2018 sRGBtoLinear(pixel.z);
2021 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
2022 Vector4f sourceFactor;
2023 Vector4f destFactor;
2025 blendFactor(r, sourceFactor, oC, pixel, state.sourceBlendFactor);
2026 blendFactor(r, destFactor, oC, pixel, state.destBlendFactor);
2028 if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
2030 oC.x *= sourceFactor.x;
2031 oC.y *= sourceFactor.y;
2032 oC.z *= sourceFactor.z;
2035 if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
2037 pixel.x *= destFactor.x;
2038 pixel.y *= destFactor.y;
2039 pixel.z *= destFactor.z;
2042 switch(state.blendOperation)
2054 case BLENDOP_INVSUB:
2055 oC.x = pixel.x - oC.x;
2056 oC.y = pixel.y - oC.y;
2057 oC.z = pixel.z - oC.z;
2060 oC.x = Min(oC.x, pixel.x);
2061 oC.y = Min(oC.y, pixel.y);
2062 oC.z = Min(oC.z, pixel.z);
2065 oC.x = Max(oC.x, pixel.x);
2066 oC.y = Max(oC.y, pixel.y);
2067 oC.z = Max(oC.z, pixel.z);
2069 case BLENDOP_SOURCE:
2078 oC.x = Float4(0.0f);
2079 oC.y = Float4(0.0f);
2080 oC.z = Float4(0.0f);
2086 blendFactorAlpha(r, sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
2087 blendFactorAlpha(r, destFactor, oC, pixel, state.destBlendFactorAlpha);
2089 if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
2091 oC.w *= sourceFactor.w;
2094 if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
2096 pixel.w *= destFactor.w;
2099 switch(state.blendOperationAlpha)
2107 case BLENDOP_INVSUB:
2112 oC.w = Min(oC.w, pixel.w);
2115 oC.w = Max(oC.w, pixel.w);
2117 case BLENDOP_SOURCE:
2124 oC.w = Float4(0.0f);
2131 void PixelRoutine::writeColor(Registers &r, int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
2133 switch(state.targetFormat[index])
2137 case FORMAT_G32R32F:
2139 oC.x = UnpackLow(oC.x, oC.y);
2140 oC.z = UnpackHigh(oC.z, oC.y);
2143 case FORMAT_A32B32G32R32F:
2144 transpose4x4(oC.x, oC.y, oC.z, oC.w);
2150 int rgbaWriteMask = state.colorWriteActive(index);
2152 Int xMask; // Combination of all masks
2154 if(state.depthTestActive)
2163 if(state.stencilActive)
2168 Pointer<Byte> buffer;
2171 switch(state.targetFormat[index])
2174 if(rgbaWriteMask & 0x00000001)
2176 buffer = cBuffer + 4 * x;
2179 value.x = *Pointer<Float>(buffer + 0);
2180 value.y = *Pointer<Float>(buffer + 4);
2182 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
2185 value.z = *Pointer<Float>(buffer + 0);
2186 value.w = *Pointer<Float>(buffer + 4);
2188 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
2189 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
2190 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2193 *Pointer<Float>(buffer + 0) = oC.x.z;
2194 *Pointer<Float>(buffer + 4) = oC.x.w;
2196 buffer -= *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
2199 *Pointer<Float>(buffer + 0) = oC.x.x;
2200 *Pointer<Float>(buffer + 4) = oC.x.y;
2203 case FORMAT_G32R32F:
2204 buffer = cBuffer + 8 * x;
2206 value = *Pointer<Float4>(buffer);
2208 if((rgbaWriteMask & 0x00000003) != 0x00000003)
2210 Float4 masked = value;
2211 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2212 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0])));
2213 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2216 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
2217 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
2218 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2219 *Pointer<Float4>(buffer) = oC.x;
2221 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
2223 value = *Pointer<Float4>(buffer);
2225 if((rgbaWriteMask & 0x00000003) != 0x00000003)
2230 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2231 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0])));
2232 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2235 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
2236 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
2237 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2238 *Pointer<Float4>(buffer) = oC.y;
2240 case FORMAT_A32B32G32R32F:
2241 buffer = cBuffer + 16 * x;
2244 value = *Pointer<Float4>(buffer, 16);
2246 if(rgbaWriteMask != 0x0000000F)
2248 Float4 masked = value;
2249 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2250 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2251 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2254 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
2255 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
2256 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2257 *Pointer<Float4>(buffer, 16) = oC.x;
2261 value = *Pointer<Float4>(buffer + 16, 16);
2263 if(rgbaWriteMask != 0x0000000F)
2265 Float4 masked = value;
2266 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2267 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2268 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2271 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
2272 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
2273 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2274 *Pointer<Float4>(buffer + 16, 16) = oC.y;
2277 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
2280 value = *Pointer<Float4>(buffer, 16);
2282 if(rgbaWriteMask != 0x0000000F)
2284 Float4 masked = value;
2285 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2286 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2287 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
2290 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
2291 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
2292 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
2293 *Pointer<Float4>(buffer, 16) = oC.z;
2297 value = *Pointer<Float4>(buffer + 16, 16);
2299 if(rgbaWriteMask != 0x0000000F)
2301 Float4 masked = value;
2302 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2303 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2304 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
2307 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
2308 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
2309 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
2310 *Pointer<Float4>(buffer + 16, 16) = oC.w;
2318 UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate)
2320 return UShort4(cf * Float4(0xFFFF), saturate);
2323 void PixelRoutine::sRGBtoLinear16_12_16(Registers &r, Vector4s &c)
2325 c.x = As<UShort4>(c.x) >> 4;
2326 c.y = As<UShort4>(c.y) >> 4;
2327 c.z = As<UShort4>(c.z) >> 4;
2329 sRGBtoLinear12_16(r, c);
2332 void PixelRoutine::sRGBtoLinear12_16(Registers &r, Vector4s &c)
2334 Pointer<Byte> LUT = r.constants + OFFSET(Constants,sRGBtoLinear12_16);
2336 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2337 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2338 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2339 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2341 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2342 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2343 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2344 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2346 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2347 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2348 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2349 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2352 void PixelRoutine::linearToSRGB16_12_16(Registers &r, Vector4s &c)
2354 c.x = As<UShort4>(c.x) >> 4;
2355 c.y = As<UShort4>(c.y) >> 4;
2356 c.z = As<UShort4>(c.z) >> 4;
2358 linearToSRGB12_16(r, c);
2361 void PixelRoutine::linearToSRGB12_16(Registers &r, Vector4s &c)
2363 Pointer<Byte> LUT = r.constants + OFFSET(Constants,linearToSRGB12_16);
2365 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2366 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2367 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2368 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2370 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2371 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2372 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2373 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2375 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2376 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2377 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2378 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2381 Float4 PixelRoutine::sRGBtoLinear(const Float4 &x) // Approximates x^2.2
2383 Float4 linear = x * x;
2384 linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
2386 return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
2389 bool PixelRoutine::colorUsed()
2391 return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill;