1 // SwiftShader Software Renderer
3 // Copyright(c) 2005-2013 TransGaming Inc.
5 // All rights reserved. No part of this software may be copied, distributed, transmitted,
6 // transcribed, stored in a retrieval system, translated into any human or computer
7 // language by any means, or disclosed to third parties without the explicit written
8 // agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
9 // or implied, including but not limited to any patent rights, are granted to you.
12 #include "PixelRoutine.hpp"
14 #include "Renderer.hpp"
15 #include "QuadRasterizer.hpp"
16 #include "Surface.hpp"
17 #include "Primitive.hpp"
19 #include "SamplerCore.hpp"
20 #include "Constants.hpp"
25 extern bool complementaryDepthBuffer;
26 extern bool postBlendSRGB;
27 extern bool exactColorRounding;
28 extern bool forceClearRegisters;
30 PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader) : QuadRasterizer(state, shader), v(shader && shader->dynamicallyIndexedInput)
32 if(!shader || shader->getVersion() < 0x0200 || forceClearRegisters)
34 for(int i = 0; i < 10; i++)
36 v[i].x = Float4(0.0f);
37 v[i].y = Float4(0.0f);
38 v[i].z = Float4(0.0f);
39 v[i].w = Float4(0.0f);
44 PixelRoutine::~PixelRoutine()
46 for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
52 void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
55 Long pipeTime = Ticks();
58 for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
60 sampler[i] = new SamplerCore(constants, state.sampler[i]);
63 const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive();
65 Int zMask[4]; // Depth mask
66 Int sMask[4]; // Stencil mask
68 for(unsigned int q = 0; q < state.multiSample; q++)
74 for(unsigned int q = 0; q < state.multiSample; q++)
76 stencilTest(sBuffer, q, x, sMask[q], cMask[q]);
82 Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16);
86 for(unsigned int q = 0; q < state.multiSample; q++)
90 if(state.multiSample > 1)
92 x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4));
95 z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false);
99 Bool depthPass = false;
103 for(unsigned int q = 0; q < state.multiSample; q++)
105 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
109 If(depthPass || Bool(!earlyDepthTest))
112 Long interpTime = Ticks();
115 Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
117 // Centroid locations
118 Float4 XXXX = Float4(0.0f);
119 Float4 YYYY = Float4(0.0f);
123 Float4 WWWW(1.0e-9f);
125 for(unsigned int q = 0; q < state.multiSample; q++)
127 XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
128 YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
129 WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]);
142 w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false);
143 rhw = reciprocal(w, false, false, true);
147 rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false));
151 for(int interpolant = 0; interpolant < 10; interpolant++)
153 for(int component = 0; component < 4; component++)
155 if(state.interpolant[interpolant].component & (1 << component))
157 if(!state.interpolant[interpolant].centroid)
159 v[interpolant][component] = interpolate(xxxx, Dv[interpolant][component], rhw, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
163 v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
170 switch(state.interpolant[interpolant].project)
175 rcp = reciprocal(v[interpolant].y);
176 v[interpolant].x = v[interpolant].x * rcp;
179 rcp = reciprocal(v[interpolant].z);
180 v[interpolant].x = v[interpolant].x * rcp;
181 v[interpolant].y = v[interpolant].y * rcp;
184 rcp = reciprocal(v[interpolant].w);
185 v[interpolant].x = v[interpolant].x * rcp;
186 v[interpolant].y = v[interpolant].y * rcp;
187 v[interpolant].z = v[interpolant].z * rcp;
192 if(state.fog.component)
194 f = interpolate(xxxx, Df, rhw, primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective);
197 setBuiltins(x, y, z, w);
200 cycles[PERF_INTERP] += Ticks() - interpTime;
203 Bool alphaPass = true;
208 Long shaderTime = Ticks();
214 cycles[PERF_SHADER] += Ticks() - shaderTime;
217 alphaPass = alphaTest(cMask);
219 if((shader && shader->containsKill()) || state.alphaTestActive())
221 for(unsigned int q = 0; q < state.multiSample; q++)
223 zMask[q] &= cMask[q];
224 sMask[q] &= cMask[q];
233 for(unsigned int q = 0; q < state.multiSample; q++)
235 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
240 Long ropTime = Ticks();
243 If(depthPass || Bool(earlyDepthTest))
245 for(unsigned int q = 0; q < state.multiSample; q++)
247 if(state.multiSampleMask & (1 << q))
249 writeDepth(zBuffer, q, x, z[q], zMask[q]);
251 if(state.occlusionEnabled)
253 occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
261 AddAtomic(Pointer<Long>(&profiler.ropOperations), 4);
264 rasterOperation(f, cBuffer, x, sMask, zMask, cMask);
269 cycles[PERF_ROP] += Ticks() - ropTime;
274 for(unsigned int q = 0; q < state.multiSample; q++)
276 if(state.multiSampleMask & (1 << q))
278 writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
283 cycles[PERF_PIPE] += Ticks() - pipeTime;
287 Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
289 Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
293 interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
294 y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
305 void PixelRoutine::stencilTest(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask)
307 if(!state.stencilActive)
312 // (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
314 Pointer<Byte> buffer = sBuffer + 2 * x;
318 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
321 Byte8 value = As<Byte8>(Long1(*Pointer<UInt>(buffer)));
322 Byte8 valueCCW = value;
324 if(!state.noStencilMask)
326 value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ));
329 stencilTest(value, state.stencilCompareMode, false);
331 if(state.twoSidedStencil)
333 if(!state.noStencilMaskCCW)
335 valueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ));
338 stencilTest(valueCCW, state.stencilCompareModeCCW, true);
340 value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
341 valueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
345 sMask = SignMask(value) & cMask;
348 void PixelRoutine::stencilTest(Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW)
352 switch(stencilCompareMode)
355 value = Byte8(0xFFFFFFFFFFFFFFFF);
358 value = Byte8(0x0000000000000000);
360 case STENCIL_LESS: // a < b ~ b > a
361 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
362 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
365 value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
367 case STENCIL_NOTEQUAL: // a != b ~ !(a == b)
368 value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
369 value ^= Byte8(0xFFFFFFFFFFFFFFFF);
371 case STENCIL_LESSEQUAL: // a <= b ~ (b > a) || (a == b)
373 equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
374 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
375 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
378 case STENCIL_GREATER: // a > b
379 equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
380 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
381 equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
384 case STENCIL_GREATEREQUAL: // a >= b ~ !(a < b) ~ !(b > a)
385 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
386 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
387 value ^= Byte8(0xFFFFFFFFFFFFFFFF);
394 Bool PixelRoutine::depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
396 if(!state.depthTestActive)
403 if(shader && shader->depthOverride())
405 if(complementaryDepthBuffer)
407 Z = Float4(1.0f) - oDepth;
415 Pointer<Byte> buffer;
418 if(!state.quadLayoutDepthBuffer)
420 buffer = zBuffer + 4 * x;
421 pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
425 buffer = zBuffer + 8 * x;
430 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
435 if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
437 if(!state.quadLayoutDepthBuffer)
439 // FIXME: Properly optimizes?
440 zValue.xy = *Pointer<Float4>(buffer);
441 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
445 zValue = *Pointer<Float4>(buffer, 16);
451 switch(state.depthCompareMode)
460 zTest = CmpEQ(zValue, Z);
463 zTest = CmpNEQ(zValue, Z);
466 if(complementaryDepthBuffer)
468 zTest = CmpLT(zValue, Z);
472 zTest = CmpNLE(zValue, Z);
475 case DEPTH_GREATEREQUAL:
476 if(complementaryDepthBuffer)
478 zTest = CmpNLT(zValue, Z);
482 zTest = CmpLE(zValue, Z);
485 case DEPTH_LESSEQUAL:
486 if(complementaryDepthBuffer)
488 zTest = CmpLE(zValue, Z);
492 zTest = CmpNLT(zValue, Z);
496 if(complementaryDepthBuffer)
498 zTest = CmpNLE(zValue, Z);
502 zTest = CmpLT(zValue, Z);
509 switch(state.depthCompareMode)
518 zMask = SignMask(zTest) & cMask;
522 if(state.stencilActive)
530 void PixelRoutine::alphaTest(Int &aMask, Short4 &alpha)
535 switch(state.alphaCompareMode)
544 cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
545 aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
547 case ALPHA_NOTEQUAL: // a != b ~ !(a == b)
548 cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4((short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF); // FIXME
549 aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
551 case ALPHA_LESS: // a < b ~ b > a
552 cmp = CmpGT(*Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)), alpha);
553 aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
555 case ALPHA_GREATEREQUAL: // a >= b ~ (a > b) || (a == b) ~ !(b > a) // TODO: Approximate
556 equal = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
557 cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
559 aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
561 case ALPHA_LESSEQUAL: // a <= b ~ !(a > b)
562 cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4((short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF); // FIXME
563 aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
565 case ALPHA_GREATER: // a > b
566 cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
567 aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
574 void PixelRoutine::alphaToCoverage(Int cMask[4], Float4 &alpha)
576 Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0)));
577 Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1)));
578 Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2)));
579 Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3)));
581 Int aMask0 = SignMask(coverage0);
582 Int aMask1 = SignMask(coverage1);
583 Int aMask2 = SignMask(coverage2);
584 Int aMask3 = SignMask(coverage3);
592 void PixelRoutine::fogBlend(Vector4f &c0, Float4 &fog)
599 if(state.pixelFogMode != FOG_NONE)
603 fog = Min(fog, Float4(1.0f));
604 fog = Max(fog, Float4(0.0f));
607 c0.x -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
608 c0.y -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
609 c0.z -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
615 c0.x += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
616 c0.y += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
617 c0.z += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
620 void PixelRoutine::pixelFog(Float4 &visibility)
622 Float4 &zw = visibility;
624 if(state.pixelFogMode != FOG_NONE)
632 if(complementaryDepthBuffer)
634 zw = Float4(1.0f) - z[0];
643 switch(state.pixelFogMode)
648 zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.scale));
649 zw += *Pointer<Float4>(data + OFFSET(DrawData,fog.offset));
652 zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.densityE));
653 zw = exponential2(zw, true);
657 zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.density2E));
658 zw = exponential2(zw, true);
665 void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
667 if(!state.depthWriteEnable)
674 if(shader && shader->depthOverride())
676 if(complementaryDepthBuffer)
678 Z = Float4(1.0f) - oDepth;
686 Pointer<Byte> buffer;
689 if(!state.quadLayoutDepthBuffer)
691 buffer = zBuffer + 4 * x;
692 pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
696 buffer = zBuffer + 8 * x;
701 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
706 if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
708 if(!state.quadLayoutDepthBuffer)
710 // FIXME: Properly optimizes?
711 zValue.xy = *Pointer<Float4>(buffer);
712 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
716 zValue = *Pointer<Float4>(buffer, 16);
720 Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
721 zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
722 Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
724 if(!state.quadLayoutDepthBuffer)
726 // FIXME: Properly optimizes?
727 *Pointer<Float2>(buffer) = Float2(Z.xy);
728 *Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
732 *Pointer<Float4>(buffer, 16) = Z;
736 void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
738 if(!state.stencilActive)
743 if(state.stencilPassOperation == OPERATION_KEEP && state.stencilZFailOperation == OPERATION_KEEP && state.stencilFailOperation == OPERATION_KEEP)
745 if(!state.twoSidedStencil || (state.stencilPassOperationCCW == OPERATION_KEEP && state.stencilZFailOperationCCW == OPERATION_KEEP && state.stencilFailOperationCCW == OPERATION_KEEP))
751 if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW))
756 Pointer<Byte> buffer = sBuffer + 2 * x;
760 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
763 Byte8 bufferValue = As<Byte8>(Long1(*Pointer<UInt>(buffer)));
766 stencilOperation(newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
768 if(!state.noStencilWriteMask)
770 Byte8 maskedValue = bufferValue;
771 newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ));
772 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
773 newValue |= maskedValue;
776 if(state.twoSidedStencil)
780 stencilOperation(newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
782 if(!state.noStencilWriteMaskCCW)
784 Byte8 maskedValue = bufferValue;
785 newValueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ));
786 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
787 newValueCCW |= maskedValue;
790 newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
791 newValueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
792 newValue |= newValueCCW;
795 newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
796 bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
797 newValue |= bufferValue;
799 *Pointer<UInt>(buffer) = UInt(As<Long>(newValue));
802 void PixelRoutine::stencilOperation(Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
804 Byte8 &pass = newValue;
808 stencilOperation(pass, bufferValue, stencilPassOperation, CCW);
810 if(stencilZFailOperation != stencilPassOperation)
812 stencilOperation(zFail, bufferValue, stencilZFailOperation, CCW);
815 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
817 stencilOperation(fail, bufferValue, stencilFailOperation, CCW);
820 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
822 if(state.depthTestActive && stencilZFailOperation != stencilPassOperation) // zMask valid and values not the same
824 pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
825 zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
829 pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
830 fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
835 void PixelRoutine::stencilOperation(Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW)
840 output = bufferValue;
843 output = Byte8(0x0000000000000000);
845 case OPERATION_REPLACE:
846 output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceQ));
848 case OPERATION_INCRSAT:
849 output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
851 case OPERATION_DECRSAT:
852 output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
854 case OPERATION_INVERT:
855 output = bufferValue ^ Byte8(0xFFFFFFFFFFFFFFFF);
858 output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
861 output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
868 void PixelRoutine::blendFactor(const Vector4s &blendFactor, const Vector4s ¤t, const Vector4s &pixel, BlendFactor blendFactorActive)
870 switch(blendFactorActive)
879 blendFactor.x = current.x;
880 blendFactor.y = current.y;
881 blendFactor.z = current.z;
883 case BLEND_INVSOURCE:
884 blendFactor.x = Short4(0xFFFFu) - current.x;
885 blendFactor.y = Short4(0xFFFFu) - current.y;
886 blendFactor.z = Short4(0xFFFFu) - current.z;
889 blendFactor.x = pixel.x;
890 blendFactor.y = pixel.y;
891 blendFactor.z = pixel.z;
894 blendFactor.x = Short4(0xFFFFu) - pixel.x;
895 blendFactor.y = Short4(0xFFFFu) - pixel.y;
896 blendFactor.z = Short4(0xFFFFu) - pixel.z;
898 case BLEND_SOURCEALPHA:
899 blendFactor.x = current.w;
900 blendFactor.y = current.w;
901 blendFactor.z = current.w;
903 case BLEND_INVSOURCEALPHA:
904 blendFactor.x = Short4(0xFFFFu) - current.w;
905 blendFactor.y = Short4(0xFFFFu) - current.w;
906 blendFactor.z = Short4(0xFFFFu) - current.w;
908 case BLEND_DESTALPHA:
909 blendFactor.x = pixel.w;
910 blendFactor.y = pixel.w;
911 blendFactor.z = pixel.w;
913 case BLEND_INVDESTALPHA:
914 blendFactor.x = Short4(0xFFFFu) - pixel.w;
915 blendFactor.y = Short4(0xFFFFu) - pixel.w;
916 blendFactor.z = Short4(0xFFFFu) - pixel.w;
918 case BLEND_SRCALPHASAT:
919 blendFactor.x = Short4(0xFFFFu) - pixel.w;
920 blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
921 blendFactor.y = blendFactor.x;
922 blendFactor.z = blendFactor.x;
925 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0]));
926 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1]));
927 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2]));
929 case BLEND_INVCONSTANT:
930 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
931 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
932 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
934 case BLEND_CONSTANTALPHA:
935 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
936 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
937 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
939 case BLEND_INVCONSTANTALPHA:
940 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
941 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
942 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
949 void PixelRoutine::blendFactorAlpha(const Vector4s &blendFactor, const Vector4s ¤t, const Vector4s &pixel, BlendFactor blendFactorAlphaActive)
951 switch(blendFactorAlphaActive)
960 blendFactor.w = current.w;
962 case BLEND_INVSOURCE:
963 blendFactor.w = Short4(0xFFFFu) - current.w;
966 blendFactor.w = pixel.w;
969 blendFactor.w = Short4(0xFFFFu) - pixel.w;
971 case BLEND_SOURCEALPHA:
972 blendFactor.w = current.w;
974 case BLEND_INVSOURCEALPHA:
975 blendFactor.w = Short4(0xFFFFu) - current.w;
977 case BLEND_DESTALPHA:
978 blendFactor.w = pixel.w;
980 case BLEND_INVDESTALPHA:
981 blendFactor.w = Short4(0xFFFFu) - pixel.w;
983 case BLEND_SRCALPHASAT:
984 blendFactor.w = Short4(0xFFFFu);
987 case BLEND_CONSTANTALPHA:
988 blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
990 case BLEND_INVCONSTANT:
991 case BLEND_INVCONSTANTALPHA:
992 blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
999 void PixelRoutine::readPixel(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel)
1003 Pointer<Byte> buffer;
1004 Pointer<Byte> buffer2;
1006 switch(state.targetFormat[index])
1009 buffer = cBuffer + 2 * x;
1010 buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1011 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1013 pixel.x = c01 & Short4(0xF800u);
1014 pixel.y = (c01 & Short4(0x07E0u)) << 5;
1015 pixel.z = (c01 & Short4(0x001Fu)) << 11;
1016 pixel.w = Short4(0xFFFFu);
1018 case FORMAT_A8R8G8B8:
1019 buffer = cBuffer + 4 * x;
1020 c01 = *Pointer<Short4>(buffer);
1021 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1022 c23 = *Pointer<Short4>(buffer);
1025 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1026 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1028 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1029 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1032 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1033 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1034 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1035 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1037 case FORMAT_A8B8G8R8:
1038 buffer = cBuffer + 4 * x;
1039 c01 = *Pointer<Short4>(buffer);
1040 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1041 c23 = *Pointer<Short4>(buffer);
1044 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1045 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1047 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1048 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1051 pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1052 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1053 pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1054 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1057 buffer = cBuffer + 1 * x;
1058 pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 0);
1059 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1060 pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 1);
1061 pixel.w = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1062 pixel.x = Short4(0x0000);
1063 pixel.y = Short4(0x0000);
1064 pixel.z = Short4(0x0000);
1066 case FORMAT_X8R8G8B8:
1067 buffer = cBuffer + 4 * x;
1068 c01 = *Pointer<Short4>(buffer);
1069 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1070 c23 = *Pointer<Short4>(buffer);
1073 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1074 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1076 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1077 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1079 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1080 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1081 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1082 pixel.w = Short4(0xFFFFu);
1084 case FORMAT_X8B8G8R8:
1085 buffer = cBuffer + 4 * x;
1086 c01 = *Pointer<Short4>(buffer);
1087 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1088 c23 = *Pointer<Short4>(buffer);
1091 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1092 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1094 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1095 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1098 pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1099 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1100 pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1101 pixel.w = Short4(0xFFFFu);
1103 case FORMAT_A8G8R8B8Q:
1105 // pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1106 // pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1107 // pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1108 // pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1110 case FORMAT_X8G8R8B8Q:
1112 // pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1113 // pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1114 // pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1115 // pixel.w = Short4(0xFFFFu);
1117 case FORMAT_A16B16G16R16:
1119 pixel.x = *Pointer<Short4>(buffer + 8 * x);
1120 pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
1121 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1122 pixel.z = *Pointer<Short4>(buffer + 8 * x);
1123 pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
1124 transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
1128 pixel.x = *Pointer<Short4>(buffer + 4 * x);
1129 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1130 pixel.y = *Pointer<Short4>(buffer + 4 * x);
1132 pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
1133 pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
1135 pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
1136 pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
1137 pixel.z = Short4(0xFFFFu);
1138 pixel.w = Short4(0xFFFFu);
1144 if(postBlendSRGB && state.writeSRGB)
1146 sRGBtoLinear16_12_16(pixel);
1150 void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4s ¤t, Int &x)
1152 if(!state.alphaBlendActive)
1158 readPixel(index, cBuffer, x, pixel);
1160 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
1161 Vector4s sourceFactor;
1162 Vector4s destFactor;
1164 blendFactor(sourceFactor, current, pixel, state.sourceBlendFactor);
1165 blendFactor(destFactor, current, pixel, state.destBlendFactor);
1167 if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
1169 current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
1170 current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
1171 current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
1174 if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
1176 pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
1177 pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
1178 pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
1181 switch(state.blendOperation)
1184 current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1185 current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1186 current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1189 current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1190 current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1191 current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1193 case BLENDOP_INVSUB:
1194 current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
1195 current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
1196 current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
1199 current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
1200 current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
1201 current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
1204 current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
1205 current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
1206 current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
1208 case BLENDOP_SOURCE:
1212 current.x = pixel.x;
1213 current.y = pixel.y;
1214 current.z = pixel.z;
1217 current.x = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1218 current.y = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1219 current.z = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1225 blendFactorAlpha(sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
1226 blendFactorAlpha(destFactor, current, pixel, state.destBlendFactorAlpha);
1228 if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
1230 current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
1233 if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
1235 pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
1238 switch(state.blendOperationAlpha)
1241 current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1244 current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1246 case BLENDOP_INVSUB:
1247 current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
1250 current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
1253 current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
1255 case BLENDOP_SOURCE:
1259 current.w = pixel.w;
1262 current.w = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1269 void PixelRoutine::logicOperation(int index, Pointer<Byte> &cBuffer, Vector4s ¤t, Int &x)
1271 if(state.logicalOperation == LOGICALOP_COPY)
1277 readPixel(index, cBuffer, x, pixel);
1279 switch(state.logicalOperation)
1281 case LOGICALOP_CLEAR:
1287 current.x = 0xFFFFu;
1288 current.y = 0xFFFFu;
1289 current.z = 0xFFFFu;
1291 case LOGICALOP_COPY:
1292 ASSERT(false); // Optimized out
1294 case LOGICALOP_COPY_INVERTED:
1295 current.x = ~current.x;
1296 current.y = ~current.y;
1297 current.z = ~current.z;
1299 case LOGICALOP_NOOP:
1300 current.x = pixel.x;
1301 current.y = pixel.y;
1302 current.z = pixel.z;
1304 case LOGICALOP_INVERT:
1305 current.x = ~pixel.x;
1306 current.y = ~pixel.y;
1307 current.z = ~pixel.z;
1310 current.x = pixel.x & current.x;
1311 current.y = pixel.y & current.y;
1312 current.z = pixel.z & current.z;
1314 case LOGICALOP_NAND:
1315 current.x = ~(pixel.x & current.x);
1316 current.y = ~(pixel.y & current.y);
1317 current.z = ~(pixel.z & current.z);
1320 current.x = pixel.x | current.x;
1321 current.y = pixel.y | current.y;
1322 current.z = pixel.z | current.z;
1325 current.x = ~(pixel.x | current.x);
1326 current.y = ~(pixel.y | current.y);
1327 current.z = ~(pixel.z | current.z);
1330 current.x = pixel.x ^ current.x;
1331 current.y = pixel.y ^ current.y;
1332 current.z = pixel.z ^ current.z;
1334 case LOGICALOP_EQUIV:
1335 current.x = ~(pixel.x ^ current.x);
1336 current.y = ~(pixel.y ^ current.y);
1337 current.z = ~(pixel.z ^ current.z);
1339 case LOGICALOP_AND_REVERSE:
1340 current.x = ~pixel.x & current.x;
1341 current.y = ~pixel.y & current.y;
1342 current.z = ~pixel.z & current.z;
1344 case LOGICALOP_AND_INVERTED:
1345 current.x = pixel.x & ~current.x;
1346 current.y = pixel.y & ~current.y;
1347 current.z = pixel.z & ~current.z;
1349 case LOGICALOP_OR_REVERSE:
1350 current.x = ~pixel.x | current.x;
1351 current.y = ~pixel.y | current.y;
1352 current.z = ~pixel.z | current.z;
1354 case LOGICALOP_OR_INVERTED:
1355 current.x = pixel.x | ~current.x;
1356 current.y = pixel.y | ~current.y;
1357 current.z = pixel.z | ~current.z;
1364 void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s ¤t, Int &sMask, Int &zMask, Int &cMask)
1366 if(postBlendSRGB && state.writeSRGB)
1368 linearToSRGB16_12_16(current);
1371 if(exactColorRounding)
1373 switch(state.targetFormat[index])
1376 current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400));
1377 current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200));
1378 current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400));
1380 case FORMAT_X8G8R8B8Q:
1381 case FORMAT_A8G8R8B8Q:
1382 case FORMAT_X8R8G8B8:
1383 case FORMAT_X8B8G8R8:
1384 case FORMAT_A8R8G8B8:
1385 case FORMAT_A8B8G8R8:
1386 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1387 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1388 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1389 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1396 int rgbaWriteMask = state.colorWriteActive(index);
1397 int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
1398 int brgaWriteMask = (rgbaWriteMask & 0x00000008) | (rgbaWriteMask & 0x00000001) << 1 | (rgbaWriteMask & 0x00000002) << 1 | (rgbaWriteMask & 0x00000004) >> 2;
1400 switch(state.targetFormat[index])
1404 current.x = current.x & Short4(0xF800u);
1405 current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1406 current.z = As<UShort4>(current.z) >> 11;
1408 current.x = current.x | current.y | current.z;
1411 case FORMAT_X8G8R8B8Q:
1413 // current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1414 // current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1415 // current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1417 // current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1418 // current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1420 case FORMAT_A8G8R8B8Q:
1422 // current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1423 // current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1424 // current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1425 // current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1427 // current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1428 // current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1430 case FORMAT_X8R8G8B8:
1431 case FORMAT_A8R8G8B8:
1432 if(state.targetFormat[index] == FORMAT_X8R8G8B8 || rgbaWriteMask == 0x7)
1434 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1435 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1436 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1438 current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1439 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1441 current.x = current.z;
1442 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1443 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1444 current.y = current.z;
1445 current.z = As<Short4>(UnpackLow(current.z, current.x));
1446 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1450 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1451 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1452 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1453 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1455 current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1456 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1458 current.x = current.z;
1459 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1460 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1461 current.y = current.z;
1462 current.z = As<Short4>(UnpackLow(current.z, current.x));
1463 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1466 case FORMAT_X8B8G8R8:
1467 case FORMAT_A8B8G8R8:
1468 if(state.targetFormat[index] == FORMAT_X8B8G8R8 || rgbaWriteMask == 0x7)
1470 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1471 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1472 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1474 current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1475 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1477 current.x = current.z;
1478 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1479 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1480 current.y = current.z;
1481 current.z = As<Short4>(UnpackLow(current.z, current.x));
1482 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1486 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1487 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1488 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1489 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1491 current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1492 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1494 current.x = current.z;
1495 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1496 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1497 current.y = current.z;
1498 current.z = As<Short4>(UnpackLow(current.z, current.x));
1499 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1503 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1504 current.w = As<Short4>(Pack(As<UShort4>(current.w), As<UShort4>(current.w)));
1507 current.z = current.x;
1508 current.x = As<Short4>(UnpackLow(current.x, current.y));
1509 current.z = As<Short4>(UnpackHigh(current.z, current.y));
1510 current.y = current.z;
1512 case FORMAT_A16B16G16R16:
1513 transpose4x4(current.x, current.y, current.z, current.w);
1519 Short4 c01 = current.z;
1520 Short4 c23 = current.y;
1522 Int xMask; // Combination of all masks
1524 if(state.depthTestActive)
1533 if(state.stencilActive)
1538 switch(state.targetFormat[index])
1542 Pointer<Byte> buffer = cBuffer + 2 * x;
1543 Int value = *Pointer<Int>(buffer);
1545 Int c01 = Extract(As<Int2>(current.x), 0);
1547 if((bgraWriteMask & 0x00000007) != 0x00000007)
1550 c01 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1551 masked &= *Pointer<Int>(constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0]));
1555 c01 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
1556 value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
1558 *Pointer<Int>(buffer) = c01;
1560 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1561 value = *Pointer<Int>(buffer);
1563 Int c23 = Extract(As<Int2>(current.x), 1);
1565 if((bgraWriteMask & 0x00000007) != 0x00000007)
1568 c23 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1569 masked &= *Pointer<Int>(constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0]));
1573 c23 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
1574 value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
1576 *Pointer<Int>(buffer) = c23;
1579 case FORMAT_A8G8R8B8Q:
1580 case FORMAT_X8G8R8B8Q: // FIXME: Don't touch alpha?
1582 // value = *Pointer<Short4>(cBuffer + 8 * x + 0);
1584 // if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1585 // ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1586 // (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
1588 // Short4 masked = value;
1589 // c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1590 // masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1594 // c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1595 // value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1597 // *Pointer<Short4>(cBuffer + 8 * x + 0) = c01;
1599 // value = *Pointer<Short4>(cBuffer + 8 * x + 8);
1601 // if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1602 // ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1603 // (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
1605 // Short4 masked = value;
1606 // c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1607 // masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1611 // c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1612 // value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1614 // *Pointer<Short4>(cBuffer + 8 * x + 8) = c23;
1616 case FORMAT_A8R8G8B8:
1617 case FORMAT_X8R8G8B8: // FIXME: Don't touch alpha?
1619 Pointer<Byte> buffer = cBuffer + x * 4;
1620 Short4 value = *Pointer<Short4>(buffer);
1622 if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1623 ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1624 (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
1626 Short4 masked = value;
1627 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1628 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1632 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1633 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1635 *Pointer<Short4>(buffer) = c01;
1637 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1638 value = *Pointer<Short4>(buffer);
1640 if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1641 ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1642 (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
1644 Short4 masked = value;
1645 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1646 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1650 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1651 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1653 *Pointer<Short4>(buffer) = c23;
1656 case FORMAT_A8B8G8R8:
1657 case FORMAT_X8B8G8R8: // FIXME: Don't touch alpha?
1659 Pointer<Byte> buffer = cBuffer + x * 4;
1660 Short4 value = *Pointer<Short4>(buffer);
1662 if((state.targetFormat[index] == FORMAT_A8B8G8R8 && rgbaWriteMask != 0x0000000F) ||
1663 ((state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x00000007) &&
1664 (state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x0000000F))) // FIXME: Need for masking when XBGR && Fh?
1666 Short4 masked = value;
1667 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1668 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1672 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1673 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1675 *Pointer<Short4>(buffer) = c01;
1677 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1678 value = *Pointer<Short4>(buffer);
1680 if((state.targetFormat[index] == FORMAT_A8B8G8R8 && rgbaWriteMask != 0x0000000F) ||
1681 ((state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x00000007) &&
1682 (state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x0000000F))) // FIXME: Need for masking when XBGR && Fh?
1684 Short4 masked = value;
1685 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1686 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1690 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1691 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1693 *Pointer<Short4>(buffer) = c23;
1697 if(rgbaWriteMask & 0x00000008)
1699 Pointer<Byte> buffer = cBuffer + 1 * x;
1701 Insert(value, *Pointer<Short>(buffer), 0);
1702 Int pitch = *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1703 Insert(value, *Pointer<Short>(buffer + pitch), 1);
1704 value = UnpackLow(As<Byte8>(value), As<Byte8>(value));
1706 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q) + 8 * xMask);
1707 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q) + 8 * xMask);
1710 *Pointer<Short>(buffer) = Extract(current.w, 0);
1711 *Pointer<Short>(buffer + pitch) = Extract(current.w, 1);
1716 Pointer<Byte> buffer = cBuffer + 4 * x;
1718 Short4 value = *Pointer<Short4>(buffer);
1720 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1722 Short4 masked = value;
1723 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1724 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
1725 current.x |= masked;
1728 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1729 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1731 *Pointer<Short4>(buffer) = current.x;
1733 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1735 value = *Pointer<Short4>(buffer);
1737 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1739 Short4 masked = value;
1740 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1741 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
1742 current.y |= masked;
1745 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1746 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1748 *Pointer<Short4>(buffer) = current.y;
1751 case FORMAT_A16B16G16R16:
1753 Pointer<Byte> buffer = cBuffer + 8 * x;
1756 Short4 value = *Pointer<Short4>(buffer);
1758 if(rgbaWriteMask != 0x0000000F)
1760 Short4 masked = value;
1761 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1762 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1763 current.x |= masked;
1766 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
1767 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
1769 *Pointer<Short4>(buffer) = current.x;
1773 Short4 value = *Pointer<Short4>(buffer + 8);
1775 if(rgbaWriteMask != 0x0000000F)
1777 Short4 masked = value;
1778 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1779 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1780 current.y |= masked;
1783 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
1784 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
1786 *Pointer<Short4>(buffer + 8) = current.y;
1789 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1792 Short4 value = *Pointer<Short4>(buffer);
1794 if(rgbaWriteMask != 0x0000000F)
1796 Short4 masked = value;
1797 current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1798 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1799 current.z |= masked;
1802 current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
1803 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
1805 *Pointer<Short4>(buffer) = current.z;
1809 Short4 value = *Pointer<Short4>(buffer + 8);
1811 if(rgbaWriteMask != 0x0000000F)
1813 Short4 masked = value;
1814 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1815 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1816 current.w |= masked;
1819 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
1820 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
1822 *Pointer<Short4>(buffer + 8) = current.w;
1831 void PixelRoutine::blendFactor(const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive)
1833 switch(blendFactorActive)
1842 blendFactor.x = oC.x;
1843 blendFactor.y = oC.y;
1844 blendFactor.z = oC.z;
1846 case BLEND_INVSOURCE:
1847 blendFactor.x = Float4(1.0f) - oC.x;
1848 blendFactor.y = Float4(1.0f) - oC.y;
1849 blendFactor.z = Float4(1.0f) - oC.z;
1852 blendFactor.x = pixel.x;
1853 blendFactor.y = pixel.y;
1854 blendFactor.z = pixel.z;
1857 blendFactor.x = Float4(1.0f) - pixel.x;
1858 blendFactor.y = Float4(1.0f) - pixel.y;
1859 blendFactor.z = Float4(1.0f) - pixel.z;
1861 case BLEND_SOURCEALPHA:
1862 blendFactor.x = oC.w;
1863 blendFactor.y = oC.w;
1864 blendFactor.z = oC.w;
1866 case BLEND_INVSOURCEALPHA:
1867 blendFactor.x = Float4(1.0f) - oC.w;
1868 blendFactor.y = Float4(1.0f) - oC.w;
1869 blendFactor.z = Float4(1.0f) - oC.w;
1871 case BLEND_DESTALPHA:
1872 blendFactor.x = pixel.w;
1873 blendFactor.y = pixel.w;
1874 blendFactor.z = pixel.w;
1876 case BLEND_INVDESTALPHA:
1877 blendFactor.x = Float4(1.0f) - pixel.w;
1878 blendFactor.y = Float4(1.0f) - pixel.w;
1879 blendFactor.z = Float4(1.0f) - pixel.w;
1881 case BLEND_SRCALPHASAT:
1882 blendFactor.x = Float4(1.0f) - pixel.w;
1883 blendFactor.x = Min(blendFactor.x, oC.w);
1884 blendFactor.y = blendFactor.x;
1885 blendFactor.z = blendFactor.x;
1887 case BLEND_CONSTANT:
1888 blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0]));
1889 blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1]));
1890 blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2]));
1892 case BLEND_INVCONSTANT:
1893 blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
1894 blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
1895 blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
1902 void PixelRoutine::blendFactorAlpha(const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive)
1904 switch(blendFactorAlphaActive)
1913 blendFactor.w = oC.w;
1915 case BLEND_INVSOURCE:
1916 blendFactor.w = Float4(1.0f) - oC.w;
1919 blendFactor.w = pixel.w;
1922 blendFactor.w = Float4(1.0f) - pixel.w;
1924 case BLEND_SOURCEALPHA:
1925 blendFactor.w = oC.w;
1927 case BLEND_INVSOURCEALPHA:
1928 blendFactor.w = Float4(1.0f) - oC.w;
1930 case BLEND_DESTALPHA:
1931 blendFactor.w = pixel.w;
1933 case BLEND_INVDESTALPHA:
1934 blendFactor.w = Float4(1.0f) - pixel.w;
1936 case BLEND_SRCALPHASAT:
1937 blendFactor.w = Float4(1.0f);
1939 case BLEND_CONSTANT:
1940 blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
1942 case BLEND_INVCONSTANT:
1943 blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
1950 void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
1952 if(!state.alphaBlendActive)
1957 Pointer<Byte> buffer;
1965 switch(state.targetFormat[index])
1968 case FORMAT_G32R32I:
1969 one = As<Float4>(Int4(0x7FFFFFFF));
1972 case FORMAT_G32R32UI:
1973 one = As<Float4>(Int4(0xFFFFFFFF));
1976 case FORMAT_G32R32F:
1981 switch(state.targetFormat[index])
1988 pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
1989 pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
1990 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1992 pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
1993 pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
1994 pixel.y = pixel.z = pixel.w = one;
1996 case FORMAT_G32R32I:
1997 case FORMAT_G32R32UI:
1998 case FORMAT_G32R32F:
2000 pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
2001 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2002 pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
2004 pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88);
2005 pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD);
2007 pixel.z = pixel.w = one;
2009 case FORMAT_X32B32G32R32F:
2010 case FORMAT_A32B32G32R32F:
2011 case FORMAT_A32B32G32R32I:
2012 case FORMAT_A32B32G32R32UI:
2014 pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
2015 pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2016 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2017 pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
2018 pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2019 transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
2020 if(state.targetFormat[index] == FORMAT_X32B32G32R32F)
2022 pixel.w = Float4(1.0f);
2029 if(postBlendSRGB && state.writeSRGB)
2031 sRGBtoLinear(pixel.x);
2032 sRGBtoLinear(pixel.y);
2033 sRGBtoLinear(pixel.z);
2036 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
2037 Vector4f sourceFactor;
2038 Vector4f destFactor;
2040 blendFactor(sourceFactor, oC, pixel, state.sourceBlendFactor);
2041 blendFactor(destFactor, oC, pixel, state.destBlendFactor);
2043 if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
2045 oC.x *= sourceFactor.x;
2046 oC.y *= sourceFactor.y;
2047 oC.z *= sourceFactor.z;
2050 if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
2052 pixel.x *= destFactor.x;
2053 pixel.y *= destFactor.y;
2054 pixel.z *= destFactor.z;
2057 switch(state.blendOperation)
2069 case BLENDOP_INVSUB:
2070 oC.x = pixel.x - oC.x;
2071 oC.y = pixel.y - oC.y;
2072 oC.z = pixel.z - oC.z;
2075 oC.x = Min(oC.x, pixel.x);
2076 oC.y = Min(oC.y, pixel.y);
2077 oC.z = Min(oC.z, pixel.z);
2080 oC.x = Max(oC.x, pixel.x);
2081 oC.y = Max(oC.y, pixel.y);
2082 oC.z = Max(oC.z, pixel.z);
2084 case BLENDOP_SOURCE:
2093 oC.x = Float4(0.0f);
2094 oC.y = Float4(0.0f);
2095 oC.z = Float4(0.0f);
2101 blendFactorAlpha(sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
2102 blendFactorAlpha(destFactor, oC, pixel, state.destBlendFactorAlpha);
2104 if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
2106 oC.w *= sourceFactor.w;
2109 if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
2111 pixel.w *= destFactor.w;
2114 switch(state.blendOperationAlpha)
2122 case BLENDOP_INVSUB:
2127 oC.w = Min(oC.w, pixel.w);
2130 oC.w = Max(oC.w, pixel.w);
2132 case BLENDOP_SOURCE:
2139 oC.w = Float4(0.0f);
2146 void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
2148 switch(state.targetFormat[index])
2154 case FORMAT_G32R32F:
2155 case FORMAT_G32R32I:
2156 case FORMAT_G32R32UI:
2158 oC.x = UnpackLow(oC.x, oC.y);
2159 oC.z = UnpackHigh(oC.z, oC.y);
2162 case FORMAT_X32B32G32R32F:
2163 case FORMAT_A32B32G32R32F:
2164 case FORMAT_A32B32G32R32I:
2165 case FORMAT_A32B32G32R32UI:
2166 transpose4x4(oC.x, oC.y, oC.z, oC.w);
2172 int rgbaWriteMask = state.colorWriteActive(index);
2174 Int xMask; // Combination of all masks
2176 if(state.depthTestActive)
2185 if(state.stencilActive)
2190 Pointer<Byte> buffer;
2193 switch(state.targetFormat[index])
2198 if(rgbaWriteMask & 0x00000001)
2200 buffer = cBuffer + 4 * x;
2203 value.x = *Pointer<Float>(buffer + 0);
2204 value.y = *Pointer<Float>(buffer + 4);
2206 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2209 value.z = *Pointer<Float>(buffer + 0);
2210 value.w = *Pointer<Float>(buffer + 4);
2212 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
2213 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
2214 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2217 *Pointer<Float>(buffer + 0) = oC.x.z;
2218 *Pointer<Float>(buffer + 4) = oC.x.w;
2220 buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2223 *Pointer<Float>(buffer + 0) = oC.x.x;
2224 *Pointer<Float>(buffer + 4) = oC.x.y;
2227 case FORMAT_G32R32F:
2228 case FORMAT_G32R32I:
2229 case FORMAT_G32R32UI:
2230 buffer = cBuffer + 8 * x;
2232 value = *Pointer<Float4>(buffer);
2234 if((rgbaWriteMask & 0x00000003) != 0x00000003)
2236 Float4 masked = value;
2237 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2238 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0])));
2239 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2242 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
2243 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
2244 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2245 *Pointer<Float4>(buffer) = oC.x;
2247 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2249 value = *Pointer<Float4>(buffer);
2251 if((rgbaWriteMask & 0x00000003) != 0x00000003)
2256 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2257 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0])));
2258 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2261 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
2262 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
2263 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2264 *Pointer<Float4>(buffer) = oC.y;
2266 case FORMAT_X32B32G32R32F:
2267 case FORMAT_A32B32G32R32F:
2268 case FORMAT_A32B32G32R32I:
2269 case FORMAT_A32B32G32R32UI:
2270 buffer = cBuffer + 16 * x;
2273 value = *Pointer<Float4>(buffer, 16);
2275 if(rgbaWriteMask != 0x0000000F)
2277 Float4 masked = value;
2278 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2279 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2280 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2283 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
2284 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
2285 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2286 *Pointer<Float4>(buffer, 16) = oC.x;
2290 value = *Pointer<Float4>(buffer + 16, 16);
2292 if(rgbaWriteMask != 0x0000000F)
2294 Float4 masked = value;
2295 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2296 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2297 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2300 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
2301 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
2302 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2303 *Pointer<Float4>(buffer + 16, 16) = oC.y;
2306 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2309 value = *Pointer<Float4>(buffer, 16);
2311 if(rgbaWriteMask != 0x0000000F)
2313 Float4 masked = value;
2314 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2315 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2316 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
2319 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
2320 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
2321 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
2322 *Pointer<Float4>(buffer, 16) = oC.z;
2326 value = (state.targetFormat[index] == FORMAT_X32B32G32R32F) ? Float4(1.0f) : *Pointer<Float4>(buffer + 16, 16);
2328 if(rgbaWriteMask != 0x0000000F)
2330 Float4 masked = value;
2331 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2332 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2333 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
2336 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
2337 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
2338 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
2339 *Pointer<Float4>(buffer + 16, 16) = oC.w;
2347 UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate)
2349 return UShort4(cf * Float4(0xFFFF), saturate);
2352 void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
2354 c.x = As<UShort4>(c.x) >> 4;
2355 c.y = As<UShort4>(c.y) >> 4;
2356 c.z = As<UShort4>(c.z) >> 4;
2358 sRGBtoLinear12_16(c);
2361 void PixelRoutine::sRGBtoLinear12_16(Vector4s &c)
2363 Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16);
2365 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2366 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2367 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2368 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2370 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2371 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2372 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2373 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2375 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2376 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2377 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2378 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2381 void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
2383 c.x = As<UShort4>(c.x) >> 4;
2384 c.y = As<UShort4>(c.y) >> 4;
2385 c.z = As<UShort4>(c.z) >> 4;
2387 linearToSRGB12_16(c);
2390 void PixelRoutine::linearToSRGB12_16(Vector4s &c)
2392 Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16);
2394 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2395 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2396 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2397 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2399 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2400 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2401 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2402 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2404 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2405 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2406 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2407 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2410 Float4 PixelRoutine::sRGBtoLinear(const Float4 &x) // Approximates x^2.2
2412 Float4 linear = x * x;
2413 linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
2415 return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
2418 bool PixelRoutine::colorUsed()
2420 return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill;