1 // SwiftShader Software Renderer
3 // Copyright(c) 2005-2013 TransGaming Inc.
5 // All rights reserved. No part of this software may be copied, distributed, transmitted,
6 // transcribed, stored in a retrieval system, translated into any human or computer
7 // language by any means, or disclosed to third parties without the explicit written
8 // agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
9 // or implied, including but not limited to any patent rights, are granted to you.
12 #include "PixelRoutine.hpp"
14 #include "Renderer.hpp"
15 #include "QuadRasterizer.hpp"
16 #include "Surface.hpp"
17 #include "Primitive.hpp"
19 #include "SamplerCore.hpp"
20 #include "Constants.hpp"
25 extern bool complementaryDepthBuffer;
26 extern bool postBlendSRGB;
27 extern bool exactColorRounding;
28 extern bool forceClearRegisters;
30 PixelRoutine::Registers::Registers(const PixelShader *shader) :
31 QuadRasterizer::Registers(),
32 rf(shader && shader->dynamicallyIndexedTemporaries),
33 vf(shader && shader->dynamicallyIndexedInput)
35 if(!shader || shader->getVersion() < 0x0200 || forceClearRegisters)
37 for(int i = 0; i < 10; i++)
39 vf[i].x = Float4(0.0f);
40 vf[i].y = Float4(0.0f);
41 vf[i].z = Float4(0.0f);
42 vf[i].w = Float4(0.0f);
47 PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader) : QuadRasterizer(state, shader)
51 PixelRoutine::~PixelRoutine()
53 for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
59 void PixelRoutine::quad(QuadRasterizer::Registers &rBase, Pointer<Byte> cBuffer[4], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
61 Registers& r = *static_cast<Registers*>(&rBase);
64 Long pipeTime = Ticks();
67 for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
69 sampler[i] = new SamplerCore(r.constants, state.sampler[i]);
72 const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive();
74 Int zMask[4]; // Depth mask
75 Int sMask[4]; // Stencil mask
77 for(unsigned int q = 0; q < state.multiSample; q++)
83 for(unsigned int q = 0; q < state.multiSample; q++)
85 stencilTest(r, sBuffer, q, x, sMask[q], cMask[q]);
95 Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(r.primitive + OFFSET(Primitive,xQuad), 16);
99 for(unsigned int q = 0; q < state.multiSample; q++)
103 if(state.multiSample > 1)
105 x -= *Pointer<Float4>(r.constants + OFFSET(Constants,X) + q * sizeof(float4));
108 z[q] = interpolate(x, r.Dz[q], z[q], r.primitive + OFFSET(Primitive,z), false, false);
112 Bool depthPass = false;
116 for(unsigned int q = 0; q < state.multiSample; q++)
118 depthPass = depthPass || depthTest(r, zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
122 If(depthPass || Bool(!earlyDepthTest))
125 Long interpTime = Ticks();
128 Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(r.primitive + OFFSET(Primitive,yQuad), 16);
130 // Centroid locations
131 Float4 XXXX = Float4(0.0f);
132 Float4 YYYY = Float4(0.0f);
136 Float4 WWWW(1.0e-9f);
138 for(unsigned int q = 0; q < state.multiSample; q++)
140 XXXX += *Pointer<Float4>(r.constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
141 YYYY += *Pointer<Float4>(r.constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
142 WWWW += *Pointer<Float4>(r.constants + OFFSET(Constants,weight) + 16 * cMask[q]);
155 w = interpolate(xxxx, r.Dw, rhw, r.primitive + OFFSET(Primitive,w), false, false);
160 rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, r.primitive + OFFSET(Primitive,w), false, false));
164 for(int interpolant = 0; interpolant < 10; interpolant++)
166 for(int component = 0; component < 4; component++)
168 if(state.interpolant[interpolant].component & (1 << component))
170 if(!state.interpolant[interpolant].centroid)
172 r.vf[interpolant][component] = interpolate(xxxx, r.Dv[interpolant][component], rhw, r.primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
176 r.vf[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, r.primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
183 switch(state.interpolant[interpolant].project)
188 rcp = reciprocal(r.vf[interpolant].y);
189 r.vf[interpolant].x = r.vf[interpolant].x * rcp;
192 rcp = reciprocal(r.vf[interpolant].z);
193 r.vf[interpolant].x = r.vf[interpolant].x * rcp;
194 r.vf[interpolant].y = r.vf[interpolant].y * rcp;
197 rcp = reciprocal(r.vf[interpolant].w);
198 r.vf[interpolant].x = r.vf[interpolant].x * rcp;
199 r.vf[interpolant].y = r.vf[interpolant].y * rcp;
200 r.vf[interpolant].z = r.vf[interpolant].z * rcp;
205 if(state.fog.component)
207 f = interpolate(xxxx, r.Df, rhw, r.primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective);
210 setBuiltins(r, x, y, z, w);
213 r.cycles[PERF_INTERP] += Ticks() - interpTime;
216 Bool alphaPass = true;
221 Long shaderTime = Ticks();
224 applyShader(r, cMask);
227 r.cycles[PERF_SHADER] += Ticks() - shaderTime;
230 alphaPass = alphaTest(r, cMask);
232 if((shader && shader->containsKill()) || state.alphaTestActive())
234 for(unsigned int q = 0; q < state.multiSample; q++)
236 zMask[q] &= cMask[q];
237 sMask[q] &= cMask[q];
246 for(unsigned int q = 0; q < state.multiSample; q++)
248 depthPass = depthPass || depthTest(r, zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
253 Long ropTime = Ticks();
256 If(depthPass || Bool(earlyDepthTest))
258 for(unsigned int q = 0; q < state.multiSample; q++)
260 if(state.multiSampleMask & (1 << q))
262 writeDepth(r, zBuffer, q, x, z[q], zMask[q]);
264 if(state.occlusionEnabled)
266 r.occlusion += *Pointer<UInt>(r.constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
274 AddAtomic(Pointer<Long>(&profiler.ropOperations), 4);
277 rasterOperation(r, f, cBuffer, x, sMask, zMask, cMask);
282 r.cycles[PERF_ROP] += Ticks() - ropTime;
287 for(unsigned int q = 0; q < state.multiSample; q++)
289 if(state.multiSampleMask & (1 << q))
291 writeStencil(r, sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
296 r.cycles[PERF_PIPE] += Ticks() - pipeTime;
300 Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
302 Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
306 interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
307 y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
318 void PixelRoutine::stencilTest(Registers &r, Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask)
320 if(!state.stencilActive)
325 // (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
327 Pointer<Byte> buffer = sBuffer + 2 * x;
331 buffer += q * *Pointer<Int>(r.data + OFFSET(DrawData,stencilSliceB));
334 Byte8 value = As<Byte8>(Long1(*Pointer<UInt>(buffer)));
335 Byte8 valueCCW = value;
337 if(!state.noStencilMask)
339 value &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[0].testMaskQ));
342 stencilTest(r, value, state.stencilCompareMode, false);
344 if(state.twoSidedStencil)
346 if(!state.noStencilMaskCCW)
348 valueCCW &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[1].testMaskQ));
351 stencilTest(r, valueCCW, state.stencilCompareModeCCW, true);
353 value &= *Pointer<Byte8>(r.primitive + OFFSET(Primitive,clockwiseMask));
354 valueCCW &= *Pointer<Byte8>(r.primitive + OFFSET(Primitive,invClockwiseMask));
358 sMask = SignMask(value) & cMask;
361 void PixelRoutine::stencilTest(Registers &r, Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW)
365 switch(stencilCompareMode)
368 value = Byte8(0xFFFFFFFFFFFFFFFF);
371 value = Byte8(0x0000000000000000);
373 case STENCIL_LESS: // a < b ~ b > a
374 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
375 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
378 value = CmpEQ(value, *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
380 case STENCIL_NOTEQUAL: // a != b ~ !(a == b)
381 value = CmpEQ(value, *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
382 value ^= Byte8(0xFFFFFFFFFFFFFFFF);
384 case STENCIL_LESSEQUAL: // a <= b ~ (b > a) || (a == b)
386 equal = CmpEQ(equal, *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
387 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
388 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
391 case STENCIL_GREATER: // a > b
392 equal = *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
393 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
394 equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
397 case STENCIL_GREATEREQUAL: // a >= b ~ !(a < b) ~ !(b > a)
398 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
399 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
400 value ^= Byte8(0xFFFFFFFFFFFFFFFF);
407 Bool PixelRoutine::depthTest(Registers &r, Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
409 if(!state.depthTestActive)
416 if(shader && shader->depthOverride())
418 if(complementaryDepthBuffer)
420 Z = Float4(1.0f) - r.oDepth;
428 Pointer<Byte> buffer;
431 if(!state.quadLayoutDepthBuffer)
433 buffer = zBuffer + 4 * x;
434 pitch = *Pointer<Int>(r.data + OFFSET(DrawData,depthPitchB));
438 buffer = zBuffer + 8 * x;
443 buffer += q * *Pointer<Int>(r.data + OFFSET(DrawData,depthSliceB));
448 if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
450 if(!state.quadLayoutDepthBuffer)
452 // FIXME: Properly optimizes?
453 zValue.xy = *Pointer<Float4>(buffer);
454 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
458 zValue = *Pointer<Float4>(buffer, 16);
464 switch(state.depthCompareMode)
473 zTest = CmpEQ(zValue, Z);
476 zTest = CmpNEQ(zValue, Z);
479 if(complementaryDepthBuffer)
481 zTest = CmpLT(zValue, Z);
485 zTest = CmpNLE(zValue, Z);
488 case DEPTH_GREATEREQUAL:
489 if(complementaryDepthBuffer)
491 zTest = CmpNLT(zValue, Z);
495 zTest = CmpLE(zValue, Z);
498 case DEPTH_LESSEQUAL:
499 if(complementaryDepthBuffer)
501 zTest = CmpLE(zValue, Z);
505 zTest = CmpNLT(zValue, Z);
509 if(complementaryDepthBuffer)
511 zTest = CmpNLE(zValue, Z);
515 zTest = CmpLT(zValue, Z);
522 switch(state.depthCompareMode)
531 zMask = SignMask(zTest) & cMask;
535 if(state.stencilActive)
543 void PixelRoutine::alphaTest(Registers &r, Int &aMask, Short4 &alpha)
548 switch(state.alphaCompareMode)
557 cmp = CmpEQ(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)));
558 aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
560 case ALPHA_NOTEQUAL: // a != b ~ !(a == b)
561 cmp = CmpEQ(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4((short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF); // FIXME
562 aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
564 case ALPHA_LESS: // a < b ~ b > a
565 cmp = CmpGT(*Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)), alpha);
566 aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
568 case ALPHA_GREATEREQUAL: // a >= b ~ (a > b) || (a == b) ~ !(b > a) // TODO: Approximate
569 equal = CmpEQ(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)));
570 cmp = CmpGT(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)));
572 aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
574 case ALPHA_LESSEQUAL: // a <= b ~ !(a > b)
575 cmp = CmpGT(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4((short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF); // FIXME
576 aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
578 case ALPHA_GREATER: // a > b
579 cmp = CmpGT(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)));
580 aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
587 void PixelRoutine::alphaToCoverage(Registers &r, Int cMask[4], Float4 &alpha)
589 Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(r.data + OFFSET(DrawData,a2c0)));
590 Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(r.data + OFFSET(DrawData,a2c1)));
591 Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(r.data + OFFSET(DrawData,a2c2)));
592 Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(r.data + OFFSET(DrawData,a2c3)));
594 Int aMask0 = SignMask(coverage0);
595 Int aMask1 = SignMask(coverage1);
596 Int aMask2 = SignMask(coverage2);
597 Int aMask3 = SignMask(coverage3);
605 void PixelRoutine::fogBlend(Registers &r, Vector4f &c0, Float4 &fog, Float4 &z, Float4 &rhw)
612 if(state.pixelFogMode != FOG_NONE)
614 pixelFog(r, fog, z, rhw);
616 fog = Min(fog, Float4(1.0f));
617 fog = Max(fog, Float4(0.0f));
620 c0.x -= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[0]));
621 c0.y -= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[1]));
622 c0.z -= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[2]));
628 c0.x += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[0]));
629 c0.y += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[1]));
630 c0.z += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[2]));
633 void PixelRoutine::pixelFog(Registers &r, Float4 &visibility, Float4 &z, Float4 &rhw)
635 Float4 &zw = visibility;
637 if(state.pixelFogMode != FOG_NONE)
645 if(complementaryDepthBuffer)
647 zw = Float4(1.0f) - z;
656 switch(state.pixelFogMode)
661 zw *= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.scale));
662 zw += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.offset));
665 zw *= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.densityE));
666 zw = exponential2(zw, true);
670 zw *= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.density2E));
671 zw = exponential2(zw, true);
678 void PixelRoutine::writeDepth(Registers &r, Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
680 if(!state.depthWriteEnable)
687 if(shader && shader->depthOverride())
689 if(complementaryDepthBuffer)
691 Z = Float4(1.0f) - r.oDepth;
699 Pointer<Byte> buffer;
702 if(!state.quadLayoutDepthBuffer)
704 buffer = zBuffer + 4 * x;
705 pitch = *Pointer<Int>(r.data + OFFSET(DrawData,depthPitchB));
709 buffer = zBuffer + 8 * x;
714 buffer += q * *Pointer<Int>(r.data + OFFSET(DrawData,depthSliceB));
719 if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
721 if(!state.quadLayoutDepthBuffer)
723 // FIXME: Properly optimizes?
724 zValue.xy = *Pointer<Float4>(buffer);
725 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
729 zValue = *Pointer<Float4>(buffer, 16);
733 Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
734 zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
735 Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
737 if(!state.quadLayoutDepthBuffer)
739 // FIXME: Properly optimizes?
740 *Pointer<Float2>(buffer) = Float2(Z.xy);
741 *Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
745 *Pointer<Float4>(buffer, 16) = Z;
749 void PixelRoutine::writeStencil(Registers &r, Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
751 if(!state.stencilActive)
756 if(state.stencilPassOperation == OPERATION_KEEP && state.stencilZFailOperation == OPERATION_KEEP && state.stencilFailOperation == OPERATION_KEEP)
758 if(!state.twoSidedStencil || (state.stencilPassOperationCCW == OPERATION_KEEP && state.stencilZFailOperationCCW == OPERATION_KEEP && state.stencilFailOperationCCW == OPERATION_KEEP))
764 if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW))
769 Pointer<Byte> buffer = sBuffer + 2 * x;
773 buffer += q * *Pointer<Int>(r.data + OFFSET(DrawData,stencilSliceB));
776 Byte8 bufferValue = As<Byte8>(Long1(*Pointer<UInt>(buffer)));
779 stencilOperation(r, newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
781 if(!state.noStencilWriteMask)
783 Byte8 maskedValue = bufferValue;
784 newValue &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[0].writeMaskQ));
785 maskedValue &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
786 newValue |= maskedValue;
789 if(state.twoSidedStencil)
793 stencilOperation(r, newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
795 if(!state.noStencilWriteMaskCCW)
797 Byte8 maskedValue = bufferValue;
798 newValueCCW &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[1].writeMaskQ));
799 maskedValue &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
800 newValueCCW |= maskedValue;
803 newValue &= *Pointer<Byte8>(r.primitive + OFFSET(Primitive,clockwiseMask));
804 newValueCCW &= *Pointer<Byte8>(r.primitive + OFFSET(Primitive,invClockwiseMask));
805 newValue |= newValueCCW;
808 newValue &= *Pointer<Byte8>(r.constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
809 bufferValue &= *Pointer<Byte8>(r.constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
810 newValue |= bufferValue;
812 *Pointer<UInt>(buffer) = UInt(As<Long>(newValue));
815 void PixelRoutine::stencilOperation(Registers &r, Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
817 Byte8 &pass = newValue;
821 stencilOperation(r, pass, bufferValue, stencilPassOperation, CCW);
823 if(stencilZFailOperation != stencilPassOperation)
825 stencilOperation(r, zFail, bufferValue, stencilZFailOperation, CCW);
828 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
830 stencilOperation(r, fail, bufferValue, stencilFailOperation, CCW);
833 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
835 if(state.depthTestActive && stencilZFailOperation != stencilPassOperation) // zMask valid and values not the same
837 pass &= *Pointer<Byte8>(r.constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
838 zFail &= *Pointer<Byte8>(r.constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
842 pass &= *Pointer<Byte8>(r.constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
843 fail &= *Pointer<Byte8>(r.constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
848 void PixelRoutine::stencilOperation(Registers &r, Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW)
853 output = bufferValue;
856 output = Byte8(0x0000000000000000);
858 case OPERATION_REPLACE:
859 output = *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceQ));
861 case OPERATION_INCRSAT:
862 output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
864 case OPERATION_DECRSAT:
865 output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
867 case OPERATION_INVERT:
868 output = bufferValue ^ Byte8(0xFFFFFFFFFFFFFFFF);
871 output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
874 output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
881 void PixelRoutine::blendFactor(Registers &r, const Vector4s &blendFactor, const Vector4s ¤t, const Vector4s &pixel, BlendFactor blendFactorActive)
883 switch(blendFactorActive)
892 blendFactor.x = current.x;
893 blendFactor.y = current.y;
894 blendFactor.z = current.z;
896 case BLEND_INVSOURCE:
897 blendFactor.x = Short4(0xFFFFu) - current.x;
898 blendFactor.y = Short4(0xFFFFu) - current.y;
899 blendFactor.z = Short4(0xFFFFu) - current.z;
902 blendFactor.x = pixel.x;
903 blendFactor.y = pixel.y;
904 blendFactor.z = pixel.z;
907 blendFactor.x = Short4(0xFFFFu) - pixel.x;
908 blendFactor.y = Short4(0xFFFFu) - pixel.y;
909 blendFactor.z = Short4(0xFFFFu) - pixel.z;
911 case BLEND_SOURCEALPHA:
912 blendFactor.x = current.w;
913 blendFactor.y = current.w;
914 blendFactor.z = current.w;
916 case BLEND_INVSOURCEALPHA:
917 blendFactor.x = Short4(0xFFFFu) - current.w;
918 blendFactor.y = Short4(0xFFFFu) - current.w;
919 blendFactor.z = Short4(0xFFFFu) - current.w;
921 case BLEND_DESTALPHA:
922 blendFactor.x = pixel.w;
923 blendFactor.y = pixel.w;
924 blendFactor.z = pixel.w;
926 case BLEND_INVDESTALPHA:
927 blendFactor.x = Short4(0xFFFFu) - pixel.w;
928 blendFactor.y = Short4(0xFFFFu) - pixel.w;
929 blendFactor.z = Short4(0xFFFFu) - pixel.w;
931 case BLEND_SRCALPHASAT:
932 blendFactor.x = Short4(0xFFFFu) - pixel.w;
933 blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
934 blendFactor.y = blendFactor.x;
935 blendFactor.z = blendFactor.x;
938 blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[0]));
939 blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[1]));
940 blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[2]));
942 case BLEND_INVCONSTANT:
943 blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
944 blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
945 blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
947 case BLEND_CONSTANTALPHA:
948 blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
949 blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
950 blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
952 case BLEND_INVCONSTANTALPHA:
953 blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
954 blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
955 blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
962 void PixelRoutine::blendFactorAlpha(Registers &r, const Vector4s &blendFactor, const Vector4s ¤t, const Vector4s &pixel, BlendFactor blendFactorAlphaActive)
964 switch(blendFactorAlphaActive)
973 blendFactor.w = current.w;
975 case BLEND_INVSOURCE:
976 blendFactor.w = Short4(0xFFFFu) - current.w;
979 blendFactor.w = pixel.w;
982 blendFactor.w = Short4(0xFFFFu) - pixel.w;
984 case BLEND_SOURCEALPHA:
985 blendFactor.w = current.w;
987 case BLEND_INVSOURCEALPHA:
988 blendFactor.w = Short4(0xFFFFu) - current.w;
990 case BLEND_DESTALPHA:
991 blendFactor.w = pixel.w;
993 case BLEND_INVDESTALPHA:
994 blendFactor.w = Short4(0xFFFFu) - pixel.w;
996 case BLEND_SRCALPHASAT:
997 blendFactor.w = Short4(0xFFFFu);
1000 case BLEND_CONSTANTALPHA:
1001 blendFactor.w = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
1003 case BLEND_INVCONSTANT:
1004 case BLEND_INVCONSTANTALPHA:
1005 blendFactor.w = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
1012 void PixelRoutine::readPixel(Registers &r, int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel)
1016 Pointer<Byte> buffer;
1018 switch(state.targetFormat[index])
1021 buffer = cBuffer + 2 * x;
1022 c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
1023 buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1024 c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
1026 pixel.x = c01 & Short4(0xF800u);
1027 pixel.y = (c01 & Short4(0x07E0u)) << 5;
1028 pixel.z = (c01 & Short4(0x001Fu)) << 11;
1029 pixel.w = Short4(0xFFFFu);
1031 case FORMAT_A8R8G8B8:
1032 buffer = cBuffer + 4 * x;
1033 c01 = *Pointer<Short4>(buffer);
1034 buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1035 c23 = *Pointer<Short4>(buffer);
1038 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1039 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1041 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1042 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1045 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1046 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1047 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1048 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1050 case FORMAT_A8B8G8R8:
1051 buffer = cBuffer + 4 * x;
1052 c01 = *Pointer<Short4>(buffer);
1053 buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1054 c23 = *Pointer<Short4>(buffer);
1057 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1058 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1060 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1061 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1064 pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1065 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1066 pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1067 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1070 buffer = cBuffer + 1 * x;
1071 pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 0);
1072 buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1073 pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 1);
1074 pixel.w = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1075 pixel.x = Short4(0x0000);
1076 pixel.y = Short4(0x0000);
1077 pixel.z = Short4(0x0000);
1079 case FORMAT_X8R8G8B8:
1080 buffer = cBuffer + 4 * x;
1081 c01 = *Pointer<Short4>(buffer);
1082 buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1083 c23 = *Pointer<Short4>(buffer);
1086 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1087 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1089 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1090 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1092 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1093 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1094 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1095 pixel.w = Short4(0xFFFFu);
1097 case FORMAT_X8B8G8R8:
1098 buffer = cBuffer + 4 * x;
1099 c01 = *Pointer<Short4>(buffer);
1100 buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1101 c23 = *Pointer<Short4>(buffer);
1104 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1105 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1107 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1108 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1111 pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1112 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1113 pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1114 pixel.w = Short4(0xFFFFu);
1116 case FORMAT_A8G8R8B8Q:
1118 // pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1119 // pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1120 // pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1121 // pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1123 case FORMAT_X8G8R8B8Q:
1125 // pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1126 // pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1127 // pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1128 // pixel.w = Short4(0xFFFFu);
1130 case FORMAT_A16B16G16R16:
1132 pixel.x = *Pointer<Short4>(buffer + 8 * x);
1133 pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
1134 buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1135 pixel.z = *Pointer<Short4>(buffer + 8 * x);
1136 pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
1137 transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
1141 pixel.x = *Pointer<Short4>(buffer + 4 * x);
1142 buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1143 pixel.y = *Pointer<Short4>(buffer + 4 * x);
1145 pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
1146 pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
1148 pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
1149 pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
1150 pixel.z = Short4(0xFFFFu);
1151 pixel.w = Short4(0xFFFFu);
1157 if(postBlendSRGB && state.writeSRGB)
1159 sRGBtoLinear16_12_16(r, pixel);
1163 void PixelRoutine::alphaBlend(Registers &r, int index, Pointer<Byte> &cBuffer, Vector4s ¤t, Int &x)
1165 if(!state.alphaBlendActive)
1171 readPixel(r, index, cBuffer, x, pixel);
1173 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
1174 Vector4s sourceFactor;
1175 Vector4s destFactor;
1177 blendFactor(r, sourceFactor, current, pixel, state.sourceBlendFactor);
1178 blendFactor(r, destFactor, current, pixel, state.destBlendFactor);
1180 if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
1182 current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
1183 current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
1184 current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
1187 if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
1189 pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
1190 pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
1191 pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
1194 switch(state.blendOperation)
1197 current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1198 current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1199 current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1202 current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1203 current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1204 current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1206 case BLENDOP_INVSUB:
1207 current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
1208 current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
1209 current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
1212 current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
1213 current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
1214 current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
1217 current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
1218 current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
1219 current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
1221 case BLENDOP_SOURCE:
1225 current.x = pixel.x;
1226 current.y = pixel.y;
1227 current.z = pixel.z;
1230 current.x = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1231 current.y = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1232 current.z = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1238 blendFactorAlpha(r, sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
1239 blendFactorAlpha(r, destFactor, current, pixel, state.destBlendFactorAlpha);
1241 if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
1243 current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
1246 if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
1248 pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
1251 switch(state.blendOperationAlpha)
1254 current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1257 current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1259 case BLENDOP_INVSUB:
1260 current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
1263 current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
1266 current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
1268 case BLENDOP_SOURCE:
1272 current.w = pixel.w;
1275 current.w = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1282 void PixelRoutine::logicOperation(Registers &r, int index, Pointer<Byte> &cBuffer, Vector4s ¤t, Int &x)
1284 if(state.logicalOperation == LOGICALOP_COPY)
1290 readPixel(r, index, cBuffer, x, pixel);
1292 switch(state.logicalOperation)
1294 case LOGICALOP_CLEAR:
1300 current.x = 0xFFFFu;
1301 current.y = 0xFFFFu;
1302 current.z = 0xFFFFu;
1304 case LOGICALOP_COPY:
1305 ASSERT(false); // Optimized out
1307 case LOGICALOP_COPY_INVERTED:
1308 current.x = ~current.x;
1309 current.y = ~current.y;
1310 current.z = ~current.z;
1312 case LOGICALOP_NOOP:
1313 current.x = pixel.x;
1314 current.y = pixel.y;
1315 current.z = pixel.z;
1317 case LOGICALOP_INVERT:
1318 current.x = ~pixel.x;
1319 current.y = ~pixel.y;
1320 current.z = ~pixel.z;
1323 current.x = pixel.x & current.x;
1324 current.y = pixel.y & current.y;
1325 current.z = pixel.z & current.z;
1327 case LOGICALOP_NAND:
1328 current.x = ~(pixel.x & current.x);
1329 current.y = ~(pixel.y & current.y);
1330 current.z = ~(pixel.z & current.z);
1333 current.x = pixel.x | current.x;
1334 current.y = pixel.y | current.y;
1335 current.z = pixel.z | current.z;
1338 current.x = ~(pixel.x | current.x);
1339 current.y = ~(pixel.y | current.y);
1340 current.z = ~(pixel.z | current.z);
1343 current.x = pixel.x ^ current.x;
1344 current.y = pixel.y ^ current.y;
1345 current.z = pixel.z ^ current.z;
1347 case LOGICALOP_EQUIV:
1348 current.x = ~(pixel.x ^ current.x);
1349 current.y = ~(pixel.y ^ current.y);
1350 current.z = ~(pixel.z ^ current.z);
1352 case LOGICALOP_AND_REVERSE:
1353 current.x = ~pixel.x & current.x;
1354 current.y = ~pixel.y & current.y;
1355 current.z = ~pixel.z & current.z;
1357 case LOGICALOP_AND_INVERTED:
1358 current.x = pixel.x & ~current.x;
1359 current.y = pixel.y & ~current.y;
1360 current.z = pixel.z & ~current.z;
1362 case LOGICALOP_OR_REVERSE:
1363 current.x = ~pixel.x | current.x;
1364 current.y = ~pixel.y | current.y;
1365 current.z = ~pixel.z | current.z;
1367 case LOGICALOP_OR_INVERTED:
1368 current.x = pixel.x | ~current.x;
1369 current.y = pixel.y | ~current.y;
1370 current.z = pixel.z | ~current.z;
1377 void PixelRoutine::writeColor(Registers &r, int index, Pointer<Byte> &cBuffer, Int &x, Vector4s ¤t, Int &sMask, Int &zMask, Int &cMask)
1379 if(postBlendSRGB && state.writeSRGB)
1381 linearToSRGB16_12_16(r, current);
1384 if(exactColorRounding)
1386 switch(state.targetFormat[index])
1389 current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400));
1390 current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200));
1391 current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400));
1393 case FORMAT_X8G8R8B8Q:
1394 case FORMAT_A8G8R8B8Q:
1395 case FORMAT_X8R8G8B8:
1396 case FORMAT_X8B8G8R8:
1397 case FORMAT_A8R8G8B8:
1398 case FORMAT_A8B8G8R8:
1399 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1400 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1401 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1402 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1407 int rgbaWriteMask = state.colorWriteActive(index);
1408 int bgraWriteMask = rgbaWriteMask & 0x0000000A | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
1409 int brgaWriteMask = rgbaWriteMask & 0x00000008 | (rgbaWriteMask & 0x00000001) << 1 | (rgbaWriteMask & 0x00000002) << 1 | (rgbaWriteMask & 0x00000004) >> 2;
1411 switch(state.targetFormat[index])
1415 current.x = current.x & Short4(0xF800u);
1416 current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1417 current.z = As<UShort4>(current.z) >> 11;
1419 current.x = current.x | current.y | current.z;
1422 case FORMAT_X8G8R8B8Q:
1424 // current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1425 // current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1426 // current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1428 // current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1429 // current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1431 case FORMAT_A8G8R8B8Q:
1433 // current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1434 // current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1435 // current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1436 // current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1438 // current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1439 // current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1441 case FORMAT_X8R8G8B8:
1442 case FORMAT_A8R8G8B8:
1443 if(state.targetFormat[index] == FORMAT_X8R8G8B8 || rgbaWriteMask == 0x7)
1445 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1446 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1447 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1449 current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1450 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1452 current.x = current.z;
1453 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1454 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1455 current.y = current.z;
1456 current.z = As<Short4>(UnpackLow(current.z, current.x));
1457 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1461 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1462 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1463 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1464 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1466 current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1467 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1469 current.x = current.z;
1470 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1471 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1472 current.y = current.z;
1473 current.z = As<Short4>(UnpackLow(current.z, current.x));
1474 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1477 case FORMAT_X8B8G8R8:
1478 case FORMAT_A8B8G8R8:
1479 if(state.targetFormat[index] == FORMAT_X8B8G8R8 || rgbaWriteMask == 0x7)
1481 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1482 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1483 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1485 current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1486 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1488 current.x = current.z;
1489 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1490 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1491 current.y = current.z;
1492 current.z = As<Short4>(UnpackLow(current.z, current.x));
1493 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1497 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1498 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1499 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1500 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1502 current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1503 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1505 current.x = current.z;
1506 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1507 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1508 current.y = current.z;
1509 current.z = As<Short4>(UnpackLow(current.z, current.x));
1510 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1514 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1515 current.w = As<Short4>(Pack(As<UShort4>(current.w), As<UShort4>(current.w)));
1518 current.z = current.x;
1519 current.x = As<Short4>(UnpackLow(current.x, current.y));
1520 current.z = As<Short4>(UnpackHigh(current.z, current.y));
1521 current.y = current.z;
1523 case FORMAT_A16B16G16R16:
1524 transpose4x4(current.x, current.y, current.z, current.w);
1530 Short4 c01 = current.z;
1531 Short4 c23 = current.y;
1533 Int xMask; // Combination of all masks
1535 if(state.depthTestActive)
1544 if(state.stencilActive)
1549 switch(state.targetFormat[index])
1553 Pointer<Byte> buffer = cBuffer + 2 * x;
1554 Int value = *Pointer<Int>(buffer);
1556 Int c01 = Extract(As<Int2>(current.x), 0);
1558 if((bgraWriteMask & 0x00000007) != 0x00000007)
1561 c01 &= *Pointer<Int>(r.constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1562 masked &= *Pointer<Int>(r.constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0]));
1566 c01 &= *Pointer<Int>(r.constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
1567 value &= *Pointer<Int>(r.constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
1569 *Pointer<Int>(buffer) = c01;
1571 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1572 value = *Pointer<Int>(buffer);
1574 Int c23 = Extract(As<Int2>(current.x), 1);
1576 if((bgraWriteMask & 0x00000007) != 0x00000007)
1579 c23 &= *Pointer<Int>(r.constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1580 masked &= *Pointer<Int>(r.constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0]));
1584 c23 &= *Pointer<Int>(r.constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
1585 value &= *Pointer<Int>(r.constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
1587 *Pointer<Int>(buffer) = c23;
1590 case FORMAT_A8G8R8B8Q:
1591 case FORMAT_X8G8R8B8Q: // FIXME: Don't touch alpha?
1593 // value = *Pointer<Short4>(cBuffer + 8 * x + 0);
1595 // if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1596 // ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1597 // (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
1599 // Short4 masked = value;
1600 // c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1601 // masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1605 // c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1606 // value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1608 // *Pointer<Short4>(cBuffer + 8 * x + 0) = c01;
1610 // value = *Pointer<Short4>(cBuffer + 8 * x + 8);
1612 // if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1613 // ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1614 // (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
1616 // Short4 masked = value;
1617 // c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1618 // masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1622 // c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1623 // value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1625 // *Pointer<Short4>(cBuffer + 8 * x + 8) = c23;
1627 case FORMAT_A8R8G8B8:
1628 case FORMAT_X8R8G8B8: // FIXME: Don't touch alpha?
1630 Pointer<Byte> buffer = cBuffer + x * 4;
1631 Short4 value = *Pointer<Short4>(buffer);
1633 if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1634 ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1635 (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
1637 Short4 masked = value;
1638 c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1639 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1643 c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1644 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1646 *Pointer<Short4>(buffer) = c01;
1648 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1649 value = *Pointer<Short4>(buffer);
1651 if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1652 ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1653 (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
1655 Short4 masked = value;
1656 c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1657 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1661 c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1662 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1664 *Pointer<Short4>(buffer) = c23;
1667 case FORMAT_A8B8G8R8:
1668 case FORMAT_X8B8G8R8: // FIXME: Don't touch alpha?
1670 Pointer<Byte> buffer = cBuffer + x * 4;
1671 Short4 value = *Pointer<Short4>(buffer);
1673 if((state.targetFormat[index] == FORMAT_A8B8G8R8 && rgbaWriteMask != 0x0000000F) ||
1674 ((state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x00000007) &&
1675 (state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x0000000F))) // FIXME: Need for masking when XBGR && Fh?
1677 Short4 masked = value;
1678 c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1679 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1683 c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1684 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1686 *Pointer<Short4>(buffer) = c01;
1688 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1689 value = *Pointer<Short4>(buffer);
1691 if((state.targetFormat[index] == FORMAT_A8B8G8R8 && rgbaWriteMask != 0x0000000F) ||
1692 ((state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x00000007) &&
1693 (state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x0000000F))) // FIXME: Need for masking when XBGR && Fh?
1695 Short4 masked = value;
1696 c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1697 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1701 c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1702 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1704 *Pointer<Short4>(buffer) = c23;
1708 if(rgbaWriteMask & 0x00000008)
1710 Pointer<Byte> buffer = cBuffer + 1 * x;
1712 Insert(value, *Pointer<Short>(buffer), 0);
1713 Int pitch = *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1714 Insert(value, *Pointer<Short>(buffer + pitch), 1);
1715 value = UnpackLow(As<Byte8>(value), As<Byte8>(value));
1717 current.w &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q) + 8 * xMask);
1718 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q) + 8 * xMask);
1721 *Pointer<Short>(buffer) = Extract(current.w, 0);
1722 *Pointer<Short>(buffer + pitch) = Extract(current.w, 1);
1727 Pointer<Byte> buffer = cBuffer + 4 * x;
1729 Short4 value = *Pointer<Short4>(buffer);
1731 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1733 Short4 masked = value;
1734 current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1735 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
1736 current.x |= masked;
1739 current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1740 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1742 *Pointer<Short4>(buffer) = current.x;
1744 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1746 value = *Pointer<Short4>(buffer);
1748 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1750 Short4 masked = value;
1751 current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1752 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
1753 current.y |= masked;
1756 current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1757 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1759 *Pointer<Short4>(buffer) = current.y;
1762 case FORMAT_A16B16G16R16:
1764 Pointer<Byte> buffer = cBuffer + 8 * x;
1767 Short4 value = *Pointer<Short4>(buffer);
1769 if(rgbaWriteMask != 0x0000000F)
1771 Short4 masked = value;
1772 current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1773 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1774 current.x |= masked;
1777 current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
1778 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
1780 *Pointer<Short4>(buffer) = current.x;
1784 Short4 value = *Pointer<Short4>(buffer + 8);
1786 if(rgbaWriteMask != 0x0000000F)
1788 Short4 masked = value;
1789 current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1790 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1791 current.y |= masked;
1794 current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
1795 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
1797 *Pointer<Short4>(buffer + 8) = current.y;
1800 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1803 Short4 value = *Pointer<Short4>(buffer);
1805 if(rgbaWriteMask != 0x0000000F)
1807 Short4 masked = value;
1808 current.z &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1809 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1810 current.z |= masked;
1813 current.z &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
1814 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
1816 *Pointer<Short4>(buffer) = current.z;
1820 Short4 value = *Pointer<Short4>(buffer + 8);
1822 if(rgbaWriteMask != 0x0000000F)
1824 Short4 masked = value;
1825 current.w &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1826 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1827 current.w |= masked;
1830 current.w &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
1831 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
1833 *Pointer<Short4>(buffer + 8) = current.w;
1842 void PixelRoutine::blendFactor(Registers &r, const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive)
1844 switch(blendFactorActive)
1853 blendFactor.x = oC.x;
1854 blendFactor.y = oC.y;
1855 blendFactor.z = oC.z;
1857 case BLEND_INVSOURCE:
1858 blendFactor.x = Float4(1.0f) - oC.x;
1859 blendFactor.y = Float4(1.0f) - oC.y;
1860 blendFactor.z = Float4(1.0f) - oC.z;
1863 blendFactor.x = pixel.x;
1864 blendFactor.y = pixel.y;
1865 blendFactor.z = pixel.z;
1868 blendFactor.x = Float4(1.0f) - pixel.x;
1869 blendFactor.y = Float4(1.0f) - pixel.y;
1870 blendFactor.z = Float4(1.0f) - pixel.z;
1872 case BLEND_SOURCEALPHA:
1873 blendFactor.x = oC.w;
1874 blendFactor.y = oC.w;
1875 blendFactor.z = oC.w;
1877 case BLEND_INVSOURCEALPHA:
1878 blendFactor.x = Float4(1.0f) - oC.w;
1879 blendFactor.y = Float4(1.0f) - oC.w;
1880 blendFactor.z = Float4(1.0f) - oC.w;
1882 case BLEND_DESTALPHA:
1883 blendFactor.x = pixel.w;
1884 blendFactor.y = pixel.w;
1885 blendFactor.z = pixel.w;
1887 case BLEND_INVDESTALPHA:
1888 blendFactor.x = Float4(1.0f) - pixel.w;
1889 blendFactor.y = Float4(1.0f) - pixel.w;
1890 blendFactor.z = Float4(1.0f) - pixel.w;
1892 case BLEND_SRCALPHASAT:
1893 blendFactor.x = Float4(1.0f) - pixel.w;
1894 blendFactor.x = Min(blendFactor.x, oC.w);
1895 blendFactor.y = blendFactor.x;
1896 blendFactor.z = blendFactor.x;
1898 case BLEND_CONSTANT:
1899 blendFactor.x = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[0]));
1900 blendFactor.y = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[1]));
1901 blendFactor.z = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[2]));
1903 case BLEND_INVCONSTANT:
1904 blendFactor.x = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
1905 blendFactor.y = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
1906 blendFactor.z = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
1913 void PixelRoutine::blendFactorAlpha(Registers &r, const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive)
1915 switch(blendFactorAlphaActive)
1924 blendFactor.w = oC.w;
1926 case BLEND_INVSOURCE:
1927 blendFactor.w = Float4(1.0f) - oC.w;
1930 blendFactor.w = pixel.w;
1933 blendFactor.w = Float4(1.0f) - pixel.w;
1935 case BLEND_SOURCEALPHA:
1936 blendFactor.w = oC.w;
1938 case BLEND_INVSOURCEALPHA:
1939 blendFactor.w = Float4(1.0f) - oC.w;
1941 case BLEND_DESTALPHA:
1942 blendFactor.w = pixel.w;
1944 case BLEND_INVDESTALPHA:
1945 blendFactor.w = Float4(1.0f) - pixel.w;
1947 case BLEND_SRCALPHASAT:
1948 blendFactor.w = Float4(1.0f);
1950 case BLEND_CONSTANT:
1951 blendFactor.w = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[3]));
1953 case BLEND_INVCONSTANT:
1954 blendFactor.w = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
1961 void PixelRoutine::alphaBlend(Registers &r, int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
1963 if(!state.alphaBlendActive)
1968 Pointer<Byte> buffer;
1975 switch(state.targetFormat[index])
1980 pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
1981 pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
1982 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1984 pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
1985 pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
1986 pixel.y = Float4(1.0f);
1987 pixel.z = Float4(1.0f);
1988 pixel.w = Float4(1.0f);
1990 case FORMAT_G32R32F:
1992 pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
1993 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1994 pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
1996 pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88);
1997 pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD);
1999 pixel.z = Float4(1.0f);
2000 pixel.w = Float4(1.0f);
2002 case FORMAT_A32B32G32R32F:
2004 pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
2005 pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2006 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
2007 pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
2008 pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2009 transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
2015 if(postBlendSRGB && state.writeSRGB)
2017 sRGBtoLinear(pixel.x);
2018 sRGBtoLinear(pixel.y);
2019 sRGBtoLinear(pixel.z);
2022 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
2023 Vector4f sourceFactor;
2024 Vector4f destFactor;
2026 blendFactor(r, sourceFactor, oC, pixel, state.sourceBlendFactor);
2027 blendFactor(r, destFactor, oC, pixel, state.destBlendFactor);
2029 if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
2031 oC.x *= sourceFactor.x;
2032 oC.y *= sourceFactor.y;
2033 oC.z *= sourceFactor.z;
2036 if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
2038 pixel.x *= destFactor.x;
2039 pixel.y *= destFactor.y;
2040 pixel.z *= destFactor.z;
2043 switch(state.blendOperation)
2055 case BLENDOP_INVSUB:
2056 oC.x = pixel.x - oC.x;
2057 oC.y = pixel.y - oC.y;
2058 oC.z = pixel.z - oC.z;
2061 oC.x = Min(oC.x, pixel.x);
2062 oC.y = Min(oC.y, pixel.y);
2063 oC.z = Min(oC.z, pixel.z);
2066 oC.x = Max(oC.x, pixel.x);
2067 oC.y = Max(oC.y, pixel.y);
2068 oC.z = Max(oC.z, pixel.z);
2070 case BLENDOP_SOURCE:
2079 oC.x = Float4(0.0f);
2080 oC.y = Float4(0.0f);
2081 oC.z = Float4(0.0f);
2087 blendFactorAlpha(r, sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
2088 blendFactorAlpha(r, destFactor, oC, pixel, state.destBlendFactorAlpha);
2090 if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
2092 oC.w *= sourceFactor.w;
2095 if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
2097 pixel.w *= destFactor.w;
2100 switch(state.blendOperationAlpha)
2108 case BLENDOP_INVSUB:
2113 oC.w = Min(oC.w, pixel.w);
2116 oC.w = Max(oC.w, pixel.w);
2118 case BLENDOP_SOURCE:
2125 oC.w = Float4(0.0f);
2132 void PixelRoutine::writeColor(Registers &r, int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
2134 switch(state.targetFormat[index])
2138 case FORMAT_G32R32F:
2140 oC.x = UnpackLow(oC.x, oC.y);
2141 oC.z = UnpackHigh(oC.z, oC.y);
2144 case FORMAT_A32B32G32R32F:
2145 transpose4x4(oC.x, oC.y, oC.z, oC.w);
2151 int rgbaWriteMask = state.colorWriteActive(index);
2153 Int xMask; // Combination of all masks
2155 if(state.depthTestActive)
2164 if(state.stencilActive)
2169 Pointer<Byte> buffer;
2172 switch(state.targetFormat[index])
2175 if(rgbaWriteMask & 0x00000001)
2177 buffer = cBuffer + 4 * x;
2180 value.x = *Pointer<Float>(buffer + 0);
2181 value.y = *Pointer<Float>(buffer + 4);
2183 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
2186 value.z = *Pointer<Float>(buffer + 0);
2187 value.w = *Pointer<Float>(buffer + 4);
2189 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
2190 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
2191 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2194 *Pointer<Float>(buffer + 0) = oC.x.z;
2195 *Pointer<Float>(buffer + 4) = oC.x.w;
2197 buffer -= *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
2200 *Pointer<Float>(buffer + 0) = oC.x.x;
2201 *Pointer<Float>(buffer + 4) = oC.x.y;
2204 case FORMAT_G32R32F:
2205 buffer = cBuffer + 8 * x;
2207 value = *Pointer<Float4>(buffer);
2209 if((rgbaWriteMask & 0x00000003) != 0x00000003)
2211 Float4 masked = value;
2212 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2213 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0])));
2214 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2217 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
2218 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
2219 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2220 *Pointer<Float4>(buffer) = oC.x;
2222 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
2224 value = *Pointer<Float4>(buffer);
2226 if((rgbaWriteMask & 0x00000003) != 0x00000003)
2231 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2232 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0])));
2233 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2236 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
2237 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
2238 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2239 *Pointer<Float4>(buffer) = oC.y;
2241 case FORMAT_A32B32G32R32F:
2242 buffer = cBuffer + 16 * x;
2245 value = *Pointer<Float4>(buffer, 16);
2247 if(rgbaWriteMask != 0x0000000F)
2249 Float4 masked = value;
2250 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2251 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2252 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2255 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
2256 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
2257 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2258 *Pointer<Float4>(buffer, 16) = oC.x;
2262 value = *Pointer<Float4>(buffer + 16, 16);
2264 if(rgbaWriteMask != 0x0000000F)
2266 Float4 masked = value;
2267 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2268 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2269 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2272 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
2273 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
2274 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2275 *Pointer<Float4>(buffer + 16, 16) = oC.y;
2278 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
2281 value = *Pointer<Float4>(buffer, 16);
2283 if(rgbaWriteMask != 0x0000000F)
2285 Float4 masked = value;
2286 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2287 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2288 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
2291 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
2292 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
2293 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
2294 *Pointer<Float4>(buffer, 16) = oC.z;
2298 value = *Pointer<Float4>(buffer + 16, 16);
2300 if(rgbaWriteMask != 0x0000000F)
2302 Float4 masked = value;
2303 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2304 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2305 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
2308 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
2309 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
2310 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
2311 *Pointer<Float4>(buffer + 16, 16) = oC.w;
2319 UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate)
2321 return UShort4(cf * Float4(0xFFFF), saturate);
2324 void PixelRoutine::sRGBtoLinear16_12_16(Registers &r, Vector4s &c)
2326 c.x = As<UShort4>(c.x) >> 4;
2327 c.y = As<UShort4>(c.y) >> 4;
2328 c.z = As<UShort4>(c.z) >> 4;
2330 sRGBtoLinear12_16(r, c);
2333 void PixelRoutine::sRGBtoLinear12_16(Registers &r, Vector4s &c)
2335 Pointer<Byte> LUT = r.constants + OFFSET(Constants,sRGBtoLinear12_16);
2337 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2338 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2339 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2340 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2342 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2343 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2344 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2345 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2347 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2348 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2349 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2350 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2353 void PixelRoutine::linearToSRGB16_12_16(Registers &r, Vector4s &c)
2355 c.x = As<UShort4>(c.x) >> 4;
2356 c.y = As<UShort4>(c.y) >> 4;
2357 c.z = As<UShort4>(c.z) >> 4;
2359 linearToSRGB12_16(r, c);
2362 void PixelRoutine::linearToSRGB12_16(Registers &r, Vector4s &c)
2364 Pointer<Byte> LUT = r.constants + OFFSET(Constants,linearToSRGB12_16);
2366 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2367 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2368 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2369 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2371 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2372 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2373 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2374 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2376 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2377 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2378 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2379 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2382 Float4 PixelRoutine::sRGBtoLinear(const Float4 &x) // Approximates x^2.2
2384 Float4 linear = x * x;
2385 linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
2387 return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
2390 bool PixelRoutine::colorUsed()
2392 return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill;