1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 #include "PixelRoutine.hpp"
17 #include "Renderer.hpp"
18 #include "QuadRasterizer.hpp"
19 #include "Surface.hpp"
20 #include "Primitive.hpp"
21 #include "SamplerCore.hpp"
22 #include "Constants.hpp"
27 extern bool complementaryDepthBuffer;
28 extern bool postBlendSRGB;
29 extern bool exactColorRounding;
30 extern bool forceClearRegisters;
32 PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader) : QuadRasterizer(state, shader), v(shader && shader->dynamicallyIndexedInput)
34 if(!shader || shader->getVersion() < 0x0200 || forceClearRegisters)
36 for(int i = 0; i < MAX_FRAGMENT_INPUTS; i++)
38 v[i].x = Float4(0.0f);
39 v[i].y = Float4(0.0f);
40 v[i].z = Float4(0.0f);
41 v[i].w = Float4(0.0f);
46 PixelRoutine::~PixelRoutine()
48 for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
54 void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
57 Long pipeTime = Ticks();
60 for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
62 sampler[i] = new SamplerCore(constants, state.sampler[i]);
65 const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive();
67 Int zMask[4]; // Depth mask
68 Int sMask[4]; // Stencil mask
70 for(unsigned int q = 0; q < state.multiSample; q++)
76 for(unsigned int q = 0; q < state.multiSample; q++)
78 stencilTest(sBuffer, q, x, sMask[q], cMask[q]);
84 Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16);
88 for(unsigned int q = 0; q < state.multiSample; q++)
92 if(state.multiSample > 1)
94 x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4));
97 z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false);
101 Bool depthPass = false;
105 for(unsigned int q = 0; q < state.multiSample; q++)
107 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
111 If(depthPass || Bool(!earlyDepthTest))
114 Long interpTime = Ticks();
117 Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
119 // Centroid locations
120 Float4 XXXX = Float4(0.0f);
121 Float4 YYYY = Float4(0.0f);
125 Float4 WWWW(1.0e-9f);
127 for(unsigned int q = 0; q < state.multiSample; q++)
129 XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
130 YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
131 WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]);
144 w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false);
145 rhw = reciprocal(w, false, false, true);
149 rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false));
153 for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
155 for(int component = 0; component < 4; component++)
157 if(state.interpolant[interpolant].component & (1 << component))
159 if(!state.interpolant[interpolant].centroid)
161 v[interpolant][component] = interpolate(xxxx, Dv[interpolant][component], rhw, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
165 v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
172 switch(state.interpolant[interpolant].project)
177 rcp = reciprocal(v[interpolant].y);
178 v[interpolant].x = v[interpolant].x * rcp;
181 rcp = reciprocal(v[interpolant].z);
182 v[interpolant].x = v[interpolant].x * rcp;
183 v[interpolant].y = v[interpolant].y * rcp;
186 rcp = reciprocal(v[interpolant].w);
187 v[interpolant].x = v[interpolant].x * rcp;
188 v[interpolant].y = v[interpolant].y * rcp;
189 v[interpolant].z = v[interpolant].z * rcp;
194 if(state.fog.component)
196 f = interpolate(xxxx, Df, rhw, primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective);
199 setBuiltins(x, y, z, w);
202 cycles[PERF_INTERP] += Ticks() - interpTime;
205 Bool alphaPass = true;
210 Long shaderTime = Ticks();
216 cycles[PERF_SHADER] += Ticks() - shaderTime;
219 alphaPass = alphaTest(cMask);
221 if((shader && shader->containsKill()) || state.alphaTestActive())
223 for(unsigned int q = 0; q < state.multiSample; q++)
225 zMask[q] &= cMask[q];
226 sMask[q] &= cMask[q];
235 for(unsigned int q = 0; q < state.multiSample; q++)
237 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
242 Long ropTime = Ticks();
245 If(depthPass || Bool(earlyDepthTest))
247 for(unsigned int q = 0; q < state.multiSample; q++)
249 if(state.multiSampleMask & (1 << q))
251 writeDepth(zBuffer, q, x, z[q], zMask[q]);
253 if(state.occlusionEnabled)
255 occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
263 AddAtomic(Pointer<Long>(&profiler.ropOperations), 4);
266 rasterOperation(f, cBuffer, x, sMask, zMask, cMask);
271 cycles[PERF_ROP] += Ticks() - ropTime;
276 for(unsigned int q = 0; q < state.multiSample; q++)
278 if(state.multiSampleMask & (1 << q))
280 writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
285 cycles[PERF_PIPE] += Ticks() - pipeTime;
289 Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
291 Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
295 interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
296 y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
307 void PixelRoutine::stencilTest(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask)
309 if(!state.stencilActive)
314 // (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
316 Pointer<Byte> buffer = sBuffer + 2 * x;
320 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
323 Byte8 value = *Pointer<Byte8>(buffer);
324 Byte8 valueCCW = value;
326 if(!state.noStencilMask)
328 value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ));
331 stencilTest(value, state.stencilCompareMode, false);
333 if(state.twoSidedStencil)
335 if(!state.noStencilMaskCCW)
337 valueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ));
340 stencilTest(valueCCW, state.stencilCompareModeCCW, true);
342 value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
343 valueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
347 sMask = SignMask(value) & cMask;
350 void PixelRoutine::stencilTest(Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW)
354 switch(stencilCompareMode)
357 value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
360 value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
362 case STENCIL_LESS: // a < b ~ b > a
363 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
364 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
367 value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
369 case STENCIL_NOTEQUAL: // a != b ~ !(a == b)
370 value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
371 value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
373 case STENCIL_LESSEQUAL: // a <= b ~ (b > a) || (a == b)
375 equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
376 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
377 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
380 case STENCIL_GREATER: // a > b
381 equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
382 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
383 equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
386 case STENCIL_GREATEREQUAL: // a >= b ~ !(a < b) ~ !(b > a)
387 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
388 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
389 value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
396 Bool PixelRoutine::depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
398 if(!state.depthTestActive)
405 if(shader && shader->depthOverride())
407 if(complementaryDepthBuffer)
409 Z = Float4(1.0f) - oDepth;
417 Pointer<Byte> buffer;
420 if(!state.quadLayoutDepthBuffer)
422 buffer = zBuffer + 4 * x;
423 pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
427 buffer = zBuffer + 8 * x;
432 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
437 if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
439 if(!state.quadLayoutDepthBuffer)
441 // FIXME: Properly optimizes?
442 zValue.xy = *Pointer<Float4>(buffer);
443 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
447 zValue = *Pointer<Float4>(buffer, 16);
453 switch(state.depthCompareMode)
462 zTest = CmpEQ(zValue, Z);
465 zTest = CmpNEQ(zValue, Z);
468 if(complementaryDepthBuffer)
470 zTest = CmpLT(zValue, Z);
474 zTest = CmpNLE(zValue, Z);
477 case DEPTH_GREATEREQUAL:
478 if(complementaryDepthBuffer)
480 zTest = CmpNLT(zValue, Z);
484 zTest = CmpLE(zValue, Z);
487 case DEPTH_LESSEQUAL:
488 if(complementaryDepthBuffer)
490 zTest = CmpLE(zValue, Z);
494 zTest = CmpNLT(zValue, Z);
498 if(complementaryDepthBuffer)
500 zTest = CmpNLE(zValue, Z);
504 zTest = CmpLT(zValue, Z);
511 switch(state.depthCompareMode)
520 zMask = SignMask(zTest) & cMask;
524 if(state.stencilActive)
532 void PixelRoutine::alphaTest(Int &aMask, Short4 &alpha)
537 switch(state.alphaCompareMode)
546 cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
547 aMask = SignMask(Pack(cmp, Short4(0x0000)));
549 case ALPHA_NOTEQUAL: // a != b ~ !(a == b)
550 cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu); // FIXME
551 aMask = SignMask(Pack(cmp, Short4(0x0000)));
553 case ALPHA_LESS: // a < b ~ b > a
554 cmp = CmpGT(*Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)), alpha);
555 aMask = SignMask(Pack(cmp, Short4(0x0000)));
557 case ALPHA_GREATEREQUAL: // a >= b ~ (a > b) || (a == b) ~ !(b > a) // TODO: Approximate
558 equal = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
559 cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
561 aMask = SignMask(Pack(cmp, Short4(0x0000)));
563 case ALPHA_LESSEQUAL: // a <= b ~ !(a > b)
564 cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu); // FIXME
565 aMask = SignMask(Pack(cmp, Short4(0x0000)));
567 case ALPHA_GREATER: // a > b
568 cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
569 aMask = SignMask(Pack(cmp, Short4(0x0000)));
576 void PixelRoutine::alphaToCoverage(Int cMask[4], Float4 &alpha)
578 Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0)));
579 Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1)));
580 Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2)));
581 Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3)));
583 Int aMask0 = SignMask(coverage0);
584 Int aMask1 = SignMask(coverage1);
585 Int aMask2 = SignMask(coverage2);
586 Int aMask3 = SignMask(coverage3);
594 void PixelRoutine::fogBlend(Vector4f &c0, Float4 &fog)
601 if(state.pixelFogMode != FOG_NONE)
605 fog = Min(fog, Float4(1.0f));
606 fog = Max(fog, Float4(0.0f));
609 c0.x -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
610 c0.y -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
611 c0.z -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
617 c0.x += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
618 c0.y += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
619 c0.z += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
622 void PixelRoutine::pixelFog(Float4 &visibility)
624 Float4 &zw = visibility;
626 if(state.pixelFogMode != FOG_NONE)
634 if(complementaryDepthBuffer)
636 zw = Float4(1.0f) - z[0];
645 switch(state.pixelFogMode)
650 zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.scale));
651 zw += *Pointer<Float4>(data + OFFSET(DrawData,fog.offset));
654 zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.densityE));
655 zw = exponential2(zw, true);
659 zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.density2E));
660 zw = exponential2(zw, true);
667 void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
669 if(!state.depthWriteEnable)
676 if(shader && shader->depthOverride())
678 if(complementaryDepthBuffer)
680 Z = Float4(1.0f) - oDepth;
688 Pointer<Byte> buffer;
691 if(!state.quadLayoutDepthBuffer)
693 buffer = zBuffer + 4 * x;
694 pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
698 buffer = zBuffer + 8 * x;
703 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
708 if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
710 if(!state.quadLayoutDepthBuffer)
712 // FIXME: Properly optimizes?
713 zValue.xy = *Pointer<Float4>(buffer);
714 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
718 zValue = *Pointer<Float4>(buffer, 16);
722 Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
723 zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
724 Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
726 if(!state.quadLayoutDepthBuffer)
728 // FIXME: Properly optimizes?
729 *Pointer<Float2>(buffer) = Float2(Z.xy);
730 *Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
734 *Pointer<Float4>(buffer, 16) = Z;
738 void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
740 if(!state.stencilActive)
745 if(state.stencilPassOperation == OPERATION_KEEP && state.stencilZFailOperation == OPERATION_KEEP && state.stencilFailOperation == OPERATION_KEEP)
747 if(!state.twoSidedStencil || (state.stencilPassOperationCCW == OPERATION_KEEP && state.stencilZFailOperationCCW == OPERATION_KEEP && state.stencilFailOperationCCW == OPERATION_KEEP))
753 if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW))
758 Pointer<Byte> buffer = sBuffer + 2 * x;
762 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
765 Byte8 bufferValue = *Pointer<Byte8>(buffer);
768 stencilOperation(newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
770 if(!state.noStencilWriteMask)
772 Byte8 maskedValue = bufferValue;
773 newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ));
774 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
775 newValue |= maskedValue;
778 if(state.twoSidedStencil)
782 stencilOperation(newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
784 if(!state.noStencilWriteMaskCCW)
786 Byte8 maskedValue = bufferValue;
787 newValueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ));
788 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
789 newValueCCW |= maskedValue;
792 newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
793 newValueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
794 newValue |= newValueCCW;
797 newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
798 bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
799 newValue |= bufferValue;
801 *Pointer<Byte4>(buffer) = Byte4(newValue);
804 void PixelRoutine::stencilOperation(Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
806 Byte8 &pass = newValue;
810 stencilOperation(pass, bufferValue, stencilPassOperation, CCW);
812 if(stencilZFailOperation != stencilPassOperation)
814 stencilOperation(zFail, bufferValue, stencilZFailOperation, CCW);
817 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
819 stencilOperation(fail, bufferValue, stencilFailOperation, CCW);
822 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
824 if(state.depthTestActive && stencilZFailOperation != stencilPassOperation) // zMask valid and values not the same
826 pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
827 zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
831 pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
832 fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
837 void PixelRoutine::stencilOperation(Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW)
842 output = bufferValue;
845 output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
847 case OPERATION_REPLACE:
848 output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceQ));
850 case OPERATION_INCRSAT:
851 output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
853 case OPERATION_DECRSAT:
854 output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
856 case OPERATION_INVERT:
857 output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
860 output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
863 output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
870 void PixelRoutine::blendFactor(Vector4s &blendFactor, const Vector4s ¤t, const Vector4s &pixel, BlendFactor blendFactorActive)
872 switch(blendFactorActive)
881 blendFactor.x = current.x;
882 blendFactor.y = current.y;
883 blendFactor.z = current.z;
885 case BLEND_INVSOURCE:
886 blendFactor.x = Short4(0xFFFFu) - current.x;
887 blendFactor.y = Short4(0xFFFFu) - current.y;
888 blendFactor.z = Short4(0xFFFFu) - current.z;
891 blendFactor.x = pixel.x;
892 blendFactor.y = pixel.y;
893 blendFactor.z = pixel.z;
896 blendFactor.x = Short4(0xFFFFu) - pixel.x;
897 blendFactor.y = Short4(0xFFFFu) - pixel.y;
898 blendFactor.z = Short4(0xFFFFu) - pixel.z;
900 case BLEND_SOURCEALPHA:
901 blendFactor.x = current.w;
902 blendFactor.y = current.w;
903 blendFactor.z = current.w;
905 case BLEND_INVSOURCEALPHA:
906 blendFactor.x = Short4(0xFFFFu) - current.w;
907 blendFactor.y = Short4(0xFFFFu) - current.w;
908 blendFactor.z = Short4(0xFFFFu) - current.w;
910 case BLEND_DESTALPHA:
911 blendFactor.x = pixel.w;
912 blendFactor.y = pixel.w;
913 blendFactor.z = pixel.w;
915 case BLEND_INVDESTALPHA:
916 blendFactor.x = Short4(0xFFFFu) - pixel.w;
917 blendFactor.y = Short4(0xFFFFu) - pixel.w;
918 blendFactor.z = Short4(0xFFFFu) - pixel.w;
920 case BLEND_SRCALPHASAT:
921 blendFactor.x = Short4(0xFFFFu) - pixel.w;
922 blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
923 blendFactor.y = blendFactor.x;
924 blendFactor.z = blendFactor.x;
927 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0]));
928 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1]));
929 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2]));
931 case BLEND_INVCONSTANT:
932 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
933 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
934 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
936 case BLEND_CONSTANTALPHA:
937 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
938 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
939 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
941 case BLEND_INVCONSTANTALPHA:
942 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
943 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
944 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
951 void PixelRoutine::blendFactorAlpha(Vector4s &blendFactor, const Vector4s ¤t, const Vector4s &pixel, BlendFactor blendFactorAlphaActive)
953 switch(blendFactorAlphaActive)
962 blendFactor.w = current.w;
964 case BLEND_INVSOURCE:
965 blendFactor.w = Short4(0xFFFFu) - current.w;
968 blendFactor.w = pixel.w;
971 blendFactor.w = Short4(0xFFFFu) - pixel.w;
973 case BLEND_SOURCEALPHA:
974 blendFactor.w = current.w;
976 case BLEND_INVSOURCEALPHA:
977 blendFactor.w = Short4(0xFFFFu) - current.w;
979 case BLEND_DESTALPHA:
980 blendFactor.w = pixel.w;
982 case BLEND_INVDESTALPHA:
983 blendFactor.w = Short4(0xFFFFu) - pixel.w;
985 case BLEND_SRCALPHASAT:
986 blendFactor.w = Short4(0xFFFFu);
989 case BLEND_CONSTANTALPHA:
990 blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
992 case BLEND_INVCONSTANT:
993 case BLEND_INVCONSTANTALPHA:
994 blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
1001 bool PixelRoutine::isSRGB(int index) const
1003 return state.targetFormat[index] == FORMAT_SRGB8_A8 || state.targetFormat[index] == FORMAT_SRGB8_X8;
1006 void PixelRoutine::readPixel(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel)
1010 Pointer<Byte> buffer;
1011 Pointer<Byte> buffer2;
1013 switch(state.targetFormat[index])
1016 buffer = cBuffer + 2 * x;
1017 buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1018 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1020 pixel.x = c01 & Short4(0xF800u);
1021 pixel.y = (c01 & Short4(0x07E0u)) << 5;
1022 pixel.z = (c01 & Short4(0x001Fu)) << 11;
1023 pixel.w = Short4(0xFFFFu);
1025 case FORMAT_A8R8G8B8:
1026 buffer = cBuffer + 4 * x;
1027 c01 = *Pointer<Short4>(buffer);
1028 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1029 c23 = *Pointer<Short4>(buffer);
1032 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1033 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1035 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1036 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1039 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1040 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1041 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1042 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1044 case FORMAT_A8B8G8R8:
1045 case FORMAT_SRGB8_A8:
1046 buffer = cBuffer + 4 * x;
1047 c01 = *Pointer<Short4>(buffer);
1048 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1049 c23 = *Pointer<Short4>(buffer);
1052 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1053 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1055 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1056 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1059 pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1060 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1061 pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1062 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1065 buffer = cBuffer + 1 * x;
1066 pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 0);
1067 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1068 pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 1);
1069 pixel.w = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1070 pixel.x = Short4(0x0000);
1071 pixel.y = Short4(0x0000);
1072 pixel.z = Short4(0x0000);
1074 case FORMAT_X8R8G8B8:
1075 buffer = cBuffer + 4 * x;
1076 c01 = *Pointer<Short4>(buffer);
1077 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1078 c23 = *Pointer<Short4>(buffer);
1081 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1082 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1084 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1085 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1087 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1088 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1089 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1090 pixel.w = Short4(0xFFFFu);
1092 case FORMAT_X8B8G8R8:
1093 case FORMAT_SRGB8_X8:
1094 buffer = cBuffer + 4 * x;
1095 c01 = *Pointer<Short4>(buffer);
1096 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1097 c23 = *Pointer<Short4>(buffer);
1100 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1101 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1103 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1104 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1107 pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1108 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1109 pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1110 pixel.w = Short4(0xFFFFu);
1112 case FORMAT_A8G8R8B8Q:
1114 // pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1115 // pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1116 // pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1117 // pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1119 case FORMAT_X8G8R8B8Q:
1121 // pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1122 // pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1123 // pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1124 // pixel.w = Short4(0xFFFFu);
1126 case FORMAT_A16B16G16R16:
1128 pixel.x = *Pointer<Short4>(buffer + 8 * x);
1129 pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
1130 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1131 pixel.z = *Pointer<Short4>(buffer + 8 * x);
1132 pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
1133 transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
1137 pixel.x = *Pointer<Short4>(buffer + 4 * x);
1138 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1139 pixel.y = *Pointer<Short4>(buffer + 4 * x);
1141 pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
1142 pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
1144 pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
1145 pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
1146 pixel.z = Short4(0xFFFFu);
1147 pixel.w = Short4(0xFFFFu);
1153 if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
1155 sRGBtoLinear16_12_16(pixel);
1159 void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4s ¤t, Int &x)
1161 if(!state.alphaBlendActive)
1167 readPixel(index, cBuffer, x, pixel);
1169 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
1170 Vector4s sourceFactor;
1171 Vector4s destFactor;
1173 blendFactor(sourceFactor, current, pixel, state.sourceBlendFactor);
1174 blendFactor(destFactor, current, pixel, state.destBlendFactor);
1176 if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
1178 current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
1179 current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
1180 current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
1183 if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
1185 pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
1186 pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
1187 pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
1190 switch(state.blendOperation)
1193 current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1194 current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1195 current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1198 current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1199 current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1200 current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1202 case BLENDOP_INVSUB:
1203 current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
1204 current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
1205 current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
1208 current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
1209 current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
1210 current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
1213 current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
1214 current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
1215 current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
1217 case BLENDOP_SOURCE:
1221 current.x = pixel.x;
1222 current.y = pixel.y;
1223 current.z = pixel.z;
1226 current.x = Short4(0x0000);
1227 current.y = Short4(0x0000);
1228 current.z = Short4(0x0000);
1234 blendFactorAlpha(sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
1235 blendFactorAlpha(destFactor, current, pixel, state.destBlendFactorAlpha);
1237 if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
1239 current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
1242 if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
1244 pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
1247 switch(state.blendOperationAlpha)
1250 current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1253 current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1255 case BLENDOP_INVSUB:
1256 current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
1259 current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
1262 current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
1264 case BLENDOP_SOURCE:
1268 current.w = pixel.w;
1271 current.w = Short4(0x0000);
1278 void PixelRoutine::logicOperation(int index, Pointer<Byte> &cBuffer, Vector4s ¤t, Int &x)
1280 if(state.logicalOperation == LOGICALOP_COPY)
1286 readPixel(index, cBuffer, x, pixel);
1288 switch(state.logicalOperation)
1290 case LOGICALOP_CLEAR:
1291 current.x = UShort4(0);
1292 current.y = UShort4(0);
1293 current.z = UShort4(0);
1296 current.x = UShort4(0xFFFFu);
1297 current.y = UShort4(0xFFFFu);
1298 current.z = UShort4(0xFFFFu);
1300 case LOGICALOP_COPY:
1301 ASSERT(false); // Optimized out
1303 case LOGICALOP_COPY_INVERTED:
1304 current.x = ~current.x;
1305 current.y = ~current.y;
1306 current.z = ~current.z;
1308 case LOGICALOP_NOOP:
1309 current.x = pixel.x;
1310 current.y = pixel.y;
1311 current.z = pixel.z;
1313 case LOGICALOP_INVERT:
1314 current.x = ~pixel.x;
1315 current.y = ~pixel.y;
1316 current.z = ~pixel.z;
1319 current.x = pixel.x & current.x;
1320 current.y = pixel.y & current.y;
1321 current.z = pixel.z & current.z;
1323 case LOGICALOP_NAND:
1324 current.x = ~(pixel.x & current.x);
1325 current.y = ~(pixel.y & current.y);
1326 current.z = ~(pixel.z & current.z);
1329 current.x = pixel.x | current.x;
1330 current.y = pixel.y | current.y;
1331 current.z = pixel.z | current.z;
1334 current.x = ~(pixel.x | current.x);
1335 current.y = ~(pixel.y | current.y);
1336 current.z = ~(pixel.z | current.z);
1339 current.x = pixel.x ^ current.x;
1340 current.y = pixel.y ^ current.y;
1341 current.z = pixel.z ^ current.z;
1343 case LOGICALOP_EQUIV:
1344 current.x = ~(pixel.x ^ current.x);
1345 current.y = ~(pixel.y ^ current.y);
1346 current.z = ~(pixel.z ^ current.z);
1348 case LOGICALOP_AND_REVERSE:
1349 current.x = ~pixel.x & current.x;
1350 current.y = ~pixel.y & current.y;
1351 current.z = ~pixel.z & current.z;
1353 case LOGICALOP_AND_INVERTED:
1354 current.x = pixel.x & ~current.x;
1355 current.y = pixel.y & ~current.y;
1356 current.z = pixel.z & ~current.z;
1358 case LOGICALOP_OR_REVERSE:
1359 current.x = ~pixel.x | current.x;
1360 current.y = ~pixel.y | current.y;
1361 current.z = ~pixel.z | current.z;
1363 case LOGICALOP_OR_INVERTED:
1364 current.x = pixel.x | ~current.x;
1365 current.y = pixel.y | ~current.y;
1366 current.z = pixel.z | ~current.z;
1373 void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s ¤t, Int &sMask, Int &zMask, Int &cMask)
1375 if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
1377 linearToSRGB16_12_16(current);
1380 if(exactColorRounding)
1382 switch(state.targetFormat[index])
1385 current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400));
1386 current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200));
1387 current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400));
1389 case FORMAT_X8G8R8B8Q:
1390 case FORMAT_A8G8R8B8Q:
1391 case FORMAT_X8R8G8B8:
1392 case FORMAT_X8B8G8R8:
1393 case FORMAT_A8R8G8B8:
1394 case FORMAT_A8B8G8R8:
1395 case FORMAT_SRGB8_X8:
1396 case FORMAT_SRGB8_A8:
1399 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
1400 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
1401 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
1402 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
1409 int rgbaWriteMask = state.colorWriteActive(index);
1410 int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
1412 switch(state.targetFormat[index])
1416 current.x = current.x & Short4(0xF800u);
1417 current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1418 current.z = As<UShort4>(current.z) >> 11;
1420 current.x = current.x | current.y | current.z;
1423 case FORMAT_X8G8R8B8Q:
1425 // current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1426 // current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1427 // current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1429 // current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1430 // current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1432 case FORMAT_A8G8R8B8Q:
1434 // current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1435 // current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1436 // current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1437 // current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1439 // current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1440 // current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1442 case FORMAT_X8R8G8B8:
1443 case FORMAT_A8R8G8B8:
1444 if(state.targetFormat[index] == FORMAT_X8R8G8B8 || rgbaWriteMask == 0x7)
1446 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1447 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1448 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1450 current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1451 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1453 current.x = current.z;
1454 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1455 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1456 current.y = current.z;
1457 current.z = As<Short4>(UnpackLow(current.z, current.x));
1458 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1462 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1463 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1464 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1465 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1467 current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1468 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1470 current.x = current.z;
1471 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1472 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1473 current.y = current.z;
1474 current.z = As<Short4>(UnpackLow(current.z, current.x));
1475 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1478 case FORMAT_X8B8G8R8:
1479 case FORMAT_A8B8G8R8:
1480 case FORMAT_SRGB8_X8:
1481 case FORMAT_SRGB8_A8:
1482 if(state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8 || rgbaWriteMask == 0x7)
1484 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1485 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1486 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1488 current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1489 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1491 current.x = current.z;
1492 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1493 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1494 current.y = current.z;
1495 current.z = As<Short4>(UnpackLow(current.z, current.x));
1496 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1500 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1501 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1502 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1503 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1505 current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1506 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1508 current.x = current.z;
1509 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1510 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1511 current.y = current.z;
1512 current.z = As<Short4>(UnpackLow(current.z, current.x));
1513 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1517 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1518 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1519 current.x = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.x)));
1520 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1521 current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
1524 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1525 current.x = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.x)));
1528 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1529 current.w = As<Short4>(Pack(As<UShort4>(current.w), As<UShort4>(current.w)));
1532 current.z = current.x;
1533 current.x = As<Short4>(UnpackLow(current.x, current.y));
1534 current.z = As<Short4>(UnpackHigh(current.z, current.y));
1535 current.y = current.z;
1537 case FORMAT_A16B16G16R16:
1538 transpose4x4(current.x, current.y, current.z, current.w);
1544 Short4 c01 = current.z;
1545 Short4 c23 = current.y;
1547 Int xMask; // Combination of all masks
1549 if(state.depthTestActive)
1558 if(state.stencilActive)
1563 switch(state.targetFormat[index])
1567 Pointer<Byte> buffer = cBuffer + 2 * x;
1568 Int value = *Pointer<Int>(buffer);
1570 Int c01 = Extract(As<Int2>(current.x), 0);
1572 if((bgraWriteMask & 0x00000007) != 0x00000007)
1575 c01 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1576 masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
1580 c01 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
1581 value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
1583 *Pointer<Int>(buffer) = c01;
1585 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1586 value = *Pointer<Int>(buffer);
1588 Int c23 = Extract(As<Int2>(current.x), 1);
1590 if((bgraWriteMask & 0x00000007) != 0x00000007)
1593 c23 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1594 masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
1598 c23 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
1599 value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
1601 *Pointer<Int>(buffer) = c23;
1604 case FORMAT_A8G8R8B8Q:
1605 case FORMAT_X8G8R8B8Q: // FIXME: Don't touch alpha?
1607 // value = *Pointer<Short4>(cBuffer + 8 * x + 0);
1609 // if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1610 // ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1611 // (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
1613 // Short4 masked = value;
1614 // c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1615 // masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1619 // c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1620 // value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1622 // *Pointer<Short4>(cBuffer + 8 * x + 0) = c01;
1624 // value = *Pointer<Short4>(cBuffer + 8 * x + 8);
1626 // if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1627 // ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1628 // (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
1630 // Short4 masked = value;
1631 // c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1632 // masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1636 // c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1637 // value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1639 // *Pointer<Short4>(cBuffer + 8 * x + 8) = c23;
1641 case FORMAT_A8R8G8B8:
1642 case FORMAT_X8R8G8B8: // FIXME: Don't touch alpha?
1644 Pointer<Byte> buffer = cBuffer + x * 4;
1645 Short4 value = *Pointer<Short4>(buffer);
1647 if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1648 ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1649 (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
1651 Short4 masked = value;
1652 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1653 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1657 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1658 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1660 *Pointer<Short4>(buffer) = c01;
1662 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1663 value = *Pointer<Short4>(buffer);
1665 if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1666 ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1667 (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
1669 Short4 masked = value;
1670 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1671 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1675 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1676 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1678 *Pointer<Short4>(buffer) = c23;
1681 case FORMAT_A8B8G8R8:
1682 case FORMAT_X8B8G8R8: // FIXME: Don't touch alpha?
1683 case FORMAT_SRGB8_X8:
1684 case FORMAT_SRGB8_A8:
1686 Pointer<Byte> buffer = cBuffer + x * 4;
1687 Short4 value = *Pointer<Short4>(buffer);
1689 bool masked = (((state.targetFormat[index] == FORMAT_A8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_A8) && rgbaWriteMask != 0x0000000F) ||
1690 (((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x00000007) &&
1691 ((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x0000000F))); // FIXME: Need for masking when XBGR && Fh?
1695 Short4 masked = value;
1696 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1697 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1701 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1702 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1704 *Pointer<Short4>(buffer) = c01;
1706 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1707 value = *Pointer<Short4>(buffer);
1711 Short4 masked = value;
1712 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1713 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1717 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1718 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1720 *Pointer<Short4>(buffer) = c23;
1724 if((rgbaWriteMask & 0x00000003) != 0x0)
1726 Pointer<Byte> buffer = cBuffer + 2 * x;
1728 value = Insert(value, *Pointer<Int>(buffer), 0);
1729 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1730 value = Insert(value, *Pointer<Int>(buffer + pitch), 1);
1732 Int2 packedCol = As<Int2>(current.x);
1734 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
1735 if((rgbaWriteMask & 0x3) != 0x3)
1737 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
1738 UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
1739 mergedMask &= rgbaMask;
1742 packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
1744 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
1745 *Pointer<UInt>(buffer + pitch) = As<UInt>(Extract(packedCol, 1));
1749 if(rgbaWriteMask & 0x00000001)
1751 Pointer<Byte> buffer = cBuffer + 1 * x;
1753 value = Insert(value, *Pointer<Short>(buffer), 0);
1754 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1755 value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
1756 value = UnpackLow(As<Byte8>(value), As<Byte8>(value));
1758 current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
1759 value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
1762 *Pointer<Short>(buffer) = Extract(current.x, 0);
1763 *Pointer<Short>(buffer + pitch) = Extract(current.x, 1);
1767 if(rgbaWriteMask & 0x00000008)
1769 Pointer<Byte> buffer = cBuffer + 1 * x;
1771 value = Insert(value, *Pointer<Short>(buffer), 0);
1772 Int pitch = *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1773 value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
1774 value = UnpackLow(As<Byte8>(value), As<Byte8>(value));
1776 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q) + 8 * xMask);
1777 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q) + 8 * xMask);
1780 *Pointer<Short>(buffer) = Extract(current.w, 0);
1781 *Pointer<Short>(buffer + pitch) = Extract(current.w, 1);
1786 Pointer<Byte> buffer = cBuffer + 4 * x;
1788 Short4 value = *Pointer<Short4>(buffer);
1790 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1792 Short4 masked = value;
1793 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1794 masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
1795 current.x |= masked;
1798 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1799 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1801 *Pointer<Short4>(buffer) = current.x;
1803 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1805 value = *Pointer<Short4>(buffer);
1807 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1809 Short4 masked = value;
1810 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1811 masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
1812 current.y |= masked;
1815 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1816 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1818 *Pointer<Short4>(buffer) = current.y;
1821 case FORMAT_A16B16G16R16:
1823 Pointer<Byte> buffer = cBuffer + 8 * x;
1826 Short4 value = *Pointer<Short4>(buffer);
1828 if(rgbaWriteMask != 0x0000000F)
1830 Short4 masked = value;
1831 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1832 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1833 current.x |= masked;
1836 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
1837 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
1839 *Pointer<Short4>(buffer) = current.x;
1843 Short4 value = *Pointer<Short4>(buffer + 8);
1845 if(rgbaWriteMask != 0x0000000F)
1847 Short4 masked = value;
1848 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1849 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1850 current.y |= masked;
1853 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
1854 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
1856 *Pointer<Short4>(buffer + 8) = current.y;
1859 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1862 Short4 value = *Pointer<Short4>(buffer);
1864 if(rgbaWriteMask != 0x0000000F)
1866 Short4 masked = value;
1867 current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1868 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1869 current.z |= masked;
1872 current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
1873 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
1875 *Pointer<Short4>(buffer) = current.z;
1879 Short4 value = *Pointer<Short4>(buffer + 8);
1881 if(rgbaWriteMask != 0x0000000F)
1883 Short4 masked = value;
1884 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1885 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1886 current.w |= masked;
1889 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
1890 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
1892 *Pointer<Short4>(buffer + 8) = current.w;
1901 void PixelRoutine::blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive)
1903 switch(blendFactorActive)
1912 blendFactor.x = oC.x;
1913 blendFactor.y = oC.y;
1914 blendFactor.z = oC.z;
1916 case BLEND_INVSOURCE:
1917 blendFactor.x = Float4(1.0f) - oC.x;
1918 blendFactor.y = Float4(1.0f) - oC.y;
1919 blendFactor.z = Float4(1.0f) - oC.z;
1922 blendFactor.x = pixel.x;
1923 blendFactor.y = pixel.y;
1924 blendFactor.z = pixel.z;
1927 blendFactor.x = Float4(1.0f) - pixel.x;
1928 blendFactor.y = Float4(1.0f) - pixel.y;
1929 blendFactor.z = Float4(1.0f) - pixel.z;
1931 case BLEND_SOURCEALPHA:
1932 blendFactor.x = oC.w;
1933 blendFactor.y = oC.w;
1934 blendFactor.z = oC.w;
1936 case BLEND_INVSOURCEALPHA:
1937 blendFactor.x = Float4(1.0f) - oC.w;
1938 blendFactor.y = Float4(1.0f) - oC.w;
1939 blendFactor.z = Float4(1.0f) - oC.w;
1941 case BLEND_DESTALPHA:
1942 blendFactor.x = pixel.w;
1943 blendFactor.y = pixel.w;
1944 blendFactor.z = pixel.w;
1946 case BLEND_INVDESTALPHA:
1947 blendFactor.x = Float4(1.0f) - pixel.w;
1948 blendFactor.y = Float4(1.0f) - pixel.w;
1949 blendFactor.z = Float4(1.0f) - pixel.w;
1951 case BLEND_SRCALPHASAT:
1952 blendFactor.x = Float4(1.0f) - pixel.w;
1953 blendFactor.x = Min(blendFactor.x, oC.w);
1954 blendFactor.y = blendFactor.x;
1955 blendFactor.z = blendFactor.x;
1957 case BLEND_CONSTANT:
1958 blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0]));
1959 blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1]));
1960 blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2]));
1962 case BLEND_INVCONSTANT:
1963 blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
1964 blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
1965 blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
1972 void PixelRoutine::blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive)
1974 switch(blendFactorAlphaActive)
1983 blendFactor.w = oC.w;
1985 case BLEND_INVSOURCE:
1986 blendFactor.w = Float4(1.0f) - oC.w;
1989 blendFactor.w = pixel.w;
1992 blendFactor.w = Float4(1.0f) - pixel.w;
1994 case BLEND_SOURCEALPHA:
1995 blendFactor.w = oC.w;
1997 case BLEND_INVSOURCEALPHA:
1998 blendFactor.w = Float4(1.0f) - oC.w;
2000 case BLEND_DESTALPHA:
2001 blendFactor.w = pixel.w;
2003 case BLEND_INVDESTALPHA:
2004 blendFactor.w = Float4(1.0f) - pixel.w;
2006 case BLEND_SRCALPHASAT:
2007 blendFactor.w = Float4(1.0f);
2009 case BLEND_CONSTANT:
2010 blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
2012 case BLEND_INVCONSTANT:
2013 blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
2020 void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
2022 if(!state.alphaBlendActive)
2027 Pointer<Byte> buffer;
2035 if(Surface::isFloatFormat(state.targetFormat[index]))
2039 else if(Surface::isNonNormalizedInteger(state.targetFormat[index]))
2041 one = As<Float4>(Surface::isUnsignedComponent(state.targetFormat[index], 0) ? Int4(0xFFFFFFFF) : Int4(0x7FFFFFFF));
2044 switch(state.targetFormat[index])
2051 pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
2052 pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
2053 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2055 pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
2056 pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
2057 pixel.y = pixel.z = pixel.w = one;
2059 case FORMAT_G32R32I:
2060 case FORMAT_G32R32UI:
2061 case FORMAT_G32R32F:
2063 pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
2064 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2065 pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
2067 pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88);
2068 pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD);
2070 pixel.z = pixel.w = one;
2072 case FORMAT_X32B32G32R32F:
2073 case FORMAT_A32B32G32R32F:
2074 case FORMAT_A32B32G32R32I:
2075 case FORMAT_A32B32G32R32UI:
2077 pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
2078 pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2079 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2080 pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
2081 pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2082 transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
2083 if(state.targetFormat[index] == FORMAT_X32B32G32R32F)
2085 pixel.w = Float4(1.0f);
2092 if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
2094 sRGBtoLinear(pixel.x);
2095 sRGBtoLinear(pixel.y);
2096 sRGBtoLinear(pixel.z);
2099 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
2100 Vector4f sourceFactor;
2101 Vector4f destFactor;
2103 blendFactor(sourceFactor, oC, pixel, state.sourceBlendFactor);
2104 blendFactor(destFactor, oC, pixel, state.destBlendFactor);
2106 if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
2108 oC.x *= sourceFactor.x;
2109 oC.y *= sourceFactor.y;
2110 oC.z *= sourceFactor.z;
2113 if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
2115 pixel.x *= destFactor.x;
2116 pixel.y *= destFactor.y;
2117 pixel.z *= destFactor.z;
2120 switch(state.blendOperation)
2132 case BLENDOP_INVSUB:
2133 oC.x = pixel.x - oC.x;
2134 oC.y = pixel.y - oC.y;
2135 oC.z = pixel.z - oC.z;
2138 oC.x = Min(oC.x, pixel.x);
2139 oC.y = Min(oC.y, pixel.y);
2140 oC.z = Min(oC.z, pixel.z);
2143 oC.x = Max(oC.x, pixel.x);
2144 oC.y = Max(oC.y, pixel.y);
2145 oC.z = Max(oC.z, pixel.z);
2147 case BLENDOP_SOURCE:
2156 oC.x = Float4(0.0f);
2157 oC.y = Float4(0.0f);
2158 oC.z = Float4(0.0f);
2164 blendFactorAlpha(sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
2165 blendFactorAlpha(destFactor, oC, pixel, state.destBlendFactorAlpha);
2167 if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
2169 oC.w *= sourceFactor.w;
2172 if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
2174 pixel.w *= destFactor.w;
2177 switch(state.blendOperationAlpha)
2185 case BLENDOP_INVSUB:
2190 oC.w = Min(oC.w, pixel.w);
2193 oC.w = Max(oC.w, pixel.w);
2195 case BLENDOP_SOURCE:
2202 oC.w = Float4(0.0f);
2209 void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
2211 switch(state.targetFormat[index])
2221 case FORMAT_G32R32F:
2222 case FORMAT_G32R32I:
2223 case FORMAT_G32R32UI:
2224 case FORMAT_G16R16I:
2225 case FORMAT_G16R16UI:
2229 oC.x = UnpackLow(oC.x, oC.y);
2230 oC.z = UnpackHigh(oC.z, oC.y);
2233 case FORMAT_X32B32G32R32F:
2234 case FORMAT_A32B32G32R32F:
2235 case FORMAT_A32B32G32R32I:
2236 case FORMAT_A32B32G32R32UI:
2237 case FORMAT_A16B16G16R16I:
2238 case FORMAT_A16B16G16R16UI:
2239 case FORMAT_A8B8G8R8I:
2240 case FORMAT_A8B8G8R8UI:
2241 transpose4x4(oC.x, oC.y, oC.z, oC.w);
2247 int rgbaWriteMask = state.colorWriteActive(index);
2249 Int xMask; // Combination of all masks
2251 if(state.depthTestActive)
2260 if(state.stencilActive)
2265 Pointer<Byte> buffer;
2268 switch(state.targetFormat[index])
2273 if(rgbaWriteMask & 0x00000001)
2275 buffer = cBuffer + 4 * x;
2278 value.x = *Pointer<Float>(buffer + 0);
2279 value.y = *Pointer<Float>(buffer + 4);
2281 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2284 value.z = *Pointer<Float>(buffer + 0);
2285 value.w = *Pointer<Float>(buffer + 4);
2287 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
2288 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
2289 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2292 *Pointer<Float>(buffer + 0) = oC.x.z;
2293 *Pointer<Float>(buffer + 4) = oC.x.w;
2295 buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2298 *Pointer<Float>(buffer + 0) = oC.x.x;
2299 *Pointer<Float>(buffer + 4) = oC.x.y;
2304 if(rgbaWriteMask & 0x00000001)
2306 buffer = cBuffer + 2 * x;
2309 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
2311 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2313 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
2314 value = As<Float4>(Int4(xyzw));
2316 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2317 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2318 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2320 if(state.targetFormat[index] == FORMAT_R16I)
2322 Float component = oC.x.z;
2323 *Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2325 *Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2327 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2330 *Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2332 *Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2334 else // FORMAT_R16UI
2336 Float component = oC.x.z;
2337 *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2339 *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2341 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2344 *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2346 *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2352 if(rgbaWriteMask & 0x00000001)
2354 buffer = cBuffer + x;
2356 UInt xyzw, packedCol;
2358 xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
2359 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2360 xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
2362 Short4 tmpCol = Short4(As<Int4>(oC.x));
2363 if(state.targetFormat[index] == FORMAT_R8I)
2365 tmpCol = As<Short4>(Pack(tmpCol, tmpCol));
2369 tmpCol = As<Short4>(Pack(As<UShort4>(tmpCol), As<UShort4>(tmpCol)));
2371 packedCol = Extract(As<Int2>(tmpCol), 0);
2373 packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
2374 (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
2376 *Pointer<UShort>(buffer) = UShort(packedCol >> 16);
2377 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2378 *Pointer<UShort>(buffer) = UShort(packedCol);
2381 case FORMAT_G32R32F:
2382 case FORMAT_G32R32I:
2383 case FORMAT_G32R32UI:
2384 buffer = cBuffer + 8 * x;
2386 value = *Pointer<Float4>(buffer);
2388 if((rgbaWriteMask & 0x00000003) != 0x00000003)
2390 Float4 masked = value;
2391 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2392 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
2393 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2396 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
2397 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
2398 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2399 *Pointer<Float4>(buffer) = oC.x;
2401 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2403 value = *Pointer<Float4>(buffer);
2405 if((rgbaWriteMask & 0x00000003) != 0x00000003)
2410 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2411 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
2412 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2415 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
2416 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
2417 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2418 *Pointer<Float4>(buffer) = oC.y;
2420 case FORMAT_G16R16I:
2421 case FORMAT_G16R16UI:
2422 if((rgbaWriteMask & 0x00000003) != 0x0)
2424 buffer = cBuffer + 4 * x;
2427 UShort4 packedCol = UShort4(As<Int4>(oC.x));
2428 UShort4 value = *Pointer<UShort4>(buffer);
2429 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2430 if((rgbaWriteMask & 0x3) != 0x3)
2432 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
2433 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2434 mergedMask &= rgbaMask;
2436 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2438 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2440 packedCol = UShort4(As<Int4>(oC.y));
2441 value = *Pointer<UShort4>(buffer);
2442 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2443 if((rgbaWriteMask & 0x3) != 0x3)
2445 mergedMask &= rgbaMask;
2447 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2452 if((rgbaWriteMask & 0x00000003) != 0x0)
2454 buffer = cBuffer + 2 * x;
2456 Int2 xyzw, packedCol;
2458 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
2459 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2460 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
2462 if(state.targetFormat[index] == FORMAT_G8R8I)
2464 packedCol = As<Int2>(Pack(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2468 packedCol = As<Int2>(Pack(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y))));
2471 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
2472 if((rgbaWriteMask & 0x3) != 0x3)
2474 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
2475 UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2476 mergedMask &= rgbaMask;
2479 packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
2481 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
2482 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2483 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
2486 case FORMAT_X32B32G32R32F:
2487 case FORMAT_A32B32G32R32F:
2488 case FORMAT_A32B32G32R32I:
2489 case FORMAT_A32B32G32R32UI:
2490 buffer = cBuffer + 16 * x;
2493 value = *Pointer<Float4>(buffer, 16);
2495 if(rgbaWriteMask != 0x0000000F)
2497 Float4 masked = value;
2498 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2499 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2500 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2503 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
2504 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
2505 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2506 *Pointer<Float4>(buffer, 16) = oC.x;
2510 value = *Pointer<Float4>(buffer + 16, 16);
2512 if(rgbaWriteMask != 0x0000000F)
2514 Float4 masked = value;
2515 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2516 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2517 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2520 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
2521 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
2522 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2523 *Pointer<Float4>(buffer + 16, 16) = oC.y;
2526 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2529 value = *Pointer<Float4>(buffer, 16);
2531 if(rgbaWriteMask != 0x0000000F)
2533 Float4 masked = value;
2534 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2535 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2536 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
2539 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
2540 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
2541 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
2542 *Pointer<Float4>(buffer, 16) = oC.z;
2546 value = *Pointer<Float4>(buffer + 16, 16);
2548 if(rgbaWriteMask != 0x0000000F)
2550 Float4 masked = value;
2551 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2552 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2553 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
2556 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
2557 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
2558 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
2559 *Pointer<Float4>(buffer + 16, 16) = oC.w;
2562 case FORMAT_A16B16G16R16I:
2563 case FORMAT_A16B16G16R16UI:
2564 if((rgbaWriteMask & 0x0000000F) != 0x0)
2566 buffer = cBuffer + 8 * x;
2569 UShort8 value = *Pointer<UShort8>(buffer);
2570 UShort8 packedCol = UShort8(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y)));
2571 UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
2572 if((rgbaWriteMask & 0xF) != 0xF)
2574 UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
2575 rgbaMask = UInt4(tmpMask, tmpMask);
2576 mergedMask &= rgbaMask;
2578 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2580 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2582 value = *Pointer<UShort8>(buffer);
2583 packedCol = UShort8(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w)));
2584 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
2585 if((rgbaWriteMask & 0xF) != 0xF)
2587 mergedMask &= rgbaMask;
2589 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2592 case FORMAT_A8B8G8R8I:
2593 case FORMAT_A8B8G8R8UI:
2594 if((rgbaWriteMask & 0x0000000F) != 0x0)
2596 UInt2 value, packedCol, mergedMask;
2598 buffer = cBuffer + 4 * x;
2600 if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
2602 packedCol = As<UInt2>(Pack(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2606 packedCol = As<UInt2>(Pack(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y))));
2608 value = *Pointer<UInt2>(buffer, 16);
2609 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2610 if(rgbaWriteMask != 0xF)
2612 mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2614 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2616 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2618 if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
2620 packedCol = As<UInt2>(Pack(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
2624 packedCol = As<UInt2>(Pack(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w))));
2626 value = *Pointer<UInt2>(buffer, 16);
2627 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2628 if(rgbaWriteMask != 0xF)
2630 mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2632 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2640 UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate)
2642 return UShort4(cf * Float4(0xFFFF), saturate);
2645 void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
2647 c.x = As<UShort4>(c.x) >> 4;
2648 c.y = As<UShort4>(c.y) >> 4;
2649 c.z = As<UShort4>(c.z) >> 4;
2651 sRGBtoLinear12_16(c);
2654 void PixelRoutine::sRGBtoLinear12_16(Vector4s &c)
2656 Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16);
2658 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2659 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2660 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2661 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2663 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2664 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2665 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2666 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2668 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2669 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2670 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2671 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2674 void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
2676 c.x = As<UShort4>(c.x) >> 4;
2677 c.y = As<UShort4>(c.y) >> 4;
2678 c.z = As<UShort4>(c.z) >> 4;
2680 linearToSRGB12_16(c);
2683 void PixelRoutine::linearToSRGB12_16(Vector4s &c)
2685 Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16);
2687 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2688 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2689 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2690 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2692 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2693 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2694 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2695 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2697 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2698 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2699 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2700 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2703 Float4 PixelRoutine::sRGBtoLinear(const Float4 &x) // Approximates x^2.2
2705 Float4 linear = x * x;
2706 linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
2708 return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
2711 bool PixelRoutine::colorUsed()
2713 return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill;