1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 #include "PixelRoutine.hpp"
17 #include "Renderer.hpp"
18 #include "QuadRasterizer.hpp"
19 #include "Surface.hpp"
20 #include "Primitive.hpp"
22 #include "SamplerCore.hpp"
23 #include "Constants.hpp"
28 extern bool complementaryDepthBuffer;
29 extern bool postBlendSRGB;
30 extern bool exactColorRounding;
31 extern bool forceClearRegisters;
33 PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader) : QuadRasterizer(state, shader), v(shader && shader->dynamicallyIndexedInput)
35 if(!shader || shader->getVersion() < 0x0200 || forceClearRegisters)
37 for(int i = 0; i < MAX_FRAGMENT_INPUTS; i++)
39 v[i].x = Float4(0.0f);
40 v[i].y = Float4(0.0f);
41 v[i].z = Float4(0.0f);
42 v[i].w = Float4(0.0f);
47 PixelRoutine::~PixelRoutine()
49 for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
55 void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
58 Long pipeTime = Ticks();
61 for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
63 sampler[i] = new SamplerCore(constants, state.sampler[i]);
66 const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive();
68 Int zMask[4]; // Depth mask
69 Int sMask[4]; // Stencil mask
71 for(unsigned int q = 0; q < state.multiSample; q++)
77 for(unsigned int q = 0; q < state.multiSample; q++)
79 stencilTest(sBuffer, q, x, sMask[q], cMask[q]);
85 Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16);
89 for(unsigned int q = 0; q < state.multiSample; q++)
93 if(state.multiSample > 1)
95 x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4));
98 z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false);
102 Bool depthPass = false;
106 for(unsigned int q = 0; q < state.multiSample; q++)
108 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
112 If(depthPass || Bool(!earlyDepthTest))
115 Long interpTime = Ticks();
118 Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
120 // Centroid locations
121 Float4 XXXX = Float4(0.0f);
122 Float4 YYYY = Float4(0.0f);
126 Float4 WWWW(1.0e-9f);
128 for(unsigned int q = 0; q < state.multiSample; q++)
130 XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
131 YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
132 WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]);
145 w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false);
146 rhw = reciprocal(w, false, false, true);
150 rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false));
154 for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
156 for(int component = 0; component < 4; component++)
158 if(state.interpolant[interpolant].component & (1 << component))
160 if(!state.interpolant[interpolant].centroid)
162 v[interpolant][component] = interpolate(xxxx, Dv[interpolant][component], rhw, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
166 v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
173 switch(state.interpolant[interpolant].project)
178 rcp = reciprocal(v[interpolant].y);
179 v[interpolant].x = v[interpolant].x * rcp;
182 rcp = reciprocal(v[interpolant].z);
183 v[interpolant].x = v[interpolant].x * rcp;
184 v[interpolant].y = v[interpolant].y * rcp;
187 rcp = reciprocal(v[interpolant].w);
188 v[interpolant].x = v[interpolant].x * rcp;
189 v[interpolant].y = v[interpolant].y * rcp;
190 v[interpolant].z = v[interpolant].z * rcp;
195 if(state.fog.component)
197 f = interpolate(xxxx, Df, rhw, primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective);
200 setBuiltins(x, y, z, w);
203 cycles[PERF_INTERP] += Ticks() - interpTime;
206 Bool alphaPass = true;
211 Long shaderTime = Ticks();
217 cycles[PERF_SHADER] += Ticks() - shaderTime;
220 alphaPass = alphaTest(cMask);
222 if((shader && shader->containsKill()) || state.alphaTestActive())
224 for(unsigned int q = 0; q < state.multiSample; q++)
226 zMask[q] &= cMask[q];
227 sMask[q] &= cMask[q];
236 for(unsigned int q = 0; q < state.multiSample; q++)
238 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
243 Long ropTime = Ticks();
246 If(depthPass || Bool(earlyDepthTest))
248 for(unsigned int q = 0; q < state.multiSample; q++)
250 if(state.multiSampleMask & (1 << q))
252 writeDepth(zBuffer, q, x, z[q], zMask[q]);
254 if(state.occlusionEnabled)
256 occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
264 AddAtomic(Pointer<Long>(&profiler.ropOperations), 4);
267 rasterOperation(f, cBuffer, x, sMask, zMask, cMask);
272 cycles[PERF_ROP] += Ticks() - ropTime;
277 for(unsigned int q = 0; q < state.multiSample; q++)
279 if(state.multiSampleMask & (1 << q))
281 writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
286 cycles[PERF_PIPE] += Ticks() - pipeTime;
290 Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
292 Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
296 interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
297 y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
308 void PixelRoutine::stencilTest(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask)
310 if(!state.stencilActive)
315 // (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
317 Pointer<Byte> buffer = sBuffer + 2 * x;
321 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
324 Byte8 value = *Pointer<Byte8>(buffer);
325 Byte8 valueCCW = value;
327 if(!state.noStencilMask)
329 value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ));
332 stencilTest(value, state.stencilCompareMode, false);
334 if(state.twoSidedStencil)
336 if(!state.noStencilMaskCCW)
338 valueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ));
341 stencilTest(valueCCW, state.stencilCompareModeCCW, true);
343 value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
344 valueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
348 sMask = SignMask(value) & cMask;
351 void PixelRoutine::stencilTest(Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW)
355 switch(stencilCompareMode)
358 value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
361 value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
363 case STENCIL_LESS: // a < b ~ b > a
364 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
365 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
368 value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
370 case STENCIL_NOTEQUAL: // a != b ~ !(a == b)
371 value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
372 value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
374 case STENCIL_LESSEQUAL: // a <= b ~ (b > a) || (a == b)
376 equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
377 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
378 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
381 case STENCIL_GREATER: // a > b
382 equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
383 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
384 equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
387 case STENCIL_GREATEREQUAL: // a >= b ~ !(a < b) ~ !(b > a)
388 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
389 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
390 value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
397 Bool PixelRoutine::depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
399 if(!state.depthTestActive)
406 if(shader && shader->depthOverride())
408 if(complementaryDepthBuffer)
410 Z = Float4(1.0f) - oDepth;
418 Pointer<Byte> buffer;
421 if(!state.quadLayoutDepthBuffer)
423 buffer = zBuffer + 4 * x;
424 pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
428 buffer = zBuffer + 8 * x;
433 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
438 if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
440 if(!state.quadLayoutDepthBuffer)
442 // FIXME: Properly optimizes?
443 zValue.xy = *Pointer<Float4>(buffer);
444 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
448 zValue = *Pointer<Float4>(buffer, 16);
454 switch(state.depthCompareMode)
463 zTest = CmpEQ(zValue, Z);
466 zTest = CmpNEQ(zValue, Z);
469 if(complementaryDepthBuffer)
471 zTest = CmpLT(zValue, Z);
475 zTest = CmpNLE(zValue, Z);
478 case DEPTH_GREATEREQUAL:
479 if(complementaryDepthBuffer)
481 zTest = CmpNLT(zValue, Z);
485 zTest = CmpLE(zValue, Z);
488 case DEPTH_LESSEQUAL:
489 if(complementaryDepthBuffer)
491 zTest = CmpLE(zValue, Z);
495 zTest = CmpNLT(zValue, Z);
499 if(complementaryDepthBuffer)
501 zTest = CmpNLE(zValue, Z);
505 zTest = CmpLT(zValue, Z);
512 switch(state.depthCompareMode)
521 zMask = SignMask(zTest) & cMask;
525 if(state.stencilActive)
533 void PixelRoutine::alphaTest(Int &aMask, Short4 &alpha)
538 switch(state.alphaCompareMode)
547 cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
548 aMask = SignMask(Pack(cmp, Short4(0x0000)));
550 case ALPHA_NOTEQUAL: // a != b ~ !(a == b)
551 cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu); // FIXME
552 aMask = SignMask(Pack(cmp, Short4(0x0000)));
554 case ALPHA_LESS: // a < b ~ b > a
555 cmp = CmpGT(*Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)), alpha);
556 aMask = SignMask(Pack(cmp, Short4(0x0000)));
558 case ALPHA_GREATEREQUAL: // a >= b ~ (a > b) || (a == b) ~ !(b > a) // TODO: Approximate
559 equal = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
560 cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
562 aMask = SignMask(Pack(cmp, Short4(0x0000)));
564 case ALPHA_LESSEQUAL: // a <= b ~ !(a > b)
565 cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu); // FIXME
566 aMask = SignMask(Pack(cmp, Short4(0x0000)));
568 case ALPHA_GREATER: // a > b
569 cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
570 aMask = SignMask(Pack(cmp, Short4(0x0000)));
577 void PixelRoutine::alphaToCoverage(Int cMask[4], Float4 &alpha)
579 Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0)));
580 Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1)));
581 Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2)));
582 Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3)));
584 Int aMask0 = SignMask(coverage0);
585 Int aMask1 = SignMask(coverage1);
586 Int aMask2 = SignMask(coverage2);
587 Int aMask3 = SignMask(coverage3);
595 void PixelRoutine::fogBlend(Vector4f &c0, Float4 &fog)
602 if(state.pixelFogMode != FOG_NONE)
606 fog = Min(fog, Float4(1.0f));
607 fog = Max(fog, Float4(0.0f));
610 c0.x -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
611 c0.y -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
612 c0.z -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
618 c0.x += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
619 c0.y += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
620 c0.z += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
623 void PixelRoutine::pixelFog(Float4 &visibility)
625 Float4 &zw = visibility;
627 if(state.pixelFogMode != FOG_NONE)
635 if(complementaryDepthBuffer)
637 zw = Float4(1.0f) - z[0];
646 switch(state.pixelFogMode)
651 zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.scale));
652 zw += *Pointer<Float4>(data + OFFSET(DrawData,fog.offset));
655 zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.densityE));
656 zw = exponential2(zw, true);
660 zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.density2E));
661 zw = exponential2(zw, true);
668 void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
670 if(!state.depthWriteEnable)
677 if(shader && shader->depthOverride())
679 if(complementaryDepthBuffer)
681 Z = Float4(1.0f) - oDepth;
689 Pointer<Byte> buffer;
692 if(!state.quadLayoutDepthBuffer)
694 buffer = zBuffer + 4 * x;
695 pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
699 buffer = zBuffer + 8 * x;
704 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
709 if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
711 if(!state.quadLayoutDepthBuffer)
713 // FIXME: Properly optimizes?
714 zValue.xy = *Pointer<Float4>(buffer);
715 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
719 zValue = *Pointer<Float4>(buffer, 16);
723 Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
724 zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
725 Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
727 if(!state.quadLayoutDepthBuffer)
729 // FIXME: Properly optimizes?
730 *Pointer<Float2>(buffer) = Float2(Z.xy);
731 *Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
735 *Pointer<Float4>(buffer, 16) = Z;
739 void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
741 if(!state.stencilActive)
746 if(state.stencilPassOperation == OPERATION_KEEP && state.stencilZFailOperation == OPERATION_KEEP && state.stencilFailOperation == OPERATION_KEEP)
748 if(!state.twoSidedStencil || (state.stencilPassOperationCCW == OPERATION_KEEP && state.stencilZFailOperationCCW == OPERATION_KEEP && state.stencilFailOperationCCW == OPERATION_KEEP))
754 if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW))
759 Pointer<Byte> buffer = sBuffer + 2 * x;
763 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
766 Byte8 bufferValue = *Pointer<Byte8>(buffer);
769 stencilOperation(newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
771 if(!state.noStencilWriteMask)
773 Byte8 maskedValue = bufferValue;
774 newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ));
775 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
776 newValue |= maskedValue;
779 if(state.twoSidedStencil)
783 stencilOperation(newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
785 if(!state.noStencilWriteMaskCCW)
787 Byte8 maskedValue = bufferValue;
788 newValueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ));
789 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
790 newValueCCW |= maskedValue;
793 newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
794 newValueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
795 newValue |= newValueCCW;
798 newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
799 bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
800 newValue |= bufferValue;
802 *Pointer<Byte4>(buffer) = Byte4(newValue);
805 void PixelRoutine::stencilOperation(Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
807 Byte8 &pass = newValue;
811 stencilOperation(pass, bufferValue, stencilPassOperation, CCW);
813 if(stencilZFailOperation != stencilPassOperation)
815 stencilOperation(zFail, bufferValue, stencilZFailOperation, CCW);
818 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
820 stencilOperation(fail, bufferValue, stencilFailOperation, CCW);
823 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
825 if(state.depthTestActive && stencilZFailOperation != stencilPassOperation) // zMask valid and values not the same
827 pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
828 zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
832 pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
833 fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
838 void PixelRoutine::stencilOperation(Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW)
843 output = bufferValue;
846 output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
848 case OPERATION_REPLACE:
849 output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceQ));
851 case OPERATION_INCRSAT:
852 output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
854 case OPERATION_DECRSAT:
855 output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
857 case OPERATION_INVERT:
858 output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
861 output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
864 output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
871 void PixelRoutine::blendFactor(const Vector4s &blendFactor, const Vector4s ¤t, const Vector4s &pixel, BlendFactor blendFactorActive)
873 switch(blendFactorActive)
882 blendFactor.x = current.x;
883 blendFactor.y = current.y;
884 blendFactor.z = current.z;
886 case BLEND_INVSOURCE:
887 blendFactor.x = Short4(0xFFFFu) - current.x;
888 blendFactor.y = Short4(0xFFFFu) - current.y;
889 blendFactor.z = Short4(0xFFFFu) - current.z;
892 blendFactor.x = pixel.x;
893 blendFactor.y = pixel.y;
894 blendFactor.z = pixel.z;
897 blendFactor.x = Short4(0xFFFFu) - pixel.x;
898 blendFactor.y = Short4(0xFFFFu) - pixel.y;
899 blendFactor.z = Short4(0xFFFFu) - pixel.z;
901 case BLEND_SOURCEALPHA:
902 blendFactor.x = current.w;
903 blendFactor.y = current.w;
904 blendFactor.z = current.w;
906 case BLEND_INVSOURCEALPHA:
907 blendFactor.x = Short4(0xFFFFu) - current.w;
908 blendFactor.y = Short4(0xFFFFu) - current.w;
909 blendFactor.z = Short4(0xFFFFu) - current.w;
911 case BLEND_DESTALPHA:
912 blendFactor.x = pixel.w;
913 blendFactor.y = pixel.w;
914 blendFactor.z = pixel.w;
916 case BLEND_INVDESTALPHA:
917 blendFactor.x = Short4(0xFFFFu) - pixel.w;
918 blendFactor.y = Short4(0xFFFFu) - pixel.w;
919 blendFactor.z = Short4(0xFFFFu) - pixel.w;
921 case BLEND_SRCALPHASAT:
922 blendFactor.x = Short4(0xFFFFu) - pixel.w;
923 blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
924 blendFactor.y = blendFactor.x;
925 blendFactor.z = blendFactor.x;
928 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0]));
929 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1]));
930 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2]));
932 case BLEND_INVCONSTANT:
933 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
934 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
935 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
937 case BLEND_CONSTANTALPHA:
938 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
939 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
940 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
942 case BLEND_INVCONSTANTALPHA:
943 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
944 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
945 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
952 void PixelRoutine::blendFactorAlpha(const Vector4s &blendFactor, const Vector4s ¤t, const Vector4s &pixel, BlendFactor blendFactorAlphaActive)
954 switch(blendFactorAlphaActive)
963 blendFactor.w = current.w;
965 case BLEND_INVSOURCE:
966 blendFactor.w = Short4(0xFFFFu) - current.w;
969 blendFactor.w = pixel.w;
972 blendFactor.w = Short4(0xFFFFu) - pixel.w;
974 case BLEND_SOURCEALPHA:
975 blendFactor.w = current.w;
977 case BLEND_INVSOURCEALPHA:
978 blendFactor.w = Short4(0xFFFFu) - current.w;
980 case BLEND_DESTALPHA:
981 blendFactor.w = pixel.w;
983 case BLEND_INVDESTALPHA:
984 blendFactor.w = Short4(0xFFFFu) - pixel.w;
986 case BLEND_SRCALPHASAT:
987 blendFactor.w = Short4(0xFFFFu);
990 case BLEND_CONSTANTALPHA:
991 blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
993 case BLEND_INVCONSTANT:
994 case BLEND_INVCONSTANTALPHA:
995 blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
1002 bool PixelRoutine::isSRGB(int index) const
1004 return state.targetFormat[index] == FORMAT_SRGB8_A8 || state.targetFormat[index] == FORMAT_SRGB8_X8;
1007 void PixelRoutine::readPixel(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel)
1011 Pointer<Byte> buffer;
1012 Pointer<Byte> buffer2;
1014 switch(state.targetFormat[index])
1017 buffer = cBuffer + 2 * x;
1018 buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1019 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1021 pixel.x = c01 & Short4(0xF800u);
1022 pixel.y = (c01 & Short4(0x07E0u)) << 5;
1023 pixel.z = (c01 & Short4(0x001Fu)) << 11;
1024 pixel.w = Short4(0xFFFFu);
1026 case FORMAT_A8R8G8B8:
1027 buffer = cBuffer + 4 * x;
1028 c01 = *Pointer<Short4>(buffer);
1029 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1030 c23 = *Pointer<Short4>(buffer);
1033 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1034 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1036 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1037 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1040 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1041 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1042 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1043 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1045 case FORMAT_A8B8G8R8:
1046 case FORMAT_SRGB8_A8:
1047 buffer = cBuffer + 4 * x;
1048 c01 = *Pointer<Short4>(buffer);
1049 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1050 c23 = *Pointer<Short4>(buffer);
1053 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1054 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1056 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1057 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1060 pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1061 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1062 pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1063 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1066 buffer = cBuffer + 1 * x;
1067 pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 0);
1068 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1069 pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 1);
1070 pixel.w = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1071 pixel.x = Short4(0x0000);
1072 pixel.y = Short4(0x0000);
1073 pixel.z = Short4(0x0000);
1075 case FORMAT_X8R8G8B8:
1076 buffer = cBuffer + 4 * x;
1077 c01 = *Pointer<Short4>(buffer);
1078 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1079 c23 = *Pointer<Short4>(buffer);
1082 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1083 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1085 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1086 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1088 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1089 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1090 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1091 pixel.w = Short4(0xFFFFu);
1093 case FORMAT_X8B8G8R8:
1094 case FORMAT_SRGB8_X8:
1095 buffer = cBuffer + 4 * x;
1096 c01 = *Pointer<Short4>(buffer);
1097 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1098 c23 = *Pointer<Short4>(buffer);
1101 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1102 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1104 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1105 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1108 pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1109 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1110 pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1111 pixel.w = Short4(0xFFFFu);
1113 case FORMAT_A8G8R8B8Q:
1115 // pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1116 // pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1117 // pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1118 // pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1120 case FORMAT_X8G8R8B8Q:
1122 // pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1123 // pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1124 // pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1125 // pixel.w = Short4(0xFFFFu);
1127 case FORMAT_A16B16G16R16:
1129 pixel.x = *Pointer<Short4>(buffer + 8 * x);
1130 pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
1131 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1132 pixel.z = *Pointer<Short4>(buffer + 8 * x);
1133 pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
1134 transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
1138 pixel.x = *Pointer<Short4>(buffer + 4 * x);
1139 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1140 pixel.y = *Pointer<Short4>(buffer + 4 * x);
1142 pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
1143 pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
1145 pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
1146 pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
1147 pixel.z = Short4(0xFFFFu);
1148 pixel.w = Short4(0xFFFFu);
1154 if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
1156 sRGBtoLinear16_12_16(pixel);
1160 void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4s ¤t, Int &x)
1162 if(!state.alphaBlendActive)
1168 readPixel(index, cBuffer, x, pixel);
1170 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
1171 Vector4s sourceFactor;
1172 Vector4s destFactor;
1174 blendFactor(sourceFactor, current, pixel, state.sourceBlendFactor);
1175 blendFactor(destFactor, current, pixel, state.destBlendFactor);
1177 if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
1179 current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
1180 current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
1181 current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
1184 if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
1186 pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
1187 pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
1188 pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
1191 switch(state.blendOperation)
1194 current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1195 current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1196 current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1199 current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1200 current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1201 current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1203 case BLENDOP_INVSUB:
1204 current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
1205 current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
1206 current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
1209 current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
1210 current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
1211 current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
1214 current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
1215 current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
1216 current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
1218 case BLENDOP_SOURCE:
1222 current.x = pixel.x;
1223 current.y = pixel.y;
1224 current.z = pixel.z;
1227 current.x = Short4(0x0000);
1228 current.y = Short4(0x0000);
1229 current.z = Short4(0x0000);
1235 blendFactorAlpha(sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
1236 blendFactorAlpha(destFactor, current, pixel, state.destBlendFactorAlpha);
1238 if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
1240 current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
1243 if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
1245 pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
1248 switch(state.blendOperationAlpha)
1251 current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1254 current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1256 case BLENDOP_INVSUB:
1257 current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
1260 current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
1263 current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
1265 case BLENDOP_SOURCE:
1269 current.w = pixel.w;
1272 current.w = Short4(0x0000);
1279 void PixelRoutine::logicOperation(int index, Pointer<Byte> &cBuffer, Vector4s ¤t, Int &x)
1281 if(state.logicalOperation == LOGICALOP_COPY)
1287 readPixel(index, cBuffer, x, pixel);
1289 switch(state.logicalOperation)
1291 case LOGICALOP_CLEAR:
1292 current.x = UShort4(0);
1293 current.y = UShort4(0);
1294 current.z = UShort4(0);
1297 current.x = UShort4(0xFFFFu);
1298 current.y = UShort4(0xFFFFu);
1299 current.z = UShort4(0xFFFFu);
1301 case LOGICALOP_COPY:
1302 ASSERT(false); // Optimized out
1304 case LOGICALOP_COPY_INVERTED:
1305 current.x = ~current.x;
1306 current.y = ~current.y;
1307 current.z = ~current.z;
1309 case LOGICALOP_NOOP:
1310 current.x = pixel.x;
1311 current.y = pixel.y;
1312 current.z = pixel.z;
1314 case LOGICALOP_INVERT:
1315 current.x = ~pixel.x;
1316 current.y = ~pixel.y;
1317 current.z = ~pixel.z;
1320 current.x = pixel.x & current.x;
1321 current.y = pixel.y & current.y;
1322 current.z = pixel.z & current.z;
1324 case LOGICALOP_NAND:
1325 current.x = ~(pixel.x & current.x);
1326 current.y = ~(pixel.y & current.y);
1327 current.z = ~(pixel.z & current.z);
1330 current.x = pixel.x | current.x;
1331 current.y = pixel.y | current.y;
1332 current.z = pixel.z | current.z;
1335 current.x = ~(pixel.x | current.x);
1336 current.y = ~(pixel.y | current.y);
1337 current.z = ~(pixel.z | current.z);
1340 current.x = pixel.x ^ current.x;
1341 current.y = pixel.y ^ current.y;
1342 current.z = pixel.z ^ current.z;
1344 case LOGICALOP_EQUIV:
1345 current.x = ~(pixel.x ^ current.x);
1346 current.y = ~(pixel.y ^ current.y);
1347 current.z = ~(pixel.z ^ current.z);
1349 case LOGICALOP_AND_REVERSE:
1350 current.x = ~pixel.x & current.x;
1351 current.y = ~pixel.y & current.y;
1352 current.z = ~pixel.z & current.z;
1354 case LOGICALOP_AND_INVERTED:
1355 current.x = pixel.x & ~current.x;
1356 current.y = pixel.y & ~current.y;
1357 current.z = pixel.z & ~current.z;
1359 case LOGICALOP_OR_REVERSE:
1360 current.x = ~pixel.x | current.x;
1361 current.y = ~pixel.y | current.y;
1362 current.z = ~pixel.z | current.z;
1364 case LOGICALOP_OR_INVERTED:
1365 current.x = pixel.x | ~current.x;
1366 current.y = pixel.y | ~current.y;
1367 current.z = pixel.z | ~current.z;
1374 void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s ¤t, Int &sMask, Int &zMask, Int &cMask)
1376 if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
1378 linearToSRGB16_12_16(current);
1381 if(exactColorRounding)
1383 switch(state.targetFormat[index])
1386 current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400));
1387 current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200));
1388 current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400));
1390 case FORMAT_X8G8R8B8Q:
1391 case FORMAT_A8G8R8B8Q:
1392 case FORMAT_X8R8G8B8:
1393 case FORMAT_X8B8G8R8:
1394 case FORMAT_A8R8G8B8:
1395 case FORMAT_A8B8G8R8:
1396 case FORMAT_SRGB8_X8:
1397 case FORMAT_SRGB8_A8:
1400 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
1401 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
1402 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
1403 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
1410 int rgbaWriteMask = state.colorWriteActive(index);
1411 int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
1413 switch(state.targetFormat[index])
1417 current.x = current.x & Short4(0xF800u);
1418 current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1419 current.z = As<UShort4>(current.z) >> 11;
1421 current.x = current.x | current.y | current.z;
1424 case FORMAT_X8G8R8B8Q:
1426 // current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1427 // current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1428 // current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1430 // current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1431 // current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1433 case FORMAT_A8G8R8B8Q:
1435 // current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1436 // current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1437 // current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1438 // current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1440 // current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1441 // current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1443 case FORMAT_X8R8G8B8:
1444 case FORMAT_A8R8G8B8:
1445 if(state.targetFormat[index] == FORMAT_X8R8G8B8 || rgbaWriteMask == 0x7)
1447 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1448 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1449 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1451 current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1452 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1454 current.x = current.z;
1455 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1456 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1457 current.y = current.z;
1458 current.z = As<Short4>(UnpackLow(current.z, current.x));
1459 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1463 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1464 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1465 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1466 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1468 current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1469 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1471 current.x = current.z;
1472 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1473 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1474 current.y = current.z;
1475 current.z = As<Short4>(UnpackLow(current.z, current.x));
1476 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1479 case FORMAT_X8B8G8R8:
1480 case FORMAT_A8B8G8R8:
1481 case FORMAT_SRGB8_X8:
1482 case FORMAT_SRGB8_A8:
1483 if(state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8 || rgbaWriteMask == 0x7)
1485 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1486 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1487 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1489 current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1490 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1492 current.x = current.z;
1493 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1494 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1495 current.y = current.z;
1496 current.z = As<Short4>(UnpackLow(current.z, current.x));
1497 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1501 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1502 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1503 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1504 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1506 current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1507 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1509 current.x = current.z;
1510 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1511 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1512 current.y = current.z;
1513 current.z = As<Short4>(UnpackLow(current.z, current.x));
1514 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1518 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1519 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1520 current.x = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.x)));
1521 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1522 current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
1525 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1526 current.x = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.x)));
1529 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1530 current.w = As<Short4>(Pack(As<UShort4>(current.w), As<UShort4>(current.w)));
1533 current.z = current.x;
1534 current.x = As<Short4>(UnpackLow(current.x, current.y));
1535 current.z = As<Short4>(UnpackHigh(current.z, current.y));
1536 current.y = current.z;
1538 case FORMAT_A16B16G16R16:
1539 transpose4x4(current.x, current.y, current.z, current.w);
1545 Short4 c01 = current.z;
1546 Short4 c23 = current.y;
1548 Int xMask; // Combination of all masks
1550 if(state.depthTestActive)
1559 if(state.stencilActive)
1564 switch(state.targetFormat[index])
1568 Pointer<Byte> buffer = cBuffer + 2 * x;
1569 Int value = *Pointer<Int>(buffer);
1571 Int c01 = Extract(As<Int2>(current.x), 0);
1573 if((bgraWriteMask & 0x00000007) != 0x00000007)
1576 c01 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1577 masked &= *Pointer<Int>(constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0]));
1581 c01 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
1582 value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
1584 *Pointer<Int>(buffer) = c01;
1586 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1587 value = *Pointer<Int>(buffer);
1589 Int c23 = Extract(As<Int2>(current.x), 1);
1591 if((bgraWriteMask & 0x00000007) != 0x00000007)
1594 c23 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1595 masked &= *Pointer<Int>(constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0]));
1599 c23 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
1600 value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
1602 *Pointer<Int>(buffer) = c23;
1605 case FORMAT_A8G8R8B8Q:
1606 case FORMAT_X8G8R8B8Q: // FIXME: Don't touch alpha?
1608 // value = *Pointer<Short4>(cBuffer + 8 * x + 0);
1610 // if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1611 // ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1612 // (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
1614 // Short4 masked = value;
1615 // c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1616 // masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1620 // c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1621 // value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1623 // *Pointer<Short4>(cBuffer + 8 * x + 0) = c01;
1625 // value = *Pointer<Short4>(cBuffer + 8 * x + 8);
1627 // if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1628 // ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1629 // (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
1631 // Short4 masked = value;
1632 // c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1633 // masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1637 // c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1638 // value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1640 // *Pointer<Short4>(cBuffer + 8 * x + 8) = c23;
1642 case FORMAT_A8R8G8B8:
1643 case FORMAT_X8R8G8B8: // FIXME: Don't touch alpha?
1645 Pointer<Byte> buffer = cBuffer + x * 4;
1646 Short4 value = *Pointer<Short4>(buffer);
1648 if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1649 ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1650 (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
1652 Short4 masked = value;
1653 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1654 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1658 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1659 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1661 *Pointer<Short4>(buffer) = c01;
1663 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1664 value = *Pointer<Short4>(buffer);
1666 if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1667 ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1668 (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
1670 Short4 masked = value;
1671 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1672 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1676 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1677 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1679 *Pointer<Short4>(buffer) = c23;
1682 case FORMAT_A8B8G8R8:
1683 case FORMAT_X8B8G8R8: // FIXME: Don't touch alpha?
1684 case FORMAT_SRGB8_X8:
1685 case FORMAT_SRGB8_A8:
1687 Pointer<Byte> buffer = cBuffer + x * 4;
1688 Short4 value = *Pointer<Short4>(buffer);
1690 bool masked = (((state.targetFormat[index] == FORMAT_A8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_A8) && rgbaWriteMask != 0x0000000F) ||
1691 (((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x00000007) &&
1692 ((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x0000000F))); // FIXME: Need for masking when XBGR && Fh?
1696 Short4 masked = value;
1697 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1698 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1702 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1703 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1705 *Pointer<Short4>(buffer) = c01;
1707 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1708 value = *Pointer<Short4>(buffer);
1712 Short4 masked = value;
1713 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1714 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1718 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1719 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1721 *Pointer<Short4>(buffer) = c23;
1725 if((rgbaWriteMask & 0x00000003) != 0x0)
1727 Pointer<Byte> buffer = cBuffer + 2 * x;
1729 value = Insert(value, *Pointer<Int>(buffer), 0);
1730 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1731 value = Insert(value, *Pointer<Int>(buffer + pitch), 1);
1733 Int2 packedCol = As<Int2>(current.x);
1735 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
1736 if((rgbaWriteMask & 0x3) != 0x3)
1738 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
1739 UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
1740 mergedMask &= rgbaMask;
1743 packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
1745 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
1746 *Pointer<UInt>(buffer + pitch) = As<UInt>(Extract(packedCol, 1));
1750 if(rgbaWriteMask & 0x00000001)
1752 Pointer<Byte> buffer = cBuffer + 1 * x;
1754 value = Insert(value, *Pointer<Short>(buffer), 0);
1755 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1756 value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
1757 value = UnpackLow(As<Byte8>(value), As<Byte8>(value));
1759 current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
1760 value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
1763 *Pointer<Short>(buffer) = Extract(current.x, 0);
1764 *Pointer<Short>(buffer + pitch) = Extract(current.x, 1);
1768 if(rgbaWriteMask & 0x00000008)
1770 Pointer<Byte> buffer = cBuffer + 1 * x;
1772 value = Insert(value, *Pointer<Short>(buffer), 0);
1773 Int pitch = *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1774 value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
1775 value = UnpackLow(As<Byte8>(value), As<Byte8>(value));
1777 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q) + 8 * xMask);
1778 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q) + 8 * xMask);
1781 *Pointer<Short>(buffer) = Extract(current.w, 0);
1782 *Pointer<Short>(buffer + pitch) = Extract(current.w, 1);
1787 Pointer<Byte> buffer = cBuffer + 4 * x;
1789 Short4 value = *Pointer<Short4>(buffer);
1791 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1793 Short4 masked = value;
1794 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1795 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
1796 current.x |= masked;
1799 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1800 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1802 *Pointer<Short4>(buffer) = current.x;
1804 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1806 value = *Pointer<Short4>(buffer);
1808 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1810 Short4 masked = value;
1811 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1812 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
1813 current.y |= masked;
1816 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1817 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1819 *Pointer<Short4>(buffer) = current.y;
1822 case FORMAT_A16B16G16R16:
1824 Pointer<Byte> buffer = cBuffer + 8 * x;
1827 Short4 value = *Pointer<Short4>(buffer);
1829 if(rgbaWriteMask != 0x0000000F)
1831 Short4 masked = value;
1832 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1833 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1834 current.x |= masked;
1837 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
1838 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
1840 *Pointer<Short4>(buffer) = current.x;
1844 Short4 value = *Pointer<Short4>(buffer + 8);
1846 if(rgbaWriteMask != 0x0000000F)
1848 Short4 masked = value;
1849 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1850 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1851 current.y |= masked;
1854 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
1855 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
1857 *Pointer<Short4>(buffer + 8) = current.y;
1860 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1863 Short4 value = *Pointer<Short4>(buffer);
1865 if(rgbaWriteMask != 0x0000000F)
1867 Short4 masked = value;
1868 current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1869 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1870 current.z |= masked;
1873 current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
1874 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
1876 *Pointer<Short4>(buffer) = current.z;
1880 Short4 value = *Pointer<Short4>(buffer + 8);
1882 if(rgbaWriteMask != 0x0000000F)
1884 Short4 masked = value;
1885 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1886 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1887 current.w |= masked;
1890 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
1891 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
1893 *Pointer<Short4>(buffer + 8) = current.w;
1902 void PixelRoutine::blendFactor(const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive)
1904 switch(blendFactorActive)
1913 blendFactor.x = oC.x;
1914 blendFactor.y = oC.y;
1915 blendFactor.z = oC.z;
1917 case BLEND_INVSOURCE:
1918 blendFactor.x = Float4(1.0f) - oC.x;
1919 blendFactor.y = Float4(1.0f) - oC.y;
1920 blendFactor.z = Float4(1.0f) - oC.z;
1923 blendFactor.x = pixel.x;
1924 blendFactor.y = pixel.y;
1925 blendFactor.z = pixel.z;
1928 blendFactor.x = Float4(1.0f) - pixel.x;
1929 blendFactor.y = Float4(1.0f) - pixel.y;
1930 blendFactor.z = Float4(1.0f) - pixel.z;
1932 case BLEND_SOURCEALPHA:
1933 blendFactor.x = oC.w;
1934 blendFactor.y = oC.w;
1935 blendFactor.z = oC.w;
1937 case BLEND_INVSOURCEALPHA:
1938 blendFactor.x = Float4(1.0f) - oC.w;
1939 blendFactor.y = Float4(1.0f) - oC.w;
1940 blendFactor.z = Float4(1.0f) - oC.w;
1942 case BLEND_DESTALPHA:
1943 blendFactor.x = pixel.w;
1944 blendFactor.y = pixel.w;
1945 blendFactor.z = pixel.w;
1947 case BLEND_INVDESTALPHA:
1948 blendFactor.x = Float4(1.0f) - pixel.w;
1949 blendFactor.y = Float4(1.0f) - pixel.w;
1950 blendFactor.z = Float4(1.0f) - pixel.w;
1952 case BLEND_SRCALPHASAT:
1953 blendFactor.x = Float4(1.0f) - pixel.w;
1954 blendFactor.x = Min(blendFactor.x, oC.w);
1955 blendFactor.y = blendFactor.x;
1956 blendFactor.z = blendFactor.x;
1958 case BLEND_CONSTANT:
1959 blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0]));
1960 blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1]));
1961 blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2]));
1963 case BLEND_INVCONSTANT:
1964 blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
1965 blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
1966 blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
1973 void PixelRoutine::blendFactorAlpha(const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive)
1975 switch(blendFactorAlphaActive)
1984 blendFactor.w = oC.w;
1986 case BLEND_INVSOURCE:
1987 blendFactor.w = Float4(1.0f) - oC.w;
1990 blendFactor.w = pixel.w;
1993 blendFactor.w = Float4(1.0f) - pixel.w;
1995 case BLEND_SOURCEALPHA:
1996 blendFactor.w = oC.w;
1998 case BLEND_INVSOURCEALPHA:
1999 blendFactor.w = Float4(1.0f) - oC.w;
2001 case BLEND_DESTALPHA:
2002 blendFactor.w = pixel.w;
2004 case BLEND_INVDESTALPHA:
2005 blendFactor.w = Float4(1.0f) - pixel.w;
2007 case BLEND_SRCALPHASAT:
2008 blendFactor.w = Float4(1.0f);
2010 case BLEND_CONSTANT:
2011 blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
2013 case BLEND_INVCONSTANT:
2014 blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
2021 void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
2023 if(!state.alphaBlendActive)
2028 Pointer<Byte> buffer;
2036 if(Surface::isFloatFormat(state.targetFormat[index]))
2040 else if(Surface::isNonNormalizedInteger(state.targetFormat[index]))
2042 one = As<Float4>(Surface::isUnsignedComponent(state.targetFormat[index], 0) ? Int4(0xFFFFFFFF) : Int4(0x7FFFFFFF));
2045 switch(state.targetFormat[index])
2052 pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
2053 pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
2054 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2056 pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
2057 pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
2058 pixel.y = pixel.z = pixel.w = one;
2060 case FORMAT_G32R32I:
2061 case FORMAT_G32R32UI:
2062 case FORMAT_G32R32F:
2064 pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
2065 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2066 pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
2068 pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88);
2069 pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD);
2071 pixel.z = pixel.w = one;
2073 case FORMAT_X32B32G32R32F:
2074 case FORMAT_A32B32G32R32F:
2075 case FORMAT_A32B32G32R32I:
2076 case FORMAT_A32B32G32R32UI:
2078 pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
2079 pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2080 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2081 pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
2082 pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2083 transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
2084 if(state.targetFormat[index] == FORMAT_X32B32G32R32F)
2086 pixel.w = Float4(1.0f);
2093 if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
2095 sRGBtoLinear(pixel.x);
2096 sRGBtoLinear(pixel.y);
2097 sRGBtoLinear(pixel.z);
2100 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
2101 Vector4f sourceFactor;
2102 Vector4f destFactor;
2104 blendFactor(sourceFactor, oC, pixel, state.sourceBlendFactor);
2105 blendFactor(destFactor, oC, pixel, state.destBlendFactor);
2107 if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
2109 oC.x *= sourceFactor.x;
2110 oC.y *= sourceFactor.y;
2111 oC.z *= sourceFactor.z;
2114 if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
2116 pixel.x *= destFactor.x;
2117 pixel.y *= destFactor.y;
2118 pixel.z *= destFactor.z;
2121 switch(state.blendOperation)
2133 case BLENDOP_INVSUB:
2134 oC.x = pixel.x - oC.x;
2135 oC.y = pixel.y - oC.y;
2136 oC.z = pixel.z - oC.z;
2139 oC.x = Min(oC.x, pixel.x);
2140 oC.y = Min(oC.y, pixel.y);
2141 oC.z = Min(oC.z, pixel.z);
2144 oC.x = Max(oC.x, pixel.x);
2145 oC.y = Max(oC.y, pixel.y);
2146 oC.z = Max(oC.z, pixel.z);
2148 case BLENDOP_SOURCE:
2157 oC.x = Float4(0.0f);
2158 oC.y = Float4(0.0f);
2159 oC.z = Float4(0.0f);
2165 blendFactorAlpha(sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
2166 blendFactorAlpha(destFactor, oC, pixel, state.destBlendFactorAlpha);
2168 if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
2170 oC.w *= sourceFactor.w;
2173 if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
2175 pixel.w *= destFactor.w;
2178 switch(state.blendOperationAlpha)
2186 case BLENDOP_INVSUB:
2191 oC.w = Min(oC.w, pixel.w);
2194 oC.w = Max(oC.w, pixel.w);
2196 case BLENDOP_SOURCE:
2203 oC.w = Float4(0.0f);
2210 void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
2212 switch(state.targetFormat[index])
2222 case FORMAT_G32R32F:
2223 case FORMAT_G32R32I:
2224 case FORMAT_G32R32UI:
2225 case FORMAT_G16R16I:
2226 case FORMAT_G16R16UI:
2230 oC.x = UnpackLow(oC.x, oC.y);
2231 oC.z = UnpackHigh(oC.z, oC.y);
2234 case FORMAT_X32B32G32R32F:
2235 case FORMAT_A32B32G32R32F:
2236 case FORMAT_A32B32G32R32I:
2237 case FORMAT_A32B32G32R32UI:
2238 case FORMAT_A16B16G16R16I:
2239 case FORMAT_A16B16G16R16UI:
2240 case FORMAT_A8B8G8R8I:
2241 case FORMAT_A8B8G8R8UI:
2242 transpose4x4(oC.x, oC.y, oC.z, oC.w);
2248 int rgbaWriteMask = state.colorWriteActive(index);
2250 Int xMask; // Combination of all masks
2252 if(state.depthTestActive)
2261 if(state.stencilActive)
2266 Pointer<Byte> buffer;
2269 switch(state.targetFormat[index])
2274 if(rgbaWriteMask & 0x00000001)
2276 buffer = cBuffer + 4 * x;
2279 value.x = *Pointer<Float>(buffer + 0);
2280 value.y = *Pointer<Float>(buffer + 4);
2282 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2285 value.z = *Pointer<Float>(buffer + 0);
2286 value.w = *Pointer<Float>(buffer + 4);
2288 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
2289 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
2290 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2293 *Pointer<Float>(buffer + 0) = oC.x.z;
2294 *Pointer<Float>(buffer + 4) = oC.x.w;
2296 buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2299 *Pointer<Float>(buffer + 0) = oC.x.x;
2300 *Pointer<Float>(buffer + 4) = oC.x.y;
2305 if(rgbaWriteMask & 0x00000001)
2307 buffer = cBuffer + 2 * x;
2310 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
2312 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2314 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
2315 value = As<Float4>(Int4(xyzw));
2317 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2318 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2319 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2321 if(state.targetFormat[index] == FORMAT_R16I)
2323 Float component = oC.x.z;
2324 *Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2326 *Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2328 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2331 *Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2333 *Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2335 else // FORMAT_R16UI
2337 Float component = oC.x.z;
2338 *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2340 *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2342 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2345 *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2347 *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2353 if(rgbaWriteMask & 0x00000001)
2355 buffer = cBuffer + x;
2357 UInt xyzw, packedCol;
2359 xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
2360 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2361 xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
2363 Short4 tmpCol = Short4(As<Int4>(oC.x));
2364 if(state.targetFormat[index] == FORMAT_R8I)
2366 tmpCol = As<Short4>(Pack(tmpCol, tmpCol));
2370 tmpCol = As<Short4>(Pack(As<UShort4>(tmpCol), As<UShort4>(tmpCol)));
2372 packedCol = Extract(As<Int2>(tmpCol), 0);
2374 packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
2375 (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
2377 *Pointer<UShort>(buffer) = UShort(packedCol >> 16);
2378 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2379 *Pointer<UShort>(buffer) = UShort(packedCol);
2382 case FORMAT_G32R32F:
2383 case FORMAT_G32R32I:
2384 case FORMAT_G32R32UI:
2385 buffer = cBuffer + 8 * x;
2387 value = *Pointer<Float4>(buffer);
2389 if((rgbaWriteMask & 0x00000003) != 0x00000003)
2391 Float4 masked = value;
2392 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2393 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0])));
2394 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2397 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
2398 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
2399 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2400 *Pointer<Float4>(buffer) = oC.x;
2402 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2404 value = *Pointer<Float4>(buffer);
2406 if((rgbaWriteMask & 0x00000003) != 0x00000003)
2411 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2412 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0])));
2413 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2416 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
2417 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
2418 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2419 *Pointer<Float4>(buffer) = oC.y;
2421 case FORMAT_G16R16I:
2422 case FORMAT_G16R16UI:
2423 if((rgbaWriteMask & 0x00000003) != 0x0)
2425 buffer = cBuffer + 4 * x;
2428 UShort4 packedCol = UShort4(As<Int4>(oC.x));
2429 UShort4 value = *Pointer<UShort4>(buffer);
2430 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2431 if((rgbaWriteMask & 0x3) != 0x3)
2433 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
2434 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2435 mergedMask &= rgbaMask;
2437 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2439 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2441 packedCol = UShort4(As<Int4>(oC.y));
2442 value = *Pointer<UShort4>(buffer);
2443 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2444 if((rgbaWriteMask & 0x3) != 0x3)
2446 mergedMask &= rgbaMask;
2448 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2453 if((rgbaWriteMask & 0x00000003) != 0x0)
2455 buffer = cBuffer + 2 * x;
2457 Int2 xyzw, packedCol;
2459 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
2460 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2461 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
2463 if(state.targetFormat[index] == FORMAT_G8R8I)
2465 packedCol = As<Int2>(Pack(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2469 packedCol = As<Int2>(Pack(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y))));
2472 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
2473 if((rgbaWriteMask & 0x3) != 0x3)
2475 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
2476 UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2477 mergedMask &= rgbaMask;
2480 packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
2482 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
2483 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2484 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
2487 case FORMAT_X32B32G32R32F:
2488 case FORMAT_A32B32G32R32F:
2489 case FORMAT_A32B32G32R32I:
2490 case FORMAT_A32B32G32R32UI:
2491 buffer = cBuffer + 16 * x;
2494 value = *Pointer<Float4>(buffer, 16);
2496 if(rgbaWriteMask != 0x0000000F)
2498 Float4 masked = value;
2499 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2500 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2501 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2504 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
2505 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
2506 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2507 *Pointer<Float4>(buffer, 16) = oC.x;
2511 value = *Pointer<Float4>(buffer + 16, 16);
2513 if(rgbaWriteMask != 0x0000000F)
2515 Float4 masked = value;
2516 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2517 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2518 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2521 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
2522 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
2523 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2524 *Pointer<Float4>(buffer + 16, 16) = oC.y;
2527 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2530 value = *Pointer<Float4>(buffer, 16);
2532 if(rgbaWriteMask != 0x0000000F)
2534 Float4 masked = value;
2535 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2536 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2537 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
2540 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
2541 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
2542 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
2543 *Pointer<Float4>(buffer, 16) = oC.z;
2547 value = (state.targetFormat[index] == FORMAT_X32B32G32R32F) ? Float4(1.0f) : *Pointer<Float4>(buffer + 16, 16);
2549 if(rgbaWriteMask != 0x0000000F)
2551 Float4 masked = value;
2552 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2553 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2554 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
2557 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
2558 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
2559 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
2560 *Pointer<Float4>(buffer + 16, 16) = oC.w;
2563 case FORMAT_A16B16G16R16I:
2564 case FORMAT_A16B16G16R16UI:
2565 if((rgbaWriteMask & 0x0000000F) != 0x0)
2567 buffer = cBuffer + 8 * x;
2570 UShort8 value = *Pointer<UShort8>(buffer);
2571 UShort8 packedCol = UShort8(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y)));
2572 UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
2573 if((rgbaWriteMask & 0xF) != 0xF)
2575 UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
2576 rgbaMask = UInt4(tmpMask, tmpMask);
2577 mergedMask &= rgbaMask;
2579 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2581 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2583 value = *Pointer<UShort8>(buffer);
2584 packedCol = UShort8(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w)));
2585 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
2586 if((rgbaWriteMask & 0xF) != 0xF)
2588 mergedMask &= rgbaMask;
2590 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2593 case FORMAT_A8B8G8R8I:
2594 case FORMAT_A8B8G8R8UI:
2595 if((rgbaWriteMask & 0x0000000F) != 0x0)
2597 UInt2 value, packedCol, mergedMask;
2599 buffer = cBuffer + 4 * x;
2601 if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
2603 packedCol = As<UInt2>(Pack(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2607 packedCol = As<UInt2>(Pack(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y))));
2609 value = *Pointer<UInt2>(buffer, 16);
2610 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2611 if(rgbaWriteMask != 0xF)
2613 mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2615 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2617 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2619 if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
2621 packedCol = As<UInt2>(Pack(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
2625 packedCol = As<UInt2>(Pack(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w))));
2627 value = *Pointer<UInt2>(buffer, 16);
2628 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2629 if(rgbaWriteMask != 0xF)
2631 mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2633 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2641 UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate)
2643 return UShort4(cf * Float4(0xFFFF), saturate);
2646 void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
2648 c.x = As<UShort4>(c.x) >> 4;
2649 c.y = As<UShort4>(c.y) >> 4;
2650 c.z = As<UShort4>(c.z) >> 4;
2652 sRGBtoLinear12_16(c);
2655 void PixelRoutine::sRGBtoLinear12_16(Vector4s &c)
2657 Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16);
2659 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2660 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2661 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2662 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2664 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2665 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2666 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2667 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2669 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2670 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2671 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2672 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2675 void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
2677 c.x = As<UShort4>(c.x) >> 4;
2678 c.y = As<UShort4>(c.y) >> 4;
2679 c.z = As<UShort4>(c.z) >> 4;
2681 linearToSRGB12_16(c);
2684 void PixelRoutine::linearToSRGB12_16(Vector4s &c)
2686 Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16);
2688 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2689 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2690 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2691 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2693 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2694 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2695 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2696 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2698 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2699 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2700 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2701 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2704 Float4 PixelRoutine::sRGBtoLinear(const Float4 &x) // Approximates x^2.2
2706 Float4 linear = x * x;
2707 linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
2709 return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
2712 bool PixelRoutine::colorUsed()
2714 return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill;