OSDN Git Service

Eliminate unnecessary inverse masks.
[android-x86/external-swiftshader.git] / src / Shader / PixelRoutine.cpp
index c313eb4..44fafd3 100644 (file)
@@ -1,13 +1,16 @@
-// SwiftShader Software Renderer
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
 //
-// Copyright(c) 2005-2013 TransGaming Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
 //
-// All rights reserved. No part of this software may be copied, distributed, transmitted,
-// transcribed, stored in a retrieval system, translated into any human or computer
-// language by any means, or disclosed to third parties without the explicit written
-// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
-// or implied, including but not limited to any patent rights, are granted to you.
+//    http://www.apache.org/licenses/LICENSE-2.0
 //
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 #include "PixelRoutine.hpp"
 
@@ -15,7 +18,6 @@
 #include "QuadRasterizer.hpp"
 #include "Surface.hpp"
 #include "Primitive.hpp"
-#include "CPUID.hpp"
 #include "SamplerCore.hpp"
 #include "Constants.hpp"
 #include "Debug.hpp"
@@ -27,13 +29,11 @@ namespace sw
        extern bool exactColorRounding;
        extern bool forceClearRegisters;
 
-       PixelRoutine::Registers::Registers(const PixelShader *shader) :
-               QuadRasterizer::Registers(),
-               v(shader && shader->dynamicallyIndexedInput)
+       PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader) : QuadRasterizer(state, shader), v(shader && shader->dynamicallyIndexedInput)
        {
                if(!shader || shader->getVersion() < 0x0200 || forceClearRegisters)
                {
-                       for(int i = 0; i < 10; i++)
+                       for(int i = 0; i < MAX_FRAGMENT_INPUTS; i++)
                        {
                                v[i].x = Float4(0.0f);
                                v[i].y = Float4(0.0f);
@@ -43,10 +43,6 @@ namespace sw
                }
        }
 
-       PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader) : QuadRasterizer(state, shader)
-       {
-       }
-
        PixelRoutine::~PixelRoutine()
        {
                for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
@@ -55,17 +51,15 @@ namespace sw
                }
        }
 
-       void PixelRoutine::quad(QuadRasterizer::Registers &rBase, Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
+       void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
        {
-               Registers& r = *static_cast<Registers*>(&rBase);
-
                #if PERF_PROFILE
                        Long pipeTime = Ticks();
                #endif
 
                for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
                {
-                       sampler[i] = new SamplerCore(r.constants, state.sampler[i]);
+                       sampler[i] = new SamplerCore(constants, state.sampler[i]);
                }
 
                const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive();
@@ -81,30 +75,26 @@ namespace sw
 
                for(unsigned int q = 0; q < state.multiSample; q++)
                {
-                       stencilTest(r, sBuffer, q, x, sMask[q], cMask[q]);
+                       stencilTest(sBuffer, q, x, sMask[q], cMask[q]);
                }
 
                Float4 f;
-
-               Float4 (&z)[4] = r.z;
-               Float4 &w = r.w;
-               Float4 &rhw = r.rhw;
                Float4 rhwCentroid;
 
-               Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(r.primitive + OFFSET(Primitive,xQuad), 16);
+               Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16);
 
                if(interpolateZ())
                {
                        for(unsigned int q = 0; q < state.multiSample; q++)
                        {
                                Float4 x = xxxx;
-                       
+
                                if(state.multiSample > 1)
                                {
-                                       x -= *Pointer<Float4>(r.constants + OFFSET(Constants,X) + q * sizeof(float4));
+                                       x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4));
                                }
 
-                               z[q] = interpolate(x, r.Dz[q], z[q], r.primitive + OFFSET(Primitive,z), false, false);
+                               z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false);
                        }
                }
 
@@ -114,7 +104,7 @@ namespace sw
                {
                        for(unsigned int q = 0; q < state.multiSample; q++)
                        {
-                               depthPass = depthPass || depthTest(r, zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
+                               depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
                        }
                }
 
@@ -124,7 +114,7 @@ namespace sw
                                Long interpTime = Ticks();
                        #endif
 
-                       Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(r.primitive + OFFSET(Primitive,yQuad), 16);
+                       Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
 
                        // Centroid locations
                        Float4 XXXX = Float4(0.0f);
@@ -136,9 +126,9 @@ namespace sw
 
                                for(unsigned int q = 0; q < state.multiSample; q++)
                                {
-                                       XXXX += *Pointer<Float4>(r.constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
-                                       YYYY += *Pointer<Float4>(r.constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
-                                       WWWW += *Pointer<Float4>(r.constants + OFFSET(Constants,weight) + 16 * cMask[q]);
+                                       XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
+                                       YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
+                                       WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]);
                                }
 
                                WWWW = Rcp_pp(WWWW);
@@ -151,16 +141,16 @@ namespace sw
 
                        if(interpolateW())
                        {
-                               w = interpolate(xxxx, r.Dw, rhw, r.primitive + OFFSET(Primitive,w), false, false);
-                               rhw = reciprocal(w);
+                               w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false);
+                               rhw = reciprocal(w, false, false, true);
 
                                if(state.centroid)
                                {
-                                       rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, r.primitive + OFFSET(Primitive,w), false, false));
+                                       rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false));
                                }
                        }
 
-                       for(int interpolant = 0; interpolant < 10; interpolant++)
+                       for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
                        {
                                for(int component = 0; component < 4; component++)
                                {
@@ -168,11 +158,11 @@ namespace sw
                                        {
                                                if(!state.interpolant[interpolant].centroid)
                                                {
-                                                       r.v[interpolant][component] = interpolate(xxxx, r.Dv[interpolant][component], rhw, r.primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
+                                                       v[interpolant][component] = interpolate(xxxx, Dv[interpolant][component], rhw, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
                                                }
                                                else
                                                {
-                                                       r.v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, r.primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
+                                                       v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
                                                }
                                        }
                                }
@@ -184,32 +174,32 @@ namespace sw
                                case 0:
                                        break;
                                case 1:
-                                       rcp = reciprocal(r.v[interpolant].y);
-                                       r.v[interpolant].x = r.v[interpolant].x * rcp;
+                                       rcp = reciprocal(v[interpolant].y);
+                                       v[interpolant].x = v[interpolant].x * rcp;
                                        break;
                                case 2:
-                                       rcp = reciprocal(r.v[interpolant].z);
-                                       r.v[interpolant].x = r.v[interpolant].x * rcp;
-                                       r.v[interpolant].y = r.v[interpolant].y * rcp;
+                                       rcp = reciprocal(v[interpolant].z);
+                                       v[interpolant].x = v[interpolant].x * rcp;
+                                       v[interpolant].y = v[interpolant].y * rcp;
                                        break;
                                case 3:
-                                       rcp = reciprocal(r.v[interpolant].w);
-                                       r.v[interpolant].x = r.v[interpolant].x * rcp;
-                                       r.v[interpolant].y = r.v[interpolant].y * rcp;
-                                       r.v[interpolant].z = r.v[interpolant].z * rcp;
+                                       rcp = reciprocal(v[interpolant].w);
+                                       v[interpolant].x = v[interpolant].x * rcp;
+                                       v[interpolant].y = v[interpolant].y * rcp;
+                                       v[interpolant].z = v[interpolant].z * rcp;
                                        break;
                                }
                        }
 
                        if(state.fog.component)
                        {
-                               f = interpolate(xxxx, r.Df, rhw, r.primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective);
+                               f = interpolate(xxxx, Df, rhw, primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective);
                        }
 
-                       setBuiltins(r, x, y, z, w);
+                       setBuiltins(x, y, z, w);
 
                        #if PERF_PROFILE
-                               r.cycles[PERF_INTERP] += Ticks() - interpTime;
+                               cycles[PERF_INTERP] += Ticks() - interpTime;
                        #endif
 
                        Bool alphaPass = true;
@@ -220,13 +210,13 @@ namespace sw
                                        Long shaderTime = Ticks();
                                #endif
 
-                               applyShader(r, cMask);
+                               applyShader(cMask);
 
                                #if PERF_PROFILE
-                                       r.cycles[PERF_SHADER] += Ticks() - shaderTime;
+                                       cycles[PERF_SHADER] += Ticks() - shaderTime;
                                #endif
 
-                               alphaPass = alphaTest(r, cMask);
+                               alphaPass = alphaTest(cMask);
 
                                if((shader && shader->containsKill()) || state.alphaTestActive())
                                {
@@ -244,7 +234,7 @@ namespace sw
                                {
                                        for(unsigned int q = 0; q < state.multiSample; q++)
                                        {
-                                               depthPass = depthPass || depthTest(r, zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
+                                               depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
                                        }
                                }
 
@@ -258,11 +248,11 @@ namespace sw
                                        {
                                                if(state.multiSampleMask & (1 << q))
                                                {
-                                                       writeDepth(r, zBuffer, q, x, z[q], zMask[q]);
+                                                       writeDepth(zBuffer, q, x, z[q], zMask[q]);
 
                                                        if(state.occlusionEnabled)
                                                        {
-                                                               r.occlusion += *Pointer<UInt>(r.constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
+                                                               occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
                                                        }
                                                }
                                        }
@@ -273,12 +263,12 @@ namespace sw
                                                        AddAtomic(Pointer<Long>(&profiler.ropOperations), 4);
                                                #endif
 
-                                               rasterOperation(r, f, cBuffer, x, sMask, zMask, cMask);
+                                               rasterOperation(f, cBuffer, x, sMask, zMask, cMask);
                                        }
                                }
 
                                #if PERF_PROFILE
-                                       r.cycles[PERF_ROP] += Ticks() - ropTime;
+                                       cycles[PERF_ROP] += Ticks() - ropTime;
                                #endif
                        }
                }
@@ -287,12 +277,12 @@ namespace sw
                {
                        if(state.multiSampleMask & (1 << q))
                        {
-                               writeStencil(r, sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
+                               writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
                        }
                }
 
                #if PERF_PROFILE
-                       r.cycles[PERF_PIPE] += Ticks() - pipeTime;
+                       cycles[PERF_PIPE] += Ticks() - pipeTime;
                #endif
        }
 
@@ -314,7 +304,7 @@ namespace sw
                return interpolant;
        }
 
-       void PixelRoutine::stencilTest(Registers &r, Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask)
+       void PixelRoutine::stencilTest(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask)
        {
                if(!state.stencilActive)
                {
@@ -327,83 +317,83 @@ namespace sw
 
                if(q > 0)
                {
-                       buffer += q * *Pointer<Int>(r.data + OFFSET(DrawData,stencilSliceB));
+                       buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
                }
 
-               Byte8 value = As<Byte8>(Long1(*Pointer<UInt>(buffer)));
+               Byte8 value = *Pointer<Byte8>(buffer);
                Byte8 valueCCW = value;
 
                if(!state.noStencilMask)
                {
-                       value &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[0].testMaskQ));
+                       value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ));
                }
 
-               stencilTest(r, value, state.stencilCompareMode, false);
+               stencilTest(value, state.stencilCompareMode, false);
 
                if(state.twoSidedStencil)
                {
                        if(!state.noStencilMaskCCW)
                        {
-                               valueCCW &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[1].testMaskQ));
+                               valueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ));
                        }
 
-                       stencilTest(r, valueCCW, state.stencilCompareModeCCW, true);
+                       stencilTest(valueCCW, state.stencilCompareModeCCW, true);
 
-                       value &= *Pointer<Byte8>(r.primitive + OFFSET(Primitive,clockwiseMask));
-                       valueCCW &= *Pointer<Byte8>(r.primitive + OFFSET(Primitive,invClockwiseMask));
+                       value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
+                       valueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
                        value |= valueCCW;
                }
 
                sMask = SignMask(value) & cMask;
        }
 
-       void PixelRoutine::stencilTest(Registers &r, Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW)
+       void PixelRoutine::stencilTest(Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW)
        {
                Byte8 equal;
 
                switch(stencilCompareMode)
                {
                case STENCIL_ALWAYS:
-                       value = Byte8(0xFFFFFFFFFFFFFFFF);
+                       value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
                        break;
                case STENCIL_NEVER:
-                       value = Byte8(0x0000000000000000);
+                       value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
                        break;
                case STENCIL_LESS:                      // a < b ~ b > a
                        value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-                       value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
+                       value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
                        break;
                case STENCIL_EQUAL:
-                       value = CmpEQ(value, *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
+                       value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
                        break;
                case STENCIL_NOTEQUAL:          // a != b ~ !(a == b)
-                       value = CmpEQ(value, *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
-                       value ^= Byte8(0xFFFFFFFFFFFFFFFF);
+                       value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
+                       value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
                        break;
                case STENCIL_LESSEQUAL: // a <= b ~ (b > a) || (a == b)
                        equal = value;
-                       equal = CmpEQ(equal, *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
+                       equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
                        value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-                       value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
+                       value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
                        value |= equal;
                        break;
                case STENCIL_GREATER:           // a > b
-                       equal = *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
+                       equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
                        value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
                        equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
                        value = equal;
                        break;
                case STENCIL_GREATEREQUAL:      // a >= b ~ !(a < b) ~ !(b > a)
                        value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-                       value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
-                       value ^= Byte8(0xFFFFFFFFFFFFFFFF);
+                       value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
+                       value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
                        break;
                default:
                        ASSERT(false);
                }
        }
 
-       Bool PixelRoutine::depthTest(Registers &r, Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
+       Bool PixelRoutine::depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
        {
                if(!state.depthTestActive)
                {
@@ -416,11 +406,11 @@ namespace sw
                {
                        if(complementaryDepthBuffer)
                        {
-                               Z = Float4(1.0f) - r.oDepth;
+                               Z = Float4(1.0f) - oDepth;
                        }
                        else
                        {
-                               Z = r.oDepth;
+                               Z = oDepth;
                        }
                }
 
@@ -430,7 +420,7 @@ namespace sw
                if(!state.quadLayoutDepthBuffer)
                {
                        buffer = zBuffer + 4 * x;
-                       pitch = *Pointer<Int>(r.data + OFFSET(DrawData,depthPitchB));
+                       pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
                }
                else
                {
@@ -439,7 +429,7 @@ namespace sw
 
                if(q > 0)
                {
-                       buffer += q * *Pointer<Int>(r.data + OFFSET(DrawData,depthSliceB));
+                       buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
                }
 
                Float4 zValue;
@@ -530,7 +520,7 @@ namespace sw
                        zMask = SignMask(zTest) & cMask;
                        break;
                }
-               
+
                if(state.stencilActive)
                {
                        zMask &= sMask;
@@ -539,7 +529,7 @@ namespace sw
                return zMask != 0;
        }
 
-       void PixelRoutine::alphaTest(Registers &r, Int &aMask, Short4 &alpha)
+       void PixelRoutine::alphaTest(Int &aMask, Short4 &alpha)
        {
                Short4 cmp;
                Short4 equal;
@@ -553,42 +543,42 @@ namespace sw
                        aMask = 0x0;
                        break;
                case ALPHA_EQUAL:
-                       cmp = CmpEQ(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)));
-                       aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
+                       cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
+                       aMask = SignMask(Pack(cmp, Short4(0x0000)));
                        break;
-               case ALPHA_NOTEQUAL:            // a != b ~ !(a == b)
-                       cmp = CmpEQ(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4((short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF);   // FIXME
-                       aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
+               case ALPHA_NOTEQUAL:       // a != b ~ !(a == b)
+                       cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
+                       aMask = SignMask(Pack(cmp, Short4(0x0000)));
                        break;
-               case ALPHA_LESS:                        // a < b ~ b > a
-                       cmp = CmpGT(*Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)), alpha);
-                       aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
+               case ALPHA_LESS:           // a < b ~ b > a
+                       cmp = CmpGT(*Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)), alpha);
+                       aMask = SignMask(Pack(cmp, Short4(0x0000)));
                        break;
-               case ALPHA_GREATEREQUAL:        // a >= b ~ (a > b) || (a == b) ~ !(b > a)   // TODO: Approximate
-                       equal = CmpEQ(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)));
-                       cmp = CmpGT(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)));
+               case ALPHA_GREATEREQUAL:   // a >= b ~ (a > b) || (a == b) ~ !(b > a)   // TODO: Approximate
+                       equal = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
+                       cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
                        cmp |= equal;
-                       aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
+                       aMask = SignMask(Pack(cmp, Short4(0x0000)));
                        break;
-               case ALPHA_LESSEQUAL:           // a <= b ~ !(a > b)
-                       cmp = CmpGT(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4((short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF);   // FIXME
-                       aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
+               case ALPHA_LESSEQUAL:      // a <= b ~ !(a > b)
+                       cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
+                       aMask = SignMask(Pack(cmp, Short4(0x0000)));
                        break;
-               case ALPHA_GREATER:                     // a > b
-                       cmp = CmpGT(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)));
-                       aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
+               case ALPHA_GREATER:        // a > b
+                       cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
+                       aMask = SignMask(Pack(cmp, Short4(0x0000)));
                        break;
                default:
                        ASSERT(false);
                }
        }
 
-       void PixelRoutine::alphaToCoverage(Registers &r, Int cMask[4], Float4 &alpha)
+       void PixelRoutine::alphaToCoverage(Int cMask[4], Float4 &alpha)
        {
-               Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(r.data + OFFSET(DrawData,a2c0)));
-               Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(r.data + OFFSET(DrawData,a2c1)));
-               Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(r.data + OFFSET(DrawData,a2c2)));
-               Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(r.data + OFFSET(DrawData,a2c3)));
+               Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0)));
+               Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1)));
+               Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2)));
+               Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3)));
 
                Int aMask0 = SignMask(coverage0);
                Int aMask1 = SignMask(coverage1);
@@ -601,7 +591,7 @@ namespace sw
                cMask[3] &= aMask3;
        }
 
-       void PixelRoutine::fogBlend(Registers &r, Vector4f &c0, Float4 &fog)
+       void PixelRoutine::fogBlend(Vector4f &c0, Float4 &fog)
        {
                if(!state.fogActive)
                {
@@ -610,26 +600,26 @@ namespace sw
 
                if(state.pixelFogMode != FOG_NONE)
                {
-                       pixelFog(r, fog);
+                       pixelFog(fog);
 
                        fog = Min(fog, Float4(1.0f));
                        fog = Max(fog, Float4(0.0f));
                }
 
-               c0.x -= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[0]));
-               c0.y -= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[1]));
-               c0.z -= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[2]));
+               c0.x -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
+               c0.y -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
+               c0.z -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
 
                c0.x *= fog;
                c0.y *= fog;
                c0.z *= fog;
 
-               c0.x += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[0]));
-               c0.y += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[1]));
-               c0.z += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[2]));
+               c0.x += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
+               c0.y += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
+               c0.z += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
        }
 
-       void PixelRoutine::pixelFog(Registers &r, Float4 &visibility)
+       void PixelRoutine::pixelFog(Float4 &visibility)
        {
                Float4 &zw = visibility;
 
@@ -637,17 +627,17 @@ namespace sw
                {
                        if(state.wBasedFog)
                        {
-                               zw = r.rhw;
+                               zw = rhw;
                        }
                        else
                        {
                                if(complementaryDepthBuffer)
                                {
-                                       zw = Float4(1.0f) - r.z[0];
+                                       zw = Float4(1.0f) - z[0];
                                }
                                else
                                {
-                                       zw = r.z[0];
+                                       zw = z[0];
                                }
                        }
                }
@@ -657,16 +647,16 @@ namespace sw
                case FOG_NONE:
                        break;
                case FOG_LINEAR:
-                       zw *= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.scale));
-                       zw += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.offset));
+                       zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.scale));
+                       zw += *Pointer<Float4>(data + OFFSET(DrawData,fog.offset));
                        break;
                case FOG_EXP:
-                       zw *= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.densityE));
+                       zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.densityE));
                        zw = exponential2(zw, true);
                        break;
                case FOG_EXP2:
                        zw *= zw;
-                       zw *= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.density2E));
+                       zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.density2E));
                        zw = exponential2(zw, true);
                        break;
                default:
@@ -674,7 +664,7 @@ namespace sw
                }
        }
 
-       void PixelRoutine::writeDepth(Registers &r, Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
+       void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
        {
                if(!state.depthWriteEnable)
                {
@@ -687,11 +677,11 @@ namespace sw
                {
                        if(complementaryDepthBuffer)
                        {
-                               Z = Float4(1.0f) - r.oDepth;
+                               Z = Float4(1.0f) - oDepth;
                        }
                        else
                        {
-                               Z = r.oDepth;
+                               Z = oDepth;
                        }
                }
 
@@ -699,18 +689,18 @@ namespace sw
                Int pitch;
 
                if(!state.quadLayoutDepthBuffer)
-               {       
+               {
                        buffer = zBuffer + 4 * x;
-                       pitch = *Pointer<Int>(r.data + OFFSET(DrawData,depthPitchB));
+                       pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
                }
                else
-               {       
+               {
                        buffer = zBuffer + 8 * x;
                }
 
                if(q > 0)
                {
-                       buffer += q * *Pointer<Int>(r.data + OFFSET(DrawData,depthSliceB));
+                       buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
                }
 
                Float4 zValue;
@@ -729,8 +719,8 @@ namespace sw
                        }
                }
 
-               Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
-               zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
+               Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
+               zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
                Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
 
                if(!state.quadLayoutDepthBuffer)
@@ -745,7 +735,7 @@ namespace sw
                }
        }
 
-       void PixelRoutine::writeStencil(Registers &r, Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
+       void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
        {
                if(!state.stencilActive)
                {
@@ -769,19 +759,19 @@ namespace sw
 
                if(q > 0)
                {
-                       buffer += q * *Pointer<Int>(r.data + OFFSET(DrawData,stencilSliceB));
+                       buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
                }
 
-               Byte8 bufferValue = As<Byte8>(Long1(*Pointer<UInt>(buffer)));
-       
+               Byte8 bufferValue = *Pointer<Byte8>(buffer);
+
                Byte8 newValue;
-               stencilOperation(r, newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
+               stencilOperation(newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
 
                if(!state.noStencilWriteMask)
                {
                        Byte8 maskedValue = bufferValue;
-                       newValue &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[0].writeMaskQ));
-                       maskedValue &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
+                       newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ));
+                       maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
                        newValue |= maskedValue;
                }
 
@@ -789,62 +779,62 @@ namespace sw
                {
                        Byte8 newValueCCW;
 
-                       stencilOperation(r, newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
+                       stencilOperation(newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
 
                        if(!state.noStencilWriteMaskCCW)
                        {
                                Byte8 maskedValue = bufferValue;
-                               newValueCCW &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[1].writeMaskQ));
-                               maskedValue &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
+                               newValueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ));
+                               maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
                                newValueCCW |= maskedValue;
                        }
 
-                       newValue &= *Pointer<Byte8>(r.primitive + OFFSET(Primitive,clockwiseMask));
-                       newValueCCW &= *Pointer<Byte8>(r.primitive + OFFSET(Primitive,invClockwiseMask));
+                       newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
+                       newValueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
                        newValue |= newValueCCW;
                }
 
-               newValue &= *Pointer<Byte8>(r.constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
-               bufferValue &= *Pointer<Byte8>(r.constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
+               newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
+               bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
                newValue |= bufferValue;
 
-               *Pointer<UInt>(buffer) = UInt(As<Long>(newValue));
+               *Pointer<Byte4>(buffer) = Byte4(newValue);
        }
 
-       void PixelRoutine::stencilOperation(Registers &r, Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
+       void PixelRoutine::stencilOperation(Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
        {
                Byte8 &pass = newValue;
                Byte8 fail;
                Byte8 zFail;
 
-               stencilOperation(r, pass, bufferValue, stencilPassOperation, CCW);
+               stencilOperation(pass, bufferValue, stencilPassOperation, CCW);
 
                if(stencilZFailOperation != stencilPassOperation)
                {
-                       stencilOperation(r, zFail, bufferValue, stencilZFailOperation, CCW);
+                       stencilOperation(zFail, bufferValue, stencilZFailOperation, CCW);
                }
 
                if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
                {
-                       stencilOperation(r, fail, bufferValue, stencilFailOperation, CCW);
+                       stencilOperation(fail, bufferValue, stencilFailOperation, CCW);
                }
 
                if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
                {
                        if(state.depthTestActive && stencilZFailOperation != stencilPassOperation)   // zMask valid and values not the same
                        {
-                               pass &= *Pointer<Byte8>(r.constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
-                               zFail &= *Pointer<Byte8>(r.constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
+                               pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
+                               zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
                                pass |= zFail;
                        }
 
-                       pass &= *Pointer<Byte8>(r.constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
-                       fail &= *Pointer<Byte8>(r.constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
+                       pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
+                       fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
                        pass |= fail;
                }
        }
 
-       void PixelRoutine::stencilOperation(Registers &r, Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW)
+       void PixelRoutine::stencilOperation(Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW)
        {
                switch(operation)
                {
@@ -852,10 +842,10 @@ namespace sw
                        output = bufferValue;
                        break;
                case OPERATION_ZERO:
-                       output = Byte8(0x0000000000000000);
+                       output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
                        break;
                case OPERATION_REPLACE:
-                       output = *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceQ));
+                       output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceQ));
                        break;
                case OPERATION_INCRSAT:
                        output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
@@ -864,7 +854,7 @@ namespace sw
                        output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
                        break;
                case OPERATION_INVERT:
-                       output = bufferValue ^ Byte8(0xFFFFFFFFFFFFFFFF);
+                       output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
                        break;
                case OPERATION_INCR:
                        output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
@@ -877,7 +867,7 @@ namespace sw
                }
        }
 
-       void PixelRoutine::blendFactor(Registers &r, const Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorActive)
+       void PixelRoutine::blendFactor(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorActive)
        {
                switch(blendFactorActive)
                {
@@ -934,31 +924,31 @@ namespace sw
                        blendFactor.z = blendFactor.x;
                        break;
                case BLEND_CONSTANT:
-                       blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[0]));
-                       blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[1]));
-                       blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[2]));
+                       blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0]));
+                       blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1]));
+                       blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2]));
                        break;
                case BLEND_INVCONSTANT:
-                       blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
-                       blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
-                       blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
+                       blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
+                       blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
+                       blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
                        break;
                case BLEND_CONSTANTALPHA:
-                       blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
-                       blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
-                       blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
+                       blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
+                       blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
+                       blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
                        break;
                case BLEND_INVCONSTANTALPHA:
-                       blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
-                       blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
-                       blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
+                       blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
+                       blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
+                       blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
                        break;
                default:
                        ASSERT(false);
                }
        }
-       
-       void PixelRoutine::blendFactorAlpha(Registers &r, const Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorAlphaActive)
+
+       void PixelRoutine::blendFactorAlpha(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorAlphaActive)
        {
                switch(blendFactorAlphaActive)
                {
@@ -997,18 +987,23 @@ namespace sw
                        break;
                case BLEND_CONSTANT:
                case BLEND_CONSTANTALPHA:
-                       blendFactor.w = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
+                       blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
                        break;
                case BLEND_INVCONSTANT:
                case BLEND_INVCONSTANTALPHA:
-                       blendFactor.w = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
+                       blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
                        break;
                default:
                        ASSERT(false);
                }
        }
 
-       void PixelRoutine::readPixel(Registers &r, int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel)
+       bool PixelRoutine::isSRGB(int index) const
+       {
+               return state.targetFormat[index] == FORMAT_SRGB8_A8 || state.targetFormat[index] == FORMAT_SRGB8_X8;
+       }
+
+       void PixelRoutine::readPixel(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel)
        {
                Short4 c01;
                Short4 c23;
@@ -1019,7 +1014,7 @@ namespace sw
                {
                case FORMAT_R5G6B5:
                        buffer = cBuffer + 2 * x;
-                       buffer2 = buffer + *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
+                       buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
                        c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
 
                        pixel.x = c01 & Short4(0xF800u);
@@ -1030,7 +1025,7 @@ namespace sw
                case FORMAT_A8R8G8B8:
                        buffer = cBuffer + 4 * x;
                        c01 = *Pointer<Short4>(buffer);
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
+                       buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
                        c23 = *Pointer<Short4>(buffer);
                        pixel.z = c01;
                        pixel.y = c01;
@@ -1047,9 +1042,10 @@ namespace sw
                        pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
                        break;
                case FORMAT_A8B8G8R8:
+               case FORMAT_SRGB8_A8:
                        buffer = cBuffer + 4 * x;
                        c01 = *Pointer<Short4>(buffer);
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
+                       buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
                        c23 = *Pointer<Short4>(buffer);
                        pixel.z = c01;
                        pixel.y = c01;
@@ -1068,7 +1064,7 @@ namespace sw
                case FORMAT_A8:
                        buffer = cBuffer + 1 * x;
                        pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 0);
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
+                       buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
                        pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 1);
                        pixel.w = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
                        pixel.x = Short4(0x0000);
@@ -1078,7 +1074,7 @@ namespace sw
                case FORMAT_X8R8G8B8:
                        buffer = cBuffer + 4 * x;
                        c01 = *Pointer<Short4>(buffer);
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
+                       buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
                        c23 = *Pointer<Short4>(buffer);
                        pixel.z = c01;
                        pixel.y = c01;
@@ -1094,9 +1090,10 @@ namespace sw
                        pixel.w = Short4(0xFFFFu);
                        break;
                case FORMAT_X8B8G8R8:
+               case FORMAT_SRGB8_X8:
                        buffer = cBuffer + 4 * x;
                        c01 = *Pointer<Short4>(buffer);
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
+                       buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
                        c23 = *Pointer<Short4>(buffer);
                        pixel.z = c01;
                        pixel.y = c01;
@@ -1130,7 +1127,7 @@ namespace sw
                        buffer = cBuffer;
                        pixel.x = *Pointer<Short4>(buffer + 8 * x);
                        pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
+                       buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
                        pixel.z = *Pointer<Short4>(buffer + 8 * x);
                        pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
                        transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
@@ -1138,7 +1135,7 @@ namespace sw
                case FORMAT_G16R16:
                        buffer = cBuffer;
                        pixel.x = *Pointer<Short4>(buffer + 4 * x);
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
+                       buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
                        pixel.y = *Pointer<Short4>(buffer + 4 * x);
                        pixel.z = pixel.x;
                        pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
@@ -1153,13 +1150,13 @@ namespace sw
                        ASSERT(false);
                }
 
-               if(postBlendSRGB && state.writeSRGB)
+               if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
                {
-                       sRGBtoLinear16_12_16(r, pixel);
+                       sRGBtoLinear16_12_16(pixel);
                }
        }
 
-       void PixelRoutine::alphaBlend(Registers &r, int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
+       void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
        {
                if(!state.alphaBlendActive)
                {
@@ -1167,14 +1164,14 @@ namespace sw
                }
 
                Vector4s pixel;
-               readPixel(r, index, cBuffer, x, pixel);
+               readPixel(index, cBuffer, x, pixel);
 
                // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
                Vector4s sourceFactor;
                Vector4s destFactor;
 
-               blendFactor(r, sourceFactor, current, pixel, state.sourceBlendFactor);
-               blendFactor(r, destFactor, current, pixel, state.destBlendFactor);
+               blendFactor(sourceFactor, current, pixel, state.sourceBlendFactor);
+               blendFactor(destFactor, current, pixel, state.destBlendFactor);
 
                if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
                {
@@ -1182,7 +1179,7 @@ namespace sw
                        current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
                        current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
                }
-       
+
                if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
                {
                        pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
@@ -1226,22 +1223,22 @@ namespace sw
                        current.z = pixel.z;
                        break;
                case BLENDOP_NULL:
-                       current.x = Short4(0x0000, 0x0000, 0x0000, 0x0000);
-                       current.y = Short4(0x0000, 0x0000, 0x0000, 0x0000);
-                       current.z = Short4(0x0000, 0x0000, 0x0000, 0x0000);
+                       current.x = Short4(0x0000);
+                       current.y = Short4(0x0000);
+                       current.z = Short4(0x0000);
                        break;
                default:
                        ASSERT(false);
                }
 
-               blendFactorAlpha(r, sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
-               blendFactorAlpha(r, destFactor, current, pixel, state.destBlendFactorAlpha);
+               blendFactorAlpha(sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
+               blendFactorAlpha(destFactor, current, pixel, state.destBlendFactorAlpha);
 
                if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
                {
                        current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
                }
-       
+
                if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
                {
                        pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
@@ -1271,14 +1268,14 @@ namespace sw
                        current.w = pixel.w;
                        break;
                case BLENDOP_NULL:
-                       current.w = Short4(0x0000, 0x0000, 0x0000, 0x0000);
+                       current.w = Short4(0x0000);
                        break;
                default:
                        ASSERT(false);
                }
        }
 
-       void PixelRoutine::logicOperation(Registers &r, int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
+       void PixelRoutine::logicOperation(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
        {
                if(state.logicalOperation == LOGICALOP_COPY)
                {
@@ -1286,19 +1283,19 @@ namespace sw
                }
 
                Vector4s pixel;
-               readPixel(r, index, cBuffer, x, pixel);
+               readPixel(index, cBuffer, x, pixel);
 
                switch(state.logicalOperation)
                {
                case LOGICALOP_CLEAR:
-                       current.x = 0;
-                       current.y = 0;
-                       current.z = 0;
+                       current.x = UShort4(0);
+                       current.y = UShort4(0);
+                       current.z = UShort4(0);
                        break;
                case LOGICALOP_SET:
-                       current.x = 0xFFFFu;
-                       current.y = 0xFFFFu;
-                       current.z = 0xFFFFu;
+                       current.x = UShort4(0xFFFFu);
+                       current.y = UShort4(0xFFFFu);
+                       current.z = UShort4(0xFFFFu);
                        break;
                case LOGICALOP_COPY:
                        ASSERT(false);   // Optimized out
@@ -1373,11 +1370,11 @@ namespace sw
                }
        }
 
-       void PixelRoutine::writeColor(Registers &r, int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &current, Int &sMask, Int &zMask, Int &cMask)
+       void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &current, Int &sMask, Int &zMask, Int &cMask)
        {
-               if(postBlendSRGB && state.writeSRGB)
+               if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
                {
-                       linearToSRGB16_12_16(r, current);
+                       linearToSRGB16_12_16(current);
                }
 
                if(exactColorRounding)
@@ -1395,17 +1392,22 @@ namespace sw
                        case FORMAT_X8B8G8R8:
                        case FORMAT_A8R8G8B8:
                        case FORMAT_A8B8G8R8:
-                               current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
-                               current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
-                               current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
-                               current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
+                       case FORMAT_SRGB8_X8:
+                       case FORMAT_SRGB8_A8:
+                       case FORMAT_G8R8:
+                       case FORMAT_R8:
+                               current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
+                               current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
+                               current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
+                               current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
+                               break;
+                       default:
                                break;
                        }
                }
 
                int rgbaWriteMask = state.colorWriteActive(index);
-               int bgraWriteMask = rgbaWriteMask & 0x0000000A | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
-               int brgaWriteMask = rgbaWriteMask & 0x00000008 | (rgbaWriteMask & 0x00000001) << 1 | (rgbaWriteMask & 0x00000002) << 1 | (rgbaWriteMask & 0x00000004) >> 2;
+               int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
 
                switch(state.targetFormat[index])
                {
@@ -1475,7 +1477,9 @@ namespace sw
                        break;
                case FORMAT_X8B8G8R8:
                case FORMAT_A8B8G8R8:
-                       if(state.targetFormat[index] == FORMAT_X8B8G8R8 || rgbaWriteMask == 0x7)
+               case FORMAT_SRGB8_X8:
+               case FORMAT_SRGB8_A8:
+                       if(state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8 || rgbaWriteMask == 0x7)
                        {
                                current.x = As<Short4>(As<UShort4>(current.x) >> 8);
                                current.y = As<Short4>(As<UShort4>(current.y) >> 8);
@@ -1509,6 +1513,17 @@ namespace sw
                                current.y = As<Short4>(UnpackHigh(current.y, current.x));
                        }
                        break;
+               case FORMAT_G8R8:
+                       current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+                       current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+                       current.x = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.x)));
+                       current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
+                       current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
+                       break;
+               case FORMAT_R8:
+                       current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+                       current.x = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.x)));
+                       break;
                case FORMAT_A8:
                        current.w = As<Short4>(As<UShort4>(current.w) >> 8);
                        current.w = As<Short4>(Pack(As<UShort4>(current.w), As<UShort4>(current.w)));
@@ -1557,17 +1572,17 @@ namespace sw
                                if((bgraWriteMask & 0x00000007) != 0x00000007)
                                {
                                        Int masked = value;
-                                       c01 &= *Pointer<Int>(r.constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
-                                       masked &= *Pointer<Int>(r.constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0]));
+                                       c01 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
+                                       masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
                                        c01 |= masked;
                                }
 
-                               c01 &= *Pointer<Int>(r.constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
-                               value &= *Pointer<Int>(r.constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
+                               c01 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
+                               value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
                                c01 |= value;
                                *Pointer<Int>(buffer) = c01;
 
-                               buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
+                               buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
                                value = *Pointer<Int>(buffer);
 
                                Int c23 = Extract(As<Int2>(current.x), 1);
@@ -1575,13 +1590,13 @@ namespace sw
                                if((bgraWriteMask & 0x00000007) != 0x00000007)
                                {
                                        Int masked = value;
-                                       c23 &= *Pointer<Int>(r.constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
-                                       masked &= *Pointer<Int>(r.constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0]));
+                                       c23 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
+                                       masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
                                        c23 |= masked;
                                }
 
-                               c23 &= *Pointer<Int>(r.constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
-                               value &= *Pointer<Int>(r.constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
+                               c23 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
+                               value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
                                c23 |= value;
                                *Pointer<Int>(buffer) = c23;
                        }
@@ -1596,13 +1611,13 @@ namespace sw
                //          (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
                //      {
                //              Short4 masked = value;
-               //              c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
-               //              masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
+               //              c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
+               //              masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
                //              c01 |= masked;
                //      }
 
-               //      c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
-               //      value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
+               //      c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
+               //      value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
                //      c01 |= value;
                //      *Pointer<Short4>(cBuffer + 8 * x + 0) = c01;
 
@@ -1613,13 +1628,13 @@ namespace sw
                //          (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
                //      {
                //              Short4 masked = value;
-               //              c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
-               //              masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
+               //              c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
+               //              masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
                //              c23 |= masked;
                //      }
 
-               //      c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
-               //      value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
+               //      c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
+               //      value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
                //      c23 |= value;
                //      *Pointer<Short4>(cBuffer + 8 * x + 8) = c23;
                        break;
@@ -1634,17 +1649,17 @@ namespace sw
                                        (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
                                {
                                        Short4 masked = value;
-                                       c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
-                                       masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
+                                       c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
+                                       masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
                                        c01 |= masked;
                                }
 
-                               c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
-                               value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
+                               c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
+                               value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
                                c01 |= value;
                                *Pointer<Short4>(buffer) = c01;
 
-                               buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
+                               buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
                                value = *Pointer<Short4>(buffer);
 
                                if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
@@ -1652,69 +1667,114 @@ namespace sw
                                        (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
                                {
                                        Short4 masked = value;
-                                       c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
-                                       masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
+                                       c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
+                                       masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
                                        c23 |= masked;
                                }
 
-                               c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
-                               value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
+                               c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
+                               value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
                                c23 |= value;
                                *Pointer<Short4>(buffer) = c23;
                        }
                        break;
                case FORMAT_A8B8G8R8:
                case FORMAT_X8B8G8R8:   // FIXME: Don't touch alpha?
+               case FORMAT_SRGB8_X8:
+               case FORMAT_SRGB8_A8:
                        {
                                Pointer<Byte> buffer = cBuffer + x * 4;
                                Short4 value = *Pointer<Short4>(buffer);
 
-                               if((state.targetFormat[index] == FORMAT_A8B8G8R8 && rgbaWriteMask != 0x0000000F) ||
-                                  ((state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x00000007) &&
-                                       (state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x0000000F)))   // FIXME: Need for masking when XBGR && Fh?
+                               bool masked = (((state.targetFormat[index] == FORMAT_A8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_A8) && rgbaWriteMask != 0x0000000F) ||
+                                             (((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x00000007) &&
+                                              ((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x0000000F))); // FIXME: Need for masking when XBGR && Fh?
+
+                               if(masked)
                                {
                                        Short4 masked = value;
-                                       c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
-                                       masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
+                                       c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
+                                       masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
                                        c01 |= masked;
                                }
 
-                               c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
-                               value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
+                               c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
+                               value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
                                c01 |= value;
                                *Pointer<Short4>(buffer) = c01;
 
-                               buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
+                               buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
                                value = *Pointer<Short4>(buffer);
 
-                               if((state.targetFormat[index] == FORMAT_A8B8G8R8 && rgbaWriteMask != 0x0000000F) ||
-                                  ((state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x00000007) &&
-                                       (state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x0000000F)))   // FIXME: Need for masking when XBGR && Fh?
+                               if(masked)
                                {
                                        Short4 masked = value;
-                                       c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
-                                       masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
+                                       c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
+                                       masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
                                        c23 |= masked;
                                }
 
-                               c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
-                               value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
+                               c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
+                               value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
                                c23 |= value;
                                *Pointer<Short4>(buffer) = c23;
                        }
                        break;
+               case FORMAT_G8R8:
+                       if((rgbaWriteMask & 0x00000003) != 0x0)
+                       {
+                               Pointer<Byte> buffer = cBuffer + 2 * x;
+                               Int2 value;
+                               value = Insert(value, *Pointer<Int>(buffer), 0);
+                               Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+                               value = Insert(value, *Pointer<Int>(buffer + pitch), 1);
+
+                               Int2 packedCol = As<Int2>(current.x);
+
+                               UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
+                               if((rgbaWriteMask & 0x3) != 0x3)
+                               {
+                                       Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
+                                       UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
+                                       mergedMask &= rgbaMask;
+                               }
+
+                               packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
+
+                               *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
+                               *Pointer<UInt>(buffer + pitch) = As<UInt>(Extract(packedCol, 1));
+                       }
+                       break;
+               case FORMAT_R8:
+                       if(rgbaWriteMask & 0x00000001)
+                       {
+                               Pointer<Byte> buffer = cBuffer + 1 * x;
+                               Short4 value;
+                               value = Insert(value, *Pointer<Short>(buffer), 0);
+                               Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+                               value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
+                               value = UnpackLow(As<Byte8>(value), As<Byte8>(value));
+
+                               current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
+                               value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
+                               current.x |= value;
+
+                               *Pointer<Short>(buffer) = Extract(current.x, 0);
+                               *Pointer<Short>(buffer + pitch) = Extract(current.x, 1);
+                       }
+                       break;
                case FORMAT_A8:
                        if(rgbaWriteMask & 0x00000008)
                        {
                                Pointer<Byte> buffer = cBuffer + 1 * x;
                                Short4 value;
-                               Insert(value, *Pointer<Short>(buffer), 0);
-                               Int pitch = *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
-                               Insert(value, *Pointer<Short>(buffer + pitch), 1);
+                               value = Insert(value, *Pointer<Short>(buffer), 0);
+                               Int pitch = *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+                               value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
                                value = UnpackLow(As<Byte8>(value), As<Byte8>(value));
 
-                               current.w &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q) + 8 * xMask);
-                               value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q) + 8 * xMask);
+                               current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q) + 8 * xMask);
+                               value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q) + 8 * xMask);
                                current.w |= value;
 
                                *Pointer<Short>(buffer) = Extract(current.w, 0);
@@ -1730,30 +1790,30 @@ namespace sw
                                if((rgbaWriteMask & 0x00000003) != 0x00000003)
                                {
                                        Short4 masked = value;
-                                       current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
-                                       masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
+                                       current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
+                                       masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
                                        current.x |= masked;
                                }
 
-                               current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
-                               value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
+                               current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
+                               value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
                                current.x |= value;
                                *Pointer<Short4>(buffer) = current.x;
 
-                               buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
+                               buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
 
                                value = *Pointer<Short4>(buffer);
 
                                if((rgbaWriteMask & 0x00000003) != 0x00000003)
                                {
                                        Short4 masked = value;
-                                       current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
-                                       masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
+                                       current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
+                                       masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
                                        current.y |= masked;
                                }
 
-                               current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
-                               value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
+                               current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
+                               value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
                                current.y |= value;
                                *Pointer<Short4>(buffer) = current.y;
                        }
@@ -1768,13 +1828,13 @@ namespace sw
                                        if(rgbaWriteMask != 0x0000000F)
                                        {
                                                Short4 masked = value;
-                                               current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
-                                               masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
+                                               current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
+                                               masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
                                                current.x |= masked;
                                        }
 
-                                       current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
-                                       value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
+                                       current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
+                                       value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
                                        current.x |= value;
                                        *Pointer<Short4>(buffer) = current.x;
                                }
@@ -1785,18 +1845,18 @@ namespace sw
                                        if(rgbaWriteMask != 0x0000000F)
                                        {
                                                Short4 masked = value;
-                                               current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
-                                               masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
+                                               current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
+                                               masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
                                                current.y |= masked;
                                        }
 
-                                       current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
-                                       value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
+                                       current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
+                                       value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
                                        current.y |= value;
                                        *Pointer<Short4>(buffer + 8) = current.y;
                                }
 
-                               buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
+                               buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
 
                                {
                                        Short4 value = *Pointer<Short4>(buffer);
@@ -1804,13 +1864,13 @@ namespace sw
                                        if(rgbaWriteMask != 0x0000000F)
                                        {
                                                Short4 masked = value;
-                                               current.z &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
-                                               masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
+                                               current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
+                                               masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
                                                current.z |= masked;
                                        }
 
-                                       current.z &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
-                                       value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
+                                       current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
+                                       value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
                                        current.z |= value;
                                        *Pointer<Short4>(buffer) = current.z;
                                }
@@ -1821,13 +1881,13 @@ namespace sw
                                        if(rgbaWriteMask != 0x0000000F)
                                        {
                                                Short4 masked = value;
-                                               current.w &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
-                                               masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
+                                               current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
+                                               masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
                                                current.w |= masked;
                                        }
 
-                                       current.w &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
-                                       value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
+                                       current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
+                                       value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
                                        current.w |= value;
                                        *Pointer<Short4>(buffer + 8) = current.w;
                                }
@@ -1838,7 +1898,7 @@ namespace sw
                }
        }
 
-       void PixelRoutine::blendFactor(Registers &r, const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive) 
+       void PixelRoutine::blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive)
        {
                switch(blendFactorActive)
                {
@@ -1895,21 +1955,21 @@ namespace sw
                        blendFactor.z = blendFactor.x;
                        break;
                case BLEND_CONSTANT:
-                       blendFactor.x = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[0]));
-                       blendFactor.y = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[1]));
-                       blendFactor.z = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[2]));
+                       blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0]));
+                       blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1]));
+                       blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2]));
                        break;
                case BLEND_INVCONSTANT:
-                       blendFactor.x = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
-                       blendFactor.y = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
-                       blendFactor.z = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
+                       blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
+                       blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
+                       blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
                        break;
                default:
                        ASSERT(false);
                }
        }
 
-       void PixelRoutine::blendFactorAlpha(Registers &r, const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive) 
+       void PixelRoutine::blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive)
        {
                switch(blendFactorAlphaActive)
                {
@@ -1947,17 +2007,17 @@ namespace sw
                        blendFactor.w = Float4(1.0f);
                        break;
                case BLEND_CONSTANT:
-                       blendFactor.w = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[3]));
+                       blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
                        break;
                case BLEND_INVCONSTANT:
-                       blendFactor.w = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
+                       blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
                        break;
                default:
                        ASSERT(false);
                }
        }
 
-       void PixelRoutine::alphaBlend(Registers &r, int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
+       void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
        {
                if(!state.alphaBlendActive)
                {
@@ -1971,47 +2031,65 @@ namespace sw
                Short4 c01;
                Short4 c23;
 
+               Float4 one;
+               if(Surface::isFloatFormat(state.targetFormat[index]))
+               {
+                       one = Float4(1.0f);
+               }
+               else if(Surface::isNonNormalizedInteger(state.targetFormat[index]))
+               {
+                       one = As<Float4>(Surface::isUnsignedComponent(state.targetFormat[index], 0) ? Int4(0xFFFFFFFF) : Int4(0x7FFFFFFF));
+               }
+
                switch(state.targetFormat[index])
                {
+               case FORMAT_R32I:
+               case FORMAT_R32UI:
                case FORMAT_R32F:
                        buffer = cBuffer;
                        // FIXME: movlps
                        pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
                        pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
+                       buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
                        // FIXME: movhps
                        pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
                        pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
-                       pixel.y = Float4(1.0f);
-                       pixel.z = Float4(1.0f);
-                       pixel.w = Float4(1.0f);
+                       pixel.y = pixel.z = pixel.w = one;
                        break;
+               case FORMAT_G32R32I:
+               case FORMAT_G32R32UI:
                case FORMAT_G32R32F:
                        buffer = cBuffer;
                        pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
+                       buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
                        pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
                        pixel.z = pixel.x;
                        pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88);
                        pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD);
                        pixel.y = pixel.z;
-                       pixel.z = Float4(1.0f);
-                       pixel.w = Float4(1.0f);
+                       pixel.z = pixel.w = one;
                        break;
+               case FORMAT_X32B32G32R32F:
                case FORMAT_A32B32G32R32F:
+               case FORMAT_A32B32G32R32I:
+               case FORMAT_A32B32G32R32UI:
                        buffer = cBuffer;
                        pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
                        pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
+                       buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
                        pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
                        pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
                        transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
+                       if(state.targetFormat[index] == FORMAT_X32B32G32R32F)
+                       {
+                               pixel.w = Float4(1.0f);
+                       }
                        break;
                default:
                        ASSERT(false);
                }
 
-               if(postBlendSRGB && state.writeSRGB)
+               if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
                {
                        sRGBtoLinear(pixel.x);
                        sRGBtoLinear(pixel.y);
@@ -2022,8 +2100,8 @@ namespace sw
                Vector4f sourceFactor;
                Vector4f destFactor;
 
-               blendFactor(r, sourceFactor, oC, pixel, state.sourceBlendFactor);
-               blendFactor(r, destFactor, oC, pixel, state.destBlendFactor);
+               blendFactor(sourceFactor, oC, pixel, state.sourceBlendFactor);
+               blendFactor(destFactor, oC, pixel, state.destBlendFactor);
 
                if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
                {
@@ -2031,7 +2109,7 @@ namespace sw
                        oC.y *= sourceFactor.y;
                        oC.z *= sourceFactor.z;
                }
-       
+
                if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
                {
                        pixel.x *= destFactor.x;
@@ -2083,14 +2161,14 @@ namespace sw
                        ASSERT(false);
                }
 
-               blendFactorAlpha(r, sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
-               blendFactorAlpha(r, destFactor, oC, pixel, state.destBlendFactorAlpha);
+               blendFactorAlpha(sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
+               blendFactorAlpha(destFactor, oC, pixel, state.destBlendFactorAlpha);
 
                if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
                {
                        oC.w *= sourceFactor.w;
                }
-       
+
                if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
                {
                        pixel.w *= destFactor.w;
@@ -2108,10 +2186,10 @@ namespace sw
                        pixel.w -= oC.w;
                        oC.w = pixel.w;
                        break;
-               case BLENDOP_MIN:       
+               case BLENDOP_MIN:
                        oC.w = Min(oC.w, pixel.w);
                        break;
-               case BLENDOP_MAX:       
+               case BLENDOP_MAX:
                        oC.w = Max(oC.w, pixel.w);
                        break;
                case BLENDOP_SOURCE:
@@ -2128,19 +2206,38 @@ namespace sw
                }
        }
 
-       void PixelRoutine::writeColor(Registers &r, int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
+       void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
        {
                switch(state.targetFormat[index])
                {
                case FORMAT_R32F:
+               case FORMAT_R32I:
+               case FORMAT_R32UI:
+               case FORMAT_R16I:
+               case FORMAT_R16UI:
+               case FORMAT_R8I:
+               case FORMAT_R8UI:
                        break;
                case FORMAT_G32R32F:
+               case FORMAT_G32R32I:
+               case FORMAT_G32R32UI:
+               case FORMAT_G16R16I:
+               case FORMAT_G16R16UI:
+               case FORMAT_G8R8I:
+               case FORMAT_G8R8UI:
                        oC.z = oC.x;
                        oC.x = UnpackLow(oC.x, oC.y);
                        oC.z = UnpackHigh(oC.z, oC.y);
                        oC.y = oC.z;
                        break;
+               case FORMAT_X32B32G32R32F:
                case FORMAT_A32B32G32R32F:
+               case FORMAT_A32B32G32R32I:
+               case FORMAT_A32B32G32R32UI:
+               case FORMAT_A16B16G16R16I:
+               case FORMAT_A16B16G16R16UI:
+               case FORMAT_A8B8G8R8I:
+               case FORMAT_A8B8G8R8UI:
                        transpose4x4(oC.x, oC.y, oC.z, oC.w);
                        break;
                default:
@@ -2171,6 +2268,8 @@ namespace sw
                switch(state.targetFormat[index])
                {
                case FORMAT_R32F:
+               case FORMAT_R32I:
+               case FORMAT_R32UI:
                        if(rgbaWriteMask & 0x00000001)
                        {
                                buffer = cBuffer + 4 * x;
@@ -2179,28 +2278,109 @@ namespace sw
                                value.x = *Pointer<Float>(buffer + 0);
                                value.y = *Pointer<Float>(buffer + 4);
 
-                               buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
+                               buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
 
                                // FIXME: movhps
                                value.z = *Pointer<Float>(buffer + 0);
                                value.w = *Pointer<Float>(buffer + 4);
 
-                               oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
-                               value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
+                               oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
+                               value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
                                oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
 
                                // FIXME: movhps
                                *Pointer<Float>(buffer + 0) = oC.x.z;
                                *Pointer<Float>(buffer + 4) = oC.x.w;
 
-                               buffer -= *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
+                               buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
 
                                // FIXME: movlps
                                *Pointer<Float>(buffer + 0) = oC.x.x;
                                *Pointer<Float>(buffer + 4) = oC.x.y;
                        }
                        break;
+               case FORMAT_R16I:
+               case FORMAT_R16UI:
+                       if(rgbaWriteMask & 0x00000001)
+                       {
+                               buffer = cBuffer + 2 * x;
+
+                               UShort4 xyzw;
+                               xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
+
+                               buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+                               xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
+                               value = As<Float4>(Int4(xyzw));
+
+                               oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
+                               value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
+                               oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
+
+                               if(state.targetFormat[index] == FORMAT_R16I)
+                               {
+                                       Float component = oC.x.z;
+                                       *Pointer<Short>(buffer + 0) = Short(As<Int>(component));
+                                       component = oC.x.w;
+                                       *Pointer<Short>(buffer + 2) = Short(As<Int>(component));
+
+                                       buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+                                       component = oC.x.x;
+                                       *Pointer<Short>(buffer + 0) = Short(As<Int>(component));
+                                       component = oC.x.y;
+                                       *Pointer<Short>(buffer + 2) = Short(As<Int>(component));
+                               }
+                               else // FORMAT_R16UI
+                               {
+                                       Float component = oC.x.z;
+                                       *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
+                                       component = oC.x.w;
+                                       *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
+
+                                       buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+                                       component = oC.x.x;
+                                       *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
+                                       component = oC.x.y;
+                                       *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
+                               }
+                       }
+                       break;
+               case FORMAT_R8I:
+               case FORMAT_R8UI:
+                       if(rgbaWriteMask & 0x00000001)
+                       {
+                               buffer = cBuffer + x;
+
+                               UInt xyzw, packedCol;
+
+                               xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
+                               buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+                               xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
+
+                               Short4 tmpCol = Short4(As<Int4>(oC.x));
+                               if(state.targetFormat[index] == FORMAT_R8I)
+                               {
+                                       tmpCol = As<Short4>(Pack(tmpCol, tmpCol));
+                               }
+                               else
+                               {
+                                       tmpCol = As<Short4>(Pack(As<UShort4>(tmpCol), As<UShort4>(tmpCol)));
+                               }
+                               packedCol = Extract(As<Int2>(tmpCol), 0);
+
+                               packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
+                                           (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
+
+                               *Pointer<UShort>(buffer) = UShort(packedCol >> 16);
+                               buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+                               *Pointer<UShort>(buffer) = UShort(packedCol);
+                       }
+                       break;
                case FORMAT_G32R32F:
+               case FORMAT_G32R32I:
+               case FORMAT_G32R32UI:
                        buffer = cBuffer + 8 * x;
 
                        value = *Pointer<Float4>(buffer);
@@ -2208,17 +2388,17 @@ namespace sw
                        if((rgbaWriteMask & 0x00000003) != 0x00000003)
                        {
                                Float4 masked = value;
-                               oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
-                               masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0])));
+                               oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
+                               masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
                                oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
                        }
 
-                       oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
-                       value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
+                       oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
+                       value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
                        oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
                        *Pointer<Float4>(buffer) = oC.x;
 
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
+                       buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
 
                        value = *Pointer<Float4>(buffer);
 
@@ -2227,17 +2407,86 @@ namespace sw
                                Float4 masked;
 
                                masked = value;
-                               oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
-                               masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0])));
+                               oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
+                               masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
                                oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
                        }
 
-                       oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
-                       value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
+                       oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
+                       value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
                        oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
                        *Pointer<Float4>(buffer) = oC.y;
                        break;
+               case FORMAT_G16R16I:
+               case FORMAT_G16R16UI:
+                       if((rgbaWriteMask & 0x00000003) != 0x0)
+                       {
+                               buffer = cBuffer + 4 * x;
+
+                               UInt2 rgbaMask;
+                               UShort4 packedCol = UShort4(As<Int4>(oC.x));
+                               UShort4 value = *Pointer<UShort4>(buffer);
+                               UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
+                               if((rgbaWriteMask & 0x3) != 0x3)
+                               {
+                                       Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
+                                       rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
+                                       mergedMask &= rgbaMask;
+                               }
+                               *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
+
+                               buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+                               packedCol = UShort4(As<Int4>(oC.y));
+                               value = *Pointer<UShort4>(buffer);
+                               mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
+                               if((rgbaWriteMask & 0x3) != 0x3)
+                               {
+                                       mergedMask &= rgbaMask;
+                               }
+                               *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
+                       }
+                       break;
+               case FORMAT_G8R8I:
+               case FORMAT_G8R8UI:
+                       if((rgbaWriteMask & 0x00000003) != 0x0)
+                       {
+                               buffer = cBuffer + 2 * x;
+
+                               Int2 xyzw, packedCol;
+
+                               xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
+                               buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+                               xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
+
+                               if(state.targetFormat[index] == FORMAT_G8R8I)
+                               {
+                                       packedCol = As<Int2>(Pack(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
+                               }
+                               else
+                               {
+                                       packedCol = As<Int2>(Pack(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y))));
+                               }
+
+                               UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
+                               if((rgbaWriteMask & 0x3) != 0x3)
+                               {
+                                       Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
+                                       UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
+                                       mergedMask &= rgbaMask;
+                               }
+
+                               packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
+
+                               *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
+                               buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+                               *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
+                       }
+                       break;
+               case FORMAT_X32B32G32R32F:
                case FORMAT_A32B32G32R32F:
+               case FORMAT_A32B32G32R32I:
+               case FORMAT_A32B32G32R32UI:
                        buffer = cBuffer + 16 * x;
 
                        {
@@ -2246,13 +2495,13 @@ namespace sw
                                if(rgbaWriteMask != 0x0000000F)
                                {
                                        Float4 masked = value;
-                                       oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
-                                       masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
+                                       oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
+                                       masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
                                        oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
                                }
-                               
-                               oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
-                               value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
+
+                               oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
+                               value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
                                oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
                                *Pointer<Float4>(buffer, 16) = oC.x;
                        }
@@ -2261,20 +2510,20 @@ namespace sw
                                value = *Pointer<Float4>(buffer + 16, 16);
 
                                if(rgbaWriteMask != 0x0000000F)
-                               {       
+                               {
                                        Float4 masked = value;
-                                       oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
-                                       masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
+                                       oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
+                                       masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
                                        oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
                                }
 
-                               oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
-                               value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
+                               oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
+                               value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
                                oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
                                *Pointer<Float4>(buffer + 16, 16) = oC.y;
                        }
 
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
+                       buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
 
                        {
                                value = *Pointer<Float4>(buffer, 16);
@@ -2282,13 +2531,13 @@ namespace sw
                                if(rgbaWriteMask != 0x0000000F)
                                {
                                        Float4 masked = value;
-                                       oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
-                                       masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
+                                       oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
+                                       masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
                                        oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
                                }
 
-                               oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
-                               value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
+                               oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
+                               value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
                                oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
                                *Pointer<Float4>(buffer, 16) = oC.z;
                        }
@@ -2299,17 +2548,90 @@ namespace sw
                                if(rgbaWriteMask != 0x0000000F)
                                {
                                        Float4 masked = value;
-                                       oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
-                                       masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
+                                       oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
+                                       masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
                                        oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
                                }
 
-                               oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
-                               value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
+                               oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
+                               value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
                                oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
                                *Pointer<Float4>(buffer + 16, 16) = oC.w;
                        }
                        break;
+               case FORMAT_A16B16G16R16I:
+               case FORMAT_A16B16G16R16UI:
+                       if((rgbaWriteMask & 0x0000000F) != 0x0)
+                       {
+                               buffer = cBuffer + 8 * x;
+
+                               UInt4 rgbaMask;
+                               UShort8 value = *Pointer<UShort8>(buffer);
+                               UShort8 packedCol = UShort8(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y)));
+                               UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
+                               if((rgbaWriteMask & 0xF) != 0xF)
+                               {
+                                       UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
+                                       rgbaMask = UInt4(tmpMask, tmpMask);
+                                       mergedMask &= rgbaMask;
+                               }
+                               *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
+
+                               buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+                               value = *Pointer<UShort8>(buffer);
+                               packedCol = UShort8(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w)));
+                               mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
+                               if((rgbaWriteMask & 0xF) != 0xF)
+                               {
+                                       mergedMask &= rgbaMask;
+                               }
+                               *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
+                       }
+                       break;
+               case FORMAT_A8B8G8R8I:
+               case FORMAT_A8B8G8R8UI:
+                       if((rgbaWriteMask & 0x0000000F) != 0x0)
+                       {
+                               UInt2 value, packedCol, mergedMask;
+
+                               buffer = cBuffer + 4 * x;
+
+                               if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
+                               {
+                                       packedCol = As<UInt2>(Pack(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
+                               }
+                               else
+                               {
+                                       packedCol = As<UInt2>(Pack(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y))));
+                               }
+                               value = *Pointer<UInt2>(buffer, 16);
+                               mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
+                               if(rgbaWriteMask != 0xF)
+                               {
+                                       mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
+                               }
+                               *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
+
+                               buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+                               if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
+                               {
+                                       packedCol = As<UInt2>(Pack(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
+                               }
+                               else
+                               {
+                                       packedCol = As<UInt2>(Pack(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w))));
+                               }
+                               value = *Pointer<UInt2>(buffer, 16);
+                               mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
+                               if(rgbaWriteMask != 0xF)
+                               {
+                                       mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
+                               }
+                               *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
+                       }
+                       break;
                default:
                        ASSERT(false);
                }
@@ -2320,18 +2642,18 @@ namespace sw
                return UShort4(cf * Float4(0xFFFF), saturate);
        }
 
-       void PixelRoutine::sRGBtoLinear16_12_16(Registers &r, Vector4s &c)
+       void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
        {
                c.x = As<UShort4>(c.x) >> 4;
                c.y = As<UShort4>(c.y) >> 4;
                c.z = As<UShort4>(c.z) >> 4;
 
-               sRGBtoLinear12_16(r, c);
+               sRGBtoLinear12_16(c);
        }
 
-       void PixelRoutine::sRGBtoLinear12_16(Registers &r, Vector4s &c)
+       void PixelRoutine::sRGBtoLinear12_16(Vector4s &c)
        {
-               Pointer<Byte> LUT = r.constants + OFFSET(Constants,sRGBtoLinear12_16);
+               Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16);
 
                c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
                c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
@@ -2349,18 +2671,18 @@ namespace sw
                c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
        }
 
-       void PixelRoutine::linearToSRGB16_12_16(Registers &r, Vector4s &c)
+       void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
        {
                c.x = As<UShort4>(c.x) >> 4;
                c.y = As<UShort4>(c.y) >> 4;
                c.z = As<UShort4>(c.z) >> 4;
 
-               linearToSRGB12_16(r, c);
+               linearToSRGB12_16(c);
        }
 
-       void PixelRoutine::linearToSRGB12_16(Registers &r, Vector4s &c)
+       void PixelRoutine::linearToSRGB12_16(Vector4s &c)
        {
-               Pointer<Byte> LUT = r.constants + OFFSET(Constants,linearToSRGB12_16);
+               Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16);
 
                c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
                c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);