OSDN Git Service

fwidth fix
[android-x86/external-swiftshader.git] / src / Shader / PixelRoutine.cpp
index 1951b14..ac27a4b 100644 (file)
 #include "Constants.hpp"
 #include "Debug.hpp"
 
-#include <assert.h>
-
-extern bool localShaderConstants;
-
 namespace sw
 {
        extern bool complementaryDepthBuffer;
@@ -53,7 +49,7 @@ namespace sw
 
        PixelRoutine::~PixelRoutine()
        {
-               for(int i = 0; i < 16; i++)
+               for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
                {
                        delete sampler[i];
                }
@@ -65,7 +61,7 @@ namespace sw
                        Long pipeTime = Ticks();
                #endif
 
-               for(int i = 0; i < 16; i++)
+               for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
                {
                        sampler[i] = new SamplerCore(r.constants, state.sampler[i]);
                }
@@ -127,7 +123,7 @@ namespace sw
                                Long interpTime = Ticks();
                        #endif
 
-                       Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(r.primitive + OFFSET(Primitive, yQuad), 16);
+                       Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(r.primitive + OFFSET(Primitive,yQuad), 16);
 
                        // Centroid locations
                        Float4 XXXX = Float4(0.0f);
@@ -283,7 +279,7 @@ namespace sw
                                else
                                {
                                        r.current = r.diffuse;
-                                       Vector4i temp(0x0000, 0x0000, 0x0000, 0x0000);
+                                       Vector4s temp(0x0000, 0x0000, 0x0000, 0x0000);
 
                                        for(int stage = 0; stage < 8; stage++)
                                        {
@@ -292,7 +288,7 @@ namespace sw
                                                        break;
                                                }
 
-                                               Vector4i texture;
+                                               Vector4s texture;
 
                                                if(state.textureStage[stage].usesTexture)
                                                {
@@ -459,7 +455,7 @@ namespace sw
                        value &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[0].testMaskQ));
                }
 
-               stencilTest(r, value, (Context::StencilCompareMode)state.stencilCompareMode, false);
+               stencilTest(r, value, state.stencilCompareMode, false);
 
                if(state.twoSidedStencil)
                {
@@ -468,7 +464,7 @@ namespace sw
                                valueCCW &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[1].testMaskQ));
                        }
 
-                       stencilTest(r, valueCCW, (Context::StencilCompareMode)state.stencilCompareModeCCW, true);
+                       stencilTest(r, valueCCW, state.stencilCompareModeCCW, true);
 
                        value &= *Pointer<Byte8>(r.primitive + OFFSET(Primitive,clockwiseMask));
                        valueCCW &= *Pointer<Byte8>(r.primitive + OFFSET(Primitive,invClockwiseMask));
@@ -478,43 +474,43 @@ namespace sw
                sMask = SignMask(value) & cMask;
        }
 
-       void PixelRoutine::stencilTest(Registers &r, Byte8 &value, Context::StencilCompareMode stencilCompareMode, bool CCW)
+       void PixelRoutine::stencilTest(Registers &r, Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW)
        {
                Byte8 equal;
 
                switch(stencilCompareMode)
                {
-               case Context::STENCIL_ALWAYS:
+               case STENCIL_ALWAYS:
                        value = Byte8(0xFFFFFFFFFFFFFFFF);
                        break;
-               case Context::STENCIL_NEVER:
+               case STENCIL_NEVER:
                        value = Byte8(0x0000000000000000);
                        break;
-               case Context::STENCIL_LESS:                     // a < b ~ b > a
+               case STENCIL_LESS:                      // a < b ~ b > a
                        value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
                        value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
                        break;
-               case Context::STENCIL_EQUAL:
+               case STENCIL_EQUAL:
                        value = CmpEQ(value, *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
                        break;
-               case Context::STENCIL_NOTEQUAL:         // a != b ~ !(a == b)
+               case STENCIL_NOTEQUAL:          // a != b ~ !(a == b)
                        value = CmpEQ(value, *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
                        value ^= Byte8(0xFFFFFFFFFFFFFFFF);
                        break;
-               case Context::STENCIL_LESSEQUAL:        // a <= b ~ (b > a) || (a == b)
+               case STENCIL_LESSEQUAL: // a <= b ~ (b > a) || (a == b)
                        equal = value;
                        equal = CmpEQ(equal, *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
                        value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
                        value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
                        value |= equal;
                        break;
-               case Context::STENCIL_GREATER:          // a > b
+               case STENCIL_GREATER:           // a > b
                        equal = *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
                        value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
                        equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
                        value = equal;
                        break;
-               case Context::STENCIL_GREATEREQUAL:     // a >= b ~ !(a < b) ~ !(b > a)
+               case STENCIL_GREATEREQUAL:      // a >= b ~ !(a < b) ~ !(b > a)
                        value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
                        value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
                        value ^= Byte8(0xFFFFFFFFFFFFFFFF);
@@ -565,7 +561,7 @@ namespace sw
 
                Float4 zValue;
 
-               if(state.depthCompareMode != Context::DEPTH_NEVER || (state.depthCompareMode != Context::DEPTH_ALWAYS && !state.depthWriteEnable))
+               if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
                {
                        if(!state.quadLayoutDepthBuffer)
                        {
@@ -583,19 +579,19 @@ namespace sw
 
                switch(state.depthCompareMode)
                {
-               case Context::DEPTH_ALWAYS:
+               case DEPTH_ALWAYS:
                        // Optimized
                        break;
-               case Context::DEPTH_NEVER:
+               case DEPTH_NEVER:
                        // Optimized
                        break;
-               case Context::DEPTH_EQUAL:
+               case DEPTH_EQUAL:
                        zTest = CmpEQ(zValue, Z);
                        break;
-               case Context::DEPTH_NOTEQUAL:
+               case DEPTH_NOTEQUAL:
                        zTest = CmpNEQ(zValue, Z);
                        break;
-               case Context::DEPTH_LESS:
+               case DEPTH_LESS:
                        if(complementaryDepthBuffer)
                        {
                                zTest = CmpLT(zValue, Z);
@@ -605,7 +601,7 @@ namespace sw
                                zTest = CmpNLE(zValue, Z);
                        }
                        break;
-               case Context::DEPTH_GREATEREQUAL:
+               case DEPTH_GREATEREQUAL:
                        if(complementaryDepthBuffer)
                        {
                                zTest = CmpNLT(zValue, Z);
@@ -615,7 +611,7 @@ namespace sw
                                zTest = CmpLE(zValue, Z);
                        }
                        break;
-               case Context::DEPTH_LESSEQUAL:
+               case DEPTH_LESSEQUAL:
                        if(complementaryDepthBuffer)
                        {
                                zTest = CmpLE(zValue, Z);
@@ -625,7 +621,7 @@ namespace sw
                                zTest = CmpNLT(zValue, Z);
                        }
                        break;
-               case Context::DEPTH_GREATER:
+               case DEPTH_GREATER:
                        if(complementaryDepthBuffer)
                        {
                                zTest = CmpNLE(zValue, Z);
@@ -641,10 +637,10 @@ namespace sw
 
                switch(state.depthCompareMode)
                {
-               case Context::DEPTH_ALWAYS:
+               case DEPTH_ALWAYS:
                        zMask = cMask;
                        break;
-               case Context::DEPTH_NEVER:
+               case DEPTH_NEVER:
                        zMask = 0x0;
                        break;
                default:
@@ -660,15 +656,15 @@ namespace sw
                return zMask != 0;
        }
 
-       void PixelRoutine::blendTexture(Registers &r, Vector4i &temp, Vector4i &texture, int stage)
+       void PixelRoutine::blendTexture(Registers &r, Vector4s &temp, Vector4s &texture, int stage)
        {
-               Vector4i *arg1;
-               Vector4i *arg2;
-               Vector4i *arg3;
-               Vector4i res;
+               Vector4s *arg1;
+               Vector4s *arg2;
+               Vector4s *arg3;
+               Vector4s res;
 
-               Vector4i constant;
-               Vector4i tfactor;
+               Vector4s constant;
+               Vector4s tfactor;
 
                const TextureStage::State &textureStage = state.textureStage[stage];
 
@@ -762,9 +758,9 @@ namespace sw
                        ASSERT(false);
                }
 
-               Vector4i mod1;
-               Vector4i mod2;
-               Vector4i mod3;
+               Vector4s mod1;
+               Vector4s mod2;
+               Vector4s mod3;
 
                switch(textureStage.firstModifier)
                {
@@ -1573,35 +1569,35 @@ namespace sw
 
                switch(state.alphaCompareMode)
                {
-               case Context::ALPHA_ALWAYS:
+               case ALPHA_ALWAYS:
                        aMask = 0xF;
                        break;
-               case Context::ALPHA_NEVER:
+               case ALPHA_NEVER:
                        aMask = 0x0;
                        break;
-               case Context::ALPHA_EQUAL:
+               case ALPHA_EQUAL:
                        cmp = CmpEQ(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)));
                        aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
                        break;
-               case Context::ALPHA_NOTEQUAL:           // a != b ~ !(a == b)
+               case ALPHA_NOTEQUAL:            // a != b ~ !(a == b)
                        cmp = CmpEQ(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4((short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF);   // FIXME
                        aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
                        break;
-               case Context::ALPHA_LESS:                       // a < b ~ b > a
+               case ALPHA_LESS:                        // a < b ~ b > a
                        cmp = CmpGT(*Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)), alpha);
                        aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
                        break;
-               case Context::ALPHA_GREATEREQUAL:       // a >= b ~ (a > b) || (a == b) ~ !(b > a)   // TODO: Approximate
+               case ALPHA_GREATEREQUAL:        // a >= b ~ (a > b) || (a == b) ~ !(b > a)   // TODO: Approximate
                        equal = CmpEQ(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)));
                        cmp = CmpGT(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)));
                        cmp |= equal;
                        aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
                        break;
-               case Context::ALPHA_LESSEQUAL:          // a <= b ~ !(a > b)
+               case ALPHA_LESSEQUAL:           // a <= b ~ !(a > b)
                        cmp = CmpGT(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4((short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF);   // FIXME
                        aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
                        break;
-               case Context::ALPHA_GREATER:                    // a > b
+               case ALPHA_GREATER:                     // a > b
                        cmp = CmpGT(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)));
                        aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
                        break;
@@ -1628,7 +1624,7 @@ namespace sw
                cMask[3] &= aMask3;
        }
 
-       Bool PixelRoutine::alphaTest(Registers &r, Int cMask[4], Vector4i &current)
+       Bool PixelRoutine::alphaTest(Registers &r, Int cMask[4], Vector4s &current)
        {
                if(!state.alphaTestActive())
                {
@@ -1637,7 +1633,7 @@ namespace sw
 
                Int aMask;
 
-               if(state.transparencyAntialiasing == Context::TRANSPARENCY_NONE)
+               if(state.transparencyAntialiasing == TRANSPARENCY_NONE)
                {
                        alphaTest(r, aMask, current.w);
 
@@ -1646,7 +1642,7 @@ namespace sw
                                cMask[q] &= aMask;
                        }
                }
-               else if(state.transparencyAntialiasing == Context::TRANSPARENCY_ALPHA_TO_COVERAGE)
+               else if(state.transparencyAntialiasing == TRANSPARENCY_ALPHA_TO_COVERAGE)
                {
                        Float4 alpha = Float4(current.w) * Float4(1.0f / 0x1000);
 
@@ -1673,7 +1669,7 @@ namespace sw
 
                Int aMask;
 
-               if(state.transparencyAntialiasing == Context::TRANSPARENCY_NONE)
+               if(state.transparencyAntialiasing == TRANSPARENCY_NONE)
                {
                        Short4 alpha = RoundShort4(c0.w * Float4(0x1000));
 
@@ -1684,7 +1680,7 @@ namespace sw
                                cMask[q] &= aMask;
                        }
                }
-               else if(state.transparencyAntialiasing == Context::TRANSPARENCY_ALPHA_TO_COVERAGE)
+               else if(state.transparencyAntialiasing == TRANSPARENCY_ALPHA_TO_COVERAGE)
                {
                        alphaToCoverage(r, cMask, c0.w);
                }
@@ -1700,14 +1696,14 @@ namespace sw
                return pass != 0x0;
        }
 
-       void PixelRoutine::fogBlend(Registers &r, Vector4i &current, Float4 &f, Float4 &z, Float4 &rhw)
+       void PixelRoutine::fogBlend(Registers &r, Vector4s &current, Float4 &f, Float4 &z, Float4 &rhw)
        {
                if(!state.fogActive)
                {
                        return;
                }
 
-               if(state.pixelFogMode != Context::FOG_NONE)
+               if(state.pixelFogMode != FOG_NONE)
                {
                        pixelFog(r, f, z, rhw);
                }
@@ -1732,7 +1728,7 @@ namespace sw
                        return;
                }
 
-               if(state.pixelFogMode != Context::FOG_NONE)
+               if(state.pixelFogMode != FOG_NONE)
                {
                        pixelFog(r, fog, z, rhw);
 
@@ -1757,7 +1753,7 @@ namespace sw
        {
                Float4 &zw = visibility;
 
-               if(state.pixelFogMode != Context::FOG_NONE)
+               if(state.pixelFogMode != FOG_NONE)
                {
                        if(state.wBasedFog)
                        {
@@ -1778,17 +1774,17 @@ namespace sw
 
                switch(state.pixelFogMode)
                {
-               case Context::FOG_NONE:
+               case FOG_NONE:
                        break;
-               case Context::FOG_LINEAR:
+               case FOG_LINEAR:
                        zw *= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.scale));
                        zw += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.offset));
                        break;
-               case Context::FOG_EXP:
+               case FOG_EXP:
                        zw *= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.densityE));
                        zw = exponential2(zw, true);
                        break;
-               case Context::FOG_EXP2:
+               case FOG_EXP2:
                        zw *= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.densityE2));
                        zw *= zw;
                        zw = exponential2(zw, true);
@@ -1799,7 +1795,7 @@ namespace sw
                }
        }
 
-       void PixelRoutine::specularPixel(Vector4i &current, Vector4i &specular)
+       void PixelRoutine::specularPixel(Vector4s &current, Vector4s &specular)
        {
                if(!state.specularAdd)
                {
@@ -1852,7 +1848,7 @@ namespace sw
 
                Float4 zValue;
 
-               if(state.depthCompareMode != Context::DEPTH_NEVER || (state.depthCompareMode != Context::DEPTH_ALWAYS && !state.depthWriteEnable))
+               if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
                {
                        if(!state.quadLayoutDepthBuffer)
                        {
@@ -1889,9 +1885,9 @@ namespace sw
                        return;
                }
 
-               if(state.stencilPassOperation == Context::OPERATION_KEEP && state.stencilZFailOperation == Context::OPERATION_KEEP && state.stencilFailOperation == Context::OPERATION_KEEP)
+               if(state.stencilPassOperation == OPERATION_KEEP && state.stencilZFailOperation == OPERATION_KEEP && state.stencilFailOperation == OPERATION_KEEP)
                {
-                       if(!state.twoSidedStencil || (state.stencilPassOperationCCW == Context::OPERATION_KEEP && state.stencilZFailOperationCCW == Context::OPERATION_KEEP && state.stencilFailOperationCCW == Context::OPERATION_KEEP))
+                       if(!state.twoSidedStencil || (state.stencilPassOperationCCW == OPERATION_KEEP && state.stencilZFailOperationCCW == OPERATION_KEEP && state.stencilFailOperationCCW == OPERATION_KEEP))
                        {
                                return;
                        }
@@ -1912,7 +1908,7 @@ namespace sw
                Byte8 bufferValue = As<Byte8>(Long1(*Pointer<UInt>(buffer)));
        
                Byte8 newValue;
-               stencilOperation(r, newValue, bufferValue, (Context::StencilOperation)state.stencilPassOperation, (Context::StencilOperation)state.stencilZFailOperation, (Context::StencilOperation)state.stencilFailOperation, false, zMask, sMask);
+               stencilOperation(r, newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
 
                if(!state.noStencilWriteMask)
                {
@@ -1926,7 +1922,7 @@ namespace sw
                {
                        Byte8 newValueCCW;
 
-                       stencilOperation(r, newValueCCW, bufferValue, (Context::StencilOperation)state.stencilPassOperationCCW, (Context::StencilOperation)state.stencilZFailOperationCCW, (Context::StencilOperation)state.stencilFailOperationCCW, true, zMask, sMask);
+                       stencilOperation(r, newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
 
                        if(!state.noStencilWriteMaskCCW)
                        {
@@ -1948,7 +1944,7 @@ namespace sw
                *Pointer<UInt>(buffer) = UInt(As<Long>(newValue));
        }
 
-       void PixelRoutine::stencilOperation(Registers &r, Byte8 &newValue, Byte8 &bufferValue, Context::StencilOperation stencilPassOperation, Context::StencilOperation stencilZFailOperation, Context::StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
+       void PixelRoutine::stencilOperation(Registers &r, Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
        {
                Byte8 &pass = newValue;
                Byte8 fail;
@@ -1981,32 +1977,32 @@ namespace sw
                }
        }
 
-       void PixelRoutine::stencilOperation(Registers &r, Byte8 &output, Byte8 &bufferValue, Context::StencilOperation operation, bool CCW)
+       void PixelRoutine::stencilOperation(Registers &r, Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW)
        {
                switch(operation)
                {
-               case Context::OPERATION_KEEP:
+               case OPERATION_KEEP:
                        output = bufferValue;
                        break;
-               case Context::OPERATION_ZERO:
+               case OPERATION_ZERO:
                        output = Byte8(0x0000000000000000);
                        break;
-               case Context::OPERATION_REPLACE:
+               case OPERATION_REPLACE:
                        output = *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceQ));
                        break;
-               case Context::OPERATION_INCRSAT:
+               case OPERATION_INCRSAT:
                        output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
                        break;
-               case Context::OPERATION_DECRSAT:
+               case OPERATION_DECRSAT:
                        output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
                        break;
-               case Context::OPERATION_INVERT:
+               case OPERATION_INVERT:
                        output = bufferValue ^ Byte8(0xFFFFFFFFFFFFFFFF);
                        break;
-               case Context::OPERATION_INCR:
+               case OPERATION_INCR:
                        output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
                        break;
-               case Context::OPERATION_DECR:
+               case OPERATION_DECR:
                        output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
                        break;
                default:
@@ -2014,7 +2010,7 @@ namespace sw
                }
        }
 
-       void PixelRoutine::sampleTexture(Registers &r, Vector4i &c, int coordinates, int stage, bool project)
+       void PixelRoutine::sampleTexture(Registers &r, Vector4s &c, int coordinates, int stage, bool project)
        {
                Float4 u = r.vf[2 + coordinates].x;
                Float4 v = r.vf[2 + coordinates].y;
@@ -2032,15 +2028,15 @@ namespace sw
                sampleTexture(r, c, stage, u, v, w, q, project);
        }
 
-       void PixelRoutine::sampleTexture(Registers &r, Vector4i &c, int stage, Float4 &u, Float4 &v, Float4 &w, Float4 &q, bool project, bool bias, bool fixed12)
+       void PixelRoutine::sampleTexture(Registers &r, Vector4s &c, int stage, Float4 &u, Float4 &v, Float4 &w, Float4 &q, bool project, bool bias)
        {
                Vector4f dsx;
                Vector4f dsy;
 
-               sampleTexture(r, c, stage, u, v, w, q, dsx, dsy, project, bias, fixed12, false);
+               sampleTexture(r, c, stage, u, v, w, q, dsx, dsy, project, bias, false);
        }
 
-       void PixelRoutine::sampleTexture(Registers &r, Vector4i &c, int stage, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &dsx, Vector4f &dsy, bool project, bool bias, bool fixed12, bool gradients, bool lodProvided)
+       void PixelRoutine::sampleTexture(Registers &r, Vector4s &c, int stage, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &dsx, Vector4f &dsy, bool project, bool bias, bool gradients, bool lodProvided)
        {
                #if PERF_PROFILE
                        Long texTime = Ticks();
@@ -2050,7 +2046,7 @@ namespace sw
 
                if(!project)
                {
-                       sampler[stage]->sampleTexture(texture, c, u, v, w, q, dsx, dsy, bias, fixed12, gradients, lodProvided);
+                       sampler[stage]->sampleTexture(texture, c, u, v, w, q, dsx, dsy, bias, gradients, lodProvided);
                }
                else
                {
@@ -2060,7 +2056,7 @@ namespace sw
                        Float4 v_q = v * rq;
                        Float4 w_q = w * rq;
 
-                       sampler[stage]->sampleTexture(texture, c, u_q, v_q, w_q, q, dsx, dsy, bias, fixed12, gradients, lodProvided);
+                       sampler[stage]->sampleTexture(texture, c, u_q, v_q, w_q, q, dsx, dsy, bias, gradients, lodProvided);
                }
 
                #if PERF_PROFILE
@@ -2076,9 +2072,9 @@ namespace sw
                }
                else
                {
-                       Int index = As<Int>(Float(reg(r, sampler).x.x));
+                       Int index = As<Int>(Float(fetchRegisterF(r, sampler).x.x));
 
-                       for(int i = 0; i < 16; i++)
+                       for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
                        {
                                if(shader->usesSampler(i))
                                {
@@ -2133,11 +2129,14 @@ namespace sw
                        {
                        case FORMAT_NULL:
                                break;
-                       case FORMAT_A16B16G16R16:
+                       case FORMAT_R5G6B5:
                        case FORMAT_A8R8G8B8:
+                       case FORMAT_A8B8G8R8:
                        case FORMAT_X8R8G8B8:
+                       case FORMAT_X8B8G8R8:
                        case FORMAT_A8:
                        case FORMAT_G16R16:
+                       case FORMAT_A16B16G16R16:
                                oC[index].x = Max(oC[index].x, Float4(0.0f)); oC[index].x = Min(oC[index].x, Float4(1.0f));
                                oC[index].y = Max(oC[index].y, Float4(0.0f)); oC[index].y = Min(oC[index].y, Float4(1.0f));
                                oC[index].z = Max(oC[index].z, Float4(0.0f)); oC[index].z = Min(oC[index].z, Float4(1.0f));
@@ -2153,7 +2152,7 @@ namespace sw
                }
        }
 
-       void PixelRoutine::rasterOperation(Vector4i &current, Registers &r, Float4 &fog, Pointer<Byte> &cBuffer, Int &x, Int sMask[4], Int zMask[4], Int cMask[4])
+       void PixelRoutine::rasterOperation(Vector4s &current, Registers &r, Float4 &fog, Pointer<Byte> &cBuffer, Int &x, Int sMask[4], Int zMask[4], Int cMask[4])
        {
                if(!state.colorWriteActive(0))
                {
@@ -2164,8 +2163,11 @@ namespace sw
 
                switch(state.targetFormat[0])
                {
+               case FORMAT_R5G6B5:
                case FORMAT_X8R8G8B8:
+               case FORMAT_X8B8G8R8:
                case FORMAT_A8R8G8B8:
+               case FORMAT_A8B8G8R8:
                case FORMAT_A8:
                case FORMAT_G16R16:
                case FORMAT_A16B16G16R16:
@@ -2181,16 +2183,24 @@ namespace sw
                                current.w <<= 4;
                        }
 
+                       if(state.targetFormat[0] == FORMAT_R5G6B5)
+                       {
+                               current.x &= Short4(0xF800u);
+                               current.y &= Short4(0xFC00u);
+                               current.z &= Short4(0xF800u);
+                       }
+
                        fogBlend(r, current, fog, r.z[0], r.rhw);
 
                        for(unsigned int q = 0; q < state.multiSample; q++)
                        {
                                Pointer<Byte> buffer = cBuffer + q * *Pointer<Int>(r.data + OFFSET(DrawData,colorSliceB[0]));
-                               Vector4i color = current;
+                               Vector4s color = current;
 
                                if(state.multiSampleMask & (1 << q))
                                {
                                        alphaBlend(r, 0, buffer, color, x);
+                                       logicOperation(r, 0, buffer, color, x);
                                        writeColor(r, 0, buffer, x, color, sMask[q], zMask[q], cMask[q]);
                                }
                        }
@@ -2241,15 +2251,18 @@ namespace sw
 
                        switch(state.targetFormat[index])
                        {
+                       case FORMAT_R5G6B5:
                        case FORMAT_X8R8G8B8:
+                       case FORMAT_X8B8G8R8:
                        case FORMAT_A8R8G8B8:
+                       case FORMAT_A8B8G8R8:
                        case FORMAT_A8:
                        case FORMAT_G16R16:
                        case FORMAT_A16B16G16R16:
                                for(unsigned int q = 0; q < state.multiSample; q++)
                                {
                                        Pointer<Byte> buffer = cBuffer[index] + q * *Pointer<Int>(r.data + OFFSET(DrawData,colorSliceB[index]));
-                                       Vector4i color;
+                                       Vector4s color;
 
                                        color.x = convertFixed16(oC[index].x, false);
                                        color.y = convertFixed16(oC[index].y, false);
@@ -2259,6 +2272,7 @@ namespace sw
                                        if(state.multiSampleMask & (1 << q))
                                        {
                                                alphaBlend(r, index, buffer, color, x);
+                                               logicOperation(r, index, buffer, color, x);
                                                writeColor(r, index, buffer, x, color, sMask[q], zMask[q], cMask[q]);
                                        }
                                }
@@ -2284,78 +2298,78 @@ namespace sw
                }
        }
 
-       void PixelRoutine::blendFactor(Registers &r, const Vector4i &blendFactor, const Vector4i &current, const Vector4i &pixel, Context::BlendFactor blendFactorActive)
+       void PixelRoutine::blendFactor(Registers &r, const Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorActive)
        {
                switch(blendFactorActive)
                {
-               case Context::BLEND_ZERO:
+               case BLEND_ZERO:
                        // Optimized
                        break;
-               case Context::BLEND_ONE:
+               case BLEND_ONE:
                        // Optimized
                        break;
-               case Context::BLEND_SOURCE:
+               case BLEND_SOURCE:
                        blendFactor.x = current.x;
                        blendFactor.y = current.y;
                        blendFactor.z = current.z;
                        break;
-               case Context::BLEND_INVSOURCE:
+               case BLEND_INVSOURCE:
                        blendFactor.x = Short4(0xFFFFu) - current.x;
                        blendFactor.y = Short4(0xFFFFu) - current.y;
                        blendFactor.z = Short4(0xFFFFu) - current.z;
                        break;
-               case Context::BLEND_DEST:
+               case BLEND_DEST:
                        blendFactor.x = pixel.x;
                        blendFactor.y = pixel.y;
                        blendFactor.z = pixel.z;
                        break;
-               case Context::BLEND_INVDEST:
+               case BLEND_INVDEST:
                        blendFactor.x = Short4(0xFFFFu) - pixel.x;
                        blendFactor.y = Short4(0xFFFFu) - pixel.y;
                        blendFactor.z = Short4(0xFFFFu) - pixel.z;
                        break;
-               case Context::BLEND_SOURCEALPHA:
+               case BLEND_SOURCEALPHA:
                        blendFactor.x = current.w;
                        blendFactor.y = current.w;
                        blendFactor.z = current.w;
                        break;
-               case Context::BLEND_INVSOURCEALPHA:
+               case BLEND_INVSOURCEALPHA:
                        blendFactor.x = Short4(0xFFFFu) - current.w;
                        blendFactor.y = Short4(0xFFFFu) - current.w;
                        blendFactor.z = Short4(0xFFFFu) - current.w;
                        break;
-               case Context::BLEND_DESTALPHA:
+               case BLEND_DESTALPHA:
                        blendFactor.x = pixel.w;
                        blendFactor.y = pixel.w;
                        blendFactor.z = pixel.w;
                        break;
-               case Context::BLEND_INVDESTALPHA:
+               case BLEND_INVDESTALPHA:
                        blendFactor.x = Short4(0xFFFFu) - pixel.w;
                        blendFactor.y = Short4(0xFFFFu) - pixel.w;
                        blendFactor.z = Short4(0xFFFFu) - pixel.w;
                        break;
-               case Context::BLEND_SRCALPHASAT:
+               case BLEND_SRCALPHASAT:
                        blendFactor.x = Short4(0xFFFFu) - pixel.w;
                        blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
                        blendFactor.y = blendFactor.x;
                        blendFactor.z = blendFactor.x;
                        break;
-               case Context::BLEND_CONSTANT:
+               case BLEND_CONSTANT:
                        blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[0]));
                        blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[1]));
                        blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[2]));
                        break;
-               case Context::BLEND_INVCONSTANT:
+               case BLEND_INVCONSTANT:
                        blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
                        blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
                        blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
                        break;
-               case Context::BLEND_CONSTANTALPHA:
+               case BLEND_CONSTANTALPHA:
                        blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
                        blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
                        blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
                        break;
-               case Context::BLEND_INVCONSTANTALPHA:
+               case BLEND_INVCONSTANTALPHA:
                        blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
                        blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
                        blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
@@ -2365,49 +2379,49 @@ namespace sw
                }
        }
        
-       void PixelRoutine::blendFactorAlpha(Registers &r, const Vector4i &blendFactor, const Vector4i &current, const Vector4i &pixel, Context::BlendFactor blendFactorAlphaActive)
+       void PixelRoutine::blendFactorAlpha(Registers &r, const Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorAlphaActive)
        {
                switch(blendFactorAlphaActive)
                {
-               case Context::BLEND_ZERO:
+               case BLEND_ZERO:
                        // Optimized
                        break;
-               case Context::BLEND_ONE:
+               case BLEND_ONE:
                        // Optimized
                        break;
-               case Context::BLEND_SOURCE:
+               case BLEND_SOURCE:
                        blendFactor.w = current.w;
                        break;
-               case Context::BLEND_INVSOURCE:
+               case BLEND_INVSOURCE:
                        blendFactor.w = Short4(0xFFFFu) - current.w;
                        break;
-               case Context::BLEND_DEST:
+               case BLEND_DEST:
                        blendFactor.w = pixel.w;
                        break;
-               case Context::BLEND_INVDEST:
+               case BLEND_INVDEST:
                        blendFactor.w = Short4(0xFFFFu) - pixel.w;
                        break;
-               case Context::BLEND_SOURCEALPHA:
+               case BLEND_SOURCEALPHA:
                        blendFactor.w = current.w;
                        break;
-               case Context::BLEND_INVSOURCEALPHA:
+               case BLEND_INVSOURCEALPHA:
                        blendFactor.w = Short4(0xFFFFu) - current.w;
                        break;
-               case Context::BLEND_DESTALPHA:
+               case BLEND_DESTALPHA:
                        blendFactor.w = pixel.w;
                        break;
-               case Context::BLEND_INVDESTALPHA:
+               case BLEND_INVDESTALPHA:
                        blendFactor.w = Short4(0xFFFFu) - pixel.w;
                        break;
-               case Context::BLEND_SRCALPHASAT:
+               case BLEND_SRCALPHASAT:
                        blendFactor.w = Short4(0xFFFFu);
                        break;
-               case Context::BLEND_CONSTANT:
-               case Context::BLEND_CONSTANTALPHA:
+               case BLEND_CONSTANT:
+               case BLEND_CONSTANTALPHA:
                        blendFactor.w = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
                        break;
-               case Context::BLEND_INVCONSTANT:
-               case Context::BLEND_INVCONSTANTALPHA:
+               case BLEND_INVCONSTANT:
+               case BLEND_INVCONSTANTALPHA:
                        blendFactor.w = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
                        break;
                default:
@@ -2415,26 +2429,29 @@ namespace sw
                }
        }
 
-       void PixelRoutine::alphaBlend(Registers &r, int index, Pointer<Byte> &cBuffer, Vector4i &current, Int &x)
+       void PixelRoutine::readPixel(Registers &r, int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x, Vector4s &pixel)
        {
-               if(!state.alphaBlendActive)
-               {
-                       return;
-               }
-                
-               Pointer<Byte> buffer;
-
-               Vector4i pixel;
                Short4 c01;
                Short4 c23;
+               Pointer<Byte> buffer;
 
-               // Read pixel
                switch(state.targetFormat[index])
                {
+               case FORMAT_R5G6B5:
+                       buffer = cBuffer + 2 * x;
+                       c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
+                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
+                       c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
+
+                       pixel.x = c01 & Short4(0xF800u);
+                       pixel.y = (c01 & Short4(0x07E0u)) << 5;
+                       pixel.z = (c01 & Short4(0x001Fu)) << 11;
+                       pixel.w = Short4(0xFFFFu);
+                       break;
                case FORMAT_A8R8G8B8:
                        buffer = cBuffer + 4 * x;
                        c01 = *Pointer<Short4>(buffer);
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
+                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
                        c23 = *Pointer<Short4>(buffer);
                        pixel.z = c01;
                        pixel.y = c01;
@@ -2450,10 +2467,29 @@ namespace sw
                        pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
                        pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
                        break;
+               case FORMAT_A8B8G8R8:
+                       buffer = cBuffer + 4 * x;
+                       c01 = *Pointer<Short4>(buffer);
+                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
+                       c23 = *Pointer<Short4>(buffer);
+                       pixel.z = c01;
+                       pixel.y = c01;
+                       pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
+                       pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
+                       pixel.x = pixel.z;
+                       pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
+                       pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
+                       pixel.y = pixel.z;
+                       pixel.w = pixel.x;
+                       pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
+                       pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
+                       pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
+                       pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
+                       break;
                case FORMAT_A8:
                        buffer = cBuffer + 1 * x;
                        pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 0);
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
+                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
                        pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 1);
                        pixel.w = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
                        pixel.x = Short4(0x0000);
@@ -2463,7 +2499,7 @@ namespace sw
                case FORMAT_X8R8G8B8:
                        buffer = cBuffer + 4 * x;
                        c01 = *Pointer<Short4>(buffer);
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
+                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
                        c23 = *Pointer<Short4>(buffer);
                        pixel.z = c01;
                        pixel.y = c01;
@@ -2478,34 +2514,53 @@ namespace sw
                        pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
                        pixel.w = Short4(0xFFFFu);
                        break;
+               case FORMAT_X8B8G8R8:
+                       buffer = cBuffer + 4 * x;
+                       c01 = *Pointer<Short4>(buffer);
+                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
+                       c23 = *Pointer<Short4>(buffer);
+                       pixel.z = c01;
+                       pixel.y = c01;
+                       pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
+                       pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
+                       pixel.x = pixel.z;
+                       pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
+                       pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
+                       pixel.y = pixel.z;
+                       pixel.w = pixel.x;
+                       pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
+                       pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
+                       pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
+                       pixel.w = Short4(0xFFFFu);
+                       break;
                case FORMAT_A8G8R8B8Q:
                        UNIMPLEMENTED();
-               //      pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
-               //      pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
-               //      pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
-               //      pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8));
+                       //      pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
+                       //      pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
+                       //      pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
+                       //      pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8));
                        break;
                case FORMAT_X8G8R8B8Q:
                        UNIMPLEMENTED();
-               //      pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
-               //      pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
-               //      pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
-               //      pixel.w = Short4(0xFFFFu);
+                       //      pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
+                       //      pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
+                       //      pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
+                       //      pixel.w = Short4(0xFFFFu);
                        break;
                case FORMAT_A16B16G16R16:
-                       buffer  = cBuffer;
+                       buffer = cBuffer;
                        pixel.x = *Pointer<Short4>(buffer + 8 * x);
                        pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
+                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
                        pixel.z = *Pointer<Short4>(buffer + 8 * x);
                        pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
                        transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
                        break;
                case FORMAT_G16R16:
                        buffer = cBuffer;
-                       pixel.x = *Pointer<Short4>(buffer  + 4 * x);
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
-                       pixel.y = *Pointer<Short4>(buffer  + 4 * x);
+                       pixel.x = *Pointer<Short4>(buffer + 4 * x);
+                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
+                       pixel.y = *Pointer<Short4>(buffer + 4 * x);
                        pixel.z = pixel.x;
                        pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
                        pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
@@ -2521,24 +2576,38 @@ namespace sw
 
                if(postBlendSRGB && state.writeSRGB)
                {
-                       sRGBtoLinear16_16(r, pixel);    
+                       sRGBtoLinear16_12_16(r, pixel);
+               }
+       }
+
+       void PixelRoutine::alphaBlend(Registers &r, int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
+       {
+               if(!state.alphaBlendActive)
+               {
+                       return;
                }
 
+               Vector4s pixel;
+               Short4 c01;
+               Short4 c23;
+
+               readPixel(r, index, cBuffer, current, x, pixel);
+
                // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
-               Vector4i sourceFactor;
-               Vector4i destFactor;
+               Vector4s sourceFactor;
+               Vector4s destFactor;
 
-               blendFactor(r, sourceFactor, current, pixel, (Context::BlendFactor)state.sourceBlendFactor);
-               blendFactor(r, destFactor, current, pixel, (Context::BlendFactor)state.destBlendFactor);
+               blendFactor(r, sourceFactor, current, pixel, state.sourceBlendFactor);
+               blendFactor(r, destFactor, current, pixel, state.destBlendFactor);
 
-               if(state.sourceBlendFactor != Context::BLEND_ONE && state.sourceBlendFactor != Context::BLEND_ZERO)
+               if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
                {
                        current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
                        current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
                        current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
                }
        
-               if(state.destBlendFactor != Context::BLEND_ONE && state.destBlendFactor != Context::BLEND_ZERO)
+               if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
                {
                        pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
                        pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
@@ -2547,40 +2616,40 @@ namespace sw
 
                switch(state.blendOperation)
                {
-               case Context::BLENDOP_ADD:
+               case BLENDOP_ADD:
                        current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
                        current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
                        current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
                        break;
-               case Context::BLENDOP_SUB:
+               case BLENDOP_SUB:
                        current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
                        current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
                        current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
                        break;
-               case Context::BLENDOP_INVSUB:
+               case BLENDOP_INVSUB:
                        current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
                        current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
                        current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
                        break;
-               case Context::BLENDOP_MIN:
+               case BLENDOP_MIN:
                        current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
                        current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
                        current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
                        break;
-               case Context::BLENDOP_MAX:
+               case BLENDOP_MAX:
                        current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
                        current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
                        current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
                        break;
-               case Context::BLENDOP_SOURCE:
+               case BLENDOP_SOURCE:
                        // No operation
                        break;
-               case Context::BLENDOP_DEST:
+               case BLENDOP_DEST:
                        current.x = pixel.x;
                        current.y = pixel.y;
                        current.z = pixel.z;
                        break;
-               case Context::BLENDOP_NULL:
+               case BLENDOP_NULL:
                        current.x = Short4(0x0000, 0x0000, 0x0000, 0x0000);
                        current.y = Short4(0x0000, 0x0000, 0x0000, 0x0000);
                        current.z = Short4(0x0000, 0x0000, 0x0000, 0x0000);
@@ -2589,43 +2658,43 @@ namespace sw
                        ASSERT(false);
                }
 
-               blendFactorAlpha(r, sourceFactor, current, pixel, (Context::BlendFactor)state.sourceBlendFactorAlpha);
-               blendFactorAlpha(r, destFactor, current, pixel, (Context::BlendFactor)state.destBlendFactorAlpha);
+               blendFactorAlpha(r, sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
+               blendFactorAlpha(r, destFactor, current, pixel, state.destBlendFactorAlpha);
 
-               if(state.sourceBlendFactorAlpha != Context::BLEND_ONE && state.sourceBlendFactorAlpha != Context::BLEND_ZERO)
+               if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
                {
                        current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
                }
        
-               if(state.destBlendFactorAlpha != Context::BLEND_ONE && state.destBlendFactorAlpha != Context::BLEND_ZERO)
+               if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
                {
                        pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
                }
 
                switch(state.blendOperationAlpha)
                {
-               case Context::BLENDOP_ADD:
+               case BLENDOP_ADD:
                        current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
                        break;
-               case Context::BLENDOP_SUB:
+               case BLENDOP_SUB:
                        current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
                        break;
-               case Context::BLENDOP_INVSUB:
+               case BLENDOP_INVSUB:
                        current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
                        break;
-               case Context::BLENDOP_MIN:
+               case BLENDOP_MIN:
                        current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
                        break;
-               case Context::BLENDOP_MAX:
+               case BLENDOP_MAX:
                        current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
                        break;
-               case Context::BLENDOP_SOURCE:
+               case BLENDOP_SOURCE:
                        // No operation
                        break;
-               case Context::BLENDOP_DEST:
+               case BLENDOP_DEST:
                        current.w = pixel.w;
                        break;
-               case Context::BLENDOP_NULL:
+               case BLENDOP_NULL:
                        current.w = Short4(0x0000, 0x0000, 0x0000, 0x0000);
                        break;
                default:
@@ -2633,26 +2702,123 @@ namespace sw
                }
        }
 
-       void PixelRoutine::writeColor(Registers &r, int index, Pointer<Byte> &cBuffer, Int &x, Vector4i &current, Int &sMask, Int &zMask, Int &cMask)
+       void PixelRoutine::logicOperation(Registers &r, int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
        {
-               if(!state.colorWriteActive(index))
+               if(state.logicalOperation == LogicalOperation::LOGICALOP_COPY)
                {
                        return;
                }
 
+               Vector4s pixel;
+
+               // Read pixel
+               readPixel(r, index, cBuffer, current, x, pixel);
+
+               switch(state.logicalOperation)
+               {
+               case LOGICALOP_CLEAR:
+                       current.x = 0;
+                       current.y = 0;
+                       current.z = 0;
+                       break;
+               case LOGICALOP_SET:
+                       current.x = 0xFFFF;
+                       current.y = 0xFFFF;
+                       current.z = 0xFFFF;
+                       break;
+               case LOGICALOP_COPY:
+                       ASSERT(false);   // Optimized out
+                       break;
+               case LOGICALOP_COPY_INVERTED:
+                       current.x = ~current.x;
+                       current.y = ~current.y;
+                       current.z = ~current.z;
+                       break;
+               case LOGICALOP_NOOP:
+                       current.x = pixel.x;
+                       current.y = pixel.y;
+                       current.z = pixel.z;
+                       break;
+               case LOGICALOP_INVERT:
+                       current.x = ~pixel.x;
+                       current.y = ~pixel.y;
+                       current.z = ~pixel.z;
+                       break;
+               case LOGICALOP_AND:
+                       current.x = pixel.x & current.x;
+                       current.y = pixel.y & current.y;
+                       current.z = pixel.z & current.z;
+                       break;
+               case LOGICALOP_NAND:
+                       current.x = ~(pixel.x & current.x);
+                       current.y = ~(pixel.y & current.y);
+                       current.z = ~(pixel.z & current.z);
+                       break;
+               case LOGICALOP_OR:
+                       current.x = pixel.x | current.x;
+                       current.y = pixel.y | current.y;
+                       current.z = pixel.z | current.z;
+                       break;
+               case LOGICALOP_NOR:
+                       current.x = ~(pixel.x | current.x);
+                       current.y = ~(pixel.y | current.y);
+                       current.z = ~(pixel.z | current.z);
+                       break;
+               case LOGICALOP_XOR:
+                       current.x = pixel.x ^ current.x;
+                       current.y = pixel.y ^ current.y;
+                       current.z = pixel.z ^ current.z;
+                       break;
+               case LOGICALOP_EQUIV:
+                       current.x = ~(pixel.x ^ current.x);
+                       current.y = ~(pixel.y ^ current.y);
+                       current.z = ~(pixel.z ^ current.z);
+                       break;
+               case LOGICALOP_AND_REVERSE:
+                       current.x = ~pixel.x & current.x;
+                       current.y = ~pixel.y & current.y;
+                       current.z = ~pixel.z & current.z;
+                       break;
+               case LOGICALOP_AND_INVERTED:
+                       current.x = pixel.x & ~current.x;
+                       current.y = pixel.y & ~current.y;
+                       current.z = pixel.z & ~current.z;
+                       break;
+               case LOGICALOP_OR_REVERSE:
+                       current.x = ~pixel.x | current.x;
+                       current.y = ~pixel.y | current.y;
+                       current.z = ~pixel.z | current.z;
+                       break;
+               case LOGICALOP_OR_INVERTED:
+                       current.x = pixel.x | ~current.x;
+                       current.y = pixel.y | ~current.y;
+                       current.z = pixel.z | ~current.z;
+                       break;
+               default:
+                       ASSERT(false);
+               }
+       }
+
+       void PixelRoutine::writeColor(Registers &r, int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &current, Int &sMask, Int &zMask, Int &cMask)
+       {
                if(postBlendSRGB && state.writeSRGB)
                {
-                       linearToSRGB16_16(r, current);
+                       linearToSRGB16_12_16(r, current);
                }
 
                if(exactColorRounding)
                {
                        switch(state.targetFormat[index])
                        {
+                       case FORMAT_R5G6B5:
+                       //      UNIMPLEMENTED();   // FIXME
+                               break;
                        case FORMAT_X8G8R8B8Q:
                        case FORMAT_A8G8R8B8Q:
                        case FORMAT_X8R8G8B8:
+                       case FORMAT_X8B8G8R8:
                        case FORMAT_A8R8G8B8:
+                       case FORMAT_A8B8G8R8:
                                {
                                        current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
                                        current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
@@ -2669,6 +2835,15 @@ namespace sw
 
                switch(state.targetFormat[index])
                {
+               case FORMAT_R5G6B5:
+                       {
+                               current.x = current.x & Short4(0xF800u);
+                               current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
+                               current.z = As<UShort4>(current.z) >> 11;
+
+                               current.x = current.x | current.y | current.z;
+                       }
+                       break;
                case FORMAT_X8G8R8B8Q:
                        UNIMPLEMENTED();
                //      current.x = As<Short4>(As<UShort4>(current.x) >> 8);
@@ -2724,6 +2899,42 @@ namespace sw
                                current.y = As<Short4>(UnpackHigh(current.y, current.x));
                        }
                        break;
+               case FORMAT_X8B8G8R8:
+               case FORMAT_A8B8G8R8:
+                       if(state.targetFormat[index] == FORMAT_X8B8G8R8 || rgbaWriteMask == 0x7)
+                       {
+                               current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+                               current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+                               current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+
+                               current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
+                               current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
+
+                               current.x = current.z;
+                               current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
+                               current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
+                               current.y = current.z;
+                               current.z = As<Short4>(UnpackLow(current.z, current.x));
+                               current.y = As<Short4>(UnpackHigh(current.y, current.x));
+                       }
+                       else
+                       {
+                               current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+                               current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+                               current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+                               current.w = As<Short4>(As<UShort4>(current.w) >> 8);
+
+                               current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
+                               current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
+
+                               current.x = current.z;
+                               current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
+                               current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
+                               current.y = current.z;
+                               current.z = As<Short4>(UnpackLow(current.z, current.x));
+                               current.y = As<Short4>(UnpackHigh(current.y, current.x));
+                       }
+                       break;
                case FORMAT_A8:
                        current.w = As<Short4>(As<UShort4>(current.w) >> 8);
                        current.w = As<Short4>(Pack(As<UShort4>(current.w), As<UShort4>(current.w)));
@@ -2737,20 +2948,6 @@ namespace sw
                case FORMAT_A16B16G16R16:
                        transpose4x4(current.x, current.y, current.z, current.w);
                        break;
-               case FORMAT_R32F:
-               case FORMAT_G32R32F:
-               case FORMAT_A32B32G32R32F:
-                       {
-                               Vector4f oC;
-
-                               oC.x = convertUnsigned16(UShort4(current.x));
-                               oC.y = convertUnsigned16(UShort4(current.y));
-                               oC.z = convertUnsigned16(UShort4(current.z));
-                               oC.w = convertUnsigned16(UShort4(current.w));
-
-                               writeColor(r, index, cBuffer, x, oC, sMask, zMask, cMask);
-                       }
-                       return;
                default:
                        ASSERT(false);
                }
@@ -2774,11 +2971,47 @@ namespace sw
                        xMask &= sMask;
                }
 
-               Pointer<Byte> buffer;
-               Short4 value;
-
                switch(state.targetFormat[index])
                {
+               case FORMAT_R5G6B5:
+                       {
+                               Pointer<Byte> buffer = cBuffer + 2 * x;
+                               Int value = *Pointer<Int>(buffer);
+
+                               Int c01 = Extract(As<Int2>(current.x), 0);
+
+                               if((bgraWriteMask & 0x00000007) != 0x00000007)
+                               {
+                                       Int masked = value;
+                                       c01 &= *Pointer<Int>(r.constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
+                                       masked &= *Pointer<Int>(r.constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0]));
+                                       c01 |= masked;
+                               }
+
+                               c01 &= *Pointer<Int>(r.constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
+                               value &= *Pointer<Int>(r.constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
+                               c01 |= value;
+                               *Pointer<Int>(buffer) = c01;
+
+                               buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
+                               value = *Pointer<Int>(buffer);
+
+                               Int c23 = Extract(As<Int2>(current.x), 1);
+
+                               if((bgraWriteMask & 0x00000007) != 0x00000007)
+                               {
+                                       Int masked = value;
+                                       c23 &= *Pointer<Int>(r.constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
+                                       masked &= *Pointer<Int>(r.constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0]));
+                                       c23 |= masked;
+                               }
+
+                               c23 &= *Pointer<Int>(r.constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
+                               value &= *Pointer<Int>(r.constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
+                               c23 |= value;
+                               *Pointer<Int>(buffer) = c23;
+                       }
+                       break;
                case FORMAT_A8G8R8B8Q:
                case FORMAT_X8G8R8B8Q:   // FIXME: Don't touch alpha?
                        UNIMPLEMENTED();
@@ -2818,46 +3051,89 @@ namespace sw
                        break;
                case FORMAT_A8R8G8B8:
                case FORMAT_X8R8G8B8:   // FIXME: Don't touch alpha?
-                       buffer = cBuffer + x * 4;
-                       value = *Pointer<Short4>(buffer);
-
-                       if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
-                          ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
-                           (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
                        {
-                               Short4 masked = value;
-                               c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
-                               masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
-                               c01 |= masked;
-                       }
+                               Pointer<Byte> buffer = cBuffer + x * 4;
+                               Short4 value = *Pointer<Short4>(buffer);
+
+                               if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
+                                  ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
+                                       (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
+                               {
+                                       Short4 masked = value;
+                                       c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
+                                       masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
+                                       c01 |= masked;
+                               }
 
-                       c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
-                       value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
-                       c01 |= value;
-                       *Pointer<Short4>(buffer) = c01;
+                               c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
+                               value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
+                               c01 |= value;
+                               *Pointer<Short4>(buffer) = c01;
 
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
-                       value = *Pointer<Short4>(buffer);
+                               buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
+                               value = *Pointer<Short4>(buffer);
 
-                       if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
-                          ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
-                           (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
-                       {
-                               Short4 masked = value;
-                               c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
-                               masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
-                               c23 |= masked;
+                               if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
+                                  ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
+                                       (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
+                               {
+                                       Short4 masked = value;
+                                       c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
+                                       masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
+                                       c23 |= masked;
+                               }
+
+                               c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
+                               value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
+                               c23 |= value;
+                               *Pointer<Short4>(buffer) = c23;
                        }
+                       break;
+               case FORMAT_A8B8G8R8:
+               case FORMAT_X8B8G8R8:   // FIXME: Don't touch alpha?
+                       {
+                               Pointer<Byte> buffer = cBuffer + x * 4;
+                               Short4 value = *Pointer<Short4>(buffer);
+
+                               if((state.targetFormat[index] == FORMAT_A8B8G8R8 && rgbaWriteMask != 0x0000000F) ||
+                                  ((state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x00000007) &&
+                                       (state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x0000000F)))   // FIXME: Need for masking when XBGR && Fh?
+                               {
+                                       Short4 masked = value;
+                                       c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
+                                       masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
+                                       c01 |= masked;
+                               }
+
+                               c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
+                               value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
+                               c01 |= value;
+                               *Pointer<Short4>(buffer) = c01;
+
+                               buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
+                               value = *Pointer<Short4>(buffer);
 
-                       c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
-                       value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
-                       c23 |= value;
-                       *Pointer<Short4>(buffer) = c23;
+                               if((state.targetFormat[index] == FORMAT_A8B8G8R8 && rgbaWriteMask != 0x0000000F) ||
+                                  ((state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x00000007) &&
+                                       (state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x0000000F)))   // FIXME: Need for masking when XBGR && Fh?
+                               {
+                                       Short4 masked = value;
+                                       c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
+                                       masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
+                                       c23 |= masked;
+                               }
+
+                               c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
+                               value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
+                               c23 |= value;
+                               *Pointer<Short4>(buffer) = c23;
+                       }
                        break;
                case FORMAT_A8:
                        if(rgbaWriteMask & 0x00000008)
                        {
-                               buffer = cBuffer + 1 * x;
+                               Pointer<Byte> buffer = cBuffer + 1 * x;
+                               Short4 value;
                                Insert(value, *Pointer<Short>(buffer), 0);
                                Int pitch = *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
                                Insert(value, *Pointer<Short>(buffer + pitch), 1);
@@ -2872,111 +3148,115 @@ namespace sw
                        }
                        break;
                case FORMAT_G16R16:
-                       buffer = cBuffer + 4 * x;
-
-                       value = *Pointer<Short4>(buffer);
-
-                       if((rgbaWriteMask & 0x00000003) != 0x00000003)
-                       {
-                               Short4 masked = value;
-                               current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
-                               masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
-                               current.x |= masked;
-                       }
-
-                       current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
-                       value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
-                       current.x |= value;
-                       *Pointer<Short4>(buffer) = current.x;
-
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
-
-                       value = *Pointer<Short4>(buffer);
-
-                       if((rgbaWriteMask & 0x00000003) != 0x00000003)
                        {
-                               Short4 masked = value;
-                               current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
-                               masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
-                               current.y |= masked;
-                       }
+                               Pointer<Byte> buffer = cBuffer + 4 * x;
 
-                       current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
-                       value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
-                       current.y |= value;
-                       *Pointer<Short4>(buffer) = current.y;
-                       break;
-               case FORMAT_A16B16G16R16:
-                       buffer = cBuffer + 8 * x;
+                               Short4 value = *Pointer<Short4>(buffer);
 
-                       {
-                               value = *Pointer<Short4>(buffer);
-
-                               if(rgbaWriteMask != 0x0000000F)
+                               if((rgbaWriteMask & 0x00000003) != 0x00000003)
                                {
                                        Short4 masked = value;
-                                       current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
-                                       masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
+                                       current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
+                                       masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
                                        current.x |= masked;
                                }
 
-                               current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
-                               value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
+                               current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
+                               value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
                                current.x |= value;
                                *Pointer<Short4>(buffer) = current.x;
-                       }
 
-                       {
-                               value = *Pointer<Short4>(buffer + 8);
+                               buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
 
-                               if(rgbaWriteMask != 0x0000000F)
+                               value = *Pointer<Short4>(buffer);
+
+                               if((rgbaWriteMask & 0x00000003) != 0x00000003)
                                {
                                        Short4 masked = value;
-                                       current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
-                                       masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
+                                       current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
+                                       masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
                                        current.y |= masked;
                                }
 
-                               current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
-                               value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
+                               current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
+                               value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
                                current.y |= value;
-                               *Pointer<Short4>(buffer + 8) = current.y;
+                               *Pointer<Short4>(buffer) = current.y;
                        }
-
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
-
+                       break;
+               case FORMAT_A16B16G16R16:
                        {
-                               value = *Pointer<Short4>(buffer);
+                               Pointer<Byte> buffer = cBuffer + 8 * x;
 
-                               if(rgbaWriteMask != 0x0000000F)
                                {
-                                       Short4 masked = value;
-                                       current.z &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
-                                       masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
-                                       current.z |= masked;
+                                       Short4 value = *Pointer<Short4>(buffer);
+
+                                       if(rgbaWriteMask != 0x0000000F)
+                                       {
+                                               Short4 masked = value;
+                                               current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
+                                               masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
+                                               current.x |= masked;
+                                       }
+
+                                       current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
+                                       value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
+                                       current.x |= value;
+                                       *Pointer<Short4>(buffer) = current.x;
                                }
 
-                               current.z &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
-                               value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
-                               current.z |= value;
-                               *Pointer<Short4>(buffer) = current.z;
-                       }
+                               {
+                                       Short4 value = *Pointer<Short4>(buffer + 8);
 
-                       {
-                               value = *Pointer<Short4>(buffer + 8);
+                                       if(rgbaWriteMask != 0x0000000F)
+                                       {
+                                               Short4 masked = value;
+                                               current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
+                                               masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
+                                               current.y |= masked;
+                                       }
+
+                                       current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
+                                       value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
+                                       current.y |= value;
+                                       *Pointer<Short4>(buffer + 8) = current.y;
+                               }
+
+                               buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
 
-                               if(rgbaWriteMask != 0x0000000F)
                                {
-                                       Short4 masked = value;
-                                       current.w &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
-                                       masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
-                                       current.w |= masked;
+                                       Short4 value = *Pointer<Short4>(buffer);
+
+                                       if(rgbaWriteMask != 0x0000000F)
+                                       {
+                                               Short4 masked = value;
+                                               current.z &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
+                                               masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
+                                               current.z |= masked;
+                                       }
+
+                                       current.z &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
+                                       value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
+                                       current.z |= value;
+                                       *Pointer<Short4>(buffer) = current.z;
                                }
 
-                               current.w &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
-                               value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
-                               current.w |= value;
-                               *Pointer<Short4>(buffer + 8) = current.w;
+                               {
+                                       Short4 value = *Pointer<Short4>(buffer + 8);
+
+                                       if(rgbaWriteMask != 0x0000000F)
+                                       {
+                                               Short4 masked = value;
+                                               current.w &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
+                                               masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
+                                               current.w |= masked;
+                                       }
+
+                                       current.w &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
+                                       value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
+                                       current.w |= value;
+                                       *Pointer<Short4>(buffer + 8) = current.w;
+                               }
                        }
                        break;
                default:
@@ -2984,68 +3264,68 @@ namespace sw
                }
        }
 
-       void PixelRoutine::blendFactor(Registers &r, const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, Context::BlendFactor blendFactorActive) 
+       void PixelRoutine::blendFactor(Registers &r, const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive) 
        {
                switch(blendFactorActive)
                {
-               case Context::BLEND_ZERO:
+               case BLEND_ZERO:
                        // Optimized
                        break;
-               case Context::BLEND_ONE:
+               case BLEND_ONE:
                        // Optimized
                        break;
-               case Context::BLEND_SOURCE:
+               case BLEND_SOURCE:
                        blendFactor.x = oC.x;
                        blendFactor.y = oC.y;
                        blendFactor.z = oC.z;
                        break;
-               case Context::BLEND_INVSOURCE:
+               case BLEND_INVSOURCE:
                        blendFactor.x = Float4(1.0f) - oC.x;
                        blendFactor.y = Float4(1.0f) - oC.y;
                        blendFactor.z = Float4(1.0f) - oC.z;
                        break;
-               case Context::BLEND_DEST:
+               case BLEND_DEST:
                        blendFactor.x = pixel.x;
                        blendFactor.y = pixel.y;
                        blendFactor.z = pixel.z;
                        break;
-               case Context::BLEND_INVDEST:
+               case BLEND_INVDEST:
                        blendFactor.x = Float4(1.0f) - pixel.x;
                        blendFactor.y = Float4(1.0f) - pixel.y;
                        blendFactor.z = Float4(1.0f) - pixel.z;
                        break;
-               case Context::BLEND_SOURCEALPHA:
+               case BLEND_SOURCEALPHA:
                        blendFactor.x = oC.w;
                        blendFactor.y = oC.w;
                        blendFactor.z = oC.w;
                        break;
-               case Context::BLEND_INVSOURCEALPHA:
+               case BLEND_INVSOURCEALPHA:
                        blendFactor.x = Float4(1.0f) - oC.w;
                        blendFactor.y = Float4(1.0f) - oC.w;
                        blendFactor.z = Float4(1.0f) - oC.w;
                        break;
-               case Context::BLEND_DESTALPHA:
+               case BLEND_DESTALPHA:
                        blendFactor.x = pixel.w;
                        blendFactor.y = pixel.w;
                        blendFactor.z = pixel.w;
                        break;
-               case Context::BLEND_INVDESTALPHA:
+               case BLEND_INVDESTALPHA:
                        blendFactor.x = Float4(1.0f) - pixel.w;
                        blendFactor.y = Float4(1.0f) - pixel.w;
                        blendFactor.z = Float4(1.0f) - pixel.w;
                        break;
-               case Context::BLEND_SRCALPHASAT:
+               case BLEND_SRCALPHASAT:
                        blendFactor.x = Float4(1.0f) - pixel.w;
                        blendFactor.x = Min(blendFactor.x, oC.w);
                        blendFactor.y = blendFactor.x;
                        blendFactor.z = blendFactor.x;
                        break;
-               case Context::BLEND_CONSTANT:
+               case BLEND_CONSTANT:
                        blendFactor.x = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[0]));
                        blendFactor.y = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[1]));
                        blendFactor.z = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[2]));
                        break;
-               case Context::BLEND_INVCONSTANT:
+               case BLEND_INVCONSTANT:
                        blendFactor.x = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
                        blendFactor.y = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
                        blendFactor.z = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
@@ -3055,47 +3335,47 @@ namespace sw
                }
        }
 
-       void PixelRoutine::blendFactorAlpha(Registers &r, const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, Context::BlendFactor blendFactorAlphaActive) 
+       void PixelRoutine::blendFactorAlpha(Registers &r, const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive) 
        {
                switch(blendFactorAlphaActive)
                {
-               case Context::BLEND_ZERO:
+               case BLEND_ZERO:
                        // Optimized
                        break;
-               case Context::BLEND_ONE:
+               case BLEND_ONE:
                        // Optimized
                        break;
-               case Context::BLEND_SOURCE:
+               case BLEND_SOURCE:
                        blendFactor.w = oC.w;
                        break;
-               case Context::BLEND_INVSOURCE:
+               case BLEND_INVSOURCE:
                        blendFactor.w = Float4(1.0f) - oC.w;
                        break;
-               case Context::BLEND_DEST:
+               case BLEND_DEST:
                        blendFactor.w = pixel.w;
                        break;
-               case Context::BLEND_INVDEST:
+               case BLEND_INVDEST:
                        blendFactor.w = Float4(1.0f) - pixel.w;
                        break;
-               case Context::BLEND_SOURCEALPHA:
+               case BLEND_SOURCEALPHA:
                        blendFactor.w = oC.w;
                        break;
-               case Context::BLEND_INVSOURCEALPHA:
+               case BLEND_INVSOURCEALPHA:
                        blendFactor.w = Float4(1.0f) - oC.w;
                        break;
-               case Context::BLEND_DESTALPHA:
+               case BLEND_DESTALPHA:
                        blendFactor.w = pixel.w;
                        break;
-               case Context::BLEND_INVDESTALPHA:
+               case BLEND_INVDESTALPHA:
                        blendFactor.w = Float4(1.0f) - pixel.w;
                        break;
-               case Context::BLEND_SRCALPHASAT:
+               case BLEND_SRCALPHASAT:
                        blendFactor.w = Float4(1.0f);
                        break;
-               case Context::BLEND_CONSTANT:
+               case BLEND_CONSTANT:
                        blendFactor.w = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[3]));
                        break;
-               case Context::BLEND_INVCONSTANT:
+               case BLEND_INVCONSTANT:
                        blendFactor.w = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
                        break;
                default:
@@ -3113,115 +3393,12 @@ namespace sw
                Pointer<Byte> buffer;
                Vector4f pixel;
 
-               Vector4i color;
+               Vector4s color;
                Short4 c01;
                Short4 c23;
 
-               // Read pixel
                switch(state.targetFormat[index])
                {
-               case FORMAT_A8R8G8B8:
-                       buffer = cBuffer + 4 * x;
-                       c01 = *Pointer<Short4>(buffer);
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
-                       c23 = *Pointer<Short4>(buffer);
-                       color.z = c01;
-                       color.y = c01;
-                       color.z = UnpackLow(As<Byte8>(color.z), As<Byte8>(c23));
-                       color.y = UnpackHigh(As<Byte8>(color.y), As<Byte8>(c23));
-                       color.x = color.z;
-                       color.z = UnpackLow(As<Byte8>(color.z), As<Byte8>(color.y));
-                       color.x = UnpackHigh(As<Byte8>(color.x), As<Byte8>(color.y));
-                       color.y = color.z;
-                       color.w = color.x;
-                       color.x = UnpackLow(As<Byte8>(color.x), As<Byte8>(color.x));
-                       color.y = UnpackHigh(As<Byte8>(color.y), As<Byte8>(color.y));
-                       color.z = UnpackLow(As<Byte8>(color.z), As<Byte8>(color.z));
-                       color.w = UnpackHigh(As<Byte8>(color.w), As<Byte8>(color.w));
-
-                       pixel.x = convertUnsigned16(As<UShort4>(color.x));
-                       pixel.y = convertUnsigned16(As<UShort4>(color.y));
-                       pixel.z = convertUnsigned16(As<UShort4>(color.z));
-                       pixel.w = convertUnsigned16(As<UShort4>(color.w));
-                       break;
-               case FORMAT_X8R8G8B8:
-                       buffer = cBuffer + 4 * x;
-                       c01 = *Pointer<Short4>(buffer);
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
-                       c23 = *Pointer<Short4>(buffer);
-                       color.z = c01;
-                       color.y = c01;
-                       color.z = UnpackLow(As<Byte8>(color.z), As<Byte8>(c23));
-                       color.y = UnpackHigh(As<Byte8>(color.y), As<Byte8>(c23));
-                       color.x = color.z;
-                       color.z = UnpackLow(As<Byte8>(color.z), As<Byte8>(color.y));
-                       color.x = UnpackHigh(As<Byte8>(color.x), As<Byte8>(color.y));
-                       color.y = color.z;
-                       color.x = UnpackLow(As<Byte8>(color.x), As<Byte8>(color.x));
-                       color.y = UnpackHigh(As<Byte8>(color.y), As<Byte8>(color.y));
-                       color.z = UnpackLow(As<Byte8>(color.z), As<Byte8>(color.z));
-
-                       pixel.x = convertUnsigned16(As<UShort4>(color.x));
-                       pixel.y = convertUnsigned16(As<UShort4>(color.y));
-                       pixel.z = convertUnsigned16(As<UShort4>(color.z));
-                       pixel.w = Float4(1.0f);
-                       break;
-               case FORMAT_A8:
-                       buffer = cBuffer + 1 * x;
-                       c01 = Insert(c01, *Pointer<Short>(buffer), 0);
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
-                       c01 = Insert(c01, *Pointer<Short>(buffer), 1);
-                       pixel.w = convertUnsigned16(As<UShort4>(UnpackLow(As<Byte8>(c01), As<Byte8>(c01))));
-                       pixel.x = Float4(0.0f);
-                       pixel.y = Float4(0.0f);
-                       pixel.z = Float4(0.0f);
-                       break;
-               case FORMAT_A8G8R8B8Q:
-                       UNIMPLEMENTED();
-               //      UnpackLow(pixel.z, qword_ptr [cBuffer+8*x+0]);
-               //      UnpackHigh(pixel.x, qword_ptr [cBuffer+8*x+0]);
-               //      UnpackLow(pixel.y, qword_ptr [cBuffer+8*x+8]);
-               //      UnpackHigh(pixel.w, qword_ptr [cBuffer+8*x+8]);
-                       break;
-               case FORMAT_X8G8R8B8Q:
-                       UNIMPLEMENTED();
-               //      UnpackLow(pixel.z, qword_ptr [cBuffer+8*x+0]);
-               //      UnpackHigh(pixel.x, qword_ptr [cBuffer+8*x+0]);
-               //      UnpackLow(pixel.y, qword_ptr [cBuffer+8*x+8]);
-               //      pixel.w = Short4(0xFFFFu);
-                       break;
-               case FORMAT_A16B16G16R16:
-                       buffer  = cBuffer;
-                       color.x = *Pointer<Short4>(buffer + 8 * x);
-                       color.y = *Pointer<Short4>(buffer + 8 * x + 8);
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
-                       color.z = *Pointer<Short4>(buffer + 8 * x);
-                       color.w = *Pointer<Short4>(buffer + 8 * x + 8);
-                       
-                       transpose4x4(color.x, color.y, color.z, color.w);
-
-                       pixel.x = convertUnsigned16(As<UShort4>(color.x));
-                       pixel.y = convertUnsigned16(As<UShort4>(color.y));
-                       pixel.z = convertUnsigned16(As<UShort4>(color.z));
-                       pixel.w = convertUnsigned16(As<UShort4>(color.w));
-                       break;
-               case FORMAT_G16R16:
-                       buffer = cBuffer;
-                       color.x = *Pointer<Short4>(buffer  + 4 * x);
-                       buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
-                       color.y = *Pointer<Short4>(buffer  + 4 * x);
-                       color.z = color.x;
-                       color.x = As<Short4>(UnpackLow(color.x, color.y));
-                       color.z = As<Short4>(UnpackHigh(color.z, color.y));
-                       color.y = color.z;
-                       color.x = As<Short4>(UnpackLow(color.x, color.z));
-                       color.y = As<Short4>(UnpackHigh(color.y, color.z));
-                       
-                       pixel.x = convertUnsigned16(As<UShort4>(color.x));
-                       pixel.y = convertUnsigned16(As<UShort4>(color.y));
-                       pixel.z = Float4(1.0f);
-                       pixel.w = Float4(1.0f);
-                       break;
                case FORMAT_R32F:
                        buffer = cBuffer;
                        // FIXME: movlps
@@ -3271,17 +3448,17 @@ namespace sw
                Vector4f sourceFactor;
                Vector4f destFactor;
 
-               blendFactor(r, sourceFactor, oC, pixel, (Context::BlendFactor)state.sourceBlendFactor);
-               blendFactor(r, destFactor, oC, pixel, (Context::BlendFactor)state.destBlendFactor);
+               blendFactor(r, sourceFactor, oC, pixel, state.sourceBlendFactor);
+               blendFactor(r, destFactor, oC, pixel, state.destBlendFactor);
 
-               if(state.sourceBlendFactor != Context::BLEND_ONE && state.sourceBlendFactor != Context::BLEND_ZERO)
+               if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
                {
                        oC.x *= sourceFactor.x;
                        oC.y *= sourceFactor.y;
                        oC.z *= sourceFactor.z;
                }
        
-               if(state.destBlendFactor != Context::BLEND_ONE && state.destBlendFactor != Context::BLEND_ZERO)
+               if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
                {
                        pixel.x *= destFactor.x;
                        pixel.y *= destFactor.y;
@@ -3290,40 +3467,40 @@ namespace sw
 
                switch(state.blendOperation)
                {
-               case Context::BLENDOP_ADD:
+               case BLENDOP_ADD:
                        oC.x += pixel.x;
                        oC.y += pixel.y;
                        oC.z += pixel.z;
                        break;
-               case Context::BLENDOP_SUB:
+               case BLENDOP_SUB:
                        oC.x -= pixel.x;
                        oC.y -= pixel.y;
                        oC.z -= pixel.z;
                        break;
-               case Context::BLENDOP_INVSUB:
+               case BLENDOP_INVSUB:
                        oC.x = pixel.x - oC.x;
                        oC.y = pixel.y - oC.y;
                        oC.z = pixel.z - oC.z;
                        break;
-               case Context::BLENDOP_MIN:
+               case BLENDOP_MIN:
                        oC.x = Min(oC.x, pixel.x);
                        oC.y = Min(oC.y, pixel.y);
                        oC.z = Min(oC.z, pixel.z);
                        break;
-               case Context::BLENDOP_MAX:
+               case BLENDOP_MAX:
                        oC.x = Max(oC.x, pixel.x);
                        oC.y = Max(oC.y, pixel.y);
                        oC.z = Max(oC.z, pixel.z);
                        break;
-               case Context::BLENDOP_SOURCE:
+               case BLENDOP_SOURCE:
                        // No operation
                        break;
-               case Context::BLENDOP_DEST:
+               case BLENDOP_DEST:
                        oC.x = pixel.x;
                        oC.y = pixel.y;
                        oC.z = pixel.z;
                        break;
-               case Context::BLENDOP_NULL:
+               case BLENDOP_NULL:
                        oC.x = Float4(0.0f);
                        oC.y = Float4(0.0f);
                        oC.z = Float4(0.0f);
@@ -3332,44 +3509,44 @@ namespace sw
                        ASSERT(false);
                }
 
-               blendFactorAlpha(r, sourceFactor, oC, pixel, (Context::BlendFactor)state.sourceBlendFactorAlpha);
-               blendFactorAlpha(r, destFactor, oC, pixel, (Context::BlendFactor)state.destBlendFactorAlpha);
+               blendFactorAlpha(r, sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
+               blendFactorAlpha(r, destFactor, oC, pixel, state.destBlendFactorAlpha);
 
-               if(state.sourceBlendFactorAlpha != Context::BLEND_ONE && state.sourceBlendFactorAlpha != Context::BLEND_ZERO)
+               if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
                {
                        oC.w *= sourceFactor.w;
                }
        
-               if(state.destBlendFactorAlpha != Context::BLEND_ONE && state.destBlendFactorAlpha != Context::BLEND_ZERO)
+               if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
                {
                        pixel.w *= destFactor.w;
                }
 
                switch(state.blendOperationAlpha)
                {
-               case Context::BLENDOP_ADD:
+               case BLENDOP_ADD:
                        oC.w += pixel.w;
                        break;
-               case Context::BLENDOP_SUB:
+               case BLENDOP_SUB:
                        oC.w -= pixel.w;
                        break;
-               case Context::BLENDOP_INVSUB:
+               case BLENDOP_INVSUB:
                        pixel.w -= oC.w;
                        oC.w = pixel.w;
                        break;
-               case Context::BLENDOP_MIN:      
+               case BLENDOP_MIN:       
                        oC.w = Min(oC.w, pixel.w);
                        break;
-               case Context::BLENDOP_MAX:      
+               case BLENDOP_MAX:       
                        oC.w = Max(oC.w, pixel.w);
                        break;
-               case Context::BLENDOP_SOURCE:
+               case BLENDOP_SOURCE:
                        // No operation
                        break;
-               case Context::BLENDOP_DEST:
+               case BLENDOP_DEST:
                        oC.w = pixel.w;
                        break;
-               case Context::BLENDOP_NULL:
+               case BLENDOP_NULL:
                        oC.w = Float4(0.0f);
                        break;
                default:
@@ -3379,23 +3556,8 @@ namespace sw
 
        void PixelRoutine::writeColor(Registers &r, int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
        {
-               if(!state.colorWriteActive(index))
-               {
-                       return;
-               }
-
-               Vector4i color;
-
                switch(state.targetFormat[index])
                {
-               case FORMAT_X8R8G8B8:
-               case FORMAT_A8R8G8B8:
-               case FORMAT_A8:
-               case FORMAT_G16R16:
-               case FORMAT_A16B16G16R16:
-                       convertFixed16(color, oC, true);
-                       writeColor(r, index, cBuffer, x, color, sMask, zMask, cMask);
-                       return;
                case FORMAT_R32F:
                        break;
                case FORMAT_G32R32F:
@@ -3582,9 +3744,9 @@ namespace sw
        void PixelRoutine::ps_1_x(Registers &r, Int cMask[4])
        {
                int pad = 0;        // Count number of texm3x3pad instructions
-               Vector4i dPairing;   // Destination for first pairing instruction
+               Vector4s dPairing;   // Destination for first pairing instruction
 
-               for(int i = 0; i < shader->getLength(); i++)
+               for(size_t i = 0; i < shader->getLength(); i++)
                {
                        const Shader::Instruction *instruction = shader->getInstruction(i);
                        Shader::Opcode opcode = instruction->opcode;
@@ -3607,14 +3769,14 @@ namespace sw
                        bool pairing = i + 1 < shader->getLength() && shader->getInstruction(i + 1)->coissue;   // First instruction of pair
                        bool coissue = instruction->coissue;                                                              // Second instruction of pair
 
-                       Vector4i d;
-                       Vector4i s0;
-                       Vector4i s1;
-                       Vector4i s2;
+                       Vector4s d;
+                       Vector4s s0;
+                       Vector4s s1;
+                       Vector4s s2;
 
-                       if(src0.type != Shader::PARAMETER_VOID) s0 = regi(r, src0);
-                       if(src1.type != Shader::PARAMETER_VOID) s1 = regi(r, src1);
-                       if(src2.type != Shader::PARAMETER_VOID) s2 = regi(r, src2);
+                       if(src0.type != Shader::PARAMETER_VOID) s0 = fetchRegisterS(r, src0);
+                       if(src1.type != Shader::PARAMETER_VOID) s1 = fetchRegisterS(r, src1);
+                       if(src2.type != Shader::PARAMETER_VOID) s2 = fetchRegisterS(r, src2);
 
                        Float4 u = version < 0x0104 ? r.vf[2 + dst.index].x : r.vf[2 + src0.index].x;
                        Float4 v = version < 0x0104 ? r.vf[2 + dst.index].y : r.vf[2 + src0.index].y;
@@ -3670,7 +3832,7 @@ namespace sw
                                        }
                                        else
                                        {
-                                               TEXKILL(cMask, r.ri[dst.index]);
+                                               TEXKILL(cMask, r.rs[dst.index]);
                                        }
                                }
                                else ASSERT(false);
@@ -3786,7 +3948,7 @@ namespace sw
                bool out[4][4] = {false};
 
                // Create all call site return blocks up front
-               for(int i = 0; i < shader->getLength(); i++)
+               for(size_t i = 0; i < shader->getLength(); i++)
                {
                        const Shader::Instruction *instruction = shader->getInstruction(i);
                        Shader::Opcode opcode = instruction->opcode;
@@ -3800,7 +3962,7 @@ namespace sw
                        }
                }
                
-               for(int i = 0; i < shader->getLength(); i++)
+               for(size_t i = 0; i < shader->getLength(); i++)
                {
                        const Shader::Instruction *instruction = shader->getInstruction(i);
                        Shader::Opcode opcode = instruction->opcode;
@@ -3843,10 +4005,10 @@ namespace sw
                                }
                        }
 
-                       if(src0.type != Shader::PARAMETER_VOID) s0 = reg(r, src0);
-                       if(src1.type != Shader::PARAMETER_VOID) s1 = reg(r, src1);
-                       if(src2.type != Shader::PARAMETER_VOID) s2 = reg(r, src2);
-                       if(src3.type != Shader::PARAMETER_VOID) s3 = reg(r, src3);
+                       if(src0.type != Shader::PARAMETER_VOID) s0 = fetchRegisterF(r, src0);
+                       if(src1.type != Shader::PARAMETER_VOID) s1 = fetchRegisterF(r, src1);
+                       if(src2.type != Shader::PARAMETER_VOID) s2 = fetchRegisterF(r, src2);
+                       if(src3.type != Shader::PARAMETER_VOID) s3 = fetchRegisterF(r, src3);
 
                        switch(opcode)
                        {
@@ -3876,6 +4038,8 @@ namespace sw
                        case Shader::OPCODE_FRC:                frc(d, s0);                                                                             break;
                        case Shader::OPCODE_TRUNC:      trunc(d, s0);                                   break;
                        case Shader::OPCODE_FLOOR:      floor(d, s0);                                   break;
+                       case Shader::OPCODE_ROUND:              round(d, s0);                                   break;
+                       case Shader::OPCODE_ROUNDEVEN:  roundEven(d, s0);                               break;
                        case Shader::OPCODE_CEIL:       ceil(d, s0);                                    break;
                        case Shader::OPCODE_EXP2X:              exp2x(d, s0, pp);                                                               break;
                        case Shader::OPCODE_EXP2:               exp2(d, s0, pp);                                                                break;
@@ -3929,6 +4093,12 @@ namespace sw
                        case Shader::OPCODE_ASIN:               asin(d, s0, pp);                                                                break;
                        case Shader::OPCODE_ATAN:               atan(d, s0, pp);                                                                break;
                        case Shader::OPCODE_ATAN2:              atan2(d, s0, s1, pp);                                                   break;
+                       case Shader::OPCODE_COSH:               cosh(d, s0, pp);                                                                break;
+                       case Shader::OPCODE_SINH:               sinh(d, s0, pp);                                                                break;
+                       case Shader::OPCODE_TANH:               tanh(d, s0, pp);                                                                break;
+                       case Shader::OPCODE_ACOSH:              acosh(d, s0, pp);                                                               break;
+                       case Shader::OPCODE_ASINH:              asinh(d, s0, pp);                                                               break;
+                       case Shader::OPCODE_ATANH:              atanh(d, s0, pp);                                                               break;
                        case Shader::OPCODE_M4X4:               M4X4(r, d, s0, src1);                                                   break;
                        case Shader::OPCODE_M4X3:               M4X3(r, d, s0, src1);                                                   break;
                        case Shader::OPCODE_M3X4:               M3X4(r, d, s0, src1);                                                   break;
@@ -4140,7 +4310,7 @@ namespace sw
 
                for(int i = 0; i < 4; i++)
                {
-                       if((Format)state.targetFormat[i] != FORMAT_NULL)
+                       if(state.targetFormat[i] != FORMAT_NULL)
                        {
                                if(!out[i][0]) r.oC[i].x = Float4(0.0f);
                                if(!out[i][1]) r.oC[i].y = Float4(0.0f);
@@ -4155,12 +4325,12 @@ namespace sw
                return RoundShort4(cf * Float4(0x1000));
        }
 
-       void PixelRoutine::convertFixed12(Vector4i &ci, Vector4f &cf)
+       void PixelRoutine::convertFixed12(Vector4s &cs, Vector4f &cf)
        {
-               ci.x = convertFixed12(cf.x);
-               ci.y = convertFixed12(cf.y);
-               ci.z = convertFixed12(cf.z);
-               ci.w = convertFixed12(cf.w);
+               cs.x = convertFixed12(cf.x);
+               cs.y = convertFixed12(cf.y);
+               cs.z = convertFixed12(cf.z);
+               cs.w = convertFixed12(cf.w);
        }
 
        UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate)
@@ -4168,33 +4338,33 @@ namespace sw
                return UShort4(cf * Float4(0xFFFF), saturate);
        }
 
-       void PixelRoutine::convertFixed16(Vector4i &ci, Vector4f &cf, bool saturate)
+       void PixelRoutine::convertFixed16(Vector4s &cs, Vector4f &cf, bool saturate)
        {
-               ci.x = convertFixed16(cf.x, saturate);
-               ci.y = convertFixed16(cf.y, saturate);
-               ci.z = convertFixed16(cf.z, saturate);
-               ci.w = convertFixed16(cf.w, saturate);
+               cs.x = convertFixed16(cf.x, saturate);
+               cs.y = convertFixed16(cf.y, saturate);
+               cs.z = convertFixed16(cf.z, saturate);
+               cs.w = convertFixed16(cf.w, saturate);
        }
 
-       Float4 PixelRoutine::convertSigned12(Short4 &ci)
+       Float4 PixelRoutine::convertSigned12(Short4 &cs)
        {
-               return Float4(ci) * Float4(1.0f / 0x0FFE);
+               return Float4(cs) * Float4(1.0f / 0x0FFE);
        }
 
-       void PixelRoutine::convertSigned12(Vector4f &cf, Vector4i &ci)
+       void PixelRoutine::convertSigned12(Vector4f &cf, Vector4s &cs)
        {
-               cf.x = convertSigned12(ci.x);
-               cf.y = convertSigned12(ci.y);
-               cf.z = convertSigned12(ci.z);
-               cf.w = convertSigned12(ci.w);
+               cf.x = convertSigned12(cs.x);
+               cf.y = convertSigned12(cs.y);
+               cf.z = convertSigned12(cs.z);
+               cf.w = convertSigned12(cs.w);
        }
 
-       Float4 PixelRoutine::convertUnsigned16(UShort4 ci)
+       Float4 PixelRoutine::convertUnsigned16(UShort4 cs)
        {
-               return Float4(ci) * Float4(1.0f / 0xFFFF);
+               return Float4(cs) * Float4(1.0f / 0xFFFF);
        }
 
-       void PixelRoutine::sRGBtoLinear16_16(Registers &r, Vector4i &c)
+       void PixelRoutine::sRGBtoLinear16_12_16(Registers &r, Vector4s &c)
        {
                c.x = As<UShort4>(c.x) >> 4;
                c.y = As<UShort4>(c.y) >> 4;
@@ -4203,9 +4373,9 @@ namespace sw
                sRGBtoLinear12_16(r, c);
        }
 
-       void PixelRoutine::sRGBtoLinear12_16(Registers &r, Vector4i &c)
+       void PixelRoutine::sRGBtoLinear12_16(Registers &r, Vector4s &c)
        {
-               Pointer<Byte> LUT = r.constants + OFFSET(Constants,sRGBtoLin12_16);
+               Pointer<Byte> LUT = r.constants + OFFSET(Constants,sRGBtoLinear12_16);
 
                c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
                c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
@@ -4223,7 +4393,7 @@ namespace sw
                c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
        }
 
-       void PixelRoutine::linearToSRGB16_16(Registers &r, Vector4i &c)
+       void PixelRoutine::linearToSRGB16_12_16(Registers &r, Vector4s &c)
        {
                c.x = As<UShort4>(c.x) >> 4;
                c.y = As<UShort4>(c.y) >> 4;
@@ -4232,9 +4402,9 @@ namespace sw
                linearToSRGB12_16(r, c);
        }
 
-       void PixelRoutine::linearToSRGB12_16(Registers &r, Vector4i &c)
+       void PixelRoutine::linearToSRGB12_16(Registers &r, Vector4s &c)
        {
-               Pointer<Byte> LUT = r.constants + OFFSET(Constants,linToSRGB12_16);
+               Pointer<Byte> LUT = r.constants + OFFSET(Constants,linearToSRGB12_16);
 
                c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
                c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
@@ -4268,7 +4438,7 @@ namespace sw
                return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
        }
 
-       void PixelRoutine::MOV(Vector4i &dst, Vector4i &src0)
+       void PixelRoutine::MOV(Vector4s &dst, Vector4s &src0)
        {
                dst.x = src0.x;
                dst.y = src0.y;
@@ -4276,7 +4446,7 @@ namespace sw
                dst.w = src0.w;
        }
 
-       void PixelRoutine::ADD(Vector4i &dst, Vector4i &src0, Vector4i &src1)
+       void PixelRoutine::ADD(Vector4s &dst, Vector4s &src0, Vector4s &src1)
        {
                dst.x = AddSat(src0.x, src1.x);
                dst.y = AddSat(src0.y, src1.y);
@@ -4284,7 +4454,7 @@ namespace sw
                dst.w = AddSat(src0.w, src1.w);
        }
 
-       void PixelRoutine::SUB(Vector4i &dst, Vector4i &src0, Vector4i &src1)
+       void PixelRoutine::SUB(Vector4s &dst, Vector4s &src0, Vector4s &src1)
        {
                dst.x = SubSat(src0.x, src1.x);
                dst.y = SubSat(src0.y, src1.y);
@@ -4292,7 +4462,7 @@ namespace sw
                dst.w = SubSat(src0.w, src1.w);
        }
 
-       void PixelRoutine::MAD(Vector4i &dst, Vector4i &src0, Vector4i &src1, Vector4i &src2)
+       void PixelRoutine::MAD(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2)
        {
                // FIXME: Long fixed-point multiply fixup
                {dst.x = MulHigh(src0.x, src1.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, src2.x);}
@@ -4301,7 +4471,7 @@ namespace sw
                {dst.w = MulHigh(src0.w, src1.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, src2.w);}
        }
 
-       void PixelRoutine::MUL(Vector4i &dst, Vector4i &src0, Vector4i &src1)
+       void PixelRoutine::MUL(Vector4s &dst, Vector4s &src0, Vector4s &src1)
        {
                // FIXME: Long fixed-point multiply fixup
                {dst.x = MulHigh(src0.x, src1.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x);}
@@ -4310,7 +4480,7 @@ namespace sw
                {dst.w = MulHigh(src0.w, src1.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w);}
        }
 
-       void PixelRoutine::DP3(Vector4i &dst, Vector4i &src0, Vector4i &src1)
+       void PixelRoutine::DP3(Vector4s &dst, Vector4s &src0, Vector4s &src1)
        {
                Short4 t0;
                Short4 t1;
@@ -4328,7 +4498,7 @@ namespace sw
                dst.w = t0;
        }
 
-       void PixelRoutine::DP4(Vector4i &dst, Vector4i &src0, Vector4i &src1)
+       void PixelRoutine::DP4(Vector4s &dst, Vector4s &src0, Vector4s &src1)
        {
                Short4 t0;
                Short4 t1;
@@ -4348,7 +4518,7 @@ namespace sw
                dst.w = t0;
        }
 
-       void PixelRoutine::LRP(Vector4i &dst, Vector4i &src0, Vector4i &src1, Vector4i &src2)
+       void PixelRoutine::LRP(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2)
        {
                // FIXME: Long fixed-point multiply fixup
                {dst.x = SubSat(src1.x, src2.x); dst.x = MulHigh(dst.x, src0.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, src2.x);}
@@ -4357,7 +4527,7 @@ namespace sw
                {dst.w = SubSat(src1.w, src2.w); dst.w = MulHigh(dst.w, src0.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, src2.w);}
        }
 
-       void PixelRoutine::TEXCOORD(Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate)
+       void PixelRoutine::TEXCOORD(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate)
        {
                Float4 uw;
                Float4 vw;
@@ -4399,7 +4569,7 @@ namespace sw
                dst.w = Short4(0x1000);
        }
 
-       void PixelRoutine::TEXCRD(Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate, bool project)
+       void PixelRoutine::TEXCRD(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate, bool project)
        {
                Float4 uw = u;
                Float4 vw = v;
@@ -4448,7 +4618,7 @@ namespace sw
                }
        }
 
-       void PixelRoutine::TEXDP3(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, Vector4i &src)
+       void PixelRoutine::TEXDP3(Registers &r, Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, Vector4s &src)
        {
                TEXM3X3PAD(r, u, v, s, src, 0, false);
 
@@ -4460,7 +4630,7 @@ namespace sw
                dst.w = t0;
        }
 
-       void PixelRoutine::TEXDP3TEX(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4i &src0)
+       void PixelRoutine::TEXDP3TEX(Registers &r, Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0)
        {
                TEXM3X3PAD(r, u, v, s, src0, 0, false);
 
@@ -4482,7 +4652,7 @@ namespace sw
                }
        }
 
-       void PixelRoutine::TEXKILL(Int cMask[4], Vector4i &src)
+       void PixelRoutine::TEXKILL(Int cMask[4], Vector4s &src)
        {
                Short4 test = src.x | src.y | src.z;
                Int kill = SignMask(Pack(test, test)) ^ 0x0000000F;
@@ -4493,12 +4663,12 @@ namespace sw
                }
        }
 
-       void PixelRoutine::TEX(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, int sampler, bool project)
+       void PixelRoutine::TEX(Registers &r, Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int sampler, bool project)
        {
                sampleTexture(r, dst, sampler, u, v, s, s, project);
        }
 
-       void PixelRoutine::TEXLD(Registers &r, Vector4i &dst, Vector4i &src, int sampler, bool project)
+       void PixelRoutine::TEXLD(Registers &r, Vector4s &dst, Vector4s &src, int sampler, bool project)
        {
                Float4 u = Float4(src.x) * Float4(1.0f / 0x0FFE);
                Float4 v = Float4(src.y) * Float4(1.0f / 0x0FFE);
@@ -4507,7 +4677,7 @@ namespace sw
                sampleTexture(r, dst, sampler, u, v, s, s, project);
        }
 
-       void PixelRoutine::TEXBEM(Registers &r, Vector4i &dst, Vector4i &src, Float4 &u, Float4 &v, Float4 &s, int stage)
+       void PixelRoutine::TEXBEM(Registers &r, Vector4s &dst, Vector4s &src, Float4 &u, Float4 &v, Float4 &s, int stage)
        {
                Float4 du = Float4(src.x) * Float4(1.0f / 0x0FFE);
                Float4 dv = Float4(src.y) * Float4(1.0f / 0x0FFE);
@@ -4528,7 +4698,7 @@ namespace sw
                sampleTexture(r, dst, stage, u_, v_, s, s);
        }
 
-       void PixelRoutine::TEXBEML(Registers &r, Vector4i &dst, Vector4i &src, Float4 &u, Float4 &v, Float4 &s, int stage)
+       void PixelRoutine::TEXBEML(Registers &r, Vector4s &dst, Vector4s &src, Float4 &u, Float4 &v, Float4 &s, int stage)
        {
                Float4 du = Float4(src.x) * Float4(1.0f / 0x0FFE);
                Float4 dv = Float4(src.y) * Float4(1.0f / 0x0FFE);
@@ -4562,7 +4732,7 @@ namespace sw
                dst.z = MulHigh(dst.z, L); dst.z = dst.z << 4;
        }
 
-       void PixelRoutine::TEXREG2AR(Registers &r, Vector4i &dst, Vector4i &src0, int stage)
+       void PixelRoutine::TEXREG2AR(Registers &r, Vector4s &dst, Vector4s &src0, int stage)
        {
                Float4 u = Float4(src0.w) * Float4(1.0f / 0x0FFE);
                Float4 v = Float4(src0.x) * Float4(1.0f / 0x0FFE);
@@ -4571,7 +4741,7 @@ namespace sw
                sampleTexture(r, dst, stage, u, v, s, s);
        }
 
-       void PixelRoutine::TEXREG2GB(Registers &r, Vector4i &dst, Vector4i &src0, int stage)
+       void PixelRoutine::TEXREG2GB(Registers &r, Vector4s &dst, Vector4s &src0, int stage)
        {
                Float4 u = Float4(src0.y) * Float4(1.0f / 0x0FFE);
                Float4 v = Float4(src0.z) * Float4(1.0f / 0x0FFE);
@@ -4580,7 +4750,7 @@ namespace sw
                sampleTexture(r, dst, stage, u, v, s, s);
        }
 
-       void PixelRoutine::TEXREG2RGB(Registers &r, Vector4i &dst, Vector4i &src0, int stage)
+       void PixelRoutine::TEXREG2RGB(Registers &r, Vector4s &dst, Vector4s &src0, int stage)
        {
                Float4 u = Float4(src0.x) * Float4(1.0f / 0x0FFE);
                Float4 v = Float4(src0.y) * Float4(1.0f / 0x0FFE);
@@ -4589,7 +4759,7 @@ namespace sw
                sampleTexture(r, dst, stage, u, v, s, s);
        }
 
-       void PixelRoutine::TEXM3X2DEPTH(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, Vector4i &src, bool signedScaling)
+       void PixelRoutine::TEXM3X2DEPTH(Registers &r, Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, Vector4s &src, bool signedScaling)
        {
                TEXM3X2PAD(r, u, v, s, src, 1, signedScaling);
 
@@ -4599,12 +4769,12 @@ namespace sw
                r.oDepth = r.u_;
        }
 
-       void PixelRoutine::TEXM3X2PAD(Registers &r, Float4 &u, Float4 &v, Float4 &s, Vector4i &src0, int component, bool signedScaling)
+       void PixelRoutine::TEXM3X2PAD(Registers &r, Float4 &u, Float4 &v, Float4 &s, Vector4s &src0, int component, bool signedScaling)
        {
                TEXM3X3PAD(r, u, v, s, src0, component, signedScaling);
        }
 
-       void PixelRoutine::TEXM3X2TEX(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4i &src0, bool signedScaling)
+       void PixelRoutine::TEXM3X2TEX(Registers &r, Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0, bool signedScaling)
        {
                TEXM3X2PAD(r, u, v, s, src0, 1, signedScaling);
 
@@ -4613,7 +4783,7 @@ namespace sw
                sampleTexture(r, dst, stage, r.u_, r.v_, r.w_, r.w_);
        }
 
-       void PixelRoutine::TEXM3X3(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, Vector4i &src0, bool signedScaling)
+       void PixelRoutine::TEXM3X3(Registers &r, Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, Vector4s &src0, bool signedScaling)
        {
                TEXM3X3PAD(r, u, v, s, src0, 2, signedScaling);
 
@@ -4623,7 +4793,7 @@ namespace sw
                dst.w = Short4(0x1000);
        }
 
-       void PixelRoutine::TEXM3X3PAD(Registers &r, Float4 &u, Float4 &v, Float4 &s, Vector4i &src0, int component, bool signedScaling)
+       void PixelRoutine::TEXM3X3PAD(Registers &r, Float4 &u, Float4 &v, Float4 &s, Vector4s &src0, int component, bool signedScaling)
        {
                if(component == 0 || previousScaling != signedScaling)   // FIXME: Other source modifiers?
                {
@@ -4647,7 +4817,7 @@ namespace sw
                }
        }
 
-       void PixelRoutine::TEXM3X3SPEC(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4i &src0, Vector4i &src1)
+       void PixelRoutine::TEXM3X3SPEC(Registers &r, Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0, Vector4s &src1)
        {
                TEXM3X3PAD(r, u, v, s, src0, 2, false);
 
@@ -4684,14 +4854,14 @@ namespace sw
                sampleTexture(r, dst, stage,  u__, v__, w__, w__);
        }
 
-       void PixelRoutine::TEXM3X3TEX(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4i &src0, bool signedScaling)
+       void PixelRoutine::TEXM3X3TEX(Registers &r, Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0, bool signedScaling)
        {
                TEXM3X3PAD(r, u, v, s, src0, 2, signedScaling);
 
                sampleTexture(r, dst, stage, r.u_, r.v_, r.w_, r.w_);
        }
 
-       void PixelRoutine::TEXM3X3VSPEC(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4i &src0)
+       void PixelRoutine::TEXM3X3VSPEC(Registers &r, Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0)
        {
                TEXM3X3PAD(r, u, v, s, src0, 2, false);
 
@@ -4730,8 +4900,8 @@ namespace sw
 
        void PixelRoutine::TEXDEPTH(Registers &r)
        {
-               r.u_ = Float4(r.ri[5].x);
-               r.v_ = Float4(r.ri[5].y);
+               r.u_ = Float4(r.rs[5].x);
+               r.v_ = Float4(r.rs[5].y);
 
                // z / w
                r.u_ *= Rcp_pp(r.v_);   // FIXME: Set result to 1.0 when division by zero
@@ -4739,7 +4909,7 @@ namespace sw
                r.oDepth = r.u_;
        }
 
-       void PixelRoutine::CND(Vector4i &dst, Vector4i &src0, Vector4i &src1, Vector4i &src2)
+       void PixelRoutine::CND(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2)
        {
                {Short4 t0; t0 = src0.x; t0 = CmpGT(t0, Short4(0x0800, 0x0800, 0x0800, 0x0800)); Short4 t1; t1 = src1.x; t1 = t1 & t0; t0 = ~t0 & src2.x; t0 = t0 | t1; dst.x = t0;};
                {Short4 t0; t0 = src0.y; t0 = CmpGT(t0, Short4(0x0800, 0x0800, 0x0800, 0x0800)); Short4 t1; t1 = src1.y; t1 = t1 & t0; t0 = ~t0 & src2.y; t0 = t0 | t1; dst.y = t0;};
@@ -4747,7 +4917,7 @@ namespace sw
                {Short4 t0; t0 = src0.w; t0 = CmpGT(t0, Short4(0x0800, 0x0800, 0x0800, 0x0800)); Short4 t1; t1 = src1.w; t1 = t1 & t0; t0 = ~t0 & src2.w; t0 = t0 | t1; dst.w = t0;};
        }
 
-       void PixelRoutine::CMP(Vector4i &dst, Vector4i &src0, Vector4i &src1, Vector4i &src2)
+       void PixelRoutine::CMP(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2)
        {
                {Short4 t0 = CmpGT(Short4(0x0000, 0x0000, 0x0000, 0x0000), src0.x); Short4 t1; t1 = src2.x; t1 &= t0; t0 = ~t0 & src1.x; t0 |= t1; dst.x = t0;};
                {Short4 t0 = CmpGT(Short4(0x0000, 0x0000, 0x0000, 0x0000), src0.y); Short4 t1; t1 = src2.y; t1 &= t0; t0 = ~t0 & src1.y; t0 |= t1; dst.y = t0;};
@@ -4755,7 +4925,7 @@ namespace sw
                {Short4 t0 = CmpGT(Short4(0x0000, 0x0000, 0x0000, 0x0000), src0.w); Short4 t1; t1 = src2.w; t1 &= t0; t0 = ~t0 & src1.w; t0 |= t1; dst.w = t0;};
        }
 
-       void PixelRoutine::BEM(Registers &r, Vector4i &dst, Vector4i &src0, Vector4i &src1, int stage)
+       void PixelRoutine::BEM(Registers &r, Vector4s &dst, Vector4s &src0, Vector4s &src1, int stage)
        {
                Short4 t0;
                Short4 t1;
@@ -4777,8 +4947,8 @@ namespace sw
 
        void PixelRoutine::M3X2(Registers &r, Vector4f &dst, Vector4f &src0, const Src &src1)
        {
-               Vector4f row0 = reg(r, src1, 0);
-               Vector4f row1 = reg(r, src1, 1);
+               Vector4f row0 = fetchRegisterF(r, src1, 0);
+               Vector4f row1 = fetchRegisterF(r, src1, 1);
 
                dst.x = dot3(src0, row0);
                dst.y = dot3(src0, row1);
@@ -4786,9 +4956,9 @@ namespace sw
 
        void PixelRoutine::M3X3(Registers &r, Vector4f &dst, Vector4f &src0, const Src &src1)
        {
-               Vector4f row0 = reg(r, src1, 0);
-               Vector4f row1 = reg(r, src1, 1);
-               Vector4f row2 = reg(r, src1, 2);
+               Vector4f row0 = fetchRegisterF(r, src1, 0);
+               Vector4f row1 = fetchRegisterF(r, src1, 1);
+               Vector4f row2 = fetchRegisterF(r, src1, 2);
 
                dst.x = dot3(src0, row0);
                dst.y = dot3(src0, row1);
@@ -4797,10 +4967,10 @@ namespace sw
 
        void PixelRoutine::M3X4(Registers &r, Vector4f &dst, Vector4f &src0, const Src &src1)
        {
-               Vector4f row0 = reg(r, src1, 0);
-               Vector4f row1 = reg(r, src1, 1);
-               Vector4f row2 = reg(r, src1, 2);
-               Vector4f row3 = reg(r, src1, 3);
+               Vector4f row0 = fetchRegisterF(r, src1, 0);
+               Vector4f row1 = fetchRegisterF(r, src1, 1);
+               Vector4f row2 = fetchRegisterF(r, src1, 2);
+               Vector4f row3 = fetchRegisterF(r, src1, 3);
 
                dst.x = dot3(src0, row0);
                dst.y = dot3(src0, row1);
@@ -4810,9 +4980,9 @@ namespace sw
 
        void PixelRoutine::M4X3(Registers &r, Vector4f &dst, Vector4f &src0, const Src &src1)
        {
-               Vector4f row0 = reg(r, src1, 0);
-               Vector4f row1 = reg(r, src1, 1);
-               Vector4f row2 = reg(r, src1, 2);
+               Vector4f row0 = fetchRegisterF(r, src1, 0);
+               Vector4f row1 = fetchRegisterF(r, src1, 1);
+               Vector4f row2 = fetchRegisterF(r, src1, 2);
 
                dst.x = dot4(src0, row0);
                dst.y = dot4(src0, row1);
@@ -4821,10 +4991,10 @@ namespace sw
 
        void PixelRoutine::M4X4(Registers &r, Vector4f &dst, Vector4f &src0, const Src &src1)
        {
-               Vector4f row0 = reg(r, src1, 0);
-               Vector4f row1 = reg(r, src1, 1);
-               Vector4f row2 = reg(r, src1, 2);
-               Vector4f row3 = reg(r, src1, 3);
+               Vector4f row0 = fetchRegisterF(r, src1, 0);
+               Vector4f row1 = fetchRegisterF(r, src1, 1);
+               Vector4f row2 = fetchRegisterF(r, src1, 2);
+               Vector4f row3 = fetchRegisterF(r, src1, 3);
 
                dst.x = dot4(src0, row0);
                dst.y = dot4(src0, row1);
@@ -4925,9 +5095,9 @@ namespace sw
        {
                // abs(dFdx(src)) + abs(dFdy(src));
                dst.x = Abs(src.x.yyww - src.x.xxzz) + Abs(src.x.zwzw - src.x.xyxy);
-               dst.y = Abs(src.y.yyww - src.x.xxzz) + Abs(src.y.zwzw - src.y.xyxy);
-               dst.z = Abs(src.z.yyww - src.x.xxzz) + Abs(src.z.zwzw - src.z.xyxy);
-               dst.w = Abs(src.w.yyww - src.x.xxzz) + Abs(src.w.zwzw - src.w.xyxy);
+               dst.y = Abs(src.y.yyww - src.y.xxzz) + Abs(src.y.zwzw - src.y.xyxy);
+               dst.z = Abs(src.z.yyww - src.z.xxzz) + Abs(src.z.zwzw - src.z.xyxy);
+               dst.w = Abs(src.w.yyww - src.w.xxzz) + Abs(src.w.zwzw - src.w.xyxy);
        }
 
        void PixelRoutine::BREAK(Registers &r)
@@ -5204,7 +5374,7 @@ namespace sw
                }
                else
                {
-                       Int4 condition = As<Int4>(reg(r, src).x);
+                       Int4 condition = As<Int4>(fetchRegisterF(r, src).x);
                        IF(r, condition);
                }
        }
@@ -5373,7 +5543,7 @@ namespace sw
                Nucleus::setInsertBlock(testBlock);
                r.enableContinue = restoreContinue;
 
-               const Vector4f &src = reg(r, temporaryRegister);
+               const Vector4f &src = fetchRegisterF(r, temporaryRegister);
                Int4 condition = As<Int4>(src.x);
                condition &= r.enableStack[r.enableIndex - 1];
                r.enableStack[r.enableIndex] = condition;
@@ -5436,46 +5606,46 @@ namespace sw
                // FIXME: Use enableLeave in other control-flow constructs
        }
        
-       void PixelRoutine::writeDestination(Registers &r, Vector4i &d, const Dst &dst)
+       void PixelRoutine::writeDestination(Registers &r, Vector4s &d, const Dst &dst)
        {
                switch(dst.type)
                {
                case Shader::PARAMETER_TEMP:
-                       if(dst.mask & 0x1) r.ri[dst.index].x = d.x;
-                       if(dst.mask & 0x2) r.ri[dst.index].y = d.y;
-                       if(dst.mask & 0x4) r.ri[dst.index].z = d.z;
-                       if(dst.mask & 0x8) r.ri[dst.index].w = d.w;
+                       if(dst.mask & 0x1) r.rs[dst.index].x = d.x;
+                       if(dst.mask & 0x2) r.rs[dst.index].y = d.y;
+                       if(dst.mask & 0x4) r.rs[dst.index].z = d.z;
+                       if(dst.mask & 0x8) r.rs[dst.index].w = d.w;
                        break;
                case Shader::PARAMETER_INPUT:
-                       if(dst.mask & 0x1) r.vi[dst.index].x = d.x;
-                       if(dst.mask & 0x2) r.vi[dst.index].y = d.y;
-                       if(dst.mask & 0x4) r.vi[dst.index].z = d.z;
-                       if(dst.mask & 0x8) r.vi[dst.index].w = d.w;
+                       if(dst.mask & 0x1) r.vs[dst.index].x = d.x;
+                       if(dst.mask & 0x2) r.vs[dst.index].y = d.y;
+                       if(dst.mask & 0x4) r.vs[dst.index].z = d.z;
+                       if(dst.mask & 0x8) r.vs[dst.index].w = d.w;
                        break;
                case Shader::PARAMETER_CONST:                   ASSERT(false);  break;
                case Shader::PARAMETER_TEXTURE:
-                       if(dst.mask & 0x1) r.ti[dst.index].x = d.x;
-                       if(dst.mask & 0x2) r.ti[dst.index].y = d.y;
-                       if(dst.mask & 0x4) r.ti[dst.index].z = d.z;
-                       if(dst.mask & 0x8) r.ti[dst.index].w = d.w;
+                       if(dst.mask & 0x1) r.ts[dst.index].x = d.x;
+                       if(dst.mask & 0x2) r.ts[dst.index].y = d.y;
+                       if(dst.mask & 0x4) r.ts[dst.index].z = d.z;
+                       if(dst.mask & 0x8) r.ts[dst.index].w = d.w;
                        break;
                case Shader::PARAMETER_COLOROUT:
-                       if(dst.mask & 0x1) r.vi[dst.index].x = d.x;
-                       if(dst.mask & 0x2) r.vi[dst.index].y = d.y;
-                       if(dst.mask & 0x4) r.vi[dst.index].z = d.z;
-                       if(dst.mask & 0x8) r.vi[dst.index].w = d.w;
+                       if(dst.mask & 0x1) r.vs[dst.index].x = d.x;
+                       if(dst.mask & 0x2) r.vs[dst.index].y = d.y;
+                       if(dst.mask & 0x4) r.vs[dst.index].z = d.z;
+                       if(dst.mask & 0x8) r.vs[dst.index].w = d.w;
                        break;
                default:
                        ASSERT(false);
                }
        }
 
-       Vector4i PixelRoutine::regi(Registers &r, const Src &src)
+       Vector4s PixelRoutine::fetchRegisterS(Registers &r, const Src &src)
        {
-               Vector4i *reg;
+               Vector4s *reg;
                int i = src.index;
 
-               Vector4i c;
+               Vector4s c;
 
                if(src.type == Shader::PARAMETER_CONST)
                {
@@ -5487,12 +5657,12 @@ namespace sw
 
                switch(src.type)
                {
-               case Shader::PARAMETER_TEMP:          reg = &r.ri[i]; break;
-               case Shader::PARAMETER_INPUT:         reg = &r.vi[i]; break;
+               case Shader::PARAMETER_TEMP:          reg = &r.rs[i]; break;
+               case Shader::PARAMETER_INPUT:         reg = &r.vs[i]; break;
                case Shader::PARAMETER_CONST:         reg = &c;       break;
-               case Shader::PARAMETER_TEXTURE:       reg = &r.ti[i]; break;
-               case Shader::PARAMETER_VOID:          return r.ri[0]; // Dummy
-               case Shader::PARAMETER_FLOAT4LITERAL: return r.ri[0]; // Dummy
+               case Shader::PARAMETER_TEXTURE:       reg = &r.ts[i]; break;
+               case Shader::PARAMETER_VOID:          return r.rs[0]; // Dummy
+               case Shader::PARAMETER_FLOAT4LITERAL: return r.rs[0]; // Dummy
                default:
                        ASSERT(false);
                }
@@ -5502,7 +5672,7 @@ namespace sw
                const Short4 &z = (*reg)[(src.swizzle >> 4) & 0x3];
                const Short4 &w = (*reg)[(src.swizzle >> 6) & 0x3];
 
-               Vector4i mod;
+               Vector4s mod;
 
                switch(src.modifier)
                {
@@ -5597,7 +5767,7 @@ namespace sw
                return mod;
        }
 
-       Vector4f PixelRoutine::reg(Registers &r, const Src &src, int offset)
+       Vector4f PixelRoutine::fetchRegisterF(Registers &r, const Src &src, int offset)
        {
                Vector4f reg;
                int i = src.index + offset;
@@ -5738,9 +5908,9 @@ namespace sw
                        c.z = c.z.zzzz;
                        c.w = c.w.wwww;
 
-                       if(localShaderConstants)   // Constant may be known at compile time
+                       if(shader->containsDefineInstruction())   // Constant may be known at compile time
                        {
-                               for(int j = 0; j < shader->getLength(); j++)
+                               for(size_t j = 0; j < shader->getLength(); j++)
                                {
                                        const Shader::Instruction &instruction = *shader->getInstruction(j);