OSDN Git Service

gl_VertexID implementation
[android-x86/external-swiftshader.git] / src / Shader / VertexRoutine.cpp
index 60b193d..0f1ccdf 100644 (file)
@@ -1,13 +1,16 @@
-// SwiftShader Software Renderer
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
 //
-// Copyright(c) 2005-2012 TransGaming Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
 //
-// All rights reserved. No part of this software may be copied, distributed, transmitted,
-// transcribed, stored in a retrieval system, translated into any human or computer
-// language by any means, or disclosed to third parties without the explicit written
-// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
-// or implied, including but not limited to any patent rights, are granted to you.
+//    http://www.apache.org/licenses/LICENSE-2.0
 //
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 #include "VertexRoutine.hpp"
 
@@ -23,9 +26,11 @@ namespace sw
        extern bool halfIntegerCoordinates;     // Pixel centers are not at integer coordinates
        extern bool symmetricNormalizedDepth;   // [-1, 1] instead of [0, 1]
 
-       VertexRoutine::VertexRoutine(const VertexProcessor::State &state, const VertexShader *shader) : state(state), shader(shader)
+       VertexRoutine::VertexRoutine(const VertexProcessor::State &state, const VertexShader *shader)
+               : v(shader && shader->dynamicallyIndexedInput),
+                 o(shader && shader->dynamicallyIndexedOutput),
+                 state(state)
        {
-               routine = 0;
        }
 
        VertexRoutine::~VertexRoutine()
@@ -34,130 +39,116 @@ namespace sw
 
        void VertexRoutine::generate()
        {
-               Function<Void(Pointer<Byte>, Pointer<Byte>, Pointer<Byte>, Pointer<Byte>)> function;
-               {
-                       Pointer<Byte> vertex(function.arg(0));
-                       Pointer<Byte> batch(function.arg(1));
-                       Pointer<Byte> task(function.arg(2));
-                       Pointer<Byte> data(function.arg(3));
+               const bool textureSampling = state.textureSampling;
 
-                       const bool texldl = state.shaderContainsTexldl;
+               Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache);
+               Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex);
+               Pointer<Byte> tagCache = cache + OFFSET(VertexCache,tag);
 
-                       Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache);
-                       Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex);
-                       Pointer<Byte> tagCache = cache + OFFSET(VertexCache,tag);
+               UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask,vertexCount));
+               UInt primitiveNumber = *Pointer<UInt>(task + OFFSET(VertexTask, primitiveStart));
+               UInt indexInPrimitive = 0;
 
-                       UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask,vertexCount));
+               constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
 
-                       Registers r(shader);
-                       r.data = data;
-                       r.constants = *Pointer<Pointer<Byte> >(data + OFFSET(DrawData,constants));
-                       if(shader && shader->instanceIdDeclared)
-                       {
-                               r.instanceID = *Pointer<Int>(data + OFFSET(DrawData, instanceID));
-                       }
+               Do
+               {
+                       UInt index = *Pointer<UInt>(batch);
+                       UInt tagIndex = index & 0x0000003C;
+                       UInt indexQ = !textureSampling ? UInt(index & 0xFFFFFFFC) : index;   // FIXME: TEXLDL hack to have independent LODs, hurts performance.
 
-                       Do
+                       If(*Pointer<UInt>(tagCache + tagIndex) != indexQ)
                        {
-                               UInt index = *Pointer<UInt>(batch);
-                               UInt tagIndex = index & 0x0000003C;
-                               UInt indexQ = !texldl ? UInt(index & 0xFFFFFFFC) : index;   // FIXME: TEXLDL hack to have independent LODs, hurts performance.
+                               *Pointer<UInt>(tagCache + tagIndex) = indexQ;
 
-                               If(*Pointer<UInt>(tagCache + tagIndex) != indexQ)
-                               {
-                                       *Pointer<UInt>(tagCache + tagIndex) = indexQ;
+                               readInput(indexQ);
+                               pipeline(indexQ);
+                               postTransform();
+                               computeClipFlags();
 
-                                       readInput(r, indexQ);
-                                       pipeline(r);
-                                       postTransform(r);
-                                       computeClipFlags(r);
+                               Pointer<Byte> cacheLine0 = vertexCache + tagIndex * UInt((int)sizeof(Vertex));
+                               writeCache(cacheLine0);
+                       }
 
-                                       Pointer<Byte> cacheLine0 = vertexCache + tagIndex * UInt((int)sizeof(Vertex));
-                                       writeCache(cacheLine0, r);
-                               }
+                       UInt cacheIndex = index & 0x0000003F;
+                       Pointer<Byte> cacheLine = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
+                       writeVertex(vertex, cacheLine);
 
-                               UInt cacheIndex = index & 0x0000003F;
-                               Pointer<Byte> cacheLine = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
-                               writeVertex(vertex, cacheLine);
+                       if(state.transformFeedbackEnabled != 0)
+                       {
+                               transformFeedback(vertex, primitiveNumber, indexInPrimitive);
 
-                               vertex += sizeof(Vertex);
-                               batch += sizeof(unsigned int);
-                               vertexCount--;
+                               indexInPrimitive++;
+                               If(indexInPrimitive == 3)
+                               {
+                                       primitiveNumber++;
+                                       indexInPrimitive = 0;
+                               }
                        }
-                       Until(vertexCount == 0)
 
-                       Return();
+                       vertex += sizeof(Vertex);
+                       batch += sizeof(unsigned int);
+                       vertexCount--;
                }
+               Until(vertexCount == 0)
 
-               routine = function(L"VertexRoutine_%0.8X", state.shaderID);
+               Return();
        }
 
-       Routine *VertexRoutine::getRoutine()
+       void VertexRoutine::readInput(UInt &index)
        {
-               return routine;
-       }
-
-       void VertexRoutine::readInput(Registers &r, UInt &index)
-       {
-               for(int i = 0; i < VERTEX_ATTRIBUTES; i++)
+               for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
                {
-                       Pointer<Byte> input = *Pointer<Pointer<Byte> >(r.data + OFFSET(DrawData,input) + sizeof(void*) * i);
-                       UInt stride = *Pointer<UInt>(r.data + OFFSET(DrawData,stride) + sizeof(unsigned int) * i);
+                       Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,input) + sizeof(void*) * i);
+                       UInt stride = *Pointer<UInt>(data + OFFSET(DrawData,stride) + sizeof(unsigned int) * i);
 
-                       r.v[i] = readStream(r, input, stride, state.input[i], index);
+                       v[i] = readStream(input, stride, state.input[i], index);
                }
        }
 
-       void VertexRoutine::computeClipFlags(Registers &r)
+       void VertexRoutine::computeClipFlags()
        {
                int pos = state.positionRegister;
 
-               Int4 maxX = CmpLT(r.o[pos].w, r.o[pos].x);
-               Int4 maxY = CmpLT(r.o[pos].w, r.o[pos].y);
-               Int4 maxZ = CmpLT(r.o[pos].w, r.o[pos].z);
-
-               Int4 minX = CmpNLE(-r.o[pos].w, r.o[pos].x);
-               Int4 minY = CmpNLE(-r.o[pos].w, r.o[pos].y);
-               Int4 minZ = CmpNLE(Float4(0.0f), r.o[pos].z);
-
-               Int flags;
+               Int4 maxX = CmpLT(o[pos].w, o[pos].x);
+               Int4 maxY = CmpLT(o[pos].w, o[pos].y);
+               Int4 maxZ = CmpLT(o[pos].w, o[pos].z);
+               Int4 minX = CmpNLE(-o[pos].w, o[pos].x);
+               Int4 minY = CmpNLE(-o[pos].w, o[pos].y);
+               Int4 minZ = symmetricNormalizedDepth ? CmpNLE(-o[pos].w, o[pos].z) : CmpNLE(Float4(0.0f), o[pos].z);
 
-               flags = SignMask(maxX);
-               r.clipFlags = *Pointer<Int>(r.constants + OFFSET(Constants,maxX) + flags * 4);   // FIXME: Array indexing
-               flags = SignMask(maxY);
-               r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,maxY) + flags * 4);
-               flags = SignMask(maxZ);
-               r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,maxZ) + flags * 4);
-               flags = SignMask(minX);
-               r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,minX) + flags * 4);
-               flags = SignMask(minY);
-               r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,minY) + flags * 4);
-               flags = SignMask(minZ);
-               r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,minZ) + flags * 4);
+               clipFlags = *Pointer<Int>(constants + OFFSET(Constants,maxX) + SignMask(maxX) * 4);   // FIXME: Array indexing
+               clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxY) + SignMask(maxY) * 4);
+               clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxZ) + SignMask(maxZ) * 4);
+               clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minX) + SignMask(minX) * 4);
+               clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minY) + SignMask(minY) * 4);
+               clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minZ) + SignMask(minZ) * 4);
 
-               Int4 finiteX = CmpLE(Abs(r.o[pos].x), *Pointer<Float4>(r.constants + OFFSET(Constants,maxPos)));
-               Int4 finiteY = CmpLE(Abs(r.o[pos].y), *Pointer<Float4>(r.constants + OFFSET(Constants,maxPos)));
-               Int4 finiteZ = CmpLE(Abs(r.o[pos].z), *Pointer<Float4>(r.constants + OFFSET(Constants,maxPos)));
+               Int4 finiteX = CmpLE(Abs(o[pos].x), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
+               Int4 finiteY = CmpLE(Abs(o[pos].y), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
+               Int4 finiteZ = CmpLE(Abs(o[pos].z), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
 
-               flags = SignMask(finiteX & finiteY & finiteZ);
-               r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,fini) + flags * 4);
+               Int4 finiteXYZ = finiteX & finiteY & finiteZ;
+               clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,fini) + SignMask(finiteXYZ) * 4);
 
                if(state.preTransformed)
                {
-                       r.clipFlags &= 0xFBFBFBFB;   // Don't clip against far clip plane
+                       clipFlags &= 0xFBFBFBFB;   // Don't clip against far clip plane
                }
        }
 
-       Vector4f VertexRoutine::readStream(Registers &r, Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index)
+       Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index)
        {
-               const bool texldl = state.shaderContainsTexldl;
+               const bool textureSampling = state.textureSampling;
 
                Vector4f v;
 
                Pointer<Byte> source0 = buffer + index * stride;
-               Pointer<Byte> source1 = source0 + (!texldl ? stride : 0);
-               Pointer<Byte> source2 = source1 + (!texldl ? stride : 0);
-               Pointer<Byte> source3 = source2 + (!texldl ? stride : 0);
+               Pointer<Byte> source1 = source0 + (!textureSampling ? stride : 0);
+               Pointer<Byte> source2 = source1 + (!textureSampling ? stride : 0);
+               Pointer<Byte> source3 = source2 + (!textureSampling ? stride : 0);
+
+               bool isNativeFloatAttrib = (stream.attribType == VertexShader::ATTRIBTYPE_FLOAT) || stream.normalized;
 
                switch(stream.type)
                {
@@ -167,25 +158,47 @@ namespace sw
                                {
                                        // Null stream, all default components
                                }
-                               else if(stream.count == 1)
-                               {
-                                       v.x.x = *Pointer<Float>(source0);
-                                       v.x.y = *Pointer<Float>(source1);
-                                       v.x.z = *Pointer<Float>(source2);
-                                       v.x.w = *Pointer<Float>(source3);
-                               }
                                else
                                {
-                                       v.x = *Pointer<Float4>(source0);
-                                       v.y = *Pointer<Float4>(source1);
-                                       v.z = *Pointer<Float4>(source2);
-                                       v.w = *Pointer<Float4>(source3);
+                                       if(stream.count == 1)
+                                       {
+                                               v.x.x = *Pointer<Float>(source0);
+                                               v.x.y = *Pointer<Float>(source1);
+                                               v.x.z = *Pointer<Float>(source2);
+                                               v.x.w = *Pointer<Float>(source3);
+                                       }
+                                       else
+                                       {
+                                               v.x = *Pointer<Float4>(source0);
+                                               v.y = *Pointer<Float4>(source1);
+                                               v.z = *Pointer<Float4>(source2);
+                                               v.w = *Pointer<Float4>(source3);
 
-                                       transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+                                               transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+                                       }
+
+                                       switch(stream.attribType)
+                                       {
+                                       case VertexShader::ATTRIBTYPE_INT:
+                                               if(stream.count >= 1) v.x = As<Float4>(Int4(v.x));
+                                               if(stream.count >= 2) v.x = As<Float4>(Int4(v.y));
+                                               if(stream.count >= 3) v.x = As<Float4>(Int4(v.z));
+                                               if(stream.count >= 4) v.x = As<Float4>(Int4(v.w));
+                                               break;
+                                       case VertexShader::ATTRIBTYPE_UINT:
+                                               if(stream.count >= 1) v.x = As<Float4>(UInt4(v.x));
+                                               if(stream.count >= 2) v.x = As<Float4>(UInt4(v.y));
+                                               if(stream.count >= 3) v.x = As<Float4>(UInt4(v.z));
+                                               if(stream.count >= 4) v.x = As<Float4>(UInt4(v.w));
+                                               break;
+                                       default:
+                                               break;
+                                       }
                                }
                        }
                        break;
                case STREAMTYPE_BYTE:
+                       if(isNativeFloatAttrib) // Stream: UByte, Shader attrib: Float
                        {
                                v.x = Float4(*Pointer<Byte4>(source0));
                                v.y = Float4(*Pointer<Byte4>(source1));
@@ -196,14 +209,24 @@ namespace sw
 
                                if(stream.normalized)
                                {
-                                       if(stream.count >= 1) v.x *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
-                                       if(stream.count >= 2) v.y *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
-                                       if(stream.count >= 3) v.z *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
-                                       if(stream.count >= 4) v.w *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
+                                       if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+                                       if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+                                       if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+                                       if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
                                }
                        }
+                       else // Stream: UByte, Shader attrib: Int / UInt
+                       {
+                               v.x = As<Float4>(Int4(*Pointer<Byte4>(source0)));
+                               v.y = As<Float4>(Int4(*Pointer<Byte4>(source1)));
+                               v.z = As<Float4>(Int4(*Pointer<Byte4>(source2)));
+                               v.w = As<Float4>(Int4(*Pointer<Byte4>(source3)));
+
+                               transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+                       }
                        break;
                case STREAMTYPE_SBYTE:
+                       if(isNativeFloatAttrib) // Stream: SByte, Shader attrib: Float
                        {
                                v.x = Float4(*Pointer<SByte4>(source0));
                                v.y = Float4(*Pointer<SByte4>(source1));
@@ -214,19 +237,28 @@ namespace sw
 
                                if(stream.normalized)
                                {
-                                       if(stream.count >= 1) v.x *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleSByte));
-                                       if(stream.count >= 2) v.y *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleSByte));
-                                       if(stream.count >= 3) v.z *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleSByte));
-                                       if(stream.count >= 4) v.w *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleSByte));
+                                       if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
+                                       if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
+                                       if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
+                                       if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
                                }
                        }
+                       else // Stream: SByte, Shader attrib: Int / UInt
+                       {
+                               v.x = As<Float4>(Int4(*Pointer<SByte4>(source0)));
+                               v.y = As<Float4>(Int4(*Pointer<SByte4>(source1)));
+                               v.z = As<Float4>(Int4(*Pointer<SByte4>(source2)));
+                               v.w = As<Float4>(Int4(*Pointer<SByte4>(source3)));
+
+                               transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+                       }
                        break;
                case STREAMTYPE_COLOR:
                        {
-                               v.x = Float4(*Pointer<Byte4>(source0)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
-                               v.y = Float4(*Pointer<Byte4>(source1)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
-                               v.z = Float4(*Pointer<Byte4>(source2)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
-                               v.w = Float4(*Pointer<Byte4>(source3)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
+                               v.x = Float4(*Pointer<Byte4>(source0)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+                               v.y = Float4(*Pointer<Byte4>(source1)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+                               v.z = Float4(*Pointer<Byte4>(source2)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+                               v.w = Float4(*Pointer<Byte4>(source3)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
 
                                transpose4x4(v.x, v.y, v.z, v.w);
 
@@ -237,47 +269,123 @@ namespace sw
                        }
                        break;
                case STREAMTYPE_SHORT:
+                       if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
                        {
                                v.x = Float4(*Pointer<Short4>(source0));
                                v.y = Float4(*Pointer<Short4>(source1));
                                v.z = Float4(*Pointer<Short4>(source2));
                                v.w = Float4(*Pointer<Short4>(source3));
-                       
+
                                transpose4xN(v.x, v.y, v.z, v.w, stream.count);
 
                                if(stream.normalized)
                                {
-                                       if(stream.count >= 1) v.x *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleShort));
-                                       if(stream.count >= 2) v.y *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleShort));
-                                       if(stream.count >= 3) v.z *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleShort));
-                                       if(stream.count >= 4) v.w *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleShort));
-                               }                       
+                                       if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
+                                       if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
+                                       if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
+                                       if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
+                               }
+                       }
+                       else // Stream: Short, Shader attrib: Int/UInt, no type conversion
+                       {
+                               v.x = As<Float4>(Int4(*Pointer<Short4>(source0)));
+                               v.y = As<Float4>(Int4(*Pointer<Short4>(source1)));
+                               v.z = As<Float4>(Int4(*Pointer<Short4>(source2)));
+                               v.w = As<Float4>(Int4(*Pointer<Short4>(source3)));
+
+                               transpose4xN(v.x, v.y, v.z, v.w, stream.count);
                        }
                        break;
                case STREAMTYPE_USHORT:
+                       if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
                        {
                                v.x = Float4(*Pointer<UShort4>(source0));
                                v.y = Float4(*Pointer<UShort4>(source1));
                                v.z = Float4(*Pointer<UShort4>(source2));
                                v.w = Float4(*Pointer<UShort4>(source3));
-                       
+
                                transpose4xN(v.x, v.y, v.z, v.w, stream.count);
 
                                if(stream.normalized)
                                {
-                                       if(stream.count >= 1) v.x *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleUShort));
-                                       if(stream.count >= 2) v.y *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleUShort));
-                                       if(stream.count >= 3) v.z *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleUShort));
-                                       if(stream.count >= 4) v.w *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleUShort));
+                                       if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
+                                       if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
+                                       if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
+                                       if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
                                }
                        }
+                       else // Stream: UShort, Shader attrib: Int/UInt, no type conversion
+                       {
+                               v.x = As<Float4>(Int4(*Pointer<UShort4>(source0)));
+                               v.y = As<Float4>(Int4(*Pointer<UShort4>(source1)));
+                               v.z = As<Float4>(Int4(*Pointer<UShort4>(source2)));
+                               v.w = As<Float4>(Int4(*Pointer<UShort4>(source3)));
+
+                               transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+                       }
+                       break;
+               case STREAMTYPE_INT:
+                       if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
+                       {
+                               v.x = Float4(*Pointer<Int4>(source0));
+                               v.y = Float4(*Pointer<Int4>(source1));
+                               v.z = Float4(*Pointer<Int4>(source2));
+                               v.w = Float4(*Pointer<Int4>(source3));
+
+                               transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+
+                               if(stream.normalized)
+                               {
+                                       if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
+                                       if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
+                                       if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
+                                       if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
+                               }
+                       }
+                       else // Stream: Int, Shader attrib: Int/UInt, no type conversion
+                       {
+                               v.x = *Pointer<Float4>(source0);
+                               v.y = *Pointer<Float4>(source1);
+                               v.z = *Pointer<Float4>(source2);
+                               v.w = *Pointer<Float4>(source3);
+
+                               transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+                       }
+                       break;
+               case STREAMTYPE_UINT:
+                       if(isNativeFloatAttrib) // Stream: UInt, Shader attrib: Float
+                       {
+                               v.x = Float4(*Pointer<UInt4>(source0));
+                               v.y = Float4(*Pointer<UInt4>(source1));
+                               v.z = Float4(*Pointer<UInt4>(source2));
+                               v.w = Float4(*Pointer<UInt4>(source3));
+
+                               transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+
+                               if(stream.normalized)
+                               {
+                                       if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
+                                       if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
+                                       if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
+                                       if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
+                               }
+                       }
+                       else // Stream: UInt, Shader attrib: Int/UInt, no type conversion
+                       {
+                               v.x = *Pointer<Float4>(source0);
+                               v.y = *Pointer<Float4>(source1);
+                               v.z = *Pointer<Float4>(source2);
+                               v.w = *Pointer<Float4>(source3);
+
+                               transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+                       }
                        break;
                case STREAMTYPE_UDEC3:
                        {
                                // FIXME: Vectorize
                                {
                                        Int x, y, z;
-                                       
+
                                        x = y = z = *Pointer<Int>(source0);
 
                                        v.x.x = Float(x & 0x000003FF);
@@ -287,7 +395,7 @@ namespace sw
 
                                {
                                        Int x, y, z;
-                                       
+
                                        x = y = z = *Pointer<Int>(source1);
 
                                        v.y.x = Float(x & 0x000003FF);
@@ -297,7 +405,7 @@ namespace sw
 
                                {
                                        Int x, y, z;
-                                       
+
                                        x = y = z = *Pointer<Int>(source2);
 
                                        v.z.x = Float(x & 0x000003FF);
@@ -307,7 +415,7 @@ namespace sw
 
                                {
                                        Int x, y, z;
-                                       
+
                                        x = y = z = *Pointer<Int>(source3);
 
                                        v.w.x = Float(x & 0x000003FF);
@@ -326,7 +434,7 @@ namespace sw
                                // FIXME: Vectorize
                                {
                                        Int x, y, z;
-                                       
+
                                        x = y = z = *Pointer<Int>(source0);
 
                                        v.x.x = Float((x << 22) & 0xFFC00000);
@@ -336,7 +444,7 @@ namespace sw
 
                                {
                                        Int x, y, z;
-                                       
+
                                        x = y = z = *Pointer<Int>(source1);
 
                                        v.y.x = Float((x << 22) & 0xFFC00000);
@@ -346,7 +454,7 @@ namespace sw
 
                                {
                                        Int x, y, z;
-                                       
+
                                        x = y = z = *Pointer<Int>(source2);
 
                                        v.z.x = Float((x << 22) & 0xFFC00000);
@@ -356,7 +464,7 @@ namespace sw
 
                                {
                                        Int x, y, z;
-                                       
+
                                        x = y = z = *Pointer<Int>(source3);
 
                                        v.w.x = Float((x << 22) & 0xFFC00000);
@@ -373,10 +481,10 @@ namespace sw
                        break;
                case STREAMTYPE_FIXED:
                        {
-                               v.x = Float4(*Pointer<Int4>(source0)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleFixed));
-                               v.y = Float4(*Pointer<Int4>(source1)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleFixed));
-                               v.z = Float4(*Pointer<Int4>(source2)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleFixed));
-                               v.w = Float4(*Pointer<Int4>(source3)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleFixed));
+                               v.x = Float4(*Pointer<Int4>(source0)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
+                               v.y = Float4(*Pointer<Int4>(source1)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
+                               v.z = Float4(*Pointer<Int4>(source2)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
+                               v.w = Float4(*Pointer<Int4>(source3)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
 
                                transpose4xN(v.x, v.y, v.z, v.w, stream.count);
                        }
@@ -390,10 +498,10 @@ namespace sw
                                        UShort x2 = *Pointer<UShort>(source2 + 0);
                                        UShort x3 = *Pointer<UShort>(source3 + 0);
 
-                                       v.x.x = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(x0) * 4);
-                                       v.x.y = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(x1) * 4);
-                                       v.x.z = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(x2) * 4);
-                                       v.x.w = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(x3) * 4);
+                                       v.x.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x0) * 4);
+                                       v.x.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x1) * 4);
+                                       v.x.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x2) * 4);
+                                       v.x.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x3) * 4);
                                }
 
                                if(stream.count >= 2)
@@ -403,10 +511,10 @@ namespace sw
                                        UShort y2 = *Pointer<UShort>(source2 + 2);
                                        UShort y3 = *Pointer<UShort>(source3 + 2);
 
-                                       v.y.x = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(y0) * 4);
-                                       v.y.y = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(y1) * 4);
-                                       v.y.z = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(y2) * 4);
-                                       v.y.w = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(y3) * 4);
+                                       v.y.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y0) * 4);
+                                       v.y.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y1) * 4);
+                                       v.y.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y2) * 4);
+                                       v.y.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y3) * 4);
                                }
 
                                if(stream.count >= 3)
@@ -416,10 +524,10 @@ namespace sw
                                        UShort z2 = *Pointer<UShort>(source2 + 4);
                                        UShort z3 = *Pointer<UShort>(source3 + 4);
 
-                                       v.z.x = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(z0) * 4);
-                                       v.z.y = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(z1) * 4);
-                                       v.z.z = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(z2) * 4);
-                                       v.z.w = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(z3) * 4);
+                                       v.z.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z0) * 4);
+                                       v.z.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z1) * 4);
+                                       v.z.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z2) * 4);
+                                       v.z.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z3) * 4);
                                }
 
                                if(stream.count >= 4)
@@ -429,10 +537,10 @@ namespace sw
                                        UShort w2 = *Pointer<UShort>(source2 + 6);
                                        UShort w3 = *Pointer<UShort>(source3 + 6);
 
-                                       v.w.x = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(w0) * 4);
-                                       v.w.y = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(w1) * 4);
-                                       v.w.z = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(w2) * 4);
-                                       v.w.w = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(w3) * 4);
+                                       v.w.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w0) * 4);
+                                       v.w.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w1) * 4);
+                                       v.w.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w2) * 4);
+                                       v.w.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w3) * 4);
                                }
                        }
                        break;
@@ -444,6 +552,50 @@ namespace sw
                                v.x.w = *Pointer<Float>(source3);
                        }
                        break;
+               case STREAMTYPE_2_10_10_10_INT:
+                       {
+                               Int4 src;
+                               src = Insert(src, *Pointer<Int>(source0), 0);
+                               src = Insert(src, *Pointer<Int>(source1), 1);
+                               src = Insert(src, *Pointer<Int>(source2), 2);
+                               src = Insert(src, *Pointer<Int>(source3), 3);
+
+                               v.x = Float4((src << 22) >> 22);
+                               v.y = Float4((src << 12) >> 22);
+                               v.z = Float4((src << 02) >> 22);
+                               v.w = Float4(src >> 30);
+
+                               if(stream.normalized)
+                               {
+                                       v.x = Max(v.x * Float4(1.0f / 0x1FF), Float4(-1.0f));
+                                       v.y = Max(v.y * Float4(1.0f / 0x1FF), Float4(-1.0f));
+                                       v.z = Max(v.z * Float4(1.0f / 0x1FF), Float4(-1.0f));
+                                       v.w = Max(v.w, Float4(-1.0f));
+                               }
+                       }
+                       break;
+               case STREAMTYPE_2_10_10_10_UINT:
+                       {
+                               Int4 src;
+                               src = Insert(src, *Pointer<Int>(source0), 0);
+                               src = Insert(src, *Pointer<Int>(source1), 1);
+                               src = Insert(src, *Pointer<Int>(source2), 2);
+                               src = Insert(src, *Pointer<Int>(source3), 3);
+
+                               v.x = Float4(src & Int4(0x3FF));
+                               v.y = Float4((src >> 10) & Int4(0x3FF));
+                               v.z = Float4((src >> 20) & Int4(0x3FF));
+                               v.w = Float4((src >> 30) & Int4(0x3));
+
+                               if(stream.normalized)
+                               {
+                                       v.x *= Float4(1.0f / 0x3FF);
+                                       v.y *= Float4(1.0f / 0x3FF);
+                                       v.z *= Float4(1.0f / 0x3FF);
+                                       v.w *= Float4(1.0f / 0x3);
+                               }
+                       }
+                       break;
                default:
                        ASSERT(false);
                }
@@ -451,61 +603,56 @@ namespace sw
                if(stream.count < 1) v.x = Float4(0.0f);
                if(stream.count < 2) v.y = Float4(0.0f);
                if(stream.count < 3) v.z = Float4(0.0f);
-               if(stream.count < 4) v.w = Float4(1.0f);
+               if(stream.count < 4) v.w = isNativeFloatAttrib ? As<Float4>(Float4(1.0f)) : As<Float4>(Int4(0));
 
                return v;
        }
 
-       void VertexRoutine::postTransform(Registers &r)
+       void VertexRoutine::postTransform()
        {
                int pos = state.positionRegister;
 
                // Backtransform
                if(state.preTransformed)
                {
-                       Float4 rhw = Float4(1.0f) / r.o[pos].w;
+                       Float4 rhw = Float4(1.0f) / o[pos].w;
 
-                       Float4 W = *Pointer<Float4>(r.data + OFFSET(DrawData,Wx16)) * Float4(1.0f / 16.0f);
-                       Float4 H = *Pointer<Float4>(r.data + OFFSET(DrawData,Hx16)) * Float4(1.0f / 16.0f);
-                       Float4 L = *Pointer<Float4>(r.data + OFFSET(DrawData,X0x16)) * Float4(1.0f / 16.0f);
-                       Float4 T = *Pointer<Float4>(r.data + OFFSET(DrawData,Y0x16)) * Float4(1.0f / 16.0f);
+                       Float4 W = *Pointer<Float4>(data + OFFSET(DrawData,Wx16)) * Float4(1.0f / 16.0f);
+                       Float4 H = *Pointer<Float4>(data + OFFSET(DrawData,Hx16)) * Float4(1.0f / 16.0f);
+                       Float4 L = *Pointer<Float4>(data + OFFSET(DrawData,X0x16)) * Float4(1.0f / 16.0f);
+                       Float4 T = *Pointer<Float4>(data + OFFSET(DrawData,Y0x16)) * Float4(1.0f / 16.0f);
 
-                       r.o[pos].x = (r.o[pos].x - L) / W * rhw;
-                       r.o[pos].y = (r.o[pos].y - T) / H * rhw;
-                       r.o[pos].z = r.o[pos].z * rhw;
-                       r.o[pos].w = rhw;
+                       o[pos].x = (o[pos].x - L) / W * rhw;
+                       o[pos].y = (o[pos].y - T) / H * rhw;
+                       o[pos].z = o[pos].z * rhw;
+                       o[pos].w = rhw;
                }
 
                if(!halfIntegerCoordinates && !state.preTransformed)
                {
-                       r.o[pos].x = r.o[pos].x + *Pointer<Float4>(r.data + OFFSET(DrawData,halfPixelX)) * r.o[pos].w;
-                       r.o[pos].y = r.o[pos].y + *Pointer<Float4>(r.data + OFFSET(DrawData,halfPixelY)) * r.o[pos].w;
+                       o[pos].x = o[pos].x + *Pointer<Float4>(data + OFFSET(DrawData,halfPixelX)) * o[pos].w;
+                       o[pos].y = o[pos].y + *Pointer<Float4>(data + OFFSET(DrawData,halfPixelY)) * o[pos].w;
                }
 
                if(state.superSampling)
                {
-                       r.o[pos].x = r.o[pos].x + *Pointer<Float4>(r.data + OFFSET(DrawData,XXXX)) * r.o[pos].w;
-                       r.o[pos].y = r.o[pos].y + *Pointer<Float4>(r.data + OFFSET(DrawData,YYYY)) * r.o[pos].w;
-               }
-
-               if(symmetricNormalizedDepth && !state.fixedFunction)
-               {
-                       r.o[pos].z = (r.o[pos].z + r.o[pos].w) * Float4(0.5f);
+                       o[pos].x = o[pos].x + *Pointer<Float4>(data + OFFSET(DrawData,XXXX)) * o[pos].w;
+                       o[pos].y = o[pos].y + *Pointer<Float4>(data + OFFSET(DrawData,YYYY)) * o[pos].w;
                }
        }
 
-       void VertexRoutine::writeCache(Pointer<Byte> &cacheLine, Registers &r)
+       void VertexRoutine::writeCache(Pointer<Byte> &cacheLine)
        {
                Vector4f v;
 
-               for(int i = 0; i < 12; i++)
+               for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
                {
                        if(state.output[i].write)
                        {
-                               v.x = r.o[i].x;
-                               v.y = r.o[i].y;
-                               v.z = r.o[i].z;
-                               v.w = r.o[i].w;
+                               v.x = o[i].x;
+                               v.y = o[i].y;
+                               v.z = o[i].z;
+                               v.w = o[i].w;
 
                                if(state.output[i].xClamp)
                                {
@@ -540,7 +687,7 @@ namespace sw
                                }
                                else
                                {
-                                       if(state.output[i].write == 0x02)
+                                       if(state.output[i].write == 0x03)
                                        {
                                                transpose2x4(v.x, v.y, v.z, v.w);
                                        }
@@ -557,23 +704,29 @@ namespace sw
                        }
                }
 
-               *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 0) = (r.clipFlags >> 0)  & 0x0000000FF;   // FIXME: unsigned char Vertex::clipFlags
-               *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 1) = (r.clipFlags >> 8)  & 0x0000000FF;
-               *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 2) = (r.clipFlags >> 16) & 0x0000000FF;
-               *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 3) = (r.clipFlags >> 24) & 0x0000000FF;
+               *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 0) = (clipFlags >> 0)  & 0x0000000FF;
+               *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 1) = (clipFlags >> 8)  & 0x0000000FF;
+               *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 2) = (clipFlags >> 16) & 0x0000000FF;
+               *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 3) = (clipFlags >> 24) & 0x0000000FF;
 
+               // Viewport transform
                int pos = state.positionRegister;
 
-               v.x = r.o[pos].x;
-               v.y = r.o[pos].y;
-               v.z = r.o[pos].z;
-               v.w = r.o[pos].w;
+               v.x = o[pos].x;
+               v.y = o[pos].y;
+               v.z = o[pos].z;
+               v.w = o[pos].w;
+
+               if(symmetricNormalizedDepth)
+               {
+                       v.z = (v.z + v.w) * Float4(0.5f);   // [-1, 1] -> [0, 1]
+               }
 
                Float4 w = As<Float4>(As<Int4>(v.w) | (As<Int4>(CmpEQ(v.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
                Float4 rhw = Float4(1.0f) / w;
 
-               v.x = As<Float4>(RoundInt(*Pointer<Float4>(r.data + OFFSET(DrawData,X0x16)) + v.x * rhw * *Pointer<Float4>(r.data + OFFSET(DrawData,Wx16))));
-               v.y = As<Float4>(RoundInt(*Pointer<Float4>(r.data + OFFSET(DrawData,Y0x16)) + v.y * rhw * *Pointer<Float4>(r.data + OFFSET(DrawData,Hx16))));
+               v.x = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,X0x16)) + v.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Wx16))));
+               v.y = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,Y0x16)) + v.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Hx16))));
                v.z = v.z * rhw;
                v.w = rhw;
 
@@ -585,17 +738,51 @@ namespace sw
                *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 3, 16) = v.w;
        }
 
-       void VertexRoutine::writeVertex(Pointer<Byte> &vertex, Pointer<Byte> &cache)
+       void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cache)
        {
-               for(int i = 0; i < 12; i++)
+               for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
                {
                        if(state.output[i].write)
                        {
-                               *Pointer<Float4>(vertex + OFFSET(Vertex,v[i])) = *Pointer<Float4>(cache + OFFSET(Vertex,v[i]));
+                               *Pointer<Int4>(vertex + OFFSET(Vertex,v[i]), 16) = *Pointer<Int4>(cache + OFFSET(Vertex,v[i]), 16);
                        }
                }
 
+               *Pointer<Int4>(vertex + OFFSET(Vertex,X)) = *Pointer<Int4>(cache + OFFSET(Vertex,X));
                *Pointer<Int>(vertex + OFFSET(Vertex,clipFlags)) = *Pointer<Int>(cache + OFFSET(Vertex,clipFlags));
-               *Pointer<Float4>(vertex + OFFSET(Vertex,X)) = *Pointer<Float4>(cache + OFFSET(Vertex,X));
+       }
+
+       void VertexRoutine::transformFeedback(const Pointer<Byte> &vertex, const UInt &primitiveNumber, const UInt &indexInPrimitive)
+       {
+               If(indexInPrimitive < state.verticesPerPrimitive)
+               {
+                       UInt tOffset = primitiveNumber * state.verticesPerPrimitive + indexInPrimitive;
+
+                       for(int i = 0; i < MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS; i++)
+                       {
+                               if(state.transformFeedbackEnabled & (1ULL << i))
+                               {
+                                       UInt reg = *Pointer<UInt>(data + OFFSET(DrawData, vs.reg[i]));
+                                       UInt row = *Pointer<UInt>(data + OFFSET(DrawData, vs.row[i]));
+                                       UInt col = *Pointer<UInt>(data + OFFSET(DrawData, vs.col[i]));
+                                       UInt str = *Pointer<UInt>(data + OFFSET(DrawData, vs.str[i]));
+
+                                       Pointer<Byte> t = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, vs.t[i])) + (tOffset * str * sizeof(float));
+                                       Pointer<Byte> v = vertex + OFFSET(Vertex, v) + reg * sizeof(float);
+
+                                       For(UInt r = 0, r < row, r++)
+                                       {
+                                               UInt rOffsetX = r * col * sizeof(float);
+                                               UInt rOffset4 = r * sizeof(float4);
+
+                                               For(UInt c = 0, c < col, c++)
+                                               {
+                                                       UInt cOffset = c * sizeof(float);
+                                                       *Pointer<Float>(t + rOffsetX + cOffset) = *Pointer<Float>(v + rOffset4 + cOffset);
+                                               }
+                                       }
+                               }
+                       }
+               }
        }
 }