OSDN Git Service

gl_VertexID implementation
[android-x86/external-swiftshader.git] / src / Shader / VertexRoutine.cpp
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "VertexRoutine.hpp"
16
17 #include "VertexShader.hpp"
18 #include "Vertex.hpp"
19 #include "Half.hpp"
20 #include "Renderer.hpp"
21 #include "Constants.hpp"
22 #include "Debug.hpp"
23
24 namespace sw
25 {
26         extern bool halfIntegerCoordinates;     // Pixel centers are not at integer coordinates
27         extern bool symmetricNormalizedDepth;   // [-1, 1] instead of [0, 1]
28
29         VertexRoutine::VertexRoutine(const VertexProcessor::State &state, const VertexShader *shader)
30                 : v(shader && shader->dynamicallyIndexedInput),
31                   o(shader && shader->dynamicallyIndexedOutput),
32                   state(state)
33         {
34         }
35
36         VertexRoutine::~VertexRoutine()
37         {
38         }
39
40         void VertexRoutine::generate()
41         {
42                 const bool textureSampling = state.textureSampling;
43
44                 Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache);
45                 Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex);
46                 Pointer<Byte> tagCache = cache + OFFSET(VertexCache,tag);
47
48                 UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask,vertexCount));
49                 UInt primitiveNumber = *Pointer<UInt>(task + OFFSET(VertexTask, primitiveStart));
50                 UInt indexInPrimitive = 0;
51
52                 constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
53
54                 Do
55                 {
56                         UInt index = *Pointer<UInt>(batch);
57                         UInt tagIndex = index & 0x0000003C;
58                         UInt indexQ = !textureSampling ? UInt(index & 0xFFFFFFFC) : index;   // FIXME: TEXLDL hack to have independent LODs, hurts performance.
59
60                         If(*Pointer<UInt>(tagCache + tagIndex) != indexQ)
61                         {
62                                 *Pointer<UInt>(tagCache + tagIndex) = indexQ;
63
64                                 readInput(indexQ);
65                                 pipeline(indexQ);
66                                 postTransform();
67                                 computeClipFlags();
68
69                                 Pointer<Byte> cacheLine0 = vertexCache + tagIndex * UInt((int)sizeof(Vertex));
70                                 writeCache(cacheLine0);
71                         }
72
73                         UInt cacheIndex = index & 0x0000003F;
74                         Pointer<Byte> cacheLine = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
75                         writeVertex(vertex, cacheLine);
76
77                         if(state.transformFeedbackEnabled != 0)
78                         {
79                                 transformFeedback(vertex, primitiveNumber, indexInPrimitive);
80
81                                 indexInPrimitive++;
82                                 If(indexInPrimitive == 3)
83                                 {
84                                         primitiveNumber++;
85                                         indexInPrimitive = 0;
86                                 }
87                         }
88
89                         vertex += sizeof(Vertex);
90                         batch += sizeof(unsigned int);
91                         vertexCount--;
92                 }
93                 Until(vertexCount == 0)
94
95                 Return();
96         }
97
98         void VertexRoutine::readInput(UInt &index)
99         {
100                 for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
101                 {
102                         Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,input) + sizeof(void*) * i);
103                         UInt stride = *Pointer<UInt>(data + OFFSET(DrawData,stride) + sizeof(unsigned int) * i);
104
105                         v[i] = readStream(input, stride, state.input[i], index);
106                 }
107         }
108
109         void VertexRoutine::computeClipFlags()
110         {
111                 int pos = state.positionRegister;
112
113                 Int4 maxX = CmpLT(o[pos].w, o[pos].x);
114                 Int4 maxY = CmpLT(o[pos].w, o[pos].y);
115                 Int4 maxZ = CmpLT(o[pos].w, o[pos].z);
116                 Int4 minX = CmpNLE(-o[pos].w, o[pos].x);
117                 Int4 minY = CmpNLE(-o[pos].w, o[pos].y);
118                 Int4 minZ = symmetricNormalizedDepth ? CmpNLE(-o[pos].w, o[pos].z) : CmpNLE(Float4(0.0f), o[pos].z);
119
120                 clipFlags = *Pointer<Int>(constants + OFFSET(Constants,maxX) + SignMask(maxX) * 4);   // FIXME: Array indexing
121                 clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxY) + SignMask(maxY) * 4);
122                 clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxZ) + SignMask(maxZ) * 4);
123                 clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minX) + SignMask(minX) * 4);
124                 clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minY) + SignMask(minY) * 4);
125                 clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minZ) + SignMask(minZ) * 4);
126
127                 Int4 finiteX = CmpLE(Abs(o[pos].x), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
128                 Int4 finiteY = CmpLE(Abs(o[pos].y), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
129                 Int4 finiteZ = CmpLE(Abs(o[pos].z), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
130
131                 Int4 finiteXYZ = finiteX & finiteY & finiteZ;
132                 clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,fini) + SignMask(finiteXYZ) * 4);
133
134                 if(state.preTransformed)
135                 {
136                         clipFlags &= 0xFBFBFBFB;   // Don't clip against far clip plane
137                 }
138         }
139
140         Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index)
141         {
142                 const bool textureSampling = state.textureSampling;
143
144                 Vector4f v;
145
146                 Pointer<Byte> source0 = buffer + index * stride;
147                 Pointer<Byte> source1 = source0 + (!textureSampling ? stride : 0);
148                 Pointer<Byte> source2 = source1 + (!textureSampling ? stride : 0);
149                 Pointer<Byte> source3 = source2 + (!textureSampling ? stride : 0);
150
151                 bool isNativeFloatAttrib = (stream.attribType == VertexShader::ATTRIBTYPE_FLOAT) || stream.normalized;
152
153                 switch(stream.type)
154                 {
155                 case STREAMTYPE_FLOAT:
156                         {
157                                 if(stream.count == 0)
158                                 {
159                                         // Null stream, all default components
160                                 }
161                                 else
162                                 {
163                                         if(stream.count == 1)
164                                         {
165                                                 v.x.x = *Pointer<Float>(source0);
166                                                 v.x.y = *Pointer<Float>(source1);
167                                                 v.x.z = *Pointer<Float>(source2);
168                                                 v.x.w = *Pointer<Float>(source3);
169                                         }
170                                         else
171                                         {
172                                                 v.x = *Pointer<Float4>(source0);
173                                                 v.y = *Pointer<Float4>(source1);
174                                                 v.z = *Pointer<Float4>(source2);
175                                                 v.w = *Pointer<Float4>(source3);
176
177                                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
178                                         }
179
180                                         switch(stream.attribType)
181                                         {
182                                         case VertexShader::ATTRIBTYPE_INT:
183                                                 if(stream.count >= 1) v.x = As<Float4>(Int4(v.x));
184                                                 if(stream.count >= 2) v.x = As<Float4>(Int4(v.y));
185                                                 if(stream.count >= 3) v.x = As<Float4>(Int4(v.z));
186                                                 if(stream.count >= 4) v.x = As<Float4>(Int4(v.w));
187                                                 break;
188                                         case VertexShader::ATTRIBTYPE_UINT:
189                                                 if(stream.count >= 1) v.x = As<Float4>(UInt4(v.x));
190                                                 if(stream.count >= 2) v.x = As<Float4>(UInt4(v.y));
191                                                 if(stream.count >= 3) v.x = As<Float4>(UInt4(v.z));
192                                                 if(stream.count >= 4) v.x = As<Float4>(UInt4(v.w));
193                                                 break;
194                                         default:
195                                                 break;
196                                         }
197                                 }
198                         }
199                         break;
200                 case STREAMTYPE_BYTE:
201                         if(isNativeFloatAttrib) // Stream: UByte, Shader attrib: Float
202                         {
203                                 v.x = Float4(*Pointer<Byte4>(source0));
204                                 v.y = Float4(*Pointer<Byte4>(source1));
205                                 v.z = Float4(*Pointer<Byte4>(source2));
206                                 v.w = Float4(*Pointer<Byte4>(source3));
207
208                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
209
210                                 if(stream.normalized)
211                                 {
212                                         if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
213                                         if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
214                                         if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
215                                         if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
216                                 }
217                         }
218                         else // Stream: UByte, Shader attrib: Int / UInt
219                         {
220                                 v.x = As<Float4>(Int4(*Pointer<Byte4>(source0)));
221                                 v.y = As<Float4>(Int4(*Pointer<Byte4>(source1)));
222                                 v.z = As<Float4>(Int4(*Pointer<Byte4>(source2)));
223                                 v.w = As<Float4>(Int4(*Pointer<Byte4>(source3)));
224
225                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
226                         }
227                         break;
228                 case STREAMTYPE_SBYTE:
229                         if(isNativeFloatAttrib) // Stream: SByte, Shader attrib: Float
230                         {
231                                 v.x = Float4(*Pointer<SByte4>(source0));
232                                 v.y = Float4(*Pointer<SByte4>(source1));
233                                 v.z = Float4(*Pointer<SByte4>(source2));
234                                 v.w = Float4(*Pointer<SByte4>(source3));
235
236                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
237
238                                 if(stream.normalized)
239                                 {
240                                         if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
241                                         if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
242                                         if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
243                                         if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
244                                 }
245                         }
246                         else // Stream: SByte, Shader attrib: Int / UInt
247                         {
248                                 v.x = As<Float4>(Int4(*Pointer<SByte4>(source0)));
249                                 v.y = As<Float4>(Int4(*Pointer<SByte4>(source1)));
250                                 v.z = As<Float4>(Int4(*Pointer<SByte4>(source2)));
251                                 v.w = As<Float4>(Int4(*Pointer<SByte4>(source3)));
252
253                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
254                         }
255                         break;
256                 case STREAMTYPE_COLOR:
257                         {
258                                 v.x = Float4(*Pointer<Byte4>(source0)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
259                                 v.y = Float4(*Pointer<Byte4>(source1)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
260                                 v.z = Float4(*Pointer<Byte4>(source2)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
261                                 v.w = Float4(*Pointer<Byte4>(source3)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
262
263                                 transpose4x4(v.x, v.y, v.z, v.w);
264
265                                 // Swap red and blue
266                                 Float4 t = v.x;
267                                 v.x = v.z;
268                                 v.z = t;
269                         }
270                         break;
271                 case STREAMTYPE_SHORT:
272                         if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
273                         {
274                                 v.x = Float4(*Pointer<Short4>(source0));
275                                 v.y = Float4(*Pointer<Short4>(source1));
276                                 v.z = Float4(*Pointer<Short4>(source2));
277                                 v.w = Float4(*Pointer<Short4>(source3));
278
279                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
280
281                                 if(stream.normalized)
282                                 {
283                                         if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
284                                         if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
285                                         if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
286                                         if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
287                                 }
288                         }
289                         else // Stream: Short, Shader attrib: Int/UInt, no type conversion
290                         {
291                                 v.x = As<Float4>(Int4(*Pointer<Short4>(source0)));
292                                 v.y = As<Float4>(Int4(*Pointer<Short4>(source1)));
293                                 v.z = As<Float4>(Int4(*Pointer<Short4>(source2)));
294                                 v.w = As<Float4>(Int4(*Pointer<Short4>(source3)));
295
296                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
297                         }
298                         break;
299                 case STREAMTYPE_USHORT:
300                         if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
301                         {
302                                 v.x = Float4(*Pointer<UShort4>(source0));
303                                 v.y = Float4(*Pointer<UShort4>(source1));
304                                 v.z = Float4(*Pointer<UShort4>(source2));
305                                 v.w = Float4(*Pointer<UShort4>(source3));
306
307                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
308
309                                 if(stream.normalized)
310                                 {
311                                         if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
312                                         if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
313                                         if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
314                                         if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
315                                 }
316                         }
317                         else // Stream: UShort, Shader attrib: Int/UInt, no type conversion
318                         {
319                                 v.x = As<Float4>(Int4(*Pointer<UShort4>(source0)));
320                                 v.y = As<Float4>(Int4(*Pointer<UShort4>(source1)));
321                                 v.z = As<Float4>(Int4(*Pointer<UShort4>(source2)));
322                                 v.w = As<Float4>(Int4(*Pointer<UShort4>(source3)));
323
324                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
325                         }
326                         break;
327                 case STREAMTYPE_INT:
328                         if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
329                         {
330                                 v.x = Float4(*Pointer<Int4>(source0));
331                                 v.y = Float4(*Pointer<Int4>(source1));
332                                 v.z = Float4(*Pointer<Int4>(source2));
333                                 v.w = Float4(*Pointer<Int4>(source3));
334
335                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
336
337                                 if(stream.normalized)
338                                 {
339                                         if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
340                                         if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
341                                         if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
342                                         if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
343                                 }
344                         }
345                         else // Stream: Int, Shader attrib: Int/UInt, no type conversion
346                         {
347                                 v.x = *Pointer<Float4>(source0);
348                                 v.y = *Pointer<Float4>(source1);
349                                 v.z = *Pointer<Float4>(source2);
350                                 v.w = *Pointer<Float4>(source3);
351
352                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
353                         }
354                         break;
355                 case STREAMTYPE_UINT:
356                         if(isNativeFloatAttrib) // Stream: UInt, Shader attrib: Float
357                         {
358                                 v.x = Float4(*Pointer<UInt4>(source0));
359                                 v.y = Float4(*Pointer<UInt4>(source1));
360                                 v.z = Float4(*Pointer<UInt4>(source2));
361                                 v.w = Float4(*Pointer<UInt4>(source3));
362
363                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
364
365                                 if(stream.normalized)
366                                 {
367                                         if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
368                                         if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
369                                         if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
370                                         if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
371                                 }
372                         }
373                         else // Stream: UInt, Shader attrib: Int/UInt, no type conversion
374                         {
375                                 v.x = *Pointer<Float4>(source0);
376                                 v.y = *Pointer<Float4>(source1);
377                                 v.z = *Pointer<Float4>(source2);
378                                 v.w = *Pointer<Float4>(source3);
379
380                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
381                         }
382                         break;
383                 case STREAMTYPE_UDEC3:
384                         {
385                                 // FIXME: Vectorize
386                                 {
387                                         Int x, y, z;
388
389                                         x = y = z = *Pointer<Int>(source0);
390
391                                         v.x.x = Float(x & 0x000003FF);
392                                         v.x.y = Float(y & 0x000FFC00);
393                                         v.x.z = Float(z & 0x3FF00000);
394                                 }
395
396                                 {
397                                         Int x, y, z;
398
399                                         x = y = z = *Pointer<Int>(source1);
400
401                                         v.y.x = Float(x & 0x000003FF);
402                                         v.y.y = Float(y & 0x000FFC00);
403                                         v.y.z = Float(z & 0x3FF00000);
404                                 }
405
406                                 {
407                                         Int x, y, z;
408
409                                         x = y = z = *Pointer<Int>(source2);
410
411                                         v.z.x = Float(x & 0x000003FF);
412                                         v.z.y = Float(y & 0x000FFC00);
413                                         v.z.z = Float(z & 0x3FF00000);
414                                 }
415
416                                 {
417                                         Int x, y, z;
418
419                                         x = y = z = *Pointer<Int>(source3);
420
421                                         v.w.x = Float(x & 0x000003FF);
422                                         v.w.y = Float(y & 0x000FFC00);
423                                         v.w.z = Float(z & 0x3FF00000);
424                                 }
425
426                                 transpose4x3(v.x, v.y, v.z, v.w);
427
428                                 v.y *= Float4(1.0f / 0x00000400);
429                                 v.z *= Float4(1.0f / 0x00100000);
430                         }
431                         break;
432                 case STREAMTYPE_DEC3N:
433                         {
434                                 // FIXME: Vectorize
435                                 {
436                                         Int x, y, z;
437
438                                         x = y = z = *Pointer<Int>(source0);
439
440                                         v.x.x = Float((x << 22) & 0xFFC00000);
441                                         v.x.y = Float((y << 12) & 0xFFC00000);
442                                         v.x.z = Float((z << 2)  & 0xFFC00000);
443                                 }
444
445                                 {
446                                         Int x, y, z;
447
448                                         x = y = z = *Pointer<Int>(source1);
449
450                                         v.y.x = Float((x << 22) & 0xFFC00000);
451                                         v.y.y = Float((y << 12) & 0xFFC00000);
452                                         v.y.z = Float((z << 2)  & 0xFFC00000);
453                                 }
454
455                                 {
456                                         Int x, y, z;
457
458                                         x = y = z = *Pointer<Int>(source2);
459
460                                         v.z.x = Float((x << 22) & 0xFFC00000);
461                                         v.z.y = Float((y << 12) & 0xFFC00000);
462                                         v.z.z = Float((z << 2)  & 0xFFC00000);
463                                 }
464
465                                 {
466                                         Int x, y, z;
467
468                                         x = y = z = *Pointer<Int>(source3);
469
470                                         v.w.x = Float((x << 22) & 0xFFC00000);
471                                         v.w.y = Float((y << 12) & 0xFFC00000);
472                                         v.w.z = Float((z << 2)  & 0xFFC00000);
473                                 }
474
475                                 transpose4x3(v.x, v.y, v.z, v.w);
476
477                                 v.x *= Float4(1.0f / 0x00400000 / 511.0f);
478                                 v.y *= Float4(1.0f / 0x00400000 / 511.0f);
479                                 v.z *= Float4(1.0f / 0x00400000 / 511.0f);
480                         }
481                         break;
482                 case STREAMTYPE_FIXED:
483                         {
484                                 v.x = Float4(*Pointer<Int4>(source0)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
485                                 v.y = Float4(*Pointer<Int4>(source1)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
486                                 v.z = Float4(*Pointer<Int4>(source2)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
487                                 v.w = Float4(*Pointer<Int4>(source3)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
488
489                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
490                         }
491                         break;
492                 case STREAMTYPE_HALF:
493                         {
494                                 if(stream.count >= 1)
495                                 {
496                                         UShort x0 = *Pointer<UShort>(source0 + 0);
497                                         UShort x1 = *Pointer<UShort>(source1 + 0);
498                                         UShort x2 = *Pointer<UShort>(source2 + 0);
499                                         UShort x3 = *Pointer<UShort>(source3 + 0);
500
501                                         v.x.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x0) * 4);
502                                         v.x.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x1) * 4);
503                                         v.x.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x2) * 4);
504                                         v.x.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x3) * 4);
505                                 }
506
507                                 if(stream.count >= 2)
508                                 {
509                                         UShort y0 = *Pointer<UShort>(source0 + 2);
510                                         UShort y1 = *Pointer<UShort>(source1 + 2);
511                                         UShort y2 = *Pointer<UShort>(source2 + 2);
512                                         UShort y3 = *Pointer<UShort>(source3 + 2);
513
514                                         v.y.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y0) * 4);
515                                         v.y.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y1) * 4);
516                                         v.y.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y2) * 4);
517                                         v.y.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y3) * 4);
518                                 }
519
520                                 if(stream.count >= 3)
521                                 {
522                                         UShort z0 = *Pointer<UShort>(source0 + 4);
523                                         UShort z1 = *Pointer<UShort>(source1 + 4);
524                                         UShort z2 = *Pointer<UShort>(source2 + 4);
525                                         UShort z3 = *Pointer<UShort>(source3 + 4);
526
527                                         v.z.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z0) * 4);
528                                         v.z.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z1) * 4);
529                                         v.z.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z2) * 4);
530                                         v.z.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z3) * 4);
531                                 }
532
533                                 if(stream.count >= 4)
534                                 {
535                                         UShort w0 = *Pointer<UShort>(source0 + 6);
536                                         UShort w1 = *Pointer<UShort>(source1 + 6);
537                                         UShort w2 = *Pointer<UShort>(source2 + 6);
538                                         UShort w3 = *Pointer<UShort>(source3 + 6);
539
540                                         v.w.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w0) * 4);
541                                         v.w.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w1) * 4);
542                                         v.w.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w2) * 4);
543                                         v.w.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w3) * 4);
544                                 }
545                         }
546                         break;
547                 case STREAMTYPE_INDICES:
548                         {
549                                 v.x.x = *Pointer<Float>(source0);
550                                 v.x.y = *Pointer<Float>(source1);
551                                 v.x.z = *Pointer<Float>(source2);
552                                 v.x.w = *Pointer<Float>(source3);
553                         }
554                         break;
555                 case STREAMTYPE_2_10_10_10_INT:
556                         {
557                                 Int4 src;
558                                 src = Insert(src, *Pointer<Int>(source0), 0);
559                                 src = Insert(src, *Pointer<Int>(source1), 1);
560                                 src = Insert(src, *Pointer<Int>(source2), 2);
561                                 src = Insert(src, *Pointer<Int>(source3), 3);
562
563                                 v.x = Float4((src << 22) >> 22);
564                                 v.y = Float4((src << 12) >> 22);
565                                 v.z = Float4((src << 02) >> 22);
566                                 v.w = Float4(src >> 30);
567
568                                 if(stream.normalized)
569                                 {
570                                         v.x = Max(v.x * Float4(1.0f / 0x1FF), Float4(-1.0f));
571                                         v.y = Max(v.y * Float4(1.0f / 0x1FF), Float4(-1.0f));
572                                         v.z = Max(v.z * Float4(1.0f / 0x1FF), Float4(-1.0f));
573                                         v.w = Max(v.w, Float4(-1.0f));
574                                 }
575                         }
576                         break;
577                 case STREAMTYPE_2_10_10_10_UINT:
578                         {
579                                 Int4 src;
580                                 src = Insert(src, *Pointer<Int>(source0), 0);
581                                 src = Insert(src, *Pointer<Int>(source1), 1);
582                                 src = Insert(src, *Pointer<Int>(source2), 2);
583                                 src = Insert(src, *Pointer<Int>(source3), 3);
584
585                                 v.x = Float4(src & Int4(0x3FF));
586                                 v.y = Float4((src >> 10) & Int4(0x3FF));
587                                 v.z = Float4((src >> 20) & Int4(0x3FF));
588                                 v.w = Float4((src >> 30) & Int4(0x3));
589
590                                 if(stream.normalized)
591                                 {
592                                         v.x *= Float4(1.0f / 0x3FF);
593                                         v.y *= Float4(1.0f / 0x3FF);
594                                         v.z *= Float4(1.0f / 0x3FF);
595                                         v.w *= Float4(1.0f / 0x3);
596                                 }
597                         }
598                         break;
599                 default:
600                         ASSERT(false);
601                 }
602
603                 if(stream.count < 1) v.x = Float4(0.0f);
604                 if(stream.count < 2) v.y = Float4(0.0f);
605                 if(stream.count < 3) v.z = Float4(0.0f);
606                 if(stream.count < 4) v.w = isNativeFloatAttrib ? As<Float4>(Float4(1.0f)) : As<Float4>(Int4(0));
607
608                 return v;
609         }
610
611         void VertexRoutine::postTransform()
612         {
613                 int pos = state.positionRegister;
614
615                 // Backtransform
616                 if(state.preTransformed)
617                 {
618                         Float4 rhw = Float4(1.0f) / o[pos].w;
619
620                         Float4 W = *Pointer<Float4>(data + OFFSET(DrawData,Wx16)) * Float4(1.0f / 16.0f);
621                         Float4 H = *Pointer<Float4>(data + OFFSET(DrawData,Hx16)) * Float4(1.0f / 16.0f);
622                         Float4 L = *Pointer<Float4>(data + OFFSET(DrawData,X0x16)) * Float4(1.0f / 16.0f);
623                         Float4 T = *Pointer<Float4>(data + OFFSET(DrawData,Y0x16)) * Float4(1.0f / 16.0f);
624
625                         o[pos].x = (o[pos].x - L) / W * rhw;
626                         o[pos].y = (o[pos].y - T) / H * rhw;
627                         o[pos].z = o[pos].z * rhw;
628                         o[pos].w = rhw;
629                 }
630
631                 if(!halfIntegerCoordinates && !state.preTransformed)
632                 {
633                         o[pos].x = o[pos].x + *Pointer<Float4>(data + OFFSET(DrawData,halfPixelX)) * o[pos].w;
634                         o[pos].y = o[pos].y + *Pointer<Float4>(data + OFFSET(DrawData,halfPixelY)) * o[pos].w;
635                 }
636
637                 if(state.superSampling)
638                 {
639                         o[pos].x = o[pos].x + *Pointer<Float4>(data + OFFSET(DrawData,XXXX)) * o[pos].w;
640                         o[pos].y = o[pos].y + *Pointer<Float4>(data + OFFSET(DrawData,YYYY)) * o[pos].w;
641                 }
642         }
643
644         void VertexRoutine::writeCache(Pointer<Byte> &cacheLine)
645         {
646                 Vector4f v;
647
648                 for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
649                 {
650                         if(state.output[i].write)
651                         {
652                                 v.x = o[i].x;
653                                 v.y = o[i].y;
654                                 v.z = o[i].z;
655                                 v.w = o[i].w;
656
657                                 if(state.output[i].xClamp)
658                                 {
659                                         v.x = Max(v.x, Float4(0.0f));
660                                         v.x = Min(v.x, Float4(1.0f));
661                                 }
662
663                                 if(state.output[i].yClamp)
664                                 {
665                                         v.y = Max(v.y, Float4(0.0f));
666                                         v.y = Min(v.y, Float4(1.0f));
667                                 }
668
669                                 if(state.output[i].zClamp)
670                                 {
671                                         v.z = Max(v.z, Float4(0.0f));
672                                         v.z = Min(v.z, Float4(1.0f));
673                                 }
674
675                                 if(state.output[i].wClamp)
676                                 {
677                                         v.w = Max(v.w, Float4(0.0f));
678                                         v.w = Min(v.w, Float4(1.0f));
679                                 }
680
681                                 if(state.output[i].write == 0x01)
682                                 {
683                                         *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0) = v.x.x;
684                                         *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1) = v.x.y;
685                                         *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2) = v.x.z;
686                                         *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3) = v.x.w;
687                                 }
688                                 else
689                                 {
690                                         if(state.output[i].write == 0x03)
691                                         {
692                                                 transpose2x4(v.x, v.y, v.z, v.w);
693                                         }
694                                         else
695                                         {
696                                                 transpose4x4(v.x, v.y, v.z, v.w);
697                                         }
698
699                                         *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0, 16) = v.x;
700                                         *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1, 16) = v.y;
701                                         *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2, 16) = v.z;
702                                         *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3, 16) = v.w;
703                                 }
704                         }
705                 }
706
707                 *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 0) = (clipFlags >> 0)  & 0x0000000FF;
708                 *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 1) = (clipFlags >> 8)  & 0x0000000FF;
709                 *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 2) = (clipFlags >> 16) & 0x0000000FF;
710                 *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 3) = (clipFlags >> 24) & 0x0000000FF;
711
712                 // Viewport transform
713                 int pos = state.positionRegister;
714
715                 v.x = o[pos].x;
716                 v.y = o[pos].y;
717                 v.z = o[pos].z;
718                 v.w = o[pos].w;
719
720                 if(symmetricNormalizedDepth)
721                 {
722                         v.z = (v.z + v.w) * Float4(0.5f);   // [-1, 1] -> [0, 1]
723                 }
724
725                 Float4 w = As<Float4>(As<Int4>(v.w) | (As<Int4>(CmpEQ(v.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
726                 Float4 rhw = Float4(1.0f) / w;
727
728                 v.x = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,X0x16)) + v.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Wx16))));
729                 v.y = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,Y0x16)) + v.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Hx16))));
730                 v.z = v.z * rhw;
731                 v.w = rhw;
732
733                 transpose4x4(v.x, v.y, v.z, v.w);
734
735                 *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 0, 16) = v.x;
736                 *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 1, 16) = v.y;
737                 *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 2, 16) = v.z;
738                 *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 3, 16) = v.w;
739         }
740
741         void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cache)
742         {
743                 for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
744                 {
745                         if(state.output[i].write)
746                         {
747                                 *Pointer<Int4>(vertex + OFFSET(Vertex,v[i]), 16) = *Pointer<Int4>(cache + OFFSET(Vertex,v[i]), 16);
748                         }
749                 }
750
751                 *Pointer<Int4>(vertex + OFFSET(Vertex,X)) = *Pointer<Int4>(cache + OFFSET(Vertex,X));
752                 *Pointer<Int>(vertex + OFFSET(Vertex,clipFlags)) = *Pointer<Int>(cache + OFFSET(Vertex,clipFlags));
753         }
754
755         void VertexRoutine::transformFeedback(const Pointer<Byte> &vertex, const UInt &primitiveNumber, const UInt &indexInPrimitive)
756         {
757                 If(indexInPrimitive < state.verticesPerPrimitive)
758                 {
759                         UInt tOffset = primitiveNumber * state.verticesPerPrimitive + indexInPrimitive;
760
761                         for(int i = 0; i < MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS; i++)
762                         {
763                                 if(state.transformFeedbackEnabled & (1ULL << i))
764                                 {
765                                         UInt reg = *Pointer<UInt>(data + OFFSET(DrawData, vs.reg[i]));
766                                         UInt row = *Pointer<UInt>(data + OFFSET(DrawData, vs.row[i]));
767                                         UInt col = *Pointer<UInt>(data + OFFSET(DrawData, vs.col[i]));
768                                         UInt str = *Pointer<UInt>(data + OFFSET(DrawData, vs.str[i]));
769
770                                         Pointer<Byte> t = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, vs.t[i])) + (tOffset * str * sizeof(float));
771                                         Pointer<Byte> v = vertex + OFFSET(Vertex, v) + reg * sizeof(float);
772
773                                         For(UInt r = 0, r < row, r++)
774                                         {
775                                                 UInt rOffsetX = r * col * sizeof(float);
776                                                 UInt rOffset4 = r * sizeof(float4);
777
778                                                 For(UInt c = 0, c < col, c++)
779                                                 {
780                                                         UInt cOffset = c * sizeof(float);
781                                                         *Pointer<Float>(t + rOffsetX + cOffset) = *Pointer<Float>(v + rOffset4 + cOffset);
782                                                 }
783                                         }
784                                 }
785                         }
786                 }
787         }
788 }