OSDN Git Service

Make the number of vertex inputs configurable.
[android-x86/external-swiftshader.git] / src / Shader / VertexRoutine.cpp
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "VertexRoutine.hpp"
16
17 #include "VertexShader.hpp"
18 #include "Vertex.hpp"
19 #include "Half.hpp"
20 #include "Renderer.hpp"
21 #include "Constants.hpp"
22 #include "Debug.hpp"
23
24 namespace sw
25 {
26         extern bool halfIntegerCoordinates;     // Pixel centers are not at integer coordinates
27         extern bool symmetricNormalizedDepth;   // [-1, 1] instead of [0, 1]
28
29         VertexRoutine::VertexRoutine(const VertexProcessor::State &state, const VertexShader *shader)
30                 : v(shader && shader->dynamicallyIndexedInput),
31                   o(shader && shader->dynamicallyIndexedOutput),
32                   state(state)
33         {
34         }
35
36         VertexRoutine::~VertexRoutine()
37         {
38         }
39
40         void VertexRoutine::generate()
41         {
42                 const bool textureSampling = state.textureSampling;
43
44                 Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache);
45                 Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex);
46                 Pointer<Byte> tagCache = cache + OFFSET(VertexCache,tag);
47
48                 UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask,vertexCount));
49
50                 constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
51
52                 Do
53                 {
54                         UInt index = *Pointer<UInt>(batch);
55                         UInt tagIndex = index & 0x0000003C;
56                         UInt indexQ = !textureSampling ? UInt(index & 0xFFFFFFFC) : index;   // FIXME: TEXLDL hack to have independent LODs, hurts performance.
57
58                         If(*Pointer<UInt>(tagCache + tagIndex) != indexQ)
59                         {
60                                 *Pointer<UInt>(tagCache + tagIndex) = indexQ;
61
62                                 readInput(indexQ);
63                                 pipeline();
64                                 postTransform();
65                                 computeClipFlags();
66
67                                 Pointer<Byte> cacheLine0 = vertexCache + tagIndex * UInt((int)sizeof(Vertex));
68                                 writeCache(cacheLine0);
69                         }
70
71                         UInt cacheIndex = index & 0x0000003F;
72                         Pointer<Byte> cacheLine = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
73                         writeVertex(vertex, cacheLine);
74
75                         vertex += sizeof(Vertex);
76                         batch += sizeof(unsigned int);
77                         vertexCount--;
78                 }
79                 Until(vertexCount == 0)
80
81                 Return();
82         }
83
84         void VertexRoutine::readInput(UInt &index)
85         {
86                 for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
87                 {
88                         Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,input) + sizeof(void*) * i);
89                         UInt stride = *Pointer<UInt>(data + OFFSET(DrawData,stride) + sizeof(unsigned int) * i);
90
91                         v[i] = readStream(input, stride, state.input[i], index);
92                 }
93         }
94
95         void VertexRoutine::computeClipFlags()
96         {
97                 int pos = state.positionRegister;
98
99                 Int4 maxX = CmpLT(o[pos].w, o[pos].x);
100                 Int4 maxY = CmpLT(o[pos].w, o[pos].y);
101                 Int4 maxZ = CmpLT(o[pos].w, o[pos].z);
102
103                 Int4 minX = CmpNLE(-o[pos].w, o[pos].x);
104                 Int4 minY = CmpNLE(-o[pos].w, o[pos].y);
105                 Int4 minZ = CmpNLE(Float4(0.0f), o[pos].z);
106
107                 Int flags;
108
109                 flags = SignMask(maxX);
110                 clipFlags = *Pointer<Int>(constants + OFFSET(Constants,maxX) + flags * 4);   // FIXME: Array indexing
111                 flags = SignMask(maxY);
112                 clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxY) + flags * 4);
113                 flags = SignMask(maxZ);
114                 clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxZ) + flags * 4);
115                 flags = SignMask(minX);
116                 clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minX) + flags * 4);
117                 flags = SignMask(minY);
118                 clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minY) + flags * 4);
119                 flags = SignMask(minZ);
120                 clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minZ) + flags * 4);
121
122                 Int4 finiteX = CmpLE(Abs(o[pos].x), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
123                 Int4 finiteY = CmpLE(Abs(o[pos].y), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
124                 Int4 finiteZ = CmpLE(Abs(o[pos].z), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
125
126                 flags = SignMask(finiteX & finiteY & finiteZ);
127                 clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,fini) + flags * 4);
128
129                 if(state.preTransformed)
130                 {
131                         clipFlags &= 0xFBFBFBFB;   // Don't clip against far clip plane
132                 }
133         }
134
135         Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index)
136         {
137                 const bool textureSampling = state.textureSampling;
138
139                 Vector4f v;
140
141                 Pointer<Byte> source0 = buffer + index * stride;
142                 Pointer<Byte> source1 = source0 + (!textureSampling ? stride : 0);
143                 Pointer<Byte> source2 = source1 + (!textureSampling ? stride : 0);
144                 Pointer<Byte> source3 = source2 + (!textureSampling ? stride : 0);
145
146                 switch(stream.type)
147                 {
148                 case STREAMTYPE_FLOAT:
149                         {
150                                 if(stream.count == 0)
151                                 {
152                                         // Null stream, all default components
153                                 }
154                                 else if(stream.count == 1)
155                                 {
156                                         v.x.x = *Pointer<Float>(source0);
157                                         v.x.y = *Pointer<Float>(source1);
158                                         v.x.z = *Pointer<Float>(source2);
159                                         v.x.w = *Pointer<Float>(source3);
160                                 }
161                                 else
162                                 {
163                                         v.x = *Pointer<Float4>(source0);
164                                         v.y = *Pointer<Float4>(source1);
165                                         v.z = *Pointer<Float4>(source2);
166                                         v.w = *Pointer<Float4>(source3);
167
168                                         transpose4xN(v.x, v.y, v.z, v.w, stream.count);
169                                 }
170                         }
171                         break;
172                 case STREAMTYPE_BYTE:
173                         {
174                                 v.x = Float4(*Pointer<Byte4>(source0));
175                                 v.y = Float4(*Pointer<Byte4>(source1));
176                                 v.z = Float4(*Pointer<Byte4>(source2));
177                                 v.w = Float4(*Pointer<Byte4>(source3));
178
179                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
180
181                                 if(stream.normalized)
182                                 {
183                                         if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
184                                         if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
185                                         if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
186                                         if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
187                                 }
188                         }
189                         break;
190                 case STREAMTYPE_SBYTE:
191                         {
192                                 v.x = Float4(*Pointer<SByte4>(source0));
193                                 v.y = Float4(*Pointer<SByte4>(source1));
194                                 v.z = Float4(*Pointer<SByte4>(source2));
195                                 v.w = Float4(*Pointer<SByte4>(source3));
196
197                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
198
199                                 if(stream.normalized)
200                                 {
201                                         if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
202                                         if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
203                                         if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
204                                         if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
205                                 }
206                         }
207                         break;
208                 case STREAMTYPE_COLOR:
209                         {
210                                 v.x = Float4(*Pointer<Byte4>(source0)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
211                                 v.y = Float4(*Pointer<Byte4>(source1)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
212                                 v.z = Float4(*Pointer<Byte4>(source2)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
213                                 v.w = Float4(*Pointer<Byte4>(source3)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
214
215                                 transpose4x4(v.x, v.y, v.z, v.w);
216
217                                 // Swap red and blue
218                                 Float4 t = v.x;
219                                 v.x = v.z;
220                                 v.z = t;
221                         }
222                         break;
223                 case STREAMTYPE_SHORT:
224                         {
225                                 v.x = Float4(*Pointer<Short4>(source0));
226                                 v.y = Float4(*Pointer<Short4>(source1));
227                                 v.z = Float4(*Pointer<Short4>(source2));
228                                 v.w = Float4(*Pointer<Short4>(source3));
229
230                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
231
232                                 if(stream.normalized)
233                                 {
234                                         if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
235                                         if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
236                                         if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
237                                         if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
238                                 }
239                         }
240                         break;
241                 case STREAMTYPE_USHORT:
242                         {
243                                 v.x = Float4(*Pointer<UShort4>(source0));
244                                 v.y = Float4(*Pointer<UShort4>(source1));
245                                 v.z = Float4(*Pointer<UShort4>(source2));
246                                 v.w = Float4(*Pointer<UShort4>(source3));
247
248                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
249
250                                 if(stream.normalized)
251                                 {
252                                         if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
253                                         if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
254                                         if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
255                                         if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
256                                 }
257                         }
258                         break;
259                 case STREAMTYPE_INT:
260                         {
261                                 if(stream.normalized)
262                                 {
263                                         v.x = Float4(*Pointer<Int4>(source0));
264                                         v.y = Float4(*Pointer<Int4>(source1));
265                                         v.z = Float4(*Pointer<Int4>(source2));
266                                         v.w = Float4(*Pointer<Int4>(source3));
267
268                                         transpose4xN(v.x, v.y, v.z, v.w, stream.count);
269
270                                         if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
271                                         if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
272                                         if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
273                                         if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
274                                 }
275                                 else
276                                 {
277                                         v.x = As<Float4>(*Pointer<Int4>(source0));
278                                         v.y = As<Float4>(*Pointer<Int4>(source1));
279                                         v.z = As<Float4>(*Pointer<Int4>(source2));
280                                         v.w = As<Float4>(*Pointer<Int4>(source3));
281
282                                         transpose4xN(v.x, v.y, v.z, v.w, stream.count);
283                                 }
284                         }
285                         break;
286                 case STREAMTYPE_UINT:
287                         {
288                                 if(stream.normalized)
289                                 {
290                                         v.x = Float4(*Pointer<UInt4>(source0));
291                                         v.y = Float4(*Pointer<UInt4>(source1));
292                                         v.z = Float4(*Pointer<UInt4>(source2));
293                                         v.w = Float4(*Pointer<UInt4>(source3));
294
295                                         transpose4xN(v.x, v.y, v.z, v.w, stream.count);
296
297                                         if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
298                                         if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
299                                         if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
300                                         if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
301                                 }
302                                 else
303                                 {
304                                         v.x = As<Float4>(*Pointer<UInt4>(source0));
305                                         v.y = As<Float4>(*Pointer<UInt4>(source1));
306                                         v.z = As<Float4>(*Pointer<UInt4>(source2));
307                                         v.w = As<Float4>(*Pointer<UInt4>(source3));
308
309                                         transpose4xN(v.x, v.y, v.z, v.w, stream.count);
310                                 }
311                         }
312                         break;
313                 case STREAMTYPE_UDEC3:
314                         {
315                                 // FIXME: Vectorize
316                                 {
317                                         Int x, y, z;
318
319                                         x = y = z = *Pointer<Int>(source0);
320
321                                         v.x.x = Float(x & 0x000003FF);
322                                         v.x.y = Float(y & 0x000FFC00);
323                                         v.x.z = Float(z & 0x3FF00000);
324                                 }
325
326                                 {
327                                         Int x, y, z;
328
329                                         x = y = z = *Pointer<Int>(source1);
330
331                                         v.y.x = Float(x & 0x000003FF);
332                                         v.y.y = Float(y & 0x000FFC00);
333                                         v.y.z = Float(z & 0x3FF00000);
334                                 }
335
336                                 {
337                                         Int x, y, z;
338
339                                         x = y = z = *Pointer<Int>(source2);
340
341                                         v.z.x = Float(x & 0x000003FF);
342                                         v.z.y = Float(y & 0x000FFC00);
343                                         v.z.z = Float(z & 0x3FF00000);
344                                 }
345
346                                 {
347                                         Int x, y, z;
348
349                                         x = y = z = *Pointer<Int>(source3);
350
351                                         v.w.x = Float(x & 0x000003FF);
352                                         v.w.y = Float(y & 0x000FFC00);
353                                         v.w.z = Float(z & 0x3FF00000);
354                                 }
355
356                                 transpose4x3(v.x, v.y, v.z, v.w);
357
358                                 v.y *= Float4(1.0f / 0x00000400);
359                                 v.z *= Float4(1.0f / 0x00100000);
360                         }
361                         break;
362                 case STREAMTYPE_DEC3N:
363                         {
364                                 // FIXME: Vectorize
365                                 {
366                                         Int x, y, z;
367
368                                         x = y = z = *Pointer<Int>(source0);
369
370                                         v.x.x = Float((x << 22) & 0xFFC00000);
371                                         v.x.y = Float((y << 12) & 0xFFC00000);
372                                         v.x.z = Float((z << 2)  & 0xFFC00000);
373                                 }
374
375                                 {
376                                         Int x, y, z;
377
378                                         x = y = z = *Pointer<Int>(source1);
379
380                                         v.y.x = Float((x << 22) & 0xFFC00000);
381                                         v.y.y = Float((y << 12) & 0xFFC00000);
382                                         v.y.z = Float((z << 2)  & 0xFFC00000);
383                                 }
384
385                                 {
386                                         Int x, y, z;
387
388                                         x = y = z = *Pointer<Int>(source2);
389
390                                         v.z.x = Float((x << 22) & 0xFFC00000);
391                                         v.z.y = Float((y << 12) & 0xFFC00000);
392                                         v.z.z = Float((z << 2)  & 0xFFC00000);
393                                 }
394
395                                 {
396                                         Int x, y, z;
397
398                                         x = y = z = *Pointer<Int>(source3);
399
400                                         v.w.x = Float((x << 22) & 0xFFC00000);
401                                         v.w.y = Float((y << 12) & 0xFFC00000);
402                                         v.w.z = Float((z << 2)  & 0xFFC00000);
403                                 }
404
405                                 transpose4x3(v.x, v.y, v.z, v.w);
406
407                                 v.x *= Float4(1.0f / 0x00400000 / 511.0f);
408                                 v.y *= Float4(1.0f / 0x00400000 / 511.0f);
409                                 v.z *= Float4(1.0f / 0x00400000 / 511.0f);
410                         }
411                         break;
412                 case STREAMTYPE_FIXED:
413                         {
414                                 v.x = Float4(*Pointer<Int4>(source0)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
415                                 v.y = Float4(*Pointer<Int4>(source1)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
416                                 v.z = Float4(*Pointer<Int4>(source2)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
417                                 v.w = Float4(*Pointer<Int4>(source3)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
418
419                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
420                         }
421                         break;
422                 case STREAMTYPE_HALF:
423                         {
424                                 if(stream.count >= 1)
425                                 {
426                                         UShort x0 = *Pointer<UShort>(source0 + 0);
427                                         UShort x1 = *Pointer<UShort>(source1 + 0);
428                                         UShort x2 = *Pointer<UShort>(source2 + 0);
429                                         UShort x3 = *Pointer<UShort>(source3 + 0);
430
431                                         v.x.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x0) * 4);
432                                         v.x.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x1) * 4);
433                                         v.x.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x2) * 4);
434                                         v.x.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x3) * 4);
435                                 }
436
437                                 if(stream.count >= 2)
438                                 {
439                                         UShort y0 = *Pointer<UShort>(source0 + 2);
440                                         UShort y1 = *Pointer<UShort>(source1 + 2);
441                                         UShort y2 = *Pointer<UShort>(source2 + 2);
442                                         UShort y3 = *Pointer<UShort>(source3 + 2);
443
444                                         v.y.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y0) * 4);
445                                         v.y.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y1) * 4);
446                                         v.y.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y2) * 4);
447                                         v.y.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y3) * 4);
448                                 }
449
450                                 if(stream.count >= 3)
451                                 {
452                                         UShort z0 = *Pointer<UShort>(source0 + 4);
453                                         UShort z1 = *Pointer<UShort>(source1 + 4);
454                                         UShort z2 = *Pointer<UShort>(source2 + 4);
455                                         UShort z3 = *Pointer<UShort>(source3 + 4);
456
457                                         v.z.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z0) * 4);
458                                         v.z.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z1) * 4);
459                                         v.z.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z2) * 4);
460                                         v.z.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z3) * 4);
461                                 }
462
463                                 if(stream.count >= 4)
464                                 {
465                                         UShort w0 = *Pointer<UShort>(source0 + 6);
466                                         UShort w1 = *Pointer<UShort>(source1 + 6);
467                                         UShort w2 = *Pointer<UShort>(source2 + 6);
468                                         UShort w3 = *Pointer<UShort>(source3 + 6);
469
470                                         v.w.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w0) * 4);
471                                         v.w.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w1) * 4);
472                                         v.w.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w2) * 4);
473                                         v.w.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w3) * 4);
474                                 }
475                         }
476                         break;
477                 case STREAMTYPE_INDICES:
478                         {
479                                 v.x.x = *Pointer<Float>(source0);
480                                 v.x.y = *Pointer<Float>(source1);
481                                 v.x.z = *Pointer<Float>(source2);
482                                 v.x.w = *Pointer<Float>(source3);
483                         }
484                         break;
485                 case STREAMTYPE_2_10_10_10_INT:
486                         {
487                                 Int4 src;
488                                 src = Insert(src, *Pointer<Int>(source0), 0);
489                                 src = Insert(src, *Pointer<Int>(source1), 1);
490                                 src = Insert(src, *Pointer<Int>(source2), 2);
491                                 src = Insert(src, *Pointer<Int>(source3), 3);
492
493                                 v.x = Float4((src << 22) >> 22);
494                                 v.y = Float4((src << 12) >> 22);
495                                 v.z = Float4((src << 02) >> 22);
496                                 v.w = Float4(src >> 30);
497
498                                 if(stream.normalized)
499                                 {
500                                         v.x = Max(v.x * Float4(1.0f / 0x1FF), Float4(-1.0f));
501                                         v.y = Max(v.y * Float4(1.0f / 0x1FF), Float4(-1.0f));
502                                         v.z = Max(v.z * Float4(1.0f / 0x1FF), Float4(-1.0f));
503                                         v.w = Max(v.w, Float4(-1.0f));
504                                 }
505                         }
506                         break;
507                 case STREAMTYPE_2_10_10_10_UINT:
508                         {
509                                 Int4 src;
510                                 src = Insert(src, *Pointer<Int>(source0), 0);
511                                 src = Insert(src, *Pointer<Int>(source1), 1);
512                                 src = Insert(src, *Pointer<Int>(source2), 2);
513                                 src = Insert(src, *Pointer<Int>(source3), 3);
514
515                                 v.x = Float4(src & Int4(0x3FF));
516                                 v.y = Float4((src >> 10) & Int4(0x3FF));
517                                 v.z = Float4((src >> 20) & Int4(0x3FF));
518                                 v.w = Float4((src >> 30) & Int4(0x3));
519
520                                 if(stream.normalized)
521                                 {
522                                         v.x *= Float4(1.0f / 0x3FF);
523                                         v.y *= Float4(1.0f / 0x3FF);
524                                         v.z *= Float4(1.0f / 0x3FF);
525                                         v.w *= Float4(1.0f / 0x3);
526                                 }
527                         }
528                         break;
529                 default:
530                         ASSERT(false);
531                 }
532
533                 if(stream.count < 1) v.x = Float4(0.0f);
534                 if(stream.count < 2) v.y = Float4(0.0f);
535                 if(stream.count < 3) v.z = Float4(0.0f);
536                 if(stream.count < 4) v.w = Float4(1.0f);
537
538                 return v;
539         }
540
541         void VertexRoutine::postTransform()
542         {
543                 int pos = state.positionRegister;
544
545                 // Backtransform
546                 if(state.preTransformed)
547                 {
548                         Float4 rhw = Float4(1.0f) / o[pos].w;
549
550                         Float4 W = *Pointer<Float4>(data + OFFSET(DrawData,Wx16)) * Float4(1.0f / 16.0f);
551                         Float4 H = *Pointer<Float4>(data + OFFSET(DrawData,Hx16)) * Float4(1.0f / 16.0f);
552                         Float4 L = *Pointer<Float4>(data + OFFSET(DrawData,X0x16)) * Float4(1.0f / 16.0f);
553                         Float4 T = *Pointer<Float4>(data + OFFSET(DrawData,Y0x16)) * Float4(1.0f / 16.0f);
554
555                         o[pos].x = (o[pos].x - L) / W * rhw;
556                         o[pos].y = (o[pos].y - T) / H * rhw;
557                         o[pos].z = o[pos].z * rhw;
558                         o[pos].w = rhw;
559                 }
560
561                 if(!halfIntegerCoordinates && !state.preTransformed)
562                 {
563                         o[pos].x = o[pos].x + *Pointer<Float4>(data + OFFSET(DrawData,halfPixelX)) * o[pos].w;
564                         o[pos].y = o[pos].y + *Pointer<Float4>(data + OFFSET(DrawData,halfPixelY)) * o[pos].w;
565                 }
566
567                 if(state.superSampling)
568                 {
569                         o[pos].x = o[pos].x + *Pointer<Float4>(data + OFFSET(DrawData,XXXX)) * o[pos].w;
570                         o[pos].y = o[pos].y + *Pointer<Float4>(data + OFFSET(DrawData,YYYY)) * o[pos].w;
571                 }
572
573                 if(symmetricNormalizedDepth && !state.fixedFunction)
574                 {
575                         o[pos].z = (o[pos].z + o[pos].w) * Float4(0.5f);
576                 }
577         }
578
579         void VertexRoutine::writeCache(Pointer<Byte> &cacheLine)
580         {
581                 Vector4f v;
582
583                 for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
584                 {
585                         if(state.output[i].write)
586                         {
587                                 v.x = o[i].x;
588                                 v.y = o[i].y;
589                                 v.z = o[i].z;
590                                 v.w = o[i].w;
591
592                                 if(state.output[i].xClamp)
593                                 {
594                                         v.x = Max(v.x, Float4(0.0f));
595                                         v.x = Min(v.x, Float4(1.0f));
596                                 }
597
598                                 if(state.output[i].yClamp)
599                                 {
600                                         v.y = Max(v.y, Float4(0.0f));
601                                         v.y = Min(v.y, Float4(1.0f));
602                                 }
603
604                                 if(state.output[i].zClamp)
605                                 {
606                                         v.z = Max(v.z, Float4(0.0f));
607                                         v.z = Min(v.z, Float4(1.0f));
608                                 }
609
610                                 if(state.output[i].wClamp)
611                                 {
612                                         v.w = Max(v.w, Float4(0.0f));
613                                         v.w = Min(v.w, Float4(1.0f));
614                                 }
615
616                                 if(state.output[i].write == 0x01)
617                                 {
618                                         *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0) = v.x.x;
619                                         *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1) = v.x.y;
620                                         *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2) = v.x.z;
621                                         *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3) = v.x.w;
622                                 }
623                                 else
624                                 {
625                                         if(state.output[i].write == 0x02)
626                                         {
627                                                 transpose2x4(v.x, v.y, v.z, v.w);
628                                         }
629                                         else
630                                         {
631                                                 transpose4x4(v.x, v.y, v.z, v.w);
632                                         }
633
634                                         *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0, 16) = v.x;
635                                         *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1, 16) = v.y;
636                                         *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2, 16) = v.z;
637                                         *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3, 16) = v.w;
638                                 }
639                         }
640                 }
641
642                 *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 0) = (clipFlags >> 0)  & 0x0000000FF;
643                 *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 1) = (clipFlags >> 8)  & 0x0000000FF;
644                 *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 2) = (clipFlags >> 16) & 0x0000000FF;
645                 *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 3) = (clipFlags >> 24) & 0x0000000FF;
646
647                 int pos = state.positionRegister;
648
649                 v.x = o[pos].x;
650                 v.y = o[pos].y;
651                 v.z = o[pos].z;
652                 v.w = o[pos].w;
653
654                 Float4 w = As<Float4>(As<Int4>(v.w) | (As<Int4>(CmpEQ(v.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
655                 Float4 rhw = Float4(1.0f) / w;
656
657                 v.x = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,X0x16)) + v.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Wx16))));
658                 v.y = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,Y0x16)) + v.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Hx16))));
659                 v.z = v.z * rhw;
660                 v.w = rhw;
661
662                 transpose4x4(v.x, v.y, v.z, v.w);
663
664                 *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 0, 16) = v.x;
665                 *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 1, 16) = v.y;
666                 *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 2, 16) = v.z;
667                 *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 3, 16) = v.w;
668         }
669
670         void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cache)
671         {
672                 for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
673                 {
674                         if(state.output[i].write)
675                         {
676                                 *Pointer<Int4>(vertex + OFFSET(Vertex,v[i]), 16) = *Pointer<Int4>(cache + OFFSET(Vertex,v[i]), 16);
677                         }
678                 }
679
680                 *Pointer<Int4>(vertex + OFFSET(Vertex,X)) = *Pointer<Int4>(cache + OFFSET(Vertex,X));
681                 *Pointer<Int>(vertex + OFFSET(Vertex,clipFlags)) = *Pointer<Int>(cache + OFFSET(Vertex,clipFlags));
682         }
683 }