OSDN Git Service

Add support for quad primitives.
[android-x86/external-swiftshader.git] / src / Shader / VertexRoutine.cpp
1 // SwiftShader Software Renderer
2 //
3 // Copyright(c) 2005-2012 TransGaming Inc.
4 //
5 // All rights reserved. No part of this software may be copied, distributed, transmitted,
6 // transcribed, stored in a retrieval system, translated into any human or computer
7 // language by any means, or disclosed to third parties without the explicit written
8 // agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
9 // or implied, including but not limited to any patent rights, are granted to you.
10 //
11
12 #include "VertexRoutine.hpp"
13
14 #include "VertexShader.hpp"
15 #include "Vertex.hpp"
16 #include "Half.hpp"
17 #include "Renderer.hpp"
18 #include "Constants.hpp"
19 #include "Debug.hpp"
20
21 namespace sw
22 {
23         extern bool halfIntegerCoordinates;     // Pixel centers are not at integer coordinates
24         extern bool symmetricNormalizedDepth;   // [-1, 1] instead of [0, 1]
25
26         VertexRoutine::VertexRoutine(const VertexProcessor::State &state, const VertexShader *shader) : state(state), shader(shader)
27         {
28                 routine = 0;
29         }
30
31         VertexRoutine::~VertexRoutine()
32         {
33         }
34
35         void VertexRoutine::generate()
36         {
37                 Function<Void, Pointer<Byte>, Pointer<Byte>, Pointer<Byte>, Pointer<Byte> > function;
38                 {
39                         Pointer<Byte> vertex(function.arg(0));
40                         Pointer<Byte> batch(function.arg(1));
41                         Pointer<Byte> task(function.arg(2));
42                         Pointer<Byte> data(function.arg(3));
43
44                         const bool texldl = state.shaderContainsTexldl;
45
46                         Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache);
47                         Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex);
48                         Pointer<Byte> tagCache = cache + OFFSET(VertexCache,tag);
49
50                         UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask,vertexCount));
51
52                         Registers r(shader);
53                         r.data = data;
54                         r.constants = *Pointer<Pointer<Byte> >(data + OFFSET(DrawData,constants));
55
56                         Do
57                         {
58                                 UInt index = *Pointer<UInt>(batch);
59                                 UInt tagIndex = index & 0x0000003C;
60                                 UInt indexQ = !texldl ? UInt(index & 0xFFFFFFFC) : index;   // FIXME: TEXLDL hack to have independent LODs, hurts performance.
61
62                                 If(*Pointer<UInt>(tagCache + tagIndex) != indexQ)
63                                 {
64                                         *Pointer<UInt>(tagCache + tagIndex) = indexQ;
65
66                                         readInput(r, indexQ);
67                                         pipeline(r);
68                                         postTransform(r);
69                                         computeClipFlags(r);
70
71                                         Pointer<Byte> cacheLine0 = vertexCache + tagIndex * UInt((int)sizeof(Vertex));
72                                         writeCache(cacheLine0, r);
73                                 }
74
75                                 UInt cacheIndex = index & 0x0000003F;
76                                 Pointer<Byte> cacheLine = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
77                                 writeVertex(vertex, cacheLine);
78
79                                 vertex += sizeof(Vertex);
80                                 batch += sizeof(unsigned int);
81                                 vertexCount--;
82                         }
83                         Until(vertexCount == 0)
84
85                         Return();
86                 }
87
88                 routine = function(L"VertexRoutine_%0.8X", state.shaderID);
89         }
90
91         Routine *VertexRoutine::getRoutine()
92         {
93                 return routine;
94         }
95
96         void VertexRoutine::readInput(Registers &r, UInt &index)
97         {
98                 for(int i = 0; i < 16; i++)
99                 {
100                         Pointer<Byte> input = *Pointer<Pointer<Byte> >(r.data + OFFSET(DrawData,input) + sizeof(void*) * i);
101                         UInt stride = *Pointer<UInt>(r.data + OFFSET(DrawData,stride) + sizeof(unsigned int) * i);
102
103                         r.v[i] = readStream(r, input, stride, state.input[i], index);
104                 }
105         }
106
107         void VertexRoutine::computeClipFlags(Registers &r)
108         {
109                 int pos = state.positionRegister;
110
111                 // Backtransform
112                 if(state.preTransformed)
113                 {
114                         Float4 rhw = Float4(1.0f) / r.o[pos].w;
115
116                         Float4 W = *Pointer<Float4>(r.data + OFFSET(DrawData,Wx16)) * Float4(1.0f / 16.0f);
117                         Float4 H = *Pointer<Float4>(r.data + OFFSET(DrawData,Hx16)) * Float4(1.0f / 16.0f);
118                         Float4 L = *Pointer<Float4>(r.data + OFFSET(DrawData,X0x16)) * Float4(1.0f / 16.0f);
119                         Float4 T = *Pointer<Float4>(r.data + OFFSET(DrawData,Y0x16)) * Float4(1.0f / 16.0f);
120
121                         r.o[pos].x = (r.o[pos].x - L) / W * rhw;
122                         r.o[pos].y = (r.o[pos].y - T) / H * rhw;
123                         r.o[pos].z = r.o[pos].z * rhw;
124                         r.o[pos].w = rhw;
125                 }
126
127                 if(state.superSampling)
128                 {
129                         r.o[pos].x = r.o[pos].x + *Pointer<Float4>(r.data + OFFSET(DrawData,XXXX)) * r.o[pos].w;
130                         r.o[pos].y = r.o[pos].y + *Pointer<Float4>(r.data + OFFSET(DrawData,YYYY)) * r.o[pos].w;
131                 }
132
133                 Float4 clipX = r.o[pos].x;
134                 Float4 clipY = r.o[pos].y;
135
136                 if(state.multiSampling)   // Clip at pixel edges instead of pixel centers
137                 {
138                         clipX += *Pointer<Float4>(r.data + OFFSET(DrawData,halfPixelX)) * r.o[pos].w;
139                         clipY += *Pointer<Float4>(r.data + OFFSET(DrawData,halfPixelY)) * r.o[pos].w;
140                 }
141
142                 Int4 maxX = CmpLT(r.o[pos].w, clipX);
143                 Int4 maxY = CmpLT(r.o[pos].w, clipY);
144                 Int4 maxZ = CmpLT(r.o[pos].w, r.o[pos].z);
145
146                 Int4 minX = CmpNLE(-r.o[pos].w, clipX);
147                 Int4 minY = CmpNLE(-r.o[pos].w, clipY);
148                 Int4 minZ = CmpNLE(Float4(0.0f), r.o[pos].z);
149
150                 Int flags;
151
152                 flags = SignMask(maxX);
153                 r.clipFlags = *Pointer<Int>(r.constants + OFFSET(Constants,maxX) + flags * 4);   // FIXME: Array indexing
154                 flags = SignMask(maxY);
155                 r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,maxY) + flags * 4);
156                 flags = SignMask(maxZ);
157                 r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,maxZ) + flags * 4);
158                 flags = SignMask(minX);
159                 r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,minX) + flags * 4);
160                 flags = SignMask(minY);
161                 r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,minY) + flags * 4);
162                 flags = SignMask(minZ);
163                 r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,minZ) + flags * 4);
164
165                 Int4 finiteX = CmpLE(Abs(r.o[pos].x), *Pointer<Float4>(r.constants + OFFSET(Constants,maxPos)));
166                 Int4 finiteY = CmpLE(Abs(r.o[pos].y), *Pointer<Float4>(r.constants + OFFSET(Constants,maxPos)));
167                 Int4 finiteZ = CmpLE(Abs(r.o[pos].z), *Pointer<Float4>(r.constants + OFFSET(Constants,maxPos)));
168
169                 flags = SignMask(finiteX & finiteY & finiteZ);
170                 r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,fini) + flags * 4);
171
172                 if(state.preTransformed)
173                 {
174                         r.clipFlags &= 0xFBFBFBFB;   // Don't clip against far clip plane
175                 }
176         }
177
178         Vector4f VertexRoutine::readStream(Registers &r, Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index)
179         {
180                 const bool texldl = state.shaderContainsTexldl;
181
182                 Vector4f v;
183
184                 Pointer<Byte> source0 = buffer + index * stride;
185                 Pointer<Byte> source1 = source0 + (!texldl ? stride : 0);
186                 Pointer<Byte> source2 = source1 + (!texldl ? stride : 0);
187                 Pointer<Byte> source3 = source2 + (!texldl ? stride : 0);
188
189                 switch(stream.type)
190                 {
191                 case STREAMTYPE_FLOAT:
192                         {
193                                 if(stream.count == 0)
194                                 {
195                                         // Null stream, all default components
196                                 }
197                                 else if(stream.count == 1)
198                                 {
199                                         v.x.x = *Pointer<Float>(source0);
200                                         v.x.y = *Pointer<Float>(source1);
201                                         v.x.z = *Pointer<Float>(source2);
202                                         v.x.w = *Pointer<Float>(source3);
203                                 }
204                                 else
205                                 {
206                                         v.x = *Pointer<Float4>(source0);
207                                         v.y = *Pointer<Float4>(source1);
208                                         v.z = *Pointer<Float4>(source2);
209                                         v.w = *Pointer<Float4>(source3);
210
211                                         transpose4xN(v.x, v.y, v.z, v.w, stream.count);
212                                 }
213                         }
214                         break;
215                 case STREAMTYPE_BYTE:
216                         {
217                                 v.x = Float4(*Pointer<Byte4>(source0));
218                                 v.y = Float4(*Pointer<Byte4>(source1));
219                                 v.z = Float4(*Pointer<Byte4>(source2));
220                                 v.w = Float4(*Pointer<Byte4>(source3));
221
222                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
223
224                                 if(stream.normalized)
225                                 {
226                                         if(stream.count >= 1) v.x *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
227                                         if(stream.count >= 2) v.y *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
228                                         if(stream.count >= 3) v.z *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
229                                         if(stream.count >= 4) v.w *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
230                                 }
231                         }
232                         break;
233                 case STREAMTYPE_SBYTE:
234                         {
235                                 v.x = Float4(*Pointer<SByte4>(source0));
236                                 v.y = Float4(*Pointer<SByte4>(source1));
237                                 v.z = Float4(*Pointer<SByte4>(source2));
238                                 v.w = Float4(*Pointer<SByte4>(source3));
239
240                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
241
242                                 if(stream.normalized)
243                                 {
244                                         if(stream.count >= 1) v.x *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleSByte));
245                                         if(stream.count >= 2) v.y *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleSByte));
246                                         if(stream.count >= 3) v.z *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleSByte));
247                                         if(stream.count >= 4) v.w *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleSByte));
248                                 }
249                         }
250                         break;
251                 case STREAMTYPE_COLOR:
252                         {
253                                 v.x = Float4(*Pointer<Byte4>(source0)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
254                                 v.y = Float4(*Pointer<Byte4>(source1)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
255                                 v.z = Float4(*Pointer<Byte4>(source2)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
256                                 v.w = Float4(*Pointer<Byte4>(source3)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
257
258                                 transpose4x4(v.x, v.y, v.z, v.w);
259
260                                 // Swap red and blue
261                                 Float4 t = v.x;
262                                 v.x = v.z;
263                                 v.z = t;
264                         }
265                         break;
266                 case STREAMTYPE_SHORT:
267                         {
268                                 v.x = Float4(*Pointer<Short4>(source0));
269                                 v.y = Float4(*Pointer<Short4>(source1));
270                                 v.z = Float4(*Pointer<Short4>(source2));
271                                 v.w = Float4(*Pointer<Short4>(source3));
272                         
273                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
274
275                                 if(stream.normalized)
276                                 {
277                                         if(stream.count >= 1) v.x *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleShort));
278                                         if(stream.count >= 2) v.y *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleShort));
279                                         if(stream.count >= 3) v.z *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleShort));
280                                         if(stream.count >= 4) v.w *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleShort));
281                                 }                       
282                         }
283                         break;
284                 case STREAMTYPE_USHORT:
285                         {
286                                 v.x = Float4(*Pointer<UShort4>(source0));
287                                 v.y = Float4(*Pointer<UShort4>(source1));
288                                 v.z = Float4(*Pointer<UShort4>(source2));
289                                 v.w = Float4(*Pointer<UShort4>(source3));
290                         
291                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
292
293                                 if(stream.normalized)
294                                 {
295                                         if(stream.count >= 1) v.x *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleUShort));
296                                         if(stream.count >= 2) v.y *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleUShort));
297                                         if(stream.count >= 3) v.z *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleUShort));
298                                         if(stream.count >= 4) v.w *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleUShort));
299                                 }
300                         }
301                         break;
302                 case STREAMTYPE_UDEC3:
303                         {
304                                 // FIXME: Vectorize
305                                 {
306                                         Int x, y, z;
307                                         
308                                         x = y = z = *Pointer<Int>(source0);
309
310                                         v.x.x = Float(x & 0x000003FF);
311                                         v.x.y = Float(y & 0x000FFC00);
312                                         v.x.z = Float(z & 0x3FF00000);
313                                 }
314
315                                 {
316                                         Int x, y, z;
317                                         
318                                         x = y = z = *Pointer<Int>(source1);
319
320                                         v.y.x = Float(x & 0x000003FF);
321                                         v.y.y = Float(y & 0x000FFC00);
322                                         v.y.z = Float(z & 0x3FF00000);
323                                 }
324
325                                 {
326                                         Int x, y, z;
327                                         
328                                         x = y = z = *Pointer<Int>(source2);
329
330                                         v.z.x = Float(x & 0x000003FF);
331                                         v.z.y = Float(y & 0x000FFC00);
332                                         v.z.z = Float(z & 0x3FF00000);
333                                 }
334
335                                 {
336                                         Int x, y, z;
337                                         
338                                         x = y = z = *Pointer<Int>(source3);
339
340                                         v.w.x = Float(x & 0x000003FF);
341                                         v.w.y = Float(y & 0x000FFC00);
342                                         v.w.z = Float(z & 0x3FF00000);
343                                 }
344
345                                 transpose4x3(v.x, v.y, v.z, v.w);
346
347                                 v.y *= Float4(1.0f / 0x00000400);
348                                 v.z *= Float4(1.0f / 0x00100000);
349                         }
350                         break;
351                 case STREAMTYPE_DEC3N:
352                         {
353                                 // FIXME: Vectorize
354                                 {
355                                         Int x, y, z;
356                                         
357                                         x = y = z = *Pointer<Int>(source0);
358
359                                         v.x.x = Float((x << 22) & 0xFFC00000);
360                                         v.x.y = Float((y << 12) & 0xFFC00000);
361                                         v.x.z = Float((z << 2)  & 0xFFC00000);
362                                 }
363
364                                 {
365                                         Int x, y, z;
366                                         
367                                         x = y = z = *Pointer<Int>(source1);
368
369                                         v.y.x = Float((x << 22) & 0xFFC00000);
370                                         v.y.y = Float((y << 12) & 0xFFC00000);
371                                         v.y.z = Float((z << 2)  & 0xFFC00000);
372                                 }
373
374                                 {
375                                         Int x, y, z;
376                                         
377                                         x = y = z = *Pointer<Int>(source2);
378
379                                         v.z.x = Float((x << 22) & 0xFFC00000);
380                                         v.z.y = Float((y << 12) & 0xFFC00000);
381                                         v.z.z = Float((z << 2)  & 0xFFC00000);
382                                 }
383
384                                 {
385                                         Int x, y, z;
386                                         
387                                         x = y = z = *Pointer<Int>(source3);
388
389                                         v.w.x = Float((x << 22) & 0xFFC00000);
390                                         v.w.y = Float((y << 12) & 0xFFC00000);
391                                         v.w.z = Float((z << 2)  & 0xFFC00000);
392                                 }
393
394                                 transpose4x3(v.x, v.y, v.z, v.w);
395
396                                 v.x *= Float4(1.0f / 0x00400000 / 511.0f);
397                                 v.y *= Float4(1.0f / 0x00400000 / 511.0f);
398                                 v.z *= Float4(1.0f / 0x00400000 / 511.0f);
399                         }
400                         break;
401                 case STREAMTYPE_FIXED:
402                         {
403                                 v.x = Float4(*Pointer<Int4>(source0)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleFixed));
404                                 v.y = Float4(*Pointer<Int4>(source1)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleFixed));
405                                 v.z = Float4(*Pointer<Int4>(source2)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleFixed));
406                                 v.w = Float4(*Pointer<Int4>(source3)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleFixed));
407
408                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
409                         }
410                         break;
411                 case STREAMTYPE_HALF:
412                         {
413                                 if(stream.count >= 1)
414                                 {
415                                         UShort x0 = *Pointer<UShort>(source0 + 0);
416                                         UShort x1 = *Pointer<UShort>(source1 + 0);
417                                         UShort x2 = *Pointer<UShort>(source2 + 0);
418                                         UShort x3 = *Pointer<UShort>(source3 + 0);
419
420                                         v.x.x = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(x0) * 4);
421                                         v.x.y = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(x1) * 4);
422                                         v.x.z = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(x2) * 4);
423                                         v.x.w = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(x3) * 4);
424                                 }
425
426                                 if(stream.count >= 2)
427                                 {
428                                         UShort y0 = *Pointer<UShort>(source0 + 2);
429                                         UShort y1 = *Pointer<UShort>(source1 + 2);
430                                         UShort y2 = *Pointer<UShort>(source2 + 2);
431                                         UShort y3 = *Pointer<UShort>(source3 + 2);
432
433                                         v.y.x = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(y0) * 4);
434                                         v.y.y = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(y1) * 4);
435                                         v.y.z = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(y2) * 4);
436                                         v.y.w = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(y3) * 4);
437                                 }
438
439                                 if(stream.count >= 3)
440                                 {
441                                         UShort z0 = *Pointer<UShort>(source0 + 4);
442                                         UShort z1 = *Pointer<UShort>(source1 + 4);
443                                         UShort z2 = *Pointer<UShort>(source2 + 4);
444                                         UShort z3 = *Pointer<UShort>(source3 + 4);
445
446                                         v.z.x = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(z0) * 4);
447                                         v.z.y = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(z1) * 4);
448                                         v.z.z = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(z2) * 4);
449                                         v.z.w = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(z3) * 4);
450                                 }
451
452                                 if(stream.count >= 4)
453                                 {
454                                         UShort w0 = *Pointer<UShort>(source0 + 6);
455                                         UShort w1 = *Pointer<UShort>(source1 + 6);
456                                         UShort w2 = *Pointer<UShort>(source2 + 6);
457                                         UShort w3 = *Pointer<UShort>(source3 + 6);
458
459                                         v.w.x = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(w0) * 4);
460                                         v.w.y = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(w1) * 4);
461                                         v.w.z = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(w2) * 4);
462                                         v.w.w = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(w3) * 4);
463                                 }
464                         }
465                         break;
466                 case STREAMTYPE_INDICES:
467                         {
468                                 v.x.x = *Pointer<Float>(source0);
469                                 v.x.y = *Pointer<Float>(source1);
470                                 v.x.z = *Pointer<Float>(source2);
471                                 v.x.w = *Pointer<Float>(source3);
472                         }
473                         break;
474                 default:
475                         ASSERT(false);
476                 }
477
478                 if(stream.count < 1) v.x = Float4(0.0f);
479                 if(stream.count < 2) v.y = Float4(0.0f);
480                 if(stream.count < 3) v.z = Float4(0.0f);
481                 if(stream.count < 4) v.w = Float4(1.0f);
482
483                 return v;
484         }
485
486         void VertexRoutine::postTransform(Registers &r)
487         {
488                 int pos = state.positionRegister;
489
490                 if(halfIntegerCoordinates)
491                 {
492                         r.o[pos].x = r.o[pos].x - *Pointer<Float4>(r.data + OFFSET(DrawData,halfPixelX)) * r.o[pos].w;
493                         r.o[pos].y = r.o[pos].y - *Pointer<Float4>(r.data + OFFSET(DrawData,halfPixelY)) * r.o[pos].w;
494                 }
495
496                 if(symmetricNormalizedDepth)
497                 {
498                         r.o[pos].z = (r.o[pos].z + r.o[pos].w) * Float4(0.5f);
499                 }
500         }
501
502         void VertexRoutine::writeCache(Pointer<Byte> &cacheLine, Registers &r)
503         {
504                 Vector4f v;
505
506                 for(int i = 0; i < 12; i++)
507                 {
508                         if(state.output[i].write)
509                         {
510                                 v.x = r.o[i].x;
511                                 v.y = r.o[i].y;
512                                 v.z = r.o[i].z;
513                                 v.w = r.o[i].w;
514
515                                 if(state.output[i].xClamp)
516                                 {
517                                         v.x = Max(v.x, Float4(0.0f));
518                                         v.x = Min(v.x, Float4(1.0f));
519                                 }
520
521                                 if(state.output[i].yClamp)
522                                 {
523                                         v.y = Max(v.y, Float4(0.0f));
524                                         v.y = Min(v.y, Float4(1.0f));
525                                 }
526
527                                 if(state.output[i].zClamp)
528                                 {
529                                         v.z = Max(v.z, Float4(0.0f));
530                                         v.z = Min(v.z, Float4(1.0f));
531                                 }
532
533                                 if(state.output[i].wClamp)
534                                 {
535                                         v.w = Max(v.w, Float4(0.0f));
536                                         v.w = Min(v.w, Float4(1.0f));
537                                 }
538
539                                 if(state.output[i].write == 0x01)
540                                 {
541                                         *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0) = v.x.x;
542                                         *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1) = v.x.y;
543                                         *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2) = v.x.z;
544                                         *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3) = v.x.w;
545                                 }
546                                 else
547                                 {
548                                         if(state.output[i].write == 0x02)
549                                         {
550                                                 transpose2x4(v.x, v.y, v.z, v.w);
551                                         }
552                                         else
553                                         {
554                                                 transpose4x4(v.x, v.y, v.z, v.w);
555                                         }
556
557                                         *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0, 16) = v.x;
558                                         *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1, 16) = v.y;
559                                         *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2, 16) = v.z;
560                                         *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3, 16) = v.w;
561                                 }
562                         }
563                 }
564
565                 *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 0) = (r.clipFlags >> 0)  & 0x0000000FF;   // FIXME: unsigned char Vertex::clipFlags
566                 *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 1) = (r.clipFlags >> 8)  & 0x0000000FF;
567                 *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 2) = (r.clipFlags >> 16) & 0x0000000FF;
568                 *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 3) = (r.clipFlags >> 24) & 0x0000000FF;
569
570                 int pos = state.positionRegister;
571
572                 v.x = r.o[pos].x;
573                 v.y = r.o[pos].y;
574                 v.z = r.o[pos].z;
575                 v.w = r.o[pos].w;
576
577                 Float4 w = As<Float4>(As<Int4>(v.w) | (As<Int4>(CmpEQ(v.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
578                 Float4 rhw = Float4(1.0f) / w;
579
580                 v.x = As<Float4>(RoundInt(*Pointer<Float4>(r.data + OFFSET(DrawData,X0x16)) + v.x * rhw * *Pointer<Float4>(r.data + OFFSET(DrawData,Wx16))));
581                 v.y = As<Float4>(RoundInt(*Pointer<Float4>(r.data + OFFSET(DrawData,Y0x16)) + v.y * rhw * *Pointer<Float4>(r.data + OFFSET(DrawData,Hx16))));
582                 v.z = v.z * rhw;
583                 v.w = rhw;
584
585                 transpose4x4(v.x, v.y, v.z, v.w);
586
587                 *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 0, 16) = v.x;
588                 *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 1, 16) = v.y;
589                 *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 2, 16) = v.z;
590                 *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 3, 16) = v.w;
591         }
592
593         void VertexRoutine::writeVertex(Pointer<Byte> &vertex, Pointer<Byte> &cache)
594         {
595                 for(int i = 0; i < 12; i++)
596                 {
597                         if(state.output[i].write)
598                         {
599                                 *Pointer<Float4>(vertex + OFFSET(Vertex,v[i])) = *Pointer<Float4>(cache + OFFSET(Vertex,v[i]));
600                         }
601                 }
602
603                 *Pointer<Int>(vertex + OFFSET(Vertex,clipFlags)) = *Pointer<Int>(cache + OFFSET(Vertex,clipFlags));
604                 *Pointer<Float4>(vertex + OFFSET(Vertex,X)) = *Pointer<Float4>(cache + OFFSET(Vertex,X));
605         }
606 }