OSDN Git Service

Make Function variadic and take a function signature.
[android-x86/external-swiftshader.git] / src / Shader / VertexRoutine.cpp
1 // SwiftShader Software Renderer
2 //
3 // Copyright(c) 2005-2012 TransGaming Inc.
4 //
5 // All rights reserved. No part of this software may be copied, distributed, transmitted,
6 // transcribed, stored in a retrieval system, translated into any human or computer
7 // language by any means, or disclosed to third parties without the explicit written
8 // agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
9 // or implied, including but not limited to any patent rights, are granted to you.
10 //
11
12 #include "VertexRoutine.hpp"
13
14 #include "VertexShader.hpp"
15 #include "Vertex.hpp"
16 #include "Half.hpp"
17 #include "Renderer.hpp"
18 #include "Constants.hpp"
19 #include "Debug.hpp"
20
21 namespace sw
22 {
23         extern bool halfIntegerCoordinates;     // Pixel centers are not at integer coordinates
24         extern bool symmetricNormalizedDepth;   // [-1, 1] instead of [0, 1]
25
26         VertexRoutine::VertexRoutine(const VertexProcessor::State &state, const VertexShader *shader) : state(state), shader(shader)
27         {
28                 routine = 0;
29         }
30
31         VertexRoutine::~VertexRoutine()
32         {
33         }
34
35         void VertexRoutine::generate()
36         {
37                 Function<Void(Pointer<Byte>, Pointer<Byte>, Pointer<Byte>, Pointer<Byte>)> function;
38                 {
39                         Pointer<Byte> vertex(function.arg(0));
40                         Pointer<Byte> batch(function.arg(1));
41                         Pointer<Byte> task(function.arg(2));
42                         Pointer<Byte> data(function.arg(3));
43
44                         const bool texldl = state.shaderContainsTexldl;
45
46                         Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache);
47                         Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex);
48                         Pointer<Byte> tagCache = cache + OFFSET(VertexCache,tag);
49
50                         UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask,vertexCount));
51
52                         Registers r(shader);
53                         r.data = data;
54                         r.constants = *Pointer<Pointer<Byte> >(data + OFFSET(DrawData,constants));
55                         if(shader && shader->instanceIdDeclared)
56                         {
57                                 r.instanceID = *Pointer<Int>(data + OFFSET(DrawData, instanceID));
58                         }
59
60                         Do
61                         {
62                                 UInt index = *Pointer<UInt>(batch);
63                                 UInt tagIndex = index & 0x0000003C;
64                                 UInt indexQ = !texldl ? UInt(index & 0xFFFFFFFC) : index;   // FIXME: TEXLDL hack to have independent LODs, hurts performance.
65
66                                 If(*Pointer<UInt>(tagCache + tagIndex) != indexQ)
67                                 {
68                                         *Pointer<UInt>(tagCache + tagIndex) = indexQ;
69
70                                         readInput(r, indexQ);
71                                         pipeline(r);
72                                         postTransform(r);
73                                         computeClipFlags(r);
74
75                                         Pointer<Byte> cacheLine0 = vertexCache + tagIndex * UInt((int)sizeof(Vertex));
76                                         writeCache(cacheLine0, r);
77                                 }
78
79                                 UInt cacheIndex = index & 0x0000003F;
80                                 Pointer<Byte> cacheLine = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
81                                 writeVertex(vertex, cacheLine);
82
83                                 vertex += sizeof(Vertex);
84                                 batch += sizeof(unsigned int);
85                                 vertexCount--;
86                         }
87                         Until(vertexCount == 0)
88
89                         Return();
90                 }
91
92                 routine = function(L"VertexRoutine_%0.8X", state.shaderID);
93         }
94
95         Routine *VertexRoutine::getRoutine()
96         {
97                 return routine;
98         }
99
100         void VertexRoutine::readInput(Registers &r, UInt &index)
101         {
102                 for(int i = 0; i < VERTEX_ATTRIBUTES; i++)
103                 {
104                         Pointer<Byte> input = *Pointer<Pointer<Byte> >(r.data + OFFSET(DrawData,input) + sizeof(void*) * i);
105                         UInt stride = *Pointer<UInt>(r.data + OFFSET(DrawData,stride) + sizeof(unsigned int) * i);
106
107                         r.v[i] = readStream(r, input, stride, state.input[i], index);
108                 }
109         }
110
111         void VertexRoutine::computeClipFlags(Registers &r)
112         {
113                 int pos = state.positionRegister;
114
115                 Int4 maxX = CmpLT(r.o[pos].w, r.o[pos].x);
116                 Int4 maxY = CmpLT(r.o[pos].w, r.o[pos].y);
117                 Int4 maxZ = CmpLT(r.o[pos].w, r.o[pos].z);
118
119                 Int4 minX = CmpNLE(-r.o[pos].w, r.o[pos].x);
120                 Int4 minY = CmpNLE(-r.o[pos].w, r.o[pos].y);
121                 Int4 minZ = CmpNLE(Float4(0.0f), r.o[pos].z);
122
123                 Int flags;
124
125                 flags = SignMask(maxX);
126                 r.clipFlags = *Pointer<Int>(r.constants + OFFSET(Constants,maxX) + flags * 4);   // FIXME: Array indexing
127                 flags = SignMask(maxY);
128                 r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,maxY) + flags * 4);
129                 flags = SignMask(maxZ);
130                 r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,maxZ) + flags * 4);
131                 flags = SignMask(minX);
132                 r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,minX) + flags * 4);
133                 flags = SignMask(minY);
134                 r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,minY) + flags * 4);
135                 flags = SignMask(minZ);
136                 r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,minZ) + flags * 4);
137
138                 Int4 finiteX = CmpLE(Abs(r.o[pos].x), *Pointer<Float4>(r.constants + OFFSET(Constants,maxPos)));
139                 Int4 finiteY = CmpLE(Abs(r.o[pos].y), *Pointer<Float4>(r.constants + OFFSET(Constants,maxPos)));
140                 Int4 finiteZ = CmpLE(Abs(r.o[pos].z), *Pointer<Float4>(r.constants + OFFSET(Constants,maxPos)));
141
142                 flags = SignMask(finiteX & finiteY & finiteZ);
143                 r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,fini) + flags * 4);
144
145                 if(state.preTransformed)
146                 {
147                         r.clipFlags &= 0xFBFBFBFB;   // Don't clip against far clip plane
148                 }
149         }
150
151         Vector4f VertexRoutine::readStream(Registers &r, Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index)
152         {
153                 const bool texldl = state.shaderContainsTexldl;
154
155                 Vector4f v;
156
157                 Pointer<Byte> source0 = buffer + index * stride;
158                 Pointer<Byte> source1 = source0 + (!texldl ? stride : 0);
159                 Pointer<Byte> source2 = source1 + (!texldl ? stride : 0);
160                 Pointer<Byte> source3 = source2 + (!texldl ? stride : 0);
161
162                 switch(stream.type)
163                 {
164                 case STREAMTYPE_FLOAT:
165                         {
166                                 if(stream.count == 0)
167                                 {
168                                         // Null stream, all default components
169                                 }
170                                 else if(stream.count == 1)
171                                 {
172                                         v.x.x = *Pointer<Float>(source0);
173                                         v.x.y = *Pointer<Float>(source1);
174                                         v.x.z = *Pointer<Float>(source2);
175                                         v.x.w = *Pointer<Float>(source3);
176                                 }
177                                 else
178                                 {
179                                         v.x = *Pointer<Float4>(source0);
180                                         v.y = *Pointer<Float4>(source1);
181                                         v.z = *Pointer<Float4>(source2);
182                                         v.w = *Pointer<Float4>(source3);
183
184                                         transpose4xN(v.x, v.y, v.z, v.w, stream.count);
185                                 }
186                         }
187                         break;
188                 case STREAMTYPE_BYTE:
189                         {
190                                 v.x = Float4(*Pointer<Byte4>(source0));
191                                 v.y = Float4(*Pointer<Byte4>(source1));
192                                 v.z = Float4(*Pointer<Byte4>(source2));
193                                 v.w = Float4(*Pointer<Byte4>(source3));
194
195                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
196
197                                 if(stream.normalized)
198                                 {
199                                         if(stream.count >= 1) v.x *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
200                                         if(stream.count >= 2) v.y *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
201                                         if(stream.count >= 3) v.z *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
202                                         if(stream.count >= 4) v.w *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
203                                 }
204                         }
205                         break;
206                 case STREAMTYPE_SBYTE:
207                         {
208                                 v.x = Float4(*Pointer<SByte4>(source0));
209                                 v.y = Float4(*Pointer<SByte4>(source1));
210                                 v.z = Float4(*Pointer<SByte4>(source2));
211                                 v.w = Float4(*Pointer<SByte4>(source3));
212
213                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
214
215                                 if(stream.normalized)
216                                 {
217                                         if(stream.count >= 1) v.x *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleSByte));
218                                         if(stream.count >= 2) v.y *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleSByte));
219                                         if(stream.count >= 3) v.z *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleSByte));
220                                         if(stream.count >= 4) v.w *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleSByte));
221                                 }
222                         }
223                         break;
224                 case STREAMTYPE_COLOR:
225                         {
226                                 v.x = Float4(*Pointer<Byte4>(source0)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
227                                 v.y = Float4(*Pointer<Byte4>(source1)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
228                                 v.z = Float4(*Pointer<Byte4>(source2)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
229                                 v.w = Float4(*Pointer<Byte4>(source3)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
230
231                                 transpose4x4(v.x, v.y, v.z, v.w);
232
233                                 // Swap red and blue
234                                 Float4 t = v.x;
235                                 v.x = v.z;
236                                 v.z = t;
237                         }
238                         break;
239                 case STREAMTYPE_SHORT:
240                         {
241                                 v.x = Float4(*Pointer<Short4>(source0));
242                                 v.y = Float4(*Pointer<Short4>(source1));
243                                 v.z = Float4(*Pointer<Short4>(source2));
244                                 v.w = Float4(*Pointer<Short4>(source3));
245                         
246                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
247
248                                 if(stream.normalized)
249                                 {
250                                         if(stream.count >= 1) v.x *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleShort));
251                                         if(stream.count >= 2) v.y *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleShort));
252                                         if(stream.count >= 3) v.z *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleShort));
253                                         if(stream.count >= 4) v.w *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleShort));
254                                 }                       
255                         }
256                         break;
257                 case STREAMTYPE_USHORT:
258                         {
259                                 v.x = Float4(*Pointer<UShort4>(source0));
260                                 v.y = Float4(*Pointer<UShort4>(source1));
261                                 v.z = Float4(*Pointer<UShort4>(source2));
262                                 v.w = Float4(*Pointer<UShort4>(source3));
263                         
264                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
265
266                                 if(stream.normalized)
267                                 {
268                                         if(stream.count >= 1) v.x *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleUShort));
269                                         if(stream.count >= 2) v.y *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleUShort));
270                                         if(stream.count >= 3) v.z *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleUShort));
271                                         if(stream.count >= 4) v.w *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleUShort));
272                                 }
273                         }
274                         break;
275                 case STREAMTYPE_UDEC3:
276                         {
277                                 // FIXME: Vectorize
278                                 {
279                                         Int x, y, z;
280                                         
281                                         x = y = z = *Pointer<Int>(source0);
282
283                                         v.x.x = Float(x & 0x000003FF);
284                                         v.x.y = Float(y & 0x000FFC00);
285                                         v.x.z = Float(z & 0x3FF00000);
286                                 }
287
288                                 {
289                                         Int x, y, z;
290                                         
291                                         x = y = z = *Pointer<Int>(source1);
292
293                                         v.y.x = Float(x & 0x000003FF);
294                                         v.y.y = Float(y & 0x000FFC00);
295                                         v.y.z = Float(z & 0x3FF00000);
296                                 }
297
298                                 {
299                                         Int x, y, z;
300                                         
301                                         x = y = z = *Pointer<Int>(source2);
302
303                                         v.z.x = Float(x & 0x000003FF);
304                                         v.z.y = Float(y & 0x000FFC00);
305                                         v.z.z = Float(z & 0x3FF00000);
306                                 }
307
308                                 {
309                                         Int x, y, z;
310                                         
311                                         x = y = z = *Pointer<Int>(source3);
312
313                                         v.w.x = Float(x & 0x000003FF);
314                                         v.w.y = Float(y & 0x000FFC00);
315                                         v.w.z = Float(z & 0x3FF00000);
316                                 }
317
318                                 transpose4x3(v.x, v.y, v.z, v.w);
319
320                                 v.y *= Float4(1.0f / 0x00000400);
321                                 v.z *= Float4(1.0f / 0x00100000);
322                         }
323                         break;
324                 case STREAMTYPE_DEC3N:
325                         {
326                                 // FIXME: Vectorize
327                                 {
328                                         Int x, y, z;
329                                         
330                                         x = y = z = *Pointer<Int>(source0);
331
332                                         v.x.x = Float((x << 22) & 0xFFC00000);
333                                         v.x.y = Float((y << 12) & 0xFFC00000);
334                                         v.x.z = Float((z << 2)  & 0xFFC00000);
335                                 }
336
337                                 {
338                                         Int x, y, z;
339                                         
340                                         x = y = z = *Pointer<Int>(source1);
341
342                                         v.y.x = Float((x << 22) & 0xFFC00000);
343                                         v.y.y = Float((y << 12) & 0xFFC00000);
344                                         v.y.z = Float((z << 2)  & 0xFFC00000);
345                                 }
346
347                                 {
348                                         Int x, y, z;
349                                         
350                                         x = y = z = *Pointer<Int>(source2);
351
352                                         v.z.x = Float((x << 22) & 0xFFC00000);
353                                         v.z.y = Float((y << 12) & 0xFFC00000);
354                                         v.z.z = Float((z << 2)  & 0xFFC00000);
355                                 }
356
357                                 {
358                                         Int x, y, z;
359                                         
360                                         x = y = z = *Pointer<Int>(source3);
361
362                                         v.w.x = Float((x << 22) & 0xFFC00000);
363                                         v.w.y = Float((y << 12) & 0xFFC00000);
364                                         v.w.z = Float((z << 2)  & 0xFFC00000);
365                                 }
366
367                                 transpose4x3(v.x, v.y, v.z, v.w);
368
369                                 v.x *= Float4(1.0f / 0x00400000 / 511.0f);
370                                 v.y *= Float4(1.0f / 0x00400000 / 511.0f);
371                                 v.z *= Float4(1.0f / 0x00400000 / 511.0f);
372                         }
373                         break;
374                 case STREAMTYPE_FIXED:
375                         {
376                                 v.x = Float4(*Pointer<Int4>(source0)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleFixed));
377                                 v.y = Float4(*Pointer<Int4>(source1)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleFixed));
378                                 v.z = Float4(*Pointer<Int4>(source2)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleFixed));
379                                 v.w = Float4(*Pointer<Int4>(source3)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleFixed));
380
381                                 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
382                         }
383                         break;
384                 case STREAMTYPE_HALF:
385                         {
386                                 if(stream.count >= 1)
387                                 {
388                                         UShort x0 = *Pointer<UShort>(source0 + 0);
389                                         UShort x1 = *Pointer<UShort>(source1 + 0);
390                                         UShort x2 = *Pointer<UShort>(source2 + 0);
391                                         UShort x3 = *Pointer<UShort>(source3 + 0);
392
393                                         v.x.x = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(x0) * 4);
394                                         v.x.y = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(x1) * 4);
395                                         v.x.z = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(x2) * 4);
396                                         v.x.w = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(x3) * 4);
397                                 }
398
399                                 if(stream.count >= 2)
400                                 {
401                                         UShort y0 = *Pointer<UShort>(source0 + 2);
402                                         UShort y1 = *Pointer<UShort>(source1 + 2);
403                                         UShort y2 = *Pointer<UShort>(source2 + 2);
404                                         UShort y3 = *Pointer<UShort>(source3 + 2);
405
406                                         v.y.x = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(y0) * 4);
407                                         v.y.y = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(y1) * 4);
408                                         v.y.z = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(y2) * 4);
409                                         v.y.w = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(y3) * 4);
410                                 }
411
412                                 if(stream.count >= 3)
413                                 {
414                                         UShort z0 = *Pointer<UShort>(source0 + 4);
415                                         UShort z1 = *Pointer<UShort>(source1 + 4);
416                                         UShort z2 = *Pointer<UShort>(source2 + 4);
417                                         UShort z3 = *Pointer<UShort>(source3 + 4);
418
419                                         v.z.x = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(z0) * 4);
420                                         v.z.y = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(z1) * 4);
421                                         v.z.z = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(z2) * 4);
422                                         v.z.w = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(z3) * 4);
423                                 }
424
425                                 if(stream.count >= 4)
426                                 {
427                                         UShort w0 = *Pointer<UShort>(source0 + 6);
428                                         UShort w1 = *Pointer<UShort>(source1 + 6);
429                                         UShort w2 = *Pointer<UShort>(source2 + 6);
430                                         UShort w3 = *Pointer<UShort>(source3 + 6);
431
432                                         v.w.x = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(w0) * 4);
433                                         v.w.y = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(w1) * 4);
434                                         v.w.z = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(w2) * 4);
435                                         v.w.w = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(w3) * 4);
436                                 }
437                         }
438                         break;
439                 case STREAMTYPE_INDICES:
440                         {
441                                 v.x.x = *Pointer<Float>(source0);
442                                 v.x.y = *Pointer<Float>(source1);
443                                 v.x.z = *Pointer<Float>(source2);
444                                 v.x.w = *Pointer<Float>(source3);
445                         }
446                         break;
447                 default:
448                         ASSERT(false);
449                 }
450
451                 if(stream.count < 1) v.x = Float4(0.0f);
452                 if(stream.count < 2) v.y = Float4(0.0f);
453                 if(stream.count < 3) v.z = Float4(0.0f);
454                 if(stream.count < 4) v.w = Float4(1.0f);
455
456                 return v;
457         }
458
459         void VertexRoutine::postTransform(Registers &r)
460         {
461                 int pos = state.positionRegister;
462
463                 // Backtransform
464                 if(state.preTransformed)
465                 {
466                         Float4 rhw = Float4(1.0f) / r.o[pos].w;
467
468                         Float4 W = *Pointer<Float4>(r.data + OFFSET(DrawData,Wx16)) * Float4(1.0f / 16.0f);
469                         Float4 H = *Pointer<Float4>(r.data + OFFSET(DrawData,Hx16)) * Float4(1.0f / 16.0f);
470                         Float4 L = *Pointer<Float4>(r.data + OFFSET(DrawData,X0x16)) * Float4(1.0f / 16.0f);
471                         Float4 T = *Pointer<Float4>(r.data + OFFSET(DrawData,Y0x16)) * Float4(1.0f / 16.0f);
472
473                         r.o[pos].x = (r.o[pos].x - L) / W * rhw;
474                         r.o[pos].y = (r.o[pos].y - T) / H * rhw;
475                         r.o[pos].z = r.o[pos].z * rhw;
476                         r.o[pos].w = rhw;
477                 }
478
479                 if(!halfIntegerCoordinates && !state.preTransformed)
480                 {
481                         r.o[pos].x = r.o[pos].x + *Pointer<Float4>(r.data + OFFSET(DrawData,halfPixelX)) * r.o[pos].w;
482                         r.o[pos].y = r.o[pos].y + *Pointer<Float4>(r.data + OFFSET(DrawData,halfPixelY)) * r.o[pos].w;
483                 }
484
485                 if(state.superSampling)
486                 {
487                         r.o[pos].x = r.o[pos].x + *Pointer<Float4>(r.data + OFFSET(DrawData,XXXX)) * r.o[pos].w;
488                         r.o[pos].y = r.o[pos].y + *Pointer<Float4>(r.data + OFFSET(DrawData,YYYY)) * r.o[pos].w;
489                 }
490
491                 if(symmetricNormalizedDepth && !state.fixedFunction)
492                 {
493                         r.o[pos].z = (r.o[pos].z + r.o[pos].w) * Float4(0.5f);
494                 }
495         }
496
497         void VertexRoutine::writeCache(Pointer<Byte> &cacheLine, Registers &r)
498         {
499                 Vector4f v;
500
501                 for(int i = 0; i < 12; i++)
502                 {
503                         if(state.output[i].write)
504                         {
505                                 v.x = r.o[i].x;
506                                 v.y = r.o[i].y;
507                                 v.z = r.o[i].z;
508                                 v.w = r.o[i].w;
509
510                                 if(state.output[i].xClamp)
511                                 {
512                                         v.x = Max(v.x, Float4(0.0f));
513                                         v.x = Min(v.x, Float4(1.0f));
514                                 }
515
516                                 if(state.output[i].yClamp)
517                                 {
518                                         v.y = Max(v.y, Float4(0.0f));
519                                         v.y = Min(v.y, Float4(1.0f));
520                                 }
521
522                                 if(state.output[i].zClamp)
523                                 {
524                                         v.z = Max(v.z, Float4(0.0f));
525                                         v.z = Min(v.z, Float4(1.0f));
526                                 }
527
528                                 if(state.output[i].wClamp)
529                                 {
530                                         v.w = Max(v.w, Float4(0.0f));
531                                         v.w = Min(v.w, Float4(1.0f));
532                                 }
533
534                                 if(state.output[i].write == 0x01)
535                                 {
536                                         *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0) = v.x.x;
537                                         *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1) = v.x.y;
538                                         *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2) = v.x.z;
539                                         *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3) = v.x.w;
540                                 }
541                                 else
542                                 {
543                                         if(state.output[i].write == 0x02)
544                                         {
545                                                 transpose2x4(v.x, v.y, v.z, v.w);
546                                         }
547                                         else
548                                         {
549                                                 transpose4x4(v.x, v.y, v.z, v.w);
550                                         }
551
552                                         *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0, 16) = v.x;
553                                         *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1, 16) = v.y;
554                                         *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2, 16) = v.z;
555                                         *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3, 16) = v.w;
556                                 }
557                         }
558                 }
559
560                 *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 0) = (r.clipFlags >> 0)  & 0x0000000FF;   // FIXME: unsigned char Vertex::clipFlags
561                 *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 1) = (r.clipFlags >> 8)  & 0x0000000FF;
562                 *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 2) = (r.clipFlags >> 16) & 0x0000000FF;
563                 *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 3) = (r.clipFlags >> 24) & 0x0000000FF;
564
565                 int pos = state.positionRegister;
566
567                 v.x = r.o[pos].x;
568                 v.y = r.o[pos].y;
569                 v.z = r.o[pos].z;
570                 v.w = r.o[pos].w;
571
572                 Float4 w = As<Float4>(As<Int4>(v.w) | (As<Int4>(CmpEQ(v.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
573                 Float4 rhw = Float4(1.0f) / w;
574
575                 v.x = As<Float4>(RoundInt(*Pointer<Float4>(r.data + OFFSET(DrawData,X0x16)) + v.x * rhw * *Pointer<Float4>(r.data + OFFSET(DrawData,Wx16))));
576                 v.y = As<Float4>(RoundInt(*Pointer<Float4>(r.data + OFFSET(DrawData,Y0x16)) + v.y * rhw * *Pointer<Float4>(r.data + OFFSET(DrawData,Hx16))));
577                 v.z = v.z * rhw;
578                 v.w = rhw;
579
580                 transpose4x4(v.x, v.y, v.z, v.w);
581
582                 *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 0, 16) = v.x;
583                 *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 1, 16) = v.y;
584                 *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 2, 16) = v.z;
585                 *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 3, 16) = v.w;
586         }
587
588         void VertexRoutine::writeVertex(Pointer<Byte> &vertex, Pointer<Byte> &cache)
589         {
590                 for(int i = 0; i < 12; i++)
591                 {
592                         if(state.output[i].write)
593                         {
594                                 *Pointer<Float4>(vertex + OFFSET(Vertex,v[i])) = *Pointer<Float4>(cache + OFFSET(Vertex,v[i]));
595                         }
596                 }
597
598                 *Pointer<Int>(vertex + OFFSET(Vertex,clipFlags)) = *Pointer<Int>(cache + OFFSET(Vertex,clipFlags));
599                 *Pointer<Float4>(vertex + OFFSET(Vertex,X)) = *Pointer<Float4>(cache + OFFSET(Vertex,X));
600         }
601 }