1 // SwiftShader Software Renderer
3 // Copyright(c) 2005-2012 TransGaming Inc.
5 // All rights reserved. No part of this software may be copied, distributed, transmitted,
6 // transcribed, stored in a retrieval system, translated into any human or computer
7 // language by any means, or disclosed to third parties without the explicit written
8 // agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
9 // or implied, including but not limited to any patent rights, are granted to you.
12 #include "VertexRoutine.hpp"
14 #include "VertexShader.hpp"
17 #include "Renderer.hpp"
18 #include "Constants.hpp"
23 extern bool halfIntegerCoordinates; // Pixel centers are not at integer coordinates
24 extern bool symmetricNormalizedDepth; // [-1, 1] instead of [0, 1]
26 VertexRoutine::VertexRoutine(const VertexProcessor::State &state, const VertexShader *shader) : state(state), shader(shader)
31 VertexRoutine::~VertexRoutine()
35 void VertexRoutine::generate()
37 Function<Void(Pointer<Byte>, Pointer<Byte>, Pointer<Byte>, Pointer<Byte>)> function;
39 Pointer<Byte> vertex(function.arg(0));
40 Pointer<Byte> batch(function.arg(1));
41 Pointer<Byte> task(function.arg(2));
42 Pointer<Byte> data(function.arg(3));
44 const bool texldl = state.shaderContainsTexldl;
46 Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache);
47 Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex);
48 Pointer<Byte> tagCache = cache + OFFSET(VertexCache,tag);
50 UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask,vertexCount));
54 r.constants = *Pointer<Pointer<Byte> >(data + OFFSET(DrawData,constants));
55 if(shader && shader->instanceIdDeclared)
57 r.instanceID = *Pointer<Int>(data + OFFSET(DrawData, instanceID));
62 UInt index = *Pointer<UInt>(batch);
63 UInt tagIndex = index & 0x0000003C;
64 UInt indexQ = !texldl ? UInt(index & 0xFFFFFFFC) : index; // FIXME: TEXLDL hack to have independent LODs, hurts performance.
66 If(*Pointer<UInt>(tagCache + tagIndex) != indexQ)
68 *Pointer<UInt>(tagCache + tagIndex) = indexQ;
75 Pointer<Byte> cacheLine0 = vertexCache + tagIndex * UInt((int)sizeof(Vertex));
76 writeCache(cacheLine0, r);
79 UInt cacheIndex = index & 0x0000003F;
80 Pointer<Byte> cacheLine = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
81 writeVertex(vertex, cacheLine);
83 vertex += sizeof(Vertex);
84 batch += sizeof(unsigned int);
87 Until(vertexCount == 0)
92 routine = function(L"VertexRoutine_%0.8X", state.shaderID);
95 Routine *VertexRoutine::getRoutine()
100 void VertexRoutine::readInput(Registers &r, UInt &index)
102 for(int i = 0; i < VERTEX_ATTRIBUTES; i++)
104 Pointer<Byte> input = *Pointer<Pointer<Byte> >(r.data + OFFSET(DrawData,input) + sizeof(void*) * i);
105 UInt stride = *Pointer<UInt>(r.data + OFFSET(DrawData,stride) + sizeof(unsigned int) * i);
107 r.v[i] = readStream(r, input, stride, state.input[i], index);
111 void VertexRoutine::computeClipFlags(Registers &r)
113 int pos = state.positionRegister;
115 Int4 maxX = CmpLT(r.o[pos].w, r.o[pos].x);
116 Int4 maxY = CmpLT(r.o[pos].w, r.o[pos].y);
117 Int4 maxZ = CmpLT(r.o[pos].w, r.o[pos].z);
119 Int4 minX = CmpNLE(-r.o[pos].w, r.o[pos].x);
120 Int4 minY = CmpNLE(-r.o[pos].w, r.o[pos].y);
121 Int4 minZ = CmpNLE(Float4(0.0f), r.o[pos].z);
125 flags = SignMask(maxX);
126 r.clipFlags = *Pointer<Int>(r.constants + OFFSET(Constants,maxX) + flags * 4); // FIXME: Array indexing
127 flags = SignMask(maxY);
128 r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,maxY) + flags * 4);
129 flags = SignMask(maxZ);
130 r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,maxZ) + flags * 4);
131 flags = SignMask(minX);
132 r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,minX) + flags * 4);
133 flags = SignMask(minY);
134 r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,minY) + flags * 4);
135 flags = SignMask(minZ);
136 r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,minZ) + flags * 4);
138 Int4 finiteX = CmpLE(Abs(r.o[pos].x), *Pointer<Float4>(r.constants + OFFSET(Constants,maxPos)));
139 Int4 finiteY = CmpLE(Abs(r.o[pos].y), *Pointer<Float4>(r.constants + OFFSET(Constants,maxPos)));
140 Int4 finiteZ = CmpLE(Abs(r.o[pos].z), *Pointer<Float4>(r.constants + OFFSET(Constants,maxPos)));
142 flags = SignMask(finiteX & finiteY & finiteZ);
143 r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,fini) + flags * 4);
145 if(state.preTransformed)
147 r.clipFlags &= 0xFBFBFBFB; // Don't clip against far clip plane
151 Vector4f VertexRoutine::readStream(Registers &r, Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index)
153 const bool texldl = state.shaderContainsTexldl;
157 Pointer<Byte> source0 = buffer + index * stride;
158 Pointer<Byte> source1 = source0 + (!texldl ? stride : 0);
159 Pointer<Byte> source2 = source1 + (!texldl ? stride : 0);
160 Pointer<Byte> source3 = source2 + (!texldl ? stride : 0);
164 case STREAMTYPE_FLOAT:
166 if(stream.count == 0)
168 // Null stream, all default components
170 else if(stream.count == 1)
172 v.x.x = *Pointer<Float>(source0);
173 v.x.y = *Pointer<Float>(source1);
174 v.x.z = *Pointer<Float>(source2);
175 v.x.w = *Pointer<Float>(source3);
179 v.x = *Pointer<Float4>(source0);
180 v.y = *Pointer<Float4>(source1);
181 v.z = *Pointer<Float4>(source2);
182 v.w = *Pointer<Float4>(source3);
184 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
188 case STREAMTYPE_BYTE:
190 v.x = Float4(*Pointer<Byte4>(source0));
191 v.y = Float4(*Pointer<Byte4>(source1));
192 v.z = Float4(*Pointer<Byte4>(source2));
193 v.w = Float4(*Pointer<Byte4>(source3));
195 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
197 if(stream.normalized)
199 if(stream.count >= 1) v.x *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
200 if(stream.count >= 2) v.y *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
201 if(stream.count >= 3) v.z *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
202 if(stream.count >= 4) v.w *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
206 case STREAMTYPE_SBYTE:
208 v.x = Float4(*Pointer<SByte4>(source0));
209 v.y = Float4(*Pointer<SByte4>(source1));
210 v.z = Float4(*Pointer<SByte4>(source2));
211 v.w = Float4(*Pointer<SByte4>(source3));
213 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
215 if(stream.normalized)
217 if(stream.count >= 1) v.x *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleSByte));
218 if(stream.count >= 2) v.y *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleSByte));
219 if(stream.count >= 3) v.z *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleSByte));
220 if(stream.count >= 4) v.w *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleSByte));
224 case STREAMTYPE_COLOR:
226 v.x = Float4(*Pointer<Byte4>(source0)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
227 v.y = Float4(*Pointer<Byte4>(source1)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
228 v.z = Float4(*Pointer<Byte4>(source2)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
229 v.w = Float4(*Pointer<Byte4>(source3)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte));
231 transpose4x4(v.x, v.y, v.z, v.w);
239 case STREAMTYPE_SHORT:
241 v.x = Float4(*Pointer<Short4>(source0));
242 v.y = Float4(*Pointer<Short4>(source1));
243 v.z = Float4(*Pointer<Short4>(source2));
244 v.w = Float4(*Pointer<Short4>(source3));
246 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
248 if(stream.normalized)
250 if(stream.count >= 1) v.x *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleShort));
251 if(stream.count >= 2) v.y *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleShort));
252 if(stream.count >= 3) v.z *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleShort));
253 if(stream.count >= 4) v.w *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleShort));
257 case STREAMTYPE_USHORT:
259 v.x = Float4(*Pointer<UShort4>(source0));
260 v.y = Float4(*Pointer<UShort4>(source1));
261 v.z = Float4(*Pointer<UShort4>(source2));
262 v.w = Float4(*Pointer<UShort4>(source3));
264 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
266 if(stream.normalized)
268 if(stream.count >= 1) v.x *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleUShort));
269 if(stream.count >= 2) v.y *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleUShort));
270 if(stream.count >= 3) v.z *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleUShort));
271 if(stream.count >= 4) v.w *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleUShort));
275 case STREAMTYPE_UDEC3:
281 x = y = z = *Pointer<Int>(source0);
283 v.x.x = Float(x & 0x000003FF);
284 v.x.y = Float(y & 0x000FFC00);
285 v.x.z = Float(z & 0x3FF00000);
291 x = y = z = *Pointer<Int>(source1);
293 v.y.x = Float(x & 0x000003FF);
294 v.y.y = Float(y & 0x000FFC00);
295 v.y.z = Float(z & 0x3FF00000);
301 x = y = z = *Pointer<Int>(source2);
303 v.z.x = Float(x & 0x000003FF);
304 v.z.y = Float(y & 0x000FFC00);
305 v.z.z = Float(z & 0x3FF00000);
311 x = y = z = *Pointer<Int>(source3);
313 v.w.x = Float(x & 0x000003FF);
314 v.w.y = Float(y & 0x000FFC00);
315 v.w.z = Float(z & 0x3FF00000);
318 transpose4x3(v.x, v.y, v.z, v.w);
320 v.y *= Float4(1.0f / 0x00000400);
321 v.z *= Float4(1.0f / 0x00100000);
324 case STREAMTYPE_DEC3N:
330 x = y = z = *Pointer<Int>(source0);
332 v.x.x = Float((x << 22) & 0xFFC00000);
333 v.x.y = Float((y << 12) & 0xFFC00000);
334 v.x.z = Float((z << 2) & 0xFFC00000);
340 x = y = z = *Pointer<Int>(source1);
342 v.y.x = Float((x << 22) & 0xFFC00000);
343 v.y.y = Float((y << 12) & 0xFFC00000);
344 v.y.z = Float((z << 2) & 0xFFC00000);
350 x = y = z = *Pointer<Int>(source2);
352 v.z.x = Float((x << 22) & 0xFFC00000);
353 v.z.y = Float((y << 12) & 0xFFC00000);
354 v.z.z = Float((z << 2) & 0xFFC00000);
360 x = y = z = *Pointer<Int>(source3);
362 v.w.x = Float((x << 22) & 0xFFC00000);
363 v.w.y = Float((y << 12) & 0xFFC00000);
364 v.w.z = Float((z << 2) & 0xFFC00000);
367 transpose4x3(v.x, v.y, v.z, v.w);
369 v.x *= Float4(1.0f / 0x00400000 / 511.0f);
370 v.y *= Float4(1.0f / 0x00400000 / 511.0f);
371 v.z *= Float4(1.0f / 0x00400000 / 511.0f);
374 case STREAMTYPE_FIXED:
376 v.x = Float4(*Pointer<Int4>(source0)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleFixed));
377 v.y = Float4(*Pointer<Int4>(source1)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleFixed));
378 v.z = Float4(*Pointer<Int4>(source2)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleFixed));
379 v.w = Float4(*Pointer<Int4>(source3)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleFixed));
381 transpose4xN(v.x, v.y, v.z, v.w, stream.count);
384 case STREAMTYPE_HALF:
386 if(stream.count >= 1)
388 UShort x0 = *Pointer<UShort>(source0 + 0);
389 UShort x1 = *Pointer<UShort>(source1 + 0);
390 UShort x2 = *Pointer<UShort>(source2 + 0);
391 UShort x3 = *Pointer<UShort>(source3 + 0);
393 v.x.x = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(x0) * 4);
394 v.x.y = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(x1) * 4);
395 v.x.z = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(x2) * 4);
396 v.x.w = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(x3) * 4);
399 if(stream.count >= 2)
401 UShort y0 = *Pointer<UShort>(source0 + 2);
402 UShort y1 = *Pointer<UShort>(source1 + 2);
403 UShort y2 = *Pointer<UShort>(source2 + 2);
404 UShort y3 = *Pointer<UShort>(source3 + 2);
406 v.y.x = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(y0) * 4);
407 v.y.y = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(y1) * 4);
408 v.y.z = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(y2) * 4);
409 v.y.w = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(y3) * 4);
412 if(stream.count >= 3)
414 UShort z0 = *Pointer<UShort>(source0 + 4);
415 UShort z1 = *Pointer<UShort>(source1 + 4);
416 UShort z2 = *Pointer<UShort>(source2 + 4);
417 UShort z3 = *Pointer<UShort>(source3 + 4);
419 v.z.x = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(z0) * 4);
420 v.z.y = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(z1) * 4);
421 v.z.z = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(z2) * 4);
422 v.z.w = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(z3) * 4);
425 if(stream.count >= 4)
427 UShort w0 = *Pointer<UShort>(source0 + 6);
428 UShort w1 = *Pointer<UShort>(source1 + 6);
429 UShort w2 = *Pointer<UShort>(source2 + 6);
430 UShort w3 = *Pointer<UShort>(source3 + 6);
432 v.w.x = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(w0) * 4);
433 v.w.y = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(w1) * 4);
434 v.w.z = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(w2) * 4);
435 v.w.w = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(w3) * 4);
439 case STREAMTYPE_INDICES:
441 v.x.x = *Pointer<Float>(source0);
442 v.x.y = *Pointer<Float>(source1);
443 v.x.z = *Pointer<Float>(source2);
444 v.x.w = *Pointer<Float>(source3);
451 if(stream.count < 1) v.x = Float4(0.0f);
452 if(stream.count < 2) v.y = Float4(0.0f);
453 if(stream.count < 3) v.z = Float4(0.0f);
454 if(stream.count < 4) v.w = Float4(1.0f);
459 void VertexRoutine::postTransform(Registers &r)
461 int pos = state.positionRegister;
464 if(state.preTransformed)
466 Float4 rhw = Float4(1.0f) / r.o[pos].w;
468 Float4 W = *Pointer<Float4>(r.data + OFFSET(DrawData,Wx16)) * Float4(1.0f / 16.0f);
469 Float4 H = *Pointer<Float4>(r.data + OFFSET(DrawData,Hx16)) * Float4(1.0f / 16.0f);
470 Float4 L = *Pointer<Float4>(r.data + OFFSET(DrawData,X0x16)) * Float4(1.0f / 16.0f);
471 Float4 T = *Pointer<Float4>(r.data + OFFSET(DrawData,Y0x16)) * Float4(1.0f / 16.0f);
473 r.o[pos].x = (r.o[pos].x - L) / W * rhw;
474 r.o[pos].y = (r.o[pos].y - T) / H * rhw;
475 r.o[pos].z = r.o[pos].z * rhw;
479 if(!halfIntegerCoordinates && !state.preTransformed)
481 r.o[pos].x = r.o[pos].x + *Pointer<Float4>(r.data + OFFSET(DrawData,halfPixelX)) * r.o[pos].w;
482 r.o[pos].y = r.o[pos].y + *Pointer<Float4>(r.data + OFFSET(DrawData,halfPixelY)) * r.o[pos].w;
485 if(state.superSampling)
487 r.o[pos].x = r.o[pos].x + *Pointer<Float4>(r.data + OFFSET(DrawData,XXXX)) * r.o[pos].w;
488 r.o[pos].y = r.o[pos].y + *Pointer<Float4>(r.data + OFFSET(DrawData,YYYY)) * r.o[pos].w;
491 if(symmetricNormalizedDepth && !state.fixedFunction)
493 r.o[pos].z = (r.o[pos].z + r.o[pos].w) * Float4(0.5f);
497 void VertexRoutine::writeCache(Pointer<Byte> &cacheLine, Registers &r)
501 for(int i = 0; i < 12; i++)
503 if(state.output[i].write)
510 if(state.output[i].xClamp)
512 v.x = Max(v.x, Float4(0.0f));
513 v.x = Min(v.x, Float4(1.0f));
516 if(state.output[i].yClamp)
518 v.y = Max(v.y, Float4(0.0f));
519 v.y = Min(v.y, Float4(1.0f));
522 if(state.output[i].zClamp)
524 v.z = Max(v.z, Float4(0.0f));
525 v.z = Min(v.z, Float4(1.0f));
528 if(state.output[i].wClamp)
530 v.w = Max(v.w, Float4(0.0f));
531 v.w = Min(v.w, Float4(1.0f));
534 if(state.output[i].write == 0x01)
536 *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0) = v.x.x;
537 *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1) = v.x.y;
538 *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2) = v.x.z;
539 *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3) = v.x.w;
543 if(state.output[i].write == 0x02)
545 transpose2x4(v.x, v.y, v.z, v.w);
549 transpose4x4(v.x, v.y, v.z, v.w);
552 *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0, 16) = v.x;
553 *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1, 16) = v.y;
554 *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2, 16) = v.z;
555 *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3, 16) = v.w;
560 *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 0) = (r.clipFlags >> 0) & 0x0000000FF; // FIXME: unsigned char Vertex::clipFlags
561 *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 1) = (r.clipFlags >> 8) & 0x0000000FF;
562 *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 2) = (r.clipFlags >> 16) & 0x0000000FF;
563 *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 3) = (r.clipFlags >> 24) & 0x0000000FF;
565 int pos = state.positionRegister;
572 Float4 w = As<Float4>(As<Int4>(v.w) | (As<Int4>(CmpEQ(v.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
573 Float4 rhw = Float4(1.0f) / w;
575 v.x = As<Float4>(RoundInt(*Pointer<Float4>(r.data + OFFSET(DrawData,X0x16)) + v.x * rhw * *Pointer<Float4>(r.data + OFFSET(DrawData,Wx16))));
576 v.y = As<Float4>(RoundInt(*Pointer<Float4>(r.data + OFFSET(DrawData,Y0x16)) + v.y * rhw * *Pointer<Float4>(r.data + OFFSET(DrawData,Hx16))));
580 transpose4x4(v.x, v.y, v.z, v.w);
582 *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 0, 16) = v.x;
583 *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 1, 16) = v.y;
584 *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 2, 16) = v.z;
585 *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 3, 16) = v.w;
588 void VertexRoutine::writeVertex(Pointer<Byte> &vertex, Pointer<Byte> &cache)
590 for(int i = 0; i < 12; i++)
592 if(state.output[i].write)
594 *Pointer<Float4>(vertex + OFFSET(Vertex,v[i])) = *Pointer<Float4>(cache + OFFSET(Vertex,v[i]));
598 *Pointer<Int>(vertex + OFFSET(Vertex,clipFlags)) = *Pointer<Int>(cache + OFFSET(Vertex,clipFlags));
599 *Pointer<Float4>(vertex + OFFSET(Vertex,X)) = *Pointer<Float4>(cache + OFFSET(Vertex,X));