OSDN Git Service

Fixed Windows warnings
[android-x86/external-swiftshader.git] / src / Shader / PixelPipeline.cpp
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "PixelPipeline.hpp"
16 #include "Renderer.hpp"
17 #include "SamplerCore.hpp"
18
19 namespace sw
20 {
21         extern bool postBlendSRGB;
22
23         void PixelPipeline::setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w)
24         {
25                 if(state.color[0].component & 0x1) diffuse.x = convertFixed12(v[0].x); else diffuse.x = Short4(0x1000);
26                 if(state.color[0].component & 0x2) diffuse.y = convertFixed12(v[0].y); else diffuse.y = Short4(0x1000);
27                 if(state.color[0].component & 0x4) diffuse.z = convertFixed12(v[0].z); else diffuse.z = Short4(0x1000);
28                 if(state.color[0].component & 0x8) diffuse.w = convertFixed12(v[0].w); else diffuse.w = Short4(0x1000);
29
30                 if(state.color[1].component & 0x1) specular.x = convertFixed12(v[1].x); else specular.x = Short4(0x0000, 0x0000, 0x0000, 0x0000);
31                 if(state.color[1].component & 0x2) specular.y = convertFixed12(v[1].y); else specular.y = Short4(0x0000, 0x0000, 0x0000, 0x0000);
32                 if(state.color[1].component & 0x4) specular.z = convertFixed12(v[1].z); else specular.z = Short4(0x0000, 0x0000, 0x0000, 0x0000);
33                 if(state.color[1].component & 0x8) specular.w = convertFixed12(v[1].w); else specular.w = Short4(0x0000, 0x0000, 0x0000, 0x0000);
34         }
35
36         void PixelPipeline::fixedFunction()
37         {
38                 current = diffuse;
39                 Vector4s temp(0x0000, 0x0000, 0x0000, 0x0000);
40
41                 for(int stage = 0; stage < 8; stage++)
42                 {
43                         if(state.textureStage[stage].stageOperation == TextureStage::STAGE_DISABLE)
44                         {
45                                 break;
46                         }
47
48                         Vector4s texture;
49
50                         if(state.textureStage[stage].usesTexture)
51                         {
52                                 sampleTexture(texture, stage, stage);
53                         }
54
55                         blendTexture(temp, texture, stage);
56                 }
57
58                 specularPixel(current, specular);
59         }
60
61         void PixelPipeline::applyShader(Int cMask[4])
62         {
63                 if(!shader)
64                 {
65                         fixedFunction();
66                         return;
67                 }
68
69                 int pad = 0;        // Count number of texm3x3pad instructions
70                 Vector4s dPairing;   // Destination for first pairing instruction
71
72                 for(size_t i = 0; i < shader->getLength(); i++)
73                 {
74                         const Shader::Instruction *instruction = shader->getInstruction(i);
75                         Shader::Opcode opcode = instruction->opcode;
76
77                         //      #ifndef NDEBUG   // FIXME: Centralize debug output control
78                         //              shader->printInstruction(i, "debug.txt");
79                         //      #endif
80
81                         if(opcode == Shader::OPCODE_DCL || opcode == Shader::OPCODE_DEF || opcode == Shader::OPCODE_DEFI || opcode == Shader::OPCODE_DEFB)
82                         {
83                                 continue;
84                         }
85
86                         const Dst &dst = instruction->dst;
87                         const Src &src0 = instruction->src[0];
88                         const Src &src1 = instruction->src[1];
89                         const Src &src2 = instruction->src[2];
90
91                         unsigned short version = shader->getVersion();
92                         bool pairing = i + 1 < shader->getLength() && shader->getInstruction(i + 1)->coissue;   // First instruction of pair
93                         bool coissue = instruction->coissue;                                                              // Second instruction of pair
94
95                         Vector4s d;
96                         Vector4s s0;
97                         Vector4s s1;
98                         Vector4s s2;
99
100                         if(src0.type != Shader::PARAMETER_VOID) s0 = fetchRegister(src0);
101                         if(src1.type != Shader::PARAMETER_VOID) s1 = fetchRegister(src1);
102                         if(src2.type != Shader::PARAMETER_VOID) s2 = fetchRegister(src2);
103
104                         Float4 x = version < 0x0104 ? v[2 + dst.index].x : v[2 + src0.index].x;
105                         Float4 y = version < 0x0104 ? v[2 + dst.index].y : v[2 + src0.index].y;
106                         Float4 z = version < 0x0104 ? v[2 + dst.index].z : v[2 + src0.index].z;
107                         Float4 w = version < 0x0104 ? v[2 + dst.index].w : v[2 + src0.index].w;
108
109                         switch(opcode)
110                         {
111                         case Shader::OPCODE_PS_1_0: break;
112                         case Shader::OPCODE_PS_1_1: break;
113                         case Shader::OPCODE_PS_1_2: break;
114                         case Shader::OPCODE_PS_1_3: break;
115                         case Shader::OPCODE_PS_1_4: break;
116
117                         case Shader::OPCODE_DEF:    break;
118
119                         case Shader::OPCODE_NOP:    break;
120                         case Shader::OPCODE_MOV: MOV(d, s0);         break;
121                         case Shader::OPCODE_ADD: ADD(d, s0, s1);     break;
122                         case Shader::OPCODE_SUB: SUB(d, s0, s1);     break;
123                         case Shader::OPCODE_MAD: MAD(d, s0, s1, s2); break;
124                         case Shader::OPCODE_MUL: MUL(d, s0, s1);     break;
125                         case Shader::OPCODE_DP3: DP3(d, s0, s1);     break;
126                         case Shader::OPCODE_DP4: DP4(d, s0, s1);     break;
127                         case Shader::OPCODE_LRP: LRP(d, s0, s1, s2); break;
128                         case Shader::OPCODE_TEXCOORD:
129                                 if(version < 0x0104)
130                                 {
131                                         TEXCOORD(d, x, y, z, dst.index);
132                         }
133                                 else
134                                 {
135                                         if((src0.swizzle & 0x30) == 0x20)   // .xyz
136                                         {
137                                                 TEXCRD(d, x, y, z, src0.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
138                                         }
139                                         else   // .xwy
140                                         {
141                                                 TEXCRD(d, x, y, w, src0.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
142                                         }
143                                 }
144                                 break;
145                         case Shader::OPCODE_TEXKILL:
146                                 if(version < 0x0104)
147                                 {
148                                         TEXKILL(cMask, x, y, z);
149                                 }
150                                 else if(version == 0x0104)
151                                 {
152                                         if(dst.type == Shader::PARAMETER_TEXTURE)
153                                         {
154                                                 TEXKILL(cMask, x, y, z);
155                                         }
156                                         else
157                                         {
158                                                 TEXKILL(cMask, rs[dst.index]);
159                                         }
160                                 }
161                                 else ASSERT(false);
162                                 break;
163                         case Shader::OPCODE_TEX:
164                                 if(version < 0x0104)
165                                 {
166                                         TEX(d, x, y, z, dst.index, false);
167                                 }
168                                 else if(version == 0x0104)
169                                 {
170                                         if(src0.type == Shader::PARAMETER_TEXTURE)
171                                         {
172                                                 if((src0.swizzle & 0x30) == 0x20)   // .xyz
173                                                 {
174                                                         TEX(d, x, y, z, dst.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
175                                                 }
176                                                 else   // .xyw
177                                                 {
178                                                         TEX(d, x, y, w, dst.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
179                                                 }
180                                         }
181                                         else
182                                         {
183                                                 TEXLD(d, s0, dst.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
184                                         }
185                                 }
186                                 else ASSERT(false);
187                                 break;
188                         case Shader::OPCODE_TEXBEM:       TEXBEM(d, s0, x, y, z, dst.index);                                             break;
189                         case Shader::OPCODE_TEXBEML:      TEXBEML(d, s0, x, y, z, dst.index);                                            break;
190                         case Shader::OPCODE_TEXREG2AR:    TEXREG2AR(d, s0, dst.index);                                                   break;
191                         case Shader::OPCODE_TEXREG2GB:    TEXREG2GB(d, s0, dst.index);                                                   break;
192                         case Shader::OPCODE_TEXM3X2PAD:   TEXM3X2PAD(x, y, z, s0, 0, src0.modifier == Shader::MODIFIER_SIGN);            break;
193                         case Shader::OPCODE_TEXM3X2TEX:   TEXM3X2TEX(d, x, y, z, dst.index, s0, src0.modifier == Shader::MODIFIER_SIGN); break;
194                         case Shader::OPCODE_TEXM3X3PAD:   TEXM3X3PAD(x, y, z, s0, pad++ % 2, src0.modifier == Shader::MODIFIER_SIGN);    break;
195                         case Shader::OPCODE_TEXM3X3TEX:   TEXM3X3TEX(d, x, y, z, dst.index, s0, src0.modifier == Shader::MODIFIER_SIGN); break;
196                         case Shader::OPCODE_TEXM3X3SPEC:  TEXM3X3SPEC(d, x, y, z, dst.index, s0, s1);                                    break;
197                         case Shader::OPCODE_TEXM3X3VSPEC: TEXM3X3VSPEC(d, x, y, z, dst.index, s0);                                       break;
198                         case Shader::OPCODE_CND:          CND(d, s0, s1, s2);                                                            break;
199                         case Shader::OPCODE_TEXREG2RGB:   TEXREG2RGB(d, s0, dst.index);                                                  break;
200                         case Shader::OPCODE_TEXDP3TEX:    TEXDP3TEX(d, x, y, z, dst.index, s0);                                          break;
201                         case Shader::OPCODE_TEXM3X2DEPTH: TEXM3X2DEPTH(d, x, y, z, s0, src0.modifier == Shader::MODIFIER_SIGN);          break;
202                         case Shader::OPCODE_TEXDP3:       TEXDP3(d, x, y, z, s0);                                                        break;
203                         case Shader::OPCODE_TEXM3X3:      TEXM3X3(d, x, y, z, s0, src0.modifier == Shader::MODIFIER_SIGN);               break;
204                         case Shader::OPCODE_TEXDEPTH:     TEXDEPTH();                                                                    break;
205                         case Shader::OPCODE_CMP0:         CMP(d, s0, s1, s2);                                                            break;
206                         case Shader::OPCODE_BEM:          BEM(d, s0, s1, dst.index);                                                     break;
207                         case Shader::OPCODE_PHASE:                                                                                       break;
208                         case Shader::OPCODE_END:                                                                                         break;
209                         default:
210                                 ASSERT(false);
211                         }
212
213                         if(dst.type != Shader::PARAMETER_VOID && opcode != Shader::OPCODE_TEXKILL)
214                         {
215                                 if(dst.shift > 0)
216                                 {
217                                         if(dst.mask & 0x1) { d.x = AddSat(d.x, d.x); if(dst.shift > 1) d.x = AddSat(d.x, d.x); if(dst.shift > 2) d.x = AddSat(d.x, d.x); }
218                                         if(dst.mask & 0x2) { d.y = AddSat(d.y, d.y); if(dst.shift > 1) d.y = AddSat(d.y, d.y); if(dst.shift > 2) d.y = AddSat(d.y, d.y); }
219                                         if(dst.mask & 0x4) { d.z = AddSat(d.z, d.z); if(dst.shift > 1) d.z = AddSat(d.z, d.z); if(dst.shift > 2) d.z = AddSat(d.z, d.z); }
220                                         if(dst.mask & 0x8) { d.w = AddSat(d.w, d.w); if(dst.shift > 1) d.w = AddSat(d.w, d.w); if(dst.shift > 2) d.w = AddSat(d.w, d.w); }
221                                 }
222                                 else if(dst.shift < 0)
223                                 {
224                                         if(dst.mask & 0x1) d.x = d.x >> -dst.shift;
225                                         if(dst.mask & 0x2) d.y = d.y >> -dst.shift;
226                                         if(dst.mask & 0x4) d.z = d.z >> -dst.shift;
227                                         if(dst.mask & 0x8) d.w = d.w >> -dst.shift;
228                                 }
229
230                                 if(dst.saturate)
231                                 {
232                                         if(dst.mask & 0x1) { d.x = Min(d.x, Short4(0x1000)); d.x = Max(d.x, Short4(0x0000, 0x0000, 0x0000, 0x0000)); }
233                                         if(dst.mask & 0x2) { d.y = Min(d.y, Short4(0x1000)); d.y = Max(d.y, Short4(0x0000, 0x0000, 0x0000, 0x0000)); }
234                                         if(dst.mask & 0x4) { d.z = Min(d.z, Short4(0x1000)); d.z = Max(d.z, Short4(0x0000, 0x0000, 0x0000, 0x0000)); }
235                                         if(dst.mask & 0x8) { d.w = Min(d.w, Short4(0x1000)); d.w = Max(d.w, Short4(0x0000, 0x0000, 0x0000, 0x0000)); }
236                                 }
237
238                                 if(pairing)
239                                 {
240                                         if(dst.mask & 0x1) dPairing.x = d.x;
241                                         if(dst.mask & 0x2) dPairing.y = d.y;
242                                         if(dst.mask & 0x4) dPairing.z = d.z;
243                                         if(dst.mask & 0x8) dPairing.w = d.w;
244                                 }
245
246                                 if(coissue)
247                                 {
248                                         const Dst &dst = shader->getInstruction(i - 1)->dst;
249
250                                         writeDestination(dPairing, dst);
251                                 }
252
253                                 if(!pairing)
254                                 {
255                                         writeDestination(d, dst);
256                                 }
257                         }
258                 }
259         }
260
261         Bool PixelPipeline::alphaTest(Int cMask[4])
262         {
263                 current.x = Min(current.x, Short4(0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF)); current.x = Max(current.x, Short4(0x0000, 0x0000, 0x0000, 0x0000));
264                 current.y = Min(current.y, Short4(0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF)); current.y = Max(current.y, Short4(0x0000, 0x0000, 0x0000, 0x0000));
265                 current.z = Min(current.z, Short4(0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF)); current.z = Max(current.z, Short4(0x0000, 0x0000, 0x0000, 0x0000));
266                 current.w = Min(current.w, Short4(0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF)); current.w = Max(current.w, Short4(0x0000, 0x0000, 0x0000, 0x0000));
267
268                 if(!state.alphaTestActive())
269                 {
270                         return true;
271                 }
272
273                 Int aMask;
274
275                 if(state.transparencyAntialiasing == TRANSPARENCY_NONE)
276                 {
277                         PixelRoutine::alphaTest(aMask, current.w);
278
279                         for(unsigned int q = 0; q < state.multiSample; q++)
280                         {
281                                 cMask[q] &= aMask;
282                         }
283                 }
284                 else if(state.transparencyAntialiasing == TRANSPARENCY_ALPHA_TO_COVERAGE)
285                 {
286                         Float4 alpha = Float4(current.w) * Float4(1.0f / 0x1000);
287
288                         alphaToCoverage(cMask, alpha);
289                 }
290                 else ASSERT(false);
291
292                 Int pass = cMask[0];
293
294                 for(unsigned int q = 1; q < state.multiSample; q++)
295                 {
296                         pass = pass | cMask[q];
297                 }
298
299                 return pass != 0x0;
300         }
301
302         void PixelPipeline::rasterOperation(Float4 &fog, Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4])
303         {
304                 if(!state.colorWriteActive(0))
305                 {
306                         return;
307                 }
308
309                 Vector4f oC;
310
311                 switch(state.targetFormat[0])
312                 {
313                 case FORMAT_R5G6B5:
314                 case FORMAT_X8R8G8B8:
315                 case FORMAT_X8B8G8R8:
316                 case FORMAT_A8R8G8B8:
317                 case FORMAT_A8B8G8R8:
318                 case FORMAT_A8:
319                 case FORMAT_G16R16:
320                 case FORMAT_A16B16G16R16:
321                         if(!postBlendSRGB && state.writeSRGB)
322                         {
323                                 linearToSRGB12_16(current);
324                         }
325                         else
326                         {
327                                 current.x <<= 4;
328                                 current.y <<= 4;
329                                 current.z <<= 4;
330                                 current.w <<= 4;
331                         }
332
333                         if(state.targetFormat[0] == FORMAT_R5G6B5)
334                         {
335                                 current.x &= Short4(0xF800u);
336                                 current.y &= Short4(0xFC00u);
337                                 current.z &= Short4(0xF800u);
338                         }
339
340                         fogBlend(current, fog);
341
342                         for(unsigned int q = 0; q < state.multiSample; q++)
343                         {
344                                 Pointer<Byte> buffer = cBuffer[0] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[0]));
345                                 Vector4s color = current;
346
347                                 if(state.multiSampleMask & (1 << q))
348                                 {
349                                         alphaBlend(0, buffer, color, x);
350                                         logicOperation(0, buffer, color, x);
351                                         writeColor(0, buffer, x, color, sMask[q], zMask[q], cMask[q]);
352                                 }
353                         }
354                         break;
355                 case FORMAT_R32F:
356                 case FORMAT_G32R32F:
357                 case FORMAT_X32B32G32R32F:
358                 case FORMAT_A32B32G32R32F:
359                         convertSigned12(oC, current);
360                         PixelRoutine::fogBlend(oC, fog);
361
362                         for(unsigned int q = 0; q < state.multiSample; q++)
363                         {
364                                 Pointer<Byte> buffer = cBuffer[0] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[0]));
365                                 Vector4f color = oC;
366
367                                 if(state.multiSampleMask & (1 << q))
368                                 {
369                                         alphaBlend(0, buffer, color, x);
370                                         writeColor(0, buffer, x, color, sMask[q], zMask[q], cMask[q]);
371                                 }
372                         }
373                         break;
374                 default:
375                         ASSERT(false);
376                 }
377         }
378
379         void PixelPipeline::blendTexture(Vector4s &temp, Vector4s &texture, int stage)
380         {
381                 Vector4s *arg1 = nullptr;
382                 Vector4s *arg2 = nullptr;
383                 Vector4s *arg3 = nullptr;
384                 Vector4s res;
385
386                 Vector4s constant;
387                 Vector4s tfactor;
388
389                 const TextureStage::State &textureStage = state.textureStage[stage];
390
391                 if(textureStage.firstArgument == TextureStage::SOURCE_CONSTANT ||
392                    textureStage.firstArgumentAlpha == TextureStage::SOURCE_CONSTANT ||
393                    textureStage.secondArgument == TextureStage::SOURCE_CONSTANT ||
394                    textureStage.secondArgumentAlpha == TextureStage::SOURCE_CONSTANT ||
395                    textureStage.thirdArgument == TextureStage::SOURCE_CONSTANT ||
396                    textureStage.thirdArgumentAlpha == TextureStage::SOURCE_CONSTANT)
397                 {
398                         constant.x = *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].constantColor4[0]));
399                         constant.y = *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].constantColor4[1]));
400                         constant.z = *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].constantColor4[2]));
401                         constant.w = *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].constantColor4[3]));
402                 }
403
404                 if(textureStage.firstArgument == TextureStage::SOURCE_TFACTOR ||
405                    textureStage.firstArgumentAlpha == TextureStage::SOURCE_TFACTOR ||
406                    textureStage.secondArgument == TextureStage::SOURCE_TFACTOR ||
407                    textureStage.secondArgumentAlpha == TextureStage::SOURCE_TFACTOR ||
408                    textureStage.thirdArgument == TextureStage::SOURCE_TFACTOR ||
409                    textureStage.thirdArgumentAlpha == TextureStage::SOURCE_TFACTOR)
410                 {
411                         tfactor.x = *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[0]));
412                         tfactor.y = *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[1]));
413                         tfactor.z = *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[2]));
414                         tfactor.w = *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]));
415                 }
416
417                 // Premodulate
418                 if(stage > 0 && textureStage.usesTexture)
419                 {
420                         if(state.textureStage[stage - 1].stageOperation == TextureStage::STAGE_PREMODULATE)
421                         {
422                                 current.x = MulHigh(current.x, texture.x) << 4;
423                                 current.y = MulHigh(current.y, texture.y) << 4;
424                                 current.z = MulHigh(current.z, texture.z) << 4;
425                         }
426
427                         if(state.textureStage[stage - 1].stageOperationAlpha == TextureStage::STAGE_PREMODULATE)
428                         {
429                                 current.w = MulHigh(current.w, texture.w) << 4;
430                         }
431                 }
432
433                 if(luminance)
434                 {
435                         texture.x = MulHigh(texture.x, L) << 4;
436                         texture.y = MulHigh(texture.y, L) << 4;
437                         texture.z = MulHigh(texture.z, L) << 4;
438
439                         luminance = false;
440                 }
441
442                 switch(textureStage.firstArgument)
443                 {
444                 case TextureStage::SOURCE_TEXTURE:      arg1 = &texture;    break;
445                 case TextureStage::SOURCE_CONSTANT:     arg1 = &constant;   break;
446                 case TextureStage::SOURCE_CURRENT:      arg1 = &current;  break;
447                 case TextureStage::SOURCE_DIFFUSE:      arg1 = &diffuse;  break;
448                 case TextureStage::SOURCE_SPECULAR:     arg1 = &specular; break;
449                 case TextureStage::SOURCE_TEMP:         arg1 = &temp;       break;
450                 case TextureStage::SOURCE_TFACTOR:      arg1 = &tfactor;    break;
451                 default:
452                         ASSERT(false);
453                 }
454
455                 switch(textureStage.secondArgument)
456                 {
457                 case TextureStage::SOURCE_TEXTURE:      arg2 = &texture;    break;
458                 case TextureStage::SOURCE_CONSTANT:     arg2 = &constant;   break;
459                 case TextureStage::SOURCE_CURRENT:      arg2 = &current;  break;
460                 case TextureStage::SOURCE_DIFFUSE:      arg2 = &diffuse;  break;
461                 case TextureStage::SOURCE_SPECULAR:     arg2 = &specular; break;
462                 case TextureStage::SOURCE_TEMP:         arg2 = &temp;       break;
463                 case TextureStage::SOURCE_TFACTOR:      arg2 = &tfactor;    break;
464                 default:
465                         ASSERT(false);
466                 }
467
468                 switch(textureStage.thirdArgument)
469                 {
470                 case TextureStage::SOURCE_TEXTURE:      arg3 = &texture;    break;
471                 case TextureStage::SOURCE_CONSTANT:     arg3 = &constant;   break;
472                 case TextureStage::SOURCE_CURRENT:      arg3 = &current;  break;
473                 case TextureStage::SOURCE_DIFFUSE:      arg3 = &diffuse;  break;
474                 case TextureStage::SOURCE_SPECULAR:     arg3 = &specular; break;
475                 case TextureStage::SOURCE_TEMP:         arg3 = &temp;       break;
476                 case TextureStage::SOURCE_TFACTOR:      arg3 = &tfactor;    break;
477                 default:
478                         ASSERT(false);
479                 }
480
481                 Vector4s mod1;
482                 Vector4s mod2;
483                 Vector4s mod3;
484
485                 switch(textureStage.firstModifier)
486                 {
487                 case TextureStage::MODIFIER_COLOR:
488                         break;
489                 case TextureStage::MODIFIER_INVCOLOR:
490                         mod1.x = SubSat(Short4(0x1000), arg1->x);
491                         mod1.y = SubSat(Short4(0x1000), arg1->y);
492                         mod1.z = SubSat(Short4(0x1000), arg1->z);
493                         mod1.w = SubSat(Short4(0x1000), arg1->w);
494
495                         arg1 = &mod1;
496                         break;
497                 case TextureStage::MODIFIER_ALPHA:
498                         mod1.x = arg1->w;
499                         mod1.y = arg1->w;
500                         mod1.z = arg1->w;
501                         mod1.w = arg1->w;
502
503                         arg1 = &mod1;
504                         break;
505                 case TextureStage::MODIFIER_INVALPHA:
506                         mod1.x = SubSat(Short4(0x1000), arg1->w);
507                         mod1.y = SubSat(Short4(0x1000), arg1->w);
508                         mod1.z = SubSat(Short4(0x1000), arg1->w);
509                         mod1.w = SubSat(Short4(0x1000), arg1->w);
510
511                         arg1 = &mod1;
512                         break;
513                 default:
514                         ASSERT(false);
515                 }
516
517                 switch(textureStage.secondModifier)
518                 {
519                 case TextureStage::MODIFIER_COLOR:
520                         break;
521                 case TextureStage::MODIFIER_INVCOLOR:
522                         mod2.x = SubSat(Short4(0x1000), arg2->x);
523                         mod2.y = SubSat(Short4(0x1000), arg2->y);
524                         mod2.z = SubSat(Short4(0x1000), arg2->z);
525                         mod2.w = SubSat(Short4(0x1000), arg2->w);
526
527                         arg2 = &mod2;
528                         break;
529                 case TextureStage::MODIFIER_ALPHA:
530                         mod2.x = arg2->w;
531                         mod2.y = arg2->w;
532                         mod2.z = arg2->w;
533                         mod2.w = arg2->w;
534
535                         arg2 = &mod2;
536                         break;
537                 case TextureStage::MODIFIER_INVALPHA:
538                         mod2.x = SubSat(Short4(0x1000), arg2->w);
539                         mod2.y = SubSat(Short4(0x1000), arg2->w);
540                         mod2.z = SubSat(Short4(0x1000), arg2->w);
541                         mod2.w = SubSat(Short4(0x1000), arg2->w);
542
543                         arg2 = &mod2;
544                         break;
545                 default:
546                         ASSERT(false);
547                 }
548
549                 switch(textureStage.thirdModifier)
550                 {
551                 case TextureStage::MODIFIER_COLOR:
552                         break;
553                 case TextureStage::MODIFIER_INVCOLOR:
554                         mod3.x = SubSat(Short4(0x1000), arg3->x);
555                         mod3.y = SubSat(Short4(0x1000), arg3->y);
556                         mod3.z = SubSat(Short4(0x1000), arg3->z);
557                         mod3.w = SubSat(Short4(0x1000), arg3->w);
558
559                         arg3 = &mod3;
560                         break;
561                 case TextureStage::MODIFIER_ALPHA:
562                         mod3.x = arg3->w;
563                         mod3.y = arg3->w;
564                         mod3.z = arg3->w;
565                         mod3.w = arg3->w;
566
567                         arg3 = &mod3;
568                         break;
569                 case TextureStage::MODIFIER_INVALPHA:
570                         mod3.x = SubSat(Short4(0x1000), arg3->w);
571                         mod3.y = SubSat(Short4(0x1000), arg3->w);
572                         mod3.z = SubSat(Short4(0x1000), arg3->w);
573                         mod3.w = SubSat(Short4(0x1000), arg3->w);
574
575                         arg3 = &mod3;
576                         break;
577                 default:
578                         ASSERT(false);
579                 }
580
581                 switch(textureStage.stageOperation)
582                 {
583                 case TextureStage::STAGE_DISABLE:
584                         break;
585                 case TextureStage::STAGE_SELECTARG1: // Arg1
586                         res.x = arg1->x;
587                         res.y = arg1->y;
588                         res.z = arg1->z;
589                         break;
590                 case TextureStage::STAGE_SELECTARG2: // Arg2
591                         res.x = arg2->x;
592                         res.y = arg2->y;
593                         res.z = arg2->z;
594                         break;
595                 case TextureStage::STAGE_SELECTARG3: // Arg3
596                         res.x = arg3->x;
597                         res.y = arg3->y;
598                         res.z = arg3->z;
599                         break;
600                 case TextureStage::STAGE_MODULATE: // Arg1 * Arg2
601                         res.x = MulHigh(arg1->x, arg2->x) << 4;
602                         res.y = MulHigh(arg1->y, arg2->y) << 4;
603                         res.z = MulHigh(arg1->z, arg2->z) << 4;
604                         break;
605                 case TextureStage::STAGE_MODULATE2X: // Arg1 * Arg2 * 2
606                         res.x = MulHigh(arg1->x, arg2->x) << 5;
607                         res.y = MulHigh(arg1->y, arg2->y) << 5;
608                         res.z = MulHigh(arg1->z, arg2->z) << 5;
609                         break;
610                 case TextureStage::STAGE_MODULATE4X: // Arg1 * Arg2 * 4
611                         res.x = MulHigh(arg1->x, arg2->x) << 6;
612                         res.y = MulHigh(arg1->y, arg2->y) << 6;
613                         res.z = MulHigh(arg1->z, arg2->z) << 6;
614                         break;
615                 case TextureStage::STAGE_ADD: // Arg1 + Arg2
616                         res.x = AddSat(arg1->x, arg2->x);
617                         res.y = AddSat(arg1->y, arg2->y);
618                         res.z = AddSat(arg1->z, arg2->z);
619                         break;
620                 case TextureStage::STAGE_ADDSIGNED: // Arg1 + Arg2 - 0.5
621                         res.x = AddSat(arg1->x, arg2->x);
622                         res.y = AddSat(arg1->y, arg2->y);
623                         res.z = AddSat(arg1->z, arg2->z);
624
625                         res.x = SubSat(res.x, Short4(0x0800, 0x0800, 0x0800, 0x0800));
626                         res.y = SubSat(res.y, Short4(0x0800, 0x0800, 0x0800, 0x0800));
627                         res.z = SubSat(res.z, Short4(0x0800, 0x0800, 0x0800, 0x0800));
628                         break;
629                 case TextureStage::STAGE_ADDSIGNED2X: // (Arg1 + Arg2 - 0.5) << 1
630                         res.x = AddSat(arg1->x, arg2->x);
631                         res.y = AddSat(arg1->y, arg2->y);
632                         res.z = AddSat(arg1->z, arg2->z);
633
634                         res.x = SubSat(res.x, Short4(0x0800, 0x0800, 0x0800, 0x0800));
635                         res.y = SubSat(res.y, Short4(0x0800, 0x0800, 0x0800, 0x0800));
636                         res.z = SubSat(res.z, Short4(0x0800, 0x0800, 0x0800, 0x0800));
637
638                         res.x = AddSat(res.x, res.x);
639                         res.y = AddSat(res.y, res.y);
640                         res.z = AddSat(res.z, res.z);
641                         break;
642                 case TextureStage::STAGE_SUBTRACT: // Arg1 - Arg2
643                         res.x = SubSat(arg1->x, arg2->x);
644                         res.y = SubSat(arg1->y, arg2->y);
645                         res.z = SubSat(arg1->z, arg2->z);
646                         break;
647                 case TextureStage::STAGE_ADDSMOOTH: // Arg1 + Arg2 - Arg1 * Arg2
648                         {
649                                 Short4 tmp;
650
651                                 tmp = MulHigh(arg1->x, arg2->x) << 4; res.x = AddSat(arg1->x, arg2->x); res.x = SubSat(res.x, tmp);
652                                 tmp = MulHigh(arg1->y, arg2->y) << 4; res.y = AddSat(arg1->y, arg2->y); res.y = SubSat(res.y, tmp);
653                                 tmp = MulHigh(arg1->z, arg2->z) << 4; res.z = AddSat(arg1->z, arg2->z); res.z = SubSat(res.z, tmp);
654                         }
655                         break;
656                 case TextureStage::STAGE_MULTIPLYADD: // Arg3 + Arg1 * Arg2
657                         res.x = MulHigh(arg1->x, arg2->x) << 4; res.x = AddSat(res.x, arg3->x);
658                         res.y = MulHigh(arg1->y, arg2->y) << 4; res.y = AddSat(res.y, arg3->y);
659                         res.z = MulHigh(arg1->z, arg2->z) << 4; res.z = AddSat(res.z, arg3->z);
660                         break;
661                 case TextureStage::STAGE_LERP: // Arg3 * (Arg1 - Arg2) + Arg2
662                         res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, arg3->x) << 4; res.x = AddSat(res.x, arg2->x);
663                         res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, arg3->y) << 4; res.y = AddSat(res.y, arg2->y);
664                         res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, arg3->z) << 4; res.z = AddSat(res.z, arg2->z);
665                         break;
666                 case TextureStage::STAGE_DOT3: // 2 * (Arg1.x - 0.5) * 2 * (Arg2.x - 0.5) + 2 * (Arg1.y - 0.5) * 2 * (Arg2.y - 0.5) + 2 * (Arg1.z - 0.5) * 2 * (Arg2.z - 0.5)
667                         {
668                                 Short4 tmp;
669
670                                 res.x = SubSat(arg1->x, Short4(0x0800, 0x0800, 0x0800, 0x0800)); tmp = SubSat(arg2->x, Short4(0x0800, 0x0800, 0x0800, 0x0800)); res.x = MulHigh(res.x, tmp);
671                                 res.y = SubSat(arg1->y, Short4(0x0800, 0x0800, 0x0800, 0x0800)); tmp = SubSat(arg2->y, Short4(0x0800, 0x0800, 0x0800, 0x0800)); res.y = MulHigh(res.y, tmp);
672                                 res.z = SubSat(arg1->z, Short4(0x0800, 0x0800, 0x0800, 0x0800)); tmp = SubSat(arg2->z, Short4(0x0800, 0x0800, 0x0800, 0x0800)); res.z = MulHigh(res.z, tmp);
673
674                                 res.x = res.x << 6;
675                                 res.y = res.y << 6;
676                                 res.z = res.z << 6;
677
678                                 res.x = AddSat(res.x, res.y);
679                                 res.x = AddSat(res.x, res.z);
680
681                                 // Clamp to [0, 1]
682                                 res.x = Max(res.x, Short4(0x0000, 0x0000, 0x0000, 0x0000));
683                                 res.x = Min(res.x, Short4(0x1000));
684
685                                 res.y = res.x;
686                                 res.z = res.x;
687                                 res.w = res.x;
688                         }
689                         break;
690                 case TextureStage::STAGE_BLENDCURRENTALPHA: // Alpha * (Arg1 - Arg2) + Arg2
691                         res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, current.w) << 4; res.x = AddSat(res.x, arg2->x);
692                         res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, current.w) << 4; res.y = AddSat(res.y, arg2->y);
693                         res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, current.w) << 4; res.z = AddSat(res.z, arg2->z);
694                         break;
695                 case TextureStage::STAGE_BLENDDIFFUSEALPHA: // Alpha * (Arg1 - Arg2) + Arg2
696                         res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, diffuse.w) << 4; res.x = AddSat(res.x, arg2->x);
697                         res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, diffuse.w) << 4; res.y = AddSat(res.y, arg2->y);
698                         res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, diffuse.w) << 4; res.z = AddSat(res.z, arg2->z);
699                         break;
700                 case TextureStage::STAGE_BLENDFACTORALPHA: // Alpha * (Arg1 - Arg2) + Arg2
701                         res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]))) << 4; res.x = AddSat(res.x, arg2->x);
702                         res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]))) << 4; res.y = AddSat(res.y, arg2->y);
703                         res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]))) << 4; res.z = AddSat(res.z, arg2->z);
704                         break;
705                 case TextureStage::STAGE_BLENDTEXTUREALPHA: // Alpha * (Arg1 - Arg2) + Arg2
706                         res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, texture.w) << 4; res.x = AddSat(res.x, arg2->x);
707                         res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, texture.w) << 4; res.y = AddSat(res.y, arg2->y);
708                         res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, texture.w) << 4; res.z = AddSat(res.z, arg2->z);
709                         break;
710                 case TextureStage::STAGE_BLENDTEXTUREALPHAPM: // Arg1 + Arg2 * (1 - Alpha)
711                         res.x = SubSat(Short4(0x1000), texture.w); res.x = MulHigh(res.x, arg2->x) << 4; res.x = AddSat(res.x, arg1->x);
712                         res.y = SubSat(Short4(0x1000), texture.w); res.y = MulHigh(res.y, arg2->y) << 4; res.y = AddSat(res.y, arg1->y);
713                         res.z = SubSat(Short4(0x1000), texture.w); res.z = MulHigh(res.z, arg2->z) << 4; res.z = AddSat(res.z, arg1->z);
714                         break;
715                 case TextureStage::STAGE_PREMODULATE:
716                         res.x = arg1->x;
717                         res.y = arg1->y;
718                         res.z = arg1->z;
719                         break;
720                 case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR: // Arg1 + Arg1.w * Arg2
721                         res.x = MulHigh(arg1->w, arg2->x) << 4; res.x = AddSat(res.x, arg1->x);
722                         res.y = MulHigh(arg1->w, arg2->y) << 4; res.y = AddSat(res.y, arg1->y);
723                         res.z = MulHigh(arg1->w, arg2->z) << 4; res.z = AddSat(res.z, arg1->z);
724                         break;
725                 case TextureStage::STAGE_MODULATECOLOR_ADDALPHA: // Arg1 * Arg2 + Arg1.w
726                         res.x = MulHigh(arg1->x, arg2->x) << 4; res.x = AddSat(res.x, arg1->w);
727                         res.y = MulHigh(arg1->y, arg2->y) << 4; res.y = AddSat(res.y, arg1->w);
728                         res.z = MulHigh(arg1->z, arg2->z) << 4; res.z = AddSat(res.z, arg1->w);
729                         break;
730                 case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR: // (1 - Arg1.w) * Arg2 + Arg1
731                         {
732                                 Short4 tmp;
733
734                                 res.x = AddSat(arg1->x, arg2->x); tmp = MulHigh(arg1->w, arg2->x) << 4; res.x = SubSat(res.x, tmp);
735                                 res.y = AddSat(arg1->y, arg2->y); tmp = MulHigh(arg1->w, arg2->y) << 4; res.y = SubSat(res.y, tmp);
736                                 res.z = AddSat(arg1->z, arg2->z); tmp = MulHigh(arg1->w, arg2->z) << 4; res.z = SubSat(res.z, tmp);
737                         }
738                         break;
739                 case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA: // (1 - Arg1) * Arg2 + Arg1.w
740                         {
741                                 Short4 tmp;
742
743                                 res.x = AddSat(arg1->w, arg2->x); tmp = MulHigh(arg1->x, arg2->x) << 4; res.x = SubSat(res.x, tmp);
744                                 res.y = AddSat(arg1->w, arg2->y); tmp = MulHigh(arg1->y, arg2->y) << 4; res.y = SubSat(res.y, tmp);
745                                 res.z = AddSat(arg1->w, arg2->z); tmp = MulHigh(arg1->z, arg2->z) << 4; res.z = SubSat(res.z, tmp);
746                         }
747                         break;
748                 case TextureStage::STAGE_BUMPENVMAP:
749                         {
750                                 du = Float4(texture.x) * Float4(1.0f / 0x0FE0);
751                                 dv = Float4(texture.y) * Float4(1.0f / 0x0FE0);
752
753                                 Float4 du2;
754                                 Float4 dv2;
755
756                                 du2 = du;
757                                 dv2 = dv;
758                                 du *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][0]));
759                                 dv2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][0]));
760                                 du += dv2;
761                                 dv *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][1]));
762                                 du2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][1]));
763                                 dv += du2;
764
765                                 perturbate = true;
766
767                                 res.x = current.x;
768                                 res.y = current.y;
769                                 res.z = current.z;
770                                 res.w = current.w;
771                         }
772                         break;
773                 case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
774                         {
775                                 du = Float4(texture.x) * Float4(1.0f / 0x0FE0);
776                                 dv = Float4(texture.y) * Float4(1.0f / 0x0FE0);
777
778                                 Float4 du2;
779                                 Float4 dv2;
780
781                                 du2 = du;
782                                 dv2 = dv;
783
784                                 du *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][0]));
785                                 dv2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][0]));
786                                 du += dv2;
787                                 dv *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][1]));
788                                 du2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][1]));
789                                 dv += du2;
790
791                                 perturbate = true;
792
793                                 L = texture.z;
794                                 L = MulHigh(L, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].luminanceScale4)));
795                                 L = L << 4;
796                                 L = AddSat(L, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].luminanceOffset4)));
797                                 L = Max(L, Short4(0x0000, 0x0000, 0x0000, 0x0000));
798                                 L = Min(L, Short4(0x1000));
799
800                                 luminance = true;
801
802                                 res.x = current.x;
803                                 res.y = current.y;
804                                 res.z = current.z;
805                                 res.w = current.w;
806                         }
807                         break;
808                 default:
809                         ASSERT(false);
810                 }
811
812                 if(textureStage.stageOperation != TextureStage::STAGE_DOT3)
813                 {
814                         switch(textureStage.firstArgumentAlpha)
815                         {
816                         case TextureStage::SOURCE_TEXTURE:      arg1 = &texture;                break;
817                         case TextureStage::SOURCE_CONSTANT:     arg1 = &constant;               break;
818                         case TextureStage::SOURCE_CURRENT:      arg1 = &current;                break;
819                         case TextureStage::SOURCE_DIFFUSE:      arg1 = &diffuse;                break;
820                         case TextureStage::SOURCE_SPECULAR:     arg1 = &specular;               break;
821                         case TextureStage::SOURCE_TEMP:         arg1 = &temp;                   break;
822                         case TextureStage::SOURCE_TFACTOR:      arg1 = &tfactor;                break;
823                         default:
824                                 ASSERT(false);
825                         }
826
827                         switch(textureStage.secondArgumentAlpha)
828                         {
829                         case TextureStage::SOURCE_TEXTURE:      arg2 = &texture;                break;
830                         case TextureStage::SOURCE_CONSTANT:     arg2 = &constant;               break;
831                         case TextureStage::SOURCE_CURRENT:      arg2 = &current;                break;
832                         case TextureStage::SOURCE_DIFFUSE:      arg2 = &diffuse;                break;
833                         case TextureStage::SOURCE_SPECULAR:     arg2 = &specular;               break;
834                         case TextureStage::SOURCE_TEMP:         arg2 = &temp;                   break;
835                         case TextureStage::SOURCE_TFACTOR:      arg2 = &tfactor;                break;
836                         default:
837                                 ASSERT(false);
838                         }
839
840                         switch(textureStage.thirdArgumentAlpha)
841                         {
842                         case TextureStage::SOURCE_TEXTURE:      arg3 = &texture;                break;
843                         case TextureStage::SOURCE_CONSTANT:     arg3 = &constant;               break;
844                         case TextureStage::SOURCE_CURRENT:      arg3 = &current;                break;
845                         case TextureStage::SOURCE_DIFFUSE:      arg3 = &diffuse;                break;
846                         case TextureStage::SOURCE_SPECULAR:     arg3 = &specular;               break;
847                         case TextureStage::SOURCE_TEMP:         arg3 = &temp;                   break;
848                         case TextureStage::SOURCE_TFACTOR:      arg3 = &tfactor;                break;
849                         default:
850                                 ASSERT(false);
851                         }
852
853                         switch(textureStage.firstModifierAlpha)   // FIXME: Check if actually used
854                         {
855                         case TextureStage::MODIFIER_COLOR:
856                                 break;
857                         case TextureStage::MODIFIER_INVCOLOR:
858                                 mod1.w = SubSat(Short4(0x1000), arg1->w);
859
860                                 arg1 = &mod1;
861                                 break;
862                         case TextureStage::MODIFIER_ALPHA:
863                                 // Redudant
864                                 break;
865                         case TextureStage::MODIFIER_INVALPHA:
866                                 mod1.w = SubSat(Short4(0x1000), arg1->w);
867
868                                 arg1 = &mod1;
869                                 break;
870                         default:
871                                 ASSERT(false);
872                         }
873
874                         switch(textureStage.secondModifierAlpha)   // FIXME: Check if actually used
875                         {
876                         case TextureStage::MODIFIER_COLOR:
877                                 break;
878                         case TextureStage::MODIFIER_INVCOLOR:
879                                 mod2.w = SubSat(Short4(0x1000), arg2->w);
880
881                                 arg2 = &mod2;
882                                 break;
883                         case TextureStage::MODIFIER_ALPHA:
884                                 // Redudant
885                                 break;
886                         case TextureStage::MODIFIER_INVALPHA:
887                                 mod2.w = SubSat(Short4(0x1000), arg2->w);
888
889                                 arg2 = &mod2;
890                                 break;
891                         default:
892                                 ASSERT(false);
893                         }
894
895                         switch(textureStage.thirdModifierAlpha)   // FIXME: Check if actually used
896                         {
897                         case TextureStage::MODIFIER_COLOR:
898                                 break;
899                         case TextureStage::MODIFIER_INVCOLOR:
900                                 mod3.w = SubSat(Short4(0x1000), arg3->w);
901
902                                 arg3 = &mod3;
903                                 break;
904                         case TextureStage::MODIFIER_ALPHA:
905                                 // Redudant
906                                 break;
907                         case TextureStage::MODIFIER_INVALPHA:
908                                 mod3.w = SubSat(Short4(0x1000), arg3->w);
909
910                                 arg3 = &mod3;
911                                 break;
912                         default:
913                                 ASSERT(false);
914                         }
915
916                         switch(textureStage.stageOperationAlpha)
917                         {
918                         case TextureStage::STAGE_DISABLE:
919                                 break;
920                         case TextureStage::STAGE_SELECTARG1: // Arg1
921                                 res.w = arg1->w;
922                                 break;
923                         case TextureStage::STAGE_SELECTARG2: // Arg2
924                                 res.w = arg2->w;
925                                 break;
926                         case TextureStage::STAGE_SELECTARG3: // Arg3
927                                 res.w = arg3->w;
928                                 break;
929                         case TextureStage::STAGE_MODULATE: // Arg1 * Arg2
930                                 res.w = MulHigh(arg1->w, arg2->w) << 4;
931                                 break;
932                         case TextureStage::STAGE_MODULATE2X: // Arg1 * Arg2 * 2
933                                 res.w = MulHigh(arg1->w, arg2->w) << 5;
934                                 break;
935                         case TextureStage::STAGE_MODULATE4X: // Arg1 * Arg2 * 4
936                                 res.w = MulHigh(arg1->w, arg2->w) << 6;
937                                 break;
938                         case TextureStage::STAGE_ADD: // Arg1 + Arg2
939                                 res.w = AddSat(arg1->w, arg2->w);
940                                 break;
941                         case TextureStage::STAGE_ADDSIGNED: // Arg1 + Arg2 - 0.5
942                                 res.w = AddSat(arg1->w, arg2->w);
943                                 res.w = SubSat(res.w, Short4(0x0800, 0x0800, 0x0800, 0x0800));
944                                 break;
945                         case TextureStage::STAGE_ADDSIGNED2X: // (Arg1 + Arg2 - 0.5) << 1
946                                 res.w = AddSat(arg1->w, arg2->w);
947                                 res.w = SubSat(res.w, Short4(0x0800, 0x0800, 0x0800, 0x0800));
948                                 res.w = AddSat(res.w, res.w);
949                                 break;
950                         case TextureStage::STAGE_SUBTRACT: // Arg1 - Arg2
951                                 res.w = SubSat(arg1->w, arg2->w);
952                                 break;
953                         case TextureStage::STAGE_ADDSMOOTH: // Arg1 + Arg2 - Arg1 * Arg2
954                                 {
955                                         Short4 tmp;
956
957                                         tmp = MulHigh(arg1->w, arg2->w) << 4; res.w = AddSat(arg1->w, arg2->w); res.w = SubSat(res.w, tmp);
958                                 }
959                                 break;
960                         case TextureStage::STAGE_MULTIPLYADD: // Arg3 + Arg1 * Arg2
961                                 res.w = MulHigh(arg1->w, arg2->w) << 4; res.w = AddSat(res.w, arg3->w);
962                                 break;
963                         case TextureStage::STAGE_LERP: // Arg3 * (Arg1 - Arg2) + Arg2
964                                 res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, arg3->w) << 4; res.w = AddSat(res.w, arg2->w);
965                                 break;
966                         case TextureStage::STAGE_DOT3:
967                                 break;   // Already computed in color channel
968                         case TextureStage::STAGE_BLENDCURRENTALPHA: // Alpha * (Arg1 - Arg2) + Arg2
969                                 res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, current.w) << 4; res.w = AddSat(res.w, arg2->w);
970                                 break;
971                         case TextureStage::STAGE_BLENDDIFFUSEALPHA: // Arg1 * (Alpha) + Arg2 * (1 - Alpha)
972                                 res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, diffuse.w) << 4; res.w = AddSat(res.w, arg2->w);
973                                 break;
974                         case TextureStage::STAGE_BLENDFACTORALPHA:
975                                 res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]))) << 4; res.w = AddSat(res.w, arg2->w);
976                                 break;
977                         case TextureStage::STAGE_BLENDTEXTUREALPHA: // Arg1 * (Alpha) + Arg2 * (1 - Alpha)
978                                 res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, texture.w) << 4; res.w = AddSat(res.w, arg2->w);
979                                 break;
980                         case TextureStage::STAGE_BLENDTEXTUREALPHAPM: // Arg1 + Arg2 * (1 - Alpha)
981                                 res.w = SubSat(Short4(0x1000), texture.w); res.w = MulHigh(res.w, arg2->w) << 4; res.w = AddSat(res.w, arg1->w);
982                                 break;
983                         case TextureStage::STAGE_PREMODULATE:
984                                 res.w = arg1->w;
985                                 break;
986                         case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
987                         case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
988                         case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
989                         case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
990                         case TextureStage::STAGE_BUMPENVMAP:
991                         case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
992                                 break;   // Invalid alpha operations
993                         default:
994                                 ASSERT(false);
995                         }
996                 }
997
998                 // Clamp result to [0, 1]
999
1000                 switch(textureStage.stageOperation)
1001                 {
1002                 case TextureStage::STAGE_DISABLE:
1003                 case TextureStage::STAGE_SELECTARG1:
1004                 case TextureStage::STAGE_SELECTARG2:
1005                 case TextureStage::STAGE_SELECTARG3:
1006                 case TextureStage::STAGE_MODULATE:
1007                 case TextureStage::STAGE_MODULATE2X:
1008                 case TextureStage::STAGE_MODULATE4X:
1009                 case TextureStage::STAGE_ADD:
1010                 case TextureStage::STAGE_MULTIPLYADD:
1011                 case TextureStage::STAGE_LERP:
1012                 case TextureStage::STAGE_BLENDCURRENTALPHA:
1013                 case TextureStage::STAGE_BLENDDIFFUSEALPHA:
1014                 case TextureStage::STAGE_BLENDFACTORALPHA:
1015                 case TextureStage::STAGE_BLENDTEXTUREALPHA:
1016                 case TextureStage::STAGE_BLENDTEXTUREALPHAPM:
1017                 case TextureStage::STAGE_DOT3:   // Already clamped
1018                 case TextureStage::STAGE_PREMODULATE:
1019                 case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
1020                 case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
1021                 case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
1022                 case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
1023                 case TextureStage::STAGE_BUMPENVMAP:
1024                 case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
1025                         if(state.textureStage[stage].cantUnderflow)
1026                         {
1027                                 break;   // Can't go below zero
1028                         }
1029                 case TextureStage::STAGE_ADDSIGNED:
1030                 case TextureStage::STAGE_ADDSIGNED2X:
1031                 case TextureStage::STAGE_SUBTRACT:
1032                 case TextureStage::STAGE_ADDSMOOTH:
1033                         res.x = Max(res.x, Short4(0x0000, 0x0000, 0x0000, 0x0000));
1034                         res.y = Max(res.y, Short4(0x0000, 0x0000, 0x0000, 0x0000));
1035                         res.z = Max(res.z, Short4(0x0000, 0x0000, 0x0000, 0x0000));
1036                         break;
1037                 default:
1038                         ASSERT(false);
1039                 }
1040
1041                 switch(textureStage.stageOperationAlpha)
1042                 {
1043                 case TextureStage::STAGE_DISABLE:
1044                 case TextureStage::STAGE_SELECTARG1:
1045                 case TextureStage::STAGE_SELECTARG2:
1046                 case TextureStage::STAGE_SELECTARG3:
1047                 case TextureStage::STAGE_MODULATE:
1048                 case TextureStage::STAGE_MODULATE2X:
1049                 case TextureStage::STAGE_MODULATE4X:
1050                 case TextureStage::STAGE_ADD:
1051                 case TextureStage::STAGE_MULTIPLYADD:
1052                 case TextureStage::STAGE_LERP:
1053                 case TextureStage::STAGE_BLENDCURRENTALPHA:
1054                 case TextureStage::STAGE_BLENDDIFFUSEALPHA:
1055                 case TextureStage::STAGE_BLENDFACTORALPHA:
1056                 case TextureStage::STAGE_BLENDTEXTUREALPHA:
1057                 case TextureStage::STAGE_BLENDTEXTUREALPHAPM:
1058                 case TextureStage::STAGE_DOT3:   // Already clamped
1059                 case TextureStage::STAGE_PREMODULATE:
1060                 case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
1061                 case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
1062                 case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
1063                 case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
1064                 case TextureStage::STAGE_BUMPENVMAP:
1065                 case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
1066                         if(state.textureStage[stage].cantUnderflow)
1067                         {
1068                                 break;   // Can't go below zero
1069                         }
1070                 case TextureStage::STAGE_ADDSIGNED:
1071                 case TextureStage::STAGE_ADDSIGNED2X:
1072                 case TextureStage::STAGE_SUBTRACT:
1073                 case TextureStage::STAGE_ADDSMOOTH:
1074                         res.w = Max(res.w, Short4(0x0000, 0x0000, 0x0000, 0x0000));
1075                         break;
1076                 default:
1077                         ASSERT(false);
1078                 }
1079
1080                 switch(textureStage.stageOperation)
1081                 {
1082                 case TextureStage::STAGE_DISABLE:
1083                 case TextureStage::STAGE_SELECTARG1:
1084                 case TextureStage::STAGE_SELECTARG2:
1085                 case TextureStage::STAGE_SELECTARG3:
1086                 case TextureStage::STAGE_MODULATE:
1087                 case TextureStage::STAGE_SUBTRACT:
1088                 case TextureStage::STAGE_ADDSMOOTH:
1089                 case TextureStage::STAGE_LERP:
1090                 case TextureStage::STAGE_BLENDCURRENTALPHA:
1091                 case TextureStage::STAGE_BLENDDIFFUSEALPHA:
1092                 case TextureStage::STAGE_BLENDFACTORALPHA:
1093                 case TextureStage::STAGE_BLENDTEXTUREALPHA:
1094                 case TextureStage::STAGE_DOT3:   // Already clamped
1095                 case TextureStage::STAGE_PREMODULATE:
1096                 case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
1097                 case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
1098                 case TextureStage::STAGE_BUMPENVMAP:
1099                 case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
1100                         break;   // Can't go above one
1101                 case TextureStage::STAGE_MODULATE2X:
1102                 case TextureStage::STAGE_MODULATE4X:
1103                 case TextureStage::STAGE_ADD:
1104                 case TextureStage::STAGE_ADDSIGNED:
1105                 case TextureStage::STAGE_ADDSIGNED2X:
1106                 case TextureStage::STAGE_MULTIPLYADD:
1107                 case TextureStage::STAGE_BLENDTEXTUREALPHAPM:
1108                 case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
1109                 case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
1110                         res.x = Min(res.x, Short4(0x1000));
1111                         res.y = Min(res.y, Short4(0x1000));
1112                         res.z = Min(res.z, Short4(0x1000));
1113                         break;
1114                 default:
1115                         ASSERT(false);
1116                 }
1117
1118                 switch(textureStage.stageOperationAlpha)
1119                 {
1120                 case TextureStage::STAGE_DISABLE:
1121                 case TextureStage::STAGE_SELECTARG1:
1122                 case TextureStage::STAGE_SELECTARG2:
1123                 case TextureStage::STAGE_SELECTARG3:
1124                 case TextureStage::STAGE_MODULATE:
1125                 case TextureStage::STAGE_SUBTRACT:
1126                 case TextureStage::STAGE_ADDSMOOTH:
1127                 case TextureStage::STAGE_LERP:
1128                 case TextureStage::STAGE_BLENDCURRENTALPHA:
1129                 case TextureStage::STAGE_BLENDDIFFUSEALPHA:
1130                 case TextureStage::STAGE_BLENDFACTORALPHA:
1131                 case TextureStage::STAGE_BLENDTEXTUREALPHA:
1132                 case TextureStage::STAGE_DOT3:   // Already clamped
1133                 case TextureStage::STAGE_PREMODULATE:
1134                 case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
1135                 case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
1136                 case TextureStage::STAGE_BUMPENVMAP:
1137                 case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
1138                         break;   // Can't go above one
1139                 case TextureStage::STAGE_MODULATE2X:
1140                 case TextureStage::STAGE_MODULATE4X:
1141                 case TextureStage::STAGE_ADD:
1142                 case TextureStage::STAGE_ADDSIGNED:
1143                 case TextureStage::STAGE_ADDSIGNED2X:
1144                 case TextureStage::STAGE_MULTIPLYADD:
1145                 case TextureStage::STAGE_BLENDTEXTUREALPHAPM:
1146                 case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
1147                 case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
1148                         res.w = Min(res.w, Short4(0x1000));
1149                         break;
1150                 default:
1151                         ASSERT(false);
1152                 }
1153
1154                 switch(textureStage.destinationArgument)
1155                 {
1156                 case TextureStage::DESTINATION_CURRENT:
1157                         current.x = res.x;
1158                         current.y = res.y;
1159                         current.z = res.z;
1160                         current.w = res.w;
1161                         break;
1162                 case TextureStage::DESTINATION_TEMP:
1163                         temp.x = res.x;
1164                         temp.y = res.y;
1165                         temp.z = res.z;
1166                         temp.w = res.w;
1167                         break;
1168                 default:
1169                         ASSERT(false);
1170                 }
1171         }
1172
1173         void PixelPipeline::fogBlend(Vector4s &current, Float4 &f)
1174         {
1175                 if(!state.fogActive)
1176                 {
1177                         return;
1178                 }
1179
1180                 if(state.pixelFogMode != FOG_NONE)
1181                 {
1182                         pixelFog(f);
1183                 }
1184
1185                 UShort4 fog = convertFixed16(f, true);
1186
1187                 current.x = As<Short4>(MulHigh(As<UShort4>(current.x), fog));
1188                 current.y = As<Short4>(MulHigh(As<UShort4>(current.y), fog));
1189                 current.z = As<Short4>(MulHigh(As<UShort4>(current.z), fog));
1190
1191                 UShort4 invFog = UShort4(0xFFFFu) - fog;
1192
1193                 current.x += As<Short4>(MulHigh(invFog, *Pointer<UShort4>(data + OFFSET(DrawData, fog.color4[0]))));
1194                 current.y += As<Short4>(MulHigh(invFog, *Pointer<UShort4>(data + OFFSET(DrawData, fog.color4[1]))));
1195                 current.z += As<Short4>(MulHigh(invFog, *Pointer<UShort4>(data + OFFSET(DrawData, fog.color4[2]))));
1196         }
1197
1198         void PixelPipeline::specularPixel(Vector4s &current, Vector4s &specular)
1199         {
1200                 if(!state.specularAdd)
1201                 {
1202                         return;
1203                 }
1204
1205                 current.x = AddSat(current.x, specular.x);
1206                 current.y = AddSat(current.y, specular.y);
1207                 current.z = AddSat(current.z, specular.z);
1208         }
1209
1210         void PixelPipeline::sampleTexture(Vector4s &c, int coordinates, int stage, bool project)
1211         {
1212                 Float4 x = v[2 + coordinates].x;
1213                 Float4 y = v[2 + coordinates].y;
1214                 Float4 z = v[2 + coordinates].z;
1215                 Float4 w = v[2 + coordinates].w;
1216
1217                 if(perturbate)
1218                 {
1219                         x += du;
1220                         y += dv;
1221
1222                         perturbate = false;
1223                 }
1224
1225                 sampleTexture(c, stage, x, y, z, w, project);
1226         }
1227
1228         void PixelPipeline::sampleTexture(Vector4s &c, int stage, Float4 &u, Float4 &v, Float4 &w, Float4 &q, bool project)
1229         {
1230                 #if PERF_PROFILE
1231                         Long texTime = Ticks();
1232                 #endif
1233
1234                 Vector4f dsx;
1235                 Vector4f dsy;
1236
1237                 Pointer<Byte> texture = data + OFFSET(DrawData, mipmap) + stage * sizeof(Texture);
1238
1239                 if(!project)
1240                 {
1241                         sampler[stage]->sampleTexture(texture, c, u, v, w, q, dsx, dsy);
1242                 }
1243                 else
1244                 {
1245                         Float4 rq = reciprocal(q);
1246
1247                         Float4 u_q = u * rq;
1248                         Float4 v_q = v * rq;
1249                         Float4 w_q = w * rq;
1250
1251                         sampler[stage]->sampleTexture(texture, c, u_q, v_q, w_q, q, dsx, dsy);
1252                 }
1253
1254                 #if PERF_PROFILE
1255                         cycles[PERF_TEX] += Ticks() - texTime;
1256                 #endif
1257         }
1258
1259         Short4 PixelPipeline::convertFixed12(RValue<Float4> cf)
1260         {
1261                 return RoundShort4(cf * Float4(0x1000));
1262         }
1263
1264         void PixelPipeline::convertFixed12(Vector4s &cs, Vector4f &cf)
1265         {
1266                 cs.x = convertFixed12(cf.x);
1267                 cs.y = convertFixed12(cf.y);
1268                 cs.z = convertFixed12(cf.z);
1269                 cs.w = convertFixed12(cf.w);
1270         }
1271
1272         Float4 PixelPipeline::convertSigned12(Short4 &cs)
1273         {
1274                 return Float4(cs) * Float4(1.0f / 0x0FFE);
1275         }
1276
1277         void PixelPipeline::convertSigned12(Vector4f &cf, Vector4s &cs)
1278         {
1279                 cf.x = convertSigned12(cs.x);
1280                 cf.y = convertSigned12(cs.y);
1281                 cf.z = convertSigned12(cs.z);
1282                 cf.w = convertSigned12(cs.w);
1283         }
1284
1285         void PixelPipeline::writeDestination(Vector4s &d, const Dst &dst)
1286         {
1287                 switch(dst.type)
1288                 {
1289                 case Shader::PARAMETER_TEMP:
1290                         if(dst.mask & 0x1) rs[dst.index].x = d.x;
1291                         if(dst.mask & 0x2) rs[dst.index].y = d.y;
1292                         if(dst.mask & 0x4) rs[dst.index].z = d.z;
1293                         if(dst.mask & 0x8) rs[dst.index].w = d.w;
1294                         break;
1295                 case Shader::PARAMETER_INPUT:
1296                         if(dst.mask & 0x1) vs[dst.index].x = d.x;
1297                         if(dst.mask & 0x2) vs[dst.index].y = d.y;
1298                         if(dst.mask & 0x4) vs[dst.index].z = d.z;
1299                         if(dst.mask & 0x8) vs[dst.index].w = d.w;
1300                         break;
1301                 case Shader::PARAMETER_CONST: ASSERT(false); break;
1302                 case Shader::PARAMETER_TEXTURE:
1303                         if(dst.mask & 0x1) ts[dst.index].x = d.x;
1304                         if(dst.mask & 0x2) ts[dst.index].y = d.y;
1305                         if(dst.mask & 0x4) ts[dst.index].z = d.z;
1306                         if(dst.mask & 0x8) ts[dst.index].w = d.w;
1307                         break;
1308                 case Shader::PARAMETER_COLOROUT:
1309                         if(dst.mask & 0x1) vs[dst.index].x = d.x;
1310                         if(dst.mask & 0x2) vs[dst.index].y = d.y;
1311                         if(dst.mask & 0x4) vs[dst.index].z = d.z;
1312                         if(dst.mask & 0x8) vs[dst.index].w = d.w;
1313                         break;
1314                 default:
1315                         ASSERT(false);
1316                 }
1317         }
1318
1319         Vector4s PixelPipeline::fetchRegister(const Src &src)
1320         {
1321                 Vector4s *reg;
1322                 int i = src.index;
1323
1324                 Vector4s c;
1325
1326                 if(src.type == Shader::PARAMETER_CONST)
1327                 {
1328                         c.x = *Pointer<Short4>(data + OFFSET(DrawData, ps.cW[i][0]));
1329                         c.y = *Pointer<Short4>(data + OFFSET(DrawData, ps.cW[i][1]));
1330                         c.z = *Pointer<Short4>(data + OFFSET(DrawData, ps.cW[i][2]));
1331                         c.w = *Pointer<Short4>(data + OFFSET(DrawData, ps.cW[i][3]));
1332                 }
1333
1334                 switch(src.type)
1335                 {
1336                 case Shader::PARAMETER_TEMP:          reg = &rs[i]; break;
1337                 case Shader::PARAMETER_INPUT:         reg = &vs[i]; break;
1338                 case Shader::PARAMETER_CONST:         reg = &c;       break;
1339                 case Shader::PARAMETER_TEXTURE:       reg = &ts[i]; break;
1340                 case Shader::PARAMETER_VOID:          return rs[0]; // Dummy
1341                 case Shader::PARAMETER_FLOAT4LITERAL: return rs[0]; // Dummy
1342                 default: ASSERT(false); return rs[0];
1343                 }
1344
1345                 const Short4 &x = (*reg)[(src.swizzle >> 0) & 0x3];
1346                 const Short4 &y = (*reg)[(src.swizzle >> 2) & 0x3];
1347                 const Short4 &z = (*reg)[(src.swizzle >> 4) & 0x3];
1348                 const Short4 &w = (*reg)[(src.swizzle >> 6) & 0x3];
1349
1350                 Vector4s mod;
1351
1352                 switch(src.modifier)
1353                 {
1354                 case Shader::MODIFIER_NONE:
1355                         mod.x = x;
1356                         mod.y = y;
1357                         mod.z = z;
1358                         mod.w = w;
1359                         break;
1360                 case Shader::MODIFIER_BIAS:
1361                         mod.x = SubSat(x, Short4(0x0800, 0x0800, 0x0800, 0x0800));
1362                         mod.y = SubSat(y, Short4(0x0800, 0x0800, 0x0800, 0x0800));
1363                         mod.z = SubSat(z, Short4(0x0800, 0x0800, 0x0800, 0x0800));
1364                         mod.w = SubSat(w, Short4(0x0800, 0x0800, 0x0800, 0x0800));
1365                         break;
1366                 case Shader::MODIFIER_BIAS_NEGATE:
1367                         mod.x = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), x);
1368                         mod.y = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), y);
1369                         mod.z = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), z);
1370                         mod.w = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), w);
1371                         break;
1372                 case Shader::MODIFIER_COMPLEMENT:
1373                         mod.x = SubSat(Short4(0x1000), x);
1374                         mod.y = SubSat(Short4(0x1000), y);
1375                         mod.z = SubSat(Short4(0x1000), z);
1376                         mod.w = SubSat(Short4(0x1000), w);
1377                         break;
1378                 case Shader::MODIFIER_NEGATE:
1379                         mod.x = -x;
1380                         mod.y = -y;
1381                         mod.z = -z;
1382                         mod.w = -w;
1383                         break;
1384                 case Shader::MODIFIER_X2:
1385                         mod.x = AddSat(x, x);
1386                         mod.y = AddSat(y, y);
1387                         mod.z = AddSat(z, z);
1388                         mod.w = AddSat(w, w);
1389                         break;
1390                 case Shader::MODIFIER_X2_NEGATE:
1391                         mod.x = -AddSat(x, x);
1392                         mod.y = -AddSat(y, y);
1393                         mod.z = -AddSat(z, z);
1394                         mod.w = -AddSat(w, w);
1395                         break;
1396                 case Shader::MODIFIER_SIGN:
1397                         mod.x = SubSat(x, Short4(0x0800, 0x0800, 0x0800, 0x0800));
1398                         mod.y = SubSat(y, Short4(0x0800, 0x0800, 0x0800, 0x0800));
1399                         mod.z = SubSat(z, Short4(0x0800, 0x0800, 0x0800, 0x0800));
1400                         mod.w = SubSat(w, Short4(0x0800, 0x0800, 0x0800, 0x0800));
1401                         mod.x = AddSat(mod.x, mod.x);
1402                         mod.y = AddSat(mod.y, mod.y);
1403                         mod.z = AddSat(mod.z, mod.z);
1404                         mod.w = AddSat(mod.w, mod.w);
1405                         break;
1406                 case Shader::MODIFIER_SIGN_NEGATE:
1407                         mod.x = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), x);
1408                         mod.y = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), y);
1409                         mod.z = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), z);
1410                         mod.w = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), w);
1411                         mod.x = AddSat(mod.x, mod.x);
1412                         mod.y = AddSat(mod.y, mod.y);
1413                         mod.z = AddSat(mod.z, mod.z);
1414                         mod.w = AddSat(mod.w, mod.w);
1415                         break;
1416                 case Shader::MODIFIER_DZ:
1417                         mod.x = x;
1418                         mod.y = y;
1419                         mod.z = z;
1420                         mod.w = w;
1421                         // Projection performed by texture sampler
1422                         break;
1423                 case Shader::MODIFIER_DW:
1424                         mod.x = x;
1425                         mod.y = y;
1426                         mod.z = z;
1427                         mod.w = w;
1428                         // Projection performed by texture sampler
1429                         break;
1430                 default:
1431                         ASSERT(false);
1432                 }
1433
1434                 if(src.type == Shader::PARAMETER_CONST && (src.modifier == Shader::MODIFIER_X2 || src.modifier == Shader::MODIFIER_X2_NEGATE))
1435                 {
1436                         mod.x = Min(mod.x, Short4(0x1000)); mod.x = Max(mod.x, Short4(-0x1000, -0x1000, -0x1000, -0x1000));
1437                         mod.y = Min(mod.y, Short4(0x1000)); mod.y = Max(mod.y, Short4(-0x1000, -0x1000, -0x1000, -0x1000));
1438                         mod.z = Min(mod.z, Short4(0x1000)); mod.z = Max(mod.z, Short4(-0x1000, -0x1000, -0x1000, -0x1000));
1439                         mod.w = Min(mod.w, Short4(0x1000)); mod.w = Max(mod.w, Short4(-0x1000, -0x1000, -0x1000, -0x1000));
1440                 }
1441
1442                 return mod;
1443         }
1444
1445         void PixelPipeline::MOV(Vector4s &dst, Vector4s &src0)
1446         {
1447                 dst.x = src0.x;
1448                 dst.y = src0.y;
1449                 dst.z = src0.z;
1450                 dst.w = src0.w;
1451         }
1452
1453         void PixelPipeline::ADD(Vector4s &dst, Vector4s &src0, Vector4s &src1)
1454         {
1455                 dst.x = AddSat(src0.x, src1.x);
1456                 dst.y = AddSat(src0.y, src1.y);
1457                 dst.z = AddSat(src0.z, src1.z);
1458                 dst.w = AddSat(src0.w, src1.w);
1459         }
1460
1461         void PixelPipeline::SUB(Vector4s &dst, Vector4s &src0, Vector4s &src1)
1462         {
1463                 dst.x = SubSat(src0.x, src1.x);
1464                 dst.y = SubSat(src0.y, src1.y);
1465                 dst.z = SubSat(src0.z, src1.z);
1466                 dst.w = SubSat(src0.w, src1.w);
1467         }
1468
1469         void PixelPipeline::MAD(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2)
1470         {
1471                 // FIXME: Long fixed-point multiply fixup
1472                 { dst.x = MulHigh(src0.x, src1.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, src2.x); }
1473                 {
1474                 dst.y = MulHigh(src0.y, src1.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, src2.y);
1475         }
1476                 {dst.z = MulHigh(src0.z, src1.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, src2.z); }
1477                 {dst.w = MulHigh(src0.w, src1.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, src2.w); }
1478         }
1479
1480         void PixelPipeline::MUL(Vector4s &dst, Vector4s &src0, Vector4s &src1)
1481         {
1482                 // FIXME: Long fixed-point multiply fixup
1483                 { dst.x = MulHigh(src0.x, src1.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); }
1484                 {
1485                 dst.y = MulHigh(src0.y, src1.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y);
1486         }
1487                 {dst.z = MulHigh(src0.z, src1.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); }
1488                 {dst.w = MulHigh(src0.w, src1.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); }
1489         }
1490
1491         void PixelPipeline::DP3(Vector4s &dst, Vector4s &src0, Vector4s &src1)
1492         {
1493                 Short4 t0;
1494                 Short4 t1;
1495
1496                 // FIXME: Long fixed-point multiply fixup
1497                 t0 = MulHigh(src0.x, src1.x); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0);
1498                 t1 = MulHigh(src0.y, src1.y); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
1499                 t0 = AddSat(t0, t1);
1500                 t1 = MulHigh(src0.z, src1.z); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
1501                 t0 = AddSat(t0, t1);
1502
1503                 dst.x = t0;
1504                 dst.y = t0;
1505                 dst.z = t0;
1506                 dst.w = t0;
1507         }
1508
1509         void PixelPipeline::DP4(Vector4s &dst, Vector4s &src0, Vector4s &src1)
1510         {
1511                 Short4 t0;
1512                 Short4 t1;
1513
1514                 // FIXME: Long fixed-point multiply fixup
1515                 t0 = MulHigh(src0.x, src1.x); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0);
1516                 t1 = MulHigh(src0.y, src1.y); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
1517                 t0 = AddSat(t0, t1);
1518                 t1 = MulHigh(src0.z, src1.z); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
1519                 t0 = AddSat(t0, t1);
1520                 t1 = MulHigh(src0.w, src1.w); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
1521                 t0 = AddSat(t0, t1);
1522
1523                 dst.x = t0;
1524                 dst.y = t0;
1525                 dst.z = t0;
1526                 dst.w = t0;
1527         }
1528
1529         void PixelPipeline::LRP(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2)
1530         {
1531                 // FIXME: Long fixed-point multiply fixup
1532                 { dst.x = SubSat(src1.x, src2.x); dst.x = MulHigh(dst.x, src0.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, src2.x); }
1533                 {
1534                 dst.y = SubSat(src1.y, src2.y); dst.y = MulHigh(dst.y, src0.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, src2.y);
1535         }
1536                 {dst.z = SubSat(src1.z, src2.z); dst.z = MulHigh(dst.z, src0.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, src2.z); }
1537                 {dst.w = SubSat(src1.w, src2.w); dst.w = MulHigh(dst.w, src0.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, src2.w); }
1538         }
1539
1540         void PixelPipeline::TEXCOORD(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate)
1541         {
1542                 Float4 uw;
1543                 Float4 vw;
1544                 Float4 sw;
1545
1546                 if(state.interpolant[2 + coordinate].component & 0x01)
1547                 {
1548                         uw = Max(u, Float4(0.0f));
1549                         uw = Min(uw, Float4(1.0f));
1550                         dst.x = convertFixed12(uw);
1551                 }
1552                 else
1553                 {
1554                         dst.x = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1555                 }
1556
1557                 if(state.interpolant[2 + coordinate].component & 0x02)
1558                 {
1559                         vw = Max(v, Float4(0.0f));
1560                         vw = Min(vw, Float4(1.0f));
1561                         dst.y = convertFixed12(vw);
1562                 }
1563                 else
1564                 {
1565                         dst.y = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1566                 }
1567
1568                 if(state.interpolant[2 + coordinate].component & 0x04)
1569                 {
1570                         sw = Max(s, Float4(0.0f));
1571                         sw = Min(sw, Float4(1.0f));
1572                         dst.z = convertFixed12(sw);
1573                 }
1574                 else
1575                 {
1576                         dst.z = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1577                 }
1578
1579                 dst.w = Short4(0x1000);
1580         }
1581
1582         void PixelPipeline::TEXCRD(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate, bool project)
1583         {
1584                 Float4 uw = u;
1585                 Float4 vw = v;
1586                 Float4 sw = s;
1587
1588                 if(project)
1589                 {
1590                         uw *= Rcp_pp(s);
1591                         vw *= Rcp_pp(s);
1592                 }
1593
1594                 if(state.interpolant[2 + coordinate].component & 0x01)
1595                 {
1596                         uw *= Float4(0x1000);
1597                         uw = Max(uw, Float4(-0x8000));
1598                         uw = Min(uw, Float4(0x7FFF));
1599                         dst.x = RoundShort4(uw);
1600                 }
1601                 else
1602                 {
1603                         dst.x = Short4(0x0000);
1604                 }
1605
1606                 if(state.interpolant[2 + coordinate].component & 0x02)
1607                 {
1608                         vw *= Float4(0x1000);
1609                         vw = Max(vw, Float4(-0x8000));
1610                         vw = Min(vw, Float4(0x7FFF));
1611                         dst.y = RoundShort4(vw);
1612                 }
1613                 else
1614                 {
1615                         dst.y = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1616                 }
1617
1618                 if(state.interpolant[2 + coordinate].component & 0x04)
1619                 {
1620                         sw *= Float4(0x1000);
1621                         sw = Max(sw, Float4(-0x8000));
1622                         sw = Min(sw, Float4(0x7FFF));
1623                         dst.z = RoundShort4(sw);
1624                 }
1625                 else
1626                 {
1627                         dst.z = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1628                 }
1629         }
1630
1631         void PixelPipeline::TEXDP3(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, Vector4s &src)
1632         {
1633                 TEXM3X3PAD(u, v, s, src, 0, false);
1634
1635                 Short4 t0 = RoundShort4(u_ * Float4(0x1000));
1636
1637                 dst.x = t0;
1638                 dst.y = t0;
1639                 dst.z = t0;
1640                 dst.w = t0;
1641         }
1642
1643         void PixelPipeline::TEXDP3TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0)
1644         {
1645                 TEXM3X3PAD(u, v, s, src0, 0, false);
1646
1647                 v_ = Float4(0.0f);
1648                 w_ = Float4(0.0f);
1649
1650                 sampleTexture(dst, stage, u_, v_, w_, w_);
1651         }
1652
1653         void PixelPipeline::TEXKILL(Int cMask[4], Float4 &u, Float4 &v, Float4 &s)
1654         {
1655                 Int kill = SignMask(CmpNLT(u, Float4(0.0f))) &
1656                         SignMask(CmpNLT(v, Float4(0.0f))) &
1657                         SignMask(CmpNLT(s, Float4(0.0f)));
1658
1659                 for(unsigned int q = 0; q < state.multiSample; q++)
1660                 {
1661                         cMask[q] &= kill;
1662                 }
1663         }
1664
1665         void PixelPipeline::TEXKILL(Int cMask[4], Vector4s &src)
1666         {
1667                 Short4 test = src.x | src.y | src.z;
1668                 Int kill = SignMask(Pack(test, test)) ^ 0x0000000F;
1669
1670                 for(unsigned int q = 0; q < state.multiSample; q++)
1671                 {
1672                         cMask[q] &= kill;
1673                 }
1674         }
1675
1676         void PixelPipeline::TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int sampler, bool project)
1677         {
1678                 sampleTexture(dst, sampler, u, v, s, s, project);
1679         }
1680
1681         void PixelPipeline::TEXLD(Vector4s &dst, Vector4s &src, int sampler, bool project)
1682         {
1683                 Float4 u = Float4(src.x) * Float4(1.0f / 0x0FFE);
1684                 Float4 v = Float4(src.y) * Float4(1.0f / 0x0FFE);
1685                 Float4 s = Float4(src.z) * Float4(1.0f / 0x0FFE);
1686
1687                 sampleTexture(dst, sampler, u, v, s, s, project);
1688         }
1689
1690         void PixelPipeline::TEXBEM(Vector4s &dst, Vector4s &src, Float4 &u, Float4 &v, Float4 &s, int stage)
1691         {
1692                 Float4 du = Float4(src.x) * Float4(1.0f / 0x0FFE);
1693                 Float4 dv = Float4(src.y) * Float4(1.0f / 0x0FFE);
1694
1695                 Float4 du2 = du;
1696                 Float4 dv2 = dv;
1697
1698                 du *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][0]));
1699                 dv2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][0]));
1700                 du += dv2;
1701                 dv *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][1]));
1702                 du2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][1]));
1703                 dv += du2;
1704
1705                 Float4 u_ = u + du;
1706                 Float4 v_ = v + dv;
1707
1708                 sampleTexture(dst, stage, u_, v_, s, s);
1709         }
1710
1711         void PixelPipeline::TEXBEML(Vector4s &dst, Vector4s &src, Float4 &u, Float4 &v, Float4 &s, int stage)
1712         {
1713                 Float4 du = Float4(src.x) * Float4(1.0f / 0x0FFE);
1714                 Float4 dv = Float4(src.y) * Float4(1.0f / 0x0FFE);
1715
1716                 Float4 du2 = du;
1717                 Float4 dv2 = dv;
1718
1719                 du *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][0]));
1720                 dv2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][0]));
1721                 du += dv2;
1722                 dv *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][1]));
1723                 du2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][1]));
1724                 dv += du2;
1725
1726                 Float4 u_ = u + du;
1727                 Float4 v_ = v + dv;
1728
1729                 sampleTexture(dst, stage, u_, v_, s, s);
1730
1731                 Short4 L;
1732
1733                 L = src.z;
1734                 L = MulHigh(L, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].luminanceScale4)));
1735                 L = L << 4;
1736                 L = AddSat(L, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].luminanceOffset4)));
1737                 L = Max(L, Short4(0x0000, 0x0000, 0x0000, 0x0000));
1738                 L = Min(L, Short4(0x1000));
1739
1740                 dst.x = MulHigh(dst.x, L); dst.x = dst.x << 4;
1741                 dst.y = MulHigh(dst.y, L); dst.y = dst.y << 4;
1742                 dst.z = MulHigh(dst.z, L); dst.z = dst.z << 4;
1743         }
1744
1745         void PixelPipeline::TEXREG2AR(Vector4s &dst, Vector4s &src0, int stage)
1746         {
1747                 Float4 u = Float4(src0.w) * Float4(1.0f / 0x0FFE);
1748                 Float4 v = Float4(src0.x) * Float4(1.0f / 0x0FFE);
1749                 Float4 s = Float4(src0.z) * Float4(1.0f / 0x0FFE);
1750
1751                 sampleTexture(dst, stage, u, v, s, s);
1752         }
1753
1754         void PixelPipeline::TEXREG2GB(Vector4s &dst, Vector4s &src0, int stage)
1755         {
1756                 Float4 u = Float4(src0.y) * Float4(1.0f / 0x0FFE);
1757                 Float4 v = Float4(src0.z) * Float4(1.0f / 0x0FFE);
1758                 Float4 s = v;
1759
1760                 sampleTexture(dst, stage, u, v, s, s);
1761         }
1762
1763         void PixelPipeline::TEXREG2RGB(Vector4s &dst, Vector4s &src0, int stage)
1764         {
1765                 Float4 u = Float4(src0.x) * Float4(1.0f / 0x0FFE);
1766                 Float4 v = Float4(src0.y) * Float4(1.0f / 0x0FFE);
1767                 Float4 s = Float4(src0.z) * Float4(1.0f / 0x0FFE);
1768
1769                 sampleTexture(dst, stage, u, v, s, s);
1770         }
1771
1772         void PixelPipeline::TEXM3X2DEPTH(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, Vector4s &src, bool signedScaling)
1773         {
1774                 TEXM3X2PAD(u, v, s, src, 1, signedScaling);
1775
1776                 // z / w
1777                 u_ *= Rcp_pp(v_);   // FIXME: Set result to 1.0 when division by zero
1778
1779                 oDepth = u_;
1780         }
1781
1782         void PixelPipeline::TEXM3X2PAD(Float4 &u, Float4 &v, Float4 &s, Vector4s &src0, int component, bool signedScaling)
1783         {
1784                 TEXM3X3PAD(u, v, s, src0, component, signedScaling);
1785         }
1786
1787         void PixelPipeline::TEXM3X2TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0, bool signedScaling)
1788         {
1789                 TEXM3X2PAD(u, v, s, src0, 1, signedScaling);
1790
1791                 w_ = Float4(0.0f);
1792
1793                 sampleTexture(dst, stage, u_, v_, w_, w_);
1794         }
1795
1796         void PixelPipeline::TEXM3X3(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, Vector4s &src0, bool signedScaling)
1797         {
1798                 TEXM3X3PAD(u, v, s, src0, 2, signedScaling);
1799
1800                 dst.x = RoundShort4(u_ * Float4(0x1000));
1801                 dst.y = RoundShort4(v_ * Float4(0x1000));
1802                 dst.z = RoundShort4(w_ * Float4(0x1000));
1803                 dst.w = Short4(0x1000);
1804         }
1805
1806         void PixelPipeline::TEXM3X3PAD(Float4 &u, Float4 &v, Float4 &s, Vector4s &src0, int component, bool signedScaling)
1807         {
1808                 if(component == 0 || previousScaling != signedScaling)   // FIXME: Other source modifiers?
1809                 {
1810                         U = Float4(src0.x);
1811                         V = Float4(src0.y);
1812                         W = Float4(src0.z);
1813
1814                         previousScaling = signedScaling;
1815                 }
1816
1817                 Float4 x = U * u + V * v + W * s;
1818
1819                 x *= Float4(1.0f / 0x1000);
1820
1821                 switch(component)
1822                 {
1823                 case 0: u_ = x; break;
1824                 case 1: v_ = x; break;
1825                 case 2: w_ = x; break;
1826                 default: ASSERT(false);
1827                 }
1828         }
1829
1830         void PixelPipeline::TEXM3X3SPEC(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0, Vector4s &src1)
1831         {
1832                 TEXM3X3PAD(u, v, s, src0, 2, false);
1833
1834                 Float4 E[3];   // Eye vector
1835
1836                 E[0] = Float4(src1.x) * Float4(1.0f / 0x0FFE);
1837                 E[1] = Float4(src1.y) * Float4(1.0f / 0x0FFE);
1838                 E[2] = Float4(src1.z) * Float4(1.0f / 0x0FFE);
1839
1840                 // Reflection
1841                 Float4 u__;
1842                 Float4 v__;
1843                 Float4 w__;
1844
1845                 // (u'', v'', w'') = 2 * (N . E) * N - E * (N . N)
1846                 u__ = u_ * E[0];
1847                 v__ = v_ * E[1];
1848                 w__ = w_ * E[2];
1849                 u__ += v__ + w__;
1850                 u__ += u__;
1851                 v__ = u__;
1852                 w__ = u__;
1853                 u__ *= u_;
1854                 v__ *= v_;
1855                 w__ *= w_;
1856                 u_ *= u_;
1857                 v_ *= v_;
1858                 w_ *= w_;
1859                 u_ += v_ + w_;
1860                 u__ -= E[0] * u_;
1861                 v__ -= E[1] * u_;
1862                 w__ -= E[2] * u_;
1863
1864                 sampleTexture(dst, stage, u__, v__, w__, w__);
1865         }
1866
1867         void PixelPipeline::TEXM3X3TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0, bool signedScaling)
1868         {
1869                 TEXM3X3PAD(u, v, s, src0, 2, signedScaling);
1870
1871                 sampleTexture(dst, stage, u_, v_, w_, w_);
1872         }
1873
1874         void PixelPipeline::TEXM3X3VSPEC(Vector4s &dst, Float4 &x, Float4 &y, Float4 &z, int stage, Vector4s &src0)
1875         {
1876                 TEXM3X3PAD(x, y, z, src0, 2, false);
1877
1878                 Float4 E[3];   // Eye vector
1879
1880                 E[0] = v[2 + stage - 2].w;
1881                 E[1] = v[2 + stage - 1].w;
1882                 E[2] = v[2 + stage - 0].w;
1883
1884                 // Reflection
1885                 Float4 u__;
1886                 Float4 v__;
1887                 Float4 w__;
1888
1889                 // (u'', v'', w'') = 2 * (N . E) * N - E * (N . N)
1890                 u__ = u_ * E[0];
1891                 v__ = v_ * E[1];
1892                 w__ = w_ * E[2];
1893                 u__ += v__ + w__;
1894                 u__ += u__;
1895                 v__ = u__;
1896                 w__ = u__;
1897                 u__ *= u_;
1898                 v__ *= v_;
1899                 w__ *= w_;
1900                 u_ *= u_;
1901                 v_ *= v_;
1902                 w_ *= w_;
1903                 u_ += v_ + w_;
1904                 u__ -= E[0] * u_;
1905                 v__ -= E[1] * u_;
1906                 w__ -= E[2] * u_;
1907
1908                 sampleTexture(dst, stage, u__, v__, w__, w__);
1909         }
1910
1911         void PixelPipeline::TEXDEPTH()
1912         {
1913                 u_ = Float4(rs[5].x);
1914                 v_ = Float4(rs[5].y);
1915
1916                 // z / w
1917                 u_ *= Rcp_pp(v_);   // FIXME: Set result to 1.0 when division by zero
1918
1919                 oDepth = u_;
1920         }
1921
1922         void PixelPipeline::CND(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2)
1923         {
1924                 { Short4 t0; t0 = src0.x; t0 = CmpGT(t0, Short4(0x0800, 0x0800, 0x0800, 0x0800)); Short4 t1; t1 = src1.x; t1 = t1 & t0; t0 = ~t0 & src2.x; t0 = t0 | t1; dst.x = t0; };
1925                 {Short4 t0; t0 = src0.y; t0 = CmpGT(t0, Short4(0x0800, 0x0800, 0x0800, 0x0800)); Short4 t1; t1 = src1.y; t1 = t1 & t0; t0 = ~t0 & src2.y; t0 = t0 | t1; dst.y = t0; };
1926                 {Short4 t0; t0 = src0.z; t0 = CmpGT(t0, Short4(0x0800, 0x0800, 0x0800, 0x0800)); Short4 t1; t1 = src1.z; t1 = t1 & t0; t0 = ~t0 & src2.z; t0 = t0 | t1; dst.z = t0; };
1927                 {Short4 t0; t0 = src0.w; t0 = CmpGT(t0, Short4(0x0800, 0x0800, 0x0800, 0x0800)); Short4 t1; t1 = src1.w; t1 = t1 & t0; t0 = ~t0 & src2.w; t0 = t0 | t1; dst.w = t0; };
1928         }
1929
1930         void PixelPipeline::CMP(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2)
1931         {
1932                 { Short4 t0 = CmpGT(Short4(0x0000, 0x0000, 0x0000, 0x0000), src0.x); Short4 t1; t1 = src2.x; t1 &= t0; t0 = ~t0 & src1.x; t0 |= t1; dst.x = t0; };
1933                 {Short4 t0 = CmpGT(Short4(0x0000, 0x0000, 0x0000, 0x0000), src0.y); Short4 t1; t1 = src2.y; t1 &= t0; t0 = ~t0 & src1.y; t0 |= t1; dst.y = t0; };
1934                 {Short4 t0 = CmpGT(Short4(0x0000, 0x0000, 0x0000, 0x0000), src0.z); Short4 t1; t1 = src2.z; t1 &= t0; t0 = ~t0 & src1.z; t0 |= t1; dst.z = t0; };
1935                 {Short4 t0 = CmpGT(Short4(0x0000, 0x0000, 0x0000, 0x0000), src0.w); Short4 t1; t1 = src2.w; t1 &= t0; t0 = ~t0 & src1.w; t0 |= t1; dst.w = t0; };
1936         }
1937
1938         void PixelPipeline::BEM(Vector4s &dst, Vector4s &src0, Vector4s &src1, int stage)
1939         {
1940                 Short4 t0;
1941                 Short4 t1;
1942
1943                 // dst.x = src0.x + BUMPENVMAT00(stage) * src1.x + BUMPENVMAT10(stage) * src1.y
1944                 t0 = MulHigh(src1.x, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4W[0][0]))); t0 = t0 << 4;   // FIXME: Matrix components range? Overflow hazard.
1945                 t1 = MulHigh(src1.y, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4W[1][0]))); t1 = t1 << 4;   // FIXME: Matrix components range? Overflow hazard.
1946                 t0 = AddSat(t0, t1);
1947                 t0 = AddSat(t0, src0.x);
1948                 dst.x = t0;
1949
1950                 // dst.y = src0.y + BUMPENVMAT01(stage) * src1.x + BUMPENVMAT11(stage) * src1.y
1951                 t0 = MulHigh(src1.x, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4W[0][1]))); t0 = t0 << 4;   // FIXME: Matrix components range? Overflow hazard.
1952                 t1 = MulHigh(src1.y, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4W[1][1]))); t1 = t1 << 4;   // FIXME: Matrix components range? Overflow hazard.
1953                 t0 = AddSat(t0, t1);
1954                 t0 = AddSat(t0, src0.y);
1955                 dst.y = t0;
1956         }
1957 }
1958