OSDN Git Service

Optimize Int2 construction.
[android-x86/external-swiftshader.git] / src / Shader / PixelRoutine.cpp
1 // SwiftShader Software Renderer
2 //
3 // Copyright(c) 2005-2013 TransGaming Inc.
4 //
5 // All rights reserved. No part of this software may be copied, distributed, transmitted,
6 // transcribed, stored in a retrieval system, translated into any human or computer
7 // language by any means, or disclosed to third parties without the explicit written
8 // agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
9 // or implied, including but not limited to any patent rights, are granted to you.
10 //
11
12 #include "PixelRoutine.hpp"
13
14 #include "Renderer.hpp"
15 #include "QuadRasterizer.hpp"
16 #include "Surface.hpp"
17 #include "Primitive.hpp"
18 #include "CPUID.hpp"
19 #include "SamplerCore.hpp"
20 #include "Constants.hpp"
21 #include "Debug.hpp"
22
23 namespace sw
24 {
25         extern bool complementaryDepthBuffer;
26         extern bool postBlendSRGB;
27         extern bool exactColorRounding;
28         extern bool forceClearRegisters;
29
30         PixelRoutine::Registers::Registers(const PixelShader *shader) :
31                 QuadRasterizer::Registers(),
32                 v(shader && shader->dynamicallyIndexedInput)
33         {
34                 if(!shader || shader->getVersion() < 0x0200 || forceClearRegisters)
35                 {
36                         for(int i = 0; i < 10; i++)
37                         {
38                                 v[i].x = Float4(0.0f);
39                                 v[i].y = Float4(0.0f);
40                                 v[i].z = Float4(0.0f);
41                                 v[i].w = Float4(0.0f);
42                         }
43                 }
44         }
45
46         PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader) : QuadRasterizer(state, shader)
47         {
48         }
49
50         PixelRoutine::~PixelRoutine()
51         {
52                 for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
53                 {
54                         delete sampler[i];
55                 }
56         }
57
58         void PixelRoutine::quad(QuadRasterizer::Registers &rBase, Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
59         {
60                 Registers& r = *static_cast<Registers*>(&rBase);
61
62                 #if PERF_PROFILE
63                         Long pipeTime = Ticks();
64                 #endif
65
66                 for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
67                 {
68                         sampler[i] = new SamplerCore(r.constants, state.sampler[i]);
69                 }
70
71                 const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive();
72
73                 Int zMask[4];   // Depth mask
74                 Int sMask[4];   // Stencil mask
75
76                 for(unsigned int q = 0; q < state.multiSample; q++)
77                 {
78                         zMask[q] = cMask[q];
79                         sMask[q] = cMask[q];
80                 }
81
82                 for(unsigned int q = 0; q < state.multiSample; q++)
83                 {
84                         stencilTest(r, sBuffer, q, x, sMask[q], cMask[q]);
85                 }
86
87                 Float4 f;
88
89                 Float4 (&z)[4] = r.z;
90                 Float4 &w = r.w;
91                 Float4 &rhw = r.rhw;
92                 Float4 rhwCentroid;
93
94                 Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(r.primitive + OFFSET(Primitive,xQuad), 16);
95
96                 if(interpolateZ())
97                 {
98                         for(unsigned int q = 0; q < state.multiSample; q++)
99                         {
100                                 Float4 x = xxxx;
101                         
102                                 if(state.multiSample > 1)
103                                 {
104                                         x -= *Pointer<Float4>(r.constants + OFFSET(Constants,X) + q * sizeof(float4));
105                                 }
106
107                                 z[q] = interpolate(x, r.Dz[q], z[q], r.primitive + OFFSET(Primitive,z), false, false);
108                         }
109                 }
110
111                 Bool depthPass = false;
112
113                 if(earlyDepthTest)
114                 {
115                         for(unsigned int q = 0; q < state.multiSample; q++)
116                         {
117                                 depthPass = depthPass || depthTest(r, zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
118                         }
119                 }
120
121                 If(depthPass || Bool(!earlyDepthTest))
122                 {
123                         #if PERF_PROFILE
124                                 Long interpTime = Ticks();
125                         #endif
126
127                         Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(r.primitive + OFFSET(Primitive,yQuad), 16);
128
129                         // Centroid locations
130                         Float4 XXXX = Float4(0.0f);
131                         Float4 YYYY = Float4(0.0f);
132
133                         if(state.centroid)
134                         {
135                                 Float4 WWWW(1.0e-9f);
136
137                                 for(unsigned int q = 0; q < state.multiSample; q++)
138                                 {
139                                         XXXX += *Pointer<Float4>(r.constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
140                                         YYYY += *Pointer<Float4>(r.constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
141                                         WWWW += *Pointer<Float4>(r.constants + OFFSET(Constants,weight) + 16 * cMask[q]);
142                                 }
143
144                                 WWWW = Rcp_pp(WWWW);
145                                 XXXX *= WWWW;
146                                 YYYY *= WWWW;
147
148                                 XXXX += xxxx;
149                                 YYYY += yyyy;
150                         }
151
152                         if(interpolateW())
153                         {
154                                 w = interpolate(xxxx, r.Dw, rhw, r.primitive + OFFSET(Primitive,w), false, false);
155                                 rhw = reciprocal(w);
156
157                                 if(state.centroid)
158                                 {
159                                         rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, r.primitive + OFFSET(Primitive,w), false, false));
160                                 }
161                         }
162
163                         for(int interpolant = 0; interpolant < 10; interpolant++)
164                         {
165                                 for(int component = 0; component < 4; component++)
166                                 {
167                                         if(state.interpolant[interpolant].component & (1 << component))
168                                         {
169                                                 if(!state.interpolant[interpolant].centroid)
170                                                 {
171                                                         r.v[interpolant][component] = interpolate(xxxx, r.Dv[interpolant][component], rhw, r.primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
172                                                 }
173                                                 else
174                                                 {
175                                                         r.v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, r.primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
176                                                 }
177                                         }
178                                 }
179
180                                 Float4 rcp;
181
182                                 switch(state.interpolant[interpolant].project)
183                                 {
184                                 case 0:
185                                         break;
186                                 case 1:
187                                         rcp = reciprocal(r.v[interpolant].y);
188                                         r.v[interpolant].x = r.v[interpolant].x * rcp;
189                                         break;
190                                 case 2:
191                                         rcp = reciprocal(r.v[interpolant].z);
192                                         r.v[interpolant].x = r.v[interpolant].x * rcp;
193                                         r.v[interpolant].y = r.v[interpolant].y * rcp;
194                                         break;
195                                 case 3:
196                                         rcp = reciprocal(r.v[interpolant].w);
197                                         r.v[interpolant].x = r.v[interpolant].x * rcp;
198                                         r.v[interpolant].y = r.v[interpolant].y * rcp;
199                                         r.v[interpolant].z = r.v[interpolant].z * rcp;
200                                         break;
201                                 }
202                         }
203
204                         if(state.fog.component)
205                         {
206                                 f = interpolate(xxxx, r.Df, rhw, r.primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective);
207                         }
208
209                         setBuiltins(r, x, y, z, w);
210
211                         #if PERF_PROFILE
212                                 r.cycles[PERF_INTERP] += Ticks() - interpTime;
213                         #endif
214
215                         Bool alphaPass = true;
216
217                         if(colorUsed())
218                         {
219                                 #if PERF_PROFILE
220                                         Long shaderTime = Ticks();
221                                 #endif
222
223                                 applyShader(r, cMask);
224
225                                 #if PERF_PROFILE
226                                         r.cycles[PERF_SHADER] += Ticks() - shaderTime;
227                                 #endif
228
229                                 alphaPass = alphaTest(r, cMask);
230
231                                 if((shader && shader->containsKill()) || state.alphaTestActive())
232                                 {
233                                         for(unsigned int q = 0; q < state.multiSample; q++)
234                                         {
235                                                 zMask[q] &= cMask[q];
236                                                 sMask[q] &= cMask[q];
237                                         }
238                                 }
239                         }
240
241                         If(alphaPass)
242                         {
243                                 if(!earlyDepthTest)
244                                 {
245                                         for(unsigned int q = 0; q < state.multiSample; q++)
246                                         {
247                                                 depthPass = depthPass || depthTest(r, zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
248                                         }
249                                 }
250
251                                 #if PERF_PROFILE
252                                         Long ropTime = Ticks();
253                                 #endif
254
255                                 If(depthPass || Bool(earlyDepthTest))
256                                 {
257                                         for(unsigned int q = 0; q < state.multiSample; q++)
258                                         {
259                                                 if(state.multiSampleMask & (1 << q))
260                                                 {
261                                                         writeDepth(r, zBuffer, q, x, z[q], zMask[q]);
262
263                                                         if(state.occlusionEnabled)
264                                                         {
265                                                                 r.occlusion += *Pointer<UInt>(r.constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
266                                                         }
267                                                 }
268                                         }
269
270                                         if(colorUsed())
271                                         {
272                                                 #if PERF_PROFILE
273                                                         AddAtomic(Pointer<Long>(&profiler.ropOperations), 4);
274                                                 #endif
275
276                                                 rasterOperation(r, f, cBuffer, x, sMask, zMask, cMask);
277                                         }
278                                 }
279
280                                 #if PERF_PROFILE
281                                         r.cycles[PERF_ROP] += Ticks() - ropTime;
282                                 #endif
283                         }
284                 }
285
286                 for(unsigned int q = 0; q < state.multiSample; q++)
287                 {
288                         if(state.multiSampleMask & (1 << q))
289                         {
290                                 writeStencil(r, sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
291                         }
292                 }
293
294                 #if PERF_PROFILE
295                         r.cycles[PERF_PIPE] += Ticks() - pipeTime;
296                 #endif
297         }
298
299         Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
300         {
301                 Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
302
303                 if(!flat)
304                 {
305                         interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
306                                        y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
307
308                         if(perspective)
309                         {
310                                 interpolant *= rhw;
311                         }
312                 }
313
314                 return interpolant;
315         }
316
317         void PixelRoutine::stencilTest(Registers &r, Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask)
318         {
319                 if(!state.stencilActive)
320                 {
321                         return;
322                 }
323
324                 // (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
325
326                 Pointer<Byte> buffer = sBuffer + 2 * x;
327
328                 if(q > 0)
329                 {
330                         buffer += q * *Pointer<Int>(r.data + OFFSET(DrawData,stencilSliceB));
331                 }
332
333                 Byte8 value = As<Byte8>(Long1(*Pointer<UInt>(buffer)));
334                 Byte8 valueCCW = value;
335
336                 if(!state.noStencilMask)
337                 {
338                         value &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[0].testMaskQ));
339                 }
340
341                 stencilTest(r, value, state.stencilCompareMode, false);
342
343                 if(state.twoSidedStencil)
344                 {
345                         if(!state.noStencilMaskCCW)
346                         {
347                                 valueCCW &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[1].testMaskQ));
348                         }
349
350                         stencilTest(r, valueCCW, state.stencilCompareModeCCW, true);
351
352                         value &= *Pointer<Byte8>(r.primitive + OFFSET(Primitive,clockwiseMask));
353                         valueCCW &= *Pointer<Byte8>(r.primitive + OFFSET(Primitive,invClockwiseMask));
354                         value |= valueCCW;
355                 }
356
357                 sMask = SignMask(value) & cMask;
358         }
359
360         void PixelRoutine::stencilTest(Registers &r, Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW)
361         {
362                 Byte8 equal;
363
364                 switch(stencilCompareMode)
365                 {
366                 case STENCIL_ALWAYS:
367                         value = Byte8(0xFFFFFFFFFFFFFFFF);
368                         break;
369                 case STENCIL_NEVER:
370                         value = Byte8(0x0000000000000000);
371                         break;
372                 case STENCIL_LESS:                      // a < b ~ b > a
373                         value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
374                         value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
375                         break;
376                 case STENCIL_EQUAL:
377                         value = CmpEQ(value, *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
378                         break;
379                 case STENCIL_NOTEQUAL:          // a != b ~ !(a == b)
380                         value = CmpEQ(value, *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
381                         value ^= Byte8(0xFFFFFFFFFFFFFFFF);
382                         break;
383                 case STENCIL_LESSEQUAL: // a <= b ~ (b > a) || (a == b)
384                         equal = value;
385                         equal = CmpEQ(equal, *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
386                         value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
387                         value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
388                         value |= equal;
389                         break;
390                 case STENCIL_GREATER:           // a > b
391                         equal = *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
392                         value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
393                         equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
394                         value = equal;
395                         break;
396                 case STENCIL_GREATEREQUAL:      // a >= b ~ !(a < b) ~ !(b > a)
397                         value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
398                         value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
399                         value ^= Byte8(0xFFFFFFFFFFFFFFFF);
400                         break;
401                 default:
402                         ASSERT(false);
403                 }
404         }
405
406         Bool PixelRoutine::depthTest(Registers &r, Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
407         {
408                 if(!state.depthTestActive)
409                 {
410                         return true;
411                 }
412
413                 Float4 Z = z;
414
415                 if(shader && shader->depthOverride())
416                 {
417                         if(complementaryDepthBuffer)
418                         {
419                                 Z = Float4(1.0f) - r.oDepth;
420                         }
421                         else
422                         {
423                                 Z = r.oDepth;
424                         }
425                 }
426
427                 Pointer<Byte> buffer;
428                 Int pitch;
429
430                 if(!state.quadLayoutDepthBuffer)
431                 {
432                         buffer = zBuffer + 4 * x;
433                         pitch = *Pointer<Int>(r.data + OFFSET(DrawData,depthPitchB));
434                 }
435                 else
436                 {
437                         buffer = zBuffer + 8 * x;
438                 }
439
440                 if(q > 0)
441                 {
442                         buffer += q * *Pointer<Int>(r.data + OFFSET(DrawData,depthSliceB));
443                 }
444
445                 Float4 zValue;
446
447                 if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
448                 {
449                         if(!state.quadLayoutDepthBuffer)
450                         {
451                                 // FIXME: Properly optimizes?
452                                 zValue.xy = *Pointer<Float4>(buffer);
453                                 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
454                         }
455                         else
456                         {
457                                 zValue = *Pointer<Float4>(buffer, 16);
458                         }
459                 }
460
461                 Int4 zTest;
462
463                 switch(state.depthCompareMode)
464                 {
465                 case DEPTH_ALWAYS:
466                         // Optimized
467                         break;
468                 case DEPTH_NEVER:
469                         // Optimized
470                         break;
471                 case DEPTH_EQUAL:
472                         zTest = CmpEQ(zValue, Z);
473                         break;
474                 case DEPTH_NOTEQUAL:
475                         zTest = CmpNEQ(zValue, Z);
476                         break;
477                 case DEPTH_LESS:
478                         if(complementaryDepthBuffer)
479                         {
480                                 zTest = CmpLT(zValue, Z);
481                         }
482                         else
483                         {
484                                 zTest = CmpNLE(zValue, Z);
485                         }
486                         break;
487                 case DEPTH_GREATEREQUAL:
488                         if(complementaryDepthBuffer)
489                         {
490                                 zTest = CmpNLT(zValue, Z);
491                         }
492                         else
493                         {
494                                 zTest = CmpLE(zValue, Z);
495                         }
496                         break;
497                 case DEPTH_LESSEQUAL:
498                         if(complementaryDepthBuffer)
499                         {
500                                 zTest = CmpLE(zValue, Z);
501                         }
502                         else
503                         {
504                                 zTest = CmpNLT(zValue, Z);
505                         }
506                         break;
507                 case DEPTH_GREATER:
508                         if(complementaryDepthBuffer)
509                         {
510                                 zTest = CmpNLE(zValue, Z);
511                         }
512                         else
513                         {
514                                 zTest = CmpLT(zValue, Z);
515                         }
516                         break;
517                 default:
518                         ASSERT(false);
519                 }
520
521                 switch(state.depthCompareMode)
522                 {
523                 case DEPTH_ALWAYS:
524                         zMask = cMask;
525                         break;
526                 case DEPTH_NEVER:
527                         zMask = 0x0;
528                         break;
529                 default:
530                         zMask = SignMask(zTest) & cMask;
531                         break;
532                 }
533                 
534                 if(state.stencilActive)
535                 {
536                         zMask &= sMask;
537                 }
538
539                 return zMask != 0;
540         }
541
542         void PixelRoutine::alphaTest(Registers &r, Int &aMask, Short4 &alpha)
543         {
544                 Short4 cmp;
545                 Short4 equal;
546
547                 switch(state.alphaCompareMode)
548                 {
549                 case ALPHA_ALWAYS:
550                         aMask = 0xF;
551                         break;
552                 case ALPHA_NEVER:
553                         aMask = 0x0;
554                         break;
555                 case ALPHA_EQUAL:
556                         cmp = CmpEQ(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)));
557                         aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
558                         break;
559                 case ALPHA_NOTEQUAL:            // a != b ~ !(a == b)
560                         cmp = CmpEQ(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4((short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF);   // FIXME
561                         aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
562                         break;
563                 case ALPHA_LESS:                        // a < b ~ b > a
564                         cmp = CmpGT(*Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)), alpha);
565                         aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
566                         break;
567                 case ALPHA_GREATEREQUAL:        // a >= b ~ (a > b) || (a == b) ~ !(b > a)   // TODO: Approximate
568                         equal = CmpEQ(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)));
569                         cmp = CmpGT(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)));
570                         cmp |= equal;
571                         aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
572                         break;
573                 case ALPHA_LESSEQUAL:           // a <= b ~ !(a > b)
574                         cmp = CmpGT(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4((short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF);   // FIXME
575                         aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
576                         break;
577                 case ALPHA_GREATER:                     // a > b
578                         cmp = CmpGT(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)));
579                         aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
580                         break;
581                 default:
582                         ASSERT(false);
583                 }
584         }
585
586         void PixelRoutine::alphaToCoverage(Registers &r, Int cMask[4], Float4 &alpha)
587         {
588                 Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(r.data + OFFSET(DrawData,a2c0)));
589                 Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(r.data + OFFSET(DrawData,a2c1)));
590                 Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(r.data + OFFSET(DrawData,a2c2)));
591                 Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(r.data + OFFSET(DrawData,a2c3)));
592
593                 Int aMask0 = SignMask(coverage0);
594                 Int aMask1 = SignMask(coverage1);
595                 Int aMask2 = SignMask(coverage2);
596                 Int aMask3 = SignMask(coverage3);
597
598                 cMask[0] &= aMask0;
599                 cMask[1] &= aMask1;
600                 cMask[2] &= aMask2;
601                 cMask[3] &= aMask3;
602         }
603
604         void PixelRoutine::fogBlend(Registers &r, Vector4f &c0, Float4 &fog)
605         {
606                 if(!state.fogActive)
607                 {
608                         return;
609                 }
610
611                 if(state.pixelFogMode != FOG_NONE)
612                 {
613                         pixelFog(r, fog);
614
615                         fog = Min(fog, Float4(1.0f));
616                         fog = Max(fog, Float4(0.0f));
617                 }
618
619                 c0.x -= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[0]));
620                 c0.y -= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[1]));
621                 c0.z -= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[2]));
622
623                 c0.x *= fog;
624                 c0.y *= fog;
625                 c0.z *= fog;
626
627                 c0.x += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[0]));
628                 c0.y += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[1]));
629                 c0.z += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[2]));
630         }
631
632         void PixelRoutine::pixelFog(Registers &r, Float4 &visibility)
633         {
634                 Float4 &zw = visibility;
635
636                 if(state.pixelFogMode != FOG_NONE)
637                 {
638                         if(state.wBasedFog)
639                         {
640                                 zw = r.rhw;
641                         }
642                         else
643                         {
644                                 if(complementaryDepthBuffer)
645                                 {
646                                         zw = Float4(1.0f) - r.z[0];
647                                 }
648                                 else
649                                 {
650                                         zw = r.z[0];
651                                 }
652                         }
653                 }
654
655                 switch(state.pixelFogMode)
656                 {
657                 case FOG_NONE:
658                         break;
659                 case FOG_LINEAR:
660                         zw *= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.scale));
661                         zw += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.offset));
662                         break;
663                 case FOG_EXP:
664                         zw *= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.densityE));
665                         zw = exponential2(zw, true);
666                         break;
667                 case FOG_EXP2:
668                         zw *= zw;
669                         zw *= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.density2E));
670                         zw = exponential2(zw, true);
671                         break;
672                 default:
673                         ASSERT(false);
674                 }
675         }
676
677         void PixelRoutine::writeDepth(Registers &r, Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
678         {
679                 if(!state.depthWriteEnable)
680                 {
681                         return;
682                 }
683
684                 Float4 Z = z;
685
686                 if(shader && shader->depthOverride())
687                 {
688                         if(complementaryDepthBuffer)
689                         {
690                                 Z = Float4(1.0f) - r.oDepth;
691                         }
692                         else
693                         {
694                                 Z = r.oDepth;
695                         }
696                 }
697
698                 Pointer<Byte> buffer;
699                 Int pitch;
700
701                 if(!state.quadLayoutDepthBuffer)
702                 {       
703                         buffer = zBuffer + 4 * x;
704                         pitch = *Pointer<Int>(r.data + OFFSET(DrawData,depthPitchB));
705                 }
706                 else
707                 {       
708                         buffer = zBuffer + 8 * x;
709                 }
710
711                 if(q > 0)
712                 {
713                         buffer += q * *Pointer<Int>(r.data + OFFSET(DrawData,depthSliceB));
714                 }
715
716                 Float4 zValue;
717
718                 if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
719                 {
720                         if(!state.quadLayoutDepthBuffer)
721                         {
722                                 // FIXME: Properly optimizes?
723                                 zValue.xy = *Pointer<Float4>(buffer);
724                                 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
725                         }
726                         else
727                         {
728                                 zValue = *Pointer<Float4>(buffer, 16);
729                         }
730                 }
731
732                 Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
733                 zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
734                 Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
735
736                 if(!state.quadLayoutDepthBuffer)
737                 {
738                         // FIXME: Properly optimizes?
739                         *Pointer<Float2>(buffer) = Float2(Z.xy);
740                         *Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
741                 }
742                 else
743                 {
744                         *Pointer<Float4>(buffer, 16) = Z;
745                 }
746         }
747
748         void PixelRoutine::writeStencil(Registers &r, Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
749         {
750                 if(!state.stencilActive)
751                 {
752                         return;
753                 }
754
755                 if(state.stencilPassOperation == OPERATION_KEEP && state.stencilZFailOperation == OPERATION_KEEP && state.stencilFailOperation == OPERATION_KEEP)
756                 {
757                         if(!state.twoSidedStencil || (state.stencilPassOperationCCW == OPERATION_KEEP && state.stencilZFailOperationCCW == OPERATION_KEEP && state.stencilFailOperationCCW == OPERATION_KEEP))
758                         {
759                                 return;
760                         }
761                 }
762
763                 if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW))
764                 {
765                         return;
766                 }
767
768                 Pointer<Byte> buffer = sBuffer + 2 * x;
769
770                 if(q > 0)
771                 {
772                         buffer += q * *Pointer<Int>(r.data + OFFSET(DrawData,stencilSliceB));
773                 }
774
775                 Byte8 bufferValue = As<Byte8>(Long1(*Pointer<UInt>(buffer)));
776         
777                 Byte8 newValue;
778                 stencilOperation(r, newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
779
780                 if(!state.noStencilWriteMask)
781                 {
782                         Byte8 maskedValue = bufferValue;
783                         newValue &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[0].writeMaskQ));
784                         maskedValue &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
785                         newValue |= maskedValue;
786                 }
787
788                 if(state.twoSidedStencil)
789                 {
790                         Byte8 newValueCCW;
791
792                         stencilOperation(r, newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
793
794                         if(!state.noStencilWriteMaskCCW)
795                         {
796                                 Byte8 maskedValue = bufferValue;
797                                 newValueCCW &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[1].writeMaskQ));
798                                 maskedValue &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
799                                 newValueCCW |= maskedValue;
800                         }
801
802                         newValue &= *Pointer<Byte8>(r.primitive + OFFSET(Primitive,clockwiseMask));
803                         newValueCCW &= *Pointer<Byte8>(r.primitive + OFFSET(Primitive,invClockwiseMask));
804                         newValue |= newValueCCW;
805                 }
806
807                 newValue &= *Pointer<Byte8>(r.constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
808                 bufferValue &= *Pointer<Byte8>(r.constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
809                 newValue |= bufferValue;
810
811                 *Pointer<UInt>(buffer) = UInt(As<Long>(newValue));
812         }
813
814         void PixelRoutine::stencilOperation(Registers &r, Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
815         {
816                 Byte8 &pass = newValue;
817                 Byte8 fail;
818                 Byte8 zFail;
819
820                 stencilOperation(r, pass, bufferValue, stencilPassOperation, CCW);
821
822                 if(stencilZFailOperation != stencilPassOperation)
823                 {
824                         stencilOperation(r, zFail, bufferValue, stencilZFailOperation, CCW);
825                 }
826
827                 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
828                 {
829                         stencilOperation(r, fail, bufferValue, stencilFailOperation, CCW);
830                 }
831
832                 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
833                 {
834                         if(state.depthTestActive && stencilZFailOperation != stencilPassOperation)   // zMask valid and values not the same
835                         {
836                                 pass &= *Pointer<Byte8>(r.constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
837                                 zFail &= *Pointer<Byte8>(r.constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
838                                 pass |= zFail;
839                         }
840
841                         pass &= *Pointer<Byte8>(r.constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
842                         fail &= *Pointer<Byte8>(r.constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
843                         pass |= fail;
844                 }
845         }
846
847         void PixelRoutine::stencilOperation(Registers &r, Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW)
848         {
849                 switch(operation)
850                 {
851                 case OPERATION_KEEP:
852                         output = bufferValue;
853                         break;
854                 case OPERATION_ZERO:
855                         output = Byte8(0x0000000000000000);
856                         break;
857                 case OPERATION_REPLACE:
858                         output = *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceQ));
859                         break;
860                 case OPERATION_INCRSAT:
861                         output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
862                         break;
863                 case OPERATION_DECRSAT:
864                         output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
865                         break;
866                 case OPERATION_INVERT:
867                         output = bufferValue ^ Byte8(0xFFFFFFFFFFFFFFFF);
868                         break;
869                 case OPERATION_INCR:
870                         output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
871                         break;
872                 case OPERATION_DECR:
873                         output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
874                         break;
875                 default:
876                         ASSERT(false);
877                 }
878         }
879
880         void PixelRoutine::blendFactor(Registers &r, const Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorActive)
881         {
882                 switch(blendFactorActive)
883                 {
884                 case BLEND_ZERO:
885                         // Optimized
886                         break;
887                 case BLEND_ONE:
888                         // Optimized
889                         break;
890                 case BLEND_SOURCE:
891                         blendFactor.x = current.x;
892                         blendFactor.y = current.y;
893                         blendFactor.z = current.z;
894                         break;
895                 case BLEND_INVSOURCE:
896                         blendFactor.x = Short4(0xFFFFu) - current.x;
897                         blendFactor.y = Short4(0xFFFFu) - current.y;
898                         blendFactor.z = Short4(0xFFFFu) - current.z;
899                         break;
900                 case BLEND_DEST:
901                         blendFactor.x = pixel.x;
902                         blendFactor.y = pixel.y;
903                         blendFactor.z = pixel.z;
904                         break;
905                 case BLEND_INVDEST:
906                         blendFactor.x = Short4(0xFFFFu) - pixel.x;
907                         blendFactor.y = Short4(0xFFFFu) - pixel.y;
908                         blendFactor.z = Short4(0xFFFFu) - pixel.z;
909                         break;
910                 case BLEND_SOURCEALPHA:
911                         blendFactor.x = current.w;
912                         blendFactor.y = current.w;
913                         blendFactor.z = current.w;
914                         break;
915                 case BLEND_INVSOURCEALPHA:
916                         blendFactor.x = Short4(0xFFFFu) - current.w;
917                         blendFactor.y = Short4(0xFFFFu) - current.w;
918                         blendFactor.z = Short4(0xFFFFu) - current.w;
919                         break;
920                 case BLEND_DESTALPHA:
921                         blendFactor.x = pixel.w;
922                         blendFactor.y = pixel.w;
923                         blendFactor.z = pixel.w;
924                         break;
925                 case BLEND_INVDESTALPHA:
926                         blendFactor.x = Short4(0xFFFFu) - pixel.w;
927                         blendFactor.y = Short4(0xFFFFu) - pixel.w;
928                         blendFactor.z = Short4(0xFFFFu) - pixel.w;
929                         break;
930                 case BLEND_SRCALPHASAT:
931                         blendFactor.x = Short4(0xFFFFu) - pixel.w;
932                         blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
933                         blendFactor.y = blendFactor.x;
934                         blendFactor.z = blendFactor.x;
935                         break;
936                 case BLEND_CONSTANT:
937                         blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[0]));
938                         blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[1]));
939                         blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[2]));
940                         break;
941                 case BLEND_INVCONSTANT:
942                         blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
943                         blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
944                         blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
945                         break;
946                 case BLEND_CONSTANTALPHA:
947                         blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
948                         blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
949                         blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
950                         break;
951                 case BLEND_INVCONSTANTALPHA:
952                         blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
953                         blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
954                         blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
955                         break;
956                 default:
957                         ASSERT(false);
958                 }
959         }
960         
961         void PixelRoutine::blendFactorAlpha(Registers &r, const Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorAlphaActive)
962         {
963                 switch(blendFactorAlphaActive)
964                 {
965                 case BLEND_ZERO:
966                         // Optimized
967                         break;
968                 case BLEND_ONE:
969                         // Optimized
970                         break;
971                 case BLEND_SOURCE:
972                         blendFactor.w = current.w;
973                         break;
974                 case BLEND_INVSOURCE:
975                         blendFactor.w = Short4(0xFFFFu) - current.w;
976                         break;
977                 case BLEND_DEST:
978                         blendFactor.w = pixel.w;
979                         break;
980                 case BLEND_INVDEST:
981                         blendFactor.w = Short4(0xFFFFu) - pixel.w;
982                         break;
983                 case BLEND_SOURCEALPHA:
984                         blendFactor.w = current.w;
985                         break;
986                 case BLEND_INVSOURCEALPHA:
987                         blendFactor.w = Short4(0xFFFFu) - current.w;
988                         break;
989                 case BLEND_DESTALPHA:
990                         blendFactor.w = pixel.w;
991                         break;
992                 case BLEND_INVDESTALPHA:
993                         blendFactor.w = Short4(0xFFFFu) - pixel.w;
994                         break;
995                 case BLEND_SRCALPHASAT:
996                         blendFactor.w = Short4(0xFFFFu);
997                         break;
998                 case BLEND_CONSTANT:
999                 case BLEND_CONSTANTALPHA:
1000                         blendFactor.w = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
1001                         break;
1002                 case BLEND_INVCONSTANT:
1003                 case BLEND_INVCONSTANTALPHA:
1004                         blendFactor.w = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
1005                         break;
1006                 default:
1007                         ASSERT(false);
1008                 }
1009         }
1010
1011         void PixelRoutine::readPixel(Registers &r, int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel)
1012         {
1013                 Short4 c01;
1014                 Short4 c23;
1015                 Pointer<Byte> buffer;
1016                 Pointer<Byte> buffer2;
1017
1018                 switch(state.targetFormat[index])
1019                 {
1020                 case FORMAT_R5G6B5:
1021                         buffer = cBuffer + 2 * x;
1022                         buffer2 = buffer + *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1023                         c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1024
1025                         pixel.x = c01 & Short4(0xF800u);
1026                         pixel.y = (c01 & Short4(0x07E0u)) << 5;
1027                         pixel.z = (c01 & Short4(0x001Fu)) << 11;
1028                         pixel.w = Short4(0xFFFFu);
1029                         break;
1030                 case FORMAT_A8R8G8B8:
1031                         buffer = cBuffer + 4 * x;
1032                         c01 = *Pointer<Short4>(buffer);
1033                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1034                         c23 = *Pointer<Short4>(buffer);
1035                         pixel.z = c01;
1036                         pixel.y = c01;
1037                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1038                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1039                         pixel.x = pixel.z;
1040                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1041                         pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1042                         pixel.y = pixel.z;
1043                         pixel.w = pixel.x;
1044                         pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1045                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1046                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1047                         pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1048                         break;
1049                 case FORMAT_A8B8G8R8:
1050                         buffer = cBuffer + 4 * x;
1051                         c01 = *Pointer<Short4>(buffer);
1052                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1053                         c23 = *Pointer<Short4>(buffer);
1054                         pixel.z = c01;
1055                         pixel.y = c01;
1056                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1057                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1058                         pixel.x = pixel.z;
1059                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1060                         pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1061                         pixel.y = pixel.z;
1062                         pixel.w = pixel.x;
1063                         pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1064                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1065                         pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1066                         pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1067                         break;
1068                 case FORMAT_A8:
1069                         buffer = cBuffer + 1 * x;
1070                         pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 0);
1071                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1072                         pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 1);
1073                         pixel.w = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1074                         pixel.x = Short4(0x0000);
1075                         pixel.y = Short4(0x0000);
1076                         pixel.z = Short4(0x0000);
1077                         break;
1078                 case FORMAT_X8R8G8B8:
1079                         buffer = cBuffer + 4 * x;
1080                         c01 = *Pointer<Short4>(buffer);
1081                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1082                         c23 = *Pointer<Short4>(buffer);
1083                         pixel.z = c01;
1084                         pixel.y = c01;
1085                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1086                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1087                         pixel.x = pixel.z;
1088                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1089                         pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1090                         pixel.y = pixel.z;
1091                         pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1092                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1093                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1094                         pixel.w = Short4(0xFFFFu);
1095                         break;
1096                 case FORMAT_X8B8G8R8:
1097                         buffer = cBuffer + 4 * x;
1098                         c01 = *Pointer<Short4>(buffer);
1099                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1100                         c23 = *Pointer<Short4>(buffer);
1101                         pixel.z = c01;
1102                         pixel.y = c01;
1103                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1104                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1105                         pixel.x = pixel.z;
1106                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1107                         pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1108                         pixel.y = pixel.z;
1109                         pixel.w = pixel.x;
1110                         pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1111                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1112                         pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1113                         pixel.w = Short4(0xFFFFu);
1114                         break;
1115                 case FORMAT_A8G8R8B8Q:
1116                         UNIMPLEMENTED();
1117                 //      pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1118                 //      pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1119                 //      pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1120                 //      pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1121                         break;
1122                 case FORMAT_X8G8R8B8Q:
1123                         UNIMPLEMENTED();
1124                 //      pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1125                 //      pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1126                 //      pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1127                 //      pixel.w = Short4(0xFFFFu);
1128                         break;
1129                 case FORMAT_A16B16G16R16:
1130                         buffer = cBuffer;
1131                         pixel.x = *Pointer<Short4>(buffer + 8 * x);
1132                         pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
1133                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1134                         pixel.z = *Pointer<Short4>(buffer + 8 * x);
1135                         pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
1136                         transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
1137                         break;
1138                 case FORMAT_G16R16:
1139                         buffer = cBuffer;
1140                         pixel.x = *Pointer<Short4>(buffer + 4 * x);
1141                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1142                         pixel.y = *Pointer<Short4>(buffer + 4 * x);
1143                         pixel.z = pixel.x;
1144                         pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
1145                         pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
1146                         pixel.y = pixel.z;
1147                         pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
1148                         pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
1149                         pixel.z = Short4(0xFFFFu);
1150                         pixel.w = Short4(0xFFFFu);
1151                         break;
1152                 default:
1153                         ASSERT(false);
1154                 }
1155
1156                 if(postBlendSRGB && state.writeSRGB)
1157                 {
1158                         sRGBtoLinear16_12_16(r, pixel);
1159                 }
1160         }
1161
1162         void PixelRoutine::alphaBlend(Registers &r, int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
1163         {
1164                 if(!state.alphaBlendActive)
1165                 {
1166                         return;
1167                 }
1168
1169                 Vector4s pixel;
1170                 readPixel(r, index, cBuffer, x, pixel);
1171
1172                 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
1173                 Vector4s sourceFactor;
1174                 Vector4s destFactor;
1175
1176                 blendFactor(r, sourceFactor, current, pixel, state.sourceBlendFactor);
1177                 blendFactor(r, destFactor, current, pixel, state.destBlendFactor);
1178
1179                 if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
1180                 {
1181                         current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
1182                         current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
1183                         current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
1184                 }
1185         
1186                 if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
1187                 {
1188                         pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
1189                         pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
1190                         pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
1191                 }
1192
1193                 switch(state.blendOperation)
1194                 {
1195                 case BLENDOP_ADD:
1196                         current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1197                         current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1198                         current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1199                         break;
1200                 case BLENDOP_SUB:
1201                         current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1202                         current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1203                         current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1204                         break;
1205                 case BLENDOP_INVSUB:
1206                         current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
1207                         current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
1208                         current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
1209                         break;
1210                 case BLENDOP_MIN:
1211                         current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
1212                         current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
1213                         current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
1214                         break;
1215                 case BLENDOP_MAX:
1216                         current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
1217                         current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
1218                         current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
1219                         break;
1220                 case BLENDOP_SOURCE:
1221                         // No operation
1222                         break;
1223                 case BLENDOP_DEST:
1224                         current.x = pixel.x;
1225                         current.y = pixel.y;
1226                         current.z = pixel.z;
1227                         break;
1228                 case BLENDOP_NULL:
1229                         current.x = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1230                         current.y = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1231                         current.z = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1232                         break;
1233                 default:
1234                         ASSERT(false);
1235                 }
1236
1237                 blendFactorAlpha(r, sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
1238                 blendFactorAlpha(r, destFactor, current, pixel, state.destBlendFactorAlpha);
1239
1240                 if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
1241                 {
1242                         current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
1243                 }
1244         
1245                 if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
1246                 {
1247                         pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
1248                 }
1249
1250                 switch(state.blendOperationAlpha)
1251                 {
1252                 case BLENDOP_ADD:
1253                         current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1254                         break;
1255                 case BLENDOP_SUB:
1256                         current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1257                         break;
1258                 case BLENDOP_INVSUB:
1259                         current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
1260                         break;
1261                 case BLENDOP_MIN:
1262                         current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
1263                         break;
1264                 case BLENDOP_MAX:
1265                         current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
1266                         break;
1267                 case BLENDOP_SOURCE:
1268                         // No operation
1269                         break;
1270                 case BLENDOP_DEST:
1271                         current.w = pixel.w;
1272                         break;
1273                 case BLENDOP_NULL:
1274                         current.w = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1275                         break;
1276                 default:
1277                         ASSERT(false);
1278                 }
1279         }
1280
1281         void PixelRoutine::logicOperation(Registers &r, int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
1282         {
1283                 if(state.logicalOperation == LOGICALOP_COPY)
1284                 {
1285                         return;
1286                 }
1287
1288                 Vector4s pixel;
1289                 readPixel(r, index, cBuffer, x, pixel);
1290
1291                 switch(state.logicalOperation)
1292                 {
1293                 case LOGICALOP_CLEAR:
1294                         current.x = 0;
1295                         current.y = 0;
1296                         current.z = 0;
1297                         break;
1298                 case LOGICALOP_SET:
1299                         current.x = 0xFFFFu;
1300                         current.y = 0xFFFFu;
1301                         current.z = 0xFFFFu;
1302                         break;
1303                 case LOGICALOP_COPY:
1304                         ASSERT(false);   // Optimized out
1305                         break;
1306                 case LOGICALOP_COPY_INVERTED:
1307                         current.x = ~current.x;
1308                         current.y = ~current.y;
1309                         current.z = ~current.z;
1310                         break;
1311                 case LOGICALOP_NOOP:
1312                         current.x = pixel.x;
1313                         current.y = pixel.y;
1314                         current.z = pixel.z;
1315                         break;
1316                 case LOGICALOP_INVERT:
1317                         current.x = ~pixel.x;
1318                         current.y = ~pixel.y;
1319                         current.z = ~pixel.z;
1320                         break;
1321                 case LOGICALOP_AND:
1322                         current.x = pixel.x & current.x;
1323                         current.y = pixel.y & current.y;
1324                         current.z = pixel.z & current.z;
1325                         break;
1326                 case LOGICALOP_NAND:
1327                         current.x = ~(pixel.x & current.x);
1328                         current.y = ~(pixel.y & current.y);
1329                         current.z = ~(pixel.z & current.z);
1330                         break;
1331                 case LOGICALOP_OR:
1332                         current.x = pixel.x | current.x;
1333                         current.y = pixel.y | current.y;
1334                         current.z = pixel.z | current.z;
1335                         break;
1336                 case LOGICALOP_NOR:
1337                         current.x = ~(pixel.x | current.x);
1338                         current.y = ~(pixel.y | current.y);
1339                         current.z = ~(pixel.z | current.z);
1340                         break;
1341                 case LOGICALOP_XOR:
1342                         current.x = pixel.x ^ current.x;
1343                         current.y = pixel.y ^ current.y;
1344                         current.z = pixel.z ^ current.z;
1345                         break;
1346                 case LOGICALOP_EQUIV:
1347                         current.x = ~(pixel.x ^ current.x);
1348                         current.y = ~(pixel.y ^ current.y);
1349                         current.z = ~(pixel.z ^ current.z);
1350                         break;
1351                 case LOGICALOP_AND_REVERSE:
1352                         current.x = ~pixel.x & current.x;
1353                         current.y = ~pixel.y & current.y;
1354                         current.z = ~pixel.z & current.z;
1355                         break;
1356                 case LOGICALOP_AND_INVERTED:
1357                         current.x = pixel.x & ~current.x;
1358                         current.y = pixel.y & ~current.y;
1359                         current.z = pixel.z & ~current.z;
1360                         break;
1361                 case LOGICALOP_OR_REVERSE:
1362                         current.x = ~pixel.x | current.x;
1363                         current.y = ~pixel.y | current.y;
1364                         current.z = ~pixel.z | current.z;
1365                         break;
1366                 case LOGICALOP_OR_INVERTED:
1367                         current.x = pixel.x | ~current.x;
1368                         current.y = pixel.y | ~current.y;
1369                         current.z = pixel.z | ~current.z;
1370                         break;
1371                 default:
1372                         ASSERT(false);
1373                 }
1374         }
1375
1376         void PixelRoutine::writeColor(Registers &r, int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &current, Int &sMask, Int &zMask, Int &cMask)
1377         {
1378                 if(postBlendSRGB && state.writeSRGB)
1379                 {
1380                         linearToSRGB16_12_16(r, current);
1381                 }
1382
1383                 if(exactColorRounding)
1384                 {
1385                         switch(state.targetFormat[index])
1386                         {
1387                         case FORMAT_R5G6B5:
1388                                 current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400));
1389                                 current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200));
1390                                 current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400));
1391                                 break;
1392                         case FORMAT_X8G8R8B8Q:
1393                         case FORMAT_A8G8R8B8Q:
1394                         case FORMAT_X8R8G8B8:
1395                         case FORMAT_X8B8G8R8:
1396                         case FORMAT_A8R8G8B8:
1397                         case FORMAT_A8B8G8R8:
1398                                 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1399                                 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1400                                 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1401                                 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1402                                 break;
1403                         }
1404                 }
1405
1406                 int rgbaWriteMask = state.colorWriteActive(index);
1407                 int bgraWriteMask = rgbaWriteMask & 0x0000000A | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
1408                 int brgaWriteMask = rgbaWriteMask & 0x00000008 | (rgbaWriteMask & 0x00000001) << 1 | (rgbaWriteMask & 0x00000002) << 1 | (rgbaWriteMask & 0x00000004) >> 2;
1409
1410                 switch(state.targetFormat[index])
1411                 {
1412                 case FORMAT_R5G6B5:
1413                         {
1414                                 current.x = current.x & Short4(0xF800u);
1415                                 current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1416                                 current.z = As<UShort4>(current.z) >> 11;
1417
1418                                 current.x = current.x | current.y | current.z;
1419                         }
1420                         break;
1421                 case FORMAT_X8G8R8B8Q:
1422                         UNIMPLEMENTED();
1423                 //      current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1424                 //      current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1425                 //      current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1426
1427                 //      current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1428                 //      current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1429                         break;
1430                 case FORMAT_A8G8R8B8Q:
1431                         UNIMPLEMENTED();
1432                 //      current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1433                 //      current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1434                 //      current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1435                 //      current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1436
1437                 //      current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1438                 //      current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1439                         break;
1440                 case FORMAT_X8R8G8B8:
1441                 case FORMAT_A8R8G8B8:
1442                         if(state.targetFormat[index] == FORMAT_X8R8G8B8 || rgbaWriteMask == 0x7)
1443                         {
1444                                 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1445                                 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1446                                 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1447
1448                                 current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1449                                 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1450
1451                                 current.x = current.z;
1452                                 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1453                                 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1454                                 current.y = current.z;
1455                                 current.z = As<Short4>(UnpackLow(current.z, current.x));
1456                                 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1457                         }
1458                         else
1459                         {
1460                                 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1461                                 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1462                                 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1463                                 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1464
1465                                 current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1466                                 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1467
1468                                 current.x = current.z;
1469                                 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1470                                 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1471                                 current.y = current.z;
1472                                 current.z = As<Short4>(UnpackLow(current.z, current.x));
1473                                 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1474                         }
1475                         break;
1476                 case FORMAT_X8B8G8R8:
1477                 case FORMAT_A8B8G8R8:
1478                         if(state.targetFormat[index] == FORMAT_X8B8G8R8 || rgbaWriteMask == 0x7)
1479                         {
1480                                 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1481                                 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1482                                 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1483
1484                                 current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1485                                 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1486
1487                                 current.x = current.z;
1488                                 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1489                                 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1490                                 current.y = current.z;
1491                                 current.z = As<Short4>(UnpackLow(current.z, current.x));
1492                                 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1493                         }
1494                         else
1495                         {
1496                                 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1497                                 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1498                                 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1499                                 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1500
1501                                 current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1502                                 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1503
1504                                 current.x = current.z;
1505                                 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1506                                 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1507                                 current.y = current.z;
1508                                 current.z = As<Short4>(UnpackLow(current.z, current.x));
1509                                 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1510                         }
1511                         break;
1512                 case FORMAT_A8:
1513                         current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1514                         current.w = As<Short4>(Pack(As<UShort4>(current.w), As<UShort4>(current.w)));
1515                         break;
1516                 case FORMAT_G16R16:
1517                         current.z = current.x;
1518                         current.x = As<Short4>(UnpackLow(current.x, current.y));
1519                         current.z = As<Short4>(UnpackHigh(current.z, current.y));
1520                         current.y = current.z;
1521                         break;
1522                 case FORMAT_A16B16G16R16:
1523                         transpose4x4(current.x, current.y, current.z, current.w);
1524                         break;
1525                 default:
1526                         ASSERT(false);
1527                 }
1528
1529                 Short4 c01 = current.z;
1530                 Short4 c23 = current.y;
1531
1532                 Int xMask;   // Combination of all masks
1533
1534                 if(state.depthTestActive)
1535                 {
1536                         xMask = zMask;
1537                 }
1538                 else
1539                 {
1540                         xMask = cMask;
1541                 }
1542
1543                 if(state.stencilActive)
1544                 {
1545                         xMask &= sMask;
1546                 }
1547
1548                 switch(state.targetFormat[index])
1549                 {
1550                 case FORMAT_R5G6B5:
1551                         {
1552                                 Pointer<Byte> buffer = cBuffer + 2 * x;
1553                                 Int value = *Pointer<Int>(buffer);
1554
1555                                 Int c01 = Extract(As<Int2>(current.x), 0);
1556
1557                                 if((bgraWriteMask & 0x00000007) != 0x00000007)
1558                                 {
1559                                         Int masked = value;
1560                                         c01 &= *Pointer<Int>(r.constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1561                                         masked &= *Pointer<Int>(r.constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0]));
1562                                         c01 |= masked;
1563                                 }
1564
1565                                 c01 &= *Pointer<Int>(r.constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
1566                                 value &= *Pointer<Int>(r.constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
1567                                 c01 |= value;
1568                                 *Pointer<Int>(buffer) = c01;
1569
1570                                 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1571                                 value = *Pointer<Int>(buffer);
1572
1573                                 Int c23 = Extract(As<Int2>(current.x), 1);
1574
1575                                 if((bgraWriteMask & 0x00000007) != 0x00000007)
1576                                 {
1577                                         Int masked = value;
1578                                         c23 &= *Pointer<Int>(r.constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1579                                         masked &= *Pointer<Int>(r.constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0]));
1580                                         c23 |= masked;
1581                                 }
1582
1583                                 c23 &= *Pointer<Int>(r.constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
1584                                 value &= *Pointer<Int>(r.constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
1585                                 c23 |= value;
1586                                 *Pointer<Int>(buffer) = c23;
1587                         }
1588                         break;
1589                 case FORMAT_A8G8R8B8Q:
1590                 case FORMAT_X8G8R8B8Q:   // FIXME: Don't touch alpha?
1591                         UNIMPLEMENTED();
1592                 //      value = *Pointer<Short4>(cBuffer + 8 * x + 0);
1593
1594                 //      if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1595                 //         ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1596                 //          (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1597                 //      {
1598                 //              Short4 masked = value;
1599                 //              c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1600                 //              masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1601                 //              c01 |= masked;
1602                 //      }
1603
1604                 //      c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1605                 //      value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1606                 //      c01 |= value;
1607                 //      *Pointer<Short4>(cBuffer + 8 * x + 0) = c01;
1608
1609                 //      value = *Pointer<Short4>(cBuffer + 8 * x + 8);
1610
1611                 //      if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1612                 //         ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1613                 //          (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1614                 //      {
1615                 //              Short4 masked = value;
1616                 //              c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1617                 //              masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1618                 //              c23 |= masked;
1619                 //      }
1620
1621                 //      c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1622                 //      value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1623                 //      c23 |= value;
1624                 //      *Pointer<Short4>(cBuffer + 8 * x + 8) = c23;
1625                         break;
1626                 case FORMAT_A8R8G8B8:
1627                 case FORMAT_X8R8G8B8:   // FIXME: Don't touch alpha?
1628                         {
1629                                 Pointer<Byte> buffer = cBuffer + x * 4;
1630                                 Short4 value = *Pointer<Short4>(buffer);
1631
1632                                 if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1633                                    ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1634                                         (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1635                                 {
1636                                         Short4 masked = value;
1637                                         c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1638                                         masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1639                                         c01 |= masked;
1640                                 }
1641
1642                                 c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1643                                 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1644                                 c01 |= value;
1645                                 *Pointer<Short4>(buffer) = c01;
1646
1647                                 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1648                                 value = *Pointer<Short4>(buffer);
1649
1650                                 if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1651                                    ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1652                                         (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1653                                 {
1654                                         Short4 masked = value;
1655                                         c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1656                                         masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1657                                         c23 |= masked;
1658                                 }
1659
1660                                 c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1661                                 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1662                                 c23 |= value;
1663                                 *Pointer<Short4>(buffer) = c23;
1664                         }
1665                         break;
1666                 case FORMAT_A8B8G8R8:
1667                 case FORMAT_X8B8G8R8:   // FIXME: Don't touch alpha?
1668                         {
1669                                 Pointer<Byte> buffer = cBuffer + x * 4;
1670                                 Short4 value = *Pointer<Short4>(buffer);
1671
1672                                 if((state.targetFormat[index] == FORMAT_A8B8G8R8 && rgbaWriteMask != 0x0000000F) ||
1673                                    ((state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x00000007) &&
1674                                         (state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x0000000F)))   // FIXME: Need for masking when XBGR && Fh?
1675                                 {
1676                                         Short4 masked = value;
1677                                         c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1678                                         masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1679                                         c01 |= masked;
1680                                 }
1681
1682                                 c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1683                                 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1684                                 c01 |= value;
1685                                 *Pointer<Short4>(buffer) = c01;
1686
1687                                 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1688                                 value = *Pointer<Short4>(buffer);
1689
1690                                 if((state.targetFormat[index] == FORMAT_A8B8G8R8 && rgbaWriteMask != 0x0000000F) ||
1691                                    ((state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x00000007) &&
1692                                         (state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x0000000F)))   // FIXME: Need for masking when XBGR && Fh?
1693                                 {
1694                                         Short4 masked = value;
1695                                         c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1696                                         masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1697                                         c23 |= masked;
1698                                 }
1699
1700                                 c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1701                                 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1702                                 c23 |= value;
1703                                 *Pointer<Short4>(buffer) = c23;
1704                         }
1705                         break;
1706                 case FORMAT_A8:
1707                         if(rgbaWriteMask & 0x00000008)
1708                         {
1709                                 Pointer<Byte> buffer = cBuffer + 1 * x;
1710                                 Short4 value;
1711                                 Insert(value, *Pointer<Short>(buffer), 0);
1712                                 Int pitch = *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1713                                 Insert(value, *Pointer<Short>(buffer + pitch), 1);
1714                                 value = UnpackLow(As<Byte8>(value), As<Byte8>(value));
1715
1716                                 current.w &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q) + 8 * xMask);
1717                                 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q) + 8 * xMask);
1718                                 current.w |= value;
1719
1720                                 *Pointer<Short>(buffer) = Extract(current.w, 0);
1721                                 *Pointer<Short>(buffer + pitch) = Extract(current.w, 1);
1722                         }
1723                         break;
1724                 case FORMAT_G16R16:
1725                         {
1726                                 Pointer<Byte> buffer = cBuffer + 4 * x;
1727
1728                                 Short4 value = *Pointer<Short4>(buffer);
1729
1730                                 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1731                                 {
1732                                         Short4 masked = value;
1733                                         current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1734                                         masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
1735                                         current.x |= masked;
1736                                 }
1737
1738                                 current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1739                                 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1740                                 current.x |= value;
1741                                 *Pointer<Short4>(buffer) = current.x;
1742
1743                                 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1744
1745                                 value = *Pointer<Short4>(buffer);
1746
1747                                 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1748                                 {
1749                                         Short4 masked = value;
1750                                         current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1751                                         masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
1752                                         current.y |= masked;
1753                                 }
1754
1755                                 current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1756                                 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1757                                 current.y |= value;
1758                                 *Pointer<Short4>(buffer) = current.y;
1759                         }
1760                         break;
1761                 case FORMAT_A16B16G16R16:
1762                         {
1763                                 Pointer<Byte> buffer = cBuffer + 8 * x;
1764
1765                                 {
1766                                         Short4 value = *Pointer<Short4>(buffer);
1767
1768                                         if(rgbaWriteMask != 0x0000000F)
1769                                         {
1770                                                 Short4 masked = value;
1771                                                 current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1772                                                 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1773                                                 current.x |= masked;
1774                                         }
1775
1776                                         current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
1777                                         value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
1778                                         current.x |= value;
1779                                         *Pointer<Short4>(buffer) = current.x;
1780                                 }
1781
1782                                 {
1783                                         Short4 value = *Pointer<Short4>(buffer + 8);
1784
1785                                         if(rgbaWriteMask != 0x0000000F)
1786                                         {
1787                                                 Short4 masked = value;
1788                                                 current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1789                                                 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1790                                                 current.y |= masked;
1791                                         }
1792
1793                                         current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
1794                                         value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
1795                                         current.y |= value;
1796                                         *Pointer<Short4>(buffer + 8) = current.y;
1797                                 }
1798
1799                                 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1800
1801                                 {
1802                                         Short4 value = *Pointer<Short4>(buffer);
1803
1804                                         if(rgbaWriteMask != 0x0000000F)
1805                                         {
1806                                                 Short4 masked = value;
1807                                                 current.z &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1808                                                 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1809                                                 current.z |= masked;
1810                                         }
1811
1812                                         current.z &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
1813                                         value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
1814                                         current.z |= value;
1815                                         *Pointer<Short4>(buffer) = current.z;
1816                                 }
1817
1818                                 {
1819                                         Short4 value = *Pointer<Short4>(buffer + 8);
1820
1821                                         if(rgbaWriteMask != 0x0000000F)
1822                                         {
1823                                                 Short4 masked = value;
1824                                                 current.w &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1825                                                 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1826                                                 current.w |= masked;
1827                                         }
1828
1829                                         current.w &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
1830                                         value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
1831                                         current.w |= value;
1832                                         *Pointer<Short4>(buffer + 8) = current.w;
1833                                 }
1834                         }
1835                         break;
1836                 default:
1837                         ASSERT(false);
1838                 }
1839         }
1840
1841         void PixelRoutine::blendFactor(Registers &r, const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive) 
1842         {
1843                 switch(blendFactorActive)
1844                 {
1845                 case BLEND_ZERO:
1846                         // Optimized
1847                         break;
1848                 case BLEND_ONE:
1849                         // Optimized
1850                         break;
1851                 case BLEND_SOURCE:
1852                         blendFactor.x = oC.x;
1853                         blendFactor.y = oC.y;
1854                         blendFactor.z = oC.z;
1855                         break;
1856                 case BLEND_INVSOURCE:
1857                         blendFactor.x = Float4(1.0f) - oC.x;
1858                         blendFactor.y = Float4(1.0f) - oC.y;
1859                         blendFactor.z = Float4(1.0f) - oC.z;
1860                         break;
1861                 case BLEND_DEST:
1862                         blendFactor.x = pixel.x;
1863                         blendFactor.y = pixel.y;
1864                         blendFactor.z = pixel.z;
1865                         break;
1866                 case BLEND_INVDEST:
1867                         blendFactor.x = Float4(1.0f) - pixel.x;
1868                         blendFactor.y = Float4(1.0f) - pixel.y;
1869                         blendFactor.z = Float4(1.0f) - pixel.z;
1870                         break;
1871                 case BLEND_SOURCEALPHA:
1872                         blendFactor.x = oC.w;
1873                         blendFactor.y = oC.w;
1874                         blendFactor.z = oC.w;
1875                         break;
1876                 case BLEND_INVSOURCEALPHA:
1877                         blendFactor.x = Float4(1.0f) - oC.w;
1878                         blendFactor.y = Float4(1.0f) - oC.w;
1879                         blendFactor.z = Float4(1.0f) - oC.w;
1880                         break;
1881                 case BLEND_DESTALPHA:
1882                         blendFactor.x = pixel.w;
1883                         blendFactor.y = pixel.w;
1884                         blendFactor.z = pixel.w;
1885                         break;
1886                 case BLEND_INVDESTALPHA:
1887                         blendFactor.x = Float4(1.0f) - pixel.w;
1888                         blendFactor.y = Float4(1.0f) - pixel.w;
1889                         blendFactor.z = Float4(1.0f) - pixel.w;
1890                         break;
1891                 case BLEND_SRCALPHASAT:
1892                         blendFactor.x = Float4(1.0f) - pixel.w;
1893                         blendFactor.x = Min(blendFactor.x, oC.w);
1894                         blendFactor.y = blendFactor.x;
1895                         blendFactor.z = blendFactor.x;
1896                         break;
1897                 case BLEND_CONSTANT:
1898                         blendFactor.x = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[0]));
1899                         blendFactor.y = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[1]));
1900                         blendFactor.z = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[2]));
1901                         break;
1902                 case BLEND_INVCONSTANT:
1903                         blendFactor.x = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
1904                         blendFactor.y = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
1905                         blendFactor.z = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
1906                         break;
1907                 default:
1908                         ASSERT(false);
1909                 }
1910         }
1911
1912         void PixelRoutine::blendFactorAlpha(Registers &r, const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive) 
1913         {
1914                 switch(blendFactorAlphaActive)
1915                 {
1916                 case BLEND_ZERO:
1917                         // Optimized
1918                         break;
1919                 case BLEND_ONE:
1920                         // Optimized
1921                         break;
1922                 case BLEND_SOURCE:
1923                         blendFactor.w = oC.w;
1924                         break;
1925                 case BLEND_INVSOURCE:
1926                         blendFactor.w = Float4(1.0f) - oC.w;
1927                         break;
1928                 case BLEND_DEST:
1929                         blendFactor.w = pixel.w;
1930                         break;
1931                 case BLEND_INVDEST:
1932                         blendFactor.w = Float4(1.0f) - pixel.w;
1933                         break;
1934                 case BLEND_SOURCEALPHA:
1935                         blendFactor.w = oC.w;
1936                         break;
1937                 case BLEND_INVSOURCEALPHA:
1938                         blendFactor.w = Float4(1.0f) - oC.w;
1939                         break;
1940                 case BLEND_DESTALPHA:
1941                         blendFactor.w = pixel.w;
1942                         break;
1943                 case BLEND_INVDESTALPHA:
1944                         blendFactor.w = Float4(1.0f) - pixel.w;
1945                         break;
1946                 case BLEND_SRCALPHASAT:
1947                         blendFactor.w = Float4(1.0f);
1948                         break;
1949                 case BLEND_CONSTANT:
1950                         blendFactor.w = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[3]));
1951                         break;
1952                 case BLEND_INVCONSTANT:
1953                         blendFactor.w = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
1954                         break;
1955                 default:
1956                         ASSERT(false);
1957                 }
1958         }
1959
1960         void PixelRoutine::alphaBlend(Registers &r, int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
1961         {
1962                 if(!state.alphaBlendActive)
1963                 {
1964                         return;
1965                 }
1966
1967                 Pointer<Byte> buffer;
1968                 Vector4f pixel;
1969
1970                 Vector4s color;
1971                 Short4 c01;
1972                 Short4 c23;
1973
1974                 switch(state.targetFormat[index])
1975                 {
1976                 case FORMAT_R32F:
1977                         buffer = cBuffer;
1978                         // FIXME: movlps
1979                         pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
1980                         pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
1981                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1982                         // FIXME: movhps
1983                         pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
1984                         pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
1985                         pixel.y = Float4(1.0f);
1986                         pixel.z = Float4(1.0f);
1987                         pixel.w = Float4(1.0f);
1988                         break;
1989                 case FORMAT_G32R32F:
1990                         buffer = cBuffer;
1991                         pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
1992                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1993                         pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
1994                         pixel.z = pixel.x;
1995                         pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88);
1996                         pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD);
1997                         pixel.y = pixel.z;
1998                         pixel.z = Float4(1.0f);
1999                         pixel.w = Float4(1.0f);
2000                         break;
2001                 case FORMAT_A32B32G32R32F:
2002                         buffer = cBuffer;
2003                         pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
2004                         pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2005                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
2006                         pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
2007                         pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2008                         transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
2009                         break;
2010                 default:
2011                         ASSERT(false);
2012                 }
2013
2014                 if(postBlendSRGB && state.writeSRGB)
2015                 {
2016                         sRGBtoLinear(pixel.x);
2017                         sRGBtoLinear(pixel.y);
2018                         sRGBtoLinear(pixel.z);
2019                 }
2020
2021                 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
2022                 Vector4f sourceFactor;
2023                 Vector4f destFactor;
2024
2025                 blendFactor(r, sourceFactor, oC, pixel, state.sourceBlendFactor);
2026                 blendFactor(r, destFactor, oC, pixel, state.destBlendFactor);
2027
2028                 if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
2029                 {
2030                         oC.x *= sourceFactor.x;
2031                         oC.y *= sourceFactor.y;
2032                         oC.z *= sourceFactor.z;
2033                 }
2034         
2035                 if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
2036                 {
2037                         pixel.x *= destFactor.x;
2038                         pixel.y *= destFactor.y;
2039                         pixel.z *= destFactor.z;
2040                 }
2041
2042                 switch(state.blendOperation)
2043                 {
2044                 case BLENDOP_ADD:
2045                         oC.x += pixel.x;
2046                         oC.y += pixel.y;
2047                         oC.z += pixel.z;
2048                         break;
2049                 case BLENDOP_SUB:
2050                         oC.x -= pixel.x;
2051                         oC.y -= pixel.y;
2052                         oC.z -= pixel.z;
2053                         break;
2054                 case BLENDOP_INVSUB:
2055                         oC.x = pixel.x - oC.x;
2056                         oC.y = pixel.y - oC.y;
2057                         oC.z = pixel.z - oC.z;
2058                         break;
2059                 case BLENDOP_MIN:
2060                         oC.x = Min(oC.x, pixel.x);
2061                         oC.y = Min(oC.y, pixel.y);
2062                         oC.z = Min(oC.z, pixel.z);
2063                         break;
2064                 case BLENDOP_MAX:
2065                         oC.x = Max(oC.x, pixel.x);
2066                         oC.y = Max(oC.y, pixel.y);
2067                         oC.z = Max(oC.z, pixel.z);
2068                         break;
2069                 case BLENDOP_SOURCE:
2070                         // No operation
2071                         break;
2072                 case BLENDOP_DEST:
2073                         oC.x = pixel.x;
2074                         oC.y = pixel.y;
2075                         oC.z = pixel.z;
2076                         break;
2077                 case BLENDOP_NULL:
2078                         oC.x = Float4(0.0f);
2079                         oC.y = Float4(0.0f);
2080                         oC.z = Float4(0.0f);
2081                         break;
2082                 default:
2083                         ASSERT(false);
2084                 }
2085
2086                 blendFactorAlpha(r, sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
2087                 blendFactorAlpha(r, destFactor, oC, pixel, state.destBlendFactorAlpha);
2088
2089                 if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
2090                 {
2091                         oC.w *= sourceFactor.w;
2092                 }
2093         
2094                 if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
2095                 {
2096                         pixel.w *= destFactor.w;
2097                 }
2098
2099                 switch(state.blendOperationAlpha)
2100                 {
2101                 case BLENDOP_ADD:
2102                         oC.w += pixel.w;
2103                         break;
2104                 case BLENDOP_SUB:
2105                         oC.w -= pixel.w;
2106                         break;
2107                 case BLENDOP_INVSUB:
2108                         pixel.w -= oC.w;
2109                         oC.w = pixel.w;
2110                         break;
2111                 case BLENDOP_MIN:       
2112                         oC.w = Min(oC.w, pixel.w);
2113                         break;
2114                 case BLENDOP_MAX:       
2115                         oC.w = Max(oC.w, pixel.w);
2116                         break;
2117                 case BLENDOP_SOURCE:
2118                         // No operation
2119                         break;
2120                 case BLENDOP_DEST:
2121                         oC.w = pixel.w;
2122                         break;
2123                 case BLENDOP_NULL:
2124                         oC.w = Float4(0.0f);
2125                         break;
2126                 default:
2127                         ASSERT(false);
2128                 }
2129         }
2130
2131         void PixelRoutine::writeColor(Registers &r, int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
2132         {
2133                 switch(state.targetFormat[index])
2134                 {
2135                 case FORMAT_R32F:
2136                         break;
2137                 case FORMAT_G32R32F:
2138                         oC.z = oC.x;
2139                         oC.x = UnpackLow(oC.x, oC.y);
2140                         oC.z = UnpackHigh(oC.z, oC.y);
2141                         oC.y = oC.z;
2142                         break;
2143                 case FORMAT_A32B32G32R32F:
2144                         transpose4x4(oC.x, oC.y, oC.z, oC.w);
2145                         break;
2146                 default:
2147                         ASSERT(false);
2148                 }
2149
2150                 int rgbaWriteMask = state.colorWriteActive(index);
2151
2152                 Int xMask;   // Combination of all masks
2153
2154                 if(state.depthTestActive)
2155                 {
2156                         xMask = zMask;
2157                 }
2158                 else
2159                 {
2160                         xMask = cMask;
2161                 }
2162
2163                 if(state.stencilActive)
2164                 {
2165                         xMask &= sMask;
2166                 }
2167
2168                 Pointer<Byte> buffer;
2169                 Float4 value;
2170
2171                 switch(state.targetFormat[index])
2172                 {
2173                 case FORMAT_R32F:
2174                         if(rgbaWriteMask & 0x00000001)
2175                         {
2176                                 buffer = cBuffer + 4 * x;
2177
2178                                 // FIXME: movlps
2179                                 value.x = *Pointer<Float>(buffer + 0);
2180                                 value.y = *Pointer<Float>(buffer + 4);
2181
2182                                 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
2183
2184                                 // FIXME: movhps
2185                                 value.z = *Pointer<Float>(buffer + 0);
2186                                 value.w = *Pointer<Float>(buffer + 4);
2187
2188                                 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
2189                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
2190                                 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2191
2192                                 // FIXME: movhps
2193                                 *Pointer<Float>(buffer + 0) = oC.x.z;
2194                                 *Pointer<Float>(buffer + 4) = oC.x.w;
2195
2196                                 buffer -= *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
2197
2198                                 // FIXME: movlps
2199                                 *Pointer<Float>(buffer + 0) = oC.x.x;
2200                                 *Pointer<Float>(buffer + 4) = oC.x.y;
2201                         }
2202                         break;
2203                 case FORMAT_G32R32F:
2204                         buffer = cBuffer + 8 * x;
2205
2206                         value = *Pointer<Float4>(buffer);
2207
2208                         if((rgbaWriteMask & 0x00000003) != 0x00000003)
2209                         {
2210                                 Float4 masked = value;
2211                                 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2212                                 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0])));
2213                                 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2214                         }
2215
2216                         oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
2217                         value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
2218                         oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2219                         *Pointer<Float4>(buffer) = oC.x;
2220
2221                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
2222
2223                         value = *Pointer<Float4>(buffer);
2224
2225                         if((rgbaWriteMask & 0x00000003) != 0x00000003)
2226                         {
2227                                 Float4 masked;
2228
2229                                 masked = value;
2230                                 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2231                                 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0])));
2232                                 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2233                         }
2234
2235                         oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
2236                         value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
2237                         oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2238                         *Pointer<Float4>(buffer) = oC.y;
2239                         break;
2240                 case FORMAT_A32B32G32R32F:
2241                         buffer = cBuffer + 16 * x;
2242
2243                         {
2244                                 value = *Pointer<Float4>(buffer, 16);
2245
2246                                 if(rgbaWriteMask != 0x0000000F)
2247                                 {
2248                                         Float4 masked = value;
2249                                         oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2250                                         masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2251                                         oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2252                                 }
2253                                 
2254                                 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
2255                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
2256                                 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2257                                 *Pointer<Float4>(buffer, 16) = oC.x;
2258                         }
2259
2260                         {
2261                                 value = *Pointer<Float4>(buffer + 16, 16);
2262
2263                                 if(rgbaWriteMask != 0x0000000F)
2264                                 {       
2265                                         Float4 masked = value;
2266                                         oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2267                                         masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2268                                         oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2269                                 }
2270
2271                                 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
2272                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
2273                                 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2274                                 *Pointer<Float4>(buffer + 16, 16) = oC.y;
2275                         }
2276
2277                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
2278
2279                         {
2280                                 value = *Pointer<Float4>(buffer, 16);
2281
2282                                 if(rgbaWriteMask != 0x0000000F)
2283                                 {
2284                                         Float4 masked = value;
2285                                         oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2286                                         masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2287                                         oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
2288                                 }
2289
2290                                 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
2291                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
2292                                 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
2293                                 *Pointer<Float4>(buffer, 16) = oC.z;
2294                         }
2295
2296                         {
2297                                 value = *Pointer<Float4>(buffer + 16, 16);
2298
2299                                 if(rgbaWriteMask != 0x0000000F)
2300                                 {
2301                                         Float4 masked = value;
2302                                         oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2303                                         masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2304                                         oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
2305                                 }
2306
2307                                 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
2308                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
2309                                 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
2310                                 *Pointer<Float4>(buffer + 16, 16) = oC.w;
2311                         }
2312                         break;
2313                 default:
2314                         ASSERT(false);
2315                 }
2316         }
2317
2318         UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate)
2319         {
2320                 return UShort4(cf * Float4(0xFFFF), saturate);
2321         }
2322
2323         void PixelRoutine::sRGBtoLinear16_12_16(Registers &r, Vector4s &c)
2324         {
2325                 c.x = As<UShort4>(c.x) >> 4;
2326                 c.y = As<UShort4>(c.y) >> 4;
2327                 c.z = As<UShort4>(c.z) >> 4;
2328
2329                 sRGBtoLinear12_16(r, c);
2330         }
2331
2332         void PixelRoutine::sRGBtoLinear12_16(Registers &r, Vector4s &c)
2333         {
2334                 Pointer<Byte> LUT = r.constants + OFFSET(Constants,sRGBtoLinear12_16);
2335
2336                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2337                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2338                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2339                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2340
2341                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2342                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2343                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2344                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2345
2346                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2347                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2348                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2349                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2350         }
2351
2352         void PixelRoutine::linearToSRGB16_12_16(Registers &r, Vector4s &c)
2353         {
2354                 c.x = As<UShort4>(c.x) >> 4;
2355                 c.y = As<UShort4>(c.y) >> 4;
2356                 c.z = As<UShort4>(c.z) >> 4;
2357
2358                 linearToSRGB12_16(r, c);
2359         }
2360
2361         void PixelRoutine::linearToSRGB12_16(Registers &r, Vector4s &c)
2362         {
2363                 Pointer<Byte> LUT = r.constants + OFFSET(Constants,linearToSRGB12_16);
2364
2365                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2366                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2367                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2368                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2369
2370                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2371                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2372                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2373                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2374
2375                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2376                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2377                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2378                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2379         }
2380
2381         Float4 PixelRoutine::sRGBtoLinear(const Float4 &x)   // Approximates x^2.2
2382         {
2383                 Float4 linear = x * x;
2384                 linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
2385
2386                 return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
2387         }
2388
2389         bool PixelRoutine::colorUsed()
2390         {
2391                 return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill;
2392         }
2393 }