OSDN Git Service

Remove unused readPixel() parameter.
[android-x86/external-swiftshader.git] / src / Shader / PixelRoutine.cpp
1 // SwiftShader Software Renderer
2 //
3 // Copyright(c) 2005-2013 TransGaming Inc.
4 //
5 // All rights reserved. No part of this software may be copied, distributed, transmitted,
6 // transcribed, stored in a retrieval system, translated into any human or computer
7 // language by any means, or disclosed to third parties without the explicit written
8 // agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
9 // or implied, including but not limited to any patent rights, are granted to you.
10 //
11
12 #include "PixelRoutine.hpp"
13
14 #include "Renderer.hpp"
15 #include "QuadRasterizer.hpp"
16 #include "Surface.hpp"
17 #include "Primitive.hpp"
18 #include "CPUID.hpp"
19 #include "SamplerCore.hpp"
20 #include "Constants.hpp"
21 #include "Debug.hpp"
22
23 namespace sw
24 {
25         extern bool complementaryDepthBuffer;
26         extern bool postBlendSRGB;
27         extern bool exactColorRounding;
28         extern bool forceClearRegisters;
29
30         PixelRoutine::Registers::Registers(const PixelShader *shader) :
31                 QuadRasterizer::Registers(),
32                 rf(shader && shader->dynamicallyIndexedTemporaries),
33                 vf(shader && shader->dynamicallyIndexedInput)
34         {
35                 if(!shader || shader->getVersion() < 0x0200 || forceClearRegisters)
36                 {
37                         for(int i = 0; i < 10; i++)
38                         {
39                                 vf[i].x = Float4(0.0f);
40                                 vf[i].y = Float4(0.0f);
41                                 vf[i].z = Float4(0.0f);
42                                 vf[i].w = Float4(0.0f);
43                         }
44                 }
45         }
46
47         PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader) : QuadRasterizer(state, shader)
48         {
49         }
50
51         PixelRoutine::~PixelRoutine()
52         {
53                 for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
54                 {
55                         delete sampler[i];
56                 }
57         }
58
59         void PixelRoutine::quad(QuadRasterizer::Registers &rBase, Pointer<Byte> cBuffer[4], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
60         {
61                 Registers& r = *static_cast<Registers*>(&rBase);
62
63                 #if PERF_PROFILE
64                         Long pipeTime = Ticks();
65                 #endif
66
67                 for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
68                 {
69                         sampler[i] = new SamplerCore(r.constants, state.sampler[i]);
70                 }
71
72                 const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive();
73
74                 Int zMask[4];   // Depth mask
75                 Int sMask[4];   // Stencil mask
76
77                 for(unsigned int q = 0; q < state.multiSample; q++)
78                 {
79                         zMask[q] = cMask[q];
80                         sMask[q] = cMask[q];
81                 }
82
83                 for(unsigned int q = 0; q < state.multiSample; q++)
84                 {
85                         stencilTest(r, sBuffer, q, x, sMask[q], cMask[q]);
86                 }
87
88                 Float4 f;
89
90                 Float4 (&z)[4] = r.z;
91                 Float4 &w = r.w;
92                 Float4 &rhw = r.rhw;
93                 Float4 rhwCentroid;
94
95                 Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(r.primitive + OFFSET(Primitive,xQuad), 16);
96
97                 if(interpolateZ())
98                 {
99                         for(unsigned int q = 0; q < state.multiSample; q++)
100                         {
101                                 Float4 x = xxxx;
102                         
103                                 if(state.multiSample > 1)
104                                 {
105                                         x -= *Pointer<Float4>(r.constants + OFFSET(Constants,X) + q * sizeof(float4));
106                                 }
107
108                                 z[q] = interpolate(x, r.Dz[q], z[q], r.primitive + OFFSET(Primitive,z), false, false);
109                         }
110                 }
111
112                 Bool depthPass = false;
113
114                 if(earlyDepthTest)
115                 {
116                         for(unsigned int q = 0; q < state.multiSample; q++)
117                         {
118                                 depthPass = depthPass || depthTest(r, zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
119                         }
120                 }
121
122                 If(depthPass || Bool(!earlyDepthTest))
123                 {
124                         #if PERF_PROFILE
125                                 Long interpTime = Ticks();
126                         #endif
127
128                         Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(r.primitive + OFFSET(Primitive,yQuad), 16);
129
130                         // Centroid locations
131                         Float4 XXXX = Float4(0.0f);
132                         Float4 YYYY = Float4(0.0f);
133
134                         if(state.centroid)
135                         {
136                                 Float4 WWWW(1.0e-9f);
137
138                                 for(unsigned int q = 0; q < state.multiSample; q++)
139                                 {
140                                         XXXX += *Pointer<Float4>(r.constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
141                                         YYYY += *Pointer<Float4>(r.constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
142                                         WWWW += *Pointer<Float4>(r.constants + OFFSET(Constants,weight) + 16 * cMask[q]);
143                                 }
144
145                                 WWWW = Rcp_pp(WWWW);
146                                 XXXX *= WWWW;
147                                 YYYY *= WWWW;
148
149                                 XXXX += xxxx;
150                                 YYYY += yyyy;
151                         }
152
153                         if(interpolateW())
154                         {
155                                 w = interpolate(xxxx, r.Dw, rhw, r.primitive + OFFSET(Primitive,w), false, false);
156                                 rhw = reciprocal(w);
157
158                                 if(state.centroid)
159                                 {
160                                         rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, r.primitive + OFFSET(Primitive,w), false, false));
161                                 }
162                         }
163
164                         for(int interpolant = 0; interpolant < 10; interpolant++)
165                         {
166                                 for(int component = 0; component < 4; component++)
167                                 {
168                                         if(state.interpolant[interpolant].component & (1 << component))
169                                         {
170                                                 if(!state.interpolant[interpolant].centroid)
171                                                 {
172                                                         r.vf[interpolant][component] = interpolate(xxxx, r.Dv[interpolant][component], rhw, r.primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
173                                                 }
174                                                 else
175                                                 {
176                                                         r.vf[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, r.primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
177                                                 }
178                                         }
179                                 }
180
181                                 Float4 rcp;
182
183                                 switch(state.interpolant[interpolant].project)
184                                 {
185                                 case 0:
186                                         break;
187                                 case 1:
188                                         rcp = reciprocal(r.vf[interpolant].y);
189                                         r.vf[interpolant].x = r.vf[interpolant].x * rcp;
190                                         break;
191                                 case 2:
192                                         rcp = reciprocal(r.vf[interpolant].z);
193                                         r.vf[interpolant].x = r.vf[interpolant].x * rcp;
194                                         r.vf[interpolant].y = r.vf[interpolant].y * rcp;
195                                         break;
196                                 case 3:
197                                         rcp = reciprocal(r.vf[interpolant].w);
198                                         r.vf[interpolant].x = r.vf[interpolant].x * rcp;
199                                         r.vf[interpolant].y = r.vf[interpolant].y * rcp;
200                                         r.vf[interpolant].z = r.vf[interpolant].z * rcp;
201                                         break;
202                                 }
203                         }
204
205                         if(state.fog.component)
206                         {
207                                 f = interpolate(xxxx, r.Df, rhw, r.primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective);
208                         }
209
210                         setBuiltins(r, x, y, z, w);
211
212                         #if PERF_PROFILE
213                                 r.cycles[PERF_INTERP] += Ticks() - interpTime;
214                         #endif
215
216                         Bool alphaPass = true;
217
218                         if(colorUsed())
219                         {
220                                 #if PERF_PROFILE
221                                         Long shaderTime = Ticks();
222                                 #endif
223
224                                 applyShader(r, cMask);
225
226                                 #if PERF_PROFILE
227                                         r.cycles[PERF_SHADER] += Ticks() - shaderTime;
228                                 #endif
229
230                                 alphaPass = alphaTest(r, cMask);
231
232                                 if((shader && shader->containsKill()) || state.alphaTestActive())
233                                 {
234                                         for(unsigned int q = 0; q < state.multiSample; q++)
235                                         {
236                                                 zMask[q] &= cMask[q];
237                                                 sMask[q] &= cMask[q];
238                                         }
239                                 }
240                         }
241
242                         If(alphaPass)
243                         {
244                                 if(!earlyDepthTest)
245                                 {
246                                         for(unsigned int q = 0; q < state.multiSample; q++)
247                                         {
248                                                 depthPass = depthPass || depthTest(r, zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
249                                         }
250                                 }
251
252                                 #if PERF_PROFILE
253                                         Long ropTime = Ticks();
254                                 #endif
255
256                                 If(depthPass || Bool(earlyDepthTest))
257                                 {
258                                         for(unsigned int q = 0; q < state.multiSample; q++)
259                                         {
260                                                 if(state.multiSampleMask & (1 << q))
261                                                 {
262                                                         writeDepth(r, zBuffer, q, x, z[q], zMask[q]);
263
264                                                         if(state.occlusionEnabled)
265                                                         {
266                                                                 r.occlusion += *Pointer<UInt>(r.constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
267                                                         }
268                                                 }
269                                         }
270
271                                         if(colorUsed())
272                                         {
273                                                 #if PERF_PROFILE
274                                                         AddAtomic(Pointer<Long>(&profiler.ropOperations), 4);
275                                                 #endif
276
277                                                 rasterOperation(r, f, cBuffer, x, sMask, zMask, cMask);
278                                         }
279                                 }
280
281                                 #if PERF_PROFILE
282                                         r.cycles[PERF_ROP] += Ticks() - ropTime;
283                                 #endif
284                         }
285                 }
286
287                 for(unsigned int q = 0; q < state.multiSample; q++)
288                 {
289                         if(state.multiSampleMask & (1 << q))
290                         {
291                                 writeStencil(r, sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
292                         }
293                 }
294
295                 #if PERF_PROFILE
296                         r.cycles[PERF_PIPE] += Ticks() - pipeTime;
297                 #endif
298         }
299
300         Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
301         {
302                 Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
303
304                 if(!flat)
305                 {
306                         interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
307                                        y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
308
309                         if(perspective)
310                         {
311                                 interpolant *= rhw;
312                         }
313                 }
314
315                 return interpolant;
316         }
317
318         void PixelRoutine::stencilTest(Registers &r, Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask)
319         {
320                 if(!state.stencilActive)
321                 {
322                         return;
323                 }
324
325                 // (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
326
327                 Pointer<Byte> buffer = sBuffer + 2 * x;
328
329                 if(q > 0)
330                 {
331                         buffer += q * *Pointer<Int>(r.data + OFFSET(DrawData,stencilSliceB));
332                 }
333
334                 Byte8 value = As<Byte8>(Long1(*Pointer<UInt>(buffer)));
335                 Byte8 valueCCW = value;
336
337                 if(!state.noStencilMask)
338                 {
339                         value &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[0].testMaskQ));
340                 }
341
342                 stencilTest(r, value, state.stencilCompareMode, false);
343
344                 if(state.twoSidedStencil)
345                 {
346                         if(!state.noStencilMaskCCW)
347                         {
348                                 valueCCW &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[1].testMaskQ));
349                         }
350
351                         stencilTest(r, valueCCW, state.stencilCompareModeCCW, true);
352
353                         value &= *Pointer<Byte8>(r.primitive + OFFSET(Primitive,clockwiseMask));
354                         valueCCW &= *Pointer<Byte8>(r.primitive + OFFSET(Primitive,invClockwiseMask));
355                         value |= valueCCW;
356                 }
357
358                 sMask = SignMask(value) & cMask;
359         }
360
361         void PixelRoutine::stencilTest(Registers &r, Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW)
362         {
363                 Byte8 equal;
364
365                 switch(stencilCompareMode)
366                 {
367                 case STENCIL_ALWAYS:
368                         value = Byte8(0xFFFFFFFFFFFFFFFF);
369                         break;
370                 case STENCIL_NEVER:
371                         value = Byte8(0x0000000000000000);
372                         break;
373                 case STENCIL_LESS:                      // a < b ~ b > a
374                         value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
375                         value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
376                         break;
377                 case STENCIL_EQUAL:
378                         value = CmpEQ(value, *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
379                         break;
380                 case STENCIL_NOTEQUAL:          // a != b ~ !(a == b)
381                         value = CmpEQ(value, *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
382                         value ^= Byte8(0xFFFFFFFFFFFFFFFF);
383                         break;
384                 case STENCIL_LESSEQUAL: // a <= b ~ (b > a) || (a == b)
385                         equal = value;
386                         equal = CmpEQ(equal, *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
387                         value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
388                         value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
389                         value |= equal;
390                         break;
391                 case STENCIL_GREATER:           // a > b
392                         equal = *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
393                         value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
394                         equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
395                         value = equal;
396                         break;
397                 case STENCIL_GREATEREQUAL:      // a >= b ~ !(a < b) ~ !(b > a)
398                         value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
399                         value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
400                         value ^= Byte8(0xFFFFFFFFFFFFFFFF);
401                         break;
402                 default:
403                         ASSERT(false);
404                 }
405         }
406
407         Bool PixelRoutine::depthTest(Registers &r, Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
408         {
409                 if(!state.depthTestActive)
410                 {
411                         return true;
412                 }
413
414                 Float4 Z = z;
415
416                 if(shader && shader->depthOverride())
417                 {
418                         if(complementaryDepthBuffer)
419                         {
420                                 Z = Float4(1.0f) - r.oDepth;
421                         }
422                         else
423                         {
424                                 Z = r.oDepth;
425                         }
426                 }
427
428                 Pointer<Byte> buffer;
429                 Int pitch;
430
431                 if(!state.quadLayoutDepthBuffer)
432                 {
433                         buffer = zBuffer + 4 * x;
434                         pitch = *Pointer<Int>(r.data + OFFSET(DrawData,depthPitchB));
435                 }
436                 else
437                 {
438                         buffer = zBuffer + 8 * x;
439                 }
440
441                 if(q > 0)
442                 {
443                         buffer += q * *Pointer<Int>(r.data + OFFSET(DrawData,depthSliceB));
444                 }
445
446                 Float4 zValue;
447
448                 if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
449                 {
450                         if(!state.quadLayoutDepthBuffer)
451                         {
452                                 // FIXME: Properly optimizes?
453                                 zValue.xy = *Pointer<Float4>(buffer);
454                                 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
455                         }
456                         else
457                         {
458                                 zValue = *Pointer<Float4>(buffer, 16);
459                         }
460                 }
461
462                 Int4 zTest;
463
464                 switch(state.depthCompareMode)
465                 {
466                 case DEPTH_ALWAYS:
467                         // Optimized
468                         break;
469                 case DEPTH_NEVER:
470                         // Optimized
471                         break;
472                 case DEPTH_EQUAL:
473                         zTest = CmpEQ(zValue, Z);
474                         break;
475                 case DEPTH_NOTEQUAL:
476                         zTest = CmpNEQ(zValue, Z);
477                         break;
478                 case DEPTH_LESS:
479                         if(complementaryDepthBuffer)
480                         {
481                                 zTest = CmpLT(zValue, Z);
482                         }
483                         else
484                         {
485                                 zTest = CmpNLE(zValue, Z);
486                         }
487                         break;
488                 case DEPTH_GREATEREQUAL:
489                         if(complementaryDepthBuffer)
490                         {
491                                 zTest = CmpNLT(zValue, Z);
492                         }
493                         else
494                         {
495                                 zTest = CmpLE(zValue, Z);
496                         }
497                         break;
498                 case DEPTH_LESSEQUAL:
499                         if(complementaryDepthBuffer)
500                         {
501                                 zTest = CmpLE(zValue, Z);
502                         }
503                         else
504                         {
505                                 zTest = CmpNLT(zValue, Z);
506                         }
507                         break;
508                 case DEPTH_GREATER:
509                         if(complementaryDepthBuffer)
510                         {
511                                 zTest = CmpNLE(zValue, Z);
512                         }
513                         else
514                         {
515                                 zTest = CmpLT(zValue, Z);
516                         }
517                         break;
518                 default:
519                         ASSERT(false);
520                 }
521
522                 switch(state.depthCompareMode)
523                 {
524                 case DEPTH_ALWAYS:
525                         zMask = cMask;
526                         break;
527                 case DEPTH_NEVER:
528                         zMask = 0x0;
529                         break;
530                 default:
531                         zMask = SignMask(zTest) & cMask;
532                         break;
533                 }
534                 
535                 if(state.stencilActive)
536                 {
537                         zMask &= sMask;
538                 }
539
540                 return zMask != 0;
541         }
542
543         void PixelRoutine::alphaTest(Registers &r, Int &aMask, Short4 &alpha)
544         {
545                 Short4 cmp;
546                 Short4 equal;
547
548                 switch(state.alphaCompareMode)
549                 {
550                 case ALPHA_ALWAYS:
551                         aMask = 0xF;
552                         break;
553                 case ALPHA_NEVER:
554                         aMask = 0x0;
555                         break;
556                 case ALPHA_EQUAL:
557                         cmp = CmpEQ(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)));
558                         aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
559                         break;
560                 case ALPHA_NOTEQUAL:            // a != b ~ !(a == b)
561                         cmp = CmpEQ(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4((short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF);   // FIXME
562                         aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
563                         break;
564                 case ALPHA_LESS:                        // a < b ~ b > a
565                         cmp = CmpGT(*Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)), alpha);
566                         aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
567                         break;
568                 case ALPHA_GREATEREQUAL:        // a >= b ~ (a > b) || (a == b) ~ !(b > a)   // TODO: Approximate
569                         equal = CmpEQ(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)));
570                         cmp = CmpGT(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)));
571                         cmp |= equal;
572                         aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
573                         break;
574                 case ALPHA_LESSEQUAL:           // a <= b ~ !(a > b)
575                         cmp = CmpGT(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4((short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF);   // FIXME
576                         aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
577                         break;
578                 case ALPHA_GREATER:                     // a > b
579                         cmp = CmpGT(alpha, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.alphaReference4)));
580                         aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
581                         break;
582                 default:
583                         ASSERT(false);
584                 }
585         }
586
587         void PixelRoutine::alphaToCoverage(Registers &r, Int cMask[4], Float4 &alpha)
588         {
589                 Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(r.data + OFFSET(DrawData,a2c0)));
590                 Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(r.data + OFFSET(DrawData,a2c1)));
591                 Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(r.data + OFFSET(DrawData,a2c2)));
592                 Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(r.data + OFFSET(DrawData,a2c3)));
593
594                 Int aMask0 = SignMask(coverage0);
595                 Int aMask1 = SignMask(coverage1);
596                 Int aMask2 = SignMask(coverage2);
597                 Int aMask3 = SignMask(coverage3);
598
599                 cMask[0] &= aMask0;
600                 cMask[1] &= aMask1;
601                 cMask[2] &= aMask2;
602                 cMask[3] &= aMask3;
603         }
604
605         void PixelRoutine::fogBlend(Registers &r, Vector4f &c0, Float4 &fog, Float4 &z, Float4 &rhw)
606         {
607                 if(!state.fogActive)
608                 {
609                         return;
610                 }
611
612                 if(state.pixelFogMode != FOG_NONE)
613                 {
614                         pixelFog(r, fog, z, rhw);
615
616                         fog = Min(fog, Float4(1.0f));
617                         fog = Max(fog, Float4(0.0f));
618                 }
619
620                 c0.x -= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[0]));
621                 c0.y -= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[1]));
622                 c0.z -= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[2]));
623
624                 c0.x *= fog;
625                 c0.y *= fog;
626                 c0.z *= fog;
627
628                 c0.x += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[0]));
629                 c0.y += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[1]));
630                 c0.z += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[2]));
631         }
632
633         void PixelRoutine::pixelFog(Registers &r, Float4 &visibility, Float4 &z, Float4 &rhw)
634         {
635                 Float4 &zw = visibility;
636
637                 if(state.pixelFogMode != FOG_NONE)
638                 {
639                         if(state.wBasedFog)
640                         {
641                                 zw = rhw;
642                         }
643                         else
644                         {
645                                 if(complementaryDepthBuffer)
646                                 {
647                                         zw = Float4(1.0f) - z;
648                                 }
649                                 else
650                                 {
651                                         zw = z;
652                                 }
653                         }
654                 }
655
656                 switch(state.pixelFogMode)
657                 {
658                 case FOG_NONE:
659                         break;
660                 case FOG_LINEAR:
661                         zw *= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.scale));
662                         zw += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.offset));
663                         break;
664                 case FOG_EXP:
665                         zw *= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.densityE));
666                         zw = exponential2(zw, true);
667                         break;
668                 case FOG_EXP2:
669                         zw *= zw;
670                         zw *= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.density2E));
671                         zw = exponential2(zw, true);
672                         break;
673                 default:
674                         ASSERT(false);
675                 }
676         }
677
678         void PixelRoutine::writeDepth(Registers &r, Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
679         {
680                 if(!state.depthWriteEnable)
681                 {
682                         return;
683                 }
684
685                 Float4 Z = z;
686
687                 if(shader && shader->depthOverride())
688                 {
689                         if(complementaryDepthBuffer)
690                         {
691                                 Z = Float4(1.0f) - r.oDepth;
692                         }
693                         else
694                         {
695                                 Z = r.oDepth;
696                         }
697                 }
698
699                 Pointer<Byte> buffer;
700                 Int pitch;
701
702                 if(!state.quadLayoutDepthBuffer)
703                 {       
704                         buffer = zBuffer + 4 * x;
705                         pitch = *Pointer<Int>(r.data + OFFSET(DrawData,depthPitchB));
706                 }
707                 else
708                 {       
709                         buffer = zBuffer + 8 * x;
710                 }
711
712                 if(q > 0)
713                 {
714                         buffer += q * *Pointer<Int>(r.data + OFFSET(DrawData,depthSliceB));
715                 }
716
717                 Float4 zValue;
718
719                 if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
720                 {
721                         if(!state.quadLayoutDepthBuffer)
722                         {
723                                 // FIXME: Properly optimizes?
724                                 zValue.xy = *Pointer<Float4>(buffer);
725                                 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
726                         }
727                         else
728                         {
729                                 zValue = *Pointer<Float4>(buffer, 16);
730                         }
731                 }
732
733                 Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
734                 zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
735                 Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
736
737                 if(!state.quadLayoutDepthBuffer)
738                 {
739                         // FIXME: Properly optimizes?
740                         *Pointer<Float2>(buffer) = Float2(Z.xy);
741                         *Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
742                 }
743                 else
744                 {
745                         *Pointer<Float4>(buffer, 16) = Z;
746                 }
747         }
748
749         void PixelRoutine::writeStencil(Registers &r, Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
750         {
751                 if(!state.stencilActive)
752                 {
753                         return;
754                 }
755
756                 if(state.stencilPassOperation == OPERATION_KEEP && state.stencilZFailOperation == OPERATION_KEEP && state.stencilFailOperation == OPERATION_KEEP)
757                 {
758                         if(!state.twoSidedStencil || (state.stencilPassOperationCCW == OPERATION_KEEP && state.stencilZFailOperationCCW == OPERATION_KEEP && state.stencilFailOperationCCW == OPERATION_KEEP))
759                         {
760                                 return;
761                         }
762                 }
763
764                 if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW))
765                 {
766                         return;
767                 }
768
769                 Pointer<Byte> buffer = sBuffer + 2 * x;
770
771                 if(q > 0)
772                 {
773                         buffer += q * *Pointer<Int>(r.data + OFFSET(DrawData,stencilSliceB));
774                 }
775
776                 Byte8 bufferValue = As<Byte8>(Long1(*Pointer<UInt>(buffer)));
777         
778                 Byte8 newValue;
779                 stencilOperation(r, newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
780
781                 if(!state.noStencilWriteMask)
782                 {
783                         Byte8 maskedValue = bufferValue;
784                         newValue &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[0].writeMaskQ));
785                         maskedValue &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
786                         newValue |= maskedValue;
787                 }
788
789                 if(state.twoSidedStencil)
790                 {
791                         Byte8 newValueCCW;
792
793                         stencilOperation(r, newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
794
795                         if(!state.noStencilWriteMaskCCW)
796                         {
797                                 Byte8 maskedValue = bufferValue;
798                                 newValueCCW &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[1].writeMaskQ));
799                                 maskedValue &= *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
800                                 newValueCCW |= maskedValue;
801                         }
802
803                         newValue &= *Pointer<Byte8>(r.primitive + OFFSET(Primitive,clockwiseMask));
804                         newValueCCW &= *Pointer<Byte8>(r.primitive + OFFSET(Primitive,invClockwiseMask));
805                         newValue |= newValueCCW;
806                 }
807
808                 newValue &= *Pointer<Byte8>(r.constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
809                 bufferValue &= *Pointer<Byte8>(r.constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
810                 newValue |= bufferValue;
811
812                 *Pointer<UInt>(buffer) = UInt(As<Long>(newValue));
813         }
814
815         void PixelRoutine::stencilOperation(Registers &r, Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
816         {
817                 Byte8 &pass = newValue;
818                 Byte8 fail;
819                 Byte8 zFail;
820
821                 stencilOperation(r, pass, bufferValue, stencilPassOperation, CCW);
822
823                 if(stencilZFailOperation != stencilPassOperation)
824                 {
825                         stencilOperation(r, zFail, bufferValue, stencilZFailOperation, CCW);
826                 }
827
828                 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
829                 {
830                         stencilOperation(r, fail, bufferValue, stencilFailOperation, CCW);
831                 }
832
833                 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
834                 {
835                         if(state.depthTestActive && stencilZFailOperation != stencilPassOperation)   // zMask valid and values not the same
836                         {
837                                 pass &= *Pointer<Byte8>(r.constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
838                                 zFail &= *Pointer<Byte8>(r.constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
839                                 pass |= zFail;
840                         }
841
842                         pass &= *Pointer<Byte8>(r.constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
843                         fail &= *Pointer<Byte8>(r.constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
844                         pass |= fail;
845                 }
846         }
847
848         void PixelRoutine::stencilOperation(Registers &r, Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW)
849         {
850                 switch(operation)
851                 {
852                 case OPERATION_KEEP:
853                         output = bufferValue;
854                         break;
855                 case OPERATION_ZERO:
856                         output = Byte8(0x0000000000000000);
857                         break;
858                 case OPERATION_REPLACE:
859                         output = *Pointer<Byte8>(r.data + OFFSET(DrawData,stencil[CCW].referenceQ));
860                         break;
861                 case OPERATION_INCRSAT:
862                         output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
863                         break;
864                 case OPERATION_DECRSAT:
865                         output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
866                         break;
867                 case OPERATION_INVERT:
868                         output = bufferValue ^ Byte8(0xFFFFFFFFFFFFFFFF);
869                         break;
870                 case OPERATION_INCR:
871                         output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
872                         break;
873                 case OPERATION_DECR:
874                         output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
875                         break;
876                 default:
877                         ASSERT(false);
878                 }
879         }
880
881         void PixelRoutine::blendFactor(Registers &r, const Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorActive)
882         {
883                 switch(blendFactorActive)
884                 {
885                 case BLEND_ZERO:
886                         // Optimized
887                         break;
888                 case BLEND_ONE:
889                         // Optimized
890                         break;
891                 case BLEND_SOURCE:
892                         blendFactor.x = current.x;
893                         blendFactor.y = current.y;
894                         blendFactor.z = current.z;
895                         break;
896                 case BLEND_INVSOURCE:
897                         blendFactor.x = Short4(0xFFFFu) - current.x;
898                         blendFactor.y = Short4(0xFFFFu) - current.y;
899                         blendFactor.z = Short4(0xFFFFu) - current.z;
900                         break;
901                 case BLEND_DEST:
902                         blendFactor.x = pixel.x;
903                         blendFactor.y = pixel.y;
904                         blendFactor.z = pixel.z;
905                         break;
906                 case BLEND_INVDEST:
907                         blendFactor.x = Short4(0xFFFFu) - pixel.x;
908                         blendFactor.y = Short4(0xFFFFu) - pixel.y;
909                         blendFactor.z = Short4(0xFFFFu) - pixel.z;
910                         break;
911                 case BLEND_SOURCEALPHA:
912                         blendFactor.x = current.w;
913                         blendFactor.y = current.w;
914                         blendFactor.z = current.w;
915                         break;
916                 case BLEND_INVSOURCEALPHA:
917                         blendFactor.x = Short4(0xFFFFu) - current.w;
918                         blendFactor.y = Short4(0xFFFFu) - current.w;
919                         blendFactor.z = Short4(0xFFFFu) - current.w;
920                         break;
921                 case BLEND_DESTALPHA:
922                         blendFactor.x = pixel.w;
923                         blendFactor.y = pixel.w;
924                         blendFactor.z = pixel.w;
925                         break;
926                 case BLEND_INVDESTALPHA:
927                         blendFactor.x = Short4(0xFFFFu) - pixel.w;
928                         blendFactor.y = Short4(0xFFFFu) - pixel.w;
929                         blendFactor.z = Short4(0xFFFFu) - pixel.w;
930                         break;
931                 case BLEND_SRCALPHASAT:
932                         blendFactor.x = Short4(0xFFFFu) - pixel.w;
933                         blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
934                         blendFactor.y = blendFactor.x;
935                         blendFactor.z = blendFactor.x;
936                         break;
937                 case BLEND_CONSTANT:
938                         blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[0]));
939                         blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[1]));
940                         blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[2]));
941                         break;
942                 case BLEND_INVCONSTANT:
943                         blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
944                         blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
945                         blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
946                         break;
947                 case BLEND_CONSTANTALPHA:
948                         blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
949                         blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
950                         blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
951                         break;
952                 case BLEND_INVCONSTANTALPHA:
953                         blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
954                         blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
955                         blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
956                         break;
957                 default:
958                         ASSERT(false);
959                 }
960         }
961         
962         void PixelRoutine::blendFactorAlpha(Registers &r, const Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorAlphaActive)
963         {
964                 switch(blendFactorAlphaActive)
965                 {
966                 case BLEND_ZERO:
967                         // Optimized
968                         break;
969                 case BLEND_ONE:
970                         // Optimized
971                         break;
972                 case BLEND_SOURCE:
973                         blendFactor.w = current.w;
974                         break;
975                 case BLEND_INVSOURCE:
976                         blendFactor.w = Short4(0xFFFFu) - current.w;
977                         break;
978                 case BLEND_DEST:
979                         blendFactor.w = pixel.w;
980                         break;
981                 case BLEND_INVDEST:
982                         blendFactor.w = Short4(0xFFFFu) - pixel.w;
983                         break;
984                 case BLEND_SOURCEALPHA:
985                         blendFactor.w = current.w;
986                         break;
987                 case BLEND_INVSOURCEALPHA:
988                         blendFactor.w = Short4(0xFFFFu) - current.w;
989                         break;
990                 case BLEND_DESTALPHA:
991                         blendFactor.w = pixel.w;
992                         break;
993                 case BLEND_INVDESTALPHA:
994                         blendFactor.w = Short4(0xFFFFu) - pixel.w;
995                         break;
996                 case BLEND_SRCALPHASAT:
997                         blendFactor.w = Short4(0xFFFFu);
998                         break;
999                 case BLEND_CONSTANT:
1000                 case BLEND_CONSTANTALPHA:
1001                         blendFactor.w = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
1002                         break;
1003                 case BLEND_INVCONSTANT:
1004                 case BLEND_INVCONSTANTALPHA:
1005                         blendFactor.w = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
1006                         break;
1007                 default:
1008                         ASSERT(false);
1009                 }
1010         }
1011
1012         void PixelRoutine::readPixel(Registers &r, int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel)
1013         {
1014                 Short4 c01;
1015                 Short4 c23;
1016                 Pointer<Byte> buffer;
1017
1018                 switch(state.targetFormat[index])
1019                 {
1020                 case FORMAT_R5G6B5:
1021                         buffer = cBuffer + 2 * x;
1022                         c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
1023                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1024                         c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
1025
1026                         pixel.x = c01 & Short4(0xF800u);
1027                         pixel.y = (c01 & Short4(0x07E0u)) << 5;
1028                         pixel.z = (c01 & Short4(0x001Fu)) << 11;
1029                         pixel.w = Short4(0xFFFFu);
1030                         break;
1031                 case FORMAT_A8R8G8B8:
1032                         buffer = cBuffer + 4 * x;
1033                         c01 = *Pointer<Short4>(buffer);
1034                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1035                         c23 = *Pointer<Short4>(buffer);
1036                         pixel.z = c01;
1037                         pixel.y = c01;
1038                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1039                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1040                         pixel.x = pixel.z;
1041                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1042                         pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1043                         pixel.y = pixel.z;
1044                         pixel.w = pixel.x;
1045                         pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1046                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1047                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1048                         pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1049                         break;
1050                 case FORMAT_A8B8G8R8:
1051                         buffer = cBuffer + 4 * x;
1052                         c01 = *Pointer<Short4>(buffer);
1053                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1054                         c23 = *Pointer<Short4>(buffer);
1055                         pixel.z = c01;
1056                         pixel.y = c01;
1057                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1058                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1059                         pixel.x = pixel.z;
1060                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1061                         pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1062                         pixel.y = pixel.z;
1063                         pixel.w = pixel.x;
1064                         pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1065                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1066                         pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1067                         pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1068                         break;
1069                 case FORMAT_A8:
1070                         buffer = cBuffer + 1 * x;
1071                         pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 0);
1072                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1073                         pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 1);
1074                         pixel.w = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1075                         pixel.x = Short4(0x0000);
1076                         pixel.y = Short4(0x0000);
1077                         pixel.z = Short4(0x0000);
1078                         break;
1079                 case FORMAT_X8R8G8B8:
1080                         buffer = cBuffer + 4 * x;
1081                         c01 = *Pointer<Short4>(buffer);
1082                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1083                         c23 = *Pointer<Short4>(buffer);
1084                         pixel.z = c01;
1085                         pixel.y = c01;
1086                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1087                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1088                         pixel.x = pixel.z;
1089                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1090                         pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1091                         pixel.y = pixel.z;
1092                         pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1093                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1094                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1095                         pixel.w = Short4(0xFFFFu);
1096                         break;
1097                 case FORMAT_X8B8G8R8:
1098                         buffer = cBuffer + 4 * x;
1099                         c01 = *Pointer<Short4>(buffer);
1100                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1101                         c23 = *Pointer<Short4>(buffer);
1102                         pixel.z = c01;
1103                         pixel.y = c01;
1104                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1105                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1106                         pixel.x = pixel.z;
1107                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1108                         pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1109                         pixel.y = pixel.z;
1110                         pixel.w = pixel.x;
1111                         pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1112                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1113                         pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1114                         pixel.w = Short4(0xFFFFu);
1115                         break;
1116                 case FORMAT_A8G8R8B8Q:
1117                         UNIMPLEMENTED();
1118                 //      pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1119                 //      pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1120                 //      pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1121                 //      pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1122                         break;
1123                 case FORMAT_X8G8R8B8Q:
1124                         UNIMPLEMENTED();
1125                 //      pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1126                 //      pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1127                 //      pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1128                 //      pixel.w = Short4(0xFFFFu);
1129                         break;
1130                 case FORMAT_A16B16G16R16:
1131                         buffer = cBuffer;
1132                         pixel.x = *Pointer<Short4>(buffer + 8 * x);
1133                         pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
1134                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1135                         pixel.z = *Pointer<Short4>(buffer + 8 * x);
1136                         pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
1137                         transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
1138                         break;
1139                 case FORMAT_G16R16:
1140                         buffer = cBuffer;
1141                         pixel.x = *Pointer<Short4>(buffer + 4 * x);
1142                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
1143                         pixel.y = *Pointer<Short4>(buffer + 4 * x);
1144                         pixel.z = pixel.x;
1145                         pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
1146                         pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
1147                         pixel.y = pixel.z;
1148                         pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
1149                         pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
1150                         pixel.z = Short4(0xFFFFu);
1151                         pixel.w = Short4(0xFFFFu);
1152                         break;
1153                 default:
1154                         ASSERT(false);
1155                 }
1156
1157                 if(postBlendSRGB && state.writeSRGB)
1158                 {
1159                         sRGBtoLinear16_12_16(r, pixel);
1160                 }
1161         }
1162
1163         void PixelRoutine::alphaBlend(Registers &r, int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
1164         {
1165                 if(!state.alphaBlendActive)
1166                 {
1167                         return;
1168                 }
1169
1170                 Vector4s pixel;
1171                 readPixel(r, index, cBuffer, x, pixel);
1172
1173                 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
1174                 Vector4s sourceFactor;
1175                 Vector4s destFactor;
1176
1177                 blendFactor(r, sourceFactor, current, pixel, state.sourceBlendFactor);
1178                 blendFactor(r, destFactor, current, pixel, state.destBlendFactor);
1179
1180                 if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
1181                 {
1182                         current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
1183                         current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
1184                         current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
1185                 }
1186         
1187                 if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
1188                 {
1189                         pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
1190                         pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
1191                         pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
1192                 }
1193
1194                 switch(state.blendOperation)
1195                 {
1196                 case BLENDOP_ADD:
1197                         current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1198                         current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1199                         current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1200                         break;
1201                 case BLENDOP_SUB:
1202                         current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1203                         current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1204                         current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1205                         break;
1206                 case BLENDOP_INVSUB:
1207                         current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
1208                         current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
1209                         current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
1210                         break;
1211                 case BLENDOP_MIN:
1212                         current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
1213                         current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
1214                         current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
1215                         break;
1216                 case BLENDOP_MAX:
1217                         current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
1218                         current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
1219                         current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
1220                         break;
1221                 case BLENDOP_SOURCE:
1222                         // No operation
1223                         break;
1224                 case BLENDOP_DEST:
1225                         current.x = pixel.x;
1226                         current.y = pixel.y;
1227                         current.z = pixel.z;
1228                         break;
1229                 case BLENDOP_NULL:
1230                         current.x = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1231                         current.y = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1232                         current.z = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1233                         break;
1234                 default:
1235                         ASSERT(false);
1236                 }
1237
1238                 blendFactorAlpha(r, sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
1239                 blendFactorAlpha(r, destFactor, current, pixel, state.destBlendFactorAlpha);
1240
1241                 if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
1242                 {
1243                         current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
1244                 }
1245         
1246                 if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
1247                 {
1248                         pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
1249                 }
1250
1251                 switch(state.blendOperationAlpha)
1252                 {
1253                 case BLENDOP_ADD:
1254                         current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1255                         break;
1256                 case BLENDOP_SUB:
1257                         current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1258                         break;
1259                 case BLENDOP_INVSUB:
1260                         current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
1261                         break;
1262                 case BLENDOP_MIN:
1263                         current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
1264                         break;
1265                 case BLENDOP_MAX:
1266                         current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
1267                         break;
1268                 case BLENDOP_SOURCE:
1269                         // No operation
1270                         break;
1271                 case BLENDOP_DEST:
1272                         current.w = pixel.w;
1273                         break;
1274                 case BLENDOP_NULL:
1275                         current.w = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1276                         break;
1277                 default:
1278                         ASSERT(false);
1279                 }
1280         }
1281
1282         void PixelRoutine::logicOperation(Registers &r, int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
1283         {
1284                 if(state.logicalOperation == LOGICALOP_COPY)
1285                 {
1286                         return;
1287                 }
1288
1289                 Vector4s pixel;
1290                 readPixel(r, index, cBuffer, x, pixel);
1291
1292                 switch(state.logicalOperation)
1293                 {
1294                 case LOGICALOP_CLEAR:
1295                         current.x = 0;
1296                         current.y = 0;
1297                         current.z = 0;
1298                         break;
1299                 case LOGICALOP_SET:
1300                         current.x = 0xFFFFu;
1301                         current.y = 0xFFFFu;
1302                         current.z = 0xFFFFu;
1303                         break;
1304                 case LOGICALOP_COPY:
1305                         ASSERT(false);   // Optimized out
1306                         break;
1307                 case LOGICALOP_COPY_INVERTED:
1308                         current.x = ~current.x;
1309                         current.y = ~current.y;
1310                         current.z = ~current.z;
1311                         break;
1312                 case LOGICALOP_NOOP:
1313                         current.x = pixel.x;
1314                         current.y = pixel.y;
1315                         current.z = pixel.z;
1316                         break;
1317                 case LOGICALOP_INVERT:
1318                         current.x = ~pixel.x;
1319                         current.y = ~pixel.y;
1320                         current.z = ~pixel.z;
1321                         break;
1322                 case LOGICALOP_AND:
1323                         current.x = pixel.x & current.x;
1324                         current.y = pixel.y & current.y;
1325                         current.z = pixel.z & current.z;
1326                         break;
1327                 case LOGICALOP_NAND:
1328                         current.x = ~(pixel.x & current.x);
1329                         current.y = ~(pixel.y & current.y);
1330                         current.z = ~(pixel.z & current.z);
1331                         break;
1332                 case LOGICALOP_OR:
1333                         current.x = pixel.x | current.x;
1334                         current.y = pixel.y | current.y;
1335                         current.z = pixel.z | current.z;
1336                         break;
1337                 case LOGICALOP_NOR:
1338                         current.x = ~(pixel.x | current.x);
1339                         current.y = ~(pixel.y | current.y);
1340                         current.z = ~(pixel.z | current.z);
1341                         break;
1342                 case LOGICALOP_XOR:
1343                         current.x = pixel.x ^ current.x;
1344                         current.y = pixel.y ^ current.y;
1345                         current.z = pixel.z ^ current.z;
1346                         break;
1347                 case LOGICALOP_EQUIV:
1348                         current.x = ~(pixel.x ^ current.x);
1349                         current.y = ~(pixel.y ^ current.y);
1350                         current.z = ~(pixel.z ^ current.z);
1351                         break;
1352                 case LOGICALOP_AND_REVERSE:
1353                         current.x = ~pixel.x & current.x;
1354                         current.y = ~pixel.y & current.y;
1355                         current.z = ~pixel.z & current.z;
1356                         break;
1357                 case LOGICALOP_AND_INVERTED:
1358                         current.x = pixel.x & ~current.x;
1359                         current.y = pixel.y & ~current.y;
1360                         current.z = pixel.z & ~current.z;
1361                         break;
1362                 case LOGICALOP_OR_REVERSE:
1363                         current.x = ~pixel.x | current.x;
1364                         current.y = ~pixel.y | current.y;
1365                         current.z = ~pixel.z | current.z;
1366                         break;
1367                 case LOGICALOP_OR_INVERTED:
1368                         current.x = pixel.x | ~current.x;
1369                         current.y = pixel.y | ~current.y;
1370                         current.z = pixel.z | ~current.z;
1371                         break;
1372                 default:
1373                         ASSERT(false);
1374                 }
1375         }
1376
1377         void PixelRoutine::writeColor(Registers &r, int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &current, Int &sMask, Int &zMask, Int &cMask)
1378         {
1379                 if(postBlendSRGB && state.writeSRGB)
1380                 {
1381                         linearToSRGB16_12_16(r, current);
1382                 }
1383
1384                 if(exactColorRounding)
1385                 {
1386                         switch(state.targetFormat[index])
1387                         {
1388                         case FORMAT_R5G6B5:
1389                                 current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400));
1390                                 current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200));
1391                                 current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400));
1392                                 break;
1393                         case FORMAT_X8G8R8B8Q:
1394                         case FORMAT_A8G8R8B8Q:
1395                         case FORMAT_X8R8G8B8:
1396                         case FORMAT_X8B8G8R8:
1397                         case FORMAT_A8R8G8B8:
1398                         case FORMAT_A8B8G8R8:
1399                                 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1400                                 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1401                                 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1402                                 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1403                                 break;
1404                         }
1405                 }
1406
1407                 int rgbaWriteMask = state.colorWriteActive(index);
1408                 int bgraWriteMask = rgbaWriteMask & 0x0000000A | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
1409                 int brgaWriteMask = rgbaWriteMask & 0x00000008 | (rgbaWriteMask & 0x00000001) << 1 | (rgbaWriteMask & 0x00000002) << 1 | (rgbaWriteMask & 0x00000004) >> 2;
1410
1411                 switch(state.targetFormat[index])
1412                 {
1413                 case FORMAT_R5G6B5:
1414                         {
1415                                 current.x = current.x & Short4(0xF800u);
1416                                 current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1417                                 current.z = As<UShort4>(current.z) >> 11;
1418
1419                                 current.x = current.x | current.y | current.z;
1420                         }
1421                         break;
1422                 case FORMAT_X8G8R8B8Q:
1423                         UNIMPLEMENTED();
1424                 //      current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1425                 //      current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1426                 //      current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1427
1428                 //      current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1429                 //      current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1430                         break;
1431                 case FORMAT_A8G8R8B8Q:
1432                         UNIMPLEMENTED();
1433                 //      current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1434                 //      current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1435                 //      current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1436                 //      current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1437
1438                 //      current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1439                 //      current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1440                         break;
1441                 case FORMAT_X8R8G8B8:
1442                 case FORMAT_A8R8G8B8:
1443                         if(state.targetFormat[index] == FORMAT_X8R8G8B8 || rgbaWriteMask == 0x7)
1444                         {
1445                                 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1446                                 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1447                                 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1448
1449                                 current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1450                                 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1451
1452                                 current.x = current.z;
1453                                 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1454                                 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1455                                 current.y = current.z;
1456                                 current.z = As<Short4>(UnpackLow(current.z, current.x));
1457                                 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1458                         }
1459                         else
1460                         {
1461                                 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1462                                 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1463                                 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1464                                 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1465
1466                                 current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1467                                 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1468
1469                                 current.x = current.z;
1470                                 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1471                                 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1472                                 current.y = current.z;
1473                                 current.z = As<Short4>(UnpackLow(current.z, current.x));
1474                                 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1475                         }
1476                         break;
1477                 case FORMAT_X8B8G8R8:
1478                 case FORMAT_A8B8G8R8:
1479                         if(state.targetFormat[index] == FORMAT_X8B8G8R8 || rgbaWriteMask == 0x7)
1480                         {
1481                                 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1482                                 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1483                                 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1484
1485                                 current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1486                                 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1487
1488                                 current.x = current.z;
1489                                 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1490                                 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1491                                 current.y = current.z;
1492                                 current.z = As<Short4>(UnpackLow(current.z, current.x));
1493                                 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1494                         }
1495                         else
1496                         {
1497                                 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1498                                 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1499                                 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1500                                 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1501
1502                                 current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1503                                 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1504
1505                                 current.x = current.z;
1506                                 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1507                                 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1508                                 current.y = current.z;
1509                                 current.z = As<Short4>(UnpackLow(current.z, current.x));
1510                                 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1511                         }
1512                         break;
1513                 case FORMAT_A8:
1514                         current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1515                         current.w = As<Short4>(Pack(As<UShort4>(current.w), As<UShort4>(current.w)));
1516                         break;
1517                 case FORMAT_G16R16:
1518                         current.z = current.x;
1519                         current.x = As<Short4>(UnpackLow(current.x, current.y));
1520                         current.z = As<Short4>(UnpackHigh(current.z, current.y));
1521                         current.y = current.z;
1522                         break;
1523                 case FORMAT_A16B16G16R16:
1524                         transpose4x4(current.x, current.y, current.z, current.w);
1525                         break;
1526                 default:
1527                         ASSERT(false);
1528                 }
1529
1530                 Short4 c01 = current.z;
1531                 Short4 c23 = current.y;
1532
1533                 Int xMask;   // Combination of all masks
1534
1535                 if(state.depthTestActive)
1536                 {
1537                         xMask = zMask;
1538                 }
1539                 else
1540                 {
1541                         xMask = cMask;
1542                 }
1543
1544                 if(state.stencilActive)
1545                 {
1546                         xMask &= sMask;
1547                 }
1548
1549                 switch(state.targetFormat[index])
1550                 {
1551                 case FORMAT_R5G6B5:
1552                         {
1553                                 Pointer<Byte> buffer = cBuffer + 2 * x;
1554                                 Int value = *Pointer<Int>(buffer);
1555
1556                                 Int c01 = Extract(As<Int2>(current.x), 0);
1557
1558                                 if((bgraWriteMask & 0x00000007) != 0x00000007)
1559                                 {
1560                                         Int masked = value;
1561                                         c01 &= *Pointer<Int>(r.constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1562                                         masked &= *Pointer<Int>(r.constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0]));
1563                                         c01 |= masked;
1564                                 }
1565
1566                                 c01 &= *Pointer<Int>(r.constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
1567                                 value &= *Pointer<Int>(r.constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
1568                                 c01 |= value;
1569                                 *Pointer<Int>(buffer) = c01;
1570
1571                                 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1572                                 value = *Pointer<Int>(buffer);
1573
1574                                 Int c23 = Extract(As<Int2>(current.x), 1);
1575
1576                                 if((bgraWriteMask & 0x00000007) != 0x00000007)
1577                                 {
1578                                         Int masked = value;
1579                                         c23 &= *Pointer<Int>(r.constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1580                                         masked &= *Pointer<Int>(r.constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0]));
1581                                         c23 |= masked;
1582                                 }
1583
1584                                 c23 &= *Pointer<Int>(r.constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
1585                                 value &= *Pointer<Int>(r.constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
1586                                 c23 |= value;
1587                                 *Pointer<Int>(buffer) = c23;
1588                         }
1589                         break;
1590                 case FORMAT_A8G8R8B8Q:
1591                 case FORMAT_X8G8R8B8Q:   // FIXME: Don't touch alpha?
1592                         UNIMPLEMENTED();
1593                 //      value = *Pointer<Short4>(cBuffer + 8 * x + 0);
1594
1595                 //      if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1596                 //         ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1597                 //          (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1598                 //      {
1599                 //              Short4 masked = value;
1600                 //              c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1601                 //              masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1602                 //              c01 |= masked;
1603                 //      }
1604
1605                 //      c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1606                 //      value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1607                 //      c01 |= value;
1608                 //      *Pointer<Short4>(cBuffer + 8 * x + 0) = c01;
1609
1610                 //      value = *Pointer<Short4>(cBuffer + 8 * x + 8);
1611
1612                 //      if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1613                 //         ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1614                 //          (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1615                 //      {
1616                 //              Short4 masked = value;
1617                 //              c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1618                 //              masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1619                 //              c23 |= masked;
1620                 //      }
1621
1622                 //      c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1623                 //      value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1624                 //      c23 |= value;
1625                 //      *Pointer<Short4>(cBuffer + 8 * x + 8) = c23;
1626                         break;
1627                 case FORMAT_A8R8G8B8:
1628                 case FORMAT_X8R8G8B8:   // FIXME: Don't touch alpha?
1629                         {
1630                                 Pointer<Byte> buffer = cBuffer + x * 4;
1631                                 Short4 value = *Pointer<Short4>(buffer);
1632
1633                                 if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1634                                    ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1635                                         (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1636                                 {
1637                                         Short4 masked = value;
1638                                         c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1639                                         masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1640                                         c01 |= masked;
1641                                 }
1642
1643                                 c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1644                                 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1645                                 c01 |= value;
1646                                 *Pointer<Short4>(buffer) = c01;
1647
1648                                 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1649                                 value = *Pointer<Short4>(buffer);
1650
1651                                 if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1652                                    ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1653                                         (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1654                                 {
1655                                         Short4 masked = value;
1656                                         c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1657                                         masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1658                                         c23 |= masked;
1659                                 }
1660
1661                                 c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1662                                 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1663                                 c23 |= value;
1664                                 *Pointer<Short4>(buffer) = c23;
1665                         }
1666                         break;
1667                 case FORMAT_A8B8G8R8:
1668                 case FORMAT_X8B8G8R8:   // FIXME: Don't touch alpha?
1669                         {
1670                                 Pointer<Byte> buffer = cBuffer + x * 4;
1671                                 Short4 value = *Pointer<Short4>(buffer);
1672
1673                                 if((state.targetFormat[index] == FORMAT_A8B8G8R8 && rgbaWriteMask != 0x0000000F) ||
1674                                    ((state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x00000007) &&
1675                                         (state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x0000000F)))   // FIXME: Need for masking when XBGR && Fh?
1676                                 {
1677                                         Short4 masked = value;
1678                                         c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1679                                         masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1680                                         c01 |= masked;
1681                                 }
1682
1683                                 c01 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1684                                 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1685                                 c01 |= value;
1686                                 *Pointer<Short4>(buffer) = c01;
1687
1688                                 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1689                                 value = *Pointer<Short4>(buffer);
1690
1691                                 if((state.targetFormat[index] == FORMAT_A8B8G8R8 && rgbaWriteMask != 0x0000000F) ||
1692                                    ((state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x00000007) &&
1693                                         (state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x0000000F)))   // FIXME: Need for masking when XBGR && Fh?
1694                                 {
1695                                         Short4 masked = value;
1696                                         c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1697                                         masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1698                                         c23 |= masked;
1699                                 }
1700
1701                                 c23 &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1702                                 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1703                                 c23 |= value;
1704                                 *Pointer<Short4>(buffer) = c23;
1705                         }
1706                         break;
1707                 case FORMAT_A8:
1708                         if(rgbaWriteMask & 0x00000008)
1709                         {
1710                                 Pointer<Byte> buffer = cBuffer + 1 * x;
1711                                 Short4 value;
1712                                 Insert(value, *Pointer<Short>(buffer), 0);
1713                                 Int pitch = *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1714                                 Insert(value, *Pointer<Short>(buffer + pitch), 1);
1715                                 value = UnpackLow(As<Byte8>(value), As<Byte8>(value));
1716
1717                                 current.w &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskB4Q) + 8 * xMask);
1718                                 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskB4Q) + 8 * xMask);
1719                                 current.w |= value;
1720
1721                                 *Pointer<Short>(buffer) = Extract(current.w, 0);
1722                                 *Pointer<Short>(buffer + pitch) = Extract(current.w, 1);
1723                         }
1724                         break;
1725                 case FORMAT_G16R16:
1726                         {
1727                                 Pointer<Byte> buffer = cBuffer + 4 * x;
1728
1729                                 Short4 value = *Pointer<Short4>(buffer);
1730
1731                                 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1732                                 {
1733                                         Short4 masked = value;
1734                                         current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1735                                         masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
1736                                         current.x |= masked;
1737                                 }
1738
1739                                 current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1740                                 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1741                                 current.x |= value;
1742                                 *Pointer<Short4>(buffer) = current.x;
1743
1744                                 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1745
1746                                 value = *Pointer<Short4>(buffer);
1747
1748                                 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1749                                 {
1750                                         Short4 masked = value;
1751                                         current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1752                                         masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
1753                                         current.y |= masked;
1754                                 }
1755
1756                                 current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1757                                 value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1758                                 current.y |= value;
1759                                 *Pointer<Short4>(buffer) = current.y;
1760                         }
1761                         break;
1762                 case FORMAT_A16B16G16R16:
1763                         {
1764                                 Pointer<Byte> buffer = cBuffer + 8 * x;
1765
1766                                 {
1767                                         Short4 value = *Pointer<Short4>(buffer);
1768
1769                                         if(rgbaWriteMask != 0x0000000F)
1770                                         {
1771                                                 Short4 masked = value;
1772                                                 current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1773                                                 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1774                                                 current.x |= masked;
1775                                         }
1776
1777                                         current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
1778                                         value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
1779                                         current.x |= value;
1780                                         *Pointer<Short4>(buffer) = current.x;
1781                                 }
1782
1783                                 {
1784                                         Short4 value = *Pointer<Short4>(buffer + 8);
1785
1786                                         if(rgbaWriteMask != 0x0000000F)
1787                                         {
1788                                                 Short4 masked = value;
1789                                                 current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1790                                                 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1791                                                 current.y |= masked;
1792                                         }
1793
1794                                         current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
1795                                         value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
1796                                         current.y |= value;
1797                                         *Pointer<Short4>(buffer + 8) = current.y;
1798                                 }
1799
1800                                 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1801
1802                                 {
1803                                         Short4 value = *Pointer<Short4>(buffer);
1804
1805                                         if(rgbaWriteMask != 0x0000000F)
1806                                         {
1807                                                 Short4 masked = value;
1808                                                 current.z &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1809                                                 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1810                                                 current.z |= masked;
1811                                         }
1812
1813                                         current.z &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
1814                                         value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
1815                                         current.z |= value;
1816                                         *Pointer<Short4>(buffer) = current.z;
1817                                 }
1818
1819                                 {
1820                                         Short4 value = *Pointer<Short4>(buffer + 8);
1821
1822                                         if(rgbaWriteMask != 0x0000000F)
1823                                         {
1824                                                 Short4 masked = value;
1825                                                 current.w &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1826                                                 masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1827                                                 current.w |= masked;
1828                                         }
1829
1830                                         current.w &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
1831                                         value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
1832                                         current.w |= value;
1833                                         *Pointer<Short4>(buffer + 8) = current.w;
1834                                 }
1835                         }
1836                         break;
1837                 default:
1838                         ASSERT(false);
1839                 }
1840         }
1841
1842         void PixelRoutine::blendFactor(Registers &r, const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive) 
1843         {
1844                 switch(blendFactorActive)
1845                 {
1846                 case BLEND_ZERO:
1847                         // Optimized
1848                         break;
1849                 case BLEND_ONE:
1850                         // Optimized
1851                         break;
1852                 case BLEND_SOURCE:
1853                         blendFactor.x = oC.x;
1854                         blendFactor.y = oC.y;
1855                         blendFactor.z = oC.z;
1856                         break;
1857                 case BLEND_INVSOURCE:
1858                         blendFactor.x = Float4(1.0f) - oC.x;
1859                         blendFactor.y = Float4(1.0f) - oC.y;
1860                         blendFactor.z = Float4(1.0f) - oC.z;
1861                         break;
1862                 case BLEND_DEST:
1863                         blendFactor.x = pixel.x;
1864                         blendFactor.y = pixel.y;
1865                         blendFactor.z = pixel.z;
1866                         break;
1867                 case BLEND_INVDEST:
1868                         blendFactor.x = Float4(1.0f) - pixel.x;
1869                         blendFactor.y = Float4(1.0f) - pixel.y;
1870                         blendFactor.z = Float4(1.0f) - pixel.z;
1871                         break;
1872                 case BLEND_SOURCEALPHA:
1873                         blendFactor.x = oC.w;
1874                         blendFactor.y = oC.w;
1875                         blendFactor.z = oC.w;
1876                         break;
1877                 case BLEND_INVSOURCEALPHA:
1878                         blendFactor.x = Float4(1.0f) - oC.w;
1879                         blendFactor.y = Float4(1.0f) - oC.w;
1880                         blendFactor.z = Float4(1.0f) - oC.w;
1881                         break;
1882                 case BLEND_DESTALPHA:
1883                         blendFactor.x = pixel.w;
1884                         blendFactor.y = pixel.w;
1885                         blendFactor.z = pixel.w;
1886                         break;
1887                 case BLEND_INVDESTALPHA:
1888                         blendFactor.x = Float4(1.0f) - pixel.w;
1889                         blendFactor.y = Float4(1.0f) - pixel.w;
1890                         blendFactor.z = Float4(1.0f) - pixel.w;
1891                         break;
1892                 case BLEND_SRCALPHASAT:
1893                         blendFactor.x = Float4(1.0f) - pixel.w;
1894                         blendFactor.x = Min(blendFactor.x, oC.w);
1895                         blendFactor.y = blendFactor.x;
1896                         blendFactor.z = blendFactor.x;
1897                         break;
1898                 case BLEND_CONSTANT:
1899                         blendFactor.x = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[0]));
1900                         blendFactor.y = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[1]));
1901                         blendFactor.z = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[2]));
1902                         break;
1903                 case BLEND_INVCONSTANT:
1904                         blendFactor.x = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
1905                         blendFactor.y = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
1906                         blendFactor.z = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
1907                         break;
1908                 default:
1909                         ASSERT(false);
1910                 }
1911         }
1912
1913         void PixelRoutine::blendFactorAlpha(Registers &r, const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive) 
1914         {
1915                 switch(blendFactorAlphaActive)
1916                 {
1917                 case BLEND_ZERO:
1918                         // Optimized
1919                         break;
1920                 case BLEND_ONE:
1921                         // Optimized
1922                         break;
1923                 case BLEND_SOURCE:
1924                         blendFactor.w = oC.w;
1925                         break;
1926                 case BLEND_INVSOURCE:
1927                         blendFactor.w = Float4(1.0f) - oC.w;
1928                         break;
1929                 case BLEND_DEST:
1930                         blendFactor.w = pixel.w;
1931                         break;
1932                 case BLEND_INVDEST:
1933                         blendFactor.w = Float4(1.0f) - pixel.w;
1934                         break;
1935                 case BLEND_SOURCEALPHA:
1936                         blendFactor.w = oC.w;
1937                         break;
1938                 case BLEND_INVSOURCEALPHA:
1939                         blendFactor.w = Float4(1.0f) - oC.w;
1940                         break;
1941                 case BLEND_DESTALPHA:
1942                         blendFactor.w = pixel.w;
1943                         break;
1944                 case BLEND_INVDESTALPHA:
1945                         blendFactor.w = Float4(1.0f) - pixel.w;
1946                         break;
1947                 case BLEND_SRCALPHASAT:
1948                         blendFactor.w = Float4(1.0f);
1949                         break;
1950                 case BLEND_CONSTANT:
1951                         blendFactor.w = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[3]));
1952                         break;
1953                 case BLEND_INVCONSTANT:
1954                         blendFactor.w = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
1955                         break;
1956                 default:
1957                         ASSERT(false);
1958                 }
1959         }
1960
1961         void PixelRoutine::alphaBlend(Registers &r, int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
1962         {
1963                 if(!state.alphaBlendActive)
1964                 {
1965                         return;
1966                 }
1967
1968                 Pointer<Byte> buffer;
1969                 Vector4f pixel;
1970
1971                 Vector4s color;
1972                 Short4 c01;
1973                 Short4 c23;
1974
1975                 switch(state.targetFormat[index])
1976                 {
1977                 case FORMAT_R32F:
1978                         buffer = cBuffer;
1979                         // FIXME: movlps
1980                         pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
1981                         pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
1982                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1983                         // FIXME: movhps
1984                         pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
1985                         pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
1986                         pixel.y = Float4(1.0f);
1987                         pixel.z = Float4(1.0f);
1988                         pixel.w = Float4(1.0f);
1989                         break;
1990                 case FORMAT_G32R32F:
1991                         buffer = cBuffer;
1992                         pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
1993                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
1994                         pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
1995                         pixel.z = pixel.x;
1996                         pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88);
1997                         pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD);
1998                         pixel.y = pixel.z;
1999                         pixel.z = Float4(1.0f);
2000                         pixel.w = Float4(1.0f);
2001                         break;
2002                 case FORMAT_A32B32G32R32F:
2003                         buffer = cBuffer;
2004                         pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
2005                         pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2006                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
2007                         pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
2008                         pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2009                         transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
2010                         break;
2011                 default:
2012                         ASSERT(false);
2013                 }
2014
2015                 if(postBlendSRGB && state.writeSRGB)
2016                 {
2017                         sRGBtoLinear(pixel.x);
2018                         sRGBtoLinear(pixel.y);
2019                         sRGBtoLinear(pixel.z);
2020                 }
2021
2022                 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
2023                 Vector4f sourceFactor;
2024                 Vector4f destFactor;
2025
2026                 blendFactor(r, sourceFactor, oC, pixel, state.sourceBlendFactor);
2027                 blendFactor(r, destFactor, oC, pixel, state.destBlendFactor);
2028
2029                 if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
2030                 {
2031                         oC.x *= sourceFactor.x;
2032                         oC.y *= sourceFactor.y;
2033                         oC.z *= sourceFactor.z;
2034                 }
2035         
2036                 if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
2037                 {
2038                         pixel.x *= destFactor.x;
2039                         pixel.y *= destFactor.y;
2040                         pixel.z *= destFactor.z;
2041                 }
2042
2043                 switch(state.blendOperation)
2044                 {
2045                 case BLENDOP_ADD:
2046                         oC.x += pixel.x;
2047                         oC.y += pixel.y;
2048                         oC.z += pixel.z;
2049                         break;
2050                 case BLENDOP_SUB:
2051                         oC.x -= pixel.x;
2052                         oC.y -= pixel.y;
2053                         oC.z -= pixel.z;
2054                         break;
2055                 case BLENDOP_INVSUB:
2056                         oC.x = pixel.x - oC.x;
2057                         oC.y = pixel.y - oC.y;
2058                         oC.z = pixel.z - oC.z;
2059                         break;
2060                 case BLENDOP_MIN:
2061                         oC.x = Min(oC.x, pixel.x);
2062                         oC.y = Min(oC.y, pixel.y);
2063                         oC.z = Min(oC.z, pixel.z);
2064                         break;
2065                 case BLENDOP_MAX:
2066                         oC.x = Max(oC.x, pixel.x);
2067                         oC.y = Max(oC.y, pixel.y);
2068                         oC.z = Max(oC.z, pixel.z);
2069                         break;
2070                 case BLENDOP_SOURCE:
2071                         // No operation
2072                         break;
2073                 case BLENDOP_DEST:
2074                         oC.x = pixel.x;
2075                         oC.y = pixel.y;
2076                         oC.z = pixel.z;
2077                         break;
2078                 case BLENDOP_NULL:
2079                         oC.x = Float4(0.0f);
2080                         oC.y = Float4(0.0f);
2081                         oC.z = Float4(0.0f);
2082                         break;
2083                 default:
2084                         ASSERT(false);
2085                 }
2086
2087                 blendFactorAlpha(r, sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
2088                 blendFactorAlpha(r, destFactor, oC, pixel, state.destBlendFactorAlpha);
2089
2090                 if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
2091                 {
2092                         oC.w *= sourceFactor.w;
2093                 }
2094         
2095                 if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
2096                 {
2097                         pixel.w *= destFactor.w;
2098                 }
2099
2100                 switch(state.blendOperationAlpha)
2101                 {
2102                 case BLENDOP_ADD:
2103                         oC.w += pixel.w;
2104                         break;
2105                 case BLENDOP_SUB:
2106                         oC.w -= pixel.w;
2107                         break;
2108                 case BLENDOP_INVSUB:
2109                         pixel.w -= oC.w;
2110                         oC.w = pixel.w;
2111                         break;
2112                 case BLENDOP_MIN:       
2113                         oC.w = Min(oC.w, pixel.w);
2114                         break;
2115                 case BLENDOP_MAX:       
2116                         oC.w = Max(oC.w, pixel.w);
2117                         break;
2118                 case BLENDOP_SOURCE:
2119                         // No operation
2120                         break;
2121                 case BLENDOP_DEST:
2122                         oC.w = pixel.w;
2123                         break;
2124                 case BLENDOP_NULL:
2125                         oC.w = Float4(0.0f);
2126                         break;
2127                 default:
2128                         ASSERT(false);
2129                 }
2130         }
2131
2132         void PixelRoutine::writeColor(Registers &r, int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
2133         {
2134                 switch(state.targetFormat[index])
2135                 {
2136                 case FORMAT_R32F:
2137                         break;
2138                 case FORMAT_G32R32F:
2139                         oC.z = oC.x;
2140                         oC.x = UnpackLow(oC.x, oC.y);
2141                         oC.z = UnpackHigh(oC.z, oC.y);
2142                         oC.y = oC.z;
2143                         break;
2144                 case FORMAT_A32B32G32R32F:
2145                         transpose4x4(oC.x, oC.y, oC.z, oC.w);
2146                         break;
2147                 default:
2148                         ASSERT(false);
2149                 }
2150
2151                 int rgbaWriteMask = state.colorWriteActive(index);
2152
2153                 Int xMask;   // Combination of all masks
2154
2155                 if(state.depthTestActive)
2156                 {
2157                         xMask = zMask;
2158                 }
2159                 else
2160                 {
2161                         xMask = cMask;
2162                 }
2163
2164                 if(state.stencilActive)
2165                 {
2166                         xMask &= sMask;
2167                 }
2168
2169                 Pointer<Byte> buffer;
2170                 Float4 value;
2171
2172                 switch(state.targetFormat[index])
2173                 {
2174                 case FORMAT_R32F:
2175                         if(rgbaWriteMask & 0x00000001)
2176                         {
2177                                 buffer = cBuffer + 4 * x;
2178
2179                                 // FIXME: movlps
2180                                 value.x = *Pointer<Float>(buffer + 0);
2181                                 value.y = *Pointer<Float>(buffer + 4);
2182
2183                                 buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
2184
2185                                 // FIXME: movhps
2186                                 value.z = *Pointer<Float>(buffer + 0);
2187                                 value.w = *Pointer<Float>(buffer + 4);
2188
2189                                 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
2190                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
2191                                 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2192
2193                                 // FIXME: movhps
2194                                 *Pointer<Float>(buffer + 0) = oC.x.z;
2195                                 *Pointer<Float>(buffer + 4) = oC.x.w;
2196
2197                                 buffer -= *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
2198
2199                                 // FIXME: movlps
2200                                 *Pointer<Float>(buffer + 0) = oC.x.x;
2201                                 *Pointer<Float>(buffer + 4) = oC.x.y;
2202                         }
2203                         break;
2204                 case FORMAT_G32R32F:
2205                         buffer = cBuffer + 8 * x;
2206
2207                         value = *Pointer<Float4>(buffer);
2208
2209                         if((rgbaWriteMask & 0x00000003) != 0x00000003)
2210                         {
2211                                 Float4 masked = value;
2212                                 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2213                                 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0])));
2214                                 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2215                         }
2216
2217                         oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
2218                         value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
2219                         oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2220                         *Pointer<Float4>(buffer) = oC.x;
2221
2222                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
2223
2224                         value = *Pointer<Float4>(buffer);
2225
2226                         if((rgbaWriteMask & 0x00000003) != 0x00000003)
2227                         {
2228                                 Float4 masked;
2229
2230                                 masked = value;
2231                                 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2232                                 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0])));
2233                                 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2234                         }
2235
2236                         oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
2237                         value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
2238                         oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2239                         *Pointer<Float4>(buffer) = oC.y;
2240                         break;
2241                 case FORMAT_A32B32G32R32F:
2242                         buffer = cBuffer + 16 * x;
2243
2244                         {
2245                                 value = *Pointer<Float4>(buffer, 16);
2246
2247                                 if(rgbaWriteMask != 0x0000000F)
2248                                 {
2249                                         Float4 masked = value;
2250                                         oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2251                                         masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2252                                         oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2253                                 }
2254                                 
2255                                 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
2256                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
2257                                 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2258                                 *Pointer<Float4>(buffer, 16) = oC.x;
2259                         }
2260
2261                         {
2262                                 value = *Pointer<Float4>(buffer + 16, 16);
2263
2264                                 if(rgbaWriteMask != 0x0000000F)
2265                                 {       
2266                                         Float4 masked = value;
2267                                         oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2268                                         masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2269                                         oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2270                                 }
2271
2272                                 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
2273                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
2274                                 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2275                                 *Pointer<Float4>(buffer + 16, 16) = oC.y;
2276                         }
2277
2278                         buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
2279
2280                         {
2281                                 value = *Pointer<Float4>(buffer, 16);
2282
2283                                 if(rgbaWriteMask != 0x0000000F)
2284                                 {
2285                                         Float4 masked = value;
2286                                         oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2287                                         masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2288                                         oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
2289                                 }
2290
2291                                 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
2292                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
2293                                 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
2294                                 *Pointer<Float4>(buffer, 16) = oC.z;
2295                         }
2296
2297                         {
2298                                 value = *Pointer<Float4>(buffer + 16, 16);
2299
2300                                 if(rgbaWriteMask != 0x0000000F)
2301                                 {
2302                                         Float4 masked = value;
2303                                         oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2304                                         masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2305                                         oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
2306                                 }
2307
2308                                 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
2309                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
2310                                 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
2311                                 *Pointer<Float4>(buffer + 16, 16) = oC.w;
2312                         }
2313                         break;
2314                 default:
2315                         ASSERT(false);
2316                 }
2317         }
2318
2319         UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate)
2320         {
2321                 return UShort4(cf * Float4(0xFFFF), saturate);
2322         }
2323
2324         void PixelRoutine::sRGBtoLinear16_12_16(Registers &r, Vector4s &c)
2325         {
2326                 c.x = As<UShort4>(c.x) >> 4;
2327                 c.y = As<UShort4>(c.y) >> 4;
2328                 c.z = As<UShort4>(c.z) >> 4;
2329
2330                 sRGBtoLinear12_16(r, c);
2331         }
2332
2333         void PixelRoutine::sRGBtoLinear12_16(Registers &r, Vector4s &c)
2334         {
2335                 Pointer<Byte> LUT = r.constants + OFFSET(Constants,sRGBtoLinear12_16);
2336
2337                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2338                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2339                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2340                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2341
2342                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2343                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2344                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2345                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2346
2347                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2348                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2349                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2350                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2351         }
2352
2353         void PixelRoutine::linearToSRGB16_12_16(Registers &r, Vector4s &c)
2354         {
2355                 c.x = As<UShort4>(c.x) >> 4;
2356                 c.y = As<UShort4>(c.y) >> 4;
2357                 c.z = As<UShort4>(c.z) >> 4;
2358
2359                 linearToSRGB12_16(r, c);
2360         }
2361
2362         void PixelRoutine::linearToSRGB12_16(Registers &r, Vector4s &c)
2363         {
2364                 Pointer<Byte> LUT = r.constants + OFFSET(Constants,linearToSRGB12_16);
2365
2366                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2367                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2368                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2369                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2370
2371                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2372                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2373                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2374                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2375
2376                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2377                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2378                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2379                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2380         }
2381
2382         Float4 PixelRoutine::sRGBtoLinear(const Float4 &x)   // Approximates x^2.2
2383         {
2384                 Float4 linear = x * x;
2385                 linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
2386
2387                 return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
2388         }
2389
2390         bool PixelRoutine::colorUsed()
2391         {
2392                 return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill;
2393         }
2394 }