OSDN Git Service

Added FORMAT_X32B32G32R32F as a renderable format
[android-x86/external-swiftshader.git] / src / Shader / PixelRoutine.cpp
1 // SwiftShader Software Renderer
2 //
3 // Copyright(c) 2005-2013 TransGaming Inc.
4 //
5 // All rights reserved. No part of this software may be copied, distributed, transmitted,
6 // transcribed, stored in a retrieval system, translated into any human or computer
7 // language by any means, or disclosed to third parties without the explicit written
8 // agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
9 // or implied, including but not limited to any patent rights, are granted to you.
10 //
11
12 #include "PixelRoutine.hpp"
13
14 #include "Renderer.hpp"
15 #include "QuadRasterizer.hpp"
16 #include "Surface.hpp"
17 #include "Primitive.hpp"
18 #include "CPUID.hpp"
19 #include "SamplerCore.hpp"
20 #include "Constants.hpp"
21 #include "Debug.hpp"
22
23 namespace sw
24 {
25         extern bool complementaryDepthBuffer;
26         extern bool postBlendSRGB;
27         extern bool exactColorRounding;
28         extern bool forceClearRegisters;
29
30         PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader) : QuadRasterizer(state, shader), v(shader && shader->dynamicallyIndexedInput)
31         {
32                 if(!shader || shader->getVersion() < 0x0200 || forceClearRegisters)
33                 {
34                         for(int i = 0; i < 10; i++)
35                         {
36                                 v[i].x = Float4(0.0f);
37                                 v[i].y = Float4(0.0f);
38                                 v[i].z = Float4(0.0f);
39                                 v[i].w = Float4(0.0f);
40                         }
41                 }
42         }
43
44         PixelRoutine::~PixelRoutine()
45         {
46                 for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
47                 {
48                         delete sampler[i];
49                 }
50         }
51
52         void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
53         {
54                 #if PERF_PROFILE
55                         Long pipeTime = Ticks();
56                 #endif
57
58                 for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
59                 {
60                         sampler[i] = new SamplerCore(constants, state.sampler[i]);
61                 }
62
63                 const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive();
64
65                 Int zMask[4];   // Depth mask
66                 Int sMask[4];   // Stencil mask
67
68                 for(unsigned int q = 0; q < state.multiSample; q++)
69                 {
70                         zMask[q] = cMask[q];
71                         sMask[q] = cMask[q];
72                 }
73
74                 for(unsigned int q = 0; q < state.multiSample; q++)
75                 {
76                         stencilTest(sBuffer, q, x, sMask[q], cMask[q]);
77                 }
78
79                 Float4 f;
80                 Float4 rhwCentroid;
81
82                 Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16);
83
84                 if(interpolateZ())
85                 {
86                         for(unsigned int q = 0; q < state.multiSample; q++)
87                         {
88                                 Float4 x = xxxx;
89
90                                 if(state.multiSample > 1)
91                                 {
92                                         x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4));
93                                 }
94
95                                 z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false);
96                         }
97                 }
98
99                 Bool depthPass = false;
100
101                 if(earlyDepthTest)
102                 {
103                         for(unsigned int q = 0; q < state.multiSample; q++)
104                         {
105                                 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
106                         }
107                 }
108
109                 If(depthPass || Bool(!earlyDepthTest))
110                 {
111                         #if PERF_PROFILE
112                                 Long interpTime = Ticks();
113                         #endif
114
115                         Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
116
117                         // Centroid locations
118                         Float4 XXXX = Float4(0.0f);
119                         Float4 YYYY = Float4(0.0f);
120
121                         if(state.centroid)
122                         {
123                                 Float4 WWWW(1.0e-9f);
124
125                                 for(unsigned int q = 0; q < state.multiSample; q++)
126                                 {
127                                         XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
128                                         YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
129                                         WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]);
130                                 }
131
132                                 WWWW = Rcp_pp(WWWW);
133                                 XXXX *= WWWW;
134                                 YYYY *= WWWW;
135
136                                 XXXX += xxxx;
137                                 YYYY += yyyy;
138                         }
139
140                         if(interpolateW())
141                         {
142                                 w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false);
143                                 rhw = reciprocal(w, false, false, true);
144
145                                 if(state.centroid)
146                                 {
147                                         rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false));
148                                 }
149                         }
150
151                         for(int interpolant = 0; interpolant < 10; interpolant++)
152                         {
153                                 for(int component = 0; component < 4; component++)
154                                 {
155                                         if(state.interpolant[interpolant].component & (1 << component))
156                                         {
157                                                 if(!state.interpolant[interpolant].centroid)
158                                                 {
159                                                         v[interpolant][component] = interpolate(xxxx, Dv[interpolant][component], rhw, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
160                                                 }
161                                                 else
162                                                 {
163                                                         v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
164                                                 }
165                                         }
166                                 }
167
168                                 Float4 rcp;
169
170                                 switch(state.interpolant[interpolant].project)
171                                 {
172                                 case 0:
173                                         break;
174                                 case 1:
175                                         rcp = reciprocal(v[interpolant].y);
176                                         v[interpolant].x = v[interpolant].x * rcp;
177                                         break;
178                                 case 2:
179                                         rcp = reciprocal(v[interpolant].z);
180                                         v[interpolant].x = v[interpolant].x * rcp;
181                                         v[interpolant].y = v[interpolant].y * rcp;
182                                         break;
183                                 case 3:
184                                         rcp = reciprocal(v[interpolant].w);
185                                         v[interpolant].x = v[interpolant].x * rcp;
186                                         v[interpolant].y = v[interpolant].y * rcp;
187                                         v[interpolant].z = v[interpolant].z * rcp;
188                                         break;
189                                 }
190                         }
191
192                         if(state.fog.component)
193                         {
194                                 f = interpolate(xxxx, Df, rhw, primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective);
195                         }
196
197                         setBuiltins(x, y, z, w);
198
199                         #if PERF_PROFILE
200                                 cycles[PERF_INTERP] += Ticks() - interpTime;
201                         #endif
202
203                         Bool alphaPass = true;
204
205                         if(colorUsed())
206                         {
207                                 #if PERF_PROFILE
208                                         Long shaderTime = Ticks();
209                                 #endif
210
211                                 applyShader(cMask);
212
213                                 #if PERF_PROFILE
214                                         cycles[PERF_SHADER] += Ticks() - shaderTime;
215                                 #endif
216
217                                 alphaPass = alphaTest(cMask);
218
219                                 if((shader && shader->containsKill()) || state.alphaTestActive())
220                                 {
221                                         for(unsigned int q = 0; q < state.multiSample; q++)
222                                         {
223                                                 zMask[q] &= cMask[q];
224                                                 sMask[q] &= cMask[q];
225                                         }
226                                 }
227                         }
228
229                         If(alphaPass)
230                         {
231                                 if(!earlyDepthTest)
232                                 {
233                                         for(unsigned int q = 0; q < state.multiSample; q++)
234                                         {
235                                                 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
236                                         }
237                                 }
238
239                                 #if PERF_PROFILE
240                                         Long ropTime = Ticks();
241                                 #endif
242
243                                 If(depthPass || Bool(earlyDepthTest))
244                                 {
245                                         for(unsigned int q = 0; q < state.multiSample; q++)
246                                         {
247                                                 if(state.multiSampleMask & (1 << q))
248                                                 {
249                                                         writeDepth(zBuffer, q, x, z[q], zMask[q]);
250
251                                                         if(state.occlusionEnabled)
252                                                         {
253                                                                 occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
254                                                         }
255                                                 }
256                                         }
257
258                                         if(colorUsed())
259                                         {
260                                                 #if PERF_PROFILE
261                                                         AddAtomic(Pointer<Long>(&profiler.ropOperations), 4);
262                                                 #endif
263
264                                                 rasterOperation(f, cBuffer, x, sMask, zMask, cMask);
265                                         }
266                                 }
267
268                                 #if PERF_PROFILE
269                                         cycles[PERF_ROP] += Ticks() - ropTime;
270                                 #endif
271                         }
272                 }
273
274                 for(unsigned int q = 0; q < state.multiSample; q++)
275                 {
276                         if(state.multiSampleMask & (1 << q))
277                         {
278                                 writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
279                         }
280                 }
281
282                 #if PERF_PROFILE
283                         cycles[PERF_PIPE] += Ticks() - pipeTime;
284                 #endif
285         }
286
287         Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
288         {
289                 Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
290
291                 if(!flat)
292                 {
293                         interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
294                                        y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
295
296                         if(perspective)
297                         {
298                                 interpolant *= rhw;
299                         }
300                 }
301
302                 return interpolant;
303         }
304
305         void PixelRoutine::stencilTest(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask)
306         {
307                 if(!state.stencilActive)
308                 {
309                         return;
310                 }
311
312                 // (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
313
314                 Pointer<Byte> buffer = sBuffer + 2 * x;
315
316                 if(q > 0)
317                 {
318                         buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
319                 }
320
321                 Byte8 value = As<Byte8>(Long1(*Pointer<UInt>(buffer)));
322                 Byte8 valueCCW = value;
323
324                 if(!state.noStencilMask)
325                 {
326                         value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ));
327                 }
328
329                 stencilTest(value, state.stencilCompareMode, false);
330
331                 if(state.twoSidedStencil)
332                 {
333                         if(!state.noStencilMaskCCW)
334                         {
335                                 valueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ));
336                         }
337
338                         stencilTest(valueCCW, state.stencilCompareModeCCW, true);
339
340                         value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
341                         valueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
342                         value |= valueCCW;
343                 }
344
345                 sMask = SignMask(value) & cMask;
346         }
347
348         void PixelRoutine::stencilTest(Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW)
349         {
350                 Byte8 equal;
351
352                 switch(stencilCompareMode)
353                 {
354                 case STENCIL_ALWAYS:
355                         value = Byte8(0xFFFFFFFFFFFFFFFF);
356                         break;
357                 case STENCIL_NEVER:
358                         value = Byte8(0x0000000000000000);
359                         break;
360                 case STENCIL_LESS:                      // a < b ~ b > a
361                         value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
362                         value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
363                         break;
364                 case STENCIL_EQUAL:
365                         value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
366                         break;
367                 case STENCIL_NOTEQUAL:          // a != b ~ !(a == b)
368                         value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
369                         value ^= Byte8(0xFFFFFFFFFFFFFFFF);
370                         break;
371                 case STENCIL_LESSEQUAL: // a <= b ~ (b > a) || (a == b)
372                         equal = value;
373                         equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
374                         value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
375                         value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
376                         value |= equal;
377                         break;
378                 case STENCIL_GREATER:           // a > b
379                         equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
380                         value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
381                         equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
382                         value = equal;
383                         break;
384                 case STENCIL_GREATEREQUAL:      // a >= b ~ !(a < b) ~ !(b > a)
385                         value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
386                         value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
387                         value ^= Byte8(0xFFFFFFFFFFFFFFFF);
388                         break;
389                 default:
390                         ASSERT(false);
391                 }
392         }
393
394         Bool PixelRoutine::depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
395         {
396                 if(!state.depthTestActive)
397                 {
398                         return true;
399                 }
400
401                 Float4 Z = z;
402
403                 if(shader && shader->depthOverride())
404                 {
405                         if(complementaryDepthBuffer)
406                         {
407                                 Z = Float4(1.0f) - oDepth;
408                         }
409                         else
410                         {
411                                 Z = oDepth;
412                         }
413                 }
414
415                 Pointer<Byte> buffer;
416                 Int pitch;
417
418                 if(!state.quadLayoutDepthBuffer)
419                 {
420                         buffer = zBuffer + 4 * x;
421                         pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
422                 }
423                 else
424                 {
425                         buffer = zBuffer + 8 * x;
426                 }
427
428                 if(q > 0)
429                 {
430                         buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
431                 }
432
433                 Float4 zValue;
434
435                 if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
436                 {
437                         if(!state.quadLayoutDepthBuffer)
438                         {
439                                 // FIXME: Properly optimizes?
440                                 zValue.xy = *Pointer<Float4>(buffer);
441                                 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
442                         }
443                         else
444                         {
445                                 zValue = *Pointer<Float4>(buffer, 16);
446                         }
447                 }
448
449                 Int4 zTest;
450
451                 switch(state.depthCompareMode)
452                 {
453                 case DEPTH_ALWAYS:
454                         // Optimized
455                         break;
456                 case DEPTH_NEVER:
457                         // Optimized
458                         break;
459                 case DEPTH_EQUAL:
460                         zTest = CmpEQ(zValue, Z);
461                         break;
462                 case DEPTH_NOTEQUAL:
463                         zTest = CmpNEQ(zValue, Z);
464                         break;
465                 case DEPTH_LESS:
466                         if(complementaryDepthBuffer)
467                         {
468                                 zTest = CmpLT(zValue, Z);
469                         }
470                         else
471                         {
472                                 zTest = CmpNLE(zValue, Z);
473                         }
474                         break;
475                 case DEPTH_GREATEREQUAL:
476                         if(complementaryDepthBuffer)
477                         {
478                                 zTest = CmpNLT(zValue, Z);
479                         }
480                         else
481                         {
482                                 zTest = CmpLE(zValue, Z);
483                         }
484                         break;
485                 case DEPTH_LESSEQUAL:
486                         if(complementaryDepthBuffer)
487                         {
488                                 zTest = CmpLE(zValue, Z);
489                         }
490                         else
491                         {
492                                 zTest = CmpNLT(zValue, Z);
493                         }
494                         break;
495                 case DEPTH_GREATER:
496                         if(complementaryDepthBuffer)
497                         {
498                                 zTest = CmpNLE(zValue, Z);
499                         }
500                         else
501                         {
502                                 zTest = CmpLT(zValue, Z);
503                         }
504                         break;
505                 default:
506                         ASSERT(false);
507                 }
508
509                 switch(state.depthCompareMode)
510                 {
511                 case DEPTH_ALWAYS:
512                         zMask = cMask;
513                         break;
514                 case DEPTH_NEVER:
515                         zMask = 0x0;
516                         break;
517                 default:
518                         zMask = SignMask(zTest) & cMask;
519                         break;
520                 }
521
522                 if(state.stencilActive)
523                 {
524                         zMask &= sMask;
525                 }
526
527                 return zMask != 0;
528         }
529
530         void PixelRoutine::alphaTest(Int &aMask, Short4 &alpha)
531         {
532                 Short4 cmp;
533                 Short4 equal;
534
535                 switch(state.alphaCompareMode)
536                 {
537                 case ALPHA_ALWAYS:
538                         aMask = 0xF;
539                         break;
540                 case ALPHA_NEVER:
541                         aMask = 0x0;
542                         break;
543                 case ALPHA_EQUAL:
544                         cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
545                         aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
546                         break;
547                 case ALPHA_NOTEQUAL:            // a != b ~ !(a == b)
548                         cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4((short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF);   // FIXME
549                         aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
550                         break;
551                 case ALPHA_LESS:                        // a < b ~ b > a
552                         cmp = CmpGT(*Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)), alpha);
553                         aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
554                         break;
555                 case ALPHA_GREATEREQUAL:        // a >= b ~ (a > b) || (a == b) ~ !(b > a)   // TODO: Approximate
556                         equal = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
557                         cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
558                         cmp |= equal;
559                         aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
560                         break;
561                 case ALPHA_LESSEQUAL:           // a <= b ~ !(a > b)
562                         cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4((short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF);   // FIXME
563                         aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
564                         break;
565                 case ALPHA_GREATER:                     // a > b
566                         cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
567                         aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
568                         break;
569                 default:
570                         ASSERT(false);
571                 }
572         }
573
574         void PixelRoutine::alphaToCoverage(Int cMask[4], Float4 &alpha)
575         {
576                 Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0)));
577                 Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1)));
578                 Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2)));
579                 Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3)));
580
581                 Int aMask0 = SignMask(coverage0);
582                 Int aMask1 = SignMask(coverage1);
583                 Int aMask2 = SignMask(coverage2);
584                 Int aMask3 = SignMask(coverage3);
585
586                 cMask[0] &= aMask0;
587                 cMask[1] &= aMask1;
588                 cMask[2] &= aMask2;
589                 cMask[3] &= aMask3;
590         }
591
592         void PixelRoutine::fogBlend(Vector4f &c0, Float4 &fog)
593         {
594                 if(!state.fogActive)
595                 {
596                         return;
597                 }
598
599                 if(state.pixelFogMode != FOG_NONE)
600                 {
601                         pixelFog(fog);
602
603                         fog = Min(fog, Float4(1.0f));
604                         fog = Max(fog, Float4(0.0f));
605                 }
606
607                 c0.x -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
608                 c0.y -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
609                 c0.z -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
610
611                 c0.x *= fog;
612                 c0.y *= fog;
613                 c0.z *= fog;
614
615                 c0.x += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
616                 c0.y += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
617                 c0.z += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
618         }
619
620         void PixelRoutine::pixelFog(Float4 &visibility)
621         {
622                 Float4 &zw = visibility;
623
624                 if(state.pixelFogMode != FOG_NONE)
625                 {
626                         if(state.wBasedFog)
627                         {
628                                 zw = rhw;
629                         }
630                         else
631                         {
632                                 if(complementaryDepthBuffer)
633                                 {
634                                         zw = Float4(1.0f) - z[0];
635                                 }
636                                 else
637                                 {
638                                         zw = z[0];
639                                 }
640                         }
641                 }
642
643                 switch(state.pixelFogMode)
644                 {
645                 case FOG_NONE:
646                         break;
647                 case FOG_LINEAR:
648                         zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.scale));
649                         zw += *Pointer<Float4>(data + OFFSET(DrawData,fog.offset));
650                         break;
651                 case FOG_EXP:
652                         zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.densityE));
653                         zw = exponential2(zw, true);
654                         break;
655                 case FOG_EXP2:
656                         zw *= zw;
657                         zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.density2E));
658                         zw = exponential2(zw, true);
659                         break;
660                 default:
661                         ASSERT(false);
662                 }
663         }
664
665         void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
666         {
667                 if(!state.depthWriteEnable)
668                 {
669                         return;
670                 }
671
672                 Float4 Z = z;
673
674                 if(shader && shader->depthOverride())
675                 {
676                         if(complementaryDepthBuffer)
677                         {
678                                 Z = Float4(1.0f) - oDepth;
679                         }
680                         else
681                         {
682                                 Z = oDepth;
683                         }
684                 }
685
686                 Pointer<Byte> buffer;
687                 Int pitch;
688
689                 if(!state.quadLayoutDepthBuffer)
690                 {
691                         buffer = zBuffer + 4 * x;
692                         pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
693                 }
694                 else
695                 {
696                         buffer = zBuffer + 8 * x;
697                 }
698
699                 if(q > 0)
700                 {
701                         buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
702                 }
703
704                 Float4 zValue;
705
706                 if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
707                 {
708                         if(!state.quadLayoutDepthBuffer)
709                         {
710                                 // FIXME: Properly optimizes?
711                                 zValue.xy = *Pointer<Float4>(buffer);
712                                 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
713                         }
714                         else
715                         {
716                                 zValue = *Pointer<Float4>(buffer, 16);
717                         }
718                 }
719
720                 Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
721                 zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
722                 Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
723
724                 if(!state.quadLayoutDepthBuffer)
725                 {
726                         // FIXME: Properly optimizes?
727                         *Pointer<Float2>(buffer) = Float2(Z.xy);
728                         *Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
729                 }
730                 else
731                 {
732                         *Pointer<Float4>(buffer, 16) = Z;
733                 }
734         }
735
736         void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
737         {
738                 if(!state.stencilActive)
739                 {
740                         return;
741                 }
742
743                 if(state.stencilPassOperation == OPERATION_KEEP && state.stencilZFailOperation == OPERATION_KEEP && state.stencilFailOperation == OPERATION_KEEP)
744                 {
745                         if(!state.twoSidedStencil || (state.stencilPassOperationCCW == OPERATION_KEEP && state.stencilZFailOperationCCW == OPERATION_KEEP && state.stencilFailOperationCCW == OPERATION_KEEP))
746                         {
747                                 return;
748                         }
749                 }
750
751                 if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW))
752                 {
753                         return;
754                 }
755
756                 Pointer<Byte> buffer = sBuffer + 2 * x;
757
758                 if(q > 0)
759                 {
760                         buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
761                 }
762
763                 Byte8 bufferValue = As<Byte8>(Long1(*Pointer<UInt>(buffer)));
764
765                 Byte8 newValue;
766                 stencilOperation(newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
767
768                 if(!state.noStencilWriteMask)
769                 {
770                         Byte8 maskedValue = bufferValue;
771                         newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ));
772                         maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
773                         newValue |= maskedValue;
774                 }
775
776                 if(state.twoSidedStencil)
777                 {
778                         Byte8 newValueCCW;
779
780                         stencilOperation(newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
781
782                         if(!state.noStencilWriteMaskCCW)
783                         {
784                                 Byte8 maskedValue = bufferValue;
785                                 newValueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ));
786                                 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
787                                 newValueCCW |= maskedValue;
788                         }
789
790                         newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
791                         newValueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
792                         newValue |= newValueCCW;
793                 }
794
795                 newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
796                 bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
797                 newValue |= bufferValue;
798
799                 *Pointer<UInt>(buffer) = UInt(As<Long>(newValue));
800         }
801
802         void PixelRoutine::stencilOperation(Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
803         {
804                 Byte8 &pass = newValue;
805                 Byte8 fail;
806                 Byte8 zFail;
807
808                 stencilOperation(pass, bufferValue, stencilPassOperation, CCW);
809
810                 if(stencilZFailOperation != stencilPassOperation)
811                 {
812                         stencilOperation(zFail, bufferValue, stencilZFailOperation, CCW);
813                 }
814
815                 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
816                 {
817                         stencilOperation(fail, bufferValue, stencilFailOperation, CCW);
818                 }
819
820                 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
821                 {
822                         if(state.depthTestActive && stencilZFailOperation != stencilPassOperation)   // zMask valid and values not the same
823                         {
824                                 pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
825                                 zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
826                                 pass |= zFail;
827                         }
828
829                         pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
830                         fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
831                         pass |= fail;
832                 }
833         }
834
835         void PixelRoutine::stencilOperation(Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW)
836         {
837                 switch(operation)
838                 {
839                 case OPERATION_KEEP:
840                         output = bufferValue;
841                         break;
842                 case OPERATION_ZERO:
843                         output = Byte8(0x0000000000000000);
844                         break;
845                 case OPERATION_REPLACE:
846                         output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceQ));
847                         break;
848                 case OPERATION_INCRSAT:
849                         output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
850                         break;
851                 case OPERATION_DECRSAT:
852                         output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
853                         break;
854                 case OPERATION_INVERT:
855                         output = bufferValue ^ Byte8(0xFFFFFFFFFFFFFFFF);
856                         break;
857                 case OPERATION_INCR:
858                         output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
859                         break;
860                 case OPERATION_DECR:
861                         output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
862                         break;
863                 default:
864                         ASSERT(false);
865                 }
866         }
867
868         void PixelRoutine::blendFactor(const Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorActive)
869         {
870                 switch(blendFactorActive)
871                 {
872                 case BLEND_ZERO:
873                         // Optimized
874                         break;
875                 case BLEND_ONE:
876                         // Optimized
877                         break;
878                 case BLEND_SOURCE:
879                         blendFactor.x = current.x;
880                         blendFactor.y = current.y;
881                         blendFactor.z = current.z;
882                         break;
883                 case BLEND_INVSOURCE:
884                         blendFactor.x = Short4(0xFFFFu) - current.x;
885                         blendFactor.y = Short4(0xFFFFu) - current.y;
886                         blendFactor.z = Short4(0xFFFFu) - current.z;
887                         break;
888                 case BLEND_DEST:
889                         blendFactor.x = pixel.x;
890                         blendFactor.y = pixel.y;
891                         blendFactor.z = pixel.z;
892                         break;
893                 case BLEND_INVDEST:
894                         blendFactor.x = Short4(0xFFFFu) - pixel.x;
895                         blendFactor.y = Short4(0xFFFFu) - pixel.y;
896                         blendFactor.z = Short4(0xFFFFu) - pixel.z;
897                         break;
898                 case BLEND_SOURCEALPHA:
899                         blendFactor.x = current.w;
900                         blendFactor.y = current.w;
901                         blendFactor.z = current.w;
902                         break;
903                 case BLEND_INVSOURCEALPHA:
904                         blendFactor.x = Short4(0xFFFFu) - current.w;
905                         blendFactor.y = Short4(0xFFFFu) - current.w;
906                         blendFactor.z = Short4(0xFFFFu) - current.w;
907                         break;
908                 case BLEND_DESTALPHA:
909                         blendFactor.x = pixel.w;
910                         blendFactor.y = pixel.w;
911                         blendFactor.z = pixel.w;
912                         break;
913                 case BLEND_INVDESTALPHA:
914                         blendFactor.x = Short4(0xFFFFu) - pixel.w;
915                         blendFactor.y = Short4(0xFFFFu) - pixel.w;
916                         blendFactor.z = Short4(0xFFFFu) - pixel.w;
917                         break;
918                 case BLEND_SRCALPHASAT:
919                         blendFactor.x = Short4(0xFFFFu) - pixel.w;
920                         blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
921                         blendFactor.y = blendFactor.x;
922                         blendFactor.z = blendFactor.x;
923                         break;
924                 case BLEND_CONSTANT:
925                         blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0]));
926                         blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1]));
927                         blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2]));
928                         break;
929                 case BLEND_INVCONSTANT:
930                         blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
931                         blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
932                         blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
933                         break;
934                 case BLEND_CONSTANTALPHA:
935                         blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
936                         blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
937                         blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
938                         break;
939                 case BLEND_INVCONSTANTALPHA:
940                         blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
941                         blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
942                         blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
943                         break;
944                 default:
945                         ASSERT(false);
946                 }
947         }
948
949         void PixelRoutine::blendFactorAlpha(const Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorAlphaActive)
950         {
951                 switch(blendFactorAlphaActive)
952                 {
953                 case BLEND_ZERO:
954                         // Optimized
955                         break;
956                 case BLEND_ONE:
957                         // Optimized
958                         break;
959                 case BLEND_SOURCE:
960                         blendFactor.w = current.w;
961                         break;
962                 case BLEND_INVSOURCE:
963                         blendFactor.w = Short4(0xFFFFu) - current.w;
964                         break;
965                 case BLEND_DEST:
966                         blendFactor.w = pixel.w;
967                         break;
968                 case BLEND_INVDEST:
969                         blendFactor.w = Short4(0xFFFFu) - pixel.w;
970                         break;
971                 case BLEND_SOURCEALPHA:
972                         blendFactor.w = current.w;
973                         break;
974                 case BLEND_INVSOURCEALPHA:
975                         blendFactor.w = Short4(0xFFFFu) - current.w;
976                         break;
977                 case BLEND_DESTALPHA:
978                         blendFactor.w = pixel.w;
979                         break;
980                 case BLEND_INVDESTALPHA:
981                         blendFactor.w = Short4(0xFFFFu) - pixel.w;
982                         break;
983                 case BLEND_SRCALPHASAT:
984                         blendFactor.w = Short4(0xFFFFu);
985                         break;
986                 case BLEND_CONSTANT:
987                 case BLEND_CONSTANTALPHA:
988                         blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
989                         break;
990                 case BLEND_INVCONSTANT:
991                 case BLEND_INVCONSTANTALPHA:
992                         blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
993                         break;
994                 default:
995                         ASSERT(false);
996                 }
997         }
998
999         void PixelRoutine::readPixel(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel)
1000         {
1001                 Short4 c01;
1002                 Short4 c23;
1003                 Pointer<Byte> buffer;
1004                 Pointer<Byte> buffer2;
1005
1006                 switch(state.targetFormat[index])
1007                 {
1008                 case FORMAT_R5G6B5:
1009                         buffer = cBuffer + 2 * x;
1010                         buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1011                         c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1012
1013                         pixel.x = c01 & Short4(0xF800u);
1014                         pixel.y = (c01 & Short4(0x07E0u)) << 5;
1015                         pixel.z = (c01 & Short4(0x001Fu)) << 11;
1016                         pixel.w = Short4(0xFFFFu);
1017                         break;
1018                 case FORMAT_A8R8G8B8:
1019                         buffer = cBuffer + 4 * x;
1020                         c01 = *Pointer<Short4>(buffer);
1021                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1022                         c23 = *Pointer<Short4>(buffer);
1023                         pixel.z = c01;
1024                         pixel.y = c01;
1025                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1026                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1027                         pixel.x = pixel.z;
1028                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1029                         pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1030                         pixel.y = pixel.z;
1031                         pixel.w = pixel.x;
1032                         pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1033                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1034                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1035                         pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1036                         break;
1037                 case FORMAT_A8B8G8R8:
1038                         buffer = cBuffer + 4 * x;
1039                         c01 = *Pointer<Short4>(buffer);
1040                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1041                         c23 = *Pointer<Short4>(buffer);
1042                         pixel.z = c01;
1043                         pixel.y = c01;
1044                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1045                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1046                         pixel.x = pixel.z;
1047                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1048                         pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1049                         pixel.y = pixel.z;
1050                         pixel.w = pixel.x;
1051                         pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1052                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1053                         pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1054                         pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1055                         break;
1056                 case FORMAT_A8:
1057                         buffer = cBuffer + 1 * x;
1058                         pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 0);
1059                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1060                         pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 1);
1061                         pixel.w = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1062                         pixel.x = Short4(0x0000);
1063                         pixel.y = Short4(0x0000);
1064                         pixel.z = Short4(0x0000);
1065                         break;
1066                 case FORMAT_X8R8G8B8:
1067                         buffer = cBuffer + 4 * x;
1068                         c01 = *Pointer<Short4>(buffer);
1069                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1070                         c23 = *Pointer<Short4>(buffer);
1071                         pixel.z = c01;
1072                         pixel.y = c01;
1073                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1074                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1075                         pixel.x = pixel.z;
1076                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1077                         pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1078                         pixel.y = pixel.z;
1079                         pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1080                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1081                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1082                         pixel.w = Short4(0xFFFFu);
1083                         break;
1084                 case FORMAT_X8B8G8R8:
1085                         buffer = cBuffer + 4 * x;
1086                         c01 = *Pointer<Short4>(buffer);
1087                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1088                         c23 = *Pointer<Short4>(buffer);
1089                         pixel.z = c01;
1090                         pixel.y = c01;
1091                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1092                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1093                         pixel.x = pixel.z;
1094                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1095                         pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1096                         pixel.y = pixel.z;
1097                         pixel.w = pixel.x;
1098                         pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1099                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1100                         pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1101                         pixel.w = Short4(0xFFFFu);
1102                         break;
1103                 case FORMAT_A8G8R8B8Q:
1104                         UNIMPLEMENTED();
1105                 //      pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1106                 //      pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1107                 //      pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1108                 //      pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1109                         break;
1110                 case FORMAT_X8G8R8B8Q:
1111                         UNIMPLEMENTED();
1112                 //      pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1113                 //      pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1114                 //      pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1115                 //      pixel.w = Short4(0xFFFFu);
1116                         break;
1117                 case FORMAT_A16B16G16R16:
1118                         buffer = cBuffer;
1119                         pixel.x = *Pointer<Short4>(buffer + 8 * x);
1120                         pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
1121                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1122                         pixel.z = *Pointer<Short4>(buffer + 8 * x);
1123                         pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
1124                         transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
1125                         break;
1126                 case FORMAT_G16R16:
1127                         buffer = cBuffer;
1128                         pixel.x = *Pointer<Short4>(buffer + 4 * x);
1129                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1130                         pixel.y = *Pointer<Short4>(buffer + 4 * x);
1131                         pixel.z = pixel.x;
1132                         pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
1133                         pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
1134                         pixel.y = pixel.z;
1135                         pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
1136                         pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
1137                         pixel.z = Short4(0xFFFFu);
1138                         pixel.w = Short4(0xFFFFu);
1139                         break;
1140                 default:
1141                         ASSERT(false);
1142                 }
1143
1144                 if(postBlendSRGB && state.writeSRGB)
1145                 {
1146                         sRGBtoLinear16_12_16(pixel);
1147                 }
1148         }
1149
1150         void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
1151         {
1152                 if(!state.alphaBlendActive)
1153                 {
1154                         return;
1155                 }
1156
1157                 Vector4s pixel;
1158                 readPixel(index, cBuffer, x, pixel);
1159
1160                 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
1161                 Vector4s sourceFactor;
1162                 Vector4s destFactor;
1163
1164                 blendFactor(sourceFactor, current, pixel, state.sourceBlendFactor);
1165                 blendFactor(destFactor, current, pixel, state.destBlendFactor);
1166
1167                 if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
1168                 {
1169                         current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
1170                         current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
1171                         current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
1172                 }
1173
1174                 if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
1175                 {
1176                         pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
1177                         pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
1178                         pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
1179                 }
1180
1181                 switch(state.blendOperation)
1182                 {
1183                 case BLENDOP_ADD:
1184                         current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1185                         current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1186                         current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1187                         break;
1188                 case BLENDOP_SUB:
1189                         current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1190                         current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1191                         current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1192                         break;
1193                 case BLENDOP_INVSUB:
1194                         current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
1195                         current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
1196                         current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
1197                         break;
1198                 case BLENDOP_MIN:
1199                         current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
1200                         current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
1201                         current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
1202                         break;
1203                 case BLENDOP_MAX:
1204                         current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
1205                         current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
1206                         current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
1207                         break;
1208                 case BLENDOP_SOURCE:
1209                         // No operation
1210                         break;
1211                 case BLENDOP_DEST:
1212                         current.x = pixel.x;
1213                         current.y = pixel.y;
1214                         current.z = pixel.z;
1215                         break;
1216                 case BLENDOP_NULL:
1217                         current.x = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1218                         current.y = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1219                         current.z = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1220                         break;
1221                 default:
1222                         ASSERT(false);
1223                 }
1224
1225                 blendFactorAlpha(sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
1226                 blendFactorAlpha(destFactor, current, pixel, state.destBlendFactorAlpha);
1227
1228                 if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
1229                 {
1230                         current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
1231                 }
1232
1233                 if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
1234                 {
1235                         pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
1236                 }
1237
1238                 switch(state.blendOperationAlpha)
1239                 {
1240                 case BLENDOP_ADD:
1241                         current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1242                         break;
1243                 case BLENDOP_SUB:
1244                         current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1245                         break;
1246                 case BLENDOP_INVSUB:
1247                         current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
1248                         break;
1249                 case BLENDOP_MIN:
1250                         current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
1251                         break;
1252                 case BLENDOP_MAX:
1253                         current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
1254                         break;
1255                 case BLENDOP_SOURCE:
1256                         // No operation
1257                         break;
1258                 case BLENDOP_DEST:
1259                         current.w = pixel.w;
1260                         break;
1261                 case BLENDOP_NULL:
1262                         current.w = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1263                         break;
1264                 default:
1265                         ASSERT(false);
1266                 }
1267         }
1268
1269         void PixelRoutine::logicOperation(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
1270         {
1271                 if(state.logicalOperation == LOGICALOP_COPY)
1272                 {
1273                         return;
1274                 }
1275
1276                 Vector4s pixel;
1277                 readPixel(index, cBuffer, x, pixel);
1278
1279                 switch(state.logicalOperation)
1280                 {
1281                 case LOGICALOP_CLEAR:
1282                         current.x = 0;
1283                         current.y = 0;
1284                         current.z = 0;
1285                         break;
1286                 case LOGICALOP_SET:
1287                         current.x = 0xFFFFu;
1288                         current.y = 0xFFFFu;
1289                         current.z = 0xFFFFu;
1290                         break;
1291                 case LOGICALOP_COPY:
1292                         ASSERT(false);   // Optimized out
1293                         break;
1294                 case LOGICALOP_COPY_INVERTED:
1295                         current.x = ~current.x;
1296                         current.y = ~current.y;
1297                         current.z = ~current.z;
1298                         break;
1299                 case LOGICALOP_NOOP:
1300                         current.x = pixel.x;
1301                         current.y = pixel.y;
1302                         current.z = pixel.z;
1303                         break;
1304                 case LOGICALOP_INVERT:
1305                         current.x = ~pixel.x;
1306                         current.y = ~pixel.y;
1307                         current.z = ~pixel.z;
1308                         break;
1309                 case LOGICALOP_AND:
1310                         current.x = pixel.x & current.x;
1311                         current.y = pixel.y & current.y;
1312                         current.z = pixel.z & current.z;
1313                         break;
1314                 case LOGICALOP_NAND:
1315                         current.x = ~(pixel.x & current.x);
1316                         current.y = ~(pixel.y & current.y);
1317                         current.z = ~(pixel.z & current.z);
1318                         break;
1319                 case LOGICALOP_OR:
1320                         current.x = pixel.x | current.x;
1321                         current.y = pixel.y | current.y;
1322                         current.z = pixel.z | current.z;
1323                         break;
1324                 case LOGICALOP_NOR:
1325                         current.x = ~(pixel.x | current.x);
1326                         current.y = ~(pixel.y | current.y);
1327                         current.z = ~(pixel.z | current.z);
1328                         break;
1329                 case LOGICALOP_XOR:
1330                         current.x = pixel.x ^ current.x;
1331                         current.y = pixel.y ^ current.y;
1332                         current.z = pixel.z ^ current.z;
1333                         break;
1334                 case LOGICALOP_EQUIV:
1335                         current.x = ~(pixel.x ^ current.x);
1336                         current.y = ~(pixel.y ^ current.y);
1337                         current.z = ~(pixel.z ^ current.z);
1338                         break;
1339                 case LOGICALOP_AND_REVERSE:
1340                         current.x = ~pixel.x & current.x;
1341                         current.y = ~pixel.y & current.y;
1342                         current.z = ~pixel.z & current.z;
1343                         break;
1344                 case LOGICALOP_AND_INVERTED:
1345                         current.x = pixel.x & ~current.x;
1346                         current.y = pixel.y & ~current.y;
1347                         current.z = pixel.z & ~current.z;
1348                         break;
1349                 case LOGICALOP_OR_REVERSE:
1350                         current.x = ~pixel.x | current.x;
1351                         current.y = ~pixel.y | current.y;
1352                         current.z = ~pixel.z | current.z;
1353                         break;
1354                 case LOGICALOP_OR_INVERTED:
1355                         current.x = pixel.x | ~current.x;
1356                         current.y = pixel.y | ~current.y;
1357                         current.z = pixel.z | ~current.z;
1358                         break;
1359                 default:
1360                         ASSERT(false);
1361                 }
1362         }
1363
1364         void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &current, Int &sMask, Int &zMask, Int &cMask)
1365         {
1366                 if(postBlendSRGB && state.writeSRGB)
1367                 {
1368                         linearToSRGB16_12_16(current);
1369                 }
1370
1371                 if(exactColorRounding)
1372                 {
1373                         switch(state.targetFormat[index])
1374                         {
1375                         case FORMAT_R5G6B5:
1376                                 current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400));
1377                                 current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200));
1378                                 current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400));
1379                                 break;
1380                         case FORMAT_X8G8R8B8Q:
1381                         case FORMAT_A8G8R8B8Q:
1382                         case FORMAT_X8R8G8B8:
1383                         case FORMAT_X8B8G8R8:
1384                         case FORMAT_A8R8G8B8:
1385                         case FORMAT_A8B8G8R8:
1386                                 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1387                                 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1388                                 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1389                                 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1390                                 break;
1391                         default:
1392                                 break;
1393                         }
1394                 }
1395
1396                 int rgbaWriteMask = state.colorWriteActive(index);
1397                 int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
1398                 int brgaWriteMask = (rgbaWriteMask & 0x00000008) | (rgbaWriteMask & 0x00000001) << 1 | (rgbaWriteMask & 0x00000002) << 1 | (rgbaWriteMask & 0x00000004) >> 2;
1399
1400                 switch(state.targetFormat[index])
1401                 {
1402                 case FORMAT_R5G6B5:
1403                         {
1404                                 current.x = current.x & Short4(0xF800u);
1405                                 current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1406                                 current.z = As<UShort4>(current.z) >> 11;
1407
1408                                 current.x = current.x | current.y | current.z;
1409                         }
1410                         break;
1411                 case FORMAT_X8G8R8B8Q:
1412                         UNIMPLEMENTED();
1413                 //      current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1414                 //      current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1415                 //      current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1416
1417                 //      current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1418                 //      current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1419                         break;
1420                 case FORMAT_A8G8R8B8Q:
1421                         UNIMPLEMENTED();
1422                 //      current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1423                 //      current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1424                 //      current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1425                 //      current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1426
1427                 //      current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1428                 //      current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1429                         break;
1430                 case FORMAT_X8R8G8B8:
1431                 case FORMAT_A8R8G8B8:
1432                         if(state.targetFormat[index] == FORMAT_X8R8G8B8 || rgbaWriteMask == 0x7)
1433                         {
1434                                 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1435                                 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1436                                 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1437
1438                                 current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1439                                 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1440
1441                                 current.x = current.z;
1442                                 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1443                                 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1444                                 current.y = current.z;
1445                                 current.z = As<Short4>(UnpackLow(current.z, current.x));
1446                                 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1447                         }
1448                         else
1449                         {
1450                                 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1451                                 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1452                                 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1453                                 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1454
1455                                 current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1456                                 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1457
1458                                 current.x = current.z;
1459                                 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1460                                 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1461                                 current.y = current.z;
1462                                 current.z = As<Short4>(UnpackLow(current.z, current.x));
1463                                 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1464                         }
1465                         break;
1466                 case FORMAT_X8B8G8R8:
1467                 case FORMAT_A8B8G8R8:
1468                         if(state.targetFormat[index] == FORMAT_X8B8G8R8 || rgbaWriteMask == 0x7)
1469                         {
1470                                 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1471                                 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1472                                 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1473
1474                                 current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1475                                 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1476
1477                                 current.x = current.z;
1478                                 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1479                                 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1480                                 current.y = current.z;
1481                                 current.z = As<Short4>(UnpackLow(current.z, current.x));
1482                                 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1483                         }
1484                         else
1485                         {
1486                                 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1487                                 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1488                                 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1489                                 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1490
1491                                 current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1492                                 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1493
1494                                 current.x = current.z;
1495                                 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1496                                 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1497                                 current.y = current.z;
1498                                 current.z = As<Short4>(UnpackLow(current.z, current.x));
1499                                 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1500                         }
1501                         break;
1502                 case FORMAT_A8:
1503                         current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1504                         current.w = As<Short4>(Pack(As<UShort4>(current.w), As<UShort4>(current.w)));
1505                         break;
1506                 case FORMAT_G16R16:
1507                         current.z = current.x;
1508                         current.x = As<Short4>(UnpackLow(current.x, current.y));
1509                         current.z = As<Short4>(UnpackHigh(current.z, current.y));
1510                         current.y = current.z;
1511                         break;
1512                 case FORMAT_A16B16G16R16:
1513                         transpose4x4(current.x, current.y, current.z, current.w);
1514                         break;
1515                 default:
1516                         ASSERT(false);
1517                 }
1518
1519                 Short4 c01 = current.z;
1520                 Short4 c23 = current.y;
1521
1522                 Int xMask;   // Combination of all masks
1523
1524                 if(state.depthTestActive)
1525                 {
1526                         xMask = zMask;
1527                 }
1528                 else
1529                 {
1530                         xMask = cMask;
1531                 }
1532
1533                 if(state.stencilActive)
1534                 {
1535                         xMask &= sMask;
1536                 }
1537
1538                 switch(state.targetFormat[index])
1539                 {
1540                 case FORMAT_R5G6B5:
1541                         {
1542                                 Pointer<Byte> buffer = cBuffer + 2 * x;
1543                                 Int value = *Pointer<Int>(buffer);
1544
1545                                 Int c01 = Extract(As<Int2>(current.x), 0);
1546
1547                                 if((bgraWriteMask & 0x00000007) != 0x00000007)
1548                                 {
1549                                         Int masked = value;
1550                                         c01 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1551                                         masked &= *Pointer<Int>(constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0]));
1552                                         c01 |= masked;
1553                                 }
1554
1555                                 c01 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
1556                                 value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
1557                                 c01 |= value;
1558                                 *Pointer<Int>(buffer) = c01;
1559
1560                                 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1561                                 value = *Pointer<Int>(buffer);
1562
1563                                 Int c23 = Extract(As<Int2>(current.x), 1);
1564
1565                                 if((bgraWriteMask & 0x00000007) != 0x00000007)
1566                                 {
1567                                         Int masked = value;
1568                                         c23 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1569                                         masked &= *Pointer<Int>(constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0]));
1570                                         c23 |= masked;
1571                                 }
1572
1573                                 c23 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
1574                                 value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
1575                                 c23 |= value;
1576                                 *Pointer<Int>(buffer) = c23;
1577                         }
1578                         break;
1579                 case FORMAT_A8G8R8B8Q:
1580                 case FORMAT_X8G8R8B8Q:   // FIXME: Don't touch alpha?
1581                         UNIMPLEMENTED();
1582                 //      value = *Pointer<Short4>(cBuffer + 8 * x + 0);
1583
1584                 //      if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1585                 //         ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1586                 //          (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1587                 //      {
1588                 //              Short4 masked = value;
1589                 //              c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1590                 //              masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1591                 //              c01 |= masked;
1592                 //      }
1593
1594                 //      c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1595                 //      value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1596                 //      c01 |= value;
1597                 //      *Pointer<Short4>(cBuffer + 8 * x + 0) = c01;
1598
1599                 //      value = *Pointer<Short4>(cBuffer + 8 * x + 8);
1600
1601                 //      if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1602                 //         ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1603                 //          (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1604                 //      {
1605                 //              Short4 masked = value;
1606                 //              c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1607                 //              masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1608                 //              c23 |= masked;
1609                 //      }
1610
1611                 //      c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1612                 //      value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1613                 //      c23 |= value;
1614                 //      *Pointer<Short4>(cBuffer + 8 * x + 8) = c23;
1615                         break;
1616                 case FORMAT_A8R8G8B8:
1617                 case FORMAT_X8R8G8B8:   // FIXME: Don't touch alpha?
1618                         {
1619                                 Pointer<Byte> buffer = cBuffer + x * 4;
1620                                 Short4 value = *Pointer<Short4>(buffer);
1621
1622                                 if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1623                                    ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1624                                         (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1625                                 {
1626                                         Short4 masked = value;
1627                                         c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1628                                         masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1629                                         c01 |= masked;
1630                                 }
1631
1632                                 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1633                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1634                                 c01 |= value;
1635                                 *Pointer<Short4>(buffer) = c01;
1636
1637                                 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1638                                 value = *Pointer<Short4>(buffer);
1639
1640                                 if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1641                                    ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1642                                         (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1643                                 {
1644                                         Short4 masked = value;
1645                                         c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1646                                         masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1647                                         c23 |= masked;
1648                                 }
1649
1650                                 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1651                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1652                                 c23 |= value;
1653                                 *Pointer<Short4>(buffer) = c23;
1654                         }
1655                         break;
1656                 case FORMAT_A8B8G8R8:
1657                 case FORMAT_X8B8G8R8:   // FIXME: Don't touch alpha?
1658                         {
1659                                 Pointer<Byte> buffer = cBuffer + x * 4;
1660                                 Short4 value = *Pointer<Short4>(buffer);
1661
1662                                 if((state.targetFormat[index] == FORMAT_A8B8G8R8 && rgbaWriteMask != 0x0000000F) ||
1663                                    ((state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x00000007) &&
1664                                         (state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x0000000F)))   // FIXME: Need for masking when XBGR && Fh?
1665                                 {
1666                                         Short4 masked = value;
1667                                         c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1668                                         masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1669                                         c01 |= masked;
1670                                 }
1671
1672                                 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1673                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1674                                 c01 |= value;
1675                                 *Pointer<Short4>(buffer) = c01;
1676
1677                                 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1678                                 value = *Pointer<Short4>(buffer);
1679
1680                                 if((state.targetFormat[index] == FORMAT_A8B8G8R8 && rgbaWriteMask != 0x0000000F) ||
1681                                    ((state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x00000007) &&
1682                                         (state.targetFormat[index] == FORMAT_X8B8G8R8 && rgbaWriteMask != 0x0000000F)))   // FIXME: Need for masking when XBGR && Fh?
1683                                 {
1684                                         Short4 masked = value;
1685                                         c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1686                                         masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1687                                         c23 |= masked;
1688                                 }
1689
1690                                 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1691                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1692                                 c23 |= value;
1693                                 *Pointer<Short4>(buffer) = c23;
1694                         }
1695                         break;
1696                 case FORMAT_A8:
1697                         if(rgbaWriteMask & 0x00000008)
1698                         {
1699                                 Pointer<Byte> buffer = cBuffer + 1 * x;
1700                                 Short4 value;
1701                                 Insert(value, *Pointer<Short>(buffer), 0);
1702                                 Int pitch = *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1703                                 Insert(value, *Pointer<Short>(buffer + pitch), 1);
1704                                 value = UnpackLow(As<Byte8>(value), As<Byte8>(value));
1705
1706                                 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q) + 8 * xMask);
1707                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q) + 8 * xMask);
1708                                 current.w |= value;
1709
1710                                 *Pointer<Short>(buffer) = Extract(current.w, 0);
1711                                 *Pointer<Short>(buffer + pitch) = Extract(current.w, 1);
1712                         }
1713                         break;
1714                 case FORMAT_G16R16:
1715                         {
1716                                 Pointer<Byte> buffer = cBuffer + 4 * x;
1717
1718                                 Short4 value = *Pointer<Short4>(buffer);
1719
1720                                 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1721                                 {
1722                                         Short4 masked = value;
1723                                         current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1724                                         masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
1725                                         current.x |= masked;
1726                                 }
1727
1728                                 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1729                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1730                                 current.x |= value;
1731                                 *Pointer<Short4>(buffer) = current.x;
1732
1733                                 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1734
1735                                 value = *Pointer<Short4>(buffer);
1736
1737                                 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1738                                 {
1739                                         Short4 masked = value;
1740                                         current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1741                                         masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
1742                                         current.y |= masked;
1743                                 }
1744
1745                                 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1746                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1747                                 current.y |= value;
1748                                 *Pointer<Short4>(buffer) = current.y;
1749                         }
1750                         break;
1751                 case FORMAT_A16B16G16R16:
1752                         {
1753                                 Pointer<Byte> buffer = cBuffer + 8 * x;
1754
1755                                 {
1756                                         Short4 value = *Pointer<Short4>(buffer);
1757
1758                                         if(rgbaWriteMask != 0x0000000F)
1759                                         {
1760                                                 Short4 masked = value;
1761                                                 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1762                                                 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1763                                                 current.x |= masked;
1764                                         }
1765
1766                                         current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
1767                                         value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
1768                                         current.x |= value;
1769                                         *Pointer<Short4>(buffer) = current.x;
1770                                 }
1771
1772                                 {
1773                                         Short4 value = *Pointer<Short4>(buffer + 8);
1774
1775                                         if(rgbaWriteMask != 0x0000000F)
1776                                         {
1777                                                 Short4 masked = value;
1778                                                 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1779                                                 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1780                                                 current.y |= masked;
1781                                         }
1782
1783                                         current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
1784                                         value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
1785                                         current.y |= value;
1786                                         *Pointer<Short4>(buffer + 8) = current.y;
1787                                 }
1788
1789                                 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1790
1791                                 {
1792                                         Short4 value = *Pointer<Short4>(buffer);
1793
1794                                         if(rgbaWriteMask != 0x0000000F)
1795                                         {
1796                                                 Short4 masked = value;
1797                                                 current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1798                                                 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1799                                                 current.z |= masked;
1800                                         }
1801
1802                                         current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
1803                                         value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
1804                                         current.z |= value;
1805                                         *Pointer<Short4>(buffer) = current.z;
1806                                 }
1807
1808                                 {
1809                                         Short4 value = *Pointer<Short4>(buffer + 8);
1810
1811                                         if(rgbaWriteMask != 0x0000000F)
1812                                         {
1813                                                 Short4 masked = value;
1814                                                 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1815                                                 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1816                                                 current.w |= masked;
1817                                         }
1818
1819                                         current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
1820                                         value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
1821                                         current.w |= value;
1822                                         *Pointer<Short4>(buffer + 8) = current.w;
1823                                 }
1824                         }
1825                         break;
1826                 default:
1827                         ASSERT(false);
1828                 }
1829         }
1830
1831         void PixelRoutine::blendFactor(const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive)
1832         {
1833                 switch(blendFactorActive)
1834                 {
1835                 case BLEND_ZERO:
1836                         // Optimized
1837                         break;
1838                 case BLEND_ONE:
1839                         // Optimized
1840                         break;
1841                 case BLEND_SOURCE:
1842                         blendFactor.x = oC.x;
1843                         blendFactor.y = oC.y;
1844                         blendFactor.z = oC.z;
1845                         break;
1846                 case BLEND_INVSOURCE:
1847                         blendFactor.x = Float4(1.0f) - oC.x;
1848                         blendFactor.y = Float4(1.0f) - oC.y;
1849                         blendFactor.z = Float4(1.0f) - oC.z;
1850                         break;
1851                 case BLEND_DEST:
1852                         blendFactor.x = pixel.x;
1853                         blendFactor.y = pixel.y;
1854                         blendFactor.z = pixel.z;
1855                         break;
1856                 case BLEND_INVDEST:
1857                         blendFactor.x = Float4(1.0f) - pixel.x;
1858                         blendFactor.y = Float4(1.0f) - pixel.y;
1859                         blendFactor.z = Float4(1.0f) - pixel.z;
1860                         break;
1861                 case BLEND_SOURCEALPHA:
1862                         blendFactor.x = oC.w;
1863                         blendFactor.y = oC.w;
1864                         blendFactor.z = oC.w;
1865                         break;
1866                 case BLEND_INVSOURCEALPHA:
1867                         blendFactor.x = Float4(1.0f) - oC.w;
1868                         blendFactor.y = Float4(1.0f) - oC.w;
1869                         blendFactor.z = Float4(1.0f) - oC.w;
1870                         break;
1871                 case BLEND_DESTALPHA:
1872                         blendFactor.x = pixel.w;
1873                         blendFactor.y = pixel.w;
1874                         blendFactor.z = pixel.w;
1875                         break;
1876                 case BLEND_INVDESTALPHA:
1877                         blendFactor.x = Float4(1.0f) - pixel.w;
1878                         blendFactor.y = Float4(1.0f) - pixel.w;
1879                         blendFactor.z = Float4(1.0f) - pixel.w;
1880                         break;
1881                 case BLEND_SRCALPHASAT:
1882                         blendFactor.x = Float4(1.0f) - pixel.w;
1883                         blendFactor.x = Min(blendFactor.x, oC.w);
1884                         blendFactor.y = blendFactor.x;
1885                         blendFactor.z = blendFactor.x;
1886                         break;
1887                 case BLEND_CONSTANT:
1888                         blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0]));
1889                         blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1]));
1890                         blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2]));
1891                         break;
1892                 case BLEND_INVCONSTANT:
1893                         blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
1894                         blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
1895                         blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
1896                         break;
1897                 default:
1898                         ASSERT(false);
1899                 }
1900         }
1901
1902         void PixelRoutine::blendFactorAlpha(const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive)
1903         {
1904                 switch(blendFactorAlphaActive)
1905                 {
1906                 case BLEND_ZERO:
1907                         // Optimized
1908                         break;
1909                 case BLEND_ONE:
1910                         // Optimized
1911                         break;
1912                 case BLEND_SOURCE:
1913                         blendFactor.w = oC.w;
1914                         break;
1915                 case BLEND_INVSOURCE:
1916                         blendFactor.w = Float4(1.0f) - oC.w;
1917                         break;
1918                 case BLEND_DEST:
1919                         blendFactor.w = pixel.w;
1920                         break;
1921                 case BLEND_INVDEST:
1922                         blendFactor.w = Float4(1.0f) - pixel.w;
1923                         break;
1924                 case BLEND_SOURCEALPHA:
1925                         blendFactor.w = oC.w;
1926                         break;
1927                 case BLEND_INVSOURCEALPHA:
1928                         blendFactor.w = Float4(1.0f) - oC.w;
1929                         break;
1930                 case BLEND_DESTALPHA:
1931                         blendFactor.w = pixel.w;
1932                         break;
1933                 case BLEND_INVDESTALPHA:
1934                         blendFactor.w = Float4(1.0f) - pixel.w;
1935                         break;
1936                 case BLEND_SRCALPHASAT:
1937                         blendFactor.w = Float4(1.0f);
1938                         break;
1939                 case BLEND_CONSTANT:
1940                         blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
1941                         break;
1942                 case BLEND_INVCONSTANT:
1943                         blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
1944                         break;
1945                 default:
1946                         ASSERT(false);
1947                 }
1948         }
1949
1950         void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
1951         {
1952                 if(!state.alphaBlendActive)
1953                 {
1954                         return;
1955                 }
1956
1957                 Pointer<Byte> buffer;
1958                 Vector4f pixel;
1959
1960                 Vector4s color;
1961                 Short4 c01;
1962                 Short4 c23;
1963
1964                 Float4 one;
1965                 switch(state.targetFormat[index])
1966                 {
1967                 case FORMAT_R32I:
1968                 case FORMAT_G32R32I:
1969                         one = As<Float4>(Int4(0x7FFFFFFF));
1970                         break;
1971                 case FORMAT_R32UI:
1972                 case FORMAT_G32R32UI:
1973                         one = As<Float4>(Int4(0xFFFFFFFF));
1974                         break;
1975                 case FORMAT_R32F:
1976                 case FORMAT_G32R32F:
1977                         one = Float4(1.0f);
1978                         break;
1979                 }
1980
1981                 switch(state.targetFormat[index])
1982                 {
1983                 case FORMAT_R32I:
1984                 case FORMAT_R32UI:
1985                 case FORMAT_R32F:
1986                         buffer = cBuffer;
1987                         // FIXME: movlps
1988                         pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
1989                         pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
1990                         buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1991                         // FIXME: movhps
1992                         pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
1993                         pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
1994                         pixel.y = pixel.z = pixel.w = one;
1995                         break;
1996                 case FORMAT_G32R32I:
1997                 case FORMAT_G32R32UI:
1998                 case FORMAT_G32R32F:
1999                         buffer = cBuffer;
2000                         pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
2001                         buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2002                         pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
2003                         pixel.z = pixel.x;
2004                         pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88);
2005                         pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD);
2006                         pixel.y = pixel.z;
2007                         pixel.z = pixel.w = one;
2008                         break;
2009                 case FORMAT_X32B32G32R32F:
2010                 case FORMAT_A32B32G32R32F:
2011                 case FORMAT_A32B32G32R32I:
2012                 case FORMAT_A32B32G32R32UI:
2013                         buffer = cBuffer;
2014                         pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
2015                         pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2016                         buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2017                         pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
2018                         pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2019                         transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
2020                         if(state.targetFormat[index] == FORMAT_X32B32G32R32F)
2021                         {
2022                                 pixel.w = Float4(1.0f);
2023                         }
2024                         break;
2025                 default:
2026                         ASSERT(false);
2027                 }
2028
2029                 if(postBlendSRGB && state.writeSRGB)
2030                 {
2031                         sRGBtoLinear(pixel.x);
2032                         sRGBtoLinear(pixel.y);
2033                         sRGBtoLinear(pixel.z);
2034                 }
2035
2036                 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
2037                 Vector4f sourceFactor;
2038                 Vector4f destFactor;
2039
2040                 blendFactor(sourceFactor, oC, pixel, state.sourceBlendFactor);
2041                 blendFactor(destFactor, oC, pixel, state.destBlendFactor);
2042
2043                 if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
2044                 {
2045                         oC.x *= sourceFactor.x;
2046                         oC.y *= sourceFactor.y;
2047                         oC.z *= sourceFactor.z;
2048                 }
2049
2050                 if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
2051                 {
2052                         pixel.x *= destFactor.x;
2053                         pixel.y *= destFactor.y;
2054                         pixel.z *= destFactor.z;
2055                 }
2056
2057                 switch(state.blendOperation)
2058                 {
2059                 case BLENDOP_ADD:
2060                         oC.x += pixel.x;
2061                         oC.y += pixel.y;
2062                         oC.z += pixel.z;
2063                         break;
2064                 case BLENDOP_SUB:
2065                         oC.x -= pixel.x;
2066                         oC.y -= pixel.y;
2067                         oC.z -= pixel.z;
2068                         break;
2069                 case BLENDOP_INVSUB:
2070                         oC.x = pixel.x - oC.x;
2071                         oC.y = pixel.y - oC.y;
2072                         oC.z = pixel.z - oC.z;
2073                         break;
2074                 case BLENDOP_MIN:
2075                         oC.x = Min(oC.x, pixel.x);
2076                         oC.y = Min(oC.y, pixel.y);
2077                         oC.z = Min(oC.z, pixel.z);
2078                         break;
2079                 case BLENDOP_MAX:
2080                         oC.x = Max(oC.x, pixel.x);
2081                         oC.y = Max(oC.y, pixel.y);
2082                         oC.z = Max(oC.z, pixel.z);
2083                         break;
2084                 case BLENDOP_SOURCE:
2085                         // No operation
2086                         break;
2087                 case BLENDOP_DEST:
2088                         oC.x = pixel.x;
2089                         oC.y = pixel.y;
2090                         oC.z = pixel.z;
2091                         break;
2092                 case BLENDOP_NULL:
2093                         oC.x = Float4(0.0f);
2094                         oC.y = Float4(0.0f);
2095                         oC.z = Float4(0.0f);
2096                         break;
2097                 default:
2098                         ASSERT(false);
2099                 }
2100
2101                 blendFactorAlpha(sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
2102                 blendFactorAlpha(destFactor, oC, pixel, state.destBlendFactorAlpha);
2103
2104                 if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
2105                 {
2106                         oC.w *= sourceFactor.w;
2107                 }
2108
2109                 if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
2110                 {
2111                         pixel.w *= destFactor.w;
2112                 }
2113
2114                 switch(state.blendOperationAlpha)
2115                 {
2116                 case BLENDOP_ADD:
2117                         oC.w += pixel.w;
2118                         break;
2119                 case BLENDOP_SUB:
2120                         oC.w -= pixel.w;
2121                         break;
2122                 case BLENDOP_INVSUB:
2123                         pixel.w -= oC.w;
2124                         oC.w = pixel.w;
2125                         break;
2126                 case BLENDOP_MIN:
2127                         oC.w = Min(oC.w, pixel.w);
2128                         break;
2129                 case BLENDOP_MAX:
2130                         oC.w = Max(oC.w, pixel.w);
2131                         break;
2132                 case BLENDOP_SOURCE:
2133                         // No operation
2134                         break;
2135                 case BLENDOP_DEST:
2136                         oC.w = pixel.w;
2137                         break;
2138                 case BLENDOP_NULL:
2139                         oC.w = Float4(0.0f);
2140                         break;
2141                 default:
2142                         ASSERT(false);
2143                 }
2144         }
2145
2146         void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
2147         {
2148                 switch(state.targetFormat[index])
2149                 {
2150                 case FORMAT_R32F:
2151                 case FORMAT_R32I:
2152                 case FORMAT_R32UI:
2153                         break;
2154                 case FORMAT_G32R32F:
2155                 case FORMAT_G32R32I:
2156                 case FORMAT_G32R32UI:
2157                         oC.z = oC.x;
2158                         oC.x = UnpackLow(oC.x, oC.y);
2159                         oC.z = UnpackHigh(oC.z, oC.y);
2160                         oC.y = oC.z;
2161                         break;
2162                 case FORMAT_X32B32G32R32F:
2163                 case FORMAT_A32B32G32R32F:
2164                 case FORMAT_A32B32G32R32I:
2165                 case FORMAT_A32B32G32R32UI:
2166                         transpose4x4(oC.x, oC.y, oC.z, oC.w);
2167                         break;
2168                 default:
2169                         ASSERT(false);
2170                 }
2171
2172                 int rgbaWriteMask = state.colorWriteActive(index);
2173
2174                 Int xMask;   // Combination of all masks
2175
2176                 if(state.depthTestActive)
2177                 {
2178                         xMask = zMask;
2179                 }
2180                 else
2181                 {
2182                         xMask = cMask;
2183                 }
2184
2185                 if(state.stencilActive)
2186                 {
2187                         xMask &= sMask;
2188                 }
2189
2190                 Pointer<Byte> buffer;
2191                 Float4 value;
2192
2193                 switch(state.targetFormat[index])
2194                 {
2195                 case FORMAT_R32F:
2196                 case FORMAT_R32I:
2197                 case FORMAT_R32UI:
2198                         if(rgbaWriteMask & 0x00000001)
2199                         {
2200                                 buffer = cBuffer + 4 * x;
2201
2202                                 // FIXME: movlps
2203                                 value.x = *Pointer<Float>(buffer + 0);
2204                                 value.y = *Pointer<Float>(buffer + 4);
2205
2206                                 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2207
2208                                 // FIXME: movhps
2209                                 value.z = *Pointer<Float>(buffer + 0);
2210                                 value.w = *Pointer<Float>(buffer + 4);
2211
2212                                 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
2213                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
2214                                 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2215
2216                                 // FIXME: movhps
2217                                 *Pointer<Float>(buffer + 0) = oC.x.z;
2218                                 *Pointer<Float>(buffer + 4) = oC.x.w;
2219
2220                                 buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2221
2222                                 // FIXME: movlps
2223                                 *Pointer<Float>(buffer + 0) = oC.x.x;
2224                                 *Pointer<Float>(buffer + 4) = oC.x.y;
2225                         }
2226                         break;
2227                 case FORMAT_G32R32F:
2228                 case FORMAT_G32R32I:
2229                 case FORMAT_G32R32UI:
2230                         buffer = cBuffer + 8 * x;
2231
2232                         value = *Pointer<Float4>(buffer);
2233
2234                         if((rgbaWriteMask & 0x00000003) != 0x00000003)
2235                         {
2236                                 Float4 masked = value;
2237                                 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2238                                 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0])));
2239                                 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2240                         }
2241
2242                         oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
2243                         value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
2244                         oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2245                         *Pointer<Float4>(buffer) = oC.x;
2246
2247                         buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2248
2249                         value = *Pointer<Float4>(buffer);
2250
2251                         if((rgbaWriteMask & 0x00000003) != 0x00000003)
2252                         {
2253                                 Float4 masked;
2254
2255                                 masked = value;
2256                                 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2257                                 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0])));
2258                                 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2259                         }
2260
2261                         oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
2262                         value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
2263                         oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2264                         *Pointer<Float4>(buffer) = oC.y;
2265                         break;
2266                 case FORMAT_X32B32G32R32F:
2267                 case FORMAT_A32B32G32R32F:
2268                 case FORMAT_A32B32G32R32I:
2269                 case FORMAT_A32B32G32R32UI:
2270                         buffer = cBuffer + 16 * x;
2271
2272                         {
2273                                 value = *Pointer<Float4>(buffer, 16);
2274
2275                                 if(rgbaWriteMask != 0x0000000F)
2276                                 {
2277                                         Float4 masked = value;
2278                                         oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2279                                         masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2280                                         oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2281                                 }
2282
2283                                 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
2284                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
2285                                 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2286                                 *Pointer<Float4>(buffer, 16) = oC.x;
2287                         }
2288
2289                         {
2290                                 value = *Pointer<Float4>(buffer + 16, 16);
2291
2292                                 if(rgbaWriteMask != 0x0000000F)
2293                                 {
2294                                         Float4 masked = value;
2295                                         oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2296                                         masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2297                                         oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2298                                 }
2299
2300                                 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
2301                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
2302                                 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2303                                 *Pointer<Float4>(buffer + 16, 16) = oC.y;
2304                         }
2305
2306                         buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2307
2308                         {
2309                                 value = *Pointer<Float4>(buffer, 16);
2310
2311                                 if(rgbaWriteMask != 0x0000000F)
2312                                 {
2313                                         Float4 masked = value;
2314                                         oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2315                                         masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2316                                         oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
2317                                 }
2318
2319                                 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
2320                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
2321                                 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
2322                                 *Pointer<Float4>(buffer, 16) = oC.z;
2323                         }
2324
2325                         {
2326                                 value = (state.targetFormat[index] == FORMAT_X32B32G32R32F) ? Float4(1.0f) : *Pointer<Float4>(buffer + 16, 16);
2327
2328                                 if(rgbaWriteMask != 0x0000000F)
2329                                 {
2330                                         Float4 masked = value;
2331                                         oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2332                                         masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2333                                         oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
2334                                 }
2335
2336                                 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
2337                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
2338                                 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
2339                                 *Pointer<Float4>(buffer + 16, 16) = oC.w;
2340                         }
2341                         break;
2342                 default:
2343                         ASSERT(false);
2344                 }
2345         }
2346
2347         UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate)
2348         {
2349                 return UShort4(cf * Float4(0xFFFF), saturate);
2350         }
2351
2352         void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
2353         {
2354                 c.x = As<UShort4>(c.x) >> 4;
2355                 c.y = As<UShort4>(c.y) >> 4;
2356                 c.z = As<UShort4>(c.z) >> 4;
2357
2358                 sRGBtoLinear12_16(c);
2359         }
2360
2361         void PixelRoutine::sRGBtoLinear12_16(Vector4s &c)
2362         {
2363                 Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16);
2364
2365                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2366                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2367                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2368                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2369
2370                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2371                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2372                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2373                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2374
2375                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2376                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2377                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2378                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2379         }
2380
2381         void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
2382         {
2383                 c.x = As<UShort4>(c.x) >> 4;
2384                 c.y = As<UShort4>(c.y) >> 4;
2385                 c.z = As<UShort4>(c.z) >> 4;
2386
2387                 linearToSRGB12_16(c);
2388         }
2389
2390         void PixelRoutine::linearToSRGB12_16(Vector4s &c)
2391         {
2392                 Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16);
2393
2394                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2395                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2396                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2397                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2398
2399                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2400                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2401                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2402                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2403
2404                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2405                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2406                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2407                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2408         }
2409
2410         Float4 PixelRoutine::sRGBtoLinear(const Float4 &x)   // Approximates x^2.2
2411         {
2412                 Float4 linear = x * x;
2413                 linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
2414
2415                 return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
2416         }
2417
2418         bool PixelRoutine::colorUsed()
2419         {
2420                 return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill;
2421         }
2422 }